ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,925 @@
1
+ # BayesOpt Usage Guide (Framework + How-To)
2
+
3
+ This document explains the overall framework, config fields, and recommended usage for the training/tuning/stacking pipeline under `ins_pricing/modelling/`. It is mainly for:
4
+
5
+ - Batch training via JSON config using `ins_pricing/modelling/BayesOpt_entry.py` (can be combined with `torchrun`)
6
+ - Calling the Python API directly in notebooks/scripts via `ins_pricing.BayesOpt` or `ins_pricing.bayesopt`
7
+
8
+ ---
9
+
10
+ ## 1. Which file should you run?
11
+
12
+ Files related to this workflow in `ins_pricing/modelling/`:
13
+
14
+ - `ins_pricing/modelling/bayesopt/`: Core subpackage (data preprocessing, Trainer, Optuna tuning, FT embedding/self-supervised pretraining, plotting, SHAP, etc)
15
+ - `ins_pricing/modelling/BayesOpt.py`: Compatibility entry that re-exports the new subpackage for older import paths
16
+ - `ins_pricing/modelling/BayesOpt_entry.py`: CLI batch entry (reads multiple CSVs from config, trains/tunes/saves/plots; supports DDP)
17
+ - `ins_pricing/modelling/BayesOpt_incremental.py`: Incremental training entry (append data and reuse params/models; for production incremental scenarios)
18
+ - `ins_pricing/modelling/cli_common.py`: Shared CLI helpers (path resolution, model name generation, plotting selection)
19
+ - `ins_pricing/__init__.py`: Makes `ins_pricing/` importable (e.g. `from ins_pricing import BayesOptModel` or `from ins_pricing import bayesopt`)
20
+ - `ins_pricing/modelling/notebook_utils.py`: Notebook helpers (build and run BayesOpt_entry and watchdog commands)
21
+ - `ins_pricing/modelling/Pricing_Run.py`: Unified runner (notebook/script only needs a config; `runner` decides entry/incremental/DDP/watchdog)
22
+ - `ins_pricing/modelling/demo/config_template.json`: Common config template (recommended to copy and edit)
23
+ - `ins_pricing/modelling/demo/config_incremental_template.json`: Sample incremental training config (used by `Pricing_incremental.ipynb`)
24
+ - `ins_pricing/modelling/demo/config_explain_template.json`: Explain workflow config template
25
+ - `user_packages legacy/Try/config_Pricing_FT_Stack.json`: Historical "FT stacking" config example
26
+ - Notebooks (demo): `ins_pricing/modelling/demo/Pricing_Run.ipynb`, `ins_pricing/modelling/demo/PricingSingle.ipynb`, `ins_pricing/modelling/demo/Explain_Run.ipynb`
27
+ - Deprecated examples: see `user_packages legacy/Try/*_deprecate.ipynb`
28
+
29
+ Note: `ins_pricing/modelling/demo/` is kept in the repo only; the PyPI package does not include this directory.
30
+
31
+ ---
32
+
33
+ ## 2. Overall framework (from data to model pipeline)
34
+
35
+ ### 2.1 Typical flow for a single training job (BayesOpt_entry)
36
+
37
+ Core logic in `BayesOpt_entry.py` (each dataset `model_name.csv` runs once):
38
+
39
+ 1. Read `config.json`, build dataset names from `model_list x model_categories` (e.g. `od_bc`)
40
+ 2. Load data from `data_dir/<model_name>.csv`
41
+ 3. Split train/test with `train_test_split`
42
+ 4. Construct `BayesOptModel(train_df, test_df, ...)`
43
+ 5. Run by FT role and model selection:
44
+ - If `ft_role != "model"`: run FT first (tune/train/export embedding columns), then run base models (XGB/ResNet/GLM, etc)
45
+ - If `ft_role == "model"`: FT itself is a prediction model and can be tuned/trained in parallel with others
46
+ 6. Save models and parameter snapshots, optionally plot
47
+
48
+ Extra: `BayesOpt_entry.py` / `BayesOpt_incremental.py` resolve relative paths in config as "relative to the config.json directory" (for example, if config is in `ins_pricing/modelling/demo/`, then `./Data` means `ins_pricing/modelling/demo/Data`). Currently supported path fields: `data_dir` / `output_dir` / `optuna_storage` / `gnn_graph_cache` / `best_params_files`.
49
+
50
+ If you want notebook runs to only change config (no code changes), use `ins_pricing/modelling/demo/Pricing_Run.ipynb` (it calls `ins_pricing/modelling/Pricing_Run.py`). Add a `runner` field in config to control entry/incremental/DDP/watchdog.
51
+
52
+ ### 2.2 Core components in the BayesOpt subpackage
53
+
54
+ Under `ins_pricing/modelling/bayesopt/`:
55
+
56
+ - `BayesOptConfig`: unified config (epochs, feature lists, FT role, DDP/DP, etc)
57
+ - `DatasetPreprocessor`: preprocessing once in `BayesOptModel` init:
58
+ - create `w_act` (weighted actual), optional `w_binary_act`
59
+ - cast categorical columns to `category`
60
+ - create `train_oht_data/test_oht_data` (one-hot)
61
+ - create `train_oht_scl_data/test_oht_scl_data` (one-hot with standardized numeric columns)
62
+ - `TrainerBase`: base trainer with `tune()` (Optuna), `train()`, `save()/load()`, and distributed Optuna sync for DDP
63
+ - Trainers (`BayesOptModel.trainers`):
64
+ - `GLMTrainer`: statsmodels GLM
65
+ - `XGBTrainer`: xgboost
66
+ - `ResNetTrainer`: PyTorch MLP/ResNet style
67
+ - `FTTrainer`: FT-Transformer (supports 3 roles)
68
+ - `GNNTrainer`: GNN (standalone model `gnn`, or used to generate geo tokens for FT)
69
+ - `OutputManager`: unified output paths (`plot/`, `Results/`, `model/`)
70
+ - `VersionManager`: save/load snapshots (`Results/versions/*_ft_best.json`, etc)
71
+
72
+ ### 2.3 BayesOpt subpackage structure (read in code order)
73
+
74
+ `BayesOpt` is now a subpackage (`ins_pricing/modelling/bayesopt/`). Recommended order:
75
+
76
+ 1) **Tools and utilities**
77
+
78
+ - `IOUtils / TrainingUtils / PlotUtils`: I/O, training utilities (batch size, tweedie loss, free_cuda), plotting helpers
79
+ - `DistributedUtils`: DDP init, rank/world_size helpers
80
+
81
+ 2) **TorchTrainerMixin (common components for torch tabular training)**
82
+
83
+ - DataLoader: `_build_dataloader()` / `_build_val_dataloader()` (prints batch/accum/workers)
84
+ - Loss: `_compute_losses()` / `_compute_weighted_loss()` (regression uses tweedie; classification uses BCEWithLogits)
85
+ - Early stop: `_early_stop_update()`
86
+
87
+ 3) **Sklearn-style model classes (core training objects)**
88
+
89
+ - `ResNetSklearn`: `fit/predict/set_params`, holds `ResNetSequential`, supports DP/DDP
90
+ - `FTTransformerSklearn`: `fit/predict/fit_unsupervised`, supports embedding output, DP/DDP
91
+ - `GraphNeuralNetSklearn`: `fit/predict/set_params`, used for geo tokens (CPU/GPU graph build, adjacency cache)
92
+
93
+ 4) **Config and preprocessing/output management**
94
+
95
+ - `BayesOptConfig`: aggregated config for task, training, parallelism, FT role (built in `BayesOptModel`)
96
+ - `OutputManager`: manage `plot/Results/model` under output root
97
+ - `VersionManager`: write snapshots to `Results/versions/` and read latest (for best_params reuse)
98
+ - `DatasetPreprocessor`: runs in `BayesOptModel.__init__`, generates data views and derived columns
99
+
100
+ 5) **Trainer system (Optuna + training + cached predictions)**
101
+
102
+ - `TrainerBase`: `tune()` (Optuna), `save()/load()`, distributed Optuna sync for DDP
103
+ - `cross_val_generic()`: generic CV/holdout evaluation logic (trainer supplies model_builder/metric_fn/fit_predict_fn)
104
+ - `_fit_predict_cache()` / `_predict_and_cache()`: after training, write predictions back to `BayesOptModel.train_data/test_data`
105
+
106
+ 6) **Orchestrator BayesOptModel**
107
+
108
+ - `BayesOptModel.optimize_model(model_key, max_evals)`: unified entry, responsible for:
109
+ - selecting objective (e.g. self-supervised objective when `ft_role=unsupervised_embedding`)
110
+ - "FT as feature" mode: export `pred_<prefix>_*` and inject into downstream features
111
+ - saving snapshots (for reuse/backtracking)
112
+ - `save_model/load_model`, `plot_*`, `compute_shap_*`, etc
113
+
114
+ ### 2.4 Key call chain (from entry to disk)
115
+
116
+ Using `BayesOpt_entry.py` as an example:
117
+
118
+ 1. `BayesOpt_entry.train_from_config()` reads CSV and builds `BayesOptModel(...)`
119
+ 2. `BayesOptModel.optimize_model(model_key)`
120
+ 3. `TrainerBase.tune()` (if `reuse_best_params` is false or no historical params found)
121
+ - calls `Trainer.cross_val()` or FT self-supervised `Trainer.cross_val_unsupervised()`
122
+ - inside `cross_val_generic()`:
123
+ - sample Optuna params
124
+ - build model `model_builder(params)`
125
+ - train and evaluate on validation via `metric_fn(...)`
126
+ 4. `Trainer.train()` trains the final model with `best_params` and caches prediction columns
127
+ 5. `Trainer.save()` saves model files; `BayesOptModel.optimize_model()` saves parameter snapshots
128
+
129
+ **Optuna under DDP (distributed coordination)**:
130
+
131
+ - Only rank0 drives Optuna sampling; trial params are broadcast to other ranks
132
+ - Non-rank0 processes do not sample; they receive params and run the same objective (multi-GPU sync)
133
+
134
+ ### 2.5 Data views and cached columns (used by training/plotting)
135
+
136
+ `DatasetPreprocessor` creates common columns in `train_data/test_data`:
137
+
138
+ - `w_act`: `target * weight`
139
+ - (if `binary_resp_nme` provided) `w_binary_act`: `binary_target * weight`
140
+
141
+ After training, `TrainerBase._predict_and_cache()` writes predictions back:
142
+
143
+ - **Scalar prediction models**:
144
+ - `pred_<prefix>` (e.g. `pred_xgb/pred_resn/pred_ft`)
145
+ - `w_pred_<prefix>` (column name `w_pred_xgb`; computed as `pred_<prefix> * weight`)
146
+ - **Multi-dim output (embedding)**:
147
+ - `pred_<prefix>_0 .. pred_<prefix>_{k-1}` (e.g. `pred_ft_emb_0..`)
148
+ - these multi-dim columns do not have `w_` weighted columns
149
+
150
+ These prediction columns are used by lift/dlift/oneway plotting and downstream stacking.
151
+
152
+ ### 2.6 Sklearn-style model classes: details and usage
153
+
154
+ Below are the three sklearn-style model classes in `bayesopt` (usually created by trainers, but can be used directly).
155
+
156
+ #### 2.6.1 ResNetSklearn (`class ResNetSklearn`)
157
+
158
+ Purpose: train a residual MLP on one-hot/standardized tabular features (regression uses Softplus, classification outputs logits).
159
+
160
+ Key parameters (common):
161
+
162
+ - `input_dim`: input dimension (typically number of one-hot columns)
163
+ - `hidden_dim`, `block_num`: width and number of residual blocks
164
+ - `learning_rate`, `epochs`, `patience`
165
+ - `use_data_parallel` / `use_ddp`
166
+
167
+ Key methods:
168
+
169
+ - `fit(X_train, y_train, w_train, X_val, y_val, w_val, trial=...)`
170
+ - `predict(X_test)`: classification uses sigmoid; regression clips to positive
171
+ - `set_params(params: dict)`: trainer writes `best_params` back to model
172
+
173
+ Minimal manual example:
174
+
175
+ ```python
176
+ from ins_pricing.BayesOpt import ResNetSklearn
177
+
178
+ # Use the one-hot standardized view from DatasetPreprocessor for X_train/X_val.
179
+ resn = ResNetSklearn(model_nme="od_bc", input_dim=X_train.shape[1], task_type="regression", epochs=50)
180
+ resn.set_params({"hidden_dim": 32, "block_num": 4, "learning_rate": 1e-3})
181
+ resn.fit(X_train, y_train, w_train, X_val, y_val, w_val)
182
+ y_pred = resn.predict(X_val)
183
+ ```
184
+
185
+ #### 2.6.2 FTTransformerSklearn (`class FTTransformerSklearn`)
186
+
187
+ Purpose: learn Transformer representations on numeric/categorical features; supports three output modes:
188
+
189
+ - supervised prediction: `predict()` returns scalar predictions
190
+ - embedding output: `predict(return_embedding=True)` returns `(N, d_model)` embeddings
191
+ - self-supervised masked reconstruction: `fit_unsupervised()` (used by `ft_role=unsupervised_embedding`)
192
+
193
+ Key details:
194
+
195
+ - Numeric columns are `nan_to_num` and standardized by train mean/std in `_tensorize_split()` (reduces AMP overflow risk)
196
+ - Categorical columns record train `categories` on first build; inference uses the same categories; unknown/missing maps to "unknown index" (`len(categories)`)
197
+ - DDP uses `DistributedSampler`; the self-supervised head is computed inside forward to avoid DDP "ready twice" errors
198
+
199
+ Key methods:
200
+
201
+ - `fit(X_train, y_train, w_train, X_val, y_val, w_val, trial=..., geo_train=..., geo_val=...)`
202
+ - `predict(X_test, geo_tokens=None, return_embedding=False)`
203
+ - `fit_unsupervised(X_train, X_val=None, mask_prob_num=..., mask_prob_cat=..., ...) -> float`
204
+
205
+ Minimal manual example (self-supervised pretrain + embeddings):
206
+
207
+ ```python
208
+ from ins_pricing.BayesOpt import FTTransformerSklearn
209
+
210
+ ft = FTTransformerSklearn(
211
+ model_nme="od_bc",
212
+ num_cols=num_cols,
213
+ cat_cols=cat_cols,
214
+ d_model=64,
215
+ n_heads=4,
216
+ n_layers=4,
217
+ dropout=0.1,
218
+ epochs=30,
219
+ use_ddp=False,
220
+ )
221
+
222
+ val_loss = ft.fit_unsupervised(train_df, X_val=test_df, mask_prob_num=0.2, mask_prob_cat=0.2)
223
+ emb = ft.predict(test_df, return_embedding=True) # shape: (N, d_model)
224
+ ```
225
+
226
+ #### 2.6.3 GraphNeuralNetSklearn (`class GraphNeuralNetSklearn`)
227
+
228
+ Purpose: build a graph from `geo_feature_nmes` and train a small GNN to generate geo tokens for FT.
229
+
230
+ Key details:
231
+
232
+ - Graph building: kNN (approx via pynndescent if available; GPU graph build with PyG when memory allows)
233
+ - Adjacency cache: `graph_cache_path`
234
+ - Training: full-graph training (one forward per epoch), good for moderate-size geo features
235
+
236
+ Key methods:
237
+
238
+ - `fit(X_train, y_train, w_train, X_val, y_val, w_val, trial=...)`
239
+ - `predict(X)`: regression clips positive; classification uses sigmoid
240
+ - `set_params(params: dict)`: rebuilds the backbone after structural changes
241
+
242
+ > In most stacking workflows you do not need to call it manually: when `geo_feature_nmes` is provided in config, `BayesOptModel` builds and caches geo tokens during init.
243
+
244
+ ### 2.7 Mapping between Trainer and Sklearn models (who calls what)
245
+
246
+ To unify tuning and final training/saving, `bayesopt` uses two layers:
247
+
248
+ - **Trainer (tuning/scheduling layer)**: Optuna, CV/holdout, feature view selection, save/load, prediction caching
249
+ - **Sklearn-style model (execution layer)**: only fit/predict (plus minimal helpers), no Optuna or output paths
250
+
251
+ Mapping overview:
252
+
253
+ - `GLMTrainer` -> statsmodels GLM (not a `*Sklearn` class; trainer builds design matrix and caches `pred_glm/w_pred_glm`)
254
+ - `XGBTrainer` -> `xgb.XGBRegressor` (`enable_categorical=True`, choose `gpu_hist/hist` based on `use_gpu`)
255
+ - `ResNetTrainer` -> `ResNetSklearn`
256
+ - Feature view: usually `train_oht_scl_data/test_oht_scl_data` with `var_nmes` (one-hot + standardize)
257
+ - Cached columns: `pred_resn/w_pred_resn`
258
+ - `FTTrainer` -> `FTTransformerSklearn`
259
+ - Feature view: raw `train_data/test_data` with `factor_nmes` (numeric + category columns; category columns must be declared in `cate_list`)
260
+ - `ft_role=model`: cache `pred_ft/w_pred_ft`
261
+ - `ft_role=embedding/unsupervised_embedding`: cache `pred_<prefix>_0..` and inject into downstream `factor_nmes`
262
+ - `GraphNeuralNetSklearn`: primarily used by `BayesOptModel` to generate geo tokens (when `geo_feature_nmes` is set)
263
+
264
+ ---
265
+
266
+ ## 3. Three FT roles (decide whether to stack)
267
+
268
+ FT role is controlled by `ft_role` (from config or CLI `--ft-role`):
269
+
270
+ ### 3.1 `ft_role="model"` (FT as a prediction model)
271
+
272
+ - Goal: train FT directly from `X -> y`, generate `pred_ft` / `w_pred_ft`
273
+ - FT participates in lift/dlift/SHAP evaluation
274
+
275
+ ### 3.2 `ft_role="embedding"` (supervised training, export embeddings only)
276
+
277
+ - Goal: still train with `X -> y` (embedding quality influenced by supervised signal)
278
+ - Export pooled embedding feature columns: `pred_<ft_feature_prefix>_0..`
279
+ - These columns are injected into `factor_nmes` for downstream base models (stacking)
280
+ - FT itself is not evaluated as a standalone model in lift/SHAP
281
+
282
+ ### 3.3 `ft_role="unsupervised_embedding"` (masked pretrain + embeddings)
283
+
284
+ - Goal: do not use `y`; run masked reconstruction on inputs `X` (numeric + categorical)
285
+ - Export `pred_<ft_feature_prefix>_0..` and inject to downstream features
286
+ - Suitable for "representation first, base model decision" two-stage stacking
287
+
288
+ ---
289
+
290
+ ## 4. What does Optuna optimize?
291
+
292
+ ### 4.1 Supervised models (GLM/XGB/ResNet/FT-as-model)
293
+
294
+ - `TrainerBase.tune()` calls each trainer's `cross_val()` and minimizes validation metric (default direction `minimize`)
295
+ - Regression typically uses Tweedie deviance or related loss; classification uses logloss
296
+
297
+ ### 4.2 FT self-supervised (`unsupervised_embedding`)
298
+
299
+ When `ft_role="unsupervised_embedding"`, `BayesOptModel.optimize_model("ft")` calls:
300
+
301
+ - `FTTrainer.cross_val_unsupervised()` (Optuna objective)
302
+ - Objective: validation loss of masked reconstruction (smaller is better)
303
+ - Numeric: MSE only on masked positions (multiplied by `num_loss_weight`)
304
+ - Categorical: cross-entropy only on masked positions (multiplied by `cat_loss_weight`)
305
+
306
+ Note:
307
+ - `n_heads` is not searched by default; it is derived from `d_model` with divisibility guarantees (see `FTTrainer._resolve_adaptive_heads()`).
308
+
309
+ ---
310
+
311
+ ## 5. Output directories and files (convention)
312
+
313
+ Output root comes from `output_dir` (config) or CLI `--output-dir`. Under it:
314
+
315
+ - `plot/`: plots (loss curves, lift/dlift/oneway, etc)
316
+ - `Results/`: params, metrics, version snapshots
317
+ - `Results/<model>_bestparams_<trainer>.csv`: best params per trainer after tuning
318
+ - `Results/versions/<timestamp>_<model_key>_best.json`: snapshots (best_params and config)
319
+ - `model/`: model files
320
+ - GLM/XGB: `pkl`
321
+ - PyTorch: `pth` (ResNet usually saves state_dict; FT usually saves full object)
322
+
323
+ ---
324
+
325
+ ## 6. Config fields (JSON) - common
326
+
327
+ Start by copying `ins_pricing/modelling/demo/config_template.json`. Examples: `ins_pricing/modelling/demo/config_template.json`, `ins_pricing/modelling/demo/config_incremental_template.json`, `user_packages legacy/Try/config_Pricing_FT_Stack.json`.
328
+
329
+ ### 6.1 Path resolution rules (important)
330
+
331
+ - `BayesOpt_entry.py` / `BayesOpt_incremental.py` resolve relative paths in config as "relative to the config.json directory".
332
+ - Example: config in `ins_pricing/modelling/demo/` and `data_dir: "./Data"` means `ins_pricing/modelling/demo/Data`.
333
+ - Fields resolved: `data_dir` / `output_dir` / `optuna_storage` / `gnn_graph_cache` / `best_params_files`.
334
+ - If `optuna_storage` looks like a URL (contains `://`), it is passed to Optuna as-is; otherwise it is resolved as a file path and converted to absolute.
335
+
336
+ **Data and task**
337
+
338
+ - `data_dir` (str): directory of CSV files (`<model_name>.csv` per dataset)
339
+ - `model_list` (list[str]) / `model_categories` (list[str]): build dataset names (cartesian product)
340
+ - `target` (str): target column name
341
+ - `weight` (str): weight column name
342
+ - `feature_list` (list[str]): feature column names (recommended to provide explicitly; otherwise inferred in `BayesOptModel`)
343
+ - `categorical_features` (list[str]): categorical column names (if empty, inferred in `BayesOptModel`)
344
+ - `binary_resp_nme` (str|null, optional): binary target column (for conversion curves, etc)
345
+ - `task_type` (str, optional): `"regression"` / `"classification"`, default `"regression"`
346
+
347
+ **Training and split**
348
+
349
+ - `prop_test` (float): train/test split ratio (entry splits train/test; trainers also do CV/holdout), typical `(0, 0.5]`, default `0.25`
350
+ - `rand_seed` (int): random seed, default `13`
351
+ - `epochs` (int): NN epochs (ResNet/FT/GNN), default `50`
352
+ - `use_gpu` (bool, optional): prefer GPU (actual usage depends on `torch.cuda.is_available()`)
353
+ - `resn_weight_decay` (float, optional): ResNet weight decay (L2), default `1e-4`
354
+ - `final_ensemble` (bool, optional): enable k-fold model averaging during final training, default `false`
355
+ - `final_ensemble_k` (int, optional): number of folds for averaging, default `3`
356
+ - `final_refit` (bool, optional): enable refit after early stop with full data, default `true`
357
+
358
+ **FT stacking**
359
+
360
+ - `ft_role` (str): `"model"` / `"embedding"` / `"unsupervised_embedding"`
361
+ - `"model"`: FT acts as prediction model and outputs `pred_ft`
362
+ - `"embedding"`: FT is supervised but only exports embedding feature columns `pred_<prefix>_*`, not evaluated as final model
363
+ - `"unsupervised_embedding"`: FT uses masked reconstruction pretraining, exports `pred_<prefix>_*`
364
+ - `ft_feature_prefix` (str): prefix for exported features (creates `pred_<prefix>_0..`)
365
+ - `ft_num_numeric_tokens` (int|null): number of numeric tokens for FT; default equals number of numeric features
366
+ - `stack_model_keys` (list[str]): when `ft_role != "model"` and you want base models after FT, specify trainers to run, e.g. `["xgb","resn"]` or `["all"]`
367
+
368
+ **Parallelism and DDP**
369
+
370
+ - `use_resn_ddp` / `use_ft_ddp` / `use_gnn_ddp` (bool): use DDP (requires `torchrun`/`nproc_per_node>1`)
371
+ - `use_resn_data_parallel` / `use_ft_data_parallel` / `use_gnn_data_parallel` (bool): allow DataParallel as fallback
372
+
373
+ **Reuse historical best params (skip Optuna)**
374
+
375
+ - `reuse_best_params` (bool): `true/false`
376
+ - `true`: try `Results/versions/*_<model_key>_best.json` first, else fall back to `Results/<model>_bestparams_*.csv`
377
+ - if not found, runs Optuna normally
378
+ - `best_params_files` (dict, optional): explicit best param files, format `{"xgb":"./Results/xxx.csv","ft":"./Results/xxx.json"}`
379
+ - supports `.csv/.tsv` (read first row) and `.json` (`{"best_params": {...}}` or direct dict)
380
+ - if provided, reads directly and skips Optuna
381
+
382
+ **Optuna resume (recommended)**
383
+
384
+ - `optuna_storage` (str|null): Optuna storage (sqlite recommended)
385
+ - example: `"./Results/optuna/bayesopt.sqlite3"` (resolved to absolute path)
386
+ - or: `"sqlite:///E:/path/to/bayesopt.sqlite3"` (URL passed as-is)
387
+ - `optuna_study_prefix` (str): study name prefix; keep fixed for resuming
388
+
389
+ **XGBoost search caps (avoid very slow trials)**
390
+
391
+ - `xgb_max_depth_max` (int): max depth cap, default `25`
392
+ - `xgb_n_estimators_max` (int): tree count cap, default `500`
393
+
394
+ **GNN and geo tokens (optional)**
395
+
396
+ - `gnn_use_approx_knn` (bool): prefer approximate kNN for large samples
397
+ - `gnn_approx_knn_threshold` (int): row threshold to switch to approximate kNN
398
+ - `gnn_graph_cache` (str|null): adjacency/graph cache path
399
+ - `gnn_max_gpu_knn_nodes` (int): force CPU kNN above this node count (avoid GPU OOM)
400
+ - `gnn_knn_gpu_mem_ratio` (float): fraction of free GPU memory allowed for kNN
401
+ - `gnn_knn_gpu_mem_overhead` (float): memory overhead multiplier for kNN
402
+ - `geo_feature_nmes` (list[str]): raw columns for geo tokens (empty means no geo tokens)
403
+ - `region_province_col` / `region_city_col` (str|null): province/city columns (for region_effect features)
404
+ - `region_effect_alpha` (float): partial pooling strength (>=0)
405
+
406
+ **Plotting (optional)**
407
+
408
+ - `plot_curves` (bool): plot at end of run
409
+ - `plot` (dict): recommended unified plot settings
410
+ - `plot.enable` (bool)
411
+ - `plot.n_bins` (int): bin count
412
+ - `plot.oneway` (bool)
413
+ - `plot.lift_models` (list[str]): model keys for lift plots (e.g. `["xgb","resn"]`), empty means all trained models
414
+ - `plot.double_lift` (bool)
415
+ - `plot.double_lift_pairs` (list): supports `["xgb,resn"]` or `[["xgb","resn"]]`
416
+
417
+ **Standalone plotting (recommended)**
418
+
419
+ `ins_pricing.plotting` provides plotting utilities decoupled from training. You can use DataFrames or arrays to compare models:
420
+
421
+ - `plotting.curves`: lift/double lift/ROC/PR/KS/calibration/conversion lift
422
+ - `plotting.diagnostics`: loss curve, one-way plots
423
+ - `plotting.importance`: feature importance (supports SHAP summary)
424
+ - `plotting.geo`: geo heatmaps/contours (with map tiles for heatmap/contour)
425
+
426
+ Example (standalone):
427
+
428
+ ```python
429
+ from ins_pricing.plotting import curves, importance, geo
430
+
431
+ # Lift / Double Lift
432
+ curves.plot_lift_curve(pred, w_act, weight, n_bins=10, save_path="plot/lift.png")
433
+ curves.plot_double_lift_curve(pred1, pred2, w_act, weight, n_bins=10, save_path="plot/dlift.png")
434
+
435
+ # ROC / PR (multi-model comparison)
436
+ curves.plot_roc_curves(y_true, {"xgb": pred_xgb, "resn": pred_resn}, save_path="plot/roc.png")
437
+ curves.plot_pr_curves(y_true, {"xgb": pred_xgb, "resn": pred_resn}, save_path="plot/pr.png")
438
+
439
+ # Feature importance
440
+ importance.plot_feature_importance({"x1": 0.32, "x2": 0.18}, save_path="plot/importance.png")
441
+
442
+ # Geo heat/contour
443
+ geo.plot_geo_heatmap(df, x_col="lon", y_col="lat", value_col="loss", bins=50, save_path="plot/geo_heat.png")
444
+ geo.plot_geo_contour(df, x_col="lon", y_col="lat", value_col="loss", levels=12, save_path="plot/geo_contour.png")
445
+
446
+ # Map heatmap (requires contextily)
447
+ geo.plot_geo_heatmap_on_map(df, lon_col="lon", lat_col="lat", value_col="loss", bins=80, save_path="plot/map_heat.png")
448
+ ```
449
+
450
+ Map functions use lat/lon (EPSG:4326) by default and auto-scale view to data bounds.
451
+
452
+ The training flow also uses this plotting package (`plot_oneway`/`plot_lift`/`plot_dlift`/`plot_conversion_lift`/loss curves) for consistent maintenance.
453
+
454
+ **Model explanation (standalone module, light + deep)**
455
+
456
+ `ins_pricing.explain` provides model explanation methods decoupled from training:
457
+
458
+ - Light: permutation importance (for XGB/ResNet/FT, global)
459
+ - Deep: integrated gradients (for ResNet/FT, mainly numeric features)
460
+ - Classic: SHAP (KernelExplainer, for GLM/XGB/ResNet/FT, requires `shap`)
461
+
462
+ SHAP is optional; a prompt appears if not installed.
463
+
464
+ Example:
465
+
466
+ ```python
467
+ from ins_pricing.explain import (
468
+ permutation_importance,
469
+ resnet_integrated_gradients,
470
+ ft_integrated_gradients,
471
+ compute_shap_xgb,
472
+ )
473
+
474
+ # permutation importance
475
+ imp = permutation_importance(
476
+ predict_fn=model.predict,
477
+ X=X_valid,
478
+ y=y_valid,
479
+ sample_weight=w_valid,
480
+ metric="rmse",
481
+ n_repeats=5,
482
+ )
483
+
484
+ # ResNet integrated gradients
485
+ ig_resn = resnet_integrated_gradients(resn_model, X_valid_scl, steps=50)
486
+
487
+ # FT integrated gradients (categorical fixed; numeric/geo participate)
488
+ ig_ft = ft_integrated_gradients(ft_model, X_valid, geo_tokens=geo_tokens, steps=50)
489
+
490
+ # SHAP for XGB (BayesOptModel as context)
491
+ shap_xgb = compute_shap_xgb(model, n_background=500, n_samples=200, on_train=False)
492
+ ```
493
+
494
+ BayesOptModel also provides convenience wrappers:
495
+
496
+ ```python
497
+ model.compute_permutation_importance("resn", on_train=False, metric="rmse")
498
+ model.compute_integrated_gradients_resn(on_train=False, steps=50)
499
+ model.compute_integrated_gradients_ft(on_train=False, steps=50)
500
+ model.compute_shap_xgb(on_train=False)
501
+ model.compute_shap_glm(on_train=False)
502
+ ```
503
+
504
+ **Explain batch via config**
505
+
506
+ Use `Explain_entry.py` with config to load trained models under `output_dir/model` and run explanations on the validation set:
507
+
508
+ ```bash
509
+ python ins_pricing/modelling/Explain_entry.py --config-json ins_pricing/modelling/demo/config_explain_template.json
510
+ ```
511
+
512
+ Notebook option: `ins_pricing/modelling/demo/Explain_Run.ipynb`.
513
+
514
+ **Environment variable injection (optional)**
515
+
516
+ - `env`: values are set via `os.environ.setdefault()` (e.g. thread limits, CUDA debug)
517
+
518
+ ### 6.2 Notebook unified run: runner field (recommended)
519
+
520
+ All `Pricing_*.ipynb` are thin wrappers: they only call `Pricing_Run.run("<config.json>")`, and the run mode is controlled by config `runner`.
521
+
522
+ Notebook usage (recommended):
523
+
524
+ ```python
525
+ from ins_pricing.Pricing_Run import run
526
+ run("modelling/demo/config_template.json")
527
+ ```
528
+
529
+ CLI usage (optional):
530
+
531
+ ```bash
532
+ python ins_pricing/modelling/Pricing_Run.py --config-json ins_pricing/modelling/demo/config_template.json
533
+ ```
534
+
535
+ `runner` supports three modes:
536
+
537
+ - `runner.mode="entry"`: run `BayesOpt_entry.py`
538
+ - `runner.model_keys` (list[str]): `["glm","xgb","resn","ft","gnn"]` or includes `"all"`
539
+ - `runner.nproc_per_node` (int): `1` (single process) or `>=2` (torchrun/DDP)
540
+ - `runner.max_evals` (int): Optuna trials per model (default `50`)
541
+ - `runner.plot_curves` (bool): add `--plot-curves`
542
+ - `runner.ft_role` (str|null): if set, overrides config `ft_role`
543
+
544
+ - `runner.mode="incremental"`: run `BayesOpt_incremental.py`
545
+ - `runner.incremental_args` (list[str]): equivalent to CLI args for the incremental script
546
+ - common: `--incremental-dir/--incremental-file`, `--merge-keys`, `--timestamp-col`, `--model-keys`, `--max-evals`, `--update-base-data`, `--summary-json`, etc
547
+
548
+ - `runner.mode="explain"`: run `Explain_entry.py`
549
+ - `runner.explain_args` (list[str]): equivalent to CLI args for the explain script
550
+
551
+ watchdog (available in both modes):
552
+
553
+ - `runner.use_watchdog` (bool): enable watchdog
554
+ - `runner.idle_seconds` (int): seconds without output to treat as stuck
555
+ - `runner.max_restarts` (int): max restarts
556
+ - `runner.restart_delay_seconds` (int): delay between restarts
557
+
558
+ ---
559
+
560
+ ## 7. CLI: BayesOpt_entry.py examples
561
+
562
+ ### 7.0 Quick args reference (BayesOpt_entry.py)
563
+
564
+ Common CLI args for `BayesOpt_entry.py` (`--config-json` is required):
565
+
566
+ - `--config-json` (required, str): config path (recommend `ins_pricing/modelling/demo/xxx.json` or absolute path)
567
+ - `--model-keys` (list[str]): `glm` / `xgb` / `resn` / `ft` / `gnn` / `all`
568
+ - `--stack-model-keys` (list[str]): only when `ft_role != model`; same values as `--model-keys`
569
+ - `--max-evals` (int): Optuna trials per dataset per model
570
+ - `--plot-curves` (flag): enable plotting (also controlled by `plot_curves`/`plot.enable` in config)
571
+ - `--output-dir` (str): override config `output_dir`
572
+ - `--reuse-best-params` (flag): override config and reuse historical params to skip Optuna
573
+
574
+ DDP/DP (override config):
575
+
576
+ - `--use-resn-ddp` / `--use-ft-ddp` / `--use-gnn-ddp` (flag): force DDP for trainer
577
+ - `--use-resn-dp` / `--use-ft-dp` / `--use-gnn-dp` (flag): enable DataParallel fallback
578
+
579
+ GNN graph build (override config):
580
+
581
+ - `--gnn-no-ann` (flag): disable approximate kNN
582
+ - `--gnn-ann-threshold` (int): override `gnn_approx_knn_threshold`
583
+ - `--gnn-graph-cache` (str): override `gnn_graph_cache`
584
+ - `--gnn-max-gpu-nodes` (int): override `gnn_max_gpu_knn_nodes`
585
+ - `--gnn-gpu-mem-ratio` (float): override `gnn_knn_gpu_mem_ratio`
586
+ - `--gnn-gpu-mem-overhead` (float): override `gnn_knn_gpu_mem_overhead`
587
+
588
+ FT feature mode:
589
+
590
+ - `--ft-role` (str): `model` / `embedding` / `unsupervised_embedding`
591
+ - `--ft-feature-prefix` (str): feature prefix (e.g. `ft_emb`)
592
+ - `--ft-as-feature` (flag): compatibility alias (if config ft_role is default, set to `embedding`)
593
+
594
+ ### 7.1 Direct train/tune (single machine)
595
+
596
+ ```bash
597
+ python ins_pricing/modelling/BayesOpt_entry.py ^
598
+ --config-json ins_pricing/modelling/demo/config_template.json ^
599
+ --model-keys xgb resn ^
600
+ --max-evals 50
601
+ ```
602
+
603
+ ### 7.2 FT stacking: self-supervised FT then base models (single machine or torchrun)
604
+
605
+ If config already has `ft_role=unsupervised_embedding`, you can omit `--ft-role`.
606
+
607
+ ```bash
608
+ python ins_pricing/modelling/BayesOpt_entry.py ^
609
+ --config-json "user_packages legacy/Try/config_Pricing_FT_Stack.json" ^
610
+ --model-keys xgb resn ^
611
+ --max-evals 50
612
+ ```
613
+
614
+ DDP (multi-GPU) example:
615
+
616
+ ```bash
617
+ torchrun --standalone --nproc_per_node=2 ^
618
+ ins_pricing/modelling/BayesOpt_entry.py ^
619
+ --config-json "user_packages legacy/Try/config_Pricing_FT_Stack.json" ^
620
+ --model-keys xgb resn ^
621
+ --use-ft-ddp ^
622
+ --max-evals 50
623
+ ```
624
+
625
+ ### 7.3 Reuse historical best params (skip tuning)
626
+
627
+ ```bash
628
+ python ins_pricing/modelling/BayesOpt_entry.py ^
629
+ --config-json "user_packages legacy/Try/config_Pricing_FT_Stack.json" ^
630
+ --model-keys xgb resn ^
631
+ --reuse-best-params
632
+ ```
633
+
634
+ ### 7.4 Quick args reference (BayesOpt_incremental.py)
635
+
636
+ `BayesOpt_incremental.py` has many args; the common combo is incremental data source + merge/dedupe + models to retrain.
637
+
638
+ Common args:
639
+
640
+ - `--config-json` (required, str): reuse the same config (must include `data_dir/model_list/model_categories/target/weight/feature_list/categorical_features`)
641
+ - `--model-names` (list[str], optional): update only certain datasets (default uses `model_list x model_categories`)
642
+ - `--model-keys` (list[str]): `glm` / `xgb` / `resn` / `ft` / `gnn` / `all`
643
+ - `--incremental-dir` (Path) or `--incremental-file` (Path): incremental CSV source (choose one)
644
+ - `--incremental-template` (str): filename template for `--incremental-dir` (default `{model_name}_incremental.csv`)
645
+ - `--merge-keys` (list[str]): primary keys for dedupe after merge
646
+ - `--dedupe-keep` (str): `first` / `last`
647
+ - `--timestamp-col` (str|null): timestamp column for ordering before dedupe
648
+ - `--timestamp-descending` (flag): descending timestamp (default ascending)
649
+ - `--max-evals` (int): trial count when re-tuning is needed
650
+ - `--force-retune` (flag): force retune even if historical params exist
651
+ - `--skip-retune-missing` (flag): skip if params missing (default re-tunes)
652
+ - `--update-base-data` (flag): overwrite base CSV with merged data after success
653
+ - `--persist-merged-dir` (Path|null): optionally save merged snapshot to a separate dir
654
+ - `--summary-json` (Path|null): output summary
655
+ - `--plot-curves` (flag): plot
656
+ - `--dry-run` (flag): only merge and stats, no training
657
+
658
+ ---
659
+
660
+ ## 8. Python API: minimal runnable example (recommended to get working first)
661
+
662
+ This example shows "self-supervised FT embeddings, then XGB" (only key calls shown):
663
+
664
+ ```python
665
+ import pandas as pd
666
+ from sklearn.model_selection import train_test_split
667
+
668
+ import ins_pricing.BayesOpt as ropt
669
+
670
+ df = pd.read_csv("./Data/od_bc.csv")
671
+ train_df, test_df = train_test_split(df, test_size=0.25, random_state=13)
672
+
673
+ model = ropt.BayesOptModel(
674
+ train_df=train_df,
675
+ test_df=test_df,
676
+ model_nme="od_bc",
677
+ resp_nme="reponse",
678
+ weight_nme="weights",
679
+ factor_nmes=[...], # same as config feature_list
680
+ cate_list=[...], # same as config categorical_features
681
+ epochs=50,
682
+ use_ft_ddp=False,
683
+ ft_role="unsupervised_embedding",
684
+ ft_feature_prefix="ft_emb",
685
+ output_dir="./Results",
686
+ )
687
+
688
+ # 1) FT masked self-supervised pretrain + export embeddings + inject to factor_nmes
689
+ model.optimize_model("ft", max_evals=30)
690
+
691
+ # 2) Base model tune/train (uses injected pred_ft_emb_* features)
692
+ model.optimize_model("xgb", max_evals=50)
693
+
694
+ # 3) Save (or save one model only)
695
+ model.save_model()
696
+ ```
697
+
698
+ ### 8.x Tuning stuck / resume (recommended)
699
+
700
+ If a trial hangs for a long time (e.g. the 17th trial runs for hours), stop the run and add Optuna persistent storage in `config.json`. The next run will resume from completed trials and keep total trials equal to `max_evals`.
701
+
702
+ Some XGBoost parameter combos can be extremely slow; use the cap fields to narrow the search space.
703
+
704
+ **config.json example:**
705
+ ```json
706
+ {
707
+ "optuna_storage": "./Results/optuna/pricing.sqlite3",
708
+ "optuna_study_prefix": "pricing",
709
+ "xgb_max_depth_max": 12,
710
+ "xgb_n_estimators_max": 300
711
+ }
712
+ ```
713
+
714
+ **Continue training with current best params (no tuning)**
715
+ - Set `"reuse_best_params": true` in `config.json`: it prefers `Results/versions/*_xgb_best.json` or `Results/<model>_bestparams_xgboost.csv` and trains directly.
716
+ - Or specify `"best_params_files"` (by `model_key`) to read from files and skip Optuna:
717
+
718
+ ```json
719
+ {
720
+ "best_params_files": {
721
+ "xgb": "./Results/od_bc_bestparams_xgboost.csv",
722
+ "ft": "./Results/od_bc_bestparams_fttransformer.csv"
723
+ }
724
+ }
725
+ ```
726
+
727
+ **Auto-detect hangs and restart (Watchdog)**
728
+ If a trial hangs with no output for hours, use `ins_pricing/modelling/watchdog_run.py` to monitor output: when stdout/stderr is idle for `idle_seconds`, it kills the `torchrun` process tree and restarts. With `optuna_storage`, restarts resume remaining trials.
729
+
730
+ ```bash
731
+ python ins_pricing/modelling/watchdog_run.py --idle-seconds 7200 --max-restarts 50 -- ^
732
+ python -m torch.distributed.run --standalone --nproc_per_node=2 ^
733
+ ins_pricing/modelling/BayesOpt_entry.py --config-json config.json --model-keys xgb resn --max-evals 50
734
+ ```
735
+
736
+ ---
737
+
738
+ ## 9. Model usage examples (CLI and Python)
739
+
740
+ Examples by model/trainer. All examples follow the same data contract: CSV must include `target/weight/feature_list` columns; categorical columns listed in `categorical_features`.
741
+
742
+ > Note: `model_key` follows `BayesOpt_entry.py`: `glm` / `xgb` / `resn` / `ft` / `gnn`.
743
+
744
+ ### 9.1 GLM (`model_key="glm"`)
745
+
746
+ **CLI**
747
+
748
+ ```bash
749
+ python ins_pricing/modelling/BayesOpt_entry.py ^
750
+ --config-json ins_pricing/modelling/demo/config_template.json ^
751
+ --model-keys glm ^
752
+ --max-evals 50
753
+ ```
754
+
755
+ **Python**
756
+
757
+ ```python
758
+ model.optimize_model("glm", max_evals=50)
759
+ model.trainers["glm"].save()
760
+ ```
761
+
762
+ Use case: fast, interpretable baseline and sanity check.
763
+
764
+ ### 9.2 XGBoost (`model_key="xgb"`)
765
+
766
+ **CLI**
767
+
768
+ ```bash
769
+ python ins_pricing/modelling/BayesOpt_entry.py ^
770
+ --config-json ins_pricing/modelling/demo/config_template.json ^
771
+ --model-keys xgb ^
772
+ --max-evals 100
773
+ ```
774
+
775
+ **Python**
776
+
777
+ ```python
778
+ model.optimize_model("xgb", max_evals=100)
779
+ model.trainers["xgb"].save()
780
+ ```
781
+
782
+ Use case: strong baseline, friendly to feature engineering/stacked features (including FT embeddings).
783
+
784
+ ### 9.3 ResNet (`model_key="resn"`)
785
+
786
+ ResNetTrainer uses PyTorch, and uses one-hot/standardized views for training and CV (good for high-dimensional one-hot inputs).
787
+
788
+ **CLI (single machine)**
789
+
790
+ ```bash
791
+ python ins_pricing/modelling/BayesOpt_entry.py ^
792
+ --config-json ins_pricing/modelling/demo/config_template.json ^
793
+ --model-keys resn ^
794
+ --max-evals 50
795
+ ```
796
+
797
+ **CLI (DDP, multi-GPU)**
798
+
799
+ ```bash
800
+ torchrun --standalone --nproc_per_node=2 ^
801
+ ins_pricing/modelling/BayesOpt_entry.py ^
802
+ --config-json ins_pricing/modelling/demo/config_template.json ^
803
+ --model-keys resn ^
804
+ --use-resn-ddp ^
805
+ --max-evals 50
806
+ ```
807
+
808
+ **Python**
809
+
810
+ ```python
811
+ model.optimize_model("resn", max_evals=50)
812
+ model.trainers["resn"].save()
813
+ ```
814
+
815
+ ### 9.4 FT-Transformer: as prediction model (`ft_role="model"`)
816
+
817
+ FT outputs `pred_ft` and participates in lift/SHAP (if enabled).
818
+
819
+ **CLI**
820
+
821
+ ```bash
822
+ python ins_pricing/modelling/BayesOpt_entry.py ^
823
+ --config-json ins_pricing/modelling/demo/config_template.json ^
824
+ --model-keys ft ^
825
+ --ft-role model ^
826
+ --max-evals 50
827
+ ```
828
+
829
+ **Python**
830
+
831
+ ```python
832
+ model.config.ft_role = "model"
833
+ model.optimize_model("ft", max_evals=50)
834
+ ```
835
+
836
+ ### 9.5 FT-Transformer: supervised but export embeddings only (`ft_role="embedding"`)
837
+
838
+ FT is not evaluated as a standalone model; it writes embedding features (`pred_<prefix>_0..`) and injects them into downstream features.
839
+
840
+ **CLI (generate features with FT, then train base models)**
841
+
842
+ ```bash
843
+ python ins_pricing/modelling/BayesOpt_entry.py ^
844
+ --config-json "user_packages legacy/Try/config_Pricing_FT_Stack.json" ^
845
+ --model-keys xgb resn ^
846
+ --ft-role embedding ^
847
+ --max-evals 50
848
+ ```
849
+
850
+ **Python**
851
+
852
+ ```python
853
+ model.config.ft_role = "embedding"
854
+ model.config.ft_feature_prefix = "ft_emb"
855
+ model.optimize_model("ft", max_evals=50) # generate pred_ft_emb_* and inject to factor_nmes
856
+ model.optimize_model("xgb", max_evals=100) # train/tune with injected features
857
+ ```
858
+
859
+ ### 9.6 FT-Transformer: masked self-supervised pretrain + embeddings (`ft_role="unsupervised_embedding"`)
860
+
861
+ This is a two-stage stacking mode: representation learning first, base model decision later. Optuna objective is validation loss of masked reconstruction (not `tw_power`).
862
+
863
+ **CLI (recommended: use sample config)**
864
+
865
+ ```bash
866
+ python ins_pricing/modelling/BayesOpt_entry.py ^
867
+ --config-json "user_packages legacy/Try/config_Pricing_FT_Stack.json" ^
868
+ --model-keys xgb resn ^
869
+ --max-evals 50
870
+ ```
871
+
872
+ **CLI (DDP, multi-GPU)**
873
+
874
+ ```bash
875
+ torchrun --standalone --nproc_per_node=2 ^
876
+ ins_pricing/modelling/BayesOpt_entry.py ^
877
+ --config-json "user_packages legacy/Try/config_Pricing_FT_Stack.json" ^
878
+ --model-keys xgb resn ^
879
+ --use-ft-ddp ^
880
+ --max-evals 50
881
+ ```
882
+
883
+ **Python**
884
+
885
+ ```python
886
+ model.config.ft_role = "unsupervised_embedding"
887
+ model.config.ft_feature_prefix = "ft_emb"
888
+ model.optimize_model("ft", max_evals=50) # self-supervised pretrain + export pred_ft_emb_*
889
+ model.optimize_model("xgb", max_evals=100)
890
+ model.optimize_model("resn", max_evals=50)
891
+ ```
892
+
893
+ ### 9.7 GNN (`model_key="gnn"`) and geo tokens
894
+
895
+ GNN can run as a standalone model with Optuna tuning/training: it trains on one-hot/standardized features and writes `pred_gnn` / `w_pred_gnn` to `train_data/test_data`.
896
+
897
+ **CLI**
898
+
899
+ ```bash
900
+ python ins_pricing/modelling/BayesOpt_entry.py ^
901
+ --config-json ins_pricing/modelling/demo/config_template.json ^
902
+ --model-keys gnn ^
903
+ --max-evals 50
904
+ ```
905
+
906
+ GNN can also generate geo tokens: when config includes `geo_feature_nmes`, it trains a geo encoder to produce `geo_token_*` and injects those tokens into FT.
907
+
908
+ Implementation: geo token generation is handled by `GNNTrainer.prepare_geo_tokens()`. Tokens are stored in `BayesOptModel.train_geo_tokens/test_geo_tokens` and used as FT inputs during training/prediction.
909
+
910
+ ---
911
+
912
+ ## 9. FAQ (quick checks)
913
+
914
+ ### 9.1 torchrun OMP_NUM_THREADS warning
915
+
916
+ This is a common torchrun message: it sets per-process threads to 1 to avoid CPU overload. You can override it via config `env`.
917
+
918
+ ### 9.2 Optuna loss shows inf
919
+
920
+ This usually means NaN/inf during training or validation (numeric overflow, data issues, etc). Check:
921
+
922
+ - data ranges and NaNs (use `nan_to_num`, scaling)
923
+ - learning rate and AMP (reduce LR or disable AMP)
924
+ - gradient clipping (already enabled for torch models)
925
+ - unstable configs (cap XGBoost depth/estimators)