ins-pricing 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. ins_pricing/README.md +9 -6
  2. ins_pricing/__init__.py +3 -11
  3. ins_pricing/cli/BayesOpt_entry.py +24 -0
  4. ins_pricing/{modelling → cli}/BayesOpt_incremental.py +197 -64
  5. ins_pricing/cli/Explain_Run.py +25 -0
  6. ins_pricing/{modelling → cli}/Explain_entry.py +169 -124
  7. ins_pricing/cli/Pricing_Run.py +25 -0
  8. ins_pricing/cli/__init__.py +1 -0
  9. ins_pricing/cli/bayesopt_entry_runner.py +1312 -0
  10. ins_pricing/cli/utils/__init__.py +1 -0
  11. ins_pricing/cli/utils/cli_common.py +320 -0
  12. ins_pricing/cli/utils/cli_config.py +375 -0
  13. ins_pricing/{modelling → cli/utils}/notebook_utils.py +74 -19
  14. {ins_pricing_gemini/modelling → ins_pricing/cli}/watchdog_run.py +2 -2
  15. ins_pricing/{modelling → docs/modelling}/BayesOpt_USAGE.md +69 -49
  16. ins_pricing/docs/modelling/README.md +34 -0
  17. ins_pricing/modelling/__init__.py +57 -6
  18. ins_pricing/modelling/core/__init__.py +1 -0
  19. ins_pricing/modelling/{bayesopt → core/bayesopt}/config_preprocess.py +64 -1
  20. ins_pricing/modelling/{bayesopt → core/bayesopt}/core.py +150 -810
  21. ins_pricing/modelling/core/bayesopt/model_explain_mixin.py +296 -0
  22. ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py +548 -0
  23. ins_pricing/modelling/core/bayesopt/models/__init__.py +27 -0
  24. ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +316 -0
  25. ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +808 -0
  26. ins_pricing/modelling/core/bayesopt/models/model_gnn.py +675 -0
  27. ins_pricing/modelling/core/bayesopt/models/model_resn.py +435 -0
  28. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +19 -0
  29. ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +1020 -0
  30. ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +787 -0
  31. ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +195 -0
  32. ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +312 -0
  33. ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +261 -0
  34. ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +348 -0
  35. ins_pricing/modelling/{bayesopt → core/bayesopt}/utils.py +2 -2
  36. ins_pricing/modelling/core/evaluation.py +115 -0
  37. ins_pricing/production/__init__.py +4 -0
  38. ins_pricing/production/preprocess.py +71 -0
  39. ins_pricing/setup.py +10 -5
  40. {ins_pricing_gemini/modelling/tests → ins_pricing/tests/modelling}/test_plotting.py +2 -2
  41. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/METADATA +4 -4
  42. ins_pricing-0.2.0.dist-info/RECORD +125 -0
  43. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/top_level.txt +0 -1
  44. ins_pricing/modelling/BayesOpt_entry.py +0 -633
  45. ins_pricing/modelling/Explain_Run.py +0 -36
  46. ins_pricing/modelling/Pricing_Run.py +0 -36
  47. ins_pricing/modelling/README.md +0 -33
  48. ins_pricing/modelling/bayesopt/models.py +0 -2196
  49. ins_pricing/modelling/bayesopt/trainers.py +0 -2446
  50. ins_pricing/modelling/cli_common.py +0 -136
  51. ins_pricing/modelling/tests/test_plotting.py +0 -63
  52. ins_pricing/modelling/watchdog_run.py +0 -211
  53. ins_pricing-0.1.11.dist-info/RECORD +0 -169
  54. ins_pricing_gemini/__init__.py +0 -23
  55. ins_pricing_gemini/governance/__init__.py +0 -20
  56. ins_pricing_gemini/governance/approval.py +0 -93
  57. ins_pricing_gemini/governance/audit.py +0 -37
  58. ins_pricing_gemini/governance/registry.py +0 -99
  59. ins_pricing_gemini/governance/release.py +0 -159
  60. ins_pricing_gemini/modelling/Explain_Run.py +0 -36
  61. ins_pricing_gemini/modelling/Pricing_Run.py +0 -36
  62. ins_pricing_gemini/modelling/__init__.py +0 -151
  63. ins_pricing_gemini/modelling/cli_common.py +0 -141
  64. ins_pricing_gemini/modelling/config.py +0 -249
  65. ins_pricing_gemini/modelling/config_preprocess.py +0 -254
  66. ins_pricing_gemini/modelling/core.py +0 -741
  67. ins_pricing_gemini/modelling/data_container.py +0 -42
  68. ins_pricing_gemini/modelling/explain/__init__.py +0 -55
  69. ins_pricing_gemini/modelling/explain/gradients.py +0 -334
  70. ins_pricing_gemini/modelling/explain/metrics.py +0 -176
  71. ins_pricing_gemini/modelling/explain/permutation.py +0 -155
  72. ins_pricing_gemini/modelling/explain/shap_utils.py +0 -146
  73. ins_pricing_gemini/modelling/features.py +0 -215
  74. ins_pricing_gemini/modelling/model_manager.py +0 -148
  75. ins_pricing_gemini/modelling/model_plotting.py +0 -463
  76. ins_pricing_gemini/modelling/models.py +0 -2203
  77. ins_pricing_gemini/modelling/notebook_utils.py +0 -294
  78. ins_pricing_gemini/modelling/plotting/__init__.py +0 -45
  79. ins_pricing_gemini/modelling/plotting/common.py +0 -63
  80. ins_pricing_gemini/modelling/plotting/curves.py +0 -572
  81. ins_pricing_gemini/modelling/plotting/diagnostics.py +0 -139
  82. ins_pricing_gemini/modelling/plotting/geo.py +0 -362
  83. ins_pricing_gemini/modelling/plotting/importance.py +0 -121
  84. ins_pricing_gemini/modelling/run_logging.py +0 -133
  85. ins_pricing_gemini/modelling/tests/conftest.py +0 -8
  86. ins_pricing_gemini/modelling/tests/test_cross_val_generic.py +0 -66
  87. ins_pricing_gemini/modelling/tests/test_distributed_utils.py +0 -18
  88. ins_pricing_gemini/modelling/tests/test_explain.py +0 -56
  89. ins_pricing_gemini/modelling/tests/test_geo_tokens_split.py +0 -49
  90. ins_pricing_gemini/modelling/tests/test_graph_cache.py +0 -33
  91. ins_pricing_gemini/modelling/tests/test_plotting_library.py +0 -150
  92. ins_pricing_gemini/modelling/tests/test_preprocessor.py +0 -48
  93. ins_pricing_gemini/modelling/trainers.py +0 -2447
  94. ins_pricing_gemini/modelling/utils.py +0 -1020
  95. ins_pricing_gemini/pricing/__init__.py +0 -27
  96. ins_pricing_gemini/pricing/calibration.py +0 -39
  97. ins_pricing_gemini/pricing/data_quality.py +0 -117
  98. ins_pricing_gemini/pricing/exposure.py +0 -85
  99. ins_pricing_gemini/pricing/factors.py +0 -91
  100. ins_pricing_gemini/pricing/monitoring.py +0 -99
  101. ins_pricing_gemini/pricing/rate_table.py +0 -78
  102. ins_pricing_gemini/production/__init__.py +0 -21
  103. ins_pricing_gemini/production/drift.py +0 -30
  104. ins_pricing_gemini/production/monitoring.py +0 -143
  105. ins_pricing_gemini/production/scoring.py +0 -40
  106. ins_pricing_gemini/reporting/__init__.py +0 -11
  107. ins_pricing_gemini/reporting/report_builder.py +0 -72
  108. ins_pricing_gemini/reporting/scheduler.py +0 -45
  109. ins_pricing_gemini/scripts/BayesOpt_incremental.py +0 -722
  110. ins_pricing_gemini/scripts/Explain_entry.py +0 -545
  111. ins_pricing_gemini/scripts/__init__.py +0 -1
  112. ins_pricing_gemini/scripts/train.py +0 -568
  113. ins_pricing_gemini/setup.py +0 -55
  114. ins_pricing_gemini/smoke_test.py +0 -28
  115. /ins_pricing/{modelling → cli/utils}/run_logging.py +0 -0
  116. /ins_pricing/modelling/{BayesOpt.py → core/BayesOpt.py} +0 -0
  117. /ins_pricing/modelling/{bayesopt → core/bayesopt}/__init__.py +0 -0
  118. /ins_pricing/{modelling/tests → tests/modelling}/conftest.py +0 -0
  119. /ins_pricing/{modelling/tests → tests/modelling}/test_cross_val_generic.py +0 -0
  120. /ins_pricing/{modelling/tests → tests/modelling}/test_distributed_utils.py +0 -0
  121. /ins_pricing/{modelling/tests → tests/modelling}/test_explain.py +0 -0
  122. /ins_pricing/{modelling/tests → tests/modelling}/test_geo_tokens_split.py +0 -0
  123. /ins_pricing/{modelling/tests → tests/modelling}/test_graph_cache.py +0 -0
  124. /ins_pricing/{modelling/tests → tests/modelling}/test_plotting_library.py +0 -0
  125. /ins_pricing/{modelling/tests → tests/modelling}/test_preprocessor.py +0 -0
  126. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/WHEEL +0 -0
@@ -1,741 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from dataclasses import asdict
4
- from datetime import datetime
5
- import os
6
- from typing import Any, Dict, List, Optional
7
-
8
- try: # matplotlib is optional; avoid hard import failures in headless/minimal envs
9
- import matplotlib
10
- if os.name != "nt" and not os.environ.get("DISPLAY") and not os.environ.get("MPLBACKEND"):
11
- matplotlib.use("Agg")
12
- import matplotlib.pyplot as plt
13
- _MPL_IMPORT_ERROR: Optional[BaseException] = None
14
- except Exception as exc: # pragma: no cover - optional dependency
15
- plt = None # type: ignore[assignment]
16
- _MPL_IMPORT_ERROR = exc
17
- import numpy as np
18
- import pandas as pd
19
- import torch
20
- import statsmodels.api as sm
21
- from sklearn.model_selection import ShuffleSplit
22
- from sklearn.preprocessing import StandardScaler
23
-
24
- from .config import BayesOptConfig
25
- from .config_preprocess import DatasetPreprocessor, OutputManager, VersionManager
26
- from .data_container import DataContainer
27
- from .model_manager import ModelManager
28
- from .models import GraphNeuralNetSklearn
29
- from .trainers import FTTrainer, GLMTrainer, GNNTrainer, ResNetTrainer, XGBTrainer
30
- from .utils import EPS, PlotUtils, infer_factor_and_cate_list, set_global_seed
31
-
32
- # Feature and Plotting modules
33
- from .features import _add_region_effect, _prepare_geo_tokens, _build_geo_tokens
34
- from .model_plotting import (
35
- plot_oneway,
36
- plot_lift,
37
- plot_dlift,
38
- plot_conversion_lift,
39
- _plot_skip
40
- )
41
-
42
- try:
43
- from .plotting import curves as plot_curves
44
- from .plotting import diagnostics as plot_diagnostics
45
- from .plotting.common import PlotStyle, finalize_figure
46
- from .explain import gradients as explain_gradients
47
- from .explain import permutation as explain_permutation
48
- from .explain import shap_utils as explain_shap
49
- except Exception: # pragma: no cover - optional for legacy imports
50
- try: # best-effort for non-package imports
51
- from ins_pricing.modelling.plotting import curves as plot_curves
52
- from ins_pricing.modelling.plotting import diagnostics as plot_diagnostics
53
- from ins_pricing.modelling.plotting.common import PlotStyle, finalize_figure
54
- from ins_pricing.modelling.explain import gradients as explain_gradients
55
- from ins_pricing.modelling.explain import permutation as explain_permutation
56
- from ins_pricing.modelling.explain import shap_utils as explain_shap
57
- except Exception: # pragma: no cover
58
- plot_curves = None
59
- plot_diagnostics = None
60
- PlotStyle = None
61
- finalize_figure = None
62
- explain_gradients = None
63
- explain_permutation = None
64
- explain_shap = None
65
-
66
-
67
- # BayesOpt orchestration and SHAP utilities
68
- # =============================================================================
69
- class BayesOptModel:
70
-
71
- # Property proxies to maintain backward compatibility with Trainers
72
- @property
73
- def train_data(self): return self.data_container.train_data
74
- @property
75
- def test_data(self): return self.data_container.test_data
76
- @property
77
- def train_oht_data(self): return self.data_container.train_oht_data
78
- @property
79
- def test_oht_data(self): return self.data_container.test_oht_data
80
- @property
81
- def train_oht_scl_data(self): return self.data_container.train_oht_scl_data
82
- @property
83
- def test_oht_scl_data(self): return self.data_container.test_oht_scl_data
84
- @property
85
- def var_nmes(self): return self.data_container.var_nmes
86
- @property
87
- def num_features(self): return self.data_container.num_features
88
- @property
89
- def cat_categories_for_shap(self): return self.data_container.cat_categories_for_shap
90
- @property
91
- def train_geo_tokens(self): return self.data_container.train_geo_tokens
92
- @train_geo_tokens.setter
93
- def train_geo_tokens(self, val): self.data_container.train_geo_tokens = val
94
- @property
95
- def test_geo_tokens(self): return self.data_container.test_geo_tokens
96
- @test_geo_tokens.setter
97
- def test_geo_tokens(self, val): self.data_container.test_geo_tokens = val
98
- @property
99
- def geo_token_cols(self): return self.data_container.geo_token_cols
100
- @geo_token_cols.setter
101
- def geo_token_cols(self, val): self.data_container.geo_token_cols = val
102
-
103
- def __init__(self, train_data, test_data,
104
- model_nme, resp_nme, weight_nme, factor_nmes: Optional[List[str]] = None, task_type='regression',
105
- binary_resp_nme=None,
106
- cate_list=None, prop_test=0.25, rand_seed=None,
107
- epochs=100, use_gpu=True,
108
- use_resn_data_parallel: bool = False, use_ft_data_parallel: bool = False,
109
- use_gnn_data_parallel: bool = False,
110
- use_resn_ddp: bool = False, use_ft_ddp: bool = False,
111
- use_gnn_ddp: bool = False,
112
- output_dir: Optional[str] = None,
113
- gnn_use_approx_knn: bool = True,
114
- gnn_approx_knn_threshold: int = 50000,
115
- gnn_graph_cache: Optional[str] = None,
116
- gnn_max_gpu_knn_nodes: Optional[int] = 200000,
117
- gnn_knn_gpu_mem_ratio: float = 0.9,
118
- gnn_knn_gpu_mem_overhead: float = 2.0,
119
- ft_role: str = "model",
120
- ft_feature_prefix: str = "ft_emb",
121
- ft_num_numeric_tokens: Optional[int] = None,
122
- infer_categorical_max_unique: int = 50,
123
- infer_categorical_max_ratio: float = 0.05,
124
- reuse_best_params: bool = False,
125
- xgb_max_depth_max: int = 25,
126
- xgb_n_estimators_max: int = 500,
127
- resn_weight_decay: Optional[float] = None,
128
- final_ensemble: bool = False,
129
- final_ensemble_k: int = 3,
130
- final_refit: bool = True,
131
- optuna_storage: Optional[str] = None,
132
- optuna_study_prefix: Optional[str] = None,
133
- best_params_files: Optional[Dict[str, str]] = None):
134
- """Orchestrate BayesOpt training across multiple trainers."""
135
- inferred_factors, inferred_cats = infer_factor_and_cate_list(
136
- train_df=train_data,
137
- test_df=test_data,
138
- resp_nme=resp_nme,
139
- weight_nme=weight_nme,
140
- binary_resp_nme=binary_resp_nme,
141
- factor_nmes=factor_nmes,
142
- cate_list=cate_list,
143
- infer_categorical_max_unique=int(infer_categorical_max_unique),
144
- infer_categorical_max_ratio=float(infer_categorical_max_ratio),
145
- )
146
-
147
- config_args = {
148
- "model_nme": model_nme,
149
- "task_type": task_type,
150
- "resp_nme": resp_nme,
151
- "weight_nme": weight_nme,
152
- "factor_nmes": list(inferred_factors),
153
- "binary_resp_nme": binary_resp_nme,
154
- "cate_list": list(inferred_cats) if inferred_cats else None,
155
- "prop_test": prop_test,
156
- "rand_seed": rand_seed,
157
- "epochs": epochs,
158
- "use_gpu": use_gpu,
159
- "xgb_max_depth_max": int(xgb_max_depth_max),
160
- "xgb_n_estimators_max": int(xgb_n_estimators_max),
161
- "use_resn_data_parallel": use_resn_data_parallel,
162
- "use_ft_data_parallel": use_ft_data_parallel,
163
- "use_resn_ddp": use_resn_ddp,
164
- "use_gnn_data_parallel": use_gnn_data_parallel,
165
- "use_ft_ddp": use_ft_ddp,
166
- "use_gnn_ddp": use_gnn_ddp,
167
- "gnn_use_approx_knn": gnn_use_approx_knn,
168
- "gnn_approx_knn_threshold": gnn_approx_knn_threshold,
169
- "gnn_graph_cache": gnn_graph_cache,
170
- "gnn_max_gpu_knn_nodes": gnn_max_gpu_knn_nodes,
171
- "gnn_knn_gpu_mem_ratio": gnn_knn_gpu_mem_ratio,
172
- "gnn_knn_gpu_mem_overhead": gnn_knn_gpu_mem_overhead,
173
- "output_dir": output_dir,
174
- "optuna_storage": optuna_storage,
175
- "optuna_study_prefix": optuna_study_prefix,
176
- "best_params_files": best_params_files,
177
- "ft_role": str(ft_role or "model"),
178
- "ft_feature_prefix": str(ft_feature_prefix or "ft_emb"),
179
- "ft_num_numeric_tokens": ft_num_numeric_tokens,
180
- "reuse_best_params": bool(reuse_best_params),
181
- "resn_weight_decay": float(resn_weight_decay) if resn_weight_decay is not None else 1e-4,
182
- "final_ensemble": bool(final_ensemble),
183
- "final_ensemble_k": int(final_ensemble_k),
184
- "final_refit": bool(final_refit),
185
- }
186
- cfg = BayesOptConfig.from_legacy_dict(config_args)
187
- self.config = cfg
188
- self.model_nme = cfg.model_nme
189
- self.task_type = cfg.task_type
190
- self.resp_nme = cfg.resp_nme
191
- self.weight_nme = cfg.weight_nme
192
- self.factor_nmes = cfg.factor_nmes
193
- self.binary_resp_nme = cfg.binary_resp_nme
194
- self.cate_list = list(cfg.cate_list or [])
195
- self.prop_test = cfg.prop_test
196
- self.epochs = cfg.epochs
197
- self.rand_seed = cfg.rand_seed if cfg.rand_seed is not None else np.random.randint(
198
- 1, 10000)
199
- set_global_seed(int(self.rand_seed))
200
- self.use_gpu = bool(cfg.use_gpu and torch.cuda.is_available())
201
- self.output_manager = OutputManager(
202
- cfg.output_dir or os.getcwd(), self.model_nme)
203
-
204
- preprocessor = DatasetPreprocessor(train_data, test_data, cfg).run()
205
-
206
- self.data_container = DataContainer(
207
- train_data=preprocessor.train_data,
208
- test_data=preprocessor.test_data
209
- )
210
- self.data_container.set_preprocessed_data(preprocessor)
211
-
212
- self.geo_gnn_model: Optional[GraphNeuralNetSklearn] = None
213
-
214
- # Use extracted feature engineering logic
215
- _add_region_effect(self)
216
-
217
- self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
218
- test_size=self.prop_test,
219
- random_state=self.rand_seed)
220
- if self.task_type == 'classification':
221
- self.obj = 'binary:logistic'
222
- else:
223
- if 'f' in self.model_nme:
224
- self.obj = 'count:poisson'
225
- elif 's' in self.model_nme:
226
- self.obj = 'reg:gamma'
227
- elif 'bc' in self.model_nme:
228
- self.obj = 'reg:tweedie'
229
- else:
230
- self.obj = 'reg:tweedie'
231
- self.fit_params = {
232
- 'sample_weight': self.train_data[self.weight_nme].values
233
- }
234
- self.model_label: List[str] = []
235
- self.optuna_storage = cfg.optuna_storage
236
- self.optuna_study_prefix = cfg.optuna_study_prefix or "bayesopt"
237
-
238
- self.version_manager = VersionManager(self.output_manager)
239
-
240
- self.model_manager = ModelManager(self)
241
- self._prepare_geo_tokens()
242
- self.xgb_best = None
243
- self.resn_best = None
244
- self.gnn_best = None
245
- self.glm_best = None
246
- self.ft_best = None
247
- self.best_xgb_params = None
248
- self.best_resn_params = None
249
- self.best_gnn_params = None
250
- self.best_ft_params = None
251
- self.best_xgb_trial = None
252
- self.best_resn_trial = None
253
- self.best_gnn_trial = None
254
- self.best_ft_trial = None
255
- self.best_glm_params = None
256
- self.best_glm_trial = None
257
- self.xgb_load = None
258
- self.resn_load = None
259
- self.gnn_load = None
260
- self.ft_load = None
261
- self.version_manager = VersionManager(self.output_manager)
262
-
263
- def default_tweedie_power(self, obj: Optional[str] = None) -> Optional[float]:
264
- if self.task_type == 'classification':
265
- return None
266
- objective = obj or getattr(self, "obj", None)
267
- if objective == 'count:poisson':
268
- return 1.0
269
- if objective == 'reg:gamma':
270
- return 2.0
271
- return 1.5
272
-
273
- def _build_geo_tokens(self, params_override: Optional[Dict[str, Any]] = None):
274
- return _build_geo_tokens(self, params_override)
275
-
276
- def _prepare_geo_tokens(self) -> None:
277
- return _prepare_geo_tokens(self)
278
-
279
- # Note: _add_region_effect was called in __init__ directly via the imported function.
280
- # We remove the method definition here or keep it as a wrapper if called elsewhere.
281
- # It seems it's only called in __init__, so we can remove strict method definition
282
- # unless subclasses use it. To be safe, let's keep it wrapper.
283
- def _add_region_effect(self) -> None:
284
- _add_region_effect(self)
285
-
286
- # Plotting wrappers
287
- def plot_oneway(self, n_bins=10):
288
- plot_oneway(self, n_bins)
289
-
290
- def _require_trainer(self, model_key: str) -> "TrainerBase":
291
- return self.model_manager.get_trainer(model_key)
292
-
293
- def _pred_vector_columns(self, pred_prefix: str) -> List[str]:
294
- col_prefix = f"pred_{pred_prefix}_"
295
- cols = [c for c in self.train_data.columns if c.startswith(col_prefix)]
296
- def sort_key(name: str):
297
- tail = name.rsplit("_", 1)[-1]
298
- try:
299
- return (0, int(tail))
300
- except Exception:
301
- return (1, tail)
302
- cols.sort(key=sort_key)
303
- return cols
304
-
305
- def _inject_pred_features(self, pred_prefix: str) -> List[str]:
306
- cols = self._pred_vector_columns(pred_prefix)
307
- if cols:
308
- self.add_numeric_features_from_columns(cols)
309
- return cols
310
- scalar_col = f"pred_{pred_prefix}"
311
- if scalar_col in self.train_data.columns:
312
- self.add_numeric_feature_from_column(scalar_col)
313
- return [scalar_col]
314
- return []
315
-
316
- def _maybe_load_best_params(self, model_key: str, trainer: "TrainerBase") -> None:
317
- pass
318
-
319
- def optimize_model(self, model_key: str, max_evals: int = 100):
320
- self.model_manager.optimize(model_key, max_evals)
321
-
322
- def add_numeric_feature_from_column(self, col_name: str) -> None:
323
- if col_name not in self.train_data.columns or col_name not in self.test_data.columns:
324
- raise KeyError(
325
- f"Column '{col_name}' must exist in both train_data and test_data.")
326
-
327
- if col_name not in self.factor_nmes:
328
- self.factor_nmes.append(col_name)
329
- if col_name not in self.config.factor_nmes:
330
- self.config.factor_nmes.append(col_name)
331
-
332
- if col_name not in self.cate_list and col_name not in self.num_features:
333
- self.num_features.append(col_name)
334
-
335
- if self.train_oht_data is not None and self.test_oht_data is not None:
336
- self.train_oht_data[col_name] = self.train_data[col_name].values
337
- self.test_oht_data[col_name] = self.test_data[col_name].values
338
- if self.train_oht_scl_data is not None and self.test_oht_scl_data is not None:
339
- scaler = StandardScaler()
340
- tr = self.train_data[col_name].to_numpy(
341
- dtype=np.float32, copy=False).reshape(-1, 1)
342
- te = self.test_data[col_name].to_numpy(
343
- dtype=np.float32, copy=False).reshape(-1, 1)
344
- self.train_oht_scl_data[col_name] = scaler.fit_transform(
345
- tr).astype(np.float32).reshape(-1)
346
- self.test_oht_scl_data[col_name] = scaler.transform(te).astype(np.float32).reshape(-1)
347
-
348
- if col_name not in self.var_nmes:
349
- self.var_nmes.append(col_name)
350
-
351
- def add_numeric_features_from_columns(self, col_names: List[str]) -> None:
352
- if not col_names:
353
- return
354
- missing = [
355
- col for col in col_names
356
- if col not in self.train_data.columns or col not in self.test_data.columns
357
- ]
358
- if missing:
359
- raise KeyError(
360
- f"Column(s) {missing} must exist in both train_data and test_data."
361
- )
362
-
363
- for col_name in col_names:
364
- if col_name not in self.factor_nmes:
365
- self.factor_nmes.append(col_name)
366
- if col_name not in self.config.factor_nmes:
367
- self.config.factor_nmes.append(col_name)
368
- if col_name not in self.cate_list and col_name not in self.num_features:
369
- self.num_features.append(col_name)
370
- if col_name not in self.var_nmes:
371
- self.var_nmes.append(col_name)
372
-
373
- if self.train_oht_data is not None and self.test_oht_data is not None:
374
- self.train_oht_data.loc[:, col_names] = self.train_data[col_names].to_numpy(copy=False)
375
- self.test_oht_data.loc[:, col_names] = self.test_data[col_names].to_numpy(copy=False)
376
-
377
- if self.train_oht_scl_data is not None and self.test_oht_scl_data is not None:
378
- scaler = StandardScaler()
379
- tr = self.train_data[col_names].to_numpy(dtype=np.float32, copy=False)
380
- te = self.test_data[col_names].to_numpy(dtype=np.float32, copy=False)
381
- self.train_oht_scl_data.loc[:, col_names] = scaler.fit_transform(tr).astype(np.float32)
382
- self.test_oht_scl_data.loc[:, col_names] = scaler.transform(te).astype(np.float32)
383
-
384
- def prepare_ft_as_feature(self, max_evals: int = 50, pred_prefix: str = "ft_feat") -> str:
385
- ft_trainer = self._require_trainer("ft")
386
- ft_trainer.tune(max_evals=max_evals)
387
- if hasattr(ft_trainer, "train_as_feature"):
388
- ft_trainer.train_as_feature(pred_prefix=pred_prefix)
389
- else:
390
- ft_trainer.train()
391
- feature_col = f"pred_{pred_prefix}"
392
- self.add_numeric_feature_from_column(feature_col)
393
- return feature_col
394
-
395
- def prepare_ft_embedding_as_features(self, max_evals: int = 50, pred_prefix: str = "ft_emb") -> List[str]:
396
- ft_trainer = self._require_trainer("ft")
397
- ft_trainer.tune(max_evals=max_evals)
398
- if hasattr(ft_trainer, "train_as_feature"):
399
- ft_trainer.train_as_feature(
400
- pred_prefix=pred_prefix, feature_mode="embedding")
401
- else:
402
- raise RuntimeError(
403
- "FT trainer does not support embedding feature mode.")
404
- cols = self._pred_vector_columns(pred_prefix)
405
- if not cols:
406
- raise RuntimeError(
407
- f"No embedding columns were generated for prefix '{pred_prefix}'.")
408
- self.add_numeric_features_from_columns(cols)
409
- return cols
410
-
411
- def prepare_ft_unsupervised_embedding_as_features(self,
412
- pred_prefix: str = "ft_uemb",
413
- params: Optional[Dict[str,
414
- Any]] = None,
415
- mask_prob_num: float = 0.15,
416
- mask_prob_cat: float = 0.15,
417
- num_loss_weight: float = 1.0,
418
- cat_loss_weight: float = 1.0) -> List[str]:
419
- ft_trainer = self._require_trainer("ft")
420
- if not hasattr(ft_trainer, "pretrain_unsupervised_as_feature"):
421
- raise RuntimeError(
422
- "FT trainer does not support unsupervised pretraining.")
423
- ft_trainer.pretrain_unsupervised_as_feature(
424
- pred_prefix=pred_prefix,
425
- params=params,
426
- mask_prob_num=mask_prob_num,
427
- mask_prob_cat=mask_prob_cat,
428
- num_loss_weight=num_loss_weight,
429
- cat_loss_weight=cat_loss_weight
430
- )
431
- cols = self._pred_vector_columns(pred_prefix)
432
- if not cols:
433
- raise RuntimeError(
434
- f"No embedding columns were generated for prefix '{pred_prefix}'.")
435
- self.add_numeric_features_from_columns(cols)
436
- return cols
437
-
438
- def bayesopt_glm(self, max_evals=50):
439
- self.optimize_model('glm', max_evals)
440
-
441
- def bayesopt_xgb(self, max_evals=100):
442
- self.optimize_model('xgb', max_evals)
443
-
444
- def bayesopt_resnet(self, max_evals=100):
445
- self.optimize_model('resn', max_evals)
446
-
447
- def bayesopt_gnn(self, max_evals=50):
448
- self.optimize_model('gnn', max_evals)
449
-
450
- def bayesopt_ft(self, max_evals=50):
451
- self.optimize_model('ft', max_evals)
452
-
453
- def plot_lift(self, model_label, pred_nme, n_bins=10):
454
- plot_lift(self, model_label, pred_nme, n_bins)
455
-
456
- def plot_dlift(self, model_comp: List[str] = ['xgb', 'resn'], n_bins: int = 10) -> None:
457
- plot_dlift(self, model_comp, n_bins)
458
-
459
- def plot_conversion_lift(self, model_pred_col: str, n_bins: int = 20):
460
- plot_conversion_lift(self, model_pred_col, n_bins)
461
-
462
- # ========= Lightweight explainability: Permutation Importance =========
463
- def compute_permutation_importance(self,
464
- model_key: str,
465
- on_train: bool = True,
466
- metric: Any = "auto",
467
- n_repeats: int = 5,
468
- max_rows: int = 5000,
469
- random_state: Optional[int] = None):
470
- if explain_permutation is None:
471
- raise RuntimeError("explain.permutation is not available.")
472
-
473
- model_key = str(model_key)
474
- data = self.train_data if on_train else self.test_data
475
- if self.resp_nme not in data.columns:
476
- raise RuntimeError("Missing response column for permutation importance.")
477
- y = data[self.resp_nme]
478
- w = data[self.weight_nme] if self.weight_nme in data.columns else None
479
-
480
- if model_key == "resn":
481
- if self.resn_best is None:
482
- raise RuntimeError("ResNet model not trained.")
483
- X = self.train_oht_scl_data if on_train else self.test_oht_scl_data
484
- if X is None:
485
- raise RuntimeError("Missing standardized features for ResNet.")
486
- X = X[self.var_nmes]
487
- predict_fn = lambda df: self.resn_best.predict(df)
488
- elif model_key == "ft":
489
- if self.ft_best is None:
490
- raise RuntimeError("FT model not trained.")
491
- if str(self.config.ft_role) != "model":
492
- raise RuntimeError("FT role is not 'model'; FT predictions unavailable.")
493
- X = data[self.factor_nmes]
494
- geo_tokens = self.train_geo_tokens if on_train else self.test_geo_tokens
495
- geo_np = None
496
- if geo_tokens is not None:
497
- geo_np = geo_tokens.to_numpy(dtype=np.float32, copy=False)
498
- predict_fn = lambda df, geo=geo_np: self.ft_best.predict(df, geo_tokens=geo)
499
- elif model_key == "xgb":
500
- if self.xgb_best is None:
501
- raise RuntimeError("XGB model not trained.")
502
- X = data[self.factor_nmes]
503
- predict_fn = lambda df: self.xgb_best.predict(df)
504
- else:
505
- raise ValueError("Unsupported model_key for permutation importance.")
506
-
507
- return explain_permutation.permutation_importance(
508
- predict_fn,
509
- X,
510
- y,
511
- sample_weight=w,
512
- metric=metric,
513
- task_type=self.task_type,
514
- n_repeats=n_repeats,
515
- random_state=random_state,
516
- max_rows=max_rows,
517
- )
518
-
519
- # ========= Deep explainability: Integrated Gradients =========
520
- def compute_integrated_gradients_resn(self,
521
- on_train: bool = True,
522
- baseline: Any = None,
523
- steps: int = 50,
524
- batch_size: int = 256,
525
- target: Optional[int] = None):
526
- if explain_gradients is None:
527
- raise RuntimeError("explain.gradients is not available.")
528
- if self.resn_best is None:
529
- raise RuntimeError("ResNet model not trained.")
530
- X = self.train_oht_scl_data if on_train else self.test_oht_scl_data
531
- if X is None:
532
- raise RuntimeError("Missing standardized features for ResNet.")
533
- X = X[self.var_nmes]
534
- return explain_gradients.resnet_integrated_gradients(
535
- self.resn_best,
536
- X,
537
- baseline=baseline,
538
- steps=steps,
539
- batch_size=batch_size,
540
- target=target,
541
- )
542
-
543
- def compute_integrated_gradients_ft(self,
544
- on_train: bool = True,
545
- geo_tokens: Optional[np.ndarray] = None,
546
- baseline_num: Any = None,
547
- baseline_geo: Any = None,
548
- steps: int = 50,
549
- batch_size: int = 256,
550
- target: Optional[int] = None):
551
- if explain_gradients is None:
552
- raise RuntimeError("explain.gradients is not available.")
553
- if self.ft_best is None:
554
- raise RuntimeError("FT model not trained.")
555
- if str(self.config.ft_role) != "model":
556
- raise RuntimeError("FT role is not 'model'; FT explanations unavailable.")
557
-
558
- data = self.train_data if on_train else self.test_data
559
- X = data[self.factor_nmes]
560
-
561
- if geo_tokens is None and getattr(self.ft_best, "num_geo", 0) > 0:
562
- tokens_df = self.train_geo_tokens if on_train else self.test_geo_tokens
563
- if tokens_df is not None:
564
- geo_tokens = tokens_df.to_numpy(dtype=np.float32, copy=False)
565
-
566
- return explain_gradients.ft_integrated_gradients(
567
- self.ft_best,
568
- X,
569
- geo_tokens=geo_tokens,
570
- baseline_num=baseline_num,
571
- baseline_geo=baseline_geo,
572
- steps=steps,
573
- batch_size=batch_size,
574
- target=target,
575
- )
576
-
577
- def save_model(self, model_name=None):
578
- keys = [model_name] if model_name else self.model_manager.trainers.keys()
579
- for key in keys:
580
- if key in self.model_manager.trainers:
581
- self.model_manager.trainers[key].save()
582
- else:
583
- if model_name:
584
- print(f"[save_model] Warning: Unknown model key {key}")
585
-
586
- def load_model(self, model_name=None):
587
- keys = [model_name] if model_name else self.model_manager.trainers.keys()
588
- for key in keys:
589
- if key in self.model_manager.trainers:
590
- self.model_manager.trainers[key].load()
591
- # Restore to ctx for backward compat
592
- trainer = self.model_manager.trainers[key]
593
- if trainer.model is not None:
594
- setattr(self, f"{key}_best", trainer.model)
595
- if key in ['xgb', 'resn', 'ft', 'gnn']:
596
- setattr(self, f"{key}_load", trainer.model)
597
- else:
598
- if model_name:
599
- print(f"[load_model] Warning: Unknown model key {key}")
600
-
601
- def _sample_rows(self, data: pd.DataFrame, n: int) -> pd.DataFrame:
602
- if len(data) == 0:
603
- return data
604
- return data.sample(min(len(data), n), random_state=self.rand_seed)
605
-
606
- @staticmethod
607
- def _shap_nsamples(arr: np.ndarray, max_nsamples: int = 300) -> int:
608
- min_needed = arr.shape[1] + 2
609
- return max(min_needed, min(max_nsamples, arr.shape[0] * arr.shape[1]))
610
-
611
- def _build_ft_shap_matrix(self, data: pd.DataFrame) -> np.ndarray:
612
- matrices = []
613
- for col in self.factor_nmes:
614
- s = data[col]
615
- if col in self.cate_list:
616
- cats = pd.Categorical(
617
- s,
618
- categories=self.cat_categories_for_shap[col]
619
- )
620
- codes = np.asarray(cats.codes, dtype=np.float64).reshape(-1, 1)
621
- matrices.append(codes)
622
- else:
623
- vals = pd.to_numeric(s, errors="coerce")
624
- arr = vals.to_numpy(dtype=np.float64, copy=True).reshape(-1, 1)
625
- matrices.append(arr)
626
- X_mat = np.concatenate(matrices, axis=1) # Result shape (N, F)
627
- return X_mat
628
-
629
- def _decode_ft_shap_matrix_to_df(self, X_mat: np.ndarray) -> pd.DataFrame:
630
- data_dict = {}
631
- for j, col in enumerate(self.factor_nmes):
632
- col_vals = X_mat[:, j]
633
- if col in self.cate_list:
634
- cats = self.cat_categories_for_shap[col]
635
- codes = np.round(col_vals).astype(int)
636
- codes = np.clip(codes, -1, len(cats) - 1)
637
- cat_series = pd.Categorical.from_codes(
638
- codes,
639
- categories=cats
640
- )
641
- data_dict[col] = cat_series
642
- else:
643
- data_dict[col] = col_vals.astype(float)
644
-
645
- df = pd.DataFrame(data_dict, columns=self.factor_nmes)
646
- for col in self.cate_list:
647
- if col in df.columns:
648
- df[col] = df[col].astype("category")
649
- return df
650
-
651
- def _build_glm_design(self, data: pd.DataFrame) -> pd.DataFrame:
652
- X = data[self.var_nmes]
653
- return sm.add_constant(X, has_constant='add')
654
-
655
- def _compute_shap_core(self,
656
- model_key: str,
657
- n_background: int,
658
- n_samples: int,
659
- on_train: bool,
660
- X_df: pd.DataFrame,
661
- prep_fn,
662
- predict_fn,
663
- cleanup_fn=None):
664
- if explain_shap is None:
665
- raise RuntimeError("explain.shap_utils is not available.")
666
- return explain_shap.compute_shap_core(
667
- self,
668
- model_key,
669
- n_background,
670
- n_samples,
671
- on_train,
672
- X_df=X_df,
673
- prep_fn=prep_fn,
674
- predict_fn=predict_fn,
675
- cleanup_fn=cleanup_fn,
676
- )
677
-
678
- def compute_shap_glm(self, n_background: int = 500,
679
- n_samples: int = 200,
680
- on_train: bool = True):
681
- if explain_shap is None:
682
- raise RuntimeError("explain.shap_utils is not available.")
683
- self.shap_glm = explain_shap.compute_shap_glm(
684
- self,
685
- n_background=n_background,
686
- n_samples=n_samples,
687
- on_train=on_train,
688
- )
689
- return self.shap_glm
690
-
691
- def compute_shap_xgb(self, n_background: int = 500,
692
- n_samples: int = 200,
693
- on_train: bool = True):
694
- if explain_shap is None:
695
- raise RuntimeError("explain.shap_utils is not available.")
696
- self.shap_xgb = explain_shap.compute_shap_xgb(
697
- self,
698
- n_background=n_background,
699
- n_samples=n_samples,
700
- on_train=on_train,
701
- )
702
- return self.shap_xgb
703
-
704
- def _resn_predict_wrapper(self, X_np):
705
- model = self.resn_best.resnet.to("cpu")
706
- with torch.no_grad():
707
- X_tensor = torch.tensor(X_np, dtype=torch.float32)
708
- y_pred = model(X_tensor).cpu().numpy()
709
- y_pred = np.clip(y_pred, 1e-6, None)
710
- return y_pred.reshape(-1)
711
-
712
- def compute_shap_resn(self, n_background: int = 500,
713
- n_samples: int = 200,
714
- on_train: bool = True):
715
- if explain_shap is None:
716
- raise RuntimeError("explain.shap_utils is not available.")
717
- self.shap_resn = explain_shap.compute_shap_resn(
718
- self,
719
- n_background=n_background,
720
- n_samples=n_samples,
721
- on_train=on_train,
722
- )
723
- return self.shap_resn
724
-
725
- def _ft_shap_predict_wrapper(self, X_mat: np.ndarray) -> np.ndarray:
726
- df_input = self._decode_ft_shap_matrix_to_df(X_mat)
727
- y_pred = self.ft_best.predict(df_input)
728
- return np.asarray(y_pred, dtype=np.float64).reshape(-1)
729
-
730
- def compute_shap_ft(self, n_background: int = 500,
731
- n_samples: int = 200,
732
- on_train: bool = True):
733
- if explain_shap is None:
734
- raise RuntimeError("explain.shap_utils is not available.")
735
- self.shap_ft = explain_shap.compute_shap_ft(
736
- self,
737
- n_background=n_background,
738
- n_samples=n_samples,
739
- on_train=on_train,
740
- )
741
- return self.shap_ft