ins-pricing 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +9 -6
- ins_pricing/__init__.py +3 -11
- ins_pricing/cli/BayesOpt_entry.py +24 -0
- ins_pricing/{modelling → cli}/BayesOpt_incremental.py +197 -64
- ins_pricing/cli/Explain_Run.py +25 -0
- ins_pricing/{modelling → cli}/Explain_entry.py +169 -124
- ins_pricing/cli/Pricing_Run.py +25 -0
- ins_pricing/cli/__init__.py +1 -0
- ins_pricing/cli/bayesopt_entry_runner.py +1312 -0
- ins_pricing/cli/utils/__init__.py +1 -0
- ins_pricing/cli/utils/cli_common.py +320 -0
- ins_pricing/cli/utils/cli_config.py +375 -0
- ins_pricing/{modelling → cli/utils}/notebook_utils.py +74 -19
- {ins_pricing_gemini/modelling → ins_pricing/cli}/watchdog_run.py +2 -2
- ins_pricing/{modelling → docs/modelling}/BayesOpt_USAGE.md +69 -49
- ins_pricing/docs/modelling/README.md +34 -0
- ins_pricing/modelling/__init__.py +57 -6
- ins_pricing/modelling/core/__init__.py +1 -0
- ins_pricing/modelling/{bayesopt → core/bayesopt}/config_preprocess.py +64 -1
- ins_pricing/modelling/{bayesopt → core/bayesopt}/core.py +150 -810
- ins_pricing/modelling/core/bayesopt/model_explain_mixin.py +296 -0
- ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py +548 -0
- ins_pricing/modelling/core/bayesopt/models/__init__.py +27 -0
- ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +316 -0
- ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +808 -0
- ins_pricing/modelling/core/bayesopt/models/model_gnn.py +675 -0
- ins_pricing/modelling/core/bayesopt/models/model_resn.py +435 -0
- ins_pricing/modelling/core/bayesopt/trainers/__init__.py +19 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +1020 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +787 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +195 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +312 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +261 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +348 -0
- ins_pricing/modelling/{bayesopt → core/bayesopt}/utils.py +2 -2
- ins_pricing/modelling/core/evaluation.py +115 -0
- ins_pricing/production/__init__.py +4 -0
- ins_pricing/production/preprocess.py +71 -0
- ins_pricing/setup.py +10 -5
- {ins_pricing_gemini/modelling/tests → ins_pricing/tests/modelling}/test_plotting.py +2 -2
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/METADATA +4 -4
- ins_pricing-0.2.0.dist-info/RECORD +125 -0
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/top_level.txt +0 -1
- ins_pricing/modelling/BayesOpt_entry.py +0 -633
- ins_pricing/modelling/Explain_Run.py +0 -36
- ins_pricing/modelling/Pricing_Run.py +0 -36
- ins_pricing/modelling/README.md +0 -33
- ins_pricing/modelling/bayesopt/models.py +0 -2196
- ins_pricing/modelling/bayesopt/trainers.py +0 -2446
- ins_pricing/modelling/cli_common.py +0 -136
- ins_pricing/modelling/tests/test_plotting.py +0 -63
- ins_pricing/modelling/watchdog_run.py +0 -211
- ins_pricing-0.1.11.dist-info/RECORD +0 -169
- ins_pricing_gemini/__init__.py +0 -23
- ins_pricing_gemini/governance/__init__.py +0 -20
- ins_pricing_gemini/governance/approval.py +0 -93
- ins_pricing_gemini/governance/audit.py +0 -37
- ins_pricing_gemini/governance/registry.py +0 -99
- ins_pricing_gemini/governance/release.py +0 -159
- ins_pricing_gemini/modelling/Explain_Run.py +0 -36
- ins_pricing_gemini/modelling/Pricing_Run.py +0 -36
- ins_pricing_gemini/modelling/__init__.py +0 -151
- ins_pricing_gemini/modelling/cli_common.py +0 -141
- ins_pricing_gemini/modelling/config.py +0 -249
- ins_pricing_gemini/modelling/config_preprocess.py +0 -254
- ins_pricing_gemini/modelling/core.py +0 -741
- ins_pricing_gemini/modelling/data_container.py +0 -42
- ins_pricing_gemini/modelling/explain/__init__.py +0 -55
- ins_pricing_gemini/modelling/explain/gradients.py +0 -334
- ins_pricing_gemini/modelling/explain/metrics.py +0 -176
- ins_pricing_gemini/modelling/explain/permutation.py +0 -155
- ins_pricing_gemini/modelling/explain/shap_utils.py +0 -146
- ins_pricing_gemini/modelling/features.py +0 -215
- ins_pricing_gemini/modelling/model_manager.py +0 -148
- ins_pricing_gemini/modelling/model_plotting.py +0 -463
- ins_pricing_gemini/modelling/models.py +0 -2203
- ins_pricing_gemini/modelling/notebook_utils.py +0 -294
- ins_pricing_gemini/modelling/plotting/__init__.py +0 -45
- ins_pricing_gemini/modelling/plotting/common.py +0 -63
- ins_pricing_gemini/modelling/plotting/curves.py +0 -572
- ins_pricing_gemini/modelling/plotting/diagnostics.py +0 -139
- ins_pricing_gemini/modelling/plotting/geo.py +0 -362
- ins_pricing_gemini/modelling/plotting/importance.py +0 -121
- ins_pricing_gemini/modelling/run_logging.py +0 -133
- ins_pricing_gemini/modelling/tests/conftest.py +0 -8
- ins_pricing_gemini/modelling/tests/test_cross_val_generic.py +0 -66
- ins_pricing_gemini/modelling/tests/test_distributed_utils.py +0 -18
- ins_pricing_gemini/modelling/tests/test_explain.py +0 -56
- ins_pricing_gemini/modelling/tests/test_geo_tokens_split.py +0 -49
- ins_pricing_gemini/modelling/tests/test_graph_cache.py +0 -33
- ins_pricing_gemini/modelling/tests/test_plotting_library.py +0 -150
- ins_pricing_gemini/modelling/tests/test_preprocessor.py +0 -48
- ins_pricing_gemini/modelling/trainers.py +0 -2447
- ins_pricing_gemini/modelling/utils.py +0 -1020
- ins_pricing_gemini/pricing/__init__.py +0 -27
- ins_pricing_gemini/pricing/calibration.py +0 -39
- ins_pricing_gemini/pricing/data_quality.py +0 -117
- ins_pricing_gemini/pricing/exposure.py +0 -85
- ins_pricing_gemini/pricing/factors.py +0 -91
- ins_pricing_gemini/pricing/monitoring.py +0 -99
- ins_pricing_gemini/pricing/rate_table.py +0 -78
- ins_pricing_gemini/production/__init__.py +0 -21
- ins_pricing_gemini/production/drift.py +0 -30
- ins_pricing_gemini/production/monitoring.py +0 -143
- ins_pricing_gemini/production/scoring.py +0 -40
- ins_pricing_gemini/reporting/__init__.py +0 -11
- ins_pricing_gemini/reporting/report_builder.py +0 -72
- ins_pricing_gemini/reporting/scheduler.py +0 -45
- ins_pricing_gemini/scripts/BayesOpt_incremental.py +0 -722
- ins_pricing_gemini/scripts/Explain_entry.py +0 -545
- ins_pricing_gemini/scripts/__init__.py +0 -1
- ins_pricing_gemini/scripts/train.py +0 -568
- ins_pricing_gemini/setup.py +0 -55
- ins_pricing_gemini/smoke_test.py +0 -28
- /ins_pricing/{modelling → cli/utils}/run_logging.py +0 -0
- /ins_pricing/modelling/{BayesOpt.py → core/BayesOpt.py} +0 -0
- /ins_pricing/modelling/{bayesopt → core/bayesopt}/__init__.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/conftest.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_cross_val_generic.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_distributed_utils.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_explain.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_geo_tokens_split.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_graph_cache.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_plotting_library.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_preprocessor.py +0 -0
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/WHEEL +0 -0
|
@@ -1,741 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from dataclasses import asdict
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
import os
|
|
6
|
-
from typing import Any, Dict, List, Optional
|
|
7
|
-
|
|
8
|
-
try: # matplotlib is optional; avoid hard import failures in headless/minimal envs
|
|
9
|
-
import matplotlib
|
|
10
|
-
if os.name != "nt" and not os.environ.get("DISPLAY") and not os.environ.get("MPLBACKEND"):
|
|
11
|
-
matplotlib.use("Agg")
|
|
12
|
-
import matplotlib.pyplot as plt
|
|
13
|
-
_MPL_IMPORT_ERROR: Optional[BaseException] = None
|
|
14
|
-
except Exception as exc: # pragma: no cover - optional dependency
|
|
15
|
-
plt = None # type: ignore[assignment]
|
|
16
|
-
_MPL_IMPORT_ERROR = exc
|
|
17
|
-
import numpy as np
|
|
18
|
-
import pandas as pd
|
|
19
|
-
import torch
|
|
20
|
-
import statsmodels.api as sm
|
|
21
|
-
from sklearn.model_selection import ShuffleSplit
|
|
22
|
-
from sklearn.preprocessing import StandardScaler
|
|
23
|
-
|
|
24
|
-
from .config import BayesOptConfig
|
|
25
|
-
from .config_preprocess import DatasetPreprocessor, OutputManager, VersionManager
|
|
26
|
-
from .data_container import DataContainer
|
|
27
|
-
from .model_manager import ModelManager
|
|
28
|
-
from .models import GraphNeuralNetSklearn
|
|
29
|
-
from .trainers import FTTrainer, GLMTrainer, GNNTrainer, ResNetTrainer, XGBTrainer
|
|
30
|
-
from .utils import EPS, PlotUtils, infer_factor_and_cate_list, set_global_seed
|
|
31
|
-
|
|
32
|
-
# Feature and Plotting modules
|
|
33
|
-
from .features import _add_region_effect, _prepare_geo_tokens, _build_geo_tokens
|
|
34
|
-
from .model_plotting import (
|
|
35
|
-
plot_oneway,
|
|
36
|
-
plot_lift,
|
|
37
|
-
plot_dlift,
|
|
38
|
-
plot_conversion_lift,
|
|
39
|
-
_plot_skip
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
try:
|
|
43
|
-
from .plotting import curves as plot_curves
|
|
44
|
-
from .plotting import diagnostics as plot_diagnostics
|
|
45
|
-
from .plotting.common import PlotStyle, finalize_figure
|
|
46
|
-
from .explain import gradients as explain_gradients
|
|
47
|
-
from .explain import permutation as explain_permutation
|
|
48
|
-
from .explain import shap_utils as explain_shap
|
|
49
|
-
except Exception: # pragma: no cover - optional for legacy imports
|
|
50
|
-
try: # best-effort for non-package imports
|
|
51
|
-
from ins_pricing.modelling.plotting import curves as plot_curves
|
|
52
|
-
from ins_pricing.modelling.plotting import diagnostics as plot_diagnostics
|
|
53
|
-
from ins_pricing.modelling.plotting.common import PlotStyle, finalize_figure
|
|
54
|
-
from ins_pricing.modelling.explain import gradients as explain_gradients
|
|
55
|
-
from ins_pricing.modelling.explain import permutation as explain_permutation
|
|
56
|
-
from ins_pricing.modelling.explain import shap_utils as explain_shap
|
|
57
|
-
except Exception: # pragma: no cover
|
|
58
|
-
plot_curves = None
|
|
59
|
-
plot_diagnostics = None
|
|
60
|
-
PlotStyle = None
|
|
61
|
-
finalize_figure = None
|
|
62
|
-
explain_gradients = None
|
|
63
|
-
explain_permutation = None
|
|
64
|
-
explain_shap = None
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
# BayesOpt orchestration and SHAP utilities
|
|
68
|
-
# =============================================================================
|
|
69
|
-
class BayesOptModel:
|
|
70
|
-
|
|
71
|
-
# Property proxies to maintain backward compatibility with Trainers
|
|
72
|
-
@property
|
|
73
|
-
def train_data(self): return self.data_container.train_data
|
|
74
|
-
@property
|
|
75
|
-
def test_data(self): return self.data_container.test_data
|
|
76
|
-
@property
|
|
77
|
-
def train_oht_data(self): return self.data_container.train_oht_data
|
|
78
|
-
@property
|
|
79
|
-
def test_oht_data(self): return self.data_container.test_oht_data
|
|
80
|
-
@property
|
|
81
|
-
def train_oht_scl_data(self): return self.data_container.train_oht_scl_data
|
|
82
|
-
@property
|
|
83
|
-
def test_oht_scl_data(self): return self.data_container.test_oht_scl_data
|
|
84
|
-
@property
|
|
85
|
-
def var_nmes(self): return self.data_container.var_nmes
|
|
86
|
-
@property
|
|
87
|
-
def num_features(self): return self.data_container.num_features
|
|
88
|
-
@property
|
|
89
|
-
def cat_categories_for_shap(self): return self.data_container.cat_categories_for_shap
|
|
90
|
-
@property
|
|
91
|
-
def train_geo_tokens(self): return self.data_container.train_geo_tokens
|
|
92
|
-
@train_geo_tokens.setter
|
|
93
|
-
def train_geo_tokens(self, val): self.data_container.train_geo_tokens = val
|
|
94
|
-
@property
|
|
95
|
-
def test_geo_tokens(self): return self.data_container.test_geo_tokens
|
|
96
|
-
@test_geo_tokens.setter
|
|
97
|
-
def test_geo_tokens(self, val): self.data_container.test_geo_tokens = val
|
|
98
|
-
@property
|
|
99
|
-
def geo_token_cols(self): return self.data_container.geo_token_cols
|
|
100
|
-
@geo_token_cols.setter
|
|
101
|
-
def geo_token_cols(self, val): self.data_container.geo_token_cols = val
|
|
102
|
-
|
|
103
|
-
def __init__(self, train_data, test_data,
|
|
104
|
-
model_nme, resp_nme, weight_nme, factor_nmes: Optional[List[str]] = None, task_type='regression',
|
|
105
|
-
binary_resp_nme=None,
|
|
106
|
-
cate_list=None, prop_test=0.25, rand_seed=None,
|
|
107
|
-
epochs=100, use_gpu=True,
|
|
108
|
-
use_resn_data_parallel: bool = False, use_ft_data_parallel: bool = False,
|
|
109
|
-
use_gnn_data_parallel: bool = False,
|
|
110
|
-
use_resn_ddp: bool = False, use_ft_ddp: bool = False,
|
|
111
|
-
use_gnn_ddp: bool = False,
|
|
112
|
-
output_dir: Optional[str] = None,
|
|
113
|
-
gnn_use_approx_knn: bool = True,
|
|
114
|
-
gnn_approx_knn_threshold: int = 50000,
|
|
115
|
-
gnn_graph_cache: Optional[str] = None,
|
|
116
|
-
gnn_max_gpu_knn_nodes: Optional[int] = 200000,
|
|
117
|
-
gnn_knn_gpu_mem_ratio: float = 0.9,
|
|
118
|
-
gnn_knn_gpu_mem_overhead: float = 2.0,
|
|
119
|
-
ft_role: str = "model",
|
|
120
|
-
ft_feature_prefix: str = "ft_emb",
|
|
121
|
-
ft_num_numeric_tokens: Optional[int] = None,
|
|
122
|
-
infer_categorical_max_unique: int = 50,
|
|
123
|
-
infer_categorical_max_ratio: float = 0.05,
|
|
124
|
-
reuse_best_params: bool = False,
|
|
125
|
-
xgb_max_depth_max: int = 25,
|
|
126
|
-
xgb_n_estimators_max: int = 500,
|
|
127
|
-
resn_weight_decay: Optional[float] = None,
|
|
128
|
-
final_ensemble: bool = False,
|
|
129
|
-
final_ensemble_k: int = 3,
|
|
130
|
-
final_refit: bool = True,
|
|
131
|
-
optuna_storage: Optional[str] = None,
|
|
132
|
-
optuna_study_prefix: Optional[str] = None,
|
|
133
|
-
best_params_files: Optional[Dict[str, str]] = None):
|
|
134
|
-
"""Orchestrate BayesOpt training across multiple trainers."""
|
|
135
|
-
inferred_factors, inferred_cats = infer_factor_and_cate_list(
|
|
136
|
-
train_df=train_data,
|
|
137
|
-
test_df=test_data,
|
|
138
|
-
resp_nme=resp_nme,
|
|
139
|
-
weight_nme=weight_nme,
|
|
140
|
-
binary_resp_nme=binary_resp_nme,
|
|
141
|
-
factor_nmes=factor_nmes,
|
|
142
|
-
cate_list=cate_list,
|
|
143
|
-
infer_categorical_max_unique=int(infer_categorical_max_unique),
|
|
144
|
-
infer_categorical_max_ratio=float(infer_categorical_max_ratio),
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
config_args = {
|
|
148
|
-
"model_nme": model_nme,
|
|
149
|
-
"task_type": task_type,
|
|
150
|
-
"resp_nme": resp_nme,
|
|
151
|
-
"weight_nme": weight_nme,
|
|
152
|
-
"factor_nmes": list(inferred_factors),
|
|
153
|
-
"binary_resp_nme": binary_resp_nme,
|
|
154
|
-
"cate_list": list(inferred_cats) if inferred_cats else None,
|
|
155
|
-
"prop_test": prop_test,
|
|
156
|
-
"rand_seed": rand_seed,
|
|
157
|
-
"epochs": epochs,
|
|
158
|
-
"use_gpu": use_gpu,
|
|
159
|
-
"xgb_max_depth_max": int(xgb_max_depth_max),
|
|
160
|
-
"xgb_n_estimators_max": int(xgb_n_estimators_max),
|
|
161
|
-
"use_resn_data_parallel": use_resn_data_parallel,
|
|
162
|
-
"use_ft_data_parallel": use_ft_data_parallel,
|
|
163
|
-
"use_resn_ddp": use_resn_ddp,
|
|
164
|
-
"use_gnn_data_parallel": use_gnn_data_parallel,
|
|
165
|
-
"use_ft_ddp": use_ft_ddp,
|
|
166
|
-
"use_gnn_ddp": use_gnn_ddp,
|
|
167
|
-
"gnn_use_approx_knn": gnn_use_approx_knn,
|
|
168
|
-
"gnn_approx_knn_threshold": gnn_approx_knn_threshold,
|
|
169
|
-
"gnn_graph_cache": gnn_graph_cache,
|
|
170
|
-
"gnn_max_gpu_knn_nodes": gnn_max_gpu_knn_nodes,
|
|
171
|
-
"gnn_knn_gpu_mem_ratio": gnn_knn_gpu_mem_ratio,
|
|
172
|
-
"gnn_knn_gpu_mem_overhead": gnn_knn_gpu_mem_overhead,
|
|
173
|
-
"output_dir": output_dir,
|
|
174
|
-
"optuna_storage": optuna_storage,
|
|
175
|
-
"optuna_study_prefix": optuna_study_prefix,
|
|
176
|
-
"best_params_files": best_params_files,
|
|
177
|
-
"ft_role": str(ft_role or "model"),
|
|
178
|
-
"ft_feature_prefix": str(ft_feature_prefix or "ft_emb"),
|
|
179
|
-
"ft_num_numeric_tokens": ft_num_numeric_tokens,
|
|
180
|
-
"reuse_best_params": bool(reuse_best_params),
|
|
181
|
-
"resn_weight_decay": float(resn_weight_decay) if resn_weight_decay is not None else 1e-4,
|
|
182
|
-
"final_ensemble": bool(final_ensemble),
|
|
183
|
-
"final_ensemble_k": int(final_ensemble_k),
|
|
184
|
-
"final_refit": bool(final_refit),
|
|
185
|
-
}
|
|
186
|
-
cfg = BayesOptConfig.from_legacy_dict(config_args)
|
|
187
|
-
self.config = cfg
|
|
188
|
-
self.model_nme = cfg.model_nme
|
|
189
|
-
self.task_type = cfg.task_type
|
|
190
|
-
self.resp_nme = cfg.resp_nme
|
|
191
|
-
self.weight_nme = cfg.weight_nme
|
|
192
|
-
self.factor_nmes = cfg.factor_nmes
|
|
193
|
-
self.binary_resp_nme = cfg.binary_resp_nme
|
|
194
|
-
self.cate_list = list(cfg.cate_list or [])
|
|
195
|
-
self.prop_test = cfg.prop_test
|
|
196
|
-
self.epochs = cfg.epochs
|
|
197
|
-
self.rand_seed = cfg.rand_seed if cfg.rand_seed is not None else np.random.randint(
|
|
198
|
-
1, 10000)
|
|
199
|
-
set_global_seed(int(self.rand_seed))
|
|
200
|
-
self.use_gpu = bool(cfg.use_gpu and torch.cuda.is_available())
|
|
201
|
-
self.output_manager = OutputManager(
|
|
202
|
-
cfg.output_dir or os.getcwd(), self.model_nme)
|
|
203
|
-
|
|
204
|
-
preprocessor = DatasetPreprocessor(train_data, test_data, cfg).run()
|
|
205
|
-
|
|
206
|
-
self.data_container = DataContainer(
|
|
207
|
-
train_data=preprocessor.train_data,
|
|
208
|
-
test_data=preprocessor.test_data
|
|
209
|
-
)
|
|
210
|
-
self.data_container.set_preprocessed_data(preprocessor)
|
|
211
|
-
|
|
212
|
-
self.geo_gnn_model: Optional[GraphNeuralNetSklearn] = None
|
|
213
|
-
|
|
214
|
-
# Use extracted feature engineering logic
|
|
215
|
-
_add_region_effect(self)
|
|
216
|
-
|
|
217
|
-
self.cv = ShuffleSplit(n_splits=int(1/self.prop_test),
|
|
218
|
-
test_size=self.prop_test,
|
|
219
|
-
random_state=self.rand_seed)
|
|
220
|
-
if self.task_type == 'classification':
|
|
221
|
-
self.obj = 'binary:logistic'
|
|
222
|
-
else:
|
|
223
|
-
if 'f' in self.model_nme:
|
|
224
|
-
self.obj = 'count:poisson'
|
|
225
|
-
elif 's' in self.model_nme:
|
|
226
|
-
self.obj = 'reg:gamma'
|
|
227
|
-
elif 'bc' in self.model_nme:
|
|
228
|
-
self.obj = 'reg:tweedie'
|
|
229
|
-
else:
|
|
230
|
-
self.obj = 'reg:tweedie'
|
|
231
|
-
self.fit_params = {
|
|
232
|
-
'sample_weight': self.train_data[self.weight_nme].values
|
|
233
|
-
}
|
|
234
|
-
self.model_label: List[str] = []
|
|
235
|
-
self.optuna_storage = cfg.optuna_storage
|
|
236
|
-
self.optuna_study_prefix = cfg.optuna_study_prefix or "bayesopt"
|
|
237
|
-
|
|
238
|
-
self.version_manager = VersionManager(self.output_manager)
|
|
239
|
-
|
|
240
|
-
self.model_manager = ModelManager(self)
|
|
241
|
-
self._prepare_geo_tokens()
|
|
242
|
-
self.xgb_best = None
|
|
243
|
-
self.resn_best = None
|
|
244
|
-
self.gnn_best = None
|
|
245
|
-
self.glm_best = None
|
|
246
|
-
self.ft_best = None
|
|
247
|
-
self.best_xgb_params = None
|
|
248
|
-
self.best_resn_params = None
|
|
249
|
-
self.best_gnn_params = None
|
|
250
|
-
self.best_ft_params = None
|
|
251
|
-
self.best_xgb_trial = None
|
|
252
|
-
self.best_resn_trial = None
|
|
253
|
-
self.best_gnn_trial = None
|
|
254
|
-
self.best_ft_trial = None
|
|
255
|
-
self.best_glm_params = None
|
|
256
|
-
self.best_glm_trial = None
|
|
257
|
-
self.xgb_load = None
|
|
258
|
-
self.resn_load = None
|
|
259
|
-
self.gnn_load = None
|
|
260
|
-
self.ft_load = None
|
|
261
|
-
self.version_manager = VersionManager(self.output_manager)
|
|
262
|
-
|
|
263
|
-
def default_tweedie_power(self, obj: Optional[str] = None) -> Optional[float]:
|
|
264
|
-
if self.task_type == 'classification':
|
|
265
|
-
return None
|
|
266
|
-
objective = obj or getattr(self, "obj", None)
|
|
267
|
-
if objective == 'count:poisson':
|
|
268
|
-
return 1.0
|
|
269
|
-
if objective == 'reg:gamma':
|
|
270
|
-
return 2.0
|
|
271
|
-
return 1.5
|
|
272
|
-
|
|
273
|
-
def _build_geo_tokens(self, params_override: Optional[Dict[str, Any]] = None):
|
|
274
|
-
return _build_geo_tokens(self, params_override)
|
|
275
|
-
|
|
276
|
-
def _prepare_geo_tokens(self) -> None:
|
|
277
|
-
return _prepare_geo_tokens(self)
|
|
278
|
-
|
|
279
|
-
# Note: _add_region_effect was called in __init__ directly via the imported function.
|
|
280
|
-
# We remove the method definition here or keep it as a wrapper if called elsewhere.
|
|
281
|
-
# It seems it's only called in __init__, so we can remove strict method definition
|
|
282
|
-
# unless subclasses use it. To be safe, let's keep it wrapper.
|
|
283
|
-
def _add_region_effect(self) -> None:
|
|
284
|
-
_add_region_effect(self)
|
|
285
|
-
|
|
286
|
-
# Plotting wrappers
|
|
287
|
-
def plot_oneway(self, n_bins=10):
|
|
288
|
-
plot_oneway(self, n_bins)
|
|
289
|
-
|
|
290
|
-
def _require_trainer(self, model_key: str) -> "TrainerBase":
|
|
291
|
-
return self.model_manager.get_trainer(model_key)
|
|
292
|
-
|
|
293
|
-
def _pred_vector_columns(self, pred_prefix: str) -> List[str]:
|
|
294
|
-
col_prefix = f"pred_{pred_prefix}_"
|
|
295
|
-
cols = [c for c in self.train_data.columns if c.startswith(col_prefix)]
|
|
296
|
-
def sort_key(name: str):
|
|
297
|
-
tail = name.rsplit("_", 1)[-1]
|
|
298
|
-
try:
|
|
299
|
-
return (0, int(tail))
|
|
300
|
-
except Exception:
|
|
301
|
-
return (1, tail)
|
|
302
|
-
cols.sort(key=sort_key)
|
|
303
|
-
return cols
|
|
304
|
-
|
|
305
|
-
def _inject_pred_features(self, pred_prefix: str) -> List[str]:
|
|
306
|
-
cols = self._pred_vector_columns(pred_prefix)
|
|
307
|
-
if cols:
|
|
308
|
-
self.add_numeric_features_from_columns(cols)
|
|
309
|
-
return cols
|
|
310
|
-
scalar_col = f"pred_{pred_prefix}"
|
|
311
|
-
if scalar_col in self.train_data.columns:
|
|
312
|
-
self.add_numeric_feature_from_column(scalar_col)
|
|
313
|
-
return [scalar_col]
|
|
314
|
-
return []
|
|
315
|
-
|
|
316
|
-
def _maybe_load_best_params(self, model_key: str, trainer: "TrainerBase") -> None:
|
|
317
|
-
pass
|
|
318
|
-
|
|
319
|
-
def optimize_model(self, model_key: str, max_evals: int = 100):
|
|
320
|
-
self.model_manager.optimize(model_key, max_evals)
|
|
321
|
-
|
|
322
|
-
def add_numeric_feature_from_column(self, col_name: str) -> None:
|
|
323
|
-
if col_name not in self.train_data.columns or col_name not in self.test_data.columns:
|
|
324
|
-
raise KeyError(
|
|
325
|
-
f"Column '{col_name}' must exist in both train_data and test_data.")
|
|
326
|
-
|
|
327
|
-
if col_name not in self.factor_nmes:
|
|
328
|
-
self.factor_nmes.append(col_name)
|
|
329
|
-
if col_name not in self.config.factor_nmes:
|
|
330
|
-
self.config.factor_nmes.append(col_name)
|
|
331
|
-
|
|
332
|
-
if col_name not in self.cate_list and col_name not in self.num_features:
|
|
333
|
-
self.num_features.append(col_name)
|
|
334
|
-
|
|
335
|
-
if self.train_oht_data is not None and self.test_oht_data is not None:
|
|
336
|
-
self.train_oht_data[col_name] = self.train_data[col_name].values
|
|
337
|
-
self.test_oht_data[col_name] = self.test_data[col_name].values
|
|
338
|
-
if self.train_oht_scl_data is not None and self.test_oht_scl_data is not None:
|
|
339
|
-
scaler = StandardScaler()
|
|
340
|
-
tr = self.train_data[col_name].to_numpy(
|
|
341
|
-
dtype=np.float32, copy=False).reshape(-1, 1)
|
|
342
|
-
te = self.test_data[col_name].to_numpy(
|
|
343
|
-
dtype=np.float32, copy=False).reshape(-1, 1)
|
|
344
|
-
self.train_oht_scl_data[col_name] = scaler.fit_transform(
|
|
345
|
-
tr).astype(np.float32).reshape(-1)
|
|
346
|
-
self.test_oht_scl_data[col_name] = scaler.transform(te).astype(np.float32).reshape(-1)
|
|
347
|
-
|
|
348
|
-
if col_name not in self.var_nmes:
|
|
349
|
-
self.var_nmes.append(col_name)
|
|
350
|
-
|
|
351
|
-
def add_numeric_features_from_columns(self, col_names: List[str]) -> None:
|
|
352
|
-
if not col_names:
|
|
353
|
-
return
|
|
354
|
-
missing = [
|
|
355
|
-
col for col in col_names
|
|
356
|
-
if col not in self.train_data.columns or col not in self.test_data.columns
|
|
357
|
-
]
|
|
358
|
-
if missing:
|
|
359
|
-
raise KeyError(
|
|
360
|
-
f"Column(s) {missing} must exist in both train_data and test_data."
|
|
361
|
-
)
|
|
362
|
-
|
|
363
|
-
for col_name in col_names:
|
|
364
|
-
if col_name not in self.factor_nmes:
|
|
365
|
-
self.factor_nmes.append(col_name)
|
|
366
|
-
if col_name not in self.config.factor_nmes:
|
|
367
|
-
self.config.factor_nmes.append(col_name)
|
|
368
|
-
if col_name not in self.cate_list and col_name not in self.num_features:
|
|
369
|
-
self.num_features.append(col_name)
|
|
370
|
-
if col_name not in self.var_nmes:
|
|
371
|
-
self.var_nmes.append(col_name)
|
|
372
|
-
|
|
373
|
-
if self.train_oht_data is not None and self.test_oht_data is not None:
|
|
374
|
-
self.train_oht_data.loc[:, col_names] = self.train_data[col_names].to_numpy(copy=False)
|
|
375
|
-
self.test_oht_data.loc[:, col_names] = self.test_data[col_names].to_numpy(copy=False)
|
|
376
|
-
|
|
377
|
-
if self.train_oht_scl_data is not None and self.test_oht_scl_data is not None:
|
|
378
|
-
scaler = StandardScaler()
|
|
379
|
-
tr = self.train_data[col_names].to_numpy(dtype=np.float32, copy=False)
|
|
380
|
-
te = self.test_data[col_names].to_numpy(dtype=np.float32, copy=False)
|
|
381
|
-
self.train_oht_scl_data.loc[:, col_names] = scaler.fit_transform(tr).astype(np.float32)
|
|
382
|
-
self.test_oht_scl_data.loc[:, col_names] = scaler.transform(te).astype(np.float32)
|
|
383
|
-
|
|
384
|
-
def prepare_ft_as_feature(self, max_evals: int = 50, pred_prefix: str = "ft_feat") -> str:
|
|
385
|
-
ft_trainer = self._require_trainer("ft")
|
|
386
|
-
ft_trainer.tune(max_evals=max_evals)
|
|
387
|
-
if hasattr(ft_trainer, "train_as_feature"):
|
|
388
|
-
ft_trainer.train_as_feature(pred_prefix=pred_prefix)
|
|
389
|
-
else:
|
|
390
|
-
ft_trainer.train()
|
|
391
|
-
feature_col = f"pred_{pred_prefix}"
|
|
392
|
-
self.add_numeric_feature_from_column(feature_col)
|
|
393
|
-
return feature_col
|
|
394
|
-
|
|
395
|
-
def prepare_ft_embedding_as_features(self, max_evals: int = 50, pred_prefix: str = "ft_emb") -> List[str]:
|
|
396
|
-
ft_trainer = self._require_trainer("ft")
|
|
397
|
-
ft_trainer.tune(max_evals=max_evals)
|
|
398
|
-
if hasattr(ft_trainer, "train_as_feature"):
|
|
399
|
-
ft_trainer.train_as_feature(
|
|
400
|
-
pred_prefix=pred_prefix, feature_mode="embedding")
|
|
401
|
-
else:
|
|
402
|
-
raise RuntimeError(
|
|
403
|
-
"FT trainer does not support embedding feature mode.")
|
|
404
|
-
cols = self._pred_vector_columns(pred_prefix)
|
|
405
|
-
if not cols:
|
|
406
|
-
raise RuntimeError(
|
|
407
|
-
f"No embedding columns were generated for prefix '{pred_prefix}'.")
|
|
408
|
-
self.add_numeric_features_from_columns(cols)
|
|
409
|
-
return cols
|
|
410
|
-
|
|
411
|
-
def prepare_ft_unsupervised_embedding_as_features(self,
|
|
412
|
-
pred_prefix: str = "ft_uemb",
|
|
413
|
-
params: Optional[Dict[str,
|
|
414
|
-
Any]] = None,
|
|
415
|
-
mask_prob_num: float = 0.15,
|
|
416
|
-
mask_prob_cat: float = 0.15,
|
|
417
|
-
num_loss_weight: float = 1.0,
|
|
418
|
-
cat_loss_weight: float = 1.0) -> List[str]:
|
|
419
|
-
ft_trainer = self._require_trainer("ft")
|
|
420
|
-
if not hasattr(ft_trainer, "pretrain_unsupervised_as_feature"):
|
|
421
|
-
raise RuntimeError(
|
|
422
|
-
"FT trainer does not support unsupervised pretraining.")
|
|
423
|
-
ft_trainer.pretrain_unsupervised_as_feature(
|
|
424
|
-
pred_prefix=pred_prefix,
|
|
425
|
-
params=params,
|
|
426
|
-
mask_prob_num=mask_prob_num,
|
|
427
|
-
mask_prob_cat=mask_prob_cat,
|
|
428
|
-
num_loss_weight=num_loss_weight,
|
|
429
|
-
cat_loss_weight=cat_loss_weight
|
|
430
|
-
)
|
|
431
|
-
cols = self._pred_vector_columns(pred_prefix)
|
|
432
|
-
if not cols:
|
|
433
|
-
raise RuntimeError(
|
|
434
|
-
f"No embedding columns were generated for prefix '{pred_prefix}'.")
|
|
435
|
-
self.add_numeric_features_from_columns(cols)
|
|
436
|
-
return cols
|
|
437
|
-
|
|
438
|
-
def bayesopt_glm(self, max_evals=50):
|
|
439
|
-
self.optimize_model('glm', max_evals)
|
|
440
|
-
|
|
441
|
-
def bayesopt_xgb(self, max_evals=100):
|
|
442
|
-
self.optimize_model('xgb', max_evals)
|
|
443
|
-
|
|
444
|
-
def bayesopt_resnet(self, max_evals=100):
|
|
445
|
-
self.optimize_model('resn', max_evals)
|
|
446
|
-
|
|
447
|
-
def bayesopt_gnn(self, max_evals=50):
|
|
448
|
-
self.optimize_model('gnn', max_evals)
|
|
449
|
-
|
|
450
|
-
def bayesopt_ft(self, max_evals=50):
|
|
451
|
-
self.optimize_model('ft', max_evals)
|
|
452
|
-
|
|
453
|
-
def plot_lift(self, model_label, pred_nme, n_bins=10):
|
|
454
|
-
plot_lift(self, model_label, pred_nme, n_bins)
|
|
455
|
-
|
|
456
|
-
def plot_dlift(self, model_comp: List[str] = ['xgb', 'resn'], n_bins: int = 10) -> None:
|
|
457
|
-
plot_dlift(self, model_comp, n_bins)
|
|
458
|
-
|
|
459
|
-
def plot_conversion_lift(self, model_pred_col: str, n_bins: int = 20):
|
|
460
|
-
plot_conversion_lift(self, model_pred_col, n_bins)
|
|
461
|
-
|
|
462
|
-
# ========= Lightweight explainability: Permutation Importance =========
|
|
463
|
-
def compute_permutation_importance(self,
|
|
464
|
-
model_key: str,
|
|
465
|
-
on_train: bool = True,
|
|
466
|
-
metric: Any = "auto",
|
|
467
|
-
n_repeats: int = 5,
|
|
468
|
-
max_rows: int = 5000,
|
|
469
|
-
random_state: Optional[int] = None):
|
|
470
|
-
if explain_permutation is None:
|
|
471
|
-
raise RuntimeError("explain.permutation is not available.")
|
|
472
|
-
|
|
473
|
-
model_key = str(model_key)
|
|
474
|
-
data = self.train_data if on_train else self.test_data
|
|
475
|
-
if self.resp_nme not in data.columns:
|
|
476
|
-
raise RuntimeError("Missing response column for permutation importance.")
|
|
477
|
-
y = data[self.resp_nme]
|
|
478
|
-
w = data[self.weight_nme] if self.weight_nme in data.columns else None
|
|
479
|
-
|
|
480
|
-
if model_key == "resn":
|
|
481
|
-
if self.resn_best is None:
|
|
482
|
-
raise RuntimeError("ResNet model not trained.")
|
|
483
|
-
X = self.train_oht_scl_data if on_train else self.test_oht_scl_data
|
|
484
|
-
if X is None:
|
|
485
|
-
raise RuntimeError("Missing standardized features for ResNet.")
|
|
486
|
-
X = X[self.var_nmes]
|
|
487
|
-
predict_fn = lambda df: self.resn_best.predict(df)
|
|
488
|
-
elif model_key == "ft":
|
|
489
|
-
if self.ft_best is None:
|
|
490
|
-
raise RuntimeError("FT model not trained.")
|
|
491
|
-
if str(self.config.ft_role) != "model":
|
|
492
|
-
raise RuntimeError("FT role is not 'model'; FT predictions unavailable.")
|
|
493
|
-
X = data[self.factor_nmes]
|
|
494
|
-
geo_tokens = self.train_geo_tokens if on_train else self.test_geo_tokens
|
|
495
|
-
geo_np = None
|
|
496
|
-
if geo_tokens is not None:
|
|
497
|
-
geo_np = geo_tokens.to_numpy(dtype=np.float32, copy=False)
|
|
498
|
-
predict_fn = lambda df, geo=geo_np: self.ft_best.predict(df, geo_tokens=geo)
|
|
499
|
-
elif model_key == "xgb":
|
|
500
|
-
if self.xgb_best is None:
|
|
501
|
-
raise RuntimeError("XGB model not trained.")
|
|
502
|
-
X = data[self.factor_nmes]
|
|
503
|
-
predict_fn = lambda df: self.xgb_best.predict(df)
|
|
504
|
-
else:
|
|
505
|
-
raise ValueError("Unsupported model_key for permutation importance.")
|
|
506
|
-
|
|
507
|
-
return explain_permutation.permutation_importance(
|
|
508
|
-
predict_fn,
|
|
509
|
-
X,
|
|
510
|
-
y,
|
|
511
|
-
sample_weight=w,
|
|
512
|
-
metric=metric,
|
|
513
|
-
task_type=self.task_type,
|
|
514
|
-
n_repeats=n_repeats,
|
|
515
|
-
random_state=random_state,
|
|
516
|
-
max_rows=max_rows,
|
|
517
|
-
)
|
|
518
|
-
|
|
519
|
-
# ========= Deep explainability: Integrated Gradients =========
|
|
520
|
-
def compute_integrated_gradients_resn(self,
|
|
521
|
-
on_train: bool = True,
|
|
522
|
-
baseline: Any = None,
|
|
523
|
-
steps: int = 50,
|
|
524
|
-
batch_size: int = 256,
|
|
525
|
-
target: Optional[int] = None):
|
|
526
|
-
if explain_gradients is None:
|
|
527
|
-
raise RuntimeError("explain.gradients is not available.")
|
|
528
|
-
if self.resn_best is None:
|
|
529
|
-
raise RuntimeError("ResNet model not trained.")
|
|
530
|
-
X = self.train_oht_scl_data if on_train else self.test_oht_scl_data
|
|
531
|
-
if X is None:
|
|
532
|
-
raise RuntimeError("Missing standardized features for ResNet.")
|
|
533
|
-
X = X[self.var_nmes]
|
|
534
|
-
return explain_gradients.resnet_integrated_gradients(
|
|
535
|
-
self.resn_best,
|
|
536
|
-
X,
|
|
537
|
-
baseline=baseline,
|
|
538
|
-
steps=steps,
|
|
539
|
-
batch_size=batch_size,
|
|
540
|
-
target=target,
|
|
541
|
-
)
|
|
542
|
-
|
|
543
|
-
def compute_integrated_gradients_ft(self,
|
|
544
|
-
on_train: bool = True,
|
|
545
|
-
geo_tokens: Optional[np.ndarray] = None,
|
|
546
|
-
baseline_num: Any = None,
|
|
547
|
-
baseline_geo: Any = None,
|
|
548
|
-
steps: int = 50,
|
|
549
|
-
batch_size: int = 256,
|
|
550
|
-
target: Optional[int] = None):
|
|
551
|
-
if explain_gradients is None:
|
|
552
|
-
raise RuntimeError("explain.gradients is not available.")
|
|
553
|
-
if self.ft_best is None:
|
|
554
|
-
raise RuntimeError("FT model not trained.")
|
|
555
|
-
if str(self.config.ft_role) != "model":
|
|
556
|
-
raise RuntimeError("FT role is not 'model'; FT explanations unavailable.")
|
|
557
|
-
|
|
558
|
-
data = self.train_data if on_train else self.test_data
|
|
559
|
-
X = data[self.factor_nmes]
|
|
560
|
-
|
|
561
|
-
if geo_tokens is None and getattr(self.ft_best, "num_geo", 0) > 0:
|
|
562
|
-
tokens_df = self.train_geo_tokens if on_train else self.test_geo_tokens
|
|
563
|
-
if tokens_df is not None:
|
|
564
|
-
geo_tokens = tokens_df.to_numpy(dtype=np.float32, copy=False)
|
|
565
|
-
|
|
566
|
-
return explain_gradients.ft_integrated_gradients(
|
|
567
|
-
self.ft_best,
|
|
568
|
-
X,
|
|
569
|
-
geo_tokens=geo_tokens,
|
|
570
|
-
baseline_num=baseline_num,
|
|
571
|
-
baseline_geo=baseline_geo,
|
|
572
|
-
steps=steps,
|
|
573
|
-
batch_size=batch_size,
|
|
574
|
-
target=target,
|
|
575
|
-
)
|
|
576
|
-
|
|
577
|
-
def save_model(self, model_name=None):
|
|
578
|
-
keys = [model_name] if model_name else self.model_manager.trainers.keys()
|
|
579
|
-
for key in keys:
|
|
580
|
-
if key in self.model_manager.trainers:
|
|
581
|
-
self.model_manager.trainers[key].save()
|
|
582
|
-
else:
|
|
583
|
-
if model_name:
|
|
584
|
-
print(f"[save_model] Warning: Unknown model key {key}")
|
|
585
|
-
|
|
586
|
-
def load_model(self, model_name=None):
|
|
587
|
-
keys = [model_name] if model_name else self.model_manager.trainers.keys()
|
|
588
|
-
for key in keys:
|
|
589
|
-
if key in self.model_manager.trainers:
|
|
590
|
-
self.model_manager.trainers[key].load()
|
|
591
|
-
# Restore to ctx for backward compat
|
|
592
|
-
trainer = self.model_manager.trainers[key]
|
|
593
|
-
if trainer.model is not None:
|
|
594
|
-
setattr(self, f"{key}_best", trainer.model)
|
|
595
|
-
if key in ['xgb', 'resn', 'ft', 'gnn']:
|
|
596
|
-
setattr(self, f"{key}_load", trainer.model)
|
|
597
|
-
else:
|
|
598
|
-
if model_name:
|
|
599
|
-
print(f"[load_model] Warning: Unknown model key {key}")
|
|
600
|
-
|
|
601
|
-
def _sample_rows(self, data: pd.DataFrame, n: int) -> pd.DataFrame:
|
|
602
|
-
if len(data) == 0:
|
|
603
|
-
return data
|
|
604
|
-
return data.sample(min(len(data), n), random_state=self.rand_seed)
|
|
605
|
-
|
|
606
|
-
@staticmethod
|
|
607
|
-
def _shap_nsamples(arr: np.ndarray, max_nsamples: int = 300) -> int:
|
|
608
|
-
min_needed = arr.shape[1] + 2
|
|
609
|
-
return max(min_needed, min(max_nsamples, arr.shape[0] * arr.shape[1]))
|
|
610
|
-
|
|
611
|
-
def _build_ft_shap_matrix(self, data: pd.DataFrame) -> np.ndarray:
|
|
612
|
-
matrices = []
|
|
613
|
-
for col in self.factor_nmes:
|
|
614
|
-
s = data[col]
|
|
615
|
-
if col in self.cate_list:
|
|
616
|
-
cats = pd.Categorical(
|
|
617
|
-
s,
|
|
618
|
-
categories=self.cat_categories_for_shap[col]
|
|
619
|
-
)
|
|
620
|
-
codes = np.asarray(cats.codes, dtype=np.float64).reshape(-1, 1)
|
|
621
|
-
matrices.append(codes)
|
|
622
|
-
else:
|
|
623
|
-
vals = pd.to_numeric(s, errors="coerce")
|
|
624
|
-
arr = vals.to_numpy(dtype=np.float64, copy=True).reshape(-1, 1)
|
|
625
|
-
matrices.append(arr)
|
|
626
|
-
X_mat = np.concatenate(matrices, axis=1) # Result shape (N, F)
|
|
627
|
-
return X_mat
|
|
628
|
-
|
|
629
|
-
def _decode_ft_shap_matrix_to_df(self, X_mat: np.ndarray) -> pd.DataFrame:
|
|
630
|
-
data_dict = {}
|
|
631
|
-
for j, col in enumerate(self.factor_nmes):
|
|
632
|
-
col_vals = X_mat[:, j]
|
|
633
|
-
if col in self.cate_list:
|
|
634
|
-
cats = self.cat_categories_for_shap[col]
|
|
635
|
-
codes = np.round(col_vals).astype(int)
|
|
636
|
-
codes = np.clip(codes, -1, len(cats) - 1)
|
|
637
|
-
cat_series = pd.Categorical.from_codes(
|
|
638
|
-
codes,
|
|
639
|
-
categories=cats
|
|
640
|
-
)
|
|
641
|
-
data_dict[col] = cat_series
|
|
642
|
-
else:
|
|
643
|
-
data_dict[col] = col_vals.astype(float)
|
|
644
|
-
|
|
645
|
-
df = pd.DataFrame(data_dict, columns=self.factor_nmes)
|
|
646
|
-
for col in self.cate_list:
|
|
647
|
-
if col in df.columns:
|
|
648
|
-
df[col] = df[col].astype("category")
|
|
649
|
-
return df
|
|
650
|
-
|
|
651
|
-
def _build_glm_design(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
652
|
-
X = data[self.var_nmes]
|
|
653
|
-
return sm.add_constant(X, has_constant='add')
|
|
654
|
-
|
|
655
|
-
def _compute_shap_core(self,
|
|
656
|
-
model_key: str,
|
|
657
|
-
n_background: int,
|
|
658
|
-
n_samples: int,
|
|
659
|
-
on_train: bool,
|
|
660
|
-
X_df: pd.DataFrame,
|
|
661
|
-
prep_fn,
|
|
662
|
-
predict_fn,
|
|
663
|
-
cleanup_fn=None):
|
|
664
|
-
if explain_shap is None:
|
|
665
|
-
raise RuntimeError("explain.shap_utils is not available.")
|
|
666
|
-
return explain_shap.compute_shap_core(
|
|
667
|
-
self,
|
|
668
|
-
model_key,
|
|
669
|
-
n_background,
|
|
670
|
-
n_samples,
|
|
671
|
-
on_train,
|
|
672
|
-
X_df=X_df,
|
|
673
|
-
prep_fn=prep_fn,
|
|
674
|
-
predict_fn=predict_fn,
|
|
675
|
-
cleanup_fn=cleanup_fn,
|
|
676
|
-
)
|
|
677
|
-
|
|
678
|
-
def compute_shap_glm(self, n_background: int = 500,
|
|
679
|
-
n_samples: int = 200,
|
|
680
|
-
on_train: bool = True):
|
|
681
|
-
if explain_shap is None:
|
|
682
|
-
raise RuntimeError("explain.shap_utils is not available.")
|
|
683
|
-
self.shap_glm = explain_shap.compute_shap_glm(
|
|
684
|
-
self,
|
|
685
|
-
n_background=n_background,
|
|
686
|
-
n_samples=n_samples,
|
|
687
|
-
on_train=on_train,
|
|
688
|
-
)
|
|
689
|
-
return self.shap_glm
|
|
690
|
-
|
|
691
|
-
def compute_shap_xgb(self, n_background: int = 500,
|
|
692
|
-
n_samples: int = 200,
|
|
693
|
-
on_train: bool = True):
|
|
694
|
-
if explain_shap is None:
|
|
695
|
-
raise RuntimeError("explain.shap_utils is not available.")
|
|
696
|
-
self.shap_xgb = explain_shap.compute_shap_xgb(
|
|
697
|
-
self,
|
|
698
|
-
n_background=n_background,
|
|
699
|
-
n_samples=n_samples,
|
|
700
|
-
on_train=on_train,
|
|
701
|
-
)
|
|
702
|
-
return self.shap_xgb
|
|
703
|
-
|
|
704
|
-
def _resn_predict_wrapper(self, X_np):
|
|
705
|
-
model = self.resn_best.resnet.to("cpu")
|
|
706
|
-
with torch.no_grad():
|
|
707
|
-
X_tensor = torch.tensor(X_np, dtype=torch.float32)
|
|
708
|
-
y_pred = model(X_tensor).cpu().numpy()
|
|
709
|
-
y_pred = np.clip(y_pred, 1e-6, None)
|
|
710
|
-
return y_pred.reshape(-1)
|
|
711
|
-
|
|
712
|
-
def compute_shap_resn(self, n_background: int = 500,
|
|
713
|
-
n_samples: int = 200,
|
|
714
|
-
on_train: bool = True):
|
|
715
|
-
if explain_shap is None:
|
|
716
|
-
raise RuntimeError("explain.shap_utils is not available.")
|
|
717
|
-
self.shap_resn = explain_shap.compute_shap_resn(
|
|
718
|
-
self,
|
|
719
|
-
n_background=n_background,
|
|
720
|
-
n_samples=n_samples,
|
|
721
|
-
on_train=on_train,
|
|
722
|
-
)
|
|
723
|
-
return self.shap_resn
|
|
724
|
-
|
|
725
|
-
def _ft_shap_predict_wrapper(self, X_mat: np.ndarray) -> np.ndarray:
|
|
726
|
-
df_input = self._decode_ft_shap_matrix_to_df(X_mat)
|
|
727
|
-
y_pred = self.ft_best.predict(df_input)
|
|
728
|
-
return np.asarray(y_pred, dtype=np.float64).reshape(-1)
|
|
729
|
-
|
|
730
|
-
def compute_shap_ft(self, n_background: int = 500,
|
|
731
|
-
n_samples: int = 200,
|
|
732
|
-
on_train: bool = True):
|
|
733
|
-
if explain_shap is None:
|
|
734
|
-
raise RuntimeError("explain.shap_utils is not available.")
|
|
735
|
-
self.shap_ft = explain_shap.compute_shap_ft(
|
|
736
|
-
self,
|
|
737
|
-
n_background=n_background,
|
|
738
|
-
n_samples=n_samples,
|
|
739
|
-
on_train=on_train,
|
|
740
|
-
)
|
|
741
|
-
return self.shap_ft
|