ins-pricing 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +9 -6
- ins_pricing/__init__.py +3 -11
- ins_pricing/cli/BayesOpt_entry.py +24 -0
- ins_pricing/{modelling → cli}/BayesOpt_incremental.py +197 -64
- ins_pricing/cli/Explain_Run.py +25 -0
- ins_pricing/{modelling → cli}/Explain_entry.py +169 -124
- ins_pricing/cli/Pricing_Run.py +25 -0
- ins_pricing/cli/__init__.py +1 -0
- ins_pricing/cli/bayesopt_entry_runner.py +1312 -0
- ins_pricing/cli/utils/__init__.py +1 -0
- ins_pricing/cli/utils/cli_common.py +320 -0
- ins_pricing/cli/utils/cli_config.py +375 -0
- ins_pricing/{modelling → cli/utils}/notebook_utils.py +74 -19
- {ins_pricing_gemini/modelling → ins_pricing/cli}/watchdog_run.py +2 -2
- ins_pricing/{modelling → docs/modelling}/BayesOpt_USAGE.md +69 -49
- ins_pricing/docs/modelling/README.md +34 -0
- ins_pricing/modelling/__init__.py +57 -6
- ins_pricing/modelling/core/__init__.py +1 -0
- ins_pricing/modelling/{bayesopt → core/bayesopt}/config_preprocess.py +64 -1
- ins_pricing/modelling/{bayesopt → core/bayesopt}/core.py +150 -810
- ins_pricing/modelling/core/bayesopt/model_explain_mixin.py +296 -0
- ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py +548 -0
- ins_pricing/modelling/core/bayesopt/models/__init__.py +27 -0
- ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +316 -0
- ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +808 -0
- ins_pricing/modelling/core/bayesopt/models/model_gnn.py +675 -0
- ins_pricing/modelling/core/bayesopt/models/model_resn.py +435 -0
- ins_pricing/modelling/core/bayesopt/trainers/__init__.py +19 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +1020 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +787 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +195 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +312 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +261 -0
- ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +348 -0
- ins_pricing/modelling/{bayesopt → core/bayesopt}/utils.py +2 -2
- ins_pricing/modelling/core/evaluation.py +115 -0
- ins_pricing/production/__init__.py +4 -0
- ins_pricing/production/preprocess.py +71 -0
- ins_pricing/setup.py +10 -5
- {ins_pricing_gemini/modelling/tests → ins_pricing/tests/modelling}/test_plotting.py +2 -2
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/METADATA +4 -4
- ins_pricing-0.2.0.dist-info/RECORD +125 -0
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/top_level.txt +0 -1
- ins_pricing/modelling/BayesOpt_entry.py +0 -633
- ins_pricing/modelling/Explain_Run.py +0 -36
- ins_pricing/modelling/Pricing_Run.py +0 -36
- ins_pricing/modelling/README.md +0 -33
- ins_pricing/modelling/bayesopt/models.py +0 -2196
- ins_pricing/modelling/bayesopt/trainers.py +0 -2446
- ins_pricing/modelling/cli_common.py +0 -136
- ins_pricing/modelling/tests/test_plotting.py +0 -63
- ins_pricing/modelling/watchdog_run.py +0 -211
- ins_pricing-0.1.11.dist-info/RECORD +0 -169
- ins_pricing_gemini/__init__.py +0 -23
- ins_pricing_gemini/governance/__init__.py +0 -20
- ins_pricing_gemini/governance/approval.py +0 -93
- ins_pricing_gemini/governance/audit.py +0 -37
- ins_pricing_gemini/governance/registry.py +0 -99
- ins_pricing_gemini/governance/release.py +0 -159
- ins_pricing_gemini/modelling/Explain_Run.py +0 -36
- ins_pricing_gemini/modelling/Pricing_Run.py +0 -36
- ins_pricing_gemini/modelling/__init__.py +0 -151
- ins_pricing_gemini/modelling/cli_common.py +0 -141
- ins_pricing_gemini/modelling/config.py +0 -249
- ins_pricing_gemini/modelling/config_preprocess.py +0 -254
- ins_pricing_gemini/modelling/core.py +0 -741
- ins_pricing_gemini/modelling/data_container.py +0 -42
- ins_pricing_gemini/modelling/explain/__init__.py +0 -55
- ins_pricing_gemini/modelling/explain/gradients.py +0 -334
- ins_pricing_gemini/modelling/explain/metrics.py +0 -176
- ins_pricing_gemini/modelling/explain/permutation.py +0 -155
- ins_pricing_gemini/modelling/explain/shap_utils.py +0 -146
- ins_pricing_gemini/modelling/features.py +0 -215
- ins_pricing_gemini/modelling/model_manager.py +0 -148
- ins_pricing_gemini/modelling/model_plotting.py +0 -463
- ins_pricing_gemini/modelling/models.py +0 -2203
- ins_pricing_gemini/modelling/notebook_utils.py +0 -294
- ins_pricing_gemini/modelling/plotting/__init__.py +0 -45
- ins_pricing_gemini/modelling/plotting/common.py +0 -63
- ins_pricing_gemini/modelling/plotting/curves.py +0 -572
- ins_pricing_gemini/modelling/plotting/diagnostics.py +0 -139
- ins_pricing_gemini/modelling/plotting/geo.py +0 -362
- ins_pricing_gemini/modelling/plotting/importance.py +0 -121
- ins_pricing_gemini/modelling/run_logging.py +0 -133
- ins_pricing_gemini/modelling/tests/conftest.py +0 -8
- ins_pricing_gemini/modelling/tests/test_cross_val_generic.py +0 -66
- ins_pricing_gemini/modelling/tests/test_distributed_utils.py +0 -18
- ins_pricing_gemini/modelling/tests/test_explain.py +0 -56
- ins_pricing_gemini/modelling/tests/test_geo_tokens_split.py +0 -49
- ins_pricing_gemini/modelling/tests/test_graph_cache.py +0 -33
- ins_pricing_gemini/modelling/tests/test_plotting_library.py +0 -150
- ins_pricing_gemini/modelling/tests/test_preprocessor.py +0 -48
- ins_pricing_gemini/modelling/trainers.py +0 -2447
- ins_pricing_gemini/modelling/utils.py +0 -1020
- ins_pricing_gemini/pricing/__init__.py +0 -27
- ins_pricing_gemini/pricing/calibration.py +0 -39
- ins_pricing_gemini/pricing/data_quality.py +0 -117
- ins_pricing_gemini/pricing/exposure.py +0 -85
- ins_pricing_gemini/pricing/factors.py +0 -91
- ins_pricing_gemini/pricing/monitoring.py +0 -99
- ins_pricing_gemini/pricing/rate_table.py +0 -78
- ins_pricing_gemini/production/__init__.py +0 -21
- ins_pricing_gemini/production/drift.py +0 -30
- ins_pricing_gemini/production/monitoring.py +0 -143
- ins_pricing_gemini/production/scoring.py +0 -40
- ins_pricing_gemini/reporting/__init__.py +0 -11
- ins_pricing_gemini/reporting/report_builder.py +0 -72
- ins_pricing_gemini/reporting/scheduler.py +0 -45
- ins_pricing_gemini/scripts/BayesOpt_incremental.py +0 -722
- ins_pricing_gemini/scripts/Explain_entry.py +0 -545
- ins_pricing_gemini/scripts/__init__.py +0 -1
- ins_pricing_gemini/scripts/train.py +0 -568
- ins_pricing_gemini/setup.py +0 -55
- ins_pricing_gemini/smoke_test.py +0 -28
- /ins_pricing/{modelling → cli/utils}/run_logging.py +0 -0
- /ins_pricing/modelling/{BayesOpt.py → core/BayesOpt.py} +0 -0
- /ins_pricing/modelling/{bayesopt → core/bayesopt}/__init__.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/conftest.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_cross_val_generic.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_distributed_utils.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_explain.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_geo_tokens_split.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_graph_cache.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_plotting_library.py +0 -0
- /ins_pricing/{modelling/tests → tests/modelling}/test_preprocessor.py +0 -0
- {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/WHEEL +0 -0
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Callable, Optional, Sequence
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
from .metrics import resolve_metric
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def _prepare_data(X, y, sample_weight, max_rows, rng):
|
|
12
|
-
y_arr = np.asarray(y)
|
|
13
|
-
if y_arr.ndim != 1:
|
|
14
|
-
y_arr = y_arr.reshape(-1)
|
|
15
|
-
|
|
16
|
-
w_arr = None
|
|
17
|
-
if sample_weight is not None:
|
|
18
|
-
w_arr = np.asarray(sample_weight).reshape(-1)
|
|
19
|
-
if w_arr.shape[0] != y_arr.shape[0]:
|
|
20
|
-
raise ValueError("sample_weight length must match y.")
|
|
21
|
-
|
|
22
|
-
if isinstance(X, pd.DataFrame):
|
|
23
|
-
X_data = X
|
|
24
|
-
if len(X_data) != len(y_arr):
|
|
25
|
-
raise ValueError("X and y must have the same length.")
|
|
26
|
-
if max_rows and len(X_data) > max_rows:
|
|
27
|
-
idx = rng.choice(len(X_data), size=int(max_rows), replace=False)
|
|
28
|
-
X_data = X_data.iloc[idx].copy()
|
|
29
|
-
y_arr = y_arr[idx]
|
|
30
|
-
if w_arr is not None:
|
|
31
|
-
w_arr = w_arr[idx]
|
|
32
|
-
return X_data, y_arr, w_arr
|
|
33
|
-
|
|
34
|
-
X_np = np.asarray(X)
|
|
35
|
-
if X_np.ndim != 2:
|
|
36
|
-
raise ValueError("X must be 2d when not a DataFrame.")
|
|
37
|
-
if X_np.shape[0] != y_arr.shape[0]:
|
|
38
|
-
raise ValueError("X and y must have the same length.")
|
|
39
|
-
if max_rows and X_np.shape[0] > max_rows:
|
|
40
|
-
idx = rng.choice(X_np.shape[0], size=int(max_rows), replace=False)
|
|
41
|
-
X_np = X_np[idx]
|
|
42
|
-
y_arr = y_arr[idx]
|
|
43
|
-
if w_arr is not None:
|
|
44
|
-
w_arr = w_arr[idx]
|
|
45
|
-
return X_np, y_arr, w_arr
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def permutation_importance(
|
|
49
|
-
predict_fn: Callable,
|
|
50
|
-
X,
|
|
51
|
-
y,
|
|
52
|
-
*,
|
|
53
|
-
sample_weight=None,
|
|
54
|
-
metric: str | Callable = "auto",
|
|
55
|
-
task_type: Optional[str] = None,
|
|
56
|
-
higher_is_better: Optional[bool] = None,
|
|
57
|
-
n_repeats: int = 5,
|
|
58
|
-
random_state: Optional[int] = None,
|
|
59
|
-
max_rows: Optional[int] = 5000,
|
|
60
|
-
features: Optional[Sequence[str]] = None,
|
|
61
|
-
return_scores: bool = False,
|
|
62
|
-
safe_copy: bool = False,
|
|
63
|
-
) -> pd.DataFrame:
|
|
64
|
-
"""Permutation importance on tabular data.
|
|
65
|
-
|
|
66
|
-
predict_fn should accept the same type as X (DataFrame or ndarray).
|
|
67
|
-
Set safe_copy=True if predict_fn mutates its input.
|
|
68
|
-
"""
|
|
69
|
-
rng = np.random.default_rng(random_state)
|
|
70
|
-
n_repeats = max(1, int(n_repeats))
|
|
71
|
-
|
|
72
|
-
X_data, y_arr, w_arr = _prepare_data(X, y, sample_weight, max_rows, rng)
|
|
73
|
-
metric_fn, higher_is_better, metric_name = resolve_metric(
|
|
74
|
-
metric, task_type=task_type, higher_is_better=higher_is_better
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
baseline_pred = predict_fn(X_data)
|
|
78
|
-
baseline_score = metric_fn(y_arr, baseline_pred, w_arr)
|
|
79
|
-
|
|
80
|
-
if isinstance(X_data, pd.DataFrame):
|
|
81
|
-
feature_names = list(X_data.columns)
|
|
82
|
-
if features is not None:
|
|
83
|
-
feature_names = [f for f in features if f in X_data.columns]
|
|
84
|
-
X_perm = X_data.copy()
|
|
85
|
-
results = []
|
|
86
|
-
for feat in feature_names:
|
|
87
|
-
orig_series = X_perm[feat].copy()
|
|
88
|
-
orig_values = orig_series.to_numpy(copy=True)
|
|
89
|
-
scores = []
|
|
90
|
-
for _ in range(n_repeats):
|
|
91
|
-
X_perm[feat] = rng.permutation(orig_values)
|
|
92
|
-
pred_input = X_perm.copy() if safe_copy else X_perm
|
|
93
|
-
pred = predict_fn(pred_input)
|
|
94
|
-
score = metric_fn(y_arr, pred, w_arr)
|
|
95
|
-
scores.append(float(score))
|
|
96
|
-
X_perm[feat] = orig_series
|
|
97
|
-
|
|
98
|
-
scores_arr = np.asarray(scores, dtype=float)
|
|
99
|
-
if higher_is_better:
|
|
100
|
-
delta = baseline_score - scores_arr
|
|
101
|
-
else:
|
|
102
|
-
delta = scores_arr - baseline_score
|
|
103
|
-
entry = {
|
|
104
|
-
"feature": feat,
|
|
105
|
-
"importance_mean": float(np.mean(delta)),
|
|
106
|
-
"importance_std": float(np.std(delta)),
|
|
107
|
-
"baseline_score": float(baseline_score),
|
|
108
|
-
"permutation_score_mean": float(np.mean(scores_arr)),
|
|
109
|
-
"metric": metric_name,
|
|
110
|
-
}
|
|
111
|
-
if return_scores:
|
|
112
|
-
entry["permutation_scores"] = scores
|
|
113
|
-
results.append(entry)
|
|
114
|
-
else:
|
|
115
|
-
if features is not None:
|
|
116
|
-
if len(features) != X_data.shape[1]:
|
|
117
|
-
raise ValueError("features length must match X columns for ndarray input.")
|
|
118
|
-
feature_names = list(features)
|
|
119
|
-
else:
|
|
120
|
-
feature_names = [f"x{i}" for i in range(X_data.shape[1])]
|
|
121
|
-
|
|
122
|
-
X_base = np.asarray(X_data)
|
|
123
|
-
X_perm = X_base.copy()
|
|
124
|
-
results = []
|
|
125
|
-
for idx, feat in enumerate(feature_names):
|
|
126
|
-
orig_col = X_base[:, idx].copy()
|
|
127
|
-
scores = []
|
|
128
|
-
for _ in range(n_repeats):
|
|
129
|
-
X_perm[:, idx] = rng.permutation(orig_col)
|
|
130
|
-
pred_input = X_perm.copy() if safe_copy else X_perm
|
|
131
|
-
pred = predict_fn(pred_input)
|
|
132
|
-
score = metric_fn(y_arr, pred, w_arr)
|
|
133
|
-
scores.append(float(score))
|
|
134
|
-
X_perm[:, idx] = orig_col
|
|
135
|
-
|
|
136
|
-
scores_arr = np.asarray(scores, dtype=float)
|
|
137
|
-
if higher_is_better:
|
|
138
|
-
delta = baseline_score - scores_arr
|
|
139
|
-
else:
|
|
140
|
-
delta = scores_arr - baseline_score
|
|
141
|
-
entry = {
|
|
142
|
-
"feature": feat,
|
|
143
|
-
"importance_mean": float(np.mean(delta)),
|
|
144
|
-
"importance_std": float(np.std(delta)),
|
|
145
|
-
"baseline_score": float(baseline_score),
|
|
146
|
-
"permutation_score_mean": float(np.mean(scores_arr)),
|
|
147
|
-
"metric": metric_name,
|
|
148
|
-
}
|
|
149
|
-
if return_scores:
|
|
150
|
-
entry["permutation_scores"] = scores
|
|
151
|
-
results.append(entry)
|
|
152
|
-
|
|
153
|
-
df = pd.DataFrame(results)
|
|
154
|
-
df = df.sort_values(by="importance_mean", ascending=False).reset_index(drop=True)
|
|
155
|
-
return df
|
|
@@ -1,146 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Callable, Optional
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def _require_shap():
|
|
10
|
-
try:
|
|
11
|
-
import shap # type: ignore
|
|
12
|
-
except Exception as exc: # pragma: no cover - optional dependency
|
|
13
|
-
raise ImportError("SHAP is required. Install with `pip install shap`.") from exc
|
|
14
|
-
return shap
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def compute_shap_core(
|
|
18
|
-
ctx,
|
|
19
|
-
model_key: str,
|
|
20
|
-
n_background: int,
|
|
21
|
-
n_samples: int,
|
|
22
|
-
on_train: bool,
|
|
23
|
-
X_df: pd.DataFrame,
|
|
24
|
-
prep_fn: Callable[[pd.DataFrame], np.ndarray],
|
|
25
|
-
predict_fn: Callable[[np.ndarray], np.ndarray],
|
|
26
|
-
cleanup_fn: Optional[Callable[[], None]] = None,
|
|
27
|
-
) -> dict:
|
|
28
|
-
"""Shared SHAP pipeline using KernelExplainer with lazy import."""
|
|
29
|
-
_ = on_train
|
|
30
|
-
if model_key not in ctx.trainers or ctx.trainers[model_key].model is None:
|
|
31
|
-
raise RuntimeError(f"Model {model_key} not trained.")
|
|
32
|
-
if cleanup_fn:
|
|
33
|
-
cleanup_fn()
|
|
34
|
-
shap = _require_shap()
|
|
35
|
-
bg_df = ctx._sample_rows(X_df, n_background)
|
|
36
|
-
bg_mat = prep_fn(bg_df)
|
|
37
|
-
explainer = shap.KernelExplainer(predict_fn, bg_mat)
|
|
38
|
-
ex_df = ctx._sample_rows(X_df, n_samples)
|
|
39
|
-
ex_mat = prep_fn(ex_df)
|
|
40
|
-
nsample_eff = ctx._shap_nsamples(ex_mat)
|
|
41
|
-
shap_values = explainer.shap_values(ex_mat, nsamples=nsample_eff)
|
|
42
|
-
bg_pred = predict_fn(bg_mat)
|
|
43
|
-
base_value = float(np.asarray(bg_pred).mean())
|
|
44
|
-
|
|
45
|
-
return {
|
|
46
|
-
"explainer": explainer,
|
|
47
|
-
"X_explain": ex_df,
|
|
48
|
-
"shap_values": shap_values,
|
|
49
|
-
"base_value": base_value,
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def compute_shap_glm(ctx, n_background: int = 500, n_samples: int = 200, on_train: bool = True):
|
|
54
|
-
data = ctx.train_oht_scl_data if on_train else ctx.test_oht_scl_data
|
|
55
|
-
design_all = ctx._build_glm_design(data)
|
|
56
|
-
design_cols = list(design_all.columns)
|
|
57
|
-
|
|
58
|
-
def predict_wrapper(x_np):
|
|
59
|
-
x_df = pd.DataFrame(x_np, columns=design_cols)
|
|
60
|
-
y_pred = ctx.glm_best.predict(x_df)
|
|
61
|
-
return np.asarray(y_pred, dtype=np.float64).reshape(-1)
|
|
62
|
-
|
|
63
|
-
return compute_shap_core(
|
|
64
|
-
ctx,
|
|
65
|
-
"glm",
|
|
66
|
-
n_background,
|
|
67
|
-
n_samples,
|
|
68
|
-
on_train,
|
|
69
|
-
X_df=design_all,
|
|
70
|
-
prep_fn=lambda df: df.to_numpy(dtype=np.float64),
|
|
71
|
-
predict_fn=predict_wrapper,
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def compute_shap_xgb(ctx, n_background: int = 500, n_samples: int = 200, on_train: bool = True):
|
|
76
|
-
data = ctx.train_data if on_train else ctx.test_data
|
|
77
|
-
X_raw = data[ctx.factor_nmes]
|
|
78
|
-
|
|
79
|
-
def predict_wrapper(x_mat):
|
|
80
|
-
df_input = ctx._decode_ft_shap_matrix_to_df(x_mat)
|
|
81
|
-
return ctx.xgb_best.predict(df_input)
|
|
82
|
-
|
|
83
|
-
return compute_shap_core(
|
|
84
|
-
ctx,
|
|
85
|
-
"xgb",
|
|
86
|
-
n_background,
|
|
87
|
-
n_samples,
|
|
88
|
-
on_train,
|
|
89
|
-
X_df=X_raw,
|
|
90
|
-
prep_fn=lambda df: ctx._build_ft_shap_matrix(df).astype(np.float64),
|
|
91
|
-
predict_fn=predict_wrapper,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def compute_shap_resn(ctx, n_background: int = 500, n_samples: int = 200, on_train: bool = True):
|
|
96
|
-
data = ctx.train_oht_scl_data if on_train else ctx.test_oht_scl_data
|
|
97
|
-
X = data[ctx.var_nmes]
|
|
98
|
-
|
|
99
|
-
def cleanup():
|
|
100
|
-
import torch
|
|
101
|
-
|
|
102
|
-
ctx.resn_best.device = torch.device("cpu")
|
|
103
|
-
ctx.resn_best.resnet.to("cpu")
|
|
104
|
-
if torch.cuda.is_available():
|
|
105
|
-
torch.cuda.empty_cache()
|
|
106
|
-
|
|
107
|
-
return compute_shap_core(
|
|
108
|
-
ctx,
|
|
109
|
-
"resn",
|
|
110
|
-
n_background,
|
|
111
|
-
n_samples,
|
|
112
|
-
on_train,
|
|
113
|
-
X_df=X,
|
|
114
|
-
prep_fn=lambda df: df.to_numpy(dtype=np.float64),
|
|
115
|
-
predict_fn=lambda x: ctx._resn_predict_wrapper(x),
|
|
116
|
-
cleanup_fn=cleanup,
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def compute_shap_ft(ctx, n_background: int = 500, n_samples: int = 200, on_train: bool = True):
|
|
121
|
-
if str(ctx.config.ft_role) != "model":
|
|
122
|
-
raise RuntimeError(
|
|
123
|
-
"FT is configured as embedding-only (ft_role != 'model'); FT SHAP is disabled."
|
|
124
|
-
)
|
|
125
|
-
data = ctx.train_data if on_train else ctx.test_data
|
|
126
|
-
X_raw = data[ctx.factor_nmes]
|
|
127
|
-
|
|
128
|
-
def cleanup():
|
|
129
|
-
import torch
|
|
130
|
-
|
|
131
|
-
ctx.ft_best.device = torch.device("cpu")
|
|
132
|
-
ctx.ft_best.ft.to("cpu")
|
|
133
|
-
if torch.cuda.is_available():
|
|
134
|
-
torch.cuda.empty_cache()
|
|
135
|
-
|
|
136
|
-
return compute_shap_core(
|
|
137
|
-
ctx,
|
|
138
|
-
"ft",
|
|
139
|
-
n_background,
|
|
140
|
-
n_samples,
|
|
141
|
-
on_train,
|
|
142
|
-
X_df=X_raw,
|
|
143
|
-
prep_fn=lambda df: ctx._build_ft_shap_matrix(df).astype(np.float64),
|
|
144
|
-
predict_fn=ctx._ft_shap_predict_wrapper,
|
|
145
|
-
cleanup_fn=cleanup,
|
|
146
|
-
)
|
|
@@ -1,215 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import numpy as np
|
|
5
|
-
from sklearn.preprocessing import StandardScaler
|
|
6
|
-
from typing import Optional, Dict, Any, TYPE_CHECKING
|
|
7
|
-
import sys
|
|
8
|
-
|
|
9
|
-
if TYPE_CHECKING:
|
|
10
|
-
from .core import BayesOptModel
|
|
11
|
-
|
|
12
|
-
def _add_region_effect(model: "BayesOptModel") -> None:
|
|
13
|
-
"""Partial pooling over province/city to create a smoothed region_effect feature."""
|
|
14
|
-
prov_col = model.config.region_province_col
|
|
15
|
-
city_col = model.config.region_city_col
|
|
16
|
-
if not prov_col or not city_col:
|
|
17
|
-
return
|
|
18
|
-
for col in [prov_col, city_col]:
|
|
19
|
-
if col not in model.train_data.columns:
|
|
20
|
-
print(f"[RegionEffect] Missing column {col}; skipped.")
|
|
21
|
-
return
|
|
22
|
-
|
|
23
|
-
def safe_mean(df: pd.DataFrame) -> float:
|
|
24
|
-
w = df[model.weight_nme]
|
|
25
|
-
y = df[model.resp_nme]
|
|
26
|
-
# EPS is imported from utils usually, but we can define local or import
|
|
27
|
-
denom = max(float(w.sum()), 1e-6)
|
|
28
|
-
return float((y * w).sum() / denom)
|
|
29
|
-
|
|
30
|
-
global_mean = safe_mean(model.train_data)
|
|
31
|
-
alpha = max(float(model.config.region_effect_alpha), 0.0)
|
|
32
|
-
|
|
33
|
-
w_all = model.train_data[model.weight_nme]
|
|
34
|
-
y_all = model.train_data[model.resp_nme]
|
|
35
|
-
yw_all = y_all * w_all
|
|
36
|
-
|
|
37
|
-
prov_sumw = w_all.groupby(model.train_data[prov_col]).sum()
|
|
38
|
-
prov_sumyw = yw_all.groupby(model.train_data[prov_col]).sum()
|
|
39
|
-
prov_mean = (prov_sumyw / prov_sumw.clip(lower=1e-6)).astype(float)
|
|
40
|
-
prov_mean = prov_mean.fillna(global_mean)
|
|
41
|
-
|
|
42
|
-
city_sumw = model.train_data.groupby([prov_col, city_col])[
|
|
43
|
-
model.weight_nme].sum()
|
|
44
|
-
city_sumyw = yw_all.groupby(
|
|
45
|
-
[model.train_data[prov_col], model.train_data[city_col]]).sum()
|
|
46
|
-
city_df = pd.DataFrame({
|
|
47
|
-
"sum_w": city_sumw,
|
|
48
|
-
"sum_yw": city_sumyw,
|
|
49
|
-
})
|
|
50
|
-
city_df["prior"] = city_df.index.get_level_values(0).map(
|
|
51
|
-
prov_mean).fillna(global_mean)
|
|
52
|
-
city_df["effect"] = (
|
|
53
|
-
city_df["sum_yw"] + alpha * city_df["prior"]
|
|
54
|
-
) / (city_df["sum_w"] + alpha).clip(lower=1e-6)
|
|
55
|
-
city_effect = city_df["effect"]
|
|
56
|
-
|
|
57
|
-
def lookup_effect(df: pd.DataFrame) -> pd.Series:
|
|
58
|
-
idx = pd.MultiIndex.from_frame(df[[prov_col, city_col]])
|
|
59
|
-
effects = city_effect.reindex(idx).to_numpy(dtype=np.float64)
|
|
60
|
-
prov_fallback = df[prov_col].map(
|
|
61
|
-
prov_mean).fillna(global_mean).to_numpy(dtype=np.float64)
|
|
62
|
-
effects = np.where(np.isfinite(effects), effects, prov_fallback)
|
|
63
|
-
effects = np.where(np.isfinite(effects), effects, global_mean)
|
|
64
|
-
return pd.Series(effects, index=df.index, dtype=np.float32)
|
|
65
|
-
|
|
66
|
-
re_train = lookup_effect(model.train_data)
|
|
67
|
-
re_test = lookup_effect(model.test_data)
|
|
68
|
-
|
|
69
|
-
col_name = "region_effect"
|
|
70
|
-
model.train_data[col_name] = re_train
|
|
71
|
-
model.test_data[col_name] = re_test
|
|
72
|
-
|
|
73
|
-
# Sync into one-hot and scaled variants.
|
|
74
|
-
for df in [model.train_oht_data, model.test_oht_data]:
|
|
75
|
-
if df is not None:
|
|
76
|
-
df[col_name] = re_train if df is model.train_oht_data else re_test
|
|
77
|
-
|
|
78
|
-
# Standardize region_effect and propagate.
|
|
79
|
-
scaler = StandardScaler()
|
|
80
|
-
re_train_s = scaler.fit_transform(
|
|
81
|
-
re_train.values.reshape(-1, 1)).astype(np.float32).reshape(-1)
|
|
82
|
-
re_test_s = scaler.transform(
|
|
83
|
-
re_test.values.reshape(-1, 1)).astype(np.float32).reshape(-1)
|
|
84
|
-
for df in [model.train_oht_scl_data, model.test_oht_scl_data]:
|
|
85
|
-
if df is not None:
|
|
86
|
-
df[col_name] = re_train_s if df is model.train_oht_scl_data else re_test_s
|
|
87
|
-
|
|
88
|
-
# Update feature lists.
|
|
89
|
-
if col_name not in model.factor_nmes:
|
|
90
|
-
model.factor_nmes.append(col_name)
|
|
91
|
-
if col_name not in model.num_features:
|
|
92
|
-
model.num_features.append(col_name)
|
|
93
|
-
if model.train_oht_scl_data is not None:
|
|
94
|
-
excluded = {model.weight_nme, model.resp_nme}
|
|
95
|
-
model.var_nmes = [
|
|
96
|
-
col for col in model.train_oht_scl_data.columns if col not in excluded
|
|
97
|
-
]
|
|
98
|
-
|
|
99
|
-
def _build_geo_tokens(model: "BayesOptModel", params_override: Optional[Dict[str, Any]] = None):
|
|
100
|
-
"""Internal builder; allows trial overrides and returns None on failure."""
|
|
101
|
-
from .models import GraphNeuralNetSklearn # lazy import to avoid circle if models imports core
|
|
102
|
-
|
|
103
|
-
geo_cols = list(model.config.geo_feature_nmes or [])
|
|
104
|
-
if not geo_cols:
|
|
105
|
-
return None
|
|
106
|
-
|
|
107
|
-
available = [c for c in geo_cols if c in model.train_data.columns]
|
|
108
|
-
if not available:
|
|
109
|
-
return None
|
|
110
|
-
|
|
111
|
-
# Preprocess text/numeric: fill numeric with median, label-encode text, map unknowns.
|
|
112
|
-
proc_train = {}
|
|
113
|
-
proc_test = {}
|
|
114
|
-
for col in available:
|
|
115
|
-
s_train = model.train_data[col]
|
|
116
|
-
s_test = model.test_data[col]
|
|
117
|
-
if pd.api.types.is_numeric_dtype(s_train):
|
|
118
|
-
tr = pd.to_numeric(s_train, errors="coerce")
|
|
119
|
-
te = pd.to_numeric(s_test, errors="coerce")
|
|
120
|
-
med = np.nanmedian(tr)
|
|
121
|
-
proc_train[col] = np.nan_to_num(tr, nan=med).astype(np.float32)
|
|
122
|
-
proc_test[col] = np.nan_to_num(te, nan=med).astype(np.float32)
|
|
123
|
-
else:
|
|
124
|
-
cats = pd.Categorical(s_train.astype(str))
|
|
125
|
-
tr_codes = cats.codes.astype(np.float32, copy=True)
|
|
126
|
-
tr_codes[tr_codes < 0] = len(cats.categories)
|
|
127
|
-
te_cats = pd.Categorical(
|
|
128
|
-
s_test.astype(str), categories=cats.categories)
|
|
129
|
-
te_codes = te_cats.codes.astype(np.float32, copy=True)
|
|
130
|
-
te_codes[te_codes < 0] = len(cats.categories)
|
|
131
|
-
proc_train[col] = tr_codes
|
|
132
|
-
proc_test[col] = te_codes
|
|
133
|
-
|
|
134
|
-
train_geo_raw = pd.DataFrame(proc_train, index=model.train_data.index)
|
|
135
|
-
test_geo_raw = pd.DataFrame(proc_test, index=model.test_data.index)
|
|
136
|
-
|
|
137
|
-
scaler = StandardScaler()
|
|
138
|
-
train_geo = pd.DataFrame(
|
|
139
|
-
scaler.fit_transform(train_geo_raw),
|
|
140
|
-
columns=available,
|
|
141
|
-
index=model.train_data.index
|
|
142
|
-
)
|
|
143
|
-
test_geo = pd.DataFrame(
|
|
144
|
-
scaler.transform(test_geo_raw),
|
|
145
|
-
columns=available,
|
|
146
|
-
index=model.test_data.index
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
tw_power = model.default_tweedie_power()
|
|
150
|
-
|
|
151
|
-
cfg = params_override or {}
|
|
152
|
-
try:
|
|
153
|
-
geo_gnn = GraphNeuralNetSklearn(
|
|
154
|
-
model_nme=f"{model.model_nme}_geo",
|
|
155
|
-
input_dim=len(available),
|
|
156
|
-
hidden_dim=cfg.get("geo_token_hidden_dim",
|
|
157
|
-
model.config.geo_token_hidden_dim),
|
|
158
|
-
num_layers=cfg.get("geo_token_layers",
|
|
159
|
-
model.config.geo_token_layers),
|
|
160
|
-
k_neighbors=cfg.get("geo_token_k_neighbors",
|
|
161
|
-
model.config.geo_token_k_neighbors),
|
|
162
|
-
dropout=cfg.get("geo_token_dropout",
|
|
163
|
-
model.config.geo_token_dropout),
|
|
164
|
-
learning_rate=cfg.get(
|
|
165
|
-
"geo_token_learning_rate", model.config.geo_token_learning_rate),
|
|
166
|
-
epochs=int(cfg.get("geo_token_epochs",
|
|
167
|
-
model.config.geo_token_epochs)),
|
|
168
|
-
patience=5,
|
|
169
|
-
task_type=model.task_type,
|
|
170
|
-
tweedie_power=tw_power,
|
|
171
|
-
use_data_parallel=False,
|
|
172
|
-
use_ddp=False,
|
|
173
|
-
use_approx_knn=model.config.gnn_use_approx_knn,
|
|
174
|
-
approx_knn_threshold=model.config.gnn_approx_knn_threshold,
|
|
175
|
-
graph_cache_path=None,
|
|
176
|
-
max_gpu_knn_nodes=model.config.gnn_max_gpu_knn_nodes,
|
|
177
|
-
knn_gpu_mem_ratio=model.config.gnn_knn_gpu_mem_ratio,
|
|
178
|
-
knn_gpu_mem_overhead=model.config.gnn_knn_gpu_mem_overhead
|
|
179
|
-
)
|
|
180
|
-
geo_gnn.fit(
|
|
181
|
-
train_geo,
|
|
182
|
-
model.train_data[model.resp_nme],
|
|
183
|
-
model.train_data[model.weight_nme]
|
|
184
|
-
)
|
|
185
|
-
train_embed = geo_gnn.encode(train_geo)
|
|
186
|
-
test_embed = geo_gnn.encode(test_geo)
|
|
187
|
-
cols = [f"geo_token_{i}" for i in range(train_embed.shape[1])]
|
|
188
|
-
train_tokens = pd.DataFrame(
|
|
189
|
-
train_embed, index=model.train_data.index, columns=cols)
|
|
190
|
-
test_tokens = pd.DataFrame(
|
|
191
|
-
test_embed, index=model.test_data.index, columns=cols)
|
|
192
|
-
return train_tokens, test_tokens, cols, geo_gnn
|
|
193
|
-
except Exception as exc:
|
|
194
|
-
print(f"[GeoToken] Generation failed: {exc}")
|
|
195
|
-
return None
|
|
196
|
-
|
|
197
|
-
def _prepare_geo_tokens(model: "BayesOptModel") -> None:
|
|
198
|
-
"""Build and persist geo tokens with default config values."""
|
|
199
|
-
gnn_trainer = model.model_manager.trainers.get("gnn")
|
|
200
|
-
if gnn_trainer is not None and hasattr(gnn_trainer, "prepare_geo_tokens"):
|
|
201
|
-
try:
|
|
202
|
-
gnn_trainer.prepare_geo_tokens(force=False) # type: ignore[attr-defined]
|
|
203
|
-
return
|
|
204
|
-
except Exception as exc:
|
|
205
|
-
print(f"[GeoToken] GNNTrainer generation failed: {exc}")
|
|
206
|
-
|
|
207
|
-
result = _build_geo_tokens(model)
|
|
208
|
-
if result is None:
|
|
209
|
-
return
|
|
210
|
-
train_tokens, test_tokens, cols, geo_gnn = result
|
|
211
|
-
model.train_geo_tokens = train_tokens
|
|
212
|
-
model.test_geo_tokens = test_tokens
|
|
213
|
-
model.geo_token_cols = cols
|
|
214
|
-
model.geo_gnn_model = geo_gnn
|
|
215
|
-
print(f"[GeoToken] Generated {len(cols)}-dim geo tokens; injecting into FT.")
|
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
from typing import Dict, Optional, Any
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from dataclasses import asdict
|
|
5
|
-
from .trainers import TrainerBase, GLMTrainer, XGBTrainer, ResNetTrainer, FTTrainer, GNNTrainer
|
|
6
|
-
from .utils import IOUtils
|
|
7
|
-
|
|
8
|
-
class ModelManager:
|
|
9
|
-
"""Manages lifecycle and access to model trainers."""
|
|
10
|
-
|
|
11
|
-
def __init__(self, context: Any) -> None:
|
|
12
|
-
# context is the BayesOptModel instance
|
|
13
|
-
self.ctx = context
|
|
14
|
-
self.trainers: Dict[str, TrainerBase] = {}
|
|
15
|
-
self._initialize_trainers()
|
|
16
|
-
|
|
17
|
-
def _initialize_trainers(self) -> None:
|
|
18
|
-
self.trainers['glm'] = GLMTrainer(self.ctx)
|
|
19
|
-
self.trainers['xgb'] = XGBTrainer(self.ctx)
|
|
20
|
-
self.trainers['resn'] = ResNetTrainer(self.ctx)
|
|
21
|
-
self.trainers['ft'] = FTTrainer(self.ctx)
|
|
22
|
-
self.trainers['gnn'] = GNNTrainer(self.ctx)
|
|
23
|
-
|
|
24
|
-
def get_trainer(self, key: str) -> TrainerBase:
|
|
25
|
-
trainer = self.trainers.get(key)
|
|
26
|
-
if trainer is None:
|
|
27
|
-
raise KeyError(f"Unknown model key: {key}")
|
|
28
|
-
return trainer
|
|
29
|
-
|
|
30
|
-
def _maybe_load_best_params(self, model_key: str, trainer: TrainerBase) -> None:
|
|
31
|
-
# 1) If best_params_files is specified, load and skip tuning.
|
|
32
|
-
best_params_files = getattr(self.ctx.config, "best_params_files", None) or {}
|
|
33
|
-
best_params_file = best_params_files.get(model_key)
|
|
34
|
-
if best_params_file and not trainer.best_params:
|
|
35
|
-
trainer.best_params = IOUtils.load_params_file(best_params_file)
|
|
36
|
-
trainer.best_trial = None
|
|
37
|
-
print(
|
|
38
|
-
f"[Optuna][{trainer.label}] Loaded best_params from {best_params_file}; skip tuning."
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
# 2) If reuse_best_params is enabled, prefer version snapshots.
|
|
42
|
-
reuse_params = bool(getattr(self.ctx.config, "reuse_best_params", False))
|
|
43
|
-
if reuse_params and not trainer.best_params:
|
|
44
|
-
payload = self.ctx.version_manager.load_latest(f"{model_key}_best")
|
|
45
|
-
best_params = None if payload is None else payload.get("best_params")
|
|
46
|
-
if best_params:
|
|
47
|
-
trainer.best_params = best_params
|
|
48
|
-
trainer.best_trial = None
|
|
49
|
-
trainer.study_name = payload.get(
|
|
50
|
-
"study_name") if isinstance(payload, dict) else None
|
|
51
|
-
print(
|
|
52
|
-
f"[Optuna][{trainer.label}] Reusing best_params from versions snapshot.")
|
|
53
|
-
return
|
|
54
|
-
|
|
55
|
-
# Fallback to legacy CSV (accessed via ctx.output_manager which is available on ctx)
|
|
56
|
-
params_path = self.ctx.output_manager.result_path(
|
|
57
|
-
f'{self.ctx.model_nme}_bestparams_{trainer.label.lower()}.csv'
|
|
58
|
-
)
|
|
59
|
-
# trainer.load_best_params_csv is not standard on TrainerBase but implemented on subclasses usually
|
|
60
|
-
# But checking core.py, it was loading locally.
|
|
61
|
-
# Actually core.py logic for (3) was omitted in my previous read view (lines 640+).
|
|
62
|
-
# Assuming I should rely on whatever logic was there or just omit legacy CSV if possible.
|
|
63
|
-
# But to be safe, let's stick to what we see: reusing snapshots is modern way.
|
|
64
|
-
# If logic requires CSV loading, I'd need to verify Trainer implementations.
|
|
65
|
-
# Ideally Trainer.load() or similar should handle this?
|
|
66
|
-
# For now, I'll rely on version snapshots as primary persistence.
|
|
67
|
-
|
|
68
|
-
def optimize(self, model_key: str, max_evals: int = 100) -> None:
|
|
69
|
-
if model_key not in self.trainers:
|
|
70
|
-
print(f"Warning: Unknown model key: {model_key}")
|
|
71
|
-
return
|
|
72
|
-
|
|
73
|
-
trainer = self.get_trainer(model_key)
|
|
74
|
-
self._maybe_load_best_params(model_key, trainer)
|
|
75
|
-
|
|
76
|
-
should_tune = not trainer.best_params
|
|
77
|
-
if should_tune:
|
|
78
|
-
if model_key == "ft" and str(self.ctx.config.ft_role) == "unsupervised_embedding":
|
|
79
|
-
if hasattr(trainer, "cross_val_unsupervised"):
|
|
80
|
-
trainer.tune(
|
|
81
|
-
max_evals,
|
|
82
|
-
objective_fn=getattr(trainer, "cross_val_unsupervised")
|
|
83
|
-
)
|
|
84
|
-
else:
|
|
85
|
-
raise RuntimeError(
|
|
86
|
-
"FT trainer does not support unsupervised Optuna objective.")
|
|
87
|
-
else:
|
|
88
|
-
trainer.tune(max_evals)
|
|
89
|
-
|
|
90
|
-
if model_key == "ft" and str(self.ctx.config.ft_role) != "model":
|
|
91
|
-
prefix = str(self.ctx.config.ft_feature_prefix or "ft_emb")
|
|
92
|
-
role = str(self.ctx.config.ft_role)
|
|
93
|
-
if role == "embedding":
|
|
94
|
-
trainer.train_as_feature(
|
|
95
|
-
pred_prefix=prefix, feature_mode="embedding")
|
|
96
|
-
elif role == "unsupervised_embedding":
|
|
97
|
-
trainer.pretrain_unsupervised_as_feature(
|
|
98
|
-
pred_prefix=prefix,
|
|
99
|
-
params=trainer.best_params
|
|
100
|
-
)
|
|
101
|
-
else:
|
|
102
|
-
raise ValueError(
|
|
103
|
-
f"Unsupported ft_role='{role}', expected 'model'/'embedding'/'unsupervised_embedding'.")
|
|
104
|
-
|
|
105
|
-
# Inject generated prediction/embedding columns as features (scalar or vector).
|
|
106
|
-
# Callback to context since data state lives there (in DataContainer delegators)
|
|
107
|
-
self.ctx._inject_pred_features(prefix)
|
|
108
|
-
else:
|
|
109
|
-
trainer.train()
|
|
110
|
-
|
|
111
|
-
if bool(getattr(self.ctx.config, "final_ensemble", False)):
|
|
112
|
-
k = int(getattr(self.ctx.config, "final_ensemble_k", 3) or 3)
|
|
113
|
-
if k > 1:
|
|
114
|
-
if model_key == "ft" and str(self.ctx.config.ft_role) != "model":
|
|
115
|
-
pass
|
|
116
|
-
elif hasattr(trainer, "ensemble_predict"):
|
|
117
|
-
trainer.ensemble_predict(k)
|
|
118
|
-
else:
|
|
119
|
-
print(
|
|
120
|
-
f"[Ensemble] Trainer '{model_key}' does not support ensemble prediction.",
|
|
121
|
-
flush=True,
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
# Update context fields for backward compatibility
|
|
125
|
-
setattr(self.ctx, f"{model_key}_best", trainer.model)
|
|
126
|
-
setattr(self.ctx, f"best_{model_key}_params", trainer.best_params)
|
|
127
|
-
setattr(self.ctx, f"best_{model_key}_trial", trainer.best_trial)
|
|
128
|
-
|
|
129
|
-
# Save a snapshot for traceability
|
|
130
|
-
study_name = getattr(trainer, "study_name", None)
|
|
131
|
-
if study_name is None and trainer.best_trial is not None:
|
|
132
|
-
study_obj = getattr(trainer.best_trial, "study", None)
|
|
133
|
-
study_name = getattr(study_obj, "study_name", None)
|
|
134
|
-
|
|
135
|
-
# Pydantic config to dict
|
|
136
|
-
if hasattr(self.ctx.config, "model_dump"):
|
|
137
|
-
config_dict = self.ctx.config.model_dump()
|
|
138
|
-
else:
|
|
139
|
-
config_dict = asdict(self.ctx.config) # Fallback if for some reason it's dataclass (shouldn't be)
|
|
140
|
-
|
|
141
|
-
snapshot = {
|
|
142
|
-
"model_key": model_key,
|
|
143
|
-
"timestamp": datetime.now().isoformat(),
|
|
144
|
-
"best_params": trainer.best_params,
|
|
145
|
-
"study_name": study_name,
|
|
146
|
-
"config": config_dict,
|
|
147
|
-
}
|
|
148
|
-
self.ctx.version_manager.save(f"{model_key}_best", snapshot)
|