ins-pricing 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ins_pricing/README.md +48 -22
- ins_pricing/__init__.py +142 -90
- ins_pricing/cli/BayesOpt_entry.py +58 -46
- ins_pricing/cli/BayesOpt_incremental.py +77 -110
- ins_pricing/cli/Explain_Run.py +42 -23
- ins_pricing/cli/Explain_entry.py +551 -577
- ins_pricing/cli/Pricing_Run.py +42 -23
- ins_pricing/cli/bayesopt_entry_runner.py +51 -16
- ins_pricing/cli/utils/bootstrap.py +23 -0
- ins_pricing/cli/utils/cli_common.py +256 -256
- ins_pricing/cli/utils/cli_config.py +379 -360
- ins_pricing/cli/utils/import_resolver.py +375 -358
- ins_pricing/cli/utils/notebook_utils.py +256 -242
- ins_pricing/cli/watchdog_run.py +216 -198
- ins_pricing/frontend/__init__.py +10 -10
- ins_pricing/frontend/app.py +132 -61
- ins_pricing/frontend/config_builder.py +33 -0
- ins_pricing/frontend/example_config.json +11 -0
- ins_pricing/frontend/example_workflows.py +1 -1
- ins_pricing/frontend/runner.py +340 -388
- ins_pricing/governance/__init__.py +20 -20
- ins_pricing/governance/release.py +159 -159
- ins_pricing/modelling/README.md +1 -1
- ins_pricing/modelling/__init__.py +147 -92
- ins_pricing/modelling/{core/bayesopt → bayesopt}/README.md +31 -13
- ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
- ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +12 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +589 -552
- ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +987 -958
- ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
- ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +488 -548
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +349 -342
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +921 -913
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +794 -785
- ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +454 -446
- ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1294 -1282
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +64 -56
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +203 -198
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +333 -325
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +279 -267
- ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +515 -313
- ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
- ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +193 -186
- ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
- ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
- ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
- ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +636 -623
- ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
- ins_pricing/modelling/explain/__init__.py +55 -55
- ins_pricing/modelling/explain/metrics.py +27 -174
- ins_pricing/modelling/explain/permutation.py +237 -237
- ins_pricing/modelling/plotting/__init__.py +40 -36
- ins_pricing/modelling/plotting/compat.py +228 -0
- ins_pricing/modelling/plotting/curves.py +572 -572
- ins_pricing/modelling/plotting/diagnostics.py +163 -163
- ins_pricing/modelling/plotting/geo.py +362 -362
- ins_pricing/modelling/plotting/importance.py +121 -121
- ins_pricing/pricing/__init__.py +27 -27
- ins_pricing/pricing/factors.py +67 -56
- ins_pricing/production/__init__.py +35 -25
- ins_pricing/production/{predict.py → inference.py} +140 -57
- ins_pricing/production/monitoring.py +8 -21
- ins_pricing/reporting/__init__.py +11 -11
- ins_pricing/setup.py +1 -1
- ins_pricing/tests/production/test_inference.py +90 -0
- ins_pricing/utils/__init__.py +112 -78
- ins_pricing/utils/device.py +258 -237
- ins_pricing/utils/features.py +53 -0
- ins_pricing/utils/io.py +72 -0
- ins_pricing/utils/logging.py +34 -1
- ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
- ins_pricing/utils/metrics.py +158 -24
- ins_pricing/utils/numerics.py +76 -0
- ins_pricing/utils/paths.py +9 -1
- ins_pricing/utils/profiling.py +8 -4
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/METADATA +1 -1
- ins_pricing-0.5.1.dist-info/RECORD +132 -0
- ins_pricing/modelling/core/BayesOpt.py +0 -146
- ins_pricing/modelling/core/__init__.py +0 -1
- ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
- ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
- ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
- ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
- ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
- ins_pricing/modelling/core/bayesopt/utils.py +0 -105
- ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
- ins_pricing/tests/production/test_predict.py +0 -233
- ins_pricing-0.4.5.dist-info/RECORD +0 -130
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/WHEEL +0 -0
- {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -8,20 +8,36 @@ import pandas as pd
|
|
|
8
8
|
from sklearn.metrics import log_loss
|
|
9
9
|
from sklearn.model_selection import GroupKFold, TimeSeriesSplit
|
|
10
10
|
|
|
11
|
-
from .trainer_base import TrainerBase
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
11
|
+
from ins_pricing.modelling.bayesopt.trainers.trainer_base import TrainerBase
|
|
12
|
+
from ins_pricing.modelling.bayesopt.models import FTTransformerSklearn
|
|
13
|
+
from ins_pricing.utils.losses import regression_loss
|
|
14
|
+
from ins_pricing.utils import get_logger, log_print
|
|
15
|
+
|
|
16
|
+
_logger = get_logger("ins_pricing.trainer.ft")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _log(*args, **kwargs) -> None:
|
|
20
|
+
log_print(_logger, *args, **kwargs)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FTTrainer(TrainerBase):
|
|
24
|
+
def __init__(self, context: "BayesOptModel") -> None:
|
|
25
|
+
if context.task_type == 'classification':
|
|
26
|
+
super().__init__(context, 'FTTransformerClassifier', 'FTTransformer')
|
|
27
|
+
else:
|
|
28
|
+
super().__init__(context, 'FTTransformer', 'FTTransformer')
|
|
29
|
+
self.model: Optional[FTTransformerSklearn] = None
|
|
30
|
+
self.enable_distributed_optuna = bool(context.config.use_ft_ddp)
|
|
31
|
+
self._cv_geo_warned = False
|
|
32
|
+
|
|
33
|
+
def _maybe_cleanup_gpu(self, model: Optional[FTTransformerSklearn]) -> None:
|
|
34
|
+
if not bool(getattr(self.ctx.config, "ft_cleanup_per_fold", False)):
|
|
35
|
+
return
|
|
36
|
+
if model is not None:
|
|
37
|
+
getattr(getattr(model, "ft", None), "to",
|
|
38
|
+
lambda *_args, **_kwargs: None)("cpu")
|
|
39
|
+
synchronize = bool(getattr(self.ctx.config, "ft_cleanup_synchronize", False))
|
|
40
|
+
self._clean_gpu(synchronize=synchronize)
|
|
25
41
|
|
|
26
42
|
def _resolve_numeric_tokens(self) -> int:
|
|
27
43
|
requested = getattr(self.ctx.config, "ft_num_numeric_tokens", None)
|
|
@@ -121,7 +137,7 @@ class FTTrainer(TrainerBase):
|
|
|
121
137
|
if built is not None:
|
|
122
138
|
geo_train, geo_val, _, _ = built
|
|
123
139
|
elif not self._cv_geo_warned:
|
|
124
|
-
|
|
140
|
+
_log(
|
|
125
141
|
"[FTTrainer] Geo tokens unavailable for CV split; continue without geo tokens.",
|
|
126
142
|
flush=True,
|
|
127
143
|
)
|
|
@@ -168,22 +184,20 @@ class FTTrainer(TrainerBase):
|
|
|
168
184
|
)
|
|
169
185
|
model = self._apply_dataloader_overrides(model)
|
|
170
186
|
model.set_params(model_params)
|
|
171
|
-
try:
|
|
172
|
-
return float(model.fit_unsupervised(
|
|
173
|
-
X_train,
|
|
174
|
-
X_val=X_val,
|
|
175
|
-
trial=trial,
|
|
176
|
-
geo_train=geo_train,
|
|
177
|
-
geo_val=geo_val,
|
|
178
|
-
mask_prob_num=mask_prob_num,
|
|
179
|
-
mask_prob_cat=mask_prob_cat,
|
|
180
|
-
num_loss_weight=num_loss_weight,
|
|
181
|
-
cat_loss_weight=cat_loss_weight
|
|
182
|
-
))
|
|
183
|
-
finally:
|
|
184
|
-
|
|
185
|
-
lambda *_args, **_kwargs: None)("cpu")
|
|
186
|
-
self._clean_gpu()
|
|
187
|
+
try:
|
|
188
|
+
return float(model.fit_unsupervised(
|
|
189
|
+
X_train,
|
|
190
|
+
X_val=X_val,
|
|
191
|
+
trial=trial,
|
|
192
|
+
geo_train=geo_train,
|
|
193
|
+
geo_val=geo_val,
|
|
194
|
+
mask_prob_num=mask_prob_num,
|
|
195
|
+
mask_prob_cat=mask_prob_cat,
|
|
196
|
+
num_loss_weight=num_loss_weight,
|
|
197
|
+
cat_loss_weight=cat_loss_weight
|
|
198
|
+
))
|
|
199
|
+
finally:
|
|
200
|
+
self._maybe_cleanup_gpu(model)
|
|
187
201
|
|
|
188
202
|
def cross_val(self, trial: optuna.trial.Trial) -> float:
|
|
189
203
|
# FT-Transformer CV also focuses on memory control:
|
|
@@ -229,7 +243,7 @@ class FTTrainer(TrainerBase):
|
|
|
229
243
|
token_count += 1
|
|
230
244
|
approx_units = d_model * n_layers * max(1, token_count)
|
|
231
245
|
if approx_units > 12_000_000:
|
|
232
|
-
|
|
246
|
+
_log(
|
|
233
247
|
f"[FTTrainer] Trial pruned early: d_model={d_model}, n_layers={n_layers} -> approx_units={approx_units}")
|
|
234
248
|
raise optuna.TrialPruned(
|
|
235
249
|
"config exceeds safe memory budget; prune before training")
|
|
@@ -285,7 +299,7 @@ class FTTrainer(TrainerBase):
|
|
|
285
299
|
if built is not None:
|
|
286
300
|
geo_train, geo_val, _, _ = built
|
|
287
301
|
elif not self._cv_geo_warned:
|
|
288
|
-
|
|
302
|
+
_log(
|
|
289
303
|
"[FTTrainer] Geo tokens unavailable for CV split; continue without geo tokens.",
|
|
290
304
|
flush=True,
|
|
291
305
|
)
|
|
@@ -338,7 +352,7 @@ class FTTrainer(TrainerBase):
|
|
|
338
352
|
requested_heads=resolved_params.get("n_heads")
|
|
339
353
|
)
|
|
340
354
|
if heads_adjusted:
|
|
341
|
-
|
|
355
|
+
_log(f"[FTTrainer] Auto-adjusted n_heads from "
|
|
342
356
|
f"{resolved_params.get('n_heads')} to {adaptive_heads} "
|
|
343
357
|
f"(d_model={d_model_value}).")
|
|
344
358
|
resolved_params["n_heads"] = adaptive_heads
|
|
@@ -378,13 +392,11 @@ class FTTrainer(TrainerBase):
|
|
|
378
392
|
geo_train=geo_train,
|
|
379
393
|
geo_val=geo_val,
|
|
380
394
|
)
|
|
381
|
-
refit_epochs = self._resolve_best_epoch(
|
|
382
|
-
getattr(tmp_model, "training_history", None),
|
|
383
|
-
default_epochs=int(self.ctx.epochs),
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
lambda *_args, **_kwargs: None)("cpu")
|
|
387
|
-
self._clean_gpu()
|
|
395
|
+
refit_epochs = self._resolve_best_epoch(
|
|
396
|
+
getattr(tmp_model, "training_history", None),
|
|
397
|
+
default_epochs=int(self.ctx.epochs),
|
|
398
|
+
)
|
|
399
|
+
self._maybe_cleanup_gpu(tmp_model)
|
|
388
400
|
|
|
389
401
|
self.model = FTTransformerSklearn(
|
|
390
402
|
model_nme=self.ctx.model_nme,
|
|
@@ -451,7 +463,7 @@ class FTTrainer(TrainerBase):
|
|
|
451
463
|
|
|
452
464
|
split_iter, _ = self._resolve_ensemble_splits(X_all, k=k)
|
|
453
465
|
if split_iter is None:
|
|
454
|
-
|
|
466
|
+
_log(
|
|
455
467
|
f"[FT Ensemble] unable to build CV split (n_samples={n_samples}); skip ensemble.",
|
|
456
468
|
flush=True,
|
|
457
469
|
)
|
|
@@ -494,15 +506,13 @@ class FTTrainer(TrainerBase):
|
|
|
494
506
|
|
|
495
507
|
pred_train = model.predict(X_all, geo_tokens=geo_train_full)
|
|
496
508
|
pred_test = model.predict(X_test, geo_tokens=geo_test_full)
|
|
497
|
-
preds_train_sum += np.asarray(pred_train, dtype=np.float64)
|
|
498
|
-
preds_test_sum += np.asarray(pred_test, dtype=np.float64)
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
self._clean_gpu()
|
|
502
|
-
split_count += 1
|
|
509
|
+
preds_train_sum += np.asarray(pred_train, dtype=np.float64)
|
|
510
|
+
preds_test_sum += np.asarray(pred_test, dtype=np.float64)
|
|
511
|
+
self._maybe_cleanup_gpu(model)
|
|
512
|
+
split_count += 1
|
|
503
513
|
|
|
504
514
|
if split_count < 1:
|
|
505
|
-
|
|
515
|
+
_log(
|
|
506
516
|
f"[FT Ensemble] no CV splits generated; skip ensemble.",
|
|
507
517
|
flush=True,
|
|
508
518
|
)
|
|
@@ -591,7 +601,7 @@ class FTTrainer(TrainerBase):
|
|
|
591
601
|
requested_heads=resolved_params.get("n_heads"),
|
|
592
602
|
)
|
|
593
603
|
if heads_adjusted:
|
|
594
|
-
|
|
604
|
+
_log(
|
|
595
605
|
f"[FTTrainer] Auto-adjusted n_heads from "
|
|
596
606
|
f"{resolved_params.get('n_heads')} to {adaptive_heads} "
|
|
597
607
|
f"(d_model={resolved_params.get('d_model', model.d_model)})."
|
|
@@ -652,11 +662,9 @@ class FTTrainer(TrainerBase):
|
|
|
652
662
|
if preds_train is None:
|
|
653
663
|
preds_train = np.empty(
|
|
654
664
|
(len(X_all),) + fold_pred.shape[1:], dtype=fold_pred.dtype)
|
|
655
|
-
preds_train[val_idx] = fold_pred
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
lambda *_a, **_k: None)("cpu")
|
|
659
|
-
self._clean_gpu()
|
|
665
|
+
preds_train[val_idx] = fold_pred
|
|
666
|
+
|
|
667
|
+
self._maybe_cleanup_gpu(model)
|
|
660
668
|
|
|
661
669
|
if preds_train is None:
|
|
662
670
|
return None
|
|
@@ -773,7 +781,7 @@ class FTTrainer(TrainerBase):
|
|
|
773
781
|
requested_heads=resolved_params.get("n_heads")
|
|
774
782
|
)
|
|
775
783
|
if heads_adjusted:
|
|
776
|
-
|
|
784
|
+
_log(f"[FTTrainer] Auto-adjusted n_heads from "
|
|
777
785
|
f"{resolved_params.get('n_heads')} to {adaptive_heads} "
|
|
778
786
|
f"(d_model={resolved_params.get('d_model', self.model.d_model)}).")
|
|
779
787
|
resolved_params["n_heads"] = adaptive_heads
|
|
@@ -1,198 +1,203 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import optuna
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import statsmodels.api as sm
|
|
9
|
-
from sklearn.metrics import log_loss
|
|
10
|
-
|
|
11
|
-
from .trainer_base import TrainerBase
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
return
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
self.
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
if
|
|
153
|
-
raise RuntimeError("Missing standardized
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import optuna
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import statsmodels.api as sm
|
|
9
|
+
from sklearn.metrics import log_loss
|
|
10
|
+
|
|
11
|
+
from ins_pricing.modelling.bayesopt.trainers.trainer_base import TrainerBase
|
|
12
|
+
from ins_pricing.utils import EPS, get_logger, log_print
|
|
13
|
+
from ins_pricing.utils.losses import regression_loss
|
|
14
|
+
|
|
15
|
+
_logger = get_logger("ins_pricing.trainer.glm")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _log(*args, **kwargs) -> None:
|
|
19
|
+
log_print(_logger, *args, **kwargs)
|
|
20
|
+
|
|
21
|
+
class GLMTrainer(TrainerBase):
|
|
22
|
+
def __init__(self, context: "BayesOptModel") -> None:
|
|
23
|
+
super().__init__(context, 'GLM', 'GLM')
|
|
24
|
+
self.model = None
|
|
25
|
+
|
|
26
|
+
def _select_family(self, tweedie_power: Optional[float] = None):
|
|
27
|
+
if self.ctx.task_type == 'classification':
|
|
28
|
+
return sm.families.Binomial()
|
|
29
|
+
loss_name = getattr(self.ctx, "loss_name", "tweedie")
|
|
30
|
+
if loss_name == "poisson":
|
|
31
|
+
return sm.families.Poisson()
|
|
32
|
+
if loss_name == "gamma":
|
|
33
|
+
return sm.families.Gamma()
|
|
34
|
+
if loss_name in {"mse", "mae"}:
|
|
35
|
+
return sm.families.Gaussian()
|
|
36
|
+
power = tweedie_power if tweedie_power is not None else 1.5
|
|
37
|
+
return sm.families.Tweedie(var_power=power, link=sm.families.links.log())
|
|
38
|
+
|
|
39
|
+
def _prepare_design(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
40
|
+
# Add intercept to the statsmodels design matrix.
|
|
41
|
+
X = data[self.ctx.var_nmes]
|
|
42
|
+
return sm.add_constant(X, has_constant='add')
|
|
43
|
+
|
|
44
|
+
def _metric_power(self, family, tweedie_power: Optional[float]) -> float:
|
|
45
|
+
if isinstance(family, sm.families.Poisson):
|
|
46
|
+
return 1.0
|
|
47
|
+
if isinstance(family, sm.families.Gamma):
|
|
48
|
+
return 2.0
|
|
49
|
+
if isinstance(family, sm.families.Tweedie):
|
|
50
|
+
return tweedie_power if tweedie_power is not None else getattr(family, 'var_power', 1.5)
|
|
51
|
+
return 1.5
|
|
52
|
+
|
|
53
|
+
def cross_val(self, trial: optuna.trial.Trial) -> float:
|
|
54
|
+
param_space = {
|
|
55
|
+
"alpha": lambda t: t.suggest_float('alpha', 1e-6, 1e2, log=True),
|
|
56
|
+
"l1_ratio": lambda t: t.suggest_float('l1_ratio', 0.0, 1.0)
|
|
57
|
+
}
|
|
58
|
+
loss_name = getattr(self.ctx, "loss_name", "tweedie")
|
|
59
|
+
if self.ctx.task_type == 'regression' and loss_name == 'tweedie':
|
|
60
|
+
param_space["tweedie_power"] = lambda t: t.suggest_float(
|
|
61
|
+
'tweedie_power', 1.0, 2.0)
|
|
62
|
+
|
|
63
|
+
def data_provider():
|
|
64
|
+
data = self.ctx.train_oht_data if self.ctx.train_oht_data is not None else self.ctx.train_oht_scl_data
|
|
65
|
+
assert data is not None, "Preprocessed training data is missing."
|
|
66
|
+
return data[self.ctx.var_nmes], data[self.ctx.resp_nme], data[self.ctx.weight_nme]
|
|
67
|
+
|
|
68
|
+
def preprocess_fn(X_train, X_val):
|
|
69
|
+
X_train_s, X_val_s, _ = self._standardize_fold(
|
|
70
|
+
X_train, X_val, self.ctx.num_features)
|
|
71
|
+
return self._prepare_design(X_train_s), self._prepare_design(X_val_s)
|
|
72
|
+
|
|
73
|
+
metric_ctx: Dict[str, Any] = {}
|
|
74
|
+
|
|
75
|
+
def model_builder(params):
|
|
76
|
+
family = self._select_family(params.get("tweedie_power"))
|
|
77
|
+
metric_ctx["family"] = family
|
|
78
|
+
metric_ctx["tweedie_power"] = params.get("tweedie_power")
|
|
79
|
+
return {
|
|
80
|
+
"family": family,
|
|
81
|
+
"alpha": params["alpha"],
|
|
82
|
+
"l1_ratio": params["l1_ratio"],
|
|
83
|
+
"tweedie_power": params.get("tweedie_power")
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def fit_predict(model_cfg, X_train, y_train, w_train, X_val, y_val, w_val, _trial):
|
|
87
|
+
glm = sm.GLM(y_train, X_train,
|
|
88
|
+
family=model_cfg["family"],
|
|
89
|
+
freq_weights=w_train)
|
|
90
|
+
result = glm.fit_regularized(
|
|
91
|
+
alpha=model_cfg["alpha"],
|
|
92
|
+
L1_wt=model_cfg["l1_ratio"],
|
|
93
|
+
maxiter=200
|
|
94
|
+
)
|
|
95
|
+
return result.predict(X_val)
|
|
96
|
+
|
|
97
|
+
def metric_fn(y_true, y_pred, weight):
|
|
98
|
+
if self.ctx.task_type == 'classification':
|
|
99
|
+
y_pred_clipped = np.clip(y_pred, EPS, 1 - EPS)
|
|
100
|
+
return log_loss(y_true, y_pred_clipped, sample_weight=weight)
|
|
101
|
+
return regression_loss(
|
|
102
|
+
y_true,
|
|
103
|
+
y_pred,
|
|
104
|
+
weight,
|
|
105
|
+
loss_name=loss_name,
|
|
106
|
+
tweedie_power=metric_ctx.get("tweedie_power"),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
return self.cross_val_generic(
|
|
110
|
+
trial=trial,
|
|
111
|
+
hyperparameter_space=param_space,
|
|
112
|
+
data_provider=data_provider,
|
|
113
|
+
model_builder=model_builder,
|
|
114
|
+
metric_fn=metric_fn,
|
|
115
|
+
preprocess_fn=preprocess_fn,
|
|
116
|
+
fit_predict_fn=fit_predict
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def train(self) -> None:
|
|
120
|
+
if not self.best_params:
|
|
121
|
+
raise RuntimeError("Run tune() first to obtain best GLM parameters.")
|
|
122
|
+
tweedie_power = self.best_params.get('tweedie_power')
|
|
123
|
+
family = self._select_family(tweedie_power)
|
|
124
|
+
|
|
125
|
+
X_train = self._prepare_design(self.ctx.train_oht_scl_data)
|
|
126
|
+
y_train = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
|
|
127
|
+
w_train = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
|
|
128
|
+
|
|
129
|
+
glm = sm.GLM(y_train, X_train, family=family,
|
|
130
|
+
freq_weights=w_train)
|
|
131
|
+
self.model = glm.fit_regularized(
|
|
132
|
+
alpha=self.best_params['alpha'],
|
|
133
|
+
L1_wt=self.best_params['l1_ratio'],
|
|
134
|
+
maxiter=300
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
self.ctx.glm_best = self.model
|
|
138
|
+
self.ctx.model_label += [self.label]
|
|
139
|
+
self._predict_and_cache(
|
|
140
|
+
self.model,
|
|
141
|
+
'glm',
|
|
142
|
+
design_fn=lambda train: self._prepare_design(
|
|
143
|
+
self.ctx.train_oht_scl_data if train else self.ctx.test_oht_scl_data
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def ensemble_predict(self, k: int) -> None:
|
|
148
|
+
if not self.best_params:
|
|
149
|
+
raise RuntimeError("Run tune() first to obtain best GLM parameters.")
|
|
150
|
+
k = max(2, int(k))
|
|
151
|
+
data = self.ctx.train_oht_scl_data
|
|
152
|
+
if data is None:
|
|
153
|
+
raise RuntimeError("Missing standardized data for GLM ensemble.")
|
|
154
|
+
X_all = data[self.ctx.var_nmes]
|
|
155
|
+
y_all = data[self.ctx.resp_nme]
|
|
156
|
+
w_all = data[self.ctx.weight_nme]
|
|
157
|
+
X_test = self.ctx.test_oht_scl_data
|
|
158
|
+
if X_test is None:
|
|
159
|
+
raise RuntimeError("Missing standardized test data for GLM ensemble.")
|
|
160
|
+
|
|
161
|
+
n_samples = len(X_all)
|
|
162
|
+
X_all_design = self._prepare_design(data)
|
|
163
|
+
X_test_design = self._prepare_design(X_test)
|
|
164
|
+
tweedie_power = self.best_params.get('tweedie_power')
|
|
165
|
+
family = self._select_family(tweedie_power)
|
|
166
|
+
|
|
167
|
+
split_iter, _ = self._resolve_ensemble_splits(X_all, k=k)
|
|
168
|
+
if split_iter is None:
|
|
169
|
+
_log(
|
|
170
|
+
f"[GLM Ensemble] unable to build CV split (n_samples={n_samples}); skip ensemble.",
|
|
171
|
+
flush=True,
|
|
172
|
+
)
|
|
173
|
+
return
|
|
174
|
+
preds_train_sum = np.zeros(n_samples, dtype=np.float64)
|
|
175
|
+
preds_test_sum = np.zeros(len(X_test_design), dtype=np.float64)
|
|
176
|
+
|
|
177
|
+
split_count = 0
|
|
178
|
+
for train_idx, _val_idx in split_iter:
|
|
179
|
+
X_train = X_all_design.iloc[train_idx]
|
|
180
|
+
y_train = y_all.iloc[train_idx]
|
|
181
|
+
w_train = w_all.iloc[train_idx]
|
|
182
|
+
|
|
183
|
+
glm = sm.GLM(y_train, X_train, family=family, freq_weights=w_train)
|
|
184
|
+
result = glm.fit_regularized(
|
|
185
|
+
alpha=self.best_params['alpha'],
|
|
186
|
+
L1_wt=self.best_params['l1_ratio'],
|
|
187
|
+
maxiter=300
|
|
188
|
+
)
|
|
189
|
+
pred_train = result.predict(X_all_design)
|
|
190
|
+
pred_test = result.predict(X_test_design)
|
|
191
|
+
preds_train_sum += np.asarray(pred_train, dtype=np.float64)
|
|
192
|
+
preds_test_sum += np.asarray(pred_test, dtype=np.float64)
|
|
193
|
+
split_count += 1
|
|
194
|
+
|
|
195
|
+
if split_count < 1:
|
|
196
|
+
_log(
|
|
197
|
+
f"[GLM Ensemble] no CV splits generated; skip ensemble.",
|
|
198
|
+
flush=True,
|
|
199
|
+
)
|
|
200
|
+
return
|
|
201
|
+
preds_train = preds_train_sum / float(split_count)
|
|
202
|
+
preds_test = preds_test_sum / float(split_count)
|
|
203
|
+
self._cache_predictions("glm", preds_train, preds_test)
|