ins-pricing 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. ins_pricing/README.md +48 -22
  2. ins_pricing/__init__.py +142 -90
  3. ins_pricing/cli/BayesOpt_entry.py +58 -46
  4. ins_pricing/cli/BayesOpt_incremental.py +77 -110
  5. ins_pricing/cli/Explain_Run.py +42 -23
  6. ins_pricing/cli/Explain_entry.py +551 -577
  7. ins_pricing/cli/Pricing_Run.py +42 -23
  8. ins_pricing/cli/bayesopt_entry_runner.py +51 -16
  9. ins_pricing/cli/utils/bootstrap.py +23 -0
  10. ins_pricing/cli/utils/cli_common.py +256 -256
  11. ins_pricing/cli/utils/cli_config.py +379 -360
  12. ins_pricing/cli/utils/import_resolver.py +375 -358
  13. ins_pricing/cli/utils/notebook_utils.py +256 -242
  14. ins_pricing/cli/watchdog_run.py +216 -198
  15. ins_pricing/frontend/__init__.py +10 -10
  16. ins_pricing/frontend/app.py +132 -61
  17. ins_pricing/frontend/config_builder.py +33 -0
  18. ins_pricing/frontend/example_config.json +11 -0
  19. ins_pricing/frontend/example_workflows.py +1 -1
  20. ins_pricing/frontend/runner.py +340 -388
  21. ins_pricing/governance/__init__.py +20 -20
  22. ins_pricing/governance/release.py +159 -159
  23. ins_pricing/modelling/README.md +1 -1
  24. ins_pricing/modelling/__init__.py +147 -92
  25. ins_pricing/modelling/{core/bayesopt → bayesopt}/README.md +31 -13
  26. ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
  27. ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +12 -0
  28. ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +589 -552
  29. ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +987 -958
  30. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
  31. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +488 -548
  32. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
  33. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +349 -342
  34. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +921 -913
  35. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +794 -785
  36. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +454 -446
  37. ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
  38. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1294 -1282
  39. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +64 -56
  40. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +203 -198
  41. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +333 -325
  42. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +279 -267
  43. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +515 -313
  44. ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
  45. ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
  46. ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +193 -186
  47. ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
  48. ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
  49. ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
  50. ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +636 -623
  51. ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
  52. ins_pricing/modelling/explain/__init__.py +55 -55
  53. ins_pricing/modelling/explain/metrics.py +27 -174
  54. ins_pricing/modelling/explain/permutation.py +237 -237
  55. ins_pricing/modelling/plotting/__init__.py +40 -36
  56. ins_pricing/modelling/plotting/compat.py +228 -0
  57. ins_pricing/modelling/plotting/curves.py +572 -572
  58. ins_pricing/modelling/plotting/diagnostics.py +163 -163
  59. ins_pricing/modelling/plotting/geo.py +362 -362
  60. ins_pricing/modelling/plotting/importance.py +121 -121
  61. ins_pricing/pricing/__init__.py +27 -27
  62. ins_pricing/pricing/factors.py +67 -56
  63. ins_pricing/production/__init__.py +35 -25
  64. ins_pricing/production/{predict.py → inference.py} +140 -57
  65. ins_pricing/production/monitoring.py +8 -21
  66. ins_pricing/reporting/__init__.py +11 -11
  67. ins_pricing/setup.py +1 -1
  68. ins_pricing/tests/production/test_inference.py +90 -0
  69. ins_pricing/utils/__init__.py +112 -78
  70. ins_pricing/utils/device.py +258 -237
  71. ins_pricing/utils/features.py +53 -0
  72. ins_pricing/utils/io.py +72 -0
  73. ins_pricing/utils/logging.py +34 -1
  74. ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
  75. ins_pricing/utils/metrics.py +158 -24
  76. ins_pricing/utils/numerics.py +76 -0
  77. ins_pricing/utils/paths.py +9 -1
  78. ins_pricing/utils/profiling.py +8 -4
  79. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/METADATA +1 -1
  80. ins_pricing-0.5.1.dist-info/RECORD +132 -0
  81. ins_pricing/modelling/core/BayesOpt.py +0 -146
  82. ins_pricing/modelling/core/__init__.py +0 -1
  83. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
  84. ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
  85. ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
  86. ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
  87. ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
  88. ins_pricing/modelling/core/bayesopt/utils.py +0 -105
  89. ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
  90. ins_pricing/tests/production/test_predict.py +0 -233
  91. ins_pricing-0.4.5.dist-info/RECORD +0 -130
  92. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/WHEEL +0 -0
  93. {ins_pricing-0.4.5.dist-info → ins_pricing-0.5.1.dist-info}/top_level.txt +0 -0
@@ -8,20 +8,36 @@ import pandas as pd
8
8
  from sklearn.metrics import log_loss
9
9
  from sklearn.model_selection import GroupKFold, TimeSeriesSplit
10
10
 
11
- from .trainer_base import TrainerBase
12
- from ..models import FTTransformerSklearn
13
- from ..utils.losses import regression_loss
14
-
15
-
16
- class FTTrainer(TrainerBase):
17
- def __init__(self, context: "BayesOptModel") -> None:
18
- if context.task_type == 'classification':
19
- super().__init__(context, 'FTTransformerClassifier', 'FTTransformer')
20
- else:
21
- super().__init__(context, 'FTTransformer', 'FTTransformer')
22
- self.model: Optional[FTTransformerSklearn] = None
23
- self.enable_distributed_optuna = bool(context.config.use_ft_ddp)
24
- self._cv_geo_warned = False
11
+ from ins_pricing.modelling.bayesopt.trainers.trainer_base import TrainerBase
12
+ from ins_pricing.modelling.bayesopt.models import FTTransformerSklearn
13
+ from ins_pricing.utils.losses import regression_loss
14
+ from ins_pricing.utils import get_logger, log_print
15
+
16
+ _logger = get_logger("ins_pricing.trainer.ft")
17
+
18
+
19
+ def _log(*args, **kwargs) -> None:
20
+ log_print(_logger, *args, **kwargs)
21
+
22
+
23
+ class FTTrainer(TrainerBase):
24
+ def __init__(self, context: "BayesOptModel") -> None:
25
+ if context.task_type == 'classification':
26
+ super().__init__(context, 'FTTransformerClassifier', 'FTTransformer')
27
+ else:
28
+ super().__init__(context, 'FTTransformer', 'FTTransformer')
29
+ self.model: Optional[FTTransformerSklearn] = None
30
+ self.enable_distributed_optuna = bool(context.config.use_ft_ddp)
31
+ self._cv_geo_warned = False
32
+
33
+ def _maybe_cleanup_gpu(self, model: Optional[FTTransformerSklearn]) -> None:
34
+ if not bool(getattr(self.ctx.config, "ft_cleanup_per_fold", False)):
35
+ return
36
+ if model is not None:
37
+ getattr(getattr(model, "ft", None), "to",
38
+ lambda *_args, **_kwargs: None)("cpu")
39
+ synchronize = bool(getattr(self.ctx.config, "ft_cleanup_synchronize", False))
40
+ self._clean_gpu(synchronize=synchronize)
25
41
 
26
42
  def _resolve_numeric_tokens(self) -> int:
27
43
  requested = getattr(self.ctx.config, "ft_num_numeric_tokens", None)
@@ -121,7 +137,7 @@ class FTTrainer(TrainerBase):
121
137
  if built is not None:
122
138
  geo_train, geo_val, _, _ = built
123
139
  elif not self._cv_geo_warned:
124
- print(
140
+ _log(
125
141
  "[FTTrainer] Geo tokens unavailable for CV split; continue without geo tokens.",
126
142
  flush=True,
127
143
  )
@@ -168,22 +184,20 @@ class FTTrainer(TrainerBase):
168
184
  )
169
185
  model = self._apply_dataloader_overrides(model)
170
186
  model.set_params(model_params)
171
- try:
172
- return float(model.fit_unsupervised(
173
- X_train,
174
- X_val=X_val,
175
- trial=trial,
176
- geo_train=geo_train,
177
- geo_val=geo_val,
178
- mask_prob_num=mask_prob_num,
179
- mask_prob_cat=mask_prob_cat,
180
- num_loss_weight=num_loss_weight,
181
- cat_loss_weight=cat_loss_weight
182
- ))
183
- finally:
184
- getattr(getattr(model, "ft", None), "to",
185
- lambda *_args, **_kwargs: None)("cpu")
186
- self._clean_gpu()
187
+ try:
188
+ return float(model.fit_unsupervised(
189
+ X_train,
190
+ X_val=X_val,
191
+ trial=trial,
192
+ geo_train=geo_train,
193
+ geo_val=geo_val,
194
+ mask_prob_num=mask_prob_num,
195
+ mask_prob_cat=mask_prob_cat,
196
+ num_loss_weight=num_loss_weight,
197
+ cat_loss_weight=cat_loss_weight
198
+ ))
199
+ finally:
200
+ self._maybe_cleanup_gpu(model)
187
201
 
188
202
  def cross_val(self, trial: optuna.trial.Trial) -> float:
189
203
  # FT-Transformer CV also focuses on memory control:
@@ -229,7 +243,7 @@ class FTTrainer(TrainerBase):
229
243
  token_count += 1
230
244
  approx_units = d_model * n_layers * max(1, token_count)
231
245
  if approx_units > 12_000_000:
232
- print(
246
+ _log(
233
247
  f"[FTTrainer] Trial pruned early: d_model={d_model}, n_layers={n_layers} -> approx_units={approx_units}")
234
248
  raise optuna.TrialPruned(
235
249
  "config exceeds safe memory budget; prune before training")
@@ -285,7 +299,7 @@ class FTTrainer(TrainerBase):
285
299
  if built is not None:
286
300
  geo_train, geo_val, _, _ = built
287
301
  elif not self._cv_geo_warned:
288
- print(
302
+ _log(
289
303
  "[FTTrainer] Geo tokens unavailable for CV split; continue without geo tokens.",
290
304
  flush=True,
291
305
  )
@@ -338,7 +352,7 @@ class FTTrainer(TrainerBase):
338
352
  requested_heads=resolved_params.get("n_heads")
339
353
  )
340
354
  if heads_adjusted:
341
- print(f"[FTTrainer] Auto-adjusted n_heads from "
355
+ _log(f"[FTTrainer] Auto-adjusted n_heads from "
342
356
  f"{resolved_params.get('n_heads')} to {adaptive_heads} "
343
357
  f"(d_model={d_model_value}).")
344
358
  resolved_params["n_heads"] = adaptive_heads
@@ -378,13 +392,11 @@ class FTTrainer(TrainerBase):
378
392
  geo_train=geo_train,
379
393
  geo_val=geo_val,
380
394
  )
381
- refit_epochs = self._resolve_best_epoch(
382
- getattr(tmp_model, "training_history", None),
383
- default_epochs=int(self.ctx.epochs),
384
- )
385
- getattr(getattr(tmp_model, "ft", None), "to",
386
- lambda *_args, **_kwargs: None)("cpu")
387
- self._clean_gpu()
395
+ refit_epochs = self._resolve_best_epoch(
396
+ getattr(tmp_model, "training_history", None),
397
+ default_epochs=int(self.ctx.epochs),
398
+ )
399
+ self._maybe_cleanup_gpu(tmp_model)
388
400
 
389
401
  self.model = FTTransformerSklearn(
390
402
  model_nme=self.ctx.model_nme,
@@ -451,7 +463,7 @@ class FTTrainer(TrainerBase):
451
463
 
452
464
  split_iter, _ = self._resolve_ensemble_splits(X_all, k=k)
453
465
  if split_iter is None:
454
- print(
466
+ _log(
455
467
  f"[FT Ensemble] unable to build CV split (n_samples={n_samples}); skip ensemble.",
456
468
  flush=True,
457
469
  )
@@ -494,15 +506,13 @@ class FTTrainer(TrainerBase):
494
506
 
495
507
  pred_train = model.predict(X_all, geo_tokens=geo_train_full)
496
508
  pred_test = model.predict(X_test, geo_tokens=geo_test_full)
497
- preds_train_sum += np.asarray(pred_train, dtype=np.float64)
498
- preds_test_sum += np.asarray(pred_test, dtype=np.float64)
499
- getattr(getattr(model, "ft", None), "to",
500
- lambda *_args, **_kwargs: None)("cpu")
501
- self._clean_gpu()
502
- split_count += 1
509
+ preds_train_sum += np.asarray(pred_train, dtype=np.float64)
510
+ preds_test_sum += np.asarray(pred_test, dtype=np.float64)
511
+ self._maybe_cleanup_gpu(model)
512
+ split_count += 1
503
513
 
504
514
  if split_count < 1:
505
- print(
515
+ _log(
506
516
  f"[FT Ensemble] no CV splits generated; skip ensemble.",
507
517
  flush=True,
508
518
  )
@@ -591,7 +601,7 @@ class FTTrainer(TrainerBase):
591
601
  requested_heads=resolved_params.get("n_heads"),
592
602
  )
593
603
  if heads_adjusted:
594
- print(
604
+ _log(
595
605
  f"[FTTrainer] Auto-adjusted n_heads from "
596
606
  f"{resolved_params.get('n_heads')} to {adaptive_heads} "
597
607
  f"(d_model={resolved_params.get('d_model', model.d_model)})."
@@ -652,11 +662,9 @@ class FTTrainer(TrainerBase):
652
662
  if preds_train is None:
653
663
  preds_train = np.empty(
654
664
  (len(X_all),) + fold_pred.shape[1:], dtype=fold_pred.dtype)
655
- preds_train[val_idx] = fold_pred
656
-
657
- getattr(getattr(model, "ft", None), "to",
658
- lambda *_a, **_k: None)("cpu")
659
- self._clean_gpu()
665
+ preds_train[val_idx] = fold_pred
666
+
667
+ self._maybe_cleanup_gpu(model)
660
668
 
661
669
  if preds_train is None:
662
670
  return None
@@ -773,7 +781,7 @@ class FTTrainer(TrainerBase):
773
781
  requested_heads=resolved_params.get("n_heads")
774
782
  )
775
783
  if heads_adjusted:
776
- print(f"[FTTrainer] Auto-adjusted n_heads from "
784
+ _log(f"[FTTrainer] Auto-adjusted n_heads from "
777
785
  f"{resolved_params.get('n_heads')} to {adaptive_heads} "
778
786
  f"(d_model={resolved_params.get('d_model', self.model.d_model)}).")
779
787
  resolved_params["n_heads"] = adaptive_heads
@@ -1,198 +1,203 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any, Dict, List, Optional, Tuple
4
-
5
- import numpy as np
6
- import optuna
7
- import pandas as pd
8
- import statsmodels.api as sm
9
- from sklearn.metrics import log_loss
10
-
11
- from .trainer_base import TrainerBase
12
- from ..utils import EPS
13
- from ..utils.losses import regression_loss
14
-
15
- class GLMTrainer(TrainerBase):
16
- def __init__(self, context: "BayesOptModel") -> None:
17
- super().__init__(context, 'GLM', 'GLM')
18
- self.model = None
19
-
20
- def _select_family(self, tweedie_power: Optional[float] = None):
21
- if self.ctx.task_type == 'classification':
22
- return sm.families.Binomial()
23
- loss_name = getattr(self.ctx, "loss_name", "tweedie")
24
- if loss_name == "poisson":
25
- return sm.families.Poisson()
26
- if loss_name == "gamma":
27
- return sm.families.Gamma()
28
- if loss_name in {"mse", "mae"}:
29
- return sm.families.Gaussian()
30
- power = tweedie_power if tweedie_power is not None else 1.5
31
- return sm.families.Tweedie(var_power=power, link=sm.families.links.log())
32
-
33
- def _prepare_design(self, data: pd.DataFrame) -> pd.DataFrame:
34
- # Add intercept to the statsmodels design matrix.
35
- X = data[self.ctx.var_nmes]
36
- return sm.add_constant(X, has_constant='add')
37
-
38
- def _metric_power(self, family, tweedie_power: Optional[float]) -> float:
39
- if isinstance(family, sm.families.Poisson):
40
- return 1.0
41
- if isinstance(family, sm.families.Gamma):
42
- return 2.0
43
- if isinstance(family, sm.families.Tweedie):
44
- return tweedie_power if tweedie_power is not None else getattr(family, 'var_power', 1.5)
45
- return 1.5
46
-
47
- def cross_val(self, trial: optuna.trial.Trial) -> float:
48
- param_space = {
49
- "alpha": lambda t: t.suggest_float('alpha', 1e-6, 1e2, log=True),
50
- "l1_ratio": lambda t: t.suggest_float('l1_ratio', 0.0, 1.0)
51
- }
52
- loss_name = getattr(self.ctx, "loss_name", "tweedie")
53
- if self.ctx.task_type == 'regression' and loss_name == 'tweedie':
54
- param_space["tweedie_power"] = lambda t: t.suggest_float(
55
- 'tweedie_power', 1.0, 2.0)
56
-
57
- def data_provider():
58
- data = self.ctx.train_oht_data if self.ctx.train_oht_data is not None else self.ctx.train_oht_scl_data
59
- assert data is not None, "Preprocessed training data is missing."
60
- return data[self.ctx.var_nmes], data[self.ctx.resp_nme], data[self.ctx.weight_nme]
61
-
62
- def preprocess_fn(X_train, X_val):
63
- X_train_s, X_val_s, _ = self._standardize_fold(
64
- X_train, X_val, self.ctx.num_features)
65
- return self._prepare_design(X_train_s), self._prepare_design(X_val_s)
66
-
67
- metric_ctx: Dict[str, Any] = {}
68
-
69
- def model_builder(params):
70
- family = self._select_family(params.get("tweedie_power"))
71
- metric_ctx["family"] = family
72
- metric_ctx["tweedie_power"] = params.get("tweedie_power")
73
- return {
74
- "family": family,
75
- "alpha": params["alpha"],
76
- "l1_ratio": params["l1_ratio"],
77
- "tweedie_power": params.get("tweedie_power")
78
- }
79
-
80
- def fit_predict(model_cfg, X_train, y_train, w_train, X_val, y_val, w_val, _trial):
81
- glm = sm.GLM(y_train, X_train,
82
- family=model_cfg["family"],
83
- freq_weights=w_train)
84
- result = glm.fit_regularized(
85
- alpha=model_cfg["alpha"],
86
- L1_wt=model_cfg["l1_ratio"],
87
- maxiter=200
88
- )
89
- return result.predict(X_val)
90
-
91
- def metric_fn(y_true, y_pred, weight):
92
- if self.ctx.task_type == 'classification':
93
- y_pred_clipped = np.clip(y_pred, EPS, 1 - EPS)
94
- return log_loss(y_true, y_pred_clipped, sample_weight=weight)
95
- return regression_loss(
96
- y_true,
97
- y_pred,
98
- weight,
99
- loss_name=loss_name,
100
- tweedie_power=metric_ctx.get("tweedie_power"),
101
- )
102
-
103
- return self.cross_val_generic(
104
- trial=trial,
105
- hyperparameter_space=param_space,
106
- data_provider=data_provider,
107
- model_builder=model_builder,
108
- metric_fn=metric_fn,
109
- preprocess_fn=preprocess_fn,
110
- fit_predict_fn=fit_predict
111
- )
112
-
113
- def train(self) -> None:
114
- if not self.best_params:
115
- raise RuntimeError("Run tune() first to obtain best GLM parameters.")
116
- tweedie_power = self.best_params.get('tweedie_power')
117
- family = self._select_family(tweedie_power)
118
-
119
- X_train = self._prepare_design(self.ctx.train_oht_scl_data)
120
- y_train = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
121
- w_train = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
122
-
123
- glm = sm.GLM(y_train, X_train, family=family,
124
- freq_weights=w_train)
125
- self.model = glm.fit_regularized(
126
- alpha=self.best_params['alpha'],
127
- L1_wt=self.best_params['l1_ratio'],
128
- maxiter=300
129
- )
130
-
131
- self.ctx.glm_best = self.model
132
- self.ctx.model_label += [self.label]
133
- self._predict_and_cache(
134
- self.model,
135
- 'glm',
136
- design_fn=lambda train: self._prepare_design(
137
- self.ctx.train_oht_scl_data if train else self.ctx.test_oht_scl_data
138
- )
139
- )
140
-
141
- def ensemble_predict(self, k: int) -> None:
142
- if not self.best_params:
143
- raise RuntimeError("Run tune() first to obtain best GLM parameters.")
144
- k = max(2, int(k))
145
- data = self.ctx.train_oht_scl_data
146
- if data is None:
147
- raise RuntimeError("Missing standardized data for GLM ensemble.")
148
- X_all = data[self.ctx.var_nmes]
149
- y_all = data[self.ctx.resp_nme]
150
- w_all = data[self.ctx.weight_nme]
151
- X_test = self.ctx.test_oht_scl_data
152
- if X_test is None:
153
- raise RuntimeError("Missing standardized test data for GLM ensemble.")
154
-
155
- n_samples = len(X_all)
156
- X_all_design = self._prepare_design(data)
157
- X_test_design = self._prepare_design(X_test)
158
- tweedie_power = self.best_params.get('tweedie_power')
159
- family = self._select_family(tweedie_power)
160
-
161
- split_iter, _ = self._resolve_ensemble_splits(X_all, k=k)
162
- if split_iter is None:
163
- print(
164
- f"[GLM Ensemble] unable to build CV split (n_samples={n_samples}); skip ensemble.",
165
- flush=True,
166
- )
167
- return
168
- preds_train_sum = np.zeros(n_samples, dtype=np.float64)
169
- preds_test_sum = np.zeros(len(X_test_design), dtype=np.float64)
170
-
171
- split_count = 0
172
- for train_idx, _val_idx in split_iter:
173
- X_train = X_all_design.iloc[train_idx]
174
- y_train = y_all.iloc[train_idx]
175
- w_train = w_all.iloc[train_idx]
176
-
177
- glm = sm.GLM(y_train, X_train, family=family, freq_weights=w_train)
178
- result = glm.fit_regularized(
179
- alpha=self.best_params['alpha'],
180
- L1_wt=self.best_params['l1_ratio'],
181
- maxiter=300
182
- )
183
- pred_train = result.predict(X_all_design)
184
- pred_test = result.predict(X_test_design)
185
- preds_train_sum += np.asarray(pred_train, dtype=np.float64)
186
- preds_test_sum += np.asarray(pred_test, dtype=np.float64)
187
- split_count += 1
188
-
189
- if split_count < 1:
190
- print(
191
- f"[GLM Ensemble] no CV splits generated; skip ensemble.",
192
- flush=True,
193
- )
194
- return
195
- preds_train = preds_train_sum / float(split_count)
196
- preds_test = preds_test_sum / float(split_count)
197
- self._cache_predictions("glm", preds_train, preds_test)
198
-
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import numpy as np
6
+ import optuna
7
+ import pandas as pd
8
+ import statsmodels.api as sm
9
+ from sklearn.metrics import log_loss
10
+
11
+ from ins_pricing.modelling.bayesopt.trainers.trainer_base import TrainerBase
12
+ from ins_pricing.utils import EPS, get_logger, log_print
13
+ from ins_pricing.utils.losses import regression_loss
14
+
15
+ _logger = get_logger("ins_pricing.trainer.glm")
16
+
17
+
18
+ def _log(*args, **kwargs) -> None:
19
+ log_print(_logger, *args, **kwargs)
20
+
21
+ class GLMTrainer(TrainerBase):
22
+ def __init__(self, context: "BayesOptModel") -> None:
23
+ super().__init__(context, 'GLM', 'GLM')
24
+ self.model = None
25
+
26
+ def _select_family(self, tweedie_power: Optional[float] = None):
27
+ if self.ctx.task_type == 'classification':
28
+ return sm.families.Binomial()
29
+ loss_name = getattr(self.ctx, "loss_name", "tweedie")
30
+ if loss_name == "poisson":
31
+ return sm.families.Poisson()
32
+ if loss_name == "gamma":
33
+ return sm.families.Gamma()
34
+ if loss_name in {"mse", "mae"}:
35
+ return sm.families.Gaussian()
36
+ power = tweedie_power if tweedie_power is not None else 1.5
37
+ return sm.families.Tweedie(var_power=power, link=sm.families.links.log())
38
+
39
+ def _prepare_design(self, data: pd.DataFrame) -> pd.DataFrame:
40
+ # Add intercept to the statsmodels design matrix.
41
+ X = data[self.ctx.var_nmes]
42
+ return sm.add_constant(X, has_constant='add')
43
+
44
+ def _metric_power(self, family, tweedie_power: Optional[float]) -> float:
45
+ if isinstance(family, sm.families.Poisson):
46
+ return 1.0
47
+ if isinstance(family, sm.families.Gamma):
48
+ return 2.0
49
+ if isinstance(family, sm.families.Tweedie):
50
+ return tweedie_power if tweedie_power is not None else getattr(family, 'var_power', 1.5)
51
+ return 1.5
52
+
53
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
54
+ param_space = {
55
+ "alpha": lambda t: t.suggest_float('alpha', 1e-6, 1e2, log=True),
56
+ "l1_ratio": lambda t: t.suggest_float('l1_ratio', 0.0, 1.0)
57
+ }
58
+ loss_name = getattr(self.ctx, "loss_name", "tweedie")
59
+ if self.ctx.task_type == 'regression' and loss_name == 'tweedie':
60
+ param_space["tweedie_power"] = lambda t: t.suggest_float(
61
+ 'tweedie_power', 1.0, 2.0)
62
+
63
+ def data_provider():
64
+ data = self.ctx.train_oht_data if self.ctx.train_oht_data is not None else self.ctx.train_oht_scl_data
65
+ assert data is not None, "Preprocessed training data is missing."
66
+ return data[self.ctx.var_nmes], data[self.ctx.resp_nme], data[self.ctx.weight_nme]
67
+
68
+ def preprocess_fn(X_train, X_val):
69
+ X_train_s, X_val_s, _ = self._standardize_fold(
70
+ X_train, X_val, self.ctx.num_features)
71
+ return self._prepare_design(X_train_s), self._prepare_design(X_val_s)
72
+
73
+ metric_ctx: Dict[str, Any] = {}
74
+
75
+ def model_builder(params):
76
+ family = self._select_family(params.get("tweedie_power"))
77
+ metric_ctx["family"] = family
78
+ metric_ctx["tweedie_power"] = params.get("tweedie_power")
79
+ return {
80
+ "family": family,
81
+ "alpha": params["alpha"],
82
+ "l1_ratio": params["l1_ratio"],
83
+ "tweedie_power": params.get("tweedie_power")
84
+ }
85
+
86
+ def fit_predict(model_cfg, X_train, y_train, w_train, X_val, y_val, w_val, _trial):
87
+ glm = sm.GLM(y_train, X_train,
88
+ family=model_cfg["family"],
89
+ freq_weights=w_train)
90
+ result = glm.fit_regularized(
91
+ alpha=model_cfg["alpha"],
92
+ L1_wt=model_cfg["l1_ratio"],
93
+ maxiter=200
94
+ )
95
+ return result.predict(X_val)
96
+
97
+ def metric_fn(y_true, y_pred, weight):
98
+ if self.ctx.task_type == 'classification':
99
+ y_pred_clipped = np.clip(y_pred, EPS, 1 - EPS)
100
+ return log_loss(y_true, y_pred_clipped, sample_weight=weight)
101
+ return regression_loss(
102
+ y_true,
103
+ y_pred,
104
+ weight,
105
+ loss_name=loss_name,
106
+ tweedie_power=metric_ctx.get("tweedie_power"),
107
+ )
108
+
109
+ return self.cross_val_generic(
110
+ trial=trial,
111
+ hyperparameter_space=param_space,
112
+ data_provider=data_provider,
113
+ model_builder=model_builder,
114
+ metric_fn=metric_fn,
115
+ preprocess_fn=preprocess_fn,
116
+ fit_predict_fn=fit_predict
117
+ )
118
+
119
+ def train(self) -> None:
120
+ if not self.best_params:
121
+ raise RuntimeError("Run tune() first to obtain best GLM parameters.")
122
+ tweedie_power = self.best_params.get('tweedie_power')
123
+ family = self._select_family(tweedie_power)
124
+
125
+ X_train = self._prepare_design(self.ctx.train_oht_scl_data)
126
+ y_train = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
127
+ w_train = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
128
+
129
+ glm = sm.GLM(y_train, X_train, family=family,
130
+ freq_weights=w_train)
131
+ self.model = glm.fit_regularized(
132
+ alpha=self.best_params['alpha'],
133
+ L1_wt=self.best_params['l1_ratio'],
134
+ maxiter=300
135
+ )
136
+
137
+ self.ctx.glm_best = self.model
138
+ self.ctx.model_label += [self.label]
139
+ self._predict_and_cache(
140
+ self.model,
141
+ 'glm',
142
+ design_fn=lambda train: self._prepare_design(
143
+ self.ctx.train_oht_scl_data if train else self.ctx.test_oht_scl_data
144
+ )
145
+ )
146
+
147
+ def ensemble_predict(self, k: int) -> None:
148
+ if not self.best_params:
149
+ raise RuntimeError("Run tune() first to obtain best GLM parameters.")
150
+ k = max(2, int(k))
151
+ data = self.ctx.train_oht_scl_data
152
+ if data is None:
153
+ raise RuntimeError("Missing standardized data for GLM ensemble.")
154
+ X_all = data[self.ctx.var_nmes]
155
+ y_all = data[self.ctx.resp_nme]
156
+ w_all = data[self.ctx.weight_nme]
157
+ X_test = self.ctx.test_oht_scl_data
158
+ if X_test is None:
159
+ raise RuntimeError("Missing standardized test data for GLM ensemble.")
160
+
161
+ n_samples = len(X_all)
162
+ X_all_design = self._prepare_design(data)
163
+ X_test_design = self._prepare_design(X_test)
164
+ tweedie_power = self.best_params.get('tweedie_power')
165
+ family = self._select_family(tweedie_power)
166
+
167
+ split_iter, _ = self._resolve_ensemble_splits(X_all, k=k)
168
+ if split_iter is None:
169
+ _log(
170
+ f"[GLM Ensemble] unable to build CV split (n_samples={n_samples}); skip ensemble.",
171
+ flush=True,
172
+ )
173
+ return
174
+ preds_train_sum = np.zeros(n_samples, dtype=np.float64)
175
+ preds_test_sum = np.zeros(len(X_test_design), dtype=np.float64)
176
+
177
+ split_count = 0
178
+ for train_idx, _val_idx in split_iter:
179
+ X_train = X_all_design.iloc[train_idx]
180
+ y_train = y_all.iloc[train_idx]
181
+ w_train = w_all.iloc[train_idx]
182
+
183
+ glm = sm.GLM(y_train, X_train, family=family, freq_weights=w_train)
184
+ result = glm.fit_regularized(
185
+ alpha=self.best_params['alpha'],
186
+ L1_wt=self.best_params['l1_ratio'],
187
+ maxiter=300
188
+ )
189
+ pred_train = result.predict(X_all_design)
190
+ pred_test = result.predict(X_test_design)
191
+ preds_train_sum += np.asarray(pred_train, dtype=np.float64)
192
+ preds_test_sum += np.asarray(pred_test, dtype=np.float64)
193
+ split_count += 1
194
+
195
+ if split_count < 1:
196
+ _log(
197
+ f"[GLM Ensemble] no CV splits generated; skip ensemble.",
198
+ flush=True,
199
+ )
200
+ return
201
+ preds_train = preds_train_sum / float(split_count)
202
+ preds_test = preds_test_sum / float(split_count)
203
+ self._cache_predictions("glm", preds_train, preds_test)