ins-pricing 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. ins_pricing/README.md +9 -6
  2. ins_pricing/__init__.py +3 -11
  3. ins_pricing/cli/BayesOpt_entry.py +24 -0
  4. ins_pricing/{modelling → cli}/BayesOpt_incremental.py +197 -64
  5. ins_pricing/cli/Explain_Run.py +25 -0
  6. ins_pricing/{modelling → cli}/Explain_entry.py +169 -124
  7. ins_pricing/cli/Pricing_Run.py +25 -0
  8. ins_pricing/cli/__init__.py +1 -0
  9. ins_pricing/cli/bayesopt_entry_runner.py +1312 -0
  10. ins_pricing/cli/utils/__init__.py +1 -0
  11. ins_pricing/cli/utils/cli_common.py +320 -0
  12. ins_pricing/cli/utils/cli_config.py +375 -0
  13. ins_pricing/{modelling → cli/utils}/notebook_utils.py +74 -19
  14. {ins_pricing_gemini/modelling → ins_pricing/cli}/watchdog_run.py +2 -2
  15. ins_pricing/{modelling → docs/modelling}/BayesOpt_USAGE.md +69 -49
  16. ins_pricing/docs/modelling/README.md +34 -0
  17. ins_pricing/modelling/__init__.py +57 -6
  18. ins_pricing/modelling/core/__init__.py +1 -0
  19. ins_pricing/modelling/{bayesopt → core/bayesopt}/config_preprocess.py +64 -1
  20. ins_pricing/modelling/{bayesopt → core/bayesopt}/core.py +150 -810
  21. ins_pricing/modelling/core/bayesopt/model_explain_mixin.py +296 -0
  22. ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py +548 -0
  23. ins_pricing/modelling/core/bayesopt/models/__init__.py +27 -0
  24. ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +316 -0
  25. ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +808 -0
  26. ins_pricing/modelling/core/bayesopt/models/model_gnn.py +675 -0
  27. ins_pricing/modelling/core/bayesopt/models/model_resn.py +435 -0
  28. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +19 -0
  29. ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +1020 -0
  30. ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +787 -0
  31. ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +195 -0
  32. ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +312 -0
  33. ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +261 -0
  34. ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +348 -0
  35. ins_pricing/modelling/{bayesopt → core/bayesopt}/utils.py +2 -2
  36. ins_pricing/modelling/core/evaluation.py +115 -0
  37. ins_pricing/production/__init__.py +4 -0
  38. ins_pricing/production/preprocess.py +71 -0
  39. ins_pricing/setup.py +10 -5
  40. {ins_pricing_gemini/modelling/tests → ins_pricing/tests/modelling}/test_plotting.py +2 -2
  41. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/METADATA +4 -4
  42. ins_pricing-0.2.0.dist-info/RECORD +125 -0
  43. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/top_level.txt +0 -1
  44. ins_pricing/modelling/BayesOpt_entry.py +0 -633
  45. ins_pricing/modelling/Explain_Run.py +0 -36
  46. ins_pricing/modelling/Pricing_Run.py +0 -36
  47. ins_pricing/modelling/README.md +0 -33
  48. ins_pricing/modelling/bayesopt/models.py +0 -2196
  49. ins_pricing/modelling/bayesopt/trainers.py +0 -2446
  50. ins_pricing/modelling/cli_common.py +0 -136
  51. ins_pricing/modelling/tests/test_plotting.py +0 -63
  52. ins_pricing/modelling/watchdog_run.py +0 -211
  53. ins_pricing-0.1.11.dist-info/RECORD +0 -169
  54. ins_pricing_gemini/__init__.py +0 -23
  55. ins_pricing_gemini/governance/__init__.py +0 -20
  56. ins_pricing_gemini/governance/approval.py +0 -93
  57. ins_pricing_gemini/governance/audit.py +0 -37
  58. ins_pricing_gemini/governance/registry.py +0 -99
  59. ins_pricing_gemini/governance/release.py +0 -159
  60. ins_pricing_gemini/modelling/Explain_Run.py +0 -36
  61. ins_pricing_gemini/modelling/Pricing_Run.py +0 -36
  62. ins_pricing_gemini/modelling/__init__.py +0 -151
  63. ins_pricing_gemini/modelling/cli_common.py +0 -141
  64. ins_pricing_gemini/modelling/config.py +0 -249
  65. ins_pricing_gemini/modelling/config_preprocess.py +0 -254
  66. ins_pricing_gemini/modelling/core.py +0 -741
  67. ins_pricing_gemini/modelling/data_container.py +0 -42
  68. ins_pricing_gemini/modelling/explain/__init__.py +0 -55
  69. ins_pricing_gemini/modelling/explain/gradients.py +0 -334
  70. ins_pricing_gemini/modelling/explain/metrics.py +0 -176
  71. ins_pricing_gemini/modelling/explain/permutation.py +0 -155
  72. ins_pricing_gemini/modelling/explain/shap_utils.py +0 -146
  73. ins_pricing_gemini/modelling/features.py +0 -215
  74. ins_pricing_gemini/modelling/model_manager.py +0 -148
  75. ins_pricing_gemini/modelling/model_plotting.py +0 -463
  76. ins_pricing_gemini/modelling/models.py +0 -2203
  77. ins_pricing_gemini/modelling/notebook_utils.py +0 -294
  78. ins_pricing_gemini/modelling/plotting/__init__.py +0 -45
  79. ins_pricing_gemini/modelling/plotting/common.py +0 -63
  80. ins_pricing_gemini/modelling/plotting/curves.py +0 -572
  81. ins_pricing_gemini/modelling/plotting/diagnostics.py +0 -139
  82. ins_pricing_gemini/modelling/plotting/geo.py +0 -362
  83. ins_pricing_gemini/modelling/plotting/importance.py +0 -121
  84. ins_pricing_gemini/modelling/run_logging.py +0 -133
  85. ins_pricing_gemini/modelling/tests/conftest.py +0 -8
  86. ins_pricing_gemini/modelling/tests/test_cross_val_generic.py +0 -66
  87. ins_pricing_gemini/modelling/tests/test_distributed_utils.py +0 -18
  88. ins_pricing_gemini/modelling/tests/test_explain.py +0 -56
  89. ins_pricing_gemini/modelling/tests/test_geo_tokens_split.py +0 -49
  90. ins_pricing_gemini/modelling/tests/test_graph_cache.py +0 -33
  91. ins_pricing_gemini/modelling/tests/test_plotting_library.py +0 -150
  92. ins_pricing_gemini/modelling/tests/test_preprocessor.py +0 -48
  93. ins_pricing_gemini/modelling/trainers.py +0 -2447
  94. ins_pricing_gemini/modelling/utils.py +0 -1020
  95. ins_pricing_gemini/pricing/__init__.py +0 -27
  96. ins_pricing_gemini/pricing/calibration.py +0 -39
  97. ins_pricing_gemini/pricing/data_quality.py +0 -117
  98. ins_pricing_gemini/pricing/exposure.py +0 -85
  99. ins_pricing_gemini/pricing/factors.py +0 -91
  100. ins_pricing_gemini/pricing/monitoring.py +0 -99
  101. ins_pricing_gemini/pricing/rate_table.py +0 -78
  102. ins_pricing_gemini/production/__init__.py +0 -21
  103. ins_pricing_gemini/production/drift.py +0 -30
  104. ins_pricing_gemini/production/monitoring.py +0 -143
  105. ins_pricing_gemini/production/scoring.py +0 -40
  106. ins_pricing_gemini/reporting/__init__.py +0 -11
  107. ins_pricing_gemini/reporting/report_builder.py +0 -72
  108. ins_pricing_gemini/reporting/scheduler.py +0 -45
  109. ins_pricing_gemini/scripts/BayesOpt_incremental.py +0 -722
  110. ins_pricing_gemini/scripts/Explain_entry.py +0 -545
  111. ins_pricing_gemini/scripts/__init__.py +0 -1
  112. ins_pricing_gemini/scripts/train.py +0 -568
  113. ins_pricing_gemini/setup.py +0 -55
  114. ins_pricing_gemini/smoke_test.py +0 -28
  115. /ins_pricing/{modelling → cli/utils}/run_logging.py +0 -0
  116. /ins_pricing/modelling/{BayesOpt.py → core/BayesOpt.py} +0 -0
  117. /ins_pricing/modelling/{bayesopt → core/bayesopt}/__init__.py +0 -0
  118. /ins_pricing/{modelling/tests → tests/modelling}/conftest.py +0 -0
  119. /ins_pricing/{modelling/tests → tests/modelling}/test_cross_val_generic.py +0 -0
  120. /ins_pricing/{modelling/tests → tests/modelling}/test_distributed_utils.py +0 -0
  121. /ins_pricing/{modelling/tests → tests/modelling}/test_explain.py +0 -0
  122. /ins_pricing/{modelling/tests → tests/modelling}/test_geo_tokens_split.py +0 -0
  123. /ins_pricing/{modelling/tests → tests/modelling}/test_graph_cache.py +0 -0
  124. /ins_pricing/{modelling/tests → tests/modelling}/test_plotting_library.py +0 -0
  125. /ins_pricing/{modelling/tests → tests/modelling}/test_preprocessor.py +0 -0
  126. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,787 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import numpy as np
6
+ import optuna
7
+ import pandas as pd
8
+ from sklearn.metrics import log_loss, mean_tweedie_deviance
9
+ from sklearn.model_selection import GroupKFold, TimeSeriesSplit
10
+
11
+ from .trainer_base import TrainerBase
12
+ from ..models import FTTransformerSklearn
13
+
14
+ class FTTrainer(TrainerBase):
15
+ def __init__(self, context: "BayesOptModel") -> None:
16
+ if context.task_type == 'classification':
17
+ super().__init__(context, 'FTTransformerClassifier', 'FTTransformer')
18
+ else:
19
+ super().__init__(context, 'FTTransformer', 'FTTransformer')
20
+ self.model: Optional[FTTransformerSklearn] = None
21
+ self.enable_distributed_optuna = bool(context.config.use_ft_ddp)
22
+ self._cv_geo_warned = False
23
+
24
+ def _resolve_numeric_tokens(self) -> int:
25
+ requested = getattr(self.ctx.config, "ft_num_numeric_tokens", None)
26
+ return FTTransformerSklearn.resolve_numeric_token_count(
27
+ self.ctx.num_features,
28
+ self.ctx.cate_list,
29
+ requested,
30
+ )
31
+
32
+ def _resolve_adaptive_heads(self,
33
+ d_model: int,
34
+ requested_heads: Optional[int] = None) -> Tuple[int, bool]:
35
+ d_model = int(d_model)
36
+ if d_model <= 0:
37
+ raise ValueError(f"Invalid d_model={d_model}, expected > 0.")
38
+
39
+ default_heads = max(2, d_model // 16)
40
+ base_heads = default_heads if requested_heads is None else int(
41
+ requested_heads)
42
+ base_heads = max(1, min(base_heads, d_model))
43
+
44
+ if d_model % base_heads == 0:
45
+ return base_heads, False
46
+
47
+ for candidate in range(min(d_model, base_heads), 0, -1):
48
+ if d_model % candidate == 0:
49
+ return candidate, True
50
+ return 1, True
51
+
52
+ def _build_geo_tokens_for_split(self,
53
+ X_train: pd.DataFrame,
54
+ X_val: pd.DataFrame,
55
+ geo_params: Optional[Dict[str, Any]] = None):
56
+ if not self.ctx.config.geo_feature_nmes:
57
+ return None
58
+ orig_train = self.ctx.train_data
59
+ orig_test = self.ctx.test_data
60
+ try:
61
+ self.ctx.train_data = orig_train.loc[X_train.index].copy()
62
+ self.ctx.test_data = orig_train.loc[X_val.index].copy()
63
+ return self.ctx._build_geo_tokens(geo_params)
64
+ finally:
65
+ self.ctx.train_data = orig_train
66
+ self.ctx.test_data = orig_test
67
+
68
+ def cross_val_unsupervised(self, trial: Optional[optuna.trial.Trial]) -> float:
69
+ """Optuna objective A: minimize validation loss for masked reconstruction."""
70
+ param_space: Dict[str, Callable[[optuna.trial.Trial], Any]] = {
71
+ "learning_rate": lambda t: t.suggest_float('learning_rate', 1e-5, 5e-3, log=True),
72
+ "d_model": lambda t: t.suggest_int('d_model', 16, 128, step=16),
73
+ "n_layers": lambda t: t.suggest_int('n_layers', 2, 8),
74
+ "dropout": lambda t: t.suggest_float('dropout', 0.0, 0.3),
75
+ "weight_decay": lambda t: t.suggest_float('weight_decay', 1e-6, 1e-2, log=True),
76
+ "mask_prob_num": lambda t: t.suggest_float('mask_prob_num', 0.05, 0.4),
77
+ "mask_prob_cat": lambda t: t.suggest_float('mask_prob_cat', 0.05, 0.4),
78
+ "num_loss_weight": lambda t: t.suggest_float('num_loss_weight', 0.25, 4.0, log=True),
79
+ "cat_loss_weight": lambda t: t.suggest_float('cat_loss_weight', 0.25, 4.0, log=True),
80
+ }
81
+
82
+ params: Optional[Dict[str, Any]] = None
83
+ if self._distributed_forced_params is not None:
84
+ params = self._distributed_forced_params
85
+ self._distributed_forced_params = None
86
+ else:
87
+ if trial is None:
88
+ raise RuntimeError(
89
+ "Missing Optuna trial for parameter sampling.")
90
+ params = {name: sampler(trial)
91
+ for name, sampler in param_space.items()}
92
+ if self._should_use_distributed_optuna():
93
+ self._distributed_prepare_trial(params)
94
+
95
+ X_all = self.ctx.train_data[self.ctx.factor_nmes]
96
+ max_rows_for_ft_bo = min(1_000_000, int(len(X_all) / 2))
97
+ if max_rows_for_ft_bo > 0 and len(X_all) > max_rows_for_ft_bo:
98
+ sampled_idx = self._resolve_time_sample_indices(X_all, max_rows_for_ft_bo)
99
+ if sampled_idx is None:
100
+ X_all = X_all.sample(
101
+ n=max_rows_for_ft_bo,
102
+ random_state=self.ctx.rand_seed,
103
+ )
104
+ else:
105
+ X_all = X_all.loc[sampled_idx]
106
+
107
+ split = self._resolve_train_val_indices(X_all, allow_default=True)
108
+ if split is None:
109
+ raise ValueError("Unable to build train/val split for FT unsupervised CV.")
110
+ train_idx, val_idx = split
111
+ X_train = X_all.iloc[train_idx]
112
+ X_val = X_all.iloc[val_idx]
113
+ geo_train = geo_val = None
114
+ if self.ctx.config.geo_feature_nmes:
115
+ built = self._build_geo_tokens_for_split(X_train, X_val, params)
116
+ if built is not None:
117
+ geo_train, geo_val, _, _ = built
118
+ elif not self._cv_geo_warned:
119
+ print(
120
+ "[FTTrainer] Geo tokens unavailable for CV split; continue without geo tokens.",
121
+ flush=True,
122
+ )
123
+ self._cv_geo_warned = True
124
+
125
+ d_model = int(params["d_model"])
126
+ n_layers = int(params["n_layers"])
127
+ num_numeric_tokens = self._resolve_numeric_tokens()
128
+ token_count = num_numeric_tokens + len(self.ctx.cate_list)
129
+ if geo_train is not None:
130
+ token_count += 1
131
+ approx_units = d_model * n_layers * max(1, token_count)
132
+ if approx_units > 12_000_000:
133
+ raise optuna.TrialPruned(
134
+ f"config exceeds safe memory budget (approx_units={approx_units})")
135
+
136
+ adaptive_heads, _ = self._resolve_adaptive_heads(
137
+ d_model=d_model,
138
+ requested_heads=params.get("n_heads")
139
+ )
140
+
141
+ mask_prob_num = float(params.get("mask_prob_num", 0.15))
142
+ mask_prob_cat = float(params.get("mask_prob_cat", 0.15))
143
+ num_loss_weight = float(params.get("num_loss_weight", 1.0))
144
+ cat_loss_weight = float(params.get("cat_loss_weight", 1.0))
145
+
146
+ model_params = dict(params)
147
+ model_params["n_heads"] = adaptive_heads
148
+ for k in ("mask_prob_num", "mask_prob_cat", "num_loss_weight", "cat_loss_weight"):
149
+ model_params.pop(k, None)
150
+
151
+ model = FTTransformerSklearn(
152
+ model_nme=self.ctx.model_nme,
153
+ num_cols=self.ctx.num_features,
154
+ cat_cols=self.ctx.cate_list,
155
+ task_type=self.ctx.task_type,
156
+ epochs=self.ctx.epochs,
157
+ patience=5,
158
+ weight_decay=float(params.get("weight_decay", 0.0)),
159
+ use_data_parallel=self.ctx.config.use_ft_data_parallel,
160
+ use_ddp=self.ctx.config.use_ft_ddp,
161
+ num_numeric_tokens=num_numeric_tokens,
162
+ )
163
+ model.set_params(model_params)
164
+ try:
165
+ return float(model.fit_unsupervised(
166
+ X_train,
167
+ X_val=X_val,
168
+ trial=trial,
169
+ geo_train=geo_train,
170
+ geo_val=geo_val,
171
+ mask_prob_num=mask_prob_num,
172
+ mask_prob_cat=mask_prob_cat,
173
+ num_loss_weight=num_loss_weight,
174
+ cat_loss_weight=cat_loss_weight
175
+ ))
176
+ finally:
177
+ getattr(getattr(model, "ft", None), "to",
178
+ lambda *_args, **_kwargs: None)("cpu")
179
+ self._clean_gpu()
180
+
181
+ def cross_val(self, trial: optuna.trial.Trial) -> float:
182
+ # FT-Transformer CV also focuses on memory control:
183
+ # - Shrink search space to avoid oversized models.
184
+ # - Release GPU memory after each fold so the next trial can run.
185
+ # Slightly shrink hyperparameter space to avoid oversized models.
186
+ param_space: Dict[str, Callable[[optuna.trial.Trial], Any]] = {
187
+ "learning_rate": lambda t: t.suggest_float('learning_rate', 1e-5, 5e-4, log=True),
188
+ # "d_model": lambda t: t.suggest_int('d_model', 8, 64, step=8),
189
+ "d_model": lambda t: t.suggest_int('d_model', 16, 128, step=16),
190
+ "n_layers": lambda t: t.suggest_int('n_layers', 2, 8),
191
+ "dropout": lambda t: t.suggest_float('dropout', 0.0, 0.2),
192
+ "weight_decay": lambda t: t.suggest_float('weight_decay', 1e-6, 1e-2, log=True),
193
+ }
194
+ if self.ctx.task_type == 'regression' and self.ctx.obj == 'reg:tweedie':
195
+ param_space["tw_power"] = lambda t: t.suggest_float(
196
+ 'tw_power', 1.0, 2.0)
197
+ geo_enabled = bool(
198
+ self.ctx.geo_token_cols or self.ctx.config.geo_feature_nmes)
199
+ if geo_enabled:
200
+ # Only tune GNN-related hyperparams when geo tokens are enabled.
201
+ param_space.update({
202
+ "geo_token_hidden_dim": lambda t: t.suggest_int('geo_token_hidden_dim', 16, 128, step=16),
203
+ "geo_token_layers": lambda t: t.suggest_int('geo_token_layers', 1, 4),
204
+ "geo_token_k_neighbors": lambda t: t.suggest_int('geo_token_k_neighbors', 5, 20),
205
+ "geo_token_dropout": lambda t: t.suggest_float('geo_token_dropout', 0.0, 0.3),
206
+ "geo_token_learning_rate": lambda t: t.suggest_float('geo_token_learning_rate', 1e-4, 5e-3, log=True),
207
+ })
208
+
209
+ metric_ctx: Dict[str, Any] = {}
210
+
211
+ def data_provider():
212
+ data = self.ctx.train_data
213
+ return data[self.ctx.factor_nmes], data[self.ctx.resp_nme], data[self.ctx.weight_nme]
214
+
215
+ def model_builder(params):
216
+ d_model = int(params["d_model"])
217
+ n_layers = int(params["n_layers"])
218
+ num_numeric_tokens = self._resolve_numeric_tokens()
219
+ token_count = num_numeric_tokens + len(self.ctx.cate_list)
220
+ if geo_enabled:
221
+ token_count += 1
222
+ approx_units = d_model * n_layers * max(1, token_count)
223
+ if approx_units > 12_000_000:
224
+ print(
225
+ f"[FTTrainer] Trial pruned early: d_model={d_model}, n_layers={n_layers} -> approx_units={approx_units}")
226
+ raise optuna.TrialPruned(
227
+ "config exceeds safe memory budget; prune before training")
228
+ geo_params_local = {k: v for k, v in params.items()
229
+ if k.startswith("geo_token_")}
230
+
231
+ tw_power = params.get("tw_power")
232
+ if self.ctx.task_type == 'regression':
233
+ base_tw = self.ctx.default_tweedie_power()
234
+ if self.ctx.obj in ('count:poisson', 'reg:gamma'):
235
+ tw_power = base_tw
236
+ elif tw_power is None:
237
+ tw_power = base_tw
238
+ metric_ctx["tw_power"] = tw_power
239
+
240
+ adaptive_heads, _ = self._resolve_adaptive_heads(
241
+ d_model=d_model,
242
+ requested_heads=params.get("n_heads")
243
+ )
244
+
245
+ return FTTransformerSklearn(
246
+ model_nme=self.ctx.model_nme,
247
+ num_cols=self.ctx.num_features,
248
+ cat_cols=self.ctx.cate_list,
249
+ d_model=d_model,
250
+ n_heads=adaptive_heads,
251
+ n_layers=n_layers,
252
+ dropout=params["dropout"],
253
+ task_type=self.ctx.task_type,
254
+ epochs=self.ctx.epochs,
255
+ tweedie_power=tw_power,
256
+ learning_rate=params["learning_rate"],
257
+ patience=5,
258
+ weight_decay=float(params.get("weight_decay", 0.0)),
259
+ use_data_parallel=self.ctx.config.use_ft_data_parallel,
260
+ use_ddp=self.ctx.config.use_ft_ddp,
261
+ num_numeric_tokens=num_numeric_tokens,
262
+ ).set_params({"_geo_params": geo_params_local} if geo_enabled else {})
263
+
264
+ def fit_predict(model, X_train, y_train, w_train, X_val, y_val, w_val, trial_obj):
265
+ geo_train = geo_val = None
266
+ if geo_enabled:
267
+ geo_params = getattr(model, "_geo_params", {})
268
+ built = self._build_geo_tokens_for_split(
269
+ X_train, X_val, geo_params)
270
+ if built is not None:
271
+ geo_train, geo_val, _, _ = built
272
+ elif not self._cv_geo_warned:
273
+ print(
274
+ "[FTTrainer] Geo tokens unavailable for CV split; continue without geo tokens.",
275
+ flush=True,
276
+ )
277
+ self._cv_geo_warned = True
278
+ model.fit(
279
+ X_train, y_train, w_train,
280
+ X_val, y_val, w_val,
281
+ trial=trial_obj,
282
+ geo_train=geo_train,
283
+ geo_val=geo_val
284
+ )
285
+ return model.predict(X_val, geo_tokens=geo_val)
286
+
287
+ def metric_fn(y_true, y_pred, weight):
288
+ if self.ctx.task_type == 'regression':
289
+ return mean_tweedie_deviance(
290
+ y_true,
291
+ y_pred,
292
+ sample_weight=weight,
293
+ power=metric_ctx.get("tw_power", 1.5)
294
+ )
295
+ return log_loss(y_true, y_pred, sample_weight=weight)
296
+
297
+ data_for_cap = data_provider()[0]
298
+ max_rows_for_ft_bo = min(1000000, int(len(data_for_cap)/2))
299
+
300
+ return self.cross_val_generic(
301
+ trial=trial,
302
+ hyperparameter_space=param_space,
303
+ data_provider=data_provider,
304
+ model_builder=model_builder,
305
+ metric_fn=metric_fn,
306
+ sample_limit=max_rows_for_ft_bo if len(
307
+ data_for_cap) > max_rows_for_ft_bo > 0 else None,
308
+ fit_predict_fn=fit_predict,
309
+ cleanup_fn=lambda m: getattr(
310
+ getattr(m, "ft", None), "to", lambda *_args, **_kwargs: None)("cpu")
311
+ )
312
+
313
+ def train(self) -> None:
314
+ if not self.best_params:
315
+ raise RuntimeError("Run tune() first to obtain best FT-Transformer parameters.")
316
+ resolved_params = dict(self.best_params)
317
+ d_model_value = resolved_params.get("d_model", 64)
318
+ adaptive_heads, heads_adjusted = self._resolve_adaptive_heads(
319
+ d_model=d_model_value,
320
+ requested_heads=resolved_params.get("n_heads")
321
+ )
322
+ if heads_adjusted:
323
+ print(f"[FTTrainer] Auto-adjusted n_heads from "
324
+ f"{resolved_params.get('n_heads')} to {adaptive_heads} "
325
+ f"(d_model={d_model_value}).")
326
+ resolved_params["n_heads"] = adaptive_heads
327
+
328
+ use_refit = bool(getattr(self.ctx.config, "final_refit", True))
329
+ refit_epochs = None
330
+ X_all = self.ctx.train_data[self.ctx.factor_nmes]
331
+ y_all = self.ctx.train_data[self.ctx.resp_nme]
332
+ w_all = self.ctx.train_data[self.ctx.weight_nme]
333
+ split = self._resolve_train_val_indices(X_all)
334
+ if use_refit and split is not None:
335
+ train_idx, val_idx = split
336
+ tmp_model = FTTransformerSklearn(
337
+ model_nme=self.ctx.model_nme,
338
+ num_cols=self.ctx.num_features,
339
+ cat_cols=self.ctx.cate_list,
340
+ task_type=self.ctx.task_type,
341
+ use_data_parallel=self.ctx.config.use_ft_data_parallel,
342
+ use_ddp=self.ctx.config.use_ft_ddp,
343
+ num_numeric_tokens=self._resolve_numeric_tokens(),
344
+ weight_decay=float(resolved_params.get("weight_decay", 0.0)),
345
+ )
346
+ tmp_model.set_params(resolved_params)
347
+ geo_train_full = self.ctx.train_geo_tokens
348
+ geo_train = None if geo_train_full is None else geo_train_full.iloc[train_idx]
349
+ geo_val = None if geo_train_full is None else geo_train_full.iloc[val_idx]
350
+ tmp_model.fit(
351
+ X_all.iloc[train_idx],
352
+ y_all.iloc[train_idx],
353
+ w_all.iloc[train_idx],
354
+ X_all.iloc[val_idx],
355
+ y_all.iloc[val_idx],
356
+ w_all.iloc[val_idx],
357
+ trial=None,
358
+ geo_train=geo_train,
359
+ geo_val=geo_val,
360
+ )
361
+ refit_epochs = self._resolve_best_epoch(
362
+ getattr(tmp_model, "training_history", None),
363
+ default_epochs=int(self.ctx.epochs),
364
+ )
365
+ getattr(getattr(tmp_model, "ft", None), "to",
366
+ lambda *_args, **_kwargs: None)("cpu")
367
+ self._clean_gpu()
368
+
369
+ self.model = FTTransformerSklearn(
370
+ model_nme=self.ctx.model_nme,
371
+ num_cols=self.ctx.num_features,
372
+ cat_cols=self.ctx.cate_list,
373
+ task_type=self.ctx.task_type,
374
+ use_data_parallel=self.ctx.config.use_ft_data_parallel,
375
+ use_ddp=self.ctx.config.use_ft_ddp,
376
+ num_numeric_tokens=self._resolve_numeric_tokens(),
377
+ weight_decay=float(resolved_params.get("weight_decay", 0.0)),
378
+ )
379
+ if refit_epochs is not None:
380
+ self.model.epochs = int(refit_epochs)
381
+ self.model.set_params(resolved_params)
382
+ self.best_params = resolved_params
383
+ loss_plot_path = self.output.plot_path(
384
+ f'{self.ctx.model_nme}/loss/loss_{self.ctx.model_nme}_{self.model_name_prefix}.png')
385
+ self.model.loss_curve_path = loss_plot_path
386
+ geo_train = self.ctx.train_geo_tokens
387
+ geo_test = self.ctx.test_geo_tokens
388
+ fit_kwargs = {}
389
+ predict_kwargs_train = None
390
+ predict_kwargs_test = None
391
+ if geo_train is not None and geo_test is not None:
392
+ fit_kwargs["geo_train"] = geo_train
393
+ predict_kwargs_train = {"geo_tokens": geo_train}
394
+ predict_kwargs_test = {"geo_tokens": geo_test}
395
+ self._fit_predict_cache(
396
+ self.model,
397
+ self.ctx.train_data[self.ctx.factor_nmes],
398
+ self.ctx.train_data[self.ctx.resp_nme],
399
+ sample_weight=self.ctx.train_data[self.ctx.weight_nme],
400
+ pred_prefix='ft',
401
+ sample_weight_arg='w_train',
402
+ fit_kwargs=fit_kwargs,
403
+ predict_kwargs_train=predict_kwargs_train,
404
+ predict_kwargs_test=predict_kwargs_test
405
+ )
406
+ self.ctx.ft_best = self.model
407
+
408
+ def ensemble_predict(self, k: int) -> None:
409
+ if not self.best_params:
410
+ raise RuntimeError("Run tune() first to obtain best FT-Transformer parameters.")
411
+ k = max(2, int(k))
412
+ X_all = self.ctx.train_data[self.ctx.factor_nmes]
413
+ y_all = self.ctx.train_data[self.ctx.resp_nme]
414
+ w_all = self.ctx.train_data[self.ctx.weight_nme]
415
+ X_test = self.ctx.test_data[self.ctx.factor_nmes]
416
+ n_samples = len(X_all)
417
+ geo_train_full = self.ctx.train_geo_tokens
418
+ geo_test_full = self.ctx.test_geo_tokens
419
+
420
+ resolved_params = dict(self.best_params)
421
+ default_d_model = getattr(self.model, "d_model", 64)
422
+ adaptive_heads, _ = self._resolve_adaptive_heads(
423
+ d_model=resolved_params.get("d_model", default_d_model),
424
+ requested_heads=resolved_params.get("n_heads")
425
+ )
426
+ resolved_params["n_heads"] = adaptive_heads
427
+
428
+ split_iter, _ = self._resolve_ensemble_splits(X_all, k=k)
429
+ if split_iter is None:
430
+ print(
431
+ f"[FT Ensemble] unable to build CV split (n_samples={n_samples}); skip ensemble.",
432
+ flush=True,
433
+ )
434
+ return
435
+ preds_train_sum = np.zeros(n_samples, dtype=np.float64)
436
+ preds_test_sum = np.zeros(len(X_test), dtype=np.float64)
437
+
438
+ split_count = 0
439
+ for train_idx, val_idx in split_iter:
440
+ model = FTTransformerSklearn(
441
+ model_nme=self.ctx.model_nme,
442
+ num_cols=self.ctx.num_features,
443
+ cat_cols=self.ctx.cate_list,
444
+ task_type=self.ctx.task_type,
445
+ use_data_parallel=self.ctx.config.use_ft_data_parallel,
446
+ use_ddp=self.ctx.config.use_ft_ddp,
447
+ num_numeric_tokens=self._resolve_numeric_tokens(),
448
+ weight_decay=float(resolved_params.get("weight_decay", 0.0)),
449
+ )
450
+ model.set_params(resolved_params)
451
+
452
+ geo_train = geo_val = None
453
+ if geo_train_full is not None:
454
+ geo_train = geo_train_full.iloc[train_idx]
455
+ geo_val = geo_train_full.iloc[val_idx]
456
+
457
+ model.fit(
458
+ X_all.iloc[train_idx],
459
+ y_all.iloc[train_idx],
460
+ w_all.iloc[train_idx],
461
+ X_all.iloc[val_idx],
462
+ y_all.iloc[val_idx],
463
+ w_all.iloc[val_idx],
464
+ trial=None,
465
+ geo_train=geo_train,
466
+ geo_val=geo_val,
467
+ )
468
+
469
+ pred_train = model.predict(X_all, geo_tokens=geo_train_full)
470
+ pred_test = model.predict(X_test, geo_tokens=geo_test_full)
471
+ preds_train_sum += np.asarray(pred_train, dtype=np.float64)
472
+ preds_test_sum += np.asarray(pred_test, dtype=np.float64)
473
+ getattr(getattr(model, "ft", None), "to",
474
+ lambda *_args, **_kwargs: None)("cpu")
475
+ self._clean_gpu()
476
+ split_count += 1
477
+
478
+ if split_count < 1:
479
+ print(
480
+ f"[FT Ensemble] no CV splits generated; skip ensemble.",
481
+ flush=True,
482
+ )
483
+ return
484
+ preds_train = preds_train_sum / float(split_count)
485
+ preds_test = preds_test_sum / float(split_count)
486
+ self._cache_predictions("ft", preds_train, preds_test)
487
+
488
+ def _resolve_oof_splitter(self, n_samples: int):
489
+ cfg = self.ctx.config
490
+ raw_strategy = str(getattr(cfg, "ft_oof_strategy", "auto") or "auto").strip().lower()
491
+ base_strategy = str(getattr(cfg, "cv_strategy", "random") or "random").strip().lower()
492
+ if raw_strategy == "auto":
493
+ strategy = base_strategy
494
+ else:
495
+ strategy = raw_strategy
496
+
497
+ oof_folds = getattr(cfg, "ft_oof_folds", None)
498
+ if oof_folds is None:
499
+ if strategy in {"random", "group", "grouped"}:
500
+ val_ratio = float(self.ctx.prop_test) if self.ctx.prop_test else 0.25
501
+ if not (0.0 < val_ratio < 1.0):
502
+ val_ratio = 0.25
503
+ oof_folds = max(2, int(round(1 / val_ratio)))
504
+ else:
505
+ oof_folds = 0
506
+ oof_folds = int(oof_folds)
507
+
508
+ if oof_folds < 2 or n_samples < oof_folds:
509
+ return None, None, 0
510
+
511
+ if strategy in {"group", "grouped"}:
512
+ group_col = getattr(cfg, "cv_group_col", None)
513
+ if not group_col:
514
+ raise ValueError("cv_group_col is required for FT OOF group strategy.")
515
+ if group_col not in self.ctx.train_data.columns:
516
+ raise KeyError(f"cv_group_col '{group_col}' not in train_data.")
517
+ groups = self.ctx.train_data[group_col]
518
+ splitter = GroupKFold(n_splits=oof_folds)
519
+ return splitter, groups, oof_folds
520
+
521
+ if strategy in {"time", "timeseries", "temporal"}:
522
+ time_col = getattr(cfg, "cv_time_col", None)
523
+ if not time_col:
524
+ raise ValueError("cv_time_col is required for FT OOF time strategy.")
525
+ if time_col not in self.ctx.train_data.columns:
526
+ raise KeyError(f"cv_time_col '{time_col}' not in train_data.")
527
+ ascending = bool(getattr(cfg, "cv_time_ascending", True))
528
+ order_index = self.ctx.train_data[time_col].sort_values(ascending=ascending).index
529
+ order = self.ctx.train_data.index.get_indexer(order_index)
530
+ if n_samples <= oof_folds:
531
+ return None, None, 0
532
+ splitter = TimeSeriesSplit(n_splits=oof_folds)
533
+ return _OrderSplitter(splitter, order), None, oof_folds
534
+
535
+ shuffle = bool(getattr(cfg, "ft_oof_shuffle", True))
536
+ splitter = KFold(
537
+ n_splits=oof_folds,
538
+ shuffle=shuffle,
539
+ random_state=self.ctx.rand_seed if shuffle else None,
540
+ )
541
+ return splitter, None, oof_folds
542
+
543
+ def _build_ft_feature_model(self, resolved_params: Dict[str, Any]) -> FTTransformerSklearn:
544
+ model = FTTransformerSklearn(
545
+ model_nme=self.ctx.model_nme,
546
+ num_cols=self.ctx.num_features,
547
+ cat_cols=self.ctx.cate_list,
548
+ task_type=self.ctx.task_type,
549
+ use_data_parallel=self.ctx.config.use_ft_data_parallel,
550
+ use_ddp=self.ctx.config.use_ft_ddp,
551
+ num_numeric_tokens=self._resolve_numeric_tokens(),
552
+ )
553
+ adaptive_heads, heads_adjusted = self._resolve_adaptive_heads(
554
+ d_model=resolved_params.get("d_model", model.d_model),
555
+ requested_heads=resolved_params.get("n_heads"),
556
+ )
557
+ if heads_adjusted:
558
+ print(
559
+ f"[FTTrainer] Auto-adjusted n_heads from "
560
+ f"{resolved_params.get('n_heads')} to {adaptive_heads} "
561
+ f"(d_model={resolved_params.get('d_model', model.d_model)})."
562
+ )
563
+ resolved_params["n_heads"] = adaptive_heads
564
+ if resolved_params:
565
+ model.set_params(resolved_params)
566
+ return model
567
+
568
+ def _oof_predict_train(
569
+ self,
570
+ resolved_params: Dict[str, Any],
571
+ *,
572
+ feature_mode: str,
573
+ geo_train_full: Optional[pd.DataFrame],
574
+ ) -> Optional[np.ndarray]:
575
+ X_all = self.ctx.train_data[self.ctx.factor_nmes]
576
+ y_all = self.ctx.train_data[self.ctx.resp_nme]
577
+ w_all = self.ctx.train_data[self.ctx.weight_nme]
578
+ splitter, groups, oof_folds = self._resolve_oof_splitter(len(X_all))
579
+ if splitter is None:
580
+ return None
581
+
582
+ preds_train = None
583
+ for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X_all, y_all, groups=groups), start=1):
584
+ X_train = X_all.iloc[train_idx]
585
+ y_train = y_all.iloc[train_idx]
586
+ w_train = w_all.iloc[train_idx]
587
+ X_val = X_all.iloc[val_idx]
588
+ y_val = y_all.iloc[val_idx]
589
+ w_val = w_all.iloc[val_idx]
590
+
591
+ geo_train = geo_val = None
592
+ if geo_train_full is not None:
593
+ geo_train = geo_train_full.iloc[train_idx]
594
+ geo_val = geo_train_full.iloc[val_idx]
595
+
596
+ model = self._build_ft_feature_model(dict(resolved_params))
597
+ model.fit(
598
+ X_train,
599
+ y_train,
600
+ w_train=w_train,
601
+ X_val=X_val,
602
+ y_val=y_val,
603
+ w_val=w_val,
604
+ trial=None,
605
+ geo_train=geo_train,
606
+ geo_val=geo_val,
607
+ )
608
+
609
+ predict_kwargs = {}
610
+ if geo_val is not None:
611
+ predict_kwargs["geo_tokens"] = geo_val
612
+ if feature_mode == "embedding":
613
+ predict_kwargs["return_embedding"] = True
614
+ fold_pred = model.predict(X_val, **predict_kwargs)
615
+ fold_pred = np.asarray(fold_pred)
616
+ if preds_train is None:
617
+ preds_train = np.empty((len(X_all),) + fold_pred.shape[1:], dtype=fold_pred.dtype)
618
+ preds_train[val_idx] = fold_pred
619
+
620
+ getattr(getattr(model, "ft", None), "to", lambda *_a, **_k: None)("cpu")
621
+ self._clean_gpu()
622
+
623
+ if preds_train is None:
624
+ return None
625
+ if oof_folds < 2:
626
+ return None
627
+ return preds_train
628
+
629
+ def train_as_feature(self, pred_prefix: str = "ft_feat", feature_mode: str = "prediction") -> None:
630
+ """Train FT-Transformer only to generate features (not recorded as final model)."""
631
+ if not self.best_params:
632
+ raise RuntimeError("Run tune() first to obtain best FT-Transformer parameters.")
633
+ resolved_params = dict(self.best_params)
634
+ if feature_mode not in ("prediction", "embedding"):
635
+ raise ValueError(
636
+ f"Unsupported feature_mode='{feature_mode}', expected 'prediction' or 'embedding'.")
637
+
638
+ geo_train = self.ctx.train_geo_tokens
639
+ geo_test = self.ctx.test_geo_tokens
640
+ fit_kwargs = {}
641
+ predict_kwargs_train = None
642
+ predict_kwargs_test = None
643
+ if geo_train is not None and geo_test is not None:
644
+ fit_kwargs["geo_train"] = geo_train
645
+ predict_kwargs_train = {"geo_tokens": geo_train}
646
+ predict_kwargs_test = {"geo_tokens": geo_test}
647
+
648
+ if feature_mode == "embedding":
649
+ predict_kwargs_train = dict(predict_kwargs_train or {})
650
+ predict_kwargs_test = dict(predict_kwargs_test or {})
651
+ predict_kwargs_train["return_embedding"] = True
652
+ predict_kwargs_test["return_embedding"] = True
653
+
654
+ oof_preds = self._oof_predict_train(
655
+ resolved_params,
656
+ feature_mode=feature_mode,
657
+ geo_train_full=geo_train,
658
+ )
659
+ if oof_preds is not None:
660
+ self.model = self._build_ft_feature_model(resolved_params)
661
+ self.best_params = resolved_params
662
+ self.model.fit(
663
+ self.ctx.train_data[self.ctx.factor_nmes],
664
+ self.ctx.train_data[self.ctx.resp_nme],
665
+ w_train=self.ctx.train_data[self.ctx.weight_nme],
666
+ X_val=None,
667
+ y_val=None,
668
+ w_val=None,
669
+ trial=None,
670
+ geo_train=geo_train,
671
+ geo_val=None,
672
+ )
673
+ predict_kwargs = dict(predict_kwargs_test or {})
674
+ preds_test = self.model.predict(
675
+ self.ctx.test_data[self.ctx.factor_nmes],
676
+ **predict_kwargs,
677
+ )
678
+ self._cache_predictions(pred_prefix, oof_preds, preds_test)
679
+ return
680
+
681
+ self.model = self._build_ft_feature_model(resolved_params)
682
+ self.best_params = resolved_params
683
+ self._fit_predict_cache(
684
+ self.model,
685
+ self.ctx.train_data[self.ctx.factor_nmes],
686
+ self.ctx.train_data[self.ctx.resp_nme],
687
+ sample_weight=self.ctx.train_data[self.ctx.weight_nme],
688
+ pred_prefix=pred_prefix,
689
+ sample_weight_arg='w_train',
690
+ fit_kwargs=fit_kwargs,
691
+ predict_kwargs_train=predict_kwargs_train,
692
+ predict_kwargs_test=predict_kwargs_test,
693
+ record_label=False,
694
+ )
695
+
696
+ def pretrain_unsupervised_as_feature(self,
697
+ pred_prefix: str = "ft_uemb",
698
+ params: Optional[Dict[str,
699
+ Any]] = None,
700
+ mask_prob_num: float = 0.15,
701
+ mask_prob_cat: float = 0.15,
702
+ num_loss_weight: float = 1.0,
703
+ cat_loss_weight: float = 1.0) -> None:
704
+ """Self-supervised pretraining (masked reconstruction) and cache embeddings."""
705
+ self.model = FTTransformerSklearn(
706
+ model_nme=self.ctx.model_nme,
707
+ num_cols=self.ctx.num_features,
708
+ cat_cols=self.ctx.cate_list,
709
+ task_type=self.ctx.task_type,
710
+ use_data_parallel=self.ctx.config.use_ft_data_parallel,
711
+ use_ddp=self.ctx.config.use_ft_ddp,
712
+ num_numeric_tokens=self._resolve_numeric_tokens(),
713
+ )
714
+ resolved_params = dict(params or {})
715
+ # Reuse supervised tuning structure params unless explicitly overridden.
716
+ if not resolved_params and self.best_params:
717
+ resolved_params = dict(self.best_params)
718
+
719
+ # If params include masked reconstruction fields, they take precedence.
720
+ mask_prob_num = float(resolved_params.pop(
721
+ "mask_prob_num", mask_prob_num))
722
+ mask_prob_cat = float(resolved_params.pop(
723
+ "mask_prob_cat", mask_prob_cat))
724
+ num_loss_weight = float(resolved_params.pop(
725
+ "num_loss_weight", num_loss_weight))
726
+ cat_loss_weight = float(resolved_params.pop(
727
+ "cat_loss_weight", cat_loss_weight))
728
+
729
+ adaptive_heads, heads_adjusted = self._resolve_adaptive_heads(
730
+ d_model=resolved_params.get("d_model", self.model.d_model),
731
+ requested_heads=resolved_params.get("n_heads")
732
+ )
733
+ if heads_adjusted:
734
+ print(f"[FTTrainer] Auto-adjusted n_heads from "
735
+ f"{resolved_params.get('n_heads')} to {adaptive_heads} "
736
+ f"(d_model={resolved_params.get('d_model', self.model.d_model)}).")
737
+ resolved_params["n_heads"] = adaptive_heads
738
+ if resolved_params:
739
+ self.model.set_params(resolved_params)
740
+
741
+ loss_plot_path = self.output.plot_path(
742
+ f'{self.ctx.model_nme}/loss/loss_{self.ctx.model_nme}_FTTransformerUnsupervised.png')
743
+ self.model.loss_curve_path = loss_plot_path
744
+
745
+ # Build a simple holdout split for pretraining early stopping.
746
+ X_all = self.ctx.train_data[self.ctx.factor_nmes]
747
+ split = self._resolve_train_val_indices(X_all, allow_default=True)
748
+ if split is None:
749
+ raise ValueError("Unable to build train/val split for FT unsupervised training.")
750
+ train_idx, val_idx = split
751
+ X_tr = X_all.iloc[train_idx]
752
+ X_val = X_all.iloc[val_idx]
753
+
754
+ geo_all = self.ctx.train_geo_tokens
755
+ geo_tr = geo_val = None
756
+ if geo_all is not None:
757
+ geo_tr = geo_all.loc[X_tr.index]
758
+ geo_val = geo_all.loc[X_val.index]
759
+
760
+ self.model.fit_unsupervised(
761
+ X_tr,
762
+ X_val=X_val,
763
+ geo_train=geo_tr,
764
+ geo_val=geo_val,
765
+ mask_prob_num=mask_prob_num,
766
+ mask_prob_cat=mask_prob_cat,
767
+ num_loss_weight=num_loss_weight,
768
+ cat_loss_weight=cat_loss_weight
769
+ )
770
+
771
+ geo_train_full = self.ctx.train_geo_tokens
772
+ geo_test_full = self.ctx.test_geo_tokens
773
+ predict_kwargs_train = {"return_embedding": True}
774
+ predict_kwargs_test = {"return_embedding": True}
775
+ if geo_train_full is not None and geo_test_full is not None:
776
+ predict_kwargs_train["geo_tokens"] = geo_train_full
777
+ predict_kwargs_test["geo_tokens"] = geo_test_full
778
+
779
+ self._predict_and_cache(
780
+ self.model,
781
+ pred_prefix=pred_prefix,
782
+ predict_kwargs_train=predict_kwargs_train,
783
+ predict_kwargs_test=predict_kwargs_test
784
+ )
785
+
786
+
787
+ # =============================================================================