ins-pricing 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. ins_pricing/README.md +9 -6
  2. ins_pricing/__init__.py +3 -11
  3. ins_pricing/cli/BayesOpt_entry.py +24 -0
  4. ins_pricing/{modelling → cli}/BayesOpt_incremental.py +197 -64
  5. ins_pricing/cli/Explain_Run.py +25 -0
  6. ins_pricing/{modelling → cli}/Explain_entry.py +169 -124
  7. ins_pricing/cli/Pricing_Run.py +25 -0
  8. ins_pricing/cli/__init__.py +1 -0
  9. ins_pricing/cli/bayesopt_entry_runner.py +1312 -0
  10. ins_pricing/cli/utils/__init__.py +1 -0
  11. ins_pricing/cli/utils/cli_common.py +320 -0
  12. ins_pricing/cli/utils/cli_config.py +375 -0
  13. ins_pricing/{modelling → cli/utils}/notebook_utils.py +74 -19
  14. {ins_pricing_gemini/modelling → ins_pricing/cli}/watchdog_run.py +2 -2
  15. ins_pricing/{modelling → docs/modelling}/BayesOpt_USAGE.md +69 -49
  16. ins_pricing/docs/modelling/README.md +34 -0
  17. ins_pricing/modelling/__init__.py +57 -6
  18. ins_pricing/modelling/core/__init__.py +1 -0
  19. ins_pricing/modelling/{bayesopt → core/bayesopt}/config_preprocess.py +64 -1
  20. ins_pricing/modelling/{bayesopt → core/bayesopt}/core.py +150 -810
  21. ins_pricing/modelling/core/bayesopt/model_explain_mixin.py +296 -0
  22. ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py +548 -0
  23. ins_pricing/modelling/core/bayesopt/models/__init__.py +27 -0
  24. ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +316 -0
  25. ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +808 -0
  26. ins_pricing/modelling/core/bayesopt/models/model_gnn.py +675 -0
  27. ins_pricing/modelling/core/bayesopt/models/model_resn.py +435 -0
  28. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +19 -0
  29. ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +1020 -0
  30. ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +787 -0
  31. ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +195 -0
  32. ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +312 -0
  33. ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +261 -0
  34. ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +348 -0
  35. ins_pricing/modelling/{bayesopt → core/bayesopt}/utils.py +2 -2
  36. ins_pricing/modelling/core/evaluation.py +115 -0
  37. ins_pricing/production/__init__.py +4 -0
  38. ins_pricing/production/preprocess.py +71 -0
  39. ins_pricing/setup.py +10 -5
  40. {ins_pricing_gemini/modelling/tests → ins_pricing/tests/modelling}/test_plotting.py +2 -2
  41. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/METADATA +4 -4
  42. ins_pricing-0.2.0.dist-info/RECORD +125 -0
  43. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/top_level.txt +0 -1
  44. ins_pricing/modelling/BayesOpt_entry.py +0 -633
  45. ins_pricing/modelling/Explain_Run.py +0 -36
  46. ins_pricing/modelling/Pricing_Run.py +0 -36
  47. ins_pricing/modelling/README.md +0 -33
  48. ins_pricing/modelling/bayesopt/models.py +0 -2196
  49. ins_pricing/modelling/bayesopt/trainers.py +0 -2446
  50. ins_pricing/modelling/cli_common.py +0 -136
  51. ins_pricing/modelling/tests/test_plotting.py +0 -63
  52. ins_pricing/modelling/watchdog_run.py +0 -211
  53. ins_pricing-0.1.11.dist-info/RECORD +0 -169
  54. ins_pricing_gemini/__init__.py +0 -23
  55. ins_pricing_gemini/governance/__init__.py +0 -20
  56. ins_pricing_gemini/governance/approval.py +0 -93
  57. ins_pricing_gemini/governance/audit.py +0 -37
  58. ins_pricing_gemini/governance/registry.py +0 -99
  59. ins_pricing_gemini/governance/release.py +0 -159
  60. ins_pricing_gemini/modelling/Explain_Run.py +0 -36
  61. ins_pricing_gemini/modelling/Pricing_Run.py +0 -36
  62. ins_pricing_gemini/modelling/__init__.py +0 -151
  63. ins_pricing_gemini/modelling/cli_common.py +0 -141
  64. ins_pricing_gemini/modelling/config.py +0 -249
  65. ins_pricing_gemini/modelling/config_preprocess.py +0 -254
  66. ins_pricing_gemini/modelling/core.py +0 -741
  67. ins_pricing_gemini/modelling/data_container.py +0 -42
  68. ins_pricing_gemini/modelling/explain/__init__.py +0 -55
  69. ins_pricing_gemini/modelling/explain/gradients.py +0 -334
  70. ins_pricing_gemini/modelling/explain/metrics.py +0 -176
  71. ins_pricing_gemini/modelling/explain/permutation.py +0 -155
  72. ins_pricing_gemini/modelling/explain/shap_utils.py +0 -146
  73. ins_pricing_gemini/modelling/features.py +0 -215
  74. ins_pricing_gemini/modelling/model_manager.py +0 -148
  75. ins_pricing_gemini/modelling/model_plotting.py +0 -463
  76. ins_pricing_gemini/modelling/models.py +0 -2203
  77. ins_pricing_gemini/modelling/notebook_utils.py +0 -294
  78. ins_pricing_gemini/modelling/plotting/__init__.py +0 -45
  79. ins_pricing_gemini/modelling/plotting/common.py +0 -63
  80. ins_pricing_gemini/modelling/plotting/curves.py +0 -572
  81. ins_pricing_gemini/modelling/plotting/diagnostics.py +0 -139
  82. ins_pricing_gemini/modelling/plotting/geo.py +0 -362
  83. ins_pricing_gemini/modelling/plotting/importance.py +0 -121
  84. ins_pricing_gemini/modelling/run_logging.py +0 -133
  85. ins_pricing_gemini/modelling/tests/conftest.py +0 -8
  86. ins_pricing_gemini/modelling/tests/test_cross_val_generic.py +0 -66
  87. ins_pricing_gemini/modelling/tests/test_distributed_utils.py +0 -18
  88. ins_pricing_gemini/modelling/tests/test_explain.py +0 -56
  89. ins_pricing_gemini/modelling/tests/test_geo_tokens_split.py +0 -49
  90. ins_pricing_gemini/modelling/tests/test_graph_cache.py +0 -33
  91. ins_pricing_gemini/modelling/tests/test_plotting_library.py +0 -150
  92. ins_pricing_gemini/modelling/tests/test_preprocessor.py +0 -48
  93. ins_pricing_gemini/modelling/trainers.py +0 -2447
  94. ins_pricing_gemini/modelling/utils.py +0 -1020
  95. ins_pricing_gemini/pricing/__init__.py +0 -27
  96. ins_pricing_gemini/pricing/calibration.py +0 -39
  97. ins_pricing_gemini/pricing/data_quality.py +0 -117
  98. ins_pricing_gemini/pricing/exposure.py +0 -85
  99. ins_pricing_gemini/pricing/factors.py +0 -91
  100. ins_pricing_gemini/pricing/monitoring.py +0 -99
  101. ins_pricing_gemini/pricing/rate_table.py +0 -78
  102. ins_pricing_gemini/production/__init__.py +0 -21
  103. ins_pricing_gemini/production/drift.py +0 -30
  104. ins_pricing_gemini/production/monitoring.py +0 -143
  105. ins_pricing_gemini/production/scoring.py +0 -40
  106. ins_pricing_gemini/reporting/__init__.py +0 -11
  107. ins_pricing_gemini/reporting/report_builder.py +0 -72
  108. ins_pricing_gemini/reporting/scheduler.py +0 -45
  109. ins_pricing_gemini/scripts/BayesOpt_incremental.py +0 -722
  110. ins_pricing_gemini/scripts/Explain_entry.py +0 -545
  111. ins_pricing_gemini/scripts/__init__.py +0 -1
  112. ins_pricing_gemini/scripts/train.py +0 -568
  113. ins_pricing_gemini/setup.py +0 -55
  114. ins_pricing_gemini/smoke_test.py +0 -28
  115. /ins_pricing/{modelling → cli/utils}/run_logging.py +0 -0
  116. /ins_pricing/modelling/{BayesOpt.py → core/BayesOpt.py} +0 -0
  117. /ins_pricing/modelling/{bayesopt → core/bayesopt}/__init__.py +0 -0
  118. /ins_pricing/{modelling/tests → tests/modelling}/conftest.py +0 -0
  119. /ins_pricing/{modelling/tests → tests/modelling}/test_cross_val_generic.py +0 -0
  120. /ins_pricing/{modelling/tests → tests/modelling}/test_distributed_utils.py +0 -0
  121. /ins_pricing/{modelling/tests → tests/modelling}/test_explain.py +0 -0
  122. /ins_pricing/{modelling/tests → tests/modelling}/test_geo_tokens_split.py +0 -0
  123. /ins_pricing/{modelling/tests → tests/modelling}/test_graph_cache.py +0 -0
  124. /ins_pricing/{modelling/tests → tests/modelling}/test_plotting_library.py +0 -0
  125. /ins_pricing/{modelling/tests → tests/modelling}/test_preprocessor.py +0 -0
  126. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/WHEEL +0 -0
@@ -1,2447 +0,0 @@
1
- # =============================================================================
2
- from __future__ import annotations
3
-
4
- from datetime import timedelta
5
- import gc
6
- import os
7
- from pathlib import Path
8
- from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
9
-
10
- import joblib
11
- import numpy as np
12
- import optuna
13
- import pandas as pd
14
- import torch
15
- try: # pragma: no cover
16
- import torch.distributed as dist # type: ignore
17
- except Exception: # pragma: no cover
18
- dist = None # type: ignore
19
- import xgboost as xgb
20
- from sklearn.metrics import log_loss, mean_tweedie_deviance
21
- from sklearn.model_selection import KFold, ShuffleSplit
22
- from sklearn.preprocessing import StandardScaler
23
-
24
- import statsmodels.api as sm
25
-
26
- from .config import BayesOptConfig
27
- from .config_preprocess import BayesOptConfig, OutputManager
28
- from .models import FTTransformerSklearn, GraphNeuralNetSklearn, ResNetSklearn
29
- from .utils import DistributedUtils, EPS, ensure_parent_dir
30
-
31
- _XGB_CUDA_CHECKED = False
32
- _XGB_HAS_CUDA = False
33
-
34
-
35
- def _xgb_cuda_available() -> bool:
36
- # Best-effort check for XGBoost CUDA build; cached to avoid repeated checks.
37
- global _XGB_CUDA_CHECKED, _XGB_HAS_CUDA
38
- if _XGB_CUDA_CHECKED:
39
- return _XGB_HAS_CUDA
40
- _XGB_CUDA_CHECKED = True
41
- if not torch.cuda.is_available():
42
- _XGB_HAS_CUDA = False
43
- return False
44
- try:
45
- build_info = getattr(xgb, "build_info", None)
46
- if callable(build_info):
47
- info = build_info()
48
- for key in ("USE_CUDA", "use_cuda", "cuda"):
49
- if key in info:
50
- val = info[key]
51
- if isinstance(val, str):
52
- _XGB_HAS_CUDA = val.strip().upper() in (
53
- "ON", "YES", "TRUE", "1")
54
- else:
55
- _XGB_HAS_CUDA = bool(val)
56
- return _XGB_HAS_CUDA
57
- except Exception:
58
- pass
59
- try:
60
- has_cuda = getattr(getattr(xgb, "core", None), "_has_cuda_support", None)
61
- if callable(has_cuda):
62
- _XGB_HAS_CUDA = bool(has_cuda())
63
- return _XGB_HAS_CUDA
64
- except Exception:
65
- pass
66
- _XGB_HAS_CUDA = False
67
- return False
68
-
69
- # =============================================================================
70
- # Trainer system
71
- # =============================================================================
72
-
73
-
74
- class TrainerBase:
75
- def __init__(self, context: "BayesOptModel", label: str, model_name_prefix: str) -> None:
76
- self.ctx = context
77
- self.label = label
78
- self.model_name_prefix = model_name_prefix
79
- self.model = None
80
- self.best_params: Optional[Dict[str, Any]] = None
81
- self.best_trial = None
82
- self.study_name: Optional[str] = None
83
- self.enable_distributed_optuna: bool = False
84
- self._distributed_forced_params: Optional[Dict[str, Any]] = None
85
-
86
- def _dist_barrier(self, reason: str) -> None:
87
- """DDP barrier wrapper used by distributed Optuna.
88
-
89
- To debug "trial finished but next trial never starts" hangs, set these
90
- environment variables (either in shell or config.json `env`):
91
- - `BAYESOPT_DDP_BARRIER_DEBUG=1` to print barrier enter/exit per-rank
92
- - `BAYESOPT_DDP_BARRIER_TIMEOUT=300` to fail fast instead of waiting forever
93
- - `TORCH_DISTRIBUTED_DEBUG=DETAIL` and `NCCL_DEBUG=INFO` for PyTorch/NCCL logs
94
- """
95
- if dist is None:
96
- return
97
- try:
98
- if not getattr(dist, "is_available", lambda: False)():
99
- return
100
- if not dist.is_initialized():
101
- return
102
- except Exception:
103
- return
104
-
105
- timeout_seconds = int(os.environ.get("BAYESOPT_DDP_BARRIER_TIMEOUT", "1800"))
106
- debug_barrier = os.environ.get("BAYESOPT_DDP_BARRIER_DEBUG", "").strip() in {"1", "true", "TRUE", "yes", "YES"}
107
- rank = None
108
- world = None
109
- if debug_barrier:
110
- try:
111
- rank = dist.get_rank()
112
- world = dist.get_world_size()
113
- print(f"[DDP][{self.label}] entering barrier({reason}) rank={rank}/{world}", flush=True)
114
- except Exception:
115
- debug_barrier = False
116
- try:
117
- timeout = timedelta(seconds=timeout_seconds)
118
- backend = None
119
- try:
120
- backend = dist.get_backend()
121
- except Exception:
122
- backend = None
123
-
124
- # `monitored_barrier` is only implemented for GLOO; using it under NCCL
125
- # will raise and can itself trigger a secondary hang. Prefer an async
126
- # barrier with timeout for NCCL.
127
- monitored = getattr(dist, "monitored_barrier", None)
128
- if backend == "gloo" and callable(monitored):
129
- monitored(timeout=timeout)
130
- else:
131
- work = None
132
- try:
133
- work = dist.barrier(async_op=True)
134
- except TypeError:
135
- work = None
136
- if work is not None:
137
- wait = getattr(work, "wait", None)
138
- if callable(wait):
139
- try:
140
- wait(timeout=timeout)
141
- except TypeError:
142
- wait()
143
- else:
144
- dist.barrier()
145
- else:
146
- dist.barrier()
147
- if debug_barrier:
148
- print(f"[DDP][{self.label}] exit barrier({reason}) rank={rank}/{world}", flush=True)
149
- except Exception as exc:
150
- print(
151
- f"[DDP][{self.label}] barrier failed during {reason}: {exc}",
152
- flush=True,
153
- )
154
- raise
155
-
156
- @property
157
- def config(self) -> BayesOptConfig:
158
- return self.ctx.config
159
-
160
- @property
161
- def output(self) -> OutputManager:
162
- return self.ctx.output_manager
163
-
164
- def _get_model_filename(self) -> str:
165
- ext = 'pkl' if self.label in ['Xgboost', 'GLM'] else 'pth'
166
- return f'01_{self.ctx.model_nme}_{self.model_name_prefix}.{ext}'
167
-
168
- def _resolve_optuna_storage_url(self) -> Optional[str]:
169
- storage = getattr(self.config, "optuna_storage", None)
170
- if not storage:
171
- return None
172
- storage_str = str(storage).strip()
173
- if not storage_str:
174
- return None
175
- if "://" in storage_str or storage_str == ":memory:":
176
- return storage_str
177
- path = Path(storage_str)
178
- path = path.resolve()
179
- ensure_parent_dir(str(path))
180
- return f"sqlite:///{path.as_posix()}"
181
-
182
- def _resolve_optuna_study_name(self) -> str:
183
- prefix = getattr(self.config, "optuna_study_prefix",
184
- None) or "bayesopt"
185
- raw = f"{prefix}_{self.ctx.model_nme}_{self.model_name_prefix}"
186
- safe = "".join([c if c.isalnum() or c in "._-" else "_" for c in raw])
187
- return safe.lower()
188
-
189
- def tune(self, max_evals: int, objective_fn=None) -> None:
190
- # Generic Optuna tuning loop.
191
- if objective_fn is None:
192
- # If subclass doesn't provide objective_fn, default to cross_val.
193
- objective_fn = self.cross_val
194
-
195
- if self._should_use_distributed_optuna():
196
- self._distributed_tune(max_evals, objective_fn)
197
- return
198
-
199
- total_trials = max(1, int(max_evals))
200
- progress_counter = {"count": 0}
201
-
202
- def objective_wrapper(trial: optuna.trial.Trial) -> float:
203
- should_log = DistributedUtils.is_main_process()
204
- if should_log:
205
- current_idx = progress_counter["count"] + 1
206
- print(
207
- f"[Optuna][{self.label}] Trial {current_idx}/{total_trials} started "
208
- f"(trial_id={trial.number})."
209
- )
210
- try:
211
- result = objective_fn(trial)
212
- except RuntimeError as exc:
213
- if "out of memory" in str(exc).lower():
214
- print(
215
- f"[Optuna][{self.label}] OOM detected. Pruning trial and clearing CUDA cache."
216
- )
217
- self._clean_gpu()
218
- raise optuna.TrialPruned() from exc
219
- raise
220
- finally:
221
- self._clean_gpu()
222
- if should_log:
223
- progress_counter["count"] = progress_counter["count"] + 1
224
- trial_state = getattr(trial, "state", None)
225
- state_repr = getattr(trial_state, "name", "OK")
226
- print(
227
- f"[Optuna][{self.label}] Trial {progress_counter['count']}/{total_trials} finished "
228
- f"(status={state_repr})."
229
- )
230
- return result
231
-
232
- storage_url = self._resolve_optuna_storage_url()
233
- study_name = self._resolve_optuna_study_name()
234
- study_kwargs: Dict[str, Any] = {
235
- "direction": "minimize",
236
- "sampler": optuna.samplers.TPESampler(seed=self.ctx.rand_seed),
237
- }
238
- if storage_url:
239
- study_kwargs.update(
240
- storage=storage_url,
241
- study_name=study_name,
242
- load_if_exists=True,
243
- )
244
-
245
- study = optuna.create_study(**study_kwargs)
246
- self.study_name = getattr(study, "study_name", None)
247
-
248
- def checkpoint_callback(check_study: optuna.study.Study, _trial) -> None:
249
- # Persist best_params after each trial to allow safe resume.
250
- try:
251
- best = getattr(check_study, "best_trial", None)
252
- if best is None:
253
- return
254
- best_params = getattr(best, "params", None)
255
- if not best_params:
256
- return
257
- params_path = self.output.result_path(
258
- f'{self.ctx.model_nme}_bestparams_{self.label.lower()}.csv'
259
- )
260
- pd.DataFrame(best_params, index=[0]).to_csv(
261
- params_path, index=False)
262
- except Exception:
263
- return
264
-
265
- completed_states = (
266
- optuna.trial.TrialState.COMPLETE,
267
- optuna.trial.TrialState.PRUNED,
268
- optuna.trial.TrialState.FAIL,
269
- )
270
- completed = len(study.get_trials(states=completed_states))
271
- progress_counter["count"] = completed
272
- remaining = max(0, total_trials - completed)
273
- if remaining > 0:
274
- study.optimize(
275
- objective_wrapper,
276
- n_trials=remaining,
277
- callbacks=[checkpoint_callback],
278
- )
279
- self.best_params = study.best_params
280
- self.best_trial = study.best_trial
281
-
282
- # Save best params to CSV for reproducibility.
283
- params_path = self.output.result_path(
284
- f'{self.ctx.model_nme}_bestparams_{self.label.lower()}.csv'
285
- )
286
- pd.DataFrame(self.best_params, index=[0]).to_csv(
287
- params_path, index=False)
288
-
289
- def train(self) -> None:
290
- raise NotImplementedError
291
-
292
- def save(self) -> None:
293
- if self.model is None:
294
- print(f"[save] Warning: No model to save for {self.label}")
295
- return
296
-
297
- path = self.output.model_path(self._get_model_filename())
298
- if self.label in ['Xgboost', 'GLM']:
299
- joblib.dump(self.model, path)
300
- else:
301
- # PyTorch models can save state_dict or the full object.
302
- # Legacy behavior: ResNetTrainer saves state_dict; FTTrainer saves full object.
303
- if hasattr(self.model, 'resnet'): # ResNetSklearn model
304
- torch.save(self.model.resnet.state_dict(), path)
305
- else: # FTTransformerSklearn or other PyTorch model
306
- torch.save(self.model, path)
307
-
308
- def load(self) -> None:
309
- path = self.output.model_path(self._get_model_filename())
310
- if not os.path.exists(path):
311
- print(f"[load] Warning: Model file not found: {path}")
312
- return
313
-
314
- if self.label in ['Xgboost', 'GLM']:
315
- self.model = joblib.load(path)
316
- else:
317
- # PyTorch loading depends on the model structure.
318
- if self.label == 'ResNet' or self.label == 'ResNetClassifier':
319
- # ResNet requires reconstructing the skeleton; handled by subclass.
320
- pass
321
- else:
322
- # FT-Transformer serializes the whole object; load then move to device.
323
- loaded = torch.load(path, map_location='cpu')
324
- self._move_to_device(loaded)
325
- self.model = loaded
326
-
327
- def _move_to_device(self, model_obj):
328
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
329
- if hasattr(model_obj, 'device'):
330
- model_obj.device = device
331
- if hasattr(model_obj, 'to'):
332
- model_obj.to(device)
333
- # Move nested submodules (ft/resnet/gnn) to the same device.
334
- if hasattr(model_obj, 'ft'):
335
- model_obj.ft.to(device)
336
- if hasattr(model_obj, 'resnet'):
337
- model_obj.resnet.to(device)
338
- if hasattr(model_obj, 'gnn'):
339
- model_obj.gnn.to(device)
340
-
341
- def _should_use_distributed_optuna(self) -> bool:
342
- if not self.enable_distributed_optuna:
343
- return False
344
- rank_env = os.environ.get("RANK")
345
- world_env = os.environ.get("WORLD_SIZE")
346
- local_env = os.environ.get("LOCAL_RANK")
347
- if rank_env is None or world_env is None or local_env is None:
348
- return False
349
- try:
350
- world_size = int(world_env)
351
- except Exception:
352
- return False
353
- return world_size > 1
354
-
355
- def _distributed_is_main(self) -> bool:
356
- return DistributedUtils.is_main_process()
357
-
358
- def _distributed_send_command(self, payload: Dict[str, Any]) -> None:
359
- if not self._should_use_distributed_optuna() or not self._distributed_is_main():
360
- return
361
- if dist is None:
362
- return
363
- DistributedUtils.setup_ddp()
364
- if not dist.is_initialized():
365
- return
366
- message = [payload]
367
- dist.broadcast_object_list(message, src=0)
368
-
369
- def _distributed_prepare_trial(self, params: Dict[str, Any]) -> None:
370
- if not self._should_use_distributed_optuna():
371
- return
372
- if not self._distributed_is_main():
373
- return
374
- if dist is None:
375
- return
376
- self._distributed_send_command({"type": "RUN", "params": params})
377
- if not dist.is_initialized():
378
- return
379
- # STEP 2 (DDP/Optuna): make sure all ranks start the trial together.
380
- self._dist_barrier("prepare_trial")
381
-
382
- def _distributed_worker_loop(self, objective_fn: Callable[[Optional[optuna.trial.Trial]], float]) -> None:
383
- if dist is None:
384
- print(
385
- f"[Optuna][Worker][{self.label}] torch.distributed unavailable. Worker exit.",
386
- flush=True,
387
- )
388
- return
389
- DistributedUtils.setup_ddp()
390
- if not dist.is_initialized():
391
- print(
392
- f"[Optuna][Worker][{self.label}] DDP init failed. Worker exit.",
393
- flush=True,
394
- )
395
- return
396
- while True:
397
- message = [None]
398
- dist.broadcast_object_list(message, src=0)
399
- payload = message[0]
400
- if not isinstance(payload, dict):
401
- continue
402
- cmd = payload.get("type")
403
- if cmd == "STOP":
404
- best_params = payload.get("best_params")
405
- if best_params is not None:
406
- self.best_params = best_params
407
- break
408
- if cmd == "RUN":
409
- params = payload.get("params") or {}
410
- self._distributed_forced_params = params
411
- # STEP 2 (DDP/Optuna): align worker with rank0 before running objective_fn.
412
- self._dist_barrier("worker_start")
413
- try:
414
- objective_fn(None)
415
- except optuna.TrialPruned:
416
- pass
417
- except Exception as exc:
418
- print(
419
- f"[Optuna][Worker][{self.label}] Exception: {exc}", flush=True)
420
- finally:
421
- self._clean_gpu()
422
- # STEP 2 (DDP/Optuna): align worker with rank0 after objective_fn returns/raises.
423
- self._dist_barrier("worker_end")
424
-
425
- def _distributed_tune(self, max_evals: int, objective_fn: Callable[[optuna.trial.Trial], float]) -> None:
426
- if dist is None:
427
- print(
428
- f"[Optuna][{self.label}] torch.distributed unavailable. Fallback to single-process.",
429
- flush=True,
430
- )
431
- prev = self.enable_distributed_optuna
432
- self.enable_distributed_optuna = False
433
- try:
434
- self.tune(max_evals, objective_fn)
435
- finally:
436
- self.enable_distributed_optuna = prev
437
- return
438
- DistributedUtils.setup_ddp()
439
- if not dist.is_initialized():
440
- rank_env = os.environ.get("RANK", "0")
441
- if str(rank_env) != "0":
442
- print(
443
- f"[Optuna][{self.label}] DDP init failed on worker. Skip.",
444
- flush=True,
445
- )
446
- return
447
- print(
448
- f"[Optuna][{self.label}] DDP init failed. Fallback to single-process.",
449
- flush=True,
450
- )
451
- prev = self.enable_distributed_optuna
452
- self.enable_distributed_optuna = False
453
- try:
454
- self.tune(max_evals, objective_fn)
455
- finally:
456
- self.enable_distributed_optuna = prev
457
- return
458
- if not self._distributed_is_main():
459
- self._distributed_worker_loop(objective_fn)
460
- return
461
-
462
- total_trials = max(1, int(max_evals))
463
- progress_counter = {"count": 0}
464
-
465
- def objective_wrapper(trial: optuna.trial.Trial) -> float:
466
- should_log = True
467
- if should_log:
468
- current_idx = progress_counter["count"] + 1
469
- print(
470
- f"[Optuna][{self.label}] Trial {current_idx}/{total_trials} started "
471
- f"(trial_id={trial.number})."
472
- )
473
- try:
474
- result = objective_fn(trial)
475
- except RuntimeError as exc:
476
- if "out of memory" in str(exc).lower():
477
- print(
478
- f"[Optuna][{self.label}] OOM detected. Pruning trial and clearing CUDA cache."
479
- )
480
- self._clean_gpu()
481
- raise optuna.TrialPruned() from exc
482
- raise
483
- finally:
484
- self._clean_gpu()
485
- if should_log:
486
- progress_counter["count"] = progress_counter["count"] + 1
487
- trial_state = getattr(trial, "state", None)
488
- state_repr = getattr(trial_state, "name", "OK")
489
- print(
490
- f"[Optuna][{self.label}] Trial {progress_counter['count']}/{total_trials} finished "
491
- f"(status={state_repr})."
492
- )
493
- # STEP 2 (DDP/Optuna): a trial-end sync point; debug with BAYESOPT_DDP_BARRIER_DEBUG=1.
494
- self._dist_barrier("trial_end")
495
- return result
496
-
497
- storage_url = self._resolve_optuna_storage_url()
498
- study_name = self._resolve_optuna_study_name()
499
- study_kwargs: Dict[str, Any] = {
500
- "direction": "minimize",
501
- "sampler": optuna.samplers.TPESampler(seed=self.ctx.rand_seed),
502
- }
503
- if storage_url:
504
- study_kwargs.update(
505
- storage=storage_url,
506
- study_name=study_name,
507
- load_if_exists=True,
508
- )
509
- study = optuna.create_study(**study_kwargs)
510
- self.study_name = getattr(study, "study_name", None)
511
-
512
- def checkpoint_callback(check_study: optuna.study.Study, _trial) -> None:
513
- try:
514
- best = getattr(check_study, "best_trial", None)
515
- if best is None:
516
- return
517
- best_params = getattr(best, "params", None)
518
- if not best_params:
519
- return
520
- params_path = self.output.result_path(
521
- f'{self.ctx.model_nme}_bestparams_{self.label.lower()}.csv'
522
- )
523
- pd.DataFrame(best_params, index=[0]).to_csv(
524
- params_path, index=False)
525
- except Exception:
526
- return
527
-
528
- completed_states = (
529
- optuna.trial.TrialState.COMPLETE,
530
- optuna.trial.TrialState.PRUNED,
531
- optuna.trial.TrialState.FAIL,
532
- )
533
- completed = len(study.get_trials(states=completed_states))
534
- progress_counter["count"] = completed
535
- remaining = max(0, total_trials - completed)
536
- try:
537
- if remaining > 0:
538
- study.optimize(
539
- objective_wrapper,
540
- n_trials=remaining,
541
- callbacks=[checkpoint_callback],
542
- )
543
- self.best_params = study.best_params
544
- self.best_trial = study.best_trial
545
- params_path = self.output.result_path(
546
- f'{self.ctx.model_nme}_bestparams_{self.label.lower()}.csv'
547
- )
548
- pd.DataFrame(self.best_params, index=[0]).to_csv(
549
- params_path, index=False)
550
- finally:
551
- self._distributed_send_command(
552
- {"type": "STOP", "best_params": self.best_params})
553
-
554
- def _clean_gpu(self):
555
- gc.collect()
556
- if torch.cuda.is_available():
557
- device = None
558
- try:
559
- device = getattr(self, "device", None)
560
- except Exception:
561
- device = None
562
- if isinstance(device, torch.device):
563
- try:
564
- torch.cuda.set_device(device)
565
- except Exception:
566
- pass
567
- torch.cuda.empty_cache()
568
- do_ipc_collect = os.environ.get("BAYESOPT_CUDA_IPC_COLLECT", "").strip() in {"1", "true", "TRUE", "yes", "YES"}
569
- do_sync = os.environ.get("BAYESOPT_CUDA_SYNC", "").strip() in {"1", "true", "TRUE", "yes", "YES"}
570
- if do_ipc_collect:
571
- torch.cuda.ipc_collect()
572
- if do_sync:
573
- torch.cuda.synchronize()
574
-
575
- def _standardize_fold(self,
576
- X_train: pd.DataFrame,
577
- X_val: pd.DataFrame,
578
- columns: Optional[List[str]] = None
579
- ) -> Tuple[pd.DataFrame, pd.DataFrame, StandardScaler]:
580
- """Fit StandardScaler on the training fold and transform train/val features.
581
-
582
- Args:
583
- X_train: training features.
584
- X_val: validation features.
585
- columns: columns to scale (default: all).
586
-
587
- Returns:
588
- Scaled train/val features and the fitted scaler.
589
- """
590
- scaler = StandardScaler()
591
- cols = list(columns) if columns else list(X_train.columns)
592
- X_train_scaled = X_train.copy(deep=True)
593
- X_val_scaled = X_val.copy(deep=True)
594
- if cols:
595
- scaler.fit(X_train_scaled[cols])
596
- X_train_scaled[cols] = scaler.transform(X_train_scaled[cols])
597
- X_val_scaled[cols] = scaler.transform(X_val_scaled[cols])
598
- return X_train_scaled, X_val_scaled, scaler
599
-
600
- def cross_val_generic(
601
- self,
602
- trial: optuna.trial.Trial,
603
- hyperparameter_space: Dict[str, Callable[[optuna.trial.Trial], Any]],
604
- data_provider: Callable[[], Tuple[pd.DataFrame, pd.Series, Optional[pd.Series]]],
605
- model_builder: Callable[[Dict[str, Any]], Any],
606
- metric_fn: Callable[[pd.Series, np.ndarray, Optional[pd.Series]], float],
607
- sample_limit: Optional[int] = None,
608
- preprocess_fn: Optional[Callable[[
609
- pd.DataFrame, pd.DataFrame], Tuple[pd.DataFrame, pd.DataFrame]]] = None,
610
- fit_predict_fn: Optional[
611
- Callable[[Any, pd.DataFrame, pd.Series, Optional[pd.Series],
612
- pd.DataFrame, pd.Series, Optional[pd.Series],
613
- optuna.trial.Trial], np.ndarray]
614
- ] = None,
615
- cleanup_fn: Optional[Callable[[Any], None]] = None,
616
- splitter: Optional[Iterable[Tuple[np.ndarray, np.ndarray]]] = None) -> float:
617
- """Generic holdout/CV helper to reuse tuning workflows.
618
-
619
- Args:
620
- trial: current Optuna trial.
621
- hyperparameter_space: sampler dict keyed by parameter name.
622
- data_provider: callback returning (X, y, sample_weight).
623
- model_builder: callback to build a model per fold.
624
- metric_fn: loss/score function taking y_true, y_pred, weight.
625
- sample_limit: optional sample cap; random sample if exceeded.
626
- preprocess_fn: optional per-fold preprocessing (X_train, X_val).
627
- fit_predict_fn: optional custom fit/predict logic for validation.
628
- cleanup_fn: optional cleanup callback per fold.
629
- splitter: optional (train_idx, val_idx) iterator; defaults to ShuffleSplit.
630
-
631
- Returns:
632
- Mean validation metric across folds.
633
- """
634
- params: Optional[Dict[str, Any]] = None
635
- if self._distributed_forced_params is not None:
636
- params = self._distributed_forced_params
637
- self._distributed_forced_params = None
638
- else:
639
- if trial is None:
640
- raise RuntimeError(
641
- "Missing Optuna trial for parameter sampling.")
642
- params = {name: sampler(trial)
643
- for name, sampler in hyperparameter_space.items()}
644
- if self._should_use_distributed_optuna():
645
- self._distributed_prepare_trial(params)
646
- X_all, y_all, w_all = data_provider()
647
- if sample_limit is not None and len(X_all) > sample_limit:
648
- sampled_idx = X_all.sample(
649
- n=sample_limit,
650
- random_state=self.ctx.rand_seed
651
- ).index
652
- X_all = X_all.loc[sampled_idx]
653
- y_all = y_all.loc[sampled_idx]
654
- w_all = w_all.loc[sampled_idx] if w_all is not None else None
655
-
656
- split_iter = splitter or ShuffleSplit(
657
- n_splits=int(1 / self.ctx.prop_test),
658
- test_size=self.ctx.prop_test,
659
- random_state=self.ctx.rand_seed
660
- ).split(X_all)
661
-
662
- losses: List[float] = []
663
- for train_idx, val_idx in split_iter:
664
- X_train = X_all.iloc[train_idx]
665
- y_train = y_all.iloc[train_idx]
666
- X_val = X_all.iloc[val_idx]
667
- y_val = y_all.iloc[val_idx]
668
- w_train = w_all.iloc[train_idx] if w_all is not None else None
669
- w_val = w_all.iloc[val_idx] if w_all is not None else None
670
-
671
- if preprocess_fn:
672
- X_train, X_val = preprocess_fn(X_train, X_val)
673
-
674
- model = model_builder(params)
675
- try:
676
- if fit_predict_fn:
677
- y_pred = fit_predict_fn(
678
- model, X_train, y_train, w_train,
679
- X_val, y_val, w_val, trial
680
- )
681
- else:
682
- fit_kwargs = {}
683
- if w_train is not None:
684
- fit_kwargs["sample_weight"] = w_train
685
- model.fit(X_train, y_train, **fit_kwargs)
686
- y_pred = model.predict(X_val)
687
- losses.append(metric_fn(y_val, y_pred, w_val))
688
- finally:
689
- if cleanup_fn:
690
- cleanup_fn(model)
691
- self._clean_gpu()
692
-
693
- return float(np.mean(losses))
694
-
695
- # Prediction + caching logic.
696
- def _predict_and_cache(self,
697
- model,
698
- pred_prefix: str,
699
- use_oht: bool = False,
700
- design_fn=None,
701
- predict_kwargs_train: Optional[Dict[str, Any]] = None,
702
- predict_kwargs_test: Optional[Dict[str, Any]] = None,
703
- predict_fn: Optional[Callable[..., Any]] = None) -> None:
704
- if design_fn:
705
- X_train = design_fn(train=True)
706
- X_test = design_fn(train=False)
707
- elif use_oht:
708
- X_train = self.ctx.train_oht_scl_data[self.ctx.var_nmes]
709
- X_test = self.ctx.test_oht_scl_data[self.ctx.var_nmes]
710
- else:
711
- X_train = self.ctx.train_data[self.ctx.factor_nmes]
712
- X_test = self.ctx.test_data[self.ctx.factor_nmes]
713
-
714
- predictor = predict_fn or model.predict
715
- preds_train = predictor(X_train, **(predict_kwargs_train or {}))
716
- preds_test = predictor(X_test, **(predict_kwargs_test or {}))
717
- preds_train = np.asarray(preds_train)
718
- preds_test = np.asarray(preds_test)
719
-
720
- if preds_train.ndim <= 1 or (preds_train.ndim == 2 and preds_train.shape[1] == 1):
721
- col_name = f'pred_{pred_prefix}'
722
- self.ctx.train_data[col_name] = preds_train.reshape(-1)
723
- self.ctx.test_data[col_name] = preds_test.reshape(-1)
724
- self.ctx.train_data[f'w_{col_name}'] = (
725
- self.ctx.train_data[col_name] *
726
- self.ctx.train_data[self.ctx.weight_nme]
727
- )
728
- self.ctx.test_data[f'w_{col_name}'] = (
729
- self.ctx.test_data[col_name] *
730
- self.ctx.test_data[self.ctx.weight_nme]
731
- )
732
- return
733
-
734
- # Vector outputs (e.g., embeddings) are expanded into pred_<prefix>_0.. columns.
735
- if preds_train.ndim != 2:
736
- raise ValueError(
737
- f"Unexpected prediction shape for '{pred_prefix}': {preds_train.shape}")
738
- if preds_test.ndim != 2 or preds_test.shape[1] != preds_train.shape[1]:
739
- raise ValueError(
740
- f"Train/test prediction dims mismatch for '{pred_prefix}': "
741
- f"{preds_train.shape} vs {preds_test.shape}")
742
- for j in range(preds_train.shape[1]):
743
- col_name = f'pred_{pred_prefix}_{j}'
744
- self.ctx.train_data[col_name] = preds_train[:, j]
745
- self.ctx.test_data[col_name] = preds_test[:, j]
746
-
747
- def _cache_predictions(self,
748
- pred_prefix: str,
749
- preds_train,
750
- preds_test) -> None:
751
- preds_train = np.asarray(preds_train)
752
- preds_test = np.asarray(preds_test)
753
- if preds_train.ndim <= 1 or (preds_train.ndim == 2 and preds_train.shape[1] == 1):
754
- if preds_test.ndim > 1:
755
- preds_test = preds_test.reshape(-1)
756
- col_name = f'pred_{pred_prefix}'
757
- self.ctx.train_data[col_name] = preds_train.reshape(-1)
758
- self.ctx.test_data[col_name] = preds_test.reshape(-1)
759
- self.ctx.train_data[f'w_{col_name}'] = (
760
- self.ctx.train_data[col_name] *
761
- self.ctx.train_data[self.ctx.weight_nme]
762
- )
763
- self.ctx.test_data[f'w_{col_name}'] = (
764
- self.ctx.test_data[col_name] *
765
- self.ctx.test_data[self.ctx.weight_nme]
766
- )
767
- return
768
-
769
- if preds_train.ndim != 2:
770
- raise ValueError(
771
- f"Unexpected prediction shape for '{pred_prefix}': {preds_train.shape}")
772
- if preds_test.ndim != 2 or preds_test.shape[1] != preds_train.shape[1]:
773
- raise ValueError(
774
- f"Train/test prediction dims mismatch for '{pred_prefix}': "
775
- f"{preds_train.shape} vs {preds_test.shape}")
776
- for j in range(preds_train.shape[1]):
777
- col_name = f'pred_{pred_prefix}_{j}'
778
- self.ctx.train_data[col_name] = preds_train[:, j]
779
- self.ctx.test_data[col_name] = preds_test[:, j]
780
-
781
- def _resolve_best_epoch(self,
782
- history: Optional[Dict[str, List[float]]],
783
- default_epochs: int) -> int:
784
- if not history:
785
- return max(1, int(default_epochs))
786
- vals = history.get("val") or []
787
- if not vals:
788
- return max(1, int(default_epochs))
789
- best_idx = int(np.nanargmin(vals))
790
- return max(1, best_idx + 1)
791
-
792
- def _fit_predict_cache(self,
793
- model,
794
- X_train,
795
- y_train,
796
- sample_weight,
797
- pred_prefix: str,
798
- use_oht: bool = False,
799
- design_fn=None,
800
- fit_kwargs: Optional[Dict[str, Any]] = None,
801
- sample_weight_arg: Optional[str] = 'sample_weight',
802
- predict_kwargs_train: Optional[Dict[str, Any]] = None,
803
- predict_kwargs_test: Optional[Dict[str, Any]] = None,
804
- predict_fn: Optional[Callable[..., Any]] = None,
805
- record_label: bool = True) -> None:
806
- fit_kwargs = fit_kwargs.copy() if fit_kwargs else {}
807
- if sample_weight is not None and sample_weight_arg:
808
- fit_kwargs.setdefault(sample_weight_arg, sample_weight)
809
- model.fit(X_train, y_train, **fit_kwargs)
810
- if record_label:
811
- self.ctx.model_label.append(self.label)
812
- self._predict_and_cache(
813
- model,
814
- pred_prefix,
815
- use_oht=use_oht,
816
- design_fn=design_fn,
817
- predict_kwargs_train=predict_kwargs_train,
818
- predict_kwargs_test=predict_kwargs_test,
819
- predict_fn=predict_fn)
820
-
821
-
822
- class GNNTrainer(TrainerBase):
823
- def __init__(self, context: "BayesOptModel") -> None:
824
- super().__init__(context, 'GNN', 'GNN')
825
- self.model: Optional[GraphNeuralNetSklearn] = None
826
- self.enable_distributed_optuna = bool(context.config.use_gnn_ddp)
827
-
828
- def _build_model(self, params: Optional[Dict[str, Any]] = None) -> GraphNeuralNetSklearn:
829
- params = params or {}
830
- base_tw_power = self.ctx.default_tweedie_power()
831
- model = GraphNeuralNetSklearn(
832
- model_nme=f"{self.ctx.model_nme}_gnn",
833
- input_dim=len(self.ctx.var_nmes),
834
- hidden_dim=int(params.get("hidden_dim", 64)),
835
- num_layers=int(params.get("num_layers", 2)),
836
- k_neighbors=int(params.get("k_neighbors", 10)),
837
- dropout=float(params.get("dropout", 0.1)),
838
- learning_rate=float(params.get("learning_rate", 1e-3)),
839
- epochs=int(params.get("epochs", self.ctx.epochs)),
840
- patience=int(params.get("patience", 5)),
841
- task_type=self.ctx.task_type,
842
- tweedie_power=float(params.get("tw_power", base_tw_power or 1.5)),
843
- weight_decay=float(params.get("weight_decay", 0.0)),
844
- use_data_parallel=bool(self.ctx.config.use_gnn_data_parallel),
845
- use_ddp=bool(self.ctx.config.use_gnn_ddp),
846
- use_approx_knn=bool(self.ctx.config.gnn_use_approx_knn),
847
- approx_knn_threshold=int(self.ctx.config.gnn_approx_knn_threshold),
848
- graph_cache_path=self.ctx.config.gnn_graph_cache,
849
- max_gpu_knn_nodes=self.ctx.config.gnn_max_gpu_knn_nodes,
850
- knn_gpu_mem_ratio=float(self.ctx.config.gnn_knn_gpu_mem_ratio),
851
- knn_gpu_mem_overhead=float(
852
- self.ctx.config.gnn_knn_gpu_mem_overhead),
853
- )
854
- return model
855
-
856
- def cross_val(self, trial: optuna.trial.Trial) -> float:
857
- base_tw_power = self.ctx.default_tweedie_power()
858
- metric_ctx: Dict[str, Any] = {}
859
-
860
- def data_provider():
861
- data = self.ctx.train_oht_data if self.ctx.train_oht_data is not None else self.ctx.train_oht_scl_data
862
- assert data is not None, "Preprocessed training data is missing."
863
- return data[self.ctx.var_nmes], data[self.ctx.resp_nme], data[self.ctx.weight_nme]
864
-
865
- def model_builder(params: Dict[str, Any]):
866
- tw_power = params.get("tw_power", base_tw_power)
867
- metric_ctx["tw_power"] = tw_power
868
- return self._build_model(params)
869
-
870
- def preprocess_fn(X_train, X_val):
871
- X_train_s, X_val_s, _ = self._standardize_fold(
872
- X_train, X_val, self.ctx.num_features)
873
- return X_train_s, X_val_s
874
-
875
- def fit_predict(model, X_train, y_train, w_train, X_val, y_val, w_val, trial_obj):
876
- model.fit(
877
- X_train,
878
- y_train,
879
- w_train=w_train,
880
- X_val=X_val,
881
- y_val=y_val,
882
- w_val=w_val,
883
- trial=trial_obj,
884
- )
885
- return model.predict(X_val)
886
-
887
- def metric_fn(y_true, y_pred, weight):
888
- if self.ctx.task_type == 'classification':
889
- y_pred_clipped = np.clip(y_pred, EPS, 1 - EPS)
890
- return log_loss(y_true, y_pred_clipped, sample_weight=weight)
891
- y_pred_safe = np.maximum(y_pred, EPS)
892
- power = metric_ctx.get("tw_power", base_tw_power or 1.5)
893
- return mean_tweedie_deviance(
894
- y_true,
895
- y_pred_safe,
896
- sample_weight=weight,
897
- power=power,
898
- )
899
-
900
- # Keep GNN BO lightweight: sample during CV, use full data for final training.
901
- X_cap = data_provider()[0]
902
- sample_limit = min(200000, len(X_cap)) if len(X_cap) > 200000 else None
903
-
904
- param_space: Dict[str, Callable[[optuna.trial.Trial], Any]] = {
905
- "learning_rate": lambda t: t.suggest_float('learning_rate', 1e-4, 5e-3, log=True),
906
- "hidden_dim": lambda t: t.suggest_int('hidden_dim', 16, 128, step=16),
907
- "num_layers": lambda t: t.suggest_int('num_layers', 1, 4),
908
- "k_neighbors": lambda t: t.suggest_int('k_neighbors', 5, 30),
909
- "dropout": lambda t: t.suggest_float('dropout', 0.0, 0.3),
910
- "weight_decay": lambda t: t.suggest_float('weight_decay', 1e-6, 1e-2, log=True),
911
- }
912
- if self.ctx.task_type == 'regression' and self.ctx.obj == 'reg:tweedie':
913
- param_space["tw_power"] = lambda t: t.suggest_float(
914
- 'tw_power', 1.0, 2.0)
915
-
916
- return self.cross_val_generic(
917
- trial=trial,
918
- hyperparameter_space=param_space,
919
- data_provider=data_provider,
920
- model_builder=model_builder,
921
- metric_fn=metric_fn,
922
- sample_limit=sample_limit,
923
- preprocess_fn=preprocess_fn,
924
- fit_predict_fn=fit_predict,
925
- cleanup_fn=lambda m: getattr(
926
- getattr(m, "gnn", None), "to", lambda *_args, **_kwargs: None)("cpu")
927
- )
928
-
929
- def train(self) -> None:
930
- if not self.best_params:
931
- raise RuntimeError("Run tune() first to obtain best GNN parameters.")
932
-
933
- data = self.ctx.train_oht_scl_data
934
- assert data is not None, "Preprocessed training data is missing."
935
- X_all = data[self.ctx.var_nmes]
936
- y_all = data[self.ctx.resp_nme]
937
- w_all = data[self.ctx.weight_nme]
938
-
939
- use_refit = bool(getattr(self.ctx.config, "final_refit", True))
940
- refit_epochs = None
941
-
942
- if 0.0 < float(self.ctx.prop_test) < 1.0 and len(X_all) >= 10:
943
- splitter = ShuffleSplit(
944
- n_splits=1,
945
- test_size=self.ctx.prop_test,
946
- random_state=self.ctx.rand_seed,
947
- )
948
- train_idx, val_idx = next(splitter.split(X_all))
949
- X_train = X_all.iloc[train_idx]
950
- y_train = y_all.iloc[train_idx]
951
- w_train = w_all.iloc[train_idx]
952
- X_val = X_all.iloc[val_idx]
953
- y_val = y_all.iloc[val_idx]
954
- w_val = w_all.iloc[val_idx]
955
-
956
- if use_refit:
957
- tmp_model = self._build_model(self.best_params)
958
- tmp_model.fit(
959
- X_train,
960
- y_train,
961
- w_train=w_train,
962
- X_val=X_val,
963
- y_val=y_val,
964
- w_val=w_val,
965
- trial=None,
966
- )
967
- refit_epochs = int(getattr(tmp_model, "best_epoch", None) or self.ctx.epochs)
968
- getattr(getattr(tmp_model, "gnn", None), "to",
969
- lambda *_args, **_kwargs: None)("cpu")
970
- self._clean_gpu()
971
- else:
972
- self.model = self._build_model(self.best_params)
973
- self.model.fit(
974
- X_train,
975
- y_train,
976
- w_train=w_train,
977
- X_val=X_val,
978
- y_val=y_val,
979
- w_val=w_val,
980
- trial=None,
981
- )
982
- else:
983
- use_refit = False
984
-
985
- if use_refit:
986
- self.model = self._build_model(self.best_params)
987
- if refit_epochs is not None:
988
- self.model.epochs = int(refit_epochs)
989
- self.model.fit(
990
- X_all,
991
- y_all,
992
- w_train=w_all,
993
- X_val=None,
994
- y_val=None,
995
- w_val=None,
996
- trial=None,
997
- )
998
- elif self.model is None:
999
- self.model = self._build_model(self.best_params)
1000
- self.model.fit(
1001
- X_all,
1002
- y_all,
1003
- w_train=w_all,
1004
- X_val=None,
1005
- y_val=None,
1006
- w_val=None,
1007
- trial=None,
1008
- )
1009
- self.ctx.model_label.append(self.label)
1010
- self._predict_and_cache(self.model, pred_prefix='gnn', use_oht=True)
1011
- self.ctx.gnn_best = self.model
1012
-
1013
- # If geo_feature_nmes is set, refresh geo tokens for FT input.
1014
- if self.ctx.config.geo_feature_nmes:
1015
- self.prepare_geo_tokens(force=True)
1016
-
1017
- def ensemble_predict(self, k: int) -> None:
1018
- if not self.best_params:
1019
- raise RuntimeError("Run tune() first to obtain best GNN parameters.")
1020
- data = self.ctx.train_oht_scl_data
1021
- test_data = self.ctx.test_oht_scl_data
1022
- if data is None or test_data is None:
1023
- raise RuntimeError("Missing standardized data for GNN ensemble.")
1024
- X_all = data[self.ctx.var_nmes]
1025
- y_all = data[self.ctx.resp_nme]
1026
- w_all = data[self.ctx.weight_nme]
1027
- X_test = test_data[self.ctx.var_nmes]
1028
-
1029
- k = max(2, int(k))
1030
- n_samples = len(X_all)
1031
- if n_samples < k:
1032
- print(
1033
- f"[GNN Ensemble] n_samples={n_samples} < k={k}; skip ensemble.",
1034
- flush=True,
1035
- )
1036
- return
1037
-
1038
- splitter = KFold(
1039
- n_splits=k,
1040
- shuffle=True,
1041
- random_state=self.ctx.rand_seed,
1042
- )
1043
- preds_train_sum = np.zeros(n_samples, dtype=np.float64)
1044
- preds_test_sum = np.zeros(len(X_test), dtype=np.float64)
1045
-
1046
- for train_idx, val_idx in splitter.split(X_all):
1047
- model = self._build_model(self.best_params)
1048
- model.fit(
1049
- X_all.iloc[train_idx],
1050
- y_all.iloc[train_idx],
1051
- w_train=w_all.iloc[train_idx],
1052
- X_val=X_all.iloc[val_idx],
1053
- y_val=y_all.iloc[val_idx],
1054
- w_val=w_all.iloc[val_idx],
1055
- trial=None,
1056
- )
1057
- pred_train = model.predict(X_all)
1058
- pred_test = model.predict(X_test)
1059
- preds_train_sum += np.asarray(pred_train, dtype=np.float64)
1060
- preds_test_sum += np.asarray(pred_test, dtype=np.float64)
1061
- getattr(getattr(model, "gnn", None), "to",
1062
- lambda *_args, **_kwargs: None)("cpu")
1063
- self._clean_gpu()
1064
-
1065
- preds_train = preds_train_sum / float(k)
1066
- preds_test = preds_test_sum / float(k)
1067
- self._cache_predictions("gnn", preds_train, preds_test)
1068
-
1069
- def prepare_geo_tokens(self, force: bool = False) -> None:
1070
- """Train/update the GNN encoder for geo tokens and inject them into FT input."""
1071
- geo_cols = list(self.ctx.config.geo_feature_nmes or [])
1072
- if not geo_cols:
1073
- return
1074
- if (not force) and self.ctx.train_geo_tokens is not None and self.ctx.test_geo_tokens is not None:
1075
- return
1076
-
1077
- result = self.ctx._build_geo_tokens()
1078
- if result is None:
1079
- return
1080
- train_tokens, test_tokens, cols, geo_gnn = result
1081
- self.ctx.train_geo_tokens = train_tokens
1082
- self.ctx.test_geo_tokens = test_tokens
1083
- self.ctx.geo_token_cols = cols
1084
- self.ctx.geo_gnn_model = geo_gnn
1085
- print(f"[GeoToken][GNNTrainer] Generated {len(cols)} dims and injected into FT.", flush=True)
1086
-
1087
- def save(self) -> None:
1088
- if self.model is None:
1089
- print(f"[save] Warning: No model to save for {self.label}")
1090
- return
1091
- path = self.output.model_path(self._get_model_filename())
1092
- base_gnn = getattr(self.model, "_unwrap_gnn", lambda: None)()
1093
- state = None if base_gnn is None else base_gnn.state_dict()
1094
- payload = {
1095
- "best_params": self.best_params,
1096
- "state_dict": state,
1097
- }
1098
- torch.save(payload, path)
1099
-
1100
- def load(self) -> None:
1101
- path = self.output.model_path(self._get_model_filename())
1102
- if not os.path.exists(path):
1103
- print(f"[load] Warning: Model file not found: {path}")
1104
- return
1105
- payload = torch.load(path, map_location='cpu')
1106
- if not isinstance(payload, dict):
1107
- raise ValueError(f"Invalid GNN checkpoint: {path}")
1108
- params = payload.get("best_params") or {}
1109
- state_dict = payload.get("state_dict")
1110
- model = self._build_model(params)
1111
- if params:
1112
- model.set_params(dict(params))
1113
- base_gnn = getattr(model, "_unwrap_gnn", lambda: None)()
1114
- if base_gnn is not None and state_dict is not None:
1115
- base_gnn.load_state_dict(state_dict, strict=False)
1116
- self.model = model
1117
- self.best_params = dict(params) if isinstance(params, dict) else None
1118
- self.ctx.gnn_best = self.model
1119
-
1120
-
1121
- class XGBTrainer(TrainerBase):
1122
- def __init__(self, context: "BayesOptModel") -> None:
1123
- super().__init__(context, 'Xgboost', 'Xgboost')
1124
- self.model: Optional[xgb.XGBModel] = None
1125
- self._xgb_use_gpu = False
1126
- self._xgb_gpu_warned = False
1127
-
1128
- def _build_estimator(self) -> xgb.XGBModel:
1129
- use_gpu = bool(self.ctx.use_gpu and _xgb_cuda_available())
1130
- self._xgb_use_gpu = use_gpu
1131
- params = dict(
1132
- objective=self.ctx.obj,
1133
- random_state=self.ctx.rand_seed,
1134
- subsample=0.9,
1135
- tree_method='gpu_hist' if use_gpu else 'hist',
1136
- enable_categorical=True,
1137
- predictor='gpu_predictor' if use_gpu else 'cpu_predictor'
1138
- )
1139
- if self.ctx.use_gpu and not use_gpu and not self._xgb_gpu_warned:
1140
- print(
1141
- "[XGBoost] CUDA requested but not available; falling back to CPU.",
1142
- flush=True,
1143
- )
1144
- self._xgb_gpu_warned = True
1145
- if use_gpu:
1146
- params['gpu_id'] = 0
1147
- print(f">>> XGBoost using GPU ID: 0 (Single GPU Mode)")
1148
- if self.ctx.task_type == 'classification':
1149
- params.setdefault("eval_metric", "logloss")
1150
- return xgb.XGBClassifier(**params)
1151
- return xgb.XGBRegressor(**params)
1152
-
1153
- def _resolve_early_stopping_rounds(self, n_estimators: int) -> int:
1154
- n_estimators = max(1, int(n_estimators))
1155
- base = max(5, n_estimators // 10)
1156
- return min(50, base)
1157
-
1158
- def _build_fit_kwargs(self,
1159
- w_train,
1160
- X_val=None,
1161
- y_val=None,
1162
- w_val=None,
1163
- n_estimators: Optional[int] = None) -> Dict[str, Any]:
1164
- fit_kwargs = dict(self.ctx.fit_params or {})
1165
- fit_kwargs.pop("sample_weight", None)
1166
- fit_kwargs["sample_weight"] = w_train
1167
-
1168
- if "eval_set" not in fit_kwargs and X_val is not None and y_val is not None:
1169
- fit_kwargs["eval_set"] = [(X_val, y_val)]
1170
- if w_val is not None:
1171
- fit_kwargs["sample_weight_eval_set"] = [w_val]
1172
-
1173
- if "eval_metric" not in fit_kwargs:
1174
- fit_kwargs["eval_metric"] = "logloss" if self.ctx.task_type == 'classification' else "rmse"
1175
-
1176
- if "early_stopping_rounds" not in fit_kwargs and "eval_set" in fit_kwargs:
1177
- rounds = self._resolve_early_stopping_rounds(n_estimators or 100)
1178
- fit_kwargs["early_stopping_rounds"] = rounds
1179
-
1180
- fit_kwargs.setdefault("verbose", False)
1181
- return fit_kwargs
1182
-
1183
- def ensemble_predict(self, k: int) -> None:
1184
- if not self.best_params:
1185
- raise RuntimeError("Run tune() first to obtain best XGB parameters.")
1186
- k = max(2, int(k))
1187
- X_all = self.ctx.train_data[self.ctx.factor_nmes]
1188
- y_all = self.ctx.train_data[self.ctx.resp_nme].values
1189
- w_all = self.ctx.train_data[self.ctx.weight_nme].values
1190
- X_test = self.ctx.test_data[self.ctx.factor_nmes]
1191
- n_samples = len(X_all)
1192
- if n_samples < k:
1193
- print(
1194
- f"[XGB Ensemble] n_samples={n_samples} < k={k}; skip ensemble.",
1195
- flush=True,
1196
- )
1197
- return
1198
-
1199
- splitter = KFold(
1200
- n_splits=k,
1201
- shuffle=True,
1202
- random_state=self.ctx.rand_seed,
1203
- )
1204
- preds_train_sum = np.zeros(n_samples, dtype=np.float64)
1205
- preds_test_sum = np.zeros(len(X_test), dtype=np.float64)
1206
-
1207
- for train_idx, val_idx in splitter.split(X_all):
1208
- X_train = X_all.iloc[train_idx]
1209
- y_train = y_all[train_idx]
1210
- w_train = w_all[train_idx]
1211
- X_val = X_all.iloc[val_idx]
1212
- y_val = y_all[val_idx]
1213
- w_val = w_all[val_idx]
1214
-
1215
- clf = self._build_estimator()
1216
- clf.set_params(**self.best_params)
1217
- fit_kwargs = self._build_fit_kwargs(
1218
- w_train=w_train,
1219
- X_val=X_val,
1220
- y_val=y_val,
1221
- w_val=w_val,
1222
- n_estimators=self.best_params.get("n_estimators", 100),
1223
- )
1224
- clf.fit(X_train, y_train, **fit_kwargs)
1225
-
1226
- if self.ctx.task_type == 'classification':
1227
- pred_train = clf.predict_proba(X_all)[:, 1]
1228
- pred_test = clf.predict_proba(X_test)[:, 1]
1229
- else:
1230
- pred_train = clf.predict(X_all)
1231
- pred_test = clf.predict(X_test)
1232
- preds_train_sum += np.asarray(pred_train, dtype=np.float64)
1233
- preds_test_sum += np.asarray(pred_test, dtype=np.float64)
1234
- self._clean_gpu()
1235
-
1236
- preds_train = preds_train_sum / float(k)
1237
- preds_test = preds_test_sum / float(k)
1238
- self._cache_predictions("xgb", preds_train, preds_test)
1239
-
1240
- def cross_val(self, trial: optuna.trial.Trial) -> float:
1241
- learning_rate = trial.suggest_float(
1242
- 'learning_rate', 1e-5, 1e-1, log=True)
1243
- gamma = trial.suggest_float('gamma', 0, 10000)
1244
- max_depth_max = max(
1245
- 3, int(getattr(self.config, "xgb_max_depth_max", 25)))
1246
- n_estimators_max = max(
1247
- 10, int(getattr(self.config, "xgb_n_estimators_max", 500)))
1248
- max_depth = trial.suggest_int('max_depth', 3, max_depth_max)
1249
- n_estimators = trial.suggest_int(
1250
- 'n_estimators', 10, n_estimators_max, step=10)
1251
- min_child_weight = trial.suggest_int(
1252
- 'min_child_weight', 100, 10000, step=100)
1253
- reg_alpha = trial.suggest_float('reg_alpha', 1e-10, 1, log=True)
1254
- reg_lambda = trial.suggest_float('reg_lambda', 1e-10, 1, log=True)
1255
- if trial is not None:
1256
- print(
1257
- f"[Optuna][Xgboost] trial_id={trial.number} max_depth={max_depth} "
1258
- f"n_estimators={n_estimators}",
1259
- flush=True,
1260
- )
1261
- if max_depth >= 20 and n_estimators >= 300:
1262
- raise optuna.TrialPruned(
1263
- "XGB config is likely too slow (max_depth>=20 & n_estimators>=300)")
1264
- clf = self._build_estimator()
1265
- params = {
1266
- 'learning_rate': learning_rate,
1267
- 'gamma': gamma,
1268
- 'max_depth': max_depth,
1269
- 'n_estimators': n_estimators,
1270
- 'min_child_weight': min_child_weight,
1271
- 'reg_alpha': reg_alpha,
1272
- 'reg_lambda': reg_lambda
1273
- }
1274
- tweedie_variance_power = None
1275
- if self.ctx.task_type != 'classification':
1276
- if self.ctx.obj == 'reg:tweedie':
1277
- tweedie_variance_power = trial.suggest_float(
1278
- 'tweedie_variance_power', 1, 2)
1279
- params['tweedie_variance_power'] = tweedie_variance_power
1280
- elif self.ctx.obj == 'count:poisson':
1281
- tweedie_variance_power = 1
1282
- elif self.ctx.obj == 'reg:gamma':
1283
- tweedie_variance_power = 2
1284
- else:
1285
- tweedie_variance_power = 1.5
1286
- X_all = self.ctx.train_data[self.ctx.factor_nmes]
1287
- y_all = self.ctx.train_data[self.ctx.resp_nme].values
1288
- w_all = self.ctx.train_data[self.ctx.weight_nme].values
1289
-
1290
- losses: List[float] = []
1291
- for train_idx, val_idx in self.ctx.cv.split(X_all):
1292
- X_train = X_all.iloc[train_idx]
1293
- y_train = y_all[train_idx]
1294
- w_train = w_all[train_idx]
1295
- X_val = X_all.iloc[val_idx]
1296
- y_val = y_all[val_idx]
1297
- w_val = w_all[val_idx]
1298
-
1299
- clf = self._build_estimator()
1300
- clf.set_params(**params)
1301
- fit_kwargs = self._build_fit_kwargs(
1302
- w_train=w_train,
1303
- X_val=X_val,
1304
- y_val=y_val,
1305
- w_val=w_val,
1306
- n_estimators=n_estimators,
1307
- )
1308
- clf.fit(X_train, y_train, **fit_kwargs)
1309
-
1310
- if self.ctx.task_type == 'classification':
1311
- y_pred = clf.predict_proba(X_val)[:, 1]
1312
- y_pred = np.clip(y_pred, EPS, 1 - EPS)
1313
- loss = log_loss(y_val, y_pred, sample_weight=w_val)
1314
- else:
1315
- y_pred = clf.predict(X_val)
1316
- y_pred_safe = np.maximum(y_pred, EPS)
1317
- loss = mean_tweedie_deviance(
1318
- y_val,
1319
- y_pred_safe,
1320
- sample_weight=w_val,
1321
- power=tweedie_variance_power,
1322
- )
1323
- losses.append(float(loss))
1324
- self._clean_gpu()
1325
-
1326
- return float(np.mean(losses))
1327
-
1328
- def train(self) -> None:
1329
- if not self.best_params:
1330
- raise RuntimeError("Run tune() first to obtain best XGB parameters.")
1331
- self.model = self._build_estimator()
1332
- self.model.set_params(**self.best_params)
1333
- use_refit = bool(getattr(self.ctx.config, "final_refit", True))
1334
- predict_fn = None
1335
- if self.ctx.task_type == 'classification':
1336
- def _predict_proba(X, **_kwargs):
1337
- return self.model.predict_proba(X)[:, 1]
1338
- predict_fn = _predict_proba
1339
- X_all = self.ctx.train_data[self.ctx.factor_nmes]
1340
- y_all = self.ctx.train_data[self.ctx.resp_nme].values
1341
- w_all = self.ctx.train_data[self.ctx.weight_nme].values
1342
-
1343
- use_split = 0.0 < float(self.ctx.prop_test) < 1.0 and len(X_all) >= 10
1344
- if use_split:
1345
- splitter = ShuffleSplit(
1346
- n_splits=1,
1347
- test_size=self.ctx.prop_test,
1348
- random_state=self.ctx.rand_seed,
1349
- )
1350
- train_idx, val_idx = next(splitter.split(X_all))
1351
- X_train = X_all.iloc[train_idx]
1352
- y_train = y_all[train_idx]
1353
- w_train = w_all[train_idx]
1354
- X_val = X_all.iloc[val_idx]
1355
- y_val = y_all[val_idx]
1356
- w_val = w_all[val_idx]
1357
- fit_kwargs = self._build_fit_kwargs(
1358
- w_train=w_train,
1359
- X_val=X_val,
1360
- y_val=y_val,
1361
- w_val=w_val,
1362
- n_estimators=self.best_params.get("n_estimators", 100),
1363
- )
1364
- self.model.fit(X_train, y_train, **fit_kwargs)
1365
- best_iter = getattr(self.model, "best_iteration", None)
1366
- if use_refit and best_iter is not None:
1367
- refit_model = self._build_estimator()
1368
- refit_params = dict(self.best_params)
1369
- refit_params["n_estimators"] = int(best_iter) + 1
1370
- refit_model.set_params(**refit_params)
1371
- refit_kwargs = dict(self.ctx.fit_params or {})
1372
- refit_kwargs.setdefault("sample_weight", w_all)
1373
- refit_kwargs.pop("eval_set", None)
1374
- refit_kwargs.pop("sample_weight_eval_set", None)
1375
- refit_kwargs.pop("early_stopping_rounds", None)
1376
- refit_kwargs.pop("eval_metric", None)
1377
- refit_kwargs.setdefault("verbose", False)
1378
- refit_model.fit(X_all, y_all, **refit_kwargs)
1379
- self.model = refit_model
1380
- else:
1381
- fit_kwargs = dict(self.ctx.fit_params or {})
1382
- fit_kwargs.setdefault("sample_weight", w_all)
1383
- self.model.fit(X_all, y_all, **fit_kwargs)
1384
-
1385
- self.ctx.model_label.append(self.label)
1386
- self._predict_and_cache(
1387
- self.model,
1388
- pred_prefix='xgb',
1389
- predict_fn=predict_fn
1390
- )
1391
- self.ctx.xgb_best = self.model
1392
-
1393
-
1394
- class GLMTrainer(TrainerBase):
1395
- def __init__(self, context: "BayesOptModel") -> None:
1396
- super().__init__(context, 'GLM', 'GLM')
1397
- self.model = None
1398
-
1399
- def _select_family(self, tweedie_power: Optional[float] = None):
1400
- if self.ctx.task_type == 'classification':
1401
- return sm.families.Binomial()
1402
- if self.ctx.obj == 'count:poisson':
1403
- return sm.families.Poisson()
1404
- if self.ctx.obj == 'reg:gamma':
1405
- return sm.families.Gamma()
1406
- power = tweedie_power if tweedie_power is not None else 1.5
1407
- return sm.families.Tweedie(var_power=power, link=sm.families.links.log())
1408
-
1409
- def _prepare_design(self, data: pd.DataFrame) -> pd.DataFrame:
1410
- # Add intercept to the statsmodels design matrix.
1411
- X = data[self.ctx.var_nmes]
1412
- return sm.add_constant(X, has_constant='add')
1413
-
1414
- def _metric_power(self, family, tweedie_power: Optional[float]) -> float:
1415
- if isinstance(family, sm.families.Poisson):
1416
- return 1.0
1417
- if isinstance(family, sm.families.Gamma):
1418
- return 2.0
1419
- if isinstance(family, sm.families.Tweedie):
1420
- return tweedie_power if tweedie_power is not None else getattr(family, 'var_power', 1.5)
1421
- return 1.5
1422
-
1423
- def cross_val(self, trial: optuna.trial.Trial) -> float:
1424
- param_space = {
1425
- "alpha": lambda t: t.suggest_float('alpha', 1e-6, 1e2, log=True),
1426
- "l1_ratio": lambda t: t.suggest_float('l1_ratio', 0.0, 1.0)
1427
- }
1428
- if self.ctx.task_type == 'regression' and self.ctx.obj == 'reg:tweedie':
1429
- param_space["tweedie_power"] = lambda t: t.suggest_float(
1430
- 'tweedie_power', 1.0, 2.0)
1431
-
1432
- def data_provider():
1433
- data = self.ctx.train_oht_data if self.ctx.train_oht_data is not None else self.ctx.train_oht_scl_data
1434
- assert data is not None, "Preprocessed training data is missing."
1435
- return data[self.ctx.var_nmes], data[self.ctx.resp_nme], data[self.ctx.weight_nme]
1436
-
1437
- def preprocess_fn(X_train, X_val):
1438
- X_train_s, X_val_s, _ = self._standardize_fold(
1439
- X_train, X_val, self.ctx.num_features)
1440
- return self._prepare_design(X_train_s), self._prepare_design(X_val_s)
1441
-
1442
- metric_ctx: Dict[str, Any] = {}
1443
-
1444
- def model_builder(params):
1445
- family = self._select_family(params.get("tweedie_power"))
1446
- metric_ctx["family"] = family
1447
- metric_ctx["tweedie_power"] = params.get("tweedie_power")
1448
- return {
1449
- "family": family,
1450
- "alpha": params["alpha"],
1451
- "l1_ratio": params["l1_ratio"],
1452
- "tweedie_power": params.get("tweedie_power")
1453
- }
1454
-
1455
- def fit_predict(model_cfg, X_train, y_train, w_train, X_val, y_val, w_val, _trial):
1456
- glm = sm.GLM(y_train, X_train,
1457
- family=model_cfg["family"],
1458
- freq_weights=w_train)
1459
- result = glm.fit_regularized(
1460
- alpha=model_cfg["alpha"],
1461
- L1_wt=model_cfg["l1_ratio"],
1462
- maxiter=200
1463
- )
1464
- return result.predict(X_val)
1465
-
1466
- def metric_fn(y_true, y_pred, weight):
1467
- if self.ctx.task_type == 'classification':
1468
- y_pred_clipped = np.clip(y_pred, EPS, 1 - EPS)
1469
- return log_loss(y_true, y_pred_clipped, sample_weight=weight)
1470
- y_pred_safe = np.maximum(y_pred, EPS)
1471
- return mean_tweedie_deviance(
1472
- y_true,
1473
- y_pred_safe,
1474
- sample_weight=weight,
1475
- power=self._metric_power(
1476
- metric_ctx.get("family"), metric_ctx.get("tweedie_power"))
1477
- )
1478
-
1479
- return self.cross_val_generic(
1480
- trial=trial,
1481
- hyperparameter_space=param_space,
1482
- data_provider=data_provider,
1483
- model_builder=model_builder,
1484
- metric_fn=metric_fn,
1485
- preprocess_fn=preprocess_fn,
1486
- fit_predict_fn=fit_predict,
1487
- splitter=self.ctx.cv.split(self.ctx.train_oht_data[self.ctx.var_nmes]
1488
- if self.ctx.train_oht_data is not None else self.ctx.train_oht_scl_data[self.ctx.var_nmes])
1489
- )
1490
-
1491
- def train(self) -> None:
1492
- if not self.best_params:
1493
- raise RuntimeError("Run tune() first to obtain best GLM parameters.")
1494
- tweedie_power = self.best_params.get('tweedie_power')
1495
- family = self._select_family(tweedie_power)
1496
-
1497
- X_train = self._prepare_design(self.ctx.train_oht_scl_data)
1498
- y_train = self.ctx.train_oht_scl_data[self.ctx.resp_nme]
1499
- w_train = self.ctx.train_oht_scl_data[self.ctx.weight_nme]
1500
-
1501
- glm = sm.GLM(y_train, X_train, family=family,
1502
- freq_weights=w_train)
1503
- self.model = glm.fit_regularized(
1504
- alpha=self.best_params['alpha'],
1505
- L1_wt=self.best_params['l1_ratio'],
1506
- maxiter=300
1507
- )
1508
-
1509
- self.ctx.glm_best = self.model
1510
- self.ctx.model_label += [self.label]
1511
- self._predict_and_cache(
1512
- self.model,
1513
- 'glm',
1514
- design_fn=lambda train: self._prepare_design(
1515
- self.ctx.train_oht_scl_data if train else self.ctx.test_oht_scl_data
1516
- )
1517
- )
1518
-
1519
- def ensemble_predict(self, k: int) -> None:
1520
- if not self.best_params:
1521
- raise RuntimeError("Run tune() first to obtain best GLM parameters.")
1522
- k = max(2, int(k))
1523
- data = self.ctx.train_oht_scl_data
1524
- if data is None:
1525
- raise RuntimeError("Missing standardized data for GLM ensemble.")
1526
- X_all = data[self.ctx.var_nmes]
1527
- y_all = data[self.ctx.resp_nme]
1528
- w_all = data[self.ctx.weight_nme]
1529
- X_test = self.ctx.test_oht_scl_data
1530
- if X_test is None:
1531
- raise RuntimeError("Missing standardized test data for GLM ensemble.")
1532
-
1533
- n_samples = len(X_all)
1534
- if n_samples < k:
1535
- print(
1536
- f"[GLM Ensemble] n_samples={n_samples} < k={k}; skip ensemble.",
1537
- flush=True,
1538
- )
1539
- return
1540
-
1541
- X_all_design = self._prepare_design(data)
1542
- X_test_design = self._prepare_design(X_test)
1543
- tweedie_power = self.best_params.get('tweedie_power')
1544
- family = self._select_family(tweedie_power)
1545
-
1546
- splitter = KFold(
1547
- n_splits=k,
1548
- shuffle=True,
1549
- random_state=self.ctx.rand_seed,
1550
- )
1551
- preds_train_sum = np.zeros(n_samples, dtype=np.float64)
1552
- preds_test_sum = np.zeros(len(X_test_design), dtype=np.float64)
1553
-
1554
- for train_idx, _val_idx in splitter.split(X_all):
1555
- X_train = X_all_design.iloc[train_idx]
1556
- y_train = y_all.iloc[train_idx]
1557
- w_train = w_all.iloc[train_idx]
1558
-
1559
- glm = sm.GLM(y_train, X_train, family=family, freq_weights=w_train)
1560
- result = glm.fit_regularized(
1561
- alpha=self.best_params['alpha'],
1562
- L1_wt=self.best_params['l1_ratio'],
1563
- maxiter=300
1564
- )
1565
- pred_train = result.predict(X_all_design)
1566
- pred_test = result.predict(X_test_design)
1567
- preds_train_sum += np.asarray(pred_train, dtype=np.float64)
1568
- preds_test_sum += np.asarray(pred_test, dtype=np.float64)
1569
-
1570
- preds_train = preds_train_sum / float(k)
1571
- preds_test = preds_test_sum / float(k)
1572
- self._cache_predictions("glm", preds_train, preds_test)
1573
-
1574
-
1575
- class ResNetTrainer(TrainerBase):
1576
- def __init__(self, context: "BayesOptModel") -> None:
1577
- if context.task_type == 'classification':
1578
- super().__init__(context, 'ResNetClassifier', 'ResNet')
1579
- else:
1580
- super().__init__(context, 'ResNet', 'ResNet')
1581
- self.model: Optional[ResNetSklearn] = None
1582
- self.enable_distributed_optuna = bool(context.config.use_resn_ddp)
1583
-
1584
- def _resolve_input_dim(self) -> int:
1585
- data = getattr(self.ctx, "train_oht_scl_data", None)
1586
- if data is not None and getattr(self.ctx, "var_nmes", None):
1587
- return int(data[self.ctx.var_nmes].shape[1])
1588
- return int(len(self.ctx.var_nmes or []))
1589
-
1590
- def _build_model(self, params: Optional[Dict[str, Any]] = None) -> ResNetSklearn:
1591
- params = params or {}
1592
- power = params.get("tw_power", self.ctx.default_tweedie_power())
1593
- if power is not None:
1594
- power = float(power)
1595
- resn_weight_decay = float(
1596
- params.get(
1597
- "weight_decay",
1598
- getattr(self.ctx.config, "resn_weight_decay", 1e-4),
1599
- )
1600
- )
1601
- return ResNetSklearn(
1602
- model_nme=self.ctx.model_nme,
1603
- input_dim=self._resolve_input_dim(),
1604
- hidden_dim=int(params.get("hidden_dim", 64)),
1605
- block_num=int(params.get("block_num", 2)),
1606
- task_type=self.ctx.task_type,
1607
- epochs=self.ctx.epochs,
1608
- tweedie_power=power,
1609
- learning_rate=float(params.get("learning_rate", 0.01)),
1610
- patience=int(params.get("patience", 10)),
1611
- use_layernorm=True,
1612
- dropout=float(params.get("dropout", 0.1)),
1613
- residual_scale=float(params.get("residual_scale", 0.1)),
1614
- stochastic_depth=float(params.get("stochastic_depth", 0.0)),
1615
- weight_decay=resn_weight_decay,
1616
- use_data_parallel=self.ctx.config.use_resn_data_parallel,
1617
- use_ddp=self.ctx.config.use_resn_ddp
1618
- )
1619
-
1620
- # ========= Cross-validation (for BayesOpt) =========
1621
- def cross_val(self, trial: optuna.trial.Trial) -> float:
1622
- # ResNet CV focuses on memory control:
1623
- # - Create a ResNetSklearn per fold and release it immediately after.
1624
- # - Move model to CPU, delete, and call gc/empty_cache after each fold.
1625
- # - Optionally sample part of training data during BayesOpt to reduce memory.
1626
-
1627
- base_tw_power = self.ctx.default_tweedie_power()
1628
-
1629
- def data_provider():
1630
- data = self.ctx.train_oht_data if self.ctx.train_oht_data is not None else self.ctx.train_oht_scl_data
1631
- assert data is not None, "Preprocessed training data is missing."
1632
- return data[self.ctx.var_nmes], data[self.ctx.resp_nme], data[self.ctx.weight_nme]
1633
-
1634
- metric_ctx: Dict[str, Any] = {}
1635
-
1636
- def model_builder(params):
1637
- power = params.get("tw_power", base_tw_power)
1638
- metric_ctx["tw_power"] = power
1639
- params_local = dict(params)
1640
- params_local["tw_power"] = power
1641
- return self._build_model(params_local)
1642
-
1643
- def preprocess_fn(X_train, X_val):
1644
- X_train_s, X_val_s, _ = self._standardize_fold(
1645
- X_train, X_val, self.ctx.num_features)
1646
- return X_train_s, X_val_s
1647
-
1648
- def fit_predict(model, X_train, y_train, w_train, X_val, y_val, w_val, trial_obj):
1649
- model.fit(
1650
- X_train, y_train, w_train,
1651
- X_val, y_val, w_val,
1652
- trial=trial_obj
1653
- )
1654
- return model.predict(X_val)
1655
-
1656
- def metric_fn(y_true, y_pred, weight):
1657
- if self.ctx.task_type == 'regression':
1658
- return mean_tweedie_deviance(
1659
- y_true,
1660
- y_pred,
1661
- sample_weight=weight,
1662
- power=metric_ctx.get("tw_power", base_tw_power)
1663
- )
1664
- return log_loss(y_true, y_pred, sample_weight=weight)
1665
-
1666
- sample_cap = data_provider()[0]
1667
- max_rows_for_resnet_bo = min(100000, int(len(sample_cap)/5))
1668
-
1669
- return self.cross_val_generic(
1670
- trial=trial,
1671
- hyperparameter_space={
1672
- "learning_rate": lambda t: t.suggest_float('learning_rate', 1e-6, 1e-2, log=True),
1673
- "hidden_dim": lambda t: t.suggest_int('hidden_dim', 8, 32, step=2),
1674
- "block_num": lambda t: t.suggest_int('block_num', 2, 10),
1675
- "dropout": lambda t: t.suggest_float('dropout', 0.0, 0.3, step=0.05),
1676
- "residual_scale": lambda t: t.suggest_float('residual_scale', 0.05, 0.3, step=0.05),
1677
- "patience": lambda t: t.suggest_int('patience', 3, 12),
1678
- "stochastic_depth": lambda t: t.suggest_float('stochastic_depth', 0.0, 0.2, step=0.05),
1679
- **({"tw_power": lambda t: t.suggest_float('tw_power', 1.0, 2.0)} if self.ctx.task_type == 'regression' and self.ctx.obj == 'reg:tweedie' else {})
1680
- },
1681
- data_provider=data_provider,
1682
- model_builder=model_builder,
1683
- metric_fn=metric_fn,
1684
- sample_limit=max_rows_for_resnet_bo if len(
1685
- sample_cap) > max_rows_for_resnet_bo > 0 else None,
1686
- preprocess_fn=preprocess_fn,
1687
- fit_predict_fn=fit_predict,
1688
- cleanup_fn=lambda m: getattr(
1689
- getattr(m, "resnet", None), "to", lambda *_args, **_kwargs: None)("cpu")
1690
- )
1691
-
1692
- # ========= Train final ResNet with best hyperparameters =========
1693
- def train(self) -> None:
1694
- if not self.best_params:
1695
- raise RuntimeError("Run tune() first to obtain best ResNet parameters.")
1696
-
1697
- params = dict(self.best_params)
1698
- use_refit = bool(getattr(self.ctx.config, "final_refit", True))
1699
- data = self.ctx.train_oht_scl_data
1700
- if data is None:
1701
- raise RuntimeError("Missing standardized data for ResNet training.")
1702
- X_all = data[self.ctx.var_nmes]
1703
- y_all = data[self.ctx.resp_nme]
1704
- w_all = data[self.ctx.weight_nme]
1705
-
1706
- refit_epochs = None
1707
- if use_refit and 0.0 < float(self.ctx.prop_test) < 1.0 and len(X_all) >= 10:
1708
- splitter = ShuffleSplit(
1709
- n_splits=1,
1710
- test_size=self.ctx.prop_test,
1711
- random_state=self.ctx.rand_seed,
1712
- )
1713
- train_idx, val_idx = next(splitter.split(X_all))
1714
- tmp_model = self._build_model(params)
1715
- tmp_model.fit(
1716
- X_all.iloc[train_idx],
1717
- y_all.iloc[train_idx],
1718
- w_all.iloc[train_idx],
1719
- X_all.iloc[val_idx],
1720
- y_all.iloc[val_idx],
1721
- w_all.iloc[val_idx],
1722
- trial=None,
1723
- )
1724
- refit_epochs = self._resolve_best_epoch(
1725
- getattr(tmp_model, "training_history", None),
1726
- default_epochs=int(self.ctx.epochs),
1727
- )
1728
- getattr(getattr(tmp_model, "resnet", None), "to",
1729
- lambda *_args, **_kwargs: None)("cpu")
1730
- self._clean_gpu()
1731
-
1732
- self.model = self._build_model(params)
1733
- if refit_epochs is not None:
1734
- self.model.epochs = int(refit_epochs)
1735
- self.best_params = params
1736
- loss_plot_path = self.output.plot_path(
1737
- f'loss_{self.ctx.model_nme}_{self.model_name_prefix}.png')
1738
- self.model.loss_curve_path = loss_plot_path
1739
-
1740
- self._fit_predict_cache(
1741
- self.model,
1742
- X_all,
1743
- y_all,
1744
- sample_weight=w_all,
1745
- pred_prefix='resn',
1746
- use_oht=True,
1747
- sample_weight_arg='w_train'
1748
- )
1749
-
1750
- # Convenience wrapper for external callers.
1751
- self.ctx.resn_best = self.model
1752
-
1753
- def ensemble_predict(self, k: int) -> None:
1754
- if not self.best_params:
1755
- raise RuntimeError("Run tune() first to obtain best ResNet parameters.")
1756
- data = self.ctx.train_oht_scl_data
1757
- test_data = self.ctx.test_oht_scl_data
1758
- if data is None or test_data is None:
1759
- raise RuntimeError("Missing standardized data for ResNet ensemble.")
1760
- X_all = data[self.ctx.var_nmes]
1761
- y_all = data[self.ctx.resp_nme]
1762
- w_all = data[self.ctx.weight_nme]
1763
- X_test = test_data[self.ctx.var_nmes]
1764
-
1765
- k = max(2, int(k))
1766
- n_samples = len(X_all)
1767
- if n_samples < k:
1768
- print(
1769
- f"[ResNet Ensemble] n_samples={n_samples} < k={k}; skip ensemble.",
1770
- flush=True,
1771
- )
1772
- return
1773
-
1774
- splitter = KFold(
1775
- n_splits=k,
1776
- shuffle=True,
1777
- random_state=self.ctx.rand_seed,
1778
- )
1779
- preds_train_sum = np.zeros(n_samples, dtype=np.float64)
1780
- preds_test_sum = np.zeros(len(X_test), dtype=np.float64)
1781
-
1782
- for train_idx, val_idx in splitter.split(X_all):
1783
- model = self._build_model(self.best_params)
1784
- model.fit(
1785
- X_all.iloc[train_idx],
1786
- y_all.iloc[train_idx],
1787
- w_all.iloc[train_idx],
1788
- X_all.iloc[val_idx],
1789
- y_all.iloc[val_idx],
1790
- w_all.iloc[val_idx],
1791
- trial=None,
1792
- )
1793
- pred_train = model.predict(X_all)
1794
- pred_test = model.predict(X_test)
1795
- preds_train_sum += np.asarray(pred_train, dtype=np.float64)
1796
- preds_test_sum += np.asarray(pred_test, dtype=np.float64)
1797
- getattr(getattr(model, "resnet", None), "to",
1798
- lambda *_args, **_kwargs: None)("cpu")
1799
- self._clean_gpu()
1800
-
1801
- preds_train = preds_train_sum / float(k)
1802
- preds_test = preds_test_sum / float(k)
1803
- self._cache_predictions("resn", preds_train, preds_test)
1804
-
1805
- # ========= Save / Load =========
1806
- # ResNet is saved as state_dict and needs a custom load path.
1807
- # Save logic is implemented in TrainerBase (checks .resnet attribute).
1808
-
1809
- def load(self) -> None:
1810
- # Load ResNet weights to the current device to match context.
1811
- path = self.output.model_path(self._get_model_filename())
1812
- if os.path.exists(path):
1813
- resn_loaded = self._build_model(self.best_params)
1814
- state_dict = torch.load(path, map_location='cpu')
1815
- resn_loaded.resnet.load_state_dict(state_dict)
1816
-
1817
- self._move_to_device(resn_loaded)
1818
- self.model = resn_loaded
1819
- self.ctx.resn_best = self.model
1820
- else:
1821
- print(f"[ResNetTrainer.load] Model file not found: {path}")
1822
-
1823
-
1824
- class FTTrainer(TrainerBase):
1825
- def __init__(self, context: "BayesOptModel") -> None:
1826
- if context.task_type == 'classification':
1827
- super().__init__(context, 'FTTransformerClassifier', 'FTTransformer')
1828
- else:
1829
- super().__init__(context, 'FTTransformer', 'FTTransformer')
1830
- self.model: Optional[FTTransformerSklearn] = None
1831
- self.enable_distributed_optuna = bool(context.config.use_ft_ddp)
1832
- self._cv_geo_warned = False
1833
-
1834
- def _resolve_numeric_tokens(self) -> int:
1835
- requested = getattr(self.ctx.config, "ft_num_numeric_tokens", None)
1836
- return FTTransformerSklearn.resolve_numeric_token_count(
1837
- self.ctx.num_features,
1838
- self.ctx.cate_list,
1839
- requested,
1840
- )
1841
-
1842
- def _resolve_adaptive_heads(self,
1843
- d_model: int,
1844
- requested_heads: Optional[int] = None) -> Tuple[int, bool]:
1845
- d_model = int(d_model)
1846
- if d_model <= 0:
1847
- raise ValueError(f"Invalid d_model={d_model}, expected > 0.")
1848
-
1849
- default_heads = max(2, d_model // 16)
1850
- base_heads = default_heads if requested_heads is None else int(
1851
- requested_heads)
1852
- base_heads = max(1, min(base_heads, d_model))
1853
-
1854
- if d_model % base_heads == 0:
1855
- return base_heads, False
1856
-
1857
- for candidate in range(min(d_model, base_heads), 0, -1):
1858
- if d_model % candidate == 0:
1859
- return candidate, True
1860
- return 1, True
1861
-
1862
- def _build_geo_tokens_for_split(self,
1863
- X_train: pd.DataFrame,
1864
- X_val: pd.DataFrame,
1865
- geo_params: Optional[Dict[str, Any]] = None):
1866
- if not self.ctx.config.geo_feature_nmes:
1867
- return None
1868
- orig_train = self.ctx.train_data
1869
- orig_test = self.ctx.test_data
1870
- try:
1871
- self.ctx.train_data = orig_train.loc[X_train.index].copy()
1872
- self.ctx.test_data = orig_train.loc[X_val.index].copy()
1873
- return self.ctx._build_geo_tokens(geo_params)
1874
- finally:
1875
- self.ctx.train_data = orig_train
1876
- self.ctx.test_data = orig_test
1877
-
1878
- def cross_val_unsupervised(self, trial: Optional[optuna.trial.Trial]) -> float:
1879
- """Optuna objective A: minimize validation loss for masked reconstruction."""
1880
- param_space: Dict[str, Callable[[optuna.trial.Trial], Any]] = {
1881
- "learning_rate": lambda t: t.suggest_float('learning_rate', 1e-5, 5e-3, log=True),
1882
- "d_model": lambda t: t.suggest_int('d_model', 16, 128, step=16),
1883
- "n_layers": lambda t: t.suggest_int('n_layers', 2, 8),
1884
- "dropout": lambda t: t.suggest_float('dropout', 0.0, 0.3),
1885
- "weight_decay": lambda t: t.suggest_float('weight_decay', 1e-6, 1e-2, log=True),
1886
- "mask_prob_num": lambda t: t.suggest_float('mask_prob_num', 0.05, 0.4),
1887
- "mask_prob_cat": lambda t: t.suggest_float('mask_prob_cat', 0.05, 0.4),
1888
- "num_loss_weight": lambda t: t.suggest_float('num_loss_weight', 0.25, 4.0, log=True),
1889
- "cat_loss_weight": lambda t: t.suggest_float('cat_loss_weight', 0.25, 4.0, log=True),
1890
- }
1891
-
1892
- params: Optional[Dict[str, Any]] = None
1893
- if self._distributed_forced_params is not None:
1894
- params = self._distributed_forced_params
1895
- self._distributed_forced_params = None
1896
- else:
1897
- if trial is None:
1898
- raise RuntimeError(
1899
- "Missing Optuna trial for parameter sampling.")
1900
- params = {name: sampler(trial)
1901
- for name, sampler in param_space.items()}
1902
- if self._should_use_distributed_optuna():
1903
- self._distributed_prepare_trial(params)
1904
-
1905
- X_all = self.ctx.train_data[self.ctx.factor_nmes]
1906
- max_rows_for_ft_bo = min(1_000_000, int(len(X_all) / 2))
1907
- if max_rows_for_ft_bo > 0 and len(X_all) > max_rows_for_ft_bo:
1908
- X_all = X_all.sample(n=max_rows_for_ft_bo,
1909
- random_state=self.ctx.rand_seed)
1910
-
1911
- splitter = ShuffleSplit(
1912
- n_splits=1,
1913
- test_size=self.ctx.prop_test,
1914
- random_state=self.ctx.rand_seed
1915
- )
1916
- train_idx, val_idx = next(splitter.split(X_all))
1917
- X_train = X_all.iloc[train_idx]
1918
- X_val = X_all.iloc[val_idx]
1919
- geo_train = geo_val = None
1920
- if self.ctx.config.geo_feature_nmes:
1921
- built = self._build_geo_tokens_for_split(X_train, X_val, params)
1922
- if built is not None:
1923
- geo_train, geo_val, _, _ = built
1924
- elif not self._cv_geo_warned:
1925
- print(
1926
- "[FTTrainer] Geo tokens unavailable for CV split; continue without geo tokens.",
1927
- flush=True,
1928
- )
1929
- self._cv_geo_warned = True
1930
-
1931
- d_model = int(params["d_model"])
1932
- n_layers = int(params["n_layers"])
1933
- num_numeric_tokens = self._resolve_numeric_tokens()
1934
- token_count = num_numeric_tokens + len(self.ctx.cate_list)
1935
- if geo_train is not None:
1936
- token_count += 1
1937
- approx_units = d_model * n_layers * max(1, token_count)
1938
- if approx_units > 12_000_000:
1939
- raise optuna.TrialPruned(
1940
- f"config exceeds safe memory budget (approx_units={approx_units})")
1941
-
1942
- adaptive_heads, _ = self._resolve_adaptive_heads(
1943
- d_model=d_model,
1944
- requested_heads=params.get("n_heads")
1945
- )
1946
-
1947
- mask_prob_num = float(params.get("mask_prob_num", 0.15))
1948
- mask_prob_cat = float(params.get("mask_prob_cat", 0.15))
1949
- num_loss_weight = float(params.get("num_loss_weight", 1.0))
1950
- cat_loss_weight = float(params.get("cat_loss_weight", 1.0))
1951
-
1952
- model_params = dict(params)
1953
- model_params["n_heads"] = adaptive_heads
1954
- for k in ("mask_prob_num", "mask_prob_cat", "num_loss_weight", "cat_loss_weight"):
1955
- model_params.pop(k, None)
1956
-
1957
- model = FTTransformerSklearn(
1958
- model_nme=self.ctx.model_nme,
1959
- num_cols=self.ctx.num_features,
1960
- cat_cols=self.ctx.cate_list,
1961
- task_type=self.ctx.task_type,
1962
- epochs=self.ctx.epochs,
1963
- patience=5,
1964
- weight_decay=float(params.get("weight_decay", 0.0)),
1965
- use_data_parallel=self.ctx.config.use_ft_data_parallel,
1966
- use_ddp=self.ctx.config.use_ft_ddp,
1967
- num_numeric_tokens=num_numeric_tokens,
1968
- )
1969
- model.set_params(model_params)
1970
- try:
1971
- return float(model.fit_unsupervised(
1972
- X_train,
1973
- X_val=X_val,
1974
- trial=trial,
1975
- geo_train=geo_train,
1976
- geo_val=geo_val,
1977
- mask_prob_num=mask_prob_num,
1978
- mask_prob_cat=mask_prob_cat,
1979
- num_loss_weight=num_loss_weight,
1980
- cat_loss_weight=cat_loss_weight
1981
- ))
1982
- finally:
1983
- getattr(getattr(model, "ft", None), "to",
1984
- lambda *_args, **_kwargs: None)("cpu")
1985
- self._clean_gpu()
1986
-
1987
- def cross_val(self, trial: optuna.trial.Trial) -> float:
1988
- # FT-Transformer CV also focuses on memory control:
1989
- # - Shrink search space to avoid oversized models.
1990
- # - Release GPU memory after each fold so the next trial can run.
1991
- # Slightly shrink hyperparameter space to avoid oversized models.
1992
- param_space: Dict[str, Callable[[optuna.trial.Trial], Any]] = {
1993
- "learning_rate": lambda t: t.suggest_float('learning_rate', 1e-5, 5e-4, log=True),
1994
- # "d_model": lambda t: t.suggest_int('d_model', 8, 64, step=8),
1995
- "d_model": lambda t: t.suggest_int('d_model', 16, 128, step=16),
1996
- "n_layers": lambda t: t.suggest_int('n_layers', 2, 8),
1997
- "dropout": lambda t: t.suggest_float('dropout', 0.0, 0.2),
1998
- "weight_decay": lambda t: t.suggest_float('weight_decay', 1e-6, 1e-2, log=True),
1999
- }
2000
- if self.ctx.task_type == 'regression' and self.ctx.obj == 'reg:tweedie':
2001
- param_space["tw_power"] = lambda t: t.suggest_float(
2002
- 'tw_power', 1.0, 2.0)
2003
- geo_enabled = bool(
2004
- self.ctx.geo_token_cols or self.ctx.config.geo_feature_nmes)
2005
- if geo_enabled:
2006
- # Only tune GNN-related hyperparams when geo tokens are enabled.
2007
- param_space.update({
2008
- "geo_token_hidden_dim": lambda t: t.suggest_int('geo_token_hidden_dim', 16, 128, step=16),
2009
- "geo_token_layers": lambda t: t.suggest_int('geo_token_layers', 1, 4),
2010
- "geo_token_k_neighbors": lambda t: t.suggest_int('geo_token_k_neighbors', 5, 20),
2011
- "geo_token_dropout": lambda t: t.suggest_float('geo_token_dropout', 0.0, 0.3),
2012
- "geo_token_learning_rate": lambda t: t.suggest_float('geo_token_learning_rate', 1e-4, 5e-3, log=True),
2013
- })
2014
-
2015
- metric_ctx: Dict[str, Any] = {}
2016
-
2017
- def data_provider():
2018
- data = self.ctx.train_data
2019
- return data[self.ctx.factor_nmes], data[self.ctx.resp_nme], data[self.ctx.weight_nme]
2020
-
2021
- def model_builder(params):
2022
- d_model = int(params["d_model"])
2023
- n_layers = int(params["n_layers"])
2024
- num_numeric_tokens = self._resolve_numeric_tokens()
2025
- token_count = num_numeric_tokens + len(self.ctx.cate_list)
2026
- if geo_enabled:
2027
- token_count += 1
2028
- approx_units = d_model * n_layers * max(1, token_count)
2029
- if approx_units > 12_000_000:
2030
- print(
2031
- f"[FTTrainer] Trial pruned early: d_model={d_model}, n_layers={n_layers} -> approx_units={approx_units}")
2032
- raise optuna.TrialPruned(
2033
- "config exceeds safe memory budget; prune before training")
2034
- geo_params_local = {k: v for k, v in params.items()
2035
- if k.startswith("geo_token_")}
2036
-
2037
- tw_power = params.get("tw_power")
2038
- if self.ctx.task_type == 'regression':
2039
- base_tw = self.ctx.default_tweedie_power()
2040
- if self.ctx.obj in ('count:poisson', 'reg:gamma'):
2041
- tw_power = base_tw
2042
- elif tw_power is None:
2043
- tw_power = base_tw
2044
- metric_ctx["tw_power"] = tw_power
2045
-
2046
- adaptive_heads, _ = self._resolve_adaptive_heads(
2047
- d_model=d_model,
2048
- requested_heads=params.get("n_heads")
2049
- )
2050
-
2051
- return FTTransformerSklearn(
2052
- model_nme=self.ctx.model_nme,
2053
- num_cols=self.ctx.num_features,
2054
- cat_cols=self.ctx.cate_list,
2055
- d_model=d_model,
2056
- n_heads=adaptive_heads,
2057
- n_layers=n_layers,
2058
- dropout=params["dropout"],
2059
- task_type=self.ctx.task_type,
2060
- epochs=self.ctx.epochs,
2061
- tweedie_power=tw_power,
2062
- learning_rate=params["learning_rate"],
2063
- patience=5,
2064
- weight_decay=float(params.get("weight_decay", 0.0)),
2065
- use_data_parallel=self.ctx.config.use_ft_data_parallel,
2066
- use_ddp=self.ctx.config.use_ft_ddp,
2067
- num_numeric_tokens=num_numeric_tokens,
2068
- ).set_params({"_geo_params": geo_params_local} if geo_enabled else {})
2069
-
2070
- def fit_predict(model, X_train, y_train, w_train, X_val, y_val, w_val, trial_obj):
2071
- geo_train = geo_val = None
2072
- if geo_enabled:
2073
- geo_params = getattr(model, "_geo_params", {})
2074
- built = self._build_geo_tokens_for_split(
2075
- X_train, X_val, geo_params)
2076
- if built is not None:
2077
- geo_train, geo_val, _, _ = built
2078
- elif not self._cv_geo_warned:
2079
- print(
2080
- "[FTTrainer] Geo tokens unavailable for CV split; continue without geo tokens.",
2081
- flush=True,
2082
- )
2083
- self._cv_geo_warned = True
2084
- model.fit(
2085
- X_train, y_train, w_train,
2086
- X_val, y_val, w_val,
2087
- trial=trial_obj,
2088
- geo_train=geo_train,
2089
- geo_val=geo_val
2090
- )
2091
- return model.predict(X_val, geo_tokens=geo_val)
2092
-
2093
- def metric_fn(y_true, y_pred, weight):
2094
- if self.ctx.task_type == 'regression':
2095
- return mean_tweedie_deviance(
2096
- y_true,
2097
- y_pred,
2098
- sample_weight=weight,
2099
- power=metric_ctx.get("tw_power", 1.5)
2100
- )
2101
- return log_loss(y_true, y_pred, sample_weight=weight)
2102
-
2103
- data_for_cap = data_provider()[0]
2104
- max_rows_for_ft_bo = min(1000000, int(len(data_for_cap)/2))
2105
-
2106
- return self.cross_val_generic(
2107
- trial=trial,
2108
- hyperparameter_space=param_space,
2109
- data_provider=data_provider,
2110
- model_builder=model_builder,
2111
- metric_fn=metric_fn,
2112
- sample_limit=max_rows_for_ft_bo if len(
2113
- data_for_cap) > max_rows_for_ft_bo > 0 else None,
2114
- fit_predict_fn=fit_predict,
2115
- cleanup_fn=lambda m: getattr(
2116
- getattr(m, "ft", None), "to", lambda *_args, **_kwargs: None)("cpu")
2117
- )
2118
-
2119
- def train(self) -> None:
2120
- if not self.best_params:
2121
- raise RuntimeError("Run tune() first to obtain best FT-Transformer parameters.")
2122
- resolved_params = dict(self.best_params)
2123
- d_model_value = resolved_params.get("d_model", 64)
2124
- adaptive_heads, heads_adjusted = self._resolve_adaptive_heads(
2125
- d_model=d_model_value,
2126
- requested_heads=resolved_params.get("n_heads")
2127
- )
2128
- if heads_adjusted:
2129
- print(f"[FTTrainer] Auto-adjusted n_heads from "
2130
- f"{resolved_params.get('n_heads')} to {adaptive_heads} "
2131
- f"(d_model={d_model_value}).")
2132
- resolved_params["n_heads"] = adaptive_heads
2133
-
2134
- use_refit = bool(getattr(self.ctx.config, "final_refit", True))
2135
- refit_epochs = None
2136
- X_all = self.ctx.train_data[self.ctx.factor_nmes]
2137
- y_all = self.ctx.train_data[self.ctx.resp_nme]
2138
- w_all = self.ctx.train_data[self.ctx.weight_nme]
2139
- if use_refit and 0.0 < float(self.ctx.prop_test) < 1.0 and len(X_all) >= 10:
2140
- splitter = ShuffleSplit(
2141
- n_splits=1,
2142
- test_size=self.ctx.prop_test,
2143
- random_state=self.ctx.rand_seed,
2144
- )
2145
- train_idx, val_idx = next(splitter.split(X_all))
2146
- tmp_model = FTTransformerSklearn(
2147
- model_nme=self.ctx.model_nme,
2148
- num_cols=self.ctx.num_features,
2149
- cat_cols=self.ctx.cate_list,
2150
- task_type=self.ctx.task_type,
2151
- use_data_parallel=self.ctx.config.use_ft_data_parallel,
2152
- use_ddp=self.ctx.config.use_ft_ddp,
2153
- num_numeric_tokens=self._resolve_numeric_tokens(),
2154
- weight_decay=float(resolved_params.get("weight_decay", 0.0)),
2155
- )
2156
- tmp_model.set_params(resolved_params)
2157
- geo_train_full = self.ctx.train_geo_tokens
2158
- geo_train = None if geo_train_full is None else geo_train_full.iloc[train_idx]
2159
- geo_val = None if geo_train_full is None else geo_train_full.iloc[val_idx]
2160
- tmp_model.fit(
2161
- X_all.iloc[train_idx],
2162
- y_all.iloc[train_idx],
2163
- w_all.iloc[train_idx],
2164
- X_all.iloc[val_idx],
2165
- y_all.iloc[val_idx],
2166
- w_all.iloc[val_idx],
2167
- trial=None,
2168
- geo_train=geo_train,
2169
- geo_val=geo_val,
2170
- )
2171
- refit_epochs = self._resolve_best_epoch(
2172
- getattr(tmp_model, "training_history", None),
2173
- default_epochs=int(self.ctx.epochs),
2174
- )
2175
- getattr(getattr(tmp_model, "ft", None), "to",
2176
- lambda *_args, **_kwargs: None)("cpu")
2177
- self._clean_gpu()
2178
-
2179
- self.model = FTTransformerSklearn(
2180
- model_nme=self.ctx.model_nme,
2181
- num_cols=self.ctx.num_features,
2182
- cat_cols=self.ctx.cate_list,
2183
- task_type=self.ctx.task_type,
2184
- use_data_parallel=self.ctx.config.use_ft_data_parallel,
2185
- use_ddp=self.ctx.config.use_ft_ddp,
2186
- num_numeric_tokens=self._resolve_numeric_tokens(),
2187
- weight_decay=float(resolved_params.get("weight_decay", 0.0)),
2188
- )
2189
- if refit_epochs is not None:
2190
- self.model.epochs = int(refit_epochs)
2191
- self.model.set_params(resolved_params)
2192
- self.best_params = resolved_params
2193
- loss_plot_path = self.output.plot_path(
2194
- f'loss_{self.ctx.model_nme}_{self.model_name_prefix}.png')
2195
- self.model.loss_curve_path = loss_plot_path
2196
- geo_train = self.ctx.train_geo_tokens
2197
- geo_test = self.ctx.test_geo_tokens
2198
- fit_kwargs = {}
2199
- predict_kwargs_train = None
2200
- predict_kwargs_test = None
2201
- if geo_train is not None and geo_test is not None:
2202
- fit_kwargs["geo_train"] = geo_train
2203
- predict_kwargs_train = {"geo_tokens": geo_train}
2204
- predict_kwargs_test = {"geo_tokens": geo_test}
2205
- self._fit_predict_cache(
2206
- self.model,
2207
- self.ctx.train_data[self.ctx.factor_nmes],
2208
- self.ctx.train_data[self.ctx.resp_nme],
2209
- sample_weight=self.ctx.train_data[self.ctx.weight_nme],
2210
- pred_prefix='ft',
2211
- sample_weight_arg='w_train',
2212
- fit_kwargs=fit_kwargs,
2213
- predict_kwargs_train=predict_kwargs_train,
2214
- predict_kwargs_test=predict_kwargs_test
2215
- )
2216
- self.ctx.ft_best = self.model
2217
-
2218
- def ensemble_predict(self, k: int) -> None:
2219
- if not self.best_params:
2220
- raise RuntimeError("Run tune() first to obtain best FT-Transformer parameters.")
2221
- k = max(2, int(k))
2222
- X_all = self.ctx.train_data[self.ctx.factor_nmes]
2223
- y_all = self.ctx.train_data[self.ctx.resp_nme]
2224
- w_all = self.ctx.train_data[self.ctx.weight_nme]
2225
- X_test = self.ctx.test_data[self.ctx.factor_nmes]
2226
- n_samples = len(X_all)
2227
- if n_samples < k:
2228
- print(
2229
- f"[FT Ensemble] n_samples={n_samples} < k={k}; skip ensemble.",
2230
- flush=True,
2231
- )
2232
- return
2233
-
2234
- geo_train_full = self.ctx.train_geo_tokens
2235
- geo_test_full = self.ctx.test_geo_tokens
2236
-
2237
- resolved_params = dict(self.best_params)
2238
- default_d_model = getattr(self.model, "d_model", 64)
2239
- adaptive_heads, _ = self._resolve_adaptive_heads(
2240
- d_model=resolved_params.get("d_model", default_d_model),
2241
- requested_heads=resolved_params.get("n_heads")
2242
- )
2243
- resolved_params["n_heads"] = adaptive_heads
2244
-
2245
- splitter = KFold(
2246
- n_splits=k,
2247
- shuffle=True,
2248
- random_state=self.ctx.rand_seed,
2249
- )
2250
- preds_train_sum = np.zeros(n_samples, dtype=np.float64)
2251
- preds_test_sum = np.zeros(len(X_test), dtype=np.float64)
2252
-
2253
- for train_idx, val_idx in splitter.split(X_all):
2254
- model = FTTransformerSklearn(
2255
- model_nme=self.ctx.model_nme,
2256
- num_cols=self.ctx.num_features,
2257
- cat_cols=self.ctx.cate_list,
2258
- task_type=self.ctx.task_type,
2259
- use_data_parallel=self.ctx.config.use_ft_data_parallel,
2260
- use_ddp=self.ctx.config.use_ft_ddp,
2261
- num_numeric_tokens=self._resolve_numeric_tokens(),
2262
- weight_decay=float(resolved_params.get("weight_decay", 0.0)),
2263
- )
2264
- model.set_params(resolved_params)
2265
-
2266
- geo_train = geo_val = None
2267
- if geo_train_full is not None:
2268
- geo_train = geo_train_full.iloc[train_idx]
2269
- geo_val = geo_train_full.iloc[val_idx]
2270
-
2271
- model.fit(
2272
- X_all.iloc[train_idx],
2273
- y_all.iloc[train_idx],
2274
- w_all.iloc[train_idx],
2275
- X_all.iloc[val_idx],
2276
- y_all.iloc[val_idx],
2277
- w_all.iloc[val_idx],
2278
- trial=None,
2279
- geo_train=geo_train,
2280
- geo_val=geo_val,
2281
- )
2282
-
2283
- pred_train = model.predict(X_all, geo_tokens=geo_train_full)
2284
- pred_test = model.predict(X_test, geo_tokens=geo_test_full)
2285
- preds_train_sum += np.asarray(pred_train, dtype=np.float64)
2286
- preds_test_sum += np.asarray(pred_test, dtype=np.float64)
2287
- getattr(getattr(model, "ft", None), "to",
2288
- lambda *_args, **_kwargs: None)("cpu")
2289
- self._clean_gpu()
2290
-
2291
- preds_train = preds_train_sum / float(k)
2292
- preds_test = preds_test_sum / float(k)
2293
- self._cache_predictions("ft", preds_train, preds_test)
2294
-
2295
- def train_as_feature(self, pred_prefix: str = "ft_feat", feature_mode: str = "prediction") -> None:
2296
- """Train FT-Transformer only to generate features (not recorded as final model)."""
2297
- if not self.best_params:
2298
- raise RuntimeError("Run tune() first to obtain best FT-Transformer parameters.")
2299
- self.model = FTTransformerSklearn(
2300
- model_nme=self.ctx.model_nme,
2301
- num_cols=self.ctx.num_features,
2302
- cat_cols=self.ctx.cate_list,
2303
- task_type=self.ctx.task_type,
2304
- use_data_parallel=self.ctx.config.use_ft_data_parallel,
2305
- use_ddp=self.ctx.config.use_ft_ddp,
2306
- num_numeric_tokens=self._resolve_numeric_tokens(),
2307
- )
2308
- resolved_params = dict(self.best_params)
2309
- adaptive_heads, heads_adjusted = self._resolve_adaptive_heads(
2310
- d_model=resolved_params.get("d_model", self.model.d_model),
2311
- requested_heads=resolved_params.get("n_heads")
2312
- )
2313
- if heads_adjusted:
2314
- print(f"[FTTrainer] Auto-adjusted n_heads from "
2315
- f"{resolved_params.get('n_heads')} to {adaptive_heads} "
2316
- f"(d_model={resolved_params.get('d_model', self.model.d_model)}).")
2317
- resolved_params["n_heads"] = adaptive_heads
2318
- self.model.set_params(resolved_params)
2319
- self.best_params = resolved_params
2320
-
2321
- geo_train = self.ctx.train_geo_tokens
2322
- geo_test = self.ctx.test_geo_tokens
2323
- fit_kwargs = {}
2324
- predict_kwargs_train = None
2325
- predict_kwargs_test = None
2326
- if geo_train is not None and geo_test is not None:
2327
- fit_kwargs["geo_train"] = geo_train
2328
- predict_kwargs_train = {"geo_tokens": geo_train}
2329
- predict_kwargs_test = {"geo_tokens": geo_test}
2330
-
2331
- if feature_mode not in ("prediction", "embedding"):
2332
- raise ValueError(
2333
- f"Unsupported feature_mode='{feature_mode}', expected 'prediction' or 'embedding'.")
2334
- if feature_mode == "embedding":
2335
- predict_kwargs_train = dict(predict_kwargs_train or {})
2336
- predict_kwargs_test = dict(predict_kwargs_test or {})
2337
- predict_kwargs_train["return_embedding"] = True
2338
- predict_kwargs_test["return_embedding"] = True
2339
-
2340
- self._fit_predict_cache(
2341
- self.model,
2342
- self.ctx.train_data[self.ctx.factor_nmes],
2343
- self.ctx.train_data[self.ctx.resp_nme],
2344
- sample_weight=self.ctx.train_data[self.ctx.weight_nme],
2345
- pred_prefix=pred_prefix,
2346
- sample_weight_arg='w_train',
2347
- fit_kwargs=fit_kwargs,
2348
- predict_kwargs_train=predict_kwargs_train,
2349
- predict_kwargs_test=predict_kwargs_test,
2350
- record_label=False
2351
- )
2352
-
2353
- def pretrain_unsupervised_as_feature(self,
2354
- pred_prefix: str = "ft_uemb",
2355
- params: Optional[Dict[str,
2356
- Any]] = None,
2357
- mask_prob_num: float = 0.15,
2358
- mask_prob_cat: float = 0.15,
2359
- num_loss_weight: float = 1.0,
2360
- cat_loss_weight: float = 1.0) -> None:
2361
- """Self-supervised pretraining (masked reconstruction) and cache embeddings."""
2362
- self.model = FTTransformerSklearn(
2363
- model_nme=self.ctx.model_nme,
2364
- num_cols=self.ctx.num_features,
2365
- cat_cols=self.ctx.cate_list,
2366
- task_type=self.ctx.task_type,
2367
- use_data_parallel=self.ctx.config.use_ft_data_parallel,
2368
- use_ddp=self.ctx.config.use_ft_ddp,
2369
- num_numeric_tokens=self._resolve_numeric_tokens(),
2370
- )
2371
- resolved_params = dict(params or {})
2372
- # Reuse supervised tuning structure params unless explicitly overridden.
2373
- if not resolved_params and self.best_params:
2374
- resolved_params = dict(self.best_params)
2375
-
2376
- # If params include masked reconstruction fields, they take precedence.
2377
- mask_prob_num = float(resolved_params.pop(
2378
- "mask_prob_num", mask_prob_num))
2379
- mask_prob_cat = float(resolved_params.pop(
2380
- "mask_prob_cat", mask_prob_cat))
2381
- num_loss_weight = float(resolved_params.pop(
2382
- "num_loss_weight", num_loss_weight))
2383
- cat_loss_weight = float(resolved_params.pop(
2384
- "cat_loss_weight", cat_loss_weight))
2385
-
2386
- adaptive_heads, heads_adjusted = self._resolve_adaptive_heads(
2387
- d_model=resolved_params.get("d_model", self.model.d_model),
2388
- requested_heads=resolved_params.get("n_heads")
2389
- )
2390
- if heads_adjusted:
2391
- print(f"[FTTrainer] Auto-adjusted n_heads from "
2392
- f"{resolved_params.get('n_heads')} to {adaptive_heads} "
2393
- f"(d_model={resolved_params.get('d_model', self.model.d_model)}).")
2394
- resolved_params["n_heads"] = adaptive_heads
2395
- if resolved_params:
2396
- self.model.set_params(resolved_params)
2397
-
2398
- loss_plot_path = self.output.plot_path(
2399
- f'loss_{self.ctx.model_nme}_FTTransformerUnsupervised.png')
2400
- self.model.loss_curve_path = loss_plot_path
2401
-
2402
- # Build a simple holdout split for pretraining early stopping.
2403
- X_all = self.ctx.train_data[self.ctx.factor_nmes]
2404
- idx = np.arange(len(X_all))
2405
- splitter = ShuffleSplit(
2406
- n_splits=1,
2407
- test_size=self.ctx.prop_test,
2408
- random_state=self.ctx.rand_seed
2409
- )
2410
- train_idx, val_idx = next(splitter.split(idx))
2411
- X_tr = X_all.iloc[train_idx]
2412
- X_val = X_all.iloc[val_idx]
2413
-
2414
- geo_all = self.ctx.train_geo_tokens
2415
- geo_tr = geo_val = None
2416
- if geo_all is not None:
2417
- geo_tr = geo_all.loc[X_tr.index]
2418
- geo_val = geo_all.loc[X_val.index]
2419
-
2420
- self.model.fit_unsupervised(
2421
- X_tr,
2422
- X_val=X_val,
2423
- geo_train=geo_tr,
2424
- geo_val=geo_val,
2425
- mask_prob_num=mask_prob_num,
2426
- mask_prob_cat=mask_prob_cat,
2427
- num_loss_weight=num_loss_weight,
2428
- cat_loss_weight=cat_loss_weight
2429
- )
2430
-
2431
- geo_train_full = self.ctx.train_geo_tokens
2432
- geo_test_full = self.ctx.test_geo_tokens
2433
- predict_kwargs_train = {"return_embedding": True}
2434
- predict_kwargs_test = {"return_embedding": True}
2435
- if geo_train_full is not None and geo_test_full is not None:
2436
- predict_kwargs_train["geo_tokens"] = geo_train_full
2437
- predict_kwargs_test["geo_tokens"] = geo_test_full
2438
-
2439
- self._predict_and_cache(
2440
- self.model,
2441
- pred_prefix=pred_prefix,
2442
- predict_kwargs_train=predict_kwargs_train,
2443
- predict_kwargs_test=predict_kwargs_test
2444
- )
2445
-
2446
-
2447
- # =============================================================================