ins-pricing 0.1.11__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. ins_pricing/README.md +9 -6
  2. ins_pricing/__init__.py +3 -11
  3. ins_pricing/cli/BayesOpt_entry.py +24 -0
  4. ins_pricing/{modelling → cli}/BayesOpt_incremental.py +197 -64
  5. ins_pricing/cli/Explain_Run.py +25 -0
  6. ins_pricing/{modelling → cli}/Explain_entry.py +169 -124
  7. ins_pricing/cli/Pricing_Run.py +25 -0
  8. ins_pricing/cli/__init__.py +1 -0
  9. ins_pricing/cli/bayesopt_entry_runner.py +1312 -0
  10. ins_pricing/cli/utils/__init__.py +1 -0
  11. ins_pricing/cli/utils/cli_common.py +320 -0
  12. ins_pricing/cli/utils/cli_config.py +375 -0
  13. ins_pricing/{modelling → cli/utils}/notebook_utils.py +74 -19
  14. {ins_pricing_gemini/modelling → ins_pricing/cli}/watchdog_run.py +2 -2
  15. ins_pricing/{modelling → docs/modelling}/BayesOpt_USAGE.md +69 -49
  16. ins_pricing/docs/modelling/README.md +34 -0
  17. ins_pricing/modelling/__init__.py +57 -6
  18. ins_pricing/modelling/core/__init__.py +1 -0
  19. ins_pricing/modelling/{bayesopt → core/bayesopt}/config_preprocess.py +64 -1
  20. ins_pricing/modelling/{bayesopt → core/bayesopt}/core.py +150 -810
  21. ins_pricing/modelling/core/bayesopt/model_explain_mixin.py +296 -0
  22. ins_pricing/modelling/core/bayesopt/model_plotting_mixin.py +548 -0
  23. ins_pricing/modelling/core/bayesopt/models/__init__.py +27 -0
  24. ins_pricing/modelling/core/bayesopt/models/model_ft_components.py +316 -0
  25. ins_pricing/modelling/core/bayesopt/models/model_ft_trainer.py +808 -0
  26. ins_pricing/modelling/core/bayesopt/models/model_gnn.py +675 -0
  27. ins_pricing/modelling/core/bayesopt/models/model_resn.py +435 -0
  28. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +19 -0
  29. ins_pricing/modelling/core/bayesopt/trainers/trainer_base.py +1020 -0
  30. ins_pricing/modelling/core/bayesopt/trainers/trainer_ft.py +787 -0
  31. ins_pricing/modelling/core/bayesopt/trainers/trainer_glm.py +195 -0
  32. ins_pricing/modelling/core/bayesopt/trainers/trainer_gnn.py +312 -0
  33. ins_pricing/modelling/core/bayesopt/trainers/trainer_resn.py +261 -0
  34. ins_pricing/modelling/core/bayesopt/trainers/trainer_xgb.py +348 -0
  35. ins_pricing/modelling/{bayesopt → core/bayesopt}/utils.py +2 -2
  36. ins_pricing/modelling/core/evaluation.py +115 -0
  37. ins_pricing/production/__init__.py +4 -0
  38. ins_pricing/production/preprocess.py +71 -0
  39. ins_pricing/setup.py +10 -5
  40. {ins_pricing_gemini/modelling/tests → ins_pricing/tests/modelling}/test_plotting.py +2 -2
  41. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/METADATA +4 -4
  42. ins_pricing-0.2.0.dist-info/RECORD +125 -0
  43. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/top_level.txt +0 -1
  44. ins_pricing/modelling/BayesOpt_entry.py +0 -633
  45. ins_pricing/modelling/Explain_Run.py +0 -36
  46. ins_pricing/modelling/Pricing_Run.py +0 -36
  47. ins_pricing/modelling/README.md +0 -33
  48. ins_pricing/modelling/bayesopt/models.py +0 -2196
  49. ins_pricing/modelling/bayesopt/trainers.py +0 -2446
  50. ins_pricing/modelling/cli_common.py +0 -136
  51. ins_pricing/modelling/tests/test_plotting.py +0 -63
  52. ins_pricing/modelling/watchdog_run.py +0 -211
  53. ins_pricing-0.1.11.dist-info/RECORD +0 -169
  54. ins_pricing_gemini/__init__.py +0 -23
  55. ins_pricing_gemini/governance/__init__.py +0 -20
  56. ins_pricing_gemini/governance/approval.py +0 -93
  57. ins_pricing_gemini/governance/audit.py +0 -37
  58. ins_pricing_gemini/governance/registry.py +0 -99
  59. ins_pricing_gemini/governance/release.py +0 -159
  60. ins_pricing_gemini/modelling/Explain_Run.py +0 -36
  61. ins_pricing_gemini/modelling/Pricing_Run.py +0 -36
  62. ins_pricing_gemini/modelling/__init__.py +0 -151
  63. ins_pricing_gemini/modelling/cli_common.py +0 -141
  64. ins_pricing_gemini/modelling/config.py +0 -249
  65. ins_pricing_gemini/modelling/config_preprocess.py +0 -254
  66. ins_pricing_gemini/modelling/core.py +0 -741
  67. ins_pricing_gemini/modelling/data_container.py +0 -42
  68. ins_pricing_gemini/modelling/explain/__init__.py +0 -55
  69. ins_pricing_gemini/modelling/explain/gradients.py +0 -334
  70. ins_pricing_gemini/modelling/explain/metrics.py +0 -176
  71. ins_pricing_gemini/modelling/explain/permutation.py +0 -155
  72. ins_pricing_gemini/modelling/explain/shap_utils.py +0 -146
  73. ins_pricing_gemini/modelling/features.py +0 -215
  74. ins_pricing_gemini/modelling/model_manager.py +0 -148
  75. ins_pricing_gemini/modelling/model_plotting.py +0 -463
  76. ins_pricing_gemini/modelling/models.py +0 -2203
  77. ins_pricing_gemini/modelling/notebook_utils.py +0 -294
  78. ins_pricing_gemini/modelling/plotting/__init__.py +0 -45
  79. ins_pricing_gemini/modelling/plotting/common.py +0 -63
  80. ins_pricing_gemini/modelling/plotting/curves.py +0 -572
  81. ins_pricing_gemini/modelling/plotting/diagnostics.py +0 -139
  82. ins_pricing_gemini/modelling/plotting/geo.py +0 -362
  83. ins_pricing_gemini/modelling/plotting/importance.py +0 -121
  84. ins_pricing_gemini/modelling/run_logging.py +0 -133
  85. ins_pricing_gemini/modelling/tests/conftest.py +0 -8
  86. ins_pricing_gemini/modelling/tests/test_cross_val_generic.py +0 -66
  87. ins_pricing_gemini/modelling/tests/test_distributed_utils.py +0 -18
  88. ins_pricing_gemini/modelling/tests/test_explain.py +0 -56
  89. ins_pricing_gemini/modelling/tests/test_geo_tokens_split.py +0 -49
  90. ins_pricing_gemini/modelling/tests/test_graph_cache.py +0 -33
  91. ins_pricing_gemini/modelling/tests/test_plotting_library.py +0 -150
  92. ins_pricing_gemini/modelling/tests/test_preprocessor.py +0 -48
  93. ins_pricing_gemini/modelling/trainers.py +0 -2447
  94. ins_pricing_gemini/modelling/utils.py +0 -1020
  95. ins_pricing_gemini/pricing/__init__.py +0 -27
  96. ins_pricing_gemini/pricing/calibration.py +0 -39
  97. ins_pricing_gemini/pricing/data_quality.py +0 -117
  98. ins_pricing_gemini/pricing/exposure.py +0 -85
  99. ins_pricing_gemini/pricing/factors.py +0 -91
  100. ins_pricing_gemini/pricing/monitoring.py +0 -99
  101. ins_pricing_gemini/pricing/rate_table.py +0 -78
  102. ins_pricing_gemini/production/__init__.py +0 -21
  103. ins_pricing_gemini/production/drift.py +0 -30
  104. ins_pricing_gemini/production/monitoring.py +0 -143
  105. ins_pricing_gemini/production/scoring.py +0 -40
  106. ins_pricing_gemini/reporting/__init__.py +0 -11
  107. ins_pricing_gemini/reporting/report_builder.py +0 -72
  108. ins_pricing_gemini/reporting/scheduler.py +0 -45
  109. ins_pricing_gemini/scripts/BayesOpt_incremental.py +0 -722
  110. ins_pricing_gemini/scripts/Explain_entry.py +0 -545
  111. ins_pricing_gemini/scripts/__init__.py +0 -1
  112. ins_pricing_gemini/scripts/train.py +0 -568
  113. ins_pricing_gemini/setup.py +0 -55
  114. ins_pricing_gemini/smoke_test.py +0 -28
  115. /ins_pricing/{modelling → cli/utils}/run_logging.py +0 -0
  116. /ins_pricing/modelling/{BayesOpt.py → core/BayesOpt.py} +0 -0
  117. /ins_pricing/modelling/{bayesopt → core/bayesopt}/__init__.py +0 -0
  118. /ins_pricing/{modelling/tests → tests/modelling}/conftest.py +0 -0
  119. /ins_pricing/{modelling/tests → tests/modelling}/test_cross_val_generic.py +0 -0
  120. /ins_pricing/{modelling/tests → tests/modelling}/test_distributed_utils.py +0 -0
  121. /ins_pricing/{modelling/tests → tests/modelling}/test_explain.py +0 -0
  122. /ins_pricing/{modelling/tests → tests/modelling}/test_geo_tokens_split.py +0 -0
  123. /ins_pricing/{modelling/tests → tests/modelling}/test_graph_cache.py +0 -0
  124. /ins_pricing/{modelling/tests → tests/modelling}/test_plotting_library.py +0 -0
  125. /ins_pricing/{modelling/tests → tests/modelling}/test_preprocessor.py +0 -0
  126. {ins_pricing-0.1.11.dist-info → ins_pricing-0.2.0.dist-info}/WHEEL +0 -0
@@ -1,722 +0,0 @@
1
- """Incremental training harness built on top of ``ins_pricing.bayesopt``
2
- (compat via ``BayesOpt.py``).
3
-
4
- This utility lets you append new observations to an existing dataset,
5
- reuse previously tuned hyperparameters and retrain a subset of models
6
- without re-running the full Optuna search. It can operate on a directory
7
- of per-model incremental CSVs or a single incremental file when updating
8
- one dataset.
9
-
10
- Example:
11
- python ins_pricing/modelling/BayesOpt_incremental.py \
12
- --config-json ins_pricing/modelling/demo/config_incremental_template.json \
13
- --incremental-dir ./incremental_batches \
14
- --merge-keys policy_id vehicle_id \
15
- --model-keys glm xgb resn --plot-curves
16
- """
17
-
18
- from __future__ import annotations
19
-
20
- import argparse
21
- import json
22
- from dataclasses import asdict
23
- from datetime import datetime
24
- from pathlib import Path
25
- from typing import Any, Dict, List, Optional, Sequence, Tuple
26
-
27
- import pandas as pd
28
- from sklearn.model_selection import train_test_split
29
-
30
- try:
31
- import ins_pricing.modelling as ropt
32
- from ins_pricing.modelling.cli_common import (
33
- PLOT_MODEL_LABELS,
34
- PYTORCH_TRAINERS,
35
- build_model_names,
36
- dedupe_preserve_order,
37
- load_config_json,
38
- normalize_config_paths,
39
- parse_model_pairs,
40
- resolve_config_path,
41
- resolve_path,
42
- set_env,
43
- )
44
- except ImportError:
45
- import sys
46
- from pathlib import Path
47
- _pkg_root = Path(__file__).resolve().parent.parent
48
- if str(_pkg_root) not in sys.path:
49
- sys.path.insert(0, str(_pkg_root))
50
-
51
- try:
52
- from modelling import core as ropt
53
- from modelling.cli_common import (
54
- PLOT_MODEL_LABELS,
55
- PYTORCH_TRAINERS,
56
- build_model_names,
57
- dedupe_preserve_order,
58
- load_config_json,
59
- normalize_config_paths,
60
- parse_model_pairs,
61
- resolve_config_path,
62
- resolve_path,
63
- set_env,
64
- )
65
- except ImportError:
66
- import ins_pricing.modelling as ropt
67
- from ins_pricing.modelling.cli_common import (
68
- PLOT_MODEL_LABELS,
69
- PYTORCH_TRAINERS,
70
- build_model_names,
71
- dedupe_preserve_order,
72
- load_config_json,
73
- normalize_config_paths,
74
- parse_model_pairs,
75
- resolve_config_path,
76
- resolve_path,
77
- set_env,
78
- )
79
-
80
- try:
81
- from .run_logging import configure_run_logging # type: ignore
82
- except Exception: # pragma: no cover
83
- try:
84
- from run_logging import configure_run_logging # type: ignore
85
- except Exception: # pragma: no cover
86
- configure_run_logging = None # type: ignore
87
-
88
-
89
- def _log(message: str) -> None:
90
- print(f"[Incremental] {message}")
91
-
92
-
93
- def _parse_args() -> argparse.Namespace:
94
- parser = argparse.ArgumentParser(
95
- description="Incrementally retrain BayesOpt models using new batches of data."
96
- )
97
- parser.add_argument(
98
- "--config-json",
99
- required=True,
100
- help="Path to the JSON config that BayesOpt_entry.py uses."
101
- )
102
- parser.add_argument(
103
- "--model-names",
104
- nargs="+",
105
- default=None,
106
- help="Optional subset of dataset names to update (defaults to model_list/model_categories Cartesian product)."
107
- )
108
- parser.add_argument(
109
- "--model-keys",
110
- nargs="+",
111
- default=["glm", "xgb", "resn", "ft"],
112
- choices=["glm", "xgb", "resn", "ft", "gnn", "all"],
113
- help="Which trainers to run for each dataset."
114
- )
115
- parser.add_argument(
116
- "--incremental-dir",
117
- type=Path,
118
- default=None,
119
- help="Directory containing <model_name> incremental CSVs."
120
- )
121
- parser.add_argument(
122
- "--incremental-file",
123
- type=Path,
124
- default=None,
125
- help="Single incremental CSV (requires --model-names with exactly one entry)."
126
- )
127
- parser.add_argument(
128
- "--incremental-template",
129
- default="{model_name}_incremental.csv",
130
- help="Filename template when --incremental-dir is provided."
131
- )
132
- parser.add_argument(
133
- "--merge-keys",
134
- nargs="+",
135
- default=None,
136
- help="Column(s) used to drop duplicate rows after merging base and incremental data."
137
- )
138
- parser.add_argument(
139
- "--dedupe-keep",
140
- choices=["first", "last"],
141
- default="last",
142
- help="How pandas.drop_duplicates resolves conflicts on merge keys."
143
- )
144
- parser.add_argument(
145
- "--timestamp-col",
146
- default=None,
147
- help="Optional column used to sort rows before deduplication."
148
- )
149
- parser.add_argument(
150
- "--timestamp-descending",
151
- action="store_true",
152
- help="Sort timestamp column in descending order before deduplication."
153
- )
154
- parser.add_argument(
155
- "--min-new-rows",
156
- type=int,
157
- default=1,
158
- help="Skip training if fewer new rows than this arrive (unless --train-without-incremental)."
159
- )
160
- parser.add_argument(
161
- "--train-without-incremental",
162
- action="store_true",
163
- help="Always retrain even when no incremental file is present."
164
- )
165
- parser.add_argument(
166
- "--strict-incremental",
167
- action="store_true",
168
- help="Raise an error when a dataset is missing its incremental CSV instead of skipping it."
169
- )
170
- parser.add_argument(
171
- "--tag-new-column",
172
- default=None,
173
- help="If set, store 1 for incremental rows and 0 for historical rows in this column."
174
- )
175
- parser.add_argument(
176
- "--max-evals",
177
- type=int,
178
- default=25,
179
- help="Optuna trial count when retuning is required."
180
- )
181
- parser.add_argument(
182
- "--retune-missing",
183
- dest="retune_missing",
184
- action="store_true",
185
- default=True,
186
- help="Retune models whose best-params CSV is unavailable (default)."
187
- )
188
- parser.add_argument(
189
- "--skip-retune-missing",
190
- dest="retune_missing",
191
- action="store_false",
192
- help="Do not retune when best params are missing; such models are skipped."
193
- )
194
- parser.add_argument(
195
- "--force-retune",
196
- action="store_true",
197
- help="Run Optuna tuning even if historical best params exist."
198
- )
199
- parser.add_argument(
200
- "--prop-test",
201
- type=float,
202
- default=None,
203
- help="Override the test split proportion defined in the config file."
204
- )
205
- parser.add_argument(
206
- "--rand-seed",
207
- type=int,
208
- default=None,
209
- help="Override the random seed defined in the config."
210
- )
211
- parser.add_argument(
212
- "--epochs",
213
- type=int,
214
- default=None,
215
- help="Override the epoch count from the config."
216
- )
217
- parser.add_argument(
218
- "--output-dir",
219
- type=Path,
220
- default=None,
221
- help="Override the BayesOpt output root (models/results/plots)."
222
- )
223
- parser.add_argument(
224
- "--update-base-data",
225
- action="store_true",
226
- help="Overwrite the base CSVs with the merged dataset after a successful update."
227
- )
228
- parser.add_argument(
229
- "--persist-merged-dir",
230
- type=Path,
231
- default=None,
232
- help="Optional directory to store the merged dataset snapshots."
233
- )
234
- parser.add_argument(
235
- "--summary-json",
236
- type=Path,
237
- default=None,
238
- help="Write a JSON summary of processed datasets to this path."
239
- )
240
- parser.add_argument(
241
- "--plot-curves",
242
- action="store_true",
243
- help="Run one-way/lift plots after training (config plot settings also apply)."
244
- )
245
- parser.add_argument(
246
- "--dry-run",
247
- action="store_true",
248
- help="Merge and report counts but skip training, saving and plotting."
249
- )
250
- args = parser.parse_args()
251
-
252
- if args.incremental_file and args.incremental_dir:
253
- parser.error("Use either --incremental-dir or --incremental-file, not both.")
254
- if args.incremental_file and args.model_names and len(args.model_names) != 1:
255
- parser.error("--incremental-file can only be used when updating exactly one model.")
256
- if (not args.incremental_dir and not args.incremental_file) and not args.train_without_incremental:
257
- parser.error(
258
- "Provide --incremental-dir/--incremental-file or enable --train-without-incremental."
259
- )
260
- return args
261
-
262
-
263
- def _plot_curves_for_model(model: ropt.BayesOptModel, trained: List[str], cfg: Dict[str, Any]) -> None:
264
- plot_cfg = cfg.get("plot", {})
265
- legacy_flags = {
266
- "glm": cfg.get("plot_lift_glm", False),
267
- "xgb": cfg.get("plot_lift_xgb", False),
268
- "resn": cfg.get("plot_lift_resn", False),
269
- "ft": cfg.get("plot_lift_ft", False),
270
- }
271
- plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
272
- if not plot_enabled:
273
- return
274
-
275
- n_bins = int(plot_cfg.get("n_bins", 10))
276
- oneway_enabled = plot_cfg.get("oneway", True)
277
- available = dedupe_preserve_order([k for k in trained if k in PLOT_MODEL_LABELS])
278
-
279
- if oneway_enabled:
280
- model.plot_oneway(n_bins=n_bins)
281
- if not available:
282
- return
283
-
284
- lift_models = plot_cfg.get("lift_models")
285
- if lift_models is None:
286
- lift_models = [m for m, flag in legacy_flags.items() if flag] or available
287
- lift_models = dedupe_preserve_order([m for m in lift_models if m in available])
288
-
289
- for key in lift_models:
290
- label, pred_nme = PLOT_MODEL_LABELS[key]
291
- model.plot_lift(model_label=label, pred_nme=pred_nme, n_bins=n_bins)
292
-
293
- if not plot_cfg.get("double_lift", True) or len(available) < 2:
294
- return
295
-
296
- raw_pairs = plot_cfg.get("double_lift_pairs")
297
- if raw_pairs:
298
- pairs = [
299
- (a, b)
300
- for a, b in parse_model_pairs(raw_pairs)
301
- if a in available and b in available and a != b
302
- ]
303
- else:
304
- pairs = [(a, b) for i, a in enumerate(available) for b in available[i + 1 :]]
305
- for first, second in pairs:
306
- model.plot_dlift([first, second], n_bins=n_bins)
307
-
308
-
309
- def _coerce_scalar(value: Any) -> Any:
310
- if isinstance(value, str):
311
- lowered = value.strip().lower()
312
- if lowered in {"", "none", "nan"}:
313
- return None
314
- if lowered in {"true", "false"}:
315
- return lowered == "true"
316
- return value
317
- if hasattr(value, "item"):
318
- try:
319
- return value.item()
320
- except Exception:
321
- return value
322
- return value
323
-
324
-
325
- def _load_best_params(model: ropt.BayesOptModel, trainer, silent: bool = False) -> Optional[Dict[str, Any]]:
326
- label = trainer.label.lower()
327
- result_dir = Path(model.output_manager.result_dir)
328
- path = result_dir / f"{model.model_nme}_bestparams_{label}.csv"
329
- if not path.exists():
330
- if not silent:
331
- _log(f"No historical params found for {model.model_nme}/{label} at {path}.")
332
- return None
333
- try:
334
- params_raw = ropt.IOUtils.load_params_file(str(path))
335
- except Exception:
336
- return None
337
- return {
338
- key: _coerce_scalar(val)
339
- for key, val in (params_raw or {}).items()
340
- if not pd.isna(val)
341
- }
342
-
343
-
344
- def _to_serializable(obj: Any) -> Any:
345
- if isinstance(obj, dict):
346
- return {k: _to_serializable(v) for k, v in obj.items()}
347
- if isinstance(obj, list):
348
- return [_to_serializable(v) for v in obj]
349
- if hasattr(obj, "item"):
350
- try:
351
- return obj.item()
352
- except Exception:
353
- return str(obj)
354
- return obj
355
-
356
-
357
- class IncrementalUpdateRunner:
358
- def __init__(self, args: argparse.Namespace) -> None:
359
- self.args = args
360
- script_dir = Path(__file__).resolve().parent
361
- self.config_path = resolve_config_path(args.config_json, script_dir)
362
- cfg = load_config_json(
363
- self.config_path,
364
- required_keys=[
365
- "data_dir",
366
- "model_list",
367
- "model_categories",
368
- "target",
369
- "weight",
370
- "feature_list",
371
- "categorical_features",
372
- ],
373
- )
374
- self.cfg = normalize_config_paths(cfg, self.config_path)
375
- set_env(self.cfg.get("env", {}))
376
- self.data_dir = Path(self.cfg["data_dir"])
377
- self.data_dir.mkdir(parents=True, exist_ok=True)
378
- self.prop_test = args.prop_test if args.prop_test is not None else self.cfg.get("prop_test", 0.25)
379
- self.rand_seed = args.rand_seed if args.rand_seed is not None else self.cfg.get("rand_seed", 13)
380
- self.epochs = args.epochs if args.epochs is not None else self.cfg.get("epochs", 50)
381
- self.plot_requested = bool(args.plot_curves or self.cfg.get("plot_curves", False))
382
- self.model_names = self._resolve_model_names(args.model_names)
383
- self.merge_keys = list(args.merge_keys or [])
384
- self.timestamp_col = args.timestamp_col
385
- self.timestamp_ascending = not args.timestamp_descending
386
- output_root = args.output_dir or self.cfg.get("output_dir")
387
- if isinstance(output_root, Path) and not output_root.is_absolute():
388
- output_root = (self.config_path.parent / output_root).resolve()
389
- if isinstance(output_root, str) and output_root.strip():
390
- resolved = resolve_path(output_root, self.config_path.parent)
391
- if resolved is not None:
392
- output_root = str(resolved)
393
- self.output_root = output_root
394
-
395
- self.incremental_dir = None
396
- if args.incremental_dir is not None:
397
- self.incremental_dir = args.incremental_dir
398
- if not self.incremental_dir.is_absolute():
399
- self.incremental_dir = (self.config_path.parent / self.incremental_dir).resolve()
400
- else:
401
- self.incremental_dir = self.incremental_dir.resolve()
402
- self.incremental_file = None
403
- if args.incremental_file is not None:
404
- self.incremental_file = args.incremental_file
405
- if not self.incremental_file.is_absolute():
406
- self.incremental_file = (self.config_path.parent / self.incremental_file).resolve()
407
- else:
408
- self.incremental_file = self.incremental_file.resolve()
409
- self.summary_records: List[Dict[str, Any]] = []
410
- self.binary_resp = self.cfg.get("binary_resp_nme") or self.cfg.get("binary_target")
411
-
412
- if self.incremental_file and len(self.model_names) != 1:
413
- raise ValueError("--incremental-file can only be used when exactly one model name is targeted.")
414
-
415
- def _resolve_model_names(self, override: Optional[Sequence[str]]) -> List[str]:
416
- if override:
417
- return dedupe_preserve_order([str(item) for item in override])
418
- prefixes = self.cfg["model_list"]
419
- suffixes = self.cfg["model_categories"]
420
- return build_model_names(prefixes, suffixes)
421
-
422
- def _load_incremental_df(self, model_name: str) -> Tuple[Optional[pd.DataFrame], Optional[Path]]:
423
- path: Optional[Path] = None
424
- if self.incremental_file:
425
- path = self.incremental_file
426
- elif self.incremental_dir:
427
- rel = self.args.incremental_template.format(model_name=model_name)
428
- path = (self.incremental_dir / rel).resolve()
429
- if not path or not path.exists():
430
- return None, None
431
- try:
432
- df = pd.read_csv(path, low_memory=False)
433
- except pd.errors.EmptyDataError:
434
- _log(f"Incremental file {path} is empty; treating as no-op.")
435
- return None, path
436
- return df, path
437
-
438
- def _merge_frames(self, base_df: pd.DataFrame, inc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
439
- if inc_df is None or inc_df.empty:
440
- merged = base_df.copy(deep=True)
441
- return merged.reset_index(drop=True)
442
- frames = []
443
- tag = self.args.tag_new_column
444
- if tag:
445
- base_part = base_df.copy(deep=True)
446
- base_part[tag] = 0
447
- inc_part = inc_df.copy(deep=True)
448
- inc_part[tag] = 1
449
- frames = [base_part, inc_part]
450
- else:
451
- frames = [base_df, inc_df]
452
- merged = pd.concat(frames, ignore_index=True, sort=False)
453
- if self.timestamp_col and self.timestamp_col in merged.columns:
454
- merged = merged.sort_values(
455
- self.timestamp_col,
456
- ascending=self.timestamp_ascending,
457
- kind="mergesort",
458
- )
459
- if self.merge_keys:
460
- missing = [col for col in self.merge_keys if col not in merged.columns]
461
- if missing:
462
- raise KeyError(f"Merge keys {missing} not found in merged frame for {self.merge_keys}.")
463
- merged = merged.drop_duplicates(subset=self.merge_keys, keep=self.args.dedupe_keep)
464
- return merged.reset_index(drop=True)
465
-
466
- def _should_train(self, new_rows: int) -> bool:
467
- if self.args.train_without_incremental:
468
- return True
469
- min_needed = max(0, self.args.min_new_rows)
470
- return new_rows >= min_needed
471
-
472
- def _write_dataset(self, df: pd.DataFrame, dest: Path, reason: str) -> None:
473
- dest.parent.mkdir(parents=True, exist_ok=True)
474
- df.to_csv(dest, index=False)
475
- _log(f"Wrote {len(df)} rows to {dest} ({reason}).")
476
-
477
- def _prepare_splits(self, merged: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
478
- if not 0 < self.prop_test < 1:
479
- raise ValueError(f"prop_test must fall in (0, 1); got {self.prop_test}.")
480
- if len(merged) < 2:
481
- raise ValueError("Need at least two rows to form a train/test split.")
482
- train_df, test_df = train_test_split(
483
- merged,
484
- test_size=self.prop_test,
485
- random_state=self.rand_seed,
486
- )
487
- return train_df.reset_index(drop=True), test_df.reset_index(drop=True)
488
-
489
- def _requested_model_keys(self, trainer_map: Dict[str, Any]) -> List[str]:
490
- requested = self.args.model_keys
491
- if "all" in requested:
492
- requested = ["glm", "xgb", "resn", "ft", "gnn"]
493
- requested = dedupe_preserve_order(requested)
494
- missing = [key for key in requested if key not in trainer_map]
495
- for key in missing:
496
- _log(f"Trainer '{key}' is not available for this context and will be skipped.")
497
- return [key for key in requested if key in trainer_map]
498
-
499
- def _train_single_model(
500
- self,
501
- model_name: str,
502
- merged: pd.DataFrame,
503
- new_rows: int,
504
- incremental_path: Optional[Path],
505
- ) -> Dict[str, Any]:
506
- merged = merged.copy(deep=True)
507
- merged.fillna(0, inplace=True)
508
- train_df, test_df = self._prepare_splits(merged)
509
- model = ropt.BayesOptModel(
510
- train_df,
511
- test_df,
512
- model_name,
513
- self.cfg["target"],
514
- self.cfg["weight"],
515
- self.cfg["feature_list"],
516
- task_type=self.cfg.get("task_type", "regression"),
517
- binary_resp_nme=self.binary_resp,
518
- cate_list=self.cfg.get("categorical_features"),
519
- prop_test=self.prop_test,
520
- rand_seed=self.rand_seed,
521
- epochs=self.epochs,
522
- use_resn_data_parallel=self.cfg.get("use_resn_data_parallel", False),
523
- use_ft_data_parallel=self.cfg.get("use_ft_data_parallel", True),
524
- use_gnn_data_parallel=self.cfg.get("use_gnn_data_parallel", False),
525
- use_resn_ddp=self.cfg.get("use_resn_ddp", False),
526
- use_ft_ddp=self.cfg.get("use_ft_ddp", False),
527
- use_gnn_ddp=self.cfg.get("use_gnn_ddp", False),
528
- output_dir=str(self.output_root) if self.output_root else None,
529
- xgb_max_depth_max=int(self.cfg.get("xgb_max_depth_max", 25)),
530
- xgb_n_estimators_max=int(self.cfg.get("xgb_n_estimators_max", 500)),
531
- resn_weight_decay=self.cfg.get("resn_weight_decay"),
532
- final_ensemble=bool(self.cfg.get("final_ensemble", False)),
533
- final_ensemble_k=int(self.cfg.get("final_ensemble_k", 3)),
534
- final_refit=bool(self.cfg.get("final_refit", True)),
535
- optuna_storage=self.cfg.get("optuna_storage"),
536
- optuna_study_prefix=self.cfg.get("optuna_study_prefix"),
537
- best_params_files=self.cfg.get("best_params_files"),
538
- reuse_best_params=bool(self.cfg.get("reuse_best_params", False)),
539
- gnn_use_approx_knn=self.cfg.get("gnn_use_approx_knn", True),
540
- gnn_approx_knn_threshold=self.cfg.get("gnn_approx_knn_threshold", 50000),
541
- gnn_graph_cache=self.cfg.get("gnn_graph_cache"),
542
- gnn_max_gpu_knn_nodes=self.cfg.get("gnn_max_gpu_knn_nodes", 200000),
543
- gnn_knn_gpu_mem_ratio=self.cfg.get("gnn_knn_gpu_mem_ratio", 0.9),
544
- gnn_knn_gpu_mem_overhead=self.cfg.get("gnn_knn_gpu_mem_overhead", 2.0),
545
- ft_role=str(self.cfg.get("ft_role", "model")),
546
- ft_feature_prefix=str(self.cfg.get("ft_feature_prefix", "ft_emb")),
547
- ft_num_numeric_tokens=self.cfg.get("ft_num_numeric_tokens"),
548
- infer_categorical_max_unique=int(self.cfg.get("infer_categorical_max_unique", 50)),
549
- infer_categorical_max_ratio=float(self.cfg.get("infer_categorical_max_ratio", 0.05)),
550
- )
551
-
552
- requested_keys = self._requested_model_keys(model.trainers)
553
- executed_keys: List[str] = []
554
- param_sources: Dict[str, str] = {}
555
-
556
- if self.args.dry_run:
557
- _log(f"Dry run: would train {requested_keys} for {model_name}.")
558
- return {
559
- "executed_keys": executed_keys,
560
- "param_sources": param_sources,
561
- "model": model,
562
- }
563
-
564
- if self.args.force_retune and self.args.max_evals <= 0:
565
- raise ValueError("force_retune requires --max-evals > 0.")
566
-
567
- force_retune = bool(self.args.force_retune)
568
- if force_retune:
569
- model.config.reuse_best_params = False
570
- model.config.best_params_files = {}
571
-
572
- ft_role = str(getattr(model.config, "ft_role", "model"))
573
- if ft_role != "model" and "ft" in requested_keys:
574
- requested_keys = ["ft"] + [k for k in requested_keys if k != "ft"]
575
-
576
- for key in requested_keys:
577
- trainer = model.trainers[key]
578
-
579
- if force_retune:
580
- trainer.best_params = None
581
- trainer.best_trial = None
582
- param_sources[key] = "retune"
583
- else:
584
- best_params = _load_best_params(model, trainer, silent=True)
585
- if best_params:
586
- trainer.best_params = best_params
587
- trainer.best_trial = None
588
- param_sources[key] = "loaded"
589
- else:
590
- if not self.args.retune_missing:
591
- _log(
592
- f"Skipping {model_name}/{key}: no best params and retuning disabled."
593
- )
594
- continue
595
- param_sources[key] = "retune"
596
-
597
- if (trainer.best_params is None) and self.args.max_evals <= 0:
598
- raise ValueError("--max-evals must be positive when retuning is requested.")
599
-
600
- model.optimize_model(key, max_evals=self.args.max_evals)
601
- trainer.save()
602
- executed_keys.append(key)
603
- if key in PYTORCH_TRAINERS:
604
- ropt.free_cuda()
605
-
606
- snapshot = {
607
- "mode": "incremental_train",
608
- "model_name": model_name,
609
- "model_key": key,
610
- "timestamp": datetime.now().isoformat(),
611
- "param_source": param_sources[key],
612
- "best_params": _to_serializable(trainer.best_params or {}),
613
- "incremental_rows": new_rows,
614
- "train_rows": len(model.train_data),
615
- "test_rows": len(model.test_data),
616
- "incremental_path": str(incremental_path) if incremental_path else None,
617
- "config": asdict(model.config),
618
- }
619
- model.version_manager.save(f"{model_name}_{key}_incremental", snapshot)
620
-
621
- if not executed_keys:
622
- _log(f"No trainers executed for {model_name}.")
623
-
624
- return {
625
- "executed_keys": executed_keys,
626
- "param_sources": param_sources,
627
- "model": model,
628
- }
629
-
630
- def process(self) -> None:
631
- total_trained = 0
632
- for model_name in self.model_names:
633
- total_trained += self._process_single_model(model_name)
634
- if self.args.summary_json and self.summary_records:
635
- summary_path = self.args.summary_json.resolve()
636
- summary_path.parent.mkdir(parents=True, exist_ok=True)
637
- summary_payload = _to_serializable(self.summary_records)
638
- summary_path.write_text(json.dumps(summary_payload, indent=2, ensure_ascii=False), encoding="utf-8")
639
- _log(f"Summary written to {summary_path}.")
640
- _log(f"Finished incremental update for {total_trained} dataset(s).")
641
-
642
- def _process_single_model(self, model_name: str) -> int:
643
- base_path = self.data_dir / f"{model_name}.csv"
644
- if not base_path.exists():
645
- _log(f"Base dataset {base_path} not found; skipping {model_name}.")
646
- self.summary_records.append({
647
- "model_name": model_name,
648
- "status": "missing_base",
649
- })
650
- return 0
651
-
652
- base_df = pd.read_csv(base_path, low_memory=False)
653
- inc_df, inc_path = self._load_incremental_df(model_name)
654
- if inc_df is None and self.incremental_dir and self.args.strict_incremental and not self.args.train_without_incremental:
655
- raise FileNotFoundError(f"Missing incremental file for {model_name} under {self.incremental_dir}.")
656
-
657
- new_rows = 0 if inc_df is None else len(inc_df)
658
- _log(f"{model_name}: {len(base_df)} base rows, {new_rows} incremental rows.")
659
- merged_df = self._merge_frames(base_df, inc_df)
660
- merged_df.fillna(0, inplace=True)
661
-
662
- if self.args.update_base_data and not self.args.dry_run:
663
- self._write_dataset(merged_df, base_path, "update_base_data")
664
- if self.args.persist_merged_dir and not self.args.dry_run:
665
- dest = Path(self.args.persist_merged_dir).resolve() / f"{model_name}.csv"
666
- self._write_dataset(merged_df, dest, "persist_merged_dir")
667
-
668
- if not self._should_train(new_rows):
669
- _log(f"{model_name}: below min_new_rows ({self.args.min_new_rows}); skipping retrain.")
670
- self.summary_records.append({
671
- "model_name": model_name,
672
- "status": "skipped_no_incremental",
673
- "new_rows": new_rows,
674
- "total_rows": len(merged_df),
675
- })
676
- return 0
677
-
678
- try:
679
- train_result = self._train_single_model(model_name, merged_df, new_rows, inc_path)
680
- except Exception as exc:
681
- _log(f"Training failed for {model_name}: {exc}")
682
- self.summary_records.append({
683
- "model_name": model_name,
684
- "status": "failed",
685
- "error": str(exc),
686
- "new_rows": new_rows,
687
- "total_rows": len(merged_df),
688
- })
689
- return 0
690
-
691
- executed = train_result["executed_keys"]
692
- param_sources = train_result["param_sources"]
693
- model = train_result["model"]
694
- status = "dry_run" if self.args.dry_run else "trained"
695
-
696
- summary = {
697
- "model_name": model_name,
698
- "status": status,
699
- "trained_models": executed,
700
- "param_sources": param_sources,
701
- "new_rows": new_rows,
702
- "total_rows": len(merged_df),
703
- "incremental_path": str(inc_path) if inc_path else None,
704
- }
705
- self.summary_records.append(summary)
706
-
707
- if not self.args.dry_run and self.plot_requested and executed:
708
- _plot_curves_for_model(model, executed, self.cfg)
709
-
710
- return 1 if executed else 0
711
-
712
-
713
- def main() -> None:
714
- if configure_run_logging:
715
- configure_run_logging(prefix="bayesopt_incremental")
716
- args = _parse_args()
717
- runner = IncrementalUpdateRunner(args)
718
- runner.process()
719
-
720
-
721
- if __name__ == "__main__":
722
- main()