ins-pricing 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. ins_pricing/README.md +74 -56
  2. ins_pricing/__init__.py +142 -90
  3. ins_pricing/cli/BayesOpt_entry.py +52 -50
  4. ins_pricing/cli/BayesOpt_incremental.py +832 -898
  5. ins_pricing/cli/Explain_Run.py +31 -23
  6. ins_pricing/cli/Explain_entry.py +532 -579
  7. ins_pricing/cli/Pricing_Run.py +31 -23
  8. ins_pricing/cli/bayesopt_entry_runner.py +1440 -1438
  9. ins_pricing/cli/utils/cli_common.py +256 -256
  10. ins_pricing/cli/utils/cli_config.py +375 -375
  11. ins_pricing/cli/utils/import_resolver.py +382 -365
  12. ins_pricing/cli/utils/notebook_utils.py +340 -340
  13. ins_pricing/cli/watchdog_run.py +209 -201
  14. ins_pricing/frontend/README.md +573 -419
  15. ins_pricing/frontend/__init__.py +10 -10
  16. ins_pricing/frontend/config_builder.py +1 -0
  17. ins_pricing/frontend/example_workflows.py +1 -1
  18. ins_pricing/governance/__init__.py +20 -20
  19. ins_pricing/governance/release.py +159 -159
  20. ins_pricing/modelling/README.md +67 -0
  21. ins_pricing/modelling/__init__.py +147 -92
  22. ins_pricing/modelling/bayesopt/README.md +59 -0
  23. ins_pricing/modelling/{core/bayesopt → bayesopt}/__init__.py +64 -102
  24. ins_pricing/modelling/{core/bayesopt → bayesopt}/config_preprocess.py +562 -550
  25. ins_pricing/modelling/{core/bayesopt → bayesopt}/core.py +965 -962
  26. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_explain_mixin.py +296 -296
  27. ins_pricing/modelling/{core/bayesopt → bayesopt}/model_plotting_mixin.py +482 -548
  28. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/__init__.py +27 -27
  29. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_trainer.py +915 -913
  30. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_gnn.py +788 -785
  31. ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_resn.py +448 -446
  32. ins_pricing/modelling/bayesopt/trainers/__init__.py +19 -0
  33. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_base.py +1308 -1308
  34. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_ft.py +3 -3
  35. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_glm.py +197 -198
  36. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_gnn.py +344 -344
  37. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_resn.py +283 -283
  38. ins_pricing/modelling/{core/bayesopt → bayesopt}/trainers/trainer_xgb.py +346 -347
  39. ins_pricing/modelling/bayesopt/utils/__init__.py +67 -0
  40. ins_pricing/modelling/bayesopt/utils/constants.py +21 -0
  41. ins_pricing/modelling/bayesopt/utils/io_utils.py +7 -0
  42. ins_pricing/modelling/bayesopt/utils/losses.py +27 -0
  43. ins_pricing/modelling/bayesopt/utils/metrics_and_devices.py +17 -0
  44. ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/torch_trainer_mixin.py +623 -623
  45. ins_pricing/modelling/{core/evaluation.py → evaluation.py} +113 -104
  46. ins_pricing/modelling/explain/__init__.py +55 -55
  47. ins_pricing/modelling/explain/metrics.py +27 -174
  48. ins_pricing/modelling/explain/permutation.py +237 -237
  49. ins_pricing/modelling/plotting/__init__.py +40 -36
  50. ins_pricing/modelling/plotting/compat.py +228 -0
  51. ins_pricing/modelling/plotting/curves.py +572 -572
  52. ins_pricing/modelling/plotting/diagnostics.py +163 -163
  53. ins_pricing/modelling/plotting/geo.py +362 -362
  54. ins_pricing/modelling/plotting/importance.py +121 -121
  55. ins_pricing/pricing/__init__.py +27 -27
  56. ins_pricing/production/__init__.py +35 -25
  57. ins_pricing/production/{predict.py → inference.py} +140 -57
  58. ins_pricing/production/monitoring.py +8 -21
  59. ins_pricing/reporting/__init__.py +11 -11
  60. ins_pricing/setup.py +1 -1
  61. ins_pricing/tests/production/test_inference.py +90 -0
  62. ins_pricing/utils/__init__.py +116 -83
  63. ins_pricing/utils/device.py +255 -255
  64. ins_pricing/utils/features.py +53 -0
  65. ins_pricing/utils/io.py +72 -0
  66. ins_pricing/{modelling/core/bayesopt/utils → utils}/losses.py +125 -129
  67. ins_pricing/utils/metrics.py +158 -24
  68. ins_pricing/utils/numerics.py +76 -0
  69. ins_pricing/utils/paths.py +9 -1
  70. {ins_pricing-0.4.4.dist-info → ins_pricing-0.5.0.dist-info}/METADATA +55 -35
  71. ins_pricing-0.5.0.dist-info/RECORD +131 -0
  72. ins_pricing/CHANGELOG.md +0 -272
  73. ins_pricing/RELEASE_NOTES_0.2.8.md +0 -344
  74. ins_pricing/docs/LOSS_FUNCTIONS.md +0 -78
  75. ins_pricing/docs/modelling/BayesOpt_USAGE.md +0 -945
  76. ins_pricing/docs/modelling/README.md +0 -34
  77. ins_pricing/frontend/QUICKSTART.md +0 -152
  78. ins_pricing/modelling/core/BayesOpt.py +0 -146
  79. ins_pricing/modelling/core/__init__.py +0 -1
  80. ins_pricing/modelling/core/bayesopt/PHASE2_REFACTORING_SUMMARY.md +0 -449
  81. ins_pricing/modelling/core/bayesopt/PHASE3_REFACTORING_SUMMARY.md +0 -406
  82. ins_pricing/modelling/core/bayesopt/REFACTORING_SUMMARY.md +0 -247
  83. ins_pricing/modelling/core/bayesopt/trainers/__init__.py +0 -19
  84. ins_pricing/modelling/core/bayesopt/utils/__init__.py +0 -86
  85. ins_pricing/modelling/core/bayesopt/utils/constants.py +0 -183
  86. ins_pricing/modelling/core/bayesopt/utils/io_utils.py +0 -126
  87. ins_pricing/modelling/core/bayesopt/utils/metrics_and_devices.py +0 -555
  88. ins_pricing/modelling/core/bayesopt/utils.py +0 -105
  89. ins_pricing/modelling/core/bayesopt/utils_backup.py +0 -1503
  90. ins_pricing/tests/production/test_predict.py +0 -233
  91. ins_pricing-0.4.4.dist-info/RECORD +0 -137
  92. /ins_pricing/modelling/{core/bayesopt → bayesopt}/config_components.py +0 -0
  93. /ins_pricing/modelling/{core/bayesopt → bayesopt}/models/model_ft_components.py +0 -0
  94. /ins_pricing/modelling/{core/bayesopt → bayesopt}/utils/distributed_utils.py +0 -0
  95. {ins_pricing-0.4.4.dist-info → ins_pricing-0.5.0.dist-info}/WHEEL +0 -0
  96. {ins_pricing-0.4.4.dist-info → ins_pricing-0.5.0.dist-info}/top_level.txt +0 -0
@@ -1,904 +1,838 @@
1
- """Incremental training harness built on top of ``ins_pricing.bayesopt``
2
- (compat via ``BayesOpt.py``).
3
-
4
- This utility lets you append new observations to an existing dataset,
5
- reuse previously tuned hyperparameters and retrain a subset of models
6
- without re-running the full Optuna search. It can operate on a directory
7
- of per-model incremental CSVs or a single incremental file when updating
8
- one dataset.
9
-
10
- Example:
11
- python ins_pricing/cli/BayesOpt_incremental.py \
12
- --config-json ins_pricing/examples/modelling/config_incremental_template.json \
13
- --incremental-dir ./incremental_batches \
14
- --merge-keys policy_id vehicle_id \
15
- --model-keys glm xgb resn --plot-curves
16
- """
17
-
18
- from __future__ import annotations
19
-
1
+ """Incremental training harness built on top of ``ins_pricing.bayesopt``.
2
+
3
+ This utility lets you append new observations to an existing dataset,
4
+ reuse previously tuned hyperparameters and retrain a subset of models
5
+ without re-running the full Optuna search. It can operate on a directory
6
+ of per-model incremental CSVs or a single incremental file when updating
7
+ one dataset.
8
+
9
+ Example:
10
+ python ins_pricing/cli/BayesOpt_incremental.py \
11
+ --config-json examples/config_incremental_template.json \
12
+ --incremental-dir ./incremental_batches \
13
+ --merge-keys policy_id vehicle_id \
14
+ --model-keys glm xgb resn --plot-curves
15
+ """
16
+
17
+ from __future__ import annotations
18
+
20
19
  from pathlib import Path
20
+ import importlib.util
21
21
  import sys
22
22
 
23
23
  if __package__ in {None, ""}:
24
- repo_root = Path(__file__).resolve().parents[2]
25
- if str(repo_root) not in sys.path:
26
- sys.path.insert(0, str(repo_root))
27
-
28
- import argparse
29
- import json
30
- from dataclasses import asdict
31
- from datetime import datetime
32
- from typing import Any, Dict, List, Optional, Sequence, Tuple
33
-
24
+ if importlib.util.find_spec("ins_pricing") is None:
25
+ repo_root = Path(__file__).resolve().parents[2]
26
+ if str(repo_root) not in sys.path:
27
+ sys.path.insert(0, str(repo_root))
28
+
29
+ import argparse
30
+ import json
31
+ from dataclasses import asdict
32
+ from datetime import datetime
33
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
34
+
34
35
  import pandas as pd
35
36
 
36
- try:
37
- from .. import bayesopt as ropt # type: ignore
38
- from .utils.cli_common import ( # type: ignore
39
- PLOT_MODEL_LABELS,
40
- PYTORCH_TRAINERS,
41
- build_model_names,
42
- dedupe_preserve_order,
43
- load_dataset,
44
- parse_model_pairs,
45
- resolve_data_path,
46
- resolve_path,
47
- split_train_test,
48
- )
49
- from .utils.cli_config import ( # type: ignore
50
- add_config_json_arg,
51
- resolve_and_load_config,
52
- resolve_data_config,
53
- resolve_split_config,
54
- resolve_runtime_config,
55
- resolve_output_dirs,
56
- )
57
- except Exception: # pragma: no cover
58
- try:
59
- import bayesopt as ropt # type: ignore
60
- from utils.cli_common import ( # type: ignore
61
- PLOT_MODEL_LABELS,
62
- PYTORCH_TRAINERS,
63
- build_model_names,
64
- dedupe_preserve_order,
65
- load_dataset,
66
- parse_model_pairs,
67
- resolve_data_path,
68
- resolve_path,
69
- split_train_test,
70
- )
71
- from utils.cli_config import ( # type: ignore
72
- add_config_json_arg,
73
- resolve_and_load_config,
74
- resolve_data_config,
75
- resolve_split_config,
76
- resolve_runtime_config,
77
- resolve_output_dirs,
78
- )
79
- except Exception:
80
- try:
81
- import ins_pricing.modelling.core.bayesopt as ropt # type: ignore
82
- from ins_pricing.cli.utils.cli_common import ( # type: ignore
83
- PLOT_MODEL_LABELS,
84
- PYTORCH_TRAINERS,
85
- build_model_names,
86
- dedupe_preserve_order,
87
- load_dataset,
88
- parse_model_pairs,
89
- resolve_data_path,
90
- resolve_path,
91
- split_train_test,
92
- )
93
- from ins_pricing.cli.utils.cli_config import ( # type: ignore
94
- add_config_json_arg,
95
- resolve_and_load_config,
96
- resolve_data_config,
97
- resolve_split_config,
98
- resolve_runtime_config,
99
- resolve_output_dirs,
100
- )
101
- except Exception:
102
- import BayesOpt as ropt # type: ignore
103
- from utils.cli_common import ( # type: ignore
104
- PLOT_MODEL_LABELS,
105
- PYTORCH_TRAINERS,
106
- build_model_names,
107
- dedupe_preserve_order,
108
- load_dataset,
109
- parse_model_pairs,
110
- resolve_data_path,
111
- resolve_path,
112
- split_train_test,
113
- )
114
- from utils.cli_config import ( # type: ignore
115
- add_config_json_arg,
116
- resolve_and_load_config,
117
- resolve_data_config,
118
- resolve_split_config,
119
- resolve_runtime_config,
120
- resolve_output_dirs,
121
- )
122
-
123
- try:
124
- from .utils.run_logging import configure_run_logging # type: ignore
125
- except Exception: # pragma: no cover
126
- try:
127
- from utils.run_logging import configure_run_logging # type: ignore
128
- except Exception: # pragma: no cover
129
- configure_run_logging = None # type: ignore
130
-
131
-
132
- def _log(message: str) -> None:
133
- print(f"[Incremental] {message}")
134
-
135
-
136
- def _parse_args() -> argparse.Namespace:
137
- parser = argparse.ArgumentParser(
138
- description="Incrementally retrain BayesOpt models using new batches of data."
139
- )
140
- add_config_json_arg(
141
- parser,
142
- help_text="Path to the JSON config that cli/BayesOpt_entry.py uses.",
143
- )
144
- parser.add_argument(
145
- "--model-names",
146
- nargs="+",
147
- default=None,
148
- help="Optional subset of dataset names to update (defaults to model_list/model_categories Cartesian product)."
149
- )
150
- parser.add_argument(
151
- "--model-keys",
152
- nargs="+",
153
- default=["glm", "xgb", "resn", "ft"],
154
- choices=["glm", "xgb", "resn", "ft", "gnn", "all"],
155
- help="Which trainers to run for each dataset."
156
- )
157
- parser.add_argument(
158
- "--incremental-dir",
159
- type=Path,
160
- default=None,
161
- help="Directory containing <model_name> incremental CSVs."
162
- )
163
- parser.add_argument(
164
- "--incremental-file",
165
- type=Path,
166
- default=None,
167
- help="Single incremental CSV (requires --model-names with exactly one entry)."
168
- )
169
- parser.add_argument(
170
- "--incremental-template",
171
- default="{model_name}_incremental.csv",
172
- help="Filename template when --incremental-dir is provided."
173
- )
174
- parser.add_argument(
175
- "--merge-keys",
176
- nargs="+",
177
- default=None,
178
- help="Column(s) used to drop duplicate rows after merging base and incremental data."
179
- )
180
- parser.add_argument(
181
- "--dedupe-keep",
182
- choices=["first", "last"],
183
- default="last",
184
- help="How pandas.drop_duplicates resolves conflicts on merge keys."
185
- )
186
- parser.add_argument(
187
- "--timestamp-col",
188
- default=None,
189
- help="Optional column used to sort rows before deduplication."
190
- )
191
- parser.add_argument(
192
- "--timestamp-descending",
193
- action="store_true",
194
- help="Sort timestamp column in descending order before deduplication."
195
- )
196
- parser.add_argument(
197
- "--min-new-rows",
198
- type=int,
199
- default=1,
200
- help="Skip training if fewer new rows than this arrive (unless --train-without-incremental)."
201
- )
202
- parser.add_argument(
203
- "--train-without-incremental",
204
- action="store_true",
205
- help="Always retrain even when no incremental file is present."
206
- )
207
- parser.add_argument(
208
- "--strict-incremental",
209
- action="store_true",
210
- help="Raise an error when a dataset is missing its incremental CSV instead of skipping it."
211
- )
212
- parser.add_argument(
213
- "--tag-new-column",
214
- default=None,
215
- help="If set, store 1 for incremental rows and 0 for historical rows in this column."
216
- )
217
- parser.add_argument(
218
- "--max-evals",
219
- type=int,
220
- default=25,
221
- help="Optuna trial count when retuning is required."
222
- )
223
- parser.add_argument(
224
- "--retune-missing",
225
- dest="retune_missing",
226
- action="store_true",
227
- default=True,
228
- help="Retune models whose best-params CSV is unavailable (default)."
229
- )
230
- parser.add_argument(
231
- "--skip-retune-missing",
232
- dest="retune_missing",
233
- action="store_false",
234
- help="Do not retune when best params are missing; such models are skipped."
235
- )
236
- parser.add_argument(
237
- "--force-retune",
238
- action="store_true",
239
- help="Run Optuna tuning even if historical best params exist."
240
- )
241
- parser.add_argument(
242
- "--prop-test",
243
- type=float,
244
- default=None,
245
- help="Override the test split proportion defined in the config file."
246
- )
247
- parser.add_argument(
248
- "--rand-seed",
249
- type=int,
250
- default=None,
251
- help="Override the random seed defined in the config."
252
- )
253
- parser.add_argument(
254
- "--epochs",
255
- type=int,
256
- default=None,
257
- help="Override the epoch count from the config."
258
- )
259
- parser.add_argument(
260
- "--output-dir",
261
- type=Path,
262
- default=None,
263
- help="Override the BayesOpt output root (models/results/plots)."
264
- )
265
- parser.add_argument(
266
- "--update-base-data",
267
- action="store_true",
268
- help="Overwrite the base CSVs with the merged dataset after a successful update."
269
- )
270
- parser.add_argument(
271
- "--persist-merged-dir",
272
- type=Path,
273
- default=None,
274
- help="Optional directory to store the merged dataset snapshots."
275
- )
276
- parser.add_argument(
277
- "--summary-json",
278
- type=Path,
279
- default=None,
280
- help="Write a JSON summary of processed datasets to this path."
281
- )
282
- parser.add_argument(
283
- "--plot-curves",
284
- action="store_true",
285
- help="Run one-way/lift plots after training (config plot settings also apply)."
286
- )
287
- parser.add_argument(
288
- "--dry-run",
289
- action="store_true",
290
- help="Merge and report counts but skip training, saving and plotting."
291
- )
292
- args = parser.parse_args()
293
-
294
- if args.incremental_file and args.incremental_dir:
295
- parser.error("Use either --incremental-dir or --incremental-file, not both.")
296
- if args.incremental_file and args.model_names and len(args.model_names) != 1:
297
- parser.error("--incremental-file can only be used when updating exactly one model.")
298
- if (not args.incremental_dir and not args.incremental_file) and not args.train_without_incremental:
299
- parser.error(
300
- "Provide --incremental-dir/--incremental-file or enable --train-without-incremental."
301
- )
302
- return args
303
-
304
-
305
- def _plot_curves_for_model(model: ropt.BayesOptModel, trained: List[str], cfg: Dict[str, Any]) -> None:
306
- plot_cfg = cfg.get("plot", {})
307
- legacy_flags = {
308
- "glm": cfg.get("plot_lift_glm", False),
309
- "xgb": cfg.get("plot_lift_xgb", False),
310
- "resn": cfg.get("plot_lift_resn", False),
311
- "ft": cfg.get("plot_lift_ft", False),
312
- }
313
- plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
314
- if not plot_enabled:
315
- return
316
-
317
- n_bins = int(plot_cfg.get("n_bins", 10))
318
- oneway_enabled = plot_cfg.get("oneway", True)
319
- available = dedupe_preserve_order([k for k in trained if k in PLOT_MODEL_LABELS])
320
-
321
- lift_models = plot_cfg.get("lift_models")
322
- if lift_models is None:
323
- lift_models = [m for m, flag in legacy_flags.items() if flag]
324
- if not lift_models:
325
- lift_models = available
326
- lift_models = dedupe_preserve_order([m for m in lift_models if m in available])
327
-
328
- if oneway_enabled:
329
- oneway_pred = bool(plot_cfg.get("oneway_pred", False))
330
- oneway_pred_models = plot_cfg.get("oneway_pred_models")
331
- pred_plotted = False
332
- if oneway_pred:
333
- if oneway_pred_models is None:
334
- oneway_pred_models = lift_models or available
335
- oneway_pred_models = dedupe_preserve_order(
336
- [m for m in oneway_pred_models if m in available]
337
- )
338
- for model_key in oneway_pred_models:
339
- label, pred_nme = PLOT_MODEL_LABELS[model_key]
340
- if pred_nme not in model.train_data.columns:
341
- print(
342
- f"[Oneway] Missing prediction column '{pred_nme}'; skip.",
343
- flush=True,
344
- )
345
- continue
346
- model.plot_oneway(
347
- n_bins=n_bins,
348
- pred_col=pred_nme,
349
- pred_label=label,
350
- plot_subdir="oneway/post",
351
- )
352
- pred_plotted = True
353
- if not oneway_pred or not pred_plotted:
354
- model.plot_oneway(n_bins=n_bins, plot_subdir="oneway/post")
355
- if not available:
356
- return
357
-
358
- for key in lift_models:
359
- label, pred_nme = PLOT_MODEL_LABELS[key]
360
- model.plot_lift(model_label=label, pred_nme=pred_nme, n_bins=n_bins)
361
-
362
- if not plot_cfg.get("double_lift", True) or len(available) < 2:
363
- return
364
-
365
- raw_pairs = plot_cfg.get("double_lift_pairs")
366
- if raw_pairs:
367
- pairs = [
368
- (a, b)
369
- for a, b in parse_model_pairs(raw_pairs)
370
- if a in available and b in available and a != b
371
- ]
372
- else:
373
- pairs = [(a, b) for i, a in enumerate(available) for b in available[i + 1 :]]
374
- for first, second in pairs:
375
- model.plot_dlift([first, second], n_bins=n_bins)
376
-
377
-
378
- def _coerce_scalar(value: Any) -> Any:
379
- if isinstance(value, str):
380
- lowered = value.strip().lower()
381
- if lowered in {"", "none", "nan"}:
382
- return None
383
- if lowered in {"true", "false"}:
384
- return lowered == "true"
385
- return value
386
- if hasattr(value, "item"):
387
- try:
388
- return value.item()
389
- except Exception:
390
- return value
391
- return value
392
-
393
-
394
- def _infer_format_from_path(path: Path) -> str:
395
- suffix = path.suffix.lower()
396
- if suffix in {".parquet", ".pq"}:
397
- return "parquet"
398
- if suffix in {".feather", ".ft"}:
399
- return "feather"
400
- return "csv"
401
-
402
-
403
- def _load_best_params(model: ropt.BayesOptModel, trainer, silent: bool = False) -> Optional[Dict[str, Any]]:
404
- label = trainer.label.lower()
405
- result_dir = Path(model.output_manager.result_dir)
406
- path = result_dir / f"{model.model_nme}_bestparams_{label}.csv"
407
- if not path.exists():
408
- if not silent:
409
- _log(f"No historical params found for {model.model_nme}/{label} at {path}.")
410
- return None
411
- try:
412
- params_raw = ropt.IOUtils.load_params_file(str(path))
413
- except Exception:
414
- return None
415
- return {
416
- key: _coerce_scalar(val)
417
- for key, val in (params_raw or {}).items()
418
- if not pd.isna(val)
419
- }
420
-
421
-
422
- def _to_serializable(obj: Any) -> Any:
423
- if isinstance(obj, dict):
424
- return {k: _to_serializable(v) for k, v in obj.items()}
425
- if isinstance(obj, list):
426
- return [_to_serializable(v) for v in obj]
427
- if hasattr(obj, "item"):
428
- try:
429
- return obj.item()
430
- except Exception:
431
- return str(obj)
432
- return obj
433
-
434
-
435
- class IncrementalUpdateRunner:
436
- def __init__(self, args: argparse.Namespace) -> None:
437
- self.args = args
438
- script_dir = Path(__file__).resolve().parents[1]
439
- self.config_path, self.cfg = resolve_and_load_config(
440
- args.config_json,
441
- script_dir,
442
- required_keys=[
443
- "data_dir",
444
- "model_list",
445
- "model_categories",
446
- "target",
447
- "weight",
448
- "feature_list",
449
- "categorical_features",
450
- ],
451
- )
452
- data_dir, data_format, data_path_template, dtype_map = resolve_data_config(
453
- self.cfg,
454
- self.config_path,
455
- create_data_dir=True,
456
- )
457
- self.data_dir = data_dir
458
- self.data_format = data_format
459
- self.data_path_template = data_path_template
460
- self.dtype_map = dtype_map
461
- split_cfg = resolve_split_config(self.cfg)
462
- runtime_cfg = resolve_runtime_config(self.cfg)
463
- output_cfg = resolve_output_dirs(
464
- self.cfg,
465
- self.config_path,
466
- output_override=args.output_dir,
467
- )
468
- self.runtime_cfg = runtime_cfg
469
- self.prop_test = args.prop_test if args.prop_test is not None else split_cfg["prop_test"]
470
- self.rand_seed = args.rand_seed if args.rand_seed is not None else self.cfg.get("rand_seed", 13)
471
- self.epochs = args.epochs if args.epochs is not None else self.cfg.get("epochs", 50)
472
- self.split_strategy = split_cfg["split_strategy"]
473
- self.split_group_col = split_cfg["split_group_col"]
474
- self.split_time_col = split_cfg["split_time_col"]
475
- self.split_time_ascending = split_cfg["split_time_ascending"]
476
- self.cv_strategy = split_cfg["cv_strategy"]
477
- self.cv_group_col = split_cfg["cv_group_col"]
478
- self.cv_time_col = split_cfg["cv_time_col"]
479
- self.cv_time_ascending = split_cfg["cv_time_ascending"]
480
- self.cv_splits = split_cfg["cv_splits"]
481
- self.ft_oof_folds = split_cfg["ft_oof_folds"]
482
- self.ft_oof_strategy = split_cfg["ft_oof_strategy"]
483
- self.ft_oof_shuffle = split_cfg["ft_oof_shuffle"]
484
- self.save_preprocess = runtime_cfg["save_preprocess"]
485
- self.preprocess_artifact_path = runtime_cfg["preprocess_artifact_path"]
486
- self.bo_sample_limit = runtime_cfg["bo_sample_limit"]
487
- self.cache_predictions = runtime_cfg["cache_predictions"]
488
- self.prediction_cache_dir = runtime_cfg["prediction_cache_dir"]
489
- self.prediction_cache_format = runtime_cfg["prediction_cache_format"]
490
- self.plot_path_style = runtime_cfg["plot_path_style"]
491
- self.xgb_max_depth_max = runtime_cfg["xgb_max_depth_max"]
492
- self.xgb_n_estimators_max = runtime_cfg["xgb_n_estimators_max"]
493
- self.optuna_storage = runtime_cfg["optuna_storage"]
494
- self.optuna_study_prefix = runtime_cfg["optuna_study_prefix"]
495
- self.best_params_files = runtime_cfg["best_params_files"]
496
- self.reuse_best_params = runtime_cfg["reuse_best_params"]
497
- self.plot_requested = bool(args.plot_curves or self.cfg.get("plot_curves", False))
498
- self.model_names = self._resolve_model_names(args.model_names)
499
- self.merge_keys = list(args.merge_keys or [])
500
- self.timestamp_col = args.timestamp_col
501
- self.timestamp_ascending = not args.timestamp_descending
502
- self.output_root = output_cfg["output_dir"]
503
-
504
- self.incremental_dir = None
505
- if args.incremental_dir is not None:
506
- self.incremental_dir = args.incremental_dir
507
- if not self.incremental_dir.is_absolute():
508
- self.incremental_dir = (self.config_path.parent / self.incremental_dir).resolve()
509
- else:
510
- self.incremental_dir = self.incremental_dir.resolve()
511
- self.incremental_file = None
512
- if args.incremental_file is not None:
513
- self.incremental_file = args.incremental_file
514
- if not self.incremental_file.is_absolute():
515
- self.incremental_file = (self.config_path.parent / self.incremental_file).resolve()
516
- else:
517
- self.incremental_file = self.incremental_file.resolve()
518
- self.summary_records: List[Dict[str, Any]] = []
519
- self.binary_resp = self.cfg.get("binary_resp_nme") or self.cfg.get("binary_target")
520
-
521
- if self.incremental_file and len(self.model_names) != 1:
522
- raise ValueError("--incremental-file can only be used when exactly one model name is targeted.")
523
-
524
- def _resolve_model_names(self, override: Optional[Sequence[str]]) -> List[str]:
525
- if override:
526
- return dedupe_preserve_order([str(item) for item in override])
527
- prefixes = self.cfg["model_list"]
528
- suffixes = self.cfg["model_categories"]
529
- return build_model_names(prefixes, suffixes)
530
-
531
- def _load_incremental_df(self, model_name: str) -> Tuple[Optional[pd.DataFrame], Optional[Path]]:
532
- path: Optional[Path] = None
533
- if self.incremental_file:
534
- path = self.incremental_file
535
- elif self.incremental_dir:
536
- rel = self.args.incremental_template.format(model_name=model_name)
537
- path = (self.incremental_dir / rel).resolve()
538
- if not path or not path.exists():
539
- return None, None
540
- try:
541
- df = load_dataset(
542
- path,
543
- data_format="auto",
544
- dtype_map=self.dtype_map,
545
- low_memory=False,
546
- )
547
- except pd.errors.EmptyDataError:
548
- _log(f"Incremental file {path} is empty; treating as no-op.")
549
- return None, path
550
- except Exception as exc:
551
- _log(f"Failed to load incremental file {path}: {exc}")
552
- return None, path
553
- return df, path
554
-
555
- def _merge_frames(self, base_df: pd.DataFrame, inc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
556
- if inc_df is None or inc_df.empty:
557
- merged = base_df.copy(deep=True)
558
- return merged.reset_index(drop=True)
559
- frames = []
560
- tag = self.args.tag_new_column
561
- if tag:
562
- base_part = base_df.copy(deep=True)
563
- base_part[tag] = 0
564
- inc_part = inc_df.copy(deep=True)
565
- inc_part[tag] = 1
566
- frames = [base_part, inc_part]
567
- else:
568
- frames = [base_df, inc_df]
569
- merged = pd.concat(frames, ignore_index=True, sort=False)
570
- if self.timestamp_col and self.timestamp_col in merged.columns:
571
- merged = merged.sort_values(
572
- self.timestamp_col,
573
- ascending=self.timestamp_ascending,
574
- kind="mergesort",
575
- )
576
- if self.merge_keys:
577
- missing = [col for col in self.merge_keys if col not in merged.columns]
578
- if missing:
579
- raise KeyError(f"Merge keys {missing} not found in merged frame for {self.merge_keys}.")
580
- merged = merged.drop_duplicates(subset=self.merge_keys, keep=self.args.dedupe_keep)
581
- return merged.reset_index(drop=True)
582
-
583
- def _should_train(self, new_rows: int) -> bool:
584
- if self.args.train_without_incremental:
585
- return True
586
- min_needed = max(0, self.args.min_new_rows)
587
- return new_rows >= min_needed
588
-
589
- def _write_dataset(self, df: pd.DataFrame, dest: Path, reason: str) -> None:
590
- dest.parent.mkdir(parents=True, exist_ok=True)
591
- fmt = str(self.data_format or "csv").lower()
592
- if fmt == "auto":
593
- fmt = _infer_format_from_path(dest)
594
- if fmt == "parquet":
595
- df.to_parquet(dest, index=False)
596
- elif fmt == "feather":
597
- df.reset_index(drop=True).to_feather(dest)
598
- else:
599
- df.to_csv(dest, index=False)
600
- _log(f"Wrote {len(df)} rows to {dest} ({reason}).")
601
-
602
- def _prepare_splits(self, merged: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
603
- if not 0 < self.prop_test < 1:
604
- raise ValueError(f"prop_test must fall in (0, 1); got {self.prop_test}.")
605
- if len(merged) < 2:
606
- raise ValueError("Need at least two rows to form a train/test split.")
607
- train_df, test_df = split_train_test(
608
- merged,
609
- holdout_ratio=self.prop_test,
610
- strategy=self.split_strategy,
611
- group_col=self.split_group_col,
612
- time_col=self.split_time_col,
613
- time_ascending=self.split_time_ascending,
614
- rand_seed=self.rand_seed,
615
- reset_index_mode="always",
616
- ratio_label="prop_test",
617
- validate_ratio=False,
618
- )
619
- return train_df, test_df
620
-
621
- def _requested_model_keys(self, trainer_map: Dict[str, Any]) -> List[str]:
622
- requested = self.args.model_keys
623
- if "all" in requested:
624
- requested = ["glm", "xgb", "resn", "ft", "gnn"]
625
- requested = dedupe_preserve_order(requested)
626
- missing = [key for key in requested if key not in trainer_map]
627
- for key in missing:
628
- _log(f"Trainer '{key}' is not available for this context and will be skipped.")
629
- return [key for key in requested if key in trainer_map]
630
-
631
- def _train_single_model(
632
- self,
633
- model_name: str,
634
- merged: pd.DataFrame,
635
- new_rows: int,
636
- incremental_path: Optional[Path],
637
- ) -> Dict[str, Any]:
638
- merged = merged.copy(deep=True)
639
- merged.fillna(0, inplace=True)
640
- train_df, test_df = self._prepare_splits(merged)
641
- model = ropt.BayesOptModel(
642
- train_df,
643
- test_df,
644
- model_name,
645
- self.cfg["target"],
646
- self.cfg["weight"],
647
- self.cfg["feature_list"],
648
- task_type=self.cfg.get("task_type", "regression"),
649
- binary_resp_nme=self.binary_resp,
650
- cate_list=self.cfg.get("categorical_features"),
651
- prop_test=self.prop_test,
652
- rand_seed=self.rand_seed,
653
- epochs=self.epochs,
654
- use_gpu=bool(self.cfg.get("use_gpu", True)),
655
- use_resn_data_parallel=self.cfg.get("use_resn_data_parallel", False),
656
- use_ft_data_parallel=self.cfg.get("use_ft_data_parallel", True),
657
- use_gnn_data_parallel=self.cfg.get("use_gnn_data_parallel", False),
658
- use_resn_ddp=self.cfg.get("use_resn_ddp", False),
659
- use_ft_ddp=self.cfg.get("use_ft_ddp", False),
660
- use_gnn_ddp=self.cfg.get("use_gnn_ddp", False),
661
- output_dir=str(self.output_root) if self.output_root else None,
662
- xgb_max_depth_max=self.xgb_max_depth_max,
663
- xgb_n_estimators_max=self.xgb_n_estimators_max,
664
- resn_weight_decay=self.cfg.get("resn_weight_decay"),
665
- final_ensemble=bool(self.cfg.get("final_ensemble", False)),
666
- final_ensemble_k=int(self.cfg.get("final_ensemble_k", 3)),
667
- final_refit=bool(self.cfg.get("final_refit", True)),
668
- optuna_storage=self.optuna_storage,
669
- optuna_study_prefix=self.optuna_study_prefix,
670
- best_params_files=self.best_params_files,
671
- reuse_best_params=self.reuse_best_params,
672
- gnn_use_approx_knn=self.cfg.get("gnn_use_approx_knn", True),
673
- gnn_approx_knn_threshold=self.cfg.get("gnn_approx_knn_threshold", 50000),
674
- gnn_graph_cache=self.cfg.get("gnn_graph_cache"),
675
- gnn_max_gpu_knn_nodes=self.cfg.get("gnn_max_gpu_knn_nodes", 200000),
676
- gnn_knn_gpu_mem_ratio=self.cfg.get("gnn_knn_gpu_mem_ratio", 0.9),
677
- gnn_knn_gpu_mem_overhead=self.cfg.get("gnn_knn_gpu_mem_overhead", 2.0),
678
- region_province_col=self.cfg.get("region_province_col"),
679
- region_city_col=self.cfg.get("region_city_col"),
680
- region_effect_alpha=self.cfg.get("region_effect_alpha"),
681
- geo_feature_nmes=self.cfg.get("geo_feature_nmes"),
682
- geo_token_hidden_dim=self.cfg.get("geo_token_hidden_dim"),
683
- geo_token_layers=self.cfg.get("geo_token_layers"),
684
- geo_token_dropout=self.cfg.get("geo_token_dropout"),
685
- geo_token_k_neighbors=self.cfg.get("geo_token_k_neighbors"),
686
- geo_token_learning_rate=self.cfg.get("geo_token_learning_rate"),
687
- geo_token_epochs=self.cfg.get("geo_token_epochs"),
688
- ft_role=str(self.cfg.get("ft_role", "model")),
689
- ft_feature_prefix=str(self.cfg.get("ft_feature_prefix", "ft_emb")),
690
- ft_num_numeric_tokens=self.cfg.get("ft_num_numeric_tokens"),
691
- infer_categorical_max_unique=int(self.cfg.get("infer_categorical_max_unique", 50)),
692
- infer_categorical_max_ratio=float(self.cfg.get("infer_categorical_max_ratio", 0.05)),
693
- cv_strategy=self.cv_strategy or self.split_strategy,
694
- cv_group_col=self.cv_group_col or self.split_group_col,
695
- cv_time_col=self.cv_time_col or self.split_time_col,
696
- cv_time_ascending=self.cv_time_ascending,
697
- cv_splits=self.cv_splits,
698
- ft_oof_folds=self.ft_oof_folds,
699
- ft_oof_strategy=self.ft_oof_strategy,
700
- ft_oof_shuffle=self.ft_oof_shuffle,
701
- save_preprocess=self.save_preprocess,
702
- preprocess_artifact_path=self.preprocess_artifact_path,
703
- plot_path_style=self.plot_path_style,
704
- bo_sample_limit=self.bo_sample_limit,
705
- cache_predictions=self.cache_predictions,
706
- prediction_cache_dir=self.prediction_cache_dir,
707
- prediction_cache_format=self.prediction_cache_format,
708
- )
709
-
710
- if self.plot_requested and not self.args.dry_run:
711
- plot_cfg = self.cfg.get("plot", {})
712
- legacy_flags = {
713
- "glm": self.cfg.get("plot_lift_glm", False),
714
- "xgb": self.cfg.get("plot_lift_xgb", False),
715
- "resn": self.cfg.get("plot_lift_resn", False),
716
- "ft": self.cfg.get("plot_lift_ft", False),
717
- }
718
- plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
719
- if plot_enabled and plot_cfg.get("pre_oneway", False) and plot_cfg.get("oneway", True):
720
- n_bins = int(plot_cfg.get("n_bins", 10))
721
- model.plot_oneway(n_bins=n_bins, plot_subdir="oneway/pre")
722
-
723
- requested_keys = self._requested_model_keys(model.trainers)
724
- executed_keys: List[str] = []
725
- param_sources: Dict[str, str] = {}
726
-
727
- if self.args.dry_run:
728
- _log(f"Dry run: would train {requested_keys} for {model_name}.")
729
- return {
730
- "executed_keys": executed_keys,
731
- "param_sources": param_sources,
732
- "model": model,
733
- }
734
-
735
- if self.args.force_retune and self.args.max_evals <= 0:
736
- raise ValueError("force_retune requires --max-evals > 0.")
737
-
738
- force_retune = bool(self.args.force_retune)
739
- if force_retune:
740
- model.config.reuse_best_params = False
741
- model.config.best_params_files = {}
742
-
743
- ft_role = str(getattr(model.config, "ft_role", "model"))
744
- if ft_role != "model" and "ft" in requested_keys:
745
- requested_keys = ["ft"] + [k for k in requested_keys if k != "ft"]
746
-
747
- for key in requested_keys:
748
- trainer = model.trainers[key]
749
-
750
- if force_retune:
751
- trainer.best_params = None
752
- trainer.best_trial = None
753
- param_sources[key] = "retune"
754
- else:
755
- best_params = _load_best_params(model, trainer, silent=True)
756
- if best_params:
757
- trainer.best_params = best_params
758
- trainer.best_trial = None
759
- param_sources[key] = "loaded"
760
- else:
761
- if not self.args.retune_missing:
762
- _log(
763
- f"Skipping {model_name}/{key}: no best params and retuning disabled."
764
- )
765
- continue
766
- param_sources[key] = "retune"
767
-
768
- if (trainer.best_params is None) and self.args.max_evals <= 0:
769
- raise ValueError("--max-evals must be positive when retuning is requested.")
770
-
771
- model.optimize_model(key, max_evals=self.args.max_evals)
772
- trainer.save()
773
- executed_keys.append(key)
774
- if key in PYTORCH_TRAINERS:
775
- ropt.free_cuda()
776
-
777
- snapshot = {
778
- "mode": "incremental_train",
779
- "model_name": model_name,
780
- "model_key": key,
781
- "timestamp": datetime.now().isoformat(),
782
- "param_source": param_sources[key],
783
- "best_params": _to_serializable(trainer.best_params or {}),
784
- "incremental_rows": new_rows,
785
- "train_rows": len(model.train_data),
786
- "test_rows": len(model.test_data),
787
- "incremental_path": str(incremental_path) if incremental_path else None,
788
- "config": asdict(model.config),
789
- }
790
- model.version_manager.save(f"{model_name}_{key}_incremental", snapshot)
791
-
792
- if not executed_keys:
793
- _log(f"No trainers executed for {model_name}.")
794
-
795
- return {
796
- "executed_keys": executed_keys,
797
- "param_sources": param_sources,
798
- "model": model,
799
- }
800
-
801
- def process(self) -> None:
802
- total_trained = 0
803
- for model_name in self.model_names:
804
- total_trained += self._process_single_model(model_name)
805
- if self.args.summary_json and self.summary_records:
806
- summary_path = self.args.summary_json.resolve()
807
- summary_path.parent.mkdir(parents=True, exist_ok=True)
808
- summary_payload = _to_serializable(self.summary_records)
809
- summary_path.write_text(json.dumps(summary_payload, indent=2, ensure_ascii=False), encoding="utf-8")
810
- _log(f"Summary written to {summary_path}.")
811
- _log(f"Finished incremental update for {total_trained} dataset(s).")
812
-
813
- def _process_single_model(self, model_name: str) -> int:
814
- base_path = resolve_data_path(
815
- self.data_dir,
816
- model_name,
817
- data_format=self.data_format,
818
- path_template=self.data_path_template,
819
- )
820
- if not base_path.exists():
821
- _log(f"Base dataset {base_path} not found; skipping {model_name}.")
822
- self.summary_records.append({
823
- "model_name": model_name,
824
- "status": "missing_base",
825
- })
826
- return 0
827
-
828
- base_df = load_dataset(
829
- base_path,
830
- data_format=self.data_format,
831
- dtype_map=self.dtype_map,
832
- low_memory=False,
833
- )
834
- inc_df, inc_path = self._load_incremental_df(model_name)
835
- if inc_df is None and self.incremental_dir and self.args.strict_incremental and not self.args.train_without_incremental:
836
- raise FileNotFoundError(f"Missing incremental file for {model_name} under {self.incremental_dir}.")
837
-
838
- new_rows = 0 if inc_df is None else len(inc_df)
839
- _log(f"{model_name}: {len(base_df)} base rows, {new_rows} incremental rows.")
840
- merged_df = self._merge_frames(base_df, inc_df)
841
- merged_df.fillna(0, inplace=True)
842
-
843
- if self.args.update_base_data and not self.args.dry_run:
844
- self._write_dataset(merged_df, base_path, "update_base_data")
845
- if self.args.persist_merged_dir and not self.args.dry_run:
846
- suffix = base_path.suffix or ".csv"
847
- dest = Path(self.args.persist_merged_dir).resolve() / f"{model_name}{suffix}"
848
- self._write_dataset(merged_df, dest, "persist_merged_dir")
849
-
850
- if not self._should_train(new_rows):
851
- _log(f"{model_name}: below min_new_rows ({self.args.min_new_rows}); skipping retrain.")
852
- self.summary_records.append({
853
- "model_name": model_name,
854
- "status": "skipped_no_incremental",
855
- "new_rows": new_rows,
856
- "total_rows": len(merged_df),
857
- })
858
- return 0
859
-
860
- try:
861
- train_result = self._train_single_model(model_name, merged_df, new_rows, inc_path)
862
- except Exception as exc:
863
- _log(f"Training failed for {model_name}: {exc}")
864
- self.summary_records.append({
865
- "model_name": model_name,
866
- "status": "failed",
867
- "error": str(exc),
868
- "new_rows": new_rows,
869
- "total_rows": len(merged_df),
870
- })
871
- return 0
872
-
873
- executed = train_result["executed_keys"]
874
- param_sources = train_result["param_sources"]
875
- model = train_result["model"]
876
- status = "dry_run" if self.args.dry_run else "trained"
877
-
878
- summary = {
879
- "model_name": model_name,
880
- "status": status,
881
- "trained_models": executed,
882
- "param_sources": param_sources,
883
- "new_rows": new_rows,
884
- "total_rows": len(merged_df),
885
- "incremental_path": str(inc_path) if inc_path else None,
886
- }
887
- self.summary_records.append(summary)
888
-
889
- if not self.args.dry_run and self.plot_requested and executed:
890
- _plot_curves_for_model(model, executed, self.cfg)
891
-
892
- return 1 if executed else 0
893
-
894
-
895
- def main() -> None:
896
- if configure_run_logging:
897
- configure_run_logging(prefix="bayesopt_incremental")
898
- args = _parse_args()
899
- runner = IncrementalUpdateRunner(args)
900
- runner.process()
901
-
902
-
903
- if __name__ == "__main__":
904
- main()
37
+ from ins_pricing.cli.utils.import_resolver import resolve_imports, setup_sys_path
38
+
39
+ setup_sys_path()
40
+ _imports = resolve_imports()
41
+
42
+ ropt = _imports.bayesopt
43
+ if ropt is None: # pragma: no cover
44
+ raise ImportError("Failed to resolve ins_pricing.bayesopt for incremental CLI.")
45
+
46
+ PLOT_MODEL_LABELS = _imports.PLOT_MODEL_LABELS
47
+ PYTORCH_TRAINERS = _imports.PYTORCH_TRAINERS
48
+ build_model_names = _imports.build_model_names
49
+ dedupe_preserve_order = _imports.dedupe_preserve_order
50
+ load_dataset = _imports.load_dataset
51
+ parse_model_pairs = _imports.parse_model_pairs
52
+ resolve_data_path = _imports.resolve_data_path
53
+ resolve_path = _imports.resolve_path
54
+ split_train_test = _imports.split_train_test
55
+
56
+ add_config_json_arg = _imports.add_config_json_arg
57
+ resolve_and_load_config = _imports.resolve_and_load_config
58
+ resolve_data_config = _imports.resolve_data_config
59
+ resolve_split_config = _imports.resolve_split_config
60
+ resolve_runtime_config = _imports.resolve_runtime_config
61
+ resolve_output_dirs = _imports.resolve_output_dirs
62
+
63
+ configure_run_logging = _imports.configure_run_logging
64
+
65
+
66
+ def _log(message: str) -> None:
67
+ print(f"[Incremental] {message}")
68
+
69
+
70
+ def _parse_args() -> argparse.Namespace:
71
+ parser = argparse.ArgumentParser(
72
+ description="Incrementally retrain BayesOpt models using new batches of data."
73
+ )
74
+ add_config_json_arg(
75
+ parser,
76
+ help_text="Path to the JSON config that cli/BayesOpt_entry.py uses.",
77
+ )
78
+ parser.add_argument(
79
+ "--model-names",
80
+ nargs="+",
81
+ default=None,
82
+ help="Optional subset of dataset names to update (defaults to model_list/model_categories Cartesian product)."
83
+ )
84
+ parser.add_argument(
85
+ "--model-keys",
86
+ nargs="+",
87
+ default=["glm", "xgb", "resn", "ft"],
88
+ choices=["glm", "xgb", "resn", "ft", "gnn", "all"],
89
+ help="Which trainers to run for each dataset."
90
+ )
91
+ parser.add_argument(
92
+ "--incremental-dir",
93
+ type=Path,
94
+ default=None,
95
+ help="Directory containing <model_name> incremental CSVs."
96
+ )
97
+ parser.add_argument(
98
+ "--incremental-file",
99
+ type=Path,
100
+ default=None,
101
+ help="Single incremental CSV (requires --model-names with exactly one entry)."
102
+ )
103
+ parser.add_argument(
104
+ "--incremental-template",
105
+ default="{model_name}_incremental.csv",
106
+ help="Filename template when --incremental-dir is provided."
107
+ )
108
+ parser.add_argument(
109
+ "--merge-keys",
110
+ nargs="+",
111
+ default=None,
112
+ help="Column(s) used to drop duplicate rows after merging base and incremental data."
113
+ )
114
+ parser.add_argument(
115
+ "--dedupe-keep",
116
+ choices=["first", "last"],
117
+ default="last",
118
+ help="How pandas.drop_duplicates resolves conflicts on merge keys."
119
+ )
120
+ parser.add_argument(
121
+ "--timestamp-col",
122
+ default=None,
123
+ help="Optional column used to sort rows before deduplication."
124
+ )
125
+ parser.add_argument(
126
+ "--timestamp-descending",
127
+ action="store_true",
128
+ help="Sort timestamp column in descending order before deduplication."
129
+ )
130
+ parser.add_argument(
131
+ "--min-new-rows",
132
+ type=int,
133
+ default=1,
134
+ help="Skip training if fewer new rows than this arrive (unless --train-without-incremental)."
135
+ )
136
+ parser.add_argument(
137
+ "--train-without-incremental",
138
+ action="store_true",
139
+ help="Always retrain even when no incremental file is present."
140
+ )
141
+ parser.add_argument(
142
+ "--strict-incremental",
143
+ action="store_true",
144
+ help="Raise an error when a dataset is missing its incremental CSV instead of skipping it."
145
+ )
146
+ parser.add_argument(
147
+ "--tag-new-column",
148
+ default=None,
149
+ help="If set, store 1 for incremental rows and 0 for historical rows in this column."
150
+ )
151
+ parser.add_argument(
152
+ "--max-evals",
153
+ type=int,
154
+ default=25,
155
+ help="Optuna trial count when retuning is required."
156
+ )
157
+ parser.add_argument(
158
+ "--retune-missing",
159
+ dest="retune_missing",
160
+ action="store_true",
161
+ default=True,
162
+ help="Retune models whose best-params CSV is unavailable (default)."
163
+ )
164
+ parser.add_argument(
165
+ "--skip-retune-missing",
166
+ dest="retune_missing",
167
+ action="store_false",
168
+ help="Do not retune when best params are missing; such models are skipped."
169
+ )
170
+ parser.add_argument(
171
+ "--force-retune",
172
+ action="store_true",
173
+ help="Run Optuna tuning even if historical best params exist."
174
+ )
175
+ parser.add_argument(
176
+ "--prop-test",
177
+ type=float,
178
+ default=None,
179
+ help="Override the test split proportion defined in the config file."
180
+ )
181
+ parser.add_argument(
182
+ "--rand-seed",
183
+ type=int,
184
+ default=None,
185
+ help="Override the random seed defined in the config."
186
+ )
187
+ parser.add_argument(
188
+ "--epochs",
189
+ type=int,
190
+ default=None,
191
+ help="Override the epoch count from the config."
192
+ )
193
+ parser.add_argument(
194
+ "--output-dir",
195
+ type=Path,
196
+ default=None,
197
+ help="Override the BayesOpt output root (models/results/plots)."
198
+ )
199
+ parser.add_argument(
200
+ "--update-base-data",
201
+ action="store_true",
202
+ help="Overwrite the base CSVs with the merged dataset after a successful update."
203
+ )
204
+ parser.add_argument(
205
+ "--persist-merged-dir",
206
+ type=Path,
207
+ default=None,
208
+ help="Optional directory to store the merged dataset snapshots."
209
+ )
210
+ parser.add_argument(
211
+ "--summary-json",
212
+ type=Path,
213
+ default=None,
214
+ help="Write a JSON summary of processed datasets to this path."
215
+ )
216
+ parser.add_argument(
217
+ "--plot-curves",
218
+ action="store_true",
219
+ help="Run one-way/lift plots after training (config plot settings also apply)."
220
+ )
221
+ parser.add_argument(
222
+ "--dry-run",
223
+ action="store_true",
224
+ help="Merge and report counts but skip training, saving and plotting."
225
+ )
226
+ args = parser.parse_args()
227
+
228
+ if args.incremental_file and args.incremental_dir:
229
+ parser.error("Use either --incremental-dir or --incremental-file, not both.")
230
+ if args.incremental_file and args.model_names and len(args.model_names) != 1:
231
+ parser.error("--incremental-file can only be used when updating exactly one model.")
232
+ if (not args.incremental_dir and not args.incremental_file) and not args.train_without_incremental:
233
+ parser.error(
234
+ "Provide --incremental-dir/--incremental-file or enable --train-without-incremental."
235
+ )
236
+ return args
237
+
238
+
239
+ def _plot_curves_for_model(model: ropt.BayesOptModel, trained: List[str], cfg: Dict[str, Any]) -> None:
240
+ plot_cfg = cfg.get("plot", {})
241
+ legacy_flags = {
242
+ "glm": cfg.get("plot_lift_glm", False),
243
+ "xgb": cfg.get("plot_lift_xgb", False),
244
+ "resn": cfg.get("plot_lift_resn", False),
245
+ "ft": cfg.get("plot_lift_ft", False),
246
+ }
247
+ plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
248
+ if not plot_enabled:
249
+ return
250
+
251
+ n_bins = int(plot_cfg.get("n_bins", 10))
252
+ oneway_enabled = plot_cfg.get("oneway", True)
253
+ available = dedupe_preserve_order([k for k in trained if k in PLOT_MODEL_LABELS])
254
+
255
+ lift_models = plot_cfg.get("lift_models")
256
+ if lift_models is None:
257
+ lift_models = [m for m, flag in legacy_flags.items() if flag]
258
+ if not lift_models:
259
+ lift_models = available
260
+ lift_models = dedupe_preserve_order([m for m in lift_models if m in available])
261
+
262
+ if oneway_enabled:
263
+ oneway_pred = bool(plot_cfg.get("oneway_pred", False))
264
+ oneway_pred_models = plot_cfg.get("oneway_pred_models")
265
+ pred_plotted = False
266
+ if oneway_pred:
267
+ if oneway_pred_models is None:
268
+ oneway_pred_models = lift_models or available
269
+ oneway_pred_models = dedupe_preserve_order(
270
+ [m for m in oneway_pred_models if m in available]
271
+ )
272
+ for model_key in oneway_pred_models:
273
+ label, pred_nme = PLOT_MODEL_LABELS[model_key]
274
+ if pred_nme not in model.train_data.columns:
275
+ print(
276
+ f"[Oneway] Missing prediction column '{pred_nme}'; skip.",
277
+ flush=True,
278
+ )
279
+ continue
280
+ model.plot_oneway(
281
+ n_bins=n_bins,
282
+ pred_col=pred_nme,
283
+ pred_label=label,
284
+ plot_subdir="oneway/post",
285
+ )
286
+ pred_plotted = True
287
+ if not oneway_pred or not pred_plotted:
288
+ model.plot_oneway(n_bins=n_bins, plot_subdir="oneway/post")
289
+ if not available:
290
+ return
291
+
292
+ for key in lift_models:
293
+ label, pred_nme = PLOT_MODEL_LABELS[key]
294
+ model.plot_lift(model_label=label, pred_nme=pred_nme, n_bins=n_bins)
295
+
296
+ if not plot_cfg.get("double_lift", True) or len(available) < 2:
297
+ return
298
+
299
+ raw_pairs = plot_cfg.get("double_lift_pairs")
300
+ if raw_pairs:
301
+ pairs = [
302
+ (a, b)
303
+ for a, b in parse_model_pairs(raw_pairs)
304
+ if a in available and b in available and a != b
305
+ ]
306
+ else:
307
+ pairs = [(a, b) for i, a in enumerate(available) for b in available[i + 1 :]]
308
+ for first, second in pairs:
309
+ model.plot_dlift([first, second], n_bins=n_bins)
310
+
311
+
312
+ def _coerce_scalar(value: Any) -> Any:
313
+ if isinstance(value, str):
314
+ lowered = value.strip().lower()
315
+ if lowered in {"", "none", "nan"}:
316
+ return None
317
+ if lowered in {"true", "false"}:
318
+ return lowered == "true"
319
+ return value
320
+ if hasattr(value, "item"):
321
+ try:
322
+ return value.item()
323
+ except Exception:
324
+ return value
325
+ return value
326
+
327
+
328
+ def _infer_format_from_path(path: Path) -> str:
329
+ suffix = path.suffix.lower()
330
+ if suffix in {".parquet", ".pq"}:
331
+ return "parquet"
332
+ if suffix in {".feather", ".ft"}:
333
+ return "feather"
334
+ return "csv"
335
+
336
+
337
+ def _load_best_params(model: ropt.BayesOptModel, trainer, silent: bool = False) -> Optional[Dict[str, Any]]:
338
+ label = trainer.label.lower()
339
+ result_dir = Path(model.output_manager.result_dir)
340
+ path = result_dir / f"{model.model_nme}_bestparams_{label}.csv"
341
+ if not path.exists():
342
+ if not silent:
343
+ _log(f"No historical params found for {model.model_nme}/{label} at {path}.")
344
+ return None
345
+ try:
346
+ params_raw = ropt.IOUtils.load_params_file(str(path))
347
+ except Exception:
348
+ return None
349
+ return {
350
+ key: _coerce_scalar(val)
351
+ for key, val in (params_raw or {}).items()
352
+ if not pd.isna(val)
353
+ }
354
+
355
+
356
+ def _to_serializable(obj: Any) -> Any:
357
+ if isinstance(obj, dict):
358
+ return {k: _to_serializable(v) for k, v in obj.items()}
359
+ if isinstance(obj, list):
360
+ return [_to_serializable(v) for v in obj]
361
+ if hasattr(obj, "item"):
362
+ try:
363
+ return obj.item()
364
+ except Exception:
365
+ return str(obj)
366
+ return obj
367
+
368
+
369
+ class IncrementalUpdateRunner:
370
+ def __init__(self, args: argparse.Namespace) -> None:
371
+ self.args = args
372
+ script_dir = Path(__file__).resolve().parents[1]
373
+ self.config_path, self.cfg = resolve_and_load_config(
374
+ args.config_json,
375
+ script_dir,
376
+ required_keys=[
377
+ "data_dir",
378
+ "model_list",
379
+ "model_categories",
380
+ "target",
381
+ "weight",
382
+ "feature_list",
383
+ "categorical_features",
384
+ ],
385
+ )
386
+ data_dir, data_format, data_path_template, dtype_map = resolve_data_config(
387
+ self.cfg,
388
+ self.config_path,
389
+ create_data_dir=True,
390
+ )
391
+ self.data_dir = data_dir
392
+ self.data_format = data_format
393
+ self.data_path_template = data_path_template
394
+ self.dtype_map = dtype_map
395
+ split_cfg = resolve_split_config(self.cfg)
396
+ runtime_cfg = resolve_runtime_config(self.cfg)
397
+ output_cfg = resolve_output_dirs(
398
+ self.cfg,
399
+ self.config_path,
400
+ output_override=args.output_dir,
401
+ )
402
+ self.runtime_cfg = runtime_cfg
403
+ self.prop_test = args.prop_test if args.prop_test is not None else split_cfg["prop_test"]
404
+ self.rand_seed = args.rand_seed if args.rand_seed is not None else self.cfg.get("rand_seed", 13)
405
+ self.epochs = args.epochs if args.epochs is not None else self.cfg.get("epochs", 50)
406
+ self.split_strategy = split_cfg["split_strategy"]
407
+ self.split_group_col = split_cfg["split_group_col"]
408
+ self.split_time_col = split_cfg["split_time_col"]
409
+ self.split_time_ascending = split_cfg["split_time_ascending"]
410
+ self.cv_strategy = split_cfg["cv_strategy"]
411
+ self.cv_group_col = split_cfg["cv_group_col"]
412
+ self.cv_time_col = split_cfg["cv_time_col"]
413
+ self.cv_time_ascending = split_cfg["cv_time_ascending"]
414
+ self.cv_splits = split_cfg["cv_splits"]
415
+ self.ft_oof_folds = split_cfg["ft_oof_folds"]
416
+ self.ft_oof_strategy = split_cfg["ft_oof_strategy"]
417
+ self.ft_oof_shuffle = split_cfg["ft_oof_shuffle"]
418
+ self.save_preprocess = runtime_cfg["save_preprocess"]
419
+ self.preprocess_artifact_path = runtime_cfg["preprocess_artifact_path"]
420
+ self.bo_sample_limit = runtime_cfg["bo_sample_limit"]
421
+ self.cache_predictions = runtime_cfg["cache_predictions"]
422
+ self.prediction_cache_dir = runtime_cfg["prediction_cache_dir"]
423
+ self.prediction_cache_format = runtime_cfg["prediction_cache_format"]
424
+ self.plot_path_style = runtime_cfg["plot_path_style"]
425
+ self.xgb_max_depth_max = runtime_cfg["xgb_max_depth_max"]
426
+ self.xgb_n_estimators_max = runtime_cfg["xgb_n_estimators_max"]
427
+ self.optuna_storage = runtime_cfg["optuna_storage"]
428
+ self.optuna_study_prefix = runtime_cfg["optuna_study_prefix"]
429
+ self.best_params_files = runtime_cfg["best_params_files"]
430
+ self.reuse_best_params = runtime_cfg["reuse_best_params"]
431
+ self.plot_requested = bool(args.plot_curves or self.cfg.get("plot_curves", False))
432
+ self.model_names = self._resolve_model_names(args.model_names)
433
+ self.merge_keys = list(args.merge_keys or [])
434
+ self.timestamp_col = args.timestamp_col
435
+ self.timestamp_ascending = not args.timestamp_descending
436
+ self.output_root = output_cfg["output_dir"]
437
+
438
+ self.incremental_dir = None
439
+ if args.incremental_dir is not None:
440
+ self.incremental_dir = args.incremental_dir
441
+ if not self.incremental_dir.is_absolute():
442
+ self.incremental_dir = (self.config_path.parent / self.incremental_dir).resolve()
443
+ else:
444
+ self.incremental_dir = self.incremental_dir.resolve()
445
+ self.incremental_file = None
446
+ if args.incremental_file is not None:
447
+ self.incremental_file = args.incremental_file
448
+ if not self.incremental_file.is_absolute():
449
+ self.incremental_file = (self.config_path.parent / self.incremental_file).resolve()
450
+ else:
451
+ self.incremental_file = self.incremental_file.resolve()
452
+ self.summary_records: List[Dict[str, Any]] = []
453
+ self.binary_resp = self.cfg.get("binary_resp_nme") or self.cfg.get("binary_target")
454
+
455
+ if self.incremental_file and len(self.model_names) != 1:
456
+ raise ValueError("--incremental-file can only be used when exactly one model name is targeted.")
457
+
458
+ def _resolve_model_names(self, override: Optional[Sequence[str]]) -> List[str]:
459
+ if override:
460
+ return dedupe_preserve_order([str(item) for item in override])
461
+ prefixes = self.cfg["model_list"]
462
+ suffixes = self.cfg["model_categories"]
463
+ return build_model_names(prefixes, suffixes)
464
+
465
+ def _load_incremental_df(self, model_name: str) -> Tuple[Optional[pd.DataFrame], Optional[Path]]:
466
+ path: Optional[Path] = None
467
+ if self.incremental_file:
468
+ path = self.incremental_file
469
+ elif self.incremental_dir:
470
+ rel = self.args.incremental_template.format(model_name=model_name)
471
+ path = (self.incremental_dir / rel).resolve()
472
+ if not path or not path.exists():
473
+ return None, None
474
+ try:
475
+ df = load_dataset(
476
+ path,
477
+ data_format="auto",
478
+ dtype_map=self.dtype_map,
479
+ low_memory=False,
480
+ )
481
+ except pd.errors.EmptyDataError:
482
+ _log(f"Incremental file {path} is empty; treating as no-op.")
483
+ return None, path
484
+ except Exception as exc:
485
+ _log(f"Failed to load incremental file {path}: {exc}")
486
+ return None, path
487
+ return df, path
488
+
489
+ def _merge_frames(self, base_df: pd.DataFrame, inc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
490
+ if inc_df is None or inc_df.empty:
491
+ merged = base_df.copy(deep=True)
492
+ return merged.reset_index(drop=True)
493
+ frames = []
494
+ tag = self.args.tag_new_column
495
+ if tag:
496
+ base_part = base_df.copy(deep=True)
497
+ base_part[tag] = 0
498
+ inc_part = inc_df.copy(deep=True)
499
+ inc_part[tag] = 1
500
+ frames = [base_part, inc_part]
501
+ else:
502
+ frames = [base_df, inc_df]
503
+ merged = pd.concat(frames, ignore_index=True, sort=False)
504
+ if self.timestamp_col and self.timestamp_col in merged.columns:
505
+ merged = merged.sort_values(
506
+ self.timestamp_col,
507
+ ascending=self.timestamp_ascending,
508
+ kind="mergesort",
509
+ )
510
+ if self.merge_keys:
511
+ missing = [col for col in self.merge_keys if col not in merged.columns]
512
+ if missing:
513
+ raise KeyError(f"Merge keys {missing} not found in merged frame for {self.merge_keys}.")
514
+ merged = merged.drop_duplicates(subset=self.merge_keys, keep=self.args.dedupe_keep)
515
+ return merged.reset_index(drop=True)
516
+
517
+ def _should_train(self, new_rows: int) -> bool:
518
+ if self.args.train_without_incremental:
519
+ return True
520
+ min_needed = max(0, self.args.min_new_rows)
521
+ return new_rows >= min_needed
522
+
523
+ def _write_dataset(self, df: pd.DataFrame, dest: Path, reason: str) -> None:
524
+ dest.parent.mkdir(parents=True, exist_ok=True)
525
+ fmt = str(self.data_format or "csv").lower()
526
+ if fmt == "auto":
527
+ fmt = _infer_format_from_path(dest)
528
+ if fmt == "parquet":
529
+ df.to_parquet(dest, index=False)
530
+ elif fmt == "feather":
531
+ df.reset_index(drop=True).to_feather(dest)
532
+ else:
533
+ df.to_csv(dest, index=False)
534
+ _log(f"Wrote {len(df)} rows to {dest} ({reason}).")
535
+
536
+ def _prepare_splits(self, merged: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
537
+ if not 0 < self.prop_test < 1:
538
+ raise ValueError(f"prop_test must fall in (0, 1); got {self.prop_test}.")
539
+ if len(merged) < 2:
540
+ raise ValueError("Need at least two rows to form a train/test split.")
541
+ train_df, test_df = split_train_test(
542
+ merged,
543
+ holdout_ratio=self.prop_test,
544
+ strategy=self.split_strategy,
545
+ group_col=self.split_group_col,
546
+ time_col=self.split_time_col,
547
+ time_ascending=self.split_time_ascending,
548
+ rand_seed=self.rand_seed,
549
+ reset_index_mode="always",
550
+ ratio_label="prop_test",
551
+ validate_ratio=False,
552
+ )
553
+ return train_df, test_df
554
+
555
+ def _requested_model_keys(self, trainer_map: Dict[str, Any]) -> List[str]:
556
+ requested = self.args.model_keys
557
+ if "all" in requested:
558
+ requested = ["glm", "xgb", "resn", "ft", "gnn"]
559
+ requested = dedupe_preserve_order(requested)
560
+ missing = [key for key in requested if key not in trainer_map]
561
+ for key in missing:
562
+ _log(f"Trainer '{key}' is not available for this context and will be skipped.")
563
+ return [key for key in requested if key in trainer_map]
564
+
565
+ def _train_single_model(
566
+ self,
567
+ model_name: str,
568
+ merged: pd.DataFrame,
569
+ new_rows: int,
570
+ incremental_path: Optional[Path],
571
+ ) -> Dict[str, Any]:
572
+ merged = merged.copy(deep=True)
573
+ merged.fillna(0, inplace=True)
574
+ train_df, test_df = self._prepare_splits(merged)
575
+ model = ropt.BayesOptModel(
576
+ train_df,
577
+ test_df,
578
+ model_name,
579
+ self.cfg["target"],
580
+ self.cfg["weight"],
581
+ self.cfg["feature_list"],
582
+ task_type=self.cfg.get("task_type", "regression"),
583
+ binary_resp_nme=self.binary_resp,
584
+ cate_list=self.cfg.get("categorical_features"),
585
+ prop_test=self.prop_test,
586
+ rand_seed=self.rand_seed,
587
+ epochs=self.epochs,
588
+ use_gpu=bool(self.cfg.get("use_gpu", True)),
589
+ use_resn_data_parallel=self.cfg.get("use_resn_data_parallel", False),
590
+ use_ft_data_parallel=self.cfg.get("use_ft_data_parallel", True),
591
+ use_gnn_data_parallel=self.cfg.get("use_gnn_data_parallel", False),
592
+ use_resn_ddp=self.cfg.get("use_resn_ddp", False),
593
+ use_ft_ddp=self.cfg.get("use_ft_ddp", False),
594
+ use_gnn_ddp=self.cfg.get("use_gnn_ddp", False),
595
+ output_dir=str(self.output_root) if self.output_root else None,
596
+ xgb_max_depth_max=self.xgb_max_depth_max,
597
+ xgb_n_estimators_max=self.xgb_n_estimators_max,
598
+ resn_weight_decay=self.cfg.get("resn_weight_decay"),
599
+ final_ensemble=bool(self.cfg.get("final_ensemble", False)),
600
+ final_ensemble_k=int(self.cfg.get("final_ensemble_k", 3)),
601
+ final_refit=bool(self.cfg.get("final_refit", True)),
602
+ optuna_storage=self.optuna_storage,
603
+ optuna_study_prefix=self.optuna_study_prefix,
604
+ best_params_files=self.best_params_files,
605
+ reuse_best_params=self.reuse_best_params,
606
+ gnn_use_approx_knn=self.cfg.get("gnn_use_approx_knn", True),
607
+ gnn_approx_knn_threshold=self.cfg.get("gnn_approx_knn_threshold", 50000),
608
+ gnn_graph_cache=self.cfg.get("gnn_graph_cache"),
609
+ gnn_max_gpu_knn_nodes=self.cfg.get("gnn_max_gpu_knn_nodes", 200000),
610
+ gnn_knn_gpu_mem_ratio=self.cfg.get("gnn_knn_gpu_mem_ratio", 0.9),
611
+ gnn_knn_gpu_mem_overhead=self.cfg.get("gnn_knn_gpu_mem_overhead", 2.0),
612
+ region_province_col=self.cfg.get("region_province_col"),
613
+ region_city_col=self.cfg.get("region_city_col"),
614
+ region_effect_alpha=self.cfg.get("region_effect_alpha"),
615
+ geo_feature_nmes=self.cfg.get("geo_feature_nmes"),
616
+ geo_token_hidden_dim=self.cfg.get("geo_token_hidden_dim"),
617
+ geo_token_layers=self.cfg.get("geo_token_layers"),
618
+ geo_token_dropout=self.cfg.get("geo_token_dropout"),
619
+ geo_token_k_neighbors=self.cfg.get("geo_token_k_neighbors"),
620
+ geo_token_learning_rate=self.cfg.get("geo_token_learning_rate"),
621
+ geo_token_epochs=self.cfg.get("geo_token_epochs"),
622
+ ft_role=str(self.cfg.get("ft_role", "model")),
623
+ ft_feature_prefix=str(self.cfg.get("ft_feature_prefix", "ft_emb")),
624
+ ft_num_numeric_tokens=self.cfg.get("ft_num_numeric_tokens"),
625
+ infer_categorical_max_unique=int(self.cfg.get("infer_categorical_max_unique", 50)),
626
+ infer_categorical_max_ratio=float(self.cfg.get("infer_categorical_max_ratio", 0.05)),
627
+ cv_strategy=self.cv_strategy or self.split_strategy,
628
+ cv_group_col=self.cv_group_col or self.split_group_col,
629
+ cv_time_col=self.cv_time_col or self.split_time_col,
630
+ cv_time_ascending=self.cv_time_ascending,
631
+ cv_splits=self.cv_splits,
632
+ ft_oof_folds=self.ft_oof_folds,
633
+ ft_oof_strategy=self.ft_oof_strategy,
634
+ ft_oof_shuffle=self.ft_oof_shuffle,
635
+ save_preprocess=self.save_preprocess,
636
+ preprocess_artifact_path=self.preprocess_artifact_path,
637
+ plot_path_style=self.plot_path_style,
638
+ bo_sample_limit=self.bo_sample_limit,
639
+ cache_predictions=self.cache_predictions,
640
+ prediction_cache_dir=self.prediction_cache_dir,
641
+ prediction_cache_format=self.prediction_cache_format,
642
+ )
643
+
644
+ if self.plot_requested and not self.args.dry_run:
645
+ plot_cfg = self.cfg.get("plot", {})
646
+ legacy_flags = {
647
+ "glm": self.cfg.get("plot_lift_glm", False),
648
+ "xgb": self.cfg.get("plot_lift_xgb", False),
649
+ "resn": self.cfg.get("plot_lift_resn", False),
650
+ "ft": self.cfg.get("plot_lift_ft", False),
651
+ }
652
+ plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
653
+ if plot_enabled and plot_cfg.get("pre_oneway", False) and plot_cfg.get("oneway", True):
654
+ n_bins = int(plot_cfg.get("n_bins", 10))
655
+ model.plot_oneway(n_bins=n_bins, plot_subdir="oneway/pre")
656
+
657
+ requested_keys = self._requested_model_keys(model.trainers)
658
+ executed_keys: List[str] = []
659
+ param_sources: Dict[str, str] = {}
660
+
661
+ if self.args.dry_run:
662
+ _log(f"Dry run: would train {requested_keys} for {model_name}.")
663
+ return {
664
+ "executed_keys": executed_keys,
665
+ "param_sources": param_sources,
666
+ "model": model,
667
+ }
668
+
669
+ if self.args.force_retune and self.args.max_evals <= 0:
670
+ raise ValueError("force_retune requires --max-evals > 0.")
671
+
672
+ force_retune = bool(self.args.force_retune)
673
+ if force_retune:
674
+ model.config.reuse_best_params = False
675
+ model.config.best_params_files = {}
676
+
677
+ ft_role = str(getattr(model.config, "ft_role", "model"))
678
+ if ft_role != "model" and "ft" in requested_keys:
679
+ requested_keys = ["ft"] + [k for k in requested_keys if k != "ft"]
680
+
681
+ for key in requested_keys:
682
+ trainer = model.trainers[key]
683
+
684
+ if force_retune:
685
+ trainer.best_params = None
686
+ trainer.best_trial = None
687
+ param_sources[key] = "retune"
688
+ else:
689
+ best_params = _load_best_params(model, trainer, silent=True)
690
+ if best_params:
691
+ trainer.best_params = best_params
692
+ trainer.best_trial = None
693
+ param_sources[key] = "loaded"
694
+ else:
695
+ if not self.args.retune_missing:
696
+ _log(
697
+ f"Skipping {model_name}/{key}: no best params and retuning disabled."
698
+ )
699
+ continue
700
+ param_sources[key] = "retune"
701
+
702
+ if (trainer.best_params is None) and self.args.max_evals <= 0:
703
+ raise ValueError("--max-evals must be positive when retuning is requested.")
704
+
705
+ model.optimize_model(key, max_evals=self.args.max_evals)
706
+ trainer.save()
707
+ executed_keys.append(key)
708
+ if key in PYTORCH_TRAINERS:
709
+ ropt.free_cuda()
710
+
711
+ snapshot = {
712
+ "mode": "incremental_train",
713
+ "model_name": model_name,
714
+ "model_key": key,
715
+ "timestamp": datetime.now().isoformat(),
716
+ "param_source": param_sources[key],
717
+ "best_params": _to_serializable(trainer.best_params or {}),
718
+ "incremental_rows": new_rows,
719
+ "train_rows": len(model.train_data),
720
+ "test_rows": len(model.test_data),
721
+ "incremental_path": str(incremental_path) if incremental_path else None,
722
+ "config": asdict(model.config),
723
+ }
724
+ model.version_manager.save(f"{model_name}_{key}_incremental", snapshot)
725
+
726
+ if not executed_keys:
727
+ _log(f"No trainers executed for {model_name}.")
728
+
729
+ return {
730
+ "executed_keys": executed_keys,
731
+ "param_sources": param_sources,
732
+ "model": model,
733
+ }
734
+
735
+ def process(self) -> None:
736
+ total_trained = 0
737
+ for model_name in self.model_names:
738
+ total_trained += self._process_single_model(model_name)
739
+ if self.args.summary_json and self.summary_records:
740
+ summary_path = self.args.summary_json.resolve()
741
+ summary_path.parent.mkdir(parents=True, exist_ok=True)
742
+ summary_payload = _to_serializable(self.summary_records)
743
+ summary_path.write_text(json.dumps(summary_payload, indent=2, ensure_ascii=False), encoding="utf-8")
744
+ _log(f"Summary written to {summary_path}.")
745
+ _log(f"Finished incremental update for {total_trained} dataset(s).")
746
+
747
+ def _process_single_model(self, model_name: str) -> int:
748
+ base_path = resolve_data_path(
749
+ self.data_dir,
750
+ model_name,
751
+ data_format=self.data_format,
752
+ path_template=self.data_path_template,
753
+ )
754
+ if not base_path.exists():
755
+ _log(f"Base dataset {base_path} not found; skipping {model_name}.")
756
+ self.summary_records.append({
757
+ "model_name": model_name,
758
+ "status": "missing_base",
759
+ })
760
+ return 0
761
+
762
+ base_df = load_dataset(
763
+ base_path,
764
+ data_format=self.data_format,
765
+ dtype_map=self.dtype_map,
766
+ low_memory=False,
767
+ )
768
+ inc_df, inc_path = self._load_incremental_df(model_name)
769
+ if inc_df is None and self.incremental_dir and self.args.strict_incremental and not self.args.train_without_incremental:
770
+ raise FileNotFoundError(f"Missing incremental file for {model_name} under {self.incremental_dir}.")
771
+
772
+ new_rows = 0 if inc_df is None else len(inc_df)
773
+ _log(f"{model_name}: {len(base_df)} base rows, {new_rows} incremental rows.")
774
+ merged_df = self._merge_frames(base_df, inc_df)
775
+ merged_df.fillna(0, inplace=True)
776
+
777
+ if self.args.update_base_data and not self.args.dry_run:
778
+ self._write_dataset(merged_df, base_path, "update_base_data")
779
+ if self.args.persist_merged_dir and not self.args.dry_run:
780
+ suffix = base_path.suffix or ".csv"
781
+ dest = Path(self.args.persist_merged_dir).resolve() / f"{model_name}{suffix}"
782
+ self._write_dataset(merged_df, dest, "persist_merged_dir")
783
+
784
+ if not self._should_train(new_rows):
785
+ _log(f"{model_name}: below min_new_rows ({self.args.min_new_rows}); skipping retrain.")
786
+ self.summary_records.append({
787
+ "model_name": model_name,
788
+ "status": "skipped_no_incremental",
789
+ "new_rows": new_rows,
790
+ "total_rows": len(merged_df),
791
+ })
792
+ return 0
793
+
794
+ try:
795
+ train_result = self._train_single_model(model_name, merged_df, new_rows, inc_path)
796
+ except Exception as exc:
797
+ _log(f"Training failed for {model_name}: {exc}")
798
+ self.summary_records.append({
799
+ "model_name": model_name,
800
+ "status": "failed",
801
+ "error": str(exc),
802
+ "new_rows": new_rows,
803
+ "total_rows": len(merged_df),
804
+ })
805
+ return 0
806
+
807
+ executed = train_result["executed_keys"]
808
+ param_sources = train_result["param_sources"]
809
+ model = train_result["model"]
810
+ status = "dry_run" if self.args.dry_run else "trained"
811
+
812
+ summary = {
813
+ "model_name": model_name,
814
+ "status": status,
815
+ "trained_models": executed,
816
+ "param_sources": param_sources,
817
+ "new_rows": new_rows,
818
+ "total_rows": len(merged_df),
819
+ "incremental_path": str(inc_path) if inc_path else None,
820
+ }
821
+ self.summary_records.append(summary)
822
+
823
+ if not self.args.dry_run and self.plot_requested and executed:
824
+ _plot_curves_for_model(model, executed, self.cfg)
825
+
826
+ return 1 if executed else 0
827
+
828
+
829
+ def main() -> None:
830
+ if configure_run_logging:
831
+ configure_run_logging(prefix="bayesopt_incremental")
832
+ args = _parse_args()
833
+ runner = IncrementalUpdateRunner(args)
834
+ runner.process()
835
+
836
+
837
+ if __name__ == "__main__":
838
+ main()