ins-pricing 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,904 +1,904 @@
1
- """Incremental training harness built on top of ``ins_pricing.bayesopt``
2
- (compat via ``BayesOpt.py``).
3
-
4
- This utility lets you append new observations to an existing dataset,
5
- reuse previously tuned hyperparameters and retrain a subset of models
6
- without re-running the full Optuna search. It can operate on a directory
7
- of per-model incremental CSVs or a single incremental file when updating
8
- one dataset.
9
-
10
- Example:
11
- python ins_pricing/cli/BayesOpt_incremental.py \
12
- --config-json ins_pricing/examples/modelling/config_incremental_template.json \
13
- --incremental-dir ./incremental_batches \
14
- --merge-keys policy_id vehicle_id \
15
- --model-keys glm xgb resn --plot-curves
16
- """
17
-
18
- from __future__ import annotations
19
-
20
- from pathlib import Path
21
- import sys
22
-
23
- if __package__ in {None, ""}:
24
- repo_root = Path(__file__).resolve().parents[2]
25
- if str(repo_root) not in sys.path:
26
- sys.path.insert(0, str(repo_root))
27
-
28
- import argparse
29
- import json
30
- from dataclasses import asdict
31
- from datetime import datetime
32
- from typing import Any, Dict, List, Optional, Sequence, Tuple
33
-
34
- import pandas as pd
35
-
36
- try:
37
- from .. import bayesopt as ropt # type: ignore
38
- from .utils.cli_common import ( # type: ignore
39
- PLOT_MODEL_LABELS,
40
- PYTORCH_TRAINERS,
41
- build_model_names,
42
- dedupe_preserve_order,
43
- load_dataset,
44
- parse_model_pairs,
45
- resolve_data_path,
46
- resolve_path,
47
- split_train_test,
48
- )
49
- from .utils.cli_config import ( # type: ignore
50
- add_config_json_arg,
51
- resolve_and_load_config,
52
- resolve_data_config,
53
- resolve_split_config,
54
- resolve_runtime_config,
55
- resolve_output_dirs,
56
- )
57
- except Exception: # pragma: no cover
58
- try:
59
- import bayesopt as ropt # type: ignore
60
- from utils.cli_common import ( # type: ignore
61
- PLOT_MODEL_LABELS,
62
- PYTORCH_TRAINERS,
63
- build_model_names,
64
- dedupe_preserve_order,
65
- load_dataset,
66
- parse_model_pairs,
67
- resolve_data_path,
68
- resolve_path,
69
- split_train_test,
70
- )
71
- from utils.cli_config import ( # type: ignore
72
- add_config_json_arg,
73
- resolve_and_load_config,
74
- resolve_data_config,
75
- resolve_split_config,
76
- resolve_runtime_config,
77
- resolve_output_dirs,
78
- )
79
- except Exception:
80
- try:
81
- import ins_pricing.modelling.core.bayesopt as ropt # type: ignore
82
- from ins_pricing.cli.utils.cli_common import ( # type: ignore
83
- PLOT_MODEL_LABELS,
84
- PYTORCH_TRAINERS,
85
- build_model_names,
86
- dedupe_preserve_order,
87
- load_dataset,
88
- parse_model_pairs,
89
- resolve_data_path,
90
- resolve_path,
91
- split_train_test,
92
- )
93
- from ins_pricing.cli.utils.cli_config import ( # type: ignore
94
- add_config_json_arg,
95
- resolve_and_load_config,
96
- resolve_data_config,
97
- resolve_split_config,
98
- resolve_runtime_config,
99
- resolve_output_dirs,
100
- )
101
- except Exception:
102
- import BayesOpt as ropt # type: ignore
103
- from utils.cli_common import ( # type: ignore
104
- PLOT_MODEL_LABELS,
105
- PYTORCH_TRAINERS,
106
- build_model_names,
107
- dedupe_preserve_order,
108
- load_dataset,
109
- parse_model_pairs,
110
- resolve_data_path,
111
- resolve_path,
112
- split_train_test,
113
- )
114
- from utils.cli_config import ( # type: ignore
115
- add_config_json_arg,
116
- resolve_and_load_config,
117
- resolve_data_config,
118
- resolve_split_config,
119
- resolve_runtime_config,
120
- resolve_output_dirs,
121
- )
122
-
123
- try:
124
- from .utils.run_logging import configure_run_logging # type: ignore
125
- except Exception: # pragma: no cover
126
- try:
127
- from utils.run_logging import configure_run_logging # type: ignore
128
- except Exception: # pragma: no cover
129
- configure_run_logging = None # type: ignore
130
-
131
-
132
- def _log(message: str) -> None:
133
- print(f"[Incremental] {message}")
134
-
135
-
136
- def _parse_args() -> argparse.Namespace:
137
- parser = argparse.ArgumentParser(
138
- description="Incrementally retrain BayesOpt models using new batches of data."
139
- )
140
- add_config_json_arg(
141
- parser,
142
- help_text="Path to the JSON config that cli/BayesOpt_entry.py uses.",
143
- )
144
- parser.add_argument(
145
- "--model-names",
146
- nargs="+",
147
- default=None,
148
- help="Optional subset of dataset names to update (defaults to model_list/model_categories Cartesian product)."
149
- )
150
- parser.add_argument(
151
- "--model-keys",
152
- nargs="+",
153
- default=["glm", "xgb", "resn", "ft"],
154
- choices=["glm", "xgb", "resn", "ft", "gnn", "all"],
155
- help="Which trainers to run for each dataset."
156
- )
157
- parser.add_argument(
158
- "--incremental-dir",
159
- type=Path,
160
- default=None,
161
- help="Directory containing <model_name> incremental CSVs."
162
- )
163
- parser.add_argument(
164
- "--incremental-file",
165
- type=Path,
166
- default=None,
167
- help="Single incremental CSV (requires --model-names with exactly one entry)."
168
- )
169
- parser.add_argument(
170
- "--incremental-template",
171
- default="{model_name}_incremental.csv",
172
- help="Filename template when --incremental-dir is provided."
173
- )
174
- parser.add_argument(
175
- "--merge-keys",
176
- nargs="+",
177
- default=None,
178
- help="Column(s) used to drop duplicate rows after merging base and incremental data."
179
- )
180
- parser.add_argument(
181
- "--dedupe-keep",
182
- choices=["first", "last"],
183
- default="last",
184
- help="How pandas.drop_duplicates resolves conflicts on merge keys."
185
- )
186
- parser.add_argument(
187
- "--timestamp-col",
188
- default=None,
189
- help="Optional column used to sort rows before deduplication."
190
- )
191
- parser.add_argument(
192
- "--timestamp-descending",
193
- action="store_true",
194
- help="Sort timestamp column in descending order before deduplication."
195
- )
196
- parser.add_argument(
197
- "--min-new-rows",
198
- type=int,
199
- default=1,
200
- help="Skip training if fewer new rows than this arrive (unless --train-without-incremental)."
201
- )
202
- parser.add_argument(
203
- "--train-without-incremental",
204
- action="store_true",
205
- help="Always retrain even when no incremental file is present."
206
- )
207
- parser.add_argument(
208
- "--strict-incremental",
209
- action="store_true",
210
- help="Raise an error when a dataset is missing its incremental CSV instead of skipping it."
211
- )
212
- parser.add_argument(
213
- "--tag-new-column",
214
- default=None,
215
- help="If set, store 1 for incremental rows and 0 for historical rows in this column."
216
- )
217
- parser.add_argument(
218
- "--max-evals",
219
- type=int,
220
- default=25,
221
- help="Optuna trial count when retuning is required."
222
- )
223
- parser.add_argument(
224
- "--retune-missing",
225
- dest="retune_missing",
226
- action="store_true",
227
- default=True,
228
- help="Retune models whose best-params CSV is unavailable (default)."
229
- )
230
- parser.add_argument(
231
- "--skip-retune-missing",
232
- dest="retune_missing",
233
- action="store_false",
234
- help="Do not retune when best params are missing; such models are skipped."
235
- )
236
- parser.add_argument(
237
- "--force-retune",
238
- action="store_true",
239
- help="Run Optuna tuning even if historical best params exist."
240
- )
241
- parser.add_argument(
242
- "--prop-test",
243
- type=float,
244
- default=None,
245
- help="Override the test split proportion defined in the config file."
246
- )
247
- parser.add_argument(
248
- "--rand-seed",
249
- type=int,
250
- default=None,
251
- help="Override the random seed defined in the config."
252
- )
253
- parser.add_argument(
254
- "--epochs",
255
- type=int,
256
- default=None,
257
- help="Override the epoch count from the config."
258
- )
259
- parser.add_argument(
260
- "--output-dir",
261
- type=Path,
262
- default=None,
263
- help="Override the BayesOpt output root (models/results/plots)."
264
- )
265
- parser.add_argument(
266
- "--update-base-data",
267
- action="store_true",
268
- help="Overwrite the base CSVs with the merged dataset after a successful update."
269
- )
270
- parser.add_argument(
271
- "--persist-merged-dir",
272
- type=Path,
273
- default=None,
274
- help="Optional directory to store the merged dataset snapshots."
275
- )
276
- parser.add_argument(
277
- "--summary-json",
278
- type=Path,
279
- default=None,
280
- help="Write a JSON summary of processed datasets to this path."
281
- )
282
- parser.add_argument(
283
- "--plot-curves",
284
- action="store_true",
285
- help="Run one-way/lift plots after training (config plot settings also apply)."
286
- )
287
- parser.add_argument(
288
- "--dry-run",
289
- action="store_true",
290
- help="Merge and report counts but skip training, saving and plotting."
291
- )
292
- args = parser.parse_args()
293
-
294
- if args.incremental_file and args.incremental_dir:
295
- parser.error("Use either --incremental-dir or --incremental-file, not both.")
296
- if args.incremental_file and args.model_names and len(args.model_names) != 1:
297
- parser.error("--incremental-file can only be used when updating exactly one model.")
298
- if (not args.incremental_dir and not args.incremental_file) and not args.train_without_incremental:
299
- parser.error(
300
- "Provide --incremental-dir/--incremental-file or enable --train-without-incremental."
301
- )
302
- return args
303
-
304
-
305
- def _plot_curves_for_model(model: ropt.BayesOptModel, trained: List[str], cfg: Dict[str, Any]) -> None:
306
- plot_cfg = cfg.get("plot", {})
307
- legacy_flags = {
308
- "glm": cfg.get("plot_lift_glm", False),
309
- "xgb": cfg.get("plot_lift_xgb", False),
310
- "resn": cfg.get("plot_lift_resn", False),
311
- "ft": cfg.get("plot_lift_ft", False),
312
- }
313
- plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
314
- if not plot_enabled:
315
- return
316
-
317
- n_bins = int(plot_cfg.get("n_bins", 10))
318
- oneway_enabled = plot_cfg.get("oneway", True)
319
- available = dedupe_preserve_order([k for k in trained if k in PLOT_MODEL_LABELS])
320
-
321
- lift_models = plot_cfg.get("lift_models")
322
- if lift_models is None:
323
- lift_models = [m for m, flag in legacy_flags.items() if flag]
324
- if not lift_models:
325
- lift_models = available
326
- lift_models = dedupe_preserve_order([m for m in lift_models if m in available])
327
-
328
- if oneway_enabled:
329
- oneway_pred = bool(plot_cfg.get("oneway_pred", False))
330
- oneway_pred_models = plot_cfg.get("oneway_pred_models")
331
- pred_plotted = False
332
- if oneway_pred:
333
- if oneway_pred_models is None:
334
- oneway_pred_models = lift_models or available
335
- oneway_pred_models = dedupe_preserve_order(
336
- [m for m in oneway_pred_models if m in available]
337
- )
338
- for model_key in oneway_pred_models:
339
- label, pred_nme = PLOT_MODEL_LABELS[model_key]
340
- if pred_nme not in model.train_data.columns:
341
- print(
342
- f"[Oneway] Missing prediction column '{pred_nme}'; skip.",
343
- flush=True,
344
- )
345
- continue
346
- model.plot_oneway(
347
- n_bins=n_bins,
348
- pred_col=pred_nme,
349
- pred_label=label,
350
- plot_subdir="oneway/post",
351
- )
352
- pred_plotted = True
353
- if not oneway_pred or not pred_plotted:
354
- model.plot_oneway(n_bins=n_bins, plot_subdir="oneway/post")
355
- if not available:
356
- return
357
-
358
- for key in lift_models:
359
- label, pred_nme = PLOT_MODEL_LABELS[key]
360
- model.plot_lift(model_label=label, pred_nme=pred_nme, n_bins=n_bins)
361
-
362
- if not plot_cfg.get("double_lift", True) or len(available) < 2:
363
- return
364
-
365
- raw_pairs = plot_cfg.get("double_lift_pairs")
366
- if raw_pairs:
367
- pairs = [
368
- (a, b)
369
- for a, b in parse_model_pairs(raw_pairs)
370
- if a in available and b in available and a != b
371
- ]
372
- else:
373
- pairs = [(a, b) for i, a in enumerate(available) for b in available[i + 1 :]]
374
- for first, second in pairs:
375
- model.plot_dlift([first, second], n_bins=n_bins)
376
-
377
-
378
- def _coerce_scalar(value: Any) -> Any:
379
- if isinstance(value, str):
380
- lowered = value.strip().lower()
381
- if lowered in {"", "none", "nan"}:
382
- return None
383
- if lowered in {"true", "false"}:
384
- return lowered == "true"
385
- return value
386
- if hasattr(value, "item"):
387
- try:
388
- return value.item()
389
- except Exception:
390
- return value
391
- return value
392
-
393
-
394
- def _infer_format_from_path(path: Path) -> str:
395
- suffix = path.suffix.lower()
396
- if suffix in {".parquet", ".pq"}:
397
- return "parquet"
398
- if suffix in {".feather", ".ft"}:
399
- return "feather"
400
- return "csv"
401
-
402
-
403
- def _load_best_params(model: ropt.BayesOptModel, trainer, silent: bool = False) -> Optional[Dict[str, Any]]:
404
- label = trainer.label.lower()
405
- result_dir = Path(model.output_manager.result_dir)
406
- path = result_dir / f"{model.model_nme}_bestparams_{label}.csv"
407
- if not path.exists():
408
- if not silent:
409
- _log(f"No historical params found for {model.model_nme}/{label} at {path}.")
410
- return None
411
- try:
412
- params_raw = ropt.IOUtils.load_params_file(str(path))
413
- except Exception:
414
- return None
415
- return {
416
- key: _coerce_scalar(val)
417
- for key, val in (params_raw or {}).items()
418
- if not pd.isna(val)
419
- }
420
-
421
-
422
- def _to_serializable(obj: Any) -> Any:
423
- if isinstance(obj, dict):
424
- return {k: _to_serializable(v) for k, v in obj.items()}
425
- if isinstance(obj, list):
426
- return [_to_serializable(v) for v in obj]
427
- if hasattr(obj, "item"):
428
- try:
429
- return obj.item()
430
- except Exception:
431
- return str(obj)
432
- return obj
433
-
434
-
435
- class IncrementalUpdateRunner:
436
- def __init__(self, args: argparse.Namespace) -> None:
437
- self.args = args
438
- script_dir = Path(__file__).resolve().parents[1]
439
- self.config_path, self.cfg = resolve_and_load_config(
440
- args.config_json,
441
- script_dir,
442
- required_keys=[
443
- "data_dir",
444
- "model_list",
445
- "model_categories",
446
- "target",
447
- "weight",
448
- "feature_list",
449
- "categorical_features",
450
- ],
451
- )
452
- data_dir, data_format, data_path_template, dtype_map = resolve_data_config(
453
- self.cfg,
454
- self.config_path,
455
- create_data_dir=True,
456
- )
457
- self.data_dir = data_dir
458
- self.data_format = data_format
459
- self.data_path_template = data_path_template
460
- self.dtype_map = dtype_map
461
- split_cfg = resolve_split_config(self.cfg)
462
- runtime_cfg = resolve_runtime_config(self.cfg)
463
- output_cfg = resolve_output_dirs(
464
- self.cfg,
465
- self.config_path,
466
- output_override=args.output_dir,
467
- )
468
- self.runtime_cfg = runtime_cfg
469
- self.prop_test = args.prop_test if args.prop_test is not None else split_cfg["prop_test"]
470
- self.rand_seed = args.rand_seed if args.rand_seed is not None else self.cfg.get("rand_seed", 13)
471
- self.epochs = args.epochs if args.epochs is not None else self.cfg.get("epochs", 50)
472
- self.split_strategy = split_cfg["split_strategy"]
473
- self.split_group_col = split_cfg["split_group_col"]
474
- self.split_time_col = split_cfg["split_time_col"]
475
- self.split_time_ascending = split_cfg["split_time_ascending"]
476
- self.cv_strategy = split_cfg["cv_strategy"]
477
- self.cv_group_col = split_cfg["cv_group_col"]
478
- self.cv_time_col = split_cfg["cv_time_col"]
479
- self.cv_time_ascending = split_cfg["cv_time_ascending"]
480
- self.cv_splits = split_cfg["cv_splits"]
481
- self.ft_oof_folds = split_cfg["ft_oof_folds"]
482
- self.ft_oof_strategy = split_cfg["ft_oof_strategy"]
483
- self.ft_oof_shuffle = split_cfg["ft_oof_shuffle"]
484
- self.save_preprocess = runtime_cfg["save_preprocess"]
485
- self.preprocess_artifact_path = runtime_cfg["preprocess_artifact_path"]
486
- self.bo_sample_limit = runtime_cfg["bo_sample_limit"]
487
- self.cache_predictions = runtime_cfg["cache_predictions"]
488
- self.prediction_cache_dir = runtime_cfg["prediction_cache_dir"]
489
- self.prediction_cache_format = runtime_cfg["prediction_cache_format"]
490
- self.plot_path_style = runtime_cfg["plot_path_style"]
491
- self.xgb_max_depth_max = runtime_cfg["xgb_max_depth_max"]
492
- self.xgb_n_estimators_max = runtime_cfg["xgb_n_estimators_max"]
493
- self.optuna_storage = runtime_cfg["optuna_storage"]
494
- self.optuna_study_prefix = runtime_cfg["optuna_study_prefix"]
495
- self.best_params_files = runtime_cfg["best_params_files"]
496
- self.reuse_best_params = runtime_cfg["reuse_best_params"]
497
- self.plot_requested = bool(args.plot_curves or self.cfg.get("plot_curves", False))
498
- self.model_names = self._resolve_model_names(args.model_names)
499
- self.merge_keys = list(args.merge_keys or [])
500
- self.timestamp_col = args.timestamp_col
501
- self.timestamp_ascending = not args.timestamp_descending
502
- self.output_root = output_cfg["output_dir"]
503
-
504
- self.incremental_dir = None
505
- if args.incremental_dir is not None:
506
- self.incremental_dir = args.incremental_dir
507
- if not self.incremental_dir.is_absolute():
508
- self.incremental_dir = (self.config_path.parent / self.incremental_dir).resolve()
509
- else:
510
- self.incremental_dir = self.incremental_dir.resolve()
511
- self.incremental_file = None
512
- if args.incremental_file is not None:
513
- self.incremental_file = args.incremental_file
514
- if not self.incremental_file.is_absolute():
515
- self.incremental_file = (self.config_path.parent / self.incremental_file).resolve()
516
- else:
517
- self.incremental_file = self.incremental_file.resolve()
518
- self.summary_records: List[Dict[str, Any]] = []
519
- self.binary_resp = self.cfg.get("binary_resp_nme") or self.cfg.get("binary_target")
520
-
521
- if self.incremental_file and len(self.model_names) != 1:
522
- raise ValueError("--incremental-file can only be used when exactly one model name is targeted.")
523
-
524
- def _resolve_model_names(self, override: Optional[Sequence[str]]) -> List[str]:
525
- if override:
526
- return dedupe_preserve_order([str(item) for item in override])
527
- prefixes = self.cfg["model_list"]
528
- suffixes = self.cfg["model_categories"]
529
- return build_model_names(prefixes, suffixes)
530
-
531
- def _load_incremental_df(self, model_name: str) -> Tuple[Optional[pd.DataFrame], Optional[Path]]:
532
- path: Optional[Path] = None
533
- if self.incremental_file:
534
- path = self.incremental_file
535
- elif self.incremental_dir:
536
- rel = self.args.incremental_template.format(model_name=model_name)
537
- path = (self.incremental_dir / rel).resolve()
538
- if not path or not path.exists():
539
- return None, None
540
- try:
541
- df = load_dataset(
542
- path,
543
- data_format="auto",
544
- dtype_map=self.dtype_map,
545
- low_memory=False,
546
- )
547
- except pd.errors.EmptyDataError:
548
- _log(f"Incremental file {path} is empty; treating as no-op.")
549
- return None, path
550
- except Exception as exc:
551
- _log(f"Failed to load incremental file {path}: {exc}")
552
- return None, path
553
- return df, path
554
-
555
- def _merge_frames(self, base_df: pd.DataFrame, inc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
556
- if inc_df is None or inc_df.empty:
557
- merged = base_df.copy(deep=True)
558
- return merged.reset_index(drop=True)
559
- frames = []
560
- tag = self.args.tag_new_column
561
- if tag:
562
- base_part = base_df.copy(deep=True)
563
- base_part[tag] = 0
564
- inc_part = inc_df.copy(deep=True)
565
- inc_part[tag] = 1
566
- frames = [base_part, inc_part]
567
- else:
568
- frames = [base_df, inc_df]
569
- merged = pd.concat(frames, ignore_index=True, sort=False)
570
- if self.timestamp_col and self.timestamp_col in merged.columns:
571
- merged = merged.sort_values(
572
- self.timestamp_col,
573
- ascending=self.timestamp_ascending,
574
- kind="mergesort",
575
- )
576
- if self.merge_keys:
577
- missing = [col for col in self.merge_keys if col not in merged.columns]
578
- if missing:
579
- raise KeyError(f"Merge keys {missing} not found in merged frame for {self.merge_keys}.")
580
- merged = merged.drop_duplicates(subset=self.merge_keys, keep=self.args.dedupe_keep)
581
- return merged.reset_index(drop=True)
582
-
583
- def _should_train(self, new_rows: int) -> bool:
584
- if self.args.train_without_incremental:
585
- return True
586
- min_needed = max(0, self.args.min_new_rows)
587
- return new_rows >= min_needed
588
-
589
- def _write_dataset(self, df: pd.DataFrame, dest: Path, reason: str) -> None:
590
- dest.parent.mkdir(parents=True, exist_ok=True)
591
- fmt = str(self.data_format or "csv").lower()
592
- if fmt == "auto":
593
- fmt = _infer_format_from_path(dest)
594
- if fmt == "parquet":
595
- df.to_parquet(dest, index=False)
596
- elif fmt == "feather":
597
- df.reset_index(drop=True).to_feather(dest)
598
- else:
599
- df.to_csv(dest, index=False)
600
- _log(f"Wrote {len(df)} rows to {dest} ({reason}).")
601
-
602
- def _prepare_splits(self, merged: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
603
- if not 0 < self.prop_test < 1:
604
- raise ValueError(f"prop_test must fall in (0, 1); got {self.prop_test}.")
605
- if len(merged) < 2:
606
- raise ValueError("Need at least two rows to form a train/test split.")
607
- train_df, test_df = split_train_test(
608
- merged,
609
- holdout_ratio=self.prop_test,
610
- strategy=self.split_strategy,
611
- group_col=self.split_group_col,
612
- time_col=self.split_time_col,
613
- time_ascending=self.split_time_ascending,
614
- rand_seed=self.rand_seed,
615
- reset_index_mode="always",
616
- ratio_label="prop_test",
617
- validate_ratio=False,
618
- )
619
- return train_df, test_df
620
-
621
- def _requested_model_keys(self, trainer_map: Dict[str, Any]) -> List[str]:
622
- requested = self.args.model_keys
623
- if "all" in requested:
624
- requested = ["glm", "xgb", "resn", "ft", "gnn"]
625
- requested = dedupe_preserve_order(requested)
626
- missing = [key for key in requested if key not in trainer_map]
627
- for key in missing:
628
- _log(f"Trainer '{key}' is not available for this context and will be skipped.")
629
- return [key for key in requested if key in trainer_map]
630
-
631
- def _train_single_model(
632
- self,
633
- model_name: str,
634
- merged: pd.DataFrame,
635
- new_rows: int,
636
- incremental_path: Optional[Path],
637
- ) -> Dict[str, Any]:
638
- merged = merged.copy(deep=True)
639
- merged.fillna(0, inplace=True)
640
- train_df, test_df = self._prepare_splits(merged)
641
- model = ropt.BayesOptModel(
642
- train_df,
643
- test_df,
644
- model_name,
645
- self.cfg["target"],
646
- self.cfg["weight"],
647
- self.cfg["feature_list"],
648
- task_type=self.cfg.get("task_type", "regression"),
649
- binary_resp_nme=self.binary_resp,
650
- cate_list=self.cfg.get("categorical_features"),
651
- prop_test=self.prop_test,
652
- rand_seed=self.rand_seed,
653
- epochs=self.epochs,
654
- use_gpu=bool(self.cfg.get("use_gpu", True)),
655
- use_resn_data_parallel=self.cfg.get("use_resn_data_parallel", False),
656
- use_ft_data_parallel=self.cfg.get("use_ft_data_parallel", True),
657
- use_gnn_data_parallel=self.cfg.get("use_gnn_data_parallel", False),
658
- use_resn_ddp=self.cfg.get("use_resn_ddp", False),
659
- use_ft_ddp=self.cfg.get("use_ft_ddp", False),
660
- use_gnn_ddp=self.cfg.get("use_gnn_ddp", False),
661
- output_dir=str(self.output_root) if self.output_root else None,
662
- xgb_max_depth_max=self.xgb_max_depth_max,
663
- xgb_n_estimators_max=self.xgb_n_estimators_max,
664
- resn_weight_decay=self.cfg.get("resn_weight_decay"),
665
- final_ensemble=bool(self.cfg.get("final_ensemble", False)),
666
- final_ensemble_k=int(self.cfg.get("final_ensemble_k", 3)),
667
- final_refit=bool(self.cfg.get("final_refit", True)),
668
- optuna_storage=self.optuna_storage,
669
- optuna_study_prefix=self.optuna_study_prefix,
670
- best_params_files=self.best_params_files,
671
- reuse_best_params=self.reuse_best_params,
672
- gnn_use_approx_knn=self.cfg.get("gnn_use_approx_knn", True),
673
- gnn_approx_knn_threshold=self.cfg.get("gnn_approx_knn_threshold", 50000),
674
- gnn_graph_cache=self.cfg.get("gnn_graph_cache"),
675
- gnn_max_gpu_knn_nodes=self.cfg.get("gnn_max_gpu_knn_nodes", 200000),
676
- gnn_knn_gpu_mem_ratio=self.cfg.get("gnn_knn_gpu_mem_ratio", 0.9),
677
- gnn_knn_gpu_mem_overhead=self.cfg.get("gnn_knn_gpu_mem_overhead", 2.0),
678
- region_province_col=self.cfg.get("region_province_col"),
679
- region_city_col=self.cfg.get("region_city_col"),
680
- region_effect_alpha=self.cfg.get("region_effect_alpha"),
681
- geo_feature_nmes=self.cfg.get("geo_feature_nmes"),
682
- geo_token_hidden_dim=self.cfg.get("geo_token_hidden_dim"),
683
- geo_token_layers=self.cfg.get("geo_token_layers"),
684
- geo_token_dropout=self.cfg.get("geo_token_dropout"),
685
- geo_token_k_neighbors=self.cfg.get("geo_token_k_neighbors"),
686
- geo_token_learning_rate=self.cfg.get("geo_token_learning_rate"),
687
- geo_token_epochs=self.cfg.get("geo_token_epochs"),
688
- ft_role=str(self.cfg.get("ft_role", "model")),
689
- ft_feature_prefix=str(self.cfg.get("ft_feature_prefix", "ft_emb")),
690
- ft_num_numeric_tokens=self.cfg.get("ft_num_numeric_tokens"),
691
- infer_categorical_max_unique=int(self.cfg.get("infer_categorical_max_unique", 50)),
692
- infer_categorical_max_ratio=float(self.cfg.get("infer_categorical_max_ratio", 0.05)),
693
- cv_strategy=self.cv_strategy or self.split_strategy,
694
- cv_group_col=self.cv_group_col or self.split_group_col,
695
- cv_time_col=self.cv_time_col or self.split_time_col,
696
- cv_time_ascending=self.cv_time_ascending,
697
- cv_splits=self.cv_splits,
698
- ft_oof_folds=self.ft_oof_folds,
699
- ft_oof_strategy=self.ft_oof_strategy,
700
- ft_oof_shuffle=self.ft_oof_shuffle,
701
- save_preprocess=self.save_preprocess,
702
- preprocess_artifact_path=self.preprocess_artifact_path,
703
- plot_path_style=self.plot_path_style,
704
- bo_sample_limit=self.bo_sample_limit,
705
- cache_predictions=self.cache_predictions,
706
- prediction_cache_dir=self.prediction_cache_dir,
707
- prediction_cache_format=self.prediction_cache_format,
708
- )
709
-
710
- if self.plot_requested and not self.args.dry_run:
711
- plot_cfg = self.cfg.get("plot", {})
712
- legacy_flags = {
713
- "glm": self.cfg.get("plot_lift_glm", False),
714
- "xgb": self.cfg.get("plot_lift_xgb", False),
715
- "resn": self.cfg.get("plot_lift_resn", False),
716
- "ft": self.cfg.get("plot_lift_ft", False),
717
- }
718
- plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
719
- if plot_enabled and plot_cfg.get("pre_oneway", False) and plot_cfg.get("oneway", True):
720
- n_bins = int(plot_cfg.get("n_bins", 10))
721
- model.plot_oneway(n_bins=n_bins, plot_subdir="oneway/pre")
722
-
723
- requested_keys = self._requested_model_keys(model.trainers)
724
- executed_keys: List[str] = []
725
- param_sources: Dict[str, str] = {}
726
-
727
- if self.args.dry_run:
728
- _log(f"Dry run: would train {requested_keys} for {model_name}.")
729
- return {
730
- "executed_keys": executed_keys,
731
- "param_sources": param_sources,
732
- "model": model,
733
- }
734
-
735
- if self.args.force_retune and self.args.max_evals <= 0:
736
- raise ValueError("force_retune requires --max-evals > 0.")
737
-
738
- force_retune = bool(self.args.force_retune)
739
- if force_retune:
740
- model.config.reuse_best_params = False
741
- model.config.best_params_files = {}
742
-
743
- ft_role = str(getattr(model.config, "ft_role", "model"))
744
- if ft_role != "model" and "ft" in requested_keys:
745
- requested_keys = ["ft"] + [k for k in requested_keys if k != "ft"]
746
-
747
- for key in requested_keys:
748
- trainer = model.trainers[key]
749
-
750
- if force_retune:
751
- trainer.best_params = None
752
- trainer.best_trial = None
753
- param_sources[key] = "retune"
754
- else:
755
- best_params = _load_best_params(model, trainer, silent=True)
756
- if best_params:
757
- trainer.best_params = best_params
758
- trainer.best_trial = None
759
- param_sources[key] = "loaded"
760
- else:
761
- if not self.args.retune_missing:
762
- _log(
763
- f"Skipping {model_name}/{key}: no best params and retuning disabled."
764
- )
765
- continue
766
- param_sources[key] = "retune"
767
-
768
- if (trainer.best_params is None) and self.args.max_evals <= 0:
769
- raise ValueError("--max-evals must be positive when retuning is requested.")
770
-
771
- model.optimize_model(key, max_evals=self.args.max_evals)
772
- trainer.save()
773
- executed_keys.append(key)
774
- if key in PYTORCH_TRAINERS:
775
- ropt.free_cuda()
776
-
777
- snapshot = {
778
- "mode": "incremental_train",
779
- "model_name": model_name,
780
- "model_key": key,
781
- "timestamp": datetime.now().isoformat(),
782
- "param_source": param_sources[key],
783
- "best_params": _to_serializable(trainer.best_params or {}),
784
- "incremental_rows": new_rows,
785
- "train_rows": len(model.train_data),
786
- "test_rows": len(model.test_data),
787
- "incremental_path": str(incremental_path) if incremental_path else None,
788
- "config": asdict(model.config),
789
- }
790
- model.version_manager.save(f"{model_name}_{key}_incremental", snapshot)
791
-
792
- if not executed_keys:
793
- _log(f"No trainers executed for {model_name}.")
794
-
795
- return {
796
- "executed_keys": executed_keys,
797
- "param_sources": param_sources,
798
- "model": model,
799
- }
800
-
801
- def process(self) -> None:
802
- total_trained = 0
803
- for model_name in self.model_names:
804
- total_trained += self._process_single_model(model_name)
805
- if self.args.summary_json and self.summary_records:
806
- summary_path = self.args.summary_json.resolve()
807
- summary_path.parent.mkdir(parents=True, exist_ok=True)
808
- summary_payload = _to_serializable(self.summary_records)
809
- summary_path.write_text(json.dumps(summary_payload, indent=2, ensure_ascii=False), encoding="utf-8")
810
- _log(f"Summary written to {summary_path}.")
811
- _log(f"Finished incremental update for {total_trained} dataset(s).")
812
-
813
- def _process_single_model(self, model_name: str) -> int:
814
- base_path = resolve_data_path(
815
- self.data_dir,
816
- model_name,
817
- data_format=self.data_format,
818
- path_template=self.data_path_template,
819
- )
820
- if not base_path.exists():
821
- _log(f"Base dataset {base_path} not found; skipping {model_name}.")
822
- self.summary_records.append({
823
- "model_name": model_name,
824
- "status": "missing_base",
825
- })
826
- return 0
827
-
828
- base_df = load_dataset(
829
- base_path,
830
- data_format=self.data_format,
831
- dtype_map=self.dtype_map,
832
- low_memory=False,
833
- )
834
- inc_df, inc_path = self._load_incremental_df(model_name)
835
- if inc_df is None and self.incremental_dir and self.args.strict_incremental and not self.args.train_without_incremental:
836
- raise FileNotFoundError(f"Missing incremental file for {model_name} under {self.incremental_dir}.")
837
-
838
- new_rows = 0 if inc_df is None else len(inc_df)
839
- _log(f"{model_name}: {len(base_df)} base rows, {new_rows} incremental rows.")
840
- merged_df = self._merge_frames(base_df, inc_df)
841
- merged_df.fillna(0, inplace=True)
842
-
843
- if self.args.update_base_data and not self.args.dry_run:
844
- self._write_dataset(merged_df, base_path, "update_base_data")
845
- if self.args.persist_merged_dir and not self.args.dry_run:
846
- suffix = base_path.suffix or ".csv"
847
- dest = Path(self.args.persist_merged_dir).resolve() / f"{model_name}{suffix}"
848
- self._write_dataset(merged_df, dest, "persist_merged_dir")
849
-
850
- if not self._should_train(new_rows):
851
- _log(f"{model_name}: below min_new_rows ({self.args.min_new_rows}); skipping retrain.")
852
- self.summary_records.append({
853
- "model_name": model_name,
854
- "status": "skipped_no_incremental",
855
- "new_rows": new_rows,
856
- "total_rows": len(merged_df),
857
- })
858
- return 0
859
-
860
- try:
861
- train_result = self._train_single_model(model_name, merged_df, new_rows, inc_path)
862
- except Exception as exc:
863
- _log(f"Training failed for {model_name}: {exc}")
864
- self.summary_records.append({
865
- "model_name": model_name,
866
- "status": "failed",
867
- "error": str(exc),
868
- "new_rows": new_rows,
869
- "total_rows": len(merged_df),
870
- })
871
- return 0
872
-
873
- executed = train_result["executed_keys"]
874
- param_sources = train_result["param_sources"]
875
- model = train_result["model"]
876
- status = "dry_run" if self.args.dry_run else "trained"
877
-
878
- summary = {
879
- "model_name": model_name,
880
- "status": status,
881
- "trained_models": executed,
882
- "param_sources": param_sources,
883
- "new_rows": new_rows,
884
- "total_rows": len(merged_df),
885
- "incremental_path": str(inc_path) if inc_path else None,
886
- }
887
- self.summary_records.append(summary)
888
-
889
- if not self.args.dry_run and self.plot_requested and executed:
890
- _plot_curves_for_model(model, executed, self.cfg)
891
-
892
- return 1 if executed else 0
893
-
894
-
895
- def main() -> None:
896
- if configure_run_logging:
897
- configure_run_logging(prefix="bayesopt_incremental")
898
- args = _parse_args()
899
- runner = IncrementalUpdateRunner(args)
900
- runner.process()
901
-
902
-
903
- if __name__ == "__main__":
904
- main()
1
+ """Incremental training harness built on top of ``ins_pricing.bayesopt``
2
+ (compat via ``BayesOpt.py``).
3
+
4
+ This utility lets you append new observations to an existing dataset,
5
+ reuse previously tuned hyperparameters and retrain a subset of models
6
+ without re-running the full Optuna search. It can operate on a directory
7
+ of per-model incremental CSVs or a single incremental file when updating
8
+ one dataset.
9
+
10
+ Example:
11
+ python ins_pricing/cli/BayesOpt_incremental.py \
12
+ --config-json examples/config_incremental_template.json \
13
+ --incremental-dir ./incremental_batches \
14
+ --merge-keys policy_id vehicle_id \
15
+ --model-keys glm xgb resn --plot-curves
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from pathlib import Path
21
+ import sys
22
+
23
+ if __package__ in {None, ""}:
24
+ repo_root = Path(__file__).resolve().parents[2]
25
+ if str(repo_root) not in sys.path:
26
+ sys.path.insert(0, str(repo_root))
27
+
28
+ import argparse
29
+ import json
30
+ from dataclasses import asdict
31
+ from datetime import datetime
32
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
33
+
34
+ import pandas as pd
35
+
36
+ try:
37
+ from .. import bayesopt as ropt # type: ignore
38
+ from .utils.cli_common import ( # type: ignore
39
+ PLOT_MODEL_LABELS,
40
+ PYTORCH_TRAINERS,
41
+ build_model_names,
42
+ dedupe_preserve_order,
43
+ load_dataset,
44
+ parse_model_pairs,
45
+ resolve_data_path,
46
+ resolve_path,
47
+ split_train_test,
48
+ )
49
+ from .utils.cli_config import ( # type: ignore
50
+ add_config_json_arg,
51
+ resolve_and_load_config,
52
+ resolve_data_config,
53
+ resolve_split_config,
54
+ resolve_runtime_config,
55
+ resolve_output_dirs,
56
+ )
57
+ except Exception: # pragma: no cover
58
+ try:
59
+ import bayesopt as ropt # type: ignore
60
+ from utils.cli_common import ( # type: ignore
61
+ PLOT_MODEL_LABELS,
62
+ PYTORCH_TRAINERS,
63
+ build_model_names,
64
+ dedupe_preserve_order,
65
+ load_dataset,
66
+ parse_model_pairs,
67
+ resolve_data_path,
68
+ resolve_path,
69
+ split_train_test,
70
+ )
71
+ from utils.cli_config import ( # type: ignore
72
+ add_config_json_arg,
73
+ resolve_and_load_config,
74
+ resolve_data_config,
75
+ resolve_split_config,
76
+ resolve_runtime_config,
77
+ resolve_output_dirs,
78
+ )
79
+ except Exception:
80
+ try:
81
+ import ins_pricing.modelling.core.bayesopt as ropt # type: ignore
82
+ from ins_pricing.cli.utils.cli_common import ( # type: ignore
83
+ PLOT_MODEL_LABELS,
84
+ PYTORCH_TRAINERS,
85
+ build_model_names,
86
+ dedupe_preserve_order,
87
+ load_dataset,
88
+ parse_model_pairs,
89
+ resolve_data_path,
90
+ resolve_path,
91
+ split_train_test,
92
+ )
93
+ from ins_pricing.cli.utils.cli_config import ( # type: ignore
94
+ add_config_json_arg,
95
+ resolve_and_load_config,
96
+ resolve_data_config,
97
+ resolve_split_config,
98
+ resolve_runtime_config,
99
+ resolve_output_dirs,
100
+ )
101
+ except Exception:
102
+ import BayesOpt as ropt # type: ignore
103
+ from utils.cli_common import ( # type: ignore
104
+ PLOT_MODEL_LABELS,
105
+ PYTORCH_TRAINERS,
106
+ build_model_names,
107
+ dedupe_preserve_order,
108
+ load_dataset,
109
+ parse_model_pairs,
110
+ resolve_data_path,
111
+ resolve_path,
112
+ split_train_test,
113
+ )
114
+ from utils.cli_config import ( # type: ignore
115
+ add_config_json_arg,
116
+ resolve_and_load_config,
117
+ resolve_data_config,
118
+ resolve_split_config,
119
+ resolve_runtime_config,
120
+ resolve_output_dirs,
121
+ )
122
+
123
+ try:
124
+ from .utils.run_logging import configure_run_logging # type: ignore
125
+ except Exception: # pragma: no cover
126
+ try:
127
+ from utils.run_logging import configure_run_logging # type: ignore
128
+ except Exception: # pragma: no cover
129
+ configure_run_logging = None # type: ignore
130
+
131
+
132
+ def _log(message: str) -> None:
133
+ print(f"[Incremental] {message}")
134
+
135
+
136
+ def _parse_args() -> argparse.Namespace:
137
+ parser = argparse.ArgumentParser(
138
+ description="Incrementally retrain BayesOpt models using new batches of data."
139
+ )
140
+ add_config_json_arg(
141
+ parser,
142
+ help_text="Path to the JSON config that cli/BayesOpt_entry.py uses.",
143
+ )
144
+ parser.add_argument(
145
+ "--model-names",
146
+ nargs="+",
147
+ default=None,
148
+ help="Optional subset of dataset names to update (defaults to model_list/model_categories Cartesian product)."
149
+ )
150
+ parser.add_argument(
151
+ "--model-keys",
152
+ nargs="+",
153
+ default=["glm", "xgb", "resn", "ft"],
154
+ choices=["glm", "xgb", "resn", "ft", "gnn", "all"],
155
+ help="Which trainers to run for each dataset."
156
+ )
157
+ parser.add_argument(
158
+ "--incremental-dir",
159
+ type=Path,
160
+ default=None,
161
+ help="Directory containing <model_name> incremental CSVs."
162
+ )
163
+ parser.add_argument(
164
+ "--incremental-file",
165
+ type=Path,
166
+ default=None,
167
+ help="Single incremental CSV (requires --model-names with exactly one entry)."
168
+ )
169
+ parser.add_argument(
170
+ "--incremental-template",
171
+ default="{model_name}_incremental.csv",
172
+ help="Filename template when --incremental-dir is provided."
173
+ )
174
+ parser.add_argument(
175
+ "--merge-keys",
176
+ nargs="+",
177
+ default=None,
178
+ help="Column(s) used to drop duplicate rows after merging base and incremental data."
179
+ )
180
+ parser.add_argument(
181
+ "--dedupe-keep",
182
+ choices=["first", "last"],
183
+ default="last",
184
+ help="How pandas.drop_duplicates resolves conflicts on merge keys."
185
+ )
186
+ parser.add_argument(
187
+ "--timestamp-col",
188
+ default=None,
189
+ help="Optional column used to sort rows before deduplication."
190
+ )
191
+ parser.add_argument(
192
+ "--timestamp-descending",
193
+ action="store_true",
194
+ help="Sort timestamp column in descending order before deduplication."
195
+ )
196
+ parser.add_argument(
197
+ "--min-new-rows",
198
+ type=int,
199
+ default=1,
200
+ help="Skip training if fewer new rows than this arrive (unless --train-without-incremental)."
201
+ )
202
+ parser.add_argument(
203
+ "--train-without-incremental",
204
+ action="store_true",
205
+ help="Always retrain even when no incremental file is present."
206
+ )
207
+ parser.add_argument(
208
+ "--strict-incremental",
209
+ action="store_true",
210
+ help="Raise an error when a dataset is missing its incremental CSV instead of skipping it."
211
+ )
212
+ parser.add_argument(
213
+ "--tag-new-column",
214
+ default=None,
215
+ help="If set, store 1 for incremental rows and 0 for historical rows in this column."
216
+ )
217
+ parser.add_argument(
218
+ "--max-evals",
219
+ type=int,
220
+ default=25,
221
+ help="Optuna trial count when retuning is required."
222
+ )
223
+ parser.add_argument(
224
+ "--retune-missing",
225
+ dest="retune_missing",
226
+ action="store_true",
227
+ default=True,
228
+ help="Retune models whose best-params CSV is unavailable (default)."
229
+ )
230
+ parser.add_argument(
231
+ "--skip-retune-missing",
232
+ dest="retune_missing",
233
+ action="store_false",
234
+ help="Do not retune when best params are missing; such models are skipped."
235
+ )
236
+ parser.add_argument(
237
+ "--force-retune",
238
+ action="store_true",
239
+ help="Run Optuna tuning even if historical best params exist."
240
+ )
241
+ parser.add_argument(
242
+ "--prop-test",
243
+ type=float,
244
+ default=None,
245
+ help="Override the test split proportion defined in the config file."
246
+ )
247
+ parser.add_argument(
248
+ "--rand-seed",
249
+ type=int,
250
+ default=None,
251
+ help="Override the random seed defined in the config."
252
+ )
253
+ parser.add_argument(
254
+ "--epochs",
255
+ type=int,
256
+ default=None,
257
+ help="Override the epoch count from the config."
258
+ )
259
+ parser.add_argument(
260
+ "--output-dir",
261
+ type=Path,
262
+ default=None,
263
+ help="Override the BayesOpt output root (models/results/plots)."
264
+ )
265
+ parser.add_argument(
266
+ "--update-base-data",
267
+ action="store_true",
268
+ help="Overwrite the base CSVs with the merged dataset after a successful update."
269
+ )
270
+ parser.add_argument(
271
+ "--persist-merged-dir",
272
+ type=Path,
273
+ default=None,
274
+ help="Optional directory to store the merged dataset snapshots."
275
+ )
276
+ parser.add_argument(
277
+ "--summary-json",
278
+ type=Path,
279
+ default=None,
280
+ help="Write a JSON summary of processed datasets to this path."
281
+ )
282
+ parser.add_argument(
283
+ "--plot-curves",
284
+ action="store_true",
285
+ help="Run one-way/lift plots after training (config plot settings also apply)."
286
+ )
287
+ parser.add_argument(
288
+ "--dry-run",
289
+ action="store_true",
290
+ help="Merge and report counts but skip training, saving and plotting."
291
+ )
292
+ args = parser.parse_args()
293
+
294
+ if args.incremental_file and args.incremental_dir:
295
+ parser.error("Use either --incremental-dir or --incremental-file, not both.")
296
+ if args.incremental_file and args.model_names and len(args.model_names) != 1:
297
+ parser.error("--incremental-file can only be used when updating exactly one model.")
298
+ if (not args.incremental_dir and not args.incremental_file) and not args.train_without_incremental:
299
+ parser.error(
300
+ "Provide --incremental-dir/--incremental-file or enable --train-without-incremental."
301
+ )
302
+ return args
303
+
304
+
305
+ def _plot_curves_for_model(model: ropt.BayesOptModel, trained: List[str], cfg: Dict[str, Any]) -> None:
306
+ plot_cfg = cfg.get("plot", {})
307
+ legacy_flags = {
308
+ "glm": cfg.get("plot_lift_glm", False),
309
+ "xgb": cfg.get("plot_lift_xgb", False),
310
+ "resn": cfg.get("plot_lift_resn", False),
311
+ "ft": cfg.get("plot_lift_ft", False),
312
+ }
313
+ plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
314
+ if not plot_enabled:
315
+ return
316
+
317
+ n_bins = int(plot_cfg.get("n_bins", 10))
318
+ oneway_enabled = plot_cfg.get("oneway", True)
319
+ available = dedupe_preserve_order([k for k in trained if k in PLOT_MODEL_LABELS])
320
+
321
+ lift_models = plot_cfg.get("lift_models")
322
+ if lift_models is None:
323
+ lift_models = [m for m, flag in legacy_flags.items() if flag]
324
+ if not lift_models:
325
+ lift_models = available
326
+ lift_models = dedupe_preserve_order([m for m in lift_models if m in available])
327
+
328
+ if oneway_enabled:
329
+ oneway_pred = bool(plot_cfg.get("oneway_pred", False))
330
+ oneway_pred_models = plot_cfg.get("oneway_pred_models")
331
+ pred_plotted = False
332
+ if oneway_pred:
333
+ if oneway_pred_models is None:
334
+ oneway_pred_models = lift_models or available
335
+ oneway_pred_models = dedupe_preserve_order(
336
+ [m for m in oneway_pred_models if m in available]
337
+ )
338
+ for model_key in oneway_pred_models:
339
+ label, pred_nme = PLOT_MODEL_LABELS[model_key]
340
+ if pred_nme not in model.train_data.columns:
341
+ print(
342
+ f"[Oneway] Missing prediction column '{pred_nme}'; skip.",
343
+ flush=True,
344
+ )
345
+ continue
346
+ model.plot_oneway(
347
+ n_bins=n_bins,
348
+ pred_col=pred_nme,
349
+ pred_label=label,
350
+ plot_subdir="oneway/post",
351
+ )
352
+ pred_plotted = True
353
+ if not oneway_pred or not pred_plotted:
354
+ model.plot_oneway(n_bins=n_bins, plot_subdir="oneway/post")
355
+ if not available:
356
+ return
357
+
358
+ for key in lift_models:
359
+ label, pred_nme = PLOT_MODEL_LABELS[key]
360
+ model.plot_lift(model_label=label, pred_nme=pred_nme, n_bins=n_bins)
361
+
362
+ if not plot_cfg.get("double_lift", True) or len(available) < 2:
363
+ return
364
+
365
+ raw_pairs = plot_cfg.get("double_lift_pairs")
366
+ if raw_pairs:
367
+ pairs = [
368
+ (a, b)
369
+ for a, b in parse_model_pairs(raw_pairs)
370
+ if a in available and b in available and a != b
371
+ ]
372
+ else:
373
+ pairs = [(a, b) for i, a in enumerate(available) for b in available[i + 1 :]]
374
+ for first, second in pairs:
375
+ model.plot_dlift([first, second], n_bins=n_bins)
376
+
377
+
378
+ def _coerce_scalar(value: Any) -> Any:
379
+ if isinstance(value, str):
380
+ lowered = value.strip().lower()
381
+ if lowered in {"", "none", "nan"}:
382
+ return None
383
+ if lowered in {"true", "false"}:
384
+ return lowered == "true"
385
+ return value
386
+ if hasattr(value, "item"):
387
+ try:
388
+ return value.item()
389
+ except Exception:
390
+ return value
391
+ return value
392
+
393
+
394
+ def _infer_format_from_path(path: Path) -> str:
395
+ suffix = path.suffix.lower()
396
+ if suffix in {".parquet", ".pq"}:
397
+ return "parquet"
398
+ if suffix in {".feather", ".ft"}:
399
+ return "feather"
400
+ return "csv"
401
+
402
+
403
+ def _load_best_params(model: ropt.BayesOptModel, trainer, silent: bool = False) -> Optional[Dict[str, Any]]:
404
+ label = trainer.label.lower()
405
+ result_dir = Path(model.output_manager.result_dir)
406
+ path = result_dir / f"{model.model_nme}_bestparams_{label}.csv"
407
+ if not path.exists():
408
+ if not silent:
409
+ _log(f"No historical params found for {model.model_nme}/{label} at {path}.")
410
+ return None
411
+ try:
412
+ params_raw = ropt.IOUtils.load_params_file(str(path))
413
+ except Exception:
414
+ return None
415
+ return {
416
+ key: _coerce_scalar(val)
417
+ for key, val in (params_raw or {}).items()
418
+ if not pd.isna(val)
419
+ }
420
+
421
+
422
+ def _to_serializable(obj: Any) -> Any:
423
+ if isinstance(obj, dict):
424
+ return {k: _to_serializable(v) for k, v in obj.items()}
425
+ if isinstance(obj, list):
426
+ return [_to_serializable(v) for v in obj]
427
+ if hasattr(obj, "item"):
428
+ try:
429
+ return obj.item()
430
+ except Exception:
431
+ return str(obj)
432
+ return obj
433
+
434
+
435
+ class IncrementalUpdateRunner:
436
+ def __init__(self, args: argparse.Namespace) -> None:
437
+ self.args = args
438
+ script_dir = Path(__file__).resolve().parents[1]
439
+ self.config_path, self.cfg = resolve_and_load_config(
440
+ args.config_json,
441
+ script_dir,
442
+ required_keys=[
443
+ "data_dir",
444
+ "model_list",
445
+ "model_categories",
446
+ "target",
447
+ "weight",
448
+ "feature_list",
449
+ "categorical_features",
450
+ ],
451
+ )
452
+ data_dir, data_format, data_path_template, dtype_map = resolve_data_config(
453
+ self.cfg,
454
+ self.config_path,
455
+ create_data_dir=True,
456
+ )
457
+ self.data_dir = data_dir
458
+ self.data_format = data_format
459
+ self.data_path_template = data_path_template
460
+ self.dtype_map = dtype_map
461
+ split_cfg = resolve_split_config(self.cfg)
462
+ runtime_cfg = resolve_runtime_config(self.cfg)
463
+ output_cfg = resolve_output_dirs(
464
+ self.cfg,
465
+ self.config_path,
466
+ output_override=args.output_dir,
467
+ )
468
+ self.runtime_cfg = runtime_cfg
469
+ self.prop_test = args.prop_test if args.prop_test is not None else split_cfg["prop_test"]
470
+ self.rand_seed = args.rand_seed if args.rand_seed is not None else self.cfg.get("rand_seed", 13)
471
+ self.epochs = args.epochs if args.epochs is not None else self.cfg.get("epochs", 50)
472
+ self.split_strategy = split_cfg["split_strategy"]
473
+ self.split_group_col = split_cfg["split_group_col"]
474
+ self.split_time_col = split_cfg["split_time_col"]
475
+ self.split_time_ascending = split_cfg["split_time_ascending"]
476
+ self.cv_strategy = split_cfg["cv_strategy"]
477
+ self.cv_group_col = split_cfg["cv_group_col"]
478
+ self.cv_time_col = split_cfg["cv_time_col"]
479
+ self.cv_time_ascending = split_cfg["cv_time_ascending"]
480
+ self.cv_splits = split_cfg["cv_splits"]
481
+ self.ft_oof_folds = split_cfg["ft_oof_folds"]
482
+ self.ft_oof_strategy = split_cfg["ft_oof_strategy"]
483
+ self.ft_oof_shuffle = split_cfg["ft_oof_shuffle"]
484
+ self.save_preprocess = runtime_cfg["save_preprocess"]
485
+ self.preprocess_artifact_path = runtime_cfg["preprocess_artifact_path"]
486
+ self.bo_sample_limit = runtime_cfg["bo_sample_limit"]
487
+ self.cache_predictions = runtime_cfg["cache_predictions"]
488
+ self.prediction_cache_dir = runtime_cfg["prediction_cache_dir"]
489
+ self.prediction_cache_format = runtime_cfg["prediction_cache_format"]
490
+ self.plot_path_style = runtime_cfg["plot_path_style"]
491
+ self.xgb_max_depth_max = runtime_cfg["xgb_max_depth_max"]
492
+ self.xgb_n_estimators_max = runtime_cfg["xgb_n_estimators_max"]
493
+ self.optuna_storage = runtime_cfg["optuna_storage"]
494
+ self.optuna_study_prefix = runtime_cfg["optuna_study_prefix"]
495
+ self.best_params_files = runtime_cfg["best_params_files"]
496
+ self.reuse_best_params = runtime_cfg["reuse_best_params"]
497
+ self.plot_requested = bool(args.plot_curves or self.cfg.get("plot_curves", False))
498
+ self.model_names = self._resolve_model_names(args.model_names)
499
+ self.merge_keys = list(args.merge_keys or [])
500
+ self.timestamp_col = args.timestamp_col
501
+ self.timestamp_ascending = not args.timestamp_descending
502
+ self.output_root = output_cfg["output_dir"]
503
+
504
+ self.incremental_dir = None
505
+ if args.incremental_dir is not None:
506
+ self.incremental_dir = args.incremental_dir
507
+ if not self.incremental_dir.is_absolute():
508
+ self.incremental_dir = (self.config_path.parent / self.incremental_dir).resolve()
509
+ else:
510
+ self.incremental_dir = self.incremental_dir.resolve()
511
+ self.incremental_file = None
512
+ if args.incremental_file is not None:
513
+ self.incremental_file = args.incremental_file
514
+ if not self.incremental_file.is_absolute():
515
+ self.incremental_file = (self.config_path.parent / self.incremental_file).resolve()
516
+ else:
517
+ self.incremental_file = self.incremental_file.resolve()
518
+ self.summary_records: List[Dict[str, Any]] = []
519
+ self.binary_resp = self.cfg.get("binary_resp_nme") or self.cfg.get("binary_target")
520
+
521
+ if self.incremental_file and len(self.model_names) != 1:
522
+ raise ValueError("--incremental-file can only be used when exactly one model name is targeted.")
523
+
524
+ def _resolve_model_names(self, override: Optional[Sequence[str]]) -> List[str]:
525
+ if override:
526
+ return dedupe_preserve_order([str(item) for item in override])
527
+ prefixes = self.cfg["model_list"]
528
+ suffixes = self.cfg["model_categories"]
529
+ return build_model_names(prefixes, suffixes)
530
+
531
+ def _load_incremental_df(self, model_name: str) -> Tuple[Optional[pd.DataFrame], Optional[Path]]:
532
+ path: Optional[Path] = None
533
+ if self.incremental_file:
534
+ path = self.incremental_file
535
+ elif self.incremental_dir:
536
+ rel = self.args.incremental_template.format(model_name=model_name)
537
+ path = (self.incremental_dir / rel).resolve()
538
+ if not path or not path.exists():
539
+ return None, None
540
+ try:
541
+ df = load_dataset(
542
+ path,
543
+ data_format="auto",
544
+ dtype_map=self.dtype_map,
545
+ low_memory=False,
546
+ )
547
+ except pd.errors.EmptyDataError:
548
+ _log(f"Incremental file {path} is empty; treating as no-op.")
549
+ return None, path
550
+ except Exception as exc:
551
+ _log(f"Failed to load incremental file {path}: {exc}")
552
+ return None, path
553
+ return df, path
554
+
555
+ def _merge_frames(self, base_df: pd.DataFrame, inc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
556
+ if inc_df is None or inc_df.empty:
557
+ merged = base_df.copy(deep=True)
558
+ return merged.reset_index(drop=True)
559
+ frames = []
560
+ tag = self.args.tag_new_column
561
+ if tag:
562
+ base_part = base_df.copy(deep=True)
563
+ base_part[tag] = 0
564
+ inc_part = inc_df.copy(deep=True)
565
+ inc_part[tag] = 1
566
+ frames = [base_part, inc_part]
567
+ else:
568
+ frames = [base_df, inc_df]
569
+ merged = pd.concat(frames, ignore_index=True, sort=False)
570
+ if self.timestamp_col and self.timestamp_col in merged.columns:
571
+ merged = merged.sort_values(
572
+ self.timestamp_col,
573
+ ascending=self.timestamp_ascending,
574
+ kind="mergesort",
575
+ )
576
+ if self.merge_keys:
577
+ missing = [col for col in self.merge_keys if col not in merged.columns]
578
+ if missing:
579
+ raise KeyError(f"Merge keys {missing} not found in merged frame for {self.merge_keys}.")
580
+ merged = merged.drop_duplicates(subset=self.merge_keys, keep=self.args.dedupe_keep)
581
+ return merged.reset_index(drop=True)
582
+
583
+ def _should_train(self, new_rows: int) -> bool:
584
+ if self.args.train_without_incremental:
585
+ return True
586
+ min_needed = max(0, self.args.min_new_rows)
587
+ return new_rows >= min_needed
588
+
589
+ def _write_dataset(self, df: pd.DataFrame, dest: Path, reason: str) -> None:
590
+ dest.parent.mkdir(parents=True, exist_ok=True)
591
+ fmt = str(self.data_format or "csv").lower()
592
+ if fmt == "auto":
593
+ fmt = _infer_format_from_path(dest)
594
+ if fmt == "parquet":
595
+ df.to_parquet(dest, index=False)
596
+ elif fmt == "feather":
597
+ df.reset_index(drop=True).to_feather(dest)
598
+ else:
599
+ df.to_csv(dest, index=False)
600
+ _log(f"Wrote {len(df)} rows to {dest} ({reason}).")
601
+
602
+ def _prepare_splits(self, merged: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
603
+ if not 0 < self.prop_test < 1:
604
+ raise ValueError(f"prop_test must fall in (0, 1); got {self.prop_test}.")
605
+ if len(merged) < 2:
606
+ raise ValueError("Need at least two rows to form a train/test split.")
607
+ train_df, test_df = split_train_test(
608
+ merged,
609
+ holdout_ratio=self.prop_test,
610
+ strategy=self.split_strategy,
611
+ group_col=self.split_group_col,
612
+ time_col=self.split_time_col,
613
+ time_ascending=self.split_time_ascending,
614
+ rand_seed=self.rand_seed,
615
+ reset_index_mode="always",
616
+ ratio_label="prop_test",
617
+ validate_ratio=False,
618
+ )
619
+ return train_df, test_df
620
+
621
+ def _requested_model_keys(self, trainer_map: Dict[str, Any]) -> List[str]:
622
+ requested = self.args.model_keys
623
+ if "all" in requested:
624
+ requested = ["glm", "xgb", "resn", "ft", "gnn"]
625
+ requested = dedupe_preserve_order(requested)
626
+ missing = [key for key in requested if key not in trainer_map]
627
+ for key in missing:
628
+ _log(f"Trainer '{key}' is not available for this context and will be skipped.")
629
+ return [key for key in requested if key in trainer_map]
630
+
631
+ def _train_single_model(
632
+ self,
633
+ model_name: str,
634
+ merged: pd.DataFrame,
635
+ new_rows: int,
636
+ incremental_path: Optional[Path],
637
+ ) -> Dict[str, Any]:
638
+ merged = merged.copy(deep=True)
639
+ merged.fillna(0, inplace=True)
640
+ train_df, test_df = self._prepare_splits(merged)
641
+ model = ropt.BayesOptModel(
642
+ train_df,
643
+ test_df,
644
+ model_name,
645
+ self.cfg["target"],
646
+ self.cfg["weight"],
647
+ self.cfg["feature_list"],
648
+ task_type=self.cfg.get("task_type", "regression"),
649
+ binary_resp_nme=self.binary_resp,
650
+ cate_list=self.cfg.get("categorical_features"),
651
+ prop_test=self.prop_test,
652
+ rand_seed=self.rand_seed,
653
+ epochs=self.epochs,
654
+ use_gpu=bool(self.cfg.get("use_gpu", True)),
655
+ use_resn_data_parallel=self.cfg.get("use_resn_data_parallel", False),
656
+ use_ft_data_parallel=self.cfg.get("use_ft_data_parallel", True),
657
+ use_gnn_data_parallel=self.cfg.get("use_gnn_data_parallel", False),
658
+ use_resn_ddp=self.cfg.get("use_resn_ddp", False),
659
+ use_ft_ddp=self.cfg.get("use_ft_ddp", False),
660
+ use_gnn_ddp=self.cfg.get("use_gnn_ddp", False),
661
+ output_dir=str(self.output_root) if self.output_root else None,
662
+ xgb_max_depth_max=self.xgb_max_depth_max,
663
+ xgb_n_estimators_max=self.xgb_n_estimators_max,
664
+ resn_weight_decay=self.cfg.get("resn_weight_decay"),
665
+ final_ensemble=bool(self.cfg.get("final_ensemble", False)),
666
+ final_ensemble_k=int(self.cfg.get("final_ensemble_k", 3)),
667
+ final_refit=bool(self.cfg.get("final_refit", True)),
668
+ optuna_storage=self.optuna_storage,
669
+ optuna_study_prefix=self.optuna_study_prefix,
670
+ best_params_files=self.best_params_files,
671
+ reuse_best_params=self.reuse_best_params,
672
+ gnn_use_approx_knn=self.cfg.get("gnn_use_approx_knn", True),
673
+ gnn_approx_knn_threshold=self.cfg.get("gnn_approx_knn_threshold", 50000),
674
+ gnn_graph_cache=self.cfg.get("gnn_graph_cache"),
675
+ gnn_max_gpu_knn_nodes=self.cfg.get("gnn_max_gpu_knn_nodes", 200000),
676
+ gnn_knn_gpu_mem_ratio=self.cfg.get("gnn_knn_gpu_mem_ratio", 0.9),
677
+ gnn_knn_gpu_mem_overhead=self.cfg.get("gnn_knn_gpu_mem_overhead", 2.0),
678
+ region_province_col=self.cfg.get("region_province_col"),
679
+ region_city_col=self.cfg.get("region_city_col"),
680
+ region_effect_alpha=self.cfg.get("region_effect_alpha"),
681
+ geo_feature_nmes=self.cfg.get("geo_feature_nmes"),
682
+ geo_token_hidden_dim=self.cfg.get("geo_token_hidden_dim"),
683
+ geo_token_layers=self.cfg.get("geo_token_layers"),
684
+ geo_token_dropout=self.cfg.get("geo_token_dropout"),
685
+ geo_token_k_neighbors=self.cfg.get("geo_token_k_neighbors"),
686
+ geo_token_learning_rate=self.cfg.get("geo_token_learning_rate"),
687
+ geo_token_epochs=self.cfg.get("geo_token_epochs"),
688
+ ft_role=str(self.cfg.get("ft_role", "model")),
689
+ ft_feature_prefix=str(self.cfg.get("ft_feature_prefix", "ft_emb")),
690
+ ft_num_numeric_tokens=self.cfg.get("ft_num_numeric_tokens"),
691
+ infer_categorical_max_unique=int(self.cfg.get("infer_categorical_max_unique", 50)),
692
+ infer_categorical_max_ratio=float(self.cfg.get("infer_categorical_max_ratio", 0.05)),
693
+ cv_strategy=self.cv_strategy or self.split_strategy,
694
+ cv_group_col=self.cv_group_col or self.split_group_col,
695
+ cv_time_col=self.cv_time_col or self.split_time_col,
696
+ cv_time_ascending=self.cv_time_ascending,
697
+ cv_splits=self.cv_splits,
698
+ ft_oof_folds=self.ft_oof_folds,
699
+ ft_oof_strategy=self.ft_oof_strategy,
700
+ ft_oof_shuffle=self.ft_oof_shuffle,
701
+ save_preprocess=self.save_preprocess,
702
+ preprocess_artifact_path=self.preprocess_artifact_path,
703
+ plot_path_style=self.plot_path_style,
704
+ bo_sample_limit=self.bo_sample_limit,
705
+ cache_predictions=self.cache_predictions,
706
+ prediction_cache_dir=self.prediction_cache_dir,
707
+ prediction_cache_format=self.prediction_cache_format,
708
+ )
709
+
710
+ if self.plot_requested and not self.args.dry_run:
711
+ plot_cfg = self.cfg.get("plot", {})
712
+ legacy_flags = {
713
+ "glm": self.cfg.get("plot_lift_glm", False),
714
+ "xgb": self.cfg.get("plot_lift_xgb", False),
715
+ "resn": self.cfg.get("plot_lift_resn", False),
716
+ "ft": self.cfg.get("plot_lift_ft", False),
717
+ }
718
+ plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
719
+ if plot_enabled and plot_cfg.get("pre_oneway", False) and plot_cfg.get("oneway", True):
720
+ n_bins = int(plot_cfg.get("n_bins", 10))
721
+ model.plot_oneway(n_bins=n_bins, plot_subdir="oneway/pre")
722
+
723
+ requested_keys = self._requested_model_keys(model.trainers)
724
+ executed_keys: List[str] = []
725
+ param_sources: Dict[str, str] = {}
726
+
727
+ if self.args.dry_run:
728
+ _log(f"Dry run: would train {requested_keys} for {model_name}.")
729
+ return {
730
+ "executed_keys": executed_keys,
731
+ "param_sources": param_sources,
732
+ "model": model,
733
+ }
734
+
735
+ if self.args.force_retune and self.args.max_evals <= 0:
736
+ raise ValueError("force_retune requires --max-evals > 0.")
737
+
738
+ force_retune = bool(self.args.force_retune)
739
+ if force_retune:
740
+ model.config.reuse_best_params = False
741
+ model.config.best_params_files = {}
742
+
743
+ ft_role = str(getattr(model.config, "ft_role", "model"))
744
+ if ft_role != "model" and "ft" in requested_keys:
745
+ requested_keys = ["ft"] + [k for k in requested_keys if k != "ft"]
746
+
747
+ for key in requested_keys:
748
+ trainer = model.trainers[key]
749
+
750
+ if force_retune:
751
+ trainer.best_params = None
752
+ trainer.best_trial = None
753
+ param_sources[key] = "retune"
754
+ else:
755
+ best_params = _load_best_params(model, trainer, silent=True)
756
+ if best_params:
757
+ trainer.best_params = best_params
758
+ trainer.best_trial = None
759
+ param_sources[key] = "loaded"
760
+ else:
761
+ if not self.args.retune_missing:
762
+ _log(
763
+ f"Skipping {model_name}/{key}: no best params and retuning disabled."
764
+ )
765
+ continue
766
+ param_sources[key] = "retune"
767
+
768
+ if (trainer.best_params is None) and self.args.max_evals <= 0:
769
+ raise ValueError("--max-evals must be positive when retuning is requested.")
770
+
771
+ model.optimize_model(key, max_evals=self.args.max_evals)
772
+ trainer.save()
773
+ executed_keys.append(key)
774
+ if key in PYTORCH_TRAINERS:
775
+ ropt.free_cuda()
776
+
777
+ snapshot = {
778
+ "mode": "incremental_train",
779
+ "model_name": model_name,
780
+ "model_key": key,
781
+ "timestamp": datetime.now().isoformat(),
782
+ "param_source": param_sources[key],
783
+ "best_params": _to_serializable(trainer.best_params or {}),
784
+ "incremental_rows": new_rows,
785
+ "train_rows": len(model.train_data),
786
+ "test_rows": len(model.test_data),
787
+ "incremental_path": str(incremental_path) if incremental_path else None,
788
+ "config": asdict(model.config),
789
+ }
790
+ model.version_manager.save(f"{model_name}_{key}_incremental", snapshot)
791
+
792
+ if not executed_keys:
793
+ _log(f"No trainers executed for {model_name}.")
794
+
795
+ return {
796
+ "executed_keys": executed_keys,
797
+ "param_sources": param_sources,
798
+ "model": model,
799
+ }
800
+
801
+ def process(self) -> None:
802
+ total_trained = 0
803
+ for model_name in self.model_names:
804
+ total_trained += self._process_single_model(model_name)
805
+ if self.args.summary_json and self.summary_records:
806
+ summary_path = self.args.summary_json.resolve()
807
+ summary_path.parent.mkdir(parents=True, exist_ok=True)
808
+ summary_payload = _to_serializable(self.summary_records)
809
+ summary_path.write_text(json.dumps(summary_payload, indent=2, ensure_ascii=False), encoding="utf-8")
810
+ _log(f"Summary written to {summary_path}.")
811
+ _log(f"Finished incremental update for {total_trained} dataset(s).")
812
+
813
+ def _process_single_model(self, model_name: str) -> int:
814
+ base_path = resolve_data_path(
815
+ self.data_dir,
816
+ model_name,
817
+ data_format=self.data_format,
818
+ path_template=self.data_path_template,
819
+ )
820
+ if not base_path.exists():
821
+ _log(f"Base dataset {base_path} not found; skipping {model_name}.")
822
+ self.summary_records.append({
823
+ "model_name": model_name,
824
+ "status": "missing_base",
825
+ })
826
+ return 0
827
+
828
+ base_df = load_dataset(
829
+ base_path,
830
+ data_format=self.data_format,
831
+ dtype_map=self.dtype_map,
832
+ low_memory=False,
833
+ )
834
+ inc_df, inc_path = self._load_incremental_df(model_name)
835
+ if inc_df is None and self.incremental_dir and self.args.strict_incremental and not self.args.train_without_incremental:
836
+ raise FileNotFoundError(f"Missing incremental file for {model_name} under {self.incremental_dir}.")
837
+
838
+ new_rows = 0 if inc_df is None else len(inc_df)
839
+ _log(f"{model_name}: {len(base_df)} base rows, {new_rows} incremental rows.")
840
+ merged_df = self._merge_frames(base_df, inc_df)
841
+ merged_df.fillna(0, inplace=True)
842
+
843
+ if self.args.update_base_data and not self.args.dry_run:
844
+ self._write_dataset(merged_df, base_path, "update_base_data")
845
+ if self.args.persist_merged_dir and not self.args.dry_run:
846
+ suffix = base_path.suffix or ".csv"
847
+ dest = Path(self.args.persist_merged_dir).resolve() / f"{model_name}{suffix}"
848
+ self._write_dataset(merged_df, dest, "persist_merged_dir")
849
+
850
+ if not self._should_train(new_rows):
851
+ _log(f"{model_name}: below min_new_rows ({self.args.min_new_rows}); skipping retrain.")
852
+ self.summary_records.append({
853
+ "model_name": model_name,
854
+ "status": "skipped_no_incremental",
855
+ "new_rows": new_rows,
856
+ "total_rows": len(merged_df),
857
+ })
858
+ return 0
859
+
860
+ try:
861
+ train_result = self._train_single_model(model_name, merged_df, new_rows, inc_path)
862
+ except Exception as exc:
863
+ _log(f"Training failed for {model_name}: {exc}")
864
+ self.summary_records.append({
865
+ "model_name": model_name,
866
+ "status": "failed",
867
+ "error": str(exc),
868
+ "new_rows": new_rows,
869
+ "total_rows": len(merged_df),
870
+ })
871
+ return 0
872
+
873
+ executed = train_result["executed_keys"]
874
+ param_sources = train_result["param_sources"]
875
+ model = train_result["model"]
876
+ status = "dry_run" if self.args.dry_run else "trained"
877
+
878
+ summary = {
879
+ "model_name": model_name,
880
+ "status": status,
881
+ "trained_models": executed,
882
+ "param_sources": param_sources,
883
+ "new_rows": new_rows,
884
+ "total_rows": len(merged_df),
885
+ "incremental_path": str(inc_path) if inc_path else None,
886
+ }
887
+ self.summary_records.append(summary)
888
+
889
+ if not self.args.dry_run and self.plot_requested and executed:
890
+ _plot_curves_for_model(model, executed, self.cfg)
891
+
892
+ return 1 if executed else 0
893
+
894
+
895
+ def main() -> None:
896
+ if configure_run_logging:
897
+ configure_run_logging(prefix="bayesopt_incremental")
898
+ args = _parse_args()
899
+ runner = IncrementalUpdateRunner(args)
900
+ runner.process()
901
+
902
+
903
+ if __name__ == "__main__":
904
+ main()