ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,685 @@
1
+ """Incremental training harness built on top of ``BayesOpt.py``.
2
+
3
+ This utility lets you append new observations to an existing dataset,
4
+ reuse previously tuned hyperparameters and retrain a subset of models
5
+ without re-running the full Optuna search. It can operate on a directory
6
+ of per-model incremental CSVs or a single incremental file when updating
7
+ one dataset.
8
+
9
+ Example:
10
+ python user_packages/BayesOpt_incremental.py \
11
+ --config-json user_packages/config_BayesOpt.json \
12
+ --incremental-dir ./incremental_batches \
13
+ --merge-keys policy_id vehicle_id \
14
+ --model-keys glm xgb resn --plot-curves
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ from dataclasses import asdict
22
+ from datetime import datetime
23
+ from pathlib import Path
24
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
25
+
26
+ import pandas as pd
27
+ from sklearn.model_selection import train_test_split
28
+
29
+ try:
30
+ from . import BayesOpt as ropt # type: ignore
31
+ from .cli_common import ( # type: ignore
32
+ PLOT_MODEL_LABELS,
33
+ PYTORCH_TRAINERS,
34
+ build_model_names,
35
+ dedupe_preserve_order,
36
+ load_config_json,
37
+ normalize_config_paths,
38
+ parse_model_pairs,
39
+ resolve_config_path,
40
+ resolve_path,
41
+ set_env,
42
+ )
43
+ except Exception: # pragma: no cover
44
+ import BayesOpt as ropt # type: ignore
45
+ from cli_common import ( # type: ignore
46
+ PLOT_MODEL_LABELS,
47
+ PYTORCH_TRAINERS,
48
+ build_model_names,
49
+ dedupe_preserve_order,
50
+ load_config_json,
51
+ normalize_config_paths,
52
+ parse_model_pairs,
53
+ resolve_config_path,
54
+ resolve_path,
55
+ set_env,
56
+ )
57
+
58
+
59
+ def _log(message: str) -> None:
60
+ print(f"[Incremental] {message}")
61
+
62
+
63
+ def _parse_args() -> argparse.Namespace:
64
+ parser = argparse.ArgumentParser(
65
+ description="Incrementally retrain BayesOpt models using new batches of data."
66
+ )
67
+ parser.add_argument(
68
+ "--config-json",
69
+ required=True,
70
+ help="Path to the JSON config that BayesOpt_entry.py uses."
71
+ )
72
+ parser.add_argument(
73
+ "--model-names",
74
+ nargs="+",
75
+ default=None,
76
+ help="Optional subset of dataset names to update (defaults to model_list/model_categories Cartesian product)."
77
+ )
78
+ parser.add_argument(
79
+ "--model-keys",
80
+ nargs="+",
81
+ default=["glm", "xgb", "resn", "ft"],
82
+ choices=["glm", "xgb", "resn", "ft", "gnn", "all"],
83
+ help="Which trainers to run for each dataset."
84
+ )
85
+ parser.add_argument(
86
+ "--incremental-dir",
87
+ type=Path,
88
+ default=None,
89
+ help="Directory containing <model_name> incremental CSVs."
90
+ )
91
+ parser.add_argument(
92
+ "--incremental-file",
93
+ type=Path,
94
+ default=None,
95
+ help="Single incremental CSV (requires --model-names with exactly one entry)."
96
+ )
97
+ parser.add_argument(
98
+ "--incremental-template",
99
+ default="{model_name}_incremental.csv",
100
+ help="Filename template when --incremental-dir is provided."
101
+ )
102
+ parser.add_argument(
103
+ "--merge-keys",
104
+ nargs="+",
105
+ default=None,
106
+ help="Column(s) used to drop duplicate rows after merging base and incremental data."
107
+ )
108
+ parser.add_argument(
109
+ "--dedupe-keep",
110
+ choices=["first", "last"],
111
+ default="last",
112
+ help="How pandas.drop_duplicates resolves conflicts on merge keys."
113
+ )
114
+ parser.add_argument(
115
+ "--timestamp-col",
116
+ default=None,
117
+ help="Optional column used to sort rows before deduplication."
118
+ )
119
+ parser.add_argument(
120
+ "--timestamp-descending",
121
+ action="store_true",
122
+ help="Sort timestamp column in descending order before deduplication."
123
+ )
124
+ parser.add_argument(
125
+ "--min-new-rows",
126
+ type=int,
127
+ default=1,
128
+ help="Skip training if fewer new rows than this arrive (unless --train-without-incremental)."
129
+ )
130
+ parser.add_argument(
131
+ "--train-without-incremental",
132
+ action="store_true",
133
+ help="Always retrain even when no incremental file is present."
134
+ )
135
+ parser.add_argument(
136
+ "--strict-incremental",
137
+ action="store_true",
138
+ help="Raise an error when a dataset is missing its incremental CSV instead of skipping it."
139
+ )
140
+ parser.add_argument(
141
+ "--tag-new-column",
142
+ default=None,
143
+ help="If set, store 1 for incremental rows and 0 for historical rows in this column."
144
+ )
145
+ parser.add_argument(
146
+ "--max-evals",
147
+ type=int,
148
+ default=25,
149
+ help="Optuna trial count when retuning is required."
150
+ )
151
+ parser.add_argument(
152
+ "--retune-missing",
153
+ dest="retune_missing",
154
+ action="store_true",
155
+ default=True,
156
+ help="Retune models whose best-params CSV is unavailable (default)."
157
+ )
158
+ parser.add_argument(
159
+ "--skip-retune-missing",
160
+ dest="retune_missing",
161
+ action="store_false",
162
+ help="Do not retune when best params are missing; such models are skipped."
163
+ )
164
+ parser.add_argument(
165
+ "--force-retune",
166
+ action="store_true",
167
+ help="Run Optuna tuning even if historical best params exist."
168
+ )
169
+ parser.add_argument(
170
+ "--prop-test",
171
+ type=float,
172
+ default=None,
173
+ help="Override the test split proportion defined in the config file."
174
+ )
175
+ parser.add_argument(
176
+ "--rand-seed",
177
+ type=int,
178
+ default=None,
179
+ help="Override the random seed defined in the config."
180
+ )
181
+ parser.add_argument(
182
+ "--epochs",
183
+ type=int,
184
+ default=None,
185
+ help="Override the epoch count from the config."
186
+ )
187
+ parser.add_argument(
188
+ "--output-dir",
189
+ type=Path,
190
+ default=None,
191
+ help="Override the BayesOpt output root (models/results/plots)."
192
+ )
193
+ parser.add_argument(
194
+ "--update-base-data",
195
+ action="store_true",
196
+ help="Overwrite the base CSVs with the merged dataset after a successful update."
197
+ )
198
+ parser.add_argument(
199
+ "--persist-merged-dir",
200
+ type=Path,
201
+ default=None,
202
+ help="Optional directory to store the merged dataset snapshots."
203
+ )
204
+ parser.add_argument(
205
+ "--summary-json",
206
+ type=Path,
207
+ default=None,
208
+ help="Write a JSON summary of processed datasets to this path."
209
+ )
210
+ parser.add_argument(
211
+ "--plot-curves",
212
+ action="store_true",
213
+ help="Run one-way/lift plots after training (config plot settings also apply)."
214
+ )
215
+ parser.add_argument(
216
+ "--dry-run",
217
+ action="store_true",
218
+ help="Merge and report counts but skip training, saving and plotting."
219
+ )
220
+ args = parser.parse_args()
221
+
222
+ if args.incremental_file and args.incremental_dir:
223
+ parser.error("Use either --incremental-dir or --incremental-file, not both.")
224
+ if args.incremental_file and args.model_names and len(args.model_names) != 1:
225
+ parser.error("--incremental-file can only be used when updating exactly one model.")
226
+ if (not args.incremental_dir and not args.incremental_file) and not args.train_without_incremental:
227
+ parser.error(
228
+ "Provide --incremental-dir/--incremental-file or enable --train-without-incremental."
229
+ )
230
+ return args
231
+
232
+
233
+ def _plot_curves_for_model(model: ropt.BayesOptModel, trained: List[str], cfg: Dict[str, Any]) -> None:
234
+ plot_cfg = cfg.get("plot", {})
235
+ legacy_flags = {
236
+ "glm": cfg.get("plot_lift_glm", False),
237
+ "xgb": cfg.get("plot_lift_xgb", False),
238
+ "resn": cfg.get("plot_lift_resn", False),
239
+ "ft": cfg.get("plot_lift_ft", False),
240
+ }
241
+ plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
242
+ if not plot_enabled:
243
+ return
244
+
245
+ n_bins = int(plot_cfg.get("n_bins", 10))
246
+ oneway_enabled = plot_cfg.get("oneway", True)
247
+ available = dedupe_preserve_order([k for k in trained if k in PLOT_MODEL_LABELS])
248
+
249
+ if oneway_enabled:
250
+ model.plot_oneway(n_bins=n_bins)
251
+ if not available:
252
+ return
253
+
254
+ lift_models = plot_cfg.get("lift_models")
255
+ if lift_models is None:
256
+ lift_models = [m for m, flag in legacy_flags.items() if flag] or available
257
+ lift_models = dedupe_preserve_order([m for m in lift_models if m in available])
258
+
259
+ for key in lift_models:
260
+ label, pred_nme = PLOT_MODEL_LABELS[key]
261
+ model.plot_lift(model_label=label, pred_nme=pred_nme, n_bins=n_bins)
262
+
263
+ if not plot_cfg.get("double_lift", True) or len(available) < 2:
264
+ return
265
+
266
+ raw_pairs = plot_cfg.get("double_lift_pairs")
267
+ if raw_pairs:
268
+ pairs = [
269
+ (a, b)
270
+ for a, b in parse_model_pairs(raw_pairs)
271
+ if a in available and b in available and a != b
272
+ ]
273
+ else:
274
+ pairs = [(a, b) for i, a in enumerate(available) for b in available[i + 1 :]]
275
+ for first, second in pairs:
276
+ model.plot_dlift([first, second], n_bins=n_bins)
277
+
278
+
279
+ def _coerce_scalar(value: Any) -> Any:
280
+ if isinstance(value, str):
281
+ lowered = value.strip().lower()
282
+ if lowered in {"", "none", "nan"}:
283
+ return None
284
+ if lowered in {"true", "false"}:
285
+ return lowered == "true"
286
+ return value
287
+ if hasattr(value, "item"):
288
+ try:
289
+ return value.item()
290
+ except Exception:
291
+ return value
292
+ return value
293
+
294
+
295
+ def _load_best_params(model: ropt.BayesOptModel, trainer, silent: bool = False) -> Optional[Dict[str, Any]]:
296
+ label = trainer.label.lower()
297
+ result_dir = Path(model.output_manager.result_dir)
298
+ path = result_dir / f"{model.model_nme}_bestparams_{label}.csv"
299
+ if not path.exists():
300
+ if not silent:
301
+ _log(f"No historical params found for {model.model_nme}/{label} at {path}.")
302
+ return None
303
+ try:
304
+ params_raw = ropt.IOUtils.load_params_file(str(path))
305
+ except Exception:
306
+ return None
307
+ return {
308
+ key: _coerce_scalar(val)
309
+ for key, val in (params_raw or {}).items()
310
+ if not pd.isna(val)
311
+ }
312
+
313
+
314
+ def _to_serializable(obj: Any) -> Any:
315
+ if isinstance(obj, dict):
316
+ return {k: _to_serializable(v) for k, v in obj.items()}
317
+ if isinstance(obj, list):
318
+ return [_to_serializable(v) for v in obj]
319
+ if hasattr(obj, "item"):
320
+ try:
321
+ return obj.item()
322
+ except Exception:
323
+ return str(obj)
324
+ return obj
325
+
326
+
327
+ class IncrementalUpdateRunner:
328
+ def __init__(self, args: argparse.Namespace) -> None:
329
+ self.args = args
330
+ script_dir = Path(__file__).resolve().parent
331
+ self.config_path = resolve_config_path(args.config_json, script_dir)
332
+ cfg = load_config_json(
333
+ self.config_path,
334
+ required_keys=[
335
+ "data_dir",
336
+ "model_list",
337
+ "model_categories",
338
+ "target",
339
+ "weight",
340
+ "feature_list",
341
+ "categorical_features",
342
+ ],
343
+ )
344
+ self.cfg = normalize_config_paths(cfg, self.config_path)
345
+ set_env(self.cfg.get("env", {}))
346
+ self.data_dir = Path(self.cfg["data_dir"])
347
+ self.data_dir.mkdir(parents=True, exist_ok=True)
348
+ self.prop_test = args.prop_test if args.prop_test is not None else self.cfg.get("prop_test", 0.25)
349
+ self.rand_seed = args.rand_seed if args.rand_seed is not None else self.cfg.get("rand_seed", 13)
350
+ self.epochs = args.epochs if args.epochs is not None else self.cfg.get("epochs", 50)
351
+ self.plot_requested = bool(args.plot_curves or self.cfg.get("plot_curves", False))
352
+ self.model_names = self._resolve_model_names(args.model_names)
353
+ self.merge_keys = list(args.merge_keys or [])
354
+ self.timestamp_col = args.timestamp_col
355
+ self.timestamp_ascending = not args.timestamp_descending
356
+ output_root = args.output_dir or self.cfg.get("output_dir")
357
+ if isinstance(output_root, Path) and not output_root.is_absolute():
358
+ output_root = (self.config_path.parent / output_root).resolve()
359
+ if isinstance(output_root, str) and output_root.strip():
360
+ resolved = resolve_path(output_root, self.config_path.parent)
361
+ if resolved is not None:
362
+ output_root = str(resolved)
363
+ self.output_root = output_root
364
+
365
+ self.incremental_dir = None
366
+ if args.incremental_dir is not None:
367
+ self.incremental_dir = args.incremental_dir
368
+ if not self.incremental_dir.is_absolute():
369
+ self.incremental_dir = (self.config_path.parent / self.incremental_dir).resolve()
370
+ else:
371
+ self.incremental_dir = self.incremental_dir.resolve()
372
+ self.incremental_file = None
373
+ if args.incremental_file is not None:
374
+ self.incremental_file = args.incremental_file
375
+ if not self.incremental_file.is_absolute():
376
+ self.incremental_file = (self.config_path.parent / self.incremental_file).resolve()
377
+ else:
378
+ self.incremental_file = self.incremental_file.resolve()
379
+ self.summary_records: List[Dict[str, Any]] = []
380
+ self.binary_resp = self.cfg.get("binary_resp_nme") or self.cfg.get("binary_target")
381
+
382
+ if self.incremental_file and len(self.model_names) != 1:
383
+ raise ValueError("--incremental-file can only be used when exactly one model name is targeted.")
384
+
385
+ def _resolve_model_names(self, override: Optional[Sequence[str]]) -> List[str]:
386
+ if override:
387
+ return dedupe_preserve_order([str(item) for item in override])
388
+ prefixes = self.cfg["model_list"]
389
+ suffixes = self.cfg["model_categories"]
390
+ return build_model_names(prefixes, suffixes)
391
+
392
+ def _load_incremental_df(self, model_name: str) -> Tuple[Optional[pd.DataFrame], Optional[Path]]:
393
+ path: Optional[Path] = None
394
+ if self.incremental_file:
395
+ path = self.incremental_file
396
+ elif self.incremental_dir:
397
+ rel = self.args.incremental_template.format(model_name=model_name)
398
+ path = (self.incremental_dir / rel).resolve()
399
+ if not path or not path.exists():
400
+ return None, None
401
+ try:
402
+ df = pd.read_csv(path, low_memory=False)
403
+ except pd.errors.EmptyDataError:
404
+ _log(f"Incremental file {path} is empty; treating as no-op.")
405
+ return None, path
406
+ return df, path
407
+
408
+ def _merge_frames(self, base_df: pd.DataFrame, inc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
409
+ if inc_df is None or inc_df.empty:
410
+ merged = base_df.copy(deep=True)
411
+ return merged.reset_index(drop=True)
412
+ frames = []
413
+ tag = self.args.tag_new_column
414
+ if tag:
415
+ base_part = base_df.copy(deep=True)
416
+ base_part[tag] = 0
417
+ inc_part = inc_df.copy(deep=True)
418
+ inc_part[tag] = 1
419
+ frames = [base_part, inc_part]
420
+ else:
421
+ frames = [base_df, inc_df]
422
+ merged = pd.concat(frames, ignore_index=True, sort=False)
423
+ if self.timestamp_col and self.timestamp_col in merged.columns:
424
+ merged = merged.sort_values(
425
+ self.timestamp_col,
426
+ ascending=self.timestamp_ascending,
427
+ kind="mergesort",
428
+ )
429
+ if self.merge_keys:
430
+ missing = [col for col in self.merge_keys if col not in merged.columns]
431
+ if missing:
432
+ raise KeyError(f"Merge keys {missing} not found in merged frame for {self.merge_keys}.")
433
+ merged = merged.drop_duplicates(subset=self.merge_keys, keep=self.args.dedupe_keep)
434
+ return merged.reset_index(drop=True)
435
+
436
+ def _should_train(self, new_rows: int) -> bool:
437
+ if self.args.train_without_incremental:
438
+ return True
439
+ min_needed = max(0, self.args.min_new_rows)
440
+ return new_rows >= min_needed
441
+
442
+ def _write_dataset(self, df: pd.DataFrame, dest: Path, reason: str) -> None:
443
+ dest.parent.mkdir(parents=True, exist_ok=True)
444
+ df.to_csv(dest, index=False)
445
+ _log(f"Wrote {len(df)} rows to {dest} ({reason}).")
446
+
447
+ def _prepare_splits(self, merged: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
448
+ if not 0 < self.prop_test < 1:
449
+ raise ValueError(f"prop_test must fall in (0, 1); got {self.prop_test}.")
450
+ if len(merged) < 2:
451
+ raise ValueError("Need at least two rows to form a train/test split.")
452
+ train_df, test_df = train_test_split(
453
+ merged,
454
+ test_size=self.prop_test,
455
+ random_state=self.rand_seed,
456
+ )
457
+ return train_df.reset_index(drop=True), test_df.reset_index(drop=True)
458
+
459
+ def _requested_model_keys(self, trainer_map: Dict[str, Any]) -> List[str]:
460
+ requested = self.args.model_keys
461
+ if "all" in requested:
462
+ requested = ["glm", "xgb", "resn", "ft", "gnn"]
463
+ requested = dedupe_preserve_order(requested)
464
+ missing = [key for key in requested if key not in trainer_map]
465
+ for key in missing:
466
+ _log(f"Trainer '{key}' is not available for this context and will be skipped.")
467
+ return [key for key in requested if key in trainer_map]
468
+
469
+ def _train_single_model(
470
+ self,
471
+ model_name: str,
472
+ merged: pd.DataFrame,
473
+ new_rows: int,
474
+ incremental_path: Optional[Path],
475
+ ) -> Dict[str, Any]:
476
+ merged = merged.copy(deep=True)
477
+ merged.fillna(0, inplace=True)
478
+ train_df, test_df = self._prepare_splits(merged)
479
+ model = ropt.BayesOptModel(
480
+ train_df,
481
+ test_df,
482
+ model_name,
483
+ self.cfg["target"],
484
+ self.cfg["weight"],
485
+ self.cfg["feature_list"],
486
+ task_type=self.cfg.get("task_type", "regression"),
487
+ binary_resp_nme=self.binary_resp,
488
+ cate_list=self.cfg.get("categorical_features"),
489
+ prop_test=self.prop_test,
490
+ rand_seed=self.rand_seed,
491
+ epochs=self.epochs,
492
+ use_resn_data_parallel=self.cfg.get("use_resn_data_parallel", False),
493
+ use_ft_data_parallel=self.cfg.get("use_ft_data_parallel", True),
494
+ use_gnn_data_parallel=self.cfg.get("use_gnn_data_parallel", False),
495
+ use_resn_ddp=self.cfg.get("use_resn_ddp", False),
496
+ use_ft_ddp=self.cfg.get("use_ft_ddp", False),
497
+ use_gnn_ddp=self.cfg.get("use_gnn_ddp", False),
498
+ output_dir=str(self.output_root) if self.output_root else None,
499
+ xgb_max_depth_max=int(self.cfg.get("xgb_max_depth_max", 25)),
500
+ xgb_n_estimators_max=int(self.cfg.get("xgb_n_estimators_max", 500)),
501
+ optuna_storage=self.cfg.get("optuna_storage"),
502
+ optuna_study_prefix=self.cfg.get("optuna_study_prefix"),
503
+ best_params_files=self.cfg.get("best_params_files"),
504
+ reuse_best_params=bool(self.cfg.get("reuse_best_params", False)),
505
+ gnn_use_approx_knn=self.cfg.get("gnn_use_approx_knn", True),
506
+ gnn_approx_knn_threshold=self.cfg.get("gnn_approx_knn_threshold", 50000),
507
+ gnn_graph_cache=self.cfg.get("gnn_graph_cache"),
508
+ gnn_max_gpu_knn_nodes=self.cfg.get("gnn_max_gpu_knn_nodes", 200000),
509
+ gnn_knn_gpu_mem_ratio=self.cfg.get("gnn_knn_gpu_mem_ratio", 0.9),
510
+ gnn_knn_gpu_mem_overhead=self.cfg.get("gnn_knn_gpu_mem_overhead", 2.0),
511
+ ft_role=str(self.cfg.get("ft_role", "model")),
512
+ ft_feature_prefix=str(self.cfg.get("ft_feature_prefix", "ft_emb")),
513
+ infer_categorical_max_unique=int(self.cfg.get("infer_categorical_max_unique", 50)),
514
+ infer_categorical_max_ratio=float(self.cfg.get("infer_categorical_max_ratio", 0.05)),
515
+ )
516
+
517
+ requested_keys = self._requested_model_keys(model.trainers)
518
+ executed_keys: List[str] = []
519
+ param_sources: Dict[str, str] = {}
520
+
521
+ if self.args.dry_run:
522
+ _log(f"Dry run: would train {requested_keys} for {model_name}.")
523
+ return {
524
+ "executed_keys": executed_keys,
525
+ "param_sources": param_sources,
526
+ "model": model,
527
+ }
528
+
529
+ if self.args.force_retune and self.args.max_evals <= 0:
530
+ raise ValueError("force_retune requires --max-evals > 0.")
531
+
532
+ force_retune = bool(self.args.force_retune)
533
+ if force_retune:
534
+ model.config.reuse_best_params = False
535
+ model.config.best_params_files = {}
536
+
537
+ ft_role = str(getattr(model.config, "ft_role", "model"))
538
+ if ft_role != "model" and "ft" in requested_keys:
539
+ requested_keys = ["ft"] + [k for k in requested_keys if k != "ft"]
540
+
541
+ for key in requested_keys:
542
+ trainer = model.trainers[key]
543
+
544
+ if force_retune:
545
+ trainer.best_params = None
546
+ trainer.best_trial = None
547
+ param_sources[key] = "retune"
548
+ else:
549
+ best_params = _load_best_params(model, trainer, silent=True)
550
+ if best_params:
551
+ trainer.best_params = best_params
552
+ trainer.best_trial = None
553
+ param_sources[key] = "loaded"
554
+ else:
555
+ if not self.args.retune_missing:
556
+ _log(
557
+ f"Skipping {model_name}/{key}: no best params and retuning disabled."
558
+ )
559
+ continue
560
+ param_sources[key] = "retune"
561
+
562
+ if (trainer.best_params is None) and self.args.max_evals <= 0:
563
+ raise ValueError("--max-evals must be positive when retuning is requested.")
564
+
565
+ model.optimize_model(key, max_evals=self.args.max_evals)
566
+ trainer.save()
567
+ executed_keys.append(key)
568
+ if key in PYTORCH_TRAINERS:
569
+ ropt.free_cuda()
570
+
571
+ snapshot = {
572
+ "mode": "incremental_train",
573
+ "model_name": model_name,
574
+ "model_key": key,
575
+ "timestamp": datetime.now().isoformat(),
576
+ "param_source": param_sources[key],
577
+ "best_params": _to_serializable(trainer.best_params or {}),
578
+ "incremental_rows": new_rows,
579
+ "train_rows": len(model.train_data),
580
+ "test_rows": len(model.test_data),
581
+ "incremental_path": str(incremental_path) if incremental_path else None,
582
+ "config": asdict(model.config),
583
+ }
584
+ model.version_manager.save(f"{model_name}_{key}_incremental", snapshot)
585
+
586
+ if not executed_keys:
587
+ _log(f"No trainers executed for {model_name}.")
588
+
589
+ return {
590
+ "executed_keys": executed_keys,
591
+ "param_sources": param_sources,
592
+ "model": model,
593
+ }
594
+
595
+ def process(self) -> None:
596
+ total_trained = 0
597
+ for model_name in self.model_names:
598
+ total_trained += self._process_single_model(model_name)
599
+ if self.args.summary_json and self.summary_records:
600
+ summary_path = self.args.summary_json.resolve()
601
+ summary_path.parent.mkdir(parents=True, exist_ok=True)
602
+ summary_payload = _to_serializable(self.summary_records)
603
+ summary_path.write_text(json.dumps(summary_payload, indent=2, ensure_ascii=False), encoding="utf-8")
604
+ _log(f"Summary written to {summary_path}.")
605
+ _log(f"Finished incremental update for {total_trained} dataset(s).")
606
+
607
+ def _process_single_model(self, model_name: str) -> int:
608
+ base_path = self.data_dir / f"{model_name}.csv"
609
+ if not base_path.exists():
610
+ _log(f"Base dataset {base_path} not found; skipping {model_name}.")
611
+ self.summary_records.append({
612
+ "model_name": model_name,
613
+ "status": "missing_base",
614
+ })
615
+ return 0
616
+
617
+ base_df = pd.read_csv(base_path, low_memory=False)
618
+ inc_df, inc_path = self._load_incremental_df(model_name)
619
+ if inc_df is None and self.incremental_dir and self.args.strict_incremental and not self.args.train_without_incremental:
620
+ raise FileNotFoundError(f"Missing incremental file for {model_name} under {self.incremental_dir}.")
621
+
622
+ new_rows = 0 if inc_df is None else len(inc_df)
623
+ _log(f"{model_name}: {len(base_df)} base rows, {new_rows} incremental rows.")
624
+ merged_df = self._merge_frames(base_df, inc_df)
625
+ merged_df.fillna(0, inplace=True)
626
+
627
+ if self.args.update_base_data and not self.args.dry_run:
628
+ self._write_dataset(merged_df, base_path, "update_base_data")
629
+ if self.args.persist_merged_dir and not self.args.dry_run:
630
+ dest = Path(self.args.persist_merged_dir).resolve() / f"{model_name}.csv"
631
+ self._write_dataset(merged_df, dest, "persist_merged_dir")
632
+
633
+ if not self._should_train(new_rows):
634
+ _log(f"{model_name}: below min_new_rows ({self.args.min_new_rows}); skipping retrain.")
635
+ self.summary_records.append({
636
+ "model_name": model_name,
637
+ "status": "skipped_no_incremental",
638
+ "new_rows": new_rows,
639
+ "total_rows": len(merged_df),
640
+ })
641
+ return 0
642
+
643
+ try:
644
+ train_result = self._train_single_model(model_name, merged_df, new_rows, inc_path)
645
+ except Exception as exc:
646
+ _log(f"Training failed for {model_name}: {exc}")
647
+ self.summary_records.append({
648
+ "model_name": model_name,
649
+ "status": "failed",
650
+ "error": str(exc),
651
+ "new_rows": new_rows,
652
+ "total_rows": len(merged_df),
653
+ })
654
+ return 0
655
+
656
+ executed = train_result["executed_keys"]
657
+ param_sources = train_result["param_sources"]
658
+ model = train_result["model"]
659
+ status = "dry_run" if self.args.dry_run else "trained"
660
+
661
+ summary = {
662
+ "model_name": model_name,
663
+ "status": status,
664
+ "trained_models": executed,
665
+ "param_sources": param_sources,
666
+ "new_rows": new_rows,
667
+ "total_rows": len(merged_df),
668
+ "incremental_path": str(inc_path) if inc_path else None,
669
+ }
670
+ self.summary_records.append(summary)
671
+
672
+ if not self.args.dry_run and self.plot_requested and executed:
673
+ _plot_curves_for_model(model, executed, self.cfg)
674
+
675
+ return 1 if executed else 0
676
+
677
+
678
+ def main() -> None:
679
+ args = _parse_args()
680
+ runner = IncrementalUpdateRunner(args)
681
+ runner.process()
682
+
683
+
684
+ if __name__ == "__main__":
685
+ main()