ins-pricing 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. ins_pricing/README.md +60 -0
  2. ins_pricing/__init__.py +102 -0
  3. ins_pricing/governance/README.md +18 -0
  4. ins_pricing/governance/__init__.py +20 -0
  5. ins_pricing/governance/approval.py +93 -0
  6. ins_pricing/governance/audit.py +37 -0
  7. ins_pricing/governance/registry.py +99 -0
  8. ins_pricing/governance/release.py +159 -0
  9. ins_pricing/modelling/BayesOpt.py +146 -0
  10. ins_pricing/modelling/BayesOpt_USAGE.md +925 -0
  11. ins_pricing/modelling/BayesOpt_entry.py +575 -0
  12. ins_pricing/modelling/BayesOpt_incremental.py +731 -0
  13. ins_pricing/modelling/Explain_Run.py +36 -0
  14. ins_pricing/modelling/Explain_entry.py +539 -0
  15. ins_pricing/modelling/Pricing_Run.py +36 -0
  16. ins_pricing/modelling/README.md +33 -0
  17. ins_pricing/modelling/__init__.py +44 -0
  18. ins_pricing/modelling/bayesopt/__init__.py +98 -0
  19. ins_pricing/modelling/bayesopt/config_preprocess.py +303 -0
  20. ins_pricing/modelling/bayesopt/core.py +1476 -0
  21. ins_pricing/modelling/bayesopt/models.py +2196 -0
  22. ins_pricing/modelling/bayesopt/trainers.py +2446 -0
  23. ins_pricing/modelling/bayesopt/utils.py +1021 -0
  24. ins_pricing/modelling/cli_common.py +136 -0
  25. ins_pricing/modelling/explain/__init__.py +55 -0
  26. ins_pricing/modelling/explain/gradients.py +334 -0
  27. ins_pricing/modelling/explain/metrics.py +176 -0
  28. ins_pricing/modelling/explain/permutation.py +155 -0
  29. ins_pricing/modelling/explain/shap_utils.py +146 -0
  30. ins_pricing/modelling/notebook_utils.py +284 -0
  31. ins_pricing/modelling/plotting/__init__.py +45 -0
  32. ins_pricing/modelling/plotting/common.py +63 -0
  33. ins_pricing/modelling/plotting/curves.py +572 -0
  34. ins_pricing/modelling/plotting/diagnostics.py +139 -0
  35. ins_pricing/modelling/plotting/geo.py +362 -0
  36. ins_pricing/modelling/plotting/importance.py +121 -0
  37. ins_pricing/modelling/run_logging.py +133 -0
  38. ins_pricing/modelling/tests/conftest.py +8 -0
  39. ins_pricing/modelling/tests/test_cross_val_generic.py +66 -0
  40. ins_pricing/modelling/tests/test_distributed_utils.py +18 -0
  41. ins_pricing/modelling/tests/test_explain.py +56 -0
  42. ins_pricing/modelling/tests/test_geo_tokens_split.py +49 -0
  43. ins_pricing/modelling/tests/test_graph_cache.py +33 -0
  44. ins_pricing/modelling/tests/test_plotting.py +63 -0
  45. ins_pricing/modelling/tests/test_plotting_library.py +150 -0
  46. ins_pricing/modelling/tests/test_preprocessor.py +48 -0
  47. ins_pricing/modelling/watchdog_run.py +211 -0
  48. ins_pricing/pricing/README.md +44 -0
  49. ins_pricing/pricing/__init__.py +27 -0
  50. ins_pricing/pricing/calibration.py +39 -0
  51. ins_pricing/pricing/data_quality.py +117 -0
  52. ins_pricing/pricing/exposure.py +85 -0
  53. ins_pricing/pricing/factors.py +91 -0
  54. ins_pricing/pricing/monitoring.py +99 -0
  55. ins_pricing/pricing/rate_table.py +78 -0
  56. ins_pricing/production/__init__.py +21 -0
  57. ins_pricing/production/drift.py +30 -0
  58. ins_pricing/production/monitoring.py +143 -0
  59. ins_pricing/production/scoring.py +40 -0
  60. ins_pricing/reporting/README.md +20 -0
  61. ins_pricing/reporting/__init__.py +11 -0
  62. ins_pricing/reporting/report_builder.py +72 -0
  63. ins_pricing/reporting/scheduler.py +45 -0
  64. ins_pricing/setup.py +41 -0
  65. ins_pricing v2/__init__.py +23 -0
  66. ins_pricing v2/governance/__init__.py +20 -0
  67. ins_pricing v2/governance/approval.py +93 -0
  68. ins_pricing v2/governance/audit.py +37 -0
  69. ins_pricing v2/governance/registry.py +99 -0
  70. ins_pricing v2/governance/release.py +159 -0
  71. ins_pricing v2/modelling/Explain_Run.py +36 -0
  72. ins_pricing v2/modelling/Pricing_Run.py +36 -0
  73. ins_pricing v2/modelling/__init__.py +151 -0
  74. ins_pricing v2/modelling/cli_common.py +141 -0
  75. ins_pricing v2/modelling/config.py +249 -0
  76. ins_pricing v2/modelling/config_preprocess.py +254 -0
  77. ins_pricing v2/modelling/core.py +741 -0
  78. ins_pricing v2/modelling/data_container.py +42 -0
  79. ins_pricing v2/modelling/explain/__init__.py +55 -0
  80. ins_pricing v2/modelling/explain/gradients.py +334 -0
  81. ins_pricing v2/modelling/explain/metrics.py +176 -0
  82. ins_pricing v2/modelling/explain/permutation.py +155 -0
  83. ins_pricing v2/modelling/explain/shap_utils.py +146 -0
  84. ins_pricing v2/modelling/features.py +215 -0
  85. ins_pricing v2/modelling/model_manager.py +148 -0
  86. ins_pricing v2/modelling/model_plotting.py +463 -0
  87. ins_pricing v2/modelling/models.py +2203 -0
  88. ins_pricing v2/modelling/notebook_utils.py +294 -0
  89. ins_pricing v2/modelling/plotting/__init__.py +45 -0
  90. ins_pricing v2/modelling/plotting/common.py +63 -0
  91. ins_pricing v2/modelling/plotting/curves.py +572 -0
  92. ins_pricing v2/modelling/plotting/diagnostics.py +139 -0
  93. ins_pricing v2/modelling/plotting/geo.py +362 -0
  94. ins_pricing v2/modelling/plotting/importance.py +121 -0
  95. ins_pricing v2/modelling/run_logging.py +133 -0
  96. ins_pricing v2/modelling/tests/conftest.py +8 -0
  97. ins_pricing v2/modelling/tests/test_cross_val_generic.py +66 -0
  98. ins_pricing v2/modelling/tests/test_distributed_utils.py +18 -0
  99. ins_pricing v2/modelling/tests/test_explain.py +56 -0
  100. ins_pricing v2/modelling/tests/test_geo_tokens_split.py +49 -0
  101. ins_pricing v2/modelling/tests/test_graph_cache.py +33 -0
  102. ins_pricing v2/modelling/tests/test_plotting.py +63 -0
  103. ins_pricing v2/modelling/tests/test_plotting_library.py +150 -0
  104. ins_pricing v2/modelling/tests/test_preprocessor.py +48 -0
  105. ins_pricing v2/modelling/trainers.py +2447 -0
  106. ins_pricing v2/modelling/utils.py +1020 -0
  107. ins_pricing v2/modelling/watchdog_run.py +211 -0
  108. ins_pricing v2/pricing/__init__.py +27 -0
  109. ins_pricing v2/pricing/calibration.py +39 -0
  110. ins_pricing v2/pricing/data_quality.py +117 -0
  111. ins_pricing v2/pricing/exposure.py +85 -0
  112. ins_pricing v2/pricing/factors.py +91 -0
  113. ins_pricing v2/pricing/monitoring.py +99 -0
  114. ins_pricing v2/pricing/rate_table.py +78 -0
  115. ins_pricing v2/production/__init__.py +21 -0
  116. ins_pricing v2/production/drift.py +30 -0
  117. ins_pricing v2/production/monitoring.py +143 -0
  118. ins_pricing v2/production/scoring.py +40 -0
  119. ins_pricing v2/reporting/__init__.py +11 -0
  120. ins_pricing v2/reporting/report_builder.py +72 -0
  121. ins_pricing v2/reporting/scheduler.py +45 -0
  122. ins_pricing v2/scripts/BayesOpt_incremental.py +722 -0
  123. ins_pricing v2/scripts/Explain_entry.py +545 -0
  124. ins_pricing v2/scripts/__init__.py +1 -0
  125. ins_pricing v2/scripts/train.py +568 -0
  126. ins_pricing v2/setup.py +55 -0
  127. ins_pricing v2/smoke_test.py +28 -0
  128. ins_pricing-0.1.6.dist-info/METADATA +78 -0
  129. ins_pricing-0.1.6.dist-info/RECORD +169 -0
  130. ins_pricing-0.1.6.dist-info/WHEEL +5 -0
  131. ins_pricing-0.1.6.dist-info/top_level.txt +4 -0
  132. user_packages/__init__.py +105 -0
  133. user_packages legacy/BayesOpt.py +5659 -0
  134. user_packages legacy/BayesOpt_entry.py +513 -0
  135. user_packages legacy/BayesOpt_incremental.py +685 -0
  136. user_packages legacy/Pricing_Run.py +36 -0
  137. user_packages legacy/Try/BayesOpt Legacy251213.py +3719 -0
  138. user_packages legacy/Try/BayesOpt Legacy251215.py +3758 -0
  139. user_packages legacy/Try/BayesOpt lagecy251201.py +3506 -0
  140. user_packages legacy/Try/BayesOpt lagecy251218.py +3992 -0
  141. user_packages legacy/Try/BayesOpt legacy.py +3280 -0
  142. user_packages legacy/Try/BayesOpt.py +838 -0
  143. user_packages legacy/Try/BayesOptAll.py +1569 -0
  144. user_packages legacy/Try/BayesOptAllPlatform.py +909 -0
  145. user_packages legacy/Try/BayesOptCPUGPU.py +1877 -0
  146. user_packages legacy/Try/BayesOptSearch.py +830 -0
  147. user_packages legacy/Try/BayesOptSearchOrigin.py +829 -0
  148. user_packages legacy/Try/BayesOptV1.py +1911 -0
  149. user_packages legacy/Try/BayesOptV10.py +2973 -0
  150. user_packages legacy/Try/BayesOptV11.py +3001 -0
  151. user_packages legacy/Try/BayesOptV12.py +3001 -0
  152. user_packages legacy/Try/BayesOptV2.py +2065 -0
  153. user_packages legacy/Try/BayesOptV3.py +2209 -0
  154. user_packages legacy/Try/BayesOptV4.py +2342 -0
  155. user_packages legacy/Try/BayesOptV5.py +2372 -0
  156. user_packages legacy/Try/BayesOptV6.py +2759 -0
  157. user_packages legacy/Try/BayesOptV7.py +2832 -0
  158. user_packages legacy/Try/BayesOptV8Codex.py +2731 -0
  159. user_packages legacy/Try/BayesOptV8Gemini.py +2614 -0
  160. user_packages legacy/Try/BayesOptV9.py +2927 -0
  161. user_packages legacy/Try/BayesOpt_entry legacy.py +313 -0
  162. user_packages legacy/Try/ModelBayesOptSearch.py +359 -0
  163. user_packages legacy/Try/ResNetBayesOptSearch.py +249 -0
  164. user_packages legacy/Try/XgbBayesOptSearch.py +121 -0
  165. user_packages legacy/Try/xgbbayesopt.py +523 -0
  166. user_packages legacy/__init__.py +19 -0
  167. user_packages legacy/cli_common.py +124 -0
  168. user_packages legacy/notebook_utils.py +228 -0
  169. user_packages legacy/watchdog_run.py +202 -0
@@ -0,0 +1,731 @@
1
+ """Incremental training harness built on top of ``ins_pricing.bayesopt``
2
+ (compat via ``BayesOpt.py``).
3
+
4
+ This utility lets you append new observations to an existing dataset,
5
+ reuse previously tuned hyperparameters and retrain a subset of models
6
+ without re-running the full Optuna search. It can operate on a directory
7
+ of per-model incremental CSVs or a single incremental file when updating
8
+ one dataset.
9
+
10
+ Example:
11
+ python ins_pricing/modelling/BayesOpt_incremental.py \
12
+ --config-json ins_pricing/modelling/demo/config_incremental_template.json \
13
+ --incremental-dir ./incremental_batches \
14
+ --merge-keys policy_id vehicle_id \
15
+ --model-keys glm xgb resn --plot-curves
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ from dataclasses import asdict
23
+ from datetime import datetime
24
+ from pathlib import Path
25
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
26
+
27
+ import pandas as pd
28
+ from sklearn.model_selection import train_test_split
29
+
30
+ try:
31
+ from . import bayesopt as ropt # type: ignore
32
+ from .cli_common import ( # type: ignore
33
+ PLOT_MODEL_LABELS,
34
+ PYTORCH_TRAINERS,
35
+ build_model_names,
36
+ dedupe_preserve_order,
37
+ load_config_json,
38
+ normalize_config_paths,
39
+ parse_model_pairs,
40
+ resolve_config_path,
41
+ resolve_path,
42
+ set_env,
43
+ )
44
+ except Exception: # pragma: no cover
45
+ try:
46
+ import bayesopt as ropt # type: ignore
47
+ from cli_common import ( # type: ignore
48
+ PLOT_MODEL_LABELS,
49
+ PYTORCH_TRAINERS,
50
+ build_model_names,
51
+ dedupe_preserve_order,
52
+ load_config_json,
53
+ normalize_config_paths,
54
+ parse_model_pairs,
55
+ resolve_config_path,
56
+ resolve_path,
57
+ set_env,
58
+ )
59
+ except Exception:
60
+ try:
61
+ import ins_pricing.bayesopt as ropt # type: ignore
62
+ from ins_pricing.cli_common import ( # type: ignore
63
+ PLOT_MODEL_LABELS,
64
+ PYTORCH_TRAINERS,
65
+ build_model_names,
66
+ dedupe_preserve_order,
67
+ load_config_json,
68
+ normalize_config_paths,
69
+ parse_model_pairs,
70
+ resolve_config_path,
71
+ resolve_path,
72
+ set_env,
73
+ )
74
+ except Exception:
75
+ import BayesOpt as ropt # type: ignore
76
+ from cli_common import ( # type: ignore
77
+ PLOT_MODEL_LABELS,
78
+ PYTORCH_TRAINERS,
79
+ build_model_names,
80
+ dedupe_preserve_order,
81
+ load_config_json,
82
+ normalize_config_paths,
83
+ parse_model_pairs,
84
+ resolve_config_path,
85
+ resolve_path,
86
+ set_env,
87
+ )
88
+
89
+ try:
90
+ from .run_logging import configure_run_logging # type: ignore
91
+ except Exception: # pragma: no cover
92
+ try:
93
+ from run_logging import configure_run_logging # type: ignore
94
+ except Exception: # pragma: no cover
95
+ configure_run_logging = None # type: ignore
96
+
97
+
98
+ def _log(message: str) -> None:
99
+ print(f"[Incremental] {message}")
100
+
101
+
102
+ def _parse_args() -> argparse.Namespace:
103
+ parser = argparse.ArgumentParser(
104
+ description="Incrementally retrain BayesOpt models using new batches of data."
105
+ )
106
+ parser.add_argument(
107
+ "--config-json",
108
+ required=True,
109
+ help="Path to the JSON config that BayesOpt_entry.py uses."
110
+ )
111
+ parser.add_argument(
112
+ "--model-names",
113
+ nargs="+",
114
+ default=None,
115
+ help="Optional subset of dataset names to update (defaults to model_list/model_categories Cartesian product)."
116
+ )
117
+ parser.add_argument(
118
+ "--model-keys",
119
+ nargs="+",
120
+ default=["glm", "xgb", "resn", "ft"],
121
+ choices=["glm", "xgb", "resn", "ft", "gnn", "all"],
122
+ help="Which trainers to run for each dataset."
123
+ )
124
+ parser.add_argument(
125
+ "--incremental-dir",
126
+ type=Path,
127
+ default=None,
128
+ help="Directory containing <model_name> incremental CSVs."
129
+ )
130
+ parser.add_argument(
131
+ "--incremental-file",
132
+ type=Path,
133
+ default=None,
134
+ help="Single incremental CSV (requires --model-names with exactly one entry)."
135
+ )
136
+ parser.add_argument(
137
+ "--incremental-template",
138
+ default="{model_name}_incremental.csv",
139
+ help="Filename template when --incremental-dir is provided."
140
+ )
141
+ parser.add_argument(
142
+ "--merge-keys",
143
+ nargs="+",
144
+ default=None,
145
+ help="Column(s) used to drop duplicate rows after merging base and incremental data."
146
+ )
147
+ parser.add_argument(
148
+ "--dedupe-keep",
149
+ choices=["first", "last"],
150
+ default="last",
151
+ help="How pandas.drop_duplicates resolves conflicts on merge keys."
152
+ )
153
+ parser.add_argument(
154
+ "--timestamp-col",
155
+ default=None,
156
+ help="Optional column used to sort rows before deduplication."
157
+ )
158
+ parser.add_argument(
159
+ "--timestamp-descending",
160
+ action="store_true",
161
+ help="Sort timestamp column in descending order before deduplication."
162
+ )
163
+ parser.add_argument(
164
+ "--min-new-rows",
165
+ type=int,
166
+ default=1,
167
+ help="Skip training if fewer new rows than this arrive (unless --train-without-incremental)."
168
+ )
169
+ parser.add_argument(
170
+ "--train-without-incremental",
171
+ action="store_true",
172
+ help="Always retrain even when no incremental file is present."
173
+ )
174
+ parser.add_argument(
175
+ "--strict-incremental",
176
+ action="store_true",
177
+ help="Raise an error when a dataset is missing its incremental CSV instead of skipping it."
178
+ )
179
+ parser.add_argument(
180
+ "--tag-new-column",
181
+ default=None,
182
+ help="If set, store 1 for incremental rows and 0 for historical rows in this column."
183
+ )
184
+ parser.add_argument(
185
+ "--max-evals",
186
+ type=int,
187
+ default=25,
188
+ help="Optuna trial count when retuning is required."
189
+ )
190
+ parser.add_argument(
191
+ "--retune-missing",
192
+ dest="retune_missing",
193
+ action="store_true",
194
+ default=True,
195
+ help="Retune models whose best-params CSV is unavailable (default)."
196
+ )
197
+ parser.add_argument(
198
+ "--skip-retune-missing",
199
+ dest="retune_missing",
200
+ action="store_false",
201
+ help="Do not retune when best params are missing; such models are skipped."
202
+ )
203
+ parser.add_argument(
204
+ "--force-retune",
205
+ action="store_true",
206
+ help="Run Optuna tuning even if historical best params exist."
207
+ )
208
+ parser.add_argument(
209
+ "--prop-test",
210
+ type=float,
211
+ default=None,
212
+ help="Override the test split proportion defined in the config file."
213
+ )
214
+ parser.add_argument(
215
+ "--rand-seed",
216
+ type=int,
217
+ default=None,
218
+ help="Override the random seed defined in the config."
219
+ )
220
+ parser.add_argument(
221
+ "--epochs",
222
+ type=int,
223
+ default=None,
224
+ help="Override the epoch count from the config."
225
+ )
226
+ parser.add_argument(
227
+ "--output-dir",
228
+ type=Path,
229
+ default=None,
230
+ help="Override the BayesOpt output root (models/results/plots)."
231
+ )
232
+ parser.add_argument(
233
+ "--update-base-data",
234
+ action="store_true",
235
+ help="Overwrite the base CSVs with the merged dataset after a successful update."
236
+ )
237
+ parser.add_argument(
238
+ "--persist-merged-dir",
239
+ type=Path,
240
+ default=None,
241
+ help="Optional directory to store the merged dataset snapshots."
242
+ )
243
+ parser.add_argument(
244
+ "--summary-json",
245
+ type=Path,
246
+ default=None,
247
+ help="Write a JSON summary of processed datasets to this path."
248
+ )
249
+ parser.add_argument(
250
+ "--plot-curves",
251
+ action="store_true",
252
+ help="Run one-way/lift plots after training (config plot settings also apply)."
253
+ )
254
+ parser.add_argument(
255
+ "--dry-run",
256
+ action="store_true",
257
+ help="Merge and report counts but skip training, saving and plotting."
258
+ )
259
+ args = parser.parse_args()
260
+
261
+ if args.incremental_file and args.incremental_dir:
262
+ parser.error("Use either --incremental-dir or --incremental-file, not both.")
263
+ if args.incremental_file and args.model_names and len(args.model_names) != 1:
264
+ parser.error("--incremental-file can only be used when updating exactly one model.")
265
+ if (not args.incremental_dir and not args.incremental_file) and not args.train_without_incremental:
266
+ parser.error(
267
+ "Provide --incremental-dir/--incremental-file or enable --train-without-incremental."
268
+ )
269
+ return args
270
+
271
+
272
+ def _plot_curves_for_model(model: ropt.BayesOptModel, trained: List[str], cfg: Dict[str, Any]) -> None:
273
+ plot_cfg = cfg.get("plot", {})
274
+ legacy_flags = {
275
+ "glm": cfg.get("plot_lift_glm", False),
276
+ "xgb": cfg.get("plot_lift_xgb", False),
277
+ "resn": cfg.get("plot_lift_resn", False),
278
+ "ft": cfg.get("plot_lift_ft", False),
279
+ }
280
+ plot_enabled = plot_cfg.get("enable", any(legacy_flags.values()))
281
+ if not plot_enabled:
282
+ return
283
+
284
+ n_bins = int(plot_cfg.get("n_bins", 10))
285
+ oneway_enabled = plot_cfg.get("oneway", True)
286
+ available = dedupe_preserve_order([k for k in trained if k in PLOT_MODEL_LABELS])
287
+
288
+ if oneway_enabled:
289
+ model.plot_oneway(n_bins=n_bins)
290
+ if not available:
291
+ return
292
+
293
+ lift_models = plot_cfg.get("lift_models")
294
+ if lift_models is None:
295
+ lift_models = [m for m, flag in legacy_flags.items() if flag] or available
296
+ lift_models = dedupe_preserve_order([m for m in lift_models if m in available])
297
+
298
+ for key in lift_models:
299
+ label, pred_nme = PLOT_MODEL_LABELS[key]
300
+ model.plot_lift(model_label=label, pred_nme=pred_nme, n_bins=n_bins)
301
+
302
+ if not plot_cfg.get("double_lift", True) or len(available) < 2:
303
+ return
304
+
305
+ raw_pairs = plot_cfg.get("double_lift_pairs")
306
+ if raw_pairs:
307
+ pairs = [
308
+ (a, b)
309
+ for a, b in parse_model_pairs(raw_pairs)
310
+ if a in available and b in available and a != b
311
+ ]
312
+ else:
313
+ pairs = [(a, b) for i, a in enumerate(available) for b in available[i + 1 :]]
314
+ for first, second in pairs:
315
+ model.plot_dlift([first, second], n_bins=n_bins)
316
+
317
+
318
+ def _coerce_scalar(value: Any) -> Any:
319
+ if isinstance(value, str):
320
+ lowered = value.strip().lower()
321
+ if lowered in {"", "none", "nan"}:
322
+ return None
323
+ if lowered in {"true", "false"}:
324
+ return lowered == "true"
325
+ return value
326
+ if hasattr(value, "item"):
327
+ try:
328
+ return value.item()
329
+ except Exception:
330
+ return value
331
+ return value
332
+
333
+
334
+ def _load_best_params(model: ropt.BayesOptModel, trainer, silent: bool = False) -> Optional[Dict[str, Any]]:
335
+ label = trainer.label.lower()
336
+ result_dir = Path(model.output_manager.result_dir)
337
+ path = result_dir / f"{model.model_nme}_bestparams_{label}.csv"
338
+ if not path.exists():
339
+ if not silent:
340
+ _log(f"No historical params found for {model.model_nme}/{label} at {path}.")
341
+ return None
342
+ try:
343
+ params_raw = ropt.IOUtils.load_params_file(str(path))
344
+ except Exception:
345
+ return None
346
+ return {
347
+ key: _coerce_scalar(val)
348
+ for key, val in (params_raw or {}).items()
349
+ if not pd.isna(val)
350
+ }
351
+
352
+
353
+ def _to_serializable(obj: Any) -> Any:
354
+ if isinstance(obj, dict):
355
+ return {k: _to_serializable(v) for k, v in obj.items()}
356
+ if isinstance(obj, list):
357
+ return [_to_serializable(v) for v in obj]
358
+ if hasattr(obj, "item"):
359
+ try:
360
+ return obj.item()
361
+ except Exception:
362
+ return str(obj)
363
+ return obj
364
+
365
+
366
+ class IncrementalUpdateRunner:
367
+ def __init__(self, args: argparse.Namespace) -> None:
368
+ self.args = args
369
+ script_dir = Path(__file__).resolve().parent
370
+ self.config_path = resolve_config_path(args.config_json, script_dir)
371
+ cfg = load_config_json(
372
+ self.config_path,
373
+ required_keys=[
374
+ "data_dir",
375
+ "model_list",
376
+ "model_categories",
377
+ "target",
378
+ "weight",
379
+ "feature_list",
380
+ "categorical_features",
381
+ ],
382
+ )
383
+ self.cfg = normalize_config_paths(cfg, self.config_path)
384
+ set_env(self.cfg.get("env", {}))
385
+ self.data_dir = Path(self.cfg["data_dir"])
386
+ self.data_dir.mkdir(parents=True, exist_ok=True)
387
+ self.prop_test = args.prop_test if args.prop_test is not None else self.cfg.get("prop_test", 0.25)
388
+ self.rand_seed = args.rand_seed if args.rand_seed is not None else self.cfg.get("rand_seed", 13)
389
+ self.epochs = args.epochs if args.epochs is not None else self.cfg.get("epochs", 50)
390
+ self.plot_requested = bool(args.plot_curves or self.cfg.get("plot_curves", False))
391
+ self.model_names = self._resolve_model_names(args.model_names)
392
+ self.merge_keys = list(args.merge_keys or [])
393
+ self.timestamp_col = args.timestamp_col
394
+ self.timestamp_ascending = not args.timestamp_descending
395
+ output_root = args.output_dir or self.cfg.get("output_dir")
396
+ if isinstance(output_root, Path) and not output_root.is_absolute():
397
+ output_root = (self.config_path.parent / output_root).resolve()
398
+ if isinstance(output_root, str) and output_root.strip():
399
+ resolved = resolve_path(output_root, self.config_path.parent)
400
+ if resolved is not None:
401
+ output_root = str(resolved)
402
+ self.output_root = output_root
403
+
404
+ self.incremental_dir = None
405
+ if args.incremental_dir is not None:
406
+ self.incremental_dir = args.incremental_dir
407
+ if not self.incremental_dir.is_absolute():
408
+ self.incremental_dir = (self.config_path.parent / self.incremental_dir).resolve()
409
+ else:
410
+ self.incremental_dir = self.incremental_dir.resolve()
411
+ self.incremental_file = None
412
+ if args.incremental_file is not None:
413
+ self.incremental_file = args.incremental_file
414
+ if not self.incremental_file.is_absolute():
415
+ self.incremental_file = (self.config_path.parent / self.incremental_file).resolve()
416
+ else:
417
+ self.incremental_file = self.incremental_file.resolve()
418
+ self.summary_records: List[Dict[str, Any]] = []
419
+ self.binary_resp = self.cfg.get("binary_resp_nme") or self.cfg.get("binary_target")
420
+
421
+ if self.incremental_file and len(self.model_names) != 1:
422
+ raise ValueError("--incremental-file can only be used when exactly one model name is targeted.")
423
+
424
+ def _resolve_model_names(self, override: Optional[Sequence[str]]) -> List[str]:
425
+ if override:
426
+ return dedupe_preserve_order([str(item) for item in override])
427
+ prefixes = self.cfg["model_list"]
428
+ suffixes = self.cfg["model_categories"]
429
+ return build_model_names(prefixes, suffixes)
430
+
431
+ def _load_incremental_df(self, model_name: str) -> Tuple[Optional[pd.DataFrame], Optional[Path]]:
432
+ path: Optional[Path] = None
433
+ if self.incremental_file:
434
+ path = self.incremental_file
435
+ elif self.incremental_dir:
436
+ rel = self.args.incremental_template.format(model_name=model_name)
437
+ path = (self.incremental_dir / rel).resolve()
438
+ if not path or not path.exists():
439
+ return None, None
440
+ try:
441
+ df = pd.read_csv(path, low_memory=False)
442
+ except pd.errors.EmptyDataError:
443
+ _log(f"Incremental file {path} is empty; treating as no-op.")
444
+ return None, path
445
+ return df, path
446
+
447
+ def _merge_frames(self, base_df: pd.DataFrame, inc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
448
+ if inc_df is None or inc_df.empty:
449
+ merged = base_df.copy(deep=True)
450
+ return merged.reset_index(drop=True)
451
+ frames = []
452
+ tag = self.args.tag_new_column
453
+ if tag:
454
+ base_part = base_df.copy(deep=True)
455
+ base_part[tag] = 0
456
+ inc_part = inc_df.copy(deep=True)
457
+ inc_part[tag] = 1
458
+ frames = [base_part, inc_part]
459
+ else:
460
+ frames = [base_df, inc_df]
461
+ merged = pd.concat(frames, ignore_index=True, sort=False)
462
+ if self.timestamp_col and self.timestamp_col in merged.columns:
463
+ merged = merged.sort_values(
464
+ self.timestamp_col,
465
+ ascending=self.timestamp_ascending,
466
+ kind="mergesort",
467
+ )
468
+ if self.merge_keys:
469
+ missing = [col for col in self.merge_keys if col not in merged.columns]
470
+ if missing:
471
+ raise KeyError(f"Merge keys {missing} not found in merged frame for {self.merge_keys}.")
472
+ merged = merged.drop_duplicates(subset=self.merge_keys, keep=self.args.dedupe_keep)
473
+ return merged.reset_index(drop=True)
474
+
475
+ def _should_train(self, new_rows: int) -> bool:
476
+ if self.args.train_without_incremental:
477
+ return True
478
+ min_needed = max(0, self.args.min_new_rows)
479
+ return new_rows >= min_needed
480
+
481
+ def _write_dataset(self, df: pd.DataFrame, dest: Path, reason: str) -> None:
482
+ dest.parent.mkdir(parents=True, exist_ok=True)
483
+ df.to_csv(dest, index=False)
484
+ _log(f"Wrote {len(df)} rows to {dest} ({reason}).")
485
+
486
+ def _prepare_splits(self, merged: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
487
+ if not 0 < self.prop_test < 1:
488
+ raise ValueError(f"prop_test must fall in (0, 1); got {self.prop_test}.")
489
+ if len(merged) < 2:
490
+ raise ValueError("Need at least two rows to form a train/test split.")
491
+ train_df, test_df = train_test_split(
492
+ merged,
493
+ test_size=self.prop_test,
494
+ random_state=self.rand_seed,
495
+ )
496
+ return train_df.reset_index(drop=True), test_df.reset_index(drop=True)
497
+
498
+ def _requested_model_keys(self, trainer_map: Dict[str, Any]) -> List[str]:
499
+ requested = self.args.model_keys
500
+ if "all" in requested:
501
+ requested = ["glm", "xgb", "resn", "ft", "gnn"]
502
+ requested = dedupe_preserve_order(requested)
503
+ missing = [key for key in requested if key not in trainer_map]
504
+ for key in missing:
505
+ _log(f"Trainer '{key}' is not available for this context and will be skipped.")
506
+ return [key for key in requested if key in trainer_map]
507
+
508
+ def _train_single_model(
509
+ self,
510
+ model_name: str,
511
+ merged: pd.DataFrame,
512
+ new_rows: int,
513
+ incremental_path: Optional[Path],
514
+ ) -> Dict[str, Any]:
515
+ merged = merged.copy(deep=True)
516
+ merged.fillna(0, inplace=True)
517
+ train_df, test_df = self._prepare_splits(merged)
518
+ model = ropt.BayesOptModel(
519
+ train_df,
520
+ test_df,
521
+ model_name,
522
+ self.cfg["target"],
523
+ self.cfg["weight"],
524
+ self.cfg["feature_list"],
525
+ task_type=self.cfg.get("task_type", "regression"),
526
+ binary_resp_nme=self.binary_resp,
527
+ cate_list=self.cfg.get("categorical_features"),
528
+ prop_test=self.prop_test,
529
+ rand_seed=self.rand_seed,
530
+ epochs=self.epochs,
531
+ use_resn_data_parallel=self.cfg.get("use_resn_data_parallel", False),
532
+ use_ft_data_parallel=self.cfg.get("use_ft_data_parallel", True),
533
+ use_gnn_data_parallel=self.cfg.get("use_gnn_data_parallel", False),
534
+ use_resn_ddp=self.cfg.get("use_resn_ddp", False),
535
+ use_ft_ddp=self.cfg.get("use_ft_ddp", False),
536
+ use_gnn_ddp=self.cfg.get("use_gnn_ddp", False),
537
+ output_dir=str(self.output_root) if self.output_root else None,
538
+ xgb_max_depth_max=int(self.cfg.get("xgb_max_depth_max", 25)),
539
+ xgb_n_estimators_max=int(self.cfg.get("xgb_n_estimators_max", 500)),
540
+ resn_weight_decay=self.cfg.get("resn_weight_decay"),
541
+ final_ensemble=bool(self.cfg.get("final_ensemble", False)),
542
+ final_ensemble_k=int(self.cfg.get("final_ensemble_k", 3)),
543
+ final_refit=bool(self.cfg.get("final_refit", True)),
544
+ optuna_storage=self.cfg.get("optuna_storage"),
545
+ optuna_study_prefix=self.cfg.get("optuna_study_prefix"),
546
+ best_params_files=self.cfg.get("best_params_files"),
547
+ reuse_best_params=bool(self.cfg.get("reuse_best_params", False)),
548
+ gnn_use_approx_knn=self.cfg.get("gnn_use_approx_knn", True),
549
+ gnn_approx_knn_threshold=self.cfg.get("gnn_approx_knn_threshold", 50000),
550
+ gnn_graph_cache=self.cfg.get("gnn_graph_cache"),
551
+ gnn_max_gpu_knn_nodes=self.cfg.get("gnn_max_gpu_knn_nodes", 200000),
552
+ gnn_knn_gpu_mem_ratio=self.cfg.get("gnn_knn_gpu_mem_ratio", 0.9),
553
+ gnn_knn_gpu_mem_overhead=self.cfg.get("gnn_knn_gpu_mem_overhead", 2.0),
554
+ ft_role=str(self.cfg.get("ft_role", "model")),
555
+ ft_feature_prefix=str(self.cfg.get("ft_feature_prefix", "ft_emb")),
556
+ ft_num_numeric_tokens=self.cfg.get("ft_num_numeric_tokens"),
557
+ infer_categorical_max_unique=int(self.cfg.get("infer_categorical_max_unique", 50)),
558
+ infer_categorical_max_ratio=float(self.cfg.get("infer_categorical_max_ratio", 0.05)),
559
+ )
560
+
561
+ requested_keys = self._requested_model_keys(model.trainers)
562
+ executed_keys: List[str] = []
563
+ param_sources: Dict[str, str] = {}
564
+
565
+ if self.args.dry_run:
566
+ _log(f"Dry run: would train {requested_keys} for {model_name}.")
567
+ return {
568
+ "executed_keys": executed_keys,
569
+ "param_sources": param_sources,
570
+ "model": model,
571
+ }
572
+
573
+ if self.args.force_retune and self.args.max_evals <= 0:
574
+ raise ValueError("force_retune requires --max-evals > 0.")
575
+
576
+ force_retune = bool(self.args.force_retune)
577
+ if force_retune:
578
+ model.config.reuse_best_params = False
579
+ model.config.best_params_files = {}
580
+
581
+ ft_role = str(getattr(model.config, "ft_role", "model"))
582
+ if ft_role != "model" and "ft" in requested_keys:
583
+ requested_keys = ["ft"] + [k for k in requested_keys if k != "ft"]
584
+
585
+ for key in requested_keys:
586
+ trainer = model.trainers[key]
587
+
588
+ if force_retune:
589
+ trainer.best_params = None
590
+ trainer.best_trial = None
591
+ param_sources[key] = "retune"
592
+ else:
593
+ best_params = _load_best_params(model, trainer, silent=True)
594
+ if best_params:
595
+ trainer.best_params = best_params
596
+ trainer.best_trial = None
597
+ param_sources[key] = "loaded"
598
+ else:
599
+ if not self.args.retune_missing:
600
+ _log(
601
+ f"Skipping {model_name}/{key}: no best params and retuning disabled."
602
+ )
603
+ continue
604
+ param_sources[key] = "retune"
605
+
606
+ if (trainer.best_params is None) and self.args.max_evals <= 0:
607
+ raise ValueError("--max-evals must be positive when retuning is requested.")
608
+
609
+ model.optimize_model(key, max_evals=self.args.max_evals)
610
+ trainer.save()
611
+ executed_keys.append(key)
612
+ if key in PYTORCH_TRAINERS:
613
+ ropt.free_cuda()
614
+
615
+ snapshot = {
616
+ "mode": "incremental_train",
617
+ "model_name": model_name,
618
+ "model_key": key,
619
+ "timestamp": datetime.now().isoformat(),
620
+ "param_source": param_sources[key],
621
+ "best_params": _to_serializable(trainer.best_params or {}),
622
+ "incremental_rows": new_rows,
623
+ "train_rows": len(model.train_data),
624
+ "test_rows": len(model.test_data),
625
+ "incremental_path": str(incremental_path) if incremental_path else None,
626
+ "config": asdict(model.config),
627
+ }
628
+ model.version_manager.save(f"{model_name}_{key}_incremental", snapshot)
629
+
630
+ if not executed_keys:
631
+ _log(f"No trainers executed for {model_name}.")
632
+
633
+ return {
634
+ "executed_keys": executed_keys,
635
+ "param_sources": param_sources,
636
+ "model": model,
637
+ }
638
+
639
+ def process(self) -> None:
640
+ total_trained = 0
641
+ for model_name in self.model_names:
642
+ total_trained += self._process_single_model(model_name)
643
+ if self.args.summary_json and self.summary_records:
644
+ summary_path = self.args.summary_json.resolve()
645
+ summary_path.parent.mkdir(parents=True, exist_ok=True)
646
+ summary_payload = _to_serializable(self.summary_records)
647
+ summary_path.write_text(json.dumps(summary_payload, indent=2, ensure_ascii=False), encoding="utf-8")
648
+ _log(f"Summary written to {summary_path}.")
649
+ _log(f"Finished incremental update for {total_trained} dataset(s).")
650
+
651
+ def _process_single_model(self, model_name: str) -> int:
652
+ base_path = self.data_dir / f"{model_name}.csv"
653
+ if not base_path.exists():
654
+ _log(f"Base dataset {base_path} not found; skipping {model_name}.")
655
+ self.summary_records.append({
656
+ "model_name": model_name,
657
+ "status": "missing_base",
658
+ })
659
+ return 0
660
+
661
+ base_df = pd.read_csv(base_path, low_memory=False)
662
+ inc_df, inc_path = self._load_incremental_df(model_name)
663
+ if inc_df is None and self.incremental_dir and self.args.strict_incremental and not self.args.train_without_incremental:
664
+ raise FileNotFoundError(f"Missing incremental file for {model_name} under {self.incremental_dir}.")
665
+
666
+ new_rows = 0 if inc_df is None else len(inc_df)
667
+ _log(f"{model_name}: {len(base_df)} base rows, {new_rows} incremental rows.")
668
+ merged_df = self._merge_frames(base_df, inc_df)
669
+ merged_df.fillna(0, inplace=True)
670
+
671
+ if self.args.update_base_data and not self.args.dry_run:
672
+ self._write_dataset(merged_df, base_path, "update_base_data")
673
+ if self.args.persist_merged_dir and not self.args.dry_run:
674
+ dest = Path(self.args.persist_merged_dir).resolve() / f"{model_name}.csv"
675
+ self._write_dataset(merged_df, dest, "persist_merged_dir")
676
+
677
+ if not self._should_train(new_rows):
678
+ _log(f"{model_name}: below min_new_rows ({self.args.min_new_rows}); skipping retrain.")
679
+ self.summary_records.append({
680
+ "model_name": model_name,
681
+ "status": "skipped_no_incremental",
682
+ "new_rows": new_rows,
683
+ "total_rows": len(merged_df),
684
+ })
685
+ return 0
686
+
687
+ try:
688
+ train_result = self._train_single_model(model_name, merged_df, new_rows, inc_path)
689
+ except Exception as exc:
690
+ _log(f"Training failed for {model_name}: {exc}")
691
+ self.summary_records.append({
692
+ "model_name": model_name,
693
+ "status": "failed",
694
+ "error": str(exc),
695
+ "new_rows": new_rows,
696
+ "total_rows": len(merged_df),
697
+ })
698
+ return 0
699
+
700
+ executed = train_result["executed_keys"]
701
+ param_sources = train_result["param_sources"]
702
+ model = train_result["model"]
703
+ status = "dry_run" if self.args.dry_run else "trained"
704
+
705
+ summary = {
706
+ "model_name": model_name,
707
+ "status": status,
708
+ "trained_models": executed,
709
+ "param_sources": param_sources,
710
+ "new_rows": new_rows,
711
+ "total_rows": len(merged_df),
712
+ "incremental_path": str(inc_path) if inc_path else None,
713
+ }
714
+ self.summary_records.append(summary)
715
+
716
+ if not self.args.dry_run and self.plot_requested and executed:
717
+ _plot_curves_for_model(model, executed, self.cfg)
718
+
719
+ return 1 if executed else 0
720
+
721
+
722
+ def main() -> None:
723
+ if configure_run_logging:
724
+ configure_run_logging(prefix="bayesopt_incremental")
725
+ args = _parse_args()
726
+ runner = IncrementalUpdateRunner(args)
727
+ runner.process()
728
+
729
+
730
+ if __name__ == "__main__":
731
+ main()