claude-turing 4.2.0 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,586 @@
1
+ #!/usr/bin/env python3
2
+ """Incremental model update for the autoresearch pipeline.
3
+
4
+ Updates the existing best model with new data without retraining from
5
+ scratch. For tree models: add boosting rounds. For neural nets: fine-tune
6
+ with replay buffer. For sklearn: partial_fit or warm_start.
7
+
8
+ Tracks accuracy on both old and new data to detect catastrophic forgetting.
9
+
10
+ Usage:
11
+ python scripts/incremental_update.py exp-089 --new-data data/new_batch.csv
12
+ python scripts/incremental_update.py exp-089 --new-data data/new.csv --replay-ratio 0.1
13
+ python scripts/incremental_update.py exp-089 --new-data data/new.csv --tolerance 0.005
14
+ python scripts/incremental_update.py --json
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import sys
22
+ from datetime import datetime, timezone
23
+ from pathlib import Path
24
+
25
+ import numpy as np
26
+ import yaml
27
+
28
+ from scripts.turing_io import load_config, load_experiments
29
+
30
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
31
+ DEFAULT_REPLAY_RATIO = 0.1
32
+ DEFAULT_FORGETTING_TOLERANCE = 0.005
33
+ DEFAULT_NEW_ROUNDS = 50
34
+
35
+ # Model type detection
36
+ TREE_MODELS = {"xgboost", "lightgbm", "catboost", "gradient_boosting", "gbm"}
37
+ NEURAL_MODELS = {"mlp", "neural_network", "nn", "pytorch", "tensorflow", "keras", "transformer"}
38
+ SKLEARN_PARTIAL_FIT = {
39
+ "sgd", "passive_aggressive", "perceptron", "multinomial_nb",
40
+ "bernoulli_nb", "minibatch_kmeans",
41
+ }
42
+ SKLEARN_WARM_START = {
43
+ "random_forest", "gradient_boosting", "bagging", "adaboost",
44
+ }
45
+
46
+
47
+ # --- Model Type Detection ---
48
+
49
+
50
+ def detect_model_type(experiment: dict) -> str:
51
+ """Detect model type category from an experiment.
52
+
53
+ Returns one of: 'tree', 'neural', 'sklearn_partial', 'sklearn_warm', 'unknown'.
54
+ """
55
+ config = experiment.get("config", {})
56
+ model_type = config.get("model_type", "").lower()
57
+
58
+ if any(t in model_type for t in TREE_MODELS):
59
+ return "tree"
60
+ if any(t in model_type for t in NEURAL_MODELS):
61
+ return "neural"
62
+ if any(t in model_type for t in SKLEARN_PARTIAL_FIT):
63
+ return "sklearn_partial"
64
+ if any(t in model_type for t in SKLEARN_WARM_START):
65
+ return "sklearn_warm"
66
+
67
+ # Check hyperparams for hints
68
+ hyperparams = config.get("hyperparams", {})
69
+ if "n_estimators" in hyperparams and ("max_depth" in hyperparams or "num_leaves" in hyperparams):
70
+ return "tree"
71
+ if "hidden_size" in hyperparams or "layers" in hyperparams:
72
+ return "neural"
73
+
74
+ return "unknown"
75
+
76
+
77
+ # --- Update Strategies ---
78
+
79
+
80
+ def plan_tree_update(
81
+ experiment: dict,
82
+ new_data_size: int,
83
+ new_rounds: int = DEFAULT_NEW_ROUNDS,
84
+ ) -> dict:
85
+ """Plan incremental update for tree-based models.
86
+
87
+ XGBoost/LightGBM support continued boosting with new data.
88
+ """
89
+ config = experiment.get("config", {})
90
+ hyperparams = config.get("hyperparams", {})
91
+ current_rounds = hyperparams.get("n_estimators", hyperparams.get("num_boost_round", 100))
92
+
93
+ return {
94
+ "strategy": "continued_boosting",
95
+ "model_type": "tree",
96
+ "current_rounds": current_rounds,
97
+ "additional_rounds": new_rounds,
98
+ "total_rounds": current_rounds + new_rounds,
99
+ "new_data_size": new_data_size,
100
+ "method": "xgb_model parameter for continued training",
101
+ "instructions": [
102
+ f"Load model from {experiment.get('experiment_id', 'exp-???')}",
103
+ f"Set xgb_model/init_model to loaded model",
104
+ f"Train for {new_rounds} additional rounds on new data",
105
+ "Evaluate on old validation set + new data",
106
+ ],
107
+ }
108
+
109
+
110
+ def plan_neural_update(
111
+ experiment: dict,
112
+ new_data_size: int,
113
+ replay_ratio: float = DEFAULT_REPLAY_RATIO,
114
+ ) -> dict:
115
+ """Plan incremental update for neural network models.
116
+
117
+ Fine-tune on new data + replay buffer from old data.
118
+ """
119
+ config = experiment.get("config", {})
120
+ hyperparams = config.get("hyperparams", {})
121
+ original_lr = hyperparams.get("learning_rate", hyperparams.get("lr", 0.001))
122
+
123
+ replay_size = int(new_data_size * replay_ratio)
124
+
125
+ return {
126
+ "strategy": "fine_tune_with_replay",
127
+ "model_type": "neural",
128
+ "original_lr": original_lr,
129
+ "fine_tune_lr": original_lr * 0.1,
130
+ "new_data_size": new_data_size,
131
+ "replay_size": replay_size,
132
+ "replay_ratio": replay_ratio,
133
+ "total_training_size": new_data_size + replay_size,
134
+ "method": "Load weights, reduce LR by 10x, mix new data with replay buffer",
135
+ "instructions": [
136
+ f"Load model weights from {experiment.get('experiment_id', 'exp-???')}",
137
+ f"Reduce learning rate to {original_lr * 0.1}",
138
+ f"Sample {replay_size} examples from old training data (replay buffer)",
139
+ f"Train on {new_data_size} new + {replay_size} replay samples",
140
+ "Evaluate on old validation set + new data",
141
+ ],
142
+ }
143
+
144
+
145
+ def plan_sklearn_update(
146
+ experiment: dict,
147
+ new_data_size: int,
148
+ model_category: str,
149
+ ) -> dict:
150
+ """Plan incremental update for scikit-learn models."""
151
+ if model_category == "sklearn_partial":
152
+ return {
153
+ "strategy": "partial_fit",
154
+ "model_type": "sklearn",
155
+ "new_data_size": new_data_size,
156
+ "method": "Call partial_fit() with new data batch",
157
+ "instructions": [
158
+ f"Load model from {experiment.get('experiment_id', 'exp-???')}",
159
+ "Call model.partial_fit(X_new, y_new)",
160
+ "Evaluate on old validation set + new data",
161
+ ],
162
+ }
163
+ else:
164
+ return {
165
+ "strategy": "warm_start_retrain",
166
+ "model_type": "sklearn",
167
+ "new_data_size": new_data_size,
168
+ "method": "Set warm_start=True, retrain on combined old+new data",
169
+ "instructions": [
170
+ f"Load model from {experiment.get('experiment_id', 'exp-???')}",
171
+ "Set warm_start=True",
172
+ "Fit on combined old + new data",
173
+ "Evaluate on old validation set + new data",
174
+ ],
175
+ }
176
+
177
+
178
+ def plan_update(
179
+ experiment: dict,
180
+ new_data_size: int,
181
+ replay_ratio: float = DEFAULT_REPLAY_RATIO,
182
+ new_rounds: int = DEFAULT_NEW_ROUNDS,
183
+ ) -> dict:
184
+ """Plan the update strategy based on model type.
185
+
186
+ Args:
187
+ experiment: The experiment to update.
188
+ new_data_size: Number of new data samples.
189
+ replay_ratio: Fraction of old data to replay (neural nets).
190
+ new_rounds: Additional boosting rounds (tree models).
191
+
192
+ Returns:
193
+ Update plan with strategy, instructions, and parameters.
194
+ """
195
+ model_category = detect_model_type(experiment)
196
+
197
+ if model_category == "tree":
198
+ return plan_tree_update(experiment, new_data_size, new_rounds)
199
+ elif model_category == "neural":
200
+ return plan_neural_update(experiment, new_data_size, replay_ratio)
201
+ elif model_category in ("sklearn_partial", "sklearn_warm"):
202
+ return plan_sklearn_update(experiment, new_data_size, model_category)
203
+ else:
204
+ return {
205
+ "strategy": "unknown",
206
+ "model_type": "unknown",
207
+ "error": "Cannot determine model type for incremental update",
208
+ "suggestion": "Add model_type to config (e.g., 'xgboost', 'lightgbm', 'mlp')",
209
+ }
210
+
211
+
212
+ # --- Forgetting Detection ---
213
+
214
+
215
+ def check_forgetting(
216
+ old_metrics: dict[str, float],
217
+ new_metrics: dict[str, float],
218
+ primary_metric: str,
219
+ tolerance: float = DEFAULT_FORGETTING_TOLERANCE,
220
+ lower_is_better: bool = False,
221
+ ) -> dict:
222
+ """Check for catastrophic forgetting after incremental update.
223
+
224
+ Compares old data metrics before and after update.
225
+
226
+ Args:
227
+ old_metrics: Metrics on old validation data BEFORE update.
228
+ new_metrics: Metrics on old validation data AFTER update.
229
+ primary_metric: Primary metric name.
230
+ tolerance: Maximum allowed degradation.
231
+ lower_is_better: Whether lower metric is better.
232
+
233
+ Returns:
234
+ Forgetting check result with verdict and details.
235
+ """
236
+ old_val = old_metrics.get(primary_metric)
237
+ new_val = new_metrics.get(primary_metric)
238
+
239
+ if old_val is None or new_val is None:
240
+ return {
241
+ "verdict": "UNKNOWN",
242
+ "reason": f"Missing {primary_metric} in old or new metrics",
243
+ "old_value": old_val,
244
+ "new_value": new_val,
245
+ }
246
+
247
+ if lower_is_better:
248
+ degradation = new_val - old_val # Positive means worse
249
+ else:
250
+ degradation = old_val - new_val # Positive means worse
251
+
252
+ if degradation <= 0:
253
+ verdict = "PASS"
254
+ reason = "No degradation on old data"
255
+ elif degradation <= tolerance:
256
+ verdict = "PASS"
257
+ reason = f"Degradation {degradation:.4f} within tolerance {tolerance}"
258
+ elif degradation <= tolerance * 2:
259
+ verdict = "WARNING"
260
+ reason = f"Degradation {degradation:.4f} exceeds tolerance {tolerance} but within 2x"
261
+ else:
262
+ verdict = "FAIL"
263
+ reason = f"Catastrophic forgetting: degradation {degradation:.4f} >> tolerance {tolerance}"
264
+
265
+ return {
266
+ "verdict": verdict,
267
+ "reason": reason,
268
+ "primary_metric": primary_metric,
269
+ "old_value": round(float(old_val), 6),
270
+ "new_value": round(float(new_val), 6),
271
+ "degradation": round(float(degradation), 6),
272
+ "tolerance": tolerance,
273
+ "within_tolerance": degradation <= tolerance,
274
+ }
275
+
276
+
277
+ # --- Update Report ---
278
+
279
+
280
+ def build_update_report(
281
+ experiment: dict,
282
+ plan: dict,
283
+ old_data_metrics_before: dict[str, float] | None = None,
284
+ old_data_metrics_after: dict[str, float] | None = None,
285
+ new_data_metrics: dict[str, float] | None = None,
286
+ combined_metrics: dict[str, float] | None = None,
287
+ primary_metric: str = "accuracy",
288
+ tolerance: float = DEFAULT_FORGETTING_TOLERANCE,
289
+ lower_is_better: bool = False,
290
+ update_time_seconds: float | None = None,
291
+ full_retrain_time_seconds: float | None = None,
292
+ ) -> dict:
293
+ """Build a complete update report.
294
+
295
+ Args:
296
+ experiment: Original experiment.
297
+ plan: Update plan from plan_update().
298
+ old_data_metrics_before: Metrics on old data before update.
299
+ old_data_metrics_after: Metrics on old data after update.
300
+ new_data_metrics: Metrics on new data after update.
301
+ combined_metrics: Metrics on combined old+new data after update.
302
+ primary_metric: Primary metric name.
303
+ tolerance: Forgetting tolerance.
304
+ lower_is_better: Whether lower metric is better.
305
+ update_time_seconds: Time for incremental update.
306
+ full_retrain_time_seconds: Estimated time for full retrain.
307
+
308
+ Returns:
309
+ Complete update report.
310
+ """
311
+ exp_id = experiment.get("experiment_id", "unknown")
312
+
313
+ # Forgetting check
314
+ forgetting = None
315
+ if old_data_metrics_before and old_data_metrics_after:
316
+ forgetting = check_forgetting(
317
+ old_data_metrics_before, old_data_metrics_after,
318
+ primary_metric, tolerance, lower_is_better,
319
+ )
320
+
321
+ # Metric comparison table
322
+ metric_table = []
323
+ if old_data_metrics_before and old_data_metrics_after:
324
+ before_val = old_data_metrics_before.get(primary_metric)
325
+ after_val = old_data_metrics_after.get(primary_metric)
326
+ if before_val is not None and after_val is not None:
327
+ delta = after_val - before_val
328
+ metric_table.append({
329
+ "dataset": "Old data",
330
+ "before": round(float(before_val), 4),
331
+ "after": round(float(after_val), 4),
332
+ "delta": round(float(delta), 4),
333
+ })
334
+
335
+ if new_data_metrics:
336
+ new_val = new_data_metrics.get(primary_metric)
337
+ if new_val is not None:
338
+ metric_table.append({
339
+ "dataset": "New data",
340
+ "before": None,
341
+ "after": round(float(new_val), 4),
342
+ "delta": None,
343
+ })
344
+
345
+ if combined_metrics and old_data_metrics_before:
346
+ combined_val = combined_metrics.get(primary_metric)
347
+ before_val = old_data_metrics_before.get(primary_metric)
348
+ if combined_val is not None and before_val is not None:
349
+ delta = combined_val - before_val
350
+ metric_table.append({
351
+ "dataset": "Combined",
352
+ "before": round(float(before_val), 4),
353
+ "after": round(float(combined_val), 4),
354
+ "delta": round(float(delta), 4),
355
+ })
356
+
357
+ # Speedup
358
+ speedup = None
359
+ if update_time_seconds and full_retrain_time_seconds and full_retrain_time_seconds > 0:
360
+ speedup = round(full_retrain_time_seconds / update_time_seconds, 1)
361
+
362
+ return {
363
+ "experiment_id": exp_id,
364
+ "parent_experiment": exp_id,
365
+ "family": "update",
366
+ "plan": plan,
367
+ "metric_table": metric_table,
368
+ "forgetting_check": forgetting,
369
+ "update_time_seconds": update_time_seconds,
370
+ "full_retrain_time_seconds": full_retrain_time_seconds,
371
+ "speedup": speedup,
372
+ "verdict": forgetting["verdict"] if forgetting else "PENDING",
373
+ "generated_at": datetime.now(timezone.utc).isoformat(),
374
+ }
375
+
376
+
377
+ # --- Full Pipeline ---
378
+
379
+
380
+ def incremental_update(
381
+ exp_id: str,
382
+ new_data_path: str | None = None,
383
+ new_data_size: int | None = None,
384
+ replay_ratio: float = DEFAULT_REPLAY_RATIO,
385
+ new_rounds: int = DEFAULT_NEW_ROUNDS,
386
+ tolerance: float = DEFAULT_FORGETTING_TOLERANCE,
387
+ config_path: str = "config.yaml",
388
+ log_path: str = DEFAULT_LOG_PATH,
389
+ ) -> dict:
390
+ """Plan and report an incremental model update.
391
+
392
+ Args:
393
+ exp_id: Experiment ID to update.
394
+ new_data_path: Path to new data file.
395
+ new_data_size: Number of new samples (auto-detected from file if available).
396
+ replay_ratio: Fraction of old data for replay buffer.
397
+ new_rounds: Additional boosting rounds for tree models.
398
+ tolerance: Forgetting tolerance.
399
+ config_path: Path to config.yaml.
400
+ log_path: Path to experiment log.
401
+
402
+ Returns:
403
+ Update report with plan, instructions, and placeholder for results.
404
+ """
405
+ config = load_config(config_path)
406
+ eval_cfg = config.get("evaluation", {})
407
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
408
+ lower_is_better = eval_cfg.get("lower_is_better", False)
409
+
410
+ experiments = load_experiments(log_path)
411
+ experiment = None
412
+ for exp in experiments:
413
+ if exp.get("experiment_id") == exp_id:
414
+ experiment = exp
415
+ break
416
+
417
+ if experiment is None:
418
+ return {"error": f"Experiment {exp_id} not found in log"}
419
+
420
+ # Determine new data size
421
+ if new_data_size is None and new_data_path:
422
+ new_data_size = _count_data_samples(new_data_path)
423
+ if new_data_size is None:
424
+ new_data_size = 0
425
+
426
+ if new_data_size == 0 and new_data_path is None:
427
+ return {"error": "No new data provided. Use --new-data <path> or --new-data-size <N>"}
428
+
429
+ plan = plan_update(experiment, new_data_size, replay_ratio, new_rounds)
430
+
431
+ if "error" in plan:
432
+ return {
433
+ "experiment_id": exp_id,
434
+ "error": plan["error"],
435
+ "suggestion": plan.get("suggestion"),
436
+ }
437
+
438
+ # Build report (actual metrics filled in after execution)
439
+ report = build_update_report(
440
+ experiment=experiment,
441
+ plan=plan,
442
+ primary_metric=primary_metric,
443
+ tolerance=tolerance,
444
+ lower_is_better=lower_is_better,
445
+ )
446
+
447
+ return report
448
+
449
+
450
+ def _count_data_samples(path: str) -> int:
451
+ """Count samples in a data file (CSV/JSONL)."""
452
+ p = Path(path)
453
+ if not p.exists():
454
+ return 0
455
+ try:
456
+ with open(p) as f:
457
+ count = sum(1 for _ in f)
458
+ # Subtract header for CSV
459
+ if p.suffix == ".csv" and count > 0:
460
+ count -= 1
461
+ return max(count, 0)
462
+ except (OSError, UnicodeDecodeError):
463
+ return 0
464
+
465
+
466
+ # --- Report Formatting ---
467
+
468
+
469
+ def save_update_report(report: dict, output_dir: str = "experiments/updates") -> Path:
470
+ """Save update report to YAML."""
471
+ out_path = Path(output_dir)
472
+ out_path.mkdir(parents=True, exist_ok=True)
473
+ exp_id = report.get("experiment_id", "unknown")
474
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
475
+ filepath = out_path / f"{exp_id}-update-{ts}.yaml"
476
+ with open(filepath, "w") as f:
477
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
478
+ return filepath
479
+
480
+
481
+ def format_update_report(report: dict) -> str:
482
+ """Format update report as readable markdown."""
483
+ if "error" in report:
484
+ lines = [f"ERROR: {report['error']}"]
485
+ if "suggestion" in report:
486
+ lines.append(f"\n{report['suggestion']}")
487
+ return "\n".join(lines)
488
+
489
+ lines = ["# Incremental Update Report", ""]
490
+ lines.append(f"**Experiment:** {report.get('experiment_id', 'N/A')}")
491
+
492
+ plan = report.get("plan", {})
493
+ lines.append(f"**Strategy:** {plan.get('strategy', 'N/A')}")
494
+ lines.append(f"**Model type:** {plan.get('model_type', 'N/A')}")
495
+ lines.append("")
496
+
497
+ # Instructions
498
+ instructions = plan.get("instructions", [])
499
+ if instructions:
500
+ lines.append("**Steps:**")
501
+ for step in instructions:
502
+ lines.append(f"1. {step}")
503
+ lines.append("")
504
+
505
+ # Metric table
506
+ metric_table = report.get("metric_table", [])
507
+ if metric_table:
508
+ lines.append("| Dataset | Before | After | Delta |")
509
+ lines.append("|---------|--------|-------|-------|")
510
+ for row in metric_table:
511
+ before = f"{row['before']:.4f}" if row["before"] is not None else "—"
512
+ after = f"{row['after']:.4f}" if row["after"] is not None else "—"
513
+ delta = f"{row['delta']:+.4f}" if row["delta"] is not None else "(first)"
514
+ lines.append(f"| {row['dataset']} | {before} | {after} | {delta} |")
515
+ lines.append("")
516
+
517
+ # Forgetting check
518
+ forgetting = report.get("forgetting_check")
519
+ if forgetting:
520
+ lines.append(f"**Forgetting check:** {forgetting['verdict']}")
521
+ lines.append(f"**Reason:** {forgetting['reason']}")
522
+ lines.append("")
523
+
524
+ # Speedup
525
+ speedup = report.get("speedup")
526
+ if speedup:
527
+ update_time = report.get("update_time_seconds", 0)
528
+ retrain_time = report.get("full_retrain_time_seconds", 0)
529
+ lines.append(f"**Update time:** {update_time:.0f}s (vs {retrain_time:.0f}s full retrain, {speedup}x faster)")
530
+ elif report.get("verdict") == "PENDING":
531
+ lines.append("**Status:** Plan generated — run the update to get metrics")
532
+
533
+ lines.append("")
534
+ lines.append(f"*Generated: {report.get('generated_at', 'N/A')}*")
535
+ return "\n".join(lines)
536
+
537
+
538
+ # --- CLI ---
539
+
540
+
541
+ def main():
542
+ parser = argparse.ArgumentParser(
543
+ description="Incremental model update — add new data without full retraining"
544
+ )
545
+ parser.add_argument("exp_id", nargs="?", help="Experiment ID to update")
546
+ parser.add_argument("--new-data", help="Path to new data file")
547
+ parser.add_argument("--new-data-size", type=int, help="Number of new samples")
548
+ parser.add_argument("--replay-ratio", type=float, default=DEFAULT_REPLAY_RATIO,
549
+ help="Replay buffer ratio for neural nets")
550
+ parser.add_argument("--new-rounds", type=int, default=DEFAULT_NEW_ROUNDS,
551
+ help="Additional boosting rounds for tree models")
552
+ parser.add_argument("--tolerance", type=float, default=DEFAULT_FORGETTING_TOLERANCE,
553
+ help="Max allowed metric degradation on old data")
554
+ parser.add_argument("--config", default="config.yaml", help="Path to config.yaml")
555
+ parser.add_argument("--log", default=DEFAULT_LOG_PATH, help="Path to experiment log")
556
+ parser.add_argument("--json", action="store_true", help="Output raw JSON")
557
+
558
+ args = parser.parse_args()
559
+
560
+ if not args.exp_id:
561
+ parser.error("Please provide an experiment ID")
562
+
563
+ report = incremental_update(
564
+ exp_id=args.exp_id,
565
+ new_data_path=args.new_data,
566
+ new_data_size=args.new_data_size,
567
+ replay_ratio=args.replay_ratio,
568
+ new_rounds=args.new_rounds,
569
+ tolerance=args.tolerance,
570
+ config_path=args.config,
571
+ log_path=args.log,
572
+ )
573
+
574
+ if args.json:
575
+ print(json.dumps(report, indent=2))
576
+ else:
577
+ print(format_update_report(report))
578
+
579
+ if "error" not in report:
580
+ saved = save_update_report(report)
581
+ if not args.json:
582
+ print(f"\nSaved: {saved}")
583
+
584
+
585
+ if __name__ == "__main__":
586
+ main()