claude-turing 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +5 -2
  3. package/commands/diff.md +48 -0
  4. package/commands/regress.md +53 -0
  5. package/commands/turing.md +6 -0
  6. package/commands/watch.md +60 -0
  7. package/config/watch_alerts.yaml +36 -0
  8. package/package.json +1 -1
  9. package/src/install.js +2 -0
  10. package/src/verify.js +4 -0
  11. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  12. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  13. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  14. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
  18. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  19. package/templates/scripts/__pycache__/experiment_diff.cpython-314.pyc +0 -0
  20. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  21. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  22. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  23. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  24. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  25. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  26. package/templates/scripts/__pycache__/regression_gate.cpython-314.pyc +0 -0
  27. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  28. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  29. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  30. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  31. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  32. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  33. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  34. package/templates/scripts/__pycache__/training_monitor.cpython-314.pyc +0 -0
  35. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  36. package/templates/scripts/experiment_diff.py +703 -0
  37. package/templates/scripts/generate_brief.py +44 -0
  38. package/templates/scripts/regression_gate.py +536 -0
  39. package/templates/scripts/scaffold.py +6 -0
  40. package/templates/scripts/training_monitor.py +611 -0
  41. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  42. package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
  43. package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  44. package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
@@ -292,6 +292,23 @@ def load_reproductions(repro_dir: str = "experiments/reproductions") -> list[dic
292
292
  return reports
293
293
 
294
294
 
295
+ def load_regression_checks(regress_dir: str = "experiments/regressions") -> list[dict]:
296
+ """Load all regression check reports from YAML files."""
297
+ path = Path(regress_dir)
298
+ if not path.exists():
299
+ return []
300
+ reports = []
301
+ for f in sorted(path.glob("check-*.yaml")):
302
+ try:
303
+ with open(f) as fh:
304
+ report = yaml.safe_load(fh)
305
+ if report and isinstance(report, dict):
306
+ reports.append(report)
307
+ except (yaml.YAMLError, OSError):
308
+ continue
309
+ return reports
310
+
311
+
295
312
  def format_brief(
296
313
  campaign: dict,
297
314
  best: dict | None,
@@ -309,6 +326,7 @@ def format_brief(
309
326
  diagnoses: list[dict] | None = None,
310
327
  profiles: list[dict] | None = None,
311
328
  queue_summary: dict | None = None,
329
+ regression_checks: list[dict] | None = None,
312
330
  ) -> str:
313
331
  """Format the research briefing as markdown."""
314
332
  direction = "lower" if lower_is_better else "higher"
@@ -528,6 +546,30 @@ def format_brief(
528
546
  if auto_hyps:
529
547
  lines.append(f"\n*{auto_hyps} auto-generated hypotheses from failure analysis.*")
530
548
 
549
+ # Regression check history (stability)
550
+ if regression_checks:
551
+ lines.extend(["", "## Stability", ""])
552
+ verdict_markers = {
553
+ "pass": "PASS",
554
+ "warning": "WARNING",
555
+ "fail": "FAIL",
556
+ }
557
+ for check in regression_checks:
558
+ baseline = check.get("baseline_id", "?")
559
+ verdict = check.get("verdict", "unknown")
560
+ marker = verdict_markers.get(verdict, verdict)
561
+ date = check.get("checked_at", "")[:10]
562
+ mode = check.get("mode", "?")
563
+ lines.append(f"- **{date}** [{marker}] against {baseline} ({mode} mode)")
564
+ if verdict == "fail":
565
+ per_metric = check.get("per_metric", {})
566
+ failed = [k for k, v in per_metric.items() if v.get("verdict") == "fail"]
567
+ if failed:
568
+ lines.append(f" - Failed metrics: {', '.join(failed)}")
569
+ passed = sum(1 for c in regression_checks if c.get("verdict") == "pass")
570
+ total = len(regression_checks)
571
+ lines.append(f"\n*{passed}/{total} regression checks passed.*")
572
+
531
573
  lines.extend([
532
574
  "",
533
575
  "## Recommendations",
@@ -593,6 +635,7 @@ def generate_brief(
593
635
  diagnoses = load_diagnoses()
594
636
  profiles = load_profiles()
595
637
  queue_summary = load_queue_summary()
638
+ regression_checks = load_regression_checks()
596
639
 
597
640
  return format_brief(
598
641
  campaign, best, trajectory, model_types, hypotheses,
@@ -604,6 +647,7 @@ def generate_brief(
604
647
  diagnoses=diagnoses if diagnoses else None,
605
648
  profiles=profiles if profiles else None,
606
649
  queue_summary=queue_summary,
650
+ regression_checks=regression_checks if regression_checks else None,
607
651
  )
608
652
 
609
653
 
@@ -0,0 +1,536 @@
1
+ #!/usr/bin/env python3
2
+ """Performance regression gate for the autoresearch pipeline.
3
+
4
+ After any code or dependency change, re-runs the best experiment and
5
+ verifies metrics haven't degraded. CI for your model — catches silent
6
+ regressions from library upgrades, data pipeline changes, or accidental
7
+ train.py edits.
8
+
9
+ Verdicts:
10
+ pass — all metrics within tolerance
11
+ warning — some metrics degraded within 2x tolerance
12
+ fail — any metric degraded beyond tolerance
13
+
14
+ Usage:
15
+ python scripts/regression_gate.py
16
+ python scripts/regression_gate.py --tolerance 0.01
17
+ python scripts/regression_gate.py --against exp-042
18
+ python scripts/regression_gate.py --quick
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import json
25
+ import subprocess
26
+ import sys
27
+ from datetime import datetime, timezone
28
+ from pathlib import Path
29
+
30
+ import numpy as np
31
+ import yaml
32
+
33
+ from scripts.turing_io import load_config, load_experiments
34
+
35
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
36
+ DEFAULT_TOLERANCE = 0.01 # 1% relative
37
+ DEFAULT_RUNS = 3
38
+ QUICK_RUNS = 1
39
+
40
+
41
+ def find_best_experiment(
42
+ experiments: list[dict],
43
+ primary_metric: str,
44
+ lower_is_better: bool = False,
45
+ ) -> dict | None:
46
+ """Find the best experiment by primary metric."""
47
+ kept = [e for e in experiments if e.get("status") == "kept"]
48
+ if not kept:
49
+ # Fall back to all experiments with the metric
50
+ kept = [e for e in experiments if primary_metric in e.get("metrics", {})]
51
+
52
+ if not kept:
53
+ return None
54
+
55
+ def metric_val(exp):
56
+ return exp.get("metrics", {}).get(primary_metric, float("inf") if lower_is_better else float("-inf"))
57
+
58
+ if lower_is_better:
59
+ return min(kept, key=metric_val)
60
+ return max(kept, key=metric_val)
61
+
62
+
63
+ def capture_environment() -> dict:
64
+ """Capture current environment for regression report."""
65
+ env = {"python_version": sys.version.split()[0]}
66
+
67
+ try:
68
+ result = subprocess.run(
69
+ ["pip", "freeze"], capture_output=True, text=True, timeout=30,
70
+ )
71
+ if result.returncode == 0:
72
+ packages = {}
73
+ for line in result.stdout.strip().splitlines():
74
+ if "==" in line:
75
+ pkg, ver = line.split("==", 1)
76
+ packages[pkg.lower()] = ver
77
+ env["packages"] = packages
78
+ except (subprocess.TimeoutExpired, FileNotFoundError):
79
+ env["packages"] = {}
80
+
81
+ # Git info
82
+ try:
83
+ result = subprocess.run(
84
+ ["git", "rev-parse", "HEAD"], capture_output=True, text=True, timeout=10,
85
+ )
86
+ if result.returncode == 0:
87
+ env["git_commit"] = result.stdout.strip()
88
+
89
+ result = subprocess.run(
90
+ ["git", "diff", "--stat"], capture_output=True, text=True, timeout=10,
91
+ )
92
+ if result.returncode == 0:
93
+ env["git_dirty"] = bool(result.stdout.strip())
94
+ except (subprocess.TimeoutExpired, FileNotFoundError):
95
+ pass
96
+
97
+ return env
98
+
99
+
100
+ def diff_environments(original: dict | None, current: dict) -> list[dict]:
101
+ """Compare environments and return list of differences."""
102
+ if not original:
103
+ return [{"field": "environment", "detail": "No original environment snapshot"}]
104
+
105
+ diffs = []
106
+ orig_pkgs = original.get("packages", {})
107
+ curr_pkgs = current.get("packages", {})
108
+
109
+ critical_packages = {
110
+ "numpy", "scipy", "scikit-learn", "sklearn", "pandas",
111
+ "torch", "tensorflow", "xgboost", "lightgbm", "catboost",
112
+ }
113
+
114
+ for pkg in sorted(set(orig_pkgs) | set(curr_pkgs)):
115
+ orig_ver = orig_pkgs.get(pkg)
116
+ curr_ver = curr_pkgs.get(pkg)
117
+ if orig_ver and curr_ver and orig_ver != curr_ver:
118
+ severity = "critical" if pkg in critical_packages else "info"
119
+ diffs.append({
120
+ "field": f"package:{pkg}",
121
+ "original": orig_ver,
122
+ "current": curr_ver,
123
+ "severity": severity,
124
+ "detail": f"{pkg}: {orig_ver} -> {curr_ver}",
125
+ })
126
+
127
+ orig_py = original.get("python_version")
128
+ curr_py = current.get("python_version")
129
+ if orig_py and curr_py and orig_py != curr_py:
130
+ diffs.append({
131
+ "field": "python_version",
132
+ "original": orig_py,
133
+ "current": curr_py,
134
+ "severity": "warning",
135
+ "detail": f"Python: {orig_py} -> {curr_py}",
136
+ })
137
+
138
+ return diffs
139
+
140
+
141
+ def run_regression_check(
142
+ seed: int,
143
+ timeout: int = 600,
144
+ ) -> dict | None:
145
+ """Run train.py once and return parsed metrics."""
146
+ try:
147
+ result = subprocess.run(
148
+ ["python", "train.py", "--seed", str(seed)],
149
+ capture_output=True, text=True, timeout=timeout,
150
+ )
151
+ except subprocess.TimeoutExpired:
152
+ return None
153
+
154
+ if result.returncode != 0:
155
+ return None
156
+
157
+ metrics = {}
158
+ in_block = False
159
+ metadata_keys = {"model_type", "train_seconds"}
160
+
161
+ for line in result.stdout.splitlines():
162
+ line = line.strip()
163
+ if line == "---":
164
+ if in_block:
165
+ break
166
+ in_block = True
167
+ continue
168
+ if in_block and ":" in line:
169
+ key, value = line.split(":", 1)
170
+ key = key.strip()
171
+ value = value.strip()
172
+ if key in metadata_keys:
173
+ metrics[key] = value
174
+ else:
175
+ try:
176
+ metrics[key] = float(value)
177
+ except ValueError:
178
+ metrics[key] = value
179
+
180
+ return metrics if metrics else None
181
+
182
+
183
+ def determine_verdict(
184
+ original_metrics: dict,
185
+ new_metrics_list: list[dict],
186
+ primary_metric: str,
187
+ tolerance: float,
188
+ lower_is_better: bool = False,
189
+ ) -> dict:
190
+ """Determine regression verdict by comparing metrics.
191
+
192
+ Args:
193
+ original_metrics: Original experiment metrics.
194
+ new_metrics_list: List of metric dicts from re-run(s).
195
+ primary_metric: Name of primary metric.
196
+ tolerance: Relative tolerance threshold.
197
+ lower_is_better: Whether lower metric is better.
198
+
199
+ Returns:
200
+ Verdict dict with pass/warning/fail, per-metric details.
201
+ """
202
+ per_metric = {}
203
+ overall_verdict = "pass"
204
+
205
+ # Get all numeric metric keys
206
+ all_keys = set()
207
+ for nm in new_metrics_list:
208
+ all_keys.update(nm.keys())
209
+ all_keys &= set(original_metrics.keys())
210
+
211
+ # Filter to numeric metrics only
212
+ numeric_keys = sorted(
213
+ k for k in all_keys
214
+ if isinstance(original_metrics.get(k), (int, float))
215
+ and k not in {"model_type", "train_seconds"}
216
+ )
217
+
218
+ for key in numeric_keys:
219
+ orig_val = original_metrics[key]
220
+ new_vals = [nm[key] for nm in new_metrics_list if key in nm and isinstance(nm.get(key), (int, float))]
221
+
222
+ if not new_vals:
223
+ continue
224
+
225
+ new_mean = float(np.mean(new_vals))
226
+ delta = new_mean - orig_val
227
+ rel_diff = abs(delta) / abs(orig_val) if orig_val != 0 else abs(delta)
228
+
229
+ # Determine direction (did it get worse?)
230
+ if lower_is_better:
231
+ degraded = delta > 0 # Higher is worse
232
+ else:
233
+ degraded = delta < 0 # Lower is worse
234
+
235
+ # Determine per-metric verdict
236
+ if not degraded or rel_diff <= 0:
237
+ metric_verdict = "pass"
238
+ elif rel_diff <= tolerance:
239
+ metric_verdict = "pass"
240
+ elif rel_diff <= 2 * tolerance:
241
+ metric_verdict = "warning"
242
+ else:
243
+ metric_verdict = "fail"
244
+
245
+ entry = {
246
+ "original": round(orig_val, 6),
247
+ "new_mean": round(new_mean, 6),
248
+ "new_values": [round(v, 6) for v in new_vals],
249
+ "delta": round(delta, 6),
250
+ "relative_diff": round(rel_diff, 6),
251
+ "degraded": degraded,
252
+ "verdict": metric_verdict,
253
+ }
254
+
255
+ if len(new_vals) > 1:
256
+ entry["new_std"] = round(float(np.std(new_vals, ddof=1)), 6)
257
+
258
+ per_metric[key] = entry
259
+
260
+ # Update overall verdict
261
+ if metric_verdict == "fail" and overall_verdict != "fail":
262
+ overall_verdict = "fail"
263
+ elif metric_verdict == "warning" and overall_verdict == "pass":
264
+ overall_verdict = "warning"
265
+
266
+ return {
267
+ "verdict": overall_verdict,
268
+ "per_metric": per_metric,
269
+ "primary_metric": primary_metric,
270
+ "tolerance": tolerance,
271
+ }
272
+
273
+
274
+ def regression_gate(
275
+ tolerance: float = DEFAULT_TOLERANCE,
276
+ against: str | None = None,
277
+ quick: bool = False,
278
+ n_runs: int = DEFAULT_RUNS,
279
+ config_path: str = "config.yaml",
280
+ log_path: str = DEFAULT_LOG_PATH,
281
+ timeout: int = 600,
282
+ ) -> dict:
283
+ """Run a complete regression check.
284
+
285
+ Args:
286
+ tolerance: Relative tolerance for metric degradation.
287
+ against: Specific experiment ID to check against (default: best).
288
+ quick: Quick mode — 1 run instead of full seed study.
289
+ n_runs: Number of runs (overridden by quick).
290
+ config_path: Path to config.yaml.
291
+ log_path: Path to experiment log.
292
+ timeout: Per-run timeout in seconds.
293
+
294
+ Returns:
295
+ Complete regression check report.
296
+ """
297
+ config = load_config(config_path)
298
+ eval_cfg = config.get("evaluation", {})
299
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
300
+ lower_is_better = eval_cfg.get("lower_is_better", False)
301
+
302
+ experiments = load_experiments(log_path)
303
+
304
+ if against:
305
+ # Find specific experiment
306
+ baseline = None
307
+ for exp in experiments:
308
+ if exp.get("experiment_id") == against:
309
+ baseline = exp
310
+ break
311
+ if not baseline:
312
+ return {"error": f"Experiment {against} not found in {log_path}"}
313
+ else:
314
+ baseline = find_best_experiment(experiments, primary_metric, lower_is_better)
315
+ if not baseline:
316
+ return {"error": f"No experiments found in {log_path}"}
317
+
318
+ baseline_metrics = baseline.get("metrics", {})
319
+ baseline_id = baseline.get("experiment_id", "unknown")
320
+ baseline_value = baseline_metrics.get(primary_metric)
321
+
322
+ if baseline_value is None:
323
+ return {"error": f"Experiment {baseline_id} has no {primary_metric} metric"}
324
+
325
+ # Determine number of runs
326
+ actual_runs = QUICK_RUNS if quick else n_runs
327
+
328
+ print(f"Regression check against {baseline_id}", file=sys.stderr)
329
+ print(f"Baseline {primary_metric}: {baseline_value:.4f}", file=sys.stderr)
330
+ print(f"Tolerance: {tolerance * 100:.1f}%", file=sys.stderr)
331
+ print(f"Runs: {actual_runs} ({'quick' if quick else 'full'})", file=sys.stderr)
332
+ print(file=sys.stderr)
333
+
334
+ # Capture current environment
335
+ current_env = capture_environment()
336
+ original_env = baseline.get("environment")
337
+ env_diffs = diff_environments(original_env, current_env)
338
+
339
+ # Run checks
340
+ seed = baseline.get("config", {}).get("hyperparams", {}).get("seed", 42)
341
+ new_metrics_list = []
342
+ failed_runs = 0
343
+
344
+ for i in range(actual_runs):
345
+ run_seed = seed + i
346
+ print(f" Run {i + 1}/{actual_runs} (seed={run_seed})...", end=" ", flush=True, file=sys.stderr)
347
+ metrics = run_regression_check(run_seed, timeout=timeout)
348
+ if metrics and primary_metric in metrics:
349
+ new_metrics_list.append(metrics)
350
+ print(f"{primary_metric}={metrics[primary_metric]:.4f}", file=sys.stderr)
351
+ else:
352
+ failed_runs += 1
353
+ print("FAILED", file=sys.stderr)
354
+
355
+ if not new_metrics_list:
356
+ return {
357
+ "error": f"All {actual_runs} regression runs failed",
358
+ "baseline_id": baseline_id,
359
+ }
360
+
361
+ # Determine verdict
362
+ verdict_info = determine_verdict(
363
+ baseline_metrics, new_metrics_list, primary_metric, tolerance, lower_is_better,
364
+ )
365
+
366
+ report = {
367
+ "baseline_id": baseline_id,
368
+ "checked_at": datetime.now(timezone.utc).isoformat(),
369
+ "verdict": verdict_info["verdict"],
370
+ "primary_metric": primary_metric,
371
+ "tolerance": tolerance,
372
+ "mode": "quick" if quick else "full",
373
+ "n_runs": len(new_metrics_list),
374
+ "failed_runs": failed_runs,
375
+ "per_metric": verdict_info["per_metric"],
376
+ "environment_diffs": env_diffs,
377
+ "current_environment": current_env,
378
+ }
379
+
380
+ return report
381
+
382
+
383
+ def save_regression_report(report: dict, output_dir: str = "experiments/regressions") -> Path:
384
+ """Save regression report to YAML."""
385
+ out_path = Path(output_dir)
386
+ out_path.mkdir(parents=True, exist_ok=True)
387
+
388
+ date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
389
+ filepath = out_path / f"check-{date}.yaml"
390
+
391
+ with open(filepath, "w") as f:
392
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
393
+
394
+ return filepath
395
+
396
+
397
+ def format_regression_report(report: dict) -> str:
398
+ """Format regression report as human-readable markdown."""
399
+ if "error" in report:
400
+ return f"ERROR: {report['error']}"
401
+
402
+ verdict = report["verdict"]
403
+ verdict_markers = {
404
+ "pass": "PASS — No regression detected",
405
+ "warning": "WARNING — Minor regression, investigate",
406
+ "fail": "FAIL — REGRESSION DETECTED",
407
+ }
408
+ marker = verdict_markers.get(verdict, verdict)
409
+
410
+ lines = [
411
+ f"# Regression Check: {report.get('baseline_id', '?')}",
412
+ "",
413
+ f"**{marker}**",
414
+ "",
415
+ f"*Checked {report.get('checked_at', 'N/A')[:19]}*",
416
+ f"*Mode: {report.get('mode', '?')}, Tolerance: {report.get('tolerance', 0) * 100:.1f}%*",
417
+ "",
418
+ "## Metric Comparison",
419
+ "",
420
+ f"| Metric | Baseline | Current | Delta | Rel Diff | Verdict |",
421
+ f"|--------|----------|---------|-------|----------|---------|",
422
+ ]
423
+
424
+ per_metric = report.get("per_metric", {})
425
+ for key, m in per_metric.items():
426
+ orig = m.get("original", "N/A")
427
+ new = m.get("new_mean", "N/A")
428
+ delta = m.get("delta", 0)
429
+ rel = m.get("relative_diff", 0)
430
+ mv = m.get("verdict", "?").upper()
431
+
432
+ orig_str = f"{orig:.4f}" if isinstance(orig, float) else str(orig)
433
+ new_str = f"{new:.4f}" if isinstance(new, float) else str(new)
434
+
435
+ lines.append(
436
+ f"| {key} | {orig_str} | {new_str} | {delta:+.4f} | {rel * 100:.2f}% | {mv} |"
437
+ )
438
+
439
+ # Environment diffs
440
+ env_diffs = report.get("environment_diffs", [])
441
+ critical_env = [d for d in env_diffs if d.get("severity") in ("critical", "warning")]
442
+ if critical_env:
443
+ lines.extend(["", "## Environment Changes", ""])
444
+ for d in critical_env:
445
+ lines.append(f"- **[{d.get('severity', 'info').upper()}]** {d.get('detail', 'N/A')}")
446
+ if verdict == "fail":
447
+ lines.append("")
448
+ lines.append("*Environment changes may explain the regression.*")
449
+
450
+ # Run details
451
+ n_runs = report.get("n_runs", 0)
452
+ failed = report.get("failed_runs", 0)
453
+ if n_runs > 1 or failed > 0:
454
+ lines.extend([
455
+ "",
456
+ "## Run Details",
457
+ "",
458
+ f"- **Successful runs:** {n_runs}",
459
+ f"- **Failed runs:** {failed}",
460
+ ])
461
+ for key, m in per_metric.items():
462
+ if "new_std" in m:
463
+ lines.append(f"- **{key} std:** {m['new_std']:.6f}")
464
+
465
+ return "\n".join(lines)
466
+
467
+
468
+ def main() -> None:
469
+ """CLI entry point."""
470
+ parser = argparse.ArgumentParser(
471
+ description="Performance regression gate for ML experiments",
472
+ )
473
+ parser.add_argument(
474
+ "--tolerance", type=float, default=DEFAULT_TOLERANCE,
475
+ help=f"Relative tolerance for regression (default: {DEFAULT_TOLERANCE})",
476
+ )
477
+ parser.add_argument(
478
+ "--against",
479
+ help="Specific experiment ID to check against (default: best)",
480
+ )
481
+ parser.add_argument(
482
+ "--quick", action="store_true",
483
+ help="Quick mode: 1 run instead of full seed study",
484
+ )
485
+ parser.add_argument(
486
+ "--runs", type=int, default=DEFAULT_RUNS,
487
+ help=f"Number of regression runs (default: {DEFAULT_RUNS})",
488
+ )
489
+ parser.add_argument(
490
+ "--config", default="config.yaml",
491
+ help="Path to config.yaml",
492
+ )
493
+ parser.add_argument(
494
+ "--log", default=DEFAULT_LOG_PATH,
495
+ help="Path to experiment log",
496
+ )
497
+ parser.add_argument(
498
+ "--timeout", type=int, default=600,
499
+ help="Per-run timeout in seconds (default: 600)",
500
+ )
501
+ parser.add_argument(
502
+ "--json", action="store_true",
503
+ help="Output raw JSON instead of formatted report",
504
+ )
505
+ args = parser.parse_args()
506
+
507
+ report = regression_gate(
508
+ tolerance=args.tolerance,
509
+ against=args.against,
510
+ quick=args.quick,
511
+ n_runs=args.runs,
512
+ config_path=args.config,
513
+ log_path=args.log,
514
+ timeout=args.timeout,
515
+ )
516
+
517
+ # Save report
518
+ if "error" not in report:
519
+ filepath = save_regression_report(report)
520
+ print(f"\nSaved to {filepath}", file=sys.stderr)
521
+
522
+ # Output
523
+ if args.json:
524
+ print(json.dumps(report, indent=2, default=str))
525
+ else:
526
+ print(format_regression_report(report))
527
+
528
+ # Exit code based on verdict
529
+ if report.get("verdict") == "fail":
530
+ sys.exit(1)
531
+ elif report.get("verdict") == "warning":
532
+ sys.exit(2)
533
+
534
+
535
+ if __name__ == "__main__":
536
+ main()
@@ -107,6 +107,9 @@ TEMPLATE_DIRS = {
107
107
  "experiment_queue.py",
108
108
  "smart_retry.py",
109
109
  "fork_experiment.py",
110
+ "experiment_diff.py",
111
+ "training_monitor.py",
112
+ "regression_gate.py",
110
113
  ],
111
114
  "tests": ["__init__.py", "conftest.py"],
112
115
  }
@@ -127,6 +130,9 @@ DIRECTORIES_TO_CREATE = [
127
130
  "paper/sections",
128
131
  "experiments/retries",
129
132
  "experiments/forks",
133
+ "experiments/diffs",
134
+ "experiments/monitors",
135
+ "experiments/regressions",
130
136
  "experiments/logs",
131
137
  "models/best",
132
138
  "models/archive",