claude-turing 2.2.1 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,536 @@
1
+ #!/usr/bin/env python3
2
+ """Performance regression gate for the autoresearch pipeline.
3
+
4
+ After any code or dependency change, re-runs the best experiment and
5
+ verifies metrics haven't degraded. CI for your model — catches silent
6
+ regressions from library upgrades, data pipeline changes, or accidental
7
+ train.py edits.
8
+
9
+ Verdicts:
10
+ pass — all metrics within tolerance
11
+ warning — some metrics degraded within 2x tolerance
12
+ fail — any metric degraded beyond tolerance
13
+
14
+ Usage:
15
+ python scripts/regression_gate.py
16
+ python scripts/regression_gate.py --tolerance 0.01
17
+ python scripts/regression_gate.py --against exp-042
18
+ python scripts/regression_gate.py --quick
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import json
25
+ import subprocess
26
+ import sys
27
+ from datetime import datetime, timezone
28
+ from pathlib import Path
29
+
30
+ import numpy as np
31
+ import yaml
32
+
33
+ from scripts.turing_io import load_config, load_experiments
34
+
35
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
36
+ DEFAULT_TOLERANCE = 0.01 # 1% relative
37
+ DEFAULT_RUNS = 3
38
+ QUICK_RUNS = 1
39
+
40
+
41
+ def find_best_experiment(
42
+ experiments: list[dict],
43
+ primary_metric: str,
44
+ lower_is_better: bool = False,
45
+ ) -> dict | None:
46
+ """Find the best experiment by primary metric."""
47
+ kept = [e for e in experiments if e.get("status") == "kept"]
48
+ if not kept:
49
+ # Fall back to all experiments with the metric
50
+ kept = [e for e in experiments if primary_metric in e.get("metrics", {})]
51
+
52
+ if not kept:
53
+ return None
54
+
55
+ def metric_val(exp):
56
+ return exp.get("metrics", {}).get(primary_metric, float("inf") if lower_is_better else float("-inf"))
57
+
58
+ if lower_is_better:
59
+ return min(kept, key=metric_val)
60
+ return max(kept, key=metric_val)
61
+
62
+
63
+ def capture_environment() -> dict:
64
+ """Capture current environment for regression report."""
65
+ env = {"python_version": sys.version.split()[0]}
66
+
67
+ try:
68
+ result = subprocess.run(
69
+ ["pip", "freeze"], capture_output=True, text=True, timeout=30,
70
+ )
71
+ if result.returncode == 0:
72
+ packages = {}
73
+ for line in result.stdout.strip().splitlines():
74
+ if "==" in line:
75
+ pkg, ver = line.split("==", 1)
76
+ packages[pkg.lower()] = ver
77
+ env["packages"] = packages
78
+ except (subprocess.TimeoutExpired, FileNotFoundError):
79
+ env["packages"] = {}
80
+
81
+ # Git info
82
+ try:
83
+ result = subprocess.run(
84
+ ["git", "rev-parse", "HEAD"], capture_output=True, text=True, timeout=10,
85
+ )
86
+ if result.returncode == 0:
87
+ env["git_commit"] = result.stdout.strip()
88
+
89
+ result = subprocess.run(
90
+ ["git", "diff", "--stat"], capture_output=True, text=True, timeout=10,
91
+ )
92
+ if result.returncode == 0:
93
+ env["git_dirty"] = bool(result.stdout.strip())
94
+ except (subprocess.TimeoutExpired, FileNotFoundError):
95
+ pass
96
+
97
+ return env
98
+
99
+
100
+ def diff_environments(original: dict | None, current: dict) -> list[dict]:
101
+ """Compare environments and return list of differences."""
102
+ if not original:
103
+ return [{"field": "environment", "detail": "No original environment snapshot"}]
104
+
105
+ diffs = []
106
+ orig_pkgs = original.get("packages", {})
107
+ curr_pkgs = current.get("packages", {})
108
+
109
+ critical_packages = {
110
+ "numpy", "scipy", "scikit-learn", "sklearn", "pandas",
111
+ "torch", "tensorflow", "xgboost", "lightgbm", "catboost",
112
+ }
113
+
114
+ for pkg in sorted(set(orig_pkgs) | set(curr_pkgs)):
115
+ orig_ver = orig_pkgs.get(pkg)
116
+ curr_ver = curr_pkgs.get(pkg)
117
+ if orig_ver and curr_ver and orig_ver != curr_ver:
118
+ severity = "critical" if pkg in critical_packages else "info"
119
+ diffs.append({
120
+ "field": f"package:{pkg}",
121
+ "original": orig_ver,
122
+ "current": curr_ver,
123
+ "severity": severity,
124
+ "detail": f"{pkg}: {orig_ver} -> {curr_ver}",
125
+ })
126
+
127
+ orig_py = original.get("python_version")
128
+ curr_py = current.get("python_version")
129
+ if orig_py and curr_py and orig_py != curr_py:
130
+ diffs.append({
131
+ "field": "python_version",
132
+ "original": orig_py,
133
+ "current": curr_py,
134
+ "severity": "warning",
135
+ "detail": f"Python: {orig_py} -> {curr_py}",
136
+ })
137
+
138
+ return diffs
139
+
140
+
141
+ def run_regression_check(
142
+ seed: int,
143
+ timeout: int = 600,
144
+ ) -> dict | None:
145
+ """Run train.py once and return parsed metrics."""
146
+ try:
147
+ result = subprocess.run(
148
+ ["python", "train.py", "--seed", str(seed)],
149
+ capture_output=True, text=True, timeout=timeout,
150
+ )
151
+ except subprocess.TimeoutExpired:
152
+ return None
153
+
154
+ if result.returncode != 0:
155
+ return None
156
+
157
+ metrics = {}
158
+ in_block = False
159
+ metadata_keys = {"model_type", "train_seconds"}
160
+
161
+ for line in result.stdout.splitlines():
162
+ line = line.strip()
163
+ if line == "---":
164
+ if in_block:
165
+ break
166
+ in_block = True
167
+ continue
168
+ if in_block and ":" in line:
169
+ key, value = line.split(":", 1)
170
+ key = key.strip()
171
+ value = value.strip()
172
+ if key in metadata_keys:
173
+ metrics[key] = value
174
+ else:
175
+ try:
176
+ metrics[key] = float(value)
177
+ except ValueError:
178
+ metrics[key] = value
179
+
180
+ return metrics if metrics else None
181
+
182
+
183
+ def determine_verdict(
184
+ original_metrics: dict,
185
+ new_metrics_list: list[dict],
186
+ primary_metric: str,
187
+ tolerance: float,
188
+ lower_is_better: bool = False,
189
+ ) -> dict:
190
+ """Determine regression verdict by comparing metrics.
191
+
192
+ Args:
193
+ original_metrics: Original experiment metrics.
194
+ new_metrics_list: List of metric dicts from re-run(s).
195
+ primary_metric: Name of primary metric.
196
+ tolerance: Relative tolerance threshold.
197
+ lower_is_better: Whether lower metric is better.
198
+
199
+ Returns:
200
+ Verdict dict with pass/warning/fail, per-metric details.
201
+ """
202
+ per_metric = {}
203
+ overall_verdict = "pass"
204
+
205
+ # Get all numeric metric keys
206
+ all_keys = set()
207
+ for nm in new_metrics_list:
208
+ all_keys.update(nm.keys())
209
+ all_keys &= set(original_metrics.keys())
210
+
211
+ # Filter to numeric metrics only
212
+ numeric_keys = sorted(
213
+ k for k in all_keys
214
+ if isinstance(original_metrics.get(k), (int, float))
215
+ and k not in {"model_type", "train_seconds"}
216
+ )
217
+
218
+ for key in numeric_keys:
219
+ orig_val = original_metrics[key]
220
+ new_vals = [nm[key] for nm in new_metrics_list if key in nm and isinstance(nm.get(key), (int, float))]
221
+
222
+ if not new_vals:
223
+ continue
224
+
225
+ new_mean = float(np.mean(new_vals))
226
+ delta = new_mean - orig_val
227
+ rel_diff = abs(delta) / abs(orig_val) if orig_val != 0 else abs(delta)
228
+
229
+ # Determine direction (did it get worse?)
230
+ if lower_is_better:
231
+ degraded = delta > 0 # Higher is worse
232
+ else:
233
+ degraded = delta < 0 # Lower is worse
234
+
235
+ # Determine per-metric verdict
236
+ if not degraded or rel_diff <= 0:
237
+ metric_verdict = "pass"
238
+ elif rel_diff <= tolerance:
239
+ metric_verdict = "pass"
240
+ elif rel_diff <= 2 * tolerance:
241
+ metric_verdict = "warning"
242
+ else:
243
+ metric_verdict = "fail"
244
+
245
+ entry = {
246
+ "original": round(orig_val, 6),
247
+ "new_mean": round(new_mean, 6),
248
+ "new_values": [round(v, 6) for v in new_vals],
249
+ "delta": round(delta, 6),
250
+ "relative_diff": round(rel_diff, 6),
251
+ "degraded": degraded,
252
+ "verdict": metric_verdict,
253
+ }
254
+
255
+ if len(new_vals) > 1:
256
+ entry["new_std"] = round(float(np.std(new_vals, ddof=1)), 6)
257
+
258
+ per_metric[key] = entry
259
+
260
+ # Update overall verdict
261
+ if metric_verdict == "fail" and overall_verdict != "fail":
262
+ overall_verdict = "fail"
263
+ elif metric_verdict == "warning" and overall_verdict == "pass":
264
+ overall_verdict = "warning"
265
+
266
+ return {
267
+ "verdict": overall_verdict,
268
+ "per_metric": per_metric,
269
+ "primary_metric": primary_metric,
270
+ "tolerance": tolerance,
271
+ }
272
+
273
+
274
+ def regression_gate(
275
+ tolerance: float = DEFAULT_TOLERANCE,
276
+ against: str | None = None,
277
+ quick: bool = False,
278
+ n_runs: int = DEFAULT_RUNS,
279
+ config_path: str = "config.yaml",
280
+ log_path: str = DEFAULT_LOG_PATH,
281
+ timeout: int = 600,
282
+ ) -> dict:
283
+ """Run a complete regression check.
284
+
285
+ Args:
286
+ tolerance: Relative tolerance for metric degradation.
287
+ against: Specific experiment ID to check against (default: best).
288
+ quick: Quick mode — 1 run instead of full seed study.
289
+ n_runs: Number of runs (overridden by quick).
290
+ config_path: Path to config.yaml.
291
+ log_path: Path to experiment log.
292
+ timeout: Per-run timeout in seconds.
293
+
294
+ Returns:
295
+ Complete regression check report.
296
+ """
297
+ config = load_config(config_path)
298
+ eval_cfg = config.get("evaluation", {})
299
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
300
+ lower_is_better = eval_cfg.get("lower_is_better", False)
301
+
302
+ experiments = load_experiments(log_path)
303
+
304
+ if against:
305
+ # Find specific experiment
306
+ baseline = None
307
+ for exp in experiments:
308
+ if exp.get("experiment_id") == against:
309
+ baseline = exp
310
+ break
311
+ if not baseline:
312
+ return {"error": f"Experiment {against} not found in {log_path}"}
313
+ else:
314
+ baseline = find_best_experiment(experiments, primary_metric, lower_is_better)
315
+ if not baseline:
316
+ return {"error": f"No experiments found in {log_path}"}
317
+
318
+ baseline_metrics = baseline.get("metrics", {})
319
+ baseline_id = baseline.get("experiment_id", "unknown")
320
+ baseline_value = baseline_metrics.get(primary_metric)
321
+
322
+ if baseline_value is None:
323
+ return {"error": f"Experiment {baseline_id} has no {primary_metric} metric"}
324
+
325
+ # Determine number of runs
326
+ actual_runs = QUICK_RUNS if quick else n_runs
327
+
328
+ print(f"Regression check against {baseline_id}", file=sys.stderr)
329
+ print(f"Baseline {primary_metric}: {baseline_value:.4f}", file=sys.stderr)
330
+ print(f"Tolerance: {tolerance * 100:.1f}%", file=sys.stderr)
331
+ print(f"Runs: {actual_runs} ({'quick' if quick else 'full'})", file=sys.stderr)
332
+ print(file=sys.stderr)
333
+
334
+ # Capture current environment
335
+ current_env = capture_environment()
336
+ original_env = baseline.get("environment")
337
+ env_diffs = diff_environments(original_env, current_env)
338
+
339
+ # Run checks
340
+ seed = baseline.get("config", {}).get("hyperparams", {}).get("seed", 42)
341
+ new_metrics_list = []
342
+ failed_runs = 0
343
+
344
+ for i in range(actual_runs):
345
+ run_seed = seed + i
346
+ print(f" Run {i + 1}/{actual_runs} (seed={run_seed})...", end=" ", flush=True, file=sys.stderr)
347
+ metrics = run_regression_check(run_seed, timeout=timeout)
348
+ if metrics and primary_metric in metrics:
349
+ new_metrics_list.append(metrics)
350
+ print(f"{primary_metric}={metrics[primary_metric]:.4f}", file=sys.stderr)
351
+ else:
352
+ failed_runs += 1
353
+ print("FAILED", file=sys.stderr)
354
+
355
+ if not new_metrics_list:
356
+ return {
357
+ "error": f"All {actual_runs} regression runs failed",
358
+ "baseline_id": baseline_id,
359
+ }
360
+
361
+ # Determine verdict
362
+ verdict_info = determine_verdict(
363
+ baseline_metrics, new_metrics_list, primary_metric, tolerance, lower_is_better,
364
+ )
365
+
366
+ report = {
367
+ "baseline_id": baseline_id,
368
+ "checked_at": datetime.now(timezone.utc).isoformat(),
369
+ "verdict": verdict_info["verdict"],
370
+ "primary_metric": primary_metric,
371
+ "tolerance": tolerance,
372
+ "mode": "quick" if quick else "full",
373
+ "n_runs": len(new_metrics_list),
374
+ "failed_runs": failed_runs,
375
+ "per_metric": verdict_info["per_metric"],
376
+ "environment_diffs": env_diffs,
377
+ "current_environment": current_env,
378
+ }
379
+
380
+ return report
381
+
382
+
383
+ def save_regression_report(report: dict, output_dir: str = "experiments/regressions") -> Path:
384
+ """Save regression report to YAML."""
385
+ out_path = Path(output_dir)
386
+ out_path.mkdir(parents=True, exist_ok=True)
387
+
388
+ date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
389
+ filepath = out_path / f"check-{date}.yaml"
390
+
391
+ with open(filepath, "w") as f:
392
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
393
+
394
+ return filepath
395
+
396
+
397
+ def format_regression_report(report: dict) -> str:
398
+ """Format regression report as human-readable markdown."""
399
+ if "error" in report:
400
+ return f"ERROR: {report['error']}"
401
+
402
+ verdict = report["verdict"]
403
+ verdict_markers = {
404
+ "pass": "PASS — No regression detected",
405
+ "warning": "WARNING — Minor regression, investigate",
406
+ "fail": "FAIL — REGRESSION DETECTED",
407
+ }
408
+ marker = verdict_markers.get(verdict, verdict)
409
+
410
+ lines = [
411
+ f"# Regression Check: {report.get('baseline_id', '?')}",
412
+ "",
413
+ f"**{marker}**",
414
+ "",
415
+ f"*Checked {report.get('checked_at', 'N/A')[:19]}*",
416
+ f"*Mode: {report.get('mode', '?')}, Tolerance: {report.get('tolerance', 0) * 100:.1f}%*",
417
+ "",
418
+ "## Metric Comparison",
419
+ "",
420
+ f"| Metric | Baseline | Current | Delta | Rel Diff | Verdict |",
421
+ f"|--------|----------|---------|-------|----------|---------|",
422
+ ]
423
+
424
+ per_metric = report.get("per_metric", {})
425
+ for key, m in per_metric.items():
426
+ orig = m.get("original", "N/A")
427
+ new = m.get("new_mean", "N/A")
428
+ delta = m.get("delta", 0)
429
+ rel = m.get("relative_diff", 0)
430
+ mv = m.get("verdict", "?").upper()
431
+
432
+ orig_str = f"{orig:.4f}" if isinstance(orig, float) else str(orig)
433
+ new_str = f"{new:.4f}" if isinstance(new, float) else str(new)
434
+
435
+ lines.append(
436
+ f"| {key} | {orig_str} | {new_str} | {delta:+.4f} | {rel * 100:.2f}% | {mv} |"
437
+ )
438
+
439
+ # Environment diffs
440
+ env_diffs = report.get("environment_diffs", [])
441
+ critical_env = [d for d in env_diffs if d.get("severity") in ("critical", "warning")]
442
+ if critical_env:
443
+ lines.extend(["", "## Environment Changes", ""])
444
+ for d in critical_env:
445
+ lines.append(f"- **[{d.get('severity', 'info').upper()}]** {d.get('detail', 'N/A')}")
446
+ if verdict == "fail":
447
+ lines.append("")
448
+ lines.append("*Environment changes may explain the regression.*")
449
+
450
+ # Run details
451
+ n_runs = report.get("n_runs", 0)
452
+ failed = report.get("failed_runs", 0)
453
+ if n_runs > 1 or failed > 0:
454
+ lines.extend([
455
+ "",
456
+ "## Run Details",
457
+ "",
458
+ f"- **Successful runs:** {n_runs}",
459
+ f"- **Failed runs:** {failed}",
460
+ ])
461
+ for key, m in per_metric.items():
462
+ if "new_std" in m:
463
+ lines.append(f"- **{key} std:** {m['new_std']:.6f}")
464
+
465
+ return "\n".join(lines)
466
+
467
+
468
+ def main() -> None:
469
+ """CLI entry point."""
470
+ parser = argparse.ArgumentParser(
471
+ description="Performance regression gate for ML experiments",
472
+ )
473
+ parser.add_argument(
474
+ "--tolerance", type=float, default=DEFAULT_TOLERANCE,
475
+ help=f"Relative tolerance for regression (default: {DEFAULT_TOLERANCE})",
476
+ )
477
+ parser.add_argument(
478
+ "--against",
479
+ help="Specific experiment ID to check against (default: best)",
480
+ )
481
+ parser.add_argument(
482
+ "--quick", action="store_true",
483
+ help="Quick mode: 1 run instead of full seed study",
484
+ )
485
+ parser.add_argument(
486
+ "--runs", type=int, default=DEFAULT_RUNS,
487
+ help=f"Number of regression runs (default: {DEFAULT_RUNS})",
488
+ )
489
+ parser.add_argument(
490
+ "--config", default="config.yaml",
491
+ help="Path to config.yaml",
492
+ )
493
+ parser.add_argument(
494
+ "--log", default=DEFAULT_LOG_PATH,
495
+ help="Path to experiment log",
496
+ )
497
+ parser.add_argument(
498
+ "--timeout", type=int, default=600,
499
+ help="Per-run timeout in seconds (default: 600)",
500
+ )
501
+ parser.add_argument(
502
+ "--json", action="store_true",
503
+ help="Output raw JSON instead of formatted report",
504
+ )
505
+ args = parser.parse_args()
506
+
507
+ report = regression_gate(
508
+ tolerance=args.tolerance,
509
+ against=args.against,
510
+ quick=args.quick,
511
+ n_runs=args.runs,
512
+ config_path=args.config,
513
+ log_path=args.log,
514
+ timeout=args.timeout,
515
+ )
516
+
517
+ # Save report
518
+ if "error" not in report:
519
+ filepath = save_regression_report(report)
520
+ print(f"\nSaved to {filepath}", file=sys.stderr)
521
+
522
+ # Output
523
+ if args.json:
524
+ print(json.dumps(report, indent=2, default=str))
525
+ else:
526
+ print(format_regression_report(report))
527
+
528
+ # Exit code based on verdict
529
+ if report.get("verdict") == "fail":
530
+ sys.exit(1)
531
+ elif report.get("verdict") == "warning":
532
+ sys.exit(2)
533
+
534
+
535
+ if __name__ == "__main__":
536
+ main()
@@ -107,6 +107,12 @@ TEMPLATE_DIRS = {
107
107
  "experiment_queue.py",
108
108
  "smart_retry.py",
109
109
  "fork_experiment.py",
110
+ "experiment_diff.py",
111
+ "training_monitor.py",
112
+ "regression_gate.py",
113
+ "build_ensemble.py",
114
+ "pipeline_manager.py",
115
+ "warm_start.py",
110
116
  ],
111
117
  "tests": ["__init__.py", "conftest.py"],
112
118
  }
@@ -127,6 +133,12 @@ DIRECTORIES_TO_CREATE = [
127
133
  "paper/sections",
128
134
  "experiments/retries",
129
135
  "experiments/forks",
136
+ "experiments/diffs",
137
+ "experiments/monitors",
138
+ "experiments/regressions",
139
+ "experiments/ensembles",
140
+ "experiments/cache",
141
+ "experiments/warm_starts",
130
142
  "experiments/logs",
131
143
  "models/best",
132
144
  "models/archive",