claude-turing 2.2.1 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +8 -2
- package/commands/diff.md +48 -0
- package/commands/ensemble.md +54 -0
- package/commands/regress.md +53 -0
- package/commands/stitch.md +49 -0
- package/commands/turing.md +12 -0
- package/commands/warm.md +53 -0
- package/commands/watch.md +60 -0
- package/config/watch_alerts.yaml +36 -0
- package/package.json +1 -1
- package/src/install.js +3 -0
- package/src/verify.js +7 -0
- package/templates/scripts/__pycache__/build_ensemble.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_diff.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/pipeline_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/regression_gate.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/training_monitor.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/warm_start.cpython-314.pyc +0 -0
- package/templates/scripts/build_ensemble.py +696 -0
- package/templates/scripts/experiment_diff.py +703 -0
- package/templates/scripts/generate_brief.py +79 -0
- package/templates/scripts/pipeline_manager.py +457 -0
- package/templates/scripts/regression_gate.py +536 -0
- package/templates/scripts/scaffold.py +12 -0
- package/templates/scripts/training_monitor.py +611 -0
- package/templates/scripts/warm_start.py +493 -0
|
@@ -0,0 +1,536 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Performance regression gate for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
After any code or dependency change, re-runs the best experiment and
|
|
5
|
+
verifies metrics haven't degraded. CI for your model — catches silent
|
|
6
|
+
regressions from library upgrades, data pipeline changes, or accidental
|
|
7
|
+
train.py edits.
|
|
8
|
+
|
|
9
|
+
Verdicts:
|
|
10
|
+
pass — all metrics within tolerance
|
|
11
|
+
warning — some metrics degraded within 2x tolerance
|
|
12
|
+
fail — any metric degraded beyond tolerance
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
python scripts/regression_gate.py
|
|
16
|
+
python scripts/regression_gate.py --tolerance 0.01
|
|
17
|
+
python scripts/regression_gate.py --against exp-042
|
|
18
|
+
python scripts/regression_gate.py --quick
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import json
|
|
25
|
+
import subprocess
|
|
26
|
+
import sys
|
|
27
|
+
from datetime import datetime, timezone
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
import numpy as np
|
|
31
|
+
import yaml
|
|
32
|
+
|
|
33
|
+
from scripts.turing_io import load_config, load_experiments
|
|
34
|
+
|
|
35
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
36
|
+
DEFAULT_TOLERANCE = 0.01 # 1% relative
|
|
37
|
+
DEFAULT_RUNS = 3
|
|
38
|
+
QUICK_RUNS = 1
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def find_best_experiment(
|
|
42
|
+
experiments: list[dict],
|
|
43
|
+
primary_metric: str,
|
|
44
|
+
lower_is_better: bool = False,
|
|
45
|
+
) -> dict | None:
|
|
46
|
+
"""Find the best experiment by primary metric."""
|
|
47
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
48
|
+
if not kept:
|
|
49
|
+
# Fall back to all experiments with the metric
|
|
50
|
+
kept = [e for e in experiments if primary_metric in e.get("metrics", {})]
|
|
51
|
+
|
|
52
|
+
if not kept:
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
def metric_val(exp):
|
|
56
|
+
return exp.get("metrics", {}).get(primary_metric, float("inf") if lower_is_better else float("-inf"))
|
|
57
|
+
|
|
58
|
+
if lower_is_better:
|
|
59
|
+
return min(kept, key=metric_val)
|
|
60
|
+
return max(kept, key=metric_val)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def capture_environment() -> dict:
|
|
64
|
+
"""Capture current environment for regression report."""
|
|
65
|
+
env = {"python_version": sys.version.split()[0]}
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
result = subprocess.run(
|
|
69
|
+
["pip", "freeze"], capture_output=True, text=True, timeout=30,
|
|
70
|
+
)
|
|
71
|
+
if result.returncode == 0:
|
|
72
|
+
packages = {}
|
|
73
|
+
for line in result.stdout.strip().splitlines():
|
|
74
|
+
if "==" in line:
|
|
75
|
+
pkg, ver = line.split("==", 1)
|
|
76
|
+
packages[pkg.lower()] = ver
|
|
77
|
+
env["packages"] = packages
|
|
78
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
79
|
+
env["packages"] = {}
|
|
80
|
+
|
|
81
|
+
# Git info
|
|
82
|
+
try:
|
|
83
|
+
result = subprocess.run(
|
|
84
|
+
["git", "rev-parse", "HEAD"], capture_output=True, text=True, timeout=10,
|
|
85
|
+
)
|
|
86
|
+
if result.returncode == 0:
|
|
87
|
+
env["git_commit"] = result.stdout.strip()
|
|
88
|
+
|
|
89
|
+
result = subprocess.run(
|
|
90
|
+
["git", "diff", "--stat"], capture_output=True, text=True, timeout=10,
|
|
91
|
+
)
|
|
92
|
+
if result.returncode == 0:
|
|
93
|
+
env["git_dirty"] = bool(result.stdout.strip())
|
|
94
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
return env
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def diff_environments(original: dict | None, current: dict) -> list[dict]:
|
|
101
|
+
"""Compare environments and return list of differences."""
|
|
102
|
+
if not original:
|
|
103
|
+
return [{"field": "environment", "detail": "No original environment snapshot"}]
|
|
104
|
+
|
|
105
|
+
diffs = []
|
|
106
|
+
orig_pkgs = original.get("packages", {})
|
|
107
|
+
curr_pkgs = current.get("packages", {})
|
|
108
|
+
|
|
109
|
+
critical_packages = {
|
|
110
|
+
"numpy", "scipy", "scikit-learn", "sklearn", "pandas",
|
|
111
|
+
"torch", "tensorflow", "xgboost", "lightgbm", "catboost",
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
for pkg in sorted(set(orig_pkgs) | set(curr_pkgs)):
|
|
115
|
+
orig_ver = orig_pkgs.get(pkg)
|
|
116
|
+
curr_ver = curr_pkgs.get(pkg)
|
|
117
|
+
if orig_ver and curr_ver and orig_ver != curr_ver:
|
|
118
|
+
severity = "critical" if pkg in critical_packages else "info"
|
|
119
|
+
diffs.append({
|
|
120
|
+
"field": f"package:{pkg}",
|
|
121
|
+
"original": orig_ver,
|
|
122
|
+
"current": curr_ver,
|
|
123
|
+
"severity": severity,
|
|
124
|
+
"detail": f"{pkg}: {orig_ver} -> {curr_ver}",
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
orig_py = original.get("python_version")
|
|
128
|
+
curr_py = current.get("python_version")
|
|
129
|
+
if orig_py and curr_py and orig_py != curr_py:
|
|
130
|
+
diffs.append({
|
|
131
|
+
"field": "python_version",
|
|
132
|
+
"original": orig_py,
|
|
133
|
+
"current": curr_py,
|
|
134
|
+
"severity": "warning",
|
|
135
|
+
"detail": f"Python: {orig_py} -> {curr_py}",
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
return diffs
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def run_regression_check(
|
|
142
|
+
seed: int,
|
|
143
|
+
timeout: int = 600,
|
|
144
|
+
) -> dict | None:
|
|
145
|
+
"""Run train.py once and return parsed metrics."""
|
|
146
|
+
try:
|
|
147
|
+
result = subprocess.run(
|
|
148
|
+
["python", "train.py", "--seed", str(seed)],
|
|
149
|
+
capture_output=True, text=True, timeout=timeout,
|
|
150
|
+
)
|
|
151
|
+
except subprocess.TimeoutExpired:
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
if result.returncode != 0:
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
metrics = {}
|
|
158
|
+
in_block = False
|
|
159
|
+
metadata_keys = {"model_type", "train_seconds"}
|
|
160
|
+
|
|
161
|
+
for line in result.stdout.splitlines():
|
|
162
|
+
line = line.strip()
|
|
163
|
+
if line == "---":
|
|
164
|
+
if in_block:
|
|
165
|
+
break
|
|
166
|
+
in_block = True
|
|
167
|
+
continue
|
|
168
|
+
if in_block and ":" in line:
|
|
169
|
+
key, value = line.split(":", 1)
|
|
170
|
+
key = key.strip()
|
|
171
|
+
value = value.strip()
|
|
172
|
+
if key in metadata_keys:
|
|
173
|
+
metrics[key] = value
|
|
174
|
+
else:
|
|
175
|
+
try:
|
|
176
|
+
metrics[key] = float(value)
|
|
177
|
+
except ValueError:
|
|
178
|
+
metrics[key] = value
|
|
179
|
+
|
|
180
|
+
return metrics if metrics else None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def determine_verdict(
|
|
184
|
+
original_metrics: dict,
|
|
185
|
+
new_metrics_list: list[dict],
|
|
186
|
+
primary_metric: str,
|
|
187
|
+
tolerance: float,
|
|
188
|
+
lower_is_better: bool = False,
|
|
189
|
+
) -> dict:
|
|
190
|
+
"""Determine regression verdict by comparing metrics.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
original_metrics: Original experiment metrics.
|
|
194
|
+
new_metrics_list: List of metric dicts from re-run(s).
|
|
195
|
+
primary_metric: Name of primary metric.
|
|
196
|
+
tolerance: Relative tolerance threshold.
|
|
197
|
+
lower_is_better: Whether lower metric is better.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Verdict dict with pass/warning/fail, per-metric details.
|
|
201
|
+
"""
|
|
202
|
+
per_metric = {}
|
|
203
|
+
overall_verdict = "pass"
|
|
204
|
+
|
|
205
|
+
# Get all numeric metric keys
|
|
206
|
+
all_keys = set()
|
|
207
|
+
for nm in new_metrics_list:
|
|
208
|
+
all_keys.update(nm.keys())
|
|
209
|
+
all_keys &= set(original_metrics.keys())
|
|
210
|
+
|
|
211
|
+
# Filter to numeric metrics only
|
|
212
|
+
numeric_keys = sorted(
|
|
213
|
+
k for k in all_keys
|
|
214
|
+
if isinstance(original_metrics.get(k), (int, float))
|
|
215
|
+
and k not in {"model_type", "train_seconds"}
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
for key in numeric_keys:
|
|
219
|
+
orig_val = original_metrics[key]
|
|
220
|
+
new_vals = [nm[key] for nm in new_metrics_list if key in nm and isinstance(nm.get(key), (int, float))]
|
|
221
|
+
|
|
222
|
+
if not new_vals:
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
new_mean = float(np.mean(new_vals))
|
|
226
|
+
delta = new_mean - orig_val
|
|
227
|
+
rel_diff = abs(delta) / abs(orig_val) if orig_val != 0 else abs(delta)
|
|
228
|
+
|
|
229
|
+
# Determine direction (did it get worse?)
|
|
230
|
+
if lower_is_better:
|
|
231
|
+
degraded = delta > 0 # Higher is worse
|
|
232
|
+
else:
|
|
233
|
+
degraded = delta < 0 # Lower is worse
|
|
234
|
+
|
|
235
|
+
# Determine per-metric verdict
|
|
236
|
+
if not degraded or rel_diff <= 0:
|
|
237
|
+
metric_verdict = "pass"
|
|
238
|
+
elif rel_diff <= tolerance:
|
|
239
|
+
metric_verdict = "pass"
|
|
240
|
+
elif rel_diff <= 2 * tolerance:
|
|
241
|
+
metric_verdict = "warning"
|
|
242
|
+
else:
|
|
243
|
+
metric_verdict = "fail"
|
|
244
|
+
|
|
245
|
+
entry = {
|
|
246
|
+
"original": round(orig_val, 6),
|
|
247
|
+
"new_mean": round(new_mean, 6),
|
|
248
|
+
"new_values": [round(v, 6) for v in new_vals],
|
|
249
|
+
"delta": round(delta, 6),
|
|
250
|
+
"relative_diff": round(rel_diff, 6),
|
|
251
|
+
"degraded": degraded,
|
|
252
|
+
"verdict": metric_verdict,
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
if len(new_vals) > 1:
|
|
256
|
+
entry["new_std"] = round(float(np.std(new_vals, ddof=1)), 6)
|
|
257
|
+
|
|
258
|
+
per_metric[key] = entry
|
|
259
|
+
|
|
260
|
+
# Update overall verdict
|
|
261
|
+
if metric_verdict == "fail" and overall_verdict != "fail":
|
|
262
|
+
overall_verdict = "fail"
|
|
263
|
+
elif metric_verdict == "warning" and overall_verdict == "pass":
|
|
264
|
+
overall_verdict = "warning"
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
"verdict": overall_verdict,
|
|
268
|
+
"per_metric": per_metric,
|
|
269
|
+
"primary_metric": primary_metric,
|
|
270
|
+
"tolerance": tolerance,
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def regression_gate(
|
|
275
|
+
tolerance: float = DEFAULT_TOLERANCE,
|
|
276
|
+
against: str | None = None,
|
|
277
|
+
quick: bool = False,
|
|
278
|
+
n_runs: int = DEFAULT_RUNS,
|
|
279
|
+
config_path: str = "config.yaml",
|
|
280
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
281
|
+
timeout: int = 600,
|
|
282
|
+
) -> dict:
|
|
283
|
+
"""Run a complete regression check.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
tolerance: Relative tolerance for metric degradation.
|
|
287
|
+
against: Specific experiment ID to check against (default: best).
|
|
288
|
+
quick: Quick mode — 1 run instead of full seed study.
|
|
289
|
+
n_runs: Number of runs (overridden by quick).
|
|
290
|
+
config_path: Path to config.yaml.
|
|
291
|
+
log_path: Path to experiment log.
|
|
292
|
+
timeout: Per-run timeout in seconds.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
Complete regression check report.
|
|
296
|
+
"""
|
|
297
|
+
config = load_config(config_path)
|
|
298
|
+
eval_cfg = config.get("evaluation", {})
|
|
299
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
300
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
301
|
+
|
|
302
|
+
experiments = load_experiments(log_path)
|
|
303
|
+
|
|
304
|
+
if against:
|
|
305
|
+
# Find specific experiment
|
|
306
|
+
baseline = None
|
|
307
|
+
for exp in experiments:
|
|
308
|
+
if exp.get("experiment_id") == against:
|
|
309
|
+
baseline = exp
|
|
310
|
+
break
|
|
311
|
+
if not baseline:
|
|
312
|
+
return {"error": f"Experiment {against} not found in {log_path}"}
|
|
313
|
+
else:
|
|
314
|
+
baseline = find_best_experiment(experiments, primary_metric, lower_is_better)
|
|
315
|
+
if not baseline:
|
|
316
|
+
return {"error": f"No experiments found in {log_path}"}
|
|
317
|
+
|
|
318
|
+
baseline_metrics = baseline.get("metrics", {})
|
|
319
|
+
baseline_id = baseline.get("experiment_id", "unknown")
|
|
320
|
+
baseline_value = baseline_metrics.get(primary_metric)
|
|
321
|
+
|
|
322
|
+
if baseline_value is None:
|
|
323
|
+
return {"error": f"Experiment {baseline_id} has no {primary_metric} metric"}
|
|
324
|
+
|
|
325
|
+
# Determine number of runs
|
|
326
|
+
actual_runs = QUICK_RUNS if quick else n_runs
|
|
327
|
+
|
|
328
|
+
print(f"Regression check against {baseline_id}", file=sys.stderr)
|
|
329
|
+
print(f"Baseline {primary_metric}: {baseline_value:.4f}", file=sys.stderr)
|
|
330
|
+
print(f"Tolerance: {tolerance * 100:.1f}%", file=sys.stderr)
|
|
331
|
+
print(f"Runs: {actual_runs} ({'quick' if quick else 'full'})", file=sys.stderr)
|
|
332
|
+
print(file=sys.stderr)
|
|
333
|
+
|
|
334
|
+
# Capture current environment
|
|
335
|
+
current_env = capture_environment()
|
|
336
|
+
original_env = baseline.get("environment")
|
|
337
|
+
env_diffs = diff_environments(original_env, current_env)
|
|
338
|
+
|
|
339
|
+
# Run checks
|
|
340
|
+
seed = baseline.get("config", {}).get("hyperparams", {}).get("seed", 42)
|
|
341
|
+
new_metrics_list = []
|
|
342
|
+
failed_runs = 0
|
|
343
|
+
|
|
344
|
+
for i in range(actual_runs):
|
|
345
|
+
run_seed = seed + i
|
|
346
|
+
print(f" Run {i + 1}/{actual_runs} (seed={run_seed})...", end=" ", flush=True, file=sys.stderr)
|
|
347
|
+
metrics = run_regression_check(run_seed, timeout=timeout)
|
|
348
|
+
if metrics and primary_metric in metrics:
|
|
349
|
+
new_metrics_list.append(metrics)
|
|
350
|
+
print(f"{primary_metric}={metrics[primary_metric]:.4f}", file=sys.stderr)
|
|
351
|
+
else:
|
|
352
|
+
failed_runs += 1
|
|
353
|
+
print("FAILED", file=sys.stderr)
|
|
354
|
+
|
|
355
|
+
if not new_metrics_list:
|
|
356
|
+
return {
|
|
357
|
+
"error": f"All {actual_runs} regression runs failed",
|
|
358
|
+
"baseline_id": baseline_id,
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
# Determine verdict
|
|
362
|
+
verdict_info = determine_verdict(
|
|
363
|
+
baseline_metrics, new_metrics_list, primary_metric, tolerance, lower_is_better,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
report = {
|
|
367
|
+
"baseline_id": baseline_id,
|
|
368
|
+
"checked_at": datetime.now(timezone.utc).isoformat(),
|
|
369
|
+
"verdict": verdict_info["verdict"],
|
|
370
|
+
"primary_metric": primary_metric,
|
|
371
|
+
"tolerance": tolerance,
|
|
372
|
+
"mode": "quick" if quick else "full",
|
|
373
|
+
"n_runs": len(new_metrics_list),
|
|
374
|
+
"failed_runs": failed_runs,
|
|
375
|
+
"per_metric": verdict_info["per_metric"],
|
|
376
|
+
"environment_diffs": env_diffs,
|
|
377
|
+
"current_environment": current_env,
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
return report
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def save_regression_report(report: dict, output_dir: str = "experiments/regressions") -> Path:
|
|
384
|
+
"""Save regression report to YAML."""
|
|
385
|
+
out_path = Path(output_dir)
|
|
386
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
387
|
+
|
|
388
|
+
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
389
|
+
filepath = out_path / f"check-{date}.yaml"
|
|
390
|
+
|
|
391
|
+
with open(filepath, "w") as f:
|
|
392
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
393
|
+
|
|
394
|
+
return filepath
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def format_regression_report(report: dict) -> str:
|
|
398
|
+
"""Format regression report as human-readable markdown."""
|
|
399
|
+
if "error" in report:
|
|
400
|
+
return f"ERROR: {report['error']}"
|
|
401
|
+
|
|
402
|
+
verdict = report["verdict"]
|
|
403
|
+
verdict_markers = {
|
|
404
|
+
"pass": "PASS — No regression detected",
|
|
405
|
+
"warning": "WARNING — Minor regression, investigate",
|
|
406
|
+
"fail": "FAIL — REGRESSION DETECTED",
|
|
407
|
+
}
|
|
408
|
+
marker = verdict_markers.get(verdict, verdict)
|
|
409
|
+
|
|
410
|
+
lines = [
|
|
411
|
+
f"# Regression Check: {report.get('baseline_id', '?')}",
|
|
412
|
+
"",
|
|
413
|
+
f"**{marker}**",
|
|
414
|
+
"",
|
|
415
|
+
f"*Checked {report.get('checked_at', 'N/A')[:19]}*",
|
|
416
|
+
f"*Mode: {report.get('mode', '?')}, Tolerance: {report.get('tolerance', 0) * 100:.1f}%*",
|
|
417
|
+
"",
|
|
418
|
+
"## Metric Comparison",
|
|
419
|
+
"",
|
|
420
|
+
f"| Metric | Baseline | Current | Delta | Rel Diff | Verdict |",
|
|
421
|
+
f"|--------|----------|---------|-------|----------|---------|",
|
|
422
|
+
]
|
|
423
|
+
|
|
424
|
+
per_metric = report.get("per_metric", {})
|
|
425
|
+
for key, m in per_metric.items():
|
|
426
|
+
orig = m.get("original", "N/A")
|
|
427
|
+
new = m.get("new_mean", "N/A")
|
|
428
|
+
delta = m.get("delta", 0)
|
|
429
|
+
rel = m.get("relative_diff", 0)
|
|
430
|
+
mv = m.get("verdict", "?").upper()
|
|
431
|
+
|
|
432
|
+
orig_str = f"{orig:.4f}" if isinstance(orig, float) else str(orig)
|
|
433
|
+
new_str = f"{new:.4f}" if isinstance(new, float) else str(new)
|
|
434
|
+
|
|
435
|
+
lines.append(
|
|
436
|
+
f"| {key} | {orig_str} | {new_str} | {delta:+.4f} | {rel * 100:.2f}% | {mv} |"
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Environment diffs
|
|
440
|
+
env_diffs = report.get("environment_diffs", [])
|
|
441
|
+
critical_env = [d for d in env_diffs if d.get("severity") in ("critical", "warning")]
|
|
442
|
+
if critical_env:
|
|
443
|
+
lines.extend(["", "## Environment Changes", ""])
|
|
444
|
+
for d in critical_env:
|
|
445
|
+
lines.append(f"- **[{d.get('severity', 'info').upper()}]** {d.get('detail', 'N/A')}")
|
|
446
|
+
if verdict == "fail":
|
|
447
|
+
lines.append("")
|
|
448
|
+
lines.append("*Environment changes may explain the regression.*")
|
|
449
|
+
|
|
450
|
+
# Run details
|
|
451
|
+
n_runs = report.get("n_runs", 0)
|
|
452
|
+
failed = report.get("failed_runs", 0)
|
|
453
|
+
if n_runs > 1 or failed > 0:
|
|
454
|
+
lines.extend([
|
|
455
|
+
"",
|
|
456
|
+
"## Run Details",
|
|
457
|
+
"",
|
|
458
|
+
f"- **Successful runs:** {n_runs}",
|
|
459
|
+
f"- **Failed runs:** {failed}",
|
|
460
|
+
])
|
|
461
|
+
for key, m in per_metric.items():
|
|
462
|
+
if "new_std" in m:
|
|
463
|
+
lines.append(f"- **{key} std:** {m['new_std']:.6f}")
|
|
464
|
+
|
|
465
|
+
return "\n".join(lines)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def main() -> None:
|
|
469
|
+
"""CLI entry point."""
|
|
470
|
+
parser = argparse.ArgumentParser(
|
|
471
|
+
description="Performance regression gate for ML experiments",
|
|
472
|
+
)
|
|
473
|
+
parser.add_argument(
|
|
474
|
+
"--tolerance", type=float, default=DEFAULT_TOLERANCE,
|
|
475
|
+
help=f"Relative tolerance for regression (default: {DEFAULT_TOLERANCE})",
|
|
476
|
+
)
|
|
477
|
+
parser.add_argument(
|
|
478
|
+
"--against",
|
|
479
|
+
help="Specific experiment ID to check against (default: best)",
|
|
480
|
+
)
|
|
481
|
+
parser.add_argument(
|
|
482
|
+
"--quick", action="store_true",
|
|
483
|
+
help="Quick mode: 1 run instead of full seed study",
|
|
484
|
+
)
|
|
485
|
+
parser.add_argument(
|
|
486
|
+
"--runs", type=int, default=DEFAULT_RUNS,
|
|
487
|
+
help=f"Number of regression runs (default: {DEFAULT_RUNS})",
|
|
488
|
+
)
|
|
489
|
+
parser.add_argument(
|
|
490
|
+
"--config", default="config.yaml",
|
|
491
|
+
help="Path to config.yaml",
|
|
492
|
+
)
|
|
493
|
+
parser.add_argument(
|
|
494
|
+
"--log", default=DEFAULT_LOG_PATH,
|
|
495
|
+
help="Path to experiment log",
|
|
496
|
+
)
|
|
497
|
+
parser.add_argument(
|
|
498
|
+
"--timeout", type=int, default=600,
|
|
499
|
+
help="Per-run timeout in seconds (default: 600)",
|
|
500
|
+
)
|
|
501
|
+
parser.add_argument(
|
|
502
|
+
"--json", action="store_true",
|
|
503
|
+
help="Output raw JSON instead of formatted report",
|
|
504
|
+
)
|
|
505
|
+
args = parser.parse_args()
|
|
506
|
+
|
|
507
|
+
report = regression_gate(
|
|
508
|
+
tolerance=args.tolerance,
|
|
509
|
+
against=args.against,
|
|
510
|
+
quick=args.quick,
|
|
511
|
+
n_runs=args.runs,
|
|
512
|
+
config_path=args.config,
|
|
513
|
+
log_path=args.log,
|
|
514
|
+
timeout=args.timeout,
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Save report
|
|
518
|
+
if "error" not in report:
|
|
519
|
+
filepath = save_regression_report(report)
|
|
520
|
+
print(f"\nSaved to {filepath}", file=sys.stderr)
|
|
521
|
+
|
|
522
|
+
# Output
|
|
523
|
+
if args.json:
|
|
524
|
+
print(json.dumps(report, indent=2, default=str))
|
|
525
|
+
else:
|
|
526
|
+
print(format_regression_report(report))
|
|
527
|
+
|
|
528
|
+
# Exit code based on verdict
|
|
529
|
+
if report.get("verdict") == "fail":
|
|
530
|
+
sys.exit(1)
|
|
531
|
+
elif report.get("verdict") == "warning":
|
|
532
|
+
sys.exit(2)
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
if __name__ == "__main__":
|
|
536
|
+
main()
|
|
@@ -107,6 +107,12 @@ TEMPLATE_DIRS = {
|
|
|
107
107
|
"experiment_queue.py",
|
|
108
108
|
"smart_retry.py",
|
|
109
109
|
"fork_experiment.py",
|
|
110
|
+
"experiment_diff.py",
|
|
111
|
+
"training_monitor.py",
|
|
112
|
+
"regression_gate.py",
|
|
113
|
+
"build_ensemble.py",
|
|
114
|
+
"pipeline_manager.py",
|
|
115
|
+
"warm_start.py",
|
|
110
116
|
],
|
|
111
117
|
"tests": ["__init__.py", "conftest.py"],
|
|
112
118
|
}
|
|
@@ -127,6 +133,12 @@ DIRECTORIES_TO_CREATE = [
|
|
|
127
133
|
"paper/sections",
|
|
128
134
|
"experiments/retries",
|
|
129
135
|
"experiments/forks",
|
|
136
|
+
"experiments/diffs",
|
|
137
|
+
"experiments/monitors",
|
|
138
|
+
"experiments/regressions",
|
|
139
|
+
"experiments/ensembles",
|
|
140
|
+
"experiments/cache",
|
|
141
|
+
"experiments/warm_starts",
|
|
130
142
|
"experiments/logs",
|
|
131
143
|
"models/best",
|
|
132
144
|
"models/archive",
|