claude-turing 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +5 -2
- package/commands/diff.md +48 -0
- package/commands/regress.md +53 -0
- package/commands/turing.md +6 -0
- package/commands/watch.md +60 -0
- package/config/watch_alerts.yaml +36 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +4 -0
- package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_diff.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/regression_gate.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/training_monitor.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
- package/templates/scripts/experiment_diff.py +703 -0
- package/templates/scripts/generate_brief.py +44 -0
- package/templates/scripts/regression_gate.py +536 -0
- package/templates/scripts/scaffold.py +6 -0
- package/templates/scripts/training_monitor.py +611 -0
- package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
|
@@ -292,6 +292,23 @@ def load_reproductions(repro_dir: str = "experiments/reproductions") -> list[dic
|
|
|
292
292
|
return reports
|
|
293
293
|
|
|
294
294
|
|
|
295
|
+
def load_regression_checks(regress_dir: str = "experiments/regressions") -> list[dict]:
|
|
296
|
+
"""Load all regression check reports from YAML files."""
|
|
297
|
+
path = Path(regress_dir)
|
|
298
|
+
if not path.exists():
|
|
299
|
+
return []
|
|
300
|
+
reports = []
|
|
301
|
+
for f in sorted(path.glob("check-*.yaml")):
|
|
302
|
+
try:
|
|
303
|
+
with open(f) as fh:
|
|
304
|
+
report = yaml.safe_load(fh)
|
|
305
|
+
if report and isinstance(report, dict):
|
|
306
|
+
reports.append(report)
|
|
307
|
+
except (yaml.YAMLError, OSError):
|
|
308
|
+
continue
|
|
309
|
+
return reports
|
|
310
|
+
|
|
311
|
+
|
|
295
312
|
def format_brief(
|
|
296
313
|
campaign: dict,
|
|
297
314
|
best: dict | None,
|
|
@@ -309,6 +326,7 @@ def format_brief(
|
|
|
309
326
|
diagnoses: list[dict] | None = None,
|
|
310
327
|
profiles: list[dict] | None = None,
|
|
311
328
|
queue_summary: dict | None = None,
|
|
329
|
+
regression_checks: list[dict] | None = None,
|
|
312
330
|
) -> str:
|
|
313
331
|
"""Format the research briefing as markdown."""
|
|
314
332
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -528,6 +546,30 @@ def format_brief(
|
|
|
528
546
|
if auto_hyps:
|
|
529
547
|
lines.append(f"\n*{auto_hyps} auto-generated hypotheses from failure analysis.*")
|
|
530
548
|
|
|
549
|
+
# Regression check history (stability)
|
|
550
|
+
if regression_checks:
|
|
551
|
+
lines.extend(["", "## Stability", ""])
|
|
552
|
+
verdict_markers = {
|
|
553
|
+
"pass": "PASS",
|
|
554
|
+
"warning": "WARNING",
|
|
555
|
+
"fail": "FAIL",
|
|
556
|
+
}
|
|
557
|
+
for check in regression_checks:
|
|
558
|
+
baseline = check.get("baseline_id", "?")
|
|
559
|
+
verdict = check.get("verdict", "unknown")
|
|
560
|
+
marker = verdict_markers.get(verdict, verdict)
|
|
561
|
+
date = check.get("checked_at", "")[:10]
|
|
562
|
+
mode = check.get("mode", "?")
|
|
563
|
+
lines.append(f"- **{date}** [{marker}] against {baseline} ({mode} mode)")
|
|
564
|
+
if verdict == "fail":
|
|
565
|
+
per_metric = check.get("per_metric", {})
|
|
566
|
+
failed = [k for k, v in per_metric.items() if v.get("verdict") == "fail"]
|
|
567
|
+
if failed:
|
|
568
|
+
lines.append(f" - Failed metrics: {', '.join(failed)}")
|
|
569
|
+
passed = sum(1 for c in regression_checks if c.get("verdict") == "pass")
|
|
570
|
+
total = len(regression_checks)
|
|
571
|
+
lines.append(f"\n*{passed}/{total} regression checks passed.*")
|
|
572
|
+
|
|
531
573
|
lines.extend([
|
|
532
574
|
"",
|
|
533
575
|
"## Recommendations",
|
|
@@ -593,6 +635,7 @@ def generate_brief(
|
|
|
593
635
|
diagnoses = load_diagnoses()
|
|
594
636
|
profiles = load_profiles()
|
|
595
637
|
queue_summary = load_queue_summary()
|
|
638
|
+
regression_checks = load_regression_checks()
|
|
596
639
|
|
|
597
640
|
return format_brief(
|
|
598
641
|
campaign, best, trajectory, model_types, hypotheses,
|
|
@@ -604,6 +647,7 @@ def generate_brief(
|
|
|
604
647
|
diagnoses=diagnoses if diagnoses else None,
|
|
605
648
|
profiles=profiles if profiles else None,
|
|
606
649
|
queue_summary=queue_summary,
|
|
650
|
+
regression_checks=regression_checks if regression_checks else None,
|
|
607
651
|
)
|
|
608
652
|
|
|
609
653
|
|
|
@@ -0,0 +1,536 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Performance regression gate for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
After any code or dependency change, re-runs the best experiment and
|
|
5
|
+
verifies metrics haven't degraded. CI for your model — catches silent
|
|
6
|
+
regressions from library upgrades, data pipeline changes, or accidental
|
|
7
|
+
train.py edits.
|
|
8
|
+
|
|
9
|
+
Verdicts:
|
|
10
|
+
pass — all metrics within tolerance
|
|
11
|
+
warning — some metrics degraded within 2x tolerance
|
|
12
|
+
fail — any metric degraded beyond tolerance
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
python scripts/regression_gate.py
|
|
16
|
+
python scripts/regression_gate.py --tolerance 0.01
|
|
17
|
+
python scripts/regression_gate.py --against exp-042
|
|
18
|
+
python scripts/regression_gate.py --quick
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import json
|
|
25
|
+
import subprocess
|
|
26
|
+
import sys
|
|
27
|
+
from datetime import datetime, timezone
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
import numpy as np
|
|
31
|
+
import yaml
|
|
32
|
+
|
|
33
|
+
from scripts.turing_io import load_config, load_experiments
|
|
34
|
+
|
|
35
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
36
|
+
DEFAULT_TOLERANCE = 0.01 # 1% relative
|
|
37
|
+
DEFAULT_RUNS = 3
|
|
38
|
+
QUICK_RUNS = 1
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def find_best_experiment(
|
|
42
|
+
experiments: list[dict],
|
|
43
|
+
primary_metric: str,
|
|
44
|
+
lower_is_better: bool = False,
|
|
45
|
+
) -> dict | None:
|
|
46
|
+
"""Find the best experiment by primary metric."""
|
|
47
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
48
|
+
if not kept:
|
|
49
|
+
# Fall back to all experiments with the metric
|
|
50
|
+
kept = [e for e in experiments if primary_metric in e.get("metrics", {})]
|
|
51
|
+
|
|
52
|
+
if not kept:
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
def metric_val(exp):
|
|
56
|
+
return exp.get("metrics", {}).get(primary_metric, float("inf") if lower_is_better else float("-inf"))
|
|
57
|
+
|
|
58
|
+
if lower_is_better:
|
|
59
|
+
return min(kept, key=metric_val)
|
|
60
|
+
return max(kept, key=metric_val)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def capture_environment() -> dict:
|
|
64
|
+
"""Capture current environment for regression report."""
|
|
65
|
+
env = {"python_version": sys.version.split()[0]}
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
result = subprocess.run(
|
|
69
|
+
["pip", "freeze"], capture_output=True, text=True, timeout=30,
|
|
70
|
+
)
|
|
71
|
+
if result.returncode == 0:
|
|
72
|
+
packages = {}
|
|
73
|
+
for line in result.stdout.strip().splitlines():
|
|
74
|
+
if "==" in line:
|
|
75
|
+
pkg, ver = line.split("==", 1)
|
|
76
|
+
packages[pkg.lower()] = ver
|
|
77
|
+
env["packages"] = packages
|
|
78
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
79
|
+
env["packages"] = {}
|
|
80
|
+
|
|
81
|
+
# Git info
|
|
82
|
+
try:
|
|
83
|
+
result = subprocess.run(
|
|
84
|
+
["git", "rev-parse", "HEAD"], capture_output=True, text=True, timeout=10,
|
|
85
|
+
)
|
|
86
|
+
if result.returncode == 0:
|
|
87
|
+
env["git_commit"] = result.stdout.strip()
|
|
88
|
+
|
|
89
|
+
result = subprocess.run(
|
|
90
|
+
["git", "diff", "--stat"], capture_output=True, text=True, timeout=10,
|
|
91
|
+
)
|
|
92
|
+
if result.returncode == 0:
|
|
93
|
+
env["git_dirty"] = bool(result.stdout.strip())
|
|
94
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
return env
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def diff_environments(original: dict | None, current: dict) -> list[dict]:
|
|
101
|
+
"""Compare environments and return list of differences."""
|
|
102
|
+
if not original:
|
|
103
|
+
return [{"field": "environment", "detail": "No original environment snapshot"}]
|
|
104
|
+
|
|
105
|
+
diffs = []
|
|
106
|
+
orig_pkgs = original.get("packages", {})
|
|
107
|
+
curr_pkgs = current.get("packages", {})
|
|
108
|
+
|
|
109
|
+
critical_packages = {
|
|
110
|
+
"numpy", "scipy", "scikit-learn", "sklearn", "pandas",
|
|
111
|
+
"torch", "tensorflow", "xgboost", "lightgbm", "catboost",
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
for pkg in sorted(set(orig_pkgs) | set(curr_pkgs)):
|
|
115
|
+
orig_ver = orig_pkgs.get(pkg)
|
|
116
|
+
curr_ver = curr_pkgs.get(pkg)
|
|
117
|
+
if orig_ver and curr_ver and orig_ver != curr_ver:
|
|
118
|
+
severity = "critical" if pkg in critical_packages else "info"
|
|
119
|
+
diffs.append({
|
|
120
|
+
"field": f"package:{pkg}",
|
|
121
|
+
"original": orig_ver,
|
|
122
|
+
"current": curr_ver,
|
|
123
|
+
"severity": severity,
|
|
124
|
+
"detail": f"{pkg}: {orig_ver} -> {curr_ver}",
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
orig_py = original.get("python_version")
|
|
128
|
+
curr_py = current.get("python_version")
|
|
129
|
+
if orig_py and curr_py and orig_py != curr_py:
|
|
130
|
+
diffs.append({
|
|
131
|
+
"field": "python_version",
|
|
132
|
+
"original": orig_py,
|
|
133
|
+
"current": curr_py,
|
|
134
|
+
"severity": "warning",
|
|
135
|
+
"detail": f"Python: {orig_py} -> {curr_py}",
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
return diffs
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def run_regression_check(
|
|
142
|
+
seed: int,
|
|
143
|
+
timeout: int = 600,
|
|
144
|
+
) -> dict | None:
|
|
145
|
+
"""Run train.py once and return parsed metrics."""
|
|
146
|
+
try:
|
|
147
|
+
result = subprocess.run(
|
|
148
|
+
["python", "train.py", "--seed", str(seed)],
|
|
149
|
+
capture_output=True, text=True, timeout=timeout,
|
|
150
|
+
)
|
|
151
|
+
except subprocess.TimeoutExpired:
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
if result.returncode != 0:
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
metrics = {}
|
|
158
|
+
in_block = False
|
|
159
|
+
metadata_keys = {"model_type", "train_seconds"}
|
|
160
|
+
|
|
161
|
+
for line in result.stdout.splitlines():
|
|
162
|
+
line = line.strip()
|
|
163
|
+
if line == "---":
|
|
164
|
+
if in_block:
|
|
165
|
+
break
|
|
166
|
+
in_block = True
|
|
167
|
+
continue
|
|
168
|
+
if in_block and ":" in line:
|
|
169
|
+
key, value = line.split(":", 1)
|
|
170
|
+
key = key.strip()
|
|
171
|
+
value = value.strip()
|
|
172
|
+
if key in metadata_keys:
|
|
173
|
+
metrics[key] = value
|
|
174
|
+
else:
|
|
175
|
+
try:
|
|
176
|
+
metrics[key] = float(value)
|
|
177
|
+
except ValueError:
|
|
178
|
+
metrics[key] = value
|
|
179
|
+
|
|
180
|
+
return metrics if metrics else None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def determine_verdict(
|
|
184
|
+
original_metrics: dict,
|
|
185
|
+
new_metrics_list: list[dict],
|
|
186
|
+
primary_metric: str,
|
|
187
|
+
tolerance: float,
|
|
188
|
+
lower_is_better: bool = False,
|
|
189
|
+
) -> dict:
|
|
190
|
+
"""Determine regression verdict by comparing metrics.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
original_metrics: Original experiment metrics.
|
|
194
|
+
new_metrics_list: List of metric dicts from re-run(s).
|
|
195
|
+
primary_metric: Name of primary metric.
|
|
196
|
+
tolerance: Relative tolerance threshold.
|
|
197
|
+
lower_is_better: Whether lower metric is better.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Verdict dict with pass/warning/fail, per-metric details.
|
|
201
|
+
"""
|
|
202
|
+
per_metric = {}
|
|
203
|
+
overall_verdict = "pass"
|
|
204
|
+
|
|
205
|
+
# Get all numeric metric keys
|
|
206
|
+
all_keys = set()
|
|
207
|
+
for nm in new_metrics_list:
|
|
208
|
+
all_keys.update(nm.keys())
|
|
209
|
+
all_keys &= set(original_metrics.keys())
|
|
210
|
+
|
|
211
|
+
# Filter to numeric metrics only
|
|
212
|
+
numeric_keys = sorted(
|
|
213
|
+
k for k in all_keys
|
|
214
|
+
if isinstance(original_metrics.get(k), (int, float))
|
|
215
|
+
and k not in {"model_type", "train_seconds"}
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
for key in numeric_keys:
|
|
219
|
+
orig_val = original_metrics[key]
|
|
220
|
+
new_vals = [nm[key] for nm in new_metrics_list if key in nm and isinstance(nm.get(key), (int, float))]
|
|
221
|
+
|
|
222
|
+
if not new_vals:
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
new_mean = float(np.mean(new_vals))
|
|
226
|
+
delta = new_mean - orig_val
|
|
227
|
+
rel_diff = abs(delta) / abs(orig_val) if orig_val != 0 else abs(delta)
|
|
228
|
+
|
|
229
|
+
# Determine direction (did it get worse?)
|
|
230
|
+
if lower_is_better:
|
|
231
|
+
degraded = delta > 0 # Higher is worse
|
|
232
|
+
else:
|
|
233
|
+
degraded = delta < 0 # Lower is worse
|
|
234
|
+
|
|
235
|
+
# Determine per-metric verdict
|
|
236
|
+
if not degraded or rel_diff <= 0:
|
|
237
|
+
metric_verdict = "pass"
|
|
238
|
+
elif rel_diff <= tolerance:
|
|
239
|
+
metric_verdict = "pass"
|
|
240
|
+
elif rel_diff <= 2 * tolerance:
|
|
241
|
+
metric_verdict = "warning"
|
|
242
|
+
else:
|
|
243
|
+
metric_verdict = "fail"
|
|
244
|
+
|
|
245
|
+
entry = {
|
|
246
|
+
"original": round(orig_val, 6),
|
|
247
|
+
"new_mean": round(new_mean, 6),
|
|
248
|
+
"new_values": [round(v, 6) for v in new_vals],
|
|
249
|
+
"delta": round(delta, 6),
|
|
250
|
+
"relative_diff": round(rel_diff, 6),
|
|
251
|
+
"degraded": degraded,
|
|
252
|
+
"verdict": metric_verdict,
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
if len(new_vals) > 1:
|
|
256
|
+
entry["new_std"] = round(float(np.std(new_vals, ddof=1)), 6)
|
|
257
|
+
|
|
258
|
+
per_metric[key] = entry
|
|
259
|
+
|
|
260
|
+
# Update overall verdict
|
|
261
|
+
if metric_verdict == "fail" and overall_verdict != "fail":
|
|
262
|
+
overall_verdict = "fail"
|
|
263
|
+
elif metric_verdict == "warning" and overall_verdict == "pass":
|
|
264
|
+
overall_verdict = "warning"
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
"verdict": overall_verdict,
|
|
268
|
+
"per_metric": per_metric,
|
|
269
|
+
"primary_metric": primary_metric,
|
|
270
|
+
"tolerance": tolerance,
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def regression_gate(
|
|
275
|
+
tolerance: float = DEFAULT_TOLERANCE,
|
|
276
|
+
against: str | None = None,
|
|
277
|
+
quick: bool = False,
|
|
278
|
+
n_runs: int = DEFAULT_RUNS,
|
|
279
|
+
config_path: str = "config.yaml",
|
|
280
|
+
log_path: str = DEFAULT_LOG_PATH,
|
|
281
|
+
timeout: int = 600,
|
|
282
|
+
) -> dict:
|
|
283
|
+
"""Run a complete regression check.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
tolerance: Relative tolerance for metric degradation.
|
|
287
|
+
against: Specific experiment ID to check against (default: best).
|
|
288
|
+
quick: Quick mode — 1 run instead of full seed study.
|
|
289
|
+
n_runs: Number of runs (overridden by quick).
|
|
290
|
+
config_path: Path to config.yaml.
|
|
291
|
+
log_path: Path to experiment log.
|
|
292
|
+
timeout: Per-run timeout in seconds.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
Complete regression check report.
|
|
296
|
+
"""
|
|
297
|
+
config = load_config(config_path)
|
|
298
|
+
eval_cfg = config.get("evaluation", {})
|
|
299
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
300
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
301
|
+
|
|
302
|
+
experiments = load_experiments(log_path)
|
|
303
|
+
|
|
304
|
+
if against:
|
|
305
|
+
# Find specific experiment
|
|
306
|
+
baseline = None
|
|
307
|
+
for exp in experiments:
|
|
308
|
+
if exp.get("experiment_id") == against:
|
|
309
|
+
baseline = exp
|
|
310
|
+
break
|
|
311
|
+
if not baseline:
|
|
312
|
+
return {"error": f"Experiment {against} not found in {log_path}"}
|
|
313
|
+
else:
|
|
314
|
+
baseline = find_best_experiment(experiments, primary_metric, lower_is_better)
|
|
315
|
+
if not baseline:
|
|
316
|
+
return {"error": f"No experiments found in {log_path}"}
|
|
317
|
+
|
|
318
|
+
baseline_metrics = baseline.get("metrics", {})
|
|
319
|
+
baseline_id = baseline.get("experiment_id", "unknown")
|
|
320
|
+
baseline_value = baseline_metrics.get(primary_metric)
|
|
321
|
+
|
|
322
|
+
if baseline_value is None:
|
|
323
|
+
return {"error": f"Experiment {baseline_id} has no {primary_metric} metric"}
|
|
324
|
+
|
|
325
|
+
# Determine number of runs
|
|
326
|
+
actual_runs = QUICK_RUNS if quick else n_runs
|
|
327
|
+
|
|
328
|
+
print(f"Regression check against {baseline_id}", file=sys.stderr)
|
|
329
|
+
print(f"Baseline {primary_metric}: {baseline_value:.4f}", file=sys.stderr)
|
|
330
|
+
print(f"Tolerance: {tolerance * 100:.1f}%", file=sys.stderr)
|
|
331
|
+
print(f"Runs: {actual_runs} ({'quick' if quick else 'full'})", file=sys.stderr)
|
|
332
|
+
print(file=sys.stderr)
|
|
333
|
+
|
|
334
|
+
# Capture current environment
|
|
335
|
+
current_env = capture_environment()
|
|
336
|
+
original_env = baseline.get("environment")
|
|
337
|
+
env_diffs = diff_environments(original_env, current_env)
|
|
338
|
+
|
|
339
|
+
# Run checks
|
|
340
|
+
seed = baseline.get("config", {}).get("hyperparams", {}).get("seed", 42)
|
|
341
|
+
new_metrics_list = []
|
|
342
|
+
failed_runs = 0
|
|
343
|
+
|
|
344
|
+
for i in range(actual_runs):
|
|
345
|
+
run_seed = seed + i
|
|
346
|
+
print(f" Run {i + 1}/{actual_runs} (seed={run_seed})...", end=" ", flush=True, file=sys.stderr)
|
|
347
|
+
metrics = run_regression_check(run_seed, timeout=timeout)
|
|
348
|
+
if metrics and primary_metric in metrics:
|
|
349
|
+
new_metrics_list.append(metrics)
|
|
350
|
+
print(f"{primary_metric}={metrics[primary_metric]:.4f}", file=sys.stderr)
|
|
351
|
+
else:
|
|
352
|
+
failed_runs += 1
|
|
353
|
+
print("FAILED", file=sys.stderr)
|
|
354
|
+
|
|
355
|
+
if not new_metrics_list:
|
|
356
|
+
return {
|
|
357
|
+
"error": f"All {actual_runs} regression runs failed",
|
|
358
|
+
"baseline_id": baseline_id,
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
# Determine verdict
|
|
362
|
+
verdict_info = determine_verdict(
|
|
363
|
+
baseline_metrics, new_metrics_list, primary_metric, tolerance, lower_is_better,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
report = {
|
|
367
|
+
"baseline_id": baseline_id,
|
|
368
|
+
"checked_at": datetime.now(timezone.utc).isoformat(),
|
|
369
|
+
"verdict": verdict_info["verdict"],
|
|
370
|
+
"primary_metric": primary_metric,
|
|
371
|
+
"tolerance": tolerance,
|
|
372
|
+
"mode": "quick" if quick else "full",
|
|
373
|
+
"n_runs": len(new_metrics_list),
|
|
374
|
+
"failed_runs": failed_runs,
|
|
375
|
+
"per_metric": verdict_info["per_metric"],
|
|
376
|
+
"environment_diffs": env_diffs,
|
|
377
|
+
"current_environment": current_env,
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
return report
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def save_regression_report(report: dict, output_dir: str = "experiments/regressions") -> Path:
|
|
384
|
+
"""Save regression report to YAML."""
|
|
385
|
+
out_path = Path(output_dir)
|
|
386
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
387
|
+
|
|
388
|
+
date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
389
|
+
filepath = out_path / f"check-{date}.yaml"
|
|
390
|
+
|
|
391
|
+
with open(filepath, "w") as f:
|
|
392
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
393
|
+
|
|
394
|
+
return filepath
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def format_regression_report(report: dict) -> str:
|
|
398
|
+
"""Format regression report as human-readable markdown."""
|
|
399
|
+
if "error" in report:
|
|
400
|
+
return f"ERROR: {report['error']}"
|
|
401
|
+
|
|
402
|
+
verdict = report["verdict"]
|
|
403
|
+
verdict_markers = {
|
|
404
|
+
"pass": "PASS — No regression detected",
|
|
405
|
+
"warning": "WARNING — Minor regression, investigate",
|
|
406
|
+
"fail": "FAIL — REGRESSION DETECTED",
|
|
407
|
+
}
|
|
408
|
+
marker = verdict_markers.get(verdict, verdict)
|
|
409
|
+
|
|
410
|
+
lines = [
|
|
411
|
+
f"# Regression Check: {report.get('baseline_id', '?')}",
|
|
412
|
+
"",
|
|
413
|
+
f"**{marker}**",
|
|
414
|
+
"",
|
|
415
|
+
f"*Checked {report.get('checked_at', 'N/A')[:19]}*",
|
|
416
|
+
f"*Mode: {report.get('mode', '?')}, Tolerance: {report.get('tolerance', 0) * 100:.1f}%*",
|
|
417
|
+
"",
|
|
418
|
+
"## Metric Comparison",
|
|
419
|
+
"",
|
|
420
|
+
f"| Metric | Baseline | Current | Delta | Rel Diff | Verdict |",
|
|
421
|
+
f"|--------|----------|---------|-------|----------|---------|",
|
|
422
|
+
]
|
|
423
|
+
|
|
424
|
+
per_metric = report.get("per_metric", {})
|
|
425
|
+
for key, m in per_metric.items():
|
|
426
|
+
orig = m.get("original", "N/A")
|
|
427
|
+
new = m.get("new_mean", "N/A")
|
|
428
|
+
delta = m.get("delta", 0)
|
|
429
|
+
rel = m.get("relative_diff", 0)
|
|
430
|
+
mv = m.get("verdict", "?").upper()
|
|
431
|
+
|
|
432
|
+
orig_str = f"{orig:.4f}" if isinstance(orig, float) else str(orig)
|
|
433
|
+
new_str = f"{new:.4f}" if isinstance(new, float) else str(new)
|
|
434
|
+
|
|
435
|
+
lines.append(
|
|
436
|
+
f"| {key} | {orig_str} | {new_str} | {delta:+.4f} | {rel * 100:.2f}% | {mv} |"
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Environment diffs
|
|
440
|
+
env_diffs = report.get("environment_diffs", [])
|
|
441
|
+
critical_env = [d for d in env_diffs if d.get("severity") in ("critical", "warning")]
|
|
442
|
+
if critical_env:
|
|
443
|
+
lines.extend(["", "## Environment Changes", ""])
|
|
444
|
+
for d in critical_env:
|
|
445
|
+
lines.append(f"- **[{d.get('severity', 'info').upper()}]** {d.get('detail', 'N/A')}")
|
|
446
|
+
if verdict == "fail":
|
|
447
|
+
lines.append("")
|
|
448
|
+
lines.append("*Environment changes may explain the regression.*")
|
|
449
|
+
|
|
450
|
+
# Run details
|
|
451
|
+
n_runs = report.get("n_runs", 0)
|
|
452
|
+
failed = report.get("failed_runs", 0)
|
|
453
|
+
if n_runs > 1 or failed > 0:
|
|
454
|
+
lines.extend([
|
|
455
|
+
"",
|
|
456
|
+
"## Run Details",
|
|
457
|
+
"",
|
|
458
|
+
f"- **Successful runs:** {n_runs}",
|
|
459
|
+
f"- **Failed runs:** {failed}",
|
|
460
|
+
])
|
|
461
|
+
for key, m in per_metric.items():
|
|
462
|
+
if "new_std" in m:
|
|
463
|
+
lines.append(f"- **{key} std:** {m['new_std']:.6f}")
|
|
464
|
+
|
|
465
|
+
return "\n".join(lines)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def main() -> None:
|
|
469
|
+
"""CLI entry point."""
|
|
470
|
+
parser = argparse.ArgumentParser(
|
|
471
|
+
description="Performance regression gate for ML experiments",
|
|
472
|
+
)
|
|
473
|
+
parser.add_argument(
|
|
474
|
+
"--tolerance", type=float, default=DEFAULT_TOLERANCE,
|
|
475
|
+
help=f"Relative tolerance for regression (default: {DEFAULT_TOLERANCE})",
|
|
476
|
+
)
|
|
477
|
+
parser.add_argument(
|
|
478
|
+
"--against",
|
|
479
|
+
help="Specific experiment ID to check against (default: best)",
|
|
480
|
+
)
|
|
481
|
+
parser.add_argument(
|
|
482
|
+
"--quick", action="store_true",
|
|
483
|
+
help="Quick mode: 1 run instead of full seed study",
|
|
484
|
+
)
|
|
485
|
+
parser.add_argument(
|
|
486
|
+
"--runs", type=int, default=DEFAULT_RUNS,
|
|
487
|
+
help=f"Number of regression runs (default: {DEFAULT_RUNS})",
|
|
488
|
+
)
|
|
489
|
+
parser.add_argument(
|
|
490
|
+
"--config", default="config.yaml",
|
|
491
|
+
help="Path to config.yaml",
|
|
492
|
+
)
|
|
493
|
+
parser.add_argument(
|
|
494
|
+
"--log", default=DEFAULT_LOG_PATH,
|
|
495
|
+
help="Path to experiment log",
|
|
496
|
+
)
|
|
497
|
+
parser.add_argument(
|
|
498
|
+
"--timeout", type=int, default=600,
|
|
499
|
+
help="Per-run timeout in seconds (default: 600)",
|
|
500
|
+
)
|
|
501
|
+
parser.add_argument(
|
|
502
|
+
"--json", action="store_true",
|
|
503
|
+
help="Output raw JSON instead of formatted report",
|
|
504
|
+
)
|
|
505
|
+
args = parser.parse_args()
|
|
506
|
+
|
|
507
|
+
report = regression_gate(
|
|
508
|
+
tolerance=args.tolerance,
|
|
509
|
+
against=args.against,
|
|
510
|
+
quick=args.quick,
|
|
511
|
+
n_runs=args.runs,
|
|
512
|
+
config_path=args.config,
|
|
513
|
+
log_path=args.log,
|
|
514
|
+
timeout=args.timeout,
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Save report
|
|
518
|
+
if "error" not in report:
|
|
519
|
+
filepath = save_regression_report(report)
|
|
520
|
+
print(f"\nSaved to {filepath}", file=sys.stderr)
|
|
521
|
+
|
|
522
|
+
# Output
|
|
523
|
+
if args.json:
|
|
524
|
+
print(json.dumps(report, indent=2, default=str))
|
|
525
|
+
else:
|
|
526
|
+
print(format_regression_report(report))
|
|
527
|
+
|
|
528
|
+
# Exit code based on verdict
|
|
529
|
+
if report.get("verdict") == "fail":
|
|
530
|
+
sys.exit(1)
|
|
531
|
+
elif report.get("verdict") == "warning":
|
|
532
|
+
sys.exit(2)
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
if __name__ == "__main__":
|
|
536
|
+
main()
|
|
@@ -107,6 +107,9 @@ TEMPLATE_DIRS = {
|
|
|
107
107
|
"experiment_queue.py",
|
|
108
108
|
"smart_retry.py",
|
|
109
109
|
"fork_experiment.py",
|
|
110
|
+
"experiment_diff.py",
|
|
111
|
+
"training_monitor.py",
|
|
112
|
+
"regression_gate.py",
|
|
110
113
|
],
|
|
111
114
|
"tests": ["__init__.py", "conftest.py"],
|
|
112
115
|
}
|
|
@@ -127,6 +130,9 @@ DIRECTORIES_TO_CREATE = [
|
|
|
127
130
|
"paper/sections",
|
|
128
131
|
"experiments/retries",
|
|
129
132
|
"experiments/forks",
|
|
133
|
+
"experiments/diffs",
|
|
134
|
+
"experiments/monitors",
|
|
135
|
+
"experiments/regressions",
|
|
130
136
|
"experiments/logs",
|
|
131
137
|
"models/best",
|
|
132
138
|
"models/archive",
|