claude-turing 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +33 -2
- package/commands/ablate.md +47 -0
- package/commands/diagnose.md +52 -0
- package/commands/frontier.md +45 -0
- package/commands/reproduce.md +48 -0
- package/commands/seed.md +47 -0
- package/commands/turing.md +10 -0
- package/package.json +1 -1
- package/src/install.js +2 -1
- package/src/verify.js +5 -0
- package/templates/config.yaml +10 -0
- package/templates/program.md +5 -0
- package/templates/scripts/__pycache__/ablation_study.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/diagnose_errors.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/pareto_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/reproduce_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/seed_runner.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/ablation_study.py +487 -0
- package/templates/scripts/diagnose_errors.py +601 -0
- package/templates/scripts/generate_brief.py +117 -0
- package/templates/scripts/generate_model_card.py +25 -0
- package/templates/scripts/leaderboard.py +10 -0
- package/templates/scripts/pareto_frontier.py +470 -0
- package/templates/scripts/reproduce_experiment.py +548 -0
- package/templates/scripts/scaffold.py +11 -0
- package/templates/scripts/seed_runner.py +414 -0
- package/templates/scripts/show_metrics.py +17 -0
- package/templates/scripts/turing_io.py +36 -0
- package/templates/scripts/update_state.py +13 -0
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Reproducibility verification for ML experiments.
|
|
3
|
+
|
|
4
|
+
Given an experiment ID, re-runs it from the logged config and verifies
|
|
5
|
+
metrics fall within tolerance of the original. Catches non-determinism,
|
|
6
|
+
environment drift, and silent data changes.
|
|
7
|
+
|
|
8
|
+
Verdicts:
|
|
9
|
+
reproducible — metrics match within float tolerance (1e-6)
|
|
10
|
+
approximately_reproducible — metrics within user-specified tolerance
|
|
11
|
+
not_reproducible — metrics outside tolerance/CI
|
|
12
|
+
environment_changed — different library versions detected
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
python scripts/reproduce_experiment.py exp-042
|
|
16
|
+
python scripts/reproduce_experiment.py exp-042 --tolerance 0.02
|
|
17
|
+
python scripts/reproduce_experiment.py exp-042 --strict
|
|
18
|
+
python scripts/reproduce_experiment.py exp-042 --runs 5
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import json
|
|
25
|
+
import subprocess
|
|
26
|
+
import sys
|
|
27
|
+
from datetime import datetime, timezone
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
import numpy as np
|
|
31
|
+
import yaml
|
|
32
|
+
|
|
33
|
+
from scripts.turing_io import load_config, load_experiments
|
|
34
|
+
|
|
35
|
+
FLOAT_TOLERANCE = 1e-6
|
|
36
|
+
DEFAULT_TOLERANCE = 0.02 # 2% relative
|
|
37
|
+
DEFAULT_REPRO_RUNS = 3
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def find_experiment(experiments: list[dict], exp_id: str) -> dict | None:
|
|
41
|
+
"""Find an experiment by ID in the log."""
|
|
42
|
+
for exp in experiments:
|
|
43
|
+
if exp.get("experiment_id") == exp_id:
|
|
44
|
+
return exp
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def capture_current_environment() -> dict:
|
|
49
|
+
"""Capture the current Python environment for comparison."""
|
|
50
|
+
env = {}
|
|
51
|
+
|
|
52
|
+
# Python version
|
|
53
|
+
env["python_version"] = sys.version.split()[0]
|
|
54
|
+
|
|
55
|
+
# Installed packages via pip freeze
|
|
56
|
+
try:
|
|
57
|
+
result = subprocess.run(
|
|
58
|
+
["pip", "freeze"], capture_output=True, text=True, timeout=30,
|
|
59
|
+
)
|
|
60
|
+
if result.returncode == 0:
|
|
61
|
+
packages = {}
|
|
62
|
+
for line in result.stdout.strip().splitlines():
|
|
63
|
+
if "==" in line:
|
|
64
|
+
pkg, ver = line.split("==", 1)
|
|
65
|
+
packages[pkg.lower()] = ver
|
|
66
|
+
env["packages"] = packages
|
|
67
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
68
|
+
env["packages"] = {}
|
|
69
|
+
|
|
70
|
+
return env
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def compare_environments(original_env: dict | None, current_env: dict) -> list[dict]:
|
|
74
|
+
"""Compare original experiment environment against current.
|
|
75
|
+
|
|
76
|
+
Returns list of diffs with severity (info, warning, critical).
|
|
77
|
+
"""
|
|
78
|
+
if not original_env:
|
|
79
|
+
return [{"field": "environment", "severity": "info",
|
|
80
|
+
"detail": "No environment snapshot in original experiment"}]
|
|
81
|
+
|
|
82
|
+
diffs = []
|
|
83
|
+
|
|
84
|
+
# Python version
|
|
85
|
+
orig_py = original_env.get("python_version", "unknown")
|
|
86
|
+
curr_py = current_env.get("python_version", "unknown")
|
|
87
|
+
if orig_py != curr_py:
|
|
88
|
+
diffs.append({
|
|
89
|
+
"field": "python_version",
|
|
90
|
+
"original": orig_py,
|
|
91
|
+
"current": curr_py,
|
|
92
|
+
"severity": "warning",
|
|
93
|
+
"detail": f"Python version changed: {orig_py} -> {curr_py}",
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
# Package versions
|
|
97
|
+
orig_pkgs = original_env.get("packages", {})
|
|
98
|
+
curr_pkgs = current_env.get("packages", {})
|
|
99
|
+
|
|
100
|
+
# Key ML packages that affect reproducibility
|
|
101
|
+
critical_packages = {
|
|
102
|
+
"numpy", "scipy", "scikit-learn", "sklearn", "pandas",
|
|
103
|
+
"torch", "tensorflow", "xgboost", "lightgbm", "catboost",
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
all_pkgs = set(orig_pkgs) | set(curr_pkgs)
|
|
107
|
+
for pkg in sorted(all_pkgs):
|
|
108
|
+
orig_ver = orig_pkgs.get(pkg)
|
|
109
|
+
curr_ver = curr_pkgs.get(pkg)
|
|
110
|
+
if orig_ver and curr_ver and orig_ver != curr_ver:
|
|
111
|
+
severity = "critical" if pkg in critical_packages else "info"
|
|
112
|
+
diffs.append({
|
|
113
|
+
"field": f"package:{pkg}",
|
|
114
|
+
"original": orig_ver,
|
|
115
|
+
"current": curr_ver,
|
|
116
|
+
"severity": severity,
|
|
117
|
+
"detail": f"{pkg}: {orig_ver} -> {curr_ver}",
|
|
118
|
+
})
|
|
119
|
+
elif orig_ver and not curr_ver:
|
|
120
|
+
severity = "warning" if pkg in critical_packages else "info"
|
|
121
|
+
diffs.append({
|
|
122
|
+
"field": f"package:{pkg}",
|
|
123
|
+
"original": orig_ver,
|
|
124
|
+
"current": "missing",
|
|
125
|
+
"severity": severity,
|
|
126
|
+
"detail": f"{pkg} {orig_ver} was present but is now missing",
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
return diffs
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def run_single_reproduction(seed: int, timeout: int = 600) -> dict | None:
|
|
133
|
+
"""Run train.py with given seed and return parsed metrics."""
|
|
134
|
+
cmd = ["python", "train.py", "--seed", str(seed)]
|
|
135
|
+
try:
|
|
136
|
+
proc = subprocess.run(
|
|
137
|
+
cmd, capture_output=True, text=True, timeout=timeout,
|
|
138
|
+
)
|
|
139
|
+
except subprocess.TimeoutExpired:
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
if proc.returncode != 0:
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
metrics = {}
|
|
146
|
+
in_block = False
|
|
147
|
+
metadata_keys = {"model_type", "train_seconds"}
|
|
148
|
+
|
|
149
|
+
for line in proc.stdout.splitlines():
|
|
150
|
+
line = line.strip()
|
|
151
|
+
if line == "---":
|
|
152
|
+
if in_block:
|
|
153
|
+
break
|
|
154
|
+
in_block = True
|
|
155
|
+
continue
|
|
156
|
+
if in_block and ":" in line:
|
|
157
|
+
key, value = line.split(":", 1)
|
|
158
|
+
key = key.strip()
|
|
159
|
+
value = value.strip()
|
|
160
|
+
if key in metadata_keys:
|
|
161
|
+
metrics[key] = value
|
|
162
|
+
else:
|
|
163
|
+
try:
|
|
164
|
+
metrics[key] = float(value)
|
|
165
|
+
except ValueError:
|
|
166
|
+
metrics[key] = value
|
|
167
|
+
|
|
168
|
+
return metrics if metrics else None
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def determine_verdict(
|
|
172
|
+
original_value: float,
|
|
173
|
+
new_values: list[float],
|
|
174
|
+
tolerance: float,
|
|
175
|
+
strict: bool,
|
|
176
|
+
) -> dict:
|
|
177
|
+
"""Determine reproducibility verdict.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
original_value: Metric from the original experiment.
|
|
181
|
+
new_values: Metrics from reproduction run(s).
|
|
182
|
+
tolerance: Relative tolerance for approximate match.
|
|
183
|
+
strict: If True, require exact match within float tolerance.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
Dict with verdict, reason, and statistical details.
|
|
187
|
+
"""
|
|
188
|
+
arr = np.array(new_values)
|
|
189
|
+
new_mean = float(np.mean(arr))
|
|
190
|
+
n = len(arr)
|
|
191
|
+
|
|
192
|
+
result = {
|
|
193
|
+
"original_value": round(original_value, 6),
|
|
194
|
+
"new_mean": round(new_mean, 6),
|
|
195
|
+
"new_values": [round(v, 6) for v in new_values],
|
|
196
|
+
"n_runs": n,
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
if n > 1:
|
|
200
|
+
new_std = float(np.std(arr, ddof=1))
|
|
201
|
+
from scipy import stats as scipy_stats
|
|
202
|
+
t_crit = scipy_stats.t.ppf(0.975, df=n - 1)
|
|
203
|
+
margin = t_crit * new_std / np.sqrt(n)
|
|
204
|
+
ci_lower = new_mean - margin
|
|
205
|
+
ci_upper = new_mean + margin
|
|
206
|
+
result["new_std"] = round(new_std, 6)
|
|
207
|
+
result["ci_95"] = [round(ci_lower, 6), round(ci_upper, 6)]
|
|
208
|
+
else:
|
|
209
|
+
ci_lower = new_values[0]
|
|
210
|
+
ci_upper = new_values[0]
|
|
211
|
+
result["new_std"] = 0.0
|
|
212
|
+
result["ci_95"] = [round(ci_lower, 6), round(ci_upper, 6)]
|
|
213
|
+
|
|
214
|
+
# Check exact match (deterministic case)
|
|
215
|
+
if strict or n == 1:
|
|
216
|
+
diff = abs(original_value - new_mean)
|
|
217
|
+
if diff < FLOAT_TOLERANCE:
|
|
218
|
+
result["verdict"] = "reproducible"
|
|
219
|
+
result["reason"] = f"Exact match within float tolerance ({FLOAT_TOLERANCE})"
|
|
220
|
+
return result
|
|
221
|
+
if strict:
|
|
222
|
+
result["verdict"] = "not_reproducible"
|
|
223
|
+
result["reason"] = f"Strict mode: difference {diff:.6f} exceeds float tolerance {FLOAT_TOLERANCE}"
|
|
224
|
+
return result
|
|
225
|
+
|
|
226
|
+
# Check approximate match (within tolerance)
|
|
227
|
+
relative_diff = abs(original_value - new_mean) / abs(original_value) if original_value != 0 else abs(new_mean)
|
|
228
|
+
result["relative_difference"] = round(relative_diff, 6)
|
|
229
|
+
|
|
230
|
+
if relative_diff <= tolerance:
|
|
231
|
+
result["verdict"] = "approximately_reproducible"
|
|
232
|
+
result["reason"] = (
|
|
233
|
+
f"Within {tolerance*100:.1f}% tolerance "
|
|
234
|
+
f"(actual difference: {relative_diff*100:.2f}%)"
|
|
235
|
+
)
|
|
236
|
+
return result
|
|
237
|
+
|
|
238
|
+
# Check if original falls within CI of new distribution
|
|
239
|
+
if n > 1 and ci_lower <= original_value <= ci_upper:
|
|
240
|
+
result["verdict"] = "approximately_reproducible"
|
|
241
|
+
result["reason"] = (
|
|
242
|
+
f"Original value {original_value:.4f} falls within 95% CI "
|
|
243
|
+
f"[{ci_lower:.4f}, {ci_upper:.4f}] of reproduction distribution"
|
|
244
|
+
)
|
|
245
|
+
return result
|
|
246
|
+
|
|
247
|
+
# Not reproducible
|
|
248
|
+
result["verdict"] = "not_reproducible"
|
|
249
|
+
result["reason"] = (
|
|
250
|
+
f"Difference {relative_diff*100:.2f}% exceeds {tolerance*100:.1f}% tolerance, "
|
|
251
|
+
f"and original {original_value:.4f} outside 95% CI "
|
|
252
|
+
f"[{ci_lower:.4f}, {ci_upper:.4f}]"
|
|
253
|
+
)
|
|
254
|
+
return result
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def reproduce_experiment(
|
|
258
|
+
exp_id: str,
|
|
259
|
+
tolerance: float = DEFAULT_TOLERANCE,
|
|
260
|
+
strict: bool = False,
|
|
261
|
+
n_runs: int = DEFAULT_REPRO_RUNS,
|
|
262
|
+
config_path: str = "config.yaml",
|
|
263
|
+
log_path: str = "experiments/log.jsonl",
|
|
264
|
+
timeout: int = 600,
|
|
265
|
+
) -> dict:
|
|
266
|
+
"""Run a complete reproducibility verification.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
exp_id: Experiment ID to reproduce.
|
|
270
|
+
tolerance: Relative tolerance for approximate match.
|
|
271
|
+
strict: Require exact match (overrides tolerance).
|
|
272
|
+
n_runs: Number of reproduction runs (1 for strict/deterministic).
|
|
273
|
+
config_path: Path to config.yaml.
|
|
274
|
+
log_path: Path to experiment log.
|
|
275
|
+
timeout: Per-run timeout in seconds.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Complete reproduction report dict.
|
|
279
|
+
"""
|
|
280
|
+
config = load_config(config_path)
|
|
281
|
+
eval_cfg = config.get("evaluation", {})
|
|
282
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
283
|
+
|
|
284
|
+
experiments = load_experiments(log_path)
|
|
285
|
+
original = find_experiment(experiments, exp_id)
|
|
286
|
+
|
|
287
|
+
if not original:
|
|
288
|
+
return {
|
|
289
|
+
"error": f"Experiment {exp_id} not found in {log_path}",
|
|
290
|
+
"experiment_id": exp_id,
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
original_metrics = original.get("metrics", {})
|
|
294
|
+
original_value = original_metrics.get(primary_metric)
|
|
295
|
+
if original_value is None:
|
|
296
|
+
return {
|
|
297
|
+
"error": f"Experiment {exp_id} has no {primary_metric} metric",
|
|
298
|
+
"experiment_id": exp_id,
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
# Extract seed from original experiment
|
|
302
|
+
original_seed = original.get("config", {}).get("hyperparams", {}).get("seed", 42)
|
|
303
|
+
# Also check environment for seed
|
|
304
|
+
if original.get("environment", {}).get("seed") is not None:
|
|
305
|
+
original_seed = original["environment"]["seed"]
|
|
306
|
+
|
|
307
|
+
# If strict, run once with same seed
|
|
308
|
+
if strict:
|
|
309
|
+
n_runs = 1
|
|
310
|
+
|
|
311
|
+
print(f"Reproducing {exp_id}", file=sys.stderr)
|
|
312
|
+
print(f"Original {primary_metric}: {original_value:.4f}", file=sys.stderr)
|
|
313
|
+
print(f"Mode: {'strict (exact match)' if strict else f'tolerance={tolerance*100:.1f}%'}", file=sys.stderr)
|
|
314
|
+
print(f"Runs: {n_runs} (seed={original_seed})", file=sys.stderr)
|
|
315
|
+
print(file=sys.stderr)
|
|
316
|
+
|
|
317
|
+
# Capture current environment
|
|
318
|
+
current_env = capture_current_environment()
|
|
319
|
+
original_env = original.get("environment")
|
|
320
|
+
env_diffs = compare_environments(original_env, current_env)
|
|
321
|
+
|
|
322
|
+
# Run reproductions
|
|
323
|
+
new_values = []
|
|
324
|
+
failed_runs = 0
|
|
325
|
+
for i in range(n_runs):
|
|
326
|
+
seed = original_seed if strict else original_seed + i
|
|
327
|
+
print(f" Run {i + 1}/{n_runs} (seed={seed})...", end=" ", flush=True, file=sys.stderr)
|
|
328
|
+
metrics = run_single_reproduction(seed, timeout=timeout)
|
|
329
|
+
if metrics and primary_metric in metrics:
|
|
330
|
+
val = metrics[primary_metric]
|
|
331
|
+
new_values.append(val)
|
|
332
|
+
print(f"{primary_metric}={val:.4f}", file=sys.stderr)
|
|
333
|
+
else:
|
|
334
|
+
failed_runs += 1
|
|
335
|
+
print("FAILED", file=sys.stderr)
|
|
336
|
+
|
|
337
|
+
if not new_values:
|
|
338
|
+
return {
|
|
339
|
+
"error": f"All {n_runs} reproduction runs failed",
|
|
340
|
+
"experiment_id": exp_id,
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
# Determine verdict
|
|
344
|
+
verdict_info = determine_verdict(original_value, new_values, tolerance, strict)
|
|
345
|
+
|
|
346
|
+
# Check for environment changes
|
|
347
|
+
has_env_changes = any(d["severity"] in ("warning", "critical") for d in env_diffs)
|
|
348
|
+
if has_env_changes and verdict_info["verdict"] == "not_reproducible":
|
|
349
|
+
verdict_info["verdict"] = "environment_changed"
|
|
350
|
+
verdict_info["reason"] += " (environment differences detected — this may be the cause)"
|
|
351
|
+
|
|
352
|
+
# Build full report
|
|
353
|
+
report = {
|
|
354
|
+
"experiment_id": exp_id,
|
|
355
|
+
"reproduced_at": datetime.now(timezone.utc).isoformat(),
|
|
356
|
+
"original_metrics": {primary_metric: round(original_value, 6)},
|
|
357
|
+
"original_config": original.get("config", {}),
|
|
358
|
+
"original_git_commit": original.get("git_commit"),
|
|
359
|
+
"new_metrics": {primary_metric: verdict_info["new_mean"]},
|
|
360
|
+
"verdict": verdict_info["verdict"],
|
|
361
|
+
"reason": verdict_info["reason"],
|
|
362
|
+
"statistical_details": {
|
|
363
|
+
k: v for k, v in verdict_info.items()
|
|
364
|
+
if k not in ("verdict", "reason")
|
|
365
|
+
},
|
|
366
|
+
"strictness": f"exact (1e-6)" if strict else f"tolerance={tolerance}",
|
|
367
|
+
"n_runs": len(new_values),
|
|
368
|
+
"failed_runs": failed_runs,
|
|
369
|
+
"environment_changed": has_env_changes,
|
|
370
|
+
"environment_diffs": env_diffs if env_diffs else [],
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
return report
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def save_reproduction_report(report: dict, output_dir: str = "experiments/reproductions") -> Path:
|
|
377
|
+
"""Save reproduction report to YAML file."""
|
|
378
|
+
out_path = Path(output_dir)
|
|
379
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
|
380
|
+
|
|
381
|
+
exp_id = report.get("experiment_id", "unknown")
|
|
382
|
+
filename = f"{exp_id}-repro.yaml"
|
|
383
|
+
filepath = out_path / filename
|
|
384
|
+
|
|
385
|
+
with open(filepath, "w") as f:
|
|
386
|
+
yaml.dump(report, f, default_flow_style=False, sort_keys=False)
|
|
387
|
+
|
|
388
|
+
return filepath
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def format_reproduction_report(report: dict) -> str:
|
|
392
|
+
"""Format reproduction report as human-readable markdown."""
|
|
393
|
+
if "error" in report:
|
|
394
|
+
return f"ERROR: {report['error']}"
|
|
395
|
+
|
|
396
|
+
exp_id = report["experiment_id"]
|
|
397
|
+
verdict = report["verdict"]
|
|
398
|
+
details = report.get("statistical_details", {})
|
|
399
|
+
|
|
400
|
+
# Verdict emoji/marker
|
|
401
|
+
verdict_markers = {
|
|
402
|
+
"reproducible": "PASS",
|
|
403
|
+
"approximately_reproducible": "PASS (approx)",
|
|
404
|
+
"not_reproducible": "FAIL",
|
|
405
|
+
"environment_changed": "WARN",
|
|
406
|
+
}
|
|
407
|
+
marker = verdict_markers.get(verdict, verdict)
|
|
408
|
+
|
|
409
|
+
lines = [
|
|
410
|
+
f"# Reproducibility Report: {exp_id}",
|
|
411
|
+
"",
|
|
412
|
+
f"**Verdict: {marker}**",
|
|
413
|
+
"",
|
|
414
|
+
f"*{report['reason']}*",
|
|
415
|
+
"",
|
|
416
|
+
"## Comparison",
|
|
417
|
+
"",
|
|
418
|
+
"| Metric | Original | Reproduced |",
|
|
419
|
+
"|--------|----------|------------|",
|
|
420
|
+
]
|
|
421
|
+
|
|
422
|
+
for metric, orig_val in report.get("original_metrics", {}).items():
|
|
423
|
+
new_val = report.get("new_metrics", {}).get(metric, "N/A")
|
|
424
|
+
if isinstance(orig_val, float) and isinstance(new_val, float):
|
|
425
|
+
lines.append(f"| {metric} | {orig_val:.4f} | {new_val:.4f} |")
|
|
426
|
+
else:
|
|
427
|
+
lines.append(f"| {metric} | {orig_val} | {new_val} |")
|
|
428
|
+
|
|
429
|
+
if details.get("n_runs", 0) > 1:
|
|
430
|
+
lines.extend([
|
|
431
|
+
"",
|
|
432
|
+
"## Statistical Details",
|
|
433
|
+
"",
|
|
434
|
+
f"- **Reproduction runs:** {details.get('n_runs', 'N/A')}",
|
|
435
|
+
f"- **New values:** {details.get('new_values', [])}",
|
|
436
|
+
])
|
|
437
|
+
if "new_std" in details:
|
|
438
|
+
lines.append(f"- **New std:** {details['new_std']:.6f}")
|
|
439
|
+
if "ci_95" in details:
|
|
440
|
+
ci = details["ci_95"]
|
|
441
|
+
lines.append(f"- **95% CI:** [{ci[0]:.4f}, {ci[1]:.4f}]")
|
|
442
|
+
if "relative_difference" in details:
|
|
443
|
+
lines.append(f"- **Relative difference:** {details['relative_difference']*100:.2f}%")
|
|
444
|
+
|
|
445
|
+
# Environment section
|
|
446
|
+
env_diffs = report.get("environment_diffs", [])
|
|
447
|
+
if env_diffs:
|
|
448
|
+
lines.extend([
|
|
449
|
+
"",
|
|
450
|
+
"## Environment",
|
|
451
|
+
"",
|
|
452
|
+
])
|
|
453
|
+
has_changes = False
|
|
454
|
+
for diff in env_diffs:
|
|
455
|
+
if diff["severity"] == "info" and "No environment snapshot" in diff.get("detail", ""):
|
|
456
|
+
lines.append(f"- {diff['detail']}")
|
|
457
|
+
elif diff["severity"] != "info":
|
|
458
|
+
has_changes = True
|
|
459
|
+
sev = diff["severity"].upper()
|
|
460
|
+
lines.append(f"- **[{sev}]** {diff['detail']}")
|
|
461
|
+
|
|
462
|
+
if not has_changes and not any("No environment" in d.get("detail", "") for d in env_diffs):
|
|
463
|
+
lines.append("All packages match original environment.")
|
|
464
|
+
else:
|
|
465
|
+
lines.extend([
|
|
466
|
+
"",
|
|
467
|
+
"## Environment",
|
|
468
|
+
"",
|
|
469
|
+
"All packages match original environment.",
|
|
470
|
+
])
|
|
471
|
+
|
|
472
|
+
lines.extend([
|
|
473
|
+
"",
|
|
474
|
+
f"*Strictness: {report.get('strictness', 'N/A')}*",
|
|
475
|
+
])
|
|
476
|
+
|
|
477
|
+
return "\n".join(lines)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def main() -> None:
|
|
481
|
+
"""CLI entry point."""
|
|
482
|
+
parser = argparse.ArgumentParser(
|
|
483
|
+
description="Reproducibility verification for ML experiments",
|
|
484
|
+
)
|
|
485
|
+
parser.add_argument(
|
|
486
|
+
"exp_id",
|
|
487
|
+
help="Experiment ID to reproduce (e.g., exp-042)",
|
|
488
|
+
)
|
|
489
|
+
parser.add_argument(
|
|
490
|
+
"--tolerance", type=float, default=DEFAULT_TOLERANCE,
|
|
491
|
+
help=f"Relative tolerance for approximate match (default: {DEFAULT_TOLERANCE})",
|
|
492
|
+
)
|
|
493
|
+
parser.add_argument(
|
|
494
|
+
"--strict", action="store_true",
|
|
495
|
+
help="Strict mode: require exact match within float tolerance (1e-6)",
|
|
496
|
+
)
|
|
497
|
+
parser.add_argument(
|
|
498
|
+
"--runs", type=int, default=DEFAULT_REPRO_RUNS,
|
|
499
|
+
help=f"Number of reproduction runs (default: {DEFAULT_REPRO_RUNS})",
|
|
500
|
+
)
|
|
501
|
+
parser.add_argument(
|
|
502
|
+
"--config", default="config.yaml",
|
|
503
|
+
help="Path to config.yaml",
|
|
504
|
+
)
|
|
505
|
+
parser.add_argument(
|
|
506
|
+
"--log", default="experiments/log.jsonl",
|
|
507
|
+
help="Path to experiment log",
|
|
508
|
+
)
|
|
509
|
+
parser.add_argument(
|
|
510
|
+
"--timeout", type=int, default=600,
|
|
511
|
+
help="Per-run timeout in seconds (default: 600)",
|
|
512
|
+
)
|
|
513
|
+
parser.add_argument(
|
|
514
|
+
"--json", action="store_true",
|
|
515
|
+
help="Output raw JSON instead of formatted report",
|
|
516
|
+
)
|
|
517
|
+
args = parser.parse_args()
|
|
518
|
+
|
|
519
|
+
report = reproduce_experiment(
|
|
520
|
+
exp_id=args.exp_id,
|
|
521
|
+
tolerance=args.tolerance,
|
|
522
|
+
strict=args.strict,
|
|
523
|
+
n_runs=args.runs,
|
|
524
|
+
config_path=args.config,
|
|
525
|
+
log_path=args.log,
|
|
526
|
+
timeout=args.timeout,
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
# Save report
|
|
530
|
+
if "error" not in report:
|
|
531
|
+
filepath = save_reproduction_report(report)
|
|
532
|
+
print(f"\nSaved to {filepath}", file=sys.stderr)
|
|
533
|
+
|
|
534
|
+
# Output
|
|
535
|
+
if args.json:
|
|
536
|
+
print(json.dumps(report, indent=2))
|
|
537
|
+
else:
|
|
538
|
+
print(format_reproduction_report(report))
|
|
539
|
+
|
|
540
|
+
# Exit code based on verdict
|
|
541
|
+
if report.get("verdict") == "not_reproducible":
|
|
542
|
+
sys.exit(1)
|
|
543
|
+
elif report.get("verdict") == "environment_changed":
|
|
544
|
+
sys.exit(2)
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
if __name__ == "__main__":
|
|
548
|
+
main()
|
|
@@ -90,6 +90,11 @@ TEMPLATE_DIRS = {
|
|
|
90
90
|
"export_results.py",
|
|
91
91
|
"plot_trajectory.py",
|
|
92
92
|
"treequest_suggest.py",
|
|
93
|
+
"seed_runner.py",
|
|
94
|
+
"reproduce_experiment.py",
|
|
95
|
+
"diagnose_errors.py",
|
|
96
|
+
"ablation_study.py",
|
|
97
|
+
"pareto_frontier.py",
|
|
93
98
|
],
|
|
94
99
|
"tests": ["__init__.py", "conftest.py"],
|
|
95
100
|
}
|
|
@@ -97,6 +102,12 @@ TEMPLATE_DIRS = {
|
|
|
97
102
|
DIRECTORIES_TO_CREATE = [
|
|
98
103
|
"data/splits",
|
|
99
104
|
"experiments",
|
|
105
|
+
"experiments/seed_studies",
|
|
106
|
+
"experiments/reproductions",
|
|
107
|
+
"experiments/diagnoses",
|
|
108
|
+
"experiments/ablations",
|
|
109
|
+
"experiments/frontiers",
|
|
110
|
+
"experiments/predictions",
|
|
100
111
|
"models/best",
|
|
101
112
|
"models/archive",
|
|
102
113
|
]
|