claude-turing 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,548 @@
1
+ #!/usr/bin/env python3
2
+ """Reproducibility verification for ML experiments.
3
+
4
+ Given an experiment ID, re-runs it from the logged config and verifies
5
+ metrics fall within tolerance of the original. Catches non-determinism,
6
+ environment drift, and silent data changes.
7
+
8
+ Verdicts:
9
+ reproducible — metrics match within float tolerance (1e-6)
10
+ approximately_reproducible — metrics within user-specified tolerance
11
+ not_reproducible — metrics outside tolerance/CI
12
+ environment_changed — different library versions detected
13
+
14
+ Usage:
15
+ python scripts/reproduce_experiment.py exp-042
16
+ python scripts/reproduce_experiment.py exp-042 --tolerance 0.02
17
+ python scripts/reproduce_experiment.py exp-042 --strict
18
+ python scripts/reproduce_experiment.py exp-042 --runs 5
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import json
25
+ import subprocess
26
+ import sys
27
+ from datetime import datetime, timezone
28
+ from pathlib import Path
29
+
30
+ import numpy as np
31
+ import yaml
32
+
33
+ from scripts.turing_io import load_config, load_experiments
34
+
35
+ FLOAT_TOLERANCE = 1e-6
36
+ DEFAULT_TOLERANCE = 0.02 # 2% relative
37
+ DEFAULT_REPRO_RUNS = 3
38
+
39
+
40
+ def find_experiment(experiments: list[dict], exp_id: str) -> dict | None:
41
+ """Find an experiment by ID in the log."""
42
+ for exp in experiments:
43
+ if exp.get("experiment_id") == exp_id:
44
+ return exp
45
+ return None
46
+
47
+
48
+ def capture_current_environment() -> dict:
49
+ """Capture the current Python environment for comparison."""
50
+ env = {}
51
+
52
+ # Python version
53
+ env["python_version"] = sys.version.split()[0]
54
+
55
+ # Installed packages via pip freeze
56
+ try:
57
+ result = subprocess.run(
58
+ ["pip", "freeze"], capture_output=True, text=True, timeout=30,
59
+ )
60
+ if result.returncode == 0:
61
+ packages = {}
62
+ for line in result.stdout.strip().splitlines():
63
+ if "==" in line:
64
+ pkg, ver = line.split("==", 1)
65
+ packages[pkg.lower()] = ver
66
+ env["packages"] = packages
67
+ except (subprocess.TimeoutExpired, FileNotFoundError):
68
+ env["packages"] = {}
69
+
70
+ return env
71
+
72
+
73
+ def compare_environments(original_env: dict | None, current_env: dict) -> list[dict]:
74
+ """Compare original experiment environment against current.
75
+
76
+ Returns list of diffs with severity (info, warning, critical).
77
+ """
78
+ if not original_env:
79
+ return [{"field": "environment", "severity": "info",
80
+ "detail": "No environment snapshot in original experiment"}]
81
+
82
+ diffs = []
83
+
84
+ # Python version
85
+ orig_py = original_env.get("python_version", "unknown")
86
+ curr_py = current_env.get("python_version", "unknown")
87
+ if orig_py != curr_py:
88
+ diffs.append({
89
+ "field": "python_version",
90
+ "original": orig_py,
91
+ "current": curr_py,
92
+ "severity": "warning",
93
+ "detail": f"Python version changed: {orig_py} -> {curr_py}",
94
+ })
95
+
96
+ # Package versions
97
+ orig_pkgs = original_env.get("packages", {})
98
+ curr_pkgs = current_env.get("packages", {})
99
+
100
+ # Key ML packages that affect reproducibility
101
+ critical_packages = {
102
+ "numpy", "scipy", "scikit-learn", "sklearn", "pandas",
103
+ "torch", "tensorflow", "xgboost", "lightgbm", "catboost",
104
+ }
105
+
106
+ all_pkgs = set(orig_pkgs) | set(curr_pkgs)
107
+ for pkg in sorted(all_pkgs):
108
+ orig_ver = orig_pkgs.get(pkg)
109
+ curr_ver = curr_pkgs.get(pkg)
110
+ if orig_ver and curr_ver and orig_ver != curr_ver:
111
+ severity = "critical" if pkg in critical_packages else "info"
112
+ diffs.append({
113
+ "field": f"package:{pkg}",
114
+ "original": orig_ver,
115
+ "current": curr_ver,
116
+ "severity": severity,
117
+ "detail": f"{pkg}: {orig_ver} -> {curr_ver}",
118
+ })
119
+ elif orig_ver and not curr_ver:
120
+ severity = "warning" if pkg in critical_packages else "info"
121
+ diffs.append({
122
+ "field": f"package:{pkg}",
123
+ "original": orig_ver,
124
+ "current": "missing",
125
+ "severity": severity,
126
+ "detail": f"{pkg} {orig_ver} was present but is now missing",
127
+ })
128
+
129
+ return diffs
130
+
131
+
132
+ def run_single_reproduction(seed: int, timeout: int = 600) -> dict | None:
133
+ """Run train.py with given seed and return parsed metrics."""
134
+ cmd = ["python", "train.py", "--seed", str(seed)]
135
+ try:
136
+ proc = subprocess.run(
137
+ cmd, capture_output=True, text=True, timeout=timeout,
138
+ )
139
+ except subprocess.TimeoutExpired:
140
+ return None
141
+
142
+ if proc.returncode != 0:
143
+ return None
144
+
145
+ metrics = {}
146
+ in_block = False
147
+ metadata_keys = {"model_type", "train_seconds"}
148
+
149
+ for line in proc.stdout.splitlines():
150
+ line = line.strip()
151
+ if line == "---":
152
+ if in_block:
153
+ break
154
+ in_block = True
155
+ continue
156
+ if in_block and ":" in line:
157
+ key, value = line.split(":", 1)
158
+ key = key.strip()
159
+ value = value.strip()
160
+ if key in metadata_keys:
161
+ metrics[key] = value
162
+ else:
163
+ try:
164
+ metrics[key] = float(value)
165
+ except ValueError:
166
+ metrics[key] = value
167
+
168
+ return metrics if metrics else None
169
+
170
+
171
+ def determine_verdict(
172
+ original_value: float,
173
+ new_values: list[float],
174
+ tolerance: float,
175
+ strict: bool,
176
+ ) -> dict:
177
+ """Determine reproducibility verdict.
178
+
179
+ Args:
180
+ original_value: Metric from the original experiment.
181
+ new_values: Metrics from reproduction run(s).
182
+ tolerance: Relative tolerance for approximate match.
183
+ strict: If True, require exact match within float tolerance.
184
+
185
+ Returns:
186
+ Dict with verdict, reason, and statistical details.
187
+ """
188
+ arr = np.array(new_values)
189
+ new_mean = float(np.mean(arr))
190
+ n = len(arr)
191
+
192
+ result = {
193
+ "original_value": round(original_value, 6),
194
+ "new_mean": round(new_mean, 6),
195
+ "new_values": [round(v, 6) for v in new_values],
196
+ "n_runs": n,
197
+ }
198
+
199
+ if n > 1:
200
+ new_std = float(np.std(arr, ddof=1))
201
+ from scipy import stats as scipy_stats
202
+ t_crit = scipy_stats.t.ppf(0.975, df=n - 1)
203
+ margin = t_crit * new_std / np.sqrt(n)
204
+ ci_lower = new_mean - margin
205
+ ci_upper = new_mean + margin
206
+ result["new_std"] = round(new_std, 6)
207
+ result["ci_95"] = [round(ci_lower, 6), round(ci_upper, 6)]
208
+ else:
209
+ ci_lower = new_values[0]
210
+ ci_upper = new_values[0]
211
+ result["new_std"] = 0.0
212
+ result["ci_95"] = [round(ci_lower, 6), round(ci_upper, 6)]
213
+
214
+ # Check exact match (deterministic case)
215
+ if strict or n == 1:
216
+ diff = abs(original_value - new_mean)
217
+ if diff < FLOAT_TOLERANCE:
218
+ result["verdict"] = "reproducible"
219
+ result["reason"] = f"Exact match within float tolerance ({FLOAT_TOLERANCE})"
220
+ return result
221
+ if strict:
222
+ result["verdict"] = "not_reproducible"
223
+ result["reason"] = f"Strict mode: difference {diff:.6f} exceeds float tolerance {FLOAT_TOLERANCE}"
224
+ return result
225
+
226
+ # Check approximate match (within tolerance)
227
+ relative_diff = abs(original_value - new_mean) / abs(original_value) if original_value != 0 else abs(new_mean)
228
+ result["relative_difference"] = round(relative_diff, 6)
229
+
230
+ if relative_diff <= tolerance:
231
+ result["verdict"] = "approximately_reproducible"
232
+ result["reason"] = (
233
+ f"Within {tolerance*100:.1f}% tolerance "
234
+ f"(actual difference: {relative_diff*100:.2f}%)"
235
+ )
236
+ return result
237
+
238
+ # Check if original falls within CI of new distribution
239
+ if n > 1 and ci_lower <= original_value <= ci_upper:
240
+ result["verdict"] = "approximately_reproducible"
241
+ result["reason"] = (
242
+ f"Original value {original_value:.4f} falls within 95% CI "
243
+ f"[{ci_lower:.4f}, {ci_upper:.4f}] of reproduction distribution"
244
+ )
245
+ return result
246
+
247
+ # Not reproducible
248
+ result["verdict"] = "not_reproducible"
249
+ result["reason"] = (
250
+ f"Difference {relative_diff*100:.2f}% exceeds {tolerance*100:.1f}% tolerance, "
251
+ f"and original {original_value:.4f} outside 95% CI "
252
+ f"[{ci_lower:.4f}, {ci_upper:.4f}]"
253
+ )
254
+ return result
255
+
256
+
257
+ def reproduce_experiment(
258
+ exp_id: str,
259
+ tolerance: float = DEFAULT_TOLERANCE,
260
+ strict: bool = False,
261
+ n_runs: int = DEFAULT_REPRO_RUNS,
262
+ config_path: str = "config.yaml",
263
+ log_path: str = "experiments/log.jsonl",
264
+ timeout: int = 600,
265
+ ) -> dict:
266
+ """Run a complete reproducibility verification.
267
+
268
+ Args:
269
+ exp_id: Experiment ID to reproduce.
270
+ tolerance: Relative tolerance for approximate match.
271
+ strict: Require exact match (overrides tolerance).
272
+ n_runs: Number of reproduction runs (1 for strict/deterministic).
273
+ config_path: Path to config.yaml.
274
+ log_path: Path to experiment log.
275
+ timeout: Per-run timeout in seconds.
276
+
277
+ Returns:
278
+ Complete reproduction report dict.
279
+ """
280
+ config = load_config(config_path)
281
+ eval_cfg = config.get("evaluation", {})
282
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
283
+
284
+ experiments = load_experiments(log_path)
285
+ original = find_experiment(experiments, exp_id)
286
+
287
+ if not original:
288
+ return {
289
+ "error": f"Experiment {exp_id} not found in {log_path}",
290
+ "experiment_id": exp_id,
291
+ }
292
+
293
+ original_metrics = original.get("metrics", {})
294
+ original_value = original_metrics.get(primary_metric)
295
+ if original_value is None:
296
+ return {
297
+ "error": f"Experiment {exp_id} has no {primary_metric} metric",
298
+ "experiment_id": exp_id,
299
+ }
300
+
301
+ # Extract seed from original experiment
302
+ original_seed = original.get("config", {}).get("hyperparams", {}).get("seed", 42)
303
+ # Also check environment for seed
304
+ if original.get("environment", {}).get("seed") is not None:
305
+ original_seed = original["environment"]["seed"]
306
+
307
+ # If strict, run once with same seed
308
+ if strict:
309
+ n_runs = 1
310
+
311
+ print(f"Reproducing {exp_id}", file=sys.stderr)
312
+ print(f"Original {primary_metric}: {original_value:.4f}", file=sys.stderr)
313
+ print(f"Mode: {'strict (exact match)' if strict else f'tolerance={tolerance*100:.1f}%'}", file=sys.stderr)
314
+ print(f"Runs: {n_runs} (seed={original_seed})", file=sys.stderr)
315
+ print(file=sys.stderr)
316
+
317
+ # Capture current environment
318
+ current_env = capture_current_environment()
319
+ original_env = original.get("environment")
320
+ env_diffs = compare_environments(original_env, current_env)
321
+
322
+ # Run reproductions
323
+ new_values = []
324
+ failed_runs = 0
325
+ for i in range(n_runs):
326
+ seed = original_seed if strict else original_seed + i
327
+ print(f" Run {i + 1}/{n_runs} (seed={seed})...", end=" ", flush=True, file=sys.stderr)
328
+ metrics = run_single_reproduction(seed, timeout=timeout)
329
+ if metrics and primary_metric in metrics:
330
+ val = metrics[primary_metric]
331
+ new_values.append(val)
332
+ print(f"{primary_metric}={val:.4f}", file=sys.stderr)
333
+ else:
334
+ failed_runs += 1
335
+ print("FAILED", file=sys.stderr)
336
+
337
+ if not new_values:
338
+ return {
339
+ "error": f"All {n_runs} reproduction runs failed",
340
+ "experiment_id": exp_id,
341
+ }
342
+
343
+ # Determine verdict
344
+ verdict_info = determine_verdict(original_value, new_values, tolerance, strict)
345
+
346
+ # Check for environment changes
347
+ has_env_changes = any(d["severity"] in ("warning", "critical") for d in env_diffs)
348
+ if has_env_changes and verdict_info["verdict"] == "not_reproducible":
349
+ verdict_info["verdict"] = "environment_changed"
350
+ verdict_info["reason"] += " (environment differences detected — this may be the cause)"
351
+
352
+ # Build full report
353
+ report = {
354
+ "experiment_id": exp_id,
355
+ "reproduced_at": datetime.now(timezone.utc).isoformat(),
356
+ "original_metrics": {primary_metric: round(original_value, 6)},
357
+ "original_config": original.get("config", {}),
358
+ "original_git_commit": original.get("git_commit"),
359
+ "new_metrics": {primary_metric: verdict_info["new_mean"]},
360
+ "verdict": verdict_info["verdict"],
361
+ "reason": verdict_info["reason"],
362
+ "statistical_details": {
363
+ k: v for k, v in verdict_info.items()
364
+ if k not in ("verdict", "reason")
365
+ },
366
+ "strictness": f"exact (1e-6)" if strict else f"tolerance={tolerance}",
367
+ "n_runs": len(new_values),
368
+ "failed_runs": failed_runs,
369
+ "environment_changed": has_env_changes,
370
+ "environment_diffs": env_diffs if env_diffs else [],
371
+ }
372
+
373
+ return report
374
+
375
+
376
+ def save_reproduction_report(report: dict, output_dir: str = "experiments/reproductions") -> Path:
377
+ """Save reproduction report to YAML file."""
378
+ out_path = Path(output_dir)
379
+ out_path.mkdir(parents=True, exist_ok=True)
380
+
381
+ exp_id = report.get("experiment_id", "unknown")
382
+ filename = f"{exp_id}-repro.yaml"
383
+ filepath = out_path / filename
384
+
385
+ with open(filepath, "w") as f:
386
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
387
+
388
+ return filepath
389
+
390
+
391
+ def format_reproduction_report(report: dict) -> str:
392
+ """Format reproduction report as human-readable markdown."""
393
+ if "error" in report:
394
+ return f"ERROR: {report['error']}"
395
+
396
+ exp_id = report["experiment_id"]
397
+ verdict = report["verdict"]
398
+ details = report.get("statistical_details", {})
399
+
400
+ # Verdict emoji/marker
401
+ verdict_markers = {
402
+ "reproducible": "PASS",
403
+ "approximately_reproducible": "PASS (approx)",
404
+ "not_reproducible": "FAIL",
405
+ "environment_changed": "WARN",
406
+ }
407
+ marker = verdict_markers.get(verdict, verdict)
408
+
409
+ lines = [
410
+ f"# Reproducibility Report: {exp_id}",
411
+ "",
412
+ f"**Verdict: {marker}**",
413
+ "",
414
+ f"*{report['reason']}*",
415
+ "",
416
+ "## Comparison",
417
+ "",
418
+ "| Metric | Original | Reproduced |",
419
+ "|--------|----------|------------|",
420
+ ]
421
+
422
+ for metric, orig_val in report.get("original_metrics", {}).items():
423
+ new_val = report.get("new_metrics", {}).get(metric, "N/A")
424
+ if isinstance(orig_val, float) and isinstance(new_val, float):
425
+ lines.append(f"| {metric} | {orig_val:.4f} | {new_val:.4f} |")
426
+ else:
427
+ lines.append(f"| {metric} | {orig_val} | {new_val} |")
428
+
429
+ if details.get("n_runs", 0) > 1:
430
+ lines.extend([
431
+ "",
432
+ "## Statistical Details",
433
+ "",
434
+ f"- **Reproduction runs:** {details.get('n_runs', 'N/A')}",
435
+ f"- **New values:** {details.get('new_values', [])}",
436
+ ])
437
+ if "new_std" in details:
438
+ lines.append(f"- **New std:** {details['new_std']:.6f}")
439
+ if "ci_95" in details:
440
+ ci = details["ci_95"]
441
+ lines.append(f"- **95% CI:** [{ci[0]:.4f}, {ci[1]:.4f}]")
442
+ if "relative_difference" in details:
443
+ lines.append(f"- **Relative difference:** {details['relative_difference']*100:.2f}%")
444
+
445
+ # Environment section
446
+ env_diffs = report.get("environment_diffs", [])
447
+ if env_diffs:
448
+ lines.extend([
449
+ "",
450
+ "## Environment",
451
+ "",
452
+ ])
453
+ has_changes = False
454
+ for diff in env_diffs:
455
+ if diff["severity"] == "info" and "No environment snapshot" in diff.get("detail", ""):
456
+ lines.append(f"- {diff['detail']}")
457
+ elif diff["severity"] != "info":
458
+ has_changes = True
459
+ sev = diff["severity"].upper()
460
+ lines.append(f"- **[{sev}]** {diff['detail']}")
461
+
462
+ if not has_changes and not any("No environment" in d.get("detail", "") for d in env_diffs):
463
+ lines.append("All packages match original environment.")
464
+ else:
465
+ lines.extend([
466
+ "",
467
+ "## Environment",
468
+ "",
469
+ "All packages match original environment.",
470
+ ])
471
+
472
+ lines.extend([
473
+ "",
474
+ f"*Strictness: {report.get('strictness', 'N/A')}*",
475
+ ])
476
+
477
+ return "\n".join(lines)
478
+
479
+
480
+ def main() -> None:
481
+ """CLI entry point."""
482
+ parser = argparse.ArgumentParser(
483
+ description="Reproducibility verification for ML experiments",
484
+ )
485
+ parser.add_argument(
486
+ "exp_id",
487
+ help="Experiment ID to reproduce (e.g., exp-042)",
488
+ )
489
+ parser.add_argument(
490
+ "--tolerance", type=float, default=DEFAULT_TOLERANCE,
491
+ help=f"Relative tolerance for approximate match (default: {DEFAULT_TOLERANCE})",
492
+ )
493
+ parser.add_argument(
494
+ "--strict", action="store_true",
495
+ help="Strict mode: require exact match within float tolerance (1e-6)",
496
+ )
497
+ parser.add_argument(
498
+ "--runs", type=int, default=DEFAULT_REPRO_RUNS,
499
+ help=f"Number of reproduction runs (default: {DEFAULT_REPRO_RUNS})",
500
+ )
501
+ parser.add_argument(
502
+ "--config", default="config.yaml",
503
+ help="Path to config.yaml",
504
+ )
505
+ parser.add_argument(
506
+ "--log", default="experiments/log.jsonl",
507
+ help="Path to experiment log",
508
+ )
509
+ parser.add_argument(
510
+ "--timeout", type=int, default=600,
511
+ help="Per-run timeout in seconds (default: 600)",
512
+ )
513
+ parser.add_argument(
514
+ "--json", action="store_true",
515
+ help="Output raw JSON instead of formatted report",
516
+ )
517
+ args = parser.parse_args()
518
+
519
+ report = reproduce_experiment(
520
+ exp_id=args.exp_id,
521
+ tolerance=args.tolerance,
522
+ strict=args.strict,
523
+ n_runs=args.runs,
524
+ config_path=args.config,
525
+ log_path=args.log,
526
+ timeout=args.timeout,
527
+ )
528
+
529
+ # Save report
530
+ if "error" not in report:
531
+ filepath = save_reproduction_report(report)
532
+ print(f"\nSaved to {filepath}", file=sys.stderr)
533
+
534
+ # Output
535
+ if args.json:
536
+ print(json.dumps(report, indent=2))
537
+ else:
538
+ print(format_reproduction_report(report))
539
+
540
+ # Exit code based on verdict
541
+ if report.get("verdict") == "not_reproducible":
542
+ sys.exit(1)
543
+ elif report.get("verdict") == "environment_changed":
544
+ sys.exit(2)
545
+
546
+
547
+ if __name__ == "__main__":
548
+ main()
@@ -90,6 +90,8 @@ TEMPLATE_DIRS = {
90
90
  "export_results.py",
91
91
  "plot_trajectory.py",
92
92
  "treequest_suggest.py",
93
+ "seed_runner.py",
94
+ "reproduce_experiment.py",
93
95
  ],
94
96
  "tests": ["__init__.py", "conftest.py"],
95
97
  }
@@ -97,6 +99,8 @@ TEMPLATE_DIRS = {
97
99
  DIRECTORIES_TO_CREATE = [
98
100
  "data/splits",
99
101
  "experiments",
102
+ "experiments/seed_studies",
103
+ "experiments/reproductions",
100
104
  "models/best",
101
105
  "models/archive",
102
106
  ]