claude-turing 2.5.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,451 @@
1
+ #!/usr/bin/env python3
2
+ """Pre-submission methodology audit for the autoresearch pipeline.
3
+
4
+ Checks for common ML paper methodology mistakes before submission:
5
+ data leakage, wrong CV strategy, missing baselines, unreported tuning
6
+ cost, cherry-picked seeds, train/test overlap. A reviewer checklist
7
+ you run before submitting.
8
+
9
+ Usage:
10
+ python scripts/methodology_audit.py
11
+ python scripts/methodology_audit.py --strict
12
+ python scripts/methodology_audit.py --checklist neurips
13
+ python scripts/methodology_audit.py --json
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import json
20
+ import sys
21
+ from datetime import datetime, timezone
22
+ from pathlib import Path
23
+
24
+ import yaml
25
+
26
+ from scripts.turing_io import load_config, load_experiments
27
+
28
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
29
+
30
+ # Severity levels
31
+ CRITICAL = "critical"
32
+ HIGH = "high"
33
+ MEDIUM = "medium"
34
+ LOW = "low"
35
+
36
+
37
+ # --- Audit Checks ---
38
+
39
+
40
+ def check_seed_sensitivity(
41
+ experiments: list[dict],
42
+ seed_dir: str = "experiments/seed_studies",
43
+ ) -> dict:
44
+ """Check that results are reported with error bars from multiple seeds."""
45
+ path = Path(seed_dir)
46
+ seed_studies = list(path.glob("*-seeds.yaml")) if path.exists() else []
47
+
48
+ best_kept = [e for e in experiments if e.get("status") == "kept"]
49
+ best_ids = {e.get("experiment_id") for e in best_kept[-3:]} if best_kept else set()
50
+
51
+ studied_ids = set()
52
+ for f in seed_studies:
53
+ exp_id = f.stem.replace("-seeds", "")
54
+ studied_ids.add(exp_id)
55
+
56
+ covered = best_ids & studied_ids
57
+
58
+ if not best_ids:
59
+ return {"check": "seed_sensitivity", "status": "skip", "reason": "No kept experiments", "severity": HIGH}
60
+
61
+ if covered == best_ids:
62
+ return {"check": "seed_sensitivity", "status": "pass", "reason": f"Seed studies exist for {len(covered)} best experiment(s)", "severity": HIGH}
63
+ elif covered:
64
+ return {"check": "seed_sensitivity", "status": "warn", "reason": f"Seed studies for {len(covered)}/{len(best_ids)} best experiments", "severity": HIGH, "fix": "/turing:seed"}
65
+ else:
66
+ return {"check": "seed_sensitivity", "status": "fail", "reason": "No seed studies for best experiments", "severity": HIGH, "fix": "/turing:seed"}
67
+
68
+
69
+ def check_ablation(
70
+ experiments: list[dict],
71
+ ablation_dir: str = "experiments/ablations",
72
+ ) -> dict:
73
+ """Check that major components have been ablated."""
74
+ path = Path(ablation_dir)
75
+ ablations = list(path.glob("*.yaml")) if path.exists() else []
76
+
77
+ if ablations:
78
+ return {"check": "ablation_completeness", "status": "pass", "reason": f"{len(ablations)} ablation study(s) found", "severity": HIGH}
79
+ else:
80
+ return {"check": "ablation_completeness", "status": "fail", "reason": "No ablation studies found", "severity": HIGH, "fix": "/turing:ablate"}
81
+
82
+
83
+ def check_baseline(experiments: list[dict]) -> dict:
84
+ """Check that reasonable baselines were compared against."""
85
+ baseline_keywords = {"baseline", "majority", "random", "mean", "median", "dummy", "constant", "naive"}
86
+
87
+ baselines = []
88
+ for exp in experiments:
89
+ model_type = exp.get("config", {}).get("model_type", "").lower()
90
+ desc = exp.get("description", "").lower()
91
+ if any(kw in model_type or kw in desc for kw in baseline_keywords):
92
+ baselines.append(exp.get("experiment_id", "?"))
93
+
94
+ if baselines:
95
+ return {"check": "baseline_comparison", "status": "pass", "reason": f"Baseline experiments found: {', '.join(baselines[:3])}", "severity": HIGH}
96
+ else:
97
+ return {"check": "baseline_comparison", "status": "fail", "reason": "No baseline experiments found in log", "severity": HIGH, "fix": "/turing:try 'add majority class baseline'"}
98
+
99
+
100
+ def check_reproducibility(
101
+ experiments: list[dict],
102
+ repro_dir: str = "experiments/reproductions",
103
+ ) -> dict:
104
+ """Check that the best result has been reproduced."""
105
+ path = Path(repro_dir)
106
+ repros = list(path.glob("*-repro.yaml")) if path.exists() else []
107
+
108
+ if not repros:
109
+ return {"check": "reproducibility", "status": "fail", "reason": "No reproduction reports found", "severity": HIGH, "fix": "/turing:reproduce <best-exp-id>"}
110
+
111
+ # Check if any passed
112
+ for f in repros:
113
+ try:
114
+ with open(f) as fh:
115
+ report = yaml.safe_load(fh)
116
+ if report and report.get("verdict") in ("reproducible", "approximately_reproducible"):
117
+ return {"check": "reproducibility", "status": "pass", "reason": f"Experiment {report.get('experiment_id', '?')} reproduced successfully", "severity": HIGH}
118
+ except (yaml.YAMLError, OSError):
119
+ continue
120
+
121
+ return {"check": "reproducibility", "status": "warn", "reason": "Reproduction reports exist but none passed", "severity": HIGH, "fix": "/turing:reproduce <best-exp-id>"}
122
+
123
+
124
+ def check_hyperparameter_budget(experiments: list[dict]) -> dict:
125
+ """Check that total hyperparameter tuning budget is documented."""
126
+ n_experiments = len(experiments)
127
+ total_seconds = sum(
128
+ e.get("metrics", {}).get("train_seconds", 0)
129
+ for e in experiments
130
+ if isinstance(e.get("metrics", {}).get("train_seconds"), (int, float))
131
+ )
132
+ total_hours = total_seconds / 3600
133
+
134
+ if n_experiments > 0:
135
+ return {
136
+ "check": "hyperparameter_budget",
137
+ "status": "pass" if n_experiments > 0 else "warn",
138
+ "reason": f"{n_experiments} experiments, {total_hours:.1f} compute hours logged",
139
+ "severity": MEDIUM,
140
+ "detail": {"n_experiments": n_experiments, "total_hours": round(total_hours, 2)},
141
+ }
142
+ return {"check": "hyperparameter_budget", "status": "warn", "reason": "No experiments logged", "severity": MEDIUM}
143
+
144
+
145
+ def check_data_leakage(config: dict) -> dict:
146
+ """Check for potential data leakage indicators.
147
+
148
+ This is a heuristic check — verifies that config suggests proper
149
+ train/test separation. Full leakage detection requires code analysis.
150
+ """
151
+ prepare_exists = Path("prepare.py").exists()
152
+ evaluate_exists = Path("evaluate.py").exists()
153
+
154
+ if prepare_exists and evaluate_exists:
155
+ return {"check": "data_leakage", "status": "pass", "reason": "Separate prepare.py and evaluate.py files exist (proper separation)", "severity": CRITICAL}
156
+ elif prepare_exists:
157
+ return {"check": "data_leakage", "status": "warn", "reason": "prepare.py exists but evaluate.py missing — verify evaluation uses held-out data", "severity": CRITICAL}
158
+ else:
159
+ return {"check": "data_leakage", "status": "warn", "reason": "No prepare.py found — verify data splitting is done before feature engineering", "severity": CRITICAL}
160
+
161
+
162
+ def check_cv_strategy(config: dict) -> dict:
163
+ """Check that CV strategy is appropriate for the data type."""
164
+ eval_cfg = config.get("evaluation", {})
165
+ cv_strategy = eval_cfg.get("cv_strategy", eval_cfg.get("cv", ""))
166
+
167
+ if cv_strategy:
168
+ return {"check": "cv_strategy", "status": "pass", "reason": f"CV strategy specified: {cv_strategy}", "severity": CRITICAL}
169
+ else:
170
+ return {"check": "cv_strategy", "status": "warn", "reason": "No CV strategy specified in config — verify appropriate split method for data type", "severity": CRITICAL}
171
+
172
+
173
+ def check_regression_stability(
174
+ regress_dir: str = "experiments/regressions",
175
+ ) -> dict:
176
+ """Check that regression tests have been run."""
177
+ path = Path(regress_dir)
178
+ checks = list(path.glob("check-*.yaml")) if path.exists() else []
179
+
180
+ if checks:
181
+ return {"check": "regression_stability", "status": "pass", "reason": f"{len(checks)} regression check(s) performed", "severity": MEDIUM}
182
+ else:
183
+ return {"check": "regression_stability", "status": "warn", "reason": "No regression checks found", "severity": MEDIUM, "fix": "/turing:regress"}
184
+
185
+
186
+ # --- Venue-Specific Checklists ---
187
+
188
+
189
+ VENUE_CHECKS = {
190
+ "neurips": [
191
+ {"check": "broader_impact", "description": "Broader impact statement included", "severity": MEDIUM},
192
+ {"check": "reproducibility_checklist", "description": "NeurIPS reproducibility checklist completed", "severity": HIGH},
193
+ {"check": "code_availability", "description": "Code and data availability documented", "severity": MEDIUM},
194
+ ],
195
+ "icml": [
196
+ {"check": "reproducibility_checklist", "description": "ICML reproducibility checklist completed", "severity": HIGH},
197
+ ],
198
+ "iclr": [
199
+ {"check": "ethics_statement", "description": "Ethics statement included", "severity": MEDIUM},
200
+ ],
201
+ }
202
+
203
+
204
+ def get_venue_checks(venue: str | None) -> list[dict]:
205
+ """Get venue-specific additional checks."""
206
+ if not venue:
207
+ return []
208
+ checks = VENUE_CHECKS.get(venue.lower(), [])
209
+ # These are manual checks — mark as "manual" status
210
+ return [
211
+ {**c, "status": "manual", "reason": f"Manual check required: {c['description']}"}
212
+ for c in checks
213
+ ]
214
+
215
+
216
+ # --- Full Audit ---
217
+
218
+
219
+ def run_audit(
220
+ strict: bool = False,
221
+ venue: str | None = None,
222
+ config_path: str = "config.yaml",
223
+ log_path: str = DEFAULT_LOG_PATH,
224
+ ) -> dict:
225
+ """Run a complete methodology audit.
226
+
227
+ Args:
228
+ strict: Treat warnings as failures.
229
+ venue: Venue-specific checklist (neurips, icml, iclr).
230
+ config_path: Path to config.yaml.
231
+ log_path: Path to experiment log.
232
+
233
+ Returns:
234
+ Complete audit report.
235
+ """
236
+ config = load_config(config_path)
237
+ experiments = load_experiments(log_path)
238
+
239
+ checks = [
240
+ check_data_leakage(config),
241
+ check_cv_strategy(config),
242
+ check_seed_sensitivity(experiments),
243
+ check_ablation(experiments),
244
+ check_baseline(experiments),
245
+ check_reproducibility(experiments),
246
+ check_hyperparameter_budget(experiments),
247
+ check_regression_stability(),
248
+ ]
249
+
250
+ # Add venue-specific checks
251
+ venue_checks = get_venue_checks(venue)
252
+ checks.extend(venue_checks)
253
+
254
+ # Compute score
255
+ n_pass = sum(1 for c in checks if c["status"] == "pass")
256
+ n_fail = sum(1 for c in checks if c["status"] == "fail")
257
+ n_warn = sum(1 for c in checks if c["status"] == "warn")
258
+ n_skip = sum(1 for c in checks if c["status"] == "skip")
259
+ n_manual = sum(1 for c in checks if c["status"] == "manual")
260
+ total_checkable = len(checks) - n_skip - n_manual
261
+
262
+ if strict:
263
+ # Treat warnings as failures
264
+ effective_pass = n_pass
265
+ effective_fail = n_fail + n_warn
266
+ else:
267
+ effective_pass = n_pass
268
+ effective_fail = n_fail
269
+
270
+ # Overall verdict
271
+ critical_fails = [c for c in checks if c["status"] == "fail" and c.get("severity") == CRITICAL]
272
+ if critical_fails:
273
+ verdict = "fail"
274
+ elif n_fail > 0:
275
+ verdict = "needs_work"
276
+ elif n_warn > 2:
277
+ verdict = "needs_work"
278
+ elif n_warn > 0:
279
+ verdict = "pass_with_warnings"
280
+ else:
281
+ verdict = "pass"
282
+
283
+ # Action items
284
+ actions = []
285
+ for c in checks:
286
+ if c["status"] in ("fail", "warn") and c.get("fix"):
287
+ actions.append({
288
+ "check": c["check"],
289
+ "fix": c["fix"],
290
+ "severity": c.get("severity", MEDIUM),
291
+ })
292
+
293
+ return {
294
+ "audited_at": datetime.now(timezone.utc).isoformat(),
295
+ "strict_mode": strict,
296
+ "venue": venue,
297
+ "checks": checks,
298
+ "score": {
299
+ "pass": n_pass,
300
+ "fail": n_fail,
301
+ "warn": n_warn,
302
+ "skip": n_skip,
303
+ "manual": n_manual,
304
+ "total": len(checks),
305
+ "checkable": total_checkable,
306
+ },
307
+ "verdict": verdict,
308
+ "actions": actions,
309
+ }
310
+
311
+
312
+ # --- Report Formatting ---
313
+
314
+
315
+ def save_audit_report(report: dict, output_dir: str = "experiments/audits") -> Path:
316
+ """Save audit report to YAML."""
317
+ out_path = Path(output_dir)
318
+ out_path.mkdir(parents=True, exist_ok=True)
319
+
320
+ date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
321
+ filepath = out_path / f"audit-{date}.yaml"
322
+
323
+ with open(filepath, "w") as f:
324
+ yaml.dump(report, f, default_flow_style=False, sort_keys=False)
325
+
326
+ return filepath
327
+
328
+
329
+ def format_audit_report(report: dict) -> str:
330
+ """Format audit report as markdown."""
331
+ if "error" in report:
332
+ return f"ERROR: {report['error']}"
333
+
334
+ verdict = report.get("verdict", "?")
335
+ score = report.get("score", {})
336
+ strict = report.get("strict_mode", False)
337
+
338
+ verdict_labels = {
339
+ "pass": "PASS — Ready for submission",
340
+ "pass_with_warnings": "PASS (with warnings) — Address before submission",
341
+ "needs_work": "NEEDS WORK — Fix failures before submission",
342
+ "fail": "FAIL — Critical issues found",
343
+ }
344
+
345
+ lines = [
346
+ "# Methodology Audit Report",
347
+ "",
348
+ f"*Audited {report.get('audited_at', 'N/A')[:19]}*",
349
+ f"*Mode: {'strict' if strict else 'standard'}*",
350
+ ]
351
+
352
+ if report.get("venue"):
353
+ lines.append(f"*Venue: {report['venue']}*")
354
+
355
+ lines.extend([
356
+ "",
357
+ f"**{verdict_labels.get(verdict, verdict.upper())}**",
358
+ "",
359
+ "## Checks",
360
+ "",
361
+ ])
362
+
363
+ status_markers = {
364
+ "pass": "PASS",
365
+ "fail": "FAIL",
366
+ "warn": "WARN",
367
+ "skip": "SKIP",
368
+ "manual": "TODO",
369
+ }
370
+
371
+ for c in report.get("checks", []):
372
+ status = c.get("status", "?")
373
+ marker = status_markers.get(status, status.upper())
374
+ sev = c.get("severity", "medium")
375
+ lines.append(f"- **[{marker}]** {c.get('check', '?')} ({sev}): {c.get('reason', 'N/A')}")
376
+
377
+ # Score
378
+ lines.extend([
379
+ "",
380
+ "## Score",
381
+ "",
382
+ f"**{score.get('pass', 0)}/{score.get('checkable', 0)} pass**, "
383
+ f"{score.get('warn', 0)} warning(s), "
384
+ f"{score.get('fail', 0)} failure(s)",
385
+ ])
386
+
387
+ if score.get("manual", 0) > 0:
388
+ lines.append(f"*{score['manual']} manual check(s) required*")
389
+
390
+ # Actions
391
+ actions = report.get("actions", [])
392
+ if actions:
393
+ lines.extend(["", "## Required Actions", ""])
394
+ for a in actions:
395
+ lines.append(f"- **{a['check']}** ({a['severity']}): run `{a['fix']}`")
396
+
397
+ return "\n".join(lines)
398
+
399
+
400
+ def main() -> None:
401
+ """CLI entry point."""
402
+ parser = argparse.ArgumentParser(
403
+ description="Pre-submission methodology audit",
404
+ )
405
+ parser.add_argument(
406
+ "--strict", action="store_true",
407
+ help="Strict mode: treat warnings as failures",
408
+ )
409
+ parser.add_argument(
410
+ "--checklist",
411
+ help="Venue-specific checklist (neurips, icml, iclr)",
412
+ )
413
+ parser.add_argument(
414
+ "--config", default="config.yaml",
415
+ help="Path to config.yaml",
416
+ )
417
+ parser.add_argument(
418
+ "--log", default=DEFAULT_LOG_PATH,
419
+ help="Path to experiment log",
420
+ )
421
+ parser.add_argument(
422
+ "--json", action="store_true",
423
+ help="Output raw JSON instead of formatted report",
424
+ )
425
+ args = parser.parse_args()
426
+
427
+ report = run_audit(
428
+ strict=args.strict,
429
+ venue=args.checklist,
430
+ config_path=args.config,
431
+ log_path=args.log,
432
+ )
433
+
434
+ if "error" not in report:
435
+ filepath = save_audit_report(report)
436
+ print(f"Saved to {filepath}", file=sys.stderr)
437
+
438
+ if args.json:
439
+ print(json.dumps(report, indent=2, default=str))
440
+ else:
441
+ print(format_audit_report(report))
442
+
443
+ # Exit code based on verdict
444
+ if report.get("verdict") == "fail":
445
+ sys.exit(1)
446
+ elif report.get("verdict") == "needs_work":
447
+ sys.exit(2)
448
+
449
+
450
+ if __name__ == "__main__":
451
+ main()
@@ -116,6 +116,8 @@ TEMPLATE_DIRS = {
116
116
  "scaling_estimator.py",
117
117
  "budget_manager.py",
118
118
  "model_distiller.py",
119
+ "knowledge_transfer.py",
120
+ "methodology_audit.py",
119
121
  ],
120
122
  "tests": ["__init__.py", "conftest.py"],
121
123
  }
@@ -144,6 +146,8 @@ DIRECTORIES_TO_CREATE = [
144
146
  "experiments/warm_starts",
145
147
  "experiments/scaling",
146
148
  "experiments/distillations",
149
+ "experiments/transfers",
150
+ "experiments/audits",
147
151
  "experiments/logs",
148
152
  "models/best",
149
153
  "models/archive",