claude-turing 3.5.0 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +8 -2
- package/commands/changelog.md +22 -0
- package/commands/onboard.md +20 -0
- package/commands/review.md +20 -0
- package/commands/share.md +20 -0
- package/commands/turing.md +12 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +6 -0
- package/templates/scripts/__pycache__/citation_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_changelog.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_figures.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_onboarding.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/package_experiments.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/simulate_review.cpython-314.pyc +0 -0
- package/templates/scripts/generate_onboarding.py +284 -0
- package/templates/scripts/package_experiments.py +285 -0
- package/templates/scripts/scaffold.py +10 -0
- package/templates/scripts/simulate_review.py +342 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "4.1.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 66 commands, 2 specialized agents, collaboration (onboard + share + review), research communication (cite + present + changelog), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), model surgery (prune + quantize + merge + surgery), feature & training intelligence, model debugging, pre-training intelligence, meta-intelligence, scaling & efficiency, model composition, deep analysis, experiment orchestration, literature + paper, model export, profiling, checkpoints, experiment intelligence, statistical rigor, tree-search, cost-performance, model cards, hypothesis database, novelty guard, anti-cheating, taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -371,6 +371,12 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
371
371
|
| `/turing:search <query>` | Natural language experiment search — text + structured filters |
|
|
372
372
|
| `/turing:template <action>` | Experiment template library — save/list/apply reusable configs |
|
|
373
373
|
| `/turing:replay <exp-id>` | Experiment replay — re-run old approach with current infrastructure |
|
|
374
|
+
| `/turing:cite <action>` | Citation & attribution manager — track papers, audit missing citations, generate BibTeX |
|
|
375
|
+
| `/turing:present [--figures]` | Presentation figures — training curves, comparisons, ablation, Pareto, sensitivity |
|
|
376
|
+
| `/turing:changelog [--audience]` | Model changelog — version-grouped improvements for technical or stakeholder audiences |
|
|
377
|
+
| `/turing:onboard [--audience]` | Project onboarding — walkthrough for new collaborators |
|
|
378
|
+
| `/turing:share <exp-ids...>` | Experiment packaging — portable archive with manifest |
|
|
379
|
+
| `/turing:review [--venue]` | Peer review simulation — weaknesses, fix commands, score |
|
|
374
380
|
|
|
375
381
|
And for fully hands-off operation:
|
|
376
382
|
|
|
@@ -555,11 +561,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
555
561
|
|
|
556
562
|
## Architecture of Turing Itself
|
|
557
563
|
|
|
558
|
-
|
|
564
|
+
66 commands, 2 agents, 10 config files, 85 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), pre-training intelligence (sanity + baseline + leak), model debugging (xray + sensitivity + calibrate), feature & training intelligence (feature + curriculum), model surgery (prune + quantize + merge + surgery), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), research communication (cite + present + changelog), collaboration (onboard + share + review), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
559
565
|
|
|
560
566
|
```
|
|
561
567
|
turing/
|
|
562
|
-
├── commands/
|
|
568
|
+
├── commands/ 62 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence + pre-training intelligence + model debugging + feature & training intelligence + model surgery + experiment archaeology + research communication)
|
|
563
569
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
564
570
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
565
571
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: changelog
|
|
3
|
+
description: Model changelog generation — auto-generate human-readable progress narrative from experiment history for stakeholders.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--since exp-id|date] [--audience technical|stakeholder]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Translate experiment logs into a narrative that PMs and stakeholders can read in 2 minutes.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. **Activate environment:** `source .venv/bin/activate`
|
|
13
|
+
2. **Run:** `python scripts/generate_changelog.py $ARGUMENTS`
|
|
14
|
+
3. **Audience:** technical (experiment IDs, configs), stakeholder (plain English, percentages)
|
|
15
|
+
4. **Saved output:** `paper/CHANGELOG.md`
|
|
16
|
+
|
|
17
|
+
## Examples
|
|
18
|
+
```
|
|
19
|
+
/turing:changelog # Full changelog
|
|
20
|
+
/turing:changelog --audience stakeholder # Non-technical summary
|
|
21
|
+
/turing:changelog --since exp-042 # Since specific experiment
|
|
22
|
+
```
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: onboard
|
|
3
|
+
description: Project onboarding — generate a walkthrough for new collaborators. Task, history, decisions, next steps.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--audience researcher|engineer|stakeholder] [--depth brief|full]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
5-minute read that replaces a 1-hour onboarding meeting.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/generate_onboarding.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `ONBOARDING.md`
|
|
15
|
+
|
|
16
|
+
## Examples
|
|
17
|
+
```
|
|
18
|
+
/turing:onboard
|
|
19
|
+
/turing:onboard --audience engineer --depth brief
|
|
20
|
+
```
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: review
|
|
3
|
+
description: Peer review simulation — generate likely reviewer objections with severity ratings and fix commands.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--venue neurips|icml|general] [--harsh]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Simulate a conference reviewer before you submit. Each weakness links to the command that fixes it.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/simulate_review.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/reviews/`
|
|
15
|
+
|
|
16
|
+
## Examples
|
|
17
|
+
```
|
|
18
|
+
/turing:review
|
|
19
|
+
/turing:review --venue neurips --harsh
|
|
20
|
+
```
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: share
|
|
3
|
+
description: Experiment packaging — portable archive with config, metrics, seed study, annotations, reproduction instructions.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<exp-ids...> [--include model,figures,code]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Package experiments for collaborator handoff or paper supplementary material.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/package_experiments.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `exports/packages/<name>/`
|
|
15
|
+
|
|
16
|
+
## Examples
|
|
17
|
+
```
|
|
18
|
+
/turing:share exp-089
|
|
19
|
+
/turing:share exp-042 exp-089 --include model,figures
|
|
20
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -62,6 +62,12 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
62
62
|
| "quantize", "quantization", "int8", "fp16", "reduce precision", "faster inference" | `/turing:quantize` | Optimize |
|
|
63
63
|
| "merge", "model soup", "merge weights", "average models", "TIES", "DARE" | `/turing:merge` | Compose |
|
|
64
64
|
| "surgery", "architecture", "add layer", "widen", "modify model", "swap activation" | `/turing:surgery` | Modify |
|
|
65
|
+
| "cite", "citation", "bibliography", "bibtex", "attribution", "references" | `/turing:cite` | Record |
|
|
66
|
+
| "present", "figures", "slides", "presentation", "charts", "plots" | `/turing:present` | Document |
|
|
67
|
+
| "changelog", "model changelog", "progress summary", "what improved" | `/turing:changelog` | Document |
|
|
68
|
+
| "onboard", "onboarding", "walkthrough", "new collaborator", "project overview" | `/turing:onboard` | Document |
|
|
69
|
+
| "share", "package", "export experiments", "send results", "portable" | `/turing:share` | Share |
|
|
70
|
+
| "review", "peer review", "reviewer", "simulate review", "weakness" | `/turing:review` | Validate |
|
|
65
71
|
| "trend", "trends", "research direction", "improvement rate", "diminishing returns", "what's working" | `/turing:trend` | Analyze |
|
|
66
72
|
| "flashback", "where was I", "context", "resume", "catch up", "what happened" | `/turing:flashback` | Recall |
|
|
67
73
|
| "archive", "cleanup", "compress old", "disk space", "archive experiments" | `/turing:archive` | Manage |
|
|
@@ -134,6 +140,12 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
134
140
|
| `/turing:search <query>` | Natural language experiment search with structured filters | (inline) |
|
|
135
141
|
| `/turing:template <action>` | Experiment template library: save/list/apply reusable configs across projects | (inline) |
|
|
136
142
|
| `/turing:replay <exp-id>` | Experiment replay: re-run old experiment with current infrastructure | (inline) |
|
|
143
|
+
| `/turing:cite <action>` | Citation manager: add/list/check/bib for papers, datasets, methods | (inline) |
|
|
144
|
+
| `/turing:present [--figures]` | Presentation figures: training curves, comparisons, ablation, Pareto, sensitivity | (inline) |
|
|
145
|
+
| `/turing:changelog [--audience]` | Model changelog: version-grouped improvements for technical or stakeholder audiences | (inline) |
|
|
146
|
+
| `/turing:onboard [--audience]` | Project onboarding: full walkthrough for new collaborators | (inline) |
|
|
147
|
+
| `/turing:share <exp-ids...>` | Experiment packaging: portable archive with manifest and README | (inline) |
|
|
148
|
+
| `/turing:review [--venue]` | Peer review simulation: weaknesses, questions, fix commands, score | (inline) |
|
|
137
149
|
|
|
138
150
|
## Proactive Detection
|
|
139
151
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "4.1.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
|
@@ -34,6 +34,8 @@ const SUB_COMMANDS = [
|
|
|
34
34
|
"feature", "curriculum",
|
|
35
35
|
"prune", "quantize", "merge", "surgery",
|
|
36
36
|
"trend", "flashback", "archive", "annotate", "search", "template", "replay",
|
|
37
|
+
"cite", "present", "changelog",
|
|
38
|
+
"onboard", "share", "review",
|
|
37
39
|
];
|
|
38
40
|
|
|
39
41
|
export async function install(opts = {}) {
|
package/src/verify.js
CHANGED
|
@@ -74,6 +74,12 @@ const EXPECTED_COMMANDS = [
|
|
|
74
74
|
"search/SKILL.md",
|
|
75
75
|
"template/SKILL.md",
|
|
76
76
|
"replay/SKILL.md",
|
|
77
|
+
"cite/SKILL.md",
|
|
78
|
+
"present/SKILL.md",
|
|
79
|
+
"changelog/SKILL.md",
|
|
80
|
+
"onboard/SKILL.md",
|
|
81
|
+
"share/SKILL.md",
|
|
82
|
+
"review/SKILL.md",
|
|
77
83
|
];
|
|
78
84
|
|
|
79
85
|
const EXPECTED_AGENTS = ["ml-researcher.md", "ml-evaluator.md"];
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Project onboarding generator for new collaborators.
|
|
3
|
+
|
|
4
|
+
Reads config, experiments, annotations, and hypotheses to produce a
|
|
5
|
+
structured walkthrough: task description, what's been tried (grouped
|
|
6
|
+
by family), key decisions, where heading, how to start.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/generate_onboarding.py --audience researcher --depth full
|
|
10
|
+
python scripts/generate_onboarding.py --audience stakeholder --depth brief --json
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
import yaml
|
|
21
|
+
from scripts.turing_io import load_config, load_experiments, load_hypotheses
|
|
22
|
+
|
|
23
|
+
VALID_AUDIENCES = ["researcher", "engineer", "stakeholder"]
|
|
24
|
+
VALID_DEPTHS = ["brief", "full"]
|
|
25
|
+
DEFAULT_LOG = "experiments/log.jsonl"
|
|
26
|
+
DEFAULT_ANNOTATIONS = "experiments/annotations.yaml"
|
|
27
|
+
DEFAULT_HYPOTHESES = "hypotheses.yaml"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _load_yaml_list(path: str) -> list[dict]:
|
|
31
|
+
p = Path(path)
|
|
32
|
+
if not p.exists() or p.stat().st_size == 0:
|
|
33
|
+
return []
|
|
34
|
+
with open(p) as f:
|
|
35
|
+
data = yaml.safe_load(f)
|
|
36
|
+
return data if isinstance(data, list) else []
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _load_yaml_dir(directory: str, glob: str) -> list[dict]:
|
|
40
|
+
path = Path(directory)
|
|
41
|
+
if not path.exists():
|
|
42
|
+
return []
|
|
43
|
+
items = []
|
|
44
|
+
for f in sorted(path.glob(glob)):
|
|
45
|
+
try:
|
|
46
|
+
with open(f) as fh:
|
|
47
|
+
d = yaml.safe_load(fh)
|
|
48
|
+
if d and isinstance(d, dict):
|
|
49
|
+
items.append(d)
|
|
50
|
+
except (yaml.YAMLError, OSError):
|
|
51
|
+
continue
|
|
52
|
+
return items
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _family_summary(exps: list[dict], metric: str, lower_is_better: bool) -> dict:
|
|
56
|
+
total = len(exps)
|
|
57
|
+
kept = [e for e in exps if e.get("status") == "kept"]
|
|
58
|
+
best_val, best_id = None, None
|
|
59
|
+
for e in kept:
|
|
60
|
+
val = e.get("metrics", {}).get(metric)
|
|
61
|
+
if val is None:
|
|
62
|
+
continue
|
|
63
|
+
if best_val is None or (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
|
|
64
|
+
best_val, best_id = val, e.get("experiment_id")
|
|
65
|
+
return {"total": total, "kept": len(kept), "keep_rate": round(len(kept) / total, 2) if total else 0,
|
|
66
|
+
"best_metric": best_val, "best_experiment": best_id}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _find_best(experiments: list[dict], metric: str, lower_is_better: bool) -> dict | None:
|
|
70
|
+
best, best_val = None, float("inf") if lower_is_better else float("-inf")
|
|
71
|
+
for e in experiments:
|
|
72
|
+
if e.get("status") != "kept":
|
|
73
|
+
continue
|
|
74
|
+
val = e.get("metrics", {}).get(metric)
|
|
75
|
+
if val is not None and ((lower_is_better and val < best_val) or (not lower_is_better and val > best_val)):
|
|
76
|
+
best_val, best = val, e
|
|
77
|
+
return best
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _extract_decisions(packets: list[dict], annotations: list[dict]) -> list[dict]:
|
|
81
|
+
decisions = []
|
|
82
|
+
for pkt in packets:
|
|
83
|
+
if pkt.get("action") in ("promote", "abandon", "replicate"):
|
|
84
|
+
decisions.append({"type": "decision", "experiment": pkt.get("experiment_id", "?"),
|
|
85
|
+
"action": pkt["action"], "reason": pkt.get("reason", ""),
|
|
86
|
+
"date": pkt.get("timestamp", "")[:10]})
|
|
87
|
+
key_tags = {"decision", "key", "important", "milestone"}
|
|
88
|
+
for ann in annotations:
|
|
89
|
+
if set(t.lower() for t in ann.get("tags", [])) & key_tags:
|
|
90
|
+
decisions.append({"type": "annotation", "experiment": ann.get("experiment_id", "?"),
|
|
91
|
+
"text": ann.get("text", ""), "date": ann.get("date", "")[:10]})
|
|
92
|
+
decisions.sort(key=lambda d: d.get("date", ""), reverse=True)
|
|
93
|
+
return decisions
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _project_direction(hypotheses: list[dict], experiments: list[dict]) -> dict:
|
|
97
|
+
queued = [h for h in hypotheses if h.get("status") == "queued"]
|
|
98
|
+
promising = [h for h in hypotheses if h.get("status") == "promising"]
|
|
99
|
+
recent = experiments[-5:] if len(experiments) >= 5 else experiments
|
|
100
|
+
recent_kept = sum(1 for e in recent if e.get("status") == "kept")
|
|
101
|
+
if not experiments:
|
|
102
|
+
phase = "not_started"
|
|
103
|
+
elif not queued and not promising:
|
|
104
|
+
phase = "exhausted"
|
|
105
|
+
elif recent_kept == 0 and len(recent) >= 3:
|
|
106
|
+
phase = "plateaued"
|
|
107
|
+
elif promising:
|
|
108
|
+
phase = "promising_leads"
|
|
109
|
+
else:
|
|
110
|
+
phase = "active_exploration"
|
|
111
|
+
return {"phase": phase, "queued": queued[:5], "promising": promising[:3],
|
|
112
|
+
"n_queued": len(queued), "n_promising": len(promising)}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def format_onboarding_report(config, experiments, families, best, decisions,
|
|
116
|
+
direction, annotations, seeds, audience, depth,
|
|
117
|
+
metric, lower_is_better) -> str:
|
|
118
|
+
d = "lower" if lower_is_better else "higher"
|
|
119
|
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
|
120
|
+
L = ["# Project Onboarding", "", f"*Generated {now} for audience: {audience}*", "", "---", "",
|
|
121
|
+
"## 1. What This Project Does", ""]
|
|
122
|
+
data_cfg, eval_cfg = config.get("data", {}), config.get("evaluation", {})
|
|
123
|
+
L.append(f"**Task:** {config.get('task_description', 'N/A')}")
|
|
124
|
+
L.append(f"**Dataset:** {data_cfg.get('source', 'unknown')}")
|
|
125
|
+
L.append(f"**Primary metric:** `{metric}` ({d} is better)")
|
|
126
|
+
extra = [m for m in eval_cfg.get("metrics", []) if m != metric]
|
|
127
|
+
if extra:
|
|
128
|
+
L.append(f"**Additional metrics:** {', '.join(f'`{m}`' for m in extra)}")
|
|
129
|
+
if depth == "full" and audience != "stakeholder":
|
|
130
|
+
sr = data_cfg.get("split_ratios", {})
|
|
131
|
+
if sr:
|
|
132
|
+
L.append(f"**Data splits:** {' / '.join(f'{k}: {int(v*100)}%' for k, v in sr.items())}")
|
|
133
|
+
if data_cfg.get("target_column"):
|
|
134
|
+
L.append(f"**Target column:** `{data_cfg['target_column']}`")
|
|
135
|
+
L.extend(["", "## 2. What's Been Tried", ""])
|
|
136
|
+
total, kept_n = len(experiments), sum(1 for e in experiments if e.get("status") == "kept")
|
|
137
|
+
if total == 0:
|
|
138
|
+
L.append("No experiments yet. Start with `/turing:train`.")
|
|
139
|
+
else:
|
|
140
|
+
L.append(f"**{total} experiments**, **{kept_n} kept** ({round(kept_n/total*100)}% keep rate).")
|
|
141
|
+
L.append("")
|
|
142
|
+
if best:
|
|
143
|
+
ms = ", ".join(f"{k}={v:.4f}" if isinstance(v, float) else f"{k}={v}"
|
|
144
|
+
for k, v in best.get("metrics", {}).items())
|
|
145
|
+
L.extend([f"**Champion:** `{best.get('experiment_id','?')}` "
|
|
146
|
+
f"({best.get('config',{}).get('model_type','?')}) — {ms}", ""])
|
|
147
|
+
L.extend(["### By Family", "", "| Family | Exps | Kept | Best | Status |",
|
|
148
|
+
"|--------|------|------|------|--------|"])
|
|
149
|
+
for name, s in sorted(families.items()):
|
|
150
|
+
bv = f"{s['best_metric']:.4f}" if s["best_metric"] is not None else "---"
|
|
151
|
+
st = "Exhausted" if s["keep_rate"] == 0 and s["total"] >= 3 else (
|
|
152
|
+
"Productive" if s["keep_rate"] >= 0.5 else "Mixed")
|
|
153
|
+
L.append(f"| {name} | {s['total']} | {s['kept']} | {bv} | {st} |")
|
|
154
|
+
L.append("")
|
|
155
|
+
if depth == "full" and audience in ("researcher", "engineer"):
|
|
156
|
+
for name, s in sorted(families.items()):
|
|
157
|
+
if not s["total"]:
|
|
158
|
+
continue
|
|
159
|
+
L.append(f"#### {name}")
|
|
160
|
+
if s["best_experiment"]:
|
|
161
|
+
L.append(f"- Best: `{s['best_experiment']}` ({metric}={s['best_metric']:.4f})")
|
|
162
|
+
L.append(f"- {s['kept']}/{s['total']} kept ({s['keep_rate']:.0%})")
|
|
163
|
+
fam_ids = {e.get("experiment_id") for e in experiments if (e.get("family") or "untagged") == name}
|
|
164
|
+
notes = [a for a in annotations if a.get("experiment_id") in fam_ids]
|
|
165
|
+
for n in notes[:3]:
|
|
166
|
+
L.append(f" - {n.get('text','')[:80]}")
|
|
167
|
+
L.append("")
|
|
168
|
+
L.extend(["## 3. Key Decisions", ""])
|
|
169
|
+
if not decisions:
|
|
170
|
+
L.append("No major decisions recorded yet.")
|
|
171
|
+
else:
|
|
172
|
+
lim = 5 if depth == "brief" else 15
|
|
173
|
+
for dec in decisions[:lim]:
|
|
174
|
+
if dec["type"] == "decision":
|
|
175
|
+
L.append(f"- **{dec['date']}** `{dec['experiment']}`: **{dec['action']}** — {dec['reason']}")
|
|
176
|
+
else:
|
|
177
|
+
L.append(f"- **{dec['date']}** `{dec['experiment']}`: {dec['text'][:100]}")
|
|
178
|
+
if len(decisions) > lim:
|
|
179
|
+
L.append(f" *...and {len(decisions)-lim} more*")
|
|
180
|
+
L.extend(["", "## 4. Where We're Heading", ""])
|
|
181
|
+
phases = {"not_started": "Project has not started experiments yet.",
|
|
182
|
+
"exhausted": "All hypotheses tested. Need fresh ideas.",
|
|
183
|
+
"plateaued": "Recent experiments not improving. Consider pivoting.",
|
|
184
|
+
"promising_leads": "Promising directions identified and being pursued.",
|
|
185
|
+
"active_exploration": "Actively exploring hypothesis space."}
|
|
186
|
+
L.extend([phases.get(direction["phase"], "Unknown phase."), ""])
|
|
187
|
+
if direction["n_queued"]:
|
|
188
|
+
L.append(f"**{direction['n_queued']} hypotheses queued:**")
|
|
189
|
+
for h in direction["queued"]:
|
|
190
|
+
p = " (HIGH)" if h.get("priority") == "high" else ""
|
|
191
|
+
L.append(f"- {h.get('id','?')}: {h.get('description','?')}{p}")
|
|
192
|
+
L.append("")
|
|
193
|
+
if direction["n_promising"]:
|
|
194
|
+
L.append(f"**{direction['n_promising']} promising lead(s):**")
|
|
195
|
+
for h in direction["promising"]:
|
|
196
|
+
L.append(f"- {h.get('id','?')}: {h.get('description','?')}")
|
|
197
|
+
L.append("")
|
|
198
|
+
sensitive = [s for s in seeds if s.get("seed_sensitive")]
|
|
199
|
+
if sensitive and audience != "stakeholder":
|
|
200
|
+
L.append("**Seed sensitivity warnings:**")
|
|
201
|
+
for s in sensitive:
|
|
202
|
+
L.append(f"- `{s.get('experiment_id','?')}`: CV={s.get('cv_percent',0):.1f}%")
|
|
203
|
+
L.append("")
|
|
204
|
+
L.extend(["## 5. How to Get Started", ""])
|
|
205
|
+
cmds = {"researcher": [
|
|
206
|
+
"1. Read `config.yaml` for task and evaluation setup",
|
|
207
|
+
"2. `/turing:status` — current experiment state",
|
|
208
|
+
"3. `/turing:brief` — full research intelligence report",
|
|
209
|
+
"4. Review `hypotheses.yaml` for queued ideas",
|
|
210
|
+
"5. `/turing:try \"your hypothesis\"` — inject ideas",
|
|
211
|
+
"6. `/turing:train` — run next experiment",
|
|
212
|
+
], "engineer": [
|
|
213
|
+
"1. `pip install -r requirements.txt`",
|
|
214
|
+
"2. Review `config.yaml` for data paths",
|
|
215
|
+
"3. `/turing:status` — where things stand",
|
|
216
|
+
"4. Check `train.py` for current model",
|
|
217
|
+
"5. `/turing:train` — execute experiments",
|
|
218
|
+
], "stakeholder": [
|
|
219
|
+
"1. `/turing:brief` — high-level summary",
|
|
220
|
+
"2. Check champion performance above",
|
|
221
|
+
"3. Review 'Where We're Heading' for next steps",
|
|
222
|
+
]}
|
|
223
|
+
L.extend(cmds.get(audience, []))
|
|
224
|
+
L.extend(["", "---", f"*Generated by `/turing:onboard` — {audience}, {depth}*"])
|
|
225
|
+
return "\n".join(L)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def save_onboarding_report(content: str, path: str = "ONBOARDING.md") -> Path:
|
|
229
|
+
p = Path(path)
|
|
230
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
231
|
+
p.write_text(content)
|
|
232
|
+
return p
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def generate_onboarding(config_path="config.yaml", log_path=DEFAULT_LOG,
|
|
236
|
+
hypotheses_path=DEFAULT_HYPOTHESES,
|
|
237
|
+
annotations_path=DEFAULT_ANNOTATIONS,
|
|
238
|
+
audience="researcher", depth="full") -> dict:
|
|
239
|
+
"""Generate full onboarding report. Returns dict with report and metadata."""
|
|
240
|
+
config = load_config(config_path)
|
|
241
|
+
metric = config.get("evaluation", {}).get("primary_metric", "accuracy")
|
|
242
|
+
lower = config.get("evaluation", {}).get("lower_is_better", False)
|
|
243
|
+
experiments = load_experiments(log_path)
|
|
244
|
+
hypotheses = load_hypotheses(hypotheses_path)
|
|
245
|
+
annotations = _load_yaml_list(annotations_path)
|
|
246
|
+
packets = _load_yaml_dir("experiments/decisions", "*.yaml")
|
|
247
|
+
seeds = _load_yaml_dir("experiments/seed_studies", "*-seeds.yaml")
|
|
248
|
+
fam_groups = {}
|
|
249
|
+
for e in experiments:
|
|
250
|
+
fam_groups.setdefault(e.get("family") or "untagged", []).append(e)
|
|
251
|
+
families = {n: _family_summary(exps, metric, lower) for n, exps in fam_groups.items()}
|
|
252
|
+
best = _find_best(experiments, metric, lower)
|
|
253
|
+
decisions = _extract_decisions(packets, annotations)
|
|
254
|
+
direction = _project_direction(hypotheses, experiments)
|
|
255
|
+
report = format_onboarding_report(config, experiments, families, best, decisions,
|
|
256
|
+
direction, annotations, seeds, audience, depth, metric, lower)
|
|
257
|
+
return {"timestamp": datetime.now(timezone.utc).isoformat(), "audience": audience,
|
|
258
|
+
"depth": depth, "total_experiments": len(experiments),
|
|
259
|
+
"project_phase": direction["phase"], "report": report}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def main() -> None:
|
|
263
|
+
parser = argparse.ArgumentParser(description="Generate project onboarding for new collaborators")
|
|
264
|
+
parser.add_argument("--config", default="config.yaml")
|
|
265
|
+
parser.add_argument("--log", default=DEFAULT_LOG)
|
|
266
|
+
parser.add_argument("--hypotheses", default=DEFAULT_HYPOTHESES)
|
|
267
|
+
parser.add_argument("--annotations", default=DEFAULT_ANNOTATIONS)
|
|
268
|
+
parser.add_argument("--audience", default="researcher", choices=VALID_AUDIENCES)
|
|
269
|
+
parser.add_argument("--depth", default="full", choices=VALID_DEPTHS)
|
|
270
|
+
parser.add_argument("--output", default="ONBOARDING.md")
|
|
271
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
272
|
+
args = parser.parse_args()
|
|
273
|
+
result = generate_onboarding(args.config, args.log, args.hypotheses,
|
|
274
|
+
args.annotations, args.audience, args.depth)
|
|
275
|
+
if args.json:
|
|
276
|
+
print(json.dumps(result, indent=2, default=str))
|
|
277
|
+
else:
|
|
278
|
+
saved = save_onboarding_report(result["report"], args.output)
|
|
279
|
+
print(result["report"])
|
|
280
|
+
print(f"\nSaved to {saved}", file=sys.stderr)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
if __name__ == "__main__":
|
|
284
|
+
main()
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Package experiments into portable archive for sharing.
|
|
3
|
+
|
|
4
|
+
Collects config, metrics, seed studies, annotations, decision packets
|
|
5
|
+
per experiment. Generates manifest.yaml and README.md inside the
|
|
6
|
+
package directory. Does NOT create tar.gz -- just organizes files.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/package_experiments.py
|
|
10
|
+
python scripts/package_experiments.py --experiments exp-042,exp-043
|
|
11
|
+
python scripts/package_experiments.py --include model,data-hash,figures,code
|
|
12
|
+
python scripts/package_experiments.py --json
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import hashlib
|
|
18
|
+
import json
|
|
19
|
+
import shutil
|
|
20
|
+
import sys
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
import yaml
|
|
25
|
+
from scripts.turing_io import load_config, load_experiments
|
|
26
|
+
|
|
27
|
+
DEFAULT_LOG = "experiments/log.jsonl"
|
|
28
|
+
DEFAULT_OUTPUT = "exports/packages"
|
|
29
|
+
VALID_INCLUDES = ["model", "data-hash", "figures", "code"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _load_yaml_list(path: str) -> list[dict]:
|
|
33
|
+
p = Path(path)
|
|
34
|
+
if not p.exists() or p.stat().st_size == 0:
|
|
35
|
+
return []
|
|
36
|
+
with open(p) as f:
|
|
37
|
+
data = yaml.safe_load(f)
|
|
38
|
+
return data if isinstance(data, list) else []
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _load_yaml_file(path: Path) -> dict | None:
|
|
42
|
+
if not path.exists():
|
|
43
|
+
return None
|
|
44
|
+
try:
|
|
45
|
+
with open(path) as f:
|
|
46
|
+
d = yaml.safe_load(f)
|
|
47
|
+
return d if isinstance(d, dict) else None
|
|
48
|
+
except (yaml.YAMLError, OSError):
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _file_hash(filepath: str) -> str | None:
|
|
53
|
+
p = Path(filepath)
|
|
54
|
+
if not p.exists():
|
|
55
|
+
return None
|
|
56
|
+
h = hashlib.sha256()
|
|
57
|
+
with open(p, "rb") as f:
|
|
58
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
59
|
+
h.update(chunk)
|
|
60
|
+
return h.hexdigest()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def collect_experiment_artifacts(exp: dict, includes: list[str]) -> dict:
|
|
64
|
+
"""Collect all artifacts for a single experiment."""
|
|
65
|
+
eid = exp.get("experiment_id", "unknown")
|
|
66
|
+
art: dict = {"experiment_id": eid, "status": exp.get("status", "unknown"),
|
|
67
|
+
"metrics": exp.get("metrics", {}), "config": exp.get("config", {}),
|
|
68
|
+
"description": exp.get("description", ""), "timestamp": exp.get("timestamp", ""),
|
|
69
|
+
"family": exp.get("family")}
|
|
70
|
+
# Seed study
|
|
71
|
+
seed = _load_yaml_file(Path(f"experiments/seed_studies/{eid}-seeds.yaml"))
|
|
72
|
+
if seed:
|
|
73
|
+
art["seed_study"] = {"mean": seed.get("mean"), "std": seed.get("std"),
|
|
74
|
+
"cv_percent": seed.get("cv_percent"),
|
|
75
|
+
"seed_sensitive": seed.get("seed_sensitive", False)}
|
|
76
|
+
# Decision packet
|
|
77
|
+
dec = _load_yaml_file(Path(f"experiments/decisions/{eid}-decision.yaml"))
|
|
78
|
+
if dec:
|
|
79
|
+
art["decision"] = {"action": dec.get("action"), "reason": dec.get("reason", "")}
|
|
80
|
+
# Ablation
|
|
81
|
+
abl = _load_yaml_file(Path(f"experiments/ablations/{eid}-ablation.yaml"))
|
|
82
|
+
if abl:
|
|
83
|
+
art["ablation"] = {"metric": abl.get("metric"),
|
|
84
|
+
"n_ablations": len(abl.get("results", []))}
|
|
85
|
+
# Reproduction
|
|
86
|
+
repro = _load_yaml_file(Path(f"experiments/reproductions/{eid}-repro.yaml"))
|
|
87
|
+
if repro:
|
|
88
|
+
art["reproduction"] = {"verdict": repro.get("verdict"), "reason": repro.get("reason", "")}
|
|
89
|
+
# Optional includes
|
|
90
|
+
if "model" in includes:
|
|
91
|
+
for pat in [f"models/{eid}", f"models/{eid}.*", f"checkpoints/{eid}/*"]:
|
|
92
|
+
matches = list(Path(".").glob(pat))
|
|
93
|
+
if matches:
|
|
94
|
+
art["model_path"] = str(matches[0])
|
|
95
|
+
break
|
|
96
|
+
if "data-hash" in includes:
|
|
97
|
+
dp = exp.get("config", {}).get("data", {}).get("path")
|
|
98
|
+
if dp:
|
|
99
|
+
h = _file_hash(dp)
|
|
100
|
+
if h:
|
|
101
|
+
art["data_hash"] = h
|
|
102
|
+
if "figures" in includes:
|
|
103
|
+
fig_dir = Path(f"experiments/figures/{eid}")
|
|
104
|
+
art["figures"] = [str(f) for f in fig_dir.glob("*") if f.is_file()] if fig_dir.exists() else []
|
|
105
|
+
if "code" in includes:
|
|
106
|
+
art["train_py_hash"] = _file_hash("train.py")
|
|
107
|
+
snap = Path(f"experiments/code/{eid}")
|
|
108
|
+
if snap.exists():
|
|
109
|
+
art["code_snapshot_path"] = str(snap)
|
|
110
|
+
return art
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def build_manifest(name: str, config: dict, artifacts: list[dict], includes: list[str]) -> dict:
|
|
114
|
+
eval_cfg = config.get("evaluation", {})
|
|
115
|
+
return {
|
|
116
|
+
"package": {"name": name, "created": datetime.now(timezone.utc).isoformat(),
|
|
117
|
+
"generator": "turing:share", "version": "1.0"},
|
|
118
|
+
"project": {"task": config.get("task_description", ""),
|
|
119
|
+
"primary_metric": eval_cfg.get("primary_metric", "accuracy"),
|
|
120
|
+
"lower_is_better": eval_cfg.get("lower_is_better", False)},
|
|
121
|
+
"contents": {"experiments": len(artifacts), "includes": includes,
|
|
122
|
+
"has_seed_studies": any(a.get("seed_study") for a in artifacts),
|
|
123
|
+
"has_decisions": any(a.get("decision") for a in artifacts)},
|
|
124
|
+
"experiments": [{"id": a["experiment_id"], "status": a["status"],
|
|
125
|
+
"family": a.get("family"), "metrics": a["metrics"]} for a in artifacts],
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def build_package_readme(config: dict, artifacts: list[dict], manifest: dict) -> str:
|
|
130
|
+
metric = manifest["project"]["primary_metric"]
|
|
131
|
+
d = "lower" if manifest["project"]["lower_is_better"] else "higher"
|
|
132
|
+
L = [f"# Experiment Package: {manifest['package']['name']}", "",
|
|
133
|
+
f"*Packaged {manifest['package']['created'][:19]} UTC*", "",
|
|
134
|
+
"## Project", "", f"- **Task:** {config.get('task_description', 'N/A')}",
|
|
135
|
+
f"- **Primary metric:** `{metric}` ({d} is better)", "", "## Experiments", "",
|
|
136
|
+
f"| ID | Status | Family | {metric} |",
|
|
137
|
+
f"|----|--------|--------|{'---'*max(len(metric)//3,1)}--|"]
|
|
138
|
+
for a in artifacts:
|
|
139
|
+
v = a.get("metrics", {}).get(metric)
|
|
140
|
+
vs = f"{v:.4f}" if isinstance(v, (int, float)) else "---"
|
|
141
|
+
L.append(f"| {a['experiment_id']} | {a['status']} | {a.get('family','---')} | {vs} |")
|
|
142
|
+
seeds = [a for a in artifacts if a.get("seed_study")]
|
|
143
|
+
if seeds:
|
|
144
|
+
L.extend(["", "## Seed Studies", ""])
|
|
145
|
+
for a in seeds:
|
|
146
|
+
s = a["seed_study"]
|
|
147
|
+
tag = "SEED-SENSITIVE" if s["seed_sensitive"] else "stable"
|
|
148
|
+
L.append(f"- `{a['experiment_id']}`: mean={s['mean']:.4f} +/- {s['std']:.4f} [{tag}]")
|
|
149
|
+
decs = [a for a in artifacts if a.get("decision")]
|
|
150
|
+
if decs:
|
|
151
|
+
L.extend(["", "## Decisions", ""])
|
|
152
|
+
for a in decs:
|
|
153
|
+
L.append(f"- `{a['experiment_id']}`: **{a['decision']['action']}** — {a['decision']['reason']}")
|
|
154
|
+
L.extend(["", "## Files", "", "- `manifest.yaml` — Machine-readable manifest",
|
|
155
|
+
"- `README.md` — This file", "- `experiments/` — Per-experiment artifacts",
|
|
156
|
+
"- `config.yaml` — Project config snapshot", "", "---",
|
|
157
|
+
"*Generated by `/turing:share`*"])
|
|
158
|
+
return "\n".join(L)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def write_package(pkg_dir: Path, config: dict, artifacts: list[dict],
|
|
162
|
+
manifest: dict, readme: str, includes: list[str]) -> None:
|
|
163
|
+
"""Write all package files to the directory."""
|
|
164
|
+
pkg_dir.mkdir(parents=True, exist_ok=True)
|
|
165
|
+
with open(pkg_dir / "manifest.yaml", "w") as f:
|
|
166
|
+
yaml.dump(manifest, f, default_flow_style=False, sort_keys=False)
|
|
167
|
+
(pkg_dir / "README.md").write_text(readme)
|
|
168
|
+
with open(pkg_dir / "config.yaml", "w") as f:
|
|
169
|
+
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
|
170
|
+
exp_dir = pkg_dir / "experiments"
|
|
171
|
+
exp_dir.mkdir(exist_ok=True)
|
|
172
|
+
for art in artifacts:
|
|
173
|
+
sub = exp_dir / art["experiment_id"]
|
|
174
|
+
sub.mkdir(exist_ok=True)
|
|
175
|
+
with open(sub / "artifact.yaml", "w") as f:
|
|
176
|
+
yaml.dump(art, f, default_flow_style=False, sort_keys=False)
|
|
177
|
+
if "figures" in includes and art.get("figures"):
|
|
178
|
+
fd = sub / "figures"
|
|
179
|
+
fd.mkdir(exist_ok=True)
|
|
180
|
+
for fp in art["figures"]:
|
|
181
|
+
src = Path(fp)
|
|
182
|
+
if src.exists():
|
|
183
|
+
shutil.copy2(src, fd / src.name)
|
|
184
|
+
if "code" in includes and art.get("code_snapshot_path"):
|
|
185
|
+
cs = Path(art["code_snapshot_path"])
|
|
186
|
+
if cs.exists() and cs.is_dir():
|
|
187
|
+
shutil.copytree(cs, sub / "code", dirs_exist_ok=True)
|
|
188
|
+
if "model" in includes and art.get("model_path"):
|
|
189
|
+
ms = Path(art["model_path"])
|
|
190
|
+
if ms.exists():
|
|
191
|
+
md = sub / "model"
|
|
192
|
+
md.mkdir(exist_ok=True)
|
|
193
|
+
if ms.is_dir():
|
|
194
|
+
shutil.copytree(ms, md, dirs_exist_ok=True)
|
|
195
|
+
else:
|
|
196
|
+
shutil.copy2(ms, md / ms.name)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def save_package_report(result: dict, pkg_dir: Path) -> Path:
|
|
200
|
+
rp = pkg_dir / "package-report.yaml"
|
|
201
|
+
with open(rp, "w") as f:
|
|
202
|
+
yaml.dump({"timestamp": result["timestamp"], "package_name": result["package_name"],
|
|
203
|
+
"package_dir": str(result["package_dir"]),
|
|
204
|
+
"experiments_packaged": result["experiments_packaged"],
|
|
205
|
+
"includes": result["includes"]}, f, default_flow_style=False, sort_keys=False)
|
|
206
|
+
return rp
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def format_package_report(result: dict) -> str:
|
|
210
|
+
L = ["# Package Summary", "",
|
|
211
|
+
f"- **Package:** {result['package_name']}",
|
|
212
|
+
f"- **Location:** `{result['package_dir']}`",
|
|
213
|
+
f"- **Experiments:** {result['experiments_packaged']}",
|
|
214
|
+
f"- **Includes:** {', '.join(result['includes']) or 'metrics only'}", "", "## Contents", ""]
|
|
215
|
+
for a in result.get("artifacts", []):
|
|
216
|
+
extras = [k for k in ("seed_study", "decision", "ablation", "reproduction") if a.get(k)]
|
|
217
|
+
es = f" [{', '.join(extras)}]" if extras else ""
|
|
218
|
+
L.append(f"- `{a['experiment_id']}` ({a['status']}){es}")
|
|
219
|
+
return "\n".join(L)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def package_experiments(experiment_ids=None, includes=None, config_path="config.yaml",
|
|
223
|
+
log_path=DEFAULT_LOG, output_dir=DEFAULT_OUTPUT) -> dict:
|
|
224
|
+
"""Package experiments into a portable directory."""
|
|
225
|
+
includes = includes or []
|
|
226
|
+
config = load_config(config_path)
|
|
227
|
+
experiments = load_experiments(log_path)
|
|
228
|
+
annotations = _load_yaml_list("experiments/annotations.yaml")
|
|
229
|
+
if experiment_ids:
|
|
230
|
+
selected = [e for e in experiments if e.get("experiment_id") in experiment_ids]
|
|
231
|
+
if not selected:
|
|
232
|
+
return {"error": f"No matching experiments for: {experiment_ids}"}
|
|
233
|
+
else:
|
|
234
|
+
selected = [e for e in experiments if e.get("status") == "kept"]
|
|
235
|
+
if not selected:
|
|
236
|
+
return {"error": "No kept experiments to package."}
|
|
237
|
+
artifacts = [collect_experiment_artifacts(e, includes) for e in selected]
|
|
238
|
+
for art in artifacts:
|
|
239
|
+
eid = art["experiment_id"]
|
|
240
|
+
anns = [a for a in annotations if a.get("experiment_id") == eid]
|
|
241
|
+
if anns:
|
|
242
|
+
art["annotations"] = anns
|
|
243
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
244
|
+
pkg_name = f"package-{len(artifacts)}exp-{ts}"
|
|
245
|
+
pkg_dir = Path(output_dir) / pkg_name
|
|
246
|
+
manifest = build_manifest(pkg_name, config, artifacts, includes)
|
|
247
|
+
readme = build_package_readme(config, artifacts, manifest)
|
|
248
|
+
write_package(pkg_dir, config, artifacts, manifest, readme, includes)
|
|
249
|
+
result = {"timestamp": datetime.now(timezone.utc).isoformat(), "package_name": pkg_name,
|
|
250
|
+
"package_dir": str(pkg_dir), "experiments_packaged": len(artifacts),
|
|
251
|
+
"includes": includes, "artifacts": artifacts}
|
|
252
|
+
save_package_report(result, pkg_dir)
|
|
253
|
+
return result
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def main() -> None:
|
|
257
|
+
parser = argparse.ArgumentParser(description="Package experiments into portable archive")
|
|
258
|
+
parser.add_argument("--experiments", default=None, help="Comma-separated experiment IDs")
|
|
259
|
+
parser.add_argument("--include", default=None, help="Extras: model,data-hash,figures,code")
|
|
260
|
+
parser.add_argument("--config", default="config.yaml")
|
|
261
|
+
parser.add_argument("--log", default=DEFAULT_LOG)
|
|
262
|
+
parser.add_argument("--output", default=DEFAULT_OUTPUT)
|
|
263
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
264
|
+
args = parser.parse_args()
|
|
265
|
+
exp_ids = [e.strip() for e in args.experiments.split(",")] if args.experiments else None
|
|
266
|
+
includes = []
|
|
267
|
+
if args.include:
|
|
268
|
+
includes = [i.strip() for i in args.include.split(",")]
|
|
269
|
+
bad = [i for i in includes if i not in VALID_INCLUDES]
|
|
270
|
+
if bad:
|
|
271
|
+
print(f"ERROR: Invalid include(s): {bad}. Valid: {VALID_INCLUDES}", file=sys.stderr)
|
|
272
|
+
sys.exit(1)
|
|
273
|
+
result = package_experiments(exp_ids, includes, args.config, args.log, args.output)
|
|
274
|
+
if "error" in result:
|
|
275
|
+
print(f"ERROR: {result['error']}", file=sys.stderr)
|
|
276
|
+
sys.exit(1)
|
|
277
|
+
if args.json:
|
|
278
|
+
print(json.dumps(result, indent=2, default=str))
|
|
279
|
+
else:
|
|
280
|
+
print(format_package_report(result))
|
|
281
|
+
print(f"\nPackage saved to: {result['package_dir']}", file=sys.stderr)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
if __name__ == "__main__":
|
|
285
|
+
main()
|
|
@@ -137,6 +137,12 @@ TEMPLATE_DIRS = {
|
|
|
137
137
|
"experiment_search.py",
|
|
138
138
|
"experiment_templates.py",
|
|
139
139
|
"experiment_replay.py",
|
|
140
|
+
"citation_manager.py",
|
|
141
|
+
"generate_figures.py",
|
|
142
|
+
"generate_changelog.py",
|
|
143
|
+
"generate_onboarding.py",
|
|
144
|
+
"package_experiments.py",
|
|
145
|
+
"simulate_review.py",
|
|
140
146
|
],
|
|
141
147
|
"tests": ["__init__.py", "conftest.py"],
|
|
142
148
|
}
|
|
@@ -184,6 +190,10 @@ DIRECTORIES_TO_CREATE = [
|
|
|
184
190
|
"experiments/archive",
|
|
185
191
|
"experiments/searches",
|
|
186
192
|
"experiments/replays",
|
|
193
|
+
"experiments/citations",
|
|
194
|
+
"paper/figures",
|
|
195
|
+
"exports/packages",
|
|
196
|
+
"experiments/reviews",
|
|
187
197
|
"experiments/logs",
|
|
188
198
|
"models/best",
|
|
189
199
|
"models/archive",
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Simulated peer review for ML experiment campaigns.
|
|
3
|
+
|
|
4
|
+
Checks for: missing baselines, missing error bars, missing ablation,
|
|
5
|
+
overclaimed results, missing SOTA comparison, calibration, computational
|
|
6
|
+
cost. Generates structured review with strengths/weaknesses/questions,
|
|
7
|
+
each weakness linked to a /turing: fix command. Scores 1-10.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python scripts/simulate_review.py
|
|
11
|
+
python scripts/simulate_review.py --venue neurips --harsh
|
|
12
|
+
python scripts/simulate_review.py --venue icml --json
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import sys
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import yaml
|
|
23
|
+
from scripts.turing_io import load_config, load_experiments
|
|
24
|
+
|
|
25
|
+
DEFAULT_LOG = "experiments/log.jsonl"
|
|
26
|
+
VALID_VENUES = ["neurips", "icml", "general"]
|
|
27
|
+
SEVERITY_WEIGHTS = {"critical": 3.0, "major": 2.0, "minor": 1.0, "nitpick": 0.3}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _load_yaml_dir(directory: str, glob: str) -> list[dict]:
|
|
31
|
+
path = Path(directory)
|
|
32
|
+
if not path.exists():
|
|
33
|
+
return []
|
|
34
|
+
items = []
|
|
35
|
+
for f in sorted(path.glob(glob)):
|
|
36
|
+
try:
|
|
37
|
+
with open(f) as fh:
|
|
38
|
+
d = yaml.safe_load(fh)
|
|
39
|
+
if d and isinstance(d, dict):
|
|
40
|
+
items.append(d)
|
|
41
|
+
except (yaml.YAMLError, OSError):
|
|
42
|
+
continue
|
|
43
|
+
return items
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _load_yaml_list(path: str) -> list[dict]:
|
|
47
|
+
p = Path(path)
|
|
48
|
+
if not p.exists() or p.stat().st_size == 0:
|
|
49
|
+
return []
|
|
50
|
+
with open(p) as f:
|
|
51
|
+
data = yaml.safe_load(f)
|
|
52
|
+
return data if isinstance(data, list) else []
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _w(id, sev, title, detail, fix, venues=None):
|
|
56
|
+
"""Shorthand weakness constructor."""
|
|
57
|
+
return {"id": id, "severity": sev, "title": title, "detail": detail,
|
|
58
|
+
"fix_command": fix, "venue_relevance": venues or ["neurips", "icml", "general"]}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# --- Review checks ---
|
|
62
|
+
|
|
63
|
+
def check_baselines(experiments, config):
|
|
64
|
+
types = {e.get("config", {}).get("model_type", "") for e in experiments if e.get("status") == "kept"}
|
|
65
|
+
baselines = {"logistic_regression", "linear_regression", "dummy", "majority_class", "random", "baseline"}
|
|
66
|
+
if types and not (types & baselines):
|
|
67
|
+
return _w("missing-baselines", "major", "No simple baseline comparison",
|
|
68
|
+
f"Model types: {', '.join(sorted(types))}. No simple baseline to calibrate expectations.",
|
|
69
|
+
'/turing:try "Add logistic regression baseline"')
|
|
70
|
+
|
|
71
|
+
def check_error_bars(experiments, seeds):
|
|
72
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
73
|
+
if not kept:
|
|
74
|
+
return None
|
|
75
|
+
studied = {s.get("experiment_id") for s in seeds}
|
|
76
|
+
unstudied = [e for e in kept if e.get("experiment_id") not in studied]
|
|
77
|
+
if len(unstudied) == len(kept):
|
|
78
|
+
return _w("no-error-bars", "critical", "No error bars on any result",
|
|
79
|
+
f"{len(kept)} kept experiment(s) with no seed studies. Single-seed results not publishable.",
|
|
80
|
+
"/turing:seed")
|
|
81
|
+
elif unstudied:
|
|
82
|
+
ids = ", ".join(e.get("experiment_id", "?") for e in unstudied[:5])
|
|
83
|
+
return _w("partial-error-bars", "minor", "Some experiments lack error bars",
|
|
84
|
+
f"{len(unstudied)}/{len(kept)} lack seed studies: {ids}.", "/turing:seed")
|
|
85
|
+
|
|
86
|
+
def check_ablation(experiments, ablations):
|
|
87
|
+
if len([e for e in experiments if e.get("status") == "kept"]) >= 2 and not ablations:
|
|
88
|
+
return _w("no-ablation", "major", "No ablation study",
|
|
89
|
+
"No ablation studies found. Component contributions unclear.", "/turing:ablate")
|
|
90
|
+
|
|
91
|
+
def check_overclaimed(experiments, seeds, metric, lower_is_better):
|
|
92
|
+
sensitive = [s for s in seeds if s.get("seed_sensitive")]
|
|
93
|
+
if not sensitive:
|
|
94
|
+
return None
|
|
95
|
+
details = "; ".join(f"{s.get('experiment_id','?')}: CV={s.get('cv_percent',0):.1f}%" for s in sensitive)
|
|
96
|
+
return _w("overclaimed-results", "major", "Seed-sensitive results may be overclaimed",
|
|
97
|
+
f"{len(sensitive)} experiment(s) show high seed sensitivity: {details}. "
|
|
98
|
+
"Report mean +/- std instead of point estimates.", "/turing:seed")
|
|
99
|
+
|
|
100
|
+
def check_sota(experiments, config, annotations):
|
|
101
|
+
kw = {"sota", "state-of-the-art", "benchmark", "leaderboard", "published"}
|
|
102
|
+
for ann in annotations:
|
|
103
|
+
text = ann.get("text", "").lower()
|
|
104
|
+
if any(k in text for k in kw) or any(k in [t.lower() for t in ann.get("tags", [])] for k in kw):
|
|
105
|
+
return None
|
|
106
|
+
if config.get("evaluation", {}).get("reference_score") or config.get("evaluation", {}).get("sota_score"):
|
|
107
|
+
return None
|
|
108
|
+
return _w("no-sota-comparison", "minor", "No SOTA or external benchmark comparison",
|
|
109
|
+
"No reference to published results. Add reference score or annotate with SOTA values.",
|
|
110
|
+
'/turing:try "Add SOTA comparison from literature"', ["neurips", "icml"])
|
|
111
|
+
|
|
112
|
+
def check_calibration(cal_results, experiments):
|
|
113
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
114
|
+
if not kept:
|
|
115
|
+
return None
|
|
116
|
+
if not cal_results:
|
|
117
|
+
return _w("no-calibration", "minor", "No calibration analysis",
|
|
118
|
+
"Model calibration not assessed. Reviewers expect ECE or reliability diagrams.",
|
|
119
|
+
'/turing:try "Add calibration analysis (ECE)"', ["neurips", "icml"])
|
|
120
|
+
poor = [r for r in cal_results if r.get("ece", 0) > 0.1]
|
|
121
|
+
if poor:
|
|
122
|
+
return _w("poor-calibration", "minor", "Model poorly calibrated",
|
|
123
|
+
f"{len(poor)} model(s) have ECE > 0.1. Consider temperature scaling.",
|
|
124
|
+
'/turing:try "Apply temperature scaling"')
|
|
125
|
+
|
|
126
|
+
def check_compute_cost(experiments):
|
|
127
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
128
|
+
if not kept:
|
|
129
|
+
return None
|
|
130
|
+
has_time = any(e.get("metrics", {}).get("train_seconds") is not None for e in kept)
|
|
131
|
+
has_env = any(e.get("environment") for e in kept)
|
|
132
|
+
issues = []
|
|
133
|
+
if not has_time:
|
|
134
|
+
issues.append("No training time reported")
|
|
135
|
+
if not has_env:
|
|
136
|
+
issues.append("No hardware info recorded")
|
|
137
|
+
if issues:
|
|
138
|
+
return _w("no-compute-cost", "major" if not has_time else "minor",
|
|
139
|
+
"Computational cost not reported", "; ".join(issues) + ". "
|
|
140
|
+
"Reporting compute cost is expected at all major venues.",
|
|
141
|
+
'/turing:try "Profile training and report compute cost"')
|
|
142
|
+
|
|
143
|
+
def check_diversity(experiments):
|
|
144
|
+
types = {e.get("config", {}).get("model_type", "") for e in experiments if e.get("status") == "kept"}
|
|
145
|
+
kept_n = sum(1 for e in experiments if e.get("status") == "kept")
|
|
146
|
+
if len(types) == 1 and kept_n >= 3:
|
|
147
|
+
return _w("low-diversity", "minor", "Only one model family explored",
|
|
148
|
+
f"All {kept_n} kept experiments use {list(types)[0]}. Alternatives not explored.",
|
|
149
|
+
'/turing:try "Explore alternative model architecture"')
|
|
150
|
+
|
|
151
|
+
def check_leakage(experiments, annotations):
|
|
152
|
+
kw = {"leakage", "leak", "contamination", "suspicious", "too high", "too good"}
|
|
153
|
+
flagged = [a for a in annotations
|
|
154
|
+
if any(k in a.get("text", "").lower() for k in kw) or "leakage" in [t.lower() for t in a.get("tags", [])]]
|
|
155
|
+
if flagged:
|
|
156
|
+
return _w("leakage-concern", "critical", "Data leakage flagged in annotations",
|
|
157
|
+
f"{len(flagged)} annotation(s) mention potential leakage. Must investigate before submission.",
|
|
158
|
+
'/turing:try "Investigate and rule out data leakage"')
|
|
159
|
+
|
|
160
|
+
def check_reproducibility(experiments, config):
|
|
161
|
+
issues = []
|
|
162
|
+
if config.get("data", {}).get("random_state") is None:
|
|
163
|
+
issues.append("No random state in config")
|
|
164
|
+
rd = Path("experiments/reproductions")
|
|
165
|
+
if len(experiments) >= 5 and not (rd.exists() and any(rd.glob("*.yaml"))):
|
|
166
|
+
issues.append("No reproduction checks run")
|
|
167
|
+
if issues:
|
|
168
|
+
return _w("reproducibility-gaps", "minor", "Reproducibility not fully verified",
|
|
169
|
+
"; ".join(issues) + ".", "/turing:reproduce")
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# --- Strengths & questions ---
|
|
173
|
+
|
|
174
|
+
def identify_strengths(experiments, seeds, ablations, config, metric, lower_is_better):
|
|
175
|
+
S = []
|
|
176
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
177
|
+
types = set(e.get("config", {}).get("model_type", "") for e in kept)
|
|
178
|
+
if len(kept) >= 5:
|
|
179
|
+
S.append(f"Thorough experimentation: {len(kept)} successful experiments across {len(types)} type(s).")
|
|
180
|
+
stable = [s for s in seeds if not s.get("seed_sensitive")]
|
|
181
|
+
if stable:
|
|
182
|
+
S.append(f"Seed studies: {len(stable)}/{len(seeds)} experiments show stable results.")
|
|
183
|
+
if ablations:
|
|
184
|
+
S.append(f"Ablation analysis provided ({len(ablations)} study/ies).")
|
|
185
|
+
families = set(e.get("family") for e in kept if e.get("family"))
|
|
186
|
+
if len(families) >= 3:
|
|
187
|
+
S.append(f"Systematic exploration of {len(families)} research directions.")
|
|
188
|
+
if len(experiments) >= 5 and len(kept) / len(experiments) >= 0.4:
|
|
189
|
+
S.append(f"High experiment efficiency: {len(kept)/len(experiments):.0%} keep rate.")
|
|
190
|
+
return S or ["Experiments have been initiated on this problem."]
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def generate_questions(weaknesses, experiments, config, venue):
|
|
194
|
+
Q, wids = [], {w["id"] for w in weaknesses}
|
|
195
|
+
qmap = {"missing-baselines": "How does performance compare to a simple baseline?",
|
|
196
|
+
"no-error-bars": "Can you provide confidence intervals over multiple seeds?",
|
|
197
|
+
"overclaimed-results": "Can you provide confidence intervals over multiple seeds?",
|
|
198
|
+
"no-ablation": "What is the contribution of each component?",
|
|
199
|
+
"no-sota-comparison": "How do results compare to published state-of-the-art?",
|
|
200
|
+
"no-compute-cost": "What are the computational requirements (GPU hours)?"}
|
|
201
|
+
for wid, q in qmap.items():
|
|
202
|
+
if wid in wids:
|
|
203
|
+
Q.append(q)
|
|
204
|
+
if venue == "neurips":
|
|
205
|
+
Q.append("What is the broader impact? Are there negative societal implications?")
|
|
206
|
+
elif venue == "icml":
|
|
207
|
+
Q.append("Is there theoretical justification, or is this purely empirical?")
|
|
208
|
+
return Q
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def compute_score(strengths, weaknesses, harsh):
|
|
212
|
+
score = 6.0 + min(len(strengths) * 0.4, 2.0)
|
|
213
|
+
for w in weaknesses:
|
|
214
|
+
score -= SEVERITY_WEIGHTS.get(w["severity"], 1.0)
|
|
215
|
+
if harsh:
|
|
216
|
+
score -= 1.0
|
|
217
|
+
return max(1, min(10, round(score)))
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
# --- Formatting ---
|
|
221
|
+
|
|
222
|
+
def format_review_report(strengths, weaknesses, questions, score, venue, harsh,
|
|
223
|
+
config, experiments, metric):
|
|
224
|
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
|
225
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
226
|
+
types = set(e.get("config", {}).get("model_type", "") for e in kept)
|
|
227
|
+
labels = {1: "Strong Reject", 2: "Reject", 3: "Reject", 4: "Weak Reject",
|
|
228
|
+
5: "Borderline Reject", 6: "Borderline Accept", 7: "Weak Accept",
|
|
229
|
+
8: "Accept", 9: "Strong Accept", 10: "Strong Accept"}
|
|
230
|
+
crit = sum(1 for w in weaknesses if w["severity"] == "critical")
|
|
231
|
+
maj = sum(1 for w in weaknesses if w["severity"] == "major")
|
|
232
|
+
mn = sum(1 for w in weaknesses if w["severity"] == "minor")
|
|
233
|
+
L = ["# Simulated Peer Review", "",
|
|
234
|
+
f"*Generated {now}*",
|
|
235
|
+
f"*Venue: {venue.upper()} | Mode: {'HARSH' if harsh else 'Standard'} | "
|
|
236
|
+
f"Score: {score}/10 ({labels.get(score, '?')})*", "", "---", "",
|
|
237
|
+
"## Summary", "",
|
|
238
|
+
f"This work presents experiments on {config.get('task_description', 'the given task')} "
|
|
239
|
+
f"with {len(kept)} successful experiment(s) across {len(types)} model type(s). "
|
|
240
|
+
f"Primary metric: `{metric}`.", "",
|
|
241
|
+
"## Score", "", f"**{score}/10** — {labels.get(score, '?')}", "",
|
|
242
|
+
f"- Strengths: {len(strengths)}",
|
|
243
|
+
f"- Weaknesses: {len(weaknesses)} ({crit} critical, {maj} major, {mn} minor)", "",
|
|
244
|
+
"## Strengths", ""]
|
|
245
|
+
for i, s in enumerate(strengths, 1):
|
|
246
|
+
L.append(f"**S{i}.** {s}")
|
|
247
|
+
L.extend(["", "## Weaknesses", ""])
|
|
248
|
+
for i, w in enumerate(weaknesses, 1):
|
|
249
|
+
L.extend([f"**W{i}. [{w['severity'].upper()}] {w['title']}**", "",
|
|
250
|
+
w["detail"], "", f"*Fix:* `{w['fix_command']}`", ""])
|
|
251
|
+
if not weaknesses:
|
|
252
|
+
L.extend(["No significant weaknesses identified.", ""])
|
|
253
|
+
L.extend(["## Questions for Authors", ""])
|
|
254
|
+
for i, q in enumerate(questions, 1):
|
|
255
|
+
L.append(f"**Q{i}.** {q}")
|
|
256
|
+
critical_major = [w for w in weaknesses if w["severity"] in ("critical", "major")]
|
|
257
|
+
if critical_major:
|
|
258
|
+
L.extend(["", "## Recommended Action Plan", "", "Address before submission:", ""])
|
|
259
|
+
for p, w in enumerate(critical_major, 1):
|
|
260
|
+
L.append(f"{p}. **[{w['severity'].upper()}]** {w['title']}: `{w['fix_command']}`")
|
|
261
|
+
L.extend(["", "## Verdict", ""])
|
|
262
|
+
if score >= 7:
|
|
263
|
+
L.append("Approaching publication quality. Address minor issues and consider submission.")
|
|
264
|
+
elif score >= 5:
|
|
265
|
+
L.append("Borderline. Significant improvements needed. Follow the action plan.")
|
|
266
|
+
else:
|
|
267
|
+
L.append("Not ready. Major methodology gaps. Focus on critical and major weaknesses.")
|
|
268
|
+
L.extend(["", "---", "*Simulated review by `/turing:review` — not a substitute for actual peer review.*"])
|
|
269
|
+
return "\n".join(L)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def save_review_report(result: dict, output_dir="experiments/reviews") -> Path:
|
|
273
|
+
p = Path(output_dir)
|
|
274
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
275
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
276
|
+
out = p / f"review-{ts}.yaml"
|
|
277
|
+
with open(out, "w") as f:
|
|
278
|
+
yaml.dump({"timestamp": result["timestamp"], "venue": result["venue"],
|
|
279
|
+
"harsh": result["harsh"], "score": result["score"],
|
|
280
|
+
"weaknesses": [{"id": w["id"], "severity": w["severity"],
|
|
281
|
+
"title": w["title"], "fix_command": w["fix_command"]}
|
|
282
|
+
for w in result["weaknesses"]]},
|
|
283
|
+
f, default_flow_style=False, sort_keys=False)
|
|
284
|
+
return out
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
# --- Orchestration ---
|
|
288
|
+
|
|
289
|
+
def simulate_review(venue="general", harsh=False, config_path="config.yaml",
|
|
290
|
+
log_path=DEFAULT_LOG) -> dict:
|
|
291
|
+
"""Run full simulated review pipeline."""
|
|
292
|
+
config = load_config(config_path)
|
|
293
|
+
metric = config.get("evaluation", {}).get("primary_metric", "accuracy")
|
|
294
|
+
lower = config.get("evaluation", {}).get("lower_is_better", False)
|
|
295
|
+
experiments = load_experiments(log_path)
|
|
296
|
+
if not experiments:
|
|
297
|
+
return {"error": "No experiments found. Run /turing:train first.",
|
|
298
|
+
"timestamp": datetime.now(timezone.utc).isoformat()}
|
|
299
|
+
seeds = _load_yaml_dir("experiments/seed_studies", "*-seeds.yaml")
|
|
300
|
+
ablations = _load_yaml_dir("experiments/ablations", "*-ablation.yaml")
|
|
301
|
+
cal = _load_yaml_dir("experiments/calibration", "*.yaml")
|
|
302
|
+
annotations = _load_yaml_list("experiments/annotations.yaml")
|
|
303
|
+
checks = [check_baselines(experiments, config), check_error_bars(experiments, seeds),
|
|
304
|
+
check_ablation(experiments, ablations), check_overclaimed(experiments, seeds, metric, lower),
|
|
305
|
+
check_sota(experiments, config, annotations), check_calibration(cal, experiments),
|
|
306
|
+
check_compute_cost(experiments), check_diversity(experiments),
|
|
307
|
+
check_leakage(experiments, annotations), check_reproducibility(experiments, config)]
|
|
308
|
+
weaknesses = [c for c in checks if c and venue in c.get("venue_relevance", ["general"])]
|
|
309
|
+
sev_order = {"critical": 0, "major": 1, "minor": 2, "nitpick": 3}
|
|
310
|
+
weaknesses.sort(key=lambda w: sev_order.get(w["severity"], 9))
|
|
311
|
+
strengths = identify_strengths(experiments, seeds, ablations, config, metric, lower)
|
|
312
|
+
questions = generate_questions(weaknesses, experiments, config, venue)
|
|
313
|
+
score = compute_score(strengths, weaknesses, harsh)
|
|
314
|
+
report = format_review_report(strengths, weaknesses, questions, score, venue, harsh,
|
|
315
|
+
config, experiments, metric)
|
|
316
|
+
return {"timestamp": datetime.now(timezone.utc).isoformat(), "venue": venue, "harsh": harsh,
|
|
317
|
+
"score": score, "strengths": strengths, "weaknesses": weaknesses,
|
|
318
|
+
"questions": questions, "report": report}
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def main() -> None:
|
|
322
|
+
parser = argparse.ArgumentParser(description="Simulate peer review of experiment campaign")
|
|
323
|
+
parser.add_argument("--venue", default="general", choices=VALID_VENUES)
|
|
324
|
+
parser.add_argument("--harsh", action="store_true", help="Stricter review criteria")
|
|
325
|
+
parser.add_argument("--config", default="config.yaml")
|
|
326
|
+
parser.add_argument("--log", default=DEFAULT_LOG)
|
|
327
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON")
|
|
328
|
+
args = parser.parse_args()
|
|
329
|
+
result = simulate_review(args.venue, args.harsh, args.config, args.log)
|
|
330
|
+
if "error" in result and "report" not in result:
|
|
331
|
+
print(f"ERROR: {result['error']}", file=sys.stderr)
|
|
332
|
+
sys.exit(1)
|
|
333
|
+
if args.json:
|
|
334
|
+
print(json.dumps(result, indent=2, default=str))
|
|
335
|
+
else:
|
|
336
|
+
print(result["report"])
|
|
337
|
+
saved = save_review_report(result)
|
|
338
|
+
print(f"\nReview saved to {saved}", file=sys.stderr)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
if __name__ == "__main__":
|
|
342
|
+
main()
|