claude-turing 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +30 -2
- package/commands/reproduce.md +48 -0
- package/commands/seed.md +47 -0
- package/commands/turing.md +4 -0
- package/package.json +1 -1
- package/src/install.js +1 -1
- package/src/verify.js +2 -0
- package/templates/config.yaml +10 -0
- package/templates/program.md +5 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/reproduce_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/seed_runner.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/generate_brief.py +81 -0
- package/templates/scripts/generate_model_card.py +25 -0
- package/templates/scripts/leaderboard.py +10 -0
- package/templates/scripts/reproduce_experiment.py +548 -0
- package/templates/scripts/scaffold.py +4 -0
- package/templates/scripts/seed_runner.py +414 -0
- package/templates/scripts/show_metrics.py +17 -0
- package/templates/scripts/turing_io.py +36 -0
- package/templates/scripts/update_state.py +13 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "1.3.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 19 commands, 2 specialized agents, statistical rigor (multi-seed studies, reproducibility verification), tree-search hypothesis exploration (TreeQuest AB-MCTS), cost-performance frontier analysis, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -323,6 +323,8 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
323
323
|
| Command | What it does |
|
|
324
324
|
|---------|-------------|
|
|
325
325
|
| `/turing:validate [--auto]` | Check metric stability — auto-configure multi-run if noisy |
|
|
326
|
+
| `/turing:seed [N] [--quick]` | Multi-seed study — mean/std/CI, flag seed-sensitive results |
|
|
327
|
+
| `/turing:reproduce <exp-id>` | Reproducibility verification — re-run and check tolerance |
|
|
326
328
|
| `/turing:card` | Generate a model card — performance, limitations, intended use, artifact contract |
|
|
327
329
|
| `/turing:logbook` | Generate HTML experiment logbook |
|
|
328
330
|
| `/turing:report` | Generate research report |
|
|
@@ -392,6 +394,32 @@ After N experiments with no meaningful improvement, the agent stops and reports
|
|
|
392
394
|
|
|
393
395
|
For noisy metrics, `/turing:validate` runs the pipeline multiple times and measures variance. If the coefficient of variation exceeds 5%, it auto-configures multi-run evaluation so the agent can't be rewarded for lucky single runs.
|
|
394
396
|
|
|
397
|
+
## Statistical Rigor
|
|
398
|
+
|
|
399
|
+
> *"Stop publishing lucky seeds. Start publishing distributions."*
|
|
400
|
+
|
|
401
|
+
Before claiming a result, run a seed study:
|
|
402
|
+
|
|
403
|
+
```
|
|
404
|
+
/turing:seed # 5 seeds on best experiment
|
|
405
|
+
/turing:seed --quick # 3 seeds for fast check
|
|
406
|
+
/turing:seed 10 # 10 seeds for thorough study
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
This runs the same experiment across multiple random seeds and reports mean +/- std with 95% confidence intervals. If the coefficient of variation exceeds 5%, the result is flagged as **seed-sensitive** — meaning you should report the distribution, not a single number.
|
|
410
|
+
|
|
411
|
+
To verify an experiment can be reproduced:
|
|
412
|
+
|
|
413
|
+
```
|
|
414
|
+
/turing:reproduce exp-042 # Default: 3 runs, 2% tolerance
|
|
415
|
+
/turing:reproduce exp-042 --strict # Exact match required
|
|
416
|
+
/turing:reproduce exp-042 --tolerance 0.05 # Custom tolerance
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
This re-runs the experiment from the logged config and checks that metrics fall within tolerance. It also detects environment drift — if library versions have changed since the original run, you'll know before a reviewer tells you.
|
|
420
|
+
|
|
421
|
+
Seed study results automatically appear in `/turing:brief` and `/turing:card`.
|
|
422
|
+
|
|
395
423
|
## Tree-Search Hypothesis Exploration
|
|
396
424
|
|
|
397
425
|
> *"The learned coin-flipper weaves through the quadrillion-coin room with a preternatural air."*
|
|
@@ -486,11 +514,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
486
514
|
|
|
487
515
|
## Architecture of Turing Itself
|
|
488
516
|
|
|
489
|
-
|
|
517
|
+
19 commands, 2 agents, 8 config files, 34 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor (seed studies + reproducibility), 407 tests, 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
490
518
|
|
|
491
519
|
```
|
|
492
520
|
turing/
|
|
493
|
-
├── commands/
|
|
521
|
+
├── commands/ 18 skill files (core + taste-leverage + reporting + exploration + statistical rigor)
|
|
494
522
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
495
523
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
496
524
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: reproduce
|
|
3
|
+
description: Verify reproducibility of a specific experiment by re-running from logged config and checking metrics fall within tolerance.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<exp-id> [--tolerance 0.02] [--strict] [--runs 3]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Verify that a logged experiment can be reproduced with consistent results.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- First argument is the experiment ID (required), e.g. `exp-042`
|
|
20
|
+
- `--tolerance 0.02` sets the relative tolerance (default 2%)
|
|
21
|
+
- `--strict` requires exact float match (1e-6), overrides tolerance
|
|
22
|
+
- `--runs 3` sets number of reproduction runs (default 3, 1 for strict)
|
|
23
|
+
|
|
24
|
+
3. **Run reproducibility verification:**
|
|
25
|
+
```bash
|
|
26
|
+
python scripts/reproduce_experiment.py $ARGUMENTS
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
4. **Report results:**
|
|
30
|
+
- **reproducible:** metrics match exactly (deterministic algorithm)
|
|
31
|
+
- **approximately_reproducible:** metrics within tolerance or original falls in 95% CI
|
|
32
|
+
- **not_reproducible:** metrics outside tolerance and CI
|
|
33
|
+
- **environment_changed:** metrics diverge AND library versions differ
|
|
34
|
+
- Show environment diff if present (Python version, package versions)
|
|
35
|
+
|
|
36
|
+
5. **Saved output:** report written to `experiments/reproductions/exp-NNN-repro.yaml`
|
|
37
|
+
|
|
38
|
+
6. **If experiment ID not found:** list available experiment IDs from `experiments/log.jsonl`
|
|
39
|
+
|
|
40
|
+
7. **If no training pipeline exists:** suggest `/turing:init` first.
|
|
41
|
+
|
|
42
|
+
## Examples
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
/turing:reproduce exp-042 # Default: 3 runs, 2% tolerance
|
|
46
|
+
/turing:reproduce exp-042 --strict # Exact match required
|
|
47
|
+
/turing:reproduce exp-042 --tolerance 0.05 --runs 5 # Lenient, more runs
|
|
48
|
+
```
|
package/commands/seed.md
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: seed
|
|
3
|
+
description: Run multi-seed study on an experiment to compute mean/std/CI and flag seed-sensitive results. Prevents publishing lucky seeds.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[N] [--quick] [--exp-id <id>]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Run a multi-seed study to verify that experiment results are robust across random seeds.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- A bare number (e.g., `5`) sets the seed count
|
|
20
|
+
- `--quick` runs 3 seeds instead of 5
|
|
21
|
+
- `--exp-id exp-042` targets a specific experiment (defaults to best)
|
|
22
|
+
- `--seed-list 42,123,456` uses specific seed values
|
|
23
|
+
|
|
24
|
+
3. **Run seed study:**
|
|
25
|
+
```bash
|
|
26
|
+
python scripts/seed_runner.py $ARGUMENTS
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
4. **Report results:**
|
|
30
|
+
- Show the per-seed results table
|
|
31
|
+
- Show mean +/- std with 95% CI
|
|
32
|
+
- **STABLE (CV < 5%):** result is robust, safe to report
|
|
33
|
+
- **SEED-SENSITIVE (CV >= 5%):** result varies too much across seeds — do not report single-seed numbers
|
|
34
|
+
- If seed-sensitive, recommend reporting as mean +/- std over N seeds
|
|
35
|
+
|
|
36
|
+
5. **Saved output:** results are written to `experiments/seed_studies/exp-NNN-seeds.yaml`
|
|
37
|
+
|
|
38
|
+
6. **If no training pipeline exists:** suggest `/turing:init` first.
|
|
39
|
+
|
|
40
|
+
## Examples
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
/turing:seed # 5 seeds on best experiment
|
|
44
|
+
/turing:seed --quick # 3 seeds for fast check
|
|
45
|
+
/turing:seed 10 # 10 seeds for thorough study
|
|
46
|
+
/turing:seed --exp-id exp-042 # Specific experiment
|
|
47
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -20,6 +20,8 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
20
20
|
| "poster", "presentation", "one-pager", "visual summary" | `/turing:poster` | Document |
|
|
21
21
|
| "report", "write-up", "findings", "document results" | `/turing:report` | Document |
|
|
22
22
|
| "validate", "stability", "check variance", "noisy" | `/turing:validate` | Validate |
|
|
23
|
+
| "seed", "seed study", "multi-seed", "lucky seed", "seed sensitivity" | `/turing:seed` | Validate |
|
|
24
|
+
| "reproduce", "reproducibility", "verify results", "re-run experiment", "repro" | `/turing:reproduce` | Validate |
|
|
23
25
|
| "suggest", "what model", "recommend", "which architecture", "literature" | `/turing:suggest` | Research |
|
|
24
26
|
| "explore hypotheses", "tree search", "treequest", "search hypothesis space", "MCTS" | `/turing:explore` | Research |
|
|
25
27
|
| "design", "plan experiment", "how should I test", "experiment design" | `/turing:design` | Design |
|
|
@@ -39,6 +41,8 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
39
41
|
| `/turing:brief` | Generate structured research intelligence report | @ml-evaluator |
|
|
40
42
|
| `/turing:init` | Scaffold a new ML project | (inline) |
|
|
41
43
|
| `/turing:validate` | Check metric stability, auto-fix if noisy | (inline) |
|
|
44
|
+
| `/turing:seed [N] [--quick]` | Multi-seed study: mean/std/CI, flag seed-sensitive results | (inline) |
|
|
45
|
+
| `/turing:reproduce <exp-id>` | Reproducibility verification with tolerance checking | (inline) |
|
|
42
46
|
| `/turing:suggest` | Literature-grounded model architecture suggestions | (inline, uses WebSearch) |
|
|
43
47
|
| `/turing:explore` | Tree-search hypothesis exploration via AB-MCTS | (inline) |
|
|
44
48
|
| `/turing:design <hyp-id>` | Generate structured experiment design from hypothesis | (inline, uses WebSearch) |
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
|
@@ -22,7 +22,7 @@ const PLUGIN_ROOT = join(__dirname, "..");
|
|
|
22
22
|
const SUB_COMMANDS = [
|
|
23
23
|
"init", "train", "status", "compare", "sweep", "validate",
|
|
24
24
|
"try", "brief", "suggest", "explore", "design", "logbook", "poster",
|
|
25
|
-
"report", "mode", "preflight", "card",
|
|
25
|
+
"report", "mode", "preflight", "card", "seed", "reproduce",
|
|
26
26
|
];
|
|
27
27
|
|
|
28
28
|
export async function install(opts = {}) {
|
package/src/verify.js
CHANGED
package/templates/config.yaml
CHANGED
|
@@ -19,6 +19,16 @@ evaluation:
|
|
|
19
19
|
# Set to false for metrics where higher is better (accuracy, f1, auc)
|
|
20
20
|
lower_is_better: false # {{METRIC_DIRECTION}} -- change to true if lower is better
|
|
21
21
|
|
|
22
|
+
# Multi-seed configuration (Phase 10.1: /turing:seed)
|
|
23
|
+
# Seeds used for seed studies — diverse values for good coverage
|
|
24
|
+
seed_seeds: [42, 123, 456, 789, 1024, 1337, 2048, 3141, 4096, 7919]
|
|
25
|
+
seed_study_n_runs: 5 # Default number of seeds for /turing:seed
|
|
26
|
+
seed_sensitivity_threshold: 5.0 # CV% above this = seed-sensitive
|
|
27
|
+
|
|
28
|
+
# Reproducibility configuration (Phase 10.2: /turing:reproduce)
|
|
29
|
+
reproduce_tolerance: 0.02 # 2% relative tolerance for approximate match
|
|
30
|
+
reproduce_n_runs: 3 # Default reproduction runs for stochastic algorithms
|
|
31
|
+
|
|
22
32
|
convergence:
|
|
23
33
|
patience: 3 # Consecutive non-improvements before stopping
|
|
24
34
|
improvement_threshold: 0.005 # 0.5% relative improvement required
|
package/templates/program.md
CHANGED
|
@@ -170,6 +170,11 @@ The autoresearch experiment loop. Each iteration is one experiment — one hypot
|
|
|
170
170
|
- N consecutive non-improvements (`config.yaml` → `convergence.patience`) = STOP
|
|
171
171
|
- `max_iterations` reached = STOP
|
|
172
172
|
- Report final best model and recommend next steps
|
|
173
|
+
- **Before declaring final results**, run a seed study to verify robustness:
|
|
174
|
+
```bash
|
|
175
|
+
python scripts/seed_runner.py --quick
|
|
176
|
+
```
|
|
177
|
+
If CV > 5%, the result is seed-sensitive — report mean ± std, not a single-seed number.
|
|
173
178
|
|
|
174
179
|
10. **REPEAT** — return to step 1.
|
|
175
180
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -25,6 +25,7 @@ import yaml
|
|
|
25
25
|
|
|
26
26
|
from scripts.cost_frontier import compute_pareto_frontier, load_cost_data, _format_seconds
|
|
27
27
|
from scripts.turing_io import load_config, load_experiments, load_hypotheses
|
|
28
|
+
from scripts.seed_runner import CV_THRESHOLD
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
def compute_campaign_summary(experiments: list[dict]) -> dict:
|
|
@@ -211,6 +212,40 @@ def detect_environment_drift(experiments: list[dict]) -> list[str]:
|
|
|
211
212
|
return warnings
|
|
212
213
|
|
|
213
214
|
|
|
215
|
+
def load_seed_studies(seed_dir: str = "experiments/seed_studies") -> list[dict]:
|
|
216
|
+
"""Load all seed study results from YAML files."""
|
|
217
|
+
path = Path(seed_dir)
|
|
218
|
+
if not path.exists():
|
|
219
|
+
return []
|
|
220
|
+
studies = []
|
|
221
|
+
for f in sorted(path.glob("*-seeds.yaml")):
|
|
222
|
+
try:
|
|
223
|
+
with open(f) as fh:
|
|
224
|
+
study = yaml.safe_load(fh)
|
|
225
|
+
if study and isinstance(study, dict):
|
|
226
|
+
studies.append(study)
|
|
227
|
+
except (yaml.YAMLError, OSError):
|
|
228
|
+
continue
|
|
229
|
+
return studies
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def load_reproductions(repro_dir: str = "experiments/reproductions") -> list[dict]:
|
|
233
|
+
"""Load all reproduction reports from YAML files."""
|
|
234
|
+
path = Path(repro_dir)
|
|
235
|
+
if not path.exists():
|
|
236
|
+
return []
|
|
237
|
+
reports = []
|
|
238
|
+
for f in sorted(path.glob("*-repro.yaml")):
|
|
239
|
+
try:
|
|
240
|
+
with open(f) as fh:
|
|
241
|
+
report = yaml.safe_load(fh)
|
|
242
|
+
if report and isinstance(report, dict):
|
|
243
|
+
reports.append(report)
|
|
244
|
+
except (yaml.YAMLError, OSError):
|
|
245
|
+
continue
|
|
246
|
+
return reports
|
|
247
|
+
|
|
248
|
+
|
|
214
249
|
def format_brief(
|
|
215
250
|
campaign: dict,
|
|
216
251
|
best: dict | None,
|
|
@@ -223,6 +258,8 @@ def format_brief(
|
|
|
223
258
|
env_warnings: list[str] | None = None,
|
|
224
259
|
cost_data: list | None = None,
|
|
225
260
|
cost_frontier: list | None = None,
|
|
261
|
+
seed_studies: list[dict] | None = None,
|
|
262
|
+
reproductions: list[dict] | None = None,
|
|
226
263
|
) -> str:
|
|
227
264
|
"""Format the research briefing as markdown."""
|
|
228
265
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -361,6 +398,44 @@ def format_brief(
|
|
|
361
398
|
f"The {pct:.1f}% improvement costs {ratio:.0f}x more compute.",
|
|
362
399
|
])
|
|
363
400
|
|
|
401
|
+
# Seed studies
|
|
402
|
+
if seed_studies:
|
|
403
|
+
lines.extend(["", "## Seed Studies", ""])
|
|
404
|
+
for study in seed_studies:
|
|
405
|
+
exp_id = study.get("experiment_id", "?")
|
|
406
|
+
sensitive = study.get("seed_sensitive", False)
|
|
407
|
+
status = "SEED-SENSITIVE" if sensitive else "STABLE"
|
|
408
|
+
lines.append(
|
|
409
|
+
f"- **{exp_id}:** {study.get('metric', metric)} = "
|
|
410
|
+
f"{study.get('mean', 0):.4f} +/- {study.get('std', 0):.4f} "
|
|
411
|
+
f"(CV={study.get('cv_percent', 0):.1f}%) — **{status}**"
|
|
412
|
+
)
|
|
413
|
+
if sensitive:
|
|
414
|
+
lines.append(
|
|
415
|
+
f" - 95% CI: [{study['ci_95'][0]:.4f}, {study['ci_95'][1]:.4f}] "
|
|
416
|
+
f"over {len(study.get('seeds_run', []))} seeds"
|
|
417
|
+
)
|
|
418
|
+
if any(s.get("seed_sensitive") for s in seed_studies):
|
|
419
|
+
lines.extend(["", "*Some results are seed-sensitive. Report distributions, not point estimates.*"])
|
|
420
|
+
|
|
421
|
+
# Reproduction reports
|
|
422
|
+
if reproductions:
|
|
423
|
+
lines.extend(["", "## Reproducibility", ""])
|
|
424
|
+
verdict_markers = {
|
|
425
|
+
"reproducible": "PASS",
|
|
426
|
+
"approximately_reproducible": "PASS (approx)",
|
|
427
|
+
"not_reproducible": "FAIL",
|
|
428
|
+
"environment_changed": "WARN (env)",
|
|
429
|
+
}
|
|
430
|
+
for report in reproductions:
|
|
431
|
+
exp_id = report.get("experiment_id", "?")
|
|
432
|
+
verdict = report.get("verdict", "unknown")
|
|
433
|
+
marker = verdict_markers.get(verdict, verdict)
|
|
434
|
+
lines.append(f"- **{exp_id}:** {marker} — {report.get('reason', 'N/A')}")
|
|
435
|
+
failed = [r for r in reproductions if r.get("verdict") in ("not_reproducible", "environment_changed")]
|
|
436
|
+
if failed:
|
|
437
|
+
lines.extend(["", f"*{len(failed)} experiment(s) failed reproducibility checks.*"])
|
|
438
|
+
|
|
364
439
|
lines.extend([
|
|
365
440
|
"",
|
|
366
441
|
"## Recommendations",
|
|
@@ -420,11 +495,17 @@ def generate_brief(
|
|
|
420
495
|
cost_records = load_cost_data(log_path, metric)
|
|
421
496
|
pareto = compute_pareto_frontier(cost_records, lower_is_better) if cost_records else []
|
|
422
497
|
|
|
498
|
+
# Load seed studies and reproduction reports
|
|
499
|
+
seed_studies = load_seed_studies()
|
|
500
|
+
reproductions = load_reproductions()
|
|
501
|
+
|
|
423
502
|
return format_brief(
|
|
424
503
|
campaign, best, trajectory, model_types, hypotheses,
|
|
425
504
|
metric, lower_is_better, failures, env_warnings,
|
|
426
505
|
cost_data=cost_records if cost_records else None,
|
|
427
506
|
cost_frontier=pareto if cost_records else None,
|
|
507
|
+
seed_studies=seed_studies if seed_studies else None,
|
|
508
|
+
reproductions=reproductions if reproductions else None,
|
|
428
509
|
)
|
|
429
510
|
|
|
430
511
|
|
|
@@ -243,6 +243,31 @@ def generate_card(
|
|
|
243
243
|
else:
|
|
244
244
|
lines.append("No experiments completed yet.")
|
|
245
245
|
|
|
246
|
+
# --- Seed Study ---
|
|
247
|
+
if best:
|
|
248
|
+
seed_study_path = Path("experiments/seed_studies") / f"{best.get('experiment_id', 'unknown')}-seeds.yaml"
|
|
249
|
+
if seed_study_path.exists():
|
|
250
|
+
import yaml
|
|
251
|
+
with open(seed_study_path) as f:
|
|
252
|
+
seed_study = yaml.safe_load(f) or {}
|
|
253
|
+
if seed_study and "mean" in seed_study:
|
|
254
|
+
sensitive = seed_study.get("seed_sensitive", False)
|
|
255
|
+
status = "SEED-SENSITIVE" if sensitive else "STABLE"
|
|
256
|
+
lines.extend([
|
|
257
|
+
"",
|
|
258
|
+
"### Seed Study",
|
|
259
|
+
"",
|
|
260
|
+
f"- **Status:** {status}",
|
|
261
|
+
f"- **{metric}:** {seed_study['mean']:.4f} +/- {seed_study.get('std', 0):.4f}",
|
|
262
|
+
])
|
|
263
|
+
if "ci_95" in seed_study:
|
|
264
|
+
ci = seed_study["ci_95"]
|
|
265
|
+
lines.append(f"- **95% CI:** [{ci[0]:.4f}, {ci[1]:.4f}]")
|
|
266
|
+
lines.append(f"- **CV:** {seed_study.get('cv_percent', 0):.2f}%")
|
|
267
|
+
lines.append(f"- **Seeds tested:** {len(seed_study.get('seeds_run', []))}")
|
|
268
|
+
if sensitive:
|
|
269
|
+
lines.append("- *Result varies significantly across seeds. Report distribution, not point estimate.*")
|
|
270
|
+
|
|
246
271
|
# --- Training History ---
|
|
247
272
|
lines.extend([
|
|
248
273
|
"",
|
|
@@ -503,6 +503,16 @@ def main() -> None:
|
|
|
503
503
|
print()
|
|
504
504
|
print(footer)
|
|
505
505
|
|
|
506
|
+
# Show seed study status for #1 if available
|
|
507
|
+
if ranked and args.fmt not in ("csv",):
|
|
508
|
+
from scripts.turing_io import load_seed_study
|
|
509
|
+
best_id = ranked[0].get("experiment_id")
|
|
510
|
+
if best_id:
|
|
511
|
+
study = load_seed_study(best_id)
|
|
512
|
+
if study and "mean" in study:
|
|
513
|
+
sensitive = "SEED-SENSITIVE" if study.get("seed_sensitive") else "STABLE"
|
|
514
|
+
print(f"\n Seed study: {metric}={study['mean']:.4f}±{study.get('std',0):.4f} ({sensitive})")
|
|
515
|
+
|
|
506
516
|
|
|
507
517
|
if __name__ == "__main__":
|
|
508
518
|
main()
|