claude-turing 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +34 -0
- package/LICENSE +21 -0
- package/README.md +457 -0
- package/agents/ml-evaluator.md +43 -0
- package/agents/ml-researcher.md +74 -0
- package/bin/cli.js +46 -0
- package/bin/turing-init.sh +57 -0
- package/commands/brief.md +83 -0
- package/commands/compare.md +24 -0
- package/commands/design.md +97 -0
- package/commands/init.md +123 -0
- package/commands/logbook.md +51 -0
- package/commands/mode.md +43 -0
- package/commands/poster.md +89 -0
- package/commands/preflight.md +75 -0
- package/commands/report.md +97 -0
- package/commands/rules/loop-protocol.md +91 -0
- package/commands/status.md +24 -0
- package/commands/suggest.md +95 -0
- package/commands/sweep.md +45 -0
- package/commands/train.md +66 -0
- package/commands/try.md +63 -0
- package/commands/turing.md +54 -0
- package/commands/validate.md +34 -0
- package/config/defaults.yaml +45 -0
- package/config/experiment_archetypes.yaml +127 -0
- package/config/lifecycle.toml +31 -0
- package/config/novelty_aliases.yaml +107 -0
- package/config/relationships.toml +125 -0
- package/config/state.toml +24 -0
- package/config/task_taxonomy.yaml +110 -0
- package/config/taxonomy.toml +37 -0
- package/package.json +54 -0
- package/src/claude-md.js +55 -0
- package/src/install.js +107 -0
- package/src/paths.js +20 -0
- package/src/postinstall.js +22 -0
- package/src/verify.js +109 -0
- package/templates/MEMORY.md +36 -0
- package/templates/README.md +93 -0
- package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
- package/templates/config.yaml +48 -0
- package/templates/evaluate.py +237 -0
- package/templates/features/__init__.py +0 -0
- package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
- package/templates/features/featurizers.py +138 -0
- package/templates/prepare.py +171 -0
- package/templates/program.md +216 -0
- package/templates/pyproject.toml +8 -0
- package/templates/requirements.txt +8 -0
- package/templates/scripts/__init__.py +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
- package/templates/scripts/check_convergence.py +230 -0
- package/templates/scripts/compare_runs.py +124 -0
- package/templates/scripts/critique_hypothesis.py +350 -0
- package/templates/scripts/experiment_index.py +288 -0
- package/templates/scripts/generate_brief.py +389 -0
- package/templates/scripts/generate_logbook.py +423 -0
- package/templates/scripts/log_experiment.py +243 -0
- package/templates/scripts/manage_hypotheses.py +543 -0
- package/templates/scripts/novelty_guard.py +343 -0
- package/templates/scripts/parse_metrics.py +139 -0
- package/templates/scripts/post-train-hook.sh +74 -0
- package/templates/scripts/preflight.py +549 -0
- package/templates/scripts/scaffold.py +409 -0
- package/templates/scripts/show_environment.py +92 -0
- package/templates/scripts/show_experiment_tree.py +144 -0
- package/templates/scripts/show_families.py +133 -0
- package/templates/scripts/show_metrics.py +157 -0
- package/templates/scripts/statistical_compare.py +259 -0
- package/templates/scripts/stop-hook.sh +34 -0
- package/templates/scripts/suggest_next.py +301 -0
- package/templates/scripts/sweep.py +276 -0
- package/templates/scripts/synthesize_decision.py +300 -0
- package/templates/scripts/turing_io.py +76 -0
- package/templates/scripts/update_state.py +296 -0
- package/templates/scripts/validate_stability.py +167 -0
- package/templates/scripts/verify_placeholders.py +119 -0
- package/templates/sweep_config.yaml +14 -0
- package/templates/tests/__init__.py +0 -0
- package/templates/tests/conftest.py +91 -0
- package/templates/train.py +240 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Experiment family viewer for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Groups experiments by strategic theme (family tag) and shows
|
|
5
|
+
per-family performance summaries. Tells the human when an entire
|
|
6
|
+
research direction is exhausted.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python scripts/show_families.py [--log experiments/log.jsonl] [--config config.yaml]
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
import yaml
|
|
20
|
+
|
|
21
|
+
from scripts.turing_io import load_experiments
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def group_by_family(experiments: list[dict]) -> dict[str, list[dict]]:
|
|
25
|
+
"""Group experiments by family tag."""
|
|
26
|
+
families: dict[str, list[dict]] = {}
|
|
27
|
+
for exp in experiments:
|
|
28
|
+
family = exp.get("family") or "untagged"
|
|
29
|
+
families.setdefault(family, []).append(exp)
|
|
30
|
+
return families
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def family_stats(
|
|
34
|
+
experiments: list[dict],
|
|
35
|
+
metric: str,
|
|
36
|
+
lower_is_better: bool,
|
|
37
|
+
) -> dict:
|
|
38
|
+
"""Compute stats for a family of experiments."""
|
|
39
|
+
total = len(experiments)
|
|
40
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
41
|
+
discarded = [e for e in experiments if e.get("status") == "discarded"]
|
|
42
|
+
|
|
43
|
+
metric_vals = [
|
|
44
|
+
e.get("metrics", {}).get(metric)
|
|
45
|
+
for e in kept
|
|
46
|
+
if e.get("metrics", {}).get(metric) is not None
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
best_val = None
|
|
50
|
+
best_id = None
|
|
51
|
+
if metric_vals:
|
|
52
|
+
if lower_is_better:
|
|
53
|
+
best_val = min(metric_vals)
|
|
54
|
+
else:
|
|
55
|
+
best_val = max(metric_vals)
|
|
56
|
+
for e in kept:
|
|
57
|
+
if e.get("metrics", {}).get(metric) == best_val:
|
|
58
|
+
best_id = e.get("experiment_id")
|
|
59
|
+
break
|
|
60
|
+
|
|
61
|
+
# Compute recent trend (last 3 experiments)
|
|
62
|
+
recent = experiments[-3:] if len(experiments) >= 3 else experiments
|
|
63
|
+
recent_kept = sum(1 for e in recent if e.get("status") == "kept")
|
|
64
|
+
recent_rate = recent_kept / len(recent) if recent else 0
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
"total": total,
|
|
68
|
+
"kept": len(kept),
|
|
69
|
+
"discarded": len(discarded),
|
|
70
|
+
"keep_rate": round(len(kept) / total, 2) if total > 0 else 0,
|
|
71
|
+
"best_metric": best_val,
|
|
72
|
+
"best_experiment": best_id,
|
|
73
|
+
"recent_keep_rate": round(recent_rate, 2),
|
|
74
|
+
"exhausted": total >= 3 and recent_rate == 0,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def format_families(
|
|
79
|
+
families: dict[str, dict],
|
|
80
|
+
metric_name: str,
|
|
81
|
+
) -> str:
|
|
82
|
+
"""Format family summaries as a table."""
|
|
83
|
+
if not families:
|
|
84
|
+
return "No experiments logged yet."
|
|
85
|
+
|
|
86
|
+
lines = [
|
|
87
|
+
f"{'Family':<25} {'Total':>6} {'Kept':>6} {'Rate':>6} {'Best ' + metric_name:>15} {'Status':<12}",
|
|
88
|
+
"-" * 80,
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
for name, stats in sorted(families.items(), key=lambda x: -(x[1].get("best_metric") or 0)):
|
|
92
|
+
best_str = f"{stats['best_metric']:.4f}" if stats["best_metric"] is not None else "N/A"
|
|
93
|
+
status = "EXHAUSTED" if stats["exhausted"] else "active"
|
|
94
|
+
line = f"{name:<25} {stats['total']:>6} {stats['kept']:>6} {stats['keep_rate']:>5.0%} {best_str:>15} {status:<12}"
|
|
95
|
+
lines.append(line)
|
|
96
|
+
|
|
97
|
+
# Summary
|
|
98
|
+
total_exps = sum(s["total"] for s in families.values())
|
|
99
|
+
exhausted = sum(1 for s in families.values() if s["exhausted"])
|
|
100
|
+
lines.extend([
|
|
101
|
+
"",
|
|
102
|
+
f"Total: {total_exps} experiments across {len(families)} families ({exhausted} exhausted)",
|
|
103
|
+
])
|
|
104
|
+
|
|
105
|
+
return "\n".join(lines)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def main() -> None:
|
|
109
|
+
"""CLI entry point."""
|
|
110
|
+
parser = argparse.ArgumentParser(description="Show experiment families")
|
|
111
|
+
parser.add_argument("--log", default="experiments/log.jsonl")
|
|
112
|
+
parser.add_argument("--config", default="config.yaml")
|
|
113
|
+
args = parser.parse_args()
|
|
114
|
+
|
|
115
|
+
config = {}
|
|
116
|
+
if Path(args.config).exists():
|
|
117
|
+
with open(args.config) as f:
|
|
118
|
+
config = yaml.safe_load(f) or {}
|
|
119
|
+
|
|
120
|
+
eval_cfg = config.get("evaluation", {})
|
|
121
|
+
metric = eval_cfg.get("primary_metric", "accuracy")
|
|
122
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
123
|
+
|
|
124
|
+
experiments = load_experiments(args.log)
|
|
125
|
+
families = group_by_family(experiments)
|
|
126
|
+
|
|
127
|
+
family_data = {name: family_stats(exps, metric, lower_is_better) for name, exps in families.items()}
|
|
128
|
+
|
|
129
|
+
print(format_families(family_data, metric))
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
if __name__ == "__main__":
|
|
133
|
+
main()
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Display experiment metrics from experiments/log.jsonl.
|
|
2
|
+
|
|
3
|
+
Reads the JSONL experiment log and prints a formatted table,
|
|
4
|
+
highlighting the current best by primary metric. This is the
|
|
5
|
+
agent's primary observation tool at the start of each loop iteration.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import json
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import yaml
|
|
16
|
+
|
|
17
|
+
from scripts.turing_io import load_config, load_experiments
|
|
18
|
+
|
|
19
|
+
DEFAULT_LOG_PATH = "experiments/log.jsonl"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def find_best(experiments: list[dict], primary_metric: str, lower_is_better: bool) -> str | None:
|
|
23
|
+
"""Find experiment_id with best primary metric among 'keep' entries."""
|
|
24
|
+
best_id = None
|
|
25
|
+
best_value = float("inf") if lower_is_better else float("-inf")
|
|
26
|
+
|
|
27
|
+
for exp in experiments:
|
|
28
|
+
if exp.get("status") != "kept":
|
|
29
|
+
continue
|
|
30
|
+
value = exp.get("metrics", {}).get(primary_metric)
|
|
31
|
+
if value is None:
|
|
32
|
+
continue
|
|
33
|
+
if lower_is_better and value < best_value:
|
|
34
|
+
best_value = value
|
|
35
|
+
best_id = exp["experiment_id"]
|
|
36
|
+
elif not lower_is_better and value > best_value:
|
|
37
|
+
best_value = value
|
|
38
|
+
best_id = exp["experiment_id"]
|
|
39
|
+
return best_id
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def format_table(experiments: list[dict], best_id: str | None, metric_names: list[str]) -> str:
|
|
43
|
+
"""Format experiments as a text table."""
|
|
44
|
+
if not experiments:
|
|
45
|
+
return "No experiments logged yet."
|
|
46
|
+
|
|
47
|
+
# Build dynamic header based on configured metrics
|
|
48
|
+
metric_headers = "".join(f"{m:>12}" for m in metric_names)
|
|
49
|
+
header = f"{'ID':<10} {'Status':<10} {'Model':<15}{metric_headers} {'Timestamp':<22}"
|
|
50
|
+
sep = "-" * len(header)
|
|
51
|
+
lines = [header, sep]
|
|
52
|
+
|
|
53
|
+
for exp in experiments:
|
|
54
|
+
metrics = exp.get("metrics", {})
|
|
55
|
+
model_type = exp.get("config", {}).get("model_type", "unknown")
|
|
56
|
+
|
|
57
|
+
metric_values = ""
|
|
58
|
+
for m in metric_names:
|
|
59
|
+
val = metrics.get(m)
|
|
60
|
+
if isinstance(val, (int, float)):
|
|
61
|
+
metric_values += f"{val:>12.4f}"
|
|
62
|
+
else:
|
|
63
|
+
metric_values += f"{'N/A':>12}"
|
|
64
|
+
|
|
65
|
+
ts = exp.get("timestamp", "")[:19]
|
|
66
|
+
marker = " *BEST*" if exp.get("experiment_id") == best_id else ""
|
|
67
|
+
line = f"{exp.get('experiment_id', '?'):<10} {exp.get('status', '?'):<10} {model_type:<15}{metric_values} {ts}{marker}"
|
|
68
|
+
lines.append(line)
|
|
69
|
+
|
|
70
|
+
return "\n".join(lines)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_experiment_diffs(experiments: list[dict], max_diffs: int = 3) -> str:
|
|
74
|
+
"""Get git diffs for recent discarded experiments."""
|
|
75
|
+
import subprocess
|
|
76
|
+
|
|
77
|
+
discarded = [e for e in experiments if e.get("status") == "discarded"]
|
|
78
|
+
if not discarded:
|
|
79
|
+
return ""
|
|
80
|
+
|
|
81
|
+
recent = discarded[-max_diffs:]
|
|
82
|
+
lines = ["\n--- Recent Failed Experiment Diffs ---\n"]
|
|
83
|
+
|
|
84
|
+
for exp in recent:
|
|
85
|
+
exp_id = exp.get("experiment_id", "unknown")
|
|
86
|
+
description = exp.get("description", "no description")
|
|
87
|
+
lines.append(f"=== {exp_id}: {description} ===")
|
|
88
|
+
|
|
89
|
+
# Try to find the experiment branch
|
|
90
|
+
branch = f"exp/{exp_id.replace('exp-', '')}"
|
|
91
|
+
try:
|
|
92
|
+
result = subprocess.run(
|
|
93
|
+
["git", "diff", f"main...{branch}", "--", "train.py", "config.yaml"],
|
|
94
|
+
capture_output=True,
|
|
95
|
+
text=True,
|
|
96
|
+
timeout=10,
|
|
97
|
+
)
|
|
98
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
99
|
+
# Truncate long diffs
|
|
100
|
+
diff_lines = result.stdout.strip().splitlines()
|
|
101
|
+
if len(diff_lines) > 30:
|
|
102
|
+
diff_lines = diff_lines[:30] + [f"... ({len(diff_lines) - 30} more lines)"]
|
|
103
|
+
lines.append("\n".join(diff_lines))
|
|
104
|
+
else:
|
|
105
|
+
lines.append(" (branch not found or no diff)")
|
|
106
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
107
|
+
lines.append(" (git not available)")
|
|
108
|
+
|
|
109
|
+
lines.append("")
|
|
110
|
+
|
|
111
|
+
return "\n".join(lines)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def main() -> None:
|
|
115
|
+
"""CLI entry point."""
|
|
116
|
+
parser = argparse.ArgumentParser(description="Show experiment metrics")
|
|
117
|
+
parser.add_argument(
|
|
118
|
+
"--log",
|
|
119
|
+
default=DEFAULT_LOG_PATH,
|
|
120
|
+
help=f"Path to experiment log (default: {DEFAULT_LOG_PATH})",
|
|
121
|
+
)
|
|
122
|
+
parser.add_argument(
|
|
123
|
+
"--last",
|
|
124
|
+
type=int,
|
|
125
|
+
default=None,
|
|
126
|
+
help="Show only last N experiments",
|
|
127
|
+
)
|
|
128
|
+
parser.add_argument(
|
|
129
|
+
"--with-diffs",
|
|
130
|
+
action="store_true",
|
|
131
|
+
help="Include git diffs for discarded experiments",
|
|
132
|
+
)
|
|
133
|
+
args = parser.parse_args()
|
|
134
|
+
|
|
135
|
+
config = load_config()
|
|
136
|
+
eval_cfg = config.get("evaluation", {})
|
|
137
|
+
primary_metric = eval_cfg.get("primary_metric", "accuracy")
|
|
138
|
+
metric_names = eval_cfg.get("metrics", ["accuracy", "f1_weighted"])
|
|
139
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
140
|
+
|
|
141
|
+
experiments = load_experiments(args.log)
|
|
142
|
+
|
|
143
|
+
if args.last and args.last > 0:
|
|
144
|
+
experiments = experiments[-args.last:]
|
|
145
|
+
|
|
146
|
+
best_id = find_best(experiments, primary_metric, lower_is_better)
|
|
147
|
+
print(format_table(experiments, best_id, metric_names))
|
|
148
|
+
|
|
149
|
+
if args.with_diffs:
|
|
150
|
+
all_experiments = load_experiments(args.log)
|
|
151
|
+
diffs = get_experiment_diffs(all_experiments)
|
|
152
|
+
if diffs:
|
|
153
|
+
print(diffs)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
if __name__ == "__main__":
|
|
157
|
+
main()
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Multi-run statistical comparison for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Runs the same configuration N times with different random seeds and
|
|
5
|
+
compares metric distributions rather than point estimates. This prevents
|
|
6
|
+
keep/discard decisions based on noise.
|
|
7
|
+
|
|
8
|
+
A model that scored 0.87 vs 0.86 on a single run might just be seed
|
|
9
|
+
variance. This script runs both configurations multiple times and
|
|
10
|
+
uses the Mann-Whitney U test to determine if the difference is real.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
python scripts/statistical_compare.py --n-runs 3 [--config config.yaml]
|
|
14
|
+
python scripts/statistical_compare.py --compare <log1.jsonl> <log2.jsonl>
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import json
|
|
21
|
+
import subprocess
|
|
22
|
+
import sys
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
import numpy as np
|
|
26
|
+
import yaml
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def run_multi_seed(
|
|
30
|
+
n_runs: int,
|
|
31
|
+
config_path: str = "config.yaml",
|
|
32
|
+
base_seed: int = 42,
|
|
33
|
+
) -> list[dict]:
|
|
34
|
+
"""Run training N times with different seeds, collect metrics.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
n_runs: Number of runs to execute.
|
|
38
|
+
config_path: Path to config.yaml.
|
|
39
|
+
base_seed: Starting seed (incremented for each run).
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of metric dicts from each run.
|
|
43
|
+
"""
|
|
44
|
+
results = []
|
|
45
|
+
for i in range(n_runs):
|
|
46
|
+
seed = base_seed + i
|
|
47
|
+
print(f" Run {i+1}/{n_runs} (seed={seed})...", file=sys.stderr, end=" ")
|
|
48
|
+
|
|
49
|
+
cmd = f"python train.py --config {config_path} --seed {seed}"
|
|
50
|
+
proc = subprocess.run(
|
|
51
|
+
cmd, shell=True, capture_output=True, text=True, timeout=600,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if proc.returncode != 0:
|
|
55
|
+
print(f"FAILED", file=sys.stderr)
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
# Parse metrics from output (between --- delimiters)
|
|
59
|
+
metrics = parse_metrics_from_output(proc.stdout)
|
|
60
|
+
if metrics:
|
|
61
|
+
metrics["seed"] = seed
|
|
62
|
+
results.append(metrics)
|
|
63
|
+
primary = next((v for k, v in metrics.items() if k not in ("seed", "model_type", "train_seconds")), None)
|
|
64
|
+
print(f"done ({primary})", file=sys.stderr)
|
|
65
|
+
else:
|
|
66
|
+
print(f"no metrics parsed", file=sys.stderr)
|
|
67
|
+
|
|
68
|
+
return results
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def parse_metrics_from_output(output: str) -> dict:
|
|
72
|
+
"""Parse metrics from the --- delimited format in train.py output."""
|
|
73
|
+
lines = output.strip().split("\n")
|
|
74
|
+
in_block = False
|
|
75
|
+
metrics = {}
|
|
76
|
+
metadata_keys = {"model_type", "train_seconds"}
|
|
77
|
+
|
|
78
|
+
for line in lines:
|
|
79
|
+
line = line.strip()
|
|
80
|
+
if line == "---":
|
|
81
|
+
if in_block:
|
|
82
|
+
break # end of block
|
|
83
|
+
in_block = True
|
|
84
|
+
continue
|
|
85
|
+
if in_block and ":" in line:
|
|
86
|
+
key, value = line.split(":", 1)
|
|
87
|
+
key = key.strip()
|
|
88
|
+
value = value.strip()
|
|
89
|
+
if key in metadata_keys:
|
|
90
|
+
metrics[key] = value
|
|
91
|
+
else:
|
|
92
|
+
try:
|
|
93
|
+
metrics[key] = float(value)
|
|
94
|
+
except ValueError:
|
|
95
|
+
metrics[key] = value
|
|
96
|
+
|
|
97
|
+
return metrics
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def compute_statistics(results: list[dict], metric_name: str) -> dict:
|
|
101
|
+
"""Compute summary statistics for a metric across runs.
|
|
102
|
+
|
|
103
|
+
Returns dict with mean, std, min, max, ci_lower, ci_upper (95%).
|
|
104
|
+
"""
|
|
105
|
+
values = [r[metric_name] for r in results if metric_name in r and isinstance(r[metric_name], (int, float))]
|
|
106
|
+
|
|
107
|
+
if not values:
|
|
108
|
+
return {"n": 0, "mean": None, "std": None}
|
|
109
|
+
|
|
110
|
+
arr = np.array(values)
|
|
111
|
+
n = len(arr)
|
|
112
|
+
mean = float(np.mean(arr))
|
|
113
|
+
std = float(np.std(arr, ddof=1)) if n > 1 else 0.0
|
|
114
|
+
|
|
115
|
+
# 95% confidence interval (t-distribution approximation for small n)
|
|
116
|
+
if n > 1:
|
|
117
|
+
from scipy import stats as scipy_stats
|
|
118
|
+
t_crit = scipy_stats.t.ppf(0.975, df=n - 1)
|
|
119
|
+
margin = t_crit * std / np.sqrt(n)
|
|
120
|
+
else:
|
|
121
|
+
margin = 0.0
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
"n": n,
|
|
125
|
+
"mean": round(mean, 6),
|
|
126
|
+
"std": round(std, 6),
|
|
127
|
+
"min": round(float(np.min(arr)), 6),
|
|
128
|
+
"max": round(float(np.max(arr)), 6),
|
|
129
|
+
"ci_lower": round(mean - margin, 6),
|
|
130
|
+
"ci_upper": round(mean + margin, 6),
|
|
131
|
+
"values": [round(float(v), 6) for v in arr],
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def mann_whitney_test(values_a: list[float], values_b: list[float]) -> dict:
|
|
136
|
+
"""Run Mann-Whitney U test comparing two sets of metric values.
|
|
137
|
+
|
|
138
|
+
Returns dict with statistic, p_value, and verdict.
|
|
139
|
+
"""
|
|
140
|
+
if len(values_a) < 2 or len(values_b) < 2:
|
|
141
|
+
return {
|
|
142
|
+
"statistic": None,
|
|
143
|
+
"p_value": None,
|
|
144
|
+
"verdict": "insufficient_data",
|
|
145
|
+
"detail": f"Need at least 2 values per group (got {len(values_a)} and {len(values_b)})",
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
from scipy import stats as scipy_stats
|
|
149
|
+
stat, p_value = scipy_stats.mannwhitneyu(
|
|
150
|
+
values_a, values_b, alternative="two-sided",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if p_value < 0.05:
|
|
154
|
+
mean_a = np.mean(values_a)
|
|
155
|
+
mean_b = np.mean(values_b)
|
|
156
|
+
verdict = "significantly_different"
|
|
157
|
+
detail = f"p={p_value:.4f} < 0.05 — the difference is statistically significant"
|
|
158
|
+
else:
|
|
159
|
+
verdict = "not_significant"
|
|
160
|
+
detail = f"p={p_value:.4f} >= 0.05 — the difference could be random noise"
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
"statistic": round(float(stat), 4),
|
|
164
|
+
"p_value": round(float(p_value), 6),
|
|
165
|
+
"verdict": verdict,
|
|
166
|
+
"detail": detail,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def format_comparison_report(
|
|
171
|
+
stats_a: dict,
|
|
172
|
+
stats_b: dict,
|
|
173
|
+
test_result: dict,
|
|
174
|
+
label_a: str,
|
|
175
|
+
label_b: str,
|
|
176
|
+
metric_name: str,
|
|
177
|
+
lower_is_better: bool,
|
|
178
|
+
) -> str:
|
|
179
|
+
"""Format a statistical comparison report."""
|
|
180
|
+
direction = "lower" if lower_is_better else "higher"
|
|
181
|
+
lines = [
|
|
182
|
+
f"# Statistical Comparison: {metric_name} ({direction} is better)",
|
|
183
|
+
"",
|
|
184
|
+
f"| Statistic | {label_a} | {label_b} |",
|
|
185
|
+
f"|-----------|{'---' * len(label_a)}--|{'---' * len(label_b)}--|",
|
|
186
|
+
f"| N runs | {stats_a['n']} | {stats_b['n']} |",
|
|
187
|
+
f"| Mean | {stats_a['mean']:.4f} | {stats_b['mean']:.4f} |",
|
|
188
|
+
f"| Std | {stats_a['std']:.4f} | {stats_b['std']:.4f} |",
|
|
189
|
+
f"| Min | {stats_a['min']:.4f} | {stats_b['min']:.4f} |",
|
|
190
|
+
f"| Max | {stats_a['max']:.4f} | {stats_b['max']:.4f} |",
|
|
191
|
+
f"| 95% CI | [{stats_a['ci_lower']:.4f}, {stats_a['ci_upper']:.4f}] | [{stats_b['ci_lower']:.4f}, {stats_b['ci_upper']:.4f}] |",
|
|
192
|
+
"",
|
|
193
|
+
"## Mann-Whitney U Test",
|
|
194
|
+
"",
|
|
195
|
+
f"- **Verdict:** {test_result['verdict']}",
|
|
196
|
+
f"- **Detail:** {test_result['detail']}",
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
if test_result["statistic"] is not None:
|
|
200
|
+
lines.append(f"- **U statistic:** {test_result['statistic']}")
|
|
201
|
+
lines.append(f"- **p-value:** {test_result['p_value']}")
|
|
202
|
+
|
|
203
|
+
lines.extend(["", "## Recommendation", ""])
|
|
204
|
+
|
|
205
|
+
if test_result["verdict"] == "significantly_different":
|
|
206
|
+
mean_a = stats_a["mean"]
|
|
207
|
+
mean_b = stats_b["mean"]
|
|
208
|
+
if lower_is_better:
|
|
209
|
+
better = label_a if mean_a < mean_b else label_b
|
|
210
|
+
else:
|
|
211
|
+
better = label_a if mean_a > mean_b else label_b
|
|
212
|
+
lines.append(f"**{better}** is statistically better. Safe to keep.")
|
|
213
|
+
elif test_result["verdict"] == "not_significant":
|
|
214
|
+
lines.append("The difference is not statistically significant. Consider:")
|
|
215
|
+
lines.append("- Running more trials (increase n_runs)")
|
|
216
|
+
lines.append("- Keeping the simpler/faster configuration")
|
|
217
|
+
lines.append("- Treating this as a tie and moving to a different hypothesis")
|
|
218
|
+
else:
|
|
219
|
+
lines.append("Insufficient data for statistical comparison. Run more trials.")
|
|
220
|
+
|
|
221
|
+
return "\n".join(lines)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def main() -> None:
|
|
225
|
+
"""CLI entry point."""
|
|
226
|
+
parser = argparse.ArgumentParser(
|
|
227
|
+
description="Multi-run statistical comparison for ML experiments"
|
|
228
|
+
)
|
|
229
|
+
parser.add_argument("--n-runs", type=int, default=3, help="Number of runs per configuration")
|
|
230
|
+
parser.add_argument("--config", default="config.yaml", help="Config file path")
|
|
231
|
+
parser.add_argument("--seed", type=int, default=42, help="Base random seed")
|
|
232
|
+
args = parser.parse_args()
|
|
233
|
+
|
|
234
|
+
print(f"Running {args.n_runs} trials with seeds {args.seed} to {args.seed + args.n_runs - 1}...", file=sys.stderr)
|
|
235
|
+
results = run_multi_seed(args.n_runs, args.config, args.seed)
|
|
236
|
+
|
|
237
|
+
if not results:
|
|
238
|
+
print("No successful runs.", file=sys.stderr)
|
|
239
|
+
sys.exit(1)
|
|
240
|
+
|
|
241
|
+
# Load config for metric info
|
|
242
|
+
config = {}
|
|
243
|
+
if Path(args.config).exists():
|
|
244
|
+
with open(args.config) as f:
|
|
245
|
+
config = yaml.safe_load(f) or {}
|
|
246
|
+
|
|
247
|
+
eval_cfg = config.get("evaluation", {})
|
|
248
|
+
metric = eval_cfg.get("primary_metric", "accuracy")
|
|
249
|
+
|
|
250
|
+
stats = compute_statistics(results, metric)
|
|
251
|
+
print(f"\n{metric} over {stats['n']} runs:")
|
|
252
|
+
print(f" Mean: {stats['mean']:.4f}")
|
|
253
|
+
print(f" Std: {stats['std']:.4f}")
|
|
254
|
+
print(f" 95% CI: [{stats['ci_lower']:.4f}, {stats['ci_upper']:.4f}]")
|
|
255
|
+
print(f" Values: {stats['values']}")
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
if __name__ == "__main__":
|
|
259
|
+
main()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Convergence detection hook for the autoresearch pipeline.
|
|
3
|
+
#
|
|
4
|
+
# Fired by Claude Code Stop hook after each training iteration.
|
|
5
|
+
# Thin wrapper around scripts/check_convergence.py — the actual
|
|
6
|
+
# algorithm lives in testable Python, not inline bash.
|
|
7
|
+
#
|
|
8
|
+
# This implements a discrete analogue of early stopping from gradient
|
|
9
|
+
# descent, adapted for the experiment loop context.
|
|
10
|
+
#
|
|
11
|
+
# Exit codes:
|
|
12
|
+
# 0 = not converged, agent should continue
|
|
13
|
+
# 2 = converged, agent should stop (signals /loop to halt)
|
|
14
|
+
|
|
15
|
+
set -euo pipefail
|
|
16
|
+
|
|
17
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
18
|
+
ML_DIR="$(dirname "$SCRIPT_DIR")"
|
|
19
|
+
EXPERIMENT_LOG="${ML_DIR}/experiments/log.jsonl"
|
|
20
|
+
CONFIG_FILE="${ML_DIR}/config.yaml"
|
|
21
|
+
|
|
22
|
+
# Check if log.jsonl exists
|
|
23
|
+
if [[ ! -f "$EXPERIMENT_LOG" ]]; then
|
|
24
|
+
echo "stop-hook: No log.jsonl found, not enough data to judge." >&2
|
|
25
|
+
exit 0
|
|
26
|
+
fi
|
|
27
|
+
|
|
28
|
+
# Activate venv and delegate to Python module
|
|
29
|
+
cd "$ML_DIR"
|
|
30
|
+
source .venv/bin/activate 2>/dev/null || true
|
|
31
|
+
|
|
32
|
+
python3 scripts/check_convergence.py \
|
|
33
|
+
--config "$CONFIG_FILE" \
|
|
34
|
+
--log "$EXPERIMENT_LOG"
|