claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/.claude-plugin/plugin.json +34 -0
  2. package/LICENSE +21 -0
  3. package/README.md +457 -0
  4. package/agents/ml-evaluator.md +43 -0
  5. package/agents/ml-researcher.md +74 -0
  6. package/bin/cli.js +46 -0
  7. package/bin/turing-init.sh +57 -0
  8. package/commands/brief.md +83 -0
  9. package/commands/compare.md +24 -0
  10. package/commands/design.md +97 -0
  11. package/commands/init.md +123 -0
  12. package/commands/logbook.md +51 -0
  13. package/commands/mode.md +43 -0
  14. package/commands/poster.md +89 -0
  15. package/commands/preflight.md +75 -0
  16. package/commands/report.md +97 -0
  17. package/commands/rules/loop-protocol.md +91 -0
  18. package/commands/status.md +24 -0
  19. package/commands/suggest.md +95 -0
  20. package/commands/sweep.md +45 -0
  21. package/commands/train.md +66 -0
  22. package/commands/try.md +63 -0
  23. package/commands/turing.md +54 -0
  24. package/commands/validate.md +34 -0
  25. package/config/defaults.yaml +45 -0
  26. package/config/experiment_archetypes.yaml +127 -0
  27. package/config/lifecycle.toml +31 -0
  28. package/config/novelty_aliases.yaml +107 -0
  29. package/config/relationships.toml +125 -0
  30. package/config/state.toml +24 -0
  31. package/config/task_taxonomy.yaml +110 -0
  32. package/config/taxonomy.toml +37 -0
  33. package/package.json +54 -0
  34. package/src/claude-md.js +55 -0
  35. package/src/install.js +107 -0
  36. package/src/paths.js +20 -0
  37. package/src/postinstall.js +22 -0
  38. package/src/verify.js +109 -0
  39. package/templates/MEMORY.md +36 -0
  40. package/templates/README.md +93 -0
  41. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  42. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  43. package/templates/config.yaml +48 -0
  44. package/templates/evaluate.py +237 -0
  45. package/templates/features/__init__.py +0 -0
  46. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  47. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  48. package/templates/features/featurizers.py +138 -0
  49. package/templates/prepare.py +171 -0
  50. package/templates/program.md +216 -0
  51. package/templates/pyproject.toml +8 -0
  52. package/templates/requirements.txt +8 -0
  53. package/templates/scripts/__init__.py +0 -0
  54. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  56. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  57. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  58. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  59. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  60. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  61. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  62. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  63. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  64. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  65. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  66. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  67. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  68. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  69. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  70. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  71. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  72. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  73. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  74. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  75. package/templates/scripts/check_convergence.py +230 -0
  76. package/templates/scripts/compare_runs.py +124 -0
  77. package/templates/scripts/critique_hypothesis.py +350 -0
  78. package/templates/scripts/experiment_index.py +288 -0
  79. package/templates/scripts/generate_brief.py +389 -0
  80. package/templates/scripts/generate_logbook.py +423 -0
  81. package/templates/scripts/log_experiment.py +243 -0
  82. package/templates/scripts/manage_hypotheses.py +543 -0
  83. package/templates/scripts/novelty_guard.py +343 -0
  84. package/templates/scripts/parse_metrics.py +139 -0
  85. package/templates/scripts/post-train-hook.sh +74 -0
  86. package/templates/scripts/preflight.py +549 -0
  87. package/templates/scripts/scaffold.py +409 -0
  88. package/templates/scripts/show_environment.py +92 -0
  89. package/templates/scripts/show_experiment_tree.py +144 -0
  90. package/templates/scripts/show_families.py +133 -0
  91. package/templates/scripts/show_metrics.py +157 -0
  92. package/templates/scripts/statistical_compare.py +259 -0
  93. package/templates/scripts/stop-hook.sh +34 -0
  94. package/templates/scripts/suggest_next.py +301 -0
  95. package/templates/scripts/sweep.py +276 -0
  96. package/templates/scripts/synthesize_decision.py +300 -0
  97. package/templates/scripts/turing_io.py +76 -0
  98. package/templates/scripts/update_state.py +296 -0
  99. package/templates/scripts/validate_stability.py +167 -0
  100. package/templates/scripts/verify_placeholders.py +119 -0
  101. package/templates/sweep_config.yaml +14 -0
  102. package/templates/tests/__init__.py +0 -0
  103. package/templates/tests/conftest.py +91 -0
  104. package/templates/train.py +240 -0
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env python3
2
+ """Experiment family viewer for the autoresearch pipeline.
3
+
4
+ Groups experiments by strategic theme (family tag) and shows
5
+ per-family performance summaries. Tells the human when an entire
6
+ research direction is exhausted.
7
+
8
+ Usage:
9
+ python scripts/show_families.py [--log experiments/log.jsonl] [--config config.yaml]
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import sys
17
+ from pathlib import Path
18
+
19
+ import yaml
20
+
21
+ from scripts.turing_io import load_experiments
22
+
23
+
24
+ def group_by_family(experiments: list[dict]) -> dict[str, list[dict]]:
25
+ """Group experiments by family tag."""
26
+ families: dict[str, list[dict]] = {}
27
+ for exp in experiments:
28
+ family = exp.get("family") or "untagged"
29
+ families.setdefault(family, []).append(exp)
30
+ return families
31
+
32
+
33
+ def family_stats(
34
+ experiments: list[dict],
35
+ metric: str,
36
+ lower_is_better: bool,
37
+ ) -> dict:
38
+ """Compute stats for a family of experiments."""
39
+ total = len(experiments)
40
+ kept = [e for e in experiments if e.get("status") == "kept"]
41
+ discarded = [e for e in experiments if e.get("status") == "discarded"]
42
+
43
+ metric_vals = [
44
+ e.get("metrics", {}).get(metric)
45
+ for e in kept
46
+ if e.get("metrics", {}).get(metric) is not None
47
+ ]
48
+
49
+ best_val = None
50
+ best_id = None
51
+ if metric_vals:
52
+ if lower_is_better:
53
+ best_val = min(metric_vals)
54
+ else:
55
+ best_val = max(metric_vals)
56
+ for e in kept:
57
+ if e.get("metrics", {}).get(metric) == best_val:
58
+ best_id = e.get("experiment_id")
59
+ break
60
+
61
+ # Compute recent trend (last 3 experiments)
62
+ recent = experiments[-3:] if len(experiments) >= 3 else experiments
63
+ recent_kept = sum(1 for e in recent if e.get("status") == "kept")
64
+ recent_rate = recent_kept / len(recent) if recent else 0
65
+
66
+ return {
67
+ "total": total,
68
+ "kept": len(kept),
69
+ "discarded": len(discarded),
70
+ "keep_rate": round(len(kept) / total, 2) if total > 0 else 0,
71
+ "best_metric": best_val,
72
+ "best_experiment": best_id,
73
+ "recent_keep_rate": round(recent_rate, 2),
74
+ "exhausted": total >= 3 and recent_rate == 0,
75
+ }
76
+
77
+
78
+ def format_families(
79
+ families: dict[str, dict],
80
+ metric_name: str,
81
+ ) -> str:
82
+ """Format family summaries as a table."""
83
+ if not families:
84
+ return "No experiments logged yet."
85
+
86
+ lines = [
87
+ f"{'Family':<25} {'Total':>6} {'Kept':>6} {'Rate':>6} {'Best ' + metric_name:>15} {'Status':<12}",
88
+ "-" * 80,
89
+ ]
90
+
91
+ for name, stats in sorted(families.items(), key=lambda x: -(x[1].get("best_metric") or 0)):
92
+ best_str = f"{stats['best_metric']:.4f}" if stats["best_metric"] is not None else "N/A"
93
+ status = "EXHAUSTED" if stats["exhausted"] else "active"
94
+ line = f"{name:<25} {stats['total']:>6} {stats['kept']:>6} {stats['keep_rate']:>5.0%} {best_str:>15} {status:<12}"
95
+ lines.append(line)
96
+
97
+ # Summary
98
+ total_exps = sum(s["total"] for s in families.values())
99
+ exhausted = sum(1 for s in families.values() if s["exhausted"])
100
+ lines.extend([
101
+ "",
102
+ f"Total: {total_exps} experiments across {len(families)} families ({exhausted} exhausted)",
103
+ ])
104
+
105
+ return "\n".join(lines)
106
+
107
+
108
+ def main() -> None:
109
+ """CLI entry point."""
110
+ parser = argparse.ArgumentParser(description="Show experiment families")
111
+ parser.add_argument("--log", default="experiments/log.jsonl")
112
+ parser.add_argument("--config", default="config.yaml")
113
+ args = parser.parse_args()
114
+
115
+ config = {}
116
+ if Path(args.config).exists():
117
+ with open(args.config) as f:
118
+ config = yaml.safe_load(f) or {}
119
+
120
+ eval_cfg = config.get("evaluation", {})
121
+ metric = eval_cfg.get("primary_metric", "accuracy")
122
+ lower_is_better = eval_cfg.get("lower_is_better", False)
123
+
124
+ experiments = load_experiments(args.log)
125
+ families = group_by_family(experiments)
126
+
127
+ family_data = {name: family_stats(exps, metric, lower_is_better) for name, exps in families.items()}
128
+
129
+ print(format_families(family_data, metric))
130
+
131
+
132
+ if __name__ == "__main__":
133
+ main()
@@ -0,0 +1,157 @@
1
+ """Display experiment metrics from experiments/log.jsonl.
2
+
3
+ Reads the JSONL experiment log and prints a formatted table,
4
+ highlighting the current best by primary metric. This is the
5
+ agent's primary observation tool at the start of each loop iteration.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ import yaml
16
+
17
+ from scripts.turing_io import load_config, load_experiments
18
+
19
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
20
+
21
+
22
+ def find_best(experiments: list[dict], primary_metric: str, lower_is_better: bool) -> str | None:
23
+ """Find experiment_id with best primary metric among 'keep' entries."""
24
+ best_id = None
25
+ best_value = float("inf") if lower_is_better else float("-inf")
26
+
27
+ for exp in experiments:
28
+ if exp.get("status") != "kept":
29
+ continue
30
+ value = exp.get("metrics", {}).get(primary_metric)
31
+ if value is None:
32
+ continue
33
+ if lower_is_better and value < best_value:
34
+ best_value = value
35
+ best_id = exp["experiment_id"]
36
+ elif not lower_is_better and value > best_value:
37
+ best_value = value
38
+ best_id = exp["experiment_id"]
39
+ return best_id
40
+
41
+
42
+ def format_table(experiments: list[dict], best_id: str | None, metric_names: list[str]) -> str:
43
+ """Format experiments as a text table."""
44
+ if not experiments:
45
+ return "No experiments logged yet."
46
+
47
+ # Build dynamic header based on configured metrics
48
+ metric_headers = "".join(f"{m:>12}" for m in metric_names)
49
+ header = f"{'ID':<10} {'Status':<10} {'Model':<15}{metric_headers} {'Timestamp':<22}"
50
+ sep = "-" * len(header)
51
+ lines = [header, sep]
52
+
53
+ for exp in experiments:
54
+ metrics = exp.get("metrics", {})
55
+ model_type = exp.get("config", {}).get("model_type", "unknown")
56
+
57
+ metric_values = ""
58
+ for m in metric_names:
59
+ val = metrics.get(m)
60
+ if isinstance(val, (int, float)):
61
+ metric_values += f"{val:>12.4f}"
62
+ else:
63
+ metric_values += f"{'N/A':>12}"
64
+
65
+ ts = exp.get("timestamp", "")[:19]
66
+ marker = " *BEST*" if exp.get("experiment_id") == best_id else ""
67
+ line = f"{exp.get('experiment_id', '?'):<10} {exp.get('status', '?'):<10} {model_type:<15}{metric_values} {ts}{marker}"
68
+ lines.append(line)
69
+
70
+ return "\n".join(lines)
71
+
72
+
73
+ def get_experiment_diffs(experiments: list[dict], max_diffs: int = 3) -> str:
74
+ """Get git diffs for recent discarded experiments."""
75
+ import subprocess
76
+
77
+ discarded = [e for e in experiments if e.get("status") == "discarded"]
78
+ if not discarded:
79
+ return ""
80
+
81
+ recent = discarded[-max_diffs:]
82
+ lines = ["\n--- Recent Failed Experiment Diffs ---\n"]
83
+
84
+ for exp in recent:
85
+ exp_id = exp.get("experiment_id", "unknown")
86
+ description = exp.get("description", "no description")
87
+ lines.append(f"=== {exp_id}: {description} ===")
88
+
89
+ # Try to find the experiment branch
90
+ branch = f"exp/{exp_id.replace('exp-', '')}"
91
+ try:
92
+ result = subprocess.run(
93
+ ["git", "diff", f"main...{branch}", "--", "train.py", "config.yaml"],
94
+ capture_output=True,
95
+ text=True,
96
+ timeout=10,
97
+ )
98
+ if result.returncode == 0 and result.stdout.strip():
99
+ # Truncate long diffs
100
+ diff_lines = result.stdout.strip().splitlines()
101
+ if len(diff_lines) > 30:
102
+ diff_lines = diff_lines[:30] + [f"... ({len(diff_lines) - 30} more lines)"]
103
+ lines.append("\n".join(diff_lines))
104
+ else:
105
+ lines.append(" (branch not found or no diff)")
106
+ except (subprocess.TimeoutExpired, FileNotFoundError):
107
+ lines.append(" (git not available)")
108
+
109
+ lines.append("")
110
+
111
+ return "\n".join(lines)
112
+
113
+
114
+ def main() -> None:
115
+ """CLI entry point."""
116
+ parser = argparse.ArgumentParser(description="Show experiment metrics")
117
+ parser.add_argument(
118
+ "--log",
119
+ default=DEFAULT_LOG_PATH,
120
+ help=f"Path to experiment log (default: {DEFAULT_LOG_PATH})",
121
+ )
122
+ parser.add_argument(
123
+ "--last",
124
+ type=int,
125
+ default=None,
126
+ help="Show only last N experiments",
127
+ )
128
+ parser.add_argument(
129
+ "--with-diffs",
130
+ action="store_true",
131
+ help="Include git diffs for discarded experiments",
132
+ )
133
+ args = parser.parse_args()
134
+
135
+ config = load_config()
136
+ eval_cfg = config.get("evaluation", {})
137
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
138
+ metric_names = eval_cfg.get("metrics", ["accuracy", "f1_weighted"])
139
+ lower_is_better = eval_cfg.get("lower_is_better", False)
140
+
141
+ experiments = load_experiments(args.log)
142
+
143
+ if args.last and args.last > 0:
144
+ experiments = experiments[-args.last:]
145
+
146
+ best_id = find_best(experiments, primary_metric, lower_is_better)
147
+ print(format_table(experiments, best_id, metric_names))
148
+
149
+ if args.with_diffs:
150
+ all_experiments = load_experiments(args.log)
151
+ diffs = get_experiment_diffs(all_experiments)
152
+ if diffs:
153
+ print(diffs)
154
+
155
+
156
+ if __name__ == "__main__":
157
+ main()
@@ -0,0 +1,259 @@
1
+ #!/usr/bin/env python3
2
+ """Multi-run statistical comparison for the autoresearch pipeline.
3
+
4
+ Runs the same configuration N times with different random seeds and
5
+ compares metric distributions rather than point estimates. This prevents
6
+ keep/discard decisions based on noise.
7
+
8
+ A model that scored 0.87 vs 0.86 on a single run might just be seed
9
+ variance. This script runs both configurations multiple times and
10
+ uses the Mann-Whitney U test to determine if the difference is real.
11
+
12
+ Usage:
13
+ python scripts/statistical_compare.py --n-runs 3 [--config config.yaml]
14
+ python scripts/statistical_compare.py --compare <log1.jsonl> <log2.jsonl>
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import subprocess
22
+ import sys
23
+ from pathlib import Path
24
+
25
+ import numpy as np
26
+ import yaml
27
+
28
+
29
+ def run_multi_seed(
30
+ n_runs: int,
31
+ config_path: str = "config.yaml",
32
+ base_seed: int = 42,
33
+ ) -> list[dict]:
34
+ """Run training N times with different seeds, collect metrics.
35
+
36
+ Args:
37
+ n_runs: Number of runs to execute.
38
+ config_path: Path to config.yaml.
39
+ base_seed: Starting seed (incremented for each run).
40
+
41
+ Returns:
42
+ List of metric dicts from each run.
43
+ """
44
+ results = []
45
+ for i in range(n_runs):
46
+ seed = base_seed + i
47
+ print(f" Run {i+1}/{n_runs} (seed={seed})...", file=sys.stderr, end=" ")
48
+
49
+ cmd = f"python train.py --config {config_path} --seed {seed}"
50
+ proc = subprocess.run(
51
+ cmd, shell=True, capture_output=True, text=True, timeout=600,
52
+ )
53
+
54
+ if proc.returncode != 0:
55
+ print(f"FAILED", file=sys.stderr)
56
+ continue
57
+
58
+ # Parse metrics from output (between --- delimiters)
59
+ metrics = parse_metrics_from_output(proc.stdout)
60
+ if metrics:
61
+ metrics["seed"] = seed
62
+ results.append(metrics)
63
+ primary = next((v for k, v in metrics.items() if k not in ("seed", "model_type", "train_seconds")), None)
64
+ print(f"done ({primary})", file=sys.stderr)
65
+ else:
66
+ print(f"no metrics parsed", file=sys.stderr)
67
+
68
+ return results
69
+
70
+
71
+ def parse_metrics_from_output(output: str) -> dict:
72
+ """Parse metrics from the --- delimited format in train.py output."""
73
+ lines = output.strip().split("\n")
74
+ in_block = False
75
+ metrics = {}
76
+ metadata_keys = {"model_type", "train_seconds"}
77
+
78
+ for line in lines:
79
+ line = line.strip()
80
+ if line == "---":
81
+ if in_block:
82
+ break # end of block
83
+ in_block = True
84
+ continue
85
+ if in_block and ":" in line:
86
+ key, value = line.split(":", 1)
87
+ key = key.strip()
88
+ value = value.strip()
89
+ if key in metadata_keys:
90
+ metrics[key] = value
91
+ else:
92
+ try:
93
+ metrics[key] = float(value)
94
+ except ValueError:
95
+ metrics[key] = value
96
+
97
+ return metrics
98
+
99
+
100
+ def compute_statistics(results: list[dict], metric_name: str) -> dict:
101
+ """Compute summary statistics for a metric across runs.
102
+
103
+ Returns dict with mean, std, min, max, ci_lower, ci_upper (95%).
104
+ """
105
+ values = [r[metric_name] for r in results if metric_name in r and isinstance(r[metric_name], (int, float))]
106
+
107
+ if not values:
108
+ return {"n": 0, "mean": None, "std": None}
109
+
110
+ arr = np.array(values)
111
+ n = len(arr)
112
+ mean = float(np.mean(arr))
113
+ std = float(np.std(arr, ddof=1)) if n > 1 else 0.0
114
+
115
+ # 95% confidence interval (t-distribution approximation for small n)
116
+ if n > 1:
117
+ from scipy import stats as scipy_stats
118
+ t_crit = scipy_stats.t.ppf(0.975, df=n - 1)
119
+ margin = t_crit * std / np.sqrt(n)
120
+ else:
121
+ margin = 0.0
122
+
123
+ return {
124
+ "n": n,
125
+ "mean": round(mean, 6),
126
+ "std": round(std, 6),
127
+ "min": round(float(np.min(arr)), 6),
128
+ "max": round(float(np.max(arr)), 6),
129
+ "ci_lower": round(mean - margin, 6),
130
+ "ci_upper": round(mean + margin, 6),
131
+ "values": [round(float(v), 6) for v in arr],
132
+ }
133
+
134
+
135
+ def mann_whitney_test(values_a: list[float], values_b: list[float]) -> dict:
136
+ """Run Mann-Whitney U test comparing two sets of metric values.
137
+
138
+ Returns dict with statistic, p_value, and verdict.
139
+ """
140
+ if len(values_a) < 2 or len(values_b) < 2:
141
+ return {
142
+ "statistic": None,
143
+ "p_value": None,
144
+ "verdict": "insufficient_data",
145
+ "detail": f"Need at least 2 values per group (got {len(values_a)} and {len(values_b)})",
146
+ }
147
+
148
+ from scipy import stats as scipy_stats
149
+ stat, p_value = scipy_stats.mannwhitneyu(
150
+ values_a, values_b, alternative="two-sided",
151
+ )
152
+
153
+ if p_value < 0.05:
154
+ mean_a = np.mean(values_a)
155
+ mean_b = np.mean(values_b)
156
+ verdict = "significantly_different"
157
+ detail = f"p={p_value:.4f} < 0.05 — the difference is statistically significant"
158
+ else:
159
+ verdict = "not_significant"
160
+ detail = f"p={p_value:.4f} >= 0.05 — the difference could be random noise"
161
+
162
+ return {
163
+ "statistic": round(float(stat), 4),
164
+ "p_value": round(float(p_value), 6),
165
+ "verdict": verdict,
166
+ "detail": detail,
167
+ }
168
+
169
+
170
+ def format_comparison_report(
171
+ stats_a: dict,
172
+ stats_b: dict,
173
+ test_result: dict,
174
+ label_a: str,
175
+ label_b: str,
176
+ metric_name: str,
177
+ lower_is_better: bool,
178
+ ) -> str:
179
+ """Format a statistical comparison report."""
180
+ direction = "lower" if lower_is_better else "higher"
181
+ lines = [
182
+ f"# Statistical Comparison: {metric_name} ({direction} is better)",
183
+ "",
184
+ f"| Statistic | {label_a} | {label_b} |",
185
+ f"|-----------|{'---' * len(label_a)}--|{'---' * len(label_b)}--|",
186
+ f"| N runs | {stats_a['n']} | {stats_b['n']} |",
187
+ f"| Mean | {stats_a['mean']:.4f} | {stats_b['mean']:.4f} |",
188
+ f"| Std | {stats_a['std']:.4f} | {stats_b['std']:.4f} |",
189
+ f"| Min | {stats_a['min']:.4f} | {stats_b['min']:.4f} |",
190
+ f"| Max | {stats_a['max']:.4f} | {stats_b['max']:.4f} |",
191
+ f"| 95% CI | [{stats_a['ci_lower']:.4f}, {stats_a['ci_upper']:.4f}] | [{stats_b['ci_lower']:.4f}, {stats_b['ci_upper']:.4f}] |",
192
+ "",
193
+ "## Mann-Whitney U Test",
194
+ "",
195
+ f"- **Verdict:** {test_result['verdict']}",
196
+ f"- **Detail:** {test_result['detail']}",
197
+ ]
198
+
199
+ if test_result["statistic"] is not None:
200
+ lines.append(f"- **U statistic:** {test_result['statistic']}")
201
+ lines.append(f"- **p-value:** {test_result['p_value']}")
202
+
203
+ lines.extend(["", "## Recommendation", ""])
204
+
205
+ if test_result["verdict"] == "significantly_different":
206
+ mean_a = stats_a["mean"]
207
+ mean_b = stats_b["mean"]
208
+ if lower_is_better:
209
+ better = label_a if mean_a < mean_b else label_b
210
+ else:
211
+ better = label_a if mean_a > mean_b else label_b
212
+ lines.append(f"**{better}** is statistically better. Safe to keep.")
213
+ elif test_result["verdict"] == "not_significant":
214
+ lines.append("The difference is not statistically significant. Consider:")
215
+ lines.append("- Running more trials (increase n_runs)")
216
+ lines.append("- Keeping the simpler/faster configuration")
217
+ lines.append("- Treating this as a tie and moving to a different hypothesis")
218
+ else:
219
+ lines.append("Insufficient data for statistical comparison. Run more trials.")
220
+
221
+ return "\n".join(lines)
222
+
223
+
224
+ def main() -> None:
225
+ """CLI entry point."""
226
+ parser = argparse.ArgumentParser(
227
+ description="Multi-run statistical comparison for ML experiments"
228
+ )
229
+ parser.add_argument("--n-runs", type=int, default=3, help="Number of runs per configuration")
230
+ parser.add_argument("--config", default="config.yaml", help="Config file path")
231
+ parser.add_argument("--seed", type=int, default=42, help="Base random seed")
232
+ args = parser.parse_args()
233
+
234
+ print(f"Running {args.n_runs} trials with seeds {args.seed} to {args.seed + args.n_runs - 1}...", file=sys.stderr)
235
+ results = run_multi_seed(args.n_runs, args.config, args.seed)
236
+
237
+ if not results:
238
+ print("No successful runs.", file=sys.stderr)
239
+ sys.exit(1)
240
+
241
+ # Load config for metric info
242
+ config = {}
243
+ if Path(args.config).exists():
244
+ with open(args.config) as f:
245
+ config = yaml.safe_load(f) or {}
246
+
247
+ eval_cfg = config.get("evaluation", {})
248
+ metric = eval_cfg.get("primary_metric", "accuracy")
249
+
250
+ stats = compute_statistics(results, metric)
251
+ print(f"\n{metric} over {stats['n']} runs:")
252
+ print(f" Mean: {stats['mean']:.4f}")
253
+ print(f" Std: {stats['std']:.4f}")
254
+ print(f" 95% CI: [{stats['ci_lower']:.4f}, {stats['ci_upper']:.4f}]")
255
+ print(f" Values: {stats['values']}")
256
+
257
+
258
+ if __name__ == "__main__":
259
+ main()
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env bash
2
+ # Convergence detection hook for the autoresearch pipeline.
3
+ #
4
+ # Fired by Claude Code Stop hook after each training iteration.
5
+ # Thin wrapper around scripts/check_convergence.py — the actual
6
+ # algorithm lives in testable Python, not inline bash.
7
+ #
8
+ # This implements a discrete analogue of early stopping from gradient
9
+ # descent, adapted for the experiment loop context.
10
+ #
11
+ # Exit codes:
12
+ # 0 = not converged, agent should continue
13
+ # 2 = converged, agent should stop (signals /loop to halt)
14
+
15
+ set -euo pipefail
16
+
17
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
18
+ ML_DIR="$(dirname "$SCRIPT_DIR")"
19
+ EXPERIMENT_LOG="${ML_DIR}/experiments/log.jsonl"
20
+ CONFIG_FILE="${ML_DIR}/config.yaml"
21
+
22
+ # Check if log.jsonl exists
23
+ if [[ ! -f "$EXPERIMENT_LOG" ]]; then
24
+ echo "stop-hook: No log.jsonl found, not enough data to judge." >&2
25
+ exit 0
26
+ fi
27
+
28
+ # Activate venv and delegate to Python module
29
+ cd "$ML_DIR"
30
+ source .venv/bin/activate 2>/dev/null || true
31
+
32
+ python3 scripts/check_convergence.py \
33
+ --config "$CONFIG_FILE" \
34
+ --log "$EXPERIMENT_LOG"