claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/.claude-plugin/plugin.json +34 -0
  2. package/LICENSE +21 -0
  3. package/README.md +457 -0
  4. package/agents/ml-evaluator.md +43 -0
  5. package/agents/ml-researcher.md +74 -0
  6. package/bin/cli.js +46 -0
  7. package/bin/turing-init.sh +57 -0
  8. package/commands/brief.md +83 -0
  9. package/commands/compare.md +24 -0
  10. package/commands/design.md +97 -0
  11. package/commands/init.md +123 -0
  12. package/commands/logbook.md +51 -0
  13. package/commands/mode.md +43 -0
  14. package/commands/poster.md +89 -0
  15. package/commands/preflight.md +75 -0
  16. package/commands/report.md +97 -0
  17. package/commands/rules/loop-protocol.md +91 -0
  18. package/commands/status.md +24 -0
  19. package/commands/suggest.md +95 -0
  20. package/commands/sweep.md +45 -0
  21. package/commands/train.md +66 -0
  22. package/commands/try.md +63 -0
  23. package/commands/turing.md +54 -0
  24. package/commands/validate.md +34 -0
  25. package/config/defaults.yaml +45 -0
  26. package/config/experiment_archetypes.yaml +127 -0
  27. package/config/lifecycle.toml +31 -0
  28. package/config/novelty_aliases.yaml +107 -0
  29. package/config/relationships.toml +125 -0
  30. package/config/state.toml +24 -0
  31. package/config/task_taxonomy.yaml +110 -0
  32. package/config/taxonomy.toml +37 -0
  33. package/package.json +54 -0
  34. package/src/claude-md.js +55 -0
  35. package/src/install.js +107 -0
  36. package/src/paths.js +20 -0
  37. package/src/postinstall.js +22 -0
  38. package/src/verify.js +109 -0
  39. package/templates/MEMORY.md +36 -0
  40. package/templates/README.md +93 -0
  41. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  42. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  43. package/templates/config.yaml +48 -0
  44. package/templates/evaluate.py +237 -0
  45. package/templates/features/__init__.py +0 -0
  46. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  47. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  48. package/templates/features/featurizers.py +138 -0
  49. package/templates/prepare.py +171 -0
  50. package/templates/program.md +216 -0
  51. package/templates/pyproject.toml +8 -0
  52. package/templates/requirements.txt +8 -0
  53. package/templates/scripts/__init__.py +0 -0
  54. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  56. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  57. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  58. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  59. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  60. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  61. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  62. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  63. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  64. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  65. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  66. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  67. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  68. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  69. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  70. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  71. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  72. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  73. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  74. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  75. package/templates/scripts/check_convergence.py +230 -0
  76. package/templates/scripts/compare_runs.py +124 -0
  77. package/templates/scripts/critique_hypothesis.py +350 -0
  78. package/templates/scripts/experiment_index.py +288 -0
  79. package/templates/scripts/generate_brief.py +389 -0
  80. package/templates/scripts/generate_logbook.py +423 -0
  81. package/templates/scripts/log_experiment.py +243 -0
  82. package/templates/scripts/manage_hypotheses.py +543 -0
  83. package/templates/scripts/novelty_guard.py +343 -0
  84. package/templates/scripts/parse_metrics.py +139 -0
  85. package/templates/scripts/post-train-hook.sh +74 -0
  86. package/templates/scripts/preflight.py +549 -0
  87. package/templates/scripts/scaffold.py +409 -0
  88. package/templates/scripts/show_environment.py +92 -0
  89. package/templates/scripts/show_experiment_tree.py +144 -0
  90. package/templates/scripts/show_families.py +133 -0
  91. package/templates/scripts/show_metrics.py +157 -0
  92. package/templates/scripts/statistical_compare.py +259 -0
  93. package/templates/scripts/stop-hook.sh +34 -0
  94. package/templates/scripts/suggest_next.py +301 -0
  95. package/templates/scripts/sweep.py +276 -0
  96. package/templates/scripts/synthesize_decision.py +300 -0
  97. package/templates/scripts/turing_io.py +76 -0
  98. package/templates/scripts/update_state.py +296 -0
  99. package/templates/scripts/validate_stability.py +167 -0
  100. package/templates/scripts/verify_placeholders.py +119 -0
  101. package/templates/sweep_config.yaml +14 -0
  102. package/templates/tests/__init__.py +0 -0
  103. package/templates/tests/conftest.py +91 -0
  104. package/templates/train.py +240 -0
@@ -0,0 +1,216 @@
1
+ # Autoresearch: {{PROJECT_NAME}} Model Training
2
+
3
+ *"An experiment is a question which science poses to Nature, and a measurement is the recording of Nature's answer."*
4
+
5
+ ## Goal
6
+
7
+ {{TASK_DESCRIPTION}}
8
+
9
+ **Primary metric:** {{TARGET_METRIC}} ({{METRIC_DIRECTION}} is better)
10
+ **Secondary metrics:** as configured in config.yaml `evaluation.metrics`
11
+
12
+ ## The Fundamental Constraint
13
+
14
+ **You modify `train.py` and `config.yaml`. You do NOT modify `prepare.py` or `evaluate.py`. Ever.**
15
+
16
+ This separation is not a convention — it is the architectural invariant that makes your results comparable. If you could change evaluation between experiments, no comparison would be valid. The measurement apparatus is sacred.
17
+
18
+ | Layer | Files | Your Access |
19
+ |-------|-------|-------------|
20
+ | Hidden | `evaluate.py` | NONE — do not read, reference, or access |
21
+ | Measurement | `prepare.py` | READ-ONLY |
22
+ | Hypothesis | `train.py`, `config.yaml` | READ-WRITE |
23
+ | Features | `features/featurizers.py` | READ-ONLY (modify how `train.py` uses it) |
24
+
25
+ ## Configuration
26
+
27
+ All hyperparameters live in `config.yaml`. Edit it for parameter changes — do NOT hardcode values in train.py.
28
+
29
+ Key sections:
30
+ - `model.type` — model framework (xgboost, lightgbm, etc.)
31
+ - `model.hyperparams` — all model hyperparameters
32
+ - `convergence.patience` — consecutive non-improvements before stopping
33
+
34
+ ## Branches
35
+
36
+ Create per-experiment branches to preserve all code variants:
37
+ ```
38
+ git checkout -b exp/NNN-description
39
+ # ... make changes, run experiment ...
40
+ # If improved: git checkout main && git merge exp/NNN-description
41
+ # If not improved: git checkout main (branch preserved)
42
+ ```
43
+
44
+ ## Memory
45
+
46
+ Read `.claude/agent-memory/ml-researcher/MEMORY.md` at the start of each session.
47
+ Update it after each experiment with:
48
+ - Best result (if improved)
49
+ - What was tried and why
50
+ - What worked / what failed
51
+ - Promising next directions
52
+
53
+ ## Sweep
54
+
55
+ For systematic hyperparameter search:
56
+ 1. Edit `sweep_config.yaml` with parameter ranges
57
+ 2. Generate queue: `python scripts/sweep.py`
58
+ 3. Check status: `python scripts/sweep.py --status`
59
+ 4. Get next: `python scripts/sweep.py --next`
60
+ 5. Apply overrides, create branch, run training
61
+ 6. Mark done: `python scripts/sweep.py --mark <name> complete|failed`
62
+
63
+ ## THE LOOP
64
+
65
+ The autoresearch experiment loop. Each iteration is one experiment — one hypothesis tested.
66
+
67
+ 1. **OBSERVE** — Read recent results, check hypothesis queue, research plan, and review failed diffs:
68
+ ```bash
69
+ python scripts/show_metrics.py --last 5
70
+ python scripts/manage_hypotheses.py next 2>/dev/null || echo "No queued hypotheses"
71
+ cat RESEARCH_PLAN.md 2>/dev/null || true
72
+ ```
73
+
74
+ If `RESEARCH_PLAN.md` exists, use it for strategic direction (which model families to explore, in what order, what budget). The plan is advisory — deviate if evidence warrants, but note why.
75
+
76
+ For the most recent discarded experiments, read the actual git diff to understand what was tried and failed — do NOT rely on your own memory of what you changed:
77
+ ```bash
78
+ # Show diffs from recent discarded experiment branches
79
+ for branch in $(git branch --list 'exp/*' | tail -3); do
80
+ echo "=== $branch ==="
81
+ git diff main...$branch -- train.py config.yaml 2>/dev/null | head -40
82
+ done
83
+ ```
84
+
85
+ 2. **HYPOTHESIZE** — Check the queue first. If a queued hypothesis exists (especially human-injected, high priority), use it. Otherwise, generate your own and **register it in the queue before executing**:
86
+
87
+ **If using a queued hypothesis:**
88
+ ```bash
89
+ python scripts/manage_hypotheses.py mark hyp-NNN in-progress
90
+ ```
91
+
92
+ **If generating your own hypothesis**, register it with structured detail:
93
+ ```bash
94
+ python scripts/manage_hypotheses.py add "your hypothesis description" \
95
+ --priority medium --source agent \
96
+ --model-type xgboost \
97
+ --hyperparams '{"max_depth": 8, "n_estimators": 200}' \
98
+ --family optimizer-sweep \
99
+ --tags "depth,estimators" \
100
+ --parent exp-NNN \
101
+ --expected "deeper trees should capture feature interactions"
102
+ python scripts/manage_hypotheses.py mark hyp-NNN in-progress
103
+ ```
104
+
105
+ This creates both an index entry in `hypotheses.yaml` and a detailed file at `hypotheses/hyp-NNN.yaml` with full architecture, hyperparameters, expected outcome, and lineage.
106
+
107
+ Every experiment must have a corresponding hypothesis in the queue. This ensures the hypothesis database is a complete record of every idea — human and agent alike.
108
+
109
+ To read a hypothesis's full detail:
110
+ ```bash
111
+ python scripts/manage_hypotheses.py show hyp-NNN
112
+ ```
113
+
114
+ 3. **PREPARE** — Modify `config.yaml` for hyperparameter changes. Only modify `train.py` for structural code changes.
115
+
116
+ 4. **COMMIT** the experiment:
117
+ ```bash
118
+ git commit -am "exp: {description}"
119
+ ```
120
+
121
+ 5. **EXECUTE** training:
122
+ ```bash
123
+ source .venv/bin/activate && python train.py > run.log 2>&1
124
+ ```
125
+
126
+ 6. **MEASURE** — Parse metrics from run.log:
127
+ ```bash
128
+ grep -A 10 "^---" run.log | head -10
129
+ ```
130
+
131
+ 7. **DECIDE:**
132
+
133
+ **If improved** over current best:
134
+ - Keep the commit
135
+ - Copy model: `cp models/model.joblib models/best/model.joblib`
136
+ - Update `models/best/metadata.json`
137
+
138
+ **If NOT improved:**
139
+ ```bash
140
+ git reset --hard HEAD~1
141
+ ```
142
+
143
+ 8. **RECORD** — Log the experiment (kept or discarded):
144
+ ```bash
145
+ python scripts/log_experiment.py experiments/log.jsonl exp-NNN kept|discarded \
146
+ '{"{{TARGET_METRIC}}": X.XX, ...}' \
147
+ '{"model_type": "xgboost", "hyperparams": {...}}' \
148
+ models/model.joblib "Description of hypothesis and outcome"
149
+ ```
150
+
151
+ Update the hypothesis status with result metrics:
152
+ ```bash
153
+ python scripts/manage_hypotheses.py mark hyp-NNN tested \
154
+ --result exp-NNN \
155
+ --metrics '{"{{TARGET_METRIC}}": X.XX, ...}' \
156
+ --notes "Brief explanation of what happened and why"
157
+ # or: mark hyp-NNN promising (if it improved significantly)
158
+ # or: mark hyp-NNN dead-end (if it clearly failed)
159
+ ```
160
+
161
+ Then synthesize a decision packet and auto-queue follow-ups:
162
+ ```bash
163
+ python scripts/synthesize_decision.py --experiment exp-NNN --auto-queue
164
+ ```
165
+ This produces a verdict (promote/branch_followup/abandon/fix_and_retry) and automatically queues follow-up hypotheses for `branch_followup` and `fix_and_retry` outcomes.
166
+
167
+ 9. **CONVERGE** — Check stopping conditions:
168
+ - N consecutive non-improvements (`config.yaml` → `convergence.patience`) = STOP
169
+ - `max_iterations` reached = STOP
170
+ - Report final best model and recommend next steps
171
+
172
+ 10. **REPEAT** — return to step 1.
173
+
174
+ ## Execution Rules
175
+
176
+ - **ALWAYS redirect output:** `python train.py > run.log 2>&1`
177
+ - **ALWAYS parse with grep:** `grep -A 10 "^---" run.log | head -10`
178
+ - **ALWAYS activate venv:** `source .venv/bin/activate`
179
+ - **NEVER install packages** without human approval
180
+
181
+ ## Strategy Escalation Protocol
182
+
183
+ When consecutive experiments fail to improve, escalate your approach rather than repeating similar attempts:
184
+
185
+ | Consecutive Failures | Strategy | Description |
186
+ |---------------------|----------|-------------|
187
+ | 0-1 | **EXPLOIT** | Push further in the current direction — small tweaks, parameter refinement |
188
+ | 2-3 | **RE-READ** | Stop. Re-read ALL code from scratch. Your mental model is likely stale. |
189
+ | 4-5 | **COMBINE** | Combine two previously successful ideas that haven't been tried together |
190
+ | 6+ | **RADICAL** | Abandon the current approach entirely. Try a fundamentally different model, architecture, or feature strategy. |
191
+
192
+ Track your consecutive failure count. When you hit a new tier, announce it: "Escalating to COMBINE strategy after 4 consecutive failures."
193
+
194
+ ## Experiment Ideas
195
+
196
+ Starting suggestions (ordered by expected impact):
197
+
198
+ 1. **Hyperparameter sweep:** max_depth, n_estimators, learning_rate
199
+ 2. **LightGBM:** often faster than XGBoost with comparable accuracy
200
+ 3. **Feature engineering:** domain-specific features via the featurizer pipeline
201
+ 4. **sklearn alternatives:** RandomForest, GradientBoosting
202
+ 5. **Learning rate schedule:** lower lr with more estimators (0.01 / 1000 trees)
203
+ 6. **Neural network:** if samples > 2000, try a small MLP
204
+
205
+ ## Output Format
206
+
207
+ - **Model artifact:** `models/best/model.joblib`
208
+ - **Metadata:** `models/best/metadata.json`
209
+ - **Experiment log:** `experiments/log.jsonl` (append-only JSONL)
210
+ - **TSV summary:** `experiments/results.tsv`
211
+
212
+ ## Comparing Runs
213
+
214
+ ```bash
215
+ python scripts/compare_runs.py exp-001 exp-002
216
+ ```
@@ -0,0 +1,8 @@
1
+ [project]
2
+ name = "{{PROJECT_NAME}}-ml"
3
+ version = "0.1.0"
4
+ requires-python = ">=3.12"
5
+
6
+ [tool.pytest.ini_options]
7
+ testpaths = ["tests"]
8
+ pythonpath = ["."]
@@ -0,0 +1,8 @@
1
+ scikit-learn>=1.6
2
+ xgboost>=3.2
3
+ lightgbm>=4.6
4
+ pandas>=2.2
5
+ numpy>=2.0
6
+ joblib>=1.4
7
+ pyyaml>=6.0
8
+ pytest>=8.0
File without changes
@@ -0,0 +1,230 @@
1
+ #!/usr/bin/env python3
2
+ """Convergence detection for the autoresearch pipeline.
3
+
4
+ Reads experiments/log.jsonl and checks if the last N experiments
5
+ (where N = convergence.patience) show insufficient improvement
6
+ over the best prior result.
7
+
8
+ This is a discrete analogue of early stopping from gradient descent,
9
+ adapted for the experiment loop context. The algorithm:
10
+
11
+ 1. Load all "kept" experiments from the JSONL log
12
+ 2. For each of the last N experiments, compute relative improvement
13
+ over the prior best
14
+ 3. If all N show < threshold improvement, declare convergence
15
+
16
+ Usage:
17
+ python scripts/check_convergence.py [--config config.yaml] [--log experiments/log.jsonl]
18
+
19
+ Exit codes:
20
+ 0 = not converged, agent should continue
21
+ 2 = converged, agent should stop (signals /loop to halt)
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import json
28
+ import sys
29
+ from pathlib import Path
30
+
31
+ import yaml
32
+
33
+
34
+ def load_convergence_config(config_path: str) -> dict:
35
+ """Load convergence parameters from config.yaml.
36
+
37
+ Returns dict with keys: patience, improvement_threshold,
38
+ primary_metric, lower_is_better.
39
+
40
+ Falls back to conservative defaults if config cannot be loaded.
41
+ """
42
+ defaults = {
43
+ "patience": 3,
44
+ "improvement_threshold": 0.005,
45
+ "primary_metric": "accuracy",
46
+ "lower_is_better": False,
47
+ }
48
+
49
+ path = Path(config_path)
50
+ if not path.exists():
51
+ print(f"convergence: Config not found at {config_path}, using defaults.", file=sys.stderr)
52
+ return defaults
53
+
54
+ try:
55
+ with open(path) as f:
56
+ config = yaml.safe_load(f)
57
+
58
+ convergence_cfg = config.get("convergence", {})
59
+ eval_cfg = config.get("evaluation", {})
60
+
61
+ return {
62
+ "patience": convergence_cfg.get("patience", defaults["patience"]),
63
+ "improvement_threshold": convergence_cfg.get(
64
+ "improvement_threshold", defaults["improvement_threshold"]
65
+ ),
66
+ "primary_metric": eval_cfg.get("primary_metric", defaults["primary_metric"]),
67
+ "lower_is_better": eval_cfg.get("lower_is_better", defaults["lower_is_better"]),
68
+ }
69
+ except (yaml.YAMLError, AttributeError) as e:
70
+ print(f"convergence: Error reading config: {e}. Using defaults.", file=sys.stderr)
71
+ return defaults
72
+
73
+
74
+ def load_kept_experiments(log_path: str, primary_metric: str) -> list[dict]:
75
+ """Load all 'kept' experiments with valid primary metric values.
76
+
77
+ Args:
78
+ log_path: Path to experiments/log.jsonl.
79
+ primary_metric: Metric name to extract.
80
+
81
+ Returns:
82
+ List of dicts with 'id' and 'value' keys, in chronological order.
83
+ """
84
+ path = Path(log_path)
85
+ if not path.exists():
86
+ return []
87
+
88
+ experiments = []
89
+ with open(path) as f:
90
+ for line in f:
91
+ line = line.strip()
92
+ if not line:
93
+ continue
94
+ try:
95
+ entry = json.loads(line)
96
+ value = entry.get("metrics", {}).get(primary_metric)
97
+ if value is not None and entry.get("status") == "kept":
98
+ experiments.append({
99
+ "id": entry.get("experiment_id", "?"),
100
+ "value": float(value),
101
+ })
102
+ except (json.JSONDecodeError, ValueError, TypeError):
103
+ continue
104
+
105
+ return experiments
106
+
107
+
108
+ def compute_relative_improvement(
109
+ current: float,
110
+ prior_best: float,
111
+ lower_is_better: bool,
112
+ ) -> float:
113
+ """Compute relative improvement of current value over prior best.
114
+
115
+ Returns a float where positive = improvement, negative = regression.
116
+ Returns 1.0 if prior_best is zero (any non-zero value is infinite improvement).
117
+ """
118
+ if prior_best == 0:
119
+ return 1.0 if current != 0 else 0.0
120
+
121
+ if lower_is_better:
122
+ return (prior_best - current) / abs(prior_best)
123
+ else:
124
+ return (current - prior_best) / abs(prior_best)
125
+
126
+
127
+ def check_convergence(
128
+ experiments: list[dict],
129
+ patience: int,
130
+ improvement_threshold: float,
131
+ lower_is_better: bool,
132
+ ) -> tuple[bool, int, str]:
133
+ """Check if the experiment loop has converged.
134
+
135
+ Args:
136
+ experiments: List of dicts with 'id' and 'value' keys.
137
+ patience: Number of consecutive non-improvements required.
138
+ improvement_threshold: Minimum relative improvement to count.
139
+ lower_is_better: True for metrics like MAE/MSE.
140
+
141
+ Returns:
142
+ Tuple of (converged: bool, non_improvements: int, message: str).
143
+ """
144
+ total = len(experiments)
145
+
146
+ if total < patience:
147
+ return (
148
+ False,
149
+ 0,
150
+ f"Only {total} experiments, need {patience} to check convergence.",
151
+ )
152
+
153
+ # Find best value across all experiments
154
+ values = [e["value"] for e in experiments]
155
+ best_value = min(values) if lower_is_better else max(values)
156
+
157
+ # Check last N experiments for improvement over their respective prior bests
158
+ non_improvements = 0
159
+ for i in range(total - patience, total):
160
+ prior_values = [e["value"] for e in experiments[:i]]
161
+ if not prior_values:
162
+ continue
163
+
164
+ prior_best = min(prior_values) if lower_is_better else max(prior_values)
165
+ current_value = experiments[i]["value"]
166
+ improvement = compute_relative_improvement(current_value, prior_best, lower_is_better)
167
+
168
+ if improvement < improvement_threshold:
169
+ non_improvements += 1
170
+
171
+ last_n = experiments[-patience:]
172
+ last_values = [round(e["value"], 4) for e in last_n]
173
+ primary_metric = "metric" # Cosmetic — the caller knows the actual name
174
+
175
+ if non_improvements >= patience:
176
+ msg = (
177
+ f"CONVERGED: {patience} consecutive non-improvements "
178
+ f"(threshold: {improvement_threshold * 100:.1f}% relative gain). "
179
+ f"Best={best_value:.4f}, last {patience} values={last_values}"
180
+ )
181
+ return True, non_improvements, msg
182
+ else:
183
+ msg = (
184
+ f"Not converged ({non_improvements}/{patience} non-improvements). "
185
+ f"Best={best_value:.4f}, last {patience} values={last_values}"
186
+ )
187
+ return False, non_improvements, msg
188
+
189
+
190
+ def main() -> None:
191
+ """CLI entry point."""
192
+ parser = argparse.ArgumentParser(
193
+ description="Check experiment convergence for the autoresearch pipeline"
194
+ )
195
+ parser.add_argument(
196
+ "--config",
197
+ default="config.yaml",
198
+ help="Path to config.yaml (default: config.yaml)",
199
+ )
200
+ parser.add_argument(
201
+ "--log",
202
+ default="experiments/log.jsonl",
203
+ help="Path to experiment log (default: experiments/log.jsonl)",
204
+ )
205
+ args = parser.parse_args()
206
+
207
+ # Load config
208
+ cfg = load_convergence_config(args.config)
209
+
210
+ # Load experiments
211
+ experiments = load_kept_experiments(args.log, cfg["primary_metric"])
212
+
213
+ # Check convergence
214
+ converged, non_improvements, message = check_convergence(
215
+ experiments=experiments,
216
+ patience=cfg["patience"],
217
+ improvement_threshold=cfg["improvement_threshold"],
218
+ lower_is_better=cfg["lower_is_better"],
219
+ )
220
+
221
+ print(f"convergence: {message}", file=sys.stderr)
222
+
223
+ if converged:
224
+ sys.exit(2)
225
+ else:
226
+ sys.exit(0)
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()
@@ -0,0 +1,124 @@
1
+ """Side-by-side comparison of two experiments.
2
+
3
+ Shows configuration deltas and metric differences between two experiments,
4
+ enabling the agent (or human) to understand which changes caused which
5
+ metric movements.
6
+
7
+ Usage: python scripts/compare_runs.py exp-001 exp-002 [--log path/to/log.jsonl]
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ import yaml
18
+
19
+ from scripts.turing_io import load_config
20
+
21
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
22
+
23
+
24
+ def load_experiment(log_path: str, experiment_id: str) -> dict | None:
25
+ """Load a single experiment by ID."""
26
+ path = Path(log_path)
27
+ if not path.exists():
28
+ return None
29
+
30
+ with open(path) as f:
31
+ for line in f:
32
+ line = line.strip()
33
+ if not line:
34
+ continue
35
+ try:
36
+ entry = json.loads(line)
37
+ if entry.get("experiment_id") == experiment_id:
38
+ return entry
39
+ except json.JSONDecodeError:
40
+ continue
41
+ return None
42
+
43
+
44
+ def format_comparison(exp_a: dict, exp_b: dict, config: dict) -> str:
45
+ """Format side-by-side comparison of two experiments."""
46
+ id_a = exp_a.get("experiment_id", "?")
47
+ id_b = exp_b.get("experiment_id", "?")
48
+
49
+ eval_cfg = config.get("evaluation", {})
50
+ lower_is_better_metrics = set()
51
+ if eval_cfg.get("lower_is_better", False):
52
+ lower_is_better_metrics = set(eval_cfg.get("metrics", []))
53
+
54
+ lines = [
55
+ f"{'':20s} {id_a:<20s} {id_b:<20s}",
56
+ "=" * 60,
57
+ "",
58
+ "## Config",
59
+ ]
60
+
61
+ config_a = exp_a.get("config", {})
62
+ config_b = exp_b.get("config", {})
63
+ all_config_keys = sorted(set(list(config_a.keys()) + list(config_b.keys())))
64
+ for key in all_config_keys:
65
+ val_a = config_a.get(key, "N/A")
66
+ val_b = config_b.get(key, "N/A")
67
+ marker = " <--" if val_a != val_b else ""
68
+ lines.append(f" {key:<18s} {str(val_a):<20s} {str(val_b):<20s}{marker}")
69
+
70
+ lines.append("")
71
+ lines.append("## Metrics")
72
+
73
+ metrics_a = exp_a.get("metrics", {})
74
+ metrics_b = exp_b.get("metrics", {})
75
+ all_metric_keys = sorted(set(list(metrics_a.keys()) + list(metrics_b.keys())))
76
+ for key in all_metric_keys:
77
+ val_a = metrics_a.get(key, "N/A")
78
+ val_b = metrics_b.get(key, "N/A")
79
+ diff_marker = ""
80
+ if isinstance(val_a, (int, float)) and isinstance(val_b, (int, float)):
81
+ if key in lower_is_better_metrics:
82
+ diff_marker = " (better)" if val_b < val_a else " (worse)" if val_b > val_a else ""
83
+ else:
84
+ diff_marker = " (better)" if val_b > val_a else " (worse)" if val_b < val_a else ""
85
+ a_str = f"{val_a:.4f}" if isinstance(val_a, float) else str(val_a)
86
+ b_str = f"{val_b:.4f}" if isinstance(val_b, float) else str(val_b)
87
+ lines.append(f" {key:<18s} {a_str:<20s} {b_str:<20s}{diff_marker}")
88
+
89
+ lines.append("")
90
+ lines.append("## Status")
91
+ lines.append(f" {'status':<18s} {exp_a.get('status', '?'):<20s} {exp_b.get('status', '?'):<20s}")
92
+ lines.append(f" {'timestamp':<18s} {exp_a.get('timestamp', '?')[:19]:<20s} {exp_b.get('timestamp', '?')[:19]:<20s}")
93
+
94
+ return "\n".join(lines)
95
+
96
+
97
+ def main() -> None:
98
+ """CLI entry point."""
99
+ parser = argparse.ArgumentParser(description="Compare two experiment runs")
100
+ parser.add_argument("exp_a", help="First experiment ID (e.g., exp-001)")
101
+ parser.add_argument("exp_b", help="Second experiment ID (e.g., exp-002)")
102
+ parser.add_argument(
103
+ "--log",
104
+ default=DEFAULT_LOG_PATH,
105
+ help=f"Path to experiment log (default: {DEFAULT_LOG_PATH})",
106
+ )
107
+ args = parser.parse_args()
108
+
109
+ a = load_experiment(args.log, args.exp_a)
110
+ b = load_experiment(args.log, args.exp_b)
111
+
112
+ if a is None:
113
+ print(f"Experiment {args.exp_a} not found in {args.log}")
114
+ sys.exit(1)
115
+ if b is None:
116
+ print(f"Experiment {args.exp_b} not found in {args.log}")
117
+ sys.exit(1)
118
+
119
+ config = load_config()
120
+ print(format_comparison(a, b, config))
121
+
122
+
123
+ if __name__ == "__main__":
124
+ main()