claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/.claude-plugin/plugin.json +34 -0
  2. package/LICENSE +21 -0
  3. package/README.md +457 -0
  4. package/agents/ml-evaluator.md +43 -0
  5. package/agents/ml-researcher.md +74 -0
  6. package/bin/cli.js +46 -0
  7. package/bin/turing-init.sh +57 -0
  8. package/commands/brief.md +83 -0
  9. package/commands/compare.md +24 -0
  10. package/commands/design.md +97 -0
  11. package/commands/init.md +123 -0
  12. package/commands/logbook.md +51 -0
  13. package/commands/mode.md +43 -0
  14. package/commands/poster.md +89 -0
  15. package/commands/preflight.md +75 -0
  16. package/commands/report.md +97 -0
  17. package/commands/rules/loop-protocol.md +91 -0
  18. package/commands/status.md +24 -0
  19. package/commands/suggest.md +95 -0
  20. package/commands/sweep.md +45 -0
  21. package/commands/train.md +66 -0
  22. package/commands/try.md +63 -0
  23. package/commands/turing.md +54 -0
  24. package/commands/validate.md +34 -0
  25. package/config/defaults.yaml +45 -0
  26. package/config/experiment_archetypes.yaml +127 -0
  27. package/config/lifecycle.toml +31 -0
  28. package/config/novelty_aliases.yaml +107 -0
  29. package/config/relationships.toml +125 -0
  30. package/config/state.toml +24 -0
  31. package/config/task_taxonomy.yaml +110 -0
  32. package/config/taxonomy.toml +37 -0
  33. package/package.json +54 -0
  34. package/src/claude-md.js +55 -0
  35. package/src/install.js +107 -0
  36. package/src/paths.js +20 -0
  37. package/src/postinstall.js +22 -0
  38. package/src/verify.js +109 -0
  39. package/templates/MEMORY.md +36 -0
  40. package/templates/README.md +93 -0
  41. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  42. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  43. package/templates/config.yaml +48 -0
  44. package/templates/evaluate.py +237 -0
  45. package/templates/features/__init__.py +0 -0
  46. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  47. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  48. package/templates/features/featurizers.py +138 -0
  49. package/templates/prepare.py +171 -0
  50. package/templates/program.md +216 -0
  51. package/templates/pyproject.toml +8 -0
  52. package/templates/requirements.txt +8 -0
  53. package/templates/scripts/__init__.py +0 -0
  54. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  56. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  57. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  58. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  59. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  60. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  61. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  62. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  63. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  64. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  65. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  66. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  67. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  68. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  69. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  70. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  71. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  72. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  73. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  74. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  75. package/templates/scripts/check_convergence.py +230 -0
  76. package/templates/scripts/compare_runs.py +124 -0
  77. package/templates/scripts/critique_hypothesis.py +350 -0
  78. package/templates/scripts/experiment_index.py +288 -0
  79. package/templates/scripts/generate_brief.py +389 -0
  80. package/templates/scripts/generate_logbook.py +423 -0
  81. package/templates/scripts/log_experiment.py +243 -0
  82. package/templates/scripts/manage_hypotheses.py +543 -0
  83. package/templates/scripts/novelty_guard.py +343 -0
  84. package/templates/scripts/parse_metrics.py +139 -0
  85. package/templates/scripts/post-train-hook.sh +74 -0
  86. package/templates/scripts/preflight.py +549 -0
  87. package/templates/scripts/scaffold.py +409 -0
  88. package/templates/scripts/show_environment.py +92 -0
  89. package/templates/scripts/show_experiment_tree.py +144 -0
  90. package/templates/scripts/show_families.py +133 -0
  91. package/templates/scripts/show_metrics.py +157 -0
  92. package/templates/scripts/statistical_compare.py +259 -0
  93. package/templates/scripts/stop-hook.sh +34 -0
  94. package/templates/scripts/suggest_next.py +301 -0
  95. package/templates/scripts/sweep.py +276 -0
  96. package/templates/scripts/synthesize_decision.py +300 -0
  97. package/templates/scripts/turing_io.py +76 -0
  98. package/templates/scripts/update_state.py +296 -0
  99. package/templates/scripts/validate_stability.py +167 -0
  100. package/templates/scripts/verify_placeholders.py +119 -0
  101. package/templates/sweep_config.yaml +14 -0
  102. package/templates/tests/__init__.py +0 -0
  103. package/templates/tests/conftest.py +91 -0
  104. package/templates/train.py +240 -0
@@ -0,0 +1,300 @@
1
+ #!/usr/bin/env python3
2
+ """Decision packet synthesis for the autoresearch pipeline.
3
+
4
+ After each experiment, synthesizes a structured verdict combining:
5
+ - Run outcome and metrics
6
+ - Comparison to current champion
7
+ - A recommended next action
8
+
9
+ Actions: promote, branch_followup, replicate, abandon, fix_and_retry,
10
+ investigate_crash.
11
+
12
+ Inspired by pauldebdeep9/autoresearch MemoryLab decision packets.
13
+
14
+ Usage:
15
+ python scripts/synthesize_decision.py \\
16
+ --experiment exp-005 \\
17
+ --log experiments/log.jsonl \\
18
+ --config config.yaml
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import json
25
+ import sys
26
+ from pathlib import Path
27
+
28
+ import yaml
29
+
30
+ PROMISING_DELTA = 0.005 # 0.5% relative improvement = promising
31
+
32
+
33
+ def load_experiment(log_path: str, experiment_id: str) -> dict | None:
34
+ """Load a specific experiment from the JSONL log."""
35
+ path = Path(log_path)
36
+ if not path.exists():
37
+ return None
38
+ with open(path) as f:
39
+ for line in f:
40
+ line = line.strip()
41
+ if not line:
42
+ continue
43
+ try:
44
+ entry = json.loads(line)
45
+ if entry.get("experiment_id") == experiment_id:
46
+ return entry
47
+ except json.JSONDecodeError:
48
+ continue
49
+ return None
50
+
51
+
52
+ def find_champion(log_path: str, metric: str, lower_is_better: bool) -> dict | None:
53
+ """Find the current best (champion) experiment."""
54
+ path = Path(log_path)
55
+ if not path.exists():
56
+ return None
57
+ best = None
58
+ best_val = float("inf") if lower_is_better else float("-inf")
59
+ with open(path) as f:
60
+ for line in f:
61
+ line = line.strip()
62
+ if not line:
63
+ continue
64
+ try:
65
+ entry = json.loads(line)
66
+ if entry.get("status") != "kept":
67
+ continue
68
+ val = entry.get("metrics", {}).get(metric)
69
+ if val is None:
70
+ continue
71
+ if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
72
+ best_val = val
73
+ best = entry
74
+ except json.JSONDecodeError:
75
+ continue
76
+ return best
77
+
78
+
79
+ def compute_delta(current: float, champion: float, lower_is_better: bool) -> float:
80
+ """Compute relative improvement over champion."""
81
+ if champion == 0:
82
+ return 1.0 if current != 0 else 0.0
83
+ if lower_is_better:
84
+ return (champion - current) / abs(champion)
85
+ else:
86
+ return (current - champion) / abs(champion)
87
+
88
+
89
+ def classify_outcome(
90
+ experiment: dict,
91
+ champion: dict | None,
92
+ metric: str,
93
+ lower_is_better: bool,
94
+ ) -> tuple[str, float | None]:
95
+ """Classify the experiment outcome.
96
+
97
+ Returns (outcome, delta_to_champion).
98
+ Outcomes: new_champion, marginal_improvement, lateral, regression, crash.
99
+ """
100
+ status = experiment.get("status", "")
101
+ if status == "crash":
102
+ return "crash", None
103
+
104
+ current_val = experiment.get("metrics", {}).get(metric)
105
+ if current_val is None:
106
+ return "crash", None
107
+
108
+ if champion is None:
109
+ if status == "kept":
110
+ return "new_champion", None
111
+ return "regression", None
112
+
113
+ champion_val = champion.get("metrics", {}).get(metric)
114
+ if champion_val is None:
115
+ return "new_champion" if status == "kept" else "regression", None
116
+
117
+ delta = compute_delta(current_val, champion_val, lower_is_better)
118
+
119
+ if status == "kept" and delta > PROMISING_DELTA:
120
+ return "new_champion", delta
121
+ elif status == "kept" and delta > 0:
122
+ return "marginal_improvement", delta
123
+ elif abs(delta) <= PROMISING_DELTA:
124
+ return "lateral", delta
125
+ else:
126
+ return "regression", delta
127
+
128
+
129
+ def recommend_action(
130
+ outcome: str,
131
+ experiment: dict,
132
+ ) -> tuple[str, str]:
133
+ """Recommend a next action based on the outcome.
134
+
135
+ Returns (action, rationale).
136
+ """
137
+ actions = {
138
+ "new_champion": (
139
+ "promote",
140
+ "New best result — update champion, consider replicating to confirm stability",
141
+ ),
142
+ "marginal_improvement": (
143
+ "branch_followup",
144
+ "Slight improvement — explore variations of this approach",
145
+ ),
146
+ "lateral": (
147
+ "abandon",
148
+ "No meaningful change — this direction is not productive",
149
+ ),
150
+ "regression": (
151
+ "abandon",
152
+ "Performance decreased — discard and try a different approach",
153
+ ),
154
+ "crash": (
155
+ "fix_and_retry" if experiment.get("description", "") else "investigate_crash",
156
+ "Experiment crashed — check error logs and retry with fixes" if experiment.get("description") else "Experiment crashed — investigate the cause before retrying",
157
+ ),
158
+ }
159
+ return actions.get(outcome, ("investigate_crash", "Unknown outcome"))
160
+
161
+
162
+ def synthesize_packet(
163
+ experiment: dict,
164
+ champion: dict | None,
165
+ metric: str,
166
+ lower_is_better: bool,
167
+ ) -> dict:
168
+ """Synthesize a complete decision packet for an experiment.
169
+
170
+ Returns dict with: experiment_id, outcome, action, rationale,
171
+ delta, current_metric, champion_metric, champion_id.
172
+ """
173
+ outcome, delta = classify_outcome(experiment, champion, metric, lower_is_better)
174
+ action, rationale = recommend_action(outcome, experiment)
175
+
176
+ current_val = experiment.get("metrics", {}).get(metric)
177
+ champion_val = champion.get("metrics", {}).get(metric) if champion else None
178
+
179
+ packet = {
180
+ "experiment_id": experiment.get("experiment_id", "?"),
181
+ "outcome": outcome,
182
+ "action": action,
183
+ "rationale": rationale,
184
+ "delta": round(delta, 6) if delta is not None else None,
185
+ "current_metric": current_val,
186
+ "champion_metric": champion_val,
187
+ "champion_id": champion.get("experiment_id", "?") if champion else None,
188
+ "status": experiment.get("status", "?"),
189
+ "description": experiment.get("description", ""),
190
+ "family": experiment.get("family"),
191
+ "hypothesis_id": experiment.get("hypothesis_id"),
192
+ }
193
+
194
+ return packet
195
+
196
+
197
+ def auto_queue_followup(
198
+ packet: dict,
199
+ hypotheses_path: str = "hypotheses.yaml",
200
+ ) -> str | None:
201
+ """Auto-queue a follow-up hypothesis based on the decision packet.
202
+
203
+ Actions that trigger auto-queuing:
204
+ - branch_followup: queue a follow-up hypothesis as agent/medium priority
205
+ - fix_and_retry: queue a retry hypothesis as agent/high priority
206
+
207
+ Returns the new hypothesis ID if queued, or None if no action taken.
208
+ """
209
+ action = packet.get("action", "")
210
+ exp_id = packet.get("experiment_id", "?")
211
+ description = packet.get("description", "")
212
+ family = packet.get("family")
213
+
214
+ # Import here to avoid circular dependency at module level
215
+ from scripts.manage_hypotheses import add_hypothesis
216
+
217
+ if action == "branch_followup":
218
+ desc = f"Follow up on {exp_id}: explore variations of '{description[:60]}'"
219
+ return add_hypothesis(
220
+ queue_path=hypotheses_path,
221
+ description=desc,
222
+ source="agent",
223
+ priority="medium",
224
+ parent_experiment=exp_id,
225
+ )
226
+ elif action == "fix_and_retry":
227
+ desc = f"Retry {exp_id} with fixes: '{description[:60]}' crashed — investigate and fix"
228
+ return add_hypothesis(
229
+ queue_path=hypotheses_path,
230
+ description=desc,
231
+ source="agent",
232
+ priority="high",
233
+ parent_experiment=exp_id,
234
+ )
235
+
236
+ return None
237
+
238
+
239
+ def format_packet(packet: dict, metric_name: str) -> str:
240
+ """Format a decision packet for display."""
241
+ lines = [
242
+ f"Decision Packet: {packet['experiment_id']}",
243
+ f" Outcome: {packet['outcome']}",
244
+ f" Action: {packet['action']}",
245
+ f" Rationale: {packet['rationale']}",
246
+ f" {metric_name}: {packet['current_metric']} (champion: {packet['champion_metric']})",
247
+ ]
248
+ if packet["delta"] is not None:
249
+ lines.append(f" Delta: {packet['delta']:+.4f} relative")
250
+ if packet["family"]:
251
+ lines.append(f" Family: {packet['family']}")
252
+ return "\n".join(lines)
253
+
254
+
255
+ def main() -> None:
256
+ """CLI entry point."""
257
+ parser = argparse.ArgumentParser(description="Synthesize decision packet")
258
+ parser.add_argument("--experiment", required=True, help="Experiment ID")
259
+ parser.add_argument("--log", default="experiments/log.jsonl")
260
+ parser.add_argument("--config", default="config.yaml")
261
+ parser.add_argument("--hypotheses", default="hypotheses.yaml", help="Hypothesis queue path")
262
+ parser.add_argument("--auto-queue", action="store_true", help="Auto-queue follow-up hypotheses")
263
+ parser.add_argument("--json", action="store_true", help="Output as JSON")
264
+ args = parser.parse_args()
265
+
266
+ config = {}
267
+ if Path(args.config).exists():
268
+ with open(args.config) as f:
269
+ config = yaml.safe_load(f) or {}
270
+
271
+ eval_cfg = config.get("evaluation", {})
272
+ metric = eval_cfg.get("primary_metric", "accuracy")
273
+ lower_is_better = eval_cfg.get("lower_is_better", False)
274
+
275
+ experiment = load_experiment(args.log, args.experiment)
276
+ if not experiment:
277
+ print(f"Experiment {args.experiment} not found.", file=sys.stderr)
278
+ sys.exit(1)
279
+
280
+ champion = find_champion(args.log, metric, lower_is_better)
281
+ # Don't compare champion to itself
282
+ if champion and champion.get("experiment_id") == experiment.get("experiment_id"):
283
+ champion = None
284
+
285
+ packet = synthesize_packet(experiment, champion, metric, lower_is_better)
286
+
287
+ if args.json:
288
+ print(json.dumps(packet, indent=2))
289
+ else:
290
+ print(format_packet(packet, metric))
291
+
292
+ # Auto-queue follow-up hypotheses if requested
293
+ if args.auto_queue:
294
+ hyp_id = auto_queue_followup(packet, args.hypotheses)
295
+ if hyp_id:
296
+ print(f"\n Auto-queued: {hyp_id} ({packet['action']})")
297
+
298
+
299
+ if __name__ == "__main__":
300
+ main()
@@ -0,0 +1,76 @@
1
+ """Shared data loading functions for the autoresearch pipeline.
2
+
3
+ Consolidates the duplicated load_experiments, load_config, and
4
+ load_hypotheses functions that were copy-pasted across 8+ scripts.
5
+ Every script that reads experiment logs, config, or hypothesis
6
+ queues should import from here.
7
+
8
+ Usage:
9
+ from scripts.turing_io import load_experiments, load_config, load_hypotheses
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from pathlib import Path
16
+
17
+ import yaml
18
+
19
+
20
+ def load_experiments(log_path: str) -> list[dict]:
21
+ """Load experiments from JSONL log.
22
+
23
+ Args:
24
+ log_path: Path to the experiments JSONL file.
25
+
26
+ Returns:
27
+ List of experiment dicts. Empty list if file missing or empty.
28
+ Malformed lines are silently skipped.
29
+ """
30
+ path = Path(log_path)
31
+ if not path.exists():
32
+ return []
33
+ experiments = []
34
+ with open(path) as f:
35
+ for line in f:
36
+ line = line.strip()
37
+ if line:
38
+ try:
39
+ experiments.append(json.loads(line))
40
+ except json.JSONDecodeError:
41
+ continue
42
+ return experiments
43
+
44
+
45
+ def load_config(path: str = "config.yaml") -> dict:
46
+ """Load YAML config with fallback to empty dict.
47
+
48
+ Args:
49
+ path: Path to config YAML file. Defaults to "config.yaml".
50
+
51
+ Returns:
52
+ Parsed config dict, or empty dict if file missing or invalid.
53
+ """
54
+ p = Path(path)
55
+ if not p.exists():
56
+ return {}
57
+ with open(p) as f:
58
+ return yaml.safe_load(f) or {}
59
+
60
+
61
+ def load_hypotheses(queue_path: str) -> list[dict]:
62
+ """Load hypothesis queue from YAML file.
63
+
64
+ Args:
65
+ queue_path: Path to the hypotheses YAML file.
66
+
67
+ Returns:
68
+ List of hypothesis dicts. Empty list if file missing, empty,
69
+ or not a list.
70
+ """
71
+ path = Path(queue_path)
72
+ if not path.exists() or path.stat().st_size == 0:
73
+ return []
74
+ with open(path) as f:
75
+ data = yaml.safe_load(f)
76
+ return data if isinstance(data, list) else []
@@ -0,0 +1,296 @@
1
+ #!/usr/bin/env python3
2
+ """Structured experiment state manager for the autoresearch pipeline.
3
+
4
+ Replaces free-text MEMORY.md with a validated experiment_state.yaml
5
+ that the agent reads and writes in a schema-enforced format. The state
6
+ is machine-readable and programmatically queryable.
7
+
8
+ MEMORY.md is kept as a human-readable companion, generated from the
9
+ structured state.
10
+
11
+ Usage:
12
+ python scripts/update_state.py init # Create empty state
13
+ python scripts/update_state.py show # Display current state
14
+ python scripts/update_state.py set-best <exp-id> <metrics_json>
15
+ python scripts/update_state.py add-observation <text> [--exp <exp-id>]
16
+ python scripts/update_state.py add-failure <text> --exp <exp-id> --reason <text>
17
+ python scripts/update_state.py add-direction <text> [--priority high|medium|low]
18
+ python scripts/update_state.py log-session <n_experiments> <best_metric>
19
+ python scripts/update_state.py generate-memory # Write MEMORY.md from state
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import argparse
25
+ import copy
26
+ import json
27
+ import sys
28
+ from datetime import datetime, timezone
29
+ from pathlib import Path
30
+
31
+ import yaml
32
+
33
+ DEFAULT_STATE_PATH = "experiment_state.yaml"
34
+ DEFAULT_MEMORY_PATH = ".claude/agent-memory/ml-researcher/MEMORY.md"
35
+
36
+ EMPTY_STATE = {
37
+ "goal": "",
38
+ "primary_metric": "",
39
+ "metric_direction": "",
40
+ "best_result": None,
41
+ "observations": [],
42
+ "failed_approaches": [],
43
+ "promising_directions": [],
44
+ "session_history": [],
45
+ }
46
+
47
+ REQUIRED_KEYS = set(EMPTY_STATE.keys())
48
+
49
+
50
+ def load_state(path: str) -> dict:
51
+ """Load experiment state from YAML, validating required keys."""
52
+ p = Path(path)
53
+ if not p.exists() or p.stat().st_size == 0:
54
+ return copy.deepcopy(EMPTY_STATE)
55
+ with open(p) as f:
56
+ state = yaml.safe_load(f)
57
+ if not isinstance(state, dict):
58
+ return copy.deepcopy(EMPTY_STATE)
59
+ # Ensure all required keys exist
60
+ for key in REQUIRED_KEYS:
61
+ if key not in state:
62
+ state[key] = copy.deepcopy(EMPTY_STATE[key])
63
+ return state
64
+
65
+
66
+ def save_state(path: str, state: dict) -> None:
67
+ """Save experiment state to YAML."""
68
+ p = Path(path)
69
+ p.parent.mkdir(parents=True, exist_ok=True)
70
+ with open(p, "w") as f:
71
+ yaml.dump(state, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
72
+
73
+
74
+ def validate_state(state: dict) -> list[str]:
75
+ """Validate state against schema. Returns list of error messages."""
76
+ errors = []
77
+ for key in REQUIRED_KEYS:
78
+ if key not in state:
79
+ errors.append(f"Missing required key: {key}")
80
+ if state.get("best_result") is not None:
81
+ br = state["best_result"]
82
+ if not isinstance(br, dict):
83
+ errors.append("best_result must be a dict or null")
84
+ elif "experiment_id" not in br or "metrics" not in br:
85
+ errors.append("best_result must have experiment_id and metrics")
86
+ if not isinstance(state.get("observations", []), list):
87
+ errors.append("observations must be a list")
88
+ if not isinstance(state.get("failed_approaches", []), list):
89
+ errors.append("failed_approaches must be a list")
90
+ if not isinstance(state.get("promising_directions", []), list):
91
+ errors.append("promising_directions must be a list")
92
+ if not isinstance(state.get("session_history", []), list):
93
+ errors.append("session_history must be a list")
94
+ return errors
95
+
96
+
97
+ def set_best(state: dict, experiment_id: str, metrics: dict) -> dict:
98
+ """Update the best result in state."""
99
+ state["best_result"] = {
100
+ "experiment_id": experiment_id,
101
+ "metrics": metrics,
102
+ "updated_at": datetime.now(timezone.utc).isoformat(),
103
+ }
104
+ return state
105
+
106
+
107
+ def add_observation(state: dict, text: str, experiment_id: str | None = None) -> dict:
108
+ """Add an observation to state."""
109
+ entry = {
110
+ "text": text,
111
+ "timestamp": datetime.now(timezone.utc).isoformat(),
112
+ }
113
+ if experiment_id:
114
+ entry["experiment_id"] = experiment_id
115
+ state["observations"].append(entry)
116
+ return state
117
+
118
+
119
+ def add_failure(state: dict, description: str, experiment_id: str, reason: str) -> dict:
120
+ """Record a failed approach."""
121
+ state["failed_approaches"].append({
122
+ "description": description,
123
+ "experiment_id": experiment_id,
124
+ "reason": reason,
125
+ "timestamp": datetime.now(timezone.utc).isoformat(),
126
+ })
127
+ return state
128
+
129
+
130
+ def add_direction(state: dict, description: str, priority: str = "medium") -> dict:
131
+ """Add a promising direction."""
132
+ state["promising_directions"].append({
133
+ "description": description,
134
+ "priority": priority,
135
+ "timestamp": datetime.now(timezone.utc).isoformat(),
136
+ })
137
+ return state
138
+
139
+
140
+ def log_session(state: dict, n_experiments: int, best_metric: float | None) -> dict:
141
+ """Log a training session summary."""
142
+ state["session_history"].append({
143
+ "date": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
144
+ "experiments_run": n_experiments,
145
+ "best_metric": best_metric,
146
+ })
147
+ return state
148
+
149
+
150
+ def generate_memory(state: dict, goal: str = "", metric: str = "", direction: str = "") -> str:
151
+ """Generate MEMORY.md content from structured state."""
152
+ goal = goal or state.get("goal", "")
153
+ metric = metric or state.get("primary_metric", "")
154
+ direction = direction or state.get("metric_direction", "")
155
+
156
+ lines = [
157
+ "# ML Researcher Memory",
158
+ "",
159
+ "## Goal",
160
+ "",
161
+ goal or "(not set)",
162
+ "",
163
+ f"Primary metric: {metric} ({direction} is better).",
164
+ "",
165
+ "## Best Result",
166
+ "",
167
+ ]
168
+
169
+ br = state.get("best_result")
170
+ if br:
171
+ metrics_str = ", ".join(f"{k}={v}" for k, v in br.get("metrics", {}).items())
172
+ lines.append(f"Experiment {br.get('experiment_id', '?')}: {metrics_str}")
173
+ else:
174
+ lines.append("No experiments completed yet.")
175
+
176
+ lines.extend(["", "## Observations", ""])
177
+ for obs in state.get("observations", [])[-10:]: # Last 10
178
+ exp_tag = f" ({obs['experiment_id']})" if obs.get("experiment_id") else ""
179
+ lines.append(f"- {obs['text']}{exp_tag}")
180
+ if not state.get("observations"):
181
+ lines.append("(none yet)")
182
+
183
+ lines.extend(["", "## Failed Approaches", ""])
184
+ for fail in state.get("failed_approaches", []):
185
+ lines.append(f"- {fail['description']} ({fail['experiment_id']}): {fail['reason']}")
186
+ if not state.get("failed_approaches"):
187
+ lines.append("(none yet)")
188
+
189
+ lines.extend(["", "## Promising Directions", ""])
190
+ for d in state.get("promising_directions", []):
191
+ priority_tag = f" [{d['priority']}]" if d.get("priority") != "medium" else ""
192
+ lines.append(f"- {d['description']}{priority_tag}")
193
+ if not state.get("promising_directions"):
194
+ lines.append("(none yet)")
195
+
196
+ lines.extend(["", "## Session History", ""])
197
+ if state.get("session_history"):
198
+ lines.append("| Session | Experiments | Best Metric | Notes |")
199
+ lines.append("|---------|-------------|-------------|-------|")
200
+ for s in state["session_history"]:
201
+ bm = f"{s['best_metric']:.4f}" if s.get("best_metric") is not None else "N/A"
202
+ lines.append(f"| {s.get('date', '?')} | {s.get('experiments_run', 0)} | {bm} | |")
203
+ else:
204
+ lines.append("(no sessions yet)")
205
+
206
+ lines.append("")
207
+ return "\n".join(lines)
208
+
209
+
210
+ def main() -> None:
211
+ """CLI entry point."""
212
+ parser = argparse.ArgumentParser(description="Manage experiment state")
213
+ parser.add_argument("--state", default=DEFAULT_STATE_PATH)
214
+ subparsers = parser.add_subparsers(dest="command")
215
+
216
+ subparsers.add_parser("init", help="Create empty state")
217
+ subparsers.add_parser("show", help="Display current state")
218
+ subparsers.add_parser("validate", help="Validate state schema")
219
+
220
+ sb = subparsers.add_parser("set-best")
221
+ sb.add_argument("experiment_id")
222
+ sb.add_argument("metrics_json")
223
+
224
+ obs = subparsers.add_parser("add-observation")
225
+ obs.add_argument("text")
226
+ obs.add_argument("--exp", default=None)
227
+
228
+ fail = subparsers.add_parser("add-failure")
229
+ fail.add_argument("text")
230
+ fail.add_argument("--exp", required=True)
231
+ fail.add_argument("--reason", required=True)
232
+
233
+ d = subparsers.add_parser("add-direction")
234
+ d.add_argument("text")
235
+ d.add_argument("--priority", default="medium", choices=["high", "medium", "low"])
236
+
237
+ sess = subparsers.add_parser("log-session")
238
+ sess.add_argument("n_experiments", type=int)
239
+ sess.add_argument("best_metric", type=float, nargs="?", default=None)
240
+
241
+ subparsers.add_parser("generate-memory", help="Write MEMORY.md from state")
242
+
243
+ args = parser.parse_args()
244
+ state = load_state(args.state)
245
+
246
+ if args.command == "init":
247
+ save_state(args.state, EMPTY_STATE)
248
+ print(f"Initialized empty state at {args.state}")
249
+
250
+ elif args.command == "show":
251
+ print(yaml.dump(state, default_flow_style=False, sort_keys=False))
252
+
253
+ elif args.command == "validate":
254
+ errors = validate_state(state)
255
+ if errors:
256
+ for e in errors:
257
+ print(f" ERROR: {e}", file=sys.stderr)
258
+ sys.exit(1)
259
+ print("State is valid.")
260
+
261
+ elif args.command == "set-best":
262
+ metrics = json.loads(args.metrics_json)
263
+ set_best(state, args.experiment_id, metrics)
264
+ save_state(args.state, state)
265
+ print(f"Best result set to {args.experiment_id}")
266
+
267
+ elif args.command == "add-observation":
268
+ add_observation(state, args.text, args.exp)
269
+ save_state(args.state, state)
270
+ print("Observation added.")
271
+
272
+ elif args.command == "add-failure":
273
+ add_failure(state, args.text, args.exp, args.reason)
274
+ save_state(args.state, state)
275
+ print("Failed approach recorded.")
276
+
277
+ elif args.command == "add-direction":
278
+ add_direction(state, args.text, args.priority)
279
+ save_state(args.state, state)
280
+ print("Promising direction added.")
281
+
282
+ elif args.command == "log-session":
283
+ log_session(state, args.n_experiments, args.best_metric)
284
+ save_state(args.state, state)
285
+ print("Session logged.")
286
+
287
+ elif args.command == "generate-memory":
288
+ memory_content = generate_memory(state)
289
+ print(memory_content)
290
+
291
+ else:
292
+ parser.print_help()
293
+
294
+
295
+ if __name__ == "__main__":
296
+ main()