claude-turing 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +34 -0
- package/LICENSE +21 -0
- package/README.md +457 -0
- package/agents/ml-evaluator.md +43 -0
- package/agents/ml-researcher.md +74 -0
- package/bin/cli.js +46 -0
- package/bin/turing-init.sh +57 -0
- package/commands/brief.md +83 -0
- package/commands/compare.md +24 -0
- package/commands/design.md +97 -0
- package/commands/init.md +123 -0
- package/commands/logbook.md +51 -0
- package/commands/mode.md +43 -0
- package/commands/poster.md +89 -0
- package/commands/preflight.md +75 -0
- package/commands/report.md +97 -0
- package/commands/rules/loop-protocol.md +91 -0
- package/commands/status.md +24 -0
- package/commands/suggest.md +95 -0
- package/commands/sweep.md +45 -0
- package/commands/train.md +66 -0
- package/commands/try.md +63 -0
- package/commands/turing.md +54 -0
- package/commands/validate.md +34 -0
- package/config/defaults.yaml +45 -0
- package/config/experiment_archetypes.yaml +127 -0
- package/config/lifecycle.toml +31 -0
- package/config/novelty_aliases.yaml +107 -0
- package/config/relationships.toml +125 -0
- package/config/state.toml +24 -0
- package/config/task_taxonomy.yaml +110 -0
- package/config/taxonomy.toml +37 -0
- package/package.json +54 -0
- package/src/claude-md.js +55 -0
- package/src/install.js +107 -0
- package/src/paths.js +20 -0
- package/src/postinstall.js +22 -0
- package/src/verify.js +109 -0
- package/templates/MEMORY.md +36 -0
- package/templates/README.md +93 -0
- package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
- package/templates/config.yaml +48 -0
- package/templates/evaluate.py +237 -0
- package/templates/features/__init__.py +0 -0
- package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
- package/templates/features/featurizers.py +138 -0
- package/templates/prepare.py +171 -0
- package/templates/program.md +216 -0
- package/templates/pyproject.toml +8 -0
- package/templates/requirements.txt +8 -0
- package/templates/scripts/__init__.py +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
- package/templates/scripts/check_convergence.py +230 -0
- package/templates/scripts/compare_runs.py +124 -0
- package/templates/scripts/critique_hypothesis.py +350 -0
- package/templates/scripts/experiment_index.py +288 -0
- package/templates/scripts/generate_brief.py +389 -0
- package/templates/scripts/generate_logbook.py +423 -0
- package/templates/scripts/log_experiment.py +243 -0
- package/templates/scripts/manage_hypotheses.py +543 -0
- package/templates/scripts/novelty_guard.py +343 -0
- package/templates/scripts/parse_metrics.py +139 -0
- package/templates/scripts/post-train-hook.sh +74 -0
- package/templates/scripts/preflight.py +549 -0
- package/templates/scripts/scaffold.py +409 -0
- package/templates/scripts/show_environment.py +92 -0
- package/templates/scripts/show_experiment_tree.py +144 -0
- package/templates/scripts/show_families.py +133 -0
- package/templates/scripts/show_metrics.py +157 -0
- package/templates/scripts/statistical_compare.py +259 -0
- package/templates/scripts/stop-hook.sh +34 -0
- package/templates/scripts/suggest_next.py +301 -0
- package/templates/scripts/sweep.py +276 -0
- package/templates/scripts/synthesize_decision.py +300 -0
- package/templates/scripts/turing_io.py +76 -0
- package/templates/scripts/update_state.py +296 -0
- package/templates/scripts/validate_stability.py +167 -0
- package/templates/scripts/verify_placeholders.py +119 -0
- package/templates/sweep_config.yaml +14 -0
- package/templates/tests/__init__.py +0 -0
- package/templates/tests/conftest.py +91 -0
- package/templates/train.py +240 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Decision packet synthesis for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
After each experiment, synthesizes a structured verdict combining:
|
|
5
|
+
- Run outcome and metrics
|
|
6
|
+
- Comparison to current champion
|
|
7
|
+
- A recommended next action
|
|
8
|
+
|
|
9
|
+
Actions: promote, branch_followup, replicate, abandon, fix_and_retry,
|
|
10
|
+
investigate_crash.
|
|
11
|
+
|
|
12
|
+
Inspired by pauldebdeep9/autoresearch MemoryLab decision packets.
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
python scripts/synthesize_decision.py \\
|
|
16
|
+
--experiment exp-005 \\
|
|
17
|
+
--log experiments/log.jsonl \\
|
|
18
|
+
--config config.yaml
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import json
|
|
25
|
+
import sys
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
import yaml
|
|
29
|
+
|
|
30
|
+
PROMISING_DELTA = 0.005 # 0.5% relative improvement = promising
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def load_experiment(log_path: str, experiment_id: str) -> dict | None:
|
|
34
|
+
"""Load a specific experiment from the JSONL log."""
|
|
35
|
+
path = Path(log_path)
|
|
36
|
+
if not path.exists():
|
|
37
|
+
return None
|
|
38
|
+
with open(path) as f:
|
|
39
|
+
for line in f:
|
|
40
|
+
line = line.strip()
|
|
41
|
+
if not line:
|
|
42
|
+
continue
|
|
43
|
+
try:
|
|
44
|
+
entry = json.loads(line)
|
|
45
|
+
if entry.get("experiment_id") == experiment_id:
|
|
46
|
+
return entry
|
|
47
|
+
except json.JSONDecodeError:
|
|
48
|
+
continue
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def find_champion(log_path: str, metric: str, lower_is_better: bool) -> dict | None:
|
|
53
|
+
"""Find the current best (champion) experiment."""
|
|
54
|
+
path = Path(log_path)
|
|
55
|
+
if not path.exists():
|
|
56
|
+
return None
|
|
57
|
+
best = None
|
|
58
|
+
best_val = float("inf") if lower_is_better else float("-inf")
|
|
59
|
+
with open(path) as f:
|
|
60
|
+
for line in f:
|
|
61
|
+
line = line.strip()
|
|
62
|
+
if not line:
|
|
63
|
+
continue
|
|
64
|
+
try:
|
|
65
|
+
entry = json.loads(line)
|
|
66
|
+
if entry.get("status") != "kept":
|
|
67
|
+
continue
|
|
68
|
+
val = entry.get("metrics", {}).get(metric)
|
|
69
|
+
if val is None:
|
|
70
|
+
continue
|
|
71
|
+
if (lower_is_better and val < best_val) or (not lower_is_better and val > best_val):
|
|
72
|
+
best_val = val
|
|
73
|
+
best = entry
|
|
74
|
+
except json.JSONDecodeError:
|
|
75
|
+
continue
|
|
76
|
+
return best
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def compute_delta(current: float, champion: float, lower_is_better: bool) -> float:
|
|
80
|
+
"""Compute relative improvement over champion."""
|
|
81
|
+
if champion == 0:
|
|
82
|
+
return 1.0 if current != 0 else 0.0
|
|
83
|
+
if lower_is_better:
|
|
84
|
+
return (champion - current) / abs(champion)
|
|
85
|
+
else:
|
|
86
|
+
return (current - champion) / abs(champion)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def classify_outcome(
|
|
90
|
+
experiment: dict,
|
|
91
|
+
champion: dict | None,
|
|
92
|
+
metric: str,
|
|
93
|
+
lower_is_better: bool,
|
|
94
|
+
) -> tuple[str, float | None]:
|
|
95
|
+
"""Classify the experiment outcome.
|
|
96
|
+
|
|
97
|
+
Returns (outcome, delta_to_champion).
|
|
98
|
+
Outcomes: new_champion, marginal_improvement, lateral, regression, crash.
|
|
99
|
+
"""
|
|
100
|
+
status = experiment.get("status", "")
|
|
101
|
+
if status == "crash":
|
|
102
|
+
return "crash", None
|
|
103
|
+
|
|
104
|
+
current_val = experiment.get("metrics", {}).get(metric)
|
|
105
|
+
if current_val is None:
|
|
106
|
+
return "crash", None
|
|
107
|
+
|
|
108
|
+
if champion is None:
|
|
109
|
+
if status == "kept":
|
|
110
|
+
return "new_champion", None
|
|
111
|
+
return "regression", None
|
|
112
|
+
|
|
113
|
+
champion_val = champion.get("metrics", {}).get(metric)
|
|
114
|
+
if champion_val is None:
|
|
115
|
+
return "new_champion" if status == "kept" else "regression", None
|
|
116
|
+
|
|
117
|
+
delta = compute_delta(current_val, champion_val, lower_is_better)
|
|
118
|
+
|
|
119
|
+
if status == "kept" and delta > PROMISING_DELTA:
|
|
120
|
+
return "new_champion", delta
|
|
121
|
+
elif status == "kept" and delta > 0:
|
|
122
|
+
return "marginal_improvement", delta
|
|
123
|
+
elif abs(delta) <= PROMISING_DELTA:
|
|
124
|
+
return "lateral", delta
|
|
125
|
+
else:
|
|
126
|
+
return "regression", delta
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def recommend_action(
|
|
130
|
+
outcome: str,
|
|
131
|
+
experiment: dict,
|
|
132
|
+
) -> tuple[str, str]:
|
|
133
|
+
"""Recommend a next action based on the outcome.
|
|
134
|
+
|
|
135
|
+
Returns (action, rationale).
|
|
136
|
+
"""
|
|
137
|
+
actions = {
|
|
138
|
+
"new_champion": (
|
|
139
|
+
"promote",
|
|
140
|
+
"New best result — update champion, consider replicating to confirm stability",
|
|
141
|
+
),
|
|
142
|
+
"marginal_improvement": (
|
|
143
|
+
"branch_followup",
|
|
144
|
+
"Slight improvement — explore variations of this approach",
|
|
145
|
+
),
|
|
146
|
+
"lateral": (
|
|
147
|
+
"abandon",
|
|
148
|
+
"No meaningful change — this direction is not productive",
|
|
149
|
+
),
|
|
150
|
+
"regression": (
|
|
151
|
+
"abandon",
|
|
152
|
+
"Performance decreased — discard and try a different approach",
|
|
153
|
+
),
|
|
154
|
+
"crash": (
|
|
155
|
+
"fix_and_retry" if experiment.get("description", "") else "investigate_crash",
|
|
156
|
+
"Experiment crashed — check error logs and retry with fixes" if experiment.get("description") else "Experiment crashed — investigate the cause before retrying",
|
|
157
|
+
),
|
|
158
|
+
}
|
|
159
|
+
return actions.get(outcome, ("investigate_crash", "Unknown outcome"))
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def synthesize_packet(
|
|
163
|
+
experiment: dict,
|
|
164
|
+
champion: dict | None,
|
|
165
|
+
metric: str,
|
|
166
|
+
lower_is_better: bool,
|
|
167
|
+
) -> dict:
|
|
168
|
+
"""Synthesize a complete decision packet for an experiment.
|
|
169
|
+
|
|
170
|
+
Returns dict with: experiment_id, outcome, action, rationale,
|
|
171
|
+
delta, current_metric, champion_metric, champion_id.
|
|
172
|
+
"""
|
|
173
|
+
outcome, delta = classify_outcome(experiment, champion, metric, lower_is_better)
|
|
174
|
+
action, rationale = recommend_action(outcome, experiment)
|
|
175
|
+
|
|
176
|
+
current_val = experiment.get("metrics", {}).get(metric)
|
|
177
|
+
champion_val = champion.get("metrics", {}).get(metric) if champion else None
|
|
178
|
+
|
|
179
|
+
packet = {
|
|
180
|
+
"experiment_id": experiment.get("experiment_id", "?"),
|
|
181
|
+
"outcome": outcome,
|
|
182
|
+
"action": action,
|
|
183
|
+
"rationale": rationale,
|
|
184
|
+
"delta": round(delta, 6) if delta is not None else None,
|
|
185
|
+
"current_metric": current_val,
|
|
186
|
+
"champion_metric": champion_val,
|
|
187
|
+
"champion_id": champion.get("experiment_id", "?") if champion else None,
|
|
188
|
+
"status": experiment.get("status", "?"),
|
|
189
|
+
"description": experiment.get("description", ""),
|
|
190
|
+
"family": experiment.get("family"),
|
|
191
|
+
"hypothesis_id": experiment.get("hypothesis_id"),
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
return packet
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def auto_queue_followup(
|
|
198
|
+
packet: dict,
|
|
199
|
+
hypotheses_path: str = "hypotheses.yaml",
|
|
200
|
+
) -> str | None:
|
|
201
|
+
"""Auto-queue a follow-up hypothesis based on the decision packet.
|
|
202
|
+
|
|
203
|
+
Actions that trigger auto-queuing:
|
|
204
|
+
- branch_followup: queue a follow-up hypothesis as agent/medium priority
|
|
205
|
+
- fix_and_retry: queue a retry hypothesis as agent/high priority
|
|
206
|
+
|
|
207
|
+
Returns the new hypothesis ID if queued, or None if no action taken.
|
|
208
|
+
"""
|
|
209
|
+
action = packet.get("action", "")
|
|
210
|
+
exp_id = packet.get("experiment_id", "?")
|
|
211
|
+
description = packet.get("description", "")
|
|
212
|
+
family = packet.get("family")
|
|
213
|
+
|
|
214
|
+
# Import here to avoid circular dependency at module level
|
|
215
|
+
from scripts.manage_hypotheses import add_hypothesis
|
|
216
|
+
|
|
217
|
+
if action == "branch_followup":
|
|
218
|
+
desc = f"Follow up on {exp_id}: explore variations of '{description[:60]}'"
|
|
219
|
+
return add_hypothesis(
|
|
220
|
+
queue_path=hypotheses_path,
|
|
221
|
+
description=desc,
|
|
222
|
+
source="agent",
|
|
223
|
+
priority="medium",
|
|
224
|
+
parent_experiment=exp_id,
|
|
225
|
+
)
|
|
226
|
+
elif action == "fix_and_retry":
|
|
227
|
+
desc = f"Retry {exp_id} with fixes: '{description[:60]}' crashed — investigate and fix"
|
|
228
|
+
return add_hypothesis(
|
|
229
|
+
queue_path=hypotheses_path,
|
|
230
|
+
description=desc,
|
|
231
|
+
source="agent",
|
|
232
|
+
priority="high",
|
|
233
|
+
parent_experiment=exp_id,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def format_packet(packet: dict, metric_name: str) -> str:
|
|
240
|
+
"""Format a decision packet for display."""
|
|
241
|
+
lines = [
|
|
242
|
+
f"Decision Packet: {packet['experiment_id']}",
|
|
243
|
+
f" Outcome: {packet['outcome']}",
|
|
244
|
+
f" Action: {packet['action']}",
|
|
245
|
+
f" Rationale: {packet['rationale']}",
|
|
246
|
+
f" {metric_name}: {packet['current_metric']} (champion: {packet['champion_metric']})",
|
|
247
|
+
]
|
|
248
|
+
if packet["delta"] is not None:
|
|
249
|
+
lines.append(f" Delta: {packet['delta']:+.4f} relative")
|
|
250
|
+
if packet["family"]:
|
|
251
|
+
lines.append(f" Family: {packet['family']}")
|
|
252
|
+
return "\n".join(lines)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def main() -> None:
|
|
256
|
+
"""CLI entry point."""
|
|
257
|
+
parser = argparse.ArgumentParser(description="Synthesize decision packet")
|
|
258
|
+
parser.add_argument("--experiment", required=True, help="Experiment ID")
|
|
259
|
+
parser.add_argument("--log", default="experiments/log.jsonl")
|
|
260
|
+
parser.add_argument("--config", default="config.yaml")
|
|
261
|
+
parser.add_argument("--hypotheses", default="hypotheses.yaml", help="Hypothesis queue path")
|
|
262
|
+
parser.add_argument("--auto-queue", action="store_true", help="Auto-queue follow-up hypotheses")
|
|
263
|
+
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
264
|
+
args = parser.parse_args()
|
|
265
|
+
|
|
266
|
+
config = {}
|
|
267
|
+
if Path(args.config).exists():
|
|
268
|
+
with open(args.config) as f:
|
|
269
|
+
config = yaml.safe_load(f) or {}
|
|
270
|
+
|
|
271
|
+
eval_cfg = config.get("evaluation", {})
|
|
272
|
+
metric = eval_cfg.get("primary_metric", "accuracy")
|
|
273
|
+
lower_is_better = eval_cfg.get("lower_is_better", False)
|
|
274
|
+
|
|
275
|
+
experiment = load_experiment(args.log, args.experiment)
|
|
276
|
+
if not experiment:
|
|
277
|
+
print(f"Experiment {args.experiment} not found.", file=sys.stderr)
|
|
278
|
+
sys.exit(1)
|
|
279
|
+
|
|
280
|
+
champion = find_champion(args.log, metric, lower_is_better)
|
|
281
|
+
# Don't compare champion to itself
|
|
282
|
+
if champion and champion.get("experiment_id") == experiment.get("experiment_id"):
|
|
283
|
+
champion = None
|
|
284
|
+
|
|
285
|
+
packet = synthesize_packet(experiment, champion, metric, lower_is_better)
|
|
286
|
+
|
|
287
|
+
if args.json:
|
|
288
|
+
print(json.dumps(packet, indent=2))
|
|
289
|
+
else:
|
|
290
|
+
print(format_packet(packet, metric))
|
|
291
|
+
|
|
292
|
+
# Auto-queue follow-up hypotheses if requested
|
|
293
|
+
if args.auto_queue:
|
|
294
|
+
hyp_id = auto_queue_followup(packet, args.hypotheses)
|
|
295
|
+
if hyp_id:
|
|
296
|
+
print(f"\n Auto-queued: {hyp_id} ({packet['action']})")
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
if __name__ == "__main__":
|
|
300
|
+
main()
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Shared data loading functions for the autoresearch pipeline.
|
|
2
|
+
|
|
3
|
+
Consolidates the duplicated load_experiments, load_config, and
|
|
4
|
+
load_hypotheses functions that were copy-pasted across 8+ scripts.
|
|
5
|
+
Every script that reads experiment logs, config, or hypothesis
|
|
6
|
+
queues should import from here.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from scripts.turing_io import load_experiments, load_config, load_hypotheses
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_experiments(log_path: str) -> list[dict]:
|
|
21
|
+
"""Load experiments from JSONL log.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
log_path: Path to the experiments JSONL file.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
List of experiment dicts. Empty list if file missing or empty.
|
|
28
|
+
Malformed lines are silently skipped.
|
|
29
|
+
"""
|
|
30
|
+
path = Path(log_path)
|
|
31
|
+
if not path.exists():
|
|
32
|
+
return []
|
|
33
|
+
experiments = []
|
|
34
|
+
with open(path) as f:
|
|
35
|
+
for line in f:
|
|
36
|
+
line = line.strip()
|
|
37
|
+
if line:
|
|
38
|
+
try:
|
|
39
|
+
experiments.append(json.loads(line))
|
|
40
|
+
except json.JSONDecodeError:
|
|
41
|
+
continue
|
|
42
|
+
return experiments
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def load_config(path: str = "config.yaml") -> dict:
|
|
46
|
+
"""Load YAML config with fallback to empty dict.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
path: Path to config YAML file. Defaults to "config.yaml".
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Parsed config dict, or empty dict if file missing or invalid.
|
|
53
|
+
"""
|
|
54
|
+
p = Path(path)
|
|
55
|
+
if not p.exists():
|
|
56
|
+
return {}
|
|
57
|
+
with open(p) as f:
|
|
58
|
+
return yaml.safe_load(f) or {}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def load_hypotheses(queue_path: str) -> list[dict]:
|
|
62
|
+
"""Load hypothesis queue from YAML file.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
queue_path: Path to the hypotheses YAML file.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
List of hypothesis dicts. Empty list if file missing, empty,
|
|
69
|
+
or not a list.
|
|
70
|
+
"""
|
|
71
|
+
path = Path(queue_path)
|
|
72
|
+
if not path.exists() or path.stat().st_size == 0:
|
|
73
|
+
return []
|
|
74
|
+
with open(path) as f:
|
|
75
|
+
data = yaml.safe_load(f)
|
|
76
|
+
return data if isinstance(data, list) else []
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Structured experiment state manager for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Replaces free-text MEMORY.md with a validated experiment_state.yaml
|
|
5
|
+
that the agent reads and writes in a schema-enforced format. The state
|
|
6
|
+
is machine-readable and programmatically queryable.
|
|
7
|
+
|
|
8
|
+
MEMORY.md is kept as a human-readable companion, generated from the
|
|
9
|
+
structured state.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python scripts/update_state.py init # Create empty state
|
|
13
|
+
python scripts/update_state.py show # Display current state
|
|
14
|
+
python scripts/update_state.py set-best <exp-id> <metrics_json>
|
|
15
|
+
python scripts/update_state.py add-observation <text> [--exp <exp-id>]
|
|
16
|
+
python scripts/update_state.py add-failure <text> --exp <exp-id> --reason <text>
|
|
17
|
+
python scripts/update_state.py add-direction <text> [--priority high|medium|low]
|
|
18
|
+
python scripts/update_state.py log-session <n_experiments> <best_metric>
|
|
19
|
+
python scripts/update_state.py generate-memory # Write MEMORY.md from state
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import argparse
|
|
25
|
+
import copy
|
|
26
|
+
import json
|
|
27
|
+
import sys
|
|
28
|
+
from datetime import datetime, timezone
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
|
|
31
|
+
import yaml
|
|
32
|
+
|
|
33
|
+
DEFAULT_STATE_PATH = "experiment_state.yaml"
|
|
34
|
+
DEFAULT_MEMORY_PATH = ".claude/agent-memory/ml-researcher/MEMORY.md"
|
|
35
|
+
|
|
36
|
+
EMPTY_STATE = {
|
|
37
|
+
"goal": "",
|
|
38
|
+
"primary_metric": "",
|
|
39
|
+
"metric_direction": "",
|
|
40
|
+
"best_result": None,
|
|
41
|
+
"observations": [],
|
|
42
|
+
"failed_approaches": [],
|
|
43
|
+
"promising_directions": [],
|
|
44
|
+
"session_history": [],
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
REQUIRED_KEYS = set(EMPTY_STATE.keys())
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def load_state(path: str) -> dict:
|
|
51
|
+
"""Load experiment state from YAML, validating required keys."""
|
|
52
|
+
p = Path(path)
|
|
53
|
+
if not p.exists() or p.stat().st_size == 0:
|
|
54
|
+
return copy.deepcopy(EMPTY_STATE)
|
|
55
|
+
with open(p) as f:
|
|
56
|
+
state = yaml.safe_load(f)
|
|
57
|
+
if not isinstance(state, dict):
|
|
58
|
+
return copy.deepcopy(EMPTY_STATE)
|
|
59
|
+
# Ensure all required keys exist
|
|
60
|
+
for key in REQUIRED_KEYS:
|
|
61
|
+
if key not in state:
|
|
62
|
+
state[key] = copy.deepcopy(EMPTY_STATE[key])
|
|
63
|
+
return state
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def save_state(path: str, state: dict) -> None:
|
|
67
|
+
"""Save experiment state to YAML."""
|
|
68
|
+
p = Path(path)
|
|
69
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
with open(p, "w") as f:
|
|
71
|
+
yaml.dump(state, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def validate_state(state: dict) -> list[str]:
|
|
75
|
+
"""Validate state against schema. Returns list of error messages."""
|
|
76
|
+
errors = []
|
|
77
|
+
for key in REQUIRED_KEYS:
|
|
78
|
+
if key not in state:
|
|
79
|
+
errors.append(f"Missing required key: {key}")
|
|
80
|
+
if state.get("best_result") is not None:
|
|
81
|
+
br = state["best_result"]
|
|
82
|
+
if not isinstance(br, dict):
|
|
83
|
+
errors.append("best_result must be a dict or null")
|
|
84
|
+
elif "experiment_id" not in br or "metrics" not in br:
|
|
85
|
+
errors.append("best_result must have experiment_id and metrics")
|
|
86
|
+
if not isinstance(state.get("observations", []), list):
|
|
87
|
+
errors.append("observations must be a list")
|
|
88
|
+
if not isinstance(state.get("failed_approaches", []), list):
|
|
89
|
+
errors.append("failed_approaches must be a list")
|
|
90
|
+
if not isinstance(state.get("promising_directions", []), list):
|
|
91
|
+
errors.append("promising_directions must be a list")
|
|
92
|
+
if not isinstance(state.get("session_history", []), list):
|
|
93
|
+
errors.append("session_history must be a list")
|
|
94
|
+
return errors
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def set_best(state: dict, experiment_id: str, metrics: dict) -> dict:
|
|
98
|
+
"""Update the best result in state."""
|
|
99
|
+
state["best_result"] = {
|
|
100
|
+
"experiment_id": experiment_id,
|
|
101
|
+
"metrics": metrics,
|
|
102
|
+
"updated_at": datetime.now(timezone.utc).isoformat(),
|
|
103
|
+
}
|
|
104
|
+
return state
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def add_observation(state: dict, text: str, experiment_id: str | None = None) -> dict:
|
|
108
|
+
"""Add an observation to state."""
|
|
109
|
+
entry = {
|
|
110
|
+
"text": text,
|
|
111
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
112
|
+
}
|
|
113
|
+
if experiment_id:
|
|
114
|
+
entry["experiment_id"] = experiment_id
|
|
115
|
+
state["observations"].append(entry)
|
|
116
|
+
return state
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def add_failure(state: dict, description: str, experiment_id: str, reason: str) -> dict:
|
|
120
|
+
"""Record a failed approach."""
|
|
121
|
+
state["failed_approaches"].append({
|
|
122
|
+
"description": description,
|
|
123
|
+
"experiment_id": experiment_id,
|
|
124
|
+
"reason": reason,
|
|
125
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
126
|
+
})
|
|
127
|
+
return state
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def add_direction(state: dict, description: str, priority: str = "medium") -> dict:
|
|
131
|
+
"""Add a promising direction."""
|
|
132
|
+
state["promising_directions"].append({
|
|
133
|
+
"description": description,
|
|
134
|
+
"priority": priority,
|
|
135
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
136
|
+
})
|
|
137
|
+
return state
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def log_session(state: dict, n_experiments: int, best_metric: float | None) -> dict:
|
|
141
|
+
"""Log a training session summary."""
|
|
142
|
+
state["session_history"].append({
|
|
143
|
+
"date": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
|
|
144
|
+
"experiments_run": n_experiments,
|
|
145
|
+
"best_metric": best_metric,
|
|
146
|
+
})
|
|
147
|
+
return state
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def generate_memory(state: dict, goal: str = "", metric: str = "", direction: str = "") -> str:
|
|
151
|
+
"""Generate MEMORY.md content from structured state."""
|
|
152
|
+
goal = goal or state.get("goal", "")
|
|
153
|
+
metric = metric or state.get("primary_metric", "")
|
|
154
|
+
direction = direction or state.get("metric_direction", "")
|
|
155
|
+
|
|
156
|
+
lines = [
|
|
157
|
+
"# ML Researcher Memory",
|
|
158
|
+
"",
|
|
159
|
+
"## Goal",
|
|
160
|
+
"",
|
|
161
|
+
goal or "(not set)",
|
|
162
|
+
"",
|
|
163
|
+
f"Primary metric: {metric} ({direction} is better).",
|
|
164
|
+
"",
|
|
165
|
+
"## Best Result",
|
|
166
|
+
"",
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
br = state.get("best_result")
|
|
170
|
+
if br:
|
|
171
|
+
metrics_str = ", ".join(f"{k}={v}" for k, v in br.get("metrics", {}).items())
|
|
172
|
+
lines.append(f"Experiment {br.get('experiment_id', '?')}: {metrics_str}")
|
|
173
|
+
else:
|
|
174
|
+
lines.append("No experiments completed yet.")
|
|
175
|
+
|
|
176
|
+
lines.extend(["", "## Observations", ""])
|
|
177
|
+
for obs in state.get("observations", [])[-10:]: # Last 10
|
|
178
|
+
exp_tag = f" ({obs['experiment_id']})" if obs.get("experiment_id") else ""
|
|
179
|
+
lines.append(f"- {obs['text']}{exp_tag}")
|
|
180
|
+
if not state.get("observations"):
|
|
181
|
+
lines.append("(none yet)")
|
|
182
|
+
|
|
183
|
+
lines.extend(["", "## Failed Approaches", ""])
|
|
184
|
+
for fail in state.get("failed_approaches", []):
|
|
185
|
+
lines.append(f"- {fail['description']} ({fail['experiment_id']}): {fail['reason']}")
|
|
186
|
+
if not state.get("failed_approaches"):
|
|
187
|
+
lines.append("(none yet)")
|
|
188
|
+
|
|
189
|
+
lines.extend(["", "## Promising Directions", ""])
|
|
190
|
+
for d in state.get("promising_directions", []):
|
|
191
|
+
priority_tag = f" [{d['priority']}]" if d.get("priority") != "medium" else ""
|
|
192
|
+
lines.append(f"- {d['description']}{priority_tag}")
|
|
193
|
+
if not state.get("promising_directions"):
|
|
194
|
+
lines.append("(none yet)")
|
|
195
|
+
|
|
196
|
+
lines.extend(["", "## Session History", ""])
|
|
197
|
+
if state.get("session_history"):
|
|
198
|
+
lines.append("| Session | Experiments | Best Metric | Notes |")
|
|
199
|
+
lines.append("|---------|-------------|-------------|-------|")
|
|
200
|
+
for s in state["session_history"]:
|
|
201
|
+
bm = f"{s['best_metric']:.4f}" if s.get("best_metric") is not None else "N/A"
|
|
202
|
+
lines.append(f"| {s.get('date', '?')} | {s.get('experiments_run', 0)} | {bm} | |")
|
|
203
|
+
else:
|
|
204
|
+
lines.append("(no sessions yet)")
|
|
205
|
+
|
|
206
|
+
lines.append("")
|
|
207
|
+
return "\n".join(lines)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def main() -> None:
|
|
211
|
+
"""CLI entry point."""
|
|
212
|
+
parser = argparse.ArgumentParser(description="Manage experiment state")
|
|
213
|
+
parser.add_argument("--state", default=DEFAULT_STATE_PATH)
|
|
214
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
215
|
+
|
|
216
|
+
subparsers.add_parser("init", help="Create empty state")
|
|
217
|
+
subparsers.add_parser("show", help="Display current state")
|
|
218
|
+
subparsers.add_parser("validate", help="Validate state schema")
|
|
219
|
+
|
|
220
|
+
sb = subparsers.add_parser("set-best")
|
|
221
|
+
sb.add_argument("experiment_id")
|
|
222
|
+
sb.add_argument("metrics_json")
|
|
223
|
+
|
|
224
|
+
obs = subparsers.add_parser("add-observation")
|
|
225
|
+
obs.add_argument("text")
|
|
226
|
+
obs.add_argument("--exp", default=None)
|
|
227
|
+
|
|
228
|
+
fail = subparsers.add_parser("add-failure")
|
|
229
|
+
fail.add_argument("text")
|
|
230
|
+
fail.add_argument("--exp", required=True)
|
|
231
|
+
fail.add_argument("--reason", required=True)
|
|
232
|
+
|
|
233
|
+
d = subparsers.add_parser("add-direction")
|
|
234
|
+
d.add_argument("text")
|
|
235
|
+
d.add_argument("--priority", default="medium", choices=["high", "medium", "low"])
|
|
236
|
+
|
|
237
|
+
sess = subparsers.add_parser("log-session")
|
|
238
|
+
sess.add_argument("n_experiments", type=int)
|
|
239
|
+
sess.add_argument("best_metric", type=float, nargs="?", default=None)
|
|
240
|
+
|
|
241
|
+
subparsers.add_parser("generate-memory", help="Write MEMORY.md from state")
|
|
242
|
+
|
|
243
|
+
args = parser.parse_args()
|
|
244
|
+
state = load_state(args.state)
|
|
245
|
+
|
|
246
|
+
if args.command == "init":
|
|
247
|
+
save_state(args.state, EMPTY_STATE)
|
|
248
|
+
print(f"Initialized empty state at {args.state}")
|
|
249
|
+
|
|
250
|
+
elif args.command == "show":
|
|
251
|
+
print(yaml.dump(state, default_flow_style=False, sort_keys=False))
|
|
252
|
+
|
|
253
|
+
elif args.command == "validate":
|
|
254
|
+
errors = validate_state(state)
|
|
255
|
+
if errors:
|
|
256
|
+
for e in errors:
|
|
257
|
+
print(f" ERROR: {e}", file=sys.stderr)
|
|
258
|
+
sys.exit(1)
|
|
259
|
+
print("State is valid.")
|
|
260
|
+
|
|
261
|
+
elif args.command == "set-best":
|
|
262
|
+
metrics = json.loads(args.metrics_json)
|
|
263
|
+
set_best(state, args.experiment_id, metrics)
|
|
264
|
+
save_state(args.state, state)
|
|
265
|
+
print(f"Best result set to {args.experiment_id}")
|
|
266
|
+
|
|
267
|
+
elif args.command == "add-observation":
|
|
268
|
+
add_observation(state, args.text, args.exp)
|
|
269
|
+
save_state(args.state, state)
|
|
270
|
+
print("Observation added.")
|
|
271
|
+
|
|
272
|
+
elif args.command == "add-failure":
|
|
273
|
+
add_failure(state, args.text, args.exp, args.reason)
|
|
274
|
+
save_state(args.state, state)
|
|
275
|
+
print("Failed approach recorded.")
|
|
276
|
+
|
|
277
|
+
elif args.command == "add-direction":
|
|
278
|
+
add_direction(state, args.text, args.priority)
|
|
279
|
+
save_state(args.state, state)
|
|
280
|
+
print("Promising direction added.")
|
|
281
|
+
|
|
282
|
+
elif args.command == "log-session":
|
|
283
|
+
log_session(state, args.n_experiments, args.best_metric)
|
|
284
|
+
save_state(args.state, state)
|
|
285
|
+
print("Session logged.")
|
|
286
|
+
|
|
287
|
+
elif args.command == "generate-memory":
|
|
288
|
+
memory_content = generate_memory(state)
|
|
289
|
+
print(memory_content)
|
|
290
|
+
|
|
291
|
+
else:
|
|
292
|
+
parser.print_help()
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
if __name__ == "__main__":
|
|
296
|
+
main()
|