claude-turing 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +34 -0
- package/LICENSE +21 -0
- package/README.md +457 -0
- package/agents/ml-evaluator.md +43 -0
- package/agents/ml-researcher.md +74 -0
- package/bin/cli.js +46 -0
- package/bin/turing-init.sh +57 -0
- package/commands/brief.md +83 -0
- package/commands/compare.md +24 -0
- package/commands/design.md +97 -0
- package/commands/init.md +123 -0
- package/commands/logbook.md +51 -0
- package/commands/mode.md +43 -0
- package/commands/poster.md +89 -0
- package/commands/preflight.md +75 -0
- package/commands/report.md +97 -0
- package/commands/rules/loop-protocol.md +91 -0
- package/commands/status.md +24 -0
- package/commands/suggest.md +95 -0
- package/commands/sweep.md +45 -0
- package/commands/train.md +66 -0
- package/commands/try.md +63 -0
- package/commands/turing.md +54 -0
- package/commands/validate.md +34 -0
- package/config/defaults.yaml +45 -0
- package/config/experiment_archetypes.yaml +127 -0
- package/config/lifecycle.toml +31 -0
- package/config/novelty_aliases.yaml +107 -0
- package/config/relationships.toml +125 -0
- package/config/state.toml +24 -0
- package/config/task_taxonomy.yaml +110 -0
- package/config/taxonomy.toml +37 -0
- package/package.json +54 -0
- package/src/claude-md.js +55 -0
- package/src/install.js +107 -0
- package/src/paths.js +20 -0
- package/src/postinstall.js +22 -0
- package/src/verify.js +109 -0
- package/templates/MEMORY.md +36 -0
- package/templates/README.md +93 -0
- package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
- package/templates/config.yaml +48 -0
- package/templates/evaluate.py +237 -0
- package/templates/features/__init__.py +0 -0
- package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
- package/templates/features/featurizers.py +138 -0
- package/templates/prepare.py +171 -0
- package/templates/program.md +216 -0
- package/templates/pyproject.toml +8 -0
- package/templates/requirements.txt +8 -0
- package/templates/scripts/__init__.py +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
- package/templates/scripts/check_convergence.py +230 -0
- package/templates/scripts/compare_runs.py +124 -0
- package/templates/scripts/critique_hypothesis.py +350 -0
- package/templates/scripts/experiment_index.py +288 -0
- package/templates/scripts/generate_brief.py +389 -0
- package/templates/scripts/generate_logbook.py +423 -0
- package/templates/scripts/log_experiment.py +243 -0
- package/templates/scripts/manage_hypotheses.py +543 -0
- package/templates/scripts/novelty_guard.py +343 -0
- package/templates/scripts/parse_metrics.py +139 -0
- package/templates/scripts/post-train-hook.sh +74 -0
- package/templates/scripts/preflight.py +549 -0
- package/templates/scripts/scaffold.py +409 -0
- package/templates/scripts/show_environment.py +92 -0
- package/templates/scripts/show_experiment_tree.py +144 -0
- package/templates/scripts/show_families.py +133 -0
- package/templates/scripts/show_metrics.py +157 -0
- package/templates/scripts/statistical_compare.py +259 -0
- package/templates/scripts/stop-hook.sh +34 -0
- package/templates/scripts/suggest_next.py +301 -0
- package/templates/scripts/sweep.py +276 -0
- package/templates/scripts/synthesize_decision.py +300 -0
- package/templates/scripts/turing_io.py +76 -0
- package/templates/scripts/update_state.py +296 -0
- package/templates/scripts/validate_stability.py +167 -0
- package/templates/scripts/verify_placeholders.py +119 -0
- package/templates/sweep_config.yaml +14 -0
- package/templates/tests/__init__.py +0 -0
- package/templates/tests/conftest.py +91 -0
- package/templates/train.py +240 -0
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""History-aware novelty guard for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Checks a proposed experiment description against prior experiments
|
|
5
|
+
to classify it as novel, known_success, incremental_followup,
|
|
6
|
+
repeat_failure, or duplicate_run. Applies a mode policy (explore,
|
|
7
|
+
exploit, replicate) to determine whether to proceed.
|
|
8
|
+
|
|
9
|
+
Prevents the agent from wasting iterations re-trying things it has
|
|
10
|
+
already tried, especially across /loop sessions where context is lost.
|
|
11
|
+
|
|
12
|
+
The matcher is intentionally heuristic — rule-based token matching
|
|
13
|
+
with configurable alias tables, not embedding search. Fast, inspectable,
|
|
14
|
+
and dependency-free.
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
python scripts/novelty_guard.py check \\
|
|
18
|
+
--description "increase max_depth to 8" \\
|
|
19
|
+
--log experiments/log.jsonl \\
|
|
20
|
+
--mode exploit
|
|
21
|
+
|
|
22
|
+
python scripts/novelty_guard.py check \\
|
|
23
|
+
--description "switch to LightGBM" \\
|
|
24
|
+
--log experiments/log.jsonl \\
|
|
25
|
+
--mode explore
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import argparse
|
|
31
|
+
import json
|
|
32
|
+
import re
|
|
33
|
+
import sys
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
|
|
36
|
+
import yaml
|
|
37
|
+
|
|
38
|
+
TOKEN_RE = re.compile(r"[a-z0-9_]+")
|
|
39
|
+
NUMBER_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
|
|
40
|
+
|
|
41
|
+
DEFAULT_ALIASES_PATH = "config/novelty_aliases.yaml"
|
|
42
|
+
# Fallback if running from a scaffolded project without the config dir
|
|
43
|
+
PLUGIN_ALIASES_PATHS = [
|
|
44
|
+
Path(__file__).parent.parent.parent / "config" / "novelty_aliases.yaml",
|
|
45
|
+
Path(__file__).parent.parent / "config" / "novelty_aliases.yaml",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
NOVELTY_CLASSES = ("duplicate_run", "repeat_failure", "incremental_followup", "known_success", "novel")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def load_aliases(path: str | None = None) -> dict:
|
|
52
|
+
"""Load alias configuration from YAML."""
|
|
53
|
+
candidates = [Path(path)] if path else [Path(DEFAULT_ALIASES_PATH)] + PLUGIN_ALIASES_PATHS
|
|
54
|
+
for p in candidates:
|
|
55
|
+
if p.exists():
|
|
56
|
+
with open(p) as f:
|
|
57
|
+
return yaml.safe_load(f) or {}
|
|
58
|
+
return {}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def normalize_text(text: str, aliases: dict) -> str:
|
|
62
|
+
"""Normalize experiment description for comparison.
|
|
63
|
+
|
|
64
|
+
1. Lowercase
|
|
65
|
+
2. Apply phrase aliases (multi-word → token)
|
|
66
|
+
3. Tokenize
|
|
67
|
+
4. Remove stopwords
|
|
68
|
+
5. Apply token aliases (synonym → canonical)
|
|
69
|
+
6. Sort and deduplicate
|
|
70
|
+
"""
|
|
71
|
+
text = text.lower().strip()
|
|
72
|
+
|
|
73
|
+
# Apply phrase aliases first (before tokenization breaks multi-word phrases)
|
|
74
|
+
for phrase, replacement in aliases.get("phrase_aliases", {}).items():
|
|
75
|
+
text = text.replace(phrase.lower(), replacement.lower())
|
|
76
|
+
|
|
77
|
+
# Tokenize
|
|
78
|
+
tokens = TOKEN_RE.findall(text)
|
|
79
|
+
|
|
80
|
+
# Remove stopwords
|
|
81
|
+
stopwords = set(aliases.get("stopwords", []))
|
|
82
|
+
tokens = [t for t in tokens if t not in stopwords and (len(t) > 1 or t in ("up", "nn"))]
|
|
83
|
+
|
|
84
|
+
# Apply token aliases
|
|
85
|
+
token_map = aliases.get("token_aliases", {})
|
|
86
|
+
tokens = [token_map.get(t, t) for t in tokens]
|
|
87
|
+
|
|
88
|
+
return " ".join(sorted(set(tokens)))
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def extract_numbers(text: str) -> set[str]:
|
|
92
|
+
"""Extract all numbers from text for numeric overlap scoring."""
|
|
93
|
+
return set(NUMBER_RE.findall(text.lower()))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def token_similarity(text_a: str, text_b: str) -> float:
|
|
97
|
+
"""Jaccard similarity between normalized token sets."""
|
|
98
|
+
tokens_a = set(text_a.split())
|
|
99
|
+
tokens_b = set(text_b.split())
|
|
100
|
+
if not tokens_a or not tokens_b:
|
|
101
|
+
return 0.0
|
|
102
|
+
intersection = tokens_a & tokens_b
|
|
103
|
+
union = tokens_a | tokens_b
|
|
104
|
+
return len(intersection) / len(union)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def number_overlap(text_a: str, text_b: str) -> float:
|
|
108
|
+
"""Fraction of numbers in text_a that also appear in text_b."""
|
|
109
|
+
nums_a = extract_numbers(text_a)
|
|
110
|
+
nums_b = extract_numbers(text_b)
|
|
111
|
+
if not nums_a:
|
|
112
|
+
return 0.0
|
|
113
|
+
return len(nums_a & nums_b) / len(nums_a)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def concept_overlap(text_a: str, text_b: str, aliases: dict) -> float:
|
|
117
|
+
"""Fraction of concept categories shared between two texts."""
|
|
118
|
+
patterns = aliases.get("concept_patterns", {})
|
|
119
|
+
if not patterns:
|
|
120
|
+
return 0.0
|
|
121
|
+
|
|
122
|
+
def concepts_for(text: str) -> set[str]:
|
|
123
|
+
found = set()
|
|
124
|
+
tokens = set(text.split())
|
|
125
|
+
for concept, keywords in patterns.items():
|
|
126
|
+
if any(kw in tokens or kw in text for kw in keywords):
|
|
127
|
+
found.add(concept)
|
|
128
|
+
return found
|
|
129
|
+
|
|
130
|
+
concepts_a = concepts_for(text_a)
|
|
131
|
+
concepts_b = concepts_for(text_b)
|
|
132
|
+
if not concepts_a or not concepts_b:
|
|
133
|
+
return 0.0
|
|
134
|
+
intersection = concepts_a & concepts_b
|
|
135
|
+
union = concepts_a | concepts_b
|
|
136
|
+
return len(intersection) / len(union)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def similarity_score(
|
|
140
|
+
text_a: str,
|
|
141
|
+
text_b: str,
|
|
142
|
+
raw_a: str,
|
|
143
|
+
raw_b: str,
|
|
144
|
+
aliases: dict,
|
|
145
|
+
) -> float:
|
|
146
|
+
"""Blended similarity score combining token, number, and concept overlap.
|
|
147
|
+
|
|
148
|
+
Returns float in [0, 1]. Higher = more similar.
|
|
149
|
+
"""
|
|
150
|
+
tok_sim = token_similarity(text_a, text_b)
|
|
151
|
+
num_sim = number_overlap(raw_a, raw_b)
|
|
152
|
+
con_sim = concept_overlap(text_a, text_b, aliases)
|
|
153
|
+
|
|
154
|
+
# Weighted blend: tokens matter most, concepts second, numbers third
|
|
155
|
+
return 0.5 * tok_sim + 0.3 * con_sim + 0.2 * num_sim
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def classify_against_history(
|
|
159
|
+
proposed_normalized: str,
|
|
160
|
+
proposed_raw: str,
|
|
161
|
+
history: list[dict],
|
|
162
|
+
aliases: dict,
|
|
163
|
+
threshold: float = 0.6,
|
|
164
|
+
) -> tuple[str, float, dict | None]:
|
|
165
|
+
"""Classify a proposed experiment against history.
|
|
166
|
+
|
|
167
|
+
Returns (classification, best_score, best_match_record).
|
|
168
|
+
|
|
169
|
+
Classifications:
|
|
170
|
+
- duplicate_run: score >= 0.9 (nearly identical)
|
|
171
|
+
- repeat_failure: score >= threshold and best match was discarded
|
|
172
|
+
- incremental_followup: score >= threshold * 0.8 and best match was kept
|
|
173
|
+
- known_success: score >= threshold and best match was kept
|
|
174
|
+
- novel: score < threshold (nothing similar enough)
|
|
175
|
+
"""
|
|
176
|
+
best_score = 0.0
|
|
177
|
+
best_match = None
|
|
178
|
+
|
|
179
|
+
for record in history:
|
|
180
|
+
desc = record.get("description", "")
|
|
181
|
+
normalized = normalize_text(desc, aliases)
|
|
182
|
+
score = similarity_score(proposed_normalized, normalized, proposed_raw, desc, aliases)
|
|
183
|
+
if score > best_score:
|
|
184
|
+
best_score = score
|
|
185
|
+
best_match = record
|
|
186
|
+
|
|
187
|
+
if best_score >= 0.9:
|
|
188
|
+
return "duplicate_run", best_score, best_match
|
|
189
|
+
if best_score >= threshold:
|
|
190
|
+
status = best_match.get("status", "") if best_match else ""
|
|
191
|
+
if status == "discarded":
|
|
192
|
+
return "repeat_failure", best_score, best_match
|
|
193
|
+
elif status == "kept":
|
|
194
|
+
return "known_success", best_score, best_match
|
|
195
|
+
if best_score >= threshold * 0.8 and best_match:
|
|
196
|
+
status = best_match.get("status", "")
|
|
197
|
+
if status == "kept":
|
|
198
|
+
return "incremental_followup", best_score, best_match
|
|
199
|
+
|
|
200
|
+
return "novel", best_score, best_match
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def apply_mode_policy(
|
|
204
|
+
classification: str,
|
|
205
|
+
mode: str,
|
|
206
|
+
aliases: dict,
|
|
207
|
+
) -> tuple[str, str]:
|
|
208
|
+
"""Apply research mode policy to a novelty classification.
|
|
209
|
+
|
|
210
|
+
Returns (decision, reason) where decision is "allow", "block", or "caution".
|
|
211
|
+
"""
|
|
212
|
+
policies = aliases.get("mode_policies", {})
|
|
213
|
+
mode_policy = policies.get(mode, {})
|
|
214
|
+
|
|
215
|
+
if not mode_policy:
|
|
216
|
+
return "allow", f"no policy defined for mode '{mode}'"
|
|
217
|
+
|
|
218
|
+
decision = mode_policy.get(classification, "allow")
|
|
219
|
+
reasons = {
|
|
220
|
+
("explore", "novel"): "novel enough for exploration",
|
|
221
|
+
("explore", "duplicate_run"): "duplicate work is not exploration",
|
|
222
|
+
("explore", "repeat_failure"): "repeating a failed idea is not exploratory",
|
|
223
|
+
("explore", "incremental_followup"): "follow-up work belongs in exploit mode",
|
|
224
|
+
("explore", "known_success"): "reusing a known success belongs in exploit mode",
|
|
225
|
+
("exploit", "novel"): "novel work is fine but not targeted exploitation",
|
|
226
|
+
("exploit", "duplicate_run"): "exact duplicates belong in replicate mode",
|
|
227
|
+
("exploit", "repeat_failure"): "prior failures are poor exploitation candidates",
|
|
228
|
+
("exploit", "incremental_followup"): "close to a known success — good exploitation",
|
|
229
|
+
("exploit", "known_success"): "refinement of proven approach",
|
|
230
|
+
("replicate", "novel"): "nothing close enough in history to replicate",
|
|
231
|
+
("replicate", "duplicate_run"): "intentional replication run",
|
|
232
|
+
("replicate", "known_success"): "replicating a proven result",
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
reason = reasons.get((mode, classification), f"{classification} under {mode} mode")
|
|
236
|
+
return decision, reason
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def check_novelty(
|
|
240
|
+
description: str,
|
|
241
|
+
log_path: str,
|
|
242
|
+
mode: str = "exploit",
|
|
243
|
+
aliases_path: str | None = None,
|
|
244
|
+
threshold: float = 0.6,
|
|
245
|
+
) -> dict:
|
|
246
|
+
"""Main entry point: check a proposed experiment against history.
|
|
247
|
+
|
|
248
|
+
Returns dict with: classification, score, decision, reason, top_match.
|
|
249
|
+
"""
|
|
250
|
+
aliases = load_aliases(aliases_path)
|
|
251
|
+
normalized = normalize_text(description, aliases)
|
|
252
|
+
|
|
253
|
+
# Load experiment history
|
|
254
|
+
history = []
|
|
255
|
+
path = Path(log_path)
|
|
256
|
+
if path.exists():
|
|
257
|
+
with open(path) as f:
|
|
258
|
+
for line in f:
|
|
259
|
+
line = line.strip()
|
|
260
|
+
if line:
|
|
261
|
+
try:
|
|
262
|
+
history.append(json.loads(line))
|
|
263
|
+
except json.JSONDecodeError:
|
|
264
|
+
continue
|
|
265
|
+
|
|
266
|
+
if not history:
|
|
267
|
+
return {
|
|
268
|
+
"classification": "novel",
|
|
269
|
+
"score": 0.0,
|
|
270
|
+
"decision": "allow",
|
|
271
|
+
"reason": "no prior experiments — everything is novel",
|
|
272
|
+
"top_match": None,
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
classification, score, top_match = classify_against_history(
|
|
276
|
+
normalized, description, history, aliases, threshold,
|
|
277
|
+
)
|
|
278
|
+
decision, reason = apply_mode_policy(classification, mode, aliases)
|
|
279
|
+
|
|
280
|
+
result = {
|
|
281
|
+
"classification": classification,
|
|
282
|
+
"score": round(score, 4),
|
|
283
|
+
"decision": decision,
|
|
284
|
+
"reason": reason,
|
|
285
|
+
"top_match": {
|
|
286
|
+
"experiment_id": top_match.get("experiment_id", "?"),
|
|
287
|
+
"description": top_match.get("description", ""),
|
|
288
|
+
"status": top_match.get("status", ""),
|
|
289
|
+
} if top_match else None,
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
return result
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def format_result(result: dict) -> str:
|
|
296
|
+
"""Format novelty check result for display."""
|
|
297
|
+
lines = [
|
|
298
|
+
f"Novelty: {result['classification']} (score: {result['score']:.2f})",
|
|
299
|
+
f"Decision: {result['decision']} — {result['reason']}",
|
|
300
|
+
]
|
|
301
|
+
if result.get("top_match"):
|
|
302
|
+
m = result["top_match"]
|
|
303
|
+
lines.append(f"Nearest: {m['experiment_id']} ({m['status']}) — {m['description'][:60]}")
|
|
304
|
+
return "\n".join(lines)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def main() -> None:
|
|
308
|
+
"""CLI entry point."""
|
|
309
|
+
parser = argparse.ArgumentParser(description="Novelty guard for experiment proposals")
|
|
310
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
311
|
+
|
|
312
|
+
check = subparsers.add_parser("check", help="Check a proposed experiment")
|
|
313
|
+
check.add_argument("--description", required=True, help="Experiment description")
|
|
314
|
+
check.add_argument("--log", default="experiments/log.jsonl", help="Experiment log path")
|
|
315
|
+
check.add_argument("--mode", default="exploit", choices=["explore", "exploit", "replicate"])
|
|
316
|
+
check.add_argument("--aliases", default=None, help="Path to novelty_aliases.yaml")
|
|
317
|
+
check.add_argument("--threshold", type=float, default=0.6, help="Similarity threshold")
|
|
318
|
+
check.add_argument("--json", action="store_true", help="Output as JSON")
|
|
319
|
+
|
|
320
|
+
args = parser.parse_args()
|
|
321
|
+
|
|
322
|
+
if args.command == "check":
|
|
323
|
+
result = check_novelty(
|
|
324
|
+
description=args.description,
|
|
325
|
+
log_path=args.log,
|
|
326
|
+
mode=args.mode,
|
|
327
|
+
aliases_path=args.aliases,
|
|
328
|
+
threshold=args.threshold,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
if args.json:
|
|
332
|
+
print(json.dumps(result, indent=2))
|
|
333
|
+
else:
|
|
334
|
+
print(format_result(result))
|
|
335
|
+
|
|
336
|
+
if result["decision"] == "block":
|
|
337
|
+
sys.exit(1)
|
|
338
|
+
else:
|
|
339
|
+
parser.print_help()
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
if __name__ == "__main__":
|
|
343
|
+
main()
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Canonical metric parser for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Single source of truth for parsing the --- delimited metric format
|
|
5
|
+
produced by evaluate.py's format_metrics(). Replaces three independent
|
|
6
|
+
parsers (agent grep, bash awk, stop-hook reader) with one testable module.
|
|
7
|
+
|
|
8
|
+
The format:
|
|
9
|
+
---
|
|
10
|
+
metric_name: value
|
|
11
|
+
...
|
|
12
|
+
model_type: xgboost
|
|
13
|
+
train_seconds: 2.5
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
Metadata keys (model_type, train_seconds) are separated from metric keys.
|
|
17
|
+
|
|
18
|
+
Usage:
|
|
19
|
+
python scripts/parse_metrics.py <run.log> # Print parsed JSON
|
|
20
|
+
python scripts/parse_metrics.py <run.log> --metrics # Metrics only (no metadata)
|
|
21
|
+
python scripts/parse_metrics.py <run.log> --raw # Full dict including metadata
|
|
22
|
+
|
|
23
|
+
As a library:
|
|
24
|
+
from scripts.parse_metrics import parse_run_log
|
|
25
|
+
metrics, metadata = parse_run_log("run.log")
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import argparse
|
|
31
|
+
import json
|
|
32
|
+
import sys
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
|
|
35
|
+
METADATA_KEYS = {"model_type", "train_seconds"}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def parse_metrics_block(text: str) -> tuple[dict[str, float], dict[str, str]]:
|
|
39
|
+
"""Parse a --- delimited metrics block into metrics and metadata.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
text: String containing the --- delimited block (may have surrounding text).
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Tuple of (metrics_dict, metadata_dict).
|
|
46
|
+
metrics_dict has float values. metadata_dict has string values.
|
|
47
|
+
"""
|
|
48
|
+
lines = text.strip().split("\n")
|
|
49
|
+
metrics: dict[str, float] = {}
|
|
50
|
+
metadata: dict[str, str] = {}
|
|
51
|
+
in_block = False
|
|
52
|
+
|
|
53
|
+
for line in lines:
|
|
54
|
+
line = line.strip()
|
|
55
|
+
if line == "---":
|
|
56
|
+
if in_block:
|
|
57
|
+
break # end of block
|
|
58
|
+
in_block = True
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
if in_block and ":" in line:
|
|
62
|
+
# Split on first colon only (handles values with colons)
|
|
63
|
+
key, value = line.split(":", 1)
|
|
64
|
+
key = key.strip()
|
|
65
|
+
value = value.strip()
|
|
66
|
+
|
|
67
|
+
if not key or not value:
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
if key in METADATA_KEYS:
|
|
71
|
+
metadata[key] = value
|
|
72
|
+
else:
|
|
73
|
+
try:
|
|
74
|
+
metrics[key] = float(value)
|
|
75
|
+
except ValueError:
|
|
76
|
+
metadata[key] = value
|
|
77
|
+
|
|
78
|
+
return metrics, metadata
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def parse_run_log(log_path: str) -> tuple[dict[str, float], dict[str, str]]:
|
|
82
|
+
"""Parse metrics from a run.log file.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
log_path: Path to the run.log file.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Tuple of (metrics_dict, metadata_dict).
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
FileNotFoundError: If log_path does not exist.
|
|
92
|
+
"""
|
|
93
|
+
path = Path(log_path)
|
|
94
|
+
if not path.exists():
|
|
95
|
+
raise FileNotFoundError(f"Log file not found: {log_path}")
|
|
96
|
+
|
|
97
|
+
text = path.read_text(encoding="utf-8")
|
|
98
|
+
return parse_metrics_block(text)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def metrics_to_json(metrics: dict[str, float], metadata: dict[str, str]) -> tuple[str, str]:
|
|
102
|
+
"""Convert parsed metrics and metadata to JSON strings.
|
|
103
|
+
|
|
104
|
+
Returns (metrics_json, config_json) suitable for log_experiment.py CLI.
|
|
105
|
+
"""
|
|
106
|
+
metrics_json = json.dumps(metrics)
|
|
107
|
+
config_json = json.dumps(metadata)
|
|
108
|
+
return metrics_json, config_json
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def main() -> None:
|
|
112
|
+
"""CLI entry point."""
|
|
113
|
+
parser = argparse.ArgumentParser(description="Parse metrics from run.log")
|
|
114
|
+
parser.add_argument("log_file", help="Path to run.log")
|
|
115
|
+
parser.add_argument("--metrics", action="store_true", help="Print metrics only (no metadata)")
|
|
116
|
+
parser.add_argument("--raw", action="store_true", help="Print full dict including metadata")
|
|
117
|
+
args = parser.parse_args()
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
metrics, metadata = parse_run_log(args.log_file)
|
|
121
|
+
except FileNotFoundError as e:
|
|
122
|
+
print(str(e), file=sys.stderr)
|
|
123
|
+
sys.exit(1)
|
|
124
|
+
|
|
125
|
+
if not metrics and not metadata:
|
|
126
|
+
print("No metrics block found.", file=sys.stderr)
|
|
127
|
+
sys.exit(1)
|
|
128
|
+
|
|
129
|
+
if args.metrics:
|
|
130
|
+
print(json.dumps(metrics, indent=2))
|
|
131
|
+
elif args.raw:
|
|
132
|
+
combined = {**metrics, **metadata}
|
|
133
|
+
print(json.dumps(combined, indent=2))
|
|
134
|
+
else:
|
|
135
|
+
print(json.dumps({"metrics": metrics, "metadata": metadata}, indent=2))
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
if __name__ == "__main__":
|
|
139
|
+
main()
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Post-training hook for the autoresearch pipeline.
|
|
3
|
+
#
|
|
4
|
+
# Fired by Claude Code PostToolUse hook after train.py executes.
|
|
5
|
+
# Delegates metric parsing to scripts/parse_metrics.py (the canonical
|
|
6
|
+
# parser) rather than reimplementing it in bash.
|
|
7
|
+
#
|
|
8
|
+
# This hook ensures that every training run is logged, even if the
|
|
9
|
+
# agent forgets to call log_experiment.py explicitly. Belt and suspenders.
|
|
10
|
+
|
|
11
|
+
set -euo pipefail
|
|
12
|
+
|
|
13
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
14
|
+
ML_DIR="$(dirname "$SCRIPT_DIR")"
|
|
15
|
+
EXPERIMENT_LOG="${ML_DIR}/experiments/log.jsonl"
|
|
16
|
+
|
|
17
|
+
# Check for run.log in ML_DIR first, then current directory
|
|
18
|
+
if [[ -f "${ML_DIR}/run.log" ]]; then
|
|
19
|
+
LOG_FILE="${ML_DIR}/run.log"
|
|
20
|
+
elif [[ -f "run.log" ]]; then
|
|
21
|
+
LOG_FILE="run.log"
|
|
22
|
+
else
|
|
23
|
+
echo "post-train-hook: No run.log found, skipping."
|
|
24
|
+
exit 0
|
|
25
|
+
fi
|
|
26
|
+
|
|
27
|
+
# Activate venv and delegate to Python
|
|
28
|
+
cd "$ML_DIR"
|
|
29
|
+
source .venv/bin/activate 2>/dev/null || true
|
|
30
|
+
|
|
31
|
+
# Parse metrics using the canonical parser
|
|
32
|
+
PARSED=$(python3 scripts/parse_metrics.py "$LOG_FILE" --raw 2>/dev/null) || {
|
|
33
|
+
echo "post-train-hook: No metrics block found in run.log, skipping."
|
|
34
|
+
exit 0
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# Extract metrics and metadata via Python (avoids bash JSON construction)
|
|
38
|
+
METRICS_JSON=$(python3 -c "
|
|
39
|
+
import json, sys
|
|
40
|
+
data = json.loads(sys.argv[1])
|
|
41
|
+
metadata_keys = {'model_type', 'train_seconds'}
|
|
42
|
+
metrics = {k: v for k, v in data.items() if k not in metadata_keys}
|
|
43
|
+
print(json.dumps(metrics))
|
|
44
|
+
" "$PARSED")
|
|
45
|
+
|
|
46
|
+
CONFIG_JSON=$(python3 -c "
|
|
47
|
+
import json, sys
|
|
48
|
+
data = json.loads(sys.argv[1])
|
|
49
|
+
metadata_keys = {'model_type', 'train_seconds'}
|
|
50
|
+
config = {k: v for k, v in data.items() if k in metadata_keys}
|
|
51
|
+
print(json.dumps(config))
|
|
52
|
+
" "$PARSED")
|
|
53
|
+
|
|
54
|
+
# Get git commit
|
|
55
|
+
GIT_COMMIT=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
|
|
56
|
+
|
|
57
|
+
# Get next experiment ID
|
|
58
|
+
NEXT_ID=$(python3 -c "
|
|
59
|
+
import sys; sys.path.insert(0, 'scripts')
|
|
60
|
+
from log_experiment import get_next_experiment_id
|
|
61
|
+
print(get_next_experiment_id('$EXPERIMENT_LOG'))
|
|
62
|
+
")
|
|
63
|
+
|
|
64
|
+
# Log the experiment
|
|
65
|
+
python3 scripts/log_experiment.py \
|
|
66
|
+
"$EXPERIMENT_LOG" \
|
|
67
|
+
"$NEXT_ID" \
|
|
68
|
+
"kept" \
|
|
69
|
+
"$METRICS_JSON" \
|
|
70
|
+
"$CONFIG_JSON" \
|
|
71
|
+
"models/model.joblib" \
|
|
72
|
+
"Auto-logged by post-train-hook"
|
|
73
|
+
|
|
74
|
+
echo "post-train-hook: Logged ${NEXT_ID}"
|