claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/.claude-plugin/plugin.json +34 -0
  2. package/LICENSE +21 -0
  3. package/README.md +457 -0
  4. package/agents/ml-evaluator.md +43 -0
  5. package/agents/ml-researcher.md +74 -0
  6. package/bin/cli.js +46 -0
  7. package/bin/turing-init.sh +57 -0
  8. package/commands/brief.md +83 -0
  9. package/commands/compare.md +24 -0
  10. package/commands/design.md +97 -0
  11. package/commands/init.md +123 -0
  12. package/commands/logbook.md +51 -0
  13. package/commands/mode.md +43 -0
  14. package/commands/poster.md +89 -0
  15. package/commands/preflight.md +75 -0
  16. package/commands/report.md +97 -0
  17. package/commands/rules/loop-protocol.md +91 -0
  18. package/commands/status.md +24 -0
  19. package/commands/suggest.md +95 -0
  20. package/commands/sweep.md +45 -0
  21. package/commands/train.md +66 -0
  22. package/commands/try.md +63 -0
  23. package/commands/turing.md +54 -0
  24. package/commands/validate.md +34 -0
  25. package/config/defaults.yaml +45 -0
  26. package/config/experiment_archetypes.yaml +127 -0
  27. package/config/lifecycle.toml +31 -0
  28. package/config/novelty_aliases.yaml +107 -0
  29. package/config/relationships.toml +125 -0
  30. package/config/state.toml +24 -0
  31. package/config/task_taxonomy.yaml +110 -0
  32. package/config/taxonomy.toml +37 -0
  33. package/package.json +54 -0
  34. package/src/claude-md.js +55 -0
  35. package/src/install.js +107 -0
  36. package/src/paths.js +20 -0
  37. package/src/postinstall.js +22 -0
  38. package/src/verify.js +109 -0
  39. package/templates/MEMORY.md +36 -0
  40. package/templates/README.md +93 -0
  41. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  42. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  43. package/templates/config.yaml +48 -0
  44. package/templates/evaluate.py +237 -0
  45. package/templates/features/__init__.py +0 -0
  46. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  47. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  48. package/templates/features/featurizers.py +138 -0
  49. package/templates/prepare.py +171 -0
  50. package/templates/program.md +216 -0
  51. package/templates/pyproject.toml +8 -0
  52. package/templates/requirements.txt +8 -0
  53. package/templates/scripts/__init__.py +0 -0
  54. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  56. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  57. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  58. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  59. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  60. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  61. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  62. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  63. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  64. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  65. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  66. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  67. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  68. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  69. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  70. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  71. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  72. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  73. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  74. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  75. package/templates/scripts/check_convergence.py +230 -0
  76. package/templates/scripts/compare_runs.py +124 -0
  77. package/templates/scripts/critique_hypothesis.py +350 -0
  78. package/templates/scripts/experiment_index.py +288 -0
  79. package/templates/scripts/generate_brief.py +389 -0
  80. package/templates/scripts/generate_logbook.py +423 -0
  81. package/templates/scripts/log_experiment.py +243 -0
  82. package/templates/scripts/manage_hypotheses.py +543 -0
  83. package/templates/scripts/novelty_guard.py +343 -0
  84. package/templates/scripts/parse_metrics.py +139 -0
  85. package/templates/scripts/post-train-hook.sh +74 -0
  86. package/templates/scripts/preflight.py +549 -0
  87. package/templates/scripts/scaffold.py +409 -0
  88. package/templates/scripts/show_environment.py +92 -0
  89. package/templates/scripts/show_experiment_tree.py +144 -0
  90. package/templates/scripts/show_families.py +133 -0
  91. package/templates/scripts/show_metrics.py +157 -0
  92. package/templates/scripts/statistical_compare.py +259 -0
  93. package/templates/scripts/stop-hook.sh +34 -0
  94. package/templates/scripts/suggest_next.py +301 -0
  95. package/templates/scripts/sweep.py +276 -0
  96. package/templates/scripts/synthesize_decision.py +300 -0
  97. package/templates/scripts/turing_io.py +76 -0
  98. package/templates/scripts/update_state.py +296 -0
  99. package/templates/scripts/validate_stability.py +167 -0
  100. package/templates/scripts/verify_placeholders.py +119 -0
  101. package/templates/sweep_config.yaml +14 -0
  102. package/templates/tests/__init__.py +0 -0
  103. package/templates/tests/conftest.py +91 -0
  104. package/templates/train.py +240 -0
@@ -0,0 +1,343 @@
1
+ #!/usr/bin/env python3
2
+ """History-aware novelty guard for the autoresearch pipeline.
3
+
4
+ Checks a proposed experiment description against prior experiments
5
+ to classify it as novel, known_success, incremental_followup,
6
+ repeat_failure, or duplicate_run. Applies a mode policy (explore,
7
+ exploit, replicate) to determine whether to proceed.
8
+
9
+ Prevents the agent from wasting iterations re-trying things it has
10
+ already tried, especially across /loop sessions where context is lost.
11
+
12
+ The matcher is intentionally heuristic — rule-based token matching
13
+ with configurable alias tables, not embedding search. Fast, inspectable,
14
+ and dependency-free.
15
+
16
+ Usage:
17
+ python scripts/novelty_guard.py check \\
18
+ --description "increase max_depth to 8" \\
19
+ --log experiments/log.jsonl \\
20
+ --mode exploit
21
+
22
+ python scripts/novelty_guard.py check \\
23
+ --description "switch to LightGBM" \\
24
+ --log experiments/log.jsonl \\
25
+ --mode explore
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import argparse
31
+ import json
32
+ import re
33
+ import sys
34
+ from pathlib import Path
35
+
36
+ import yaml
37
+
38
+ TOKEN_RE = re.compile(r"[a-z0-9_]+")
39
+ NUMBER_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
40
+
41
+ DEFAULT_ALIASES_PATH = "config/novelty_aliases.yaml"
42
+ # Fallback if running from a scaffolded project without the config dir
43
+ PLUGIN_ALIASES_PATHS = [
44
+ Path(__file__).parent.parent.parent / "config" / "novelty_aliases.yaml",
45
+ Path(__file__).parent.parent / "config" / "novelty_aliases.yaml",
46
+ ]
47
+
48
+ NOVELTY_CLASSES = ("duplicate_run", "repeat_failure", "incremental_followup", "known_success", "novel")
49
+
50
+
51
+ def load_aliases(path: str | None = None) -> dict:
52
+ """Load alias configuration from YAML."""
53
+ candidates = [Path(path)] if path else [Path(DEFAULT_ALIASES_PATH)] + PLUGIN_ALIASES_PATHS
54
+ for p in candidates:
55
+ if p.exists():
56
+ with open(p) as f:
57
+ return yaml.safe_load(f) or {}
58
+ return {}
59
+
60
+
61
+ def normalize_text(text: str, aliases: dict) -> str:
62
+ """Normalize experiment description for comparison.
63
+
64
+ 1. Lowercase
65
+ 2. Apply phrase aliases (multi-word → token)
66
+ 3. Tokenize
67
+ 4. Remove stopwords
68
+ 5. Apply token aliases (synonym → canonical)
69
+ 6. Sort and deduplicate
70
+ """
71
+ text = text.lower().strip()
72
+
73
+ # Apply phrase aliases first (before tokenization breaks multi-word phrases)
74
+ for phrase, replacement in aliases.get("phrase_aliases", {}).items():
75
+ text = text.replace(phrase.lower(), replacement.lower())
76
+
77
+ # Tokenize
78
+ tokens = TOKEN_RE.findall(text)
79
+
80
+ # Remove stopwords
81
+ stopwords = set(aliases.get("stopwords", []))
82
+ tokens = [t for t in tokens if t not in stopwords and (len(t) > 1 or t in ("up", "nn"))]
83
+
84
+ # Apply token aliases
85
+ token_map = aliases.get("token_aliases", {})
86
+ tokens = [token_map.get(t, t) for t in tokens]
87
+
88
+ return " ".join(sorted(set(tokens)))
89
+
90
+
91
+ def extract_numbers(text: str) -> set[str]:
92
+ """Extract all numbers from text for numeric overlap scoring."""
93
+ return set(NUMBER_RE.findall(text.lower()))
94
+
95
+
96
+ def token_similarity(text_a: str, text_b: str) -> float:
97
+ """Jaccard similarity between normalized token sets."""
98
+ tokens_a = set(text_a.split())
99
+ tokens_b = set(text_b.split())
100
+ if not tokens_a or not tokens_b:
101
+ return 0.0
102
+ intersection = tokens_a & tokens_b
103
+ union = tokens_a | tokens_b
104
+ return len(intersection) / len(union)
105
+
106
+
107
+ def number_overlap(text_a: str, text_b: str) -> float:
108
+ """Fraction of numbers in text_a that also appear in text_b."""
109
+ nums_a = extract_numbers(text_a)
110
+ nums_b = extract_numbers(text_b)
111
+ if not nums_a:
112
+ return 0.0
113
+ return len(nums_a & nums_b) / len(nums_a)
114
+
115
+
116
+ def concept_overlap(text_a: str, text_b: str, aliases: dict) -> float:
117
+ """Fraction of concept categories shared between two texts."""
118
+ patterns = aliases.get("concept_patterns", {})
119
+ if not patterns:
120
+ return 0.0
121
+
122
+ def concepts_for(text: str) -> set[str]:
123
+ found = set()
124
+ tokens = set(text.split())
125
+ for concept, keywords in patterns.items():
126
+ if any(kw in tokens or kw in text for kw in keywords):
127
+ found.add(concept)
128
+ return found
129
+
130
+ concepts_a = concepts_for(text_a)
131
+ concepts_b = concepts_for(text_b)
132
+ if not concepts_a or not concepts_b:
133
+ return 0.0
134
+ intersection = concepts_a & concepts_b
135
+ union = concepts_a | concepts_b
136
+ return len(intersection) / len(union)
137
+
138
+
139
+ def similarity_score(
140
+ text_a: str,
141
+ text_b: str,
142
+ raw_a: str,
143
+ raw_b: str,
144
+ aliases: dict,
145
+ ) -> float:
146
+ """Blended similarity score combining token, number, and concept overlap.
147
+
148
+ Returns float in [0, 1]. Higher = more similar.
149
+ """
150
+ tok_sim = token_similarity(text_a, text_b)
151
+ num_sim = number_overlap(raw_a, raw_b)
152
+ con_sim = concept_overlap(text_a, text_b, aliases)
153
+
154
+ # Weighted blend: tokens matter most, concepts second, numbers third
155
+ return 0.5 * tok_sim + 0.3 * con_sim + 0.2 * num_sim
156
+
157
+
158
+ def classify_against_history(
159
+ proposed_normalized: str,
160
+ proposed_raw: str,
161
+ history: list[dict],
162
+ aliases: dict,
163
+ threshold: float = 0.6,
164
+ ) -> tuple[str, float, dict | None]:
165
+ """Classify a proposed experiment against history.
166
+
167
+ Returns (classification, best_score, best_match_record).
168
+
169
+ Classifications:
170
+ - duplicate_run: score >= 0.9 (nearly identical)
171
+ - repeat_failure: score >= threshold and best match was discarded
172
+ - incremental_followup: score >= threshold * 0.8 and best match was kept
173
+ - known_success: score >= threshold and best match was kept
174
+ - novel: score < threshold (nothing similar enough)
175
+ """
176
+ best_score = 0.0
177
+ best_match = None
178
+
179
+ for record in history:
180
+ desc = record.get("description", "")
181
+ normalized = normalize_text(desc, aliases)
182
+ score = similarity_score(proposed_normalized, normalized, proposed_raw, desc, aliases)
183
+ if score > best_score:
184
+ best_score = score
185
+ best_match = record
186
+
187
+ if best_score >= 0.9:
188
+ return "duplicate_run", best_score, best_match
189
+ if best_score >= threshold:
190
+ status = best_match.get("status", "") if best_match else ""
191
+ if status == "discarded":
192
+ return "repeat_failure", best_score, best_match
193
+ elif status == "kept":
194
+ return "known_success", best_score, best_match
195
+ if best_score >= threshold * 0.8 and best_match:
196
+ status = best_match.get("status", "")
197
+ if status == "kept":
198
+ return "incremental_followup", best_score, best_match
199
+
200
+ return "novel", best_score, best_match
201
+
202
+
203
+ def apply_mode_policy(
204
+ classification: str,
205
+ mode: str,
206
+ aliases: dict,
207
+ ) -> tuple[str, str]:
208
+ """Apply research mode policy to a novelty classification.
209
+
210
+ Returns (decision, reason) where decision is "allow", "block", or "caution".
211
+ """
212
+ policies = aliases.get("mode_policies", {})
213
+ mode_policy = policies.get(mode, {})
214
+
215
+ if not mode_policy:
216
+ return "allow", f"no policy defined for mode '{mode}'"
217
+
218
+ decision = mode_policy.get(classification, "allow")
219
+ reasons = {
220
+ ("explore", "novel"): "novel enough for exploration",
221
+ ("explore", "duplicate_run"): "duplicate work is not exploration",
222
+ ("explore", "repeat_failure"): "repeating a failed idea is not exploratory",
223
+ ("explore", "incremental_followup"): "follow-up work belongs in exploit mode",
224
+ ("explore", "known_success"): "reusing a known success belongs in exploit mode",
225
+ ("exploit", "novel"): "novel work is fine but not targeted exploitation",
226
+ ("exploit", "duplicate_run"): "exact duplicates belong in replicate mode",
227
+ ("exploit", "repeat_failure"): "prior failures are poor exploitation candidates",
228
+ ("exploit", "incremental_followup"): "close to a known success — good exploitation",
229
+ ("exploit", "known_success"): "refinement of proven approach",
230
+ ("replicate", "novel"): "nothing close enough in history to replicate",
231
+ ("replicate", "duplicate_run"): "intentional replication run",
232
+ ("replicate", "known_success"): "replicating a proven result",
233
+ }
234
+
235
+ reason = reasons.get((mode, classification), f"{classification} under {mode} mode")
236
+ return decision, reason
237
+
238
+
239
+ def check_novelty(
240
+ description: str,
241
+ log_path: str,
242
+ mode: str = "exploit",
243
+ aliases_path: str | None = None,
244
+ threshold: float = 0.6,
245
+ ) -> dict:
246
+ """Main entry point: check a proposed experiment against history.
247
+
248
+ Returns dict with: classification, score, decision, reason, top_match.
249
+ """
250
+ aliases = load_aliases(aliases_path)
251
+ normalized = normalize_text(description, aliases)
252
+
253
+ # Load experiment history
254
+ history = []
255
+ path = Path(log_path)
256
+ if path.exists():
257
+ with open(path) as f:
258
+ for line in f:
259
+ line = line.strip()
260
+ if line:
261
+ try:
262
+ history.append(json.loads(line))
263
+ except json.JSONDecodeError:
264
+ continue
265
+
266
+ if not history:
267
+ return {
268
+ "classification": "novel",
269
+ "score": 0.0,
270
+ "decision": "allow",
271
+ "reason": "no prior experiments — everything is novel",
272
+ "top_match": None,
273
+ }
274
+
275
+ classification, score, top_match = classify_against_history(
276
+ normalized, description, history, aliases, threshold,
277
+ )
278
+ decision, reason = apply_mode_policy(classification, mode, aliases)
279
+
280
+ result = {
281
+ "classification": classification,
282
+ "score": round(score, 4),
283
+ "decision": decision,
284
+ "reason": reason,
285
+ "top_match": {
286
+ "experiment_id": top_match.get("experiment_id", "?"),
287
+ "description": top_match.get("description", ""),
288
+ "status": top_match.get("status", ""),
289
+ } if top_match else None,
290
+ }
291
+
292
+ return result
293
+
294
+
295
+ def format_result(result: dict) -> str:
296
+ """Format novelty check result for display."""
297
+ lines = [
298
+ f"Novelty: {result['classification']} (score: {result['score']:.2f})",
299
+ f"Decision: {result['decision']} — {result['reason']}",
300
+ ]
301
+ if result.get("top_match"):
302
+ m = result["top_match"]
303
+ lines.append(f"Nearest: {m['experiment_id']} ({m['status']}) — {m['description'][:60]}")
304
+ return "\n".join(lines)
305
+
306
+
307
+ def main() -> None:
308
+ """CLI entry point."""
309
+ parser = argparse.ArgumentParser(description="Novelty guard for experiment proposals")
310
+ subparsers = parser.add_subparsers(dest="command")
311
+
312
+ check = subparsers.add_parser("check", help="Check a proposed experiment")
313
+ check.add_argument("--description", required=True, help="Experiment description")
314
+ check.add_argument("--log", default="experiments/log.jsonl", help="Experiment log path")
315
+ check.add_argument("--mode", default="exploit", choices=["explore", "exploit", "replicate"])
316
+ check.add_argument("--aliases", default=None, help="Path to novelty_aliases.yaml")
317
+ check.add_argument("--threshold", type=float, default=0.6, help="Similarity threshold")
318
+ check.add_argument("--json", action="store_true", help="Output as JSON")
319
+
320
+ args = parser.parse_args()
321
+
322
+ if args.command == "check":
323
+ result = check_novelty(
324
+ description=args.description,
325
+ log_path=args.log,
326
+ mode=args.mode,
327
+ aliases_path=args.aliases,
328
+ threshold=args.threshold,
329
+ )
330
+
331
+ if args.json:
332
+ print(json.dumps(result, indent=2))
333
+ else:
334
+ print(format_result(result))
335
+
336
+ if result["decision"] == "block":
337
+ sys.exit(1)
338
+ else:
339
+ parser.print_help()
340
+
341
+
342
+ if __name__ == "__main__":
343
+ main()
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env python3
2
+ """Canonical metric parser for the autoresearch pipeline.
3
+
4
+ Single source of truth for parsing the --- delimited metric format
5
+ produced by evaluate.py's format_metrics(). Replaces three independent
6
+ parsers (agent grep, bash awk, stop-hook reader) with one testable module.
7
+
8
+ The format:
9
+ ---
10
+ metric_name: value
11
+ ...
12
+ model_type: xgboost
13
+ train_seconds: 2.5
14
+ ---
15
+
16
+ Metadata keys (model_type, train_seconds) are separated from metric keys.
17
+
18
+ Usage:
19
+ python scripts/parse_metrics.py <run.log> # Print parsed JSON
20
+ python scripts/parse_metrics.py <run.log> --metrics # Metrics only (no metadata)
21
+ python scripts/parse_metrics.py <run.log> --raw # Full dict including metadata
22
+
23
+ As a library:
24
+ from scripts.parse_metrics import parse_run_log
25
+ metrics, metadata = parse_run_log("run.log")
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import argparse
31
+ import json
32
+ import sys
33
+ from pathlib import Path
34
+
35
+ METADATA_KEYS = {"model_type", "train_seconds"}
36
+
37
+
38
+ def parse_metrics_block(text: str) -> tuple[dict[str, float], dict[str, str]]:
39
+ """Parse a --- delimited metrics block into metrics and metadata.
40
+
41
+ Args:
42
+ text: String containing the --- delimited block (may have surrounding text).
43
+
44
+ Returns:
45
+ Tuple of (metrics_dict, metadata_dict).
46
+ metrics_dict has float values. metadata_dict has string values.
47
+ """
48
+ lines = text.strip().split("\n")
49
+ metrics: dict[str, float] = {}
50
+ metadata: dict[str, str] = {}
51
+ in_block = False
52
+
53
+ for line in lines:
54
+ line = line.strip()
55
+ if line == "---":
56
+ if in_block:
57
+ break # end of block
58
+ in_block = True
59
+ continue
60
+
61
+ if in_block and ":" in line:
62
+ # Split on first colon only (handles values with colons)
63
+ key, value = line.split(":", 1)
64
+ key = key.strip()
65
+ value = value.strip()
66
+
67
+ if not key or not value:
68
+ continue
69
+
70
+ if key in METADATA_KEYS:
71
+ metadata[key] = value
72
+ else:
73
+ try:
74
+ metrics[key] = float(value)
75
+ except ValueError:
76
+ metadata[key] = value
77
+
78
+ return metrics, metadata
79
+
80
+
81
+ def parse_run_log(log_path: str) -> tuple[dict[str, float], dict[str, str]]:
82
+ """Parse metrics from a run.log file.
83
+
84
+ Args:
85
+ log_path: Path to the run.log file.
86
+
87
+ Returns:
88
+ Tuple of (metrics_dict, metadata_dict).
89
+
90
+ Raises:
91
+ FileNotFoundError: If log_path does not exist.
92
+ """
93
+ path = Path(log_path)
94
+ if not path.exists():
95
+ raise FileNotFoundError(f"Log file not found: {log_path}")
96
+
97
+ text = path.read_text(encoding="utf-8")
98
+ return parse_metrics_block(text)
99
+
100
+
101
+ def metrics_to_json(metrics: dict[str, float], metadata: dict[str, str]) -> tuple[str, str]:
102
+ """Convert parsed metrics and metadata to JSON strings.
103
+
104
+ Returns (metrics_json, config_json) suitable for log_experiment.py CLI.
105
+ """
106
+ metrics_json = json.dumps(metrics)
107
+ config_json = json.dumps(metadata)
108
+ return metrics_json, config_json
109
+
110
+
111
+ def main() -> None:
112
+ """CLI entry point."""
113
+ parser = argparse.ArgumentParser(description="Parse metrics from run.log")
114
+ parser.add_argument("log_file", help="Path to run.log")
115
+ parser.add_argument("--metrics", action="store_true", help="Print metrics only (no metadata)")
116
+ parser.add_argument("--raw", action="store_true", help="Print full dict including metadata")
117
+ args = parser.parse_args()
118
+
119
+ try:
120
+ metrics, metadata = parse_run_log(args.log_file)
121
+ except FileNotFoundError as e:
122
+ print(str(e), file=sys.stderr)
123
+ sys.exit(1)
124
+
125
+ if not metrics and not metadata:
126
+ print("No metrics block found.", file=sys.stderr)
127
+ sys.exit(1)
128
+
129
+ if args.metrics:
130
+ print(json.dumps(metrics, indent=2))
131
+ elif args.raw:
132
+ combined = {**metrics, **metadata}
133
+ print(json.dumps(combined, indent=2))
134
+ else:
135
+ print(json.dumps({"metrics": metrics, "metadata": metadata}, indent=2))
136
+
137
+
138
+ if __name__ == "__main__":
139
+ main()
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env bash
2
+ # Post-training hook for the autoresearch pipeline.
3
+ #
4
+ # Fired by Claude Code PostToolUse hook after train.py executes.
5
+ # Delegates metric parsing to scripts/parse_metrics.py (the canonical
6
+ # parser) rather than reimplementing it in bash.
7
+ #
8
+ # This hook ensures that every training run is logged, even if the
9
+ # agent forgets to call log_experiment.py explicitly. Belt and suspenders.
10
+
11
+ set -euo pipefail
12
+
13
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
14
+ ML_DIR="$(dirname "$SCRIPT_DIR")"
15
+ EXPERIMENT_LOG="${ML_DIR}/experiments/log.jsonl"
16
+
17
+ # Check for run.log in ML_DIR first, then current directory
18
+ if [[ -f "${ML_DIR}/run.log" ]]; then
19
+ LOG_FILE="${ML_DIR}/run.log"
20
+ elif [[ -f "run.log" ]]; then
21
+ LOG_FILE="run.log"
22
+ else
23
+ echo "post-train-hook: No run.log found, skipping."
24
+ exit 0
25
+ fi
26
+
27
+ # Activate venv and delegate to Python
28
+ cd "$ML_DIR"
29
+ source .venv/bin/activate 2>/dev/null || true
30
+
31
+ # Parse metrics using the canonical parser
32
+ PARSED=$(python3 scripts/parse_metrics.py "$LOG_FILE" --raw 2>/dev/null) || {
33
+ echo "post-train-hook: No metrics block found in run.log, skipping."
34
+ exit 0
35
+ }
36
+
37
+ # Extract metrics and metadata via Python (avoids bash JSON construction)
38
+ METRICS_JSON=$(python3 -c "
39
+ import json, sys
40
+ data = json.loads(sys.argv[1])
41
+ metadata_keys = {'model_type', 'train_seconds'}
42
+ metrics = {k: v for k, v in data.items() if k not in metadata_keys}
43
+ print(json.dumps(metrics))
44
+ " "$PARSED")
45
+
46
+ CONFIG_JSON=$(python3 -c "
47
+ import json, sys
48
+ data = json.loads(sys.argv[1])
49
+ metadata_keys = {'model_type', 'train_seconds'}
50
+ config = {k: v for k, v in data.items() if k in metadata_keys}
51
+ print(json.dumps(config))
52
+ " "$PARSED")
53
+
54
+ # Get git commit
55
+ GIT_COMMIT=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
56
+
57
+ # Get next experiment ID
58
+ NEXT_ID=$(python3 -c "
59
+ import sys; sys.path.insert(0, 'scripts')
60
+ from log_experiment import get_next_experiment_id
61
+ print(get_next_experiment_id('$EXPERIMENT_LOG'))
62
+ ")
63
+
64
+ # Log the experiment
65
+ python3 scripts/log_experiment.py \
66
+ "$EXPERIMENT_LOG" \
67
+ "$NEXT_ID" \
68
+ "kept" \
69
+ "$METRICS_JSON" \
70
+ "$CONFIG_JSON" \
71
+ "models/model.joblib" \
72
+ "Auto-logged by post-train-hook"
73
+
74
+ echo "post-train-hook: Logged ${NEXT_ID}"