claude-turing 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +34 -0
- package/LICENSE +21 -0
- package/README.md +457 -0
- package/agents/ml-evaluator.md +43 -0
- package/agents/ml-researcher.md +74 -0
- package/bin/cli.js +46 -0
- package/bin/turing-init.sh +57 -0
- package/commands/brief.md +83 -0
- package/commands/compare.md +24 -0
- package/commands/design.md +97 -0
- package/commands/init.md +123 -0
- package/commands/logbook.md +51 -0
- package/commands/mode.md +43 -0
- package/commands/poster.md +89 -0
- package/commands/preflight.md +75 -0
- package/commands/report.md +97 -0
- package/commands/rules/loop-protocol.md +91 -0
- package/commands/status.md +24 -0
- package/commands/suggest.md +95 -0
- package/commands/sweep.md +45 -0
- package/commands/train.md +66 -0
- package/commands/try.md +63 -0
- package/commands/turing.md +54 -0
- package/commands/validate.md +34 -0
- package/config/defaults.yaml +45 -0
- package/config/experiment_archetypes.yaml +127 -0
- package/config/lifecycle.toml +31 -0
- package/config/novelty_aliases.yaml +107 -0
- package/config/relationships.toml +125 -0
- package/config/state.toml +24 -0
- package/config/task_taxonomy.yaml +110 -0
- package/config/taxonomy.toml +37 -0
- package/package.json +54 -0
- package/src/claude-md.js +55 -0
- package/src/install.js +107 -0
- package/src/paths.js +20 -0
- package/src/postinstall.js +22 -0
- package/src/verify.js +109 -0
- package/templates/MEMORY.md +36 -0
- package/templates/README.md +93 -0
- package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
- package/templates/config.yaml +48 -0
- package/templates/evaluate.py +237 -0
- package/templates/features/__init__.py +0 -0
- package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
- package/templates/features/featurizers.py +138 -0
- package/templates/prepare.py +171 -0
- package/templates/program.md +216 -0
- package/templates/pyproject.toml +8 -0
- package/templates/requirements.txt +8 -0
- package/templates/scripts/__init__.py +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
- package/templates/scripts/check_convergence.py +230 -0
- package/templates/scripts/compare_runs.py +124 -0
- package/templates/scripts/critique_hypothesis.py +350 -0
- package/templates/scripts/experiment_index.py +288 -0
- package/templates/scripts/generate_brief.py +389 -0
- package/templates/scripts/generate_logbook.py +423 -0
- package/templates/scripts/log_experiment.py +243 -0
- package/templates/scripts/manage_hypotheses.py +543 -0
- package/templates/scripts/novelty_guard.py +343 -0
- package/templates/scripts/parse_metrics.py +139 -0
- package/templates/scripts/post-train-hook.sh +74 -0
- package/templates/scripts/preflight.py +549 -0
- package/templates/scripts/scaffold.py +409 -0
- package/templates/scripts/show_environment.py +92 -0
- package/templates/scripts/show_experiment_tree.py +144 -0
- package/templates/scripts/show_families.py +133 -0
- package/templates/scripts/show_metrics.py +157 -0
- package/templates/scripts/statistical_compare.py +259 -0
- package/templates/scripts/stop-hook.sh +34 -0
- package/templates/scripts/suggest_next.py +301 -0
- package/templates/scripts/sweep.py +276 -0
- package/templates/scripts/synthesize_decision.py +300 -0
- package/templates/scripts/turing_io.py +76 -0
- package/templates/scripts/update_state.py +296 -0
- package/templates/scripts/validate_stability.py +167 -0
- package/templates/scripts/verify_placeholders.py +119 -0
- package/templates/sweep_config.yaml +14 -0
- package/templates/tests/__init__.py +0 -0
- package/templates/tests/conftest.py +91 -0
- package/templates/train.py +240 -0
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Hypothesis critique engine for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Scores a proposed hypothesis on novelty, feasibility, and expected impact
|
|
5
|
+
before committing to an expensive training run. A 30-second critique is
|
|
6
|
+
cheaper than a 30-minute wasted experiment.
|
|
7
|
+
|
|
8
|
+
Integrates with the novelty guard (novelty_guard.py) for duplicate
|
|
9
|
+
detection and with experiment history for feasibility assessment.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python scripts/critique_hypothesis.py score "increase max_depth to 12" \\
|
|
13
|
+
--log experiments/log.jsonl --config config.yaml
|
|
14
|
+
python scripts/critique_hypothesis.py score "switch to LightGBM" \\
|
|
15
|
+
--log experiments/log.jsonl --verbose
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import sys
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
import yaml
|
|
26
|
+
|
|
27
|
+
from scripts.turing_io import load_experiments
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def score_novelty(
|
|
31
|
+
description: str,
|
|
32
|
+
experiments: list[dict],
|
|
33
|
+
novelty_result: dict | None = None,
|
|
34
|
+
) -> dict:
|
|
35
|
+
"""Score how novel this hypothesis is relative to experiment history.
|
|
36
|
+
|
|
37
|
+
Returns dict with score (0-10), rationale, and similar experiments.
|
|
38
|
+
"""
|
|
39
|
+
desc_lower = description.lower()
|
|
40
|
+
desc_words = set(desc_lower.split())
|
|
41
|
+
|
|
42
|
+
# Check for near-duplicates in experiment descriptions
|
|
43
|
+
similar = []
|
|
44
|
+
for exp in experiments:
|
|
45
|
+
exp_desc = exp.get("description", "").lower()
|
|
46
|
+
if not exp_desc:
|
|
47
|
+
continue
|
|
48
|
+
exp_words = set(exp_desc.split())
|
|
49
|
+
if desc_words and exp_words:
|
|
50
|
+
overlap = len(desc_words & exp_words) / max(len(desc_words), 1)
|
|
51
|
+
if overlap > 0.5:
|
|
52
|
+
similar.append({
|
|
53
|
+
"experiment_id": exp.get("experiment_id", "?"),
|
|
54
|
+
"description": exp.get("description", ""),
|
|
55
|
+
"status": exp.get("status", "unknown"),
|
|
56
|
+
"overlap": round(overlap, 2),
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
# If novelty guard was run, use its classification
|
|
60
|
+
if novelty_result:
|
|
61
|
+
classification = novelty_result.get("classification", "novel")
|
|
62
|
+
if classification == "duplicate_run":
|
|
63
|
+
return {"score": 0, "rationale": "Exact duplicate of prior experiment",
|
|
64
|
+
"similar": similar}
|
|
65
|
+
elif classification == "repeat_failure":
|
|
66
|
+
return {"score": 1, "rationale": f"Repeats a known failed approach",
|
|
67
|
+
"similar": similar}
|
|
68
|
+
elif classification == "incremental_followup":
|
|
69
|
+
return {"score": 5, "rationale": "Incremental follow-up to existing work",
|
|
70
|
+
"similar": similar}
|
|
71
|
+
|
|
72
|
+
# Score based on similarity
|
|
73
|
+
if not similar:
|
|
74
|
+
return {"score": 9, "rationale": "No similar experiments found — appears novel",
|
|
75
|
+
"similar": []}
|
|
76
|
+
|
|
77
|
+
# Check if similar experiments failed
|
|
78
|
+
failed_similar = [s for s in similar if s["status"] == "discarded"]
|
|
79
|
+
if failed_similar:
|
|
80
|
+
max_overlap = max(s["overlap"] for s in failed_similar)
|
|
81
|
+
if max_overlap > 0.7:
|
|
82
|
+
return {"score": 2, "rationale": f"Very similar to {len(failed_similar)} discarded experiment(s)",
|
|
83
|
+
"similar": similar}
|
|
84
|
+
return {"score": 4, "rationale": f"Partially similar to {len(failed_similar)} discarded experiment(s)",
|
|
85
|
+
"similar": similar}
|
|
86
|
+
|
|
87
|
+
# Similar to kept experiments = incremental
|
|
88
|
+
kept_similar = [s for s in similar if s["status"] == "kept"]
|
|
89
|
+
if kept_similar:
|
|
90
|
+
return {"score": 6, "rationale": f"Builds on {len(kept_similar)} successful experiment(s)",
|
|
91
|
+
"similar": similar}
|
|
92
|
+
|
|
93
|
+
return {"score": 7, "rationale": "Some overlap but distinct approach",
|
|
94
|
+
"similar": similar}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def score_feasibility(
|
|
98
|
+
description: str,
|
|
99
|
+
experiments: list[dict],
|
|
100
|
+
config: dict,
|
|
101
|
+
) -> dict:
|
|
102
|
+
"""Score how feasible this hypothesis is given current infrastructure.
|
|
103
|
+
|
|
104
|
+
Returns dict with score (0-10), rationale, and concerns.
|
|
105
|
+
"""
|
|
106
|
+
desc_lower = description.lower()
|
|
107
|
+
concerns = []
|
|
108
|
+
|
|
109
|
+
# Check for model types that might not be available
|
|
110
|
+
exotic_models = ["transformer", "bert", "gpt", "diffusion", "gan",
|
|
111
|
+
"vae", "autoencoder", "resnet", "efficientnet"]
|
|
112
|
+
current_model = config.get("model", {}).get("type", "").lower()
|
|
113
|
+
|
|
114
|
+
for model in exotic_models:
|
|
115
|
+
if model in desc_lower and model not in current_model:
|
|
116
|
+
concerns.append(f"Switching to {model} may require significant code changes and new dependencies")
|
|
117
|
+
|
|
118
|
+
# Check for mentions of GPU/hardware requirements
|
|
119
|
+
gpu_terms = ["gpu", "cuda", "multi-gpu", "distributed", "tpu"]
|
|
120
|
+
for term in gpu_terms:
|
|
121
|
+
if term in desc_lower:
|
|
122
|
+
concerns.append(f"Mentions '{term}' — verify hardware availability")
|
|
123
|
+
|
|
124
|
+
# Check for mentions of external data
|
|
125
|
+
data_terms = ["download", "external data", "pretrained", "pre-trained",
|
|
126
|
+
"huggingface", "kaggle", "new dataset"]
|
|
127
|
+
for term in data_terms:
|
|
128
|
+
if term in desc_lower:
|
|
129
|
+
concerns.append(f"May require external resources: '{term}'")
|
|
130
|
+
|
|
131
|
+
# Score
|
|
132
|
+
if len(concerns) >= 3:
|
|
133
|
+
score = 3
|
|
134
|
+
elif len(concerns) >= 2:
|
|
135
|
+
score = 5
|
|
136
|
+
elif len(concerns) == 1:
|
|
137
|
+
score = 7
|
|
138
|
+
else:
|
|
139
|
+
score = 9
|
|
140
|
+
|
|
141
|
+
rationale = "No feasibility concerns" if not concerns else f"{len(concerns)} concern(s) identified"
|
|
142
|
+
return {"score": score, "rationale": rationale, "concerns": concerns}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def score_expected_impact(
|
|
146
|
+
description: str,
|
|
147
|
+
experiments: list[dict],
|
|
148
|
+
config: dict,
|
|
149
|
+
) -> dict:
|
|
150
|
+
"""Score the expected impact of this hypothesis.
|
|
151
|
+
|
|
152
|
+
Based on what types of changes have historically improved metrics.
|
|
153
|
+
Returns dict with score (0-10), rationale, and evidence.
|
|
154
|
+
"""
|
|
155
|
+
if not experiments:
|
|
156
|
+
return {"score": 5, "rationale": "No experiment history — cannot estimate impact",
|
|
157
|
+
"evidence": []}
|
|
158
|
+
|
|
159
|
+
desc_lower = description.lower()
|
|
160
|
+
evidence = []
|
|
161
|
+
|
|
162
|
+
# Analyze which experiment types have been successful
|
|
163
|
+
type_stats: dict[str, dict] = {}
|
|
164
|
+
for exp in experiments:
|
|
165
|
+
exp_type = exp.get("config", {}).get("experiment_type", "unknown")
|
|
166
|
+
if exp_type not in type_stats:
|
|
167
|
+
type_stats[exp_type] = {"kept": 0, "discarded": 0}
|
|
168
|
+
if exp.get("status") == "kept":
|
|
169
|
+
type_stats[exp_type]["kept"] += 1
|
|
170
|
+
else:
|
|
171
|
+
type_stats[exp_type]["discarded"] += 1
|
|
172
|
+
|
|
173
|
+
# Infer experiment type from description
|
|
174
|
+
type_keywords = {
|
|
175
|
+
"hyperparameter": ["learning rate", "lr", "max_depth", "depth", "n_estimators",
|
|
176
|
+
"estimators", "batch", "epochs", "regularization", "dropout"],
|
|
177
|
+
"architecture": ["switch to", "replace", "lightgbm", "catboost", "random forest",
|
|
178
|
+
"neural", "transformer", "model type"],
|
|
179
|
+
"feature": ["feature", "polynomial", "interaction", "encoding", "one-hot",
|
|
180
|
+
"embedding", "normalize", "scale"],
|
|
181
|
+
"data": ["augment", "oversample", "undersample", "smote", "split",
|
|
182
|
+
"validation", "cross-validation"],
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
inferred_type = "unknown"
|
|
186
|
+
for exp_type, keywords in type_keywords.items():
|
|
187
|
+
if any(kw in desc_lower for kw in keywords):
|
|
188
|
+
inferred_type = exp_type
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
# Check historical success rate for this type
|
|
192
|
+
if inferred_type in type_stats:
|
|
193
|
+
stats = type_stats[inferred_type]
|
|
194
|
+
total = stats["kept"] + stats["discarded"]
|
|
195
|
+
if total > 0:
|
|
196
|
+
success_rate = stats["kept"] / total
|
|
197
|
+
evidence.append(f"{inferred_type} experiments: {stats['kept']}/{total} kept ({success_rate:.0%})")
|
|
198
|
+
if success_rate > 0.5:
|
|
199
|
+
return {"score": 8, "rationale": f"{inferred_type} changes have been successful ({success_rate:.0%} keep rate)",
|
|
200
|
+
"evidence": evidence}
|
|
201
|
+
elif success_rate > 0.2:
|
|
202
|
+
return {"score": 5, "rationale": f"{inferred_type} changes have mixed results ({success_rate:.0%} keep rate)",
|
|
203
|
+
"evidence": evidence}
|
|
204
|
+
else:
|
|
205
|
+
return {"score": 3, "rationale": f"{inferred_type} changes have mostly failed ({success_rate:.0%} keep rate)",
|
|
206
|
+
"evidence": evidence}
|
|
207
|
+
|
|
208
|
+
# Check if we're in a diminishing returns zone
|
|
209
|
+
kept_experiments = [e for e in experiments if e.get("status") == "kept"]
|
|
210
|
+
if len(kept_experiments) >= 3:
|
|
211
|
+
metric_name = config.get("evaluation", {}).get("primary_metric", "accuracy")
|
|
212
|
+
recent_metrics = []
|
|
213
|
+
for exp in kept_experiments[-5:]:
|
|
214
|
+
val = exp.get("metrics", {}).get(metric_name)
|
|
215
|
+
if isinstance(val, (int, float)):
|
|
216
|
+
recent_metrics.append(val)
|
|
217
|
+
|
|
218
|
+
if len(recent_metrics) >= 3:
|
|
219
|
+
improvements = [recent_metrics[i] - recent_metrics[i-1]
|
|
220
|
+
for i in range(1, len(recent_metrics))]
|
|
221
|
+
avg_improvement = sum(improvements) / len(improvements) if improvements else 0
|
|
222
|
+
if abs(avg_improvement) < 0.001:
|
|
223
|
+
evidence.append(f"Recent improvements are very small (avg {avg_improvement:.4f})")
|
|
224
|
+
return {"score": 4, "rationale": "In diminishing returns zone — incremental changes may not help",
|
|
225
|
+
"evidence": evidence}
|
|
226
|
+
|
|
227
|
+
return {"score": 6, "rationale": "Insufficient history to estimate impact precisely",
|
|
228
|
+
"evidence": evidence}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def critique_hypothesis(
|
|
232
|
+
description: str,
|
|
233
|
+
log_path: str = "experiments/log.jsonl",
|
|
234
|
+
config_path: str = "config.yaml",
|
|
235
|
+
novelty_result: dict | None = None,
|
|
236
|
+
) -> dict:
|
|
237
|
+
"""Run full critique on a hypothesis.
|
|
238
|
+
|
|
239
|
+
Returns dict with:
|
|
240
|
+
overall_score: float (0-10)
|
|
241
|
+
verdict: str ("proceed", "modify", "reject")
|
|
242
|
+
novelty: dict
|
|
243
|
+
feasibility: dict
|
|
244
|
+
impact: dict
|
|
245
|
+
"""
|
|
246
|
+
experiments = load_experiments(log_path)
|
|
247
|
+
|
|
248
|
+
config = {}
|
|
249
|
+
if Path(config_path).exists():
|
|
250
|
+
with open(config_path) as f:
|
|
251
|
+
config = yaml.safe_load(f) or {}
|
|
252
|
+
|
|
253
|
+
novelty = score_novelty(description, experiments, novelty_result)
|
|
254
|
+
feasibility = score_feasibility(description, experiments, config)
|
|
255
|
+
impact = score_expected_impact(description, experiments, config)
|
|
256
|
+
|
|
257
|
+
# Weighted average: novelty 30%, feasibility 30%, impact 40%
|
|
258
|
+
overall = (novelty["score"] * 0.3 +
|
|
259
|
+
feasibility["score"] * 0.3 +
|
|
260
|
+
impact["score"] * 0.4)
|
|
261
|
+
overall = round(overall, 1)
|
|
262
|
+
|
|
263
|
+
if overall >= 6.0:
|
|
264
|
+
verdict = "proceed"
|
|
265
|
+
elif overall >= 4.0:
|
|
266
|
+
verdict = "modify"
|
|
267
|
+
else:
|
|
268
|
+
verdict = "reject"
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
"overall_score": overall,
|
|
272
|
+
"verdict": verdict,
|
|
273
|
+
"novelty": novelty,
|
|
274
|
+
"feasibility": feasibility,
|
|
275
|
+
"impact": impact,
|
|
276
|
+
"description": description,
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def format_critique(result: dict) -> str:
|
|
281
|
+
"""Format critique result for display."""
|
|
282
|
+
lines = []
|
|
283
|
+
lines.append(f"Hypothesis Critique: {result['description']}")
|
|
284
|
+
lines.append("=" * 60)
|
|
285
|
+
lines.append("")
|
|
286
|
+
lines.append(f"Overall Score: {result['overall_score']}/10 → {result['verdict'].upper()}")
|
|
287
|
+
lines.append("")
|
|
288
|
+
|
|
289
|
+
for dim in ("novelty", "feasibility", "impact"):
|
|
290
|
+
d = result[dim]
|
|
291
|
+
lines.append(f" {dim.title()}: {d['score']}/10 — {d['rationale']}")
|
|
292
|
+
if d.get("concerns"):
|
|
293
|
+
for c in d["concerns"]:
|
|
294
|
+
lines.append(f" ⚠ {c}")
|
|
295
|
+
if d.get("evidence"):
|
|
296
|
+
for e in d["evidence"]:
|
|
297
|
+
lines.append(f" → {e}")
|
|
298
|
+
if d.get("similar"):
|
|
299
|
+
for s in d["similar"][:3]:
|
|
300
|
+
lines.append(f" ~ {s['experiment_id']} ({s['status']}, {s['overlap']:.0%} overlap)")
|
|
301
|
+
|
|
302
|
+
lines.append("")
|
|
303
|
+
if result["verdict"] == "reject":
|
|
304
|
+
lines.append("Recommendation: Reject this hypothesis. Consider a different approach.")
|
|
305
|
+
elif result["verdict"] == "modify":
|
|
306
|
+
lines.append("Recommendation: Modify this hypothesis to address the concerns above.")
|
|
307
|
+
else:
|
|
308
|
+
lines.append("Recommendation: Proceed with this hypothesis.")
|
|
309
|
+
|
|
310
|
+
return "\n".join(lines)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def main() -> None:
|
|
314
|
+
parser = argparse.ArgumentParser(description="Critique a hypothesis before execution")
|
|
315
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
316
|
+
|
|
317
|
+
score_parser = subparsers.add_parser("score", help="Score a hypothesis")
|
|
318
|
+
score_parser.add_argument("description", help="Hypothesis description")
|
|
319
|
+
score_parser.add_argument("--log", default="experiments/log.jsonl")
|
|
320
|
+
score_parser.add_argument("--config", default="config.yaml")
|
|
321
|
+
score_parser.add_argument("--verbose", action="store_true")
|
|
322
|
+
score_parser.add_argument("--json", action="store_true", help="Output as JSON")
|
|
323
|
+
|
|
324
|
+
args = parser.parse_args()
|
|
325
|
+
if not args.command:
|
|
326
|
+
parser.print_help()
|
|
327
|
+
sys.exit(1)
|
|
328
|
+
|
|
329
|
+
result = critique_hypothesis(args.description, args.log, args.config)
|
|
330
|
+
|
|
331
|
+
if args.json:
|
|
332
|
+
# Remove non-serializable parts for clean JSON output
|
|
333
|
+
output = {
|
|
334
|
+
"overall_score": result["overall_score"],
|
|
335
|
+
"verdict": result["verdict"],
|
|
336
|
+
"novelty_score": result["novelty"]["score"],
|
|
337
|
+
"feasibility_score": result["feasibility"]["score"],
|
|
338
|
+
"impact_score": result["impact"]["score"],
|
|
339
|
+
}
|
|
340
|
+
if args.verbose:
|
|
341
|
+
output["novelty"] = result["novelty"]
|
|
342
|
+
output["feasibility"] = result["feasibility"]
|
|
343
|
+
output["impact"] = result["impact"]
|
|
344
|
+
print(json.dumps(output, indent=2))
|
|
345
|
+
else:
|
|
346
|
+
print(format_critique(result))
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
if __name__ == "__main__":
|
|
350
|
+
main()
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Semantic experiment index for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Indexes experiment descriptions and results using TF-IDF vectors,
|
|
5
|
+
enabling semantic search over experiment history. Complementary to
|
|
6
|
+
the novelty guard's keyword matching — this finds experiments by
|
|
7
|
+
meaning, not just shared tokens.
|
|
8
|
+
|
|
9
|
+
Uses sklearn's TfidfVectorizer (already a dependency) rather than
|
|
10
|
+
sentence-transformers + FAISS to avoid new heavyweight dependencies.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
python scripts/experiment_index.py build --log experiments/log.jsonl
|
|
14
|
+
python scripts/experiment_index.py query "experiments with high learning rate"
|
|
15
|
+
python scripts/experiment_index.py query "overfitting with deep trees" --top 3
|
|
16
|
+
python scripts/experiment_index.py similar exp-005 --top 5
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import json
|
|
23
|
+
import pickle
|
|
24
|
+
import sys
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
import numpy as np
|
|
28
|
+
|
|
29
|
+
from scripts.turing_io import load_experiments
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def experiment_to_text(exp: dict) -> str:
|
|
33
|
+
"""Convert an experiment to a searchable text representation.
|
|
34
|
+
|
|
35
|
+
Combines description, config summary, metrics, and status into
|
|
36
|
+
a single string that captures the experiment's essence.
|
|
37
|
+
"""
|
|
38
|
+
parts = []
|
|
39
|
+
|
|
40
|
+
desc = exp.get("description", "")
|
|
41
|
+
if desc:
|
|
42
|
+
parts.append(desc)
|
|
43
|
+
|
|
44
|
+
# Config summary
|
|
45
|
+
config = exp.get("config", {})
|
|
46
|
+
model_type = config.get("model_type", "")
|
|
47
|
+
if model_type:
|
|
48
|
+
parts.append(f"model: {model_type}")
|
|
49
|
+
|
|
50
|
+
exp_type = config.get("experiment_type", "")
|
|
51
|
+
if exp_type:
|
|
52
|
+
parts.append(f"type: {exp_type}")
|
|
53
|
+
|
|
54
|
+
hyperparams = config.get("hyperparams", {})
|
|
55
|
+
if hyperparams:
|
|
56
|
+
hp_str = " ".join(f"{k}={v}" for k, v in hyperparams.items()
|
|
57
|
+
if isinstance(v, (int, float, str)))
|
|
58
|
+
if hp_str:
|
|
59
|
+
parts.append(f"hyperparams: {hp_str}")
|
|
60
|
+
|
|
61
|
+
# Metrics
|
|
62
|
+
metrics = exp.get("metrics", {})
|
|
63
|
+
if metrics:
|
|
64
|
+
metric_str = " ".join(f"{k}={v}" for k, v in metrics.items()
|
|
65
|
+
if isinstance(v, (int, float)))
|
|
66
|
+
if metric_str:
|
|
67
|
+
parts.append(f"metrics: {metric_str}")
|
|
68
|
+
|
|
69
|
+
# Status
|
|
70
|
+
status = exp.get("status", "")
|
|
71
|
+
if status:
|
|
72
|
+
parts.append(f"status: {status}")
|
|
73
|
+
|
|
74
|
+
# Family and tags
|
|
75
|
+
family = exp.get("family", "")
|
|
76
|
+
if family:
|
|
77
|
+
parts.append(f"family: {family}")
|
|
78
|
+
|
|
79
|
+
tags = exp.get("tags", [])
|
|
80
|
+
if tags:
|
|
81
|
+
parts.append(f"tags: {' '.join(tags)}")
|
|
82
|
+
|
|
83
|
+
return " | ".join(parts) if parts else ""
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class ExperimentIndex:
|
|
87
|
+
"""TF-IDF based semantic index over experiment history."""
|
|
88
|
+
|
|
89
|
+
def __init__(self):
|
|
90
|
+
self.vectorizer = None
|
|
91
|
+
self.tfidf_matrix = None
|
|
92
|
+
self.experiment_ids: list[str] = []
|
|
93
|
+
self.experiment_texts: list[str] = []
|
|
94
|
+
self.experiment_data: list[dict] = []
|
|
95
|
+
|
|
96
|
+
def build(self, experiments: list[dict]) -> int:
|
|
97
|
+
"""Build the index from a list of experiments.
|
|
98
|
+
|
|
99
|
+
Returns the number of experiments indexed.
|
|
100
|
+
"""
|
|
101
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
102
|
+
|
|
103
|
+
texts = []
|
|
104
|
+
ids = []
|
|
105
|
+
data = []
|
|
106
|
+
|
|
107
|
+
for exp in experiments:
|
|
108
|
+
text = experiment_to_text(exp)
|
|
109
|
+
if text:
|
|
110
|
+
texts.append(text)
|
|
111
|
+
ids.append(exp.get("experiment_id", "?"))
|
|
112
|
+
data.append(exp)
|
|
113
|
+
|
|
114
|
+
if not texts:
|
|
115
|
+
return 0
|
|
116
|
+
|
|
117
|
+
self.vectorizer = TfidfVectorizer(
|
|
118
|
+
max_features=5000,
|
|
119
|
+
stop_words="english",
|
|
120
|
+
ngram_range=(1, 2), # unigrams and bigrams
|
|
121
|
+
sublinear_tf=True,
|
|
122
|
+
)
|
|
123
|
+
self.tfidf_matrix = self.vectorizer.fit_transform(texts)
|
|
124
|
+
self.experiment_ids = ids
|
|
125
|
+
self.experiment_texts = texts
|
|
126
|
+
self.experiment_data = data
|
|
127
|
+
|
|
128
|
+
return len(texts)
|
|
129
|
+
|
|
130
|
+
def query(self, query_text: str, top_k: int = 5) -> list[dict]:
|
|
131
|
+
"""Query the index with natural language.
|
|
132
|
+
|
|
133
|
+
Returns top-K most similar experiments with scores.
|
|
134
|
+
"""
|
|
135
|
+
if self.vectorizer is None or self.tfidf_matrix is None:
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
query_vec = self.vectorizer.transform([query_text])
|
|
139
|
+
|
|
140
|
+
# Cosine similarity = dot product (TF-IDF vectors are already normalized)
|
|
141
|
+
similarities = (self.tfidf_matrix @ query_vec.T).toarray().flatten()
|
|
142
|
+
|
|
143
|
+
# Get top-K indices
|
|
144
|
+
top_indices = np.argsort(similarities)[::-1][:top_k]
|
|
145
|
+
|
|
146
|
+
results = []
|
|
147
|
+
for idx in top_indices:
|
|
148
|
+
score = float(similarities[idx])
|
|
149
|
+
if score <= 0:
|
|
150
|
+
continue
|
|
151
|
+
exp = self.experiment_data[idx]
|
|
152
|
+
results.append({
|
|
153
|
+
"experiment_id": self.experiment_ids[idx],
|
|
154
|
+
"similarity": round(score, 4),
|
|
155
|
+
"description": exp.get("description", ""),
|
|
156
|
+
"status": exp.get("status", "unknown"),
|
|
157
|
+
"metrics": exp.get("metrics", {}),
|
|
158
|
+
"config": exp.get("config", {}),
|
|
159
|
+
})
|
|
160
|
+
|
|
161
|
+
return results
|
|
162
|
+
|
|
163
|
+
def find_similar(self, experiment_id: str, top_k: int = 5) -> list[dict]:
|
|
164
|
+
"""Find experiments similar to a given experiment ID."""
|
|
165
|
+
try:
|
|
166
|
+
idx = self.experiment_ids.index(experiment_id)
|
|
167
|
+
except ValueError:
|
|
168
|
+
return []
|
|
169
|
+
|
|
170
|
+
return self.query(self.experiment_texts[idx], top_k + 1)
|
|
171
|
+
|
|
172
|
+
def save(self, path: str) -> None:
|
|
173
|
+
"""Save the index to disk."""
|
|
174
|
+
p = Path(path)
|
|
175
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
with open(p, "wb") as f:
|
|
177
|
+
pickle.dump({
|
|
178
|
+
"vectorizer": self.vectorizer,
|
|
179
|
+
"tfidf_matrix": self.tfidf_matrix,
|
|
180
|
+
"experiment_ids": self.experiment_ids,
|
|
181
|
+
"experiment_texts": self.experiment_texts,
|
|
182
|
+
"experiment_data": self.experiment_data,
|
|
183
|
+
}, f)
|
|
184
|
+
|
|
185
|
+
@classmethod
|
|
186
|
+
def load(cls, path: str) -> ExperimentIndex | None:
|
|
187
|
+
"""Load an index from disk."""
|
|
188
|
+
p = Path(path)
|
|
189
|
+
if not p.exists():
|
|
190
|
+
return None
|
|
191
|
+
try:
|
|
192
|
+
with open(p, "rb") as f:
|
|
193
|
+
data = pickle.load(f)
|
|
194
|
+
idx = cls()
|
|
195
|
+
idx.vectorizer = data["vectorizer"]
|
|
196
|
+
idx.tfidf_matrix = data["tfidf_matrix"]
|
|
197
|
+
idx.experiment_ids = data["experiment_ids"]
|
|
198
|
+
idx.experiment_texts = data["experiment_texts"]
|
|
199
|
+
idx.experiment_data = data["experiment_data"]
|
|
200
|
+
return idx
|
|
201
|
+
except Exception:
|
|
202
|
+
return None
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def format_results(results: list[dict], metric_name: str = "accuracy") -> str:
|
|
206
|
+
"""Format query results for display."""
|
|
207
|
+
if not results:
|
|
208
|
+
return "No matching experiments found."
|
|
209
|
+
|
|
210
|
+
lines = [f"Top {len(results)} similar experiments:", ""]
|
|
211
|
+
for i, r in enumerate(results, 1):
|
|
212
|
+
metric_val = r["metrics"].get(metric_name, "?")
|
|
213
|
+
lines.append(f" {i}. {r['experiment_id']} (similarity: {r['similarity']:.3f})")
|
|
214
|
+
lines.append(f" {r['description']}")
|
|
215
|
+
lines.append(f" Status: {r['status']}, {metric_name}: {metric_val}")
|
|
216
|
+
lines.append("")
|
|
217
|
+
|
|
218
|
+
return "\n".join(lines)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def main() -> None:
|
|
222
|
+
parser = argparse.ArgumentParser(description="Semantic experiment index")
|
|
223
|
+
parser.add_argument("--log", default="experiments/log.jsonl")
|
|
224
|
+
parser.add_argument("--index", default="experiments/index.pkl")
|
|
225
|
+
|
|
226
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
227
|
+
|
|
228
|
+
# build
|
|
229
|
+
subparsers.add_parser("build", help="Build/rebuild the index")
|
|
230
|
+
|
|
231
|
+
# query
|
|
232
|
+
query_parser = subparsers.add_parser("query", help="Search experiments by description")
|
|
233
|
+
query_parser.add_argument("text", help="Query text")
|
|
234
|
+
query_parser.add_argument("--top", type=int, default=5)
|
|
235
|
+
|
|
236
|
+
# similar
|
|
237
|
+
sim_parser = subparsers.add_parser("similar", help="Find experiments similar to a given one")
|
|
238
|
+
sim_parser.add_argument("experiment_id", help="Experiment ID")
|
|
239
|
+
sim_parser.add_argument("--top", type=int, default=5)
|
|
240
|
+
|
|
241
|
+
args = parser.parse_args()
|
|
242
|
+
|
|
243
|
+
if args.command == "build":
|
|
244
|
+
experiments = load_experiments(args.log)
|
|
245
|
+
if not experiments:
|
|
246
|
+
print("No experiments found.", file=sys.stderr)
|
|
247
|
+
sys.exit(1)
|
|
248
|
+
|
|
249
|
+
idx = ExperimentIndex()
|
|
250
|
+
count = idx.build(experiments)
|
|
251
|
+
idx.save(args.index)
|
|
252
|
+
print(f"Indexed {count} experiments → {args.index}")
|
|
253
|
+
|
|
254
|
+
elif args.command == "query":
|
|
255
|
+
idx = ExperimentIndex.load(args.index)
|
|
256
|
+
if idx is None:
|
|
257
|
+
# Build on the fly
|
|
258
|
+
experiments = load_experiments(args.log)
|
|
259
|
+
if not experiments:
|
|
260
|
+
print("No experiments to search.", file=sys.stderr)
|
|
261
|
+
sys.exit(1)
|
|
262
|
+
idx = ExperimentIndex()
|
|
263
|
+
idx.build(experiments)
|
|
264
|
+
|
|
265
|
+
results = idx.query(args.text, args.top)
|
|
266
|
+
print(format_results(results))
|
|
267
|
+
|
|
268
|
+
elif args.command == "similar":
|
|
269
|
+
idx = ExperimentIndex.load(args.index)
|
|
270
|
+
if idx is None:
|
|
271
|
+
experiments = load_experiments(args.log)
|
|
272
|
+
if not experiments:
|
|
273
|
+
print("No experiments to search.", file=sys.stderr)
|
|
274
|
+
sys.exit(1)
|
|
275
|
+
idx = ExperimentIndex()
|
|
276
|
+
idx.build(experiments)
|
|
277
|
+
|
|
278
|
+
results = idx.find_similar(args.experiment_id, args.top)
|
|
279
|
+
# Filter out the query experiment itself
|
|
280
|
+
results = [r for r in results if r["experiment_id"] != args.experiment_id]
|
|
281
|
+
print(format_results(results))
|
|
282
|
+
|
|
283
|
+
else:
|
|
284
|
+
parser.print_help()
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
if __name__ == "__main__":
|
|
288
|
+
main()
|