claude-turing 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/.claude-plugin/plugin.json +34 -0
  2. package/LICENSE +21 -0
  3. package/README.md +457 -0
  4. package/agents/ml-evaluator.md +43 -0
  5. package/agents/ml-researcher.md +74 -0
  6. package/bin/cli.js +46 -0
  7. package/bin/turing-init.sh +57 -0
  8. package/commands/brief.md +83 -0
  9. package/commands/compare.md +24 -0
  10. package/commands/design.md +97 -0
  11. package/commands/init.md +123 -0
  12. package/commands/logbook.md +51 -0
  13. package/commands/mode.md +43 -0
  14. package/commands/poster.md +89 -0
  15. package/commands/preflight.md +75 -0
  16. package/commands/report.md +97 -0
  17. package/commands/rules/loop-protocol.md +91 -0
  18. package/commands/status.md +24 -0
  19. package/commands/suggest.md +95 -0
  20. package/commands/sweep.md +45 -0
  21. package/commands/train.md +66 -0
  22. package/commands/try.md +63 -0
  23. package/commands/turing.md +54 -0
  24. package/commands/validate.md +34 -0
  25. package/config/defaults.yaml +45 -0
  26. package/config/experiment_archetypes.yaml +127 -0
  27. package/config/lifecycle.toml +31 -0
  28. package/config/novelty_aliases.yaml +107 -0
  29. package/config/relationships.toml +125 -0
  30. package/config/state.toml +24 -0
  31. package/config/task_taxonomy.yaml +110 -0
  32. package/config/taxonomy.toml +37 -0
  33. package/package.json +54 -0
  34. package/src/claude-md.js +55 -0
  35. package/src/install.js +107 -0
  36. package/src/paths.js +20 -0
  37. package/src/postinstall.js +22 -0
  38. package/src/verify.js +109 -0
  39. package/templates/MEMORY.md +36 -0
  40. package/templates/README.md +93 -0
  41. package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
  42. package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
  43. package/templates/config.yaml +48 -0
  44. package/templates/evaluate.py +237 -0
  45. package/templates/features/__init__.py +0 -0
  46. package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
  47. package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
  48. package/templates/features/featurizers.py +138 -0
  49. package/templates/prepare.py +171 -0
  50. package/templates/program.md +216 -0
  51. package/templates/pyproject.toml +8 -0
  52. package/templates/requirements.txt +8 -0
  53. package/templates/scripts/__init__.py +0 -0
  54. package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
  56. package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
  57. package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
  58. package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
  59. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  60. package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
  61. package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
  62. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  63. package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
  64. package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
  65. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  66. package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
  67. package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
  68. package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
  69. package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
  70. package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
  71. package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
  72. package/templates/scripts/__pycache__/turing_io.cpython-314.pyc +0 -0
  73. package/templates/scripts/__pycache__/update_state.cpython-314.pyc +0 -0
  74. package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
  75. package/templates/scripts/check_convergence.py +230 -0
  76. package/templates/scripts/compare_runs.py +124 -0
  77. package/templates/scripts/critique_hypothesis.py +350 -0
  78. package/templates/scripts/experiment_index.py +288 -0
  79. package/templates/scripts/generate_brief.py +389 -0
  80. package/templates/scripts/generate_logbook.py +423 -0
  81. package/templates/scripts/log_experiment.py +243 -0
  82. package/templates/scripts/manage_hypotheses.py +543 -0
  83. package/templates/scripts/novelty_guard.py +343 -0
  84. package/templates/scripts/parse_metrics.py +139 -0
  85. package/templates/scripts/post-train-hook.sh +74 -0
  86. package/templates/scripts/preflight.py +549 -0
  87. package/templates/scripts/scaffold.py +409 -0
  88. package/templates/scripts/show_environment.py +92 -0
  89. package/templates/scripts/show_experiment_tree.py +144 -0
  90. package/templates/scripts/show_families.py +133 -0
  91. package/templates/scripts/show_metrics.py +157 -0
  92. package/templates/scripts/statistical_compare.py +259 -0
  93. package/templates/scripts/stop-hook.sh +34 -0
  94. package/templates/scripts/suggest_next.py +301 -0
  95. package/templates/scripts/sweep.py +276 -0
  96. package/templates/scripts/synthesize_decision.py +300 -0
  97. package/templates/scripts/turing_io.py +76 -0
  98. package/templates/scripts/update_state.py +296 -0
  99. package/templates/scripts/validate_stability.py +167 -0
  100. package/templates/scripts/verify_placeholders.py +119 -0
  101. package/templates/sweep_config.yaml +14 -0
  102. package/templates/tests/__init__.py +0 -0
  103. package/templates/tests/conftest.py +91 -0
  104. package/templates/train.py +240 -0
@@ -0,0 +1,350 @@
1
+ #!/usr/bin/env python3
2
+ """Hypothesis critique engine for the autoresearch pipeline.
3
+
4
+ Scores a proposed hypothesis on novelty, feasibility, and expected impact
5
+ before committing to an expensive training run. A 30-second critique is
6
+ cheaper than a 30-minute wasted experiment.
7
+
8
+ Integrates with the novelty guard (novelty_guard.py) for duplicate
9
+ detection and with experiment history for feasibility assessment.
10
+
11
+ Usage:
12
+ python scripts/critique_hypothesis.py score "increase max_depth to 12" \\
13
+ --log experiments/log.jsonl --config config.yaml
14
+ python scripts/critique_hypothesis.py score "switch to LightGBM" \\
15
+ --log experiments/log.jsonl --verbose
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import sys
23
+ from pathlib import Path
24
+
25
+ import yaml
26
+
27
+ from scripts.turing_io import load_experiments
28
+
29
+
30
+ def score_novelty(
31
+ description: str,
32
+ experiments: list[dict],
33
+ novelty_result: dict | None = None,
34
+ ) -> dict:
35
+ """Score how novel this hypothesis is relative to experiment history.
36
+
37
+ Returns dict with score (0-10), rationale, and similar experiments.
38
+ """
39
+ desc_lower = description.lower()
40
+ desc_words = set(desc_lower.split())
41
+
42
+ # Check for near-duplicates in experiment descriptions
43
+ similar = []
44
+ for exp in experiments:
45
+ exp_desc = exp.get("description", "").lower()
46
+ if not exp_desc:
47
+ continue
48
+ exp_words = set(exp_desc.split())
49
+ if desc_words and exp_words:
50
+ overlap = len(desc_words & exp_words) / max(len(desc_words), 1)
51
+ if overlap > 0.5:
52
+ similar.append({
53
+ "experiment_id": exp.get("experiment_id", "?"),
54
+ "description": exp.get("description", ""),
55
+ "status": exp.get("status", "unknown"),
56
+ "overlap": round(overlap, 2),
57
+ })
58
+
59
+ # If novelty guard was run, use its classification
60
+ if novelty_result:
61
+ classification = novelty_result.get("classification", "novel")
62
+ if classification == "duplicate_run":
63
+ return {"score": 0, "rationale": "Exact duplicate of prior experiment",
64
+ "similar": similar}
65
+ elif classification == "repeat_failure":
66
+ return {"score": 1, "rationale": f"Repeats a known failed approach",
67
+ "similar": similar}
68
+ elif classification == "incremental_followup":
69
+ return {"score": 5, "rationale": "Incremental follow-up to existing work",
70
+ "similar": similar}
71
+
72
+ # Score based on similarity
73
+ if not similar:
74
+ return {"score": 9, "rationale": "No similar experiments found — appears novel",
75
+ "similar": []}
76
+
77
+ # Check if similar experiments failed
78
+ failed_similar = [s for s in similar if s["status"] == "discarded"]
79
+ if failed_similar:
80
+ max_overlap = max(s["overlap"] for s in failed_similar)
81
+ if max_overlap > 0.7:
82
+ return {"score": 2, "rationale": f"Very similar to {len(failed_similar)} discarded experiment(s)",
83
+ "similar": similar}
84
+ return {"score": 4, "rationale": f"Partially similar to {len(failed_similar)} discarded experiment(s)",
85
+ "similar": similar}
86
+
87
+ # Similar to kept experiments = incremental
88
+ kept_similar = [s for s in similar if s["status"] == "kept"]
89
+ if kept_similar:
90
+ return {"score": 6, "rationale": f"Builds on {len(kept_similar)} successful experiment(s)",
91
+ "similar": similar}
92
+
93
+ return {"score": 7, "rationale": "Some overlap but distinct approach",
94
+ "similar": similar}
95
+
96
+
97
+ def score_feasibility(
98
+ description: str,
99
+ experiments: list[dict],
100
+ config: dict,
101
+ ) -> dict:
102
+ """Score how feasible this hypothesis is given current infrastructure.
103
+
104
+ Returns dict with score (0-10), rationale, and concerns.
105
+ """
106
+ desc_lower = description.lower()
107
+ concerns = []
108
+
109
+ # Check for model types that might not be available
110
+ exotic_models = ["transformer", "bert", "gpt", "diffusion", "gan",
111
+ "vae", "autoencoder", "resnet", "efficientnet"]
112
+ current_model = config.get("model", {}).get("type", "").lower()
113
+
114
+ for model in exotic_models:
115
+ if model in desc_lower and model not in current_model:
116
+ concerns.append(f"Switching to {model} may require significant code changes and new dependencies")
117
+
118
+ # Check for mentions of GPU/hardware requirements
119
+ gpu_terms = ["gpu", "cuda", "multi-gpu", "distributed", "tpu"]
120
+ for term in gpu_terms:
121
+ if term in desc_lower:
122
+ concerns.append(f"Mentions '{term}' — verify hardware availability")
123
+
124
+ # Check for mentions of external data
125
+ data_terms = ["download", "external data", "pretrained", "pre-trained",
126
+ "huggingface", "kaggle", "new dataset"]
127
+ for term in data_terms:
128
+ if term in desc_lower:
129
+ concerns.append(f"May require external resources: '{term}'")
130
+
131
+ # Score
132
+ if len(concerns) >= 3:
133
+ score = 3
134
+ elif len(concerns) >= 2:
135
+ score = 5
136
+ elif len(concerns) == 1:
137
+ score = 7
138
+ else:
139
+ score = 9
140
+
141
+ rationale = "No feasibility concerns" if not concerns else f"{len(concerns)} concern(s) identified"
142
+ return {"score": score, "rationale": rationale, "concerns": concerns}
143
+
144
+
145
+ def score_expected_impact(
146
+ description: str,
147
+ experiments: list[dict],
148
+ config: dict,
149
+ ) -> dict:
150
+ """Score the expected impact of this hypothesis.
151
+
152
+ Based on what types of changes have historically improved metrics.
153
+ Returns dict with score (0-10), rationale, and evidence.
154
+ """
155
+ if not experiments:
156
+ return {"score": 5, "rationale": "No experiment history — cannot estimate impact",
157
+ "evidence": []}
158
+
159
+ desc_lower = description.lower()
160
+ evidence = []
161
+
162
+ # Analyze which experiment types have been successful
163
+ type_stats: dict[str, dict] = {}
164
+ for exp in experiments:
165
+ exp_type = exp.get("config", {}).get("experiment_type", "unknown")
166
+ if exp_type not in type_stats:
167
+ type_stats[exp_type] = {"kept": 0, "discarded": 0}
168
+ if exp.get("status") == "kept":
169
+ type_stats[exp_type]["kept"] += 1
170
+ else:
171
+ type_stats[exp_type]["discarded"] += 1
172
+
173
+ # Infer experiment type from description
174
+ type_keywords = {
175
+ "hyperparameter": ["learning rate", "lr", "max_depth", "depth", "n_estimators",
176
+ "estimators", "batch", "epochs", "regularization", "dropout"],
177
+ "architecture": ["switch to", "replace", "lightgbm", "catboost", "random forest",
178
+ "neural", "transformer", "model type"],
179
+ "feature": ["feature", "polynomial", "interaction", "encoding", "one-hot",
180
+ "embedding", "normalize", "scale"],
181
+ "data": ["augment", "oversample", "undersample", "smote", "split",
182
+ "validation", "cross-validation"],
183
+ }
184
+
185
+ inferred_type = "unknown"
186
+ for exp_type, keywords in type_keywords.items():
187
+ if any(kw in desc_lower for kw in keywords):
188
+ inferred_type = exp_type
189
+ break
190
+
191
+ # Check historical success rate for this type
192
+ if inferred_type in type_stats:
193
+ stats = type_stats[inferred_type]
194
+ total = stats["kept"] + stats["discarded"]
195
+ if total > 0:
196
+ success_rate = stats["kept"] / total
197
+ evidence.append(f"{inferred_type} experiments: {stats['kept']}/{total} kept ({success_rate:.0%})")
198
+ if success_rate > 0.5:
199
+ return {"score": 8, "rationale": f"{inferred_type} changes have been successful ({success_rate:.0%} keep rate)",
200
+ "evidence": evidence}
201
+ elif success_rate > 0.2:
202
+ return {"score": 5, "rationale": f"{inferred_type} changes have mixed results ({success_rate:.0%} keep rate)",
203
+ "evidence": evidence}
204
+ else:
205
+ return {"score": 3, "rationale": f"{inferred_type} changes have mostly failed ({success_rate:.0%} keep rate)",
206
+ "evidence": evidence}
207
+
208
+ # Check if we're in a diminishing returns zone
209
+ kept_experiments = [e for e in experiments if e.get("status") == "kept"]
210
+ if len(kept_experiments) >= 3:
211
+ metric_name = config.get("evaluation", {}).get("primary_metric", "accuracy")
212
+ recent_metrics = []
213
+ for exp in kept_experiments[-5:]:
214
+ val = exp.get("metrics", {}).get(metric_name)
215
+ if isinstance(val, (int, float)):
216
+ recent_metrics.append(val)
217
+
218
+ if len(recent_metrics) >= 3:
219
+ improvements = [recent_metrics[i] - recent_metrics[i-1]
220
+ for i in range(1, len(recent_metrics))]
221
+ avg_improvement = sum(improvements) / len(improvements) if improvements else 0
222
+ if abs(avg_improvement) < 0.001:
223
+ evidence.append(f"Recent improvements are very small (avg {avg_improvement:.4f})")
224
+ return {"score": 4, "rationale": "In diminishing returns zone — incremental changes may not help",
225
+ "evidence": evidence}
226
+
227
+ return {"score": 6, "rationale": "Insufficient history to estimate impact precisely",
228
+ "evidence": evidence}
229
+
230
+
231
+ def critique_hypothesis(
232
+ description: str,
233
+ log_path: str = "experiments/log.jsonl",
234
+ config_path: str = "config.yaml",
235
+ novelty_result: dict | None = None,
236
+ ) -> dict:
237
+ """Run full critique on a hypothesis.
238
+
239
+ Returns dict with:
240
+ overall_score: float (0-10)
241
+ verdict: str ("proceed", "modify", "reject")
242
+ novelty: dict
243
+ feasibility: dict
244
+ impact: dict
245
+ """
246
+ experiments = load_experiments(log_path)
247
+
248
+ config = {}
249
+ if Path(config_path).exists():
250
+ with open(config_path) as f:
251
+ config = yaml.safe_load(f) or {}
252
+
253
+ novelty = score_novelty(description, experiments, novelty_result)
254
+ feasibility = score_feasibility(description, experiments, config)
255
+ impact = score_expected_impact(description, experiments, config)
256
+
257
+ # Weighted average: novelty 30%, feasibility 30%, impact 40%
258
+ overall = (novelty["score"] * 0.3 +
259
+ feasibility["score"] * 0.3 +
260
+ impact["score"] * 0.4)
261
+ overall = round(overall, 1)
262
+
263
+ if overall >= 6.0:
264
+ verdict = "proceed"
265
+ elif overall >= 4.0:
266
+ verdict = "modify"
267
+ else:
268
+ verdict = "reject"
269
+
270
+ return {
271
+ "overall_score": overall,
272
+ "verdict": verdict,
273
+ "novelty": novelty,
274
+ "feasibility": feasibility,
275
+ "impact": impact,
276
+ "description": description,
277
+ }
278
+
279
+
280
+ def format_critique(result: dict) -> str:
281
+ """Format critique result for display."""
282
+ lines = []
283
+ lines.append(f"Hypothesis Critique: {result['description']}")
284
+ lines.append("=" * 60)
285
+ lines.append("")
286
+ lines.append(f"Overall Score: {result['overall_score']}/10 → {result['verdict'].upper()}")
287
+ lines.append("")
288
+
289
+ for dim in ("novelty", "feasibility", "impact"):
290
+ d = result[dim]
291
+ lines.append(f" {dim.title()}: {d['score']}/10 — {d['rationale']}")
292
+ if d.get("concerns"):
293
+ for c in d["concerns"]:
294
+ lines.append(f" ⚠ {c}")
295
+ if d.get("evidence"):
296
+ for e in d["evidence"]:
297
+ lines.append(f" → {e}")
298
+ if d.get("similar"):
299
+ for s in d["similar"][:3]:
300
+ lines.append(f" ~ {s['experiment_id']} ({s['status']}, {s['overlap']:.0%} overlap)")
301
+
302
+ lines.append("")
303
+ if result["verdict"] == "reject":
304
+ lines.append("Recommendation: Reject this hypothesis. Consider a different approach.")
305
+ elif result["verdict"] == "modify":
306
+ lines.append("Recommendation: Modify this hypothesis to address the concerns above.")
307
+ else:
308
+ lines.append("Recommendation: Proceed with this hypothesis.")
309
+
310
+ return "\n".join(lines)
311
+
312
+
313
+ def main() -> None:
314
+ parser = argparse.ArgumentParser(description="Critique a hypothesis before execution")
315
+ subparsers = parser.add_subparsers(dest="command")
316
+
317
+ score_parser = subparsers.add_parser("score", help="Score a hypothesis")
318
+ score_parser.add_argument("description", help="Hypothesis description")
319
+ score_parser.add_argument("--log", default="experiments/log.jsonl")
320
+ score_parser.add_argument("--config", default="config.yaml")
321
+ score_parser.add_argument("--verbose", action="store_true")
322
+ score_parser.add_argument("--json", action="store_true", help="Output as JSON")
323
+
324
+ args = parser.parse_args()
325
+ if not args.command:
326
+ parser.print_help()
327
+ sys.exit(1)
328
+
329
+ result = critique_hypothesis(args.description, args.log, args.config)
330
+
331
+ if args.json:
332
+ # Remove non-serializable parts for clean JSON output
333
+ output = {
334
+ "overall_score": result["overall_score"],
335
+ "verdict": result["verdict"],
336
+ "novelty_score": result["novelty"]["score"],
337
+ "feasibility_score": result["feasibility"]["score"],
338
+ "impact_score": result["impact"]["score"],
339
+ }
340
+ if args.verbose:
341
+ output["novelty"] = result["novelty"]
342
+ output["feasibility"] = result["feasibility"]
343
+ output["impact"] = result["impact"]
344
+ print(json.dumps(output, indent=2))
345
+ else:
346
+ print(format_critique(result))
347
+
348
+
349
+ if __name__ == "__main__":
350
+ main()
@@ -0,0 +1,288 @@
1
+ #!/usr/bin/env python3
2
+ """Semantic experiment index for the autoresearch pipeline.
3
+
4
+ Indexes experiment descriptions and results using TF-IDF vectors,
5
+ enabling semantic search over experiment history. Complementary to
6
+ the novelty guard's keyword matching — this finds experiments by
7
+ meaning, not just shared tokens.
8
+
9
+ Uses sklearn's TfidfVectorizer (already a dependency) rather than
10
+ sentence-transformers + FAISS to avoid new heavyweight dependencies.
11
+
12
+ Usage:
13
+ python scripts/experiment_index.py build --log experiments/log.jsonl
14
+ python scripts/experiment_index.py query "experiments with high learning rate"
15
+ python scripts/experiment_index.py query "overfitting with deep trees" --top 3
16
+ python scripts/experiment_index.py similar exp-005 --top 5
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import json
23
+ import pickle
24
+ import sys
25
+ from pathlib import Path
26
+
27
+ import numpy as np
28
+
29
+ from scripts.turing_io import load_experiments
30
+
31
+
32
+ def experiment_to_text(exp: dict) -> str:
33
+ """Convert an experiment to a searchable text representation.
34
+
35
+ Combines description, config summary, metrics, and status into
36
+ a single string that captures the experiment's essence.
37
+ """
38
+ parts = []
39
+
40
+ desc = exp.get("description", "")
41
+ if desc:
42
+ parts.append(desc)
43
+
44
+ # Config summary
45
+ config = exp.get("config", {})
46
+ model_type = config.get("model_type", "")
47
+ if model_type:
48
+ parts.append(f"model: {model_type}")
49
+
50
+ exp_type = config.get("experiment_type", "")
51
+ if exp_type:
52
+ parts.append(f"type: {exp_type}")
53
+
54
+ hyperparams = config.get("hyperparams", {})
55
+ if hyperparams:
56
+ hp_str = " ".join(f"{k}={v}" for k, v in hyperparams.items()
57
+ if isinstance(v, (int, float, str)))
58
+ if hp_str:
59
+ parts.append(f"hyperparams: {hp_str}")
60
+
61
+ # Metrics
62
+ metrics = exp.get("metrics", {})
63
+ if metrics:
64
+ metric_str = " ".join(f"{k}={v}" for k, v in metrics.items()
65
+ if isinstance(v, (int, float)))
66
+ if metric_str:
67
+ parts.append(f"metrics: {metric_str}")
68
+
69
+ # Status
70
+ status = exp.get("status", "")
71
+ if status:
72
+ parts.append(f"status: {status}")
73
+
74
+ # Family and tags
75
+ family = exp.get("family", "")
76
+ if family:
77
+ parts.append(f"family: {family}")
78
+
79
+ tags = exp.get("tags", [])
80
+ if tags:
81
+ parts.append(f"tags: {' '.join(tags)}")
82
+
83
+ return " | ".join(parts) if parts else ""
84
+
85
+
86
+ class ExperimentIndex:
87
+ """TF-IDF based semantic index over experiment history."""
88
+
89
+ def __init__(self):
90
+ self.vectorizer = None
91
+ self.tfidf_matrix = None
92
+ self.experiment_ids: list[str] = []
93
+ self.experiment_texts: list[str] = []
94
+ self.experiment_data: list[dict] = []
95
+
96
+ def build(self, experiments: list[dict]) -> int:
97
+ """Build the index from a list of experiments.
98
+
99
+ Returns the number of experiments indexed.
100
+ """
101
+ from sklearn.feature_extraction.text import TfidfVectorizer
102
+
103
+ texts = []
104
+ ids = []
105
+ data = []
106
+
107
+ for exp in experiments:
108
+ text = experiment_to_text(exp)
109
+ if text:
110
+ texts.append(text)
111
+ ids.append(exp.get("experiment_id", "?"))
112
+ data.append(exp)
113
+
114
+ if not texts:
115
+ return 0
116
+
117
+ self.vectorizer = TfidfVectorizer(
118
+ max_features=5000,
119
+ stop_words="english",
120
+ ngram_range=(1, 2), # unigrams and bigrams
121
+ sublinear_tf=True,
122
+ )
123
+ self.tfidf_matrix = self.vectorizer.fit_transform(texts)
124
+ self.experiment_ids = ids
125
+ self.experiment_texts = texts
126
+ self.experiment_data = data
127
+
128
+ return len(texts)
129
+
130
+ def query(self, query_text: str, top_k: int = 5) -> list[dict]:
131
+ """Query the index with natural language.
132
+
133
+ Returns top-K most similar experiments with scores.
134
+ """
135
+ if self.vectorizer is None or self.tfidf_matrix is None:
136
+ return []
137
+
138
+ query_vec = self.vectorizer.transform([query_text])
139
+
140
+ # Cosine similarity = dot product (TF-IDF vectors are already normalized)
141
+ similarities = (self.tfidf_matrix @ query_vec.T).toarray().flatten()
142
+
143
+ # Get top-K indices
144
+ top_indices = np.argsort(similarities)[::-1][:top_k]
145
+
146
+ results = []
147
+ for idx in top_indices:
148
+ score = float(similarities[idx])
149
+ if score <= 0:
150
+ continue
151
+ exp = self.experiment_data[idx]
152
+ results.append({
153
+ "experiment_id": self.experiment_ids[idx],
154
+ "similarity": round(score, 4),
155
+ "description": exp.get("description", ""),
156
+ "status": exp.get("status", "unknown"),
157
+ "metrics": exp.get("metrics", {}),
158
+ "config": exp.get("config", {}),
159
+ })
160
+
161
+ return results
162
+
163
+ def find_similar(self, experiment_id: str, top_k: int = 5) -> list[dict]:
164
+ """Find experiments similar to a given experiment ID."""
165
+ try:
166
+ idx = self.experiment_ids.index(experiment_id)
167
+ except ValueError:
168
+ return []
169
+
170
+ return self.query(self.experiment_texts[idx], top_k + 1)
171
+
172
+ def save(self, path: str) -> None:
173
+ """Save the index to disk."""
174
+ p = Path(path)
175
+ p.parent.mkdir(parents=True, exist_ok=True)
176
+ with open(p, "wb") as f:
177
+ pickle.dump({
178
+ "vectorizer": self.vectorizer,
179
+ "tfidf_matrix": self.tfidf_matrix,
180
+ "experiment_ids": self.experiment_ids,
181
+ "experiment_texts": self.experiment_texts,
182
+ "experiment_data": self.experiment_data,
183
+ }, f)
184
+
185
+ @classmethod
186
+ def load(cls, path: str) -> ExperimentIndex | None:
187
+ """Load an index from disk."""
188
+ p = Path(path)
189
+ if not p.exists():
190
+ return None
191
+ try:
192
+ with open(p, "rb") as f:
193
+ data = pickle.load(f)
194
+ idx = cls()
195
+ idx.vectorizer = data["vectorizer"]
196
+ idx.tfidf_matrix = data["tfidf_matrix"]
197
+ idx.experiment_ids = data["experiment_ids"]
198
+ idx.experiment_texts = data["experiment_texts"]
199
+ idx.experiment_data = data["experiment_data"]
200
+ return idx
201
+ except Exception:
202
+ return None
203
+
204
+
205
+ def format_results(results: list[dict], metric_name: str = "accuracy") -> str:
206
+ """Format query results for display."""
207
+ if not results:
208
+ return "No matching experiments found."
209
+
210
+ lines = [f"Top {len(results)} similar experiments:", ""]
211
+ for i, r in enumerate(results, 1):
212
+ metric_val = r["metrics"].get(metric_name, "?")
213
+ lines.append(f" {i}. {r['experiment_id']} (similarity: {r['similarity']:.3f})")
214
+ lines.append(f" {r['description']}")
215
+ lines.append(f" Status: {r['status']}, {metric_name}: {metric_val}")
216
+ lines.append("")
217
+
218
+ return "\n".join(lines)
219
+
220
+
221
+ def main() -> None:
222
+ parser = argparse.ArgumentParser(description="Semantic experiment index")
223
+ parser.add_argument("--log", default="experiments/log.jsonl")
224
+ parser.add_argument("--index", default="experiments/index.pkl")
225
+
226
+ subparsers = parser.add_subparsers(dest="command")
227
+
228
+ # build
229
+ subparsers.add_parser("build", help="Build/rebuild the index")
230
+
231
+ # query
232
+ query_parser = subparsers.add_parser("query", help="Search experiments by description")
233
+ query_parser.add_argument("text", help="Query text")
234
+ query_parser.add_argument("--top", type=int, default=5)
235
+
236
+ # similar
237
+ sim_parser = subparsers.add_parser("similar", help="Find experiments similar to a given one")
238
+ sim_parser.add_argument("experiment_id", help="Experiment ID")
239
+ sim_parser.add_argument("--top", type=int, default=5)
240
+
241
+ args = parser.parse_args()
242
+
243
+ if args.command == "build":
244
+ experiments = load_experiments(args.log)
245
+ if not experiments:
246
+ print("No experiments found.", file=sys.stderr)
247
+ sys.exit(1)
248
+
249
+ idx = ExperimentIndex()
250
+ count = idx.build(experiments)
251
+ idx.save(args.index)
252
+ print(f"Indexed {count} experiments → {args.index}")
253
+
254
+ elif args.command == "query":
255
+ idx = ExperimentIndex.load(args.index)
256
+ if idx is None:
257
+ # Build on the fly
258
+ experiments = load_experiments(args.log)
259
+ if not experiments:
260
+ print("No experiments to search.", file=sys.stderr)
261
+ sys.exit(1)
262
+ idx = ExperimentIndex()
263
+ idx.build(experiments)
264
+
265
+ results = idx.query(args.text, args.top)
266
+ print(format_results(results))
267
+
268
+ elif args.command == "similar":
269
+ idx = ExperimentIndex.load(args.index)
270
+ if idx is None:
271
+ experiments = load_experiments(args.log)
272
+ if not experiments:
273
+ print("No experiments to search.", file=sys.stderr)
274
+ sys.exit(1)
275
+ idx = ExperimentIndex()
276
+ idx.build(experiments)
277
+
278
+ results = idx.find_similar(args.experiment_id, args.top)
279
+ # Filter out the query experiment itself
280
+ results = [r for r in results if r["experiment_id"] != args.experiment_id]
281
+ print(format_results(results))
282
+
283
+ else:
284
+ parser.print_help()
285
+
286
+
287
+ if __name__ == "__main__":
288
+ main()