claude-turing 1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +66 -3
- package/commands/card.md +36 -0
- package/commands/explore.md +107 -0
- package/commands/suggest.md +68 -4
- package/commands/turing.md +4 -0
- package/package.json +1 -1
- package/src/claude-md.js +1 -0
- package/src/install.js +2 -2
- package/src/verify.js +2 -0
- package/templates/requirements.txt +4 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
- package/templates/scripts/cleanup.py +599 -0
- package/templates/scripts/cost_frontier.py +292 -0
- package/templates/scripts/diff_configs.py +534 -0
- package/templates/scripts/export_results.py +457 -0
- package/templates/scripts/generate_brief.py +58 -3
- package/templates/scripts/generate_model_card.py +342 -0
- package/templates/scripts/leaderboard.py +508 -0
- package/templates/scripts/manage_hypotheses.py +2 -2
- package/templates/scripts/plot_trajectory.py +611 -0
- package/templates/scripts/scaffold.py +8 -0
- package/templates/scripts/show_metrics.py +23 -2
- package/templates/scripts/treequest_suggest.py +520 -0
- package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/test_cost_frontier.py +222 -0
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Tree-search-guided hypothesis exploration for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Uses TreeQuest's AB-MCTS (Adaptive Branching Monte Carlo Tree Search) to
|
|
5
|
+
explore the space of experiment hypotheses. Each tree node is a hypothesis
|
|
6
|
+
description + structured config. The generation function produces refinements
|
|
7
|
+
of a parent hypothesis, and the scoring function uses the critique engine
|
|
8
|
+
(novelty × feasibility × impact) as the reward signal.
|
|
9
|
+
|
|
10
|
+
This is the search-driven complement to suggest_next.py's surrogate model:
|
|
11
|
+
instead of fitting a Random Forest over hyperparameter space, we search
|
|
12
|
+
the space of *ideas* using MCTS with the critique score as reward.
|
|
13
|
+
|
|
14
|
+
Requires: pip install "treequest[all]"
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
python scripts/treequest_suggest.py \\
|
|
18
|
+
--log experiments/log.jsonl \\
|
|
19
|
+
--config config.yaml \\
|
|
20
|
+
--top 5 \\
|
|
21
|
+
--iterations 30 \\
|
|
22
|
+
--strategy abmcts-a
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import json
|
|
29
|
+
import sys
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
import yaml
|
|
34
|
+
|
|
35
|
+
from scripts.critique_hypothesis import critique_hypothesis
|
|
36
|
+
from scripts.turing_io import load_experiments, load_config
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# Node representation
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class HypothesisNode:
|
|
45
|
+
"""A node in the hypothesis search tree.
|
|
46
|
+
|
|
47
|
+
Each node represents a concrete experiment hypothesis with both a
|
|
48
|
+
human-readable description and optional structured fields (model type,
|
|
49
|
+
hyperparameters, feature changes) that can be passed to the hypothesis
|
|
50
|
+
queue.
|
|
51
|
+
"""
|
|
52
|
+
description: str
|
|
53
|
+
model_type: str | None = None
|
|
54
|
+
hyperparameters: dict | None = None
|
|
55
|
+
feature_changes: dict | None = None
|
|
56
|
+
parent_description: str | None = None
|
|
57
|
+
depth: int = 0
|
|
58
|
+
critique_scores: dict = field(default_factory=dict)
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> dict:
|
|
61
|
+
"""Serialize for logging and queue integration."""
|
|
62
|
+
return {
|
|
63
|
+
"description": self.description,
|
|
64
|
+
"model_type": self.model_type,
|
|
65
|
+
"hyperparameters": self.hyperparameters,
|
|
66
|
+
"feature_changes": self.feature_changes,
|
|
67
|
+
"parent_description": self.parent_description,
|
|
68
|
+
"depth": self.depth,
|
|
69
|
+
"critique_scores": self.critique_scores,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def from_dict(d: dict) -> "HypothesisNode":
|
|
74
|
+
return HypothesisNode(
|
|
75
|
+
description=d["description"],
|
|
76
|
+
model_type=d.get("model_type"),
|
|
77
|
+
hyperparameters=d.get("hyperparameters"),
|
|
78
|
+
feature_changes=d.get("feature_changes"),
|
|
79
|
+
parent_description=d.get("parent_description"),
|
|
80
|
+
depth=d.get("depth", 0),
|
|
81
|
+
critique_scores=d.get("critique_scores", {}),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# Critique-based scoring
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def score_hypothesis(
|
|
90
|
+
node: HypothesisNode,
|
|
91
|
+
log_path: str = "experiments/log.jsonl",
|
|
92
|
+
config_path: str = "config.yaml",
|
|
93
|
+
) -> float:
|
|
94
|
+
"""Score a hypothesis node using the critique engine.
|
|
95
|
+
|
|
96
|
+
Returns a float in [0, 10] — the weighted combination of
|
|
97
|
+
novelty (30%), feasibility (30%), and expected impact (40%).
|
|
98
|
+
"""
|
|
99
|
+
result = critique_hypothesis(
|
|
100
|
+
description=node.description,
|
|
101
|
+
log_path=log_path,
|
|
102
|
+
config_path=config_path,
|
|
103
|
+
)
|
|
104
|
+
node.critique_scores = {
|
|
105
|
+
"overall": result["overall_score"],
|
|
106
|
+
"novelty": result["novelty"]["score"],
|
|
107
|
+
"feasibility": result["feasibility"]["score"],
|
|
108
|
+
"impact": result["impact"]["score"],
|
|
109
|
+
"verdict": result["verdict"],
|
|
110
|
+
}
|
|
111
|
+
return result["overall_score"]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
# Seed hypothesis generation
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
def generate_seed_hypotheses(
|
|
119
|
+
config: dict,
|
|
120
|
+
experiments: list[dict],
|
|
121
|
+
) -> list[HypothesisNode]:
|
|
122
|
+
"""Generate initial seed hypotheses from config and experiment history.
|
|
123
|
+
|
|
124
|
+
These form the root nodes of the search tree. Each represents a
|
|
125
|
+
distinct direction worth exploring.
|
|
126
|
+
"""
|
|
127
|
+
seeds: list[HypothesisNode] = []
|
|
128
|
+
current_model = config.get("model", {}).get("type", "xgboost")
|
|
129
|
+
metric = config.get("evaluation", {}).get("primary_metric", "accuracy")
|
|
130
|
+
|
|
131
|
+
# Seed 1: alternative model families
|
|
132
|
+
model_alternatives = {
|
|
133
|
+
"xgboost": ["LightGBM with dart boosting", "CatBoost with ordered boosting",
|
|
134
|
+
"Random Forest with extra-trees"],
|
|
135
|
+
"lightgbm": ["XGBoost with hist method", "CatBoost with ordered boosting",
|
|
136
|
+
"Random Forest with extra-trees"],
|
|
137
|
+
"catboost": ["XGBoost with hist method", "LightGBM with GOSS sampling",
|
|
138
|
+
"Random Forest with extra-trees"],
|
|
139
|
+
"random_forest": ["XGBoost with hist method", "LightGBM with dart boosting",
|
|
140
|
+
"CatBoost with ordered boosting"],
|
|
141
|
+
}
|
|
142
|
+
alternatives = model_alternatives.get(current_model.lower(), [
|
|
143
|
+
"XGBoost with hist method", "LightGBM with dart boosting",
|
|
144
|
+
])
|
|
145
|
+
for alt in alternatives:
|
|
146
|
+
seeds.append(HypothesisNode(
|
|
147
|
+
description=f"Switch to {alt} for {metric} optimization",
|
|
148
|
+
model_type=alt.split(" with ")[0].lower().replace(" ", ""),
|
|
149
|
+
))
|
|
150
|
+
|
|
151
|
+
# Seed 2: regularization exploration
|
|
152
|
+
seeds.append(HypothesisNode(
|
|
153
|
+
description=f"Increase regularization — add L2 penalty and reduce max_depth to combat potential overfitting",
|
|
154
|
+
hyperparameters={"reg_lambda": 1.0, "max_depth": 4},
|
|
155
|
+
))
|
|
156
|
+
|
|
157
|
+
# Seed 3: feature engineering
|
|
158
|
+
seeds.append(HypothesisNode(
|
|
159
|
+
description="Add polynomial interaction features for the top-5 most important numeric columns",
|
|
160
|
+
feature_changes={"add": ["polynomial_interactions"]},
|
|
161
|
+
))
|
|
162
|
+
|
|
163
|
+
# Seed 4: learning rate schedule
|
|
164
|
+
seeds.append(HypothesisNode(
|
|
165
|
+
description=f"Use low learning rate (0.01) with high n_estimators (2000) and early stopping for {metric}",
|
|
166
|
+
hyperparameters={"learning_rate": 0.01, "n_estimators": 2000},
|
|
167
|
+
))
|
|
168
|
+
|
|
169
|
+
# Seed 5: based on experiment history — what's been working?
|
|
170
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
171
|
+
if kept:
|
|
172
|
+
last_kept = kept[-1]
|
|
173
|
+
last_desc = last_kept.get("description", "")
|
|
174
|
+
if last_desc:
|
|
175
|
+
seeds.append(HypothesisNode(
|
|
176
|
+
description=f"Refine the approach from '{last_desc}' — try a more aggressive variant with doubled learning rate",
|
|
177
|
+
parent_description=last_desc,
|
|
178
|
+
))
|
|
179
|
+
|
|
180
|
+
return seeds
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
# Perturbation-based child generation (non-LLM fallback)
|
|
185
|
+
# ---------------------------------------------------------------------------
|
|
186
|
+
|
|
187
|
+
_PERTURBATION_STRATEGIES = [
|
|
188
|
+
"increase learning rate by 2x",
|
|
189
|
+
"decrease learning rate by 2x",
|
|
190
|
+
"double n_estimators",
|
|
191
|
+
"halve max_depth",
|
|
192
|
+
"double max_depth",
|
|
193
|
+
"add L1 regularization (reg_alpha=1.0)",
|
|
194
|
+
"add L2 regularization (reg_lambda=1.0)",
|
|
195
|
+
"increase subsample ratio to 0.9",
|
|
196
|
+
"decrease subsample ratio to 0.6",
|
|
197
|
+
"add column sampling (colsample_bytree=0.7)",
|
|
198
|
+
"switch to dart boosting",
|
|
199
|
+
"switch to GOSS sampling",
|
|
200
|
+
"add polynomial features",
|
|
201
|
+
"add target encoding for categorical columns",
|
|
202
|
+
"remove low-importance features (bottom 20%)",
|
|
203
|
+
"try log-transform on skewed numeric features",
|
|
204
|
+
"add min_child_weight constraint",
|
|
205
|
+
"increase early stopping patience",
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def generate_children(
|
|
210
|
+
parent: HypothesisNode,
|
|
211
|
+
n_children: int = 3,
|
|
212
|
+
rng_seed: int = 42,
|
|
213
|
+
) -> list[HypothesisNode]:
|
|
214
|
+
"""Generate child hypotheses by perturbing a parent.
|
|
215
|
+
|
|
216
|
+
Uses deterministic perturbation strategies. Each child is a refinement
|
|
217
|
+
or variation of the parent hypothesis.
|
|
218
|
+
"""
|
|
219
|
+
import hashlib
|
|
220
|
+
|
|
221
|
+
# Deterministic but parent-dependent selection
|
|
222
|
+
parent_hash = int(hashlib.sha256(parent.description.encode()).hexdigest(), 16)
|
|
223
|
+
start_idx = (parent_hash + rng_seed) % len(_PERTURBATION_STRATEGIES)
|
|
224
|
+
|
|
225
|
+
children = []
|
|
226
|
+
for i in range(n_children):
|
|
227
|
+
idx = (start_idx + i * 7) % len(_PERTURBATION_STRATEGIES) # stride of 7 for diversity
|
|
228
|
+
strategy = _PERTURBATION_STRATEGIES[idx]
|
|
229
|
+
child = HypothesisNode(
|
|
230
|
+
description=f"{parent.description}; additionally {strategy}",
|
|
231
|
+
model_type=parent.model_type,
|
|
232
|
+
hyperparameters=dict(parent.hyperparameters) if parent.hyperparameters else None,
|
|
233
|
+
feature_changes=dict(parent.feature_changes) if parent.feature_changes else None,
|
|
234
|
+
parent_description=parent.description,
|
|
235
|
+
depth=parent.depth + 1,
|
|
236
|
+
)
|
|
237
|
+
children.append(child)
|
|
238
|
+
|
|
239
|
+
return children
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
# ---------------------------------------------------------------------------
|
|
243
|
+
# TreeQuest integration
|
|
244
|
+
# ---------------------------------------------------------------------------
|
|
245
|
+
|
|
246
|
+
def run_treequest_search(
|
|
247
|
+
seeds: list[HypothesisNode],
|
|
248
|
+
log_path: str = "experiments/log.jsonl",
|
|
249
|
+
config_path: str = "config.yaml",
|
|
250
|
+
iterations: int = 30,
|
|
251
|
+
top_k: int = 5,
|
|
252
|
+
strategy: str = "abmcts-a",
|
|
253
|
+
children_per_node: int = 3,
|
|
254
|
+
) -> list[HypothesisNode]:
|
|
255
|
+
"""Run TreeQuest MCTS search over the hypothesis space.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
seeds: Initial hypothesis nodes (tree roots).
|
|
259
|
+
log_path: Path to experiment log for critique scoring.
|
|
260
|
+
config_path: Path to config for critique scoring.
|
|
261
|
+
iterations: Number of MCTS iterations.
|
|
262
|
+
top_k: Number of best hypotheses to return.
|
|
263
|
+
strategy: TreeQuest algorithm — "abmcts-a" or "abmcts-m".
|
|
264
|
+
children_per_node: Branching factor for child generation.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Top-K hypothesis nodes ranked by critique score.
|
|
268
|
+
"""
|
|
269
|
+
try:
|
|
270
|
+
import treequest
|
|
271
|
+
except ImportError:
|
|
272
|
+
print(
|
|
273
|
+
"TreeQuest not installed. Install with: pip install 'treequest[all]'",
|
|
274
|
+
file=sys.stderr,
|
|
275
|
+
)
|
|
276
|
+
sys.exit(1)
|
|
277
|
+
|
|
278
|
+
# Select algorithm
|
|
279
|
+
if strategy == "abmcts-m":
|
|
280
|
+
algo = treequest.ABMCTSM()
|
|
281
|
+
else:
|
|
282
|
+
algo = treequest.ABMCTSA()
|
|
283
|
+
|
|
284
|
+
# Track all scored nodes for final ranking
|
|
285
|
+
all_scored: list[HypothesisNode] = []
|
|
286
|
+
|
|
287
|
+
def generation_fn(parent_state: HypothesisNode | None) -> tuple[HypothesisNode, float]:
|
|
288
|
+
"""TreeQuest generation function.
|
|
289
|
+
|
|
290
|
+
Given a parent node (or None for root), generate a child and score it.
|
|
291
|
+
"""
|
|
292
|
+
if parent_state is None:
|
|
293
|
+
# Pick a seed
|
|
294
|
+
idx = len(all_scored) % len(seeds)
|
|
295
|
+
node = seeds[idx]
|
|
296
|
+
else:
|
|
297
|
+
children = generate_children(
|
|
298
|
+
parent_state,
|
|
299
|
+
n_children=1,
|
|
300
|
+
rng_seed=len(all_scored),
|
|
301
|
+
)
|
|
302
|
+
node = children[0]
|
|
303
|
+
|
|
304
|
+
score = score_hypothesis(node, log_path, config_path)
|
|
305
|
+
all_scored.append(node)
|
|
306
|
+
return node, score
|
|
307
|
+
|
|
308
|
+
# Initialize and run the tree search
|
|
309
|
+
algo.init_tree()
|
|
310
|
+
for i in range(iterations):
|
|
311
|
+
try:
|
|
312
|
+
algo.step(generation_fn)
|
|
313
|
+
except Exception as e:
|
|
314
|
+
print(f"Warning: iteration {i} failed: {e}", file=sys.stderr)
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
# Rank all explored nodes by critique score
|
|
318
|
+
all_scored.sort(key=lambda n: n.critique_scores.get("overall", 0), reverse=True)
|
|
319
|
+
|
|
320
|
+
# Deduplicate by description similarity
|
|
321
|
+
seen_descriptions: set[str] = set()
|
|
322
|
+
unique_results: list[HypothesisNode] = []
|
|
323
|
+
for node in all_scored:
|
|
324
|
+
# Simple dedup: normalize and check
|
|
325
|
+
normalized = node.description.lower().strip()
|
|
326
|
+
if normalized not in seen_descriptions:
|
|
327
|
+
seen_descriptions.add(normalized)
|
|
328
|
+
unique_results.append(node)
|
|
329
|
+
if len(unique_results) >= top_k:
|
|
330
|
+
break
|
|
331
|
+
|
|
332
|
+
return unique_results
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
# ---------------------------------------------------------------------------
|
|
336
|
+
# Fallback: greedy search without TreeQuest
|
|
337
|
+
# ---------------------------------------------------------------------------
|
|
338
|
+
|
|
339
|
+
def run_greedy_search(
|
|
340
|
+
seeds: list[HypothesisNode],
|
|
341
|
+
log_path: str = "experiments/log.jsonl",
|
|
342
|
+
config_path: str = "config.yaml",
|
|
343
|
+
iterations: int = 30,
|
|
344
|
+
top_k: int = 5,
|
|
345
|
+
children_per_node: int = 3,
|
|
346
|
+
) -> list[HypothesisNode]:
|
|
347
|
+
"""Greedy best-first search fallback when TreeQuest is not installed.
|
|
348
|
+
|
|
349
|
+
Expands the highest-scoring node at each step, keeping a priority
|
|
350
|
+
queue of candidates. Less sophisticated than MCTS but requires no
|
|
351
|
+
external dependency.
|
|
352
|
+
"""
|
|
353
|
+
import heapq
|
|
354
|
+
|
|
355
|
+
# Score seeds
|
|
356
|
+
scored_seeds: list[tuple[float, int, HypothesisNode]] = []
|
|
357
|
+
for i, seed in enumerate(seeds):
|
|
358
|
+
score = score_hypothesis(seed, log_path, config_path)
|
|
359
|
+
# Negate score for min-heap (we want max)
|
|
360
|
+
heapq.heappush(scored_seeds, (-score, i, seed))
|
|
361
|
+
|
|
362
|
+
frontier = scored_seeds
|
|
363
|
+
all_explored: list[HypothesisNode] = list(seeds)
|
|
364
|
+
counter = len(seeds)
|
|
365
|
+
|
|
366
|
+
for _ in range(iterations):
|
|
367
|
+
if not frontier:
|
|
368
|
+
break
|
|
369
|
+
|
|
370
|
+
# Expand best node
|
|
371
|
+
neg_score, _, best = heapq.heappop(frontier)
|
|
372
|
+
children = generate_children(best, n_children=children_per_node, rng_seed=counter)
|
|
373
|
+
|
|
374
|
+
for child in children:
|
|
375
|
+
score = score_hypothesis(child, log_path, config_path)
|
|
376
|
+
counter += 1
|
|
377
|
+
heapq.heappush(frontier, (-score, counter, child))
|
|
378
|
+
all_explored.append(child)
|
|
379
|
+
|
|
380
|
+
# Rank and deduplicate
|
|
381
|
+
all_explored.sort(key=lambda n: n.critique_scores.get("overall", 0), reverse=True)
|
|
382
|
+
|
|
383
|
+
seen: set[str] = set()
|
|
384
|
+
results: list[HypothesisNode] = []
|
|
385
|
+
for node in all_explored:
|
|
386
|
+
normalized = node.description.lower().strip()
|
|
387
|
+
if normalized not in seen:
|
|
388
|
+
seen.add(normalized)
|
|
389
|
+
results.append(node)
|
|
390
|
+
if len(results) >= top_k:
|
|
391
|
+
break
|
|
392
|
+
|
|
393
|
+
return results
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
# ---------------------------------------------------------------------------
|
|
397
|
+
# Output formatting
|
|
398
|
+
# ---------------------------------------------------------------------------
|
|
399
|
+
|
|
400
|
+
def format_results(
|
|
401
|
+
results: list[HypothesisNode],
|
|
402
|
+
metric_name: str,
|
|
403
|
+
strategy_used: str,
|
|
404
|
+
total_explored: int,
|
|
405
|
+
) -> str:
|
|
406
|
+
"""Format search results for terminal display."""
|
|
407
|
+
lines = [
|
|
408
|
+
f"TreeQuest Hypothesis Exploration ({strategy_used})",
|
|
409
|
+
"=" * 60,
|
|
410
|
+
f"Nodes explored: {total_explored}",
|
|
411
|
+
f"Top {len(results)} hypotheses by critique score:",
|
|
412
|
+
"",
|
|
413
|
+
]
|
|
414
|
+
|
|
415
|
+
for i, node in enumerate(results, 1):
|
|
416
|
+
scores = node.critique_scores
|
|
417
|
+
overall = scores.get("overall", 0)
|
|
418
|
+
verdict = scores.get("verdict", "?")
|
|
419
|
+
novelty = scores.get("novelty", 0)
|
|
420
|
+
feasibility = scores.get("feasibility", 0)
|
|
421
|
+
impact = scores.get("impact", 0)
|
|
422
|
+
|
|
423
|
+
lines.append(f" {i}. [{verdict.upper()}] (score: {overall}/10)")
|
|
424
|
+
lines.append(f" {node.description}")
|
|
425
|
+
lines.append(f" Novelty: {novelty} Feasibility: {feasibility} Impact: {impact}")
|
|
426
|
+
if node.depth > 0:
|
|
427
|
+
lines.append(f" Depth: {node.depth} (refined from parent)")
|
|
428
|
+
lines.append("")
|
|
429
|
+
|
|
430
|
+
return "\n".join(lines)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def results_to_json(results: list[HypothesisNode]) -> list[dict]:
|
|
434
|
+
"""Serialize results for machine consumption."""
|
|
435
|
+
return [node.to_dict() for node in results]
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# ---------------------------------------------------------------------------
|
|
439
|
+
# CLI
|
|
440
|
+
# ---------------------------------------------------------------------------
|
|
441
|
+
|
|
442
|
+
def main() -> None:
|
|
443
|
+
parser = argparse.ArgumentParser(
|
|
444
|
+
description="Tree-search-guided hypothesis exploration",
|
|
445
|
+
)
|
|
446
|
+
parser.add_argument("--log", default="experiments/log.jsonl",
|
|
447
|
+
help="Path to experiment log")
|
|
448
|
+
parser.add_argument("--config", default="config.yaml",
|
|
449
|
+
help="Path to project config")
|
|
450
|
+
parser.add_argument("--top", type=int, default=5,
|
|
451
|
+
help="Number of top hypotheses to return")
|
|
452
|
+
parser.add_argument("--iterations", type=int, default=30,
|
|
453
|
+
help="Number of search iterations")
|
|
454
|
+
parser.add_argument("--strategy", default="abmcts-a",
|
|
455
|
+
choices=["abmcts-a", "abmcts-m", "greedy"],
|
|
456
|
+
help="Search strategy (abmcts-a, abmcts-m, or greedy fallback)")
|
|
457
|
+
parser.add_argument("--children", type=int, default=3,
|
|
458
|
+
help="Children per node expansion")
|
|
459
|
+
parser.add_argument("--json", action="store_true",
|
|
460
|
+
help="Output as JSON")
|
|
461
|
+
parser.add_argument("--seeds-only", action="store_true",
|
|
462
|
+
help="Only show generated seeds, don't run search")
|
|
463
|
+
args = parser.parse_args()
|
|
464
|
+
|
|
465
|
+
config = load_config(args.config)
|
|
466
|
+
experiments = load_experiments(args.log)
|
|
467
|
+
metric = config.get("evaluation", {}).get("primary_metric", "accuracy")
|
|
468
|
+
|
|
469
|
+
# Generate seeds
|
|
470
|
+
seeds = generate_seed_hypotheses(config, experiments)
|
|
471
|
+
|
|
472
|
+
if args.seeds_only:
|
|
473
|
+
if args.json:
|
|
474
|
+
print(json.dumps([s.to_dict() for s in seeds], indent=2))
|
|
475
|
+
else:
|
|
476
|
+
print(f"Generated {len(seeds)} seed hypotheses:")
|
|
477
|
+
for i, s in enumerate(seeds, 1):
|
|
478
|
+
print(f" {i}. {s.description}")
|
|
479
|
+
return
|
|
480
|
+
|
|
481
|
+
# Run search
|
|
482
|
+
if args.strategy == "greedy":
|
|
483
|
+
results = run_greedy_search(
|
|
484
|
+
seeds, args.log, args.config,
|
|
485
|
+
iterations=args.iterations,
|
|
486
|
+
top_k=args.top,
|
|
487
|
+
children_per_node=args.children,
|
|
488
|
+
)
|
|
489
|
+
strategy_label = "greedy best-first"
|
|
490
|
+
else:
|
|
491
|
+
try:
|
|
492
|
+
import treequest # noqa: F401
|
|
493
|
+
results = run_treequest_search(
|
|
494
|
+
seeds, args.log, args.config,
|
|
495
|
+
iterations=args.iterations,
|
|
496
|
+
top_k=args.top,
|
|
497
|
+
strategy=args.strategy,
|
|
498
|
+
children_per_node=args.children,
|
|
499
|
+
)
|
|
500
|
+
strategy_label = f"TreeQuest {args.strategy.upper()}"
|
|
501
|
+
except ImportError:
|
|
502
|
+
print("TreeQuest not installed, falling back to greedy search.", file=sys.stderr)
|
|
503
|
+
results = run_greedy_search(
|
|
504
|
+
seeds, args.log, args.config,
|
|
505
|
+
iterations=args.iterations,
|
|
506
|
+
top_k=args.top,
|
|
507
|
+
children_per_node=args.children,
|
|
508
|
+
)
|
|
509
|
+
strategy_label = "greedy best-first (fallback)"
|
|
510
|
+
|
|
511
|
+
# Output
|
|
512
|
+
if args.json:
|
|
513
|
+
print(json.dumps(results_to_json(results), indent=2))
|
|
514
|
+
else:
|
|
515
|
+
total = args.iterations + len(seeds)
|
|
516
|
+
print(format_results(results, metric, strategy_label, total))
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
if __name__ == "__main__":
|
|
520
|
+
main()
|
|
Binary file
|