claude-turing 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +66 -3
  3. package/commands/card.md +36 -0
  4. package/commands/explore.md +107 -0
  5. package/commands/suggest.md +68 -4
  6. package/commands/turing.md +4 -0
  7. package/package.json +1 -1
  8. package/src/claude-md.js +1 -0
  9. package/src/install.js +2 -2
  10. package/src/verify.js +2 -0
  11. package/templates/requirements.txt +4 -0
  12. package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
  13. package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
  14. package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
  15. package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
  16. package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
  17. package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
  18. package/templates/scripts/cleanup.py +599 -0
  19. package/templates/scripts/cost_frontier.py +292 -0
  20. package/templates/scripts/diff_configs.py +534 -0
  21. package/templates/scripts/export_results.py +457 -0
  22. package/templates/scripts/generate_brief.py +58 -3
  23. package/templates/scripts/generate_model_card.py +342 -0
  24. package/templates/scripts/leaderboard.py +508 -0
  25. package/templates/scripts/manage_hypotheses.py +2 -2
  26. package/templates/scripts/plot_trajectory.py +611 -0
  27. package/templates/scripts/scaffold.py +8 -0
  28. package/templates/scripts/show_metrics.py +23 -2
  29. package/templates/scripts/treequest_suggest.py +520 -0
  30. package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
  31. package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
  32. package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
  33. package/templates/tests/test_cost_frontier.py +222 -0
@@ -0,0 +1,520 @@
1
+ #!/usr/bin/env python3
2
+ """Tree-search-guided hypothesis exploration for the autoresearch pipeline.
3
+
4
+ Uses TreeQuest's AB-MCTS (Adaptive Branching Monte Carlo Tree Search) to
5
+ explore the space of experiment hypotheses. Each tree node is a hypothesis
6
+ description + structured config. The generation function produces refinements
7
+ of a parent hypothesis, and the scoring function uses the critique engine
8
+ (novelty × feasibility × impact) as the reward signal.
9
+
10
+ This is the search-driven complement to suggest_next.py's surrogate model:
11
+ instead of fitting a Random Forest over hyperparameter space, we search
12
+ the space of *ideas* using MCTS with the critique score as reward.
13
+
14
+ Requires: pip install "treequest[all]"
15
+
16
+ Usage:
17
+ python scripts/treequest_suggest.py \\
18
+ --log experiments/log.jsonl \\
19
+ --config config.yaml \\
20
+ --top 5 \\
21
+ --iterations 30 \\
22
+ --strategy abmcts-a
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import json
29
+ import sys
30
+ from dataclasses import dataclass, field
31
+ from pathlib import Path
32
+
33
+ import yaml
34
+
35
+ from scripts.critique_hypothesis import critique_hypothesis
36
+ from scripts.turing_io import load_experiments, load_config
37
+
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Node representation
41
+ # ---------------------------------------------------------------------------
42
+
43
+ @dataclass
44
+ class HypothesisNode:
45
+ """A node in the hypothesis search tree.
46
+
47
+ Each node represents a concrete experiment hypothesis with both a
48
+ human-readable description and optional structured fields (model type,
49
+ hyperparameters, feature changes) that can be passed to the hypothesis
50
+ queue.
51
+ """
52
+ description: str
53
+ model_type: str | None = None
54
+ hyperparameters: dict | None = None
55
+ feature_changes: dict | None = None
56
+ parent_description: str | None = None
57
+ depth: int = 0
58
+ critique_scores: dict = field(default_factory=dict)
59
+
60
+ def to_dict(self) -> dict:
61
+ """Serialize for logging and queue integration."""
62
+ return {
63
+ "description": self.description,
64
+ "model_type": self.model_type,
65
+ "hyperparameters": self.hyperparameters,
66
+ "feature_changes": self.feature_changes,
67
+ "parent_description": self.parent_description,
68
+ "depth": self.depth,
69
+ "critique_scores": self.critique_scores,
70
+ }
71
+
72
+ @staticmethod
73
+ def from_dict(d: dict) -> "HypothesisNode":
74
+ return HypothesisNode(
75
+ description=d["description"],
76
+ model_type=d.get("model_type"),
77
+ hyperparameters=d.get("hyperparameters"),
78
+ feature_changes=d.get("feature_changes"),
79
+ parent_description=d.get("parent_description"),
80
+ depth=d.get("depth", 0),
81
+ critique_scores=d.get("critique_scores", {}),
82
+ )
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # Critique-based scoring
87
+ # ---------------------------------------------------------------------------
88
+
89
+ def score_hypothesis(
90
+ node: HypothesisNode,
91
+ log_path: str = "experiments/log.jsonl",
92
+ config_path: str = "config.yaml",
93
+ ) -> float:
94
+ """Score a hypothesis node using the critique engine.
95
+
96
+ Returns a float in [0, 10] — the weighted combination of
97
+ novelty (30%), feasibility (30%), and expected impact (40%).
98
+ """
99
+ result = critique_hypothesis(
100
+ description=node.description,
101
+ log_path=log_path,
102
+ config_path=config_path,
103
+ )
104
+ node.critique_scores = {
105
+ "overall": result["overall_score"],
106
+ "novelty": result["novelty"]["score"],
107
+ "feasibility": result["feasibility"]["score"],
108
+ "impact": result["impact"]["score"],
109
+ "verdict": result["verdict"],
110
+ }
111
+ return result["overall_score"]
112
+
113
+
114
+ # ---------------------------------------------------------------------------
115
+ # Seed hypothesis generation
116
+ # ---------------------------------------------------------------------------
117
+
118
+ def generate_seed_hypotheses(
119
+ config: dict,
120
+ experiments: list[dict],
121
+ ) -> list[HypothesisNode]:
122
+ """Generate initial seed hypotheses from config and experiment history.
123
+
124
+ These form the root nodes of the search tree. Each represents a
125
+ distinct direction worth exploring.
126
+ """
127
+ seeds: list[HypothesisNode] = []
128
+ current_model = config.get("model", {}).get("type", "xgboost")
129
+ metric = config.get("evaluation", {}).get("primary_metric", "accuracy")
130
+
131
+ # Seed 1: alternative model families
132
+ model_alternatives = {
133
+ "xgboost": ["LightGBM with dart boosting", "CatBoost with ordered boosting",
134
+ "Random Forest with extra-trees"],
135
+ "lightgbm": ["XGBoost with hist method", "CatBoost with ordered boosting",
136
+ "Random Forest with extra-trees"],
137
+ "catboost": ["XGBoost with hist method", "LightGBM with GOSS sampling",
138
+ "Random Forest with extra-trees"],
139
+ "random_forest": ["XGBoost with hist method", "LightGBM with dart boosting",
140
+ "CatBoost with ordered boosting"],
141
+ }
142
+ alternatives = model_alternatives.get(current_model.lower(), [
143
+ "XGBoost with hist method", "LightGBM with dart boosting",
144
+ ])
145
+ for alt in alternatives:
146
+ seeds.append(HypothesisNode(
147
+ description=f"Switch to {alt} for {metric} optimization",
148
+ model_type=alt.split(" with ")[0].lower().replace(" ", ""),
149
+ ))
150
+
151
+ # Seed 2: regularization exploration
152
+ seeds.append(HypothesisNode(
153
+ description=f"Increase regularization — add L2 penalty and reduce max_depth to combat potential overfitting",
154
+ hyperparameters={"reg_lambda": 1.0, "max_depth": 4},
155
+ ))
156
+
157
+ # Seed 3: feature engineering
158
+ seeds.append(HypothesisNode(
159
+ description="Add polynomial interaction features for the top-5 most important numeric columns",
160
+ feature_changes={"add": ["polynomial_interactions"]},
161
+ ))
162
+
163
+ # Seed 4: learning rate schedule
164
+ seeds.append(HypothesisNode(
165
+ description=f"Use low learning rate (0.01) with high n_estimators (2000) and early stopping for {metric}",
166
+ hyperparameters={"learning_rate": 0.01, "n_estimators": 2000},
167
+ ))
168
+
169
+ # Seed 5: based on experiment history — what's been working?
170
+ kept = [e for e in experiments if e.get("status") == "kept"]
171
+ if kept:
172
+ last_kept = kept[-1]
173
+ last_desc = last_kept.get("description", "")
174
+ if last_desc:
175
+ seeds.append(HypothesisNode(
176
+ description=f"Refine the approach from '{last_desc}' — try a more aggressive variant with doubled learning rate",
177
+ parent_description=last_desc,
178
+ ))
179
+
180
+ return seeds
181
+
182
+
183
+ # ---------------------------------------------------------------------------
184
+ # Perturbation-based child generation (non-LLM fallback)
185
+ # ---------------------------------------------------------------------------
186
+
187
+ _PERTURBATION_STRATEGIES = [
188
+ "increase learning rate by 2x",
189
+ "decrease learning rate by 2x",
190
+ "double n_estimators",
191
+ "halve max_depth",
192
+ "double max_depth",
193
+ "add L1 regularization (reg_alpha=1.0)",
194
+ "add L2 regularization (reg_lambda=1.0)",
195
+ "increase subsample ratio to 0.9",
196
+ "decrease subsample ratio to 0.6",
197
+ "add column sampling (colsample_bytree=0.7)",
198
+ "switch to dart boosting",
199
+ "switch to GOSS sampling",
200
+ "add polynomial features",
201
+ "add target encoding for categorical columns",
202
+ "remove low-importance features (bottom 20%)",
203
+ "try log-transform on skewed numeric features",
204
+ "add min_child_weight constraint",
205
+ "increase early stopping patience",
206
+ ]
207
+
208
+
209
+ def generate_children(
210
+ parent: HypothesisNode,
211
+ n_children: int = 3,
212
+ rng_seed: int = 42,
213
+ ) -> list[HypothesisNode]:
214
+ """Generate child hypotheses by perturbing a parent.
215
+
216
+ Uses deterministic perturbation strategies. Each child is a refinement
217
+ or variation of the parent hypothesis.
218
+ """
219
+ import hashlib
220
+
221
+ # Deterministic but parent-dependent selection
222
+ parent_hash = int(hashlib.sha256(parent.description.encode()).hexdigest(), 16)
223
+ start_idx = (parent_hash + rng_seed) % len(_PERTURBATION_STRATEGIES)
224
+
225
+ children = []
226
+ for i in range(n_children):
227
+ idx = (start_idx + i * 7) % len(_PERTURBATION_STRATEGIES) # stride of 7 for diversity
228
+ strategy = _PERTURBATION_STRATEGIES[idx]
229
+ child = HypothesisNode(
230
+ description=f"{parent.description}; additionally {strategy}",
231
+ model_type=parent.model_type,
232
+ hyperparameters=dict(parent.hyperparameters) if parent.hyperparameters else None,
233
+ feature_changes=dict(parent.feature_changes) if parent.feature_changes else None,
234
+ parent_description=parent.description,
235
+ depth=parent.depth + 1,
236
+ )
237
+ children.append(child)
238
+
239
+ return children
240
+
241
+
242
+ # ---------------------------------------------------------------------------
243
+ # TreeQuest integration
244
+ # ---------------------------------------------------------------------------
245
+
246
+ def run_treequest_search(
247
+ seeds: list[HypothesisNode],
248
+ log_path: str = "experiments/log.jsonl",
249
+ config_path: str = "config.yaml",
250
+ iterations: int = 30,
251
+ top_k: int = 5,
252
+ strategy: str = "abmcts-a",
253
+ children_per_node: int = 3,
254
+ ) -> list[HypothesisNode]:
255
+ """Run TreeQuest MCTS search over the hypothesis space.
256
+
257
+ Args:
258
+ seeds: Initial hypothesis nodes (tree roots).
259
+ log_path: Path to experiment log for critique scoring.
260
+ config_path: Path to config for critique scoring.
261
+ iterations: Number of MCTS iterations.
262
+ top_k: Number of best hypotheses to return.
263
+ strategy: TreeQuest algorithm — "abmcts-a" or "abmcts-m".
264
+ children_per_node: Branching factor for child generation.
265
+
266
+ Returns:
267
+ Top-K hypothesis nodes ranked by critique score.
268
+ """
269
+ try:
270
+ import treequest
271
+ except ImportError:
272
+ print(
273
+ "TreeQuest not installed. Install with: pip install 'treequest[all]'",
274
+ file=sys.stderr,
275
+ )
276
+ sys.exit(1)
277
+
278
+ # Select algorithm
279
+ if strategy == "abmcts-m":
280
+ algo = treequest.ABMCTSM()
281
+ else:
282
+ algo = treequest.ABMCTSA()
283
+
284
+ # Track all scored nodes for final ranking
285
+ all_scored: list[HypothesisNode] = []
286
+
287
+ def generation_fn(parent_state: HypothesisNode | None) -> tuple[HypothesisNode, float]:
288
+ """TreeQuest generation function.
289
+
290
+ Given a parent node (or None for root), generate a child and score it.
291
+ """
292
+ if parent_state is None:
293
+ # Pick a seed
294
+ idx = len(all_scored) % len(seeds)
295
+ node = seeds[idx]
296
+ else:
297
+ children = generate_children(
298
+ parent_state,
299
+ n_children=1,
300
+ rng_seed=len(all_scored),
301
+ )
302
+ node = children[0]
303
+
304
+ score = score_hypothesis(node, log_path, config_path)
305
+ all_scored.append(node)
306
+ return node, score
307
+
308
+ # Initialize and run the tree search
309
+ algo.init_tree()
310
+ for i in range(iterations):
311
+ try:
312
+ algo.step(generation_fn)
313
+ except Exception as e:
314
+ print(f"Warning: iteration {i} failed: {e}", file=sys.stderr)
315
+ continue
316
+
317
+ # Rank all explored nodes by critique score
318
+ all_scored.sort(key=lambda n: n.critique_scores.get("overall", 0), reverse=True)
319
+
320
+ # Deduplicate by description similarity
321
+ seen_descriptions: set[str] = set()
322
+ unique_results: list[HypothesisNode] = []
323
+ for node in all_scored:
324
+ # Simple dedup: normalize and check
325
+ normalized = node.description.lower().strip()
326
+ if normalized not in seen_descriptions:
327
+ seen_descriptions.add(normalized)
328
+ unique_results.append(node)
329
+ if len(unique_results) >= top_k:
330
+ break
331
+
332
+ return unique_results
333
+
334
+
335
+ # ---------------------------------------------------------------------------
336
+ # Fallback: greedy search without TreeQuest
337
+ # ---------------------------------------------------------------------------
338
+
339
+ def run_greedy_search(
340
+ seeds: list[HypothesisNode],
341
+ log_path: str = "experiments/log.jsonl",
342
+ config_path: str = "config.yaml",
343
+ iterations: int = 30,
344
+ top_k: int = 5,
345
+ children_per_node: int = 3,
346
+ ) -> list[HypothesisNode]:
347
+ """Greedy best-first search fallback when TreeQuest is not installed.
348
+
349
+ Expands the highest-scoring node at each step, keeping a priority
350
+ queue of candidates. Less sophisticated than MCTS but requires no
351
+ external dependency.
352
+ """
353
+ import heapq
354
+
355
+ # Score seeds
356
+ scored_seeds: list[tuple[float, int, HypothesisNode]] = []
357
+ for i, seed in enumerate(seeds):
358
+ score = score_hypothesis(seed, log_path, config_path)
359
+ # Negate score for min-heap (we want max)
360
+ heapq.heappush(scored_seeds, (-score, i, seed))
361
+
362
+ frontier = scored_seeds
363
+ all_explored: list[HypothesisNode] = list(seeds)
364
+ counter = len(seeds)
365
+
366
+ for _ in range(iterations):
367
+ if not frontier:
368
+ break
369
+
370
+ # Expand best node
371
+ neg_score, _, best = heapq.heappop(frontier)
372
+ children = generate_children(best, n_children=children_per_node, rng_seed=counter)
373
+
374
+ for child in children:
375
+ score = score_hypothesis(child, log_path, config_path)
376
+ counter += 1
377
+ heapq.heappush(frontier, (-score, counter, child))
378
+ all_explored.append(child)
379
+
380
+ # Rank and deduplicate
381
+ all_explored.sort(key=lambda n: n.critique_scores.get("overall", 0), reverse=True)
382
+
383
+ seen: set[str] = set()
384
+ results: list[HypothesisNode] = []
385
+ for node in all_explored:
386
+ normalized = node.description.lower().strip()
387
+ if normalized not in seen:
388
+ seen.add(normalized)
389
+ results.append(node)
390
+ if len(results) >= top_k:
391
+ break
392
+
393
+ return results
394
+
395
+
396
+ # ---------------------------------------------------------------------------
397
+ # Output formatting
398
+ # ---------------------------------------------------------------------------
399
+
400
+ def format_results(
401
+ results: list[HypothesisNode],
402
+ metric_name: str,
403
+ strategy_used: str,
404
+ total_explored: int,
405
+ ) -> str:
406
+ """Format search results for terminal display."""
407
+ lines = [
408
+ f"TreeQuest Hypothesis Exploration ({strategy_used})",
409
+ "=" * 60,
410
+ f"Nodes explored: {total_explored}",
411
+ f"Top {len(results)} hypotheses by critique score:",
412
+ "",
413
+ ]
414
+
415
+ for i, node in enumerate(results, 1):
416
+ scores = node.critique_scores
417
+ overall = scores.get("overall", 0)
418
+ verdict = scores.get("verdict", "?")
419
+ novelty = scores.get("novelty", 0)
420
+ feasibility = scores.get("feasibility", 0)
421
+ impact = scores.get("impact", 0)
422
+
423
+ lines.append(f" {i}. [{verdict.upper()}] (score: {overall}/10)")
424
+ lines.append(f" {node.description}")
425
+ lines.append(f" Novelty: {novelty} Feasibility: {feasibility} Impact: {impact}")
426
+ if node.depth > 0:
427
+ lines.append(f" Depth: {node.depth} (refined from parent)")
428
+ lines.append("")
429
+
430
+ return "\n".join(lines)
431
+
432
+
433
+ def results_to_json(results: list[HypothesisNode]) -> list[dict]:
434
+ """Serialize results for machine consumption."""
435
+ return [node.to_dict() for node in results]
436
+
437
+
438
+ # ---------------------------------------------------------------------------
439
+ # CLI
440
+ # ---------------------------------------------------------------------------
441
+
442
+ def main() -> None:
443
+ parser = argparse.ArgumentParser(
444
+ description="Tree-search-guided hypothesis exploration",
445
+ )
446
+ parser.add_argument("--log", default="experiments/log.jsonl",
447
+ help="Path to experiment log")
448
+ parser.add_argument("--config", default="config.yaml",
449
+ help="Path to project config")
450
+ parser.add_argument("--top", type=int, default=5,
451
+ help="Number of top hypotheses to return")
452
+ parser.add_argument("--iterations", type=int, default=30,
453
+ help="Number of search iterations")
454
+ parser.add_argument("--strategy", default="abmcts-a",
455
+ choices=["abmcts-a", "abmcts-m", "greedy"],
456
+ help="Search strategy (abmcts-a, abmcts-m, or greedy fallback)")
457
+ parser.add_argument("--children", type=int, default=3,
458
+ help="Children per node expansion")
459
+ parser.add_argument("--json", action="store_true",
460
+ help="Output as JSON")
461
+ parser.add_argument("--seeds-only", action="store_true",
462
+ help="Only show generated seeds, don't run search")
463
+ args = parser.parse_args()
464
+
465
+ config = load_config(args.config)
466
+ experiments = load_experiments(args.log)
467
+ metric = config.get("evaluation", {}).get("primary_metric", "accuracy")
468
+
469
+ # Generate seeds
470
+ seeds = generate_seed_hypotheses(config, experiments)
471
+
472
+ if args.seeds_only:
473
+ if args.json:
474
+ print(json.dumps([s.to_dict() for s in seeds], indent=2))
475
+ else:
476
+ print(f"Generated {len(seeds)} seed hypotheses:")
477
+ for i, s in enumerate(seeds, 1):
478
+ print(f" {i}. {s.description}")
479
+ return
480
+
481
+ # Run search
482
+ if args.strategy == "greedy":
483
+ results = run_greedy_search(
484
+ seeds, args.log, args.config,
485
+ iterations=args.iterations,
486
+ top_k=args.top,
487
+ children_per_node=args.children,
488
+ )
489
+ strategy_label = "greedy best-first"
490
+ else:
491
+ try:
492
+ import treequest # noqa: F401
493
+ results = run_treequest_search(
494
+ seeds, args.log, args.config,
495
+ iterations=args.iterations,
496
+ top_k=args.top,
497
+ strategy=args.strategy,
498
+ children_per_node=args.children,
499
+ )
500
+ strategy_label = f"TreeQuest {args.strategy.upper()}"
501
+ except ImportError:
502
+ print("TreeQuest not installed, falling back to greedy search.", file=sys.stderr)
503
+ results = run_greedy_search(
504
+ seeds, args.log, args.config,
505
+ iterations=args.iterations,
506
+ top_k=args.top,
507
+ children_per_node=args.children,
508
+ )
509
+ strategy_label = "greedy best-first (fallback)"
510
+
511
+ # Output
512
+ if args.json:
513
+ print(json.dumps(results_to_json(results), indent=2))
514
+ else:
515
+ total = args.iterations + len(seeds)
516
+ print(format_results(results, metric, strategy_label, total))
517
+
518
+
519
+ if __name__ == "__main__":
520
+ main()