claude-turing 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +39 -3
- package/commands/explore.md +107 -0
- package/commands/suggest.md +68 -4
- package/commands/turing.md +2 -0
- package/package.json +1 -1
- package/src/claude-md.js +1 -0
- package/src/install.js +1 -1
- package/src/verify.js +1 -0
- package/templates/requirements.txt +4 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
- package/templates/scripts/generate_brief.py +4 -3
- package/templates/scripts/manage_hypotheses.py +2 -2
- package/templates/scripts/scaffold.py +1 -0
- package/templates/scripts/treequest_suggest.py +520 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "1.2.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 17 commands, 2 specialized agents, tree-search hypothesis exploration (TreeQuest AB-MCTS), cost-performance frontier analysis, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -313,6 +313,8 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
313
313
|
| `/turing:try <hypothesis>` | Inject a hypothesis — free text or `archetype:model_comparison` |
|
|
314
314
|
| `/turing:brief [--deep]` | Research briefing — campaign summary, failure patterns, literature-grounded suggestions |
|
|
315
315
|
| `/turing:suggest` | Literature-grounded model architecture suggestions with citations |
|
|
316
|
+
| `/turing:suggest --strategy treequest` | Tree-search hypothesis exploration (alias for `/turing:explore`) |
|
|
317
|
+
| `/turing:explore` | AB-MCTS tree search over critique-scored hypothesis space |
|
|
316
318
|
| `/turing:design <hyp-id>` | Generate structured experiment design from a hypothesis |
|
|
317
319
|
| `/turing:mode <explore\|exploit\|replicate>` | Set research strategy — drives novelty guard policy |
|
|
318
320
|
|
|
@@ -390,6 +392,39 @@ After N experiments with no meaningful improvement, the agent stops and reports
|
|
|
390
392
|
|
|
391
393
|
For noisy metrics, `/turing:validate` runs the pipeline multiple times and measures variance. If the coefficient of variation exceeds 5%, it auto-configures multi-run evaluation so the agent can't be rewarded for lucky single runs.
|
|
392
394
|
|
|
395
|
+
## Tree-Search Hypothesis Exploration
|
|
396
|
+
|
|
397
|
+
> *"The learned coin-flipper weaves through the quadrillion-coin room with a preternatural air."*
|
|
398
|
+
|
|
399
|
+
Sometimes the best experiment to try next isn't obvious from the literature or the agent's memory. `/turing:explore` uses [TreeQuest](https://github.com/SakanaAI/treequest)'s AB-MCTS (Adaptive Branching Monte Carlo Tree Search) to search the space of experiment *ideas* as a tree, scored by the critique engine (novelty x feasibility x impact).
|
|
400
|
+
|
|
401
|
+
```
|
|
402
|
+
/turing:explore # Run MCTS over hypothesis space
|
|
403
|
+
/turing:explore --strategy greedy # Greedy fallback (no TreeQuest needed)
|
|
404
|
+
/turing:explore --iterations 50 --top 8 # Deeper search, more results
|
|
405
|
+
/turing:suggest --strategy treequest # Same thing via suggest
|
|
406
|
+
```
|
|
407
|
+
|
|
408
|
+
How it works:
|
|
409
|
+
|
|
410
|
+
```
|
|
411
|
+
Seeds MCTS expands best-scoring branches
|
|
412
|
+
│
|
|
413
|
+
┌──────┼──────┐ Each node is a hypothesis scored by:
|
|
414
|
+
▼ ▼ ▼ - Novelty (vs experiment history)
|
|
415
|
+
LightGBM Reg Features - Feasibility (hardware, deps)
|
|
416
|
+
│ │ │ - Expected impact (type success rate)
|
|
417
|
+
▼ ▼ ▼
|
|
418
|
+
+dart +L1 +poly Top-K results queued as hypotheses
|
|
419
|
+
│ │ for the next /turing:train run
|
|
420
|
+
▼ ▼
|
|
421
|
+
+subsamp +target-enc
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
Unlike `/turing:suggest` (which searches the web for papers), `/turing:explore` searches the space of *refinement chains* — combinations and sequences of modifications that score well together. It discovers non-obvious experiment strategies that independent suggestions cannot find.
|
|
425
|
+
|
|
426
|
+
Falls back to greedy best-first search when TreeQuest is not installed.
|
|
427
|
+
|
|
393
428
|
## Cost-Performance Frontier
|
|
394
429
|
|
|
395
430
|
> *"This model is 2% better but takes 10x longer to train. Is that worth it?"*
|
|
@@ -451,11 +486,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
451
486
|
|
|
452
487
|
## Architecture of Turing Itself
|
|
453
488
|
|
|
454
|
-
|
|
489
|
+
17 commands, 2 agents, 8 config files, 31 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, 379 tests, 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
455
490
|
|
|
456
491
|
```
|
|
457
492
|
turing/
|
|
458
|
-
├── commands/
|
|
493
|
+
├── commands/ 16 skill files (core + taste-leverage + reporting + exploration)
|
|
459
494
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
460
495
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
461
496
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -464,7 +499,7 @@ turing/
|
|
|
464
499
|
│ ├── train.py Training code (AGENT-EDITABLE)
|
|
465
500
|
│ ├── model_contract.md Artifact schema for production consumers
|
|
466
501
|
│ ├── model_registry.yaml Available model architectures + hyperparams
|
|
467
|
-
│ └── scripts/
|
|
502
|
+
│ └── scripts/ 26 Python scripts (core loop + analysis + infra + tree search)
|
|
468
503
|
├── tests/ 338 tests (unit + integration + anti-pattern + manifest)
|
|
469
504
|
├── src/ 5 JS installer files (npm deployment)
|
|
470
505
|
├── bin/ CLI entry points
|
|
@@ -482,6 +517,7 @@ turing/
|
|
|
482
517
|
- **[Principle of Least Privilege](https://en.wikipedia.org/wiki/Principle_of_least_privilege)** (Saltzer & Schroeder, 1975) — each agent has exactly the capabilities needed for its role
|
|
483
518
|
- **[Early Stopping](https://en.wikipedia.org/wiki/Early_stopping)** (Prechelt, 1998) — convergence detection as discrete early stopping
|
|
484
519
|
- **[Multi-Armed Bandits](https://en.wikipedia.org/wiki/Multi-armed_bandit)** — the explore-exploit tradeoff
|
|
520
|
+
- **[TreeQuest](https://github.com/SakanaAI/treequest)** (Sakana AI, 2025) — AB-MCTS for inference-time scaling; repurposed here for hypothesis-space exploration
|
|
485
521
|
- **[Version Control as Lab Notebook](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004668)** (Ram, 2013) — git as a scientific record-keeping system
|
|
486
522
|
- **[Reproducibility Crisis](https://en.wikipedia.org/wiki/Replication_crisis)** — if the measurement can change between experiments, results are not reproducible
|
|
487
523
|
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: explore
|
|
3
|
+
description: Tree-search-guided hypothesis exploration using AB-MCTS. Explores the space of experiment ideas as a search tree, scored by the critique engine. Discovers non-obvious refinement chains that linear suggestion cannot find.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[ml/project] [--iterations N] [--top N] [--strategy abmcts-a|abmcts-m|greedy]"
|
|
6
|
+
allowed-tools: Read, Write, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Explore the hypothesis space using tree search. Instead of suggesting independent ideas, this builds and searches a tree of refinement chains — each node is a hypothesis scored by novelty, feasibility, and expected impact.
|
|
10
|
+
|
|
11
|
+
## Project Detection
|
|
12
|
+
|
|
13
|
+
0. **Detect project directory:**
|
|
14
|
+
- If `$ARGUMENTS` contains a path (e.g., `ml/coding`), use that as the project directory
|
|
15
|
+
- Else if cwd contains `config.yaml` and `train.py`, use cwd
|
|
16
|
+
- Else search for `ml/*/` subdirectories containing `config.yaml`
|
|
17
|
+
- If exactly one found, use it
|
|
18
|
+
- If multiple found, list them and ask the user which to target
|
|
19
|
+
- All subsequent commands run from the detected project directory
|
|
20
|
+
|
|
21
|
+
## Parse Options
|
|
22
|
+
|
|
23
|
+
Extract from `$ARGUMENTS`:
|
|
24
|
+
- `--iterations N` — search depth (default: 30)
|
|
25
|
+
- `--top N` — number of results to return (default: 5)
|
|
26
|
+
- `--strategy` — algorithm choice: `abmcts-a` (default), `abmcts-m` (Bayesian), or `greedy` (no TreeQuest needed)
|
|
27
|
+
- `--seeds-only` — just show generated seeds without running search
|
|
28
|
+
- `--json` — output as JSON for programmatic use
|
|
29
|
+
|
|
30
|
+
## Steps
|
|
31
|
+
|
|
32
|
+
### 1. Assess Current State
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
source .venv/bin/activate && python scripts/show_metrics.py --last 10 2>/dev/null || echo "No experiments yet"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Read `config.yaml` to understand the current model and metric.
|
|
39
|
+
|
|
40
|
+
### 2. Run Tree Search
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
source .venv/bin/activate && python scripts/treequest_suggest.py \
|
|
44
|
+
--log experiments/log.jsonl \
|
|
45
|
+
--config config.yaml \
|
|
46
|
+
--top <N> \
|
|
47
|
+
--iterations <N> \
|
|
48
|
+
--strategy <strategy>
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
The script will:
|
|
52
|
+
- Generate seed hypotheses from config and experiment history
|
|
53
|
+
- Run AB-MCTS (or greedy fallback) over the hypothesis tree
|
|
54
|
+
- Score each node using the critique engine
|
|
55
|
+
- Return top-K ranked, deduplicated hypotheses
|
|
56
|
+
|
|
57
|
+
### 3. Queue Best Hypotheses
|
|
58
|
+
|
|
59
|
+
For each result, add to the hypothesis queue:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
source .venv/bin/activate && python scripts/manage_hypotheses.py add "<description>" \
|
|
63
|
+
--priority medium --source treequest
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 4. Show Results
|
|
67
|
+
|
|
68
|
+
Display the search output and confirm queuing:
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
TreeQuest Hypothesis Exploration (AB-MCTS-A)
|
|
72
|
+
============================================
|
|
73
|
+
Nodes explored: 35
|
|
74
|
+
Top 5 hypotheses by critique score:
|
|
75
|
+
|
|
76
|
+
1. [PROCEED] (score: 7.8/10)
|
|
77
|
+
Switch to LightGBM with dart boosting; additionally add polynomial features
|
|
78
|
+
Novelty: 8 Feasibility: 9 Impact: 7
|
|
79
|
+
-> Queued as hyp-NNN
|
|
80
|
+
|
|
81
|
+
2. [PROCEED] (score: 7.2/10)
|
|
82
|
+
Use low learning rate (0.01) with 2000 estimators; additionally add L2 regularization
|
|
83
|
+
Novelty: 7 Feasibility: 8 Impact: 7
|
|
84
|
+
Depth: 1 (refined from parent)
|
|
85
|
+
-> Queued as hyp-NNN
|
|
86
|
+
|
|
87
|
+
...
|
|
88
|
+
|
|
89
|
+
Queued N hypotheses. Run /turing:train to test them.
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## How It Differs From /turing:suggest
|
|
93
|
+
|
|
94
|
+
| | `/turing:suggest` | `/turing:explore` |
|
|
95
|
+
|---|---|---|
|
|
96
|
+
| **Source** | Web literature search | Tree search over critique scores |
|
|
97
|
+
| **Strategy** | Independent suggestions | Refinement chains (parent -> child) |
|
|
98
|
+
| **Requires internet** | Yes | No |
|
|
99
|
+
| **Discovers** | What papers recommend | What combinations score well |
|
|
100
|
+
| **Best for** | Early-stage exploration | Mid-experiment optimization |
|
|
101
|
+
|
|
102
|
+
## Integration
|
|
103
|
+
|
|
104
|
+
- Results feed into `hypotheses.yaml` — the next `/turing:train` picks them up
|
|
105
|
+
- `/turing:brief` shows queued treequest-sourced hypotheses
|
|
106
|
+
- `/turing:suggest --strategy treequest` is an alias for this command
|
|
107
|
+
- Human can override priority: `/turing:try` always takes precedence
|
package/commands/suggest.md
CHANGED
|
@@ -6,9 +6,16 @@ argument-hint: "[task description override]"
|
|
|
6
6
|
allowed-tools: Read, Write, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob, WebSearch, WebFetch
|
|
7
7
|
---
|
|
8
8
|
|
|
9
|
-
Suggest model architectures for the current ML task
|
|
9
|
+
Suggest model architectures for the current ML task. Supports two strategies:
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
- **literature** (default): Web search for recent papers, synthesize grounded suggestions with citations.
|
|
12
|
+
- **treequest**: Tree-search-guided hypothesis exploration using AB-MCTS over the critique scoring function. Explores refinement chains that literature search cannot find.
|
|
13
|
+
|
|
14
|
+
## Strategy Detection
|
|
15
|
+
|
|
16
|
+
If `$ARGUMENTS` contains `--strategy treequest` or `treequest`, use the TreeQuest strategy below. Otherwise use the default literature strategy.
|
|
17
|
+
|
|
18
|
+
## Steps (Literature Strategy — default)
|
|
12
19
|
|
|
13
20
|
### 1. Understand the Task
|
|
14
21
|
|
|
@@ -84,12 +91,69 @@ Sources consulted: <N papers/articles>
|
|
|
84
91
|
Queued N hypotheses. Run /turing:train to test them.
|
|
85
92
|
```
|
|
86
93
|
|
|
87
|
-
## Fallback
|
|
94
|
+
## Fallback (Literature Strategy)
|
|
88
95
|
|
|
89
96
|
If web search returns insufficient results, suggest model families from `config/taxonomy.toml` based on what hasn't been tried yet. Note that suggestions are taxonomy-based, not literature-backed, and queue with `--source taxonomy`.
|
|
90
97
|
|
|
98
|
+
## Steps (TreeQuest Strategy)
|
|
99
|
+
|
|
100
|
+
When using `--strategy treequest`:
|
|
101
|
+
|
|
102
|
+
### 1. Detect Project Directory
|
|
103
|
+
|
|
104
|
+
Same detection logic as the literature strategy — find `config.yaml` + `train.py`.
|
|
105
|
+
|
|
106
|
+
### 2. Run Tree Search
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
source .venv/bin/activate && python scripts/treequest_suggest.py \
|
|
110
|
+
--log experiments/log.jsonl \
|
|
111
|
+
--config config.yaml \
|
|
112
|
+
--top 5 \
|
|
113
|
+
--iterations 30 \
|
|
114
|
+
--strategy abmcts-a
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
If TreeQuest is not installed, the script automatically falls back to greedy best-first search.
|
|
118
|
+
|
|
119
|
+
### 3. Queue Results
|
|
120
|
+
|
|
121
|
+
For each result from the tree search, queue as a hypothesis:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
source .venv/bin/activate && python scripts/manage_hypotheses.py add "<description>" --priority medium --source treequest
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### 4. Show Results
|
|
128
|
+
|
|
129
|
+
Display the tree search output and confirm hypotheses were queued:
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
TreeQuest Hypothesis Exploration (AB-MCTS-A)
|
|
133
|
+
============================================
|
|
134
|
+
Nodes explored: 35
|
|
135
|
+
Top 5 hypotheses by critique score:
|
|
136
|
+
|
|
137
|
+
1. [PROCEED] (score: 7.8/10)
|
|
138
|
+
Switch to LightGBM with dart boosting; additionally add polynomial features
|
|
139
|
+
Novelty: 8 Feasibility: 9 Impact: 7
|
|
140
|
+
|
|
141
|
+
...
|
|
142
|
+
|
|
143
|
+
Queued N hypotheses. Run /turing:train to test them.
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### TreeQuest Options
|
|
147
|
+
|
|
148
|
+
Pass additional flags via `$ARGUMENTS`:
|
|
149
|
+
- `--iterations N` — search depth (default: 30)
|
|
150
|
+
- `--top N` — number of results (default: 5)
|
|
151
|
+
- `--strategy abmcts-m` — use Bayesian mixed model variant (requires PyMC)
|
|
152
|
+
- `--greedy` — force greedy fallback without TreeQuest
|
|
153
|
+
|
|
91
154
|
## Integration
|
|
92
155
|
|
|
93
156
|
- Suggestions feed into `hypotheses.yaml` — the next `/turing:train` picks them up
|
|
94
|
-
- `/turing:brief` shows queued literature-sourced hypotheses
|
|
157
|
+
- `/turing:brief` shows queued literature-sourced and treequest-sourced hypotheses
|
|
158
|
+
- `/turing:explore` runs the TreeQuest search as a standalone command
|
|
95
159
|
- Human can override priority: `/turing:try` always takes precedence
|
package/commands/turing.md
CHANGED
|
@@ -21,6 +21,7 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
21
21
|
| "report", "write-up", "findings", "document results" | `/turing:report` | Document |
|
|
22
22
|
| "validate", "stability", "check variance", "noisy" | `/turing:validate` | Validate |
|
|
23
23
|
| "suggest", "what model", "recommend", "which architecture", "literature" | `/turing:suggest` | Research |
|
|
24
|
+
| "explore hypotheses", "tree search", "treequest", "search hypothesis space", "MCTS" | `/turing:explore` | Research |
|
|
24
25
|
| "design", "plan experiment", "how should I test", "experiment design" | `/turing:design` | Design |
|
|
25
26
|
| "mode", "explore", "exploit", "replicate", "strategy" | `/turing:mode` | Strategy |
|
|
26
27
|
| "preflight", "resources", "VRAM", "memory", "can I run", "OOM", "GPU" | `/turing:preflight` | Check |
|
|
@@ -39,6 +40,7 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
39
40
|
| `/turing:init` | Scaffold a new ML project | (inline) |
|
|
40
41
|
| `/turing:validate` | Check metric stability, auto-fix if noisy | (inline) |
|
|
41
42
|
| `/turing:suggest` | Literature-grounded model architecture suggestions | (inline, uses WebSearch) |
|
|
43
|
+
| `/turing:explore` | Tree-search hypothesis exploration via AB-MCTS | (inline) |
|
|
42
44
|
| `/turing:design <hyp-id>` | Generate structured experiment design from hypothesis | (inline, uses WebSearch) |
|
|
43
45
|
| `/turing:logbook` | HTML/markdown logbook with trajectory chart | (inline) |
|
|
44
46
|
| `/turing:poster` | Single-page HTML research poster | (inline) |
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/claude-md.js
CHANGED
|
@@ -21,6 +21,7 @@ Autonomous ML research harness. The autoresearch loop as a formal protocol.
|
|
|
21
21
|
| \`/turing:validate\` | Check metric stability, auto-fix if noisy |
|
|
22
22
|
| \`/turing:try <hypothesis>\` | Inject a hypothesis into the experiment queue |
|
|
23
23
|
| \`/turing:brief\` | Generate research intelligence report |
|
|
24
|
+
| \`/turing:explore\` | Tree-search hypothesis exploration (AB-MCTS) |
|
|
24
25
|
| \`/turing:preflight\` | Pre-flight resource check (VRAM/RAM/disk) |
|
|
25
26
|
|
|
26
27
|
### Agents
|
package/src/install.js
CHANGED
|
@@ -21,7 +21,7 @@ const PLUGIN_ROOT = join(__dirname, "..");
|
|
|
21
21
|
// Single source of truth for sub-commands (DRY — used for dirs and file copy)
|
|
22
22
|
const SUB_COMMANDS = [
|
|
23
23
|
"init", "train", "status", "compare", "sweep", "validate",
|
|
24
|
-
"try", "brief", "suggest", "design", "logbook", "poster",
|
|
24
|
+
"try", "brief", "suggest", "explore", "design", "logbook", "poster",
|
|
25
25
|
"report", "mode", "preflight", "card",
|
|
26
26
|
];
|
|
27
27
|
|
package/src/verify.js
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -286,7 +286,8 @@ def format_brief(
|
|
|
286
286
|
lines.append(f"**{len(queued)} queued:**")
|
|
287
287
|
for h in queued:
|
|
288
288
|
priority_marker = " (HIGH)" if h.get("priority") == "high" else ""
|
|
289
|
-
|
|
289
|
+
source = h.get("source", "")
|
|
290
|
+
source_marker = f" [{source}]" if source in ("human", "treequest", "literature") else ""
|
|
290
291
|
lines.append(f"- {h['id']}: {h.get('description', '?')}{priority_marker}{source_marker}")
|
|
291
292
|
else:
|
|
292
293
|
lines.append("No queued hypotheses. Use `/turing:try` to inject ideas.")
|
|
@@ -387,9 +388,9 @@ def format_brief(
|
|
|
387
388
|
|
|
388
389
|
# Check if hypotheses are exhausted
|
|
389
390
|
if not queued:
|
|
390
|
-
lines.append("- No hypotheses queued — inject ideas with `/turing:try`")
|
|
391
|
+
lines.append("- No hypotheses queued — inject ideas with `/turing:try` or explore with `/turing:explore`")
|
|
391
392
|
|
|
392
|
-
lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses
|
|
393
|
+
lines.extend(["", "---", "", "*Use `/turing:try` to inject hypotheses, `/turing:explore` for tree search, `/turing:train` to execute.*"])
|
|
393
394
|
|
|
394
395
|
return "\n".join(lines)
|
|
395
396
|
|
|
@@ -277,7 +277,7 @@ def get_next_hypothesis(queue_path: str) -> dict | None:
|
|
|
277
277
|
return None
|
|
278
278
|
|
|
279
279
|
priority_order = {"high": 0, "medium": 1, "low": 2}
|
|
280
|
-
source_order = {"human": 0, "literature": 1, "
|
|
280
|
+
source_order = {"human": 0, "literature": 1, "treequest": 2, "taxonomy": 3, "agent": 4}
|
|
281
281
|
|
|
282
282
|
queued.sort(key=lambda h: (
|
|
283
283
|
priority_order.get(h.get("priority", "medium"), 1),
|
|
@@ -376,7 +376,7 @@ def main() -> None:
|
|
|
376
376
|
add_parser.add_argument("description", nargs="?", default=None, help="What to try and why")
|
|
377
377
|
add_parser.add_argument("--archetype", default=None, help="Expand from archetype (e.g., model_comparison)")
|
|
378
378
|
add_parser.add_argument("--priority", default="high", choices=sorted(VALID_PRIORITIES))
|
|
379
|
-
add_parser.add_argument("--source", default="human", choices=["human", "agent", "literature", "taxonomy"])
|
|
379
|
+
add_parser.add_argument("--source", default="human", choices=["human", "agent", "literature", "treequest", "taxonomy"])
|
|
380
380
|
add_parser.add_argument("--parent", default=None, help="Parent experiment ID")
|
|
381
381
|
add_parser.add_argument("--parent-hyp", default=None, help="Parent hypothesis ID")
|
|
382
382
|
add_parser.add_argument("--family", default=None, help="Experiment family (e.g., optimizer-sweep)")
|
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Tree-search-guided hypothesis exploration for the autoresearch pipeline.
|
|
3
|
+
|
|
4
|
+
Uses TreeQuest's AB-MCTS (Adaptive Branching Monte Carlo Tree Search) to
|
|
5
|
+
explore the space of experiment hypotheses. Each tree node is a hypothesis
|
|
6
|
+
description + structured config. The generation function produces refinements
|
|
7
|
+
of a parent hypothesis, and the scoring function uses the critique engine
|
|
8
|
+
(novelty × feasibility × impact) as the reward signal.
|
|
9
|
+
|
|
10
|
+
This is the search-driven complement to suggest_next.py's surrogate model:
|
|
11
|
+
instead of fitting a Random Forest over hyperparameter space, we search
|
|
12
|
+
the space of *ideas* using MCTS with the critique score as reward.
|
|
13
|
+
|
|
14
|
+
Requires: pip install "treequest[all]"
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
python scripts/treequest_suggest.py \\
|
|
18
|
+
--log experiments/log.jsonl \\
|
|
19
|
+
--config config.yaml \\
|
|
20
|
+
--top 5 \\
|
|
21
|
+
--iterations 30 \\
|
|
22
|
+
--strategy abmcts-a
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import json
|
|
29
|
+
import sys
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
import yaml
|
|
34
|
+
|
|
35
|
+
from scripts.critique_hypothesis import critique_hypothesis
|
|
36
|
+
from scripts.turing_io import load_experiments, load_config
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# Node representation
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class HypothesisNode:
|
|
45
|
+
"""A node in the hypothesis search tree.
|
|
46
|
+
|
|
47
|
+
Each node represents a concrete experiment hypothesis with both a
|
|
48
|
+
human-readable description and optional structured fields (model type,
|
|
49
|
+
hyperparameters, feature changes) that can be passed to the hypothesis
|
|
50
|
+
queue.
|
|
51
|
+
"""
|
|
52
|
+
description: str
|
|
53
|
+
model_type: str | None = None
|
|
54
|
+
hyperparameters: dict | None = None
|
|
55
|
+
feature_changes: dict | None = None
|
|
56
|
+
parent_description: str | None = None
|
|
57
|
+
depth: int = 0
|
|
58
|
+
critique_scores: dict = field(default_factory=dict)
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> dict:
|
|
61
|
+
"""Serialize for logging and queue integration."""
|
|
62
|
+
return {
|
|
63
|
+
"description": self.description,
|
|
64
|
+
"model_type": self.model_type,
|
|
65
|
+
"hyperparameters": self.hyperparameters,
|
|
66
|
+
"feature_changes": self.feature_changes,
|
|
67
|
+
"parent_description": self.parent_description,
|
|
68
|
+
"depth": self.depth,
|
|
69
|
+
"critique_scores": self.critique_scores,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def from_dict(d: dict) -> "HypothesisNode":
|
|
74
|
+
return HypothesisNode(
|
|
75
|
+
description=d["description"],
|
|
76
|
+
model_type=d.get("model_type"),
|
|
77
|
+
hyperparameters=d.get("hyperparameters"),
|
|
78
|
+
feature_changes=d.get("feature_changes"),
|
|
79
|
+
parent_description=d.get("parent_description"),
|
|
80
|
+
depth=d.get("depth", 0),
|
|
81
|
+
critique_scores=d.get("critique_scores", {}),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# Critique-based scoring
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def score_hypothesis(
|
|
90
|
+
node: HypothesisNode,
|
|
91
|
+
log_path: str = "experiments/log.jsonl",
|
|
92
|
+
config_path: str = "config.yaml",
|
|
93
|
+
) -> float:
|
|
94
|
+
"""Score a hypothesis node using the critique engine.
|
|
95
|
+
|
|
96
|
+
Returns a float in [0, 10] — the weighted combination of
|
|
97
|
+
novelty (30%), feasibility (30%), and expected impact (40%).
|
|
98
|
+
"""
|
|
99
|
+
result = critique_hypothesis(
|
|
100
|
+
description=node.description,
|
|
101
|
+
log_path=log_path,
|
|
102
|
+
config_path=config_path,
|
|
103
|
+
)
|
|
104
|
+
node.critique_scores = {
|
|
105
|
+
"overall": result["overall_score"],
|
|
106
|
+
"novelty": result["novelty"]["score"],
|
|
107
|
+
"feasibility": result["feasibility"]["score"],
|
|
108
|
+
"impact": result["impact"]["score"],
|
|
109
|
+
"verdict": result["verdict"],
|
|
110
|
+
}
|
|
111
|
+
return result["overall_score"]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
# Seed hypothesis generation
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
def generate_seed_hypotheses(
|
|
119
|
+
config: dict,
|
|
120
|
+
experiments: list[dict],
|
|
121
|
+
) -> list[HypothesisNode]:
|
|
122
|
+
"""Generate initial seed hypotheses from config and experiment history.
|
|
123
|
+
|
|
124
|
+
These form the root nodes of the search tree. Each represents a
|
|
125
|
+
distinct direction worth exploring.
|
|
126
|
+
"""
|
|
127
|
+
seeds: list[HypothesisNode] = []
|
|
128
|
+
current_model = config.get("model", {}).get("type", "xgboost")
|
|
129
|
+
metric = config.get("evaluation", {}).get("primary_metric", "accuracy")
|
|
130
|
+
|
|
131
|
+
# Seed 1: alternative model families
|
|
132
|
+
model_alternatives = {
|
|
133
|
+
"xgboost": ["LightGBM with dart boosting", "CatBoost with ordered boosting",
|
|
134
|
+
"Random Forest with extra-trees"],
|
|
135
|
+
"lightgbm": ["XGBoost with hist method", "CatBoost with ordered boosting",
|
|
136
|
+
"Random Forest with extra-trees"],
|
|
137
|
+
"catboost": ["XGBoost with hist method", "LightGBM with GOSS sampling",
|
|
138
|
+
"Random Forest with extra-trees"],
|
|
139
|
+
"random_forest": ["XGBoost with hist method", "LightGBM with dart boosting",
|
|
140
|
+
"CatBoost with ordered boosting"],
|
|
141
|
+
}
|
|
142
|
+
alternatives = model_alternatives.get(current_model.lower(), [
|
|
143
|
+
"XGBoost with hist method", "LightGBM with dart boosting",
|
|
144
|
+
])
|
|
145
|
+
for alt in alternatives:
|
|
146
|
+
seeds.append(HypothesisNode(
|
|
147
|
+
description=f"Switch to {alt} for {metric} optimization",
|
|
148
|
+
model_type=alt.split(" with ")[0].lower().replace(" ", ""),
|
|
149
|
+
))
|
|
150
|
+
|
|
151
|
+
# Seed 2: regularization exploration
|
|
152
|
+
seeds.append(HypothesisNode(
|
|
153
|
+
description=f"Increase regularization — add L2 penalty and reduce max_depth to combat potential overfitting",
|
|
154
|
+
hyperparameters={"reg_lambda": 1.0, "max_depth": 4},
|
|
155
|
+
))
|
|
156
|
+
|
|
157
|
+
# Seed 3: feature engineering
|
|
158
|
+
seeds.append(HypothesisNode(
|
|
159
|
+
description="Add polynomial interaction features for the top-5 most important numeric columns",
|
|
160
|
+
feature_changes={"add": ["polynomial_interactions"]},
|
|
161
|
+
))
|
|
162
|
+
|
|
163
|
+
# Seed 4: learning rate schedule
|
|
164
|
+
seeds.append(HypothesisNode(
|
|
165
|
+
description=f"Use low learning rate (0.01) with high n_estimators (2000) and early stopping for {metric}",
|
|
166
|
+
hyperparameters={"learning_rate": 0.01, "n_estimators": 2000},
|
|
167
|
+
))
|
|
168
|
+
|
|
169
|
+
# Seed 5: based on experiment history — what's been working?
|
|
170
|
+
kept = [e for e in experiments if e.get("status") == "kept"]
|
|
171
|
+
if kept:
|
|
172
|
+
last_kept = kept[-1]
|
|
173
|
+
last_desc = last_kept.get("description", "")
|
|
174
|
+
if last_desc:
|
|
175
|
+
seeds.append(HypothesisNode(
|
|
176
|
+
description=f"Refine the approach from '{last_desc}' — try a more aggressive variant with doubled learning rate",
|
|
177
|
+
parent_description=last_desc,
|
|
178
|
+
))
|
|
179
|
+
|
|
180
|
+
return seeds
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
# Perturbation-based child generation (non-LLM fallback)
|
|
185
|
+
# ---------------------------------------------------------------------------
|
|
186
|
+
|
|
187
|
+
_PERTURBATION_STRATEGIES = [
|
|
188
|
+
"increase learning rate by 2x",
|
|
189
|
+
"decrease learning rate by 2x",
|
|
190
|
+
"double n_estimators",
|
|
191
|
+
"halve max_depth",
|
|
192
|
+
"double max_depth",
|
|
193
|
+
"add L1 regularization (reg_alpha=1.0)",
|
|
194
|
+
"add L2 regularization (reg_lambda=1.0)",
|
|
195
|
+
"increase subsample ratio to 0.9",
|
|
196
|
+
"decrease subsample ratio to 0.6",
|
|
197
|
+
"add column sampling (colsample_bytree=0.7)",
|
|
198
|
+
"switch to dart boosting",
|
|
199
|
+
"switch to GOSS sampling",
|
|
200
|
+
"add polynomial features",
|
|
201
|
+
"add target encoding for categorical columns",
|
|
202
|
+
"remove low-importance features (bottom 20%)",
|
|
203
|
+
"try log-transform on skewed numeric features",
|
|
204
|
+
"add min_child_weight constraint",
|
|
205
|
+
"increase early stopping patience",
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def generate_children(
|
|
210
|
+
parent: HypothesisNode,
|
|
211
|
+
n_children: int = 3,
|
|
212
|
+
rng_seed: int = 42,
|
|
213
|
+
) -> list[HypothesisNode]:
|
|
214
|
+
"""Generate child hypotheses by perturbing a parent.
|
|
215
|
+
|
|
216
|
+
Uses deterministic perturbation strategies. Each child is a refinement
|
|
217
|
+
or variation of the parent hypothesis.
|
|
218
|
+
"""
|
|
219
|
+
import hashlib
|
|
220
|
+
|
|
221
|
+
# Deterministic but parent-dependent selection
|
|
222
|
+
parent_hash = int(hashlib.sha256(parent.description.encode()).hexdigest(), 16)
|
|
223
|
+
start_idx = (parent_hash + rng_seed) % len(_PERTURBATION_STRATEGIES)
|
|
224
|
+
|
|
225
|
+
children = []
|
|
226
|
+
for i in range(n_children):
|
|
227
|
+
idx = (start_idx + i * 7) % len(_PERTURBATION_STRATEGIES) # stride of 7 for diversity
|
|
228
|
+
strategy = _PERTURBATION_STRATEGIES[idx]
|
|
229
|
+
child = HypothesisNode(
|
|
230
|
+
description=f"{parent.description}; additionally {strategy}",
|
|
231
|
+
model_type=parent.model_type,
|
|
232
|
+
hyperparameters=dict(parent.hyperparameters) if parent.hyperparameters else None,
|
|
233
|
+
feature_changes=dict(parent.feature_changes) if parent.feature_changes else None,
|
|
234
|
+
parent_description=parent.description,
|
|
235
|
+
depth=parent.depth + 1,
|
|
236
|
+
)
|
|
237
|
+
children.append(child)
|
|
238
|
+
|
|
239
|
+
return children
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
# ---------------------------------------------------------------------------
|
|
243
|
+
# TreeQuest integration
|
|
244
|
+
# ---------------------------------------------------------------------------
|
|
245
|
+
|
|
246
|
+
def run_treequest_search(
|
|
247
|
+
seeds: list[HypothesisNode],
|
|
248
|
+
log_path: str = "experiments/log.jsonl",
|
|
249
|
+
config_path: str = "config.yaml",
|
|
250
|
+
iterations: int = 30,
|
|
251
|
+
top_k: int = 5,
|
|
252
|
+
strategy: str = "abmcts-a",
|
|
253
|
+
children_per_node: int = 3,
|
|
254
|
+
) -> list[HypothesisNode]:
|
|
255
|
+
"""Run TreeQuest MCTS search over the hypothesis space.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
seeds: Initial hypothesis nodes (tree roots).
|
|
259
|
+
log_path: Path to experiment log for critique scoring.
|
|
260
|
+
config_path: Path to config for critique scoring.
|
|
261
|
+
iterations: Number of MCTS iterations.
|
|
262
|
+
top_k: Number of best hypotheses to return.
|
|
263
|
+
strategy: TreeQuest algorithm — "abmcts-a" or "abmcts-m".
|
|
264
|
+
children_per_node: Branching factor for child generation.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Top-K hypothesis nodes ranked by critique score.
|
|
268
|
+
"""
|
|
269
|
+
try:
|
|
270
|
+
import treequest
|
|
271
|
+
except ImportError:
|
|
272
|
+
print(
|
|
273
|
+
"TreeQuest not installed. Install with: pip install 'treequest[all]'",
|
|
274
|
+
file=sys.stderr,
|
|
275
|
+
)
|
|
276
|
+
sys.exit(1)
|
|
277
|
+
|
|
278
|
+
# Select algorithm
|
|
279
|
+
if strategy == "abmcts-m":
|
|
280
|
+
algo = treequest.ABMCTSM()
|
|
281
|
+
else:
|
|
282
|
+
algo = treequest.ABMCTSA()
|
|
283
|
+
|
|
284
|
+
# Track all scored nodes for final ranking
|
|
285
|
+
all_scored: list[HypothesisNode] = []
|
|
286
|
+
|
|
287
|
+
def generation_fn(parent_state: HypothesisNode | None) -> tuple[HypothesisNode, float]:
|
|
288
|
+
"""TreeQuest generation function.
|
|
289
|
+
|
|
290
|
+
Given a parent node (or None for root), generate a child and score it.
|
|
291
|
+
"""
|
|
292
|
+
if parent_state is None:
|
|
293
|
+
# Pick a seed
|
|
294
|
+
idx = len(all_scored) % len(seeds)
|
|
295
|
+
node = seeds[idx]
|
|
296
|
+
else:
|
|
297
|
+
children = generate_children(
|
|
298
|
+
parent_state,
|
|
299
|
+
n_children=1,
|
|
300
|
+
rng_seed=len(all_scored),
|
|
301
|
+
)
|
|
302
|
+
node = children[0]
|
|
303
|
+
|
|
304
|
+
score = score_hypothesis(node, log_path, config_path)
|
|
305
|
+
all_scored.append(node)
|
|
306
|
+
return node, score
|
|
307
|
+
|
|
308
|
+
# Initialize and run the tree search
|
|
309
|
+
algo.init_tree()
|
|
310
|
+
for i in range(iterations):
|
|
311
|
+
try:
|
|
312
|
+
algo.step(generation_fn)
|
|
313
|
+
except Exception as e:
|
|
314
|
+
print(f"Warning: iteration {i} failed: {e}", file=sys.stderr)
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
# Rank all explored nodes by critique score
|
|
318
|
+
all_scored.sort(key=lambda n: n.critique_scores.get("overall", 0), reverse=True)
|
|
319
|
+
|
|
320
|
+
# Deduplicate by description similarity
|
|
321
|
+
seen_descriptions: set[str] = set()
|
|
322
|
+
unique_results: list[HypothesisNode] = []
|
|
323
|
+
for node in all_scored:
|
|
324
|
+
# Simple dedup: normalize and check
|
|
325
|
+
normalized = node.description.lower().strip()
|
|
326
|
+
if normalized not in seen_descriptions:
|
|
327
|
+
seen_descriptions.add(normalized)
|
|
328
|
+
unique_results.append(node)
|
|
329
|
+
if len(unique_results) >= top_k:
|
|
330
|
+
break
|
|
331
|
+
|
|
332
|
+
return unique_results
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
# ---------------------------------------------------------------------------
|
|
336
|
+
# Fallback: greedy search without TreeQuest
|
|
337
|
+
# ---------------------------------------------------------------------------
|
|
338
|
+
|
|
339
|
+
def run_greedy_search(
|
|
340
|
+
seeds: list[HypothesisNode],
|
|
341
|
+
log_path: str = "experiments/log.jsonl",
|
|
342
|
+
config_path: str = "config.yaml",
|
|
343
|
+
iterations: int = 30,
|
|
344
|
+
top_k: int = 5,
|
|
345
|
+
children_per_node: int = 3,
|
|
346
|
+
) -> list[HypothesisNode]:
|
|
347
|
+
"""Greedy best-first search fallback when TreeQuest is not installed.
|
|
348
|
+
|
|
349
|
+
Expands the highest-scoring node at each step, keeping a priority
|
|
350
|
+
queue of candidates. Less sophisticated than MCTS but requires no
|
|
351
|
+
external dependency.
|
|
352
|
+
"""
|
|
353
|
+
import heapq
|
|
354
|
+
|
|
355
|
+
# Score seeds
|
|
356
|
+
scored_seeds: list[tuple[float, int, HypothesisNode]] = []
|
|
357
|
+
for i, seed in enumerate(seeds):
|
|
358
|
+
score = score_hypothesis(seed, log_path, config_path)
|
|
359
|
+
# Negate score for min-heap (we want max)
|
|
360
|
+
heapq.heappush(scored_seeds, (-score, i, seed))
|
|
361
|
+
|
|
362
|
+
frontier = scored_seeds
|
|
363
|
+
all_explored: list[HypothesisNode] = list(seeds)
|
|
364
|
+
counter = len(seeds)
|
|
365
|
+
|
|
366
|
+
for _ in range(iterations):
|
|
367
|
+
if not frontier:
|
|
368
|
+
break
|
|
369
|
+
|
|
370
|
+
# Expand best node
|
|
371
|
+
neg_score, _, best = heapq.heappop(frontier)
|
|
372
|
+
children = generate_children(best, n_children=children_per_node, rng_seed=counter)
|
|
373
|
+
|
|
374
|
+
for child in children:
|
|
375
|
+
score = score_hypothesis(child, log_path, config_path)
|
|
376
|
+
counter += 1
|
|
377
|
+
heapq.heappush(frontier, (-score, counter, child))
|
|
378
|
+
all_explored.append(child)
|
|
379
|
+
|
|
380
|
+
# Rank and deduplicate
|
|
381
|
+
all_explored.sort(key=lambda n: n.critique_scores.get("overall", 0), reverse=True)
|
|
382
|
+
|
|
383
|
+
seen: set[str] = set()
|
|
384
|
+
results: list[HypothesisNode] = []
|
|
385
|
+
for node in all_explored:
|
|
386
|
+
normalized = node.description.lower().strip()
|
|
387
|
+
if normalized not in seen:
|
|
388
|
+
seen.add(normalized)
|
|
389
|
+
results.append(node)
|
|
390
|
+
if len(results) >= top_k:
|
|
391
|
+
break
|
|
392
|
+
|
|
393
|
+
return results
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
# ---------------------------------------------------------------------------
|
|
397
|
+
# Output formatting
|
|
398
|
+
# ---------------------------------------------------------------------------
|
|
399
|
+
|
|
400
|
+
def format_results(
|
|
401
|
+
results: list[HypothesisNode],
|
|
402
|
+
metric_name: str,
|
|
403
|
+
strategy_used: str,
|
|
404
|
+
total_explored: int,
|
|
405
|
+
) -> str:
|
|
406
|
+
"""Format search results for terminal display."""
|
|
407
|
+
lines = [
|
|
408
|
+
f"TreeQuest Hypothesis Exploration ({strategy_used})",
|
|
409
|
+
"=" * 60,
|
|
410
|
+
f"Nodes explored: {total_explored}",
|
|
411
|
+
f"Top {len(results)} hypotheses by critique score:",
|
|
412
|
+
"",
|
|
413
|
+
]
|
|
414
|
+
|
|
415
|
+
for i, node in enumerate(results, 1):
|
|
416
|
+
scores = node.critique_scores
|
|
417
|
+
overall = scores.get("overall", 0)
|
|
418
|
+
verdict = scores.get("verdict", "?")
|
|
419
|
+
novelty = scores.get("novelty", 0)
|
|
420
|
+
feasibility = scores.get("feasibility", 0)
|
|
421
|
+
impact = scores.get("impact", 0)
|
|
422
|
+
|
|
423
|
+
lines.append(f" {i}. [{verdict.upper()}] (score: {overall}/10)")
|
|
424
|
+
lines.append(f" {node.description}")
|
|
425
|
+
lines.append(f" Novelty: {novelty} Feasibility: {feasibility} Impact: {impact}")
|
|
426
|
+
if node.depth > 0:
|
|
427
|
+
lines.append(f" Depth: {node.depth} (refined from parent)")
|
|
428
|
+
lines.append("")
|
|
429
|
+
|
|
430
|
+
return "\n".join(lines)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def results_to_json(results: list[HypothesisNode]) -> list[dict]:
|
|
434
|
+
"""Serialize results for machine consumption."""
|
|
435
|
+
return [node.to_dict() for node in results]
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# ---------------------------------------------------------------------------
|
|
439
|
+
# CLI
|
|
440
|
+
# ---------------------------------------------------------------------------
|
|
441
|
+
|
|
442
|
+
def main() -> None:
|
|
443
|
+
parser = argparse.ArgumentParser(
|
|
444
|
+
description="Tree-search-guided hypothesis exploration",
|
|
445
|
+
)
|
|
446
|
+
parser.add_argument("--log", default="experiments/log.jsonl",
|
|
447
|
+
help="Path to experiment log")
|
|
448
|
+
parser.add_argument("--config", default="config.yaml",
|
|
449
|
+
help="Path to project config")
|
|
450
|
+
parser.add_argument("--top", type=int, default=5,
|
|
451
|
+
help="Number of top hypotheses to return")
|
|
452
|
+
parser.add_argument("--iterations", type=int, default=30,
|
|
453
|
+
help="Number of search iterations")
|
|
454
|
+
parser.add_argument("--strategy", default="abmcts-a",
|
|
455
|
+
choices=["abmcts-a", "abmcts-m", "greedy"],
|
|
456
|
+
help="Search strategy (abmcts-a, abmcts-m, or greedy fallback)")
|
|
457
|
+
parser.add_argument("--children", type=int, default=3,
|
|
458
|
+
help="Children per node expansion")
|
|
459
|
+
parser.add_argument("--json", action="store_true",
|
|
460
|
+
help="Output as JSON")
|
|
461
|
+
parser.add_argument("--seeds-only", action="store_true",
|
|
462
|
+
help="Only show generated seeds, don't run search")
|
|
463
|
+
args = parser.parse_args()
|
|
464
|
+
|
|
465
|
+
config = load_config(args.config)
|
|
466
|
+
experiments = load_experiments(args.log)
|
|
467
|
+
metric = config.get("evaluation", {}).get("primary_metric", "accuracy")
|
|
468
|
+
|
|
469
|
+
# Generate seeds
|
|
470
|
+
seeds = generate_seed_hypotheses(config, experiments)
|
|
471
|
+
|
|
472
|
+
if args.seeds_only:
|
|
473
|
+
if args.json:
|
|
474
|
+
print(json.dumps([s.to_dict() for s in seeds], indent=2))
|
|
475
|
+
else:
|
|
476
|
+
print(f"Generated {len(seeds)} seed hypotheses:")
|
|
477
|
+
for i, s in enumerate(seeds, 1):
|
|
478
|
+
print(f" {i}. {s.description}")
|
|
479
|
+
return
|
|
480
|
+
|
|
481
|
+
# Run search
|
|
482
|
+
if args.strategy == "greedy":
|
|
483
|
+
results = run_greedy_search(
|
|
484
|
+
seeds, args.log, args.config,
|
|
485
|
+
iterations=args.iterations,
|
|
486
|
+
top_k=args.top,
|
|
487
|
+
children_per_node=args.children,
|
|
488
|
+
)
|
|
489
|
+
strategy_label = "greedy best-first"
|
|
490
|
+
else:
|
|
491
|
+
try:
|
|
492
|
+
import treequest # noqa: F401
|
|
493
|
+
results = run_treequest_search(
|
|
494
|
+
seeds, args.log, args.config,
|
|
495
|
+
iterations=args.iterations,
|
|
496
|
+
top_k=args.top,
|
|
497
|
+
strategy=args.strategy,
|
|
498
|
+
children_per_node=args.children,
|
|
499
|
+
)
|
|
500
|
+
strategy_label = f"TreeQuest {args.strategy.upper()}"
|
|
501
|
+
except ImportError:
|
|
502
|
+
print("TreeQuest not installed, falling back to greedy search.", file=sys.stderr)
|
|
503
|
+
results = run_greedy_search(
|
|
504
|
+
seeds, args.log, args.config,
|
|
505
|
+
iterations=args.iterations,
|
|
506
|
+
top_k=args.top,
|
|
507
|
+
children_per_node=args.children,
|
|
508
|
+
)
|
|
509
|
+
strategy_label = "greedy best-first (fallback)"
|
|
510
|
+
|
|
511
|
+
# Output
|
|
512
|
+
if args.json:
|
|
513
|
+
print(json.dumps(results_to_json(results), indent=2))
|
|
514
|
+
else:
|
|
515
|
+
total = args.iterations + len(seeds)
|
|
516
|
+
print(format_results(results, metric, strategy_label, total))
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
if __name__ == "__main__":
|
|
520
|
+
main()
|