npm - claude-turing - Versions diffs - 1.0.1 → 1.2.0 - Mend

claude-turing 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/.claude-plugin/plugin.json +2 -2
package/README.md +66 -3
package/commands/card.md +36 -0
package/commands/explore.md +107 -0
package/commands/suggest.md +68 -4
package/commands/turing.md +4 -0
package/package.json +1 -1
package/src/claude-md.js +1 -0
package/src/install.js +2 -2
package/src/verify.js +2 -0
package/templates/requirements.txt +4 -0
package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc +0 -0
package/templates/scripts/cleanup.py +599 -0
package/templates/scripts/cost_frontier.py +292 -0
package/templates/scripts/diff_configs.py +534 -0
package/templates/scripts/export_results.py +457 -0
package/templates/scripts/generate_brief.py +58 -3
package/templates/scripts/generate_model_card.py +342 -0
package/templates/scripts/leaderboard.py +508 -0
package/templates/scripts/manage_hypotheses.py +2 -2
package/templates/scripts/plot_trajectory.py +611 -0
package/templates/scripts/scaffold.py +8 -0
package/templates/scripts/show_metrics.py +23 -2
package/templates/scripts/treequest_suggest.py +520 -0
package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
package/templates/tests/test_cost_frontier.py +222 -0

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "turing",
-  "version": "1.0.1",
-  "description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 14 commands, 2 specialized agents, structured experiment lifecycle with convergence detection, immutable evaluation infrastructure, novelty guard, decision synthesis, hypothesis database, and safety guardrails that separate the hypothesis space from the measurement apparatus. Inspired by Karpathy's autoresearch and the scientific method itself.",
+  "version": "1.2.0",
+  "description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 17 commands, 2 specialized agents, tree-search hypothesis exploration (TreeQuest AB-MCTS), cost-performance frontier analysis, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
   "author": {
     "name": "pragnition"
   },

package/README.md CHANGED Viewed

@@ -313,6 +313,8 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
 | `/turing:try <hypothesis>` | Inject a hypothesis — free text or `archetype:model_comparison` |
 | `/turing:brief [--deep]` | Research briefing — campaign summary, failure patterns, literature-grounded suggestions |
 | `/turing:suggest` | Literature-grounded model architecture suggestions with citations |
+| `/turing:suggest --strategy treequest` | Tree-search hypothesis exploration (alias for `/turing:explore`) |
+| `/turing:explore` | AB-MCTS tree search over critique-scored hypothesis space |
 | `/turing:design <hyp-id>` | Generate structured experiment design from a hypothesis |
 | `/turing:mode <explore\|exploit\|replicate>` | Set research strategy — drives novelty guard policy |
@@ -321,6 +323,7 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
 | Command | What it does |
 |---------|-------------|
 | `/turing:validate [--auto]` | Check metric stability — auto-configure multi-run if noisy |
+| `/turing:card` | Generate a model card — performance, limitations, intended use, artifact contract |
 | `/turing:logbook` | Generate HTML experiment logbook |
 | `/turing:report` | Generate research report |
 | `/turing:poster` | Generate research poster |
@@ -389,6 +392,65 @@ After N experiments with no meaningful improvement, the agent stops and reports
 For noisy metrics, `/turing:validate` runs the pipeline multiple times and measures variance. If the coefficient of variation exceeds 5%, it auto-configures multi-run evaluation so the agent can't be rewarded for lucky single runs.
+## Tree-Search Hypothesis Exploration
+> *"The learned coin-flipper weaves through the quadrillion-coin room with a preternatural air."*
+Sometimes the best experiment to try next isn't obvious from the literature or the agent's memory. `/turing:explore` uses [TreeQuest](https://github.com/SakanaAI/treequest)'s AB-MCTS (Adaptive Branching Monte Carlo Tree Search) to search the space of experiment *ideas* as a tree, scored by the critique engine (novelty x feasibility x impact).
+```
+/turing:explore                         # Run MCTS over hypothesis space
+/turing:explore --strategy greedy       # Greedy fallback (no TreeQuest needed)
+/turing:explore --iterations 50 --top 8 # Deeper search, more results
+/turing:suggest --strategy treequest    # Same thing via suggest
+```
+How it works:
+```
+         Seeds                    MCTS expands best-scoring branches
+           │
+    ┌──────┼──────┐               Each node is a hypothesis scored by:
+    ▼      ▼      ▼                 - Novelty (vs experiment history)
+  LightGBM Reg  Features            - Feasibility (hardware, deps)
+    │       │      │                - Expected impact (type success rate)
+    ▼       ▼      ▼
+  +dart   +L1   +poly             Top-K results queued as hypotheses
+    │              │              for the next /turing:train run
+    ▼              ▼
+  +subsamp      +target-enc
+```
+Unlike `/turing:suggest` (which searches the web for papers), `/turing:explore` searches the space of *refinement chains* — combinations and sequences of modifications that score well together. It discovers non-obvious experiment strategies that independent suggestions cannot find.
+Falls back to greedy best-first search when TreeQuest is not installed.
+## Cost-Performance Frontier
+> *"This model is 2% better but takes 10x longer to train. Is that worth it?"*
+The briefing now surfaces [Pareto-optimal](https://en.wikipedia.org/wiki/Pareto_efficiency) experiments — the efficient set where no other experiment is both faster AND has a better metric. The cost report tells you the tradeoff in plain language:
+```
+Best metric: exp-012 (accuracy=0.893, 2400s)
+Best efficiency: exp-003 (accuracy=0.871, 3s)
+The 2.5% improvement costs 800x more compute.
+```
+Run `python scripts/cost_frontier.py` directly, or read the "Cost-Performance Analysis" section in `/turing:brief`.
+## Model Cards
+When it's time to ship, `/turing:card` generates a standardized model card documenting:
+- Model type, framework, training time
+- Performance metrics (all configured metrics)
+- Training data source and split ratios
+- Limitations (including overfit detection)
+- Intended use and ethical considerations (user fills these in)
+- Artifact contract version for production consumers
+Inspired by [Google's Model Cards](https://arxiv.org/abs/1810.03993) and [Hugging Face model cards](https://huggingface.co/docs/hub/model-cards).
 ## Installation
 ```bash
@@ -424,11 +486,11 @@ Each project gets independent config, data, experiments, models, and agent memor
 ## Architecture of Turing Itself
-15 commands, 2 agents, 8 config files, 25 template scripts, model registry, artifact contract, 338 tests, 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
+17 commands, 2 agents, 8 config files, 31 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, 379 tests, 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
 ```
 turing/
-├── commands/              15 skill files (core + taste-leverage + reporting)
+├── commands/              16 skill files (core + taste-leverage + reporting + exploration)
 ├── agents/                2 agents (researcher: read/write, evaluator: read-only)
 ├── config/                8 files (lifecycle, taxonomy, archetypes, novelty aliases)
 ├── templates/             Scaffolded into user projects by /turing:init
@@ -437,7 +499,7 @@ turing/
 │   ├── train.py           Training code (AGENT-EDITABLE)
 │   ├── model_contract.md  Artifact schema for production consumers
 │   ├── model_registry.yaml  Available model architectures + hyperparams
-│   └── scripts/           25 Python scripts (core loop + analysis + infra)
+│   └── scripts/           26 Python scripts (core loop + analysis + infra + tree search)
 ├── tests/                 338 tests (unit + integration + anti-pattern + manifest)
 ├── src/                   5 JS installer files (npm deployment)
 ├── bin/                   CLI entry points
@@ -455,6 +517,7 @@ turing/
 - **[Principle of Least Privilege](https://en.wikipedia.org/wiki/Principle_of_least_privilege)** (Saltzer & Schroeder, 1975) — each agent has exactly the capabilities needed for its role
 - **[Early Stopping](https://en.wikipedia.org/wiki/Early_stopping)** (Prechelt, 1998) — convergence detection as discrete early stopping
 - **[Multi-Armed Bandits](https://en.wikipedia.org/wiki/Multi-armed_bandit)** — the explore-exploit tradeoff
+- **[TreeQuest](https://github.com/SakanaAI/treequest)** (Sakana AI, 2025) — AB-MCTS for inference-time scaling; repurposed here for hypothesis-space exploration
 - **[Version Control as Lab Notebook](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004668)** (Ram, 2013) — git as a scientific record-keeping system
 - **[Reproducibility Crisis](https://en.wikipedia.org/wiki/Replication_crisis)** — if the measurement can change between experiments, results are not reproducible

package/commands/card.md ADDED Viewed

@@ -0,0 +1,36 @@
+---
+name: card
+description: Generate a standardized model card documenting the trained model — type, performance, training data, limitations, intended use, and artifact contract.
+disable-model-invocation: true
+allowed-tools: Read, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob
+---
+You generate a standardized model card from the experiment log, model contract, and config.
+## Steps
+1. **Activate the virtual environment:**
+   ```bash
+   source .venv/bin/activate
+   ```
+2. **Run the model card generator:**
+   ```bash
+   python scripts/generate_model_card.py --config config.yaml --log experiments/log.jsonl --contract model_contract.md --output MODEL_CARD.md
+   ```
+3. **Read and present the generated card:**
+   - Read `MODEL_CARD.md` and display it to the user.
+   - If no experiments exist yet, inform the user and show the skeleton card.
+4. **Suggest next steps:**
+   - Review the **Ethical Considerations** section and fill in bias, fairness, and impact notes.
+   - Review the **Intended Use** section and document what the model is NOT intended for.
+   - If limitations mention overfitting, suggest running `/turing:validate` for stability checks.
+   - If the card looks complete, suggest committing it to version control.
+## Error Handling
+- If `config.yaml` is missing, tell the user to run `/turing:init` first.
+- If `experiments/log.jsonl` is missing or empty, generate a skeleton card and note that training is needed.
+- If `.venv` doesn't exist, try `python3 scripts/generate_model_card.py` directly.

package/commands/explore.md ADDED Viewed

@@ -0,0 +1,107 @@
+---
+name: explore
+description: Tree-search-guided hypothesis exploration using AB-MCTS. Explores the space of experiment ideas as a search tree, scored by the critique engine. Discovers non-obvious refinement chains that linear suggestion cannot find.
+disable-model-invocation: true
+argument-hint: "[ml/project] [--iterations N] [--top N] [--strategy abmcts-a|abmcts-m|greedy]"
+allowed-tools: Read, Write, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob
+---
+Explore the hypothesis space using tree search. Instead of suggesting independent ideas, this builds and searches a tree of refinement chains — each node is a hypothesis scored by novelty, feasibility, and expected impact.
+## Project Detection
+0. **Detect project directory:**
+   - If `$ARGUMENTS` contains a path (e.g., `ml/coding`), use that as the project directory
+   - Else if cwd contains `config.yaml` and `train.py`, use cwd
+   - Else search for `ml/*/` subdirectories containing `config.yaml`
+     - If exactly one found, use it
+     - If multiple found, list them and ask the user which to target
+   - All subsequent commands run from the detected project directory
+## Parse Options
+Extract from `$ARGUMENTS`:
+- `--iterations N` — search depth (default: 30)
+- `--top N` — number of results to return (default: 5)
+- `--strategy` — algorithm choice: `abmcts-a` (default), `abmcts-m` (Bayesian), or `greedy` (no TreeQuest needed)
+- `--seeds-only` — just show generated seeds without running search
+- `--json` — output as JSON for programmatic use
+## Steps
+### 1. Assess Current State
+```bash
+source .venv/bin/activate && python scripts/show_metrics.py --last 10 2>/dev/null || echo "No experiments yet"
+```
+Read `config.yaml` to understand the current model and metric.
+### 2. Run Tree Search
+```bash
+source .venv/bin/activate && python scripts/treequest_suggest.py \
+    --log experiments/log.jsonl \
+    --config config.yaml \
+    --top <N> \
+    --iterations <N> \
+    --strategy <strategy>
+```
+The script will:
+- Generate seed hypotheses from config and experiment history
+- Run AB-MCTS (or greedy fallback) over the hypothesis tree
+- Score each node using the critique engine
+- Return top-K ranked, deduplicated hypotheses
+### 3. Queue Best Hypotheses
+For each result, add to the hypothesis queue:
+```bash
+source .venv/bin/activate && python scripts/manage_hypotheses.py add "<description>" \
+    --priority medium --source treequest
+```
+### 4. Show Results
+Display the search output and confirm queuing:
+```
+TreeQuest Hypothesis Exploration (AB-MCTS-A)
+============================================
+Nodes explored: 35
+Top 5 hypotheses by critique score:
+  1. [PROCEED] (score: 7.8/10)
+     Switch to LightGBM with dart boosting; additionally add polynomial features
+     Novelty: 8  Feasibility: 9  Impact: 7
+     -> Queued as hyp-NNN
+  2. [PROCEED] (score: 7.2/10)
+     Use low learning rate (0.01) with 2000 estimators; additionally add L2 regularization
+     Novelty: 7  Feasibility: 8  Impact: 7
+     Depth: 1 (refined from parent)
+     -> Queued as hyp-NNN
+  ...
+Queued N hypotheses. Run /turing:train to test them.
+```
+## How It Differs From /turing:suggest
+| | `/turing:suggest` | `/turing:explore` |
+|---|---|---|
+| **Source** | Web literature search | Tree search over critique scores |
+| **Strategy** | Independent suggestions | Refinement chains (parent -> child) |
+| **Requires internet** | Yes | No |
+| **Discovers** | What papers recommend | What combinations score well |
+| **Best for** | Early-stage exploration | Mid-experiment optimization |
+## Integration
+- Results feed into `hypotheses.yaml` — the next `/turing:train` picks them up
+- `/turing:brief` shows queued treequest-sourced hypotheses
+- `/turing:suggest --strategy treequest` is an alias for this command
+- Human can override priority: `/turing:try` always takes precedence

package/commands/suggest.md CHANGED Viewed

@@ -6,9 +6,16 @@ argument-hint: "[task description override]"
 allowed-tools: Read, Write, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob, WebSearch, WebFetch
 ---
-Suggest model architectures for the current ML task, grounded in recent literature. Hypotheses backed by papers, not vibes.
+Suggest model architectures for the current ML task. Supports two strategies:
-## Steps
+- **literature** (default): Web search for recent papers, synthesize grounded suggestions with citations.
+- **treequest**: Tree-search-guided hypothesis exploration using AB-MCTS over the critique scoring function. Explores refinement chains that literature search cannot find.
+## Strategy Detection
+If `$ARGUMENTS` contains `--strategy treequest` or `treequest`, use the TreeQuest strategy below. Otherwise use the default literature strategy.
+## Steps (Literature Strategy — default)
 ### 1. Understand the Task
@@ -84,12 +91,69 @@ Sources consulted: <N papers/articles>
 Queued N hypotheses. Run /turing:train to test them.
 ```
-## Fallback
+## Fallback (Literature Strategy)
 If web search returns insufficient results, suggest model families from `config/taxonomy.toml` based on what hasn't been tried yet. Note that suggestions are taxonomy-based, not literature-backed, and queue with `--source taxonomy`.
+## Steps (TreeQuest Strategy)
+When using `--strategy treequest`:
+### 1. Detect Project Directory
+Same detection logic as the literature strategy — find `config.yaml` + `train.py`.
+### 2. Run Tree Search
+```bash
+source .venv/bin/activate && python scripts/treequest_suggest.py \
+    --log experiments/log.jsonl \
+    --config config.yaml \
+    --top 5 \
+    --iterations 30 \
+    --strategy abmcts-a
+```
+If TreeQuest is not installed, the script automatically falls back to greedy best-first search.
+### 3. Queue Results
+For each result from the tree search, queue as a hypothesis:
+```bash
+source .venv/bin/activate && python scripts/manage_hypotheses.py add "<description>" --priority medium --source treequest
+```
+### 4. Show Results
+Display the tree search output and confirm hypotheses were queued:
+```
+TreeQuest Hypothesis Exploration (AB-MCTS-A)
+============================================
+Nodes explored: 35
+Top 5 hypotheses by critique score:
+  1. [PROCEED] (score: 7.8/10)
+     Switch to LightGBM with dart boosting; additionally add polynomial features
+     Novelty: 8  Feasibility: 9  Impact: 7
+  ...
+Queued N hypotheses. Run /turing:train to test them.
+```
+### TreeQuest Options
+Pass additional flags via `$ARGUMENTS`:
+- `--iterations N` — search depth (default: 30)
+- `--top N` — number of results (default: 5)
+- `--strategy abmcts-m` — use Bayesian mixed model variant (requires PyMC)
+- `--greedy` — force greedy fallback without TreeQuest
 ## Integration
 - Suggestions feed into `hypotheses.yaml` — the next `/turing:train` picks them up
-- `/turing:brief` shows queued literature-sourced hypotheses
+- `/turing:brief` shows queued literature-sourced and treequest-sourced hypotheses
+- `/turing:explore` runs the TreeQuest search as a standalone command
 - Human can override priority: `/turing:try` always takes precedence

package/commands/turing.md CHANGED Viewed

@@ -21,9 +21,11 @@ You are the Turing ML research router. Detect the user's intent and route to the
 | "report", "write-up", "findings", "document results" | `/turing:report` | Document |
 | "validate", "stability", "check variance", "noisy" | `/turing:validate` | Validate |
 | "suggest", "what model", "recommend", "which architecture", "literature" | `/turing:suggest` | Research |
+| "explore hypotheses", "tree search", "treequest", "search hypothesis space", "MCTS" | `/turing:explore` | Research |
 | "design", "plan experiment", "how should I test", "experiment design" | `/turing:design` | Design |
 | "mode", "explore", "exploit", "replicate", "strategy" | `/turing:mode` | Strategy |
 | "preflight", "resources", "VRAM", "memory", "can I run", "OOM", "GPU" | `/turing:preflight` | Check |
+| "card", "model card", "document model", "model documentation" | `/turing:card` | Document |
 ## Sub-commands
@@ -38,12 +40,14 @@ You are the Turing ML research router. Detect the user's intent and route to the
 | `/turing:init` | Scaffold a new ML project | (inline) |
 | `/turing:validate` | Check metric stability, auto-fix if noisy | (inline) |
 | `/turing:suggest` | Literature-grounded model architecture suggestions | (inline, uses WebSearch) |
+| `/turing:explore` | Tree-search hypothesis exploration via AB-MCTS | (inline) |
 | `/turing:design <hyp-id>` | Generate structured experiment design from hypothesis | (inline, uses WebSearch) |
 | `/turing:logbook` | HTML/markdown logbook with trajectory chart | (inline) |
 | `/turing:poster` | Single-page HTML research poster | (inline) |
 | `/turing:report` | Structured markdown research report | (inline) |
 | `/turing:mode <mode>` | Set research strategy (explore/exploit/replicate) | (inline) |
 | `/turing:preflight` | Pre-flight resource check (VRAM/RAM/disk) | (inline) |
+| `/turing:card` | Generate standardized model card (type, performance, data, limitations, contract) | (inline) |
 ## Proactive Detection

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-turing",
-  "version": "1.0.1",
+  "version": "1.2.0",
   "type": "module",
   "description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
   "bin": {

package/src/claude-md.js CHANGED Viewed

@@ -21,6 +21,7 @@ Autonomous ML research harness. The autoresearch loop as a formal protocol.
 | \`/turing:validate\` | Check metric stability, auto-fix if noisy |
 | \`/turing:try <hypothesis>\` | Inject a hypothesis into the experiment queue |
 | \`/turing:brief\` | Generate research intelligence report |
+| \`/turing:explore\` | Tree-search hypothesis exploration (AB-MCTS) |
 | \`/turing:preflight\` | Pre-flight resource check (VRAM/RAM/disk) |
 ### Agents

package/src/install.js CHANGED Viewed

@@ -21,8 +21,8 @@ const PLUGIN_ROOT = join(__dirname, "..");
 // Single source of truth for sub-commands (DRY — used for dirs and file copy)
 const SUB_COMMANDS = [
   "init", "train", "status", "compare", "sweep", "validate",
-  "try", "brief", "suggest", "design", "logbook", "poster",
-  "report", "mode", "preflight",
+  "try", "brief", "suggest", "explore", "design", "logbook", "poster",
+  "report", "mode", "preflight", "card",
 ];
 export async function install(opts = {}) {

package/src/verify.js CHANGED Viewed

@@ -23,12 +23,14 @@ const EXPECTED_COMMANDS = [
   "try/SKILL.md",
   "brief/SKILL.md",
   "suggest/SKILL.md",
+  "explore/SKILL.md",
   "design/SKILL.md",
   "logbook/SKILL.md",
   "poster/SKILL.md",
   "report/SKILL.md",
   "mode/SKILL.md",
   "preflight/SKILL.md",
+  "card/SKILL.md",
 ];
 const EXPECTED_AGENTS = ["ml-researcher.md", "ml-evaluator.md"];

package/templates/requirements.txt CHANGED Viewed

@@ -6,3 +6,7 @@ numpy>=2.0
 joblib>=1.4
 pyyaml>=6.0
 pytest>=8.0
+# Optional: tree-search-guided hypothesis exploration
+# Install with: pip install "treequest[all]"
+# treequest>=0.1

package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc CHANGED Viewed

Binary file

package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/manage_hypotheses.cpython-314.pyc CHANGED Viewed

Binary file

package/templates/scripts/__pycache__/scaffold.cpython-314.pyc CHANGED Viewed

Binary file

package/templates/scripts/__pycache__/treequest_suggest.cpython-314.pyc ADDED Viewed

Binary file