npm - claude-turing - Versions diffs - 2.4.0 → 3.0.0 - Mend

claude-turing 2.4.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/.claude-plugin/plugin.json +2 -2
package/README.md +7 -2
package/commands/audit.md +56 -0
package/commands/budget.md +52 -0
package/commands/distill.md +56 -0
package/commands/scale.md +55 -0
package/commands/transfer.md +54 -0
package/commands/turing.md +10 -0
package/package.json +1 -1
package/src/install.js +2 -0
package/src/verify.js +5 -0
package/templates/scripts/__pycache__/budget_manager.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/knowledge_transfer.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/methodology_audit.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/model_distiller.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/scaling_estimator.cpython-314.pyc +0 -0
package/templates/scripts/budget_manager.py +419 -0
package/templates/scripts/generate_brief.py +101 -0
package/templates/scripts/knowledge_transfer.py +618 -0
package/templates/scripts/methodology_audit.py +451 -0
package/templates/scripts/model_distiller.py +478 -0
package/templates/scripts/scaffold.py +9 -0
package/templates/scripts/scaling_estimator.py +523 -0

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "turing",
-  "version": "2.4.0",
-  "description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 36 commands, 2 specialized agents, model composition (ensemble + pipeline stitch + warm-start), deep analysis (experiment diff + live training monitor + regression gate), experiment orchestration (batch queue + smart retry + branching), literature integration + paper drafting, production model export, performance profiling, smart checkpoints, experiment intelligence, statistical rigor, tree-search hypothesis exploration, cost-performance frontier, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
+  "version": "3.0.0",
+  "description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 41 commands, 2 specialized agents, meta-intelligence (cross-project knowledge transfer + methodology audit), scaling & efficiency (scaling laws + compute budget + model distillation), model composition (ensemble + pipeline stitch + warm-start), deep analysis (experiment diff + live training monitor + regression gate), experiment orchestration (batch queue + smart retry + branching), literature integration + paper drafting, production model export, performance profiling, smart checkpoints, experiment intelligence, statistical rigor, tree-search hypothesis exploration, cost-performance frontier, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
   "author": {
     "name": "pragnition"
   },

package/README.md CHANGED Viewed

@@ -347,6 +347,11 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
 | `/turing:ensemble [--top-k]` | Automated ensemble — voting, stacking, blending from top-K models |
 | `/turing:stitch <action>` | Pipeline composition — show, swap, cache, and run stages independently |
 | `/turing:warm <exp-id>` | Warm-start from prior model — load checkpoint, freeze layers, adjust LR |
+| `/turing:scale [--axis]` | Scaling law estimator — power-law fit, full-scale predictions, diminishing returns verdict |
+| `/turing:budget <action>` | Compute budget manager — set limits, track allocation, auto-shift explore/exploit |
+| `/turing:distill <exp-id>` | Model compression — distill teacher into smaller student with accuracy/size tradeoff |
+| `/turing:transfer [--from]` | Cross-project knowledge transfer — find similar projects, surface what worked |
+| `/turing:audit [--strict]` | Pre-submission methodology audit — data leakage, baselines, seeds, ablations, reproducibility |
 And for fully hands-off operation:
@@ -531,11 +536,11 @@ Each project gets independent config, data, experiments, models, and agent memor
 ## Architecture of Turing Itself
-36 commands, 2 agents, 10 config files, 55 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
+41 commands, 2 agents, 10 config files, 60 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
 ```
 turing/
-├── commands/              35 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition)
+├── commands/              40 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence)
 ├── agents/                2 agents (researcher: read/write, evaluator: read-only)
 ├── config/                8 files (lifecycle, taxonomy, archetypes, novelty aliases)
 ├── templates/             Scaffolded into user projects by /turing:init

package/commands/audit.md ADDED Viewed

@@ -0,0 +1,56 @@
+---
+name: audit
+description: Pre-submission methodology audit — catch data leakage, missing baselines, cherry-picked seeds, and incomplete ablations before a reviewer does.
+disable-model-invocation: true
+argument-hint: "[--strict] [--checklist neurips]"
+allowed-tools: Read, Bash(*), Grep, Glob
+---
+A reviewer checklist you run before submitting. Catches methodology mistakes that cause desk rejections.
+## Steps
+1. **Activate environment:**
+   ```bash
+   source .venv/bin/activate
+   ```
+2. **Parse arguments from `$ARGUMENTS`:**
+   - `--strict` — treat warnings as failures
+   - `--checklist neurips|icml|iclr` — add venue-specific checks
+   - `--json` — raw JSON output
+3. **Run methodology audit:**
+   ```bash
+   python scripts/methodology_audit.py $ARGUMENTS
+   ```
+4. **Checks performed:**
+   - **Data leakage** (critical): verify prepare.py/evaluate.py separation
+   - **CV strategy** (critical): verify appropriate cross-validation for data type
+   - **Seed sensitivity** (high): seed studies exist for best experiments
+   - **Ablation completeness** (high): ablation studies performed
+   - **Baseline comparison** (high): simple baselines in experiment log
+   - **Reproducibility** (high): best result successfully reproduced
+   - **Hyperparameter budget** (medium): total tuning cost documented
+   - **Regression stability** (medium): regression checks performed
+5. **Verdicts:**
+   - **PASS** — ready for submission
+   - **PASS (with warnings)** — address before submission
+   - **NEEDS WORK** — fix failures first
+   - **FAIL** — critical issues found
+6. **Actions:** each failure suggests the `/turing:` command to fix it
+7. **Venue checklists:** `--checklist neurips` adds NeurIPS-specific checks (broader impact, reproducibility checklist, code availability)
+8. **Saved output:** report in `experiments/audits/audit-YYYY-MM-DD.yaml`
+## Examples
+```
+/turing:audit                          # Standard audit
+/turing:audit --strict                 # Warnings become failures
+/turing:audit --checklist neurips      # NeurIPS submission checklist
+```

package/commands/budget.md ADDED Viewed

@@ -0,0 +1,52 @@
+---
+name: budget
+description: Compute budget manager — set experiment/time limits, track allocation across explore/exploit phases, auto-shift modes, hard stop.
+disable-model-invocation: true
+argument-hint: "<set|status|reset> [--experiments 50] [--hours 8]"
+allowed-tools: Read, Bash(*), Grep, Glob
+---
+Set a compute ceiling and let the system optimize within it. Prevents runaway experiment loops.
+## Steps
+1. **Activate environment:**
+   ```bash
+   source .venv/bin/activate
+   ```
+2. **Parse arguments from `$ARGUMENTS`:**
+   - First argument is action: `set`, `status`, `reset`, or `check`
+   - `--experiments 50` — max experiment count
+   - `--hours 8` — max wall-clock hours
+   - `--json` — raw JSON output
+3. **Run budget manager:**
+   ```bash
+   python scripts/budget_manager.py $ARGUMENTS
+   ```
+4. **Actions:**
+   - **set:** create a budget with experiment and/or time constraints
+   - **status:** show usage, burn rate, projected exhaustion, allocation breakdown
+   - **reset:** deactivate the current budget
+   - **check:** returns whether another experiment is allowed (used by `/turing:train`)
+5. **Budget allocation policy:**
+   - **0-50% budget:** EXPLORE — try diverse hypotheses
+   - **50-80% budget:** MIXED — explore promising, exploit best
+   - **80-100% budget:** EXPLOIT ONLY — refine the winner
+   - **100% budget:** HARD STOP — `/turing:train` refuses new experiments
+6. **Budget state** stored in `experiment_state.yaml` under the `budget` key.
+7. **If no budget exists:** `/turing:train` runs without limits.
+## Examples
+```
+/turing:budget set --experiments 50 --hours 8   # Set both constraints
+/turing:budget set --experiments 30             # Experiment count only
+/turing:budget status                           # Show usage and projections
+/turing:budget reset                            # Remove budget limits
+```

package/commands/distill.md ADDED Viewed

@@ -0,0 +1,56 @@
+---
+name: distill
+description: Model compression via distillation — train a smaller student model to match a larger teacher's predictions.
+disable-model-invocation: true
+argument-hint: "<teacher-exp-id> [--compression 4] [--method soft-labels]"
+allowed-tools: Read, Bash(*), Grep, Glob
+---
+Compress a large model into a smaller, faster one for production. Measures the accuracy/size/latency tradeoff.
+## Steps
+1. **Activate environment:**
+   ```bash
+   source .venv/bin/activate
+   ```
+2. **Parse arguments from `$ARGUMENTS`:**
+   - First argument is teacher experiment ID (required)
+   - `--compression 4` — compression ratio (default: 4x)
+   - `--method soft_labels|feature_matching|dataset_distillation` — distillation method
+   - `--target-latency 5` — auto-adjust compression to meet latency target (ms)
+   - `--json` — raw JSON output
+3. **Run distillation planner:**
+   ```bash
+   python scripts/model_distiller.py $ARGUMENTS
+   ```
+4. **Report includes:**
+   - Teacher model metrics
+   - Auto-selected student architecture (fewer trees/layers/width)
+   - Estimated size reduction and latency improvement
+   - Distillation configuration (temperature, alpha, loss function)
+   - Verdict: EXCELLENT / ACCEPTABLE / MARGINAL / TOO MUCH LOSS
+5. **Student selection by model type:**
+   - **Tree models:** fewer estimators, shallower depth
+   - **Neural networks:** fewer layers, narrower hidden dims
+   - **scikit-learn:** simpler model family (RandomForest → DecisionTree)
+6. **Distillation methods:**
+   - **soft_labels:** train on teacher's probability outputs with temperature scaling
+   - **feature_matching:** align intermediate representations (neural only)
+   - **dataset_distillation:** train on teacher-labeled synthetic data
+7. **Saved output:** report written to `experiments/distillations/distill-<exp-id>.yaml`
+## Examples
+```
+/turing:distill exp-042                              # 4x compression, soft labels
+/turing:distill exp-042 --compression 8              # Aggressive compression
+/turing:distill exp-042 --method feature_matching    # Neural feature alignment
+/turing:distill exp-042 --target-latency 5           # Meet 5ms latency target
+```

package/commands/scale.md ADDED Viewed

@@ -0,0 +1,55 @@
+---
+name: scale
+description: Scaling law estimator — run small experiments at different sizes, fit a power law, and predict full-scale performance before committing compute.
+disable-model-invocation: true
+argument-hint: "[--axis data|compute|params] [--points 4] [--analyze results.yaml]"
+allowed-tools: Read, Bash(*), Grep, Glob
+---
+Predict full-scale performance from a handful of small experiments. Answers "is it worth training on the full dataset?" in 30 minutes instead of 3 days.
+## Steps
+1. **Activate environment:**
+   ```bash
+   source .venv/bin/activate
+   ```
+2. **Parse arguments from `$ARGUMENTS`:**
+   - `--axis data|compute|params` — scaling axis (default: data)
+   - `--points 4` — number of scale points (default: 4)
+   - `--analyze results.yaml` — analyze existing results instead of planning
+   - `--plot` — include ASCII scaling plot
+   - `--json` — raw JSON output
+3. **Plan or analyze:**
+   - **Plan mode (default):** generates scale point configs to run
+     ```bash
+     python scripts/scaling_estimator.py --axis data --points 4
+     ```
+   - **Analyze mode:** fits power law to completed results
+     ```bash
+     python scripts/scaling_estimator.py --analyze experiments/scaling/results.yaml
+     ```
+4. **Scaling axes:**
+   - **data:** train on 10%, 25%, 50%, 75% of dataset
+   - **compute:** train for 10%, 25%, 50%, 75% of max epochs
+   - **params:** scale model size (fewer estimators, shallower depth)
+5. **After planning:** run each scale point experiment, record results in YAML, then use `--analyze` to fit the curve
+6. **Report includes:**
+   - Power law fit: `metric = a × n^b` with R²
+   - Predictions for 100%, 150%, 200% scale
+   - Verdict: DIMINISHING RETURNS / MARGINAL GAINS / WORTH SCALING
+7. **Saved output:** report written to `experiments/scaling/scale-YYYY-MM-DD.yaml`
+## Examples
+```
+/turing:scale                                  # Plan: data axis, 4 points
+/turing:scale --axis compute --points 3        # Plan: compute axis, 3 points
+/turing:scale --analyze results.yaml --plot    # Analyze with ASCII plot
+```

package/commands/transfer.md ADDED Viewed

@@ -0,0 +1,54 @@
+---
+name: transfer
+description: Cross-project knowledge transfer — find similar prior projects and surface what worked. Builds institutional ML memory.
+disable-model-invocation: true
+argument-hint: "[--from project-path] [--auto]"
+allowed-tools: Read, Bash(*), Grep, Glob
+---
+Find similar prior projects and surface what worked. "Last time you had tabular classification with class imbalance, LightGBM beat everything by 3%."
+## Steps
+1. **Activate environment:**
+   ```bash
+   source .venv/bin/activate
+   ```
+2. **Parse arguments from `$ARGUMENTS`:**
+   - `--from ~/projects/fraud-detection` — transfer from a specific project
+   - `--auto` — auto-queue hypotheses from recommendations
+   - `--index ~/.turing/project_index.yaml` — custom index path
+   - `--json` — raw JSON output
+3. **Run knowledge transfer:**
+   ```bash
+   python scripts/knowledge_transfer.py $ARGUMENTS
+   ```
+4. **Report includes:**
+   - Similar prior projects ranked by similarity score
+   - Per project: task type, winner model, key insights
+   - Suggested hypotheses from winning strategies
+   - Auto-queued hypotheses (with `--auto`)
+5. **Similarity matching** uses:
+   - Task type (classification/regression) — highest weight
+   - Dataset size (log-scale comparison)
+   - Feature types (tabular/image/text)
+   - Class balance characteristics
+   - Dimensionality
+6. **Project index** at `~/.turing/project_index.yaml` — local only, never uploaded
+7. **If no similar projects found:** suggest running on more projects first or specifying one with `--from`
+8. **Saved output:** report in `experiments/transfers/transfer-*.yaml`
+## Examples
+```
+/turing:transfer                                    # Search index for similar projects
+/turing:transfer --from ~/projects/fraud-detection  # Transfer from specific project
+/turing:transfer --auto                             # Auto-queue hypotheses
+```

package/commands/turing.md CHANGED Viewed

@@ -45,6 +45,11 @@ You are the Turing ML research router. Detect the user's intent and route to the
 | "ensemble", "combine models", "voting", "stacking", "blending", "merge models" | `/turing:ensemble` | Compose |
 | "stitch", "pipeline", "swap stage", "cache stage", "pipeline composition" | `/turing:stitch` | Compose |
 | "warm", "warm start", "fine-tune", "continue training", "transfer learning", "from checkpoint" | `/turing:warm` | Compose |
+| "scale", "scaling law", "how much data", "is more data worth it", "power law", "data efficiency" | `/turing:scale` | Analyze |
+| "budget", "compute budget", "how many experiments", "spending limit", "stop after" | `/turing:budget` | Manage |
+| "distill", "compress", "smaller model", "student model", "knowledge distillation", "model compression" | `/turing:distill` | Deploy |
+| "transfer", "what worked before", "similar project", "cross-project", "institutional knowledge", "prior projects" | `/turing:transfer` | Research |
+| "audit", "methodology check", "pre-submission", "reviewer checklist", "data leakage", "missing baselines" | `/turing:audit` | Validate |
 ## Sub-commands
@@ -86,6 +91,11 @@ You are the Turing ML research router. Detect the user's intent and route to the
 | `/turing:ensemble [--top-k] [--methods]` | Automated ensemble: voting, weighted voting, stacking, blending from top-K models | (inline) |
 | `/turing:stitch <action> [stage]` | Pipeline composition: show/swap/cache/run stages independently | (inline) |
 | `/turing:warm <exp-id>` | Warm-start from prior model: load checkpoint, freeze layers, adjust LR | (inline) |
+| `/turing:scale [--axis]` | Scaling law estimator: fit power law, predict full-scale performance | (inline) |
+| `/turing:budget <action>` | Compute budget manager: set limits, track allocation, auto-shift modes | (inline) |
+| `/turing:distill <exp-id>` | Model compression: distill teacher into smaller student model | (inline) |
+| `/turing:transfer [--from]` | Cross-project knowledge transfer: find similar prior projects, surface what worked | (inline) |
+| `/turing:audit [--strict]` | Pre-submission methodology audit: data leakage, baselines, seeds, ablations, reproducibility | (inline) |
 ## Proactive Detection

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-turing",
-  "version": "2.4.0",
+  "version": "3.0.0",
   "type": "module",
   "description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
   "bin": {

package/src/install.js CHANGED Viewed

@@ -27,6 +27,8 @@ const SUB_COMMANDS = [
   "lit", "paper", "queue", "retry", "fork",
   "diff", "watch", "regress",
   "ensemble", "stitch", "warm",
+  "scale", "budget", "distill",
+  "transfer", "audit",
 ];
 export async function install(opts = {}) {

package/src/verify.js CHANGED Viewed

@@ -50,6 +50,11 @@ const EXPECTED_COMMANDS = [
   "ensemble/SKILL.md",
   "stitch/SKILL.md",
   "warm/SKILL.md",
+  "scale/SKILL.md",
+  "budget/SKILL.md",
+  "distill/SKILL.md",
+  "transfer/SKILL.md",
+  "audit/SKILL.md",
 ];
 const EXPECTED_AGENTS = ["ml-researcher.md", "ml-evaluator.md"];