npm - claude-turing - Versions diffs - 4.2.0 → 4.4.0 - Mend

claude-turing 4.2.0 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/.claude-plugin/plugin.json +2 -2
package/README.md +7 -2
package/commands/doctor.md +30 -0
package/commands/plan.md +27 -0
package/commands/postmortem.md +28 -0
package/commands/registry.md +31 -0
package/commands/turing.md +10 -0
package/commands/update.md +27 -0
package/package.json +1 -1
package/src/install.js +2 -0
package/src/verify.js +5 -0
package/templates/scripts/__pycache__/failure_postmortem.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/harness_doctor.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/incremental_update.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/model_lifecycle.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/research_planner.cpython-314.pyc +0 -0
package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
package/templates/scripts/failure_postmortem.py +510 -0
package/templates/scripts/generate_brief.py +122 -0
package/templates/scripts/generate_model_card.py +154 -3
package/templates/scripts/harness_doctor.py +466 -0
package/templates/scripts/incremental_update.py +586 -0
package/templates/scripts/model_lifecycle.py +549 -0
package/templates/scripts/research_planner.py +470 -0
package/templates/scripts/scaffold.py +10 -0

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "turing",
-  "version": "4.2.0",
-  "description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 69 commands, 2 specialized agents, what-if analysis (whatif + counterfactual + simulate), collaboration (onboard + share + review), research communication (cite + present + changelog), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), model surgery (prune + quantize + merge + surgery), feature & training intelligence, model debugging, pre-training intelligence, meta-intelligence, scaling & efficiency, model composition, deep analysis, experiment orchestration, literature + paper, model export, profiling, checkpoints, experiment intelligence, statistical rigor, tree-search, cost-performance, model cards, hypothesis database, novelty guard, anti-cheating, taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
+  "version": "4.4.0",
+  "description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 74 commands, 2 specialized agents, operational intelligence (postmortem + doctor + plan), model lifecycle (update + registry), what-if analysis (whatif + counterfactual + simulate), collaboration (onboard + share + review), research communication (cite + present + changelog), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), model surgery (prune + quantize + merge + surgery), feature & training intelligence, model debugging, pre-training intelligence, meta-intelligence, scaling & efficiency, model composition, deep analysis, experiment orchestration, literature + paper, model export, profiling, checkpoints, experiment intelligence, statistical rigor, tree-search, cost-performance, model cards, hypothesis database, novelty guard, anti-cheating, taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
   "author": {
     "name": "pragnition"
   },

package/README.md CHANGED Viewed

@@ -380,6 +380,11 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
 | `/turing:whatif "<question>"` | What-if analysis — answer hypotheticals from existing experiment data |
 | `/turing:counterfactual <exp-id>` | Counterfactual explanations — minimum input change to flip a prediction |
 | `/turing:simulate [--configs]` | Experiment outcome prediction — pre-filter configs, save budget |
+| `/turing:update <exp-id>` | Incremental model update — add new data without full retraining |
+| `/turing:registry [action]` | Model registry — track lifecycle from candidate to production with gates |
+| `/turing:postmortem` | Failure postmortem — diagnose why experiments stopped improving |
+| `/turing:doctor [--fix]` | Harness self-diagnosis — check environment, project, resources |
+| `/turing:plan [--budget N]` | Research planning — strategic experiment campaign by ROI |
 And for fully hands-off operation:
@@ -564,11 +569,11 @@ Each project gets independent config, data, experiments, models, and agent memor
 ## Architecture of Turing Itself
-69 commands, 2 agents, 10 config files, 88 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), pre-training intelligence (sanity + baseline + leak), model debugging (xray + sensitivity + calibrate), feature & training intelligence (feature + curriculum), model surgery (prune + quantize + merge + surgery), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), research communication (cite + present + changelog), collaboration (onboard + share + review), what-if analysis (whatif + counterfactual + simulate), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
+74 commands, 2 agents, 10 config files, 93 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), pre-training intelligence (sanity + baseline + leak), model debugging (xray + sensitivity + calibrate), feature & training intelligence (feature + curriculum), model surgery (prune + quantize + merge + surgery), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), research communication (cite + present + changelog), collaboration (onboard + share + review), what-if analysis (whatif + counterfactual + simulate), model lifecycle (update + registry), operational intelligence (postmortem + doctor + plan), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
 ```
 turing/
-├── commands/              65 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence + pre-training intelligence + model debugging + feature & training intelligence + model surgery + experiment archaeology + research communication + what-if analysis)
+├── commands/              70 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence + pre-training intelligence + model debugging + feature & training intelligence + model surgery + experiment archaeology + research communication + what-if analysis + model lifecycle + operational intelligence)
 ├── agents/                2 agents (researcher: read/write, evaluator: read-only)
 ├── config/                8 files (lifecycle, taxonomy, archetypes, novelty aliases)
 ├── templates/             Scaffolded into user projects by /turing:init

package/commands/doctor.md ADDED Viewed

@@ -0,0 +1,30 @@
+---
+name: doctor
+description: Harness self-diagnosis — check environment, project, resources, and git state. Auto-fix common issues.
+disable-model-invocation: true
+argument-hint: "[--fix] [--verbose]"
+allowed-tools: Read, Bash(*), Grep, Glob
+---
+Is Turing healthy? Check everything and get a score.
+## Steps
+1. `source .venv/bin/activate`
+2. `python scripts/harness_doctor.py $ARGUMENTS`
+3. **Saved:** `experiments/doctor/`
+## Checks
+- **Environment:** Python version, venv status
+- **Dependencies:** all required packages importable
+- **Config:** config.yaml valid with required fields
+- **Experiment log:** JSONL integrity, corrupt line detection
+- **Scripts:** train.py, prepare.py, evaluate.py exist and parse
+- **Disk space:** warn if <1GB free
+- **Git state:** uncommitted changes to critical files
+## Examples
+```
+/turing:doctor
+/turing:doctor --fix
+/turing:doctor --verbose --json
+```

package/commands/plan.md ADDED Viewed

@@ -0,0 +1,27 @@
+---
+name: plan
+description: Research planning assistant — design a strategic experiment campaign with budget-aware ROI allocation.
+disable-model-invocation: true
+argument-hint: "[--budget 20] [--goal \"maximize F1 for production\"]"
+allowed-tools: Read, Bash(*), Grep, Glob
+---
+Design the next N experiments strategically, not randomly. Allocates budget by expected ROI.
+## Steps
+1. `source .venv/bin/activate`
+2. `python scripts/research_planner.py $ARGUMENTS`
+3. **Saved:** `experiments/plans/`
+## How it works
+- Analyzes experiment history to compute per-family ROI
+- Adjusts strategy priorities based on project state and goal
+- Allocates budget across: feature engineering, model search, ensemble, calibration, verification
+- Generates phased plan with specific experiment descriptions
+## Examples
+```
+/turing:plan --budget 20
+/turing:plan --budget 10 --goal "maximize F1 for production deployment"
+/turing:plan --budget 30 --json
+```

package/commands/postmortem.md ADDED Viewed

@@ -0,0 +1,28 @@
+---
+name: postmortem
+description: Failure postmortem — diagnose why experiments stopped improving and get actionable next steps.
+disable-model-invocation: true
+argument-hint: "[--window 10] [--auto-trigger 5]"
+allowed-tools: Read, Bash(*), Grep, Glob
+---
+When experiments stop improving, find out why. Diagnoses search space exhaustion, config errors, data issues, metric ceilings, and noise floors.
+## Steps
+1. `source .venv/bin/activate`
+2. `python scripts/failure_postmortem.py $ARGUMENTS`
+3. **Saved:** `experiments/postmortems/`
+## Diagnosis categories
+- **Search space exhaustion:** micro-tuning params that don't matter
+- **Systematic config error:** all experiments share a bad common config
+- **Data issue:** all model types fail similarly
+- **Metric ceiling:** near theoretical maximum
+- **Noise floor:** improvements within seed variance
+## Examples
+```
+/turing:postmortem
+/turing:postmortem --window 15
+/turing:postmortem --json
+```

package/commands/registry.md ADDED Viewed

@@ -0,0 +1,31 @@
+---
+name: registry
+description: Model registry — track, promote, and govern the model lifecycle from candidate to production.
+disable-model-invocation: true
+argument-hint: "[list|register|promote|demote|archive|history] [exp-id] [stage]"
+allowed-tools: Read, Bash(*), Grep, Glob
+---
+Track which model is production, staging, candidate, or archived. Promotion requires passing gates.
+## Steps
+1. `source .venv/bin/activate`
+2. `python scripts/model_lifecycle.py $ARGUMENTS`
+3. **Registry:** `experiments/registry.yaml`
+## Promotion gates
+- **candidate → staging:** regression check + seed study must PASS
+- **staging → production:** audit + calibration check must PASS
+- Use `--force` to skip gate checks
+## Examples
+```
+/turing:registry list
+/turing:registry register exp-095 --version v4.1
+/turing:registry promote exp-089 staging
+/turing:registry promote exp-089 production --force
+/turing:registry demote exp-078 staging --reason "latency regression"
+/turing:registry archive exp-042 --reason "superseded by v4"
+/turing:registry history
+/turing:registry history exp-089
+```

package/commands/turing.md CHANGED Viewed

@@ -78,6 +78,11 @@ You are the Turing ML research router. Detect the user's intent and route to the
 | "what if", "what-if", "hypothetical", "estimate impact", "would it help" | `/turing:whatif` | Analyze |
 | "counterfactual", "flip prediction", "why this prediction", "minimum change", "explanation" | `/turing:counterfactual` | Explain |
 | "simulate", "predict outcome", "pre-filter", "which configs will work", "forecast" | `/turing:simulate` | Predict |
+| "update", "incremental", "new data", "add data", "fine-tune existing", "partial update" | `/turing:update` | Update |
+| "registry", "promote", "demote", "staging", "production", "which model is deployed", "model lifecycle" | `/turing:registry` | Govern |
+| "postmortem", "why failing", "failure streak", "why no improvement", "what went wrong" | `/turing:postmortem` | Diagnose |
+| "doctor", "health check", "is it broken", "diagnose harness", "self-check" | `/turing:doctor` | Check |
+| "plan", "research plan", "campaign", "what next", "allocate budget", "strategic plan" | `/turing:plan` | Plan |
 ## Sub-commands
@@ -152,6 +157,11 @@ You are the Turing ML research router. Detect the user's intent and route to the
 | `/turing:whatif "<question>"` | What-if analysis: route hypotheticals to existing estimators (scaling, ablation, sensitivity, ensemble, pruning) | (inline) |
 | `/turing:counterfactual <exp-id> --sample <index>` | Input-level counterfactual explanations: minimum input change to flip a prediction | (inline) |
 | `/turing:simulate [--configs] [--top-k]` | Experiment outcome prediction: pre-filter configs using surrogate model, save budget | (inline) |
+| `/turing:update <exp-id> --new-data <path>` | Incremental model update: add new data without full retraining, forgetting detection | (inline) |
+| `/turing:registry [list\|register\|promote\|demote\|history]` | Model registry: stage lifecycle (candidate → staging → production) with promotion gates | (inline) |
+| `/turing:postmortem [--window N]` | Failure postmortem: diagnose why experiments stopped improving (exhaustion, config error, data issue, ceiling, noise) | (inline) |
+| `/turing:doctor [--fix]` | Harness self-diagnosis: environment, dependencies, config, log integrity, scripts, disk, git state | (inline) |
+| `/turing:plan [--budget N] [--goal]` | Research planning assistant: strategic campaign design with budget-aware ROI allocation | (inline) |
 ## Proactive Detection

package/commands/update.md ADDED Viewed

@@ -0,0 +1,27 @@
+---
+name: update
+description: Incremental model update — add new data without full retraining, with forgetting detection.
+disable-model-invocation: true
+argument-hint: "<exp-id> --new-data <path> [--replay-ratio 0.1] [--tolerance 0.005]"
+allowed-tools: Read, Bash(*), Grep, Glob
+---
+Add new data to an existing model without starting from scratch. Detects catastrophic forgetting.
+## Steps
+1. `source .venv/bin/activate`
+2. `python scripts/incremental_update.py $ARGUMENTS`
+3. **Saved:** `experiments/updates/`
+## Model-specific strategies
+- **XGBoost/LightGBM:** continued boosting with additional rounds
+- **Neural networks:** fine-tune with reduced LR + replay buffer from old data
+- **scikit-learn:** partial_fit() or warm_start=True
+## Examples
+```
+/turing:update exp-089 --new-data data/new_batch.csv
+/turing:update exp-089 --new-data data/new.csv --replay-ratio 0.2
+/turing:update exp-089 --new-data data/new.csv --tolerance 0.01
+/turing:update exp-089 --new-data data/new.csv --json
+```

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-turing",
-  "version": "4.2.0",
+  "version": "4.4.0",
   "type": "module",
   "description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
   "bin": {

package/src/install.js CHANGED Viewed

@@ -37,6 +37,8 @@ const SUB_COMMANDS = [
   "cite", "present", "changelog",
   "onboard", "share", "review",
   "whatif", "counterfactual", "simulate",
+  "update", "registry",
+  "postmortem", "doctor", "plan",
 ];
 export async function install(opts = {}) {

package/src/verify.js CHANGED Viewed

@@ -83,6 +83,11 @@ const EXPECTED_COMMANDS = [
   "whatif/SKILL.md",
   "counterfactual/SKILL.md",
   "simulate/SKILL.md",
+  "update/SKILL.md",
+  "registry/SKILL.md",
+  "postmortem/SKILL.md",
+  "doctor/SKILL.md",
+  "plan/SKILL.md",
 ];
 const EXPECTED_AGENTS = ["ml-researcher.md", "ml-evaluator.md"];

package/templates/scripts/__pycache__/failure_postmortem.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc CHANGED Viewed

Binary file

package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc CHANGED Viewed

Binary file

package/templates/scripts/__pycache__/harness_doctor.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/incremental_update.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/model_lifecycle.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/research_planner.cpython-314.pyc ADDED Viewed

Binary file

package/templates/scripts/__pycache__/scaffold.cpython-314.pyc CHANGED Viewed

Binary file