claude-turing 4.2.0 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +7 -2
- package/commands/doctor.md +30 -0
- package/commands/plan.md +27 -0
- package/commands/postmortem.md +28 -0
- package/commands/registry.md +31 -0
- package/commands/turing.md +10 -0
- package/commands/update.md +27 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +5 -0
- package/templates/scripts/__pycache__/failure_postmortem.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/harness_doctor.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/incremental_update.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_lifecycle.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/research_planner.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/failure_postmortem.py +510 -0
- package/templates/scripts/generate_brief.py +122 -0
- package/templates/scripts/generate_model_card.py +154 -3
- package/templates/scripts/harness_doctor.py +466 -0
- package/templates/scripts/incremental_update.py +586 -0
- package/templates/scripts/model_lifecycle.py +549 -0
- package/templates/scripts/research_planner.py +470 -0
- package/templates/scripts/scaffold.py +10 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "4.
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "4.4.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 74 commands, 2 specialized agents, operational intelligence (postmortem + doctor + plan), model lifecycle (update + registry), what-if analysis (whatif + counterfactual + simulate), collaboration (onboard + share + review), research communication (cite + present + changelog), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), model surgery (prune + quantize + merge + surgery), feature & training intelligence, model debugging, pre-training intelligence, meta-intelligence, scaling & efficiency, model composition, deep analysis, experiment orchestration, literature + paper, model export, profiling, checkpoints, experiment intelligence, statistical rigor, tree-search, cost-performance, model cards, hypothesis database, novelty guard, anti-cheating, taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -380,6 +380,11 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
380
380
|
| `/turing:whatif "<question>"` | What-if analysis — answer hypotheticals from existing experiment data |
|
|
381
381
|
| `/turing:counterfactual <exp-id>` | Counterfactual explanations — minimum input change to flip a prediction |
|
|
382
382
|
| `/turing:simulate [--configs]` | Experiment outcome prediction — pre-filter configs, save budget |
|
|
383
|
+
| `/turing:update <exp-id>` | Incremental model update — add new data without full retraining |
|
|
384
|
+
| `/turing:registry [action]` | Model registry — track lifecycle from candidate to production with gates |
|
|
385
|
+
| `/turing:postmortem` | Failure postmortem — diagnose why experiments stopped improving |
|
|
386
|
+
| `/turing:doctor [--fix]` | Harness self-diagnosis — check environment, project, resources |
|
|
387
|
+
| `/turing:plan [--budget N]` | Research planning — strategic experiment campaign by ROI |
|
|
383
388
|
|
|
384
389
|
And for fully hands-off operation:
|
|
385
390
|
|
|
@@ -564,11 +569,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
564
569
|
|
|
565
570
|
## Architecture of Turing Itself
|
|
566
571
|
|
|
567
|
-
|
|
572
|
+
74 commands, 2 agents, 10 config files, 93 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), pre-training intelligence (sanity + baseline + leak), model debugging (xray + sensitivity + calibrate), feature & training intelligence (feature + curriculum), model surgery (prune + quantize + merge + surgery), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), research communication (cite + present + changelog), collaboration (onboard + share + review), what-if analysis (whatif + counterfactual + simulate), model lifecycle (update + registry), operational intelligence (postmortem + doctor + plan), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
568
573
|
|
|
569
574
|
```
|
|
570
575
|
turing/
|
|
571
|
-
├── commands/
|
|
576
|
+
├── commands/ 70 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence + pre-training intelligence + model debugging + feature & training intelligence + model surgery + experiment archaeology + research communication + what-if analysis + model lifecycle + operational intelligence)
|
|
572
577
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
573
578
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
574
579
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: doctor
|
|
3
|
+
description: Harness self-diagnosis — check environment, project, resources, and git state. Auto-fix common issues.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--fix] [--verbose]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Is Turing healthy? Check everything and get a score.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/harness_doctor.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/doctor/`
|
|
15
|
+
|
|
16
|
+
## Checks
|
|
17
|
+
- **Environment:** Python version, venv status
|
|
18
|
+
- **Dependencies:** all required packages importable
|
|
19
|
+
- **Config:** config.yaml valid with required fields
|
|
20
|
+
- **Experiment log:** JSONL integrity, corrupt line detection
|
|
21
|
+
- **Scripts:** train.py, prepare.py, evaluate.py exist and parse
|
|
22
|
+
- **Disk space:** warn if <1GB free
|
|
23
|
+
- **Git state:** uncommitted changes to critical files
|
|
24
|
+
|
|
25
|
+
## Examples
|
|
26
|
+
```
|
|
27
|
+
/turing:doctor
|
|
28
|
+
/turing:doctor --fix
|
|
29
|
+
/turing:doctor --verbose --json
|
|
30
|
+
```
|
package/commands/plan.md
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: plan
|
|
3
|
+
description: Research planning assistant — design a strategic experiment campaign with budget-aware ROI allocation.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--budget 20] [--goal \"maximize F1 for production\"]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Design the next N experiments strategically, not randomly. Allocates budget by expected ROI.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/research_planner.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/plans/`
|
|
15
|
+
|
|
16
|
+
## How it works
|
|
17
|
+
- Analyzes experiment history to compute per-family ROI
|
|
18
|
+
- Adjusts strategy priorities based on project state and goal
|
|
19
|
+
- Allocates budget across: feature engineering, model search, ensemble, calibration, verification
|
|
20
|
+
- Generates phased plan with specific experiment descriptions
|
|
21
|
+
|
|
22
|
+
## Examples
|
|
23
|
+
```
|
|
24
|
+
/turing:plan --budget 20
|
|
25
|
+
/turing:plan --budget 10 --goal "maximize F1 for production deployment"
|
|
26
|
+
/turing:plan --budget 30 --json
|
|
27
|
+
```
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: postmortem
|
|
3
|
+
description: Failure postmortem — diagnose why experiments stopped improving and get actionable next steps.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--window 10] [--auto-trigger 5]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
When experiments stop improving, find out why. Diagnoses search space exhaustion, config errors, data issues, metric ceilings, and noise floors.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/failure_postmortem.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/postmortems/`
|
|
15
|
+
|
|
16
|
+
## Diagnosis categories
|
|
17
|
+
- **Search space exhaustion:** micro-tuning params that don't matter
|
|
18
|
+
- **Systematic config error:** all experiments share a bad common config
|
|
19
|
+
- **Data issue:** all model types fail similarly
|
|
20
|
+
- **Metric ceiling:** near theoretical maximum
|
|
21
|
+
- **Noise floor:** improvements within seed variance
|
|
22
|
+
|
|
23
|
+
## Examples
|
|
24
|
+
```
|
|
25
|
+
/turing:postmortem
|
|
26
|
+
/turing:postmortem --window 15
|
|
27
|
+
/turing:postmortem --json
|
|
28
|
+
```
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: registry
|
|
3
|
+
description: Model registry — track, promote, and govern the model lifecycle from candidate to production.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[list|register|promote|demote|archive|history] [exp-id] [stage]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Track which model is production, staging, candidate, or archived. Promotion requires passing gates.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/model_lifecycle.py $ARGUMENTS`
|
|
14
|
+
3. **Registry:** `experiments/registry.yaml`
|
|
15
|
+
|
|
16
|
+
## Promotion gates
|
|
17
|
+
- **candidate → staging:** regression check + seed study must PASS
|
|
18
|
+
- **staging → production:** audit + calibration check must PASS
|
|
19
|
+
- Use `--force` to skip gate checks
|
|
20
|
+
|
|
21
|
+
## Examples
|
|
22
|
+
```
|
|
23
|
+
/turing:registry list
|
|
24
|
+
/turing:registry register exp-095 --version v4.1
|
|
25
|
+
/turing:registry promote exp-089 staging
|
|
26
|
+
/turing:registry promote exp-089 production --force
|
|
27
|
+
/turing:registry demote exp-078 staging --reason "latency regression"
|
|
28
|
+
/turing:registry archive exp-042 --reason "superseded by v4"
|
|
29
|
+
/turing:registry history
|
|
30
|
+
/turing:registry history exp-089
|
|
31
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -78,6 +78,11 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
78
78
|
| "what if", "what-if", "hypothetical", "estimate impact", "would it help" | `/turing:whatif` | Analyze |
|
|
79
79
|
| "counterfactual", "flip prediction", "why this prediction", "minimum change", "explanation" | `/turing:counterfactual` | Explain |
|
|
80
80
|
| "simulate", "predict outcome", "pre-filter", "which configs will work", "forecast" | `/turing:simulate` | Predict |
|
|
81
|
+
| "update", "incremental", "new data", "add data", "fine-tune existing", "partial update" | `/turing:update` | Update |
|
|
82
|
+
| "registry", "promote", "demote", "staging", "production", "which model is deployed", "model lifecycle" | `/turing:registry` | Govern |
|
|
83
|
+
| "postmortem", "why failing", "failure streak", "why no improvement", "what went wrong" | `/turing:postmortem` | Diagnose |
|
|
84
|
+
| "doctor", "health check", "is it broken", "diagnose harness", "self-check" | `/turing:doctor` | Check |
|
|
85
|
+
| "plan", "research plan", "campaign", "what next", "allocate budget", "strategic plan" | `/turing:plan` | Plan |
|
|
81
86
|
|
|
82
87
|
## Sub-commands
|
|
83
88
|
|
|
@@ -152,6 +157,11 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
152
157
|
| `/turing:whatif "<question>"` | What-if analysis: route hypotheticals to existing estimators (scaling, ablation, sensitivity, ensemble, pruning) | (inline) |
|
|
153
158
|
| `/turing:counterfactual <exp-id> --sample <index>` | Input-level counterfactual explanations: minimum input change to flip a prediction | (inline) |
|
|
154
159
|
| `/turing:simulate [--configs] [--top-k]` | Experiment outcome prediction: pre-filter configs using surrogate model, save budget | (inline) |
|
|
160
|
+
| `/turing:update <exp-id> --new-data <path>` | Incremental model update: add new data without full retraining, forgetting detection | (inline) |
|
|
161
|
+
| `/turing:registry [list\|register\|promote\|demote\|history]` | Model registry: stage lifecycle (candidate → staging → production) with promotion gates | (inline) |
|
|
162
|
+
| `/turing:postmortem [--window N]` | Failure postmortem: diagnose why experiments stopped improving (exhaustion, config error, data issue, ceiling, noise) | (inline) |
|
|
163
|
+
| `/turing:doctor [--fix]` | Harness self-diagnosis: environment, dependencies, config, log integrity, scripts, disk, git state | (inline) |
|
|
164
|
+
| `/turing:plan [--budget N] [--goal]` | Research planning assistant: strategic campaign design with budget-aware ROI allocation | (inline) |
|
|
155
165
|
|
|
156
166
|
## Proactive Detection
|
|
157
167
|
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: update
|
|
3
|
+
description: Incremental model update — add new data without full retraining, with forgetting detection.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<exp-id> --new-data <path> [--replay-ratio 0.1] [--tolerance 0.005]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Add new data to an existing model without starting from scratch. Detects catastrophic forgetting.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/incremental_update.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/updates/`
|
|
15
|
+
|
|
16
|
+
## Model-specific strategies
|
|
17
|
+
- **XGBoost/LightGBM:** continued boosting with additional rounds
|
|
18
|
+
- **Neural networks:** fine-tune with reduced LR + replay buffer from old data
|
|
19
|
+
- **scikit-learn:** partial_fit() or warm_start=True
|
|
20
|
+
|
|
21
|
+
## Examples
|
|
22
|
+
```
|
|
23
|
+
/turing:update exp-089 --new-data data/new_batch.csv
|
|
24
|
+
/turing:update exp-089 --new-data data/new.csv --replay-ratio 0.2
|
|
25
|
+
/turing:update exp-089 --new-data data/new.csv --tolerance 0.01
|
|
26
|
+
/turing:update exp-089 --new-data data/new.csv --json
|
|
27
|
+
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.4.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
package/src/verify.js
CHANGED
|
@@ -83,6 +83,11 @@ const EXPECTED_COMMANDS = [
|
|
|
83
83
|
"whatif/SKILL.md",
|
|
84
84
|
"counterfactual/SKILL.md",
|
|
85
85
|
"simulate/SKILL.md",
|
|
86
|
+
"update/SKILL.md",
|
|
87
|
+
"registry/SKILL.md",
|
|
88
|
+
"postmortem/SKILL.md",
|
|
89
|
+
"doctor/SKILL.md",
|
|
90
|
+
"plan/SKILL.md",
|
|
86
91
|
];
|
|
87
92
|
|
|
88
93
|
const EXPECTED_AGENTS = ["ml-researcher.md", "ml-evaluator.md"];
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|