claude-turing 4.0.0 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +8 -2
- package/commands/counterfactual.md +27 -0
- package/commands/onboard.md +20 -0
- package/commands/review.md +20 -0
- package/commands/share.md +20 -0
- package/commands/simulate.md +28 -0
- package/commands/turing.md +12 -0
- package/commands/whatif.md +31 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +6 -0
- package/templates/scripts/__pycache__/counterfactual_explanation.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_simulator.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_onboarding.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/package_experiments.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/simulate_review.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/whatif_engine.cpython-314.pyc +0 -0
- package/templates/scripts/counterfactual_explanation.py +485 -0
- package/templates/scripts/experiment_simulator.py +463 -0
- package/templates/scripts/generate_brief.py +64 -0
- package/templates/scripts/generate_onboarding.py +284 -0
- package/templates/scripts/package_experiments.py +285 -0
- package/templates/scripts/scaffold.py +11 -0
- package/templates/scripts/simulate_review.py +342 -0
- package/templates/scripts/whatif_engine.py +763 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "4.
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "4.2.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 69 commands, 2 specialized agents, what-if analysis (whatif + counterfactual + simulate), collaboration (onboard + share + review), research communication (cite + present + changelog), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), model surgery (prune + quantize + merge + surgery), feature & training intelligence, model debugging, pre-training intelligence, meta-intelligence, scaling & efficiency, model composition, deep analysis, experiment orchestration, literature + paper, model export, profiling, checkpoints, experiment intelligence, statistical rigor, tree-search, cost-performance, model cards, hypothesis database, novelty guard, anti-cheating, taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -374,6 +374,12 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
374
374
|
| `/turing:cite <action>` | Citation & attribution manager — track papers, audit missing citations, generate BibTeX |
|
|
375
375
|
| `/turing:present [--figures]` | Presentation figures — training curves, comparisons, ablation, Pareto, sensitivity |
|
|
376
376
|
| `/turing:changelog [--audience]` | Model changelog — version-grouped improvements for technical or stakeholder audiences |
|
|
377
|
+
| `/turing:onboard [--audience]` | Project onboarding — walkthrough for new collaborators |
|
|
378
|
+
| `/turing:share <exp-ids...>` | Experiment packaging — portable archive with manifest |
|
|
379
|
+
| `/turing:review [--venue]` | Peer review simulation — weaknesses, fix commands, score |
|
|
380
|
+
| `/turing:whatif "<question>"` | What-if analysis — answer hypotheticals from existing experiment data |
|
|
381
|
+
| `/turing:counterfactual <exp-id>` | Counterfactual explanations — minimum input change to flip a prediction |
|
|
382
|
+
| `/turing:simulate [--configs]` | Experiment outcome prediction — pre-filter configs, save budget |
|
|
377
383
|
|
|
378
384
|
And for fully hands-off operation:
|
|
379
385
|
|
|
@@ -558,11 +564,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
558
564
|
|
|
559
565
|
## Architecture of Turing Itself
|
|
560
566
|
|
|
561
|
-
|
|
567
|
+
69 commands, 2 agents, 10 config files, 88 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), pre-training intelligence (sanity + baseline + leak), model debugging (xray + sensitivity + calibrate), feature & training intelligence (feature + curriculum), model surgery (prune + quantize + merge + surgery), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), research communication (cite + present + changelog), collaboration (onboard + share + review), what-if analysis (whatif + counterfactual + simulate), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
562
568
|
|
|
563
569
|
```
|
|
564
570
|
turing/
|
|
565
|
-
├── commands/
|
|
571
|
+
├── commands/ 65 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence + pre-training intelligence + model debugging + feature & training intelligence + model surgery + experiment archaeology + research communication + what-if analysis)
|
|
566
572
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
567
573
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
568
574
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: counterfactual
|
|
3
|
+
description: Input-level counterfactual explanations — find the smallest input change to flip a prediction.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<exp-id> --sample <index> [--target <class>]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
What would need to change to flip this prediction? Minimum-change counterfactual for individual predictions.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/counterfactual_explanation.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/counterfactuals/`
|
|
15
|
+
|
|
16
|
+
## Methods
|
|
17
|
+
- **Greedy perturbation:** change one feature at a time, find minimum flip
|
|
18
|
+
- **Prototype-based:** find nearest training sample from target class
|
|
19
|
+
- Both methods run and the best (smallest distance) is selected
|
|
20
|
+
|
|
21
|
+
## Examples
|
|
22
|
+
```
|
|
23
|
+
/turing:counterfactual exp-042 --sample 1247
|
|
24
|
+
/turing:counterfactual exp-042 --sample 1247 --target 0
|
|
25
|
+
/turing:counterfactual exp-042 --batch-misclassified
|
|
26
|
+
/turing:counterfactual exp-042 --sample 500 --json
|
|
27
|
+
```
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: onboard
|
|
3
|
+
description: Project onboarding — generate a walkthrough for new collaborators. Task, history, decisions, next steps.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--audience researcher|engineer|stakeholder] [--depth brief|full]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
5-minute read that replaces a 1-hour onboarding meeting.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/generate_onboarding.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `ONBOARDING.md`
|
|
15
|
+
|
|
16
|
+
## Examples
|
|
17
|
+
```
|
|
18
|
+
/turing:onboard
|
|
19
|
+
/turing:onboard --audience engineer --depth brief
|
|
20
|
+
```
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: review
|
|
3
|
+
description: Peer review simulation — generate likely reviewer objections with severity ratings and fix commands.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--venue neurips|icml|general] [--harsh]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Simulate a conference reviewer before you submit. Each weakness links to the command that fixes it.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/simulate_review.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/reviews/`
|
|
15
|
+
|
|
16
|
+
## Examples
|
|
17
|
+
```
|
|
18
|
+
/turing:review
|
|
19
|
+
/turing:review --venue neurips --harsh
|
|
20
|
+
```
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: share
|
|
3
|
+
description: Experiment packaging — portable archive with config, metrics, seed study, annotations, reproduction instructions.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<exp-ids...> [--include model,figures,code]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Package experiments for collaborator handoff or paper supplementary material.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/package_experiments.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `exports/packages/<name>/`
|
|
15
|
+
|
|
16
|
+
## Examples
|
|
17
|
+
```
|
|
18
|
+
/turing:share exp-089
|
|
19
|
+
/turing:share exp-042 exp-089 --include model,figures
|
|
20
|
+
```
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: simulate
|
|
3
|
+
description: Experiment outcome prediction — predict which configs will beat the current best before running them.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--configs configs.yaml] [--top-k 5] [--threshold 0.001]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Predict outcomes before spending compute. Ranks proposed configs and recommends which to run vs skip.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/experiment_simulator.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/simulations/`
|
|
15
|
+
|
|
16
|
+
## How it works
|
|
17
|
+
- Builds a surrogate model from experiment history (weighted k-NN)
|
|
18
|
+
- Predicts metric for each proposed config
|
|
19
|
+
- Applies novelty penalty for configs far from training distribution
|
|
20
|
+
- Ranks and filters: only recommend configs predicted to improve
|
|
21
|
+
|
|
22
|
+
## Examples
|
|
23
|
+
```
|
|
24
|
+
/turing:simulate --configs sweep_configs.yaml
|
|
25
|
+
/turing:simulate --configs candidates.yaml --top-k 3
|
|
26
|
+
/turing:simulate --configs proposals.yaml --threshold 0.005
|
|
27
|
+
/turing:simulate --configs sweep.yaml --json
|
|
28
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -65,6 +65,9 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
65
65
|
| "cite", "citation", "bibliography", "bibtex", "attribution", "references" | `/turing:cite` | Record |
|
|
66
66
|
| "present", "figures", "slides", "presentation", "charts", "plots" | `/turing:present` | Document |
|
|
67
67
|
| "changelog", "model changelog", "progress summary", "what improved" | `/turing:changelog` | Document |
|
|
68
|
+
| "onboard", "onboarding", "walkthrough", "new collaborator", "project overview" | `/turing:onboard` | Document |
|
|
69
|
+
| "share", "package", "export experiments", "send results", "portable" | `/turing:share` | Share |
|
|
70
|
+
| "review", "peer review", "reviewer", "simulate review", "weakness" | `/turing:review` | Validate |
|
|
68
71
|
| "trend", "trends", "research direction", "improvement rate", "diminishing returns", "what's working" | `/turing:trend` | Analyze |
|
|
69
72
|
| "flashback", "where was I", "context", "resume", "catch up", "what happened" | `/turing:flashback` | Recall |
|
|
70
73
|
| "archive", "cleanup", "compress old", "disk space", "archive experiments" | `/turing:archive` | Manage |
|
|
@@ -72,6 +75,9 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
72
75
|
| "search", "find experiment", "query experiments", "which experiments" | `/turing:search` | Query |
|
|
73
76
|
| "template", "recipe", "save config", "reusable config", "starting point" | `/turing:template` | Manage |
|
|
74
77
|
| "replay", "re-run", "revisit", "retry old", "would it work now" | `/turing:replay` | Validate |
|
|
78
|
+
| "what if", "what-if", "hypothetical", "estimate impact", "would it help" | `/turing:whatif` | Analyze |
|
|
79
|
+
| "counterfactual", "flip prediction", "why this prediction", "minimum change", "explanation" | `/turing:counterfactual` | Explain |
|
|
80
|
+
| "simulate", "predict outcome", "pre-filter", "which configs will work", "forecast" | `/turing:simulate` | Predict |
|
|
75
81
|
|
|
76
82
|
## Sub-commands
|
|
77
83
|
|
|
@@ -140,6 +146,12 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
140
146
|
| `/turing:cite <action>` | Citation manager: add/list/check/bib for papers, datasets, methods | (inline) |
|
|
141
147
|
| `/turing:present [--figures]` | Presentation figures: training curves, comparisons, ablation, Pareto, sensitivity | (inline) |
|
|
142
148
|
| `/turing:changelog [--audience]` | Model changelog: version-grouped improvements for technical or stakeholder audiences | (inline) |
|
|
149
|
+
| `/turing:onboard [--audience]` | Project onboarding: full walkthrough for new collaborators | (inline) |
|
|
150
|
+
| `/turing:share <exp-ids...>` | Experiment packaging: portable archive with manifest and README | (inline) |
|
|
151
|
+
| `/turing:review [--venue]` | Peer review simulation: weaknesses, questions, fix commands, score | (inline) |
|
|
152
|
+
| `/turing:whatif "<question>"` | What-if analysis: route hypotheticals to existing estimators (scaling, ablation, sensitivity, ensemble, pruning) | (inline) |
|
|
153
|
+
| `/turing:counterfactual <exp-id> --sample <index>` | Input-level counterfactual explanations: minimum input change to flip a prediction | (inline) |
|
|
154
|
+
| `/turing:simulate [--configs] [--top-k]` | Experiment outcome prediction: pre-filter configs using surrogate model, save budget | (inline) |
|
|
143
155
|
|
|
144
156
|
## Proactive Detection
|
|
145
157
|
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: whatif
|
|
3
|
+
description: What-if analysis — answer hypotheticals from existing experiment data without running new experiments.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "\"<question>\" [--json]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Answer "what if?" questions using existing experiment data. Routes to the right estimator automatically.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/whatif_engine.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/whatif/`
|
|
15
|
+
|
|
16
|
+
## Supported question types
|
|
17
|
+
- **Data scaling:** "what if I had 2x more data" → scaling law extrapolation
|
|
18
|
+
- **Ablation:** "what if I removed class 3" → ablation study data
|
|
19
|
+
- **Pipeline stitch:** "what if I combined exp-031 with exp-042" → stitch estimation
|
|
20
|
+
- **Hyperparameters:** "what if learning_rate was 0.01" → sensitivity interpolation
|
|
21
|
+
- **Ensemble:** "what if I ensembled the top models" → correlation analysis
|
|
22
|
+
- **Pruning:** "what if I pruned to 50% sparsity" → pruning sweep interpolation
|
|
23
|
+
- **Budget:** "what if I spent my budget on X vs Y" → budget allocation
|
|
24
|
+
|
|
25
|
+
## Examples
|
|
26
|
+
```
|
|
27
|
+
/turing:whatif "what if I had 2x more data"
|
|
28
|
+
/turing:whatif "what if I removed class 3"
|
|
29
|
+
/turing:whatif "what if I combined exp-031 with exp-042"
|
|
30
|
+
/turing:whatif "what if learning_rate was 0.01" --json
|
|
31
|
+
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.2.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
|
@@ -35,6 +35,8 @@ const SUB_COMMANDS = [
|
|
|
35
35
|
"prune", "quantize", "merge", "surgery",
|
|
36
36
|
"trend", "flashback", "archive", "annotate", "search", "template", "replay",
|
|
37
37
|
"cite", "present", "changelog",
|
|
38
|
+
"onboard", "share", "review",
|
|
39
|
+
"whatif", "counterfactual", "simulate",
|
|
38
40
|
];
|
|
39
41
|
|
|
40
42
|
export async function install(opts = {}) {
|
package/src/verify.js
CHANGED
|
@@ -77,6 +77,12 @@ const EXPECTED_COMMANDS = [
|
|
|
77
77
|
"cite/SKILL.md",
|
|
78
78
|
"present/SKILL.md",
|
|
79
79
|
"changelog/SKILL.md",
|
|
80
|
+
"onboard/SKILL.md",
|
|
81
|
+
"share/SKILL.md",
|
|
82
|
+
"review/SKILL.md",
|
|
83
|
+
"whatif/SKILL.md",
|
|
84
|
+
"counterfactual/SKILL.md",
|
|
85
|
+
"simulate/SKILL.md",
|
|
80
86
|
];
|
|
81
87
|
|
|
82
88
|
const EXPECTED_AGENTS = ["ml-researcher.md", "ml-evaluator.md"];
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|