claude-turing 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +7 -2
- package/commands/ablate.md +47 -0
- package/commands/checkpoint.md +47 -0
- package/commands/diagnose.md +52 -0
- package/commands/frontier.md +45 -0
- package/commands/profile.md +43 -0
- package/commands/turing.md +10 -0
- package/package.json +1 -1
- package/src/install.js +1 -0
- package/src/verify.js +5 -0
- package/templates/scripts/__pycache__/ablation_study.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/checkpoint_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/diagnose_errors.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/pareto_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/profile_training.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/ablation_study.py +487 -0
- package/templates/scripts/checkpoint_manager.py +449 -0
- package/templates/scripts/diagnose_errors.py +601 -0
- package/templates/scripts/generate_brief.py +74 -1
- package/templates/scripts/pareto_frontier.py +470 -0
- package/templates/scripts/profile_training.py +533 -0
- package/templates/scripts/scaffold.py +11 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "1.5.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 24 commands, 2 specialized agents, performance profiling, smart Pareto-based checkpoint management, experiment intelligence (error analysis, ablation studies, Pareto frontiers), statistical rigor (multi-seed studies, reproducibility verification), tree-search hypothesis exploration (TreeQuest AB-MCTS), cost-performance frontier analysis, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -325,6 +325,11 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
325
325
|
| `/turing:validate [--auto]` | Check metric stability — auto-configure multi-run if noisy |
|
|
326
326
|
| `/turing:seed [N] [--quick]` | Multi-seed study — mean/std/CI, flag seed-sensitive results |
|
|
327
327
|
| `/turing:reproduce <exp-id>` | Reproducibility verification — re-run and check tolerance |
|
|
328
|
+
| `/turing:diagnose [exp-id]` | Error analysis — failure modes, confused pairs, feature-range bias |
|
|
329
|
+
| `/turing:ablate [--components]` | Ablation study — remove components, measure impact, flag dead weight |
|
|
330
|
+
| `/turing:frontier [--metrics]` | Pareto frontier — multi-objective tradeoff visualization |
|
|
331
|
+
| `/turing:profile [exp-id]` | Computational profiling — timing, memory, throughput, bottleneck detection |
|
|
332
|
+
| `/turing:checkpoint <action>` | Smart checkpoint management — list, prune (Pareto), average, resume, stats |
|
|
328
333
|
| `/turing:card` | Generate a model card — performance, limitations, intended use, artifact contract |
|
|
329
334
|
| `/turing:logbook` | Generate HTML experiment logbook |
|
|
330
335
|
| `/turing:report` | Generate research report |
|
|
@@ -514,11 +519,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
514
519
|
|
|
515
520
|
## Architecture of Turing Itself
|
|
516
521
|
|
|
517
|
-
|
|
522
|
+
24 commands, 2 agents, 8 config files, 39 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling + smart checkpoints, 542 tests, 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
518
523
|
|
|
519
524
|
```
|
|
520
525
|
turing/
|
|
521
|
-
├── commands/
|
|
526
|
+
├── commands/ 23 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance)
|
|
522
527
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
523
528
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
524
529
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ablate
|
|
3
|
+
description: Run systematic ablation study — remove components one at a time, measure impact, produce publication-ready table with dead-weight flagging.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[exp-id] [--components \"X,Y\"] [--seeds 3] [--latex]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Run a systematic ablation study to measure the contribution of each model component.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- First argument can be an experiment ID (e.g., `exp-042`); defaults to best
|
|
20
|
+
- `--components "dropout,feature_X,regularization"` specifies components to ablate
|
|
21
|
+
- `--seeds 3` runs each ablation 3 times for statistical robustness (uses seed runner)
|
|
22
|
+
- `--latex` outputs a LaTeX-formatted table instead of markdown
|
|
23
|
+
|
|
24
|
+
3. **Run ablation study:**
|
|
25
|
+
```bash
|
|
26
|
+
python scripts/ablation_study.py $ARGUMENTS
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
4. **Report results:**
|
|
30
|
+
- Show the ablation table: Configuration | Metric | Δ from Full | % Change
|
|
31
|
+
- Rank by impact (largest Δ first)
|
|
32
|
+
- Flag **dead-weight** components (removing them improves the metric)
|
|
33
|
+
- If `--latex`, output ready for copy-paste into a paper
|
|
34
|
+
|
|
35
|
+
5. **Saved output:** results written to `experiments/ablations/exp-NNN-ablation.yaml`
|
|
36
|
+
|
|
37
|
+
6. **If no ablatable components detected:** suggest using `--components` explicitly.
|
|
38
|
+
|
|
39
|
+
## Examples
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
/turing:ablate # Auto-detect components
|
|
43
|
+
/turing:ablate exp-042 # Specific experiment
|
|
44
|
+
/turing:ablate --components "dropout,subsample" # Specific components
|
|
45
|
+
/turing:ablate --seeds 3 # Multi-seed for robustness
|
|
46
|
+
/turing:ablate --latex # LaTeX table output
|
|
47
|
+
```
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: checkpoint
|
|
3
|
+
description: Smart checkpoint management — list, prune (Pareto-based), average top-K, resume from any point, disk usage stats.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<list|prune|average|resume|stats> [exp-id] [--top 3] [--dry-run]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Manage model checkpoints intelligently using Pareto dominance.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- First word is the action: `list`, `prune`, `average`, `resume`, `stats`
|
|
20
|
+
- `resume` requires an experiment ID as second argument
|
|
21
|
+
- `--top 3` sets the number of checkpoints for averaging
|
|
22
|
+
- `--dry-run` previews pruning without deleting
|
|
23
|
+
|
|
24
|
+
3. **Run checkpoint manager:**
|
|
25
|
+
```bash
|
|
26
|
+
python scripts/checkpoint_manager.py $ARGUMENTS
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
4. **Report results by action:**
|
|
30
|
+
- **list:** Table of all checkpoints with metrics, size, and Pareto status
|
|
31
|
+
- **prune:** Removes dominated checkpoints, reports space saved
|
|
32
|
+
- **average:** Lists top-K checkpoints for weight averaging
|
|
33
|
+
- **resume:** Locates checkpoint for a specific experiment
|
|
34
|
+
- **stats:** Disk usage summary by total, average, and model type
|
|
35
|
+
|
|
36
|
+
5. **Saved output:** report written to `experiments/checkpoints/checkpoint-report.yaml`
|
|
37
|
+
|
|
38
|
+
## Examples
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
/turing:checkpoint list # Show all checkpoints
|
|
42
|
+
/turing:checkpoint stats # Disk usage summary
|
|
43
|
+
/turing:checkpoint prune --dry-run # Preview what would be pruned
|
|
44
|
+
/turing:checkpoint prune # Remove dominated checkpoints
|
|
45
|
+
/turing:checkpoint average --top 5 # Top 5 for averaging
|
|
46
|
+
/turing:checkpoint resume exp-042 # Resume from checkpoint
|
|
47
|
+
```
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: diagnose
|
|
3
|
+
description: Error analysis — cluster failure cases, identify systematic failure modes, and suggest targeted fixes with auto-queued hypotheses.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[exp-id] [--auto-queue] [--top 5]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Analyze where and why the model fails, beyond aggregate metrics.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Generate predictions if needed:**
|
|
19
|
+
Check if `experiments/predictions/exp-NNN-preds.yaml` exists. If not, run:
|
|
20
|
+
```bash
|
|
21
|
+
python train.py --predict-only --output experiments/predictions/
|
|
22
|
+
```
|
|
23
|
+
The predictions file must contain `y_true`, `y_pred`, `task_type`, and optionally `features`.
|
|
24
|
+
|
|
25
|
+
3. **Parse arguments from `$ARGUMENTS`:**
|
|
26
|
+
- First argument can be an experiment ID (e.g., `exp-042`); defaults to best
|
|
27
|
+
- `--auto-queue` auto-queues hypotheses from failure modes into `hypotheses.yaml`
|
|
28
|
+
- `--top 5` limits to top N failure modes (default 5)
|
|
29
|
+
|
|
30
|
+
4. **Run error analysis:**
|
|
31
|
+
```bash
|
|
32
|
+
python scripts/diagnose_errors.py $ARGUMENTS
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
5. **Report results:**
|
|
36
|
+
- **Classification:** confusion matrix, most-confused pairs, per-class P/R/F1, low-recall classes
|
|
37
|
+
- **Regression:** residual stats, P90/P95 errors, feature-range bias, systematic bias
|
|
38
|
+
- **Failure modes:** ranked by impact, with suggested fixes
|
|
39
|
+
- **Auto-hypotheses:** if `--auto-queue`, shows queued hypotheses targeting weaknesses
|
|
40
|
+
|
|
41
|
+
6. **Saved output:** report written to `experiments/diagnoses/exp-NNN-diagnosis.yaml`
|
|
42
|
+
|
|
43
|
+
7. **If no predictions file exists:** instruct user to run the model on validation set first.
|
|
44
|
+
|
|
45
|
+
## Examples
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
/turing:diagnose # Analyze best experiment
|
|
49
|
+
/turing:diagnose exp-042 # Specific experiment
|
|
50
|
+
/turing:diagnose --auto-queue # Queue fix hypotheses
|
|
51
|
+
/turing:diagnose --top 10 # Top 10 failure modes
|
|
52
|
+
```
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: frontier
|
|
3
|
+
description: Visualize Pareto frontier across multiple objectives — answers "which model is actually best?" when there are tradeoffs.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--metrics \"accuracy,train_seconds,n_params\"] [--ascii]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Visualize the Pareto frontier across multiple objectives from experiment history.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--metrics "accuracy,train_seconds,n_params"` specifies metrics to analyze
|
|
20
|
+
- Without `--metrics`, uses primary metric + train_seconds from config
|
|
21
|
+
- `--ascii` generates an ASCII scatter plot (2D projection)
|
|
22
|
+
|
|
23
|
+
3. **Run Pareto analysis:**
|
|
24
|
+
```bash
|
|
25
|
+
python scripts/pareto_frontier.py $ARGUMENTS
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
4. **Report results:**
|
|
29
|
+
- **Pareto-optimal experiments:** table with all metrics and what each is best at
|
|
30
|
+
- **Dominated experiments:** with their nearest Pareto neighbor
|
|
31
|
+
- **ASCII scatter plot** (if `--ascii`): 2D projection with * for Pareto, · for dominated
|
|
32
|
+
- Summary: "N Pareto-optimal of M experiments across K metrics"
|
|
33
|
+
|
|
34
|
+
5. **Saved output:** results written to `experiments/frontiers/frontier-YYYY-MM-DD.yaml`
|
|
35
|
+
|
|
36
|
+
6. **If no experiments have all requested metrics:** suggest which metrics are available.
|
|
37
|
+
|
|
38
|
+
## Examples
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
/turing:frontier # Default: metric vs time
|
|
42
|
+
/turing:frontier --metrics "accuracy,train_seconds" # 2D frontier
|
|
43
|
+
/turing:frontier --metrics "accuracy,train_seconds,n_params" # 3D frontier
|
|
44
|
+
/turing:frontier --ascii # With scatter plot
|
|
45
|
+
```
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: profile
|
|
3
|
+
description: Profile a training run — timing breakdown, memory usage, throughput, bottleneck detection with actionable recommendations.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[exp-id] [--seed 42]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Profile a training run to identify performance bottlenecks.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- First argument can be an experiment ID (e.g., `exp-042`); defaults to best
|
|
20
|
+
- `--seed 42` sets the random seed for the profiling run
|
|
21
|
+
|
|
22
|
+
3. **Run profiling:**
|
|
23
|
+
```bash
|
|
24
|
+
python scripts/profile_training.py $ARGUMENTS
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
4. **Report results:**
|
|
28
|
+
- **Timing:** total time, training time, overhead breakdown
|
|
29
|
+
- **Memory:** peak RSS, Python peak, GPU peak (if applicable)
|
|
30
|
+
- **Throughput:** samples/sec
|
|
31
|
+
- **Bottleneck:** identified bottleneck type and severity
|
|
32
|
+
- **Recommendations:** actionable fixes for the detected bottleneck
|
|
33
|
+
|
|
34
|
+
5. **Saved output:** results written to `experiments/profiles/exp-NNN-profile.yaml`
|
|
35
|
+
|
|
36
|
+
6. **If no training pipeline exists:** suggest `/turing:init` first.
|
|
37
|
+
|
|
38
|
+
## Examples
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
/turing:profile # Profile best experiment config
|
|
42
|
+
/turing:profile exp-042 # Profile specific experiment
|
|
43
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -28,6 +28,11 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
28
28
|
| "mode", "explore", "exploit", "replicate", "strategy" | `/turing:mode` | Strategy |
|
|
29
29
|
| "preflight", "resources", "VRAM", "memory", "can I run", "OOM", "GPU" | `/turing:preflight` | Check |
|
|
30
30
|
| "card", "model card", "document model", "model documentation" | `/turing:card` | Document |
|
|
31
|
+
| "diagnose", "error analysis", "failure modes", "where does it fail", "confusion matrix" | `/turing:diagnose` | Analyze |
|
|
32
|
+
| "ablate", "ablation", "remove component", "which features matter", "component impact" | `/turing:ablate` | Analyze |
|
|
33
|
+
| "frontier", "pareto", "tradeoff", "tradeoffs", "multi-objective", "which model is best" | `/turing:frontier` | Analyze |
|
|
34
|
+
| "profile", "profiling", "bottleneck", "slow training", "why is it slow", "timing" | `/turing:profile` | Check |
|
|
35
|
+
| "checkpoint", "checkpoints", "prune checkpoints", "disk space", "resume training" | `/turing:checkpoint` | Check |
|
|
31
36
|
|
|
32
37
|
## Sub-commands
|
|
33
38
|
|
|
@@ -52,6 +57,11 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
52
57
|
| `/turing:mode <mode>` | Set research strategy (explore/exploit/replicate) | (inline) |
|
|
53
58
|
| `/turing:preflight` | Pre-flight resource check (VRAM/RAM/disk) | (inline) |
|
|
54
59
|
| `/turing:card` | Generate standardized model card (type, performance, data, limitations, contract) | (inline) |
|
|
60
|
+
| `/turing:diagnose [exp-id]` | Error analysis: failure modes, confused pairs, feature-range bias | (inline) |
|
|
61
|
+
| `/turing:ablate [--components]` | Ablation study: remove components, measure impact, flag dead weight | (inline) |
|
|
62
|
+
| `/turing:frontier [--metrics]` | Pareto frontier: multi-objective tradeoff visualization | (inline) |
|
|
63
|
+
| `/turing:profile [exp-id]` | Computational profiling: timing, memory, throughput, bottleneck detection | (inline) |
|
|
64
|
+
| `/turing:checkpoint <action>` | Smart checkpoint management: list, prune (Pareto), average, resume, stats | (inline) |
|
|
55
65
|
|
|
56
66
|
## Proactive Detection
|
|
57
67
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.5.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
|
@@ -23,6 +23,7 @@ const SUB_COMMANDS = [
|
|
|
23
23
|
"init", "train", "status", "compare", "sweep", "validate",
|
|
24
24
|
"try", "brief", "suggest", "explore", "design", "logbook", "poster",
|
|
25
25
|
"report", "mode", "preflight", "card", "seed", "reproduce",
|
|
26
|
+
"diagnose", "ablate", "frontier", "profile", "checkpoint",
|
|
26
27
|
];
|
|
27
28
|
|
|
28
29
|
export async function install(opts = {}) {
|
package/src/verify.js
CHANGED
|
@@ -33,6 +33,11 @@ const EXPECTED_COMMANDS = [
|
|
|
33
33
|
"card/SKILL.md",
|
|
34
34
|
"seed/SKILL.md",
|
|
35
35
|
"reproduce/SKILL.md",
|
|
36
|
+
"diagnose/SKILL.md",
|
|
37
|
+
"ablate/SKILL.md",
|
|
38
|
+
"frontier/SKILL.md",
|
|
39
|
+
"profile/SKILL.md",
|
|
40
|
+
"checkpoint/SKILL.md",
|
|
36
41
|
];
|
|
37
42
|
|
|
38
43
|
const EXPECTED_AGENTS = ["ml-researcher.md", "ml-evaluator.md"];
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|