claude-turing 1.4.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +5 -2
- package/commands/checkpoint.md +47 -0
- package/commands/export.md +48 -0
- package/commands/profile.md +43 -0
- package/commands/turing.md +6 -0
- package/package.json +1 -1
- package/src/install.js +1 -1
- package/src/verify.js +3 -0
- package/templates/scripts/__pycache__/checkpoint_manager.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/equivalence_checker.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/export_formats.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/latency_benchmark.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/profile_training.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/checkpoint_manager.py +449 -0
- package/templates/scripts/equivalence_checker.py +158 -0
- package/templates/scripts/export_card.py +183 -0
- package/templates/scripts/export_formats.py +385 -0
- package/templates/scripts/export_model.py +324 -0
- package/templates/scripts/generate_brief.py +38 -1
- package/templates/scripts/latency_benchmark.py +167 -0
- package/templates/scripts/profile_training.py +533 -0
- package/templates/scripts/scaffold.py +10 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "2.0.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 25 commands, 2 specialized agents, production model export (6 formats, equivalence verification, latency benchmarking), performance profiling, smart Pareto-based checkpoint management, experiment intelligence (error analysis, ablation studies, Pareto frontiers), statistical rigor (multi-seed studies, reproducibility verification), tree-search hypothesis exploration (TreeQuest AB-MCTS), cost-performance frontier analysis, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -328,6 +328,9 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
328
328
|
| `/turing:diagnose [exp-id]` | Error analysis — failure modes, confused pairs, feature-range bias |
|
|
329
329
|
| `/turing:ablate [--components]` | Ablation study — remove components, measure impact, flag dead weight |
|
|
330
330
|
| `/turing:frontier [--metrics]` | Pareto frontier — multi-objective tradeoff visualization |
|
|
331
|
+
| `/turing:profile [exp-id]` | Computational profiling — timing, memory, throughput, bottleneck detection |
|
|
332
|
+
| `/turing:checkpoint <action>` | Smart checkpoint management — list, prune (Pareto), average, resume, stats |
|
|
333
|
+
| `/turing:export [--format]` | Export model to production format with equivalence check + latency benchmark |
|
|
331
334
|
| `/turing:card` | Generate a model card — performance, limitations, intended use, artifact contract |
|
|
332
335
|
| `/turing:logbook` | Generate HTML experiment logbook |
|
|
333
336
|
| `/turing:report` | Generate research report |
|
|
@@ -517,11 +520,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
517
520
|
|
|
518
521
|
## Architecture of Turing Itself
|
|
519
522
|
|
|
520
|
-
|
|
523
|
+
25 commands, 2 agents, 8 config files, 44 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, 611 tests, 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
521
524
|
|
|
522
525
|
```
|
|
523
526
|
turing/
|
|
524
|
-
├── commands/
|
|
527
|
+
├── commands/ 24 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment)
|
|
525
528
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
526
529
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
527
530
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: checkpoint
|
|
3
|
+
description: Smart checkpoint management — list, prune (Pareto-based), average top-K, resume from any point, disk usage stats.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<list|prune|average|resume|stats> [exp-id] [--top 3] [--dry-run]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Manage model checkpoints intelligently using Pareto dominance.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- First word is the action: `list`, `prune`, `average`, `resume`, `stats`
|
|
20
|
+
- `resume` requires an experiment ID as second argument
|
|
21
|
+
- `--top 3` sets the number of checkpoints for averaging
|
|
22
|
+
- `--dry-run` previews pruning without deleting
|
|
23
|
+
|
|
24
|
+
3. **Run checkpoint manager:**
|
|
25
|
+
```bash
|
|
26
|
+
python scripts/checkpoint_manager.py $ARGUMENTS
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
4. **Report results by action:**
|
|
30
|
+
- **list:** Table of all checkpoints with metrics, size, and Pareto status
|
|
31
|
+
- **prune:** Removes dominated checkpoints, reports space saved
|
|
32
|
+
- **average:** Lists top-K checkpoints for weight averaging
|
|
33
|
+
- **resume:** Locates checkpoint for a specific experiment
|
|
34
|
+
- **stats:** Disk usage summary by total, average, and model type
|
|
35
|
+
|
|
36
|
+
5. **Saved output:** report written to `experiments/checkpoints/checkpoint-report.yaml`
|
|
37
|
+
|
|
38
|
+
## Examples
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
/turing:checkpoint list # Show all checkpoints
|
|
42
|
+
/turing:checkpoint stats # Disk usage summary
|
|
43
|
+
/turing:checkpoint prune --dry-run # Preview what would be pruned
|
|
44
|
+
/turing:checkpoint prune # Remove dominated checkpoints
|
|
45
|
+
/turing:checkpoint average --top 5 # Top 5 for averaging
|
|
46
|
+
/turing:checkpoint resume exp-042 # Resume from checkpoint
|
|
47
|
+
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: export
|
|
3
|
+
description: Export model to production format with equivalence verification, latency benchmarking, and deployment model card.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[exp-id] [--format joblib|xgboost_json|onnx|torchscript|tflite]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Export a trained model to a production-ready format.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- First argument can be an experiment ID (e.g., `exp-042`); defaults to best
|
|
20
|
+
- `--format joblib|xgboost_json|onnx|torchscript|tflite` specifies export format (auto-detected if omitted)
|
|
21
|
+
- `--skip-equivalence` skips inference equivalence check
|
|
22
|
+
- `--skip-latency` skips latency benchmark
|
|
23
|
+
- `--samples 100` sets test sample count
|
|
24
|
+
|
|
25
|
+
3. **Run export pipeline:**
|
|
26
|
+
```bash
|
|
27
|
+
python scripts/export_model.py $ARGUMENTS
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
4. **Report results:**
|
|
31
|
+
- **Export:** format, file size, output path, dependencies
|
|
32
|
+
- **Equivalence:** verdict (equivalent/approximately_equivalent/divergent), max delta
|
|
33
|
+
- **Latency:** p50/p95/p99 ms, speedup vs original
|
|
34
|
+
- **Model Card:** metrics, seed study, equivalence, latency, dependencies
|
|
35
|
+
|
|
36
|
+
5. **Output:** exported model + model_card.yaml written to `exports/exp-NNN/`
|
|
37
|
+
|
|
38
|
+
6. **If model file not found:** suggest checking models/best/ directory.
|
|
39
|
+
|
|
40
|
+
## Examples
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
/turing:export # Best experiment, default format
|
|
44
|
+
/turing:export exp-042 # Specific experiment
|
|
45
|
+
/turing:export --format xgboost_json # Native XGBoost JSON
|
|
46
|
+
/turing:export --format onnx # ONNX format
|
|
47
|
+
/turing:export --skip-equivalence --skip-latency # Fast export
|
|
48
|
+
```
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: profile
|
|
3
|
+
description: Profile a training run — timing breakdown, memory usage, throughput, bottleneck detection with actionable recommendations.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[exp-id] [--seed 42]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Profile a training run to identify performance bottlenecks.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- First argument can be an experiment ID (e.g., `exp-042`); defaults to best
|
|
20
|
+
- `--seed 42` sets the random seed for the profiling run
|
|
21
|
+
|
|
22
|
+
3. **Run profiling:**
|
|
23
|
+
```bash
|
|
24
|
+
python scripts/profile_training.py $ARGUMENTS
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
4. **Report results:**
|
|
28
|
+
- **Timing:** total time, training time, overhead breakdown
|
|
29
|
+
- **Memory:** peak RSS, Python peak, GPU peak (if applicable)
|
|
30
|
+
- **Throughput:** samples/sec
|
|
31
|
+
- **Bottleneck:** identified bottleneck type and severity
|
|
32
|
+
- **Recommendations:** actionable fixes for the detected bottleneck
|
|
33
|
+
|
|
34
|
+
5. **Saved output:** results written to `experiments/profiles/exp-NNN-profile.yaml`
|
|
35
|
+
|
|
36
|
+
6. **If no training pipeline exists:** suggest `/turing:init` first.
|
|
37
|
+
|
|
38
|
+
## Examples
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
/turing:profile # Profile best experiment config
|
|
42
|
+
/turing:profile exp-042 # Profile specific experiment
|
|
43
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -31,6 +31,9 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
31
31
|
| "diagnose", "error analysis", "failure modes", "where does it fail", "confusion matrix" | `/turing:diagnose` | Analyze |
|
|
32
32
|
| "ablate", "ablation", "remove component", "which features matter", "component impact" | `/turing:ablate` | Analyze |
|
|
33
33
|
| "frontier", "pareto", "tradeoff", "tradeoffs", "multi-objective", "which model is best" | `/turing:frontier` | Analyze |
|
|
34
|
+
| "export", "deploy", "production", "onnx", "torchscript", "tflite", "ship model" | `/turing:export` | Deploy |
|
|
35
|
+
| "profile", "profiling", "bottleneck", "slow training", "why is it slow", "timing" | `/turing:profile` | Check |
|
|
36
|
+
| "checkpoint", "checkpoints", "prune checkpoints", "disk space", "resume training" | `/turing:checkpoint` | Check |
|
|
34
37
|
|
|
35
38
|
## Sub-commands
|
|
36
39
|
|
|
@@ -58,6 +61,9 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
58
61
|
| `/turing:diagnose [exp-id]` | Error analysis: failure modes, confused pairs, feature-range bias | (inline) |
|
|
59
62
|
| `/turing:ablate [--components]` | Ablation study: remove components, measure impact, flag dead weight | (inline) |
|
|
60
63
|
| `/turing:frontier [--metrics]` | Pareto frontier: multi-objective tradeoff visualization | (inline) |
|
|
64
|
+
| `/turing:export [exp-id] [--format]` | Export model to production format with equivalence check + latency benchmark | (inline) |
|
|
65
|
+
| `/turing:profile [exp-id]` | Computational profiling: timing, memory, throughput, bottleneck detection | (inline) |
|
|
66
|
+
| `/turing:checkpoint <action>` | Smart checkpoint management: list, prune (Pareto), average, resume, stats | (inline) |
|
|
61
67
|
|
|
62
68
|
## Proactive Detection
|
|
63
69
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
|
@@ -23,7 +23,7 @@ const SUB_COMMANDS = [
|
|
|
23
23
|
"init", "train", "status", "compare", "sweep", "validate",
|
|
24
24
|
"try", "brief", "suggest", "explore", "design", "logbook", "poster",
|
|
25
25
|
"report", "mode", "preflight", "card", "seed", "reproduce",
|
|
26
|
-
"diagnose", "ablate", "frontier",
|
|
26
|
+
"diagnose", "ablate", "frontier", "profile", "checkpoint", "export",
|
|
27
27
|
];
|
|
28
28
|
|
|
29
29
|
export async function install(opts = {}) {
|
package/src/verify.js
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|