claude-turing 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +5 -2
- package/commands/diff.md +48 -0
- package/commands/regress.md +53 -0
- package/commands/turing.md +6 -0
- package/commands/watch.md +60 -0
- package/config/watch_alerts.yaml +36 -0
- package/package.json +1 -1
- package/src/install.js +2 -0
- package/src/verify.js +4 -0
- package/templates/__pycache__/evaluate.cpython-314.pyc +0 -0
- package/templates/__pycache__/prepare.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/features/__pycache__/featurizers.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/check_convergence.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/critique_hypothesis.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_diff.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/experiment_index.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_logbook.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/log_experiment.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/novelty_guard.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/parse_metrics.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/regression_gate.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_experiment_tree.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/show_families.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/statistical_compare.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/suggest_next.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/sweep.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/synthesize_decision.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/training_monitor.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/verify_placeholders.cpython-314.pyc +0 -0
- package/templates/scripts/experiment_diff.py +703 -0
- package/templates/scripts/generate_brief.py +44 -0
- package/templates/scripts/regression_gate.py +536 -0
- package/templates/scripts/scaffold.py +6 -0
- package/templates/scripts/training_monitor.py +611 -0
- package/templates/scripts/__pycache__/classify_task.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "2.
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "2.3.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 33 commands, 2 specialized agents, deep analysis (experiment diff + live training monitor + regression gate), experiment orchestration (batch queue + smart retry + branching), literature integration + paper drafting, production model export, performance profiling, smart checkpoints, experiment intelligence, statistical rigor, tree-search hypothesis exploration, cost-performance frontier, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -341,6 +341,9 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
341
341
|
| `/turing:report` | Generate research report |
|
|
342
342
|
| `/turing:poster` | Generate research poster |
|
|
343
343
|
| `/turing:preflight` | Pre-release validation checks |
|
|
344
|
+
| `/turing:diff <a> <b>` | Deep experiment comparison — config diffs, metric significance, per-class regressions, curve divergence |
|
|
345
|
+
| `/turing:watch [--analyze]` | Live training monitor — loss spikes, NaN detection, overfitting, plateau alerts |
|
|
346
|
+
| `/turing:regress [--tolerance]` | Performance regression gate — verify metrics haven't degraded after changes |
|
|
344
347
|
|
|
345
348
|
And for fully hands-off operation:
|
|
346
349
|
|
|
@@ -525,11 +528,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
525
528
|
|
|
526
529
|
## Architecture of Turing Itself
|
|
527
530
|
|
|
528
|
-
|
|
531
|
+
33 commands, 2 agents, 10 config files, 52 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
529
532
|
|
|
530
533
|
```
|
|
531
534
|
turing/
|
|
532
|
-
├── commands/
|
|
535
|
+
├── commands/ 32 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis)
|
|
533
536
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
534
537
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
535
538
|
├── templates/ Scaffolded into user projects by /turing:init
|
package/commands/diff.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: diff
|
|
3
|
+
description: Deep experiment comparison — config diffs, metric significance, per-class regressions, training curve divergence, feature importance shifts.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<exp-a> <exp-b> [--code]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Deep diagnostic comparison of two experiments. Goes beyond "which metric is higher" to show where, when, and why two experiments diverge.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- First two arguments are experiment IDs (required), e.g. `exp-042 exp-053`
|
|
20
|
+
- `--code` includes git diff of train.py between the two experiments' commits
|
|
21
|
+
- `--json` outputs raw JSON instead of markdown
|
|
22
|
+
|
|
23
|
+
3. **Run deep comparison:**
|
|
24
|
+
```bash
|
|
25
|
+
python scripts/experiment_diff.py $ARGUMENTS
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
4. **Report results — the diff includes:**
|
|
29
|
+
- **Config diff:** which hyperparameters changed, with magnitude (e.g., `max_depth: 6 → 8 (+33%)`)
|
|
30
|
+
- **Metric diff:** all metrics with deltas and statistical significance (if seed studies exist)
|
|
31
|
+
- **Per-class diff:** which classes improved/regressed — flags regressions hidden by aggregate improvement
|
|
32
|
+
- **Training curve divergence:** the epoch where the two experiments' loss/metric curves separate
|
|
33
|
+
- **Feature importance shifts:** which features gained/lost importance
|
|
34
|
+
- **Code diff (--code):** git diff of train.py between the two commits
|
|
35
|
+
|
|
36
|
+
5. **Saved output:** report written to `experiments/diffs/<exp-a>-vs-<exp-b>.yaml`
|
|
37
|
+
|
|
38
|
+
6. **If experiment ID not found:** list available experiment IDs from `experiments/log.jsonl`
|
|
39
|
+
|
|
40
|
+
7. **If no training pipeline exists:** suggest `/turing:init` first.
|
|
41
|
+
|
|
42
|
+
## Examples
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
/turing:diff exp-042 exp-053 # Full diagnostic comparison
|
|
46
|
+
/turing:diff exp-042 exp-053 --code # Include train.py code changes
|
|
47
|
+
/turing:diff exp-001 exp-010 --json # Raw JSON output
|
|
48
|
+
```
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: regress
|
|
3
|
+
description: Performance regression gate — re-run best experiment after code/dependency changes and verify metrics haven't degraded.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--tolerance 0.01] [--against exp-id] [--quick]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
CI for your model. After any change to code, dependencies, or data, verify metrics haven't silently regressed.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--tolerance 0.01` sets the relative tolerance (default 1%)
|
|
20
|
+
- `--against exp-042` checks against a specific experiment (default: best)
|
|
21
|
+
- `--quick` runs 1 seed instead of 3 for fast checks
|
|
22
|
+
- `--runs 5` sets number of regression runs (default 3)
|
|
23
|
+
- `--json` outputs raw JSON
|
|
24
|
+
|
|
25
|
+
3. **Run regression gate:**
|
|
26
|
+
```bash
|
|
27
|
+
python scripts/regression_gate.py $ARGUMENTS
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
4. **Report results:**
|
|
31
|
+
- **PASS:** all metrics within tolerance — no regression
|
|
32
|
+
- **WARNING:** some metrics degraded within 2x tolerance — investigate
|
|
33
|
+
- **FAIL:** REGRESSION DETECTED — at least one metric degraded beyond tolerance
|
|
34
|
+
- Shows per-metric comparison with deltas and relative differences
|
|
35
|
+
- Shows environment diff if library versions changed (may explain regression)
|
|
36
|
+
|
|
37
|
+
5. **Saved output:** report written to `experiments/regressions/check-YYYY-MM-DD.yaml`
|
|
38
|
+
|
|
39
|
+
6. **If no experiments exist:** suggest running `/turing:train` first.
|
|
40
|
+
|
|
41
|
+
7. **On FAIL verdict:** suggest investigating with:
|
|
42
|
+
- `/turing:diff <baseline> <latest>` to see what changed
|
|
43
|
+
- `pip freeze` comparison to identify library version changes
|
|
44
|
+
- `git diff` to review code changes
|
|
45
|
+
|
|
46
|
+
## Examples
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
/turing:regress # Default: check best, 1% tolerance, 3 runs
|
|
50
|
+
/turing:regress --quick # Fast check: 1 run
|
|
51
|
+
/turing:regress --against exp-042 # Check specific experiment
|
|
52
|
+
/turing:regress --tolerance 0.005 --runs 5 # Strict: 0.5% tolerance, 5 runs
|
|
53
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -39,6 +39,9 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
39
39
|
| "fork", "branch", "try both", "parallel experiments", "A or B" | `/turing:fork` | Orchestrate |
|
|
40
40
|
| "profile", "profiling", "bottleneck", "slow training", "why is it slow", "timing" | `/turing:profile` | Check |
|
|
41
41
|
| "checkpoint", "checkpoints", "prune checkpoints", "disk space", "resume training" | `/turing:checkpoint` | Check |
|
|
42
|
+
| "diff", "deep compare", "what changed", "why did it diverge", "experiment diff" | `/turing:diff` | Analyze |
|
|
43
|
+
| "watch", "monitor", "live training", "loss spike", "is it overfitting", "training progress" | `/turing:watch` | Monitor |
|
|
44
|
+
| "regress", "regression", "did metrics degrade", "check for regression", "CI gate", "stability check" | `/turing:regress` | Validate |
|
|
42
45
|
|
|
43
46
|
## Sub-commands
|
|
44
47
|
|
|
@@ -74,6 +77,9 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
74
77
|
| `/turing:fork <exp-id> --branches` | Experiment branching: run parallel tracks, report winner | (inline) |
|
|
75
78
|
| `/turing:profile [exp-id]` | Computational profiling: timing, memory, throughput, bottleneck detection | (inline) |
|
|
76
79
|
| `/turing:checkpoint <action>` | Smart checkpoint management: list, prune (Pareto), average, resume, stats | (inline) |
|
|
80
|
+
| `/turing:diff <exp-a> <exp-b>` | Deep experiment comparison: config diff, metric significance, per-class regressions, curve divergence | (inline) |
|
|
81
|
+
| `/turing:watch [--analyze]` | Live training monitor with early-warning alerts (loss spike, NaN, overfitting, plateau) | (inline) |
|
|
82
|
+
| `/turing:regress [--tolerance]` | Performance regression gate: re-run best experiment, verify metrics haven't degraded | (inline) |
|
|
77
83
|
|
|
78
84
|
## Proactive Detection
|
|
79
85
|
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: watch
|
|
3
|
+
description: Live training monitor with early-warning alerts for loss spikes, NaN, overfitting, and metric plateaus.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--alerts] [--interval 10] [--analyze run.log]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Stream metrics during training with early-warning alerts. Catches problems mid-run instead of at the end.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--analyze run.log` — post-hoc analysis of a completed log (non-blocking)
|
|
20
|
+
- `--alerts` — show only alert lines, suppress normal output
|
|
21
|
+
- `--interval 10` — check interval in seconds (default: 10)
|
|
22
|
+
- `--alerts-config config/watch_alerts.yaml` — custom alert rules
|
|
23
|
+
- `--json` — raw JSON output (for `--analyze` mode)
|
|
24
|
+
|
|
25
|
+
3. **For post-hoc analysis:**
|
|
26
|
+
```bash
|
|
27
|
+
python scripts/training_monitor.py --analyze run.log
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
4. **For live monitoring (inform user):**
|
|
31
|
+
Live monitoring requires a running training process. Suggest the user run in a separate terminal:
|
|
32
|
+
```bash
|
|
33
|
+
python scripts/training_monitor.py --log run.log --interval 10
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
5. **Alert types:**
|
|
37
|
+
- **Loss spike:** loss > 3x rolling mean (configurable multiplier)
|
|
38
|
+
- **NaN detected:** any metric is NaN — CRITICAL, suggests pausing
|
|
39
|
+
- **Overfitting onset:** train/val gap widening for 3+ consecutive epochs
|
|
40
|
+
- **Plateau:** metric improvement < 0.001 for 5+ consecutive epochs
|
|
41
|
+
|
|
42
|
+
6. **Dashboard line format:**
|
|
43
|
+
```
|
|
44
|
+
Epoch 23/100 | loss: 0.342 ↓ | acc: 0.865 ↑ | gap: 0.018 | ⚠ plateau
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
7. **Alert config:** rules are in `config/watch_alerts.yaml` — users can customize thresholds.
|
|
48
|
+
|
|
49
|
+
8. **Saved output:** analysis report written to `experiments/monitors/analysis-*.yaml`
|
|
50
|
+
|
|
51
|
+
9. **If no training log exists:** suggest running `/turing:train` first.
|
|
52
|
+
|
|
53
|
+
## Examples
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
/turing:watch --analyze run.log # Analyze completed training
|
|
57
|
+
/turing:watch --analyze run.log --json # JSON output for scripting
|
|
58
|
+
/turing:watch --alerts # Live: show only alerts
|
|
59
|
+
/turing:watch --interval 5 # Live: check every 5 seconds
|
|
60
|
+
```
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Training monitor alert rules for /turing:watch
|
|
2
|
+
#
|
|
3
|
+
# Each alert has:
|
|
4
|
+
# condition: alert type (loss_spike, nan_detected, overfitting, plateau)
|
|
5
|
+
# severity: info | warning | critical
|
|
6
|
+
# action: optional action on trigger (e.g., "pause")
|
|
7
|
+
# message: alert message template with {epoch}, {value}, {mean}, etc.
|
|
8
|
+
#
|
|
9
|
+
# Customize thresholds to match your training dynamics.
|
|
10
|
+
|
|
11
|
+
alerts:
|
|
12
|
+
loss_spike:
|
|
13
|
+
condition: loss_spike
|
|
14
|
+
multiplier: 3.0 # Trigger if loss > N * rolling_mean
|
|
15
|
+
severity: warning
|
|
16
|
+
message: "Loss spike at epoch {epoch}: {value} vs rolling mean {mean:.4f}"
|
|
17
|
+
|
|
18
|
+
nan_detected:
|
|
19
|
+
condition: nan_detected
|
|
20
|
+
severity: critical
|
|
21
|
+
action: pause # Suggest pausing training on NaN
|
|
22
|
+
message: "NaN detected in {metric} at epoch {epoch}"
|
|
23
|
+
|
|
24
|
+
overfitting_onset:
|
|
25
|
+
condition: overfitting
|
|
26
|
+
gap_ratio: 0.5 # train_loss / val_loss ratio threshold
|
|
27
|
+
consecutive: 3 # N consecutive epochs of widening gap
|
|
28
|
+
severity: warning
|
|
29
|
+
message: "Overfitting detected — train/val gap widening since epoch {onset}"
|
|
30
|
+
|
|
31
|
+
plateau:
|
|
32
|
+
condition: plateau
|
|
33
|
+
min_improvement: 0.001 # Minimum metric change per epoch
|
|
34
|
+
consecutive: 5 # N consecutive flat epochs
|
|
35
|
+
severity: info
|
|
36
|
+
message: "Metric plateaued — consider early stopping or learning rate reduction"
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.3.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
|
@@ -25,6 +25,7 @@ const SUB_COMMANDS = [
|
|
|
25
25
|
"report", "mode", "preflight", "card", "seed", "reproduce",
|
|
26
26
|
"diagnose", "ablate", "frontier", "profile", "checkpoint", "export",
|
|
27
27
|
"lit", "paper", "queue", "retry", "fork",
|
|
28
|
+
"diff", "watch", "regress",
|
|
28
29
|
];
|
|
29
30
|
|
|
30
31
|
export async function install(opts = {}) {
|
|
@@ -79,6 +80,7 @@ export async function install(opts = {}) {
|
|
|
79
80
|
"experiment_archetypes.yaml", "novelty_aliases.yaml",
|
|
80
81
|
"relationships.toml", "state.toml", "task_taxonomy.yaml",
|
|
81
82
|
"failure_modes.yaml",
|
|
83
|
+
"watch_alerts.yaml",
|
|
82
84
|
];
|
|
83
85
|
for (const file of CONFIG_FILES) {
|
|
84
86
|
await copyFile(
|
package/src/verify.js
CHANGED
|
@@ -44,6 +44,9 @@ const EXPECTED_COMMANDS = [
|
|
|
44
44
|
"queue/SKILL.md",
|
|
45
45
|
"retry/SKILL.md",
|
|
46
46
|
"fork/SKILL.md",
|
|
47
|
+
"diff/SKILL.md",
|
|
48
|
+
"watch/SKILL.md",
|
|
49
|
+
"regress/SKILL.md",
|
|
47
50
|
];
|
|
48
51
|
|
|
49
52
|
const EXPECTED_AGENTS = ["ml-researcher.md", "ml-evaluator.md"];
|
|
@@ -53,6 +56,7 @@ const EXPECTED_CONFIG = [
|
|
|
53
56
|
"experiment_archetypes.yaml", "novelty_aliases.yaml",
|
|
54
57
|
"relationships.toml", "state.toml", "task_taxonomy.yaml",
|
|
55
58
|
"failure_modes.yaml",
|
|
59
|
+
"watch_alerts.yaml",
|
|
56
60
|
];
|
|
57
61
|
|
|
58
62
|
async function fileExists(path) {
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|