claude-turing 2.5.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +4 -2
- package/commands/audit.md +56 -0
- package/commands/transfer.md +54 -0
- package/commands/turing.md +4 -0
- package/package.json +1 -1
- package/src/install.js +1 -0
- package/src/verify.js +2 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/knowledge_transfer.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/methodology_audit.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/generate_brief.py +41 -0
- package/templates/scripts/knowledge_transfer.py +618 -0
- package/templates/scripts/methodology_audit.py +451 -0
- package/templates/scripts/scaffold.py +4 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "3.0.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 41 commands, 2 specialized agents, meta-intelligence (cross-project knowledge transfer + methodology audit), scaling & efficiency (scaling laws + compute budget + model distillation), model composition (ensemble + pipeline stitch + warm-start), deep analysis (experiment diff + live training monitor + regression gate), experiment orchestration (batch queue + smart retry + branching), literature integration + paper drafting, production model export, performance profiling, smart checkpoints, experiment intelligence, statistical rigor, tree-search hypothesis exploration, cost-performance frontier, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -350,6 +350,8 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
350
350
|
| `/turing:scale [--axis]` | Scaling law estimator — power-law fit, full-scale predictions, diminishing returns verdict |
|
|
351
351
|
| `/turing:budget <action>` | Compute budget manager — set limits, track allocation, auto-shift explore/exploit |
|
|
352
352
|
| `/turing:distill <exp-id>` | Model compression — distill teacher into smaller student with accuracy/size tradeoff |
|
|
353
|
+
| `/turing:transfer [--from]` | Cross-project knowledge transfer — find similar projects, surface what worked |
|
|
354
|
+
| `/turing:audit [--strict]` | Pre-submission methodology audit — data leakage, baselines, seeds, ablations, reproducibility |
|
|
353
355
|
|
|
354
356
|
And for fully hands-off operation:
|
|
355
357
|
|
|
@@ -534,11 +536,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
534
536
|
|
|
535
537
|
## Architecture of Turing Itself
|
|
536
538
|
|
|
537
|
-
|
|
539
|
+
41 commands, 2 agents, 10 config files, 60 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
538
540
|
|
|
539
541
|
```
|
|
540
542
|
turing/
|
|
541
|
-
├── commands/
|
|
543
|
+
├── commands/ 40 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence)
|
|
542
544
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
543
545
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
544
546
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: audit
|
|
3
|
+
description: Pre-submission methodology audit — catch data leakage, missing baselines, cherry-picked seeds, and incomplete ablations before a reviewer does.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--strict] [--checklist neurips]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
A reviewer checklist you run before submitting. Catches methodology mistakes that cause desk rejections.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--strict` — treat warnings as failures
|
|
20
|
+
- `--checklist neurips|icml|iclr` — add venue-specific checks
|
|
21
|
+
- `--json` — raw JSON output
|
|
22
|
+
|
|
23
|
+
3. **Run methodology audit:**
|
|
24
|
+
```bash
|
|
25
|
+
python scripts/methodology_audit.py $ARGUMENTS
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
4. **Checks performed:**
|
|
29
|
+
- **Data leakage** (critical): verify prepare.py/evaluate.py separation
|
|
30
|
+
- **CV strategy** (critical): verify appropriate cross-validation for data type
|
|
31
|
+
- **Seed sensitivity** (high): seed studies exist for best experiments
|
|
32
|
+
- **Ablation completeness** (high): ablation studies performed
|
|
33
|
+
- **Baseline comparison** (high): simple baselines in experiment log
|
|
34
|
+
- **Reproducibility** (high): best result successfully reproduced
|
|
35
|
+
- **Hyperparameter budget** (medium): total tuning cost documented
|
|
36
|
+
- **Regression stability** (medium): regression checks performed
|
|
37
|
+
|
|
38
|
+
5. **Verdicts:**
|
|
39
|
+
- **PASS** — ready for submission
|
|
40
|
+
- **PASS (with warnings)** — address before submission
|
|
41
|
+
- **NEEDS WORK** — fix failures first
|
|
42
|
+
- **FAIL** — critical issues found
|
|
43
|
+
|
|
44
|
+
6. **Actions:** each failure suggests the `/turing:` command to fix it
|
|
45
|
+
|
|
46
|
+
7. **Venue checklists:** `--checklist neurips` adds NeurIPS-specific checks (broader impact, reproducibility checklist, code availability)
|
|
47
|
+
|
|
48
|
+
8. **Saved output:** report in `experiments/audits/audit-YYYY-MM-DD.yaml`
|
|
49
|
+
|
|
50
|
+
## Examples
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
/turing:audit # Standard audit
|
|
54
|
+
/turing:audit --strict # Warnings become failures
|
|
55
|
+
/turing:audit --checklist neurips # NeurIPS submission checklist
|
|
56
|
+
```
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: transfer
|
|
3
|
+
description: Cross-project knowledge transfer — find similar prior projects and surface what worked. Builds institutional ML memory.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[--from project-path] [--auto]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Find similar prior projects and surface what worked. "Last time you had tabular classification with class imbalance, LightGBM beat everything by 3%."
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
|
|
13
|
+
1. **Activate environment:**
|
|
14
|
+
```bash
|
|
15
|
+
source .venv/bin/activate
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
2. **Parse arguments from `$ARGUMENTS`:**
|
|
19
|
+
- `--from ~/projects/fraud-detection` — transfer from a specific project
|
|
20
|
+
- `--auto` — auto-queue hypotheses from recommendations
|
|
21
|
+
- `--index ~/.turing/project_index.yaml` — custom index path
|
|
22
|
+
- `--json` — raw JSON output
|
|
23
|
+
|
|
24
|
+
3. **Run knowledge transfer:**
|
|
25
|
+
```bash
|
|
26
|
+
python scripts/knowledge_transfer.py $ARGUMENTS
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
4. **Report includes:**
|
|
30
|
+
- Similar prior projects ranked by similarity score
|
|
31
|
+
- Per project: task type, winner model, key insights
|
|
32
|
+
- Suggested hypotheses from winning strategies
|
|
33
|
+
- Auto-queued hypotheses (with `--auto`)
|
|
34
|
+
|
|
35
|
+
5. **Similarity matching** uses:
|
|
36
|
+
- Task type (classification/regression) — highest weight
|
|
37
|
+
- Dataset size (log-scale comparison)
|
|
38
|
+
- Feature types (tabular/image/text)
|
|
39
|
+
- Class balance characteristics
|
|
40
|
+
- Dimensionality
|
|
41
|
+
|
|
42
|
+
6. **Project index** at `~/.turing/project_index.yaml` — local only, never uploaded
|
|
43
|
+
|
|
44
|
+
7. **If no similar projects found:** suggest running on more projects first or specifying one with `--from`
|
|
45
|
+
|
|
46
|
+
8. **Saved output:** report in `experiments/transfers/transfer-*.yaml`
|
|
47
|
+
|
|
48
|
+
## Examples
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
/turing:transfer # Search index for similar projects
|
|
52
|
+
/turing:transfer --from ~/projects/fraud-detection # Transfer from specific project
|
|
53
|
+
/turing:transfer --auto # Auto-queue hypotheses
|
|
54
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -48,6 +48,8 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
48
48
|
| "scale", "scaling law", "how much data", "is more data worth it", "power law", "data efficiency" | `/turing:scale` | Analyze |
|
|
49
49
|
| "budget", "compute budget", "how many experiments", "spending limit", "stop after" | `/turing:budget` | Manage |
|
|
50
50
|
| "distill", "compress", "smaller model", "student model", "knowledge distillation", "model compression" | `/turing:distill` | Deploy |
|
|
51
|
+
| "transfer", "what worked before", "similar project", "cross-project", "institutional knowledge", "prior projects" | `/turing:transfer` | Research |
|
|
52
|
+
| "audit", "methodology check", "pre-submission", "reviewer checklist", "data leakage", "missing baselines" | `/turing:audit` | Validate |
|
|
51
53
|
|
|
52
54
|
## Sub-commands
|
|
53
55
|
|
|
@@ -92,6 +94,8 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
92
94
|
| `/turing:scale [--axis]` | Scaling law estimator: fit power law, predict full-scale performance | (inline) |
|
|
93
95
|
| `/turing:budget <action>` | Compute budget manager: set limits, track allocation, auto-shift modes | (inline) |
|
|
94
96
|
| `/turing:distill <exp-id>` | Model compression: distill teacher into smaller student model | (inline) |
|
|
97
|
+
| `/turing:transfer [--from]` | Cross-project knowledge transfer: find similar prior projects, surface what worked | (inline) |
|
|
98
|
+
| `/turing:audit [--strict]` | Pre-submission methodology audit: data leakage, baselines, seeds, ablations, reproducibility | (inline) |
|
|
95
99
|
|
|
96
100
|
## Proactive Detection
|
|
97
101
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "3.0.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
package/src/verify.js
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -355,6 +355,22 @@ def load_scaling_results(scaling_dir: str = "experiments/scaling") -> list[dict]
|
|
|
355
355
|
return reports
|
|
356
356
|
|
|
357
357
|
|
|
358
|
+
def load_audit_report(audit_dir: str = "experiments/audits") -> dict | None:
|
|
359
|
+
"""Load the most recent audit report."""
|
|
360
|
+
path = Path(audit_dir)
|
|
361
|
+
if not path.exists():
|
|
362
|
+
return None
|
|
363
|
+
files = sorted(path.glob("audit-*.yaml"))
|
|
364
|
+
if not files:
|
|
365
|
+
return None
|
|
366
|
+
try:
|
|
367
|
+
with open(files[-1]) as f:
|
|
368
|
+
report = yaml.safe_load(f)
|
|
369
|
+
return report if isinstance(report, dict) else None
|
|
370
|
+
except (yaml.YAMLError, OSError):
|
|
371
|
+
return None
|
|
372
|
+
|
|
373
|
+
|
|
358
374
|
def format_brief(
|
|
359
375
|
campaign: dict,
|
|
360
376
|
best: dict | None,
|
|
@@ -376,6 +392,7 @@ def format_brief(
|
|
|
376
392
|
ensemble_results: list[dict] | None = None,
|
|
377
393
|
budget_status: dict | None = None,
|
|
378
394
|
scaling_results: list[dict] | None = None,
|
|
395
|
+
audit_report: dict | None = None,
|
|
379
396
|
) -> str:
|
|
380
397
|
"""Format the research briefing as markdown."""
|
|
381
398
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -635,6 +652,28 @@ def format_brief(
|
|
|
635
652
|
reason = verdict.get("reason", "")
|
|
636
653
|
lines.append(f"- **{v.upper()}**: {reason}")
|
|
637
654
|
|
|
655
|
+
# Methodology audit
|
|
656
|
+
if audit_report and audit_report.get("score"):
|
|
657
|
+
score = audit_report["score"]
|
|
658
|
+
verdict = audit_report.get("verdict", "?")
|
|
659
|
+
verdict_labels = {
|
|
660
|
+
"pass": "PASS",
|
|
661
|
+
"pass_with_warnings": "PASS (warnings)",
|
|
662
|
+
"needs_work": "NEEDS WORK",
|
|
663
|
+
"fail": "FAIL",
|
|
664
|
+
}
|
|
665
|
+
lines.extend(["", "## Methodology Audit", ""])
|
|
666
|
+
lines.append(
|
|
667
|
+
f"**{verdict_labels.get(verdict, verdict.upper())}** — "
|
|
668
|
+
f"{score.get('pass', 0)}/{score.get('checkable', 0)} checks passed, "
|
|
669
|
+
f"{score.get('fail', 0)} failure(s)"
|
|
670
|
+
)
|
|
671
|
+
actions = audit_report.get("actions", [])
|
|
672
|
+
if actions:
|
|
673
|
+
lines.append("")
|
|
674
|
+
for a in actions[:3]:
|
|
675
|
+
lines.append(f"- Fix: `{a['fix']}` ({a['check']})")
|
|
676
|
+
|
|
638
677
|
# Regression check history (stability)
|
|
639
678
|
if regression_checks:
|
|
640
679
|
lines.extend(["", "## Stability", ""])
|
|
@@ -728,6 +767,7 @@ def generate_brief(
|
|
|
728
767
|
ensemble_results = load_ensemble_results()
|
|
729
768
|
budget_status = load_budget_status(log_path=log_path)
|
|
730
769
|
scaling_results = load_scaling_results()
|
|
770
|
+
audit_report = load_audit_report()
|
|
731
771
|
|
|
732
772
|
return format_brief(
|
|
733
773
|
campaign, best, trajectory, model_types, hypotheses,
|
|
@@ -743,6 +783,7 @@ def generate_brief(
|
|
|
743
783
|
ensemble_results=ensemble_results if ensemble_results else None,
|
|
744
784
|
budget_status=budget_status,
|
|
745
785
|
scaling_results=scaling_results if scaling_results else None,
|
|
786
|
+
audit_report=audit_report,
|
|
746
787
|
)
|
|
747
788
|
|
|
748
789
|
|