claude-turing 2.5.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "turing",
3
- "version": "2.5.0",
4
- "description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 39 commands, 2 specialized agents, scaling & efficiency (scaling laws + compute budget + model distillation), model composition (ensemble + pipeline stitch + warm-start), deep analysis (experiment diff + live training monitor + regression gate), experiment orchestration (batch queue + smart retry + branching), literature integration + paper drafting, production model export, performance profiling, smart checkpoints, experiment intelligence, statistical rigor, tree-search hypothesis exploration, cost-performance frontier, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
3
+ "version": "3.0.0",
4
+ "description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 41 commands, 2 specialized agents, meta-intelligence (cross-project knowledge transfer + methodology audit), scaling & efficiency (scaling laws + compute budget + model distillation), model composition (ensemble + pipeline stitch + warm-start), deep analysis (experiment diff + live training monitor + regression gate), experiment orchestration (batch queue + smart retry + branching), literature integration + paper drafting, production model export, performance profiling, smart checkpoints, experiment intelligence, statistical rigor, tree-search hypothesis exploration, cost-performance frontier, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
5
5
  "author": {
6
6
  "name": "pragnition"
7
7
  },
package/README.md CHANGED
@@ -350,6 +350,8 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
350
350
  | `/turing:scale [--axis]` | Scaling law estimator — power-law fit, full-scale predictions, diminishing returns verdict |
351
351
  | `/turing:budget <action>` | Compute budget manager — set limits, track allocation, auto-shift explore/exploit |
352
352
  | `/turing:distill <exp-id>` | Model compression — distill teacher into smaller student with accuracy/size tradeoff |
353
+ | `/turing:transfer [--from]` | Cross-project knowledge transfer — find similar projects, surface what worked |
354
+ | `/turing:audit [--strict]` | Pre-submission methodology audit — data leakage, baselines, seeds, ablations, reproducibility |
353
355
 
354
356
  And for fully hands-off operation:
355
357
 
@@ -534,11 +536,11 @@ Each project gets independent config, data, experiments, models, and agent memor
534
536
 
535
537
  ## Architecture of Turing Itself
536
538
 
537
- 39 commands, 2 agents, 10 config files, 58 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
539
+ 41 commands, 2 agents, 10 config files, 60 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
538
540
 
539
541
  ```
540
542
  turing/
541
- ├── commands/ 38 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency)
543
+ ├── commands/ 40 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence)
542
544
  ├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
543
545
  ├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
544
546
  ├── templates/ Scaffolded into user projects by /turing:init
@@ -0,0 +1,56 @@
1
+ ---
2
+ name: audit
3
+ description: Pre-submission methodology audit — catch data leakage, missing baselines, cherry-picked seeds, and incomplete ablations before a reviewer does.
4
+ disable-model-invocation: true
5
+ argument-hint: "[--strict] [--checklist neurips]"
6
+ allowed-tools: Read, Bash(*), Grep, Glob
7
+ ---
8
+
9
+ A reviewer checklist you run before submitting. Catches methodology mistakes that cause desk rejections.
10
+
11
+ ## Steps
12
+
13
+ 1. **Activate environment:**
14
+ ```bash
15
+ source .venv/bin/activate
16
+ ```
17
+
18
+ 2. **Parse arguments from `$ARGUMENTS`:**
19
+ - `--strict` — treat warnings as failures
20
+ - `--checklist neurips|icml|iclr` — add venue-specific checks
21
+ - `--json` — raw JSON output
22
+
23
+ 3. **Run methodology audit:**
24
+ ```bash
25
+ python scripts/methodology_audit.py $ARGUMENTS
26
+ ```
27
+
28
+ 4. **Checks performed:**
29
+ - **Data leakage** (critical): verify prepare.py/evaluate.py separation
30
+ - **CV strategy** (critical): verify appropriate cross-validation for data type
31
+ - **Seed sensitivity** (high): seed studies exist for best experiments
32
+ - **Ablation completeness** (high): ablation studies performed
33
+ - **Baseline comparison** (high): simple baselines in experiment log
34
+ - **Reproducibility** (high): best result successfully reproduced
35
+ - **Hyperparameter budget** (medium): total tuning cost documented
36
+ - **Regression stability** (medium): regression checks performed
37
+
38
+ 5. **Verdicts:**
39
+ - **PASS** — ready for submission
40
+ - **PASS (with warnings)** — address before submission
41
+ - **NEEDS WORK** — fix failures first
42
+ - **FAIL** — critical issues found
43
+
44
+ 6. **Actions:** each failure suggests the `/turing:` command to fix it
45
+
46
+ 7. **Venue checklists:** `--checklist neurips` adds NeurIPS-specific checks (broader impact, reproducibility checklist, code availability)
47
+
48
+ 8. **Saved output:** report in `experiments/audits/audit-YYYY-MM-DD.yaml`
49
+
50
+ ## Examples
51
+
52
+ ```
53
+ /turing:audit # Standard audit
54
+ /turing:audit --strict # Warnings become failures
55
+ /turing:audit --checklist neurips # NeurIPS submission checklist
56
+ ```
@@ -0,0 +1,54 @@
1
+ ---
2
+ name: transfer
3
+ description: Cross-project knowledge transfer — find similar prior projects and surface what worked. Builds institutional ML memory.
4
+ disable-model-invocation: true
5
+ argument-hint: "[--from project-path] [--auto]"
6
+ allowed-tools: Read, Bash(*), Grep, Glob
7
+ ---
8
+
9
+ Find similar prior projects and surface what worked. "Last time you had tabular classification with class imbalance, LightGBM beat everything by 3%."
10
+
11
+ ## Steps
12
+
13
+ 1. **Activate environment:**
14
+ ```bash
15
+ source .venv/bin/activate
16
+ ```
17
+
18
+ 2. **Parse arguments from `$ARGUMENTS`:**
19
+ - `--from ~/projects/fraud-detection` — transfer from a specific project
20
+ - `--auto` — auto-queue hypotheses from recommendations
21
+ - `--index ~/.turing/project_index.yaml` — custom index path
22
+ - `--json` — raw JSON output
23
+
24
+ 3. **Run knowledge transfer:**
25
+ ```bash
26
+ python scripts/knowledge_transfer.py $ARGUMENTS
27
+ ```
28
+
29
+ 4. **Report includes:**
30
+ - Similar prior projects ranked by similarity score
31
+ - Per project: task type, winner model, key insights
32
+ - Suggested hypotheses from winning strategies
33
+ - Auto-queued hypotheses (with `--auto`)
34
+
35
+ 5. **Similarity matching** uses:
36
+ - Task type (classification/regression) — highest weight
37
+ - Dataset size (log-scale comparison)
38
+ - Feature types (tabular/image/text)
39
+ - Class balance characteristics
40
+ - Dimensionality
41
+
42
+ 6. **Project index** at `~/.turing/project_index.yaml` — local only, never uploaded
43
+
44
+ 7. **If no similar projects found:** suggest running on more projects first or specifying one with `--from`
45
+
46
+ 8. **Saved output:** report in `experiments/transfers/transfer-*.yaml`
47
+
48
+ ## Examples
49
+
50
+ ```
51
+ /turing:transfer # Search index for similar projects
52
+ /turing:transfer --from ~/projects/fraud-detection # Transfer from specific project
53
+ /turing:transfer --auto # Auto-queue hypotheses
54
+ ```
@@ -48,6 +48,8 @@ You are the Turing ML research router. Detect the user's intent and route to the
48
48
  | "scale", "scaling law", "how much data", "is more data worth it", "power law", "data efficiency" | `/turing:scale` | Analyze |
49
49
  | "budget", "compute budget", "how many experiments", "spending limit", "stop after" | `/turing:budget` | Manage |
50
50
  | "distill", "compress", "smaller model", "student model", "knowledge distillation", "model compression" | `/turing:distill` | Deploy |
51
+ | "transfer", "what worked before", "similar project", "cross-project", "institutional knowledge", "prior projects" | `/turing:transfer` | Research |
52
+ | "audit", "methodology check", "pre-submission", "reviewer checklist", "data leakage", "missing baselines" | `/turing:audit` | Validate |
51
53
 
52
54
  ## Sub-commands
53
55
 
@@ -92,6 +94,8 @@ You are the Turing ML research router. Detect the user's intent and route to the
92
94
  | `/turing:scale [--axis]` | Scaling law estimator: fit power law, predict full-scale performance | (inline) |
93
95
  | `/turing:budget <action>` | Compute budget manager: set limits, track allocation, auto-shift modes | (inline) |
94
96
  | `/turing:distill <exp-id>` | Model compression: distill teacher into smaller student model | (inline) |
97
+ | `/turing:transfer [--from]` | Cross-project knowledge transfer: find similar prior projects, surface what worked | (inline) |
98
+ | `/turing:audit [--strict]` | Pre-submission methodology audit: data leakage, baselines, seeds, ablations, reproducibility | (inline) |
95
99
 
96
100
  ## Proactive Detection
97
101
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-turing",
3
- "version": "2.5.0",
3
+ "version": "3.0.0",
4
4
  "type": "module",
5
5
  "description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
6
6
  "bin": {
package/src/install.js CHANGED
@@ -28,6 +28,7 @@ const SUB_COMMANDS = [
28
28
  "diff", "watch", "regress",
29
29
  "ensemble", "stitch", "warm",
30
30
  "scale", "budget", "distill",
31
+ "transfer", "audit",
31
32
  ];
32
33
 
33
34
  export async function install(opts = {}) {
package/src/verify.js CHANGED
@@ -53,6 +53,8 @@ const EXPECTED_COMMANDS = [
53
53
  "scale/SKILL.md",
54
54
  "budget/SKILL.md",
55
55
  "distill/SKILL.md",
56
+ "transfer/SKILL.md",
57
+ "audit/SKILL.md",
56
58
  ];
57
59
 
58
60
  const EXPECTED_AGENTS = ["ml-researcher.md", "ml-evaluator.md"];
@@ -355,6 +355,22 @@ def load_scaling_results(scaling_dir: str = "experiments/scaling") -> list[dict]
355
355
  return reports
356
356
 
357
357
 
358
+ def load_audit_report(audit_dir: str = "experiments/audits") -> dict | None:
359
+ """Load the most recent audit report."""
360
+ path = Path(audit_dir)
361
+ if not path.exists():
362
+ return None
363
+ files = sorted(path.glob("audit-*.yaml"))
364
+ if not files:
365
+ return None
366
+ try:
367
+ with open(files[-1]) as f:
368
+ report = yaml.safe_load(f)
369
+ return report if isinstance(report, dict) else None
370
+ except (yaml.YAMLError, OSError):
371
+ return None
372
+
373
+
358
374
  def format_brief(
359
375
  campaign: dict,
360
376
  best: dict | None,
@@ -376,6 +392,7 @@ def format_brief(
376
392
  ensemble_results: list[dict] | None = None,
377
393
  budget_status: dict | None = None,
378
394
  scaling_results: list[dict] | None = None,
395
+ audit_report: dict | None = None,
379
396
  ) -> str:
380
397
  """Format the research briefing as markdown."""
381
398
  direction = "lower" if lower_is_better else "higher"
@@ -635,6 +652,28 @@ def format_brief(
635
652
  reason = verdict.get("reason", "")
636
653
  lines.append(f"- **{v.upper()}**: {reason}")
637
654
 
655
+ # Methodology audit
656
+ if audit_report and audit_report.get("score"):
657
+ score = audit_report["score"]
658
+ verdict = audit_report.get("verdict", "?")
659
+ verdict_labels = {
660
+ "pass": "PASS",
661
+ "pass_with_warnings": "PASS (warnings)",
662
+ "needs_work": "NEEDS WORK",
663
+ "fail": "FAIL",
664
+ }
665
+ lines.extend(["", "## Methodology Audit", ""])
666
+ lines.append(
667
+ f"**{verdict_labels.get(verdict, verdict.upper())}** — "
668
+ f"{score.get('pass', 0)}/{score.get('checkable', 0)} checks passed, "
669
+ f"{score.get('fail', 0)} failure(s)"
670
+ )
671
+ actions = audit_report.get("actions", [])
672
+ if actions:
673
+ lines.append("")
674
+ for a in actions[:3]:
675
+ lines.append(f"- Fix: `{a['fix']}` ({a['check']})")
676
+
638
677
  # Regression check history (stability)
639
678
  if regression_checks:
640
679
  lines.extend(["", "## Stability", ""])
@@ -728,6 +767,7 @@ def generate_brief(
728
767
  ensemble_results = load_ensemble_results()
729
768
  budget_status = load_budget_status(log_path=log_path)
730
769
  scaling_results = load_scaling_results()
770
+ audit_report = load_audit_report()
731
771
 
732
772
  return format_brief(
733
773
  campaign, best, trajectory, model_types, hypotheses,
@@ -743,6 +783,7 @@ def generate_brief(
743
783
  ensemble_results=ensemble_results if ensemble_results else None,
744
784
  budget_status=budget_status,
745
785
  scaling_results=scaling_results if scaling_results else None,
786
+ audit_report=audit_report,
746
787
  )
747
788
 
748
789