npm - claude-turing - Versions diffs - 4.4.0 → 4.6.0 - Mend

claude-turing 4.4.0 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

package/bin/cli.js CHANGED Viewed

@@ -1,9 +1,25 @@
 #!/usr/bin/env node
 import { createRequire } from "module";
+import { realpathSync } from "fs";
+import { fileURLToPath } from "url";
 const require = createRequire(import.meta.url);
 const { Command } = require("commander");
 const pkg = require("../package.json");
+export function buildInitArgs(name, dir) {
+  return [name, dir].filter(Boolean);
+}
+function isDirectRun() {
+  if (!process.argv[1]) return false;
+  try {
+    return realpathSync(fileURLToPath(import.meta.url)) === realpathSync(process.argv[1]);
+  } catch {
+    return false;
+  }
+}
 const program = new Command();
 program
@@ -34,13 +50,16 @@ program
   .command("init [name] [dir]")
   .description("Scaffold ML project (CLI mode, non-Claude-Code usage)")
   .action(async (name, dir) => {
-    const { execSync } = await import("child_process");
+    const { spawnSync } = await import("child_process");
     const { dirname, join } = await import("path");
     const { fileURLToPath } = await import("url");
     const __dirname = dirname(fileURLToPath(import.meta.url));
     const script = join(__dirname, "turing-init.sh");
-    const args = [name, dir].filter(Boolean).join(" ");
-    execSync(`bash "${script}" ${args}`, { stdio: "inherit" });
+    const args = buildInitArgs(name, dir);
+    const result = spawnSync("bash", [script, ...args], { stdio: "inherit" });
+    process.exit(result.status ?? 1);
   });
-program.parse();
+if (isDirectRun()) {
+  program.parse();
+}

package/commands/doctor.md CHANGED Viewed

@@ -21,6 +21,7 @@ Is Turing healthy? Check everything and get a score.
 - **Scripts:** train.py, prepare.py, evaluate.py exist and parse
 - **Disk space:** warn if <1GB free
 - **Git state:** uncommitted changes to critical files
+- **Claude hooks:** `.claude/settings.local.json` hook group schema; `--fix` migrates legacy bare command hooks
 ## Examples
 ```

package/commands/init.md CHANGED Viewed

@@ -37,18 +37,36 @@ python3 <templates_dir>/scripts/scaffold.py \
 The scaffold script handles everything in a single atomic operation:
 - Copies all template files with placeholder substitution
 - Creates data/, experiments/, models/ directories
-- Sets up agent memory at `.claude/agent-memory/ml-researcher/MEMORY.md`
+- Sets up agent memory at `.claude/agent-memory/ml-researcher-{project_name}/MEMORY.md`
 - Configures Claude Code hooks in `.claude/settings.local.json`
 - Creates Python virtual environment and installs requirements
 - Verifies all placeholders were replaced (fails loudly if any remain)
 ## Locating Templates
-Find the templates directory using Glob:
+Use the installed command-pack templates directory first:
+```
+.claude/commands/turing/templates/
+~/.claude/commands/turing/templates/
+```
+Then fall back to plugin or npm locations:
 ```
 ~/.claude/plugins/*/templates/
+node_modules/claude-turing/templates/
+```
+Example command:
+```bash
+python3 ~/.claude/commands/turing/templates/scripts/scaffold.py \
+    --project-name "<project_name>" \
+    --target-metric "<target_metric>" \
+    --metric-direction "<metric_direction>" \
+    --task-description "<task_description>" \
+    --ml-dir "<ml_dir>" \
+    --data-source "<data_source>" \
+    --templates-dir ~/.claude/commands/turing/templates
 ```
-Or check if installed via npm by looking for `node_modules/claude-turing/templates/`.
 ## After Scaffolding

package/commands/turing.md CHANGED Viewed

@@ -1,9 +1,17 @@
 ---
 name: turing
-description: Autonomous ML research harness. Thin router that detects ML training intent and dispatches to focused sub-commands. Each sub-command handles one phase of the experiment lifecycle.
+description: Autonomous ML research harness. Thin router that detects ML training intent and identifies the matching Turing sub-command execution path. Each sub-command handles one phase of the experiment lifecycle.
 ---
-You are the Turing ML research router. Detect the user's intent and route to the appropriate sub-command. Do not attempt to handle ML tasks directly — dispatch to the focused skill.
+You are the Turing ML research router. Detect the user's intent and identify the matching Turing sub-command execution path.
+## Execution Contract
+Turing sub-commands are explicit slash-command skills. Current sub-commands are `slash_only` and use `disable-model-invocation: true`, so router handling must not claim model dispatch into those skills.
+- If the user explicitly invokes `/turing:<cmd>`, Claude Code runtime handles that slash command.
+- If the user invokes `/turing` as a router and the detected command is `slash_only`, give the exact slash command to run.
+- If a command has a documented safe equivalent script, the assistant may execute those documented steps inline when safe and appropriate.
 ## Routing Table
@@ -86,82 +94,82 @@ You are the Turing ML research router. Detect the user's intent and route to the
 ## Sub-commands
-| Command | Purpose | Agent |
+| Command | Purpose | Invocation |
 |---|---|---|
-| `/turing:train [ml/project] [N]` | Run the autonomous experiment loop (auto-detects project from path or cwd) | @ml-researcher |
-| `/turing:status` | Show experiment status, best model, convergence | @ml-evaluator |
-| `/turing:compare <a> <b>` | Side-by-side experiment comparison | @ml-evaluator |
-| `/turing:sweep` | Generate and run hyperparameter sweep | @ml-researcher |
-| `/turing:try <hypothesis>` | Inject a hypothesis into the agent's queue | (inline) |
-| `/turing:brief` | Generate structured research intelligence report | @ml-evaluator |
-| `/turing:init` | Scaffold a new ML project | (inline) |
-| `/turing:validate` | Check metric stability, auto-fix if noisy | (inline) |
-| `/turing:seed [N] [--quick]` | Multi-seed study: mean/std/CI, flag seed-sensitive results | (inline) |
-| `/turing:reproduce <exp-id>` | Reproducibility verification with tolerance checking | (inline) |
-| `/turing:suggest` | Literature-grounded model architecture suggestions | (inline, uses WebSearch) |
-| `/turing:explore` | Tree-search hypothesis exploration via AB-MCTS | (inline) |
-| `/turing:design <hyp-id>` | Generate structured experiment design from hypothesis | (inline, uses WebSearch) |
-| `/turing:logbook` | HTML/markdown logbook with trajectory chart | (inline) |
-| `/turing:poster` | Single-page HTML research poster | (inline) |
-| `/turing:report` | Structured markdown research report | (inline) |
-| `/turing:mode <mode>` | Set research strategy (explore/exploit/replicate) | (inline) |
-| `/turing:preflight` | Pre-flight resource check (VRAM/RAM/disk) | (inline) |
-| `/turing:card` | Generate standardized model card (type, performance, data, limitations, contract) | (inline) |
-| `/turing:diagnose [exp-id]` | Error analysis: failure modes, confused pairs, feature-range bias | (inline) |
-| `/turing:ablate [--components]` | Ablation study: remove components, measure impact, flag dead weight | (inline) |
-| `/turing:frontier [--metrics]` | Pareto frontier: multi-objective tradeoff visualization | (inline) |
-| `/turing:lit <query>` | Literature search: papers, SOTA baselines, related work | (inline, uses WebSearch) |
-| `/turing:paper [--sections] [--format]` | Draft paper sections from experiment logs (setup, results, ablation, hyperparams) | (inline) |
-| `/turing:export [exp-id] [--format]` | Export model to production format with equivalence check + latency benchmark | (inline) |
-| `/turing:queue <action>` | Batch experiment scheduler: add, list, run, pause, clear | (inline) |
-| `/turing:retry <exp-id>` | Smart failure recovery: auto-diagnose crash, apply fix, re-run | (inline) |
-| `/turing:fork <exp-id> --branches` | Experiment branching: run parallel tracks, report winner | (inline) |
-| `/turing:profile [exp-id]` | Computational profiling: timing, memory, throughput, bottleneck detection | (inline) |
-| `/turing:checkpoint <action>` | Smart checkpoint management: list, prune (Pareto), average, resume, stats | (inline) |
-| `/turing:diff <exp-a> <exp-b>` | Deep experiment comparison: config diff, metric significance, per-class regressions, curve divergence | (inline) |
-| `/turing:watch [--analyze]` | Live training monitor with early-warning alerts (loss spike, NaN, overfitting, plateau) | (inline) |
-| `/turing:regress [--tolerance]` | Performance regression gate: re-run best experiment, verify metrics haven't degraded | (inline) |
-| `/turing:ensemble [--top-k] [--methods]` | Automated ensemble: voting, weighted voting, stacking, blending from top-K models | (inline) |
-| `/turing:stitch <action> [stage]` | Pipeline composition: show/swap/cache/run stages independently | (inline) |
-| `/turing:warm <exp-id>` | Warm-start from prior model: load checkpoint, freeze layers, adjust LR | (inline) |
-| `/turing:scale [--axis]` | Scaling law estimator: fit power law, predict full-scale performance | (inline) |
-| `/turing:budget <action>` | Compute budget manager: set limits, track allocation, auto-shift modes | (inline) |
-| `/turing:distill <exp-id>` | Model compression: distill teacher into smaller student model | (inline) |
-| `/turing:transfer [--from]` | Cross-project knowledge transfer: find similar prior projects, surface what worked | (inline) |
-| `/turing:audit [--strict]` | Pre-submission methodology audit: data leakage, baselines, seeds, ablations, reproducibility | (inline) |
-| `/turing:sanity [--quick]` | Pre-training sanity checks: initial loss, overfit test, gradient flow, output validation | (inline) |
-| `/turing:baseline [--methods]` | Automatic baseline generation: random, majority/mean, linear, k-NN | (inline) |
-| `/turing:leak [--deep]` | Targeted leakage detection: single-feature tests, correlation, train/test overlap | (inline) |
-| `/turing:xray [exp-id]` | Internal model diagnostics: gradient flow, dead neurons, weight distributions, tree analysis | (inline) |
-| `/turing:sensitivity [exp-id]` | Hyperparameter sensitivity analysis: rank parameters by impact, detect non-monotonic responses | (inline) |
-| `/turing:calibrate [exp-id]` | Probability calibration: ECE/MCE, reliability diagrams, Platt/isotonic/temperature scaling | (inline) |
-| `/turing:feature [--method]` | Automated feature selection: multi-method consensus ranking, redundancy, interaction generation | (inline) |
-| `/turing:curriculum [exp-id]` | Training curriculum optimization: difficulty scoring, strategy comparison, impossible sample detection | (inline) |
-| `/turing:prune <exp-id>` | Weight pruning: magnitude/structured/lottery, sparsity sweep, knee point detection | (inline) |
-| `/turing:quantize <exp-id>` | Post-training quantization: FP16/INT8, accuracy-latency comparison, QAT suggestion | (inline) |
-| `/turing:merge <exp-ids...>` | Model merging: uniform/greedy soup, TIES, DARE — free accuracy, zero latency cost | (inline) |
-| `/turing:surgery <exp-id>` | Architecture modification: add/remove layer, widen/narrow, swap activation, skip connections | (inline) |
-| `/turing:trend` | Long-term trend analysis: improvement velocity, family ROI, diminishing returns detection | (inline) |
-| `/turing:flashback` | Session context restoration: "where was I?" after days away from the project | (inline) |
-| `/turing:archive` | Experiment lifecycle cleanup: compress old artifacts, prune checkpoints, summary index | (inline) |
-| `/turing:annotate <exp-id>` | Retrospective annotations: add human notes, tags, search by content | (inline) |
-| `/turing:search <query>` | Natural language experiment search with structured filters | (inline) |
-| `/turing:template <action>` | Experiment template library: save/list/apply reusable configs across projects | (inline) |
-| `/turing:replay <exp-id>` | Experiment replay: re-run old experiment with current infrastructure | (inline) |
-| `/turing:cite <action>` | Citation manager: add/list/check/bib for papers, datasets, methods | (inline) |
-| `/turing:present [--figures]` | Presentation figures: training curves, comparisons, ablation, Pareto, sensitivity | (inline) |
-| `/turing:changelog [--audience]` | Model changelog: version-grouped improvements for technical or stakeholder audiences | (inline) |
-| `/turing:onboard [--audience]` | Project onboarding: full walkthrough for new collaborators | (inline) |
-| `/turing:share <exp-ids...>` | Experiment packaging: portable archive with manifest and README | (inline) |
-| `/turing:review [--venue]` | Peer review simulation: weaknesses, questions, fix commands, score | (inline) |
-| `/turing:whatif "<question>"` | What-if analysis: route hypotheticals to existing estimators (scaling, ablation, sensitivity, ensemble, pruning) | (inline) |
-| `/turing:counterfactual <exp-id> --sample <index>` | Input-level counterfactual explanations: minimum input change to flip a prediction | (inline) |
-| `/turing:simulate [--configs] [--top-k]` | Experiment outcome prediction: pre-filter configs using surrogate model, save budget | (inline) |
-| `/turing:update <exp-id> --new-data <path>` | Incremental model update: add new data without full retraining, forgetting detection | (inline) |
-| `/turing:registry [list\|register\|promote\|demote\|history]` | Model registry: stage lifecycle (candidate → staging → production) with promotion gates | (inline) |
-| `/turing:postmortem [--window N]` | Failure postmortem: diagnose why experiments stopped improving (exhaustion, config error, data issue, ceiling, noise) | (inline) |
-| `/turing:doctor [--fix]` | Harness self-diagnosis: environment, dependencies, config, log integrity, scripts, disk, git state | (inline) |
-| `/turing:plan [--budget N] [--goal]` | Research planning assistant: strategic campaign design with budget-aware ROI allocation | (inline) |
+| `/turing:train [ml/project] [N]` | Run the autonomous experiment loop (auto-detects project from path or cwd) | slash_only |
+| `/turing:status` | Show experiment status, best model, convergence | slash_only |
+| `/turing:compare <a> <b>` | Side-by-side experiment comparison | slash_only |
+| `/turing:sweep` | Generate and run hyperparameter sweep | slash_only |
+| `/turing:try <hypothesis>` | Inject a hypothesis into the agent's queue | slash_only |
+| `/turing:brief` | Generate structured research intelligence report | slash_only |
+| `/turing:init` | Scaffold a new ML project | slash_only |
+| `/turing:validate` | Check metric stability, auto-fix if noisy | slash_only |
+| `/turing:seed [N] [--quick]` | Multi-seed study: mean/std/CI, flag seed-sensitive results | slash_only |
+| `/turing:reproduce <exp-id>` | Reproducibility verification with tolerance checking | slash_only |
+| `/turing:suggest` | Literature-grounded model architecture suggestions | slash_only |
+| `/turing:explore` | Tree-search hypothesis exploration via AB-MCTS | slash_only |
+| `/turing:design <hyp-id>` | Generate structured experiment design from hypothesis | slash_only |
+| `/turing:logbook` | HTML/markdown logbook with trajectory chart | slash_only |
+| `/turing:poster` | Single-page HTML research poster | slash_only |
+| `/turing:report` | Structured markdown research report | slash_only |
+| `/turing:mode <mode>` | Set research strategy (explore/exploit/replicate) | slash_only |
+| `/turing:preflight` | Pre-flight resource check (VRAM/RAM/disk) | slash_only |
+| `/turing:card` | Generate standardized model card (type, performance, data, limitations, contract) | slash_only |
+| `/turing:diagnose [exp-id]` | Error analysis: failure modes, confused pairs, feature-range bias | slash_only |
+| `/turing:ablate [--components]` | Ablation study: remove components, measure impact, flag dead weight | slash_only |
+| `/turing:frontier [--metrics]` | Pareto frontier: multi-objective tradeoff visualization | slash_only |
+| `/turing:lit <query>` | Literature search: papers, SOTA baselines, related work | slash_only |
+| `/turing:paper [--sections] [--format]` | Draft paper sections from experiment logs (setup, results, ablation, hyperparams) | slash_only |
+| `/turing:export [exp-id] [--format]` | Export model to production format with equivalence check + latency benchmark | slash_only |
+| `/turing:queue <action>` | Batch experiment scheduler: add, list, run, pause, clear | slash_only |
+| `/turing:retry <exp-id>` | Smart failure recovery: auto-diagnose crash, apply fix, re-run | slash_only |
+| `/turing:fork <exp-id> --branches` | Experiment branching: run parallel tracks, report winner | slash_only |
+| `/turing:profile [exp-id]` | Computational profiling: timing, memory, throughput, bottleneck detection | slash_only |
+| `/turing:checkpoint <action>` | Smart checkpoint management: list, prune (Pareto), average, resume, stats | slash_only |
+| `/turing:diff <exp-a> <exp-b>` | Deep experiment comparison: config diff, metric significance, per-class regressions, curve divergence | slash_only |
+| `/turing:watch [--analyze]` | Live training monitor with early-warning alerts (loss spike, NaN, overfitting, plateau) | slash_only |
+| `/turing:regress [--tolerance]` | Performance regression gate: re-run best experiment, verify metrics haven't degraded | slash_only |
+| `/turing:ensemble [--top-k] [--methods]` | Automated ensemble: voting, weighted voting, stacking, blending from top-K models | slash_only |
+| `/turing:stitch <action> [stage]` | Pipeline composition: show/swap/cache/run stages independently | slash_only |
+| `/turing:warm <exp-id>` | Warm-start from prior model: load checkpoint, freeze layers, adjust LR | slash_only |
+| `/turing:scale [--axis]` | Scaling law estimator: fit power law, predict full-scale performance | slash_only |
+| `/turing:budget <action>` | Compute budget manager: set limits, track allocation, auto-shift modes | slash_only |
+| `/turing:distill <exp-id>` | Model compression: distill teacher into smaller student model | slash_only |
+| `/turing:transfer [--from]` | Cross-project knowledge transfer: find similar prior projects, surface what worked | slash_only |
+| `/turing:audit [--strict]` | Pre-submission methodology audit: data leakage, baselines, seeds, ablations, reproducibility | slash_only |
+| `/turing:sanity [--quick]` | Pre-training sanity checks: initial loss, overfit test, gradient flow, output validation | slash_only |
+| `/turing:baseline [--methods]` | Automatic baseline generation: random, majority/mean, linear, k-NN | slash_only |
+| `/turing:leak [--deep]` | Targeted leakage detection: single-feature tests, correlation, train/test overlap | slash_only |
+| `/turing:xray [exp-id]` | Internal model diagnostics: gradient flow, dead neurons, weight distributions, tree analysis | slash_only |
+| `/turing:sensitivity [exp-id]` | Hyperparameter sensitivity analysis: rank parameters by impact, detect non-monotonic responses | slash_only |
+| `/turing:calibrate [exp-id]` | Probability calibration: ECE/MCE, reliability diagrams, Platt/isotonic/temperature scaling | slash_only |
+| `/turing:feature [--method]` | Automated feature selection: multi-method consensus ranking, redundancy, interaction generation | slash_only |
+| `/turing:curriculum [exp-id]` | Training curriculum optimization: difficulty scoring, strategy comparison, impossible sample detection | slash_only |
+| `/turing:prune <exp-id>` | Weight pruning: magnitude/structured/lottery, sparsity sweep, knee point detection | slash_only |
+| `/turing:quantize <exp-id>` | Post-training quantization: FP16/INT8, accuracy-latency comparison, QAT suggestion | slash_only |
+| `/turing:merge <exp-ids...>` | Model merging: uniform/greedy soup, TIES, DARE — free accuracy, zero latency cost | slash_only |
+| `/turing:surgery <exp-id>` | Architecture modification: add/remove layer, widen/narrow, swap activation, skip connections | slash_only |
+| `/turing:trend` | Long-term trend analysis: improvement velocity, family ROI, diminishing returns detection | slash_only |
+| `/turing:flashback` | Session context restoration: "where was I?" after days away from the project | slash_only |
+| `/turing:archive` | Experiment lifecycle cleanup: compress old artifacts, prune checkpoints, summary index | slash_only |
+| `/turing:annotate <exp-id>` | Retrospective annotations: add human notes, tags, search by content | slash_only |
+| `/turing:search <query>` | Natural language experiment search with structured filters | slash_only |
+| `/turing:template <action>` | Experiment template library: save/list/apply reusable configs across projects | slash_only |
+| `/turing:replay <exp-id>` | Experiment replay: re-run old experiment with current infrastructure | slash_only |
+| `/turing:cite <action>` | Citation manager: add/list/check/bib for papers, datasets, methods | slash_only |
+| `/turing:present [--figures]` | Presentation figures: training curves, comparisons, ablation, Pareto, sensitivity | slash_only |
+| `/turing:changelog [--audience]` | Model changelog: version-grouped improvements for technical or stakeholder audiences | slash_only |
+| `/turing:onboard [--audience]` | Project onboarding: full walkthrough for new collaborators | slash_only |
+| `/turing:share <exp-ids...>` | Experiment packaging: portable archive with manifest and README | slash_only |
+| `/turing:review [--venue]` | Peer review simulation: weaknesses, questions, fix commands, score | slash_only |
+| `/turing:whatif "<question>"` | What-if analysis: route hypotheticals to existing estimators (scaling, ablation, sensitivity, ensemble, pruning) | slash_only |
+| `/turing:counterfactual <exp-id> --sample <index>` | Input-level counterfactual explanations: minimum input change to flip a prediction | slash_only |
+| `/turing:simulate [--configs] [--top-k]` | Experiment outcome prediction: pre-filter configs using surrogate model, save budget | slash_only |
+| `/turing:update <exp-id> --new-data <path>` | Incremental model update: add new data without full retraining, forgetting detection | slash_only |
+| `/turing:registry [list\|register\|promote\|demote\|history]` | Model registry: stage lifecycle (candidate → staging → production) with promotion gates | slash_only |
+| `/turing:postmortem [--window N]` | Failure postmortem: diagnose why experiments stopped improving (exhaustion, config error, data issue, ceiling, noise) | slash_only |
+| `/turing:doctor [--fix]` | Harness self-diagnosis: environment, dependencies, config, log integrity, scripts, disk, git state, Claude hooks | slash_only |
+| `/turing:plan [--budget N] [--goal]` | Research planning assistant: strategic campaign design with budget-aware ROI allocation | slash_only |
 ## Proactive Detection