claude-turing 4.7.0 → 4.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +1 -1
  3. package/agents/ml-evaluator.md +4 -4
  4. package/agents/ml-researcher.md +2 -2
  5. package/bin/turing-init.sh +2 -2
  6. package/commands/ablate.md +3 -4
  7. package/commands/annotate.md +2 -3
  8. package/commands/archive.md +2 -3
  9. package/commands/audit.md +3 -4
  10. package/commands/baseline.md +3 -4
  11. package/commands/brief.md +5 -6
  12. package/commands/budget.md +3 -4
  13. package/commands/calibrate.md +3 -4
  14. package/commands/card.md +3 -4
  15. package/commands/changelog.md +2 -3
  16. package/commands/checkpoint.md +3 -4
  17. package/commands/cite.md +2 -3
  18. package/commands/compare.md +1 -2
  19. package/commands/counterfactual.md +2 -3
  20. package/commands/curriculum.md +3 -4
  21. package/commands/design.md +3 -4
  22. package/commands/diagnose.md +4 -5
  23. package/commands/diff.md +3 -4
  24. package/commands/distill.md +3 -4
  25. package/commands/doctor.md +2 -3
  26. package/commands/ensemble.md +3 -4
  27. package/commands/explore.md +4 -5
  28. package/commands/export.md +3 -4
  29. package/commands/feature.md +3 -4
  30. package/commands/flashback.md +2 -3
  31. package/commands/fork.md +3 -4
  32. package/commands/frontier.md +3 -4
  33. package/commands/init.md +5 -6
  34. package/commands/leak.md +3 -4
  35. package/commands/lit.md +3 -4
  36. package/commands/logbook.md +5 -6
  37. package/commands/merge.md +2 -3
  38. package/commands/mode.md +1 -2
  39. package/commands/onboard.md +2 -3
  40. package/commands/paper.md +3 -4
  41. package/commands/plan.md +2 -3
  42. package/commands/poster.md +3 -4
  43. package/commands/postmortem.md +2 -3
  44. package/commands/preflight.md +5 -6
  45. package/commands/present.md +2 -3
  46. package/commands/profile.md +3 -4
  47. package/commands/prune.md +2 -3
  48. package/commands/quantize.md +2 -3
  49. package/commands/queue.md +3 -4
  50. package/commands/registry.md +2 -3
  51. package/commands/regress.md +3 -4
  52. package/commands/replay.md +2 -3
  53. package/commands/report.md +3 -4
  54. package/commands/reproduce.md +3 -4
  55. package/commands/retry.md +3 -4
  56. package/commands/review.md +2 -3
  57. package/commands/rules/loop-protocol.md +11 -11
  58. package/commands/sanity.md +3 -4
  59. package/commands/scale.md +4 -5
  60. package/commands/search.md +2 -3
  61. package/commands/seed.md +3 -4
  62. package/commands/sensitivity.md +3 -4
  63. package/commands/share.md +2 -3
  64. package/commands/simulate.md +2 -3
  65. package/commands/status.md +1 -2
  66. package/commands/stitch.md +3 -4
  67. package/commands/suggest.md +5 -6
  68. package/commands/surgery.md +2 -3
  69. package/commands/sweep.md +8 -9
  70. package/commands/template.md +2 -3
  71. package/commands/train.md +5 -6
  72. package/commands/transfer.md +3 -4
  73. package/commands/trend.md +2 -3
  74. package/commands/try.md +4 -5
  75. package/commands/turing.md +3 -3
  76. package/commands/update.md +2 -3
  77. package/commands/validate.md +4 -5
  78. package/commands/warm.md +3 -4
  79. package/commands/watch.md +4 -5
  80. package/commands/whatif.md +2 -3
  81. package/commands/xray.md +3 -4
  82. package/config/commands.yaml +75 -75
  83. package/package.json +3 -2
  84. package/skills/turing/SKILL.md +3 -3
  85. package/skills/turing/ablate/SKILL.md +3 -4
  86. package/skills/turing/annotate/SKILL.md +2 -3
  87. package/skills/turing/archive/SKILL.md +2 -3
  88. package/skills/turing/audit/SKILL.md +3 -4
  89. package/skills/turing/baseline/SKILL.md +3 -4
  90. package/skills/turing/brief/SKILL.md +5 -6
  91. package/skills/turing/budget/SKILL.md +3 -4
  92. package/skills/turing/calibrate/SKILL.md +3 -4
  93. package/skills/turing/card/SKILL.md +3 -4
  94. package/skills/turing/changelog/SKILL.md +2 -3
  95. package/skills/turing/checkpoint/SKILL.md +3 -4
  96. package/skills/turing/cite/SKILL.md +2 -3
  97. package/skills/turing/compare/SKILL.md +1 -2
  98. package/skills/turing/counterfactual/SKILL.md +2 -3
  99. package/skills/turing/curriculum/SKILL.md +3 -4
  100. package/skills/turing/design/SKILL.md +3 -4
  101. package/skills/turing/diagnose/SKILL.md +4 -5
  102. package/skills/turing/diff/SKILL.md +3 -4
  103. package/skills/turing/distill/SKILL.md +3 -4
  104. package/skills/turing/doctor/SKILL.md +2 -3
  105. package/skills/turing/ensemble/SKILL.md +3 -4
  106. package/skills/turing/explore/SKILL.md +4 -5
  107. package/skills/turing/export/SKILL.md +3 -4
  108. package/skills/turing/feature/SKILL.md +3 -4
  109. package/skills/turing/flashback/SKILL.md +2 -3
  110. package/skills/turing/fork/SKILL.md +3 -4
  111. package/skills/turing/frontier/SKILL.md +3 -4
  112. package/skills/turing/init/SKILL.md +5 -6
  113. package/skills/turing/leak/SKILL.md +3 -4
  114. package/skills/turing/lit/SKILL.md +3 -4
  115. package/skills/turing/logbook/SKILL.md +5 -6
  116. package/skills/turing/merge/SKILL.md +2 -3
  117. package/skills/turing/mode/SKILL.md +1 -2
  118. package/skills/turing/onboard/SKILL.md +2 -3
  119. package/skills/turing/paper/SKILL.md +3 -4
  120. package/skills/turing/plan/SKILL.md +2 -3
  121. package/skills/turing/poster/SKILL.md +3 -4
  122. package/skills/turing/postmortem/SKILL.md +2 -3
  123. package/skills/turing/preflight/SKILL.md +5 -6
  124. package/skills/turing/present/SKILL.md +2 -3
  125. package/skills/turing/profile/SKILL.md +3 -4
  126. package/skills/turing/prune/SKILL.md +2 -3
  127. package/skills/turing/quantize/SKILL.md +2 -3
  128. package/skills/turing/queue/SKILL.md +3 -4
  129. package/skills/turing/registry/SKILL.md +2 -3
  130. package/skills/turing/regress/SKILL.md +3 -4
  131. package/skills/turing/replay/SKILL.md +2 -3
  132. package/skills/turing/report/SKILL.md +3 -4
  133. package/skills/turing/reproduce/SKILL.md +3 -4
  134. package/skills/turing/retry/SKILL.md +3 -4
  135. package/skills/turing/review/SKILL.md +2 -3
  136. package/skills/turing/rules/loop-protocol.md +11 -11
  137. package/skills/turing/sanity/SKILL.md +3 -4
  138. package/skills/turing/scale/SKILL.md +4 -5
  139. package/skills/turing/search/SKILL.md +2 -3
  140. package/skills/turing/seed/SKILL.md +3 -4
  141. package/skills/turing/sensitivity/SKILL.md +3 -4
  142. package/skills/turing/share/SKILL.md +2 -3
  143. package/skills/turing/simulate/SKILL.md +2 -3
  144. package/skills/turing/status/SKILL.md +1 -2
  145. package/skills/turing/stitch/SKILL.md +3 -4
  146. package/skills/turing/suggest/SKILL.md +5 -6
  147. package/skills/turing/surgery/SKILL.md +2 -3
  148. package/skills/turing/sweep/SKILL.md +8 -9
  149. package/skills/turing/template/SKILL.md +2 -3
  150. package/skills/turing/train/SKILL.md +5 -6
  151. package/skills/turing/transfer/SKILL.md +3 -4
  152. package/skills/turing/trend/SKILL.md +2 -3
  153. package/skills/turing/try/SKILL.md +4 -5
  154. package/skills/turing/update/SKILL.md +2 -3
  155. package/skills/turing/validate/SKILL.md +4 -5
  156. package/skills/turing/warm/SKILL.md +3 -4
  157. package/skills/turing/watch/SKILL.md +4 -5
  158. package/skills/turing/whatif/SKILL.md +2 -3
  159. package/skills/turing/xray/SKILL.md +3 -4
  160. package/src/command-registry.js +12 -0
  161. package/src/install.js +4 -3
  162. package/src/sync-commands-layout.js +149 -0
  163. package/src/sync-skills-layout.js +4 -133
  164. package/templates/README.md +5 -8
  165. package/templates/program.md +18 -18
  166. package/templates/pyproject.toml +10 -0
  167. package/templates/requirements.txt +4 -1
  168. package/templates/scripts/generate_onboarding.py +1 -1
  169. package/templates/scripts/post-train-hook.sh +7 -8
  170. package/templates/scripts/scaffold.py +24 -26
  171. package/templates/scripts/stop-hook.sh +2 -3
  172. package/templates/scripts/turing-run-python.sh +9 -0
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: ablate
3
3
  description: Run systematic ablation study — remove components one at a time, measure impact, produce publication-ready table with dead-weight flagging.
4
- disable-model-invocation: true
5
4
  argument-hint: "[exp-id] [--components \"X,Y\"] [--seeds 3] [--latex]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Run a systematic ablation study to measure the contribution of each model compon
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -23,7 +22,7 @@ Run a systematic ablation study to measure the contribution of each model compon
23
22
 
24
23
  3. **Run ablation study:**
25
24
  ```bash
26
- python scripts/ablation_study.py $ARGUMENTS
25
+ uv run python scripts/ablation_study.py $ARGUMENTS
27
26
  ```
28
27
 
29
28
  4. **Report results:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: annotate
3
3
  description: Retrospective experiment annotations — add human notes, tags, and context that automated metrics can't capture.
4
- disable-model-invocation: true
5
4
  argument-hint: "<exp-id> \"note\" [--tag fragile] | --list | --search \"keyword\""
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  Add context that experiment logs can't capture. "This only worked because the data was pre-sorted."
10
9
 
11
10
  ## Steps
12
- 1. **Activate environment:** `source .venv/bin/activate`
13
- 2. **Run:** `python scripts/experiment_annotations.py $ARGUMENTS`
11
+ 1. **Sync environment:** `uv sync`
12
+ 2. **Run:** `uv run python scripts/experiment_annotations.py $ARGUMENTS`
14
13
  3. **Operations:** add (text + tags), list (per-experiment or all), search (keyword or tag)
15
14
  4. **Stored in:** `experiments/annotations.yaml`
16
15
 
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: archive
3
3
  description: Experiment lifecycle cleanup — compress old artifacts, prune checkpoints, create queryable summary index. Reclaim disk space.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--older-than 30d] [--keep-best 10] [--dry-run]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  Keep your project directory manageable after 200+ experiments.
10
9
 
11
10
  ## Steps
12
- 1. **Activate environment:** `source .venv/bin/activate`
13
- 2. **Run:** `python scripts/experiment_archive.py $ARGUMENTS`
11
+ 1. **Sync environment:** `uv sync`
12
+ 2. **Run:** `uv run python scripts/experiment_archive.py $ARGUMENTS`
14
13
  3. **Protected experiments:** Pareto-optimal, current best, recent, top-N by metric
15
14
  4. **Report:** archived count, preserved count, space reclaimed
16
15
  5. **Saved output:** `experiments/archive/index.yaml`
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: audit
3
3
  description: Pre-submission methodology audit — catch data leakage, missing baselines, cherry-picked seeds, and incomplete ablations before a reviewer does.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--strict] [--checklist neurips]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ A reviewer checklist you run before submitting. Catches methodology mistakes tha
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -22,7 +21,7 @@ A reviewer checklist you run before submitting. Catches methodology mistakes tha
22
21
 
23
22
  3. **Run methodology audit:**
24
23
  ```bash
25
- python scripts/methodology_audit.py $ARGUMENTS
24
+ uv run python scripts/methodology_audit.py $ARGUMENTS
26
25
  ```
27
26
 
28
27
  4. **Checks performed:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: baseline
3
3
  description: Automatic baseline generation — random, majority/mean, linear, k-NN baselines in 60 seconds. Every experiment needs a "is this better than dumb?" reference.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--methods all|simple|linear] [--data data.npz]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Generate trivial baselines so you always know if your model is meaningfully bett
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -22,7 +21,7 @@ Generate trivial baselines so you always know if your model is meaningfully bett
22
21
 
23
22
  3. **Run baseline generation:**
24
23
  ```bash
25
- python scripts/generate_baselines.py $ARGUMENTS
24
+ uv run python scripts/generate_baselines.py $ARGUMENTS
26
25
  ```
27
26
 
28
27
  4. **Baselines generated:**
@@ -1,9 +1,8 @@
1
1
  ---
2
2
  name: brief
3
3
  description: Generate a structured research intelligence report from experiment history — what's been learned, what's promising, what's exhausted, and what the human should consider next. Use --deep for literature-grounded suggestions.
4
- disable-model-invocation: true
5
4
  argument-hint: "[ml/project] [--deep]"
6
- allowed-tools: Read, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob, WebSearch, WebFetch
5
+ allowed-tools: Read, Bash(uv run python scripts/*:*, uv sync:*), Grep, Glob, WebSearch, WebFetch
7
6
  ---
8
7
 
9
8
  Generate a research briefing that a human can read in 2 minutes and immediately decide what to inject next.
@@ -24,14 +23,14 @@ Before generating the briefing, detect which project to report on:
24
23
 
25
24
  1. **Generate the briefing:**
26
25
  ```bash
27
- source .venv/bin/activate && python scripts/generate_brief.py
26
+ uv run python scripts/generate_brief.py
28
27
  ```
29
28
 
30
29
  2. **Self-critique the briefing** before presenting. Review the generated output and check:
31
30
  - **Recommendations specificity:** Are they concrete enough to act on? "Try a different model" is bad. "Try LightGBM with leaf-wise growth because exp-004 showed depth sensitivity" is good. If vague, rewrite them with specific model/hyperparameter suggestions grounded in the experiment data.
32
31
  - **Exhausted directions coverage:** Cross-reference the "Model Types Explored" section against `experiments/log.jsonl`. Are there discarded experiments missing from the summary? If so, add them.
33
32
  - **Convergence estimate grounding:** If the briefing says "close to convergence" or "further improvement possible", verify against the actual metric trajectory. Is the claim supported by the numbers?
34
- - **Metric accuracy:** Spot-check that the "Current Best" metrics match the actual log. Run `python scripts/show_metrics.py --last 1` if uncertain.
33
+ - **Metric accuracy:** Spot-check that the "Current Best" metrics match the actual log. Run `uv run python scripts/show_metrics.py --last 1` if uncertain.
35
34
 
36
35
  If any section fails the check, regenerate just that section. Max 1 revision round — don't over-polish.
37
36
 
@@ -76,7 +75,7 @@ When `--deep` is requested, add a 7th section: **Literature-Grounded Suggestions
76
75
 
77
76
  4. **Queue suggestions** as hypotheses:
78
77
  ```bash
79
- source .venv/bin/activate && python scripts/manage_hypotheses.py add "<technique>: <rationale> (source: <citation>)" --priority medium --source literature
78
+ uv run python scripts/manage_hypotheses.py add "<technique>: <rationale> (source: <citation>)" --priority medium --source literature
80
79
  ```
81
80
 
82
81
  5. **Format as a section** appended to the briefing.
@@ -84,7 +83,7 @@ When `--deep` is requested, add a 7th section: **Literature-Grounded Suggestions
84
83
  ## Saving Briefs
85
84
 
86
85
  ```bash
87
- mkdir -p briefs && python scripts/generate_brief.py > briefs/brief-$(date +%Y-%m-%d).md
86
+ mkdir -p briefs && uv run python scripts/generate_brief.py > briefs/brief-$(date +%Y-%m-%d).md
88
87
  ```
89
88
 
90
89
  ## When to Use
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: budget
3
3
  description: Compute budget manager — set experiment/time limits, track allocation across explore/exploit phases, auto-shift modes, hard stop.
4
- disable-model-invocation: true
5
4
  argument-hint: "<set|status|reset> [--experiments 50] [--hours 8]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Set a compute ceiling and let the system optimize within it. Prevents runaway ex
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -23,7 +22,7 @@ Set a compute ceiling and let the system optimize within it. Prevents runaway ex
23
22
 
24
23
  3. **Run budget manager:**
25
24
  ```bash
26
- python scripts/budget_manager.py $ARGUMENTS
25
+ uv run python scripts/budget_manager.py $ARGUMENTS
27
26
  ```
28
27
 
29
28
  4. **Actions:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: calibrate
3
3
  description: Probability calibration — measure ECE, plot reliability diagrams, apply Platt scaling or isotonic regression.
4
- disable-model-invocation: true
5
4
  argument-hint: "[exp-id] [--method platt|isotonic|temperature|auto]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Make model probabilities trustworthy. Does 80% confidence actually mean 80% corr
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -22,7 +21,7 @@ Make model probabilities trustworthy. Does 80% confidence actually mean 80% corr
22
21
 
23
22
  3. **Run calibration:**
24
23
  ```bash
25
- python scripts/calibration.py $ARGUMENTS
24
+ uv run python scripts/calibration.py $ARGUMENTS
26
25
  ```
27
26
 
28
27
  4. **Report includes:**
@@ -1,8 +1,7 @@
1
1
  ---
2
2
  name: card
3
3
  description: Generate a standardized model card documenting the trained model — type, performance, training data, limitations, intended use, and artifact contract.
4
- disable-model-invocation: true
5
- allowed-tools: Read, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob
4
+ allowed-tools: Read, Bash(uv run python scripts/*:*, uv sync:*), Grep, Glob
6
5
  ---
7
6
 
8
7
  You generate a standardized model card from the experiment log, model contract, and config.
@@ -11,12 +10,12 @@ You generate a standardized model card from the experiment log, model contract,
11
10
 
12
11
  1. **Activate the virtual environment:**
13
12
  ```bash
14
- source .venv/bin/activate
13
+ uv sync
15
14
  ```
16
15
 
17
16
  2. **Run the model card generator:**
18
17
  ```bash
19
- python scripts/generate_model_card.py --config config.yaml --log experiments/log.jsonl --contract model_contract.md --output MODEL_CARD.md
18
+ uv run python scripts/generate_model_card.py --config config.yaml --log experiments/log.jsonl --contract model_contract.md --output MODEL_CARD.md
20
19
  ```
21
20
 
22
21
  3. **Read and present the generated card:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: changelog
3
3
  description: Model changelog generation — auto-generate human-readable progress narrative from experiment history for stakeholders.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--since exp-id|date] [--audience technical|stakeholder]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  Translate experiment logs into a narrative that PMs and stakeholders can read in 2 minutes.
10
9
 
11
10
  ## Steps
12
- 1. **Activate environment:** `source .venv/bin/activate`
13
- 2. **Run:** `python scripts/generate_changelog.py $ARGUMENTS`
11
+ 1. **Sync environment:** `uv sync`
12
+ 2. **Run:** `uv run python scripts/generate_changelog.py $ARGUMENTS`
14
13
  3. **Audience:** technical (experiment IDs, configs), stakeholder (plain English, percentages)
15
14
  4. **Saved output:** `paper/CHANGELOG.md`
16
15
 
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: checkpoint
3
3
  description: Smart checkpoint management — list, prune (Pareto-based), average top-K, resume from any point, disk usage stats.
4
- disable-model-invocation: true
5
4
  argument-hint: "<list|prune|average|resume|stats> [exp-id] [--top 3] [--dry-run]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Manage model checkpoints intelligently using Pareto dominance.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -23,7 +22,7 @@ Manage model checkpoints intelligently using Pareto dominance.
23
22
 
24
23
  3. **Run checkpoint manager:**
25
24
  ```bash
26
- python scripts/checkpoint_manager.py $ARGUMENTS
25
+ uv run python scripts/checkpoint_manager.py $ARGUMENTS
27
26
  ```
28
27
 
29
28
  4. **Report results by action:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: cite
3
3
  description: Citation & attribution manager — track papers, datasets, methods. Audit for missing citations, generate BibTeX.
4
- disable-model-invocation: true
5
4
  argument-hint: "<add|list|check|bib> [--key Chen2016 --title XGBoost --url ...]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  Track which papers and methods influenced each experiment. Catch missing citations before submission.
10
9
 
11
10
  ## Steps
12
- 1. **Activate environment:** `source .venv/bin/activate`
13
- 2. **Run:** `python scripts/citation_manager.py $ARGUMENTS`
11
+ 1. **Sync environment:** `uv sync`
12
+ 2. **Run:** `uv run python scripts/citation_manager.py $ARGUMENTS`
14
13
  3. **Operations:** add (associate citation with experiment), list (group by type), check (audit missing), bib (BibTeX)
15
14
  4. **Stored in:** `experiments/citations.yaml`
16
15
 
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: compare
3
3
  description: Compare two ML experiment runs side-by-side — metrics, configuration deltas, and a verdict on which approach is more promising.
4
- disable-model-invocation: true
5
4
  argument-hint: "<exp-id-1> <exp-id-2>"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -12,7 +11,7 @@ Compare two ML experiment runs side-by-side to understand what changed and why o
12
11
 
13
12
  1. **Run comparison:**
14
13
  ```bash
15
- source .venv/bin/activate && python scripts/compare_runs.py $0 $1
14
+ uv run python scripts/compare_runs.py $0 $1
16
15
  ```
17
16
 
18
17
  2. **Analyze the delta:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: counterfactual
3
3
  description: Input-level counterfactual explanations — find the smallest input change to flip a prediction.
4
- disable-model-invocation: true
5
4
  argument-hint: "<exp-id> --sample <index> [--target <class>]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  What would need to change to flip this prediction? Minimum-change counterfactual for individual predictions.
10
9
 
11
10
  ## Steps
12
- 1. `source .venv/bin/activate`
13
- 2. `python scripts/counterfactual_explanation.py $ARGUMENTS`
11
+ 1. `uv sync`
12
+ 2. `uv run python scripts/counterfactual_explanation.py $ARGUMENTS`
14
13
  3. **Saved:** `experiments/counterfactuals/`
15
14
 
16
15
  ## Methods
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: curriculum
3
3
  description: Training curriculum optimization — order data by difficulty, compare easy-to-hard vs hard-to-easy vs self-paced strategies.
4
- disable-model-invocation: true
5
4
  argument-hint: "[exp-id] [--strategies easy-to-hard,random]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Does the order your model sees data matter? Find out systematically.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -22,7 +21,7 @@ Does the order your model sees data matter? Find out systematically.
22
21
 
23
22
  3. **Run curriculum analysis:**
24
23
  ```bash
25
- python scripts/curriculum_optimizer.py $ARGUMENTS
24
+ uv run python scripts/curriculum_optimizer.py $ARGUMENTS
26
25
  ```
27
26
 
28
27
  4. **Strategies tested:**
@@ -1,9 +1,8 @@
1
1
  ---
2
2
  name: design
3
3
  description: Generate a structured experiment design for a hypothesis. Reads experiment history, searches literature for methodology, produces a scored design document at experiments/designs/.
4
- disable-model-invocation: true
5
4
  argument-hint: "<hypothesis-id or description>"
6
- allowed-tools: Read, Write, Bash(python scripts/*:*, source .venv/bin/activate:*, mkdir:*), Grep, Glob, WebSearch, WebFetch
5
+ allowed-tools: Read, Write, Bash(uv run python scripts/*:*, uv sync:*, mkdir:*), Grep, Glob, WebSearch, WebFetch
7
6
  ---
8
7
 
9
8
  Front-load the thinking before the coding. Given a hypothesis, produce a structured experiment design grounded in methodology from the literature.
@@ -14,7 +13,7 @@ Front-load the thinking before the coding. Given a hypothesis, produce a structu
14
13
 
15
14
  If `$ARGUMENTS` matches `hyp-NNN`, load the hypothesis:
16
15
  ```bash
17
- source .venv/bin/activate && python scripts/manage_hypotheses.py show $ARGUMENTS
16
+ uv run python scripts/manage_hypotheses.py show $ARGUMENTS
18
17
  ```
19
18
 
20
19
  If freeform text, use it directly as the hypothesis description.
@@ -24,7 +23,7 @@ Read the current config and experiment state:
24
23
  cat config.yaml
25
24
  ```
26
25
  ```bash
27
- source .venv/bin/activate && python scripts/show_metrics.py --last 10 2>/dev/null || echo "No experiments yet"
26
+ uv run python scripts/show_metrics.py --last 10 2>/dev/null || echo "No experiments yet"
28
27
  ```
29
28
  ```bash
30
29
  cat experiment_state.yaml 2>/dev/null || echo "No experiment state yet"
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: diagnose
3
3
  description: Error analysis — cluster failure cases, identify systematic failure modes, and suggest targeted fixes with auto-queued hypotheses.
4
- disable-model-invocation: true
5
4
  argument-hint: "[exp-id] [--auto-queue] [--top 5]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,15 +9,15 @@ Analyze where and why the model fails, beyond aggregate metrics.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Generate predictions if needed:**
19
18
  Check if `experiments/predictions/exp-NNN-preds.yaml` exists. If not, run:
20
19
  ```bash
21
- python train.py --predict-only --output experiments/predictions/
20
+ uv run python train.py --predict-only --output experiments/predictions/
22
21
  ```
23
22
  The predictions file must contain `y_true`, `y_pred`, `task_type`, and optionally `features`.
24
23
 
@@ -29,7 +28,7 @@ Analyze where and why the model fails, beyond aggregate metrics.
29
28
 
30
29
  4. **Run error analysis:**
31
30
  ```bash
32
- python scripts/diagnose_errors.py $ARGUMENTS
31
+ uv run python scripts/diagnose_errors.py $ARGUMENTS
33
32
  ```
34
33
 
35
34
  5. **Report results:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: diff
3
3
  description: Deep experiment comparison — config diffs, metric significance, per-class regressions, training curve divergence, feature importance shifts.
4
- disable-model-invocation: true
5
4
  argument-hint: "<exp-a> <exp-b> [--code]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Deep diagnostic comparison of two experiments. Goes beyond "which metric is high
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -22,7 +21,7 @@ Deep diagnostic comparison of two experiments. Goes beyond "which metric is high
22
21
 
23
22
  3. **Run deep comparison:**
24
23
  ```bash
25
- python scripts/experiment_diff.py $ARGUMENTS
24
+ uv run python scripts/experiment_diff.py $ARGUMENTS
26
25
  ```
27
26
 
28
27
  4. **Report results — the diff includes:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: distill
3
3
  description: Model compression via distillation — train a smaller student model to match a larger teacher's predictions.
4
- disable-model-invocation: true
5
4
  argument-hint: "<teacher-exp-id> [--compression 4] [--method soft-labels]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Compress a large model into a smaller, faster one for production. Measures the a
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -24,7 +23,7 @@ Compress a large model into a smaller, faster one for production. Measures the a
24
23
 
25
24
  3. **Run distillation planner:**
26
25
  ```bash
27
- python scripts/model_distiller.py $ARGUMENTS
26
+ uv run python scripts/model_distiller.py $ARGUMENTS
28
27
  ```
29
28
 
30
29
  4. **Report includes:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: doctor
3
3
  description: Harness self-diagnosis — check environment, project, resources, and git state. Auto-fix common issues.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--fix] [--verbose]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  Is Turing healthy? Check everything and get a score.
10
9
 
11
10
  ## Steps
12
- 1. `source .venv/bin/activate`
13
- 2. `python scripts/harness_doctor.py $ARGUMENTS`
11
+ 1. `uv sync`
12
+ 2. `uv run python scripts/harness_doctor.py $ARGUMENTS`
14
13
  3. **Saved:** `experiments/doctor/`
15
14
 
16
15
  ## Checks
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: ensemble
3
3
  description: Automated ensemble construction — combines top-K models via voting, stacking, and blending for zero-cost improvement.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--top-k 5] [--methods voting,stacking,blending]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Build ensembles from your best experiments automatically. Often yields 1-3% impr
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -23,7 +22,7 @@ Build ensembles from your best experiments automatically. Often yields 1-3% impr
23
22
 
24
23
  3. **Run ensemble construction:**
25
24
  ```bash
26
- python scripts/build_ensemble.py $ARGUMENTS
25
+ uv run python scripts/build_ensemble.py $ARGUMENTS
27
26
  ```
28
27
 
29
28
  4. **Report results:**
@@ -1,9 +1,8 @@
1
1
  ---
2
2
  name: explore
3
3
  description: Tree-search-guided hypothesis exploration using AB-MCTS. Explores the space of experiment ideas as a search tree, scored by the critique engine. Discovers non-obvious refinement chains that linear suggestion cannot find.
4
- disable-model-invocation: true
5
4
  argument-hint: "[ml/project] [--iterations N] [--top N] [--strategy abmcts-a|abmcts-m|greedy]"
6
- allowed-tools: Read, Write, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob
5
+ allowed-tools: Read, Write, Bash(uv run python scripts/*:*, uv sync:*), Grep, Glob
7
6
  ---
8
7
 
9
8
  Explore the hypothesis space using tree search. Instead of suggesting independent ideas, this builds and searches a tree of refinement chains — each node is a hypothesis scored by novelty, feasibility, and expected impact.
@@ -32,7 +31,7 @@ Extract from `$ARGUMENTS`:
32
31
  ### 1. Assess Current State
33
32
 
34
33
  ```bash
35
- source .venv/bin/activate && python scripts/show_metrics.py --last 10 2>/dev/null || echo "No experiments yet"
34
+ uv run python scripts/show_metrics.py --last 10 2>/dev/null || echo "No experiments yet"
36
35
  ```
37
36
 
38
37
  Read `config.yaml` to understand the current model and metric.
@@ -40,7 +39,7 @@ Read `config.yaml` to understand the current model and metric.
40
39
  ### 2. Run Tree Search
41
40
 
42
41
  ```bash
43
- source .venv/bin/activate && python scripts/treequest_suggest.py \
42
+ uv run python scripts/treequest_suggest.py \
44
43
  --log experiments/log.jsonl \
45
44
  --config config.yaml \
46
45
  --top <N> \
@@ -59,7 +58,7 @@ The script will:
59
58
  For each result, add to the hypothesis queue:
60
59
 
61
60
  ```bash
62
- source .venv/bin/activate && python scripts/manage_hypotheses.py add "<description>" \
61
+ uv run python scripts/manage_hypotheses.py add "<description>" \
63
62
  --priority medium --source treequest
64
63
  ```
65
64
 
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: export
3
3
  description: Export model to production format with equivalence verification, latency benchmarking, and deployment model card.
4
- disable-model-invocation: true
5
4
  argument-hint: "[exp-id] [--format joblib|xgboost_json|onnx|torchscript|tflite]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Export a trained model to a production-ready format.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -24,7 +23,7 @@ Export a trained model to a production-ready format.
24
23
 
25
24
  3. **Run export pipeline:**
26
25
  ```bash
27
- python scripts/export_model.py $ARGUMENTS
26
+ uv run python scripts/export_model.py $ARGUMENTS
28
27
  ```
29
28
 
30
29
  4. **Report results:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: feature
3
3
  description: Automated feature selection — multi-method importance consensus, redundancy detection, and interaction feature generation.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--method all|importance] [--top-k 20]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Systematically evaluate which features matter and which are noise.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -22,7 +21,7 @@ Systematically evaluate which features matter and which are noise.
22
21
 
23
22
  3. **Run feature analysis:**
24
23
  ```bash
25
- python scripts/feature_intelligence.py $ARGUMENTS
24
+ uv run python scripts/feature_intelligence.py $ARGUMENTS
26
25
  ```
27
26
 
28
27
  4. **Report includes:**