claude-turing 4.7.0 → 4.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/.claude-plugin/plugin.json +2 -2
  2. package/README.md +1 -1
  3. package/agents/ml-evaluator.md +4 -4
  4. package/agents/ml-researcher.md +2 -2
  5. package/bin/turing-init.sh +2 -2
  6. package/commands/ablate.md +3 -4
  7. package/commands/annotate.md +2 -3
  8. package/commands/archive.md +2 -3
  9. package/commands/audit.md +3 -4
  10. package/commands/baseline.md +3 -4
  11. package/commands/brief.md +5 -6
  12. package/commands/budget.md +3 -4
  13. package/commands/calibrate.md +3 -4
  14. package/commands/card.md +3 -4
  15. package/commands/changelog.md +2 -3
  16. package/commands/checkpoint.md +3 -4
  17. package/commands/cite.md +2 -3
  18. package/commands/compare.md +1 -2
  19. package/commands/counterfactual.md +2 -3
  20. package/commands/curriculum.md +3 -4
  21. package/commands/design.md +3 -4
  22. package/commands/diagnose.md +4 -5
  23. package/commands/diff.md +3 -4
  24. package/commands/distill.md +3 -4
  25. package/commands/doctor.md +2 -3
  26. package/commands/ensemble.md +3 -4
  27. package/commands/explore.md +4 -5
  28. package/commands/export.md +3 -4
  29. package/commands/feature.md +3 -4
  30. package/commands/flashback.md +2 -3
  31. package/commands/fork.md +3 -4
  32. package/commands/frontier.md +3 -4
  33. package/commands/init.md +5 -6
  34. package/commands/leak.md +3 -4
  35. package/commands/lit.md +3 -4
  36. package/commands/logbook.md +5 -6
  37. package/commands/merge.md +2 -3
  38. package/commands/mode.md +1 -2
  39. package/commands/onboard.md +2 -3
  40. package/commands/paper.md +3 -4
  41. package/commands/plan.md +2 -3
  42. package/commands/poster.md +3 -4
  43. package/commands/postmortem.md +2 -3
  44. package/commands/preflight.md +5 -6
  45. package/commands/present.md +2 -3
  46. package/commands/profile.md +3 -4
  47. package/commands/prune.md +2 -3
  48. package/commands/quantize.md +2 -3
  49. package/commands/queue.md +3 -4
  50. package/commands/registry.md +2 -3
  51. package/commands/regress.md +3 -4
  52. package/commands/replay.md +2 -3
  53. package/commands/report.md +3 -4
  54. package/commands/reproduce.md +3 -4
  55. package/commands/retry.md +3 -4
  56. package/commands/review.md +2 -3
  57. package/commands/rules/loop-protocol.md +11 -11
  58. package/commands/sanity.md +3 -4
  59. package/commands/scale.md +4 -5
  60. package/commands/search.md +2 -3
  61. package/commands/seed.md +3 -4
  62. package/commands/sensitivity.md +3 -4
  63. package/commands/share.md +2 -3
  64. package/commands/simulate.md +2 -3
  65. package/commands/status.md +1 -2
  66. package/commands/stitch.md +3 -4
  67. package/commands/suggest.md +5 -6
  68. package/commands/surgery.md +2 -3
  69. package/commands/sweep.md +8 -9
  70. package/commands/template.md +2 -3
  71. package/commands/train.md +5 -6
  72. package/commands/transfer.md +3 -4
  73. package/commands/trend.md +2 -3
  74. package/commands/try.md +4 -5
  75. package/commands/turing.md +3 -3
  76. package/commands/update.md +2 -3
  77. package/commands/validate.md +4 -5
  78. package/commands/warm.md +3 -4
  79. package/commands/watch.md +4 -5
  80. package/commands/whatif.md +2 -3
  81. package/commands/xray.md +3 -4
  82. package/config/commands.yaml +75 -75
  83. package/package.json +3 -2
  84. package/skills/turing/SKILL.md +3 -3
  85. package/skills/turing/ablate/SKILL.md +3 -4
  86. package/skills/turing/annotate/SKILL.md +2 -3
  87. package/skills/turing/archive/SKILL.md +2 -3
  88. package/skills/turing/audit/SKILL.md +3 -4
  89. package/skills/turing/baseline/SKILL.md +3 -4
  90. package/skills/turing/brief/SKILL.md +5 -6
  91. package/skills/turing/budget/SKILL.md +3 -4
  92. package/skills/turing/calibrate/SKILL.md +3 -4
  93. package/skills/turing/card/SKILL.md +3 -4
  94. package/skills/turing/changelog/SKILL.md +2 -3
  95. package/skills/turing/checkpoint/SKILL.md +3 -4
  96. package/skills/turing/cite/SKILL.md +2 -3
  97. package/skills/turing/compare/SKILL.md +1 -2
  98. package/skills/turing/counterfactual/SKILL.md +2 -3
  99. package/skills/turing/curriculum/SKILL.md +3 -4
  100. package/skills/turing/design/SKILL.md +3 -4
  101. package/skills/turing/diagnose/SKILL.md +4 -5
  102. package/skills/turing/diff/SKILL.md +3 -4
  103. package/skills/turing/distill/SKILL.md +3 -4
  104. package/skills/turing/doctor/SKILL.md +2 -3
  105. package/skills/turing/ensemble/SKILL.md +3 -4
  106. package/skills/turing/explore/SKILL.md +4 -5
  107. package/skills/turing/export/SKILL.md +3 -4
  108. package/skills/turing/feature/SKILL.md +3 -4
  109. package/skills/turing/flashback/SKILL.md +2 -3
  110. package/skills/turing/fork/SKILL.md +3 -4
  111. package/skills/turing/frontier/SKILL.md +3 -4
  112. package/skills/turing/init/SKILL.md +5 -6
  113. package/skills/turing/leak/SKILL.md +3 -4
  114. package/skills/turing/lit/SKILL.md +3 -4
  115. package/skills/turing/logbook/SKILL.md +5 -6
  116. package/skills/turing/merge/SKILL.md +2 -3
  117. package/skills/turing/mode/SKILL.md +1 -2
  118. package/skills/turing/onboard/SKILL.md +2 -3
  119. package/skills/turing/paper/SKILL.md +3 -4
  120. package/skills/turing/plan/SKILL.md +2 -3
  121. package/skills/turing/poster/SKILL.md +3 -4
  122. package/skills/turing/postmortem/SKILL.md +2 -3
  123. package/skills/turing/preflight/SKILL.md +5 -6
  124. package/skills/turing/present/SKILL.md +2 -3
  125. package/skills/turing/profile/SKILL.md +3 -4
  126. package/skills/turing/prune/SKILL.md +2 -3
  127. package/skills/turing/quantize/SKILL.md +2 -3
  128. package/skills/turing/queue/SKILL.md +3 -4
  129. package/skills/turing/registry/SKILL.md +2 -3
  130. package/skills/turing/regress/SKILL.md +3 -4
  131. package/skills/turing/replay/SKILL.md +2 -3
  132. package/skills/turing/report/SKILL.md +3 -4
  133. package/skills/turing/reproduce/SKILL.md +3 -4
  134. package/skills/turing/retry/SKILL.md +3 -4
  135. package/skills/turing/review/SKILL.md +2 -3
  136. package/skills/turing/rules/loop-protocol.md +11 -11
  137. package/skills/turing/sanity/SKILL.md +3 -4
  138. package/skills/turing/scale/SKILL.md +4 -5
  139. package/skills/turing/search/SKILL.md +2 -3
  140. package/skills/turing/seed/SKILL.md +3 -4
  141. package/skills/turing/sensitivity/SKILL.md +3 -4
  142. package/skills/turing/share/SKILL.md +2 -3
  143. package/skills/turing/simulate/SKILL.md +2 -3
  144. package/skills/turing/status/SKILL.md +1 -2
  145. package/skills/turing/stitch/SKILL.md +3 -4
  146. package/skills/turing/suggest/SKILL.md +5 -6
  147. package/skills/turing/surgery/SKILL.md +2 -3
  148. package/skills/turing/sweep/SKILL.md +8 -9
  149. package/skills/turing/template/SKILL.md +2 -3
  150. package/skills/turing/train/SKILL.md +5 -6
  151. package/skills/turing/transfer/SKILL.md +3 -4
  152. package/skills/turing/trend/SKILL.md +2 -3
  153. package/skills/turing/try/SKILL.md +4 -5
  154. package/skills/turing/update/SKILL.md +2 -3
  155. package/skills/turing/validate/SKILL.md +4 -5
  156. package/skills/turing/warm/SKILL.md +3 -4
  157. package/skills/turing/watch/SKILL.md +4 -5
  158. package/skills/turing/whatif/SKILL.md +2 -3
  159. package/skills/turing/xray/SKILL.md +3 -4
  160. package/src/command-registry.js +12 -0
  161. package/src/install.js +4 -3
  162. package/src/sync-commands-layout.js +149 -0
  163. package/src/sync-skills-layout.js +4 -133
  164. package/templates/README.md +5 -8
  165. package/templates/program.md +18 -18
  166. package/templates/pyproject.toml +10 -0
  167. package/templates/requirements.txt +4 -1
  168. package/templates/scripts/generate_onboarding.py +1 -1
  169. package/templates/scripts/post-train-hook.sh +7 -8
  170. package/templates/scripts/scaffold.py +24 -26
  171. package/templates/scripts/stop-hook.sh +2 -3
  172. package/templates/scripts/turing-run-python.sh +9 -0
@@ -1,9 +1,8 @@
1
1
  ---
2
2
  name: explore
3
3
  description: Tree-search-guided hypothesis exploration using AB-MCTS. Explores the space of experiment ideas as a search tree, scored by the critique engine. Discovers non-obvious refinement chains that linear suggestion cannot find.
4
- disable-model-invocation: true
5
4
  argument-hint: "[ml/project] [--iterations N] [--top N] [--strategy abmcts-a|abmcts-m|greedy]"
6
- allowed-tools: Read, Write, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob
5
+ allowed-tools: Read, Write, Bash(uv run python scripts/*:*, uv sync:*), Grep, Glob
7
6
  ---
8
7
 
9
8
  Explore the hypothesis space using tree search. Instead of suggesting independent ideas, this builds and searches a tree of refinement chains — each node is a hypothesis scored by novelty, feasibility, and expected impact.
@@ -32,7 +31,7 @@ Extract from `$ARGUMENTS`:
32
31
  ### 1. Assess Current State
33
32
 
34
33
  ```bash
35
- source .venv/bin/activate && python scripts/show_metrics.py --last 10 2>/dev/null || echo "No experiments yet"
34
+ uv run python scripts/show_metrics.py --last 10 2>/dev/null || echo "No experiments yet"
36
35
  ```
37
36
 
38
37
  Read `config.yaml` to understand the current model and metric.
@@ -40,7 +39,7 @@ Read `config.yaml` to understand the current model and metric.
40
39
  ### 2. Run Tree Search
41
40
 
42
41
  ```bash
43
- source .venv/bin/activate && python scripts/treequest_suggest.py \
42
+ uv run python scripts/treequest_suggest.py \
44
43
  --log experiments/log.jsonl \
45
44
  --config config.yaml \
46
45
  --top <N> \
@@ -59,7 +58,7 @@ The script will:
59
58
  For each result, add to the hypothesis queue:
60
59
 
61
60
  ```bash
62
- source .venv/bin/activate && python scripts/manage_hypotheses.py add "<description>" \
61
+ uv run python scripts/manage_hypotheses.py add "<description>" \
63
62
  --priority medium --source treequest
64
63
  ```
65
64
 
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: export
3
3
  description: Export model to production format with equivalence verification, latency benchmarking, and deployment model card.
4
- disable-model-invocation: true
5
4
  argument-hint: "[exp-id] [--format joblib|xgboost_json|onnx|torchscript|tflite]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Export a trained model to a production-ready format.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -24,7 +23,7 @@ Export a trained model to a production-ready format.
24
23
 
25
24
  3. **Run export pipeline:**
26
25
  ```bash
27
- python scripts/export_model.py $ARGUMENTS
26
+ uv run python scripts/export_model.py $ARGUMENTS
28
27
  ```
29
28
 
30
29
  4. **Report results:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: feature
3
3
  description: Automated feature selection — multi-method importance consensus, redundancy detection, and interaction feature generation.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--method all|importance] [--top-k 20]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Systematically evaluate which features matter and which are noise.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -22,7 +21,7 @@ Systematically evaluate which features matter and which are noise.
22
21
 
23
22
  3. **Run feature analysis:**
24
23
  ```bash
25
- python scripts/feature_intelligence.py $ARGUMENTS
24
+ uv run python scripts/feature_intelligence.py $ARGUMENTS
26
25
  ```
27
26
 
28
27
  4. **Report includes:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: flashback
3
3
  description: Session context restoration — "where was I?" summary after days away. Current best, pending hypotheses, last session, annotations.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--days 7] [--last 10]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  Come back to a project after a week and start working in 10 seconds instead of 30 minutes.
10
9
 
11
10
  ## Steps
12
- 1. **Activate environment:** `source .venv/bin/activate`
13
- 2. **Run:** `python scripts/session_flashback.py $ARGUMENTS`
11
+ 1. **Sync environment:** `uv sync`
12
+ 2. **Run:** `uv run python scripts/session_flashback.py $ARGUMENTS`
14
13
  3. **Report:** current best, last session experiments, pending hypotheses, annotations, budget, suggested next action
15
14
  4. **Saved output:** `experiments/flashbacks/flashback-*.yaml`
16
15
 
package/commands/fork.md CHANGED
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: fork
3
3
  description: Branch an experiment into parallel tracks — run both A and B, report the winner.
4
- disable-model-invocation: true
5
4
  argument-hint: "<exp-id> --branches \"approach A\" \"approach B\" [--auto-promote]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Fork an experiment into parallel branches and compare results.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -22,7 +21,7 @@ Fork an experiment into parallel branches and compare results.
22
21
 
23
22
  3. **Run fork:**
24
23
  ```bash
25
- python scripts/fork_experiment.py $ARGUMENTS
24
+ uv run python scripts/fork_experiment.py $ARGUMENTS
26
25
  ```
27
26
 
28
27
  4. **Report results:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: frontier
3
3
  description: Visualize Pareto frontier across multiple objectives — answers "which model is actually best?" when there are tradeoffs.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--metrics \"accuracy,train_seconds,n_params\"] [--ascii]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Visualize the Pareto frontier across multiple objectives from experiment history
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -22,7 +21,7 @@ Visualize the Pareto frontier across multiple objectives from experiment history
22
21
 
23
22
  3. **Run Pareto analysis:**
24
23
  ```bash
25
- python scripts/pareto_frontier.py $ARGUMENTS
24
+ uv run python scripts/pareto_frontier.py $ARGUMENTS
26
25
  ```
27
26
 
28
27
  4. **Report results:**
package/commands/init.md CHANGED
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: init
3
- description: Initialize a new ML project with the Turing autoresearch harness. Scaffolds the full experiment infrastructure — immutable evaluation pipeline, agent-editable training code, structured logging, convergence detection hooks, and a Python virtual environment. Use --plan to generate a research plan.
4
- disable-model-invocation: true
3
+ description: Initialize a new ML project with the Turing autoresearch harness. Scaffolds the full experiment infrastructure — immutable evaluation pipeline, agent-editable training code, structured logging, convergence detection hooks, and a uv-managed Python environment. Use --plan to generate a research plan.
5
4
  argument-hint: "[project_name] [--plan]"
6
5
  allowed-tools: Read, Write, Edit, Bash(*), Grep, Glob, WebSearch, WebFetch
7
6
  ---
@@ -24,7 +23,7 @@ Ask the user for the following (or accept from `$ARGUMENTS` if provided as JSON)
24
23
  Once you have all 6 values, delegate to the unified scaffolding script:
25
24
 
26
25
  ```bash
27
- python3 <templates_dir>/scripts/scaffold.py \
26
+ uv run python <templates_dir>/scripts/scaffold.py \
28
27
  --project-name "<project_name>" \
29
28
  --target-metric "<target_metric>" \
30
29
  --metric-direction "<metric_direction>" \
@@ -39,7 +38,7 @@ The scaffold script handles everything in a single atomic operation:
39
38
  - Creates data/, experiments/, models/ directories
40
39
  - Sets up agent memory at `.claude/agent-memory/ml-researcher-{project_name}/MEMORY.md`
41
40
  - Configures Claude Code hooks in `.claude/settings.local.json`
42
- - Creates Python virtual environment and installs requirements
41
+ - Runs `uv sync` from the ML directory when uv is available
43
42
  - Verifies all placeholders were replaced (fails loudly if any remain)
44
43
 
45
44
  ## Locating Templates
@@ -58,7 +57,7 @@ node_modules/claude-turing/templates/
58
57
  Example command:
59
58
 
60
59
  ```bash
61
- python3 ~/.claude/commands/turing/templates/scripts/scaffold.py \
60
+ uv run python ~/.claude/commands/turing/templates/scripts/scaffold.py \
62
61
  --project-name "<project_name>" \
63
62
  --target-metric "<target_metric>" \
64
63
  --metric-direction "<metric_direction>" \
@@ -72,7 +71,7 @@ python3 ~/.claude/commands/turing/templates/scripts/scaffold.py \
72
71
 
73
72
  Report what was created:
74
73
  - The separation: READ-ONLY (`prepare.py`, `evaluate.py`) vs AGENT-EDITABLE (`train.py`)
75
- - Next steps: add data to the configured data source path, run `python prepare.py`, then `/turing:train`
74
+ - Next steps: add data to the configured data source path, run `uv run python prepare.py`, then `/turing:train`
76
75
  - The taste-leverage loop: `/turing:try` to inject hypotheses, `/turing:brief` for intelligence reports
77
76
 
78
77
  ## Research Plan Generation (--plan flag)
package/commands/leak.md CHANGED
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: leak
3
3
  description: Targeted leakage detection — probe for data leakage with single-feature tests, correlation checks, and train/test overlap detection.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--deep] [--features feature_1,feature_2]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Actively probe for data leakage. The #1 cause of "too good to be true" results.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -22,7 +21,7 @@ Actively probe for data leakage. The #1 cause of "too good to be true" results.
22
21
 
23
22
  3. **Run leakage scan:**
24
23
  ```bash
25
- python scripts/leakage_detector.py $ARGUMENTS
24
+ uv run python scripts/leakage_detector.py $ARGUMENTS
26
25
  ```
27
26
 
28
27
  4. **Checks performed:**
package/commands/lit.md CHANGED
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: lit
3
3
  description: Literature search scoped to the current experiment domain — find papers, SOTA baselines, and related work without leaving the terminal.
4
- disable-model-invocation: true
5
4
  argument-hint: "<query> | --baseline | --related <exp-id>"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob, WebSearch
7
6
  ---
@@ -10,9 +9,9 @@ Search the literature for papers, baselines, and related work.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -24,7 +23,7 @@ Search the literature for papers, baselines, and related work.
24
23
 
25
24
  3. **Run literature search:**
26
25
  ```bash
27
- python scripts/literature_search.py $ARGUMENTS
26
+ uv run python scripts/literature_search.py $ARGUMENTS
28
27
  ```
29
28
 
30
29
  4. **Report results:**
@@ -1,9 +1,8 @@
1
1
  ---
2
2
  name: logbook
3
3
  description: Generate a research logbook showing the full experiment narrative — hypotheses proposed, experiments run, decisions made, and progress over time. Outputs HTML (with interactive chart) or markdown.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--since YYYY-MM-DD] [--format html|markdown] [--output path]"
6
- allowed-tools: Read, Bash(python scripts/*:*, source .venv/bin/activate:*, mkdir:*), Grep, Glob
5
+ allowed-tools: Read, Bash(uv run python scripts/*:*, uv sync:*, mkdir:*), Grep, Glob
7
6
  ---
8
7
 
9
8
  Generate a research logbook that captures the full narrative of the experiment campaign.
@@ -12,7 +11,7 @@ Generate a research logbook that captures the full narrative of the experiment c
12
11
 
13
12
  1. **Generate the logbook:**
14
13
  ```bash
15
- source .venv/bin/activate && python scripts/generate_logbook.py
14
+ uv run python scripts/generate_logbook.py
16
15
  ```
17
16
 
18
17
  **With options from `$ARGUMENTS`:**
@@ -23,13 +22,13 @@ Generate a research logbook that captures the full narrative of the experiment c
23
22
  **Common usage:**
24
23
  ```bash
25
24
  # HTML logbook with interactive trajectory chart
26
- source .venv/bin/activate && python scripts/generate_logbook.py --output logbook.html
25
+ uv run python scripts/generate_logbook.py --output logbook.html
27
26
 
28
27
  # Markdown for embedding in docs or READMEs
29
- source .venv/bin/activate && python scripts/generate_logbook.py --format markdown --output logbook.md
28
+ uv run python scripts/generate_logbook.py --format markdown --output logbook.md
30
29
 
31
30
  # Last week's activity
32
- source .venv/bin/activate && python scripts/generate_logbook.py --since 2026-03-24 --output logbook.html
31
+ uv run python scripts/generate_logbook.py --since 2026-03-24 --output logbook.html
33
32
  ```
34
33
 
35
34
  2. **Present the result:**
package/commands/merge.md CHANGED
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: merge
3
3
  description: Model merging — average weights from multiple checkpoints into a single model (soups, TIES, DARE). Free accuracy, zero latency cost.
4
- disable-model-invocation: true
5
4
  argument-hint: "<exp-ids...> [--method uniform|greedy|ties|dare]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,8 +9,8 @@ Combine model weights (not predictions) into a single, better model with no late
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:** `source .venv/bin/activate`
14
- 2. **Run:** `python scripts/model_merger.py $ARGUMENTS`
12
+ 1. **Sync environment:** `uv sync`
13
+ 2. **Run:** `uv run python scripts/model_merger.py $ARGUMENTS`
15
14
  3. **Methods:** uniform soup (simple average), greedy soup (include only if improves), TIES (trim+elect+merge), DARE (drop+rescale)
16
15
  4. **Report:** compatibility check, per-model metrics, method comparison, improvement delta
17
16
  5. **Saved output:** `experiments/merges/merge-*.yaml`
package/commands/mode.md CHANGED
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: mode
3
3
  description: Set the research strategy mode — explore (try new things), exploit (refine what works), or replicate (verify results). Drives novelty guard policy and agent behavior.
4
- disable-model-invocation: true
5
4
  argument-hint: "<explore|exploit|replicate>"
6
5
  ---
7
6
 
@@ -21,7 +20,7 @@ Set the research mode for the current project. The mode determines how the novel
21
20
 
22
21
  2. **Update experiment state:**
23
22
  ```bash
24
- source .venv/bin/activate
23
+ uv sync
25
24
  python -c "
26
25
  import yaml
27
26
  from pathlib import Path
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: onboard
3
3
  description: Project onboarding — generate a walkthrough for new collaborators. Task, history, decisions, next steps.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--audience researcher|engineer|stakeholder] [--depth brief|full]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  5-minute read that replaces a 1-hour onboarding meeting.
10
9
 
11
10
  ## Steps
12
- 1. `source .venv/bin/activate`
13
- 2. `python scripts/generate_onboarding.py $ARGUMENTS`
11
+ 1. `uv sync`
12
+ 2. `uv run python scripts/generate_onboarding.py $ARGUMENTS`
14
13
  3. **Saved:** `ONBOARDING.md`
15
14
 
16
15
  ## Examples
package/commands/paper.md CHANGED
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: paper
3
3
  description: Draft mechanical paper sections (setup, results, ablation, hyperparameters) from experiment logs. LaTeX and markdown output.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--sections setup,results,ablation] [--format latex|markdown]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Draft paper sections directly from experiment data.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -21,7 +20,7 @@ Draft paper sections directly from experiment data.
21
20
 
22
21
  3. **Run paper drafting:**
23
22
  ```bash
24
- python scripts/draft_paper_sections.py $ARGUMENTS
23
+ uv run python scripts/draft_paper_sections.py $ARGUMENTS
25
24
  ```
26
25
 
27
26
  4. **Report results:**
package/commands/plan.md CHANGED
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: plan
3
3
  description: Research planning assistant — design a strategic experiment campaign with budget-aware ROI allocation.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--budget 20] [--goal \"maximize F1 for production\"]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  Design the next N experiments strategically, not randomly. Allocates budget by expected ROI.
10
9
 
11
10
  ## Steps
12
- 1. `source .venv/bin/activate`
13
- 2. `python scripts/research_planner.py $ARGUMENTS`
11
+ 1. `uv sync`
12
+ 2. `uv run python scripts/research_planner.py $ARGUMENTS`
14
13
  3. **Saved:** `experiments/plans/`
15
14
 
16
15
  ## How it works
@@ -1,9 +1,8 @@
1
1
  ---
2
2
  name: poster
3
3
  description: Generate a single-page HTML research poster summarizing the experiment campaign — best result, trajectory, key findings, and methodology. Adapted from posterskill's self-contained HTML architecture.
4
- disable-model-invocation: true
5
4
  argument-hint: "[title override]"
6
- allowed-tools: Read, Write, Edit, Bash(python scripts/*:*, source .venv/bin/activate:*, mkdir:*, open:*), Grep, Glob
5
+ allowed-tools: Read, Write, Edit, Bash(uv run python scripts/*:*, uv sync:*, mkdir:*, open:*), Grep, Glob
7
6
  ---
8
7
 
9
8
  Generate a research poster summarizing the experiment campaign as a single self-contained HTML file. Adapted from [posterskill](https://github.com/ethanweber/posterskill)'s architecture — no build step, works when opened as `file://`.
@@ -16,8 +15,8 @@ Read the experiment history and project context:
16
15
 
17
16
  ```bash
18
17
  cat config.yaml
19
- source .venv/bin/activate && python scripts/generate_brief.py
20
- source .venv/bin/activate && python scripts/show_metrics.py --last 20
18
+ uv run python scripts/generate_brief.py
19
+ uv run python scripts/show_metrics.py --last 20
21
20
  cat experiment_state.yaml 2>/dev/null || true
22
21
  cat RESEARCH_PLAN.md 2>/dev/null || true
23
22
  ```
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: postmortem
3
3
  description: Failure postmortem — diagnose why experiments stopped improving and get actionable next steps.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--window 10] [--auto-trigger 5]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  When experiments stop improving, find out why. Diagnoses search space exhaustion, config errors, data issues, metric ceilings, and noise floors.
10
9
 
11
10
  ## Steps
12
- 1. `source .venv/bin/activate`
13
- 2. `python scripts/failure_postmortem.py $ARGUMENTS`
11
+ 1. `uv sync`
12
+ 2. `uv run python scripts/failure_postmortem.py $ARGUMENTS`
14
13
  3. **Saved:** `experiments/postmortems/`
15
14
 
16
15
  ## Diagnosis categories
@@ -1,30 +1,29 @@
1
1
  ---
2
2
  name: preflight
3
3
  description: Pre-flight resource check — estimates VRAM, RAM, and disk requirements before running ML training. Compares against available system resources and issues PASS/WARN/FAIL verdict. Use before training to catch OOM errors before they happen.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--model-type torch] [--params 10M] [--batch-size 32]"
6
- allowed-tools: Read, Bash(python scripts/*:*, source .venv/bin/activate:*, nvidia-smi:*), Grep, Glob
5
+ allowed-tools: Read, Bash(uv run python scripts/*:*, uv sync:*, nvidia-smi:*), Grep, Glob
7
6
  ---
8
7
 
9
8
  Check whether the current system has enough resources to run the planned experiment.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Run preflight check:**
19
18
 
20
19
  If `$ARGUMENTS` is empty (auto-detect from config.yaml):
21
20
  ```bash
22
- python scripts/preflight.py
21
+ uv run python scripts/preflight.py
23
22
  ```
24
23
 
25
24
  If `$ARGUMENTS` contains flags:
26
25
  ```bash
27
- python scripts/preflight.py $ARGUMENTS
26
+ uv run python scripts/preflight.py $ARGUMENTS
28
27
  ```
29
28
 
30
29
  3. **Interpret the verdict:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: present
3
3
  description: Presentation figure generation — training curves, comparison charts, ablation tables, Pareto plots, sensitivity heatmaps.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--figures training,comparison] [--style light|dark|poster]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  Generate presentation-ready figure specifications from experiment data in seconds.
10
9
 
11
10
  ## Steps
12
- 1. **Activate environment:** `source .venv/bin/activate`
13
- 2. **Run:** `python scripts/generate_figures.py $ARGUMENTS`
11
+ 1. **Sync environment:** `uv sync`
12
+ 2. **Run:** `uv run python scripts/generate_figures.py $ARGUMENTS`
14
13
  3. **Figure types:** training, comparison, ablation, pareto, sensitivity
15
14
  4. **Styles:** light (papers), dark (demos), poster (large fonts)
16
15
  5. **Saved output:** `paper/figures/`
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: profile
3
3
  description: Profile a training run — timing breakdown, memory usage, throughput, bottleneck detection with actionable recommendations.
4
- disable-model-invocation: true
5
4
  argument-hint: "[exp-id] [--seed 42]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Profile a training run to identify performance bottlenecks.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -21,7 +20,7 @@ Profile a training run to identify performance bottlenecks.
21
20
 
22
21
  3. **Run profiling:**
23
22
  ```bash
24
- python scripts/profile_training.py $ARGUMENTS
23
+ uv run python scripts/profile_training.py $ARGUMENTS
25
24
  ```
26
25
 
27
26
  4. **Report results:**
package/commands/prune.md CHANGED
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: prune
3
3
  description: Weight pruning — measure accuracy at different sparsity levels, find the knee point, produce a smaller/faster model.
4
- disable-model-invocation: true
5
4
  argument-hint: "<exp-id> [--sparsity 0.5,0.75,0.9] [--method magnitude|structured|lottery]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,8 +9,8 @@ Remove redundant weights for faster inference and smaller models.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:** `source .venv/bin/activate`
14
- 2. **Run:** `python scripts/model_pruning.py $ARGUMENTS`
12
+ 1. **Sync environment:** `uv sync`
13
+ 2. **Run:** `uv run python scripts/model_pruning.py $ARGUMENTS`
15
14
  3. **Methods:** magnitude (zero small weights), structured (remove neurons), lottery (iterative with rewind)
16
15
  4. **For tree models:** progressively reduces n_estimators
17
16
  5. **Report:** sparsity sweep table, knee point, recommended sparsity
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: quantize
3
3
  description: Post-training quantization — FP32→INT8/FP16, measure accuracy loss, 2-4x speedup with <0.5% accuracy loss.
4
- disable-model-invocation: true
5
4
  argument-hint: "<exp-id> [--precision int8|fp16|dynamic]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,8 +9,8 @@ Quantize for production. Lowest-effort optimization: 2-4x speedup, 2-4x memory r
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:** `source .venv/bin/activate`
14
- 2. **Run:** `python scripts/model_quantization.py $ARGUMENTS`
12
+ 1. **Sync environment:** `uv sync`
13
+ 2. **Run:** `uv run python scripts/model_quantization.py $ARGUMENTS`
15
14
  3. **Precision levels:** FP32 (baseline), FP16 (GPU), INT8 dynamic (simplest), INT8 static (best accuracy)
16
15
  4. **Report:** precision comparison table, recommended level, QAT suggestion if needed
17
16
  5. **Saved output:** `experiments/quantization/<exp-id>-quantization.yaml`
package/commands/queue.md CHANGED
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: queue
3
3
  description: Queue experiments for batch execution with priority ordering and dependency chains. Load the queue, walk away, read the summary.
4
- disable-model-invocation: true
5
4
  argument-hint: "<add|list|run|pause|clear> [description] [--priority high] [--after q-001]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ Manage the experiment queue for unattended batch execution.
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -24,7 +23,7 @@ Manage the experiment queue for unattended batch execution.
24
23
 
25
24
  3. **Run queue manager:**
26
25
  ```bash
27
- python scripts/experiment_queue.py $ARGUMENTS
26
+ uv run python scripts/experiment_queue.py $ARGUMENTS
28
27
  ```
29
28
 
30
29
  4. **Report results by action:**
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: registry
3
3
  description: Model registry — track, promote, and govern the model lifecycle from candidate to production.
4
- disable-model-invocation: true
5
4
  argument-hint: "[list|register|promote|demote|archive|history] [exp-id] [stage]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -9,8 +8,8 @@ allowed-tools: Read, Bash(*), Grep, Glob
9
8
  Track which model is production, staging, candidate, or archived. Promotion requires passing gates.
10
9
 
11
10
  ## Steps
12
- 1. `source .venv/bin/activate`
13
- 2. `python scripts/model_lifecycle.py $ARGUMENTS`
11
+ 1. `uv sync`
12
+ 2. `uv run python scripts/model_lifecycle.py $ARGUMENTS`
14
13
  3. **Registry:** `experiments/registry.yaml`
15
14
 
16
15
  ## Promotion gates
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  name: regress
3
3
  description: Performance regression gate — re-run best experiment after code/dependency changes and verify metrics haven't degraded.
4
- disable-model-invocation: true
5
4
  argument-hint: "[--tolerance 0.01] [--against exp-id] [--quick]"
6
5
  allowed-tools: Read, Bash(*), Grep, Glob
7
6
  ---
@@ -10,9 +9,9 @@ CI for your model. After any change to code, dependencies, or data, verify metri
10
9
 
11
10
  ## Steps
12
11
 
13
- 1. **Activate environment:**
12
+ 1. **Sync environment:**
14
13
  ```bash
15
- source .venv/bin/activate
14
+ uv sync
16
15
  ```
17
16
 
18
17
  2. **Parse arguments from `$ARGUMENTS`:**
@@ -24,7 +23,7 @@ CI for your model. After any change to code, dependencies, or data, verify metri
24
23
 
25
24
  3. **Run regression gate:**
26
25
  ```bash
27
- python scripts/regression_gate.py $ARGUMENTS
26
+ uv run python scripts/regression_gate.py $ARGUMENTS
28
27
  ```
29
28
 
30
29
  4. **Report results:**