selftune 0.2.9 → 0.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. package/README.md +35 -35
  2. package/apps/local-dashboard/dist/assets/index-BZVLv70T.js +16 -0
  3. package/apps/local-dashboard/dist/assets/{vendor-react-BQH_6WrG.js → vendor-react-BXP54cYo.js} +4 -4
  4. package/apps/local-dashboard/dist/assets/{vendor-table-dK1QMLq9.js → vendor-table-DTF_SXoy.js} +1 -1
  5. package/apps/local-dashboard/dist/assets/{vendor-ui-CO2mrx6e.js → vendor-ui-CWU0d1wd.js} +66 -66
  6. package/apps/local-dashboard/dist/index.html +15 -15
  7. package/bin/selftune.cjs +1 -1
  8. package/cli/selftune/activation-rules.ts +1 -0
  9. package/cli/selftune/alpha-upload/build-payloads.ts +18 -2
  10. package/cli/selftune/alpha-upload/stage-canonical.ts +94 -0
  11. package/cli/selftune/auth/device-code.ts +32 -0
  12. package/cli/selftune/auto-update.ts +12 -0
  13. package/cli/selftune/badge/badge.ts +1 -0
  14. package/cli/selftune/canonical-export.ts +5 -0
  15. package/cli/selftune/claude-agents.ts +154 -0
  16. package/cli/selftune/contribute/bundle.ts +1 -0
  17. package/cli/selftune/contribute/contribute.ts +1 -0
  18. package/cli/selftune/cron/setup.ts +2 -2
  19. package/cli/selftune/dashboard-server.ts +1 -0
  20. package/cli/selftune/eval/hooks-to-evals.ts +1 -0
  21. package/cli/selftune/eval/import-skillsbench.ts +1 -0
  22. package/cli/selftune/eval/synthetic-evals.ts +2 -3
  23. package/cli/selftune/eval/unit-test.ts +1 -0
  24. package/cli/selftune/evolution/deploy-proposal.ts +1 -0
  25. package/cli/selftune/evolution/evolve-body.ts +93 -6
  26. package/cli/selftune/evolution/evolve.ts +0 -1
  27. package/cli/selftune/evolution/propose-body.ts +3 -2
  28. package/cli/selftune/evolution/propose-routing.ts +3 -2
  29. package/cli/selftune/evolution/refine-body.ts +3 -2
  30. package/cli/selftune/export.ts +1 -0
  31. package/cli/selftune/grading/grade-session.ts +8 -0
  32. package/cli/selftune/hooks/auto-activate.ts +1 -0
  33. package/cli/selftune/hooks/evolution-guard.ts +1 -1
  34. package/cli/selftune/hooks/prompt-log.ts +1 -0
  35. package/cli/selftune/hooks/session-stop.ts +34 -40
  36. package/cli/selftune/hooks/skill-change-guard.ts +1 -0
  37. package/cli/selftune/hooks/skill-eval.ts +1 -1
  38. package/cli/selftune/index.ts +23 -14
  39. package/cli/selftune/ingestors/claude-replay.ts +1 -0
  40. package/cli/selftune/ingestors/codex-rollout.ts +1 -0
  41. package/cli/selftune/ingestors/codex-wrapper.ts +1 -0
  42. package/cli/selftune/ingestors/openclaw-ingest.ts +1 -0
  43. package/cli/selftune/ingestors/opencode-ingest.ts +1 -0
  44. package/cli/selftune/init.ts +121 -29
  45. package/cli/selftune/localdb/db.ts +1 -0
  46. package/cli/selftune/localdb/direct-write.ts +39 -0
  47. package/cli/selftune/localdb/materialize.ts +2 -0
  48. package/cli/selftune/localdb/queries.ts +53 -0
  49. package/cli/selftune/localdb/schema.ts +28 -0
  50. package/cli/selftune/normalization.ts +1 -0
  51. package/cli/selftune/observability.ts +1 -0
  52. package/cli/selftune/repair/skill-usage.ts +1 -0
  53. package/cli/selftune/routes/orchestrate-runs.ts +1 -0
  54. package/cli/selftune/routes/overview.ts +1 -0
  55. package/cli/selftune/routes/skill-report.ts +1 -0
  56. package/cli/selftune/sync.ts +30 -1
  57. package/cli/selftune/uninstall.ts +412 -0
  58. package/cli/selftune/utils/canonical-log.ts +2 -0
  59. package/cli/selftune/utils/jsonl.ts +1 -0
  60. package/cli/selftune/utils/llm-call.ts +131 -3
  61. package/cli/selftune/utils/skill-log.ts +1 -0
  62. package/cli/selftune/utils/transcript.ts +1 -0
  63. package/cli/selftune/utils/trigger-check.ts +1 -1
  64. package/cli/selftune/workflows/skill-md-writer.ts +5 -5
  65. package/cli/selftune/workflows/workflows.ts +1 -0
  66. package/package.json +37 -33
  67. package/packages/telemetry-contract/fixtures/golden.test.ts +1 -0
  68. package/packages/telemetry-contract/package.json +1 -1
  69. package/packages/telemetry-contract/src/schemas.ts +1 -0
  70. package/packages/telemetry-contract/tests/compatibility.test.ts +1 -0
  71. package/packages/ui/README.md +35 -34
  72. package/packages/ui/package.json +3 -3
  73. package/packages/ui/src/components/ActivityTimeline.tsx +49 -42
  74. package/packages/ui/src/components/EvidenceViewer.tsx +306 -182
  75. package/packages/ui/src/components/EvolutionTimeline.tsx +83 -72
  76. package/packages/ui/src/components/InfoTip.tsx +4 -3
  77. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +60 -53
  78. package/packages/ui/src/components/section-cards.tsx +19 -24
  79. package/packages/ui/src/components/skill-health-grid.tsx +213 -193
  80. package/packages/ui/src/lib/constants.tsx +1 -0
  81. package/packages/ui/src/primitives/badge.tsx +12 -15
  82. package/packages/ui/src/primitives/button.tsx +7 -7
  83. package/packages/ui/src/primitives/card.tsx +15 -26
  84. package/packages/ui/src/primitives/checkbox.tsx +7 -8
  85. package/packages/ui/src/primitives/collapsible.tsx +5 -5
  86. package/packages/ui/src/primitives/dropdown-menu.tsx +45 -55
  87. package/packages/ui/src/primitives/label.tsx +6 -6
  88. package/packages/ui/src/primitives/select.tsx +28 -37
  89. package/packages/ui/src/primitives/table.tsx +17 -44
  90. package/packages/ui/src/primitives/tabs.tsx +14 -21
  91. package/packages/ui/src/primitives/tooltip.tsx +10 -22
  92. package/skill/SKILL.md +70 -57
  93. package/skill/Workflows/AlphaUpload.md +4 -4
  94. package/skill/Workflows/AutoActivation.md +11 -6
  95. package/skill/Workflows/Badge.md +22 -16
  96. package/skill/Workflows/Baseline.md +34 -36
  97. package/skill/Workflows/Composability.md +16 -11
  98. package/skill/Workflows/Contribute.md +26 -21
  99. package/skill/Workflows/Cron.md +23 -22
  100. package/skill/Workflows/Dashboard.md +32 -27
  101. package/skill/Workflows/Doctor.md +33 -27
  102. package/skill/Workflows/Evals.md +48 -47
  103. package/skill/Workflows/EvolutionMemory.md +31 -21
  104. package/skill/Workflows/Evolve.md +84 -82
  105. package/skill/Workflows/EvolveBody.md +58 -47
  106. package/skill/Workflows/Grade.md +16 -13
  107. package/skill/Workflows/ImportSkillsBench.md +9 -6
  108. package/skill/Workflows/Ingest.md +36 -21
  109. package/skill/Workflows/Initialize.md +108 -40
  110. package/skill/Workflows/Orchestrate.md +22 -16
  111. package/skill/Workflows/Replay.md +12 -7
  112. package/skill/Workflows/Rollback.md +13 -6
  113. package/skill/Workflows/Schedule.md +6 -6
  114. package/skill/Workflows/Sync.md +18 -11
  115. package/skill/Workflows/UnitTest.md +28 -17
  116. package/skill/Workflows/Watch.md +28 -21
  117. package/skill/agents/diagnosis-analyst.md +11 -0
  118. package/skill/agents/evolution-reviewer.md +15 -1
  119. package/skill/agents/integration-guide.md +10 -0
  120. package/skill/agents/pattern-analyst.md +12 -1
  121. package/skill/references/grading-methodology.md +23 -24
  122. package/skill/references/interactive-config.md +7 -7
  123. package/skill/references/invocation-taxonomy.md +22 -20
  124. package/skill/references/logs.md +14 -6
  125. package/skill/references/setup-patterns.md +4 -2
  126. package/.claude/agents/diagnosis-analyst.md +0 -156
  127. package/.claude/agents/evolution-reviewer.md +0 -180
  128. package/.claude/agents/integration-guide.md +0 -212
  129. package/.claude/agents/pattern-analyst.md +0 -160
  130. package/apps/local-dashboard/dist/assets/index-C4UYGWKr.js +0 -15
@@ -7,6 +7,7 @@ natural-language queries without breaking existing triggers.
7
7
  ## When to Invoke
8
8
 
9
9
  Invoke this workflow when the user requests any of the following:
10
+
10
11
  - Improving or evolving a skill's trigger coverage
11
12
  - Fixing undertriggering or missed queries for a skill
12
13
  - Optimizing a skill description based on usage data
@@ -20,27 +21,27 @@ selftune evolve --skill <name> --skill-path <path> [options]
20
21
 
21
22
  ## Options
22
23
 
23
- | Flag | Description | Default |
24
- |------|-------------|---------|
25
- | `--skill <name>` | Skill name | Required |
26
- | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
27
- | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
28
- | `--agent <name>` | Agent CLI to use (claude, codex, opencode) | Auto-detected |
29
- | `--dry-run` | Propose and validate without deploying | Off |
30
- | `--confidence <n>` | Minimum confidence threshold (0-1) | 0.6 |
31
- | `--max-iterations <n>` | Maximum retry iterations | 3 |
32
- | `--validation-model <model>` | Model for trigger-check validation LLM calls | `haiku` |
33
- | `--pareto` | Generate multiple candidates per iteration | Off |
34
- | `--candidates <n>` | Number of candidates per iteration (with `--pareto`) | 3 |
35
- | `--token-efficiency` | Optimize for token efficiency in proposals | Off |
36
- | `--with-baseline` | Include a no-skill baseline comparison | Off |
37
- | `--cheap-loop` | Use cheap models for loop, expensive for final gate | On |
38
- | `--full-model` | Use full-cost model throughout (disables cheap-loop) | Off |
39
- | `--verbose` | Print detailed progress during evolution | Off |
40
- | `--gate-model <model>` | Model for final gate validation | `sonnet` (when `--cheap-loop`) |
41
- | `--proposal-model <model>` | Model for proposal generation LLM calls | None |
42
- | `--sync-first` | Refresh source-truth telemetry before generating evals/failure patterns | Off |
43
- | `--sync-force` | Force a full source rescan during `--sync-first` | Off |
24
+ | Flag | Description | Default |
25
+ | ---------------------------- | ----------------------------------------------------------------------- | ------------------------------ |
26
+ | `--skill <name>` | Skill name | Required |
27
+ | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
28
+ | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
29
+ | `--agent <name>` | Agent CLI to use (claude, codex, opencode) | Auto-detected |
30
+ | `--dry-run` | Propose and validate without deploying | Off |
31
+ | `--confidence <n>` | Minimum confidence threshold (0-1) | 0.6 |
32
+ | `--max-iterations <n>` | Maximum retry iterations | 3 |
33
+ | `--validation-model <model>` | Model for trigger-check validation LLM calls | `haiku` |
34
+ | `--pareto` | Generate multiple candidates per iteration | Off |
35
+ | `--candidates <n>` | Number of candidates per iteration (with `--pareto`) | 3 |
36
+ | `--token-efficiency` | Optimize for token efficiency in proposals | Off |
37
+ | `--with-baseline` | Include a no-skill baseline comparison | Off |
38
+ | `--cheap-loop` | Use cheap models for loop, expensive for final gate | On |
39
+ | `--full-model` | Use full-cost model throughout (disables cheap-loop) | Off |
40
+ | `--verbose` | Print detailed progress during evolution | Off |
41
+ | `--gate-model <model>` | Model for final gate validation | `sonnet` (when `--cheap-loop`) |
42
+ | `--proposal-model <model>` | Model for proposal generation LLM calls | None |
43
+ | `--sync-first` | Refresh source-truth telemetry before generating evals/failure patterns | Off |
44
+ | `--sync-force` | Force a full source rescan during `--sync-first` | Off |
44
45
 
45
46
  ## Output Format
46
47
 
@@ -54,7 +55,7 @@ See `references/logs.md` for the audit log schema.
54
55
  "proposal_id": "evolve-pptx-1709125200000",
55
56
  "skill_name": "pptx",
56
57
  "iteration": 1,
57
- "original_pass_rate": 0.70,
58
+ "original_pass_rate": 0.7,
58
59
  "proposed_pass_rate": 0.92,
59
60
  "regression_count": 0,
60
61
  "confidence": 0.85,
@@ -67,11 +68,11 @@ See `references/logs.md` for the audit log schema.
67
68
 
68
69
  The evolution process writes multiple audit entries:
69
70
 
70
- | Action | When | Key details |
71
- |--------|------|-------------|
72
- | `created` | Proposal generated | `details` contains `original_description:` prefix |
73
- | `validated` | Proposal tested against eval set | `eval_snapshot` with before/after pass rates |
74
- | `deployed` | Updated SKILL.md written to disk | `eval_snapshot` with final rates |
71
+ | Action | When | Key details |
72
+ | ----------- | -------------------------------- | ------------------------------------------------- |
73
+ | `created` | Proposal generated | `details` contains `original_description:` prefix |
74
+ | `validated` | Proposal tested against eval set | `eval_snapshot` with before/after pass rates |
75
+ | `deployed` | Updated SKILL.md written to disk | `eval_snapshot` with final rates |
75
76
 
76
77
  ## Parsing Instructions
77
78
 
@@ -97,51 +98,45 @@ The evolution process writes multiple audit entries:
97
98
 
98
99
  Before running the evolve command, use the `AskUserQuestion` tool to present structured configuration options. If the user responds with "use defaults" or similar shorthand, skip to step 1 using the recommended defaults. If the user cancels, stop and do not continue.
99
100
 
100
- Use `AskUserQuestion` with these questions (max 4 per call — split if needed):
101
-
102
- **Call 1:**
103
-
104
- ```json
105
- {
106
- "questions": [
107
- {
108
- "question": "Execution Mode",
109
- "options": ["Dry run — preview without deploying (recommended for first run)", "Live validate and deploy if improved"]
110
- },
111
- {
112
- "question": "Model Tier (see SKILL.md reference)",
113
- "options": ["Fast (haiku)cheapest, ~2s/call (recommended with cheap-loop)", "Balanced (sonnet) good quality, ~5s/call", "Best (opus) — highest quality, ~10s/call"]
114
- },
115
- {
116
- "question": "Cost Optimization",
117
- "options": ["Cheap loop haiku for iteration, sonnet for final gate (recommended)", "Single model — use one model throughout"]
118
- },
119
- {
120
- "question": "Advanced Options",
121
- "options": ["Defaults (0.6 confidence, 3 iterations, single candidate) (recommended)", "Stricter (0.7 confidence, 5 iterations)", "Pareto mode (multiple candidates per iteration)"]
122
- }
123
- ]
124
- }
125
- ```
126
-
127
- If `AskUserQuestion` is not available, fall back to presenting these as inline numbered options.
101
+ Ask one `AskUserQuestion` at a time in this order:
102
+
103
+ 1. `Execution Mode`
104
+ Options:
105
+ - `Dry run — preview without deploying (recommended for first run)`
106
+ - `Live — validate and deploy if improved`
107
+ 2. `Model Tier (see SKILL.md reference)`
108
+ Options:
109
+ - `Fast (haiku) — cheapest, ~2s/call (recommended with cheap-loop)`
110
+ - `Balanced (sonnet) — good quality, ~5s/call`
111
+ - `Best (opus) — highest quality, ~10s/call`
112
+ 3. `Cost Optimization`
113
+ Options:
114
+ - `Cheap loophaiku for iteration, sonnet for final gate (recommended)`
115
+ - `Single model — use one model throughout`
116
+ 4. `Advanced Options`
117
+ Options:
118
+ - `Defaults (0.6 confidence, 3 iterations, single candidate) (recommended)`
119
+ - `Stricter (0.7 confidence, 5 iterations)`
120
+ - `Pareto mode (multiple candidates per iteration)`
121
+
122
+ If `AskUserQuestion` is not available or Claude does not invoke it, fall back to presenting the same choices as inline numbered options.
128
123
 
129
124
  If the user cancels, stop -- do not proceed with defaults. If the user selects "use defaults", skip to step 1 with recommended defaults.
130
125
 
131
126
  After the user responds, parse their selections and map each choice to the corresponding CLI flags:
132
127
 
133
- | Selection | CLI Flag |
134
- |-----------|----------|
135
- | 1a (dry run) | `--dry-run` |
136
- | 1b (live) | _(no flag)_ |
137
- | 2a (haiku) | `--validation-model haiku` |
138
- | 2b (sonnet) | `--validation-model sonnet` |
139
- | 2c (opus) | `--validation-model opus` |
140
- | 3a (cheap loop) | `--cheap-loop` |
141
- | 3b (single model) | _(no flag)_ |
142
- | Custom confidence | `--confidence <value>` |
143
- | Custom iterations | `--max-iterations <value>` |
144
- | 6b (pareto) | `--pareto` |
128
+ | Selection | CLI Flag |
129
+ | ----------------- | --------------------------- |
130
+ | 1a (dry run) | `--dry-run` |
131
+ | 1b (live) | _(no flag)_ |
132
+ | 2a (haiku) | `--validation-model haiku` |
133
+ | 2b (sonnet) | `--validation-model sonnet` |
134
+ | 2c (opus) | `--validation-model opus` |
135
+ | 3a (cheap loop) | `--cheap-loop` |
136
+ | 3b (single model) | _(no flag)_ |
137
+ | Custom confidence | `--confidence <value>` |
138
+ | Custom iterations | `--max-iterations <value>` |
139
+ | 6b (pareto) | `--pareto` |
145
140
 
146
141
  Show a confirmation summary to the user:
147
142
 
@@ -161,6 +156,7 @@ Build the CLI command string with all selected flags and continue to step 1.
161
156
  ### 1. Read Evolution Context
162
157
 
163
158
  Before running, read `~/.selftune/memory/context.md` for session context:
159
+
164
160
  - Active evolutions and their current status
165
161
  - Known issues from previous runs
166
162
  - Last update timestamp
@@ -197,6 +193,7 @@ evolution cannot reliably measure improvement.
197
193
  ### 4. Extract Failure Patterns
198
194
 
199
195
  The command groups missed queries by invocation type:
196
+
200
197
  - Missed explicit: description is broken (rare, high priority)
201
198
  - Missed implicit: description is too narrow (common, evolve target)
202
199
  - Missed contextual: description lacks domain vocabulary (evolve target)
@@ -227,6 +224,7 @@ For body evolution (`evolve body`), only the size constraint applies.
227
224
 
228
225
  An LLM generates a candidate description that would catch the missed
229
226
  queries. The candidate:
227
+
230
228
  - Preserves existing trigger phrases that work
231
229
  - Adds new phrases covering missed patterns
232
230
  - Maintains the description's structure and tone
@@ -234,6 +232,7 @@ queries. The candidate:
234
232
  ### 6. Validate Against Eval Set
235
233
 
236
234
  The candidate is tested against the full eval set:
235
+
237
236
  - Must improve overall pass rate
238
237
  - Must not regress more than 5% on previously-passing entries
239
238
  - Must exceed the `--confidence` threshold
@@ -246,14 +245,14 @@ with adjusted proposals.
246
245
  When summarizing an evolution run, include these aggregate metrics rather
247
246
  than only saying "passed" or "failed":
248
247
 
249
- | Metric | Meaning |
250
- |--------|---------|
251
- | `original_pass_rate` | Baseline pass rate before the proposal |
252
- | `proposed_pass_rate` | Pass rate after applying the proposal |
253
- | `regression_count` | Eval entries that passed before and failed after |
254
- | `net_change` | Total passes gained minus regressions introduced |
255
- | `iteration` / `iterations_used` | Which retry produced the current candidate |
256
- | `baseline_lift` | Additional lift over the no-skill baseline when `--with-baseline` is enabled |
248
+ | Metric | Meaning |
249
+ | ------------------------------- | ---------------------------------------------------------------------------- |
250
+ | `original_pass_rate` | Baseline pass rate before the proposal |
251
+ | `proposed_pass_rate` | Pass rate after applying the proposal |
252
+ | `regression_count` | Eval entries that passed before and failed after |
253
+ | `net_change` | Total passes gained minus regressions introduced |
254
+ | `iteration` / `iterations_used` | Which retry produced the current candidate |
255
+ | `baseline_lift` | Additional lift over the no-skill baseline when `--with-baseline` is enabled |
257
256
 
258
257
  These metrics explain whether the proposal is genuinely better, merely
259
258
  different, or too risky to deploy.
@@ -264,6 +263,7 @@ If `--dry-run`, the proposal is printed but not deployed. The audit log
264
263
  still records `created` and `validated` entries for review.
265
264
 
266
265
  If deploying:
266
+
267
267
  1. The current SKILL.md is backed up to `SKILL.md.bak`
268
268
  2. The updated description is written to SKILL.md
269
269
  3. A `deployed` entry is logged to the evolution audit
@@ -271,6 +271,7 @@ If deploying:
271
271
  ### 8. Update Memory
272
272
 
273
273
  After evolution completes (deploy or dry-run), the memory writer updates:
274
+
274
275
  - `~/.selftune/memory/context.md` -- records the evolution outcome and current state
275
276
  - `~/.selftune/memory/decisions.md` -- logs the decision rationale and proposal details
276
277
 
@@ -281,13 +282,13 @@ even after a context window reset.
281
282
 
282
283
  The evolution loop stops when any of these conditions is met (priority order):
283
284
 
284
- | # | Condition | Meaning |
285
- |---|-----------|---------|
286
- | 1 | **Converged** | Pass rate >= 0.95 |
287
- | 2 | **Max iterations** | Reached `--max-iterations` limit |
288
- | 3 | **Low confidence** | Proposal confidence below `--confidence` threshold |
289
- | 4 | **Plateau** | Pass rate unchanged across 3 consecutive iterations |
290
- | 5 | **Continue** | None of the above -- keep iterating |
285
+ | # | Condition | Meaning |
286
+ | --- | ------------------ | --------------------------------------------------- |
287
+ | 1 | **Converged** | Pass rate >= 0.95 |
288
+ | 2 | **Max iterations** | Reached `--max-iterations` limit |
289
+ | 3 | **Low confidence** | Proposal confidence below `--confidence` threshold |
290
+ | 4 | **Plateau** | Pass rate unchanged across 3 consecutive iterations |
291
+ | 5 | **Continue** | None of the above -- keep iterating |
291
292
 
292
293
  ## Cheap Loop Mode
293
294
 
@@ -296,6 +297,7 @@ and only uses an expensive model (sonnet) for a final gate validation before
296
297
  deploying. This reduces cost while maintaining deployment quality.
297
298
 
298
299
  When `--cheap-loop` is set:
300
+
299
301
  - `--proposal-model` defaults to `haiku`
300
302
  - `--validation-model` defaults to `haiku`
301
303
  - `--gate-model` defaults to `sonnet`
@@ -12,21 +12,23 @@ selftune evolve body --skill <name> --skill-path <path> --target <target> [optio
12
12
 
13
13
  ## Options
14
14
 
15
- | Flag | Description | Default |
16
- |------|-------------|---------|
17
- | `--skill <name>` | Skill name | Required |
18
- | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
19
- | `--target <type>` | Evolution target: `routing` or `body` | Required |
20
- | `--teacher-agent <name>` | Agent CLI for proposal generation | Auto-detected |
21
- | `--student-agent <name>` | Agent CLI for validation | Same as teacher |
22
- | `--teacher-model <flag>` | Model flag for teacher (e.g. `opus`) | Agent default |
23
- | `--student-model <flag>` | Model flag for student (e.g. `haiku`) | Agent default |
24
- | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
25
- | `--dry-run` | Propose and validate without deploying | Off |
26
- | `--max-iterations <n>` | Maximum refinement iterations | 3 |
27
- | `--task-description <text>` | Context for the evolution goal | None |
28
- | `--validation-model <model>` | Model for trigger-check validation calls (overrides `--student-model` for validation) | None |
29
- | `--few-shot <paths>` | Comma-separated paths to example SKILL.md files | None |
15
+ | Flag | Description | Default |
16
+ | ---------------------------- | ------------------------------------------------------------------------------------- | ------------------------ |
17
+ | `--skill <name>` | Skill name | Required |
18
+ | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
19
+ | `--target <type>` | Evolution target: `routing` or `body` | Required |
20
+ | `--teacher-agent <name>` | Agent CLI for proposal generation | Auto-detected |
21
+ | `--student-agent <name>` | Agent CLI for validation | Same as teacher |
22
+ | `--teacher-model <flag>` | Model flag for teacher | `opus` |
23
+ | `--student-model <flag>` | Model flag for student | `haiku` |
24
+ | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
25
+ | `--dry-run` | Propose and validate without deploying | Off |
26
+ | `--max-iterations <n>` | Maximum refinement iterations | 3 |
27
+ | `--task-description <text>` | Context for the evolution goal | None |
28
+ | `--validation-model <model>` | Model for trigger-check validation calls (overrides `--student-model` for validation) | None |
29
+ | `--teacher-effort <level>` | Effort level for teacher LLM: `low`, `medium`, `high`, `max` | `high` |
30
+ | `--review` | Run `evolution-reviewer` subagent as Gate 4 before deployment | Off |
31
+ | `--few-shot <paths>` | Comma-separated paths to example SKILL.md files | None |
30
32
 
31
33
  ## Evolution Targets
32
34
 
@@ -46,11 +48,12 @@ teacher generates a complete replacement, validated through 3 gates.
46
48
 
47
49
  Every proposal passes through three sequential gates:
48
50
 
49
- | Gate | Type | What it checks | Cost |
50
- |------|------|---------------|------|
51
- | **Gate 1: Structural** | Pure code | YAML frontmatter present, `# Title` exists, `## Workflow Routing` preserved if original had one | Free |
52
- | **Gate 2: Trigger Accuracy** | Student LLM | YES/NO trigger check per eval entry on the extracted description | Cheap |
53
- | **Gate 3: Quality** | Student LLM | Body clarity and completeness score (0.0-1.0) | Cheap |
51
+ | Gate | Type | What it checks | Cost |
52
+ | ----------------------------- | ----------- | ----------------------------------------------------------------------------------------------- | -------- |
53
+ | **Gate 1: Structural** | Pure code | YAML frontmatter present, `# Title` exists, `## Workflow Routing` preserved if original had one | Free |
54
+ | **Gate 2: Trigger Accuracy** | Student LLM | YES/NO trigger check per eval entry on the extracted description | Cheap |
55
+ | **Gate 3: Quality** | Student LLM | Body clarity and completeness score (0.0-1.0) | Cheap |
56
+ | **Gate 4: Reviewer** (opt-in) | Subagent | `evolution-reviewer` multi-turn review — reads files, checks evidence, APPROVE/REJECT verdict | Moderate |
54
57
 
55
58
  If any gate fails, the teacher receives structured feedback and generates
56
59
  a refined proposal. This repeats up to `--max-iterations` times.
@@ -62,32 +65,32 @@ a refined proposal. This repeats up to `--max-iterations` times.
62
65
  Before running evolve-body, use the `AskUserQuestion` tool to present structured configuration options.
63
66
  If the user says "use defaults" or similar, skip to step 1 with recommended defaults. If the user cancels, abort the workflow -- do not proceed with defaults.
64
67
 
65
- Use `AskUserQuestion` with these questions:
66
-
67
- ```json
68
- {
69
- "questions": [
70
- {
71
- "question": "Evolution Target",
72
- "options": ["Routing table — optimize workflow routing only (recommended)", "Full body — rewrite entire SKILL.md (more aggressive)"]
73
- },
74
- {
75
- "question": "Execution Mode",
76
- "options": ["Dry run — preview without deploying (recommended)", "Live — validate and deploy if improved"]
77
- },
78
- {
79
- "question": "Teacher Model (generates proposals)",
80
- "options": ["Balanced (sonnet) — good quality (recommended)", "Best (opus) — highest quality, slower"]
81
- },
82
- {
83
- "question": "Student Model & Iterations",
84
- "options": ["Fast (haiku) + 3 iterations (recommended)", "Balanced (sonnet) + 3 iterations", "Fast (haiku) + 5 iterations"]
85
- }
86
- ]
87
- }
88
- ```
89
-
90
- If `AskUserQuestion` is not available, fall back to presenting these as inline numbered options.
68
+ Ask one `AskUserQuestion` at a time in this order:
69
+
70
+ 1. `Evolution Target`
71
+ Options:
72
+ - `Routing table — optimize workflow routing only (recommended)`
73
+ - `Full body — rewrite entire SKILL.md (more aggressive)`
74
+ 2. `Execution Mode`
75
+ Options:
76
+ - `Dry run — preview without deploying (recommended)`
77
+ - `Live — validate and deploy if improved`
78
+ 3. `Teacher Model (generates proposals)`
79
+ Options:
80
+ - `Best (opus + high effort) — thinking-enabled, highest quality (recommended)`
81
+ - `Balanced (sonnet) — good quality, faster`
82
+ 4. `Student Model & Iterations`
83
+ Options:
84
+ - `Fast (haiku) + 3 iterations (recommended)`
85
+ - `Balanced (sonnet) + 3 iterations`
86
+ - `Fast (haiku) + 5 iterations`
87
+ 5. `Teacher Effort (thinking depth for proposal generation)`
88
+ Options:
89
+ - `High — extended thinking for better proposals (recommended)`
90
+ - `Max — maximum thinking depth (Opus 4.6 only, slower)`
91
+ - `Medium — standard reasoning`
92
+
93
+ If `AskUserQuestion` is not available or Claude does not invoke it, fall back to presenting the same choices as inline numbered options.
91
94
 
92
95
  After the user responds, show a confirmation summary:
93
96
 
@@ -95,7 +98,8 @@ After the user responds, show a confirmation summary:
95
98
  Configuration Summary:
96
99
  Target: routing
97
100
  Mode: dry-run
98
- Teacher model: sonnet
101
+ Teacher model: opus
102
+ Teacher effort: high
99
103
  Student model: haiku
100
104
  Iterations: 3
101
105
  Few-shot: none
@@ -106,6 +110,7 @@ Proceeding...
106
110
  ### 1. Parse Current Skill
107
111
 
108
112
  The command reads SKILL.md and splits it into sections using `parseSkillSections()`:
113
+
109
114
  - Frontmatter (YAML between `---` markers)
110
115
  - Title (first `# Heading`)
111
116
  - Description (text between title and first `## Section`)
@@ -125,6 +130,7 @@ pipeline. See `references/invocation-taxonomy.md`.
125
130
  ### 4. Generate Proposal (Teacher)
126
131
 
127
132
  The teacher LLM generates a proposal based on the target:
133
+
128
134
  - **routing**: Optimized `## Workflow Routing` markdown table
129
135
  - **body**: Complete SKILL.md body replacement
130
136
 
@@ -138,6 +144,7 @@ failure details and generates a refined proposal.
138
144
  ### 6. Deploy or Preview
139
145
 
140
146
  If `--dry-run`, prints the proposal without deploying. Otherwise:
147
+
141
148
  1. Creates a timestamped backup of the current SKILL.md
142
149
  2. Applies the change: `replaceSection()` for routing, `replaceBody()` for body
143
150
  3. Records audit entries
@@ -146,13 +153,17 @@ If `--dry-run`, prints the proposal without deploying. Otherwise:
146
153
  ## Common Patterns
147
154
 
148
155
  **"Evolve the routing table for the Research skill"**
156
+
149
157
  > `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target routing`
150
158
 
151
159
  **"Rewrite the entire skill body"**
160
+
152
161
  > `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target body --dry-run`
153
162
 
154
163
  **"Use a stronger model for generation"**
164
+
155
165
  > `selftune evolve body --skill pptx --skill-path /path/SKILL.md --target body --teacher-model opus --student-model haiku`
156
166
 
157
167
  **"Preview what would change"**
168
+
158
169
  > Always start with `--dry-run` to review the proposal before deploying.
@@ -11,13 +11,13 @@ selftune grade --skill <name> [options]
11
11
 
12
12
  ## Options
13
13
 
14
- | Flag | Description | Default |
15
- |------|-------------|---------|
16
- | `--skill <name>` | Skill name to grade | Required |
17
- | `--expectations "..."` | Explicit expectations (semicolon-separated) | Auto-derived |
18
- | `--evals-json <path>` | Pre-built eval set JSON file | None |
19
- | `--eval-id <n>` | Specific eval ID to grade from the eval set | None |
20
- | `--agent <name>` | Agent CLI to use (claude, codex, opencode) | Auto-detected |
14
+ | Flag | Description | Default |
15
+ | ---------------------- | ------------------------------------------- | ------------- |
16
+ | `--skill <name>` | Skill name to grade | Required |
17
+ | `--expectations "..."` | Explicit expectations (semicolon-separated) | Auto-derived |
18
+ | `--evals-json <path>` | Pre-built eval set JSON file | None |
19
+ | `--eval-id <n>` | Specific eval ID to grade from the eval set | None |
20
+ | `--agent <name>` | Agent CLI to use (claude, codex, opencode) | Auto-detected |
21
21
 
22
22
  ## Output Format
23
23
 
@@ -30,14 +30,10 @@ for the full schema. Key fields:
30
30
  "skill_name": "pptx",
31
31
  "transcript_path": "~/.claude/projects/.../abc123.jsonl",
32
32
  "graded_at": "2026-02-28T12:00:00Z",
33
- "expectations": [
34
- { "text": "...", "passed": true, "evidence": "..." }
35
- ],
33
+ "expectations": [{ "text": "...", "passed": true, "evidence": "..." }],
36
34
  "summary": { "passed": 2, "failed": 1, "total": 3, "pass_rate": 0.67 },
37
35
  "execution_metrics": { "tool_calls": {}, "total_tool_calls": 6, "errors_encountered": 0 },
38
- "claims": [
39
- { "claim": "...", "type": "factual", "verified": true, "evidence": "..." }
40
- ],
36
+ "claims": [{ "claim": "...", "type": "factual", "verified": true, "evidence": "..." }],
41
37
  "eval_feedback": { "suggestions": [], "overall": "..." }
42
38
  }
43
39
  ```
@@ -82,6 +78,7 @@ fields. See `references/logs.md` for the telemetry format.
82
78
  ### 2. Read the Transcript
83
79
 
84
80
  Parse the JSONL file at `transcript_path`. Extract:
81
+
85
82
  - User messages (what was asked)
86
83
  - Assistant tool calls (what the agent did)
87
84
  - Tool results (what happened)
@@ -99,6 +96,7 @@ Ensure at least one Process expectation and one Quality expectation.
99
96
 
100
97
  Search both the telemetry record and the transcript for evidence per
101
98
  expectation. Mark as:
99
+
102
100
  - **PASS** if evidence exists and supports the expectation
103
101
  - **FAIL** if evidence is absent or contradicts the expectation
104
102
 
@@ -123,6 +121,7 @@ Write the full grading result to `grading.json` in the current directory.
123
121
  ### 8. Report Results
124
122
 
125
123
  Report to the user:
124
+
126
125
  - Pass rate (e.g., "2/3 passed, 67%")
127
126
  - Failed expectations with evidence
128
127
  - Notable claims
@@ -133,19 +132,23 @@ Keep the summary concise. The full details are in `grading.json`.
133
132
  ## Common Patterns
134
133
 
135
134
  **User asks to grade a skill session**
135
+
136
136
  > Run `selftune grade --skill <name>` with default expectations. Results are
137
137
  > written to `grading.json`. Read that file and report the pass rate and any
138
138
  > failures to the user.
139
139
 
140
140
  **User provides specific expectations**
141
+
141
142
  > Run `selftune grade --skill <name> --expectations "expect1;expect2;expect3"`.
142
143
  > Parse results and report.
143
144
 
144
145
  **User wants to grade from an eval set**
146
+
145
147
  > Run `selftune grade --skill <name> --evals-json path/to/evals.json`.
146
148
  > Optionally add `--eval-id N` for a specific scenario.
147
149
 
148
150
  **Agent detection override needed**
151
+
149
152
  > The grader auto-detects the agent CLI. If detection fails or the user
150
153
  > specifies an agent, pass `--agent <name>` to override.
151
154
 
@@ -18,12 +18,12 @@ selftune eval import --dir <path> --skill <name> --output <path> [options]
18
18
 
19
19
  ## Options
20
20
 
21
- | Flag | Description | Default |
22
- |------|-------------|---------|
23
- | `--dir <path>` | Path to SkillsBench tasks directory | Required |
24
- | `--skill <name>` | Target skill to match tasks against | Required |
25
- | `--output <path>` | Output eval set JSON file | Required |
26
- | `--match-strategy <type>` | Matching strategy: `exact` or `fuzzy` | `exact` |
21
+ | Flag | Description | Default |
22
+ | ------------------------- | ------------------------------------- | -------- |
23
+ | `--dir <path>` | Path to SkillsBench tasks directory | Required |
24
+ | `--skill <name>` | Target skill to match tasks against | Required |
25
+ | `--output <path>` | Output eval set JSON file | Required |
26
+ | `--match-strategy <type>` | Matching strategy: `exact` or `fuzzy` | `exact` |
27
27
 
28
28
  ## Match Strategies
29
29
 
@@ -108,10 +108,13 @@ corpus. Use the merged set with `selftune evolve --eval-set merged-evals.json`.
108
108
  ## Common Patterns
109
109
 
110
110
  **"Import SkillsBench tasks for Research"**
111
+
111
112
  > `selftune eval import --dir /path/tasks --skill Research --output bench-evals.json`
112
113
 
113
114
  **"Use fuzzy matching for broader coverage"**
115
+
114
116
  > `selftune eval import --dir /path/tasks --skill pptx --output bench-evals.json --match-strategy fuzzy`
115
117
 
116
118
  **"Enrich my eval set with external benchmarks"**
119
+
117
120
  > Import with `eval import`, then pass the output to `evolve --eval-set`.