selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/.claude/agents/diagnosis-analyst.md +156 -0
  2. package/.claude/agents/evolution-reviewer.md +180 -0
  3. package/.claude/agents/integration-guide.md +212 -0
  4. package/.claude/agents/pattern-analyst.md +160 -0
  5. package/CHANGELOG.md +46 -1
  6. package/README.md +105 -257
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/assets/BeforeAfter.gif +0 -0
  20. package/assets/FeedbackLoop.gif +0 -0
  21. package/assets/logo.svg +9 -0
  22. package/assets/skill-health-badge.svg +20 -0
  23. package/cli/selftune/activation-rules.ts +171 -0
  24. package/cli/selftune/badge/badge-data.ts +108 -0
  25. package/cli/selftune/badge/badge-svg.ts +212 -0
  26. package/cli/selftune/badge/badge.ts +99 -0
  27. package/cli/selftune/canonical-export.ts +183 -0
  28. package/cli/selftune/constants.ts +103 -1
  29. package/cli/selftune/contribute/bundle.ts +314 -0
  30. package/cli/selftune/contribute/contribute.ts +214 -0
  31. package/cli/selftune/contribute/sanitize.ts +162 -0
  32. package/cli/selftune/cron/setup.ts +266 -0
  33. package/cli/selftune/dashboard-contract.ts +202 -0
  34. package/cli/selftune/dashboard-server.ts +1049 -0
  35. package/cli/selftune/dashboard.ts +43 -156
  36. package/cli/selftune/eval/baseline.ts +248 -0
  37. package/cli/selftune/eval/composability-v2.ts +273 -0
  38. package/cli/selftune/eval/composability.ts +117 -0
  39. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  40. package/cli/selftune/eval/hooks-to-evals.ts +101 -16
  41. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  42. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  43. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  44. package/cli/selftune/eval/unit-test.ts +196 -0
  45. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  46. package/cli/selftune/evolution/evidence.ts +26 -0
  47. package/cli/selftune/evolution/evolve-body.ts +586 -0
  48. package/cli/selftune/evolution/evolve.ts +825 -116
  49. package/cli/selftune/evolution/extract-patterns.ts +105 -16
  50. package/cli/selftune/evolution/pareto.ts +314 -0
  51. package/cli/selftune/evolution/propose-body.ts +171 -0
  52. package/cli/selftune/evolution/propose-description.ts +100 -2
  53. package/cli/selftune/evolution/propose-routing.ts +166 -0
  54. package/cli/selftune/evolution/refine-body.ts +141 -0
  55. package/cli/selftune/evolution/rollback.ts +21 -4
  56. package/cli/selftune/evolution/validate-body.ts +254 -0
  57. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  58. package/cli/selftune/evolution/validate-routing.ts +177 -0
  59. package/cli/selftune/grading/auto-grade.ts +200 -0
  60. package/cli/selftune/grading/grade-session.ts +513 -42
  61. package/cli/selftune/grading/pre-gates.ts +104 -0
  62. package/cli/selftune/grading/results.ts +42 -0
  63. package/cli/selftune/hooks/auto-activate.ts +185 -0
  64. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  65. package/cli/selftune/hooks/prompt-log.ts +172 -2
  66. package/cli/selftune/hooks/session-stop.ts +123 -3
  67. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  68. package/cli/selftune/hooks/skill-eval.ts +119 -3
  69. package/cli/selftune/index.ts +415 -48
  70. package/cli/selftune/ingestors/claude-replay.ts +377 -0
  71. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  72. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  73. package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
  74. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  75. package/cli/selftune/init.ts +376 -16
  76. package/cli/selftune/last.ts +14 -5
  77. package/cli/selftune/localdb/db.ts +63 -0
  78. package/cli/selftune/localdb/materialize.ts +428 -0
  79. package/cli/selftune/localdb/queries.ts +376 -0
  80. package/cli/selftune/localdb/schema.ts +204 -0
  81. package/cli/selftune/memory/writer.ts +447 -0
  82. package/cli/selftune/monitoring/watch.ts +90 -16
  83. package/cli/selftune/normalization.ts +682 -0
  84. package/cli/selftune/observability.ts +19 -44
  85. package/cli/selftune/orchestrate.ts +1073 -0
  86. package/cli/selftune/quickstart.ts +203 -0
  87. package/cli/selftune/repair/skill-usage.ts +576 -0
  88. package/cli/selftune/schedule.ts +561 -0
  89. package/cli/selftune/status.ts +59 -33
  90. package/cli/selftune/sync.ts +627 -0
  91. package/cli/selftune/types.ts +525 -5
  92. package/cli/selftune/utils/canonical-log.ts +45 -0
  93. package/cli/selftune/utils/frontmatter.ts +217 -0
  94. package/cli/selftune/utils/hooks.ts +41 -0
  95. package/cli/selftune/utils/html.ts +27 -0
  96. package/cli/selftune/utils/llm-call.ts +103 -19
  97. package/cli/selftune/utils/math.ts +10 -0
  98. package/cli/selftune/utils/query-filter.ts +139 -0
  99. package/cli/selftune/utils/skill-discovery.ts +340 -0
  100. package/cli/selftune/utils/skill-log.ts +68 -0
  101. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  102. package/cli/selftune/utils/transcript.ts +307 -26
  103. package/cli/selftune/utils/trigger-check.ts +89 -0
  104. package/cli/selftune/utils/tui.ts +156 -0
  105. package/cli/selftune/workflows/discover.ts +254 -0
  106. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  107. package/cli/selftune/workflows/workflows.ts +188 -0
  108. package/package.json +28 -11
  109. package/packages/telemetry-contract/README.md +11 -0
  110. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  111. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  112. package/packages/telemetry-contract/index.ts +1 -0
  113. package/packages/telemetry-contract/package.json +19 -0
  114. package/packages/telemetry-contract/src/index.ts +2 -0
  115. package/packages/telemetry-contract/src/types.ts +163 -0
  116. package/packages/telemetry-contract/src/validators.ts +109 -0
  117. package/skill/SKILL.md +180 -33
  118. package/skill/Workflows/AutoActivation.md +145 -0
  119. package/skill/Workflows/Badge.md +124 -0
  120. package/skill/Workflows/Baseline.md +144 -0
  121. package/skill/Workflows/Composability.md +107 -0
  122. package/skill/Workflows/Contribute.md +94 -0
  123. package/skill/Workflows/Cron.md +132 -0
  124. package/skill/Workflows/Dashboard.md +214 -0
  125. package/skill/Workflows/Doctor.md +63 -14
  126. package/skill/Workflows/Evals.md +110 -18
  127. package/skill/Workflows/EvolutionMemory.md +154 -0
  128. package/skill/Workflows/Evolve.md +181 -21
  129. package/skill/Workflows/EvolveBody.md +159 -0
  130. package/skill/Workflows/Grade.md +36 -31
  131. package/skill/Workflows/ImportSkillsBench.md +117 -0
  132. package/skill/Workflows/Ingest.md +142 -21
  133. package/skill/Workflows/Initialize.md +91 -23
  134. package/skill/Workflows/Orchestrate.md +139 -0
  135. package/skill/Workflows/Replay.md +91 -0
  136. package/skill/Workflows/Rollback.md +23 -4
  137. package/skill/Workflows/Schedule.md +61 -0
  138. package/skill/Workflows/Sync.md +88 -0
  139. package/skill/Workflows/UnitTest.md +150 -0
  140. package/skill/Workflows/Watch.md +33 -1
  141. package/skill/Workflows/Workflows.md +129 -0
  142. package/skill/assets/activation-rules-default.json +26 -0
  143. package/skill/assets/multi-skill-settings.json +63 -0
  144. package/skill/assets/single-skill-settings.json +57 -0
  145. package/skill/references/invocation-taxonomy.md +2 -2
  146. package/skill/references/logs.md +164 -2
  147. package/skill/references/setup-patterns.md +65 -0
  148. package/skill/references/version-history.md +40 -0
  149. package/skill/settings_snippet.json +23 -0
  150. package/templates/activation-rules-default.json +27 -0
  151. package/templates/multi-skill-settings.json +64 -0
  152. package/templates/single-skill-settings.json +58 -0
  153. package/dashboard/index.html +0 -1119
@@ -4,6 +4,14 @@ Improve a skill's description based on real usage signal. Analyzes failure
4
4
  patterns from eval sets and proposes description changes that catch more
5
5
  natural-language queries without breaking existing triggers.
6
6
 
7
+ ## When to Invoke
8
+
9
+ Invoke this workflow when the user requests any of the following:
10
+ - Improving or evolving a skill's trigger coverage
11
+ - Fixing undertriggering or missed queries for a skill
12
+ - Optimizing a skill description based on usage data
13
+ - Any request containing "evolve", "improve triggers", or "fix skill matching"
14
+
7
15
  ## Default Command
8
16
 
9
17
  ```bash
@@ -19,8 +27,14 @@ selftune evolve --skill <name> --skill-path <path> [options]
19
27
  | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
20
28
  | `--agent <name>` | Agent CLI to use (claude, codex, opencode) | Auto-detected |
21
29
  | `--dry-run` | Propose and validate without deploying | Off |
22
- | `--confidence <n>` | Minimum confidence threshold (0-1) | 0.7 |
30
+ | `--confidence <n>` | Minimum confidence threshold (0-1) | 0.6 |
23
31
  | `--max-iterations <n>` | Maximum retry iterations | 3 |
32
+ | `--validation-model <model>` | Model for trigger-check validation LLM calls | `haiku` |
33
+ | `--cheap-loop` | Use cheap models for loop, expensive for final gate | Off |
34
+ | `--gate-model <model>` | Model for final gate validation | `sonnet` (when `--cheap-loop`) |
35
+ | `--proposal-model <model>` | Model for proposal generation LLM calls | None |
36
+ | `--sync-first` | Refresh source-truth telemetry before generating evals/failure patterns | Off |
37
+ | `--sync-force` | Force a full source rescan during `--sync-first` | Off |
24
38
 
25
39
  ## Output Format
26
40
 
@@ -73,7 +87,96 @@ The evolution process writes multiple audit entries:
73
87
 
74
88
  ## Steps
75
89
 
76
- ### 1. Load or Generate Eval Set
90
+ ### 0. Pre-Flight Configuration
91
+
92
+ Before running the evolve command, present numbered configuration options to the user inline in your response, then wait for the user's answer before proceeding.
93
+
94
+ If the user responds with "use defaults", "just run it", or similar shorthand, skip to step 1 using the recommended defaults marked below.
95
+
96
+ Present the following options inline in your response:
97
+
98
+ 1. **Execution Mode**
99
+ - a) Dry run — preview proposal without deploying (recommended for first run)
100
+ - b) Live — validate and deploy if improved
101
+
102
+ 2. **Model Tier** (see SKILL.md Model Tier Reference)
103
+ - a) Fast (haiku) — cheapest, ~2s/call (recommended with cheap-loop)
104
+ - b) Balanced (sonnet) — good quality, ~5s/call
105
+ - c) Best (opus) — highest quality, ~10s/call
106
+
107
+ 3. **Cost Optimization**
108
+ - a) Cheap loop — haiku for iteration, sonnet for final gate (recommended)
109
+ - b) Single model — use one model throughout
110
+
111
+ 4. **Confidence Threshold:** 0.6 (default, higher = stricter)
112
+
113
+ 5. **Max Iterations:** 3 (default, more = longer but better results)
114
+
115
+ 6. **Multi-Candidate Selection**
116
+ - a) Single candidate — one proposal per iteration (recommended)
117
+ - b) Pareto mode — generate multiple candidates, pick best on frontier
118
+
119
+ Ask: "Reply with your choices (e.g., '1a, 2a, 3a, defaults for rest') or 'use defaults' for recommended settings."
120
+
121
+ After the user responds, parse their selections and map each choice to the corresponding CLI flags:
122
+
123
+ | Selection | CLI Flag |
124
+ |-----------|----------|
125
+ | 1a (dry run) | `--dry-run` |
126
+ | 1b (live) | _(no flag)_ |
127
+ | 2a (haiku) | `--validation-model haiku` |
128
+ | 2b (sonnet) | `--validation-model sonnet` |
129
+ | 2c (opus) | `--validation-model opus` |
130
+ | 3a (cheap loop) | `--cheap-loop` |
131
+ | 3b (single model) | _(no flag)_ |
132
+ | Custom confidence | `--confidence <value>` |
133
+ | Custom iterations | `--max-iterations <value>` |
134
+ | 6b (pareto) | `--pareto` |
135
+
136
+ Show a confirmation summary to the user:
137
+
138
+ ```
139
+ Configuration Summary:
140
+ Mode: dry-run
141
+ Model: haiku (cheap-loop: sonnet gate)
142
+ Confidence: 0.6
143
+ Iterations: 3
144
+ Pareto: off
145
+
146
+ Proceeding...
147
+ ```
148
+
149
+ Build the CLI command string with all selected flags and continue to step 1.
150
+
151
+ ### 1. Read Evolution Context
152
+
153
+ Before running, read `~/.selftune/memory/context.md` for session context:
154
+ - Active evolutions and their current status
155
+ - Known issues from previous runs
156
+ - Last update timestamp
157
+
158
+ This provides continuity across context resets. If the file doesn't exist,
159
+ proceed normally -- it will be created after the first evolution.
160
+
161
+ The evolution-guard hook (`hooks/evolution-guard.ts`) blocks direct SKILL.md
162
+ edits on monitored skills during active evolution. This prevents conflicting
163
+ changes while the evolve process is running. The guard is automatically
164
+ engaged when evolve starts and released when it completes.
165
+
166
+ ### 2. Refresh Source Truth (Recommended)
167
+
168
+ If the host has accumulated significant agent activity or is known to be
169
+ polluted, prefer:
170
+
171
+ ```bash
172
+ selftune evolve --skill <name> --skill-path <path> --sync-first
173
+ ```
174
+
175
+ `--sync-first` runs the authoritative transcript/rollout sync before eval-set
176
+ generation and failure-pattern extraction. Use `--sync-force` when you need
177
+ to ignore markers and rescan everything.
178
+
179
+ ### 3. Load or Generate Eval Set
77
180
 
78
181
  If `--eval-set` is provided, use it directly. Otherwise, the command
79
182
  generates one from logs (equivalent to running `evals --skill <name>`).
@@ -81,7 +184,7 @@ generates one from logs (equivalent to running `evals --skill <name>`).
81
184
  An eval set is required for validation. Without enough telemetry data,
82
185
  evolution cannot reliably measure improvement.
83
186
 
84
- ### 2. Extract Failure Patterns
187
+ ### 4. Extract Failure Patterns
85
188
 
86
189
  The command groups missed queries by invocation type:
87
190
  - Missed explicit: description is broken (rare, high priority)
@@ -90,7 +193,7 @@ The command groups missed queries by invocation type:
90
193
 
91
194
  See `references/invocation-taxonomy.md` for the taxonomy.
92
195
 
93
- ### 3. Propose Description Changes
196
+ ### 5. Propose Description Changes
94
197
 
95
198
  An LLM generates a candidate description that would catch the missed
96
199
  queries. The candidate:
@@ -98,7 +201,7 @@ queries. The candidate:
98
201
  - Adds new phrases covering missed patterns
99
202
  - Maintains the description's structure and tone
100
203
 
101
- ### 4. Validate Against Eval Set
204
+ ### 6. Validate Against Eval Set
102
205
 
103
206
  The candidate is tested against the full eval set:
104
207
  - Must improve overall pass rate
@@ -108,7 +211,7 @@ The candidate is tested against the full eval set:
108
211
  If validation fails, the command retries up to `--max-iterations` times
109
212
  with adjusted proposals.
110
213
 
111
- ### 5. Deploy (or Preview)
214
+ ### 7. Deploy (or Preview)
112
215
 
113
216
  If `--dry-run`, the proposal is printed but not deployed. The audit log
114
217
  still records `created` and `validated` entries for review.
@@ -118,6 +221,15 @@ If deploying:
118
221
  2. The updated description is written to SKILL.md
119
222
  3. A `deployed` entry is logged to the evolution audit
120
223
 
224
+ ### 8. Update Memory
225
+
226
+ After evolution completes (deploy or dry-run), the memory writer updates:
227
+ - `~/.selftune/memory/context.md` -- records the evolution outcome and current state
228
+ - `~/.selftune/memory/decisions.md` -- logs the decision rationale and proposal details
229
+
230
+ This ensures the next evolve, watch, or rollback workflow has full context
231
+ even after a context window reset.
232
+
121
233
  ### Stopping Criteria
122
234
 
123
235
  The evolution loop stops when any of these conditions is met (priority order):
@@ -130,24 +242,72 @@ The evolution loop stops when any of these conditions is met (priority order):
130
242
  | 4 | **Plateau** | Pass rate unchanged across 3 consecutive iterations |
131
243
  | 5 | **Continue** | None of the above -- keep iterating |
132
244
 
245
+ ## Cheap Loop Mode
246
+
247
+ The `--cheap-loop` flag runs the entire evolution loop with cheap models (haiku)
248
+ and only uses an expensive model (sonnet) for a final gate validation before
249
+ deploying. This reduces cost while maintaining deployment quality.
250
+
251
+ When `--cheap-loop` is set:
252
+ - `--proposal-model` defaults to `haiku`
253
+ - `--validation-model` defaults to `haiku`
254
+ - `--gate-model` defaults to `sonnet`
255
+
256
+ The gate validation is a new step between validation and deploy. It re-runs
257
+ `validateProposal` using the gate model. If the gate fails, the proposal is
258
+ not deployed.
259
+
260
+ ```bash
261
+ # Cheap loop with default models
262
+ selftune evolve --skill X --skill-path Y --cheap-loop
263
+
264
+ # Cheap loop with opus gate
265
+ selftune evolve --skill X --skill-path Y --cheap-loop --gate-model opus
266
+
267
+ # Manual model control without cheap-loop
268
+ selftune evolve --skill X --skill-path Y --proposal-model haiku --validation-model sonnet
269
+ ```
270
+
133
271
  ## Common Patterns
134
272
 
135
- **"Evolve the pptx skill"**
136
- > Need `--skill pptx` and `--skill-path /path/to/pptx/SKILL.md`.
137
- > If the user hasn't specified the path, search for the SKILL.md file
138
- > in the workspace or ask.
273
+ **User asks to evolve a specific skill (e.g., "evolve the pptx skill"):**
274
+ Requires `--skill pptx` and `--skill-path /path/to/pptx/SKILL.md`.
275
+ If the user has not specified the path, search for the SKILL.md file
276
+ in the workspace. If not found, ask the user for the path.
277
+
278
+ **User wants a preview without deployment (e.g., "just show me what would change"):**
279
+ Add `--dry-run` to preview proposals without deploying.
280
+
281
+ **Evolution results are insufficient:**
282
+ Check the eval set quality. Missing contextual examples limit
283
+ what evolution can learn. Generate a richer eval set first using the Evals workflow.
284
+
285
+ **Evolution keeps failing validation:**
286
+ Lower `--confidence` slightly or increase `--max-iterations`.
287
+ Also check if the eval set has contradictory expectations.
288
+
289
+ **Agent CLI override needed:**
290
+ The evolve command auto-detects the installed agent CLI.
291
+ Use `--agent <name>` to override (claude, codex, opencode).
292
+
293
+ ## Subagent Escalation
294
+
295
+ For high-stakes evolutions, consider spawning the `evolution-reviewer` agent
296
+ as a subagent to review the proposal before deploying. This is especially
297
+ valuable when the skill has a history of regressions, the evolution touches
298
+ many trigger phrases, or the confidence score is near the threshold.
139
299
 
140
- **"Just show me what would change"**
141
- > Use `--dry-run` to preview proposals without deploying.
300
+ ## Autonomous Mode
142
301
 
143
- **"The evolution didn't help enough"**
144
- > Check the eval set quality. Missing contextual examples will limit
145
- > what evolution can learn. Generate a richer eval set first.
302
+ When called by `selftune orchestrate` (via cron or --loop), evolution runs
303
+ without user interaction:
146
304
 
147
- **"Evolution keeps failing validation"**
148
- > Lower `--confidence` slightly or increase `--max-iterations`.
149
- > Also check if the eval set has contradictory expectations.
305
+ - Pre-flight is skipped entirely — defaults are used
306
+ - The orchestrator selects candidate skills based on health scores
307
+ - Evolution uses cheap-loop mode (Haiku) by default
308
+ - Validation runs automatically against the eval set
309
+ - Deploy happens if validation passes the regression threshold
310
+ - Results are logged to orchestrate-runs.jsonl
150
311
 
151
- **"Which agent is being used?"**
152
- > The evolve command auto-detects your installed agent CLI.
153
- > Use `--agent <name>` to override (claude, codex, opencode).
312
+ No user confirmation is needed. The safety controls (regression threshold,
313
+ auto-rollback via watch, SKILL.md backup) provide the guardrails.
@@ -0,0 +1,159 @@
1
+ # selftune Evolve Body Workflow
2
+
3
+ Evolve a skill's full body content or routing table, not just the description.
4
+ Uses a teacher-student model: a stronger LLM generates proposals, a cheaper
5
+ LLM validates them through a 3-gate pipeline.
6
+
7
+ ## Default Command
8
+
9
+ ```bash
10
+ selftune evolve body --skill <name> --skill-path <path> --target <target> [options]
11
+ ```
12
+
13
+ ## Options
14
+
15
+ | Flag | Description | Default |
16
+ |------|-------------|---------|
17
+ | `--skill <name>` | Skill name | Required |
18
+ | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
19
+ | `--target <type>` | Evolution target: `routing_table` or `full_body` | Required |
20
+ | `--teacher-agent <name>` | Agent CLI for proposal generation | Auto-detected |
21
+ | `--student-agent <name>` | Agent CLI for validation | Same as teacher |
22
+ | `--teacher-model <flag>` | Model flag for teacher (e.g. `opus`) | Agent default |
23
+ | `--student-model <flag>` | Model flag for student (e.g. `haiku`) | Agent default |
24
+ | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
25
+ | `--dry-run` | Propose and validate without deploying | Off |
26
+ | `--max-iterations <n>` | Maximum refinement iterations | 3 |
27
+ | `--task-description <text>` | Context for the evolution goal | None |
28
+ | `--validation-model <model>` | Model for trigger-check validation calls (overrides `--student-model` for validation) | None |
29
+ | `--few-shot <paths>` | Comma-separated paths to example SKILL.md files | None |
30
+
31
+ ## Evolution Targets
32
+
33
+ ### `routing_table`
34
+
35
+ Optimizes the `## Workflow Routing` markdown table in SKILL.md. The teacher
36
+ LLM analyzes missed triggers and proposes new routing entries that map
37
+ trigger keywords to the correct workflow files.
38
+
39
+ ### `full_body`
40
+
41
+ Rewrites the entire SKILL.md body below the frontmatter. This includes
42
+ the description, routing table, examples, and all other sections. The
43
+ teacher generates a complete replacement, validated through 3 gates.
44
+
45
+ ## 3-Gate Validation Pipeline
46
+
47
+ Every proposal passes through three sequential gates:
48
+
49
+ | Gate | Type | What it checks | Cost |
50
+ |------|------|---------------|------|
51
+ | **Gate 1: Structural** | Pure code | YAML frontmatter present, `# Title` exists, `## Workflow Routing` preserved if original had one | Free |
52
+ | **Gate 2: Trigger Accuracy** | Student LLM | YES/NO trigger check per eval entry on the extracted description | Cheap |
53
+ | **Gate 3: Quality** | Student LLM | Body clarity and completeness score (0.0-1.0) | Cheap |
54
+
55
+ If any gate fails, the teacher receives structured feedback and generates
56
+ a refined proposal. This repeats up to `--max-iterations` times.
57
+
58
+ ## Steps
59
+
60
+ ### 0. Pre-Flight Configuration
61
+
62
+ Before running evolve-body, present configuration options to the user.
63
+ If the user says "use defaults" or similar, skip to step 1 with recommended defaults.
64
+
65
+ Present these options:
66
+
67
+ ```
68
+ selftune evolve body — Pre-Flight Configuration
69
+
70
+ 1. Evolution Target
71
+ a) Routing table — optimize the workflow routing table only
72
+ b) Full body — rewrite entire SKILL.md body (more aggressive)
73
+
74
+ 2. Execution Mode
75
+ a) Dry run — preview proposal without deploying (recommended)
76
+ b) Live — validate and deploy if improved
77
+
78
+ 3. Teacher Model (generates proposals)
79
+ a) Balanced (sonnet) — good quality proposals (recommended)
80
+ b) Best (opus) — highest quality, slower and more expensive
81
+
82
+ 4. Student Model (validates proposals)
83
+ a) Fast (haiku) — cheap validation (recommended)
84
+ b) Balanced (sonnet) — higher quality validation
85
+
86
+ 5. Max Iterations: [3] (default)
87
+
88
+ 6. Few-Shot Examples: [none] (paths to example SKILL.md files for guidance)
89
+
90
+ → Reply with your choices or "use defaults" for recommended settings.
91
+ ```
92
+
93
+ After the user responds, show a confirmation summary:
94
+
95
+ ```
96
+ Configuration Summary:
97
+ Target: routing_table
98
+ Mode: dry-run
99
+ Teacher model: sonnet
100
+ Student model: haiku
101
+ Iterations: 3
102
+ Few-shot: none
103
+
104
+ Proceeding...
105
+ ```
106
+
107
+ ### 1. Parse Current Skill
108
+
109
+ The command reads SKILL.md and splits it into sections using `parseSkillSections()`:
110
+ - Frontmatter (YAML between `---` markers)
111
+ - Title (first `# Heading`)
112
+ - Description (text between title and first `## Section`)
113
+ - Workflow Routing (the `## Workflow Routing` section, if present)
114
+ - Remaining Body (everything else)
115
+
116
+ ### 2. Build Eval Set
117
+
118
+ If `--eval-set` is provided, use it directly. Otherwise, generate from logs
119
+ (same as `selftune eval generate --skill <name>`).
120
+
121
+ ### 3. Extract Failure Patterns
122
+
123
+ Groups missed queries by invocation type, same as the description evolution
124
+ pipeline. See `references/invocation-taxonomy.md`.
125
+
126
+ ### 4. Generate Proposal (Teacher)
127
+
128
+ The teacher LLM generates a proposal based on the target:
129
+ - **routing_table**: Optimized `## Workflow Routing` markdown table
130
+ - **full_body**: Complete SKILL.md body replacement
131
+
132
+ Few-shot examples from `--few-shot` paths provide structural guidance.
133
+
134
+ ### 5. Validate (3 Gates)
135
+
136
+ Each gate runs in sequence. If a gate fails, the teacher receives the
137
+ failure details and generates a refined proposal.
138
+
139
+ ### 6. Deploy or Preview
140
+
141
+ If `--dry-run`, prints the proposal without deploying. Otherwise:
142
+ 1. Creates a timestamped backup of the current SKILL.md
143
+ 2. Applies the change: `replaceSection()` for routing, `replaceBody()` for full_body
144
+ 3. Records audit entries
145
+ 4. Updates evolution memory
146
+
147
+ ## Common Patterns
148
+
149
+ **"Evolve the routing table for the Research skill"**
150
+ > `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target routing_table`
151
+
152
+ **"Rewrite the entire skill body"**
153
+ > `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target full_body --dry-run`
154
+
155
+ **"Use a stronger model for generation"**
156
+ > `selftune evolve body --skill pptx --skill-path /path/SKILL.md --target full_body --teacher-model opus --student-model haiku`
157
+
158
+ **"Preview what would change"**
159
+ > Always start with `--dry-run` to review the proposal before deploying.
@@ -74,15 +74,14 @@ for the full schema. Key fields:
74
74
 
75
75
  ### 1. Find the Session
76
76
 
77
- Read `~/.claude/session_telemetry_log.jsonl` and find the most recent entry
78
- where `skills_triggered` contains the target skill name.
79
-
80
- Note the `transcript_path`, `tool_calls`, `errors_encountered`, and
81
- `session_id` fields. See `references/logs.md` for the telemetry format.
77
+ Read `~/.claude/session_telemetry_log.jsonl`. Find the most recent entry
78
+ where `skills_triggered` contains the target skill name. Extract the
79
+ `transcript_path`, `tool_calls`, `errors_encountered`, and `session_id`
80
+ fields. See `references/logs.md` for the telemetry format.
82
81
 
83
82
  ### 2. Read the Transcript
84
83
 
85
- Parse the JSONL file at `transcript_path`. Identify:
84
+ Parse the JSONL file at `transcript_path`. Extract:
86
85
  - User messages (what was asked)
87
86
  - Assistant tool calls (what the agent did)
88
87
  - Tool results (what happened)
@@ -92,16 +91,14 @@ See `references/logs.md` for transcript format variants.
92
91
 
93
92
  ### 3. Determine Expectations
94
93
 
95
- If the user provided `--expectations`, parse the semicolon-separated list.
96
- Otherwise, derive defaults. See `references/grading-methodology.md` for the
97
- full default expectations list.
98
-
99
- Always include at least one Process expectation and one Quality expectation.
94
+ If `--expectations` was provided, parse the semicolon-separated list.
95
+ Otherwise, derive defaults from `references/grading-methodology.md`.
96
+ Ensure at least one Process expectation and one Quality expectation.
100
97
 
101
98
  ### 4. Grade Each Expectation
102
99
 
103
- For each expectation, search both the telemetry record and the transcript
104
- for evidence. Mark as:
100
+ Search both the telemetry record and the transcript for evidence per
101
+ expectation. Mark as:
105
102
  - **PASS** if evidence exists and supports the expectation
106
103
  - **FAIL** if evidence is absent or contradicts the expectation
107
104
 
@@ -109,22 +106,21 @@ Cite specific evidence: transcript line numbers, tool call names, bash output.
109
106
 
110
107
  ### 5. Extract Implicit Claims
111
108
 
112
- Pull 2-4 claims from the transcript that are not covered by the explicit
113
- expectations. Classify each as factual, process, or quality. Verify each
114
- against the transcript. See `references/grading-methodology.md` for claim
115
- types and examples.
109
+ Pull 2-4 claims from the transcript not covered by explicit expectations.
110
+ Classify each as factual, process, or quality. Verify each against the
111
+ transcript. See `references/grading-methodology.md` for claim types.
116
112
 
117
113
  ### 6. Flag Eval Gaps
118
114
 
119
115
  Review each passed expectation. If it would also pass for wrong output,
120
- note it in `eval_feedback.suggestions`. See `references/grading-methodology.md`
121
- for gap flagging criteria.
116
+ record it in `eval_feedback.suggestions`. See
117
+ `references/grading-methodology.md` for gap flagging criteria.
122
118
 
123
119
  ### 7. Write grading.json
124
120
 
125
121
  Write the full grading result to `grading.json` in the current directory.
126
122
 
127
- ### 8. Summarize
123
+ ### 8. Report Results
128
124
 
129
125
  Report to the user:
130
126
  - Pass rate (e.g., "2/3 passed, 67%")
@@ -136,17 +132,26 @@ Keep the summary concise. The full details are in `grading.json`.
136
132
 
137
133
  ## Common Patterns
138
134
 
139
- **"Grade my last pptx session"**
140
- > Find the most recent telemetry entry for `pptx`. Use default expectations.
141
- > Ask if the user wants custom expectations or proceed with defaults.
135
+ **User asks to grade a skill session**
136
+ > Run `selftune grade --skill <name>` with default expectations. Results are
137
+ > written to `grading.json`. Read that file and report the pass rate and any
138
+ > failures to the user.
139
+
140
+ **User provides specific expectations**
141
+ > Run `selftune grade --skill <name> --expectations "expect1;expect2;expect3"`.
142
+ > Parse results and report.
143
+
144
+ **User wants to grade from an eval set**
145
+ > Run `selftune grade --skill <name> --evals-json path/to/evals.json`.
146
+ > Optionally add `--eval-id N` for a specific scenario.
142
147
 
143
- **"Grade with these specific expectations"**
144
- > Pass `--expectations "expect1;expect2;expect3"` to override defaults.
148
+ **Agent detection override needed**
149
+ > The grader auto-detects the agent CLI. If detection fails or the user
150
+ > specifies an agent, pass `--agent <name>` to override.
145
151
 
146
- **"Grade using an eval set"**
147
- > Pass `--evals-json path/to/evals.json` and optionally `--eval-id N`
148
- > to grade a specific eval scenario.
152
+ ## Autonomous Mode
149
153
 
150
- **"Which agent is being used?"**
151
- > The grader auto-detects your installed agent CLI (claude, codex, opencode).
152
- > Use `--agent <name>` to override the auto-detected agent.
154
+ Grading runs implicitly during orchestrate as part of status computation.
155
+ The orchestrator reads grading results to determine which skills are
156
+ candidates for evolution. No explicit grade command is called — the
157
+ grading results from previous sessions feed into candidate selection.
@@ -0,0 +1,117 @@
1
+ # selftune Import SkillsBench Workflow
2
+
3
+ ## When to Use
4
+
5
+ When the user wants to enrich eval sets with external benchmark tasks or import SkillsBench corpora.
6
+
7
+ ## Overview
8
+
9
+ Import evaluation tasks from the SkillsBench corpus (87 real-world agent
10
+ benchmarks) and convert them to selftune eval entries. This enriches
11
+ your skill's eval set with externally validated test cases.
12
+
13
+ ## Default Command
14
+
15
+ ```bash
16
+ selftune eval import --dir <path> --skill <name> --output <path> [options]
17
+ ```
18
+
19
+ ## Options
20
+
21
+ | Flag | Description | Default |
22
+ |------|-------------|---------|
23
+ | `--dir <path>` | Path to SkillsBench tasks directory | Required |
24
+ | `--skill <name>` | Target skill to match tasks against | Required |
25
+ | `--output <path>` | Output eval set JSON file | Required |
26
+ | `--match-strategy <type>` | Matching strategy: `exact` or `fuzzy` | `exact` |
27
+
28
+ ## Match Strategies
29
+
30
+ ### `exact`
31
+
32
+ Matches tasks where `expected_skill` in `task.toml` exactly matches the
33
+ target skill name. Precise but may miss relevant tasks.
34
+
35
+ ### `fuzzy`
36
+
37
+ Uses keyword overlap between the task's category/tags and the skill name.
38
+ Casts a wider net but may include marginally relevant tasks. Review the
39
+ output and remove false matches.
40
+
41
+ ## SkillsBench Directory Structure
42
+
43
+ The importer expects this layout:
44
+
45
+ ```
46
+ tasks/
47
+ ├── task-001/
48
+ │ ├── instruction.md # Task description (used as query)
49
+ │ └── task.toml # Metadata (difficulty, category, tags, expected_skill)
50
+ ├── task-002/
51
+ │ ├── instruction.md
52
+ │ └── task.toml
53
+ └── ...
54
+ ```
55
+
56
+ ### `task.toml` Format
57
+
58
+ ```toml
59
+ difficulty = "medium"
60
+ category = "research"
61
+ tags = ["web-search", "analysis", "summarization"]
62
+ expected_skill = "Research"
63
+ expected_tools = ["WebSearch", "Read"]
64
+ ```
65
+
66
+ All fields are optional. Tasks without `task.toml` use default values.
67
+
68
+ ## Output Format
69
+
70
+ Standard selftune eval entries:
71
+
72
+ ```json
73
+ [
74
+ {
75
+ "id": 1,
76
+ "query": "Find and summarize the latest papers on transformer architectures",
77
+ "expected": true,
78
+ "invocation_type": "implicit",
79
+ "skill_name": "Research",
80
+ "source_session": null,
81
+ "source": "skillsbench"
82
+ }
83
+ ]
84
+ ```
85
+
86
+ ## Steps
87
+
88
+ ### 1. Obtain SkillsBench Corpus
89
+
90
+ Clone or download the SkillsBench repository containing the task directory.
91
+
92
+ ### 2. Import Tasks
93
+
94
+ ```bash
95
+ selftune eval import --dir /path/to/skillsbench/tasks --skill Research --output evals-bench.json
96
+ ```
97
+
98
+ ### 3. Review Output
99
+
100
+ Inspect the generated eval entries. Remove any that don't match your skill's
101
+ intended scope. Adjust match strategy if needed.
102
+
103
+ ### 4. Merge with Existing Evals
104
+
105
+ Combine imported entries with your existing eval set for a richer validation
106
+ corpus. Use the merged set with `selftune evolve --eval-set merged-evals.json`.
107
+
108
+ ## Common Patterns
109
+
110
+ **"Import SkillsBench tasks for Research"**
111
+ > `selftune eval import --dir /path/tasks --skill Research --output bench-evals.json`
112
+
113
+ **"Use fuzzy matching for broader coverage"**
114
+ > `selftune eval import --dir /path/tasks --skill pptx --output bench-evals.json --match-strategy fuzzy`
115
+
116
+ **"Enrich my eval set with external benchmarks"**
117
+ > Import with `eval import`, then pass the output to `evolve --eval-set`.