selftune 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/.claude/agents/diagnosis-analyst.md +20 -10
  2. package/.claude/agents/evolution-reviewer.md +14 -1
  3. package/.claude/agents/integration-guide.md +18 -6
  4. package/.claude/agents/pattern-analyst.md +18 -5
  5. package/CHANGELOG.md +12 -4
  6. package/README.md +43 -35
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/cli/selftune/badge/badge-data.ts +1 -1
  20. package/cli/selftune/badge/badge.ts +4 -8
  21. package/cli/selftune/canonical-export.ts +183 -0
  22. package/cli/selftune/constants.ts +28 -0
  23. package/cli/selftune/contribute/contribute.ts +1 -1
  24. package/cli/selftune/cron/setup.ts +17 -17
  25. package/cli/selftune/dashboard-contract.ts +202 -0
  26. package/cli/selftune/dashboard-server.ts +653 -186
  27. package/cli/selftune/dashboard.ts +41 -176
  28. package/cli/selftune/eval/baseline.ts +5 -4
  29. package/cli/selftune/eval/composability-v2.ts +273 -0
  30. package/cli/selftune/eval/hooks-to-evals.ts +34 -15
  31. package/cli/selftune/eval/unit-test-cli.ts +1 -1
  32. package/cli/selftune/evolution/evidence.ts +26 -0
  33. package/cli/selftune/evolution/evolve-body.ts +105 -11
  34. package/cli/selftune/evolution/evolve.ts +371 -25
  35. package/cli/selftune/evolution/extract-patterns.ts +87 -29
  36. package/cli/selftune/evolution/rollback.ts +2 -2
  37. package/cli/selftune/grading/auto-grade.ts +200 -0
  38. package/cli/selftune/grading/grade-session.ts +448 -97
  39. package/cli/selftune/grading/results.ts +42 -0
  40. package/cli/selftune/hooks/prompt-log.ts +172 -2
  41. package/cli/selftune/hooks/session-stop.ts +123 -3
  42. package/cli/selftune/hooks/skill-eval.ts +119 -3
  43. package/cli/selftune/index.ts +395 -116
  44. package/cli/selftune/ingestors/claude-replay.ts +140 -114
  45. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  46. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  47. package/cli/selftune/ingestors/openclaw-ingest.ts +141 -8
  48. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  49. package/cli/selftune/init.ts +227 -14
  50. package/cli/selftune/last.ts +14 -5
  51. package/cli/selftune/localdb/db.ts +63 -0
  52. package/cli/selftune/localdb/materialize.ts +428 -0
  53. package/cli/selftune/localdb/queries.ts +376 -0
  54. package/cli/selftune/localdb/schema.ts +204 -0
  55. package/cli/selftune/monitoring/watch.ts +66 -15
  56. package/cli/selftune/normalization.ts +682 -0
  57. package/cli/selftune/observability.ts +19 -44
  58. package/cli/selftune/orchestrate.ts +1073 -0
  59. package/cli/selftune/quickstart.ts +203 -0
  60. package/cli/selftune/repair/skill-usage.ts +576 -0
  61. package/cli/selftune/schedule.ts +561 -0
  62. package/cli/selftune/status.ts +48 -26
  63. package/cli/selftune/sync.ts +627 -0
  64. package/cli/selftune/types.ts +148 -0
  65. package/cli/selftune/utils/canonical-log.ts +45 -0
  66. package/cli/selftune/utils/hooks.ts +41 -0
  67. package/cli/selftune/utils/html.ts +27 -0
  68. package/cli/selftune/utils/llm-call.ts +78 -20
  69. package/cli/selftune/utils/math.ts +10 -0
  70. package/cli/selftune/utils/query-filter.ts +139 -0
  71. package/cli/selftune/utils/skill-discovery.ts +340 -0
  72. package/cli/selftune/utils/skill-log.ts +68 -0
  73. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  74. package/cli/selftune/utils/transcript.ts +272 -26
  75. package/cli/selftune/workflows/discover.ts +254 -0
  76. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  77. package/cli/selftune/workflows/workflows.ts +188 -0
  78. package/package.json +21 -8
  79. package/packages/telemetry-contract/README.md +11 -0
  80. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  81. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  82. package/packages/telemetry-contract/index.ts +1 -0
  83. package/packages/telemetry-contract/package.json +19 -0
  84. package/packages/telemetry-contract/src/index.ts +2 -0
  85. package/packages/telemetry-contract/src/types.ts +163 -0
  86. package/packages/telemetry-contract/src/validators.ts +109 -0
  87. package/skill/SKILL.md +84 -53
  88. package/skill/Workflows/AutoActivation.md +17 -16
  89. package/skill/Workflows/Badge.md +6 -0
  90. package/skill/Workflows/Baseline.md +46 -23
  91. package/skill/Workflows/Composability.md +12 -5
  92. package/skill/Workflows/Contribute.md +17 -14
  93. package/skill/Workflows/Cron.md +56 -79
  94. package/skill/Workflows/Dashboard.md +45 -34
  95. package/skill/Workflows/Doctor.md +30 -17
  96. package/skill/Workflows/Evals.md +64 -40
  97. package/skill/Workflows/EvolutionMemory.md +2 -0
  98. package/skill/Workflows/Evolve.md +102 -47
  99. package/skill/Workflows/EvolveBody.md +6 -6
  100. package/skill/Workflows/Grade.md +36 -31
  101. package/skill/Workflows/ImportSkillsBench.md +11 -5
  102. package/skill/Workflows/Ingest.md +43 -36
  103. package/skill/Workflows/Initialize.md +44 -30
  104. package/skill/Workflows/Orchestrate.md +139 -0
  105. package/skill/Workflows/Replay.md +39 -18
  106. package/skill/Workflows/Rollback.md +3 -3
  107. package/skill/Workflows/Schedule.md +61 -0
  108. package/skill/Workflows/Sync.md +88 -0
  109. package/skill/Workflows/UnitTest.md +34 -22
  110. package/skill/Workflows/Watch.md +14 -4
  111. package/skill/Workflows/Workflows.md +129 -0
  112. package/skill/assets/activation-rules-default.json +26 -0
  113. package/skill/assets/multi-skill-settings.json +63 -0
  114. package/skill/assets/single-skill-settings.json +57 -0
  115. package/skill/references/invocation-taxonomy.md +2 -2
  116. package/skill/references/logs.md +164 -2
  117. package/skill/references/setup-patterns.md +65 -0
  118. package/skill/references/version-history.md +40 -0
  119. package/skill/settings_snippet.json +1 -1
  120. package/templates/multi-skill-settings.json +7 -7
  121. package/templates/single-skill-settings.json +6 -6
  122. package/dashboard/index.html +0 -1680
@@ -4,10 +4,19 @@ Generate eval sets from hook logs. Detects false negatives (queries that
4
4
  should have triggered a skill but did not) and annotates each entry with
5
5
  its invocation type.
6
6
 
7
+ ## When to Invoke
8
+
9
+ Invoke this workflow when the user requests any of the following:
10
+ - Generating eval sets or test data for a skill
11
+ - Checking which skills are undertriggering
12
+ - Viewing skill telemetry or usage stats
13
+ - Preparing data before running the Evolve workflow
14
+ - Any request containing "evals", "eval set", "test queries", or "skill stats"
15
+
7
16
  ## Default Command
8
17
 
9
18
  ```bash
10
- selftune evals --skill <name> [options]
19
+ selftune eval generate --skill <name> [options]
11
20
  ```
12
21
 
13
22
  ## Options
@@ -101,10 +110,10 @@ selftune evals --skill <name> [options]
101
110
  Discover which skills have telemetry data and how many queries each has.
102
111
 
103
112
  ```bash
104
- selftune evals --list-skills
113
+ selftune eval generate --list-skills
105
114
  ```
106
115
 
107
- Use this first to identify which skills have enough data for eval generation.
116
+ Run this first to identify which skills have enough data for eval generation.
108
117
 
109
118
  ### Generate Synthetic Evals (Cold Start)
110
119
 
@@ -112,7 +121,7 @@ When a skill has no telemetry data yet, use `--synthetic` to generate eval
112
121
  queries directly from the SKILL.md content via an LLM.
113
122
 
114
123
  ```bash
115
- selftune evals --skill pptx --synthetic --skill-path /path/to/skills/pptx/SKILL.md
124
+ selftune eval generate --skill pptx --synthetic --skill-path /path/to/skills/pptx/SKILL.md
116
125
  ```
117
126
 
118
127
  The command:
@@ -125,7 +134,7 @@ The command:
125
134
  Use `--model` to override the default LLM model:
126
135
 
127
136
  ```bash
128
- selftune evals --skill pptx --synthetic --skill-path ./skills/pptx/SKILL.md --model claude-sonnet-4-5-20250514
137
+ selftune eval generate --skill pptx --synthetic --skill-path ./skills/pptx/SKILL.md --model claude-sonnet-4-5-20250514
129
138
  ```
130
139
 
131
140
  ### Generate Evals (Log-Based)
@@ -135,7 +144,7 @@ Cross-reference `skill_usage_log.jsonl` (positive triggers) against
135
144
  an eval set annotated with invocation types.
136
145
 
137
146
  ```bash
138
- selftune evals --skill pptx --max 50 --out evals-pptx.json
147
+ selftune eval generate --skill pptx --max 50 --out evals-pptx.json
139
148
  ```
140
149
 
141
150
  The command:
@@ -152,40 +161,53 @@ View aggregate telemetry for a skill: average turns, tool call breakdown,
152
161
  error rates, and common bash command patterns.
153
162
 
154
163
  ```bash
155
- selftune evals --skill pptx --stats
164
+ selftune eval generate --skill pptx --stats
156
165
  ```
157
166
 
158
167
  ## Steps
159
168
 
160
169
  ### 0. Pre-Flight Configuration
161
170
 
162
- Before generating evals, present configuration options to the user.
163
- If the user says "use defaults" or similar, skip to step 1 with recommended defaults.
171
+ Before generating evals, present numbered configuration options to the user inline in your response, then wait for the user's answer before proceeding.
164
172
 
165
- For `--list-skills` or `--stats` requests, skip pre-flight entirely (read-only).
173
+ If the user responds with "use defaults", "just do it", or similar shorthand, skip to step 1 using the recommended defaults.
166
174
 
167
- Present these options:
175
+ For `--list-skills` or `--stats` requests, skip pre-flight entirely — these are read-only operations.
168
176
 
169
- ```text
170
- selftune evals — Pre-Flight Configuration
177
+ Present the following options inline in your response:
171
178
 
172
- 1. Generation Mode
173
- a) Log-based — build evals from real usage logs (recommended if logs exist)
174
- b) Synthetic — generate evals from SKILL.md via LLM (for new skills with no data)
179
+ 1. **Generation Mode**
180
+ - a) Log-based — build evals from real usage logs (recommended if logs exist)
181
+ - b) Synthetic — generate evals from SKILL.md via LLM (for new skills with no data)
175
182
 
176
- 2. Max Entries: [50] (default how many eval entries to generate)
183
+ 2. **Skill Path** (synthetic mode only)
184
+ - Provide absolute or relative path to the target SKILL.md
185
+ - Example: `./skills/pptx/SKILL.md`
177
186
 
178
- 3. Model (synthetic mode only)
179
- a) Fast (haiku) — quick generation
180
- b) Balanced (sonnet) — better query diversity (recommended)
181
- c) Best (opus) — highest quality synthetic queries
187
+ 3. **Max Entries:** 50 (default how many eval entries to generate)
182
188
 
183
- 4. Output Path: [evals-<skill>.json] (default)
189
+ 4. **Model** (synthetic mode only)
190
+ - a) Fast (haiku) — quick generation
191
+ - b) Balanced (sonnet) — better query diversity (recommended)
192
+ - c) Best (opus) — highest quality synthetic queries
184
193
 
185
- Reply with your choices or "use defaults" for recommended settings.
186
- ```
194
+ 5. **Output Path:** `evals-<skill>.json` (default)
187
195
 
188
- After the user responds, show a confirmation summary:
196
+ Ask: "Reply with your choices or 'use defaults' for recommended settings."
197
+
198
+ After the user responds, parse their selections and map each choice to the corresponding CLI flags:
199
+
200
+ | Selection | CLI Flag |
201
+ |-----------|----------|
202
+ | 1a (log-based) | _(no flag, default)_ |
203
+ | 1b (synthetic) | `--synthetic --skill-path <path>` |
204
+ | Custom max entries | `--max <value>` |
205
+ | 4a (haiku) | `--model haiku` (resolved internally by selftune) |
206
+ | 4b (sonnet) | `--model sonnet` |
207
+ | 4c (opus) | `--model opus` |
208
+ | Custom output path | `--out <path>` |
209
+
210
+ Show a confirmation summary to the user:
189
211
 
190
212
  ```text
191
213
  Configuration Summary:
@@ -196,15 +218,17 @@ Configuration Summary:
196
218
  Proceeding...
197
219
  ```
198
220
 
221
+ Build the CLI command string with all selected flags and continue to step 1.
222
+
199
223
  ### 1. List Available Skills
200
224
 
201
- Run `selftune evals --list-skills` to see what skills have telemetry data. If the target
225
+ Run `selftune eval generate --list-skills` to see what skills have telemetry data. If the target
202
226
  skill has zero or very few queries, more sessions are needed before
203
227
  eval generation is useful.
204
228
 
205
229
  ### 2. Generate the Eval Set
206
230
 
207
- Run with `--skill <name>`. Review the output file for:
231
+ Run with `--skill <name>`. Parse the JSON output and review for:
208
232
  - Balance between positive and negative entries
209
233
  - Coverage of all three positive invocation types (explicit, implicit, contextual)
210
234
  - Reasonable negative examples (keyword overlap but wrong intent)
@@ -233,20 +257,20 @@ beyond trigger coverage.
233
257
 
234
258
  ## Common Patterns
235
259
 
236
- **"What skills are undertriggering?"**
237
- > Run `selftune evals --list-skills`, then for each skill with significant query counts,
238
- > generate evals and check for missed implicit/contextual queries.
260
+ **User asks which skills are undertriggering:**
261
+ Run `selftune eval generate --list-skills`, then for each skill with significant query counts,
262
+ generate evals and check for missed implicit/contextual queries.
239
263
 
240
- **"Generate evals for pptx"**
241
- > Run `selftune evals --skill pptx`. Review the invocation type distribution.
242
- > Feed the output to `evolve` if coverage gaps exist.
264
+ **User asks to generate evals for a specific skill:**
265
+ Run `selftune eval generate --skill <name>`. Parse the JSON output and review the invocation type distribution.
266
+ Feed the output to the Evolve workflow if coverage gaps exist.
243
267
 
244
- **"Show me skill stats"**
245
- > Run `selftune evals --skill <name> --stats` for aggregate telemetry.
268
+ **User asks for skill telemetry or stats:**
269
+ Run `selftune eval generate --skill <name> --stats` for aggregate telemetry.
246
270
 
247
- **"I have a new skill with no usage data"**
248
- > Use `selftune evals --skill <name> --synthetic --skill-path /path/to/SKILL.md`.
249
- > This generates eval queries from the skill description without needing session logs.
271
+ **User has a new skill with no usage data:**
272
+ Use `selftune eval generate --skill <name> --synthetic --skill-path /path/to/SKILL.md`.
273
+ This generates eval queries from the skill description without needing session logs.
250
274
 
251
- **"I want reproducible evals"**
252
- > Use `--seed <n>` to fix the random sampling of negative examples.
275
+ **User wants reproducible evals:**
276
+ Add `--seed <n>` to fix the random sampling of negative examples.
@@ -1,5 +1,7 @@
1
1
  # selftune Evolution Memory
2
2
 
3
+ This reference documents the evolution memory system. The agent reads these files automatically during evolve, watch, and rollback workflows for session continuity.
4
+
3
5
  Human-readable session context that survives context window resets. Provides
4
6
  continuity across evolve, watch, and rollback workflows by recording outcomes,
5
7
  decisions, and known issues in plain markdown files.
@@ -4,6 +4,14 @@ Improve a skill's description based on real usage signal. Analyzes failure
4
4
  patterns from eval sets and proposes description changes that catch more
5
5
  natural-language queries without breaking existing triggers.
6
6
 
7
+ ## When to Invoke
8
+
9
+ Invoke this workflow when the user requests any of the following:
10
+ - Improving or evolving a skill's trigger coverage
11
+ - Fixing undertriggering or missed queries for a skill
12
+ - Optimizing a skill description based on usage data
13
+ - Any request containing "evolve", "improve triggers", or "fix skill matching"
14
+
7
15
  ## Default Command
8
16
 
9
17
  ```bash
@@ -25,6 +33,8 @@ selftune evolve --skill <name> --skill-path <path> [options]
25
33
  | `--cheap-loop` | Use cheap models for loop, expensive for final gate | Off |
26
34
  | `--gate-model <model>` | Model for final gate validation | `sonnet` (when `--cheap-loop`) |
27
35
  | `--proposal-model <model>` | Model for proposal generation LLM calls | None |
36
+ | `--sync-first` | Refresh source-truth telemetry before generating evals/failure patterns | Off |
37
+ | `--sync-force` | Force a full source rescan during `--sync-first` | Off |
28
38
 
29
39
  ## Output Format
30
40
 
@@ -79,41 +89,51 @@ The evolution process writes multiple audit entries:
79
89
 
80
90
  ### 0. Pre-Flight Configuration
81
91
 
82
- Before running the evolve command, present configuration options to the user.
83
- If the user says "use defaults", "just run it", or similar, skip to step 1
84
- with the recommended defaults marked below.
92
+ Before running the evolve command, present numbered configuration options to the user inline in your response, then wait for the user's answer before proceeding.
85
93
 
86
- Present these options (use AskUserQuestion or inline prompt):
94
+ If the user responds with "use defaults", "just run it", or similar shorthand, skip to step 1 using the recommended defaults marked below.
87
95
 
88
- ```
89
- selftune evolve — Pre-Flight Configuration
96
+ Present the following options inline in your response:
90
97
 
91
- 1. Execution Mode
92
- a) Dry run — preview proposal without deploying (recommended for first run)
93
- b) Live — validate and deploy if improved
98
+ 1. **Execution Mode**
99
+ - a) Dry run — preview proposal without deploying (recommended for first run)
100
+ - b) Live — validate and deploy if improved
94
101
 
95
- 2. Model Tier (see SKILL.md Model Tier Reference)
96
- a) Fast (haiku) — cheapest, ~2s/call (recommended with cheap-loop)
97
- b) Balanced (sonnet) — good quality, ~5s/call
98
- c) Best (opus) — highest quality, ~10s/call
102
+ 2. **Model Tier** (see SKILL.md Model Tier Reference)
103
+ - a) Fast (haiku) — cheapest, ~2s/call (recommended with cheap-loop)
104
+ - b) Balanced (sonnet) — good quality, ~5s/call
105
+ - c) Best (opus) — highest quality, ~10s/call
99
106
 
100
- 3. Cost Optimization
101
- a) Cheap loop — haiku for iteration, sonnet for final gate (recommended)
102
- b) Single model — use one model throughout
107
+ 3. **Cost Optimization**
108
+ - a) Cheap loop — haiku for iteration, sonnet for final gate (recommended)
109
+ - b) Single model — use one model throughout
103
110
 
104
- 4. Confidence Threshold: [0.6] (default, higher = stricter)
111
+ 4. **Confidence Threshold:** 0.6 (default, higher = stricter)
105
112
 
106
- 5. Max Iterations: [3] (default, more = longer but better results)
113
+ 5. **Max Iterations:** 3 (default, more = longer but better results)
107
114
 
108
- 6. Multi-Candidate Selection
109
- a) Single candidate — one proposal per iteration (recommended)
110
- b) Pareto mode — generate multiple candidates, pick best on frontier
115
+ 6. **Multi-Candidate Selection**
116
+ - a) Single candidate — one proposal per iteration (recommended)
117
+ - b) Pareto mode — generate multiple candidates, pick best on frontier
111
118
 
112
- Reply with your choices (e.g., "1a, 2a, 3a, defaults for rest")
113
- or "use defaults" for recommended settings.
114
- ```
119
+ Ask: "Reply with your choices (e.g., '1a, 2a, 3a, defaults for rest') or 'use defaults' for recommended settings."
120
+
121
+ After the user responds, parse their selections and map each choice to the corresponding CLI flags:
115
122
 
116
- After the user responds, show a confirmation summary:
123
+ | Selection | CLI Flag |
124
+ |-----------|----------|
125
+ | 1a (dry run) | `--dry-run` |
126
+ | 1b (live) | _(no flag)_ |
127
+ | 2a (haiku) | `--validation-model haiku` |
128
+ | 2b (sonnet) | `--validation-model sonnet` |
129
+ | 2c (opus) | `--validation-model opus` |
130
+ | 3a (cheap loop) | `--cheap-loop` |
131
+ | 3b (single model) | _(no flag)_ |
132
+ | Custom confidence | `--confidence <value>` |
133
+ | Custom iterations | `--max-iterations <value>` |
134
+ | 6b (pareto) | `--pareto` |
135
+
136
+ Show a confirmation summary to the user:
117
137
 
118
138
  ```
119
139
  Configuration Summary:
@@ -126,7 +146,7 @@ Configuration Summary:
126
146
  Proceeding...
127
147
  ```
128
148
 
129
- Then build the CLI command with the selected flags and continue to step 1.
149
+ Build the CLI command string with all selected flags and continue to step 1.
130
150
 
131
151
  ### 1. Read Evolution Context
132
152
 
@@ -143,7 +163,20 @@ edits on monitored skills during active evolution. This prevents conflicting
143
163
  changes while the evolve process is running. The guard is automatically
144
164
  engaged when evolve starts and released when it completes.
145
165
 
146
- ### 2. Load or Generate Eval Set
166
+ ### 2. Refresh Source Truth (Recommended)
167
+
168
+ If the host has accumulated significant agent activity or is known to be
169
+ polluted, prefer:
170
+
171
+ ```bash
172
+ selftune evolve --skill <name> --skill-path <path> --sync-first
173
+ ```
174
+
175
+ `--sync-first` runs the authoritative transcript/rollout sync before eval-set
176
+ generation and failure-pattern extraction. Use `--sync-force` when you need
177
+ to ignore markers and rescan everything.
178
+
179
+ ### 3. Load or Generate Eval Set
147
180
 
148
181
  If `--eval-set` is provided, use it directly. Otherwise, the command
149
182
  generates one from logs (equivalent to running `evals --skill <name>`).
@@ -151,7 +184,7 @@ generates one from logs (equivalent to running `evals --skill <name>`).
151
184
  An eval set is required for validation. Without enough telemetry data,
152
185
  evolution cannot reliably measure improvement.
153
186
 
154
- ### 3. Extract Failure Patterns
187
+ ### 4. Extract Failure Patterns
155
188
 
156
189
  The command groups missed queries by invocation type:
157
190
  - Missed explicit: description is broken (rare, high priority)
@@ -160,7 +193,7 @@ The command groups missed queries by invocation type:
160
193
 
161
194
  See `references/invocation-taxonomy.md` for the taxonomy.
162
195
 
163
- ### 4. Propose Description Changes
196
+ ### 5. Propose Description Changes
164
197
 
165
198
  An LLM generates a candidate description that would catch the missed
166
199
  queries. The candidate:
@@ -168,7 +201,7 @@ queries. The candidate:
168
201
  - Adds new phrases covering missed patterns
169
202
  - Maintains the description's structure and tone
170
203
 
171
- ### 5. Validate Against Eval Set
204
+ ### 6. Validate Against Eval Set
172
205
 
173
206
  The candidate is tested against the full eval set:
174
207
  - Must improve overall pass rate
@@ -178,7 +211,7 @@ The candidate is tested against the full eval set:
178
211
  If validation fails, the command retries up to `--max-iterations` times
179
212
  with adjusted proposals.
180
213
 
181
- ### 6. Deploy (or Preview)
214
+ ### 7. Deploy (or Preview)
182
215
 
183
216
  If `--dry-run`, the proposal is printed but not deployed. The audit log
184
217
  still records `created` and `validated` entries for review.
@@ -188,7 +221,7 @@ If deploying:
188
221
  2. The updated description is written to SKILL.md
189
222
  3. A `deployed` entry is logged to the evolution audit
190
223
 
191
- ### 7. Update Memory
224
+ ### 8. Update Memory
192
225
 
193
226
  After evolution completes (deploy or dry-run), the memory writer updates:
194
227
  - `~/.selftune/memory/context.md` -- records the evolution outcome and current state
@@ -237,22 +270,44 @@ selftune evolve --skill X --skill-path Y --proposal-model haiku --validation-mod
237
270
 
238
271
  ## Common Patterns
239
272
 
240
- **"Evolve the pptx skill"**
241
- > Need `--skill pptx` and `--skill-path /path/to/pptx/SKILL.md`.
242
- > If the user hasn't specified the path, search for the SKILL.md file
243
- > in the workspace or ask.
273
+ **User asks to evolve a specific skill (e.g., "evolve the pptx skill"):**
274
+ Requires `--skill pptx` and `--skill-path /path/to/pptx/SKILL.md`.
275
+ If the user has not specified the path, search for the SKILL.md file
276
+ in the workspace. If not found, ask the user for the path.
277
+
278
+ **User wants a preview without deployment (e.g., "just show me what would change"):**
279
+ Add `--dry-run` to preview proposals without deploying.
280
+
281
+ **Evolution results are insufficient:**
282
+ Check the eval set quality. Missing contextual examples limit
283
+ what evolution can learn. Generate a richer eval set first using the Evals workflow.
284
+
285
+ **Evolution keeps failing validation:**
286
+ Lower `--confidence` slightly or increase `--max-iterations`.
287
+ Also check if the eval set has contradictory expectations.
288
+
289
+ **Agent CLI override needed:**
290
+ The evolve command auto-detects the installed agent CLI.
291
+ Use `--agent <name>` to override (claude, codex, opencode).
292
+
293
+ ## Subagent Escalation
294
+
295
+ For high-stakes evolutions, consider spawning the `evolution-reviewer` agent
296
+ as a subagent to review the proposal before deploying. This is especially
297
+ valuable when the skill has a history of regressions, the evolution touches
298
+ many trigger phrases, or the confidence score is near the threshold.
244
299
 
245
- **"Just show me what would change"**
246
- > Use `--dry-run` to preview proposals without deploying.
300
+ ## Autonomous Mode
247
301
 
248
- **"The evolution didn't help enough"**
249
- > Check the eval set quality. Missing contextual examples will limit
250
- > what evolution can learn. Generate a richer eval set first.
302
+ When called by `selftune orchestrate` (via cron or --loop), evolution runs
303
+ without user interaction:
251
304
 
252
- **"Evolution keeps failing validation"**
253
- > Lower `--confidence` slightly or increase `--max-iterations`.
254
- > Also check if the eval set has contradictory expectations.
305
+ - Pre-flight is skipped entirely — defaults are used
306
+ - The orchestrator selects candidate skills based on health scores
307
+ - Evolution uses cheap-loop mode (Haiku) by default
308
+ - Validation runs automatically against the eval set
309
+ - Deploy happens if validation passes the regression threshold
310
+ - Results are logged to orchestrate-runs.jsonl
255
311
 
256
- **"Which agent is being used?"**
257
- > The evolve command auto-detects your installed agent CLI.
258
- > Use `--agent <name>` to override (claude, codex, opencode).
312
+ No user confirmation is needed. The safety controls (regression threshold,
313
+ auto-rollback via watch, SKILL.md backup) provide the guardrails.
@@ -7,7 +7,7 @@ LLM validates them through a 3-gate pipeline.
7
7
  ## Default Command
8
8
 
9
9
  ```bash
10
- selftune evolve-body --skill <name> --skill-path <path> --target <target> [options]
10
+ selftune evolve body --skill <name> --skill-path <path> --target <target> [options]
11
11
  ```
12
12
 
13
13
  ## Options
@@ -65,7 +65,7 @@ If the user says "use defaults" or similar, skip to step 1 with recommended defa
65
65
  Present these options:
66
66
 
67
67
  ```
68
- selftune evolve-body — Pre-Flight Configuration
68
+ selftune evolve body — Pre-Flight Configuration
69
69
 
70
70
  1. Evolution Target
71
71
  a) Routing table — optimize the workflow routing table only
@@ -116,7 +116,7 @@ The command reads SKILL.md and splits it into sections using `parseSkillSections
116
116
  ### 2. Build Eval Set
117
117
 
118
118
  If `--eval-set` is provided, use it directly. Otherwise, generate from logs
119
- (same as `selftune evals --skill <name>`).
119
+ (same as `selftune eval generate --skill <name>`).
120
120
 
121
121
  ### 3. Extract Failure Patterns
122
122
 
@@ -147,13 +147,13 @@ If `--dry-run`, prints the proposal without deploying. Otherwise:
147
147
  ## Common Patterns
148
148
 
149
149
  **"Evolve the routing table for the Research skill"**
150
- > `selftune evolve-body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target routing_table`
150
+ > `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target routing_table`
151
151
 
152
152
  **"Rewrite the entire skill body"**
153
- > `selftune evolve-body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target full_body --dry-run`
153
+ > `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target full_body --dry-run`
154
154
 
155
155
  **"Use a stronger model for generation"**
156
- > `selftune evolve-body --skill pptx --skill-path /path/SKILL.md --target full_body --teacher-model opus --student-model haiku`
156
+ > `selftune evolve body --skill pptx --skill-path /path/SKILL.md --target full_body --teacher-model opus --student-model haiku`
157
157
 
158
158
  **"Preview what would change"**
159
159
  > Always start with `--dry-run` to review the proposal before deploying.
@@ -74,15 +74,14 @@ for the full schema. Key fields:
74
74
 
75
75
  ### 1. Find the Session
76
76
 
77
- Read `~/.claude/session_telemetry_log.jsonl` and find the most recent entry
78
- where `skills_triggered` contains the target skill name.
79
-
80
- Note the `transcript_path`, `tool_calls`, `errors_encountered`, and
81
- `session_id` fields. See `references/logs.md` for the telemetry format.
77
+ Read `~/.claude/session_telemetry_log.jsonl`. Find the most recent entry
78
+ where `skills_triggered` contains the target skill name. Extract the
79
+ `transcript_path`, `tool_calls`, `errors_encountered`, and `session_id`
80
+ fields. See `references/logs.md` for the telemetry format.
82
81
 
83
82
  ### 2. Read the Transcript
84
83
 
85
- Parse the JSONL file at `transcript_path`. Identify:
84
+ Parse the JSONL file at `transcript_path`. Extract:
86
85
  - User messages (what was asked)
87
86
  - Assistant tool calls (what the agent did)
88
87
  - Tool results (what happened)
@@ -92,16 +91,14 @@ See `references/logs.md` for transcript format variants.
92
91
 
93
92
  ### 3. Determine Expectations
94
93
 
95
- If the user provided `--expectations`, parse the semicolon-separated list.
96
- Otherwise, derive defaults. See `references/grading-methodology.md` for the
97
- full default expectations list.
98
-
99
- Always include at least one Process expectation and one Quality expectation.
94
+ If `--expectations` was provided, parse the semicolon-separated list.
95
+ Otherwise, derive defaults from `references/grading-methodology.md`.
96
+ Ensure at least one Process expectation and one Quality expectation.
100
97
 
101
98
  ### 4. Grade Each Expectation
102
99
 
103
- For each expectation, search both the telemetry record and the transcript
104
- for evidence. Mark as:
100
+ Search both the telemetry record and the transcript for evidence per
101
+ expectation. Mark as:
105
102
  - **PASS** if evidence exists and supports the expectation
106
103
  - **FAIL** if evidence is absent or contradicts the expectation
107
104
 
@@ -109,22 +106,21 @@ Cite specific evidence: transcript line numbers, tool call names, bash output.
109
106
 
110
107
  ### 5. Extract Implicit Claims
111
108
 
112
- Pull 2-4 claims from the transcript that are not covered by the explicit
113
- expectations. Classify each as factual, process, or quality. Verify each
114
- against the transcript. See `references/grading-methodology.md` for claim
115
- types and examples.
109
+ Pull 2-4 claims from the transcript not covered by explicit expectations.
110
+ Classify each as factual, process, or quality. Verify each against the
111
+ transcript. See `references/grading-methodology.md` for claim types.
116
112
 
117
113
  ### 6. Flag Eval Gaps
118
114
 
119
115
  Review each passed expectation. If it would also pass for wrong output,
120
- note it in `eval_feedback.suggestions`. See `references/grading-methodology.md`
121
- for gap flagging criteria.
116
+ record it in `eval_feedback.suggestions`. See
117
+ `references/grading-methodology.md` for gap flagging criteria.
122
118
 
123
119
  ### 7. Write grading.json
124
120
 
125
121
  Write the full grading result to `grading.json` in the current directory.
126
122
 
127
- ### 8. Summarize
123
+ ### 8. Report Results
128
124
 
129
125
  Report to the user:
130
126
  - Pass rate (e.g., "2/3 passed, 67%")
@@ -136,17 +132,26 @@ Keep the summary concise. The full details are in `grading.json`.
136
132
 
137
133
  ## Common Patterns
138
134
 
139
- **"Grade my last pptx session"**
140
- > Find the most recent telemetry entry for `pptx`. Use default expectations.
141
- > Ask if the user wants custom expectations or proceed with defaults.
135
+ **User asks to grade a skill session**
136
+ > Run `selftune grade --skill <name>` with default expectations. Results are
137
+ > written to `grading.json`. Read that file and report the pass rate and any
138
+ > failures to the user.
139
+
140
+ **User provides specific expectations**
141
+ > Run `selftune grade --skill <name> --expectations "expect1;expect2;expect3"`.
142
+ > Parse results and report.
143
+
144
+ **User wants to grade from an eval set**
145
+ > Run `selftune grade --skill <name> --evals-json path/to/evals.json`.
146
+ > Optionally add `--eval-id N` for a specific scenario.
142
147
 
143
- **"Grade with these specific expectations"**
144
- > Pass `--expectations "expect1;expect2;expect3"` to override defaults.
148
+ **Agent detection override needed**
149
+ > The grader auto-detects the agent CLI. If detection fails or the user
150
+ > specifies an agent, pass `--agent <name>` to override.
145
151
 
146
- **"Grade using an eval set"**
147
- > Pass `--evals-json path/to/evals.json` and optionally `--eval-id N`
148
- > to grade a specific eval scenario.
152
+ ## Autonomous Mode
149
153
 
150
- **"Which agent is being used?"**
151
- > The grader auto-detects your installed agent CLI (claude, codex, opencode).
152
- > Use `--agent <name>` to override the auto-detected agent.
154
+ Grading runs implicitly during orchestrate as part of status computation.
155
+ The orchestrator reads grading results to determine which skills are
156
+ candidates for evolution. No explicit grade command is called — the
157
+ grading results from previous sessions feed into candidate selection.
@@ -1,5 +1,11 @@
1
1
  # selftune Import SkillsBench Workflow
2
2
 
3
+ ## When to Use
4
+
5
+ When the user wants to enrich eval sets with external benchmark tasks or import SkillsBench corpora.
6
+
7
+ ## Overview
8
+
3
9
  Import evaluation tasks from the SkillsBench corpus (87 real-world agent
4
10
  benchmarks) and convert them to selftune eval entries. This enriches
5
11
  your skill's eval set with externally validated test cases.
@@ -7,7 +13,7 @@ your skill's eval set with externally validated test cases.
7
13
  ## Default Command
8
14
 
9
15
  ```bash
10
- selftune import-skillsbench --dir <path> --skill <name> --output <path> [options]
16
+ selftune eval import --dir <path> --skill <name> --output <path> [options]
11
17
  ```
12
18
 
13
19
  ## Options
@@ -86,7 +92,7 @@ Clone or download the SkillsBench repository containing the task directory.
86
92
  ### 2. Import Tasks
87
93
 
88
94
  ```bash
89
- selftune import-skillsbench --dir /path/to/skillsbench/tasks --skill Research --output evals-bench.json
95
+ selftune eval import --dir /path/to/skillsbench/tasks --skill Research --output evals-bench.json
90
96
  ```
91
97
 
92
98
  ### 3. Review Output
@@ -102,10 +108,10 @@ corpus. Use the merged set with `selftune evolve --eval-set merged-evals.json`.
102
108
  ## Common Patterns
103
109
 
104
110
  **"Import SkillsBench tasks for Research"**
105
- > `selftune import-skillsbench --dir /path/tasks --skill Research --output bench-evals.json`
111
+ > `selftune eval import --dir /path/tasks --skill Research --output bench-evals.json`
106
112
 
107
113
  **"Use fuzzy matching for broader coverage"**
108
- > `selftune import-skillsbench --dir /path/tasks --skill pptx --output bench-evals.json --match-strategy fuzzy`
114
+ > `selftune eval import --dir /path/tasks --skill pptx --output bench-evals.json --match-strategy fuzzy`
109
115
 
110
116
  **"Enrich my eval set with external benchmarks"**
111
- > Import with `import-skillsbench`, then pass the output to `evolve --eval-set`.
117
+ > Import with `eval import`, then pass the output to `evolve --eval-set`.