selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.claude/agents/diagnosis-analyst.md +146 -0
  2. package/.claude/agents/evolution-reviewer.md +167 -0
  3. package/.claude/agents/integration-guide.md +200 -0
  4. package/.claude/agents/pattern-analyst.md +147 -0
  5. package/CHANGELOG.md +37 -0
  6. package/README.md +96 -256
  7. package/assets/BeforeAfter.gif +0 -0
  8. package/assets/FeedbackLoop.gif +0 -0
  9. package/assets/logo.svg +9 -0
  10. package/assets/skill-health-badge.svg +20 -0
  11. package/cli/selftune/activation-rules.ts +171 -0
  12. package/cli/selftune/badge/badge-data.ts +108 -0
  13. package/cli/selftune/badge/badge-svg.ts +212 -0
  14. package/cli/selftune/badge/badge.ts +103 -0
  15. package/cli/selftune/constants.ts +75 -1
  16. package/cli/selftune/contribute/bundle.ts +314 -0
  17. package/cli/selftune/contribute/contribute.ts +214 -0
  18. package/cli/selftune/contribute/sanitize.ts +162 -0
  19. package/cli/selftune/cron/setup.ts +266 -0
  20. package/cli/selftune/dashboard-server.ts +582 -0
  21. package/cli/selftune/dashboard.ts +25 -3
  22. package/cli/selftune/eval/baseline.ts +247 -0
  23. package/cli/selftune/eval/composability.ts +117 -0
  24. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  25. package/cli/selftune/eval/hooks-to-evals.ts +68 -2
  26. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  27. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  28. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  29. package/cli/selftune/eval/unit-test.ts +196 -0
  30. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  31. package/cli/selftune/evolution/evolve-body.ts +492 -0
  32. package/cli/selftune/evolution/evolve.ts +466 -103
  33. package/cli/selftune/evolution/extract-patterns.ts +32 -1
  34. package/cli/selftune/evolution/pareto.ts +314 -0
  35. package/cli/selftune/evolution/propose-body.ts +171 -0
  36. package/cli/selftune/evolution/propose-description.ts +100 -2
  37. package/cli/selftune/evolution/propose-routing.ts +166 -0
  38. package/cli/selftune/evolution/refine-body.ts +141 -0
  39. package/cli/selftune/evolution/rollback.ts +19 -2
  40. package/cli/selftune/evolution/validate-body.ts +254 -0
  41. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  42. package/cli/selftune/evolution/validate-routing.ts +177 -0
  43. package/cli/selftune/grading/grade-session.ts +138 -18
  44. package/cli/selftune/grading/pre-gates.ts +104 -0
  45. package/cli/selftune/hooks/auto-activate.ts +185 -0
  46. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  47. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  48. package/cli/selftune/index.ts +88 -0
  49. package/cli/selftune/ingestors/claude-replay.ts +351 -0
  50. package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
  51. package/cli/selftune/init.ts +150 -3
  52. package/cli/selftune/memory/writer.ts +447 -0
  53. package/cli/selftune/monitoring/watch.ts +25 -2
  54. package/cli/selftune/status.ts +17 -13
  55. package/cli/selftune/types.ts +377 -5
  56. package/cli/selftune/utils/frontmatter.ts +217 -0
  57. package/cli/selftune/utils/llm-call.ts +29 -3
  58. package/cli/selftune/utils/transcript.ts +35 -0
  59. package/cli/selftune/utils/trigger-check.ts +89 -0
  60. package/cli/selftune/utils/tui.ts +156 -0
  61. package/dashboard/index.html +569 -8
  62. package/package.json +8 -4
  63. package/skill/SKILL.md +124 -8
  64. package/skill/Workflows/AutoActivation.md +144 -0
  65. package/skill/Workflows/Badge.md +118 -0
  66. package/skill/Workflows/Baseline.md +121 -0
  67. package/skill/Workflows/Composability.md +100 -0
  68. package/skill/Workflows/Contribute.md +91 -0
  69. package/skill/Workflows/Cron.md +155 -0
  70. package/skill/Workflows/Dashboard.md +203 -0
  71. package/skill/Workflows/Doctor.md +37 -1
  72. package/skill/Workflows/Evals.md +69 -1
  73. package/skill/Workflows/EvolutionMemory.md +152 -0
  74. package/skill/Workflows/Evolve.md +111 -6
  75. package/skill/Workflows/EvolveBody.md +159 -0
  76. package/skill/Workflows/ImportSkillsBench.md +111 -0
  77. package/skill/Workflows/Ingest.md +117 -3
  78. package/skill/Workflows/Initialize.md +57 -3
  79. package/skill/Workflows/Replay.md +70 -0
  80. package/skill/Workflows/Rollback.md +20 -1
  81. package/skill/Workflows/UnitTest.md +138 -0
  82. package/skill/Workflows/Watch.md +22 -0
  83. package/skill/settings_snippet.json +23 -0
  84. package/templates/activation-rules-default.json +27 -0
  85. package/templates/multi-skill-settings.json +64 -0
  86. package/templates/single-skill-settings.json +58 -0
@@ -0,0 +1,152 @@
1
+ # selftune Evolution Memory
2
+
3
+ Human-readable session context that survives context window resets. Provides
4
+ continuity across evolve, watch, and rollback workflows by recording outcomes,
5
+ decisions, and known issues in plain markdown files.
6
+
7
+ ## When to Use
8
+
9
+ - **Reading evolution context for continuity** -- Step 0 in Evolve, Watch, and
10
+ Rollback workflows reads memory before starting.
11
+ - **Diagnosing what happened in previous sessions** -- the decision log provides
12
+ a chronological record of every evolution action and its outcome.
13
+
14
+ ## Location
15
+
16
+ ```text
17
+ ~/.selftune/memory/
18
+ ```
19
+
20
+ All memory files live in this directory. The directory is created automatically
21
+ on the first write.
22
+
23
+ ## The Three Files
24
+
25
+ ### 1. context.md -- Active Evolutions
26
+
27
+ Tracks the current state of every skill that has been evolved, watched, or
28
+ rolled back.
29
+
30
+ **Format:** Markdown with `##` sections.
31
+
32
+ ```markdown
33
+ # Selftune Context
34
+
35
+ ## Active Evolutions
36
+ - pptx: deployed -- Added implicit triggers for slide deck queries
37
+ - csv-parser: regression -- pass_rate=0.65, baseline=0.88
38
+
39
+ ## Known Issues
40
+ - Regression detected for csv-parser: pass_rate=0.65 below baseline=0.88
41
+
42
+ ## Last Updated
43
+ 2026-03-01T14:00:00.000Z
44
+ ```
45
+
46
+ **Status values:**
47
+
48
+ | Status | Meaning |
49
+ |--------|---------|
50
+ | `deployed` | Evolution was deployed successfully |
51
+ | `failed` | Evolution attempted but did not deploy |
52
+ | `regression` | Watch detected a regression in pass rate |
53
+ | `healthy` | Watch confirmed pass rate is within threshold |
54
+ | `rolled-back` | Rollback completed successfully |
55
+ | `rollback-failed` | Rollback was attempted but failed |
56
+
57
+ ### 2. plan.md -- Current Priorities
58
+
59
+ Records evolution priorities and strategy.
60
+
61
+ **Format:** Markdown with `##` sections.
62
+
63
+ ```markdown
64
+ # Evolution Plan
65
+
66
+ ## Current Priorities
67
+ 1. Improve csv-parser implicit trigger coverage
68
+ 2. Re-evolve pptx after eval set expansion
69
+
70
+ ## Strategy
71
+ Focus on skills with highest session volume first.
72
+
73
+ ## Last Updated
74
+ 2026-03-01T14:00:00.000Z
75
+ ```
76
+
77
+ ### 3. decisions.md -- Append-Only Decision Log
78
+
79
+ Chronological record of every evolution action. Entries are never removed,
80
+ only appended.
81
+
82
+ **Entry format:**
83
+
84
+ ```markdown
85
+ ## 2026-03-01T14:00:00.000Z -- evolve
86
+ - **Skill:** pptx
87
+ - **Action:** evolved
88
+ - **Rationale:** Missed implicit triggers for slide deck queries
89
+ - **Result:** Deployed with pass_rate improvement 0.70 -> 0.92
90
+
91
+ ---
92
+ ```
93
+
94
+ Each entry contains:
95
+
96
+ | Field | Description |
97
+ |-------|-------------|
98
+ | Timestamp | ISO 8601 timestamp in the `##` heading |
99
+ | Action type | `evolve`, `rollback`, or `watch` in the heading |
100
+ | Skill | The skill name |
101
+ | Action | Past-tense result: `evolved`, `rolled-back`, or `watched` |
102
+ | Rationale | Why the action was taken |
103
+ | Result | What happened |
104
+
105
+ Entries are separated by `---` markers.
106
+
107
+ ## Auto-Update Triggers
108
+
109
+ Memory is updated automatically by the memory writer (`cli/selftune/memory/writer.ts`).
110
+ No manual editing is required during normal operation.
111
+
112
+ | Trigger | Function | Updates |
113
+ |---------|----------|---------|
114
+ | After evolve completes | `updateContextAfterEvolve` | context.md + decisions.md |
115
+ | After rollback completes | `updateContextAfterRollback` | context.md + decisions.md |
116
+ | After watch completes | `updateContextAfterWatch` | context.md + decisions.md, adds known issues on regression |
117
+
118
+ ## Reading Memory
119
+
120
+ Step 0 in the Evolve, Watch, and Rollback workflows reads `~/.selftune/memory/context.md`
121
+ before starting any operation. This provides:
122
+
123
+ - Active evolutions and their current status
124
+ - Known issues from previous runs
125
+ - Last update timestamp
126
+
127
+ If the file does not exist, the workflow proceeds normally. Memory files are
128
+ created automatically after the first evolve, watch, or rollback operation.
129
+
130
+ ## Resetting Memory
131
+
132
+ Delete the files in `~/.selftune/memory/` to start fresh:
133
+
134
+ ```bash
135
+ rm -rf ~/.selftune/memory/
136
+ ```
137
+
138
+ They will be recreated automatically on the next evolve, watch, or rollback run.
139
+
140
+ ## Common Patterns
141
+
142
+ **"What happened in the last evolution?"**
143
+ > Read `~/.selftune/memory/decisions.md`. The most recent entry at the bottom
144
+ > of the file contains the last action, skill, rationale, and result.
145
+
146
+ **"What's the current state?"**
147
+ > Read `~/.selftune/memory/context.md`. The Active Evolutions section lists
148
+ > every tracked skill and its current status.
149
+
150
+ **"Memory seems stale"**
151
+ > Delete the files in `~/.selftune/memory/` and run `selftune evolve` or
152
+ > `selftune watch` to recreate them with fresh data.
@@ -19,8 +19,12 @@ selftune evolve --skill <name> --skill-path <path> [options]
19
19
  | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
20
20
  | `--agent <name>` | Agent CLI to use (claude, codex, opencode) | Auto-detected |
21
21
  | `--dry-run` | Propose and validate without deploying | Off |
22
- | `--confidence <n>` | Minimum confidence threshold (0-1) | 0.7 |
22
+ | `--confidence <n>` | Minimum confidence threshold (0-1) | 0.6 |
23
23
  | `--max-iterations <n>` | Maximum retry iterations | 3 |
24
+ | `--validation-model <model>` | Model for trigger-check validation LLM calls | `haiku` |
25
+ | `--cheap-loop` | Use cheap models for loop, expensive for final gate | Off |
26
+ | `--gate-model <model>` | Model for final gate validation | `sonnet` (when `--cheap-loop`) |
27
+ | `--proposal-model <model>` | Model for proposal generation LLM calls | None |
24
28
 
25
29
  ## Output Format
26
30
 
@@ -73,7 +77,73 @@ The evolution process writes multiple audit entries:
73
77
 
74
78
  ## Steps
75
79
 
76
- ### 1. Load or Generate Eval Set
80
+ ### 0. Pre-Flight Configuration
81
+
82
+ Before running the evolve command, present configuration options to the user.
83
+ If the user says "use defaults", "just run it", or similar, skip to step 1
84
+ with the recommended defaults marked below.
85
+
86
+ Present these options (use AskUserQuestion or inline prompt):
87
+
88
+ ```
89
+ selftune evolve — Pre-Flight Configuration
90
+
91
+ 1. Execution Mode
92
+ a) Dry run — preview proposal without deploying (recommended for first run)
93
+ b) Live — validate and deploy if improved
94
+
95
+ 2. Model Tier (see SKILL.md Model Tier Reference)
96
+ a) Fast (haiku) — cheapest, ~2s/call (recommended with cheap-loop)
97
+ b) Balanced (sonnet) — good quality, ~5s/call
98
+ c) Best (opus) — highest quality, ~10s/call
99
+
100
+ 3. Cost Optimization
101
+ a) Cheap loop — haiku for iteration, sonnet for final gate (recommended)
102
+ b) Single model — use one model throughout
103
+
104
+ 4. Confidence Threshold: [0.6] (default, higher = stricter)
105
+
106
+ 5. Max Iterations: [3] (default, more = longer but better results)
107
+
108
+ 6. Multi-Candidate Selection
109
+ a) Single candidate — one proposal per iteration (recommended)
110
+ b) Pareto mode — generate multiple candidates, pick best on frontier
111
+
112
+ → Reply with your choices (e.g., "1a, 2a, 3a, defaults for rest")
113
+ or "use defaults" for recommended settings.
114
+ ```
115
+
116
+ After the user responds, show a confirmation summary:
117
+
118
+ ```
119
+ Configuration Summary:
120
+ Mode: dry-run
121
+ Model: haiku (cheap-loop: sonnet gate)
122
+ Confidence: 0.6
123
+ Iterations: 3
124
+ Pareto: off
125
+
126
+ Proceeding...
127
+ ```
128
+
129
+ Then build the CLI command with the selected flags and continue to step 1.
130
+
131
+ ### 1. Read Evolution Context
132
+
133
+ Before running, read `~/.selftune/memory/context.md` for session context:
134
+ - Active evolutions and their current status
135
+ - Known issues from previous runs
136
+ - Last update timestamp
137
+
138
+ This provides continuity across context resets. If the file doesn't exist,
139
+ proceed normally -- it will be created after the first evolution.
140
+
141
+ The evolution-guard hook (`hooks/evolution-guard.ts`) blocks direct SKILL.md
142
+ edits on monitored skills during active evolution. This prevents conflicting
143
+ changes while the evolve process is running. The guard is automatically
144
+ engaged when evolve starts and released when it completes.
145
+
146
+ ### 2. Load or Generate Eval Set
77
147
 
78
148
  If `--eval-set` is provided, use it directly. Otherwise, the command
79
149
  generates one from logs (equivalent to running `evals --skill <name>`).
@@ -81,7 +151,7 @@ generates one from logs (equivalent to running `evals --skill <name>`).
81
151
  An eval set is required for validation. Without enough telemetry data,
82
152
  evolution cannot reliably measure improvement.
83
153
 
84
- ### 2. Extract Failure Patterns
154
+ ### 3. Extract Failure Patterns
85
155
 
86
156
  The command groups missed queries by invocation type:
87
157
  - Missed explicit: description is broken (rare, high priority)
@@ -90,7 +160,7 @@ The command groups missed queries by invocation type:
90
160
 
91
161
  See `references/invocation-taxonomy.md` for the taxonomy.
92
162
 
93
- ### 3. Propose Description Changes
163
+ ### 4. Propose Description Changes
94
164
 
95
165
  An LLM generates a candidate description that would catch the missed
96
166
  queries. The candidate:
@@ -98,7 +168,7 @@ queries. The candidate:
98
168
  - Adds new phrases covering missed patterns
99
169
  - Maintains the description's structure and tone
100
170
 
101
- ### 4. Validate Against Eval Set
171
+ ### 5. Validate Against Eval Set
102
172
 
103
173
  The candidate is tested against the full eval set:
104
174
  - Must improve overall pass rate
@@ -108,7 +178,7 @@ The candidate is tested against the full eval set:
108
178
  If validation fails, the command retries up to `--max-iterations` times
109
179
  with adjusted proposals.
110
180
 
111
- ### 5. Deploy (or Preview)
181
+ ### 6. Deploy (or Preview)
112
182
 
113
183
  If `--dry-run`, the proposal is printed but not deployed. The audit log
114
184
  still records `created` and `validated` entries for review.
@@ -118,6 +188,15 @@ If deploying:
118
188
  2. The updated description is written to SKILL.md
119
189
  3. A `deployed` entry is logged to the evolution audit
120
190
 
191
+ ### 7. Update Memory
192
+
193
+ After evolution completes (deploy or dry-run), the memory writer updates:
194
+ - `~/.selftune/memory/context.md` -- records the evolution outcome and current state
195
+ - `~/.selftune/memory/decisions.md` -- logs the decision rationale and proposal details
196
+
197
+ This ensures the next evolve, watch, or rollback workflow has full context
198
+ even after a context window reset.
199
+
121
200
  ### Stopping Criteria
122
201
 
123
202
  The evolution loop stops when any of these conditions is met (priority order):
@@ -130,6 +209,32 @@ The evolution loop stops when any of these conditions is met (priority order):
130
209
  | 4 | **Plateau** | Pass rate unchanged across 3 consecutive iterations |
131
210
  | 5 | **Continue** | None of the above -- keep iterating |
132
211
 
212
+ ## Cheap Loop Mode
213
+
214
+ The `--cheap-loop` flag runs the entire evolution loop with cheap models (haiku)
215
+ and only uses an expensive model (sonnet) for a final gate validation before
216
+ deploying. This reduces cost while maintaining deployment quality.
217
+
218
+ When `--cheap-loop` is set:
219
+ - `--proposal-model` defaults to `haiku`
220
+ - `--validation-model` defaults to `haiku`
221
+ - `--gate-model` defaults to `sonnet`
222
+
223
+ The gate validation is a new step between validation and deploy. It re-runs
224
+ `validateProposal` using the gate model. If the gate fails, the proposal is
225
+ not deployed.
226
+
227
+ ```bash
228
+ # Cheap loop with default models
229
+ selftune evolve --skill X --skill-path Y --cheap-loop
230
+
231
+ # Cheap loop with opus gate
232
+ selftune evolve --skill X --skill-path Y --cheap-loop --gate-model opus
233
+
234
+ # Manual model control without cheap-loop
235
+ selftune evolve --skill X --skill-path Y --proposal-model haiku --validation-model sonnet
236
+ ```
237
+
133
238
  ## Common Patterns
134
239
 
135
240
  **"Evolve the pptx skill"**
@@ -0,0 +1,159 @@
1
+ # selftune Evolve Body Workflow
2
+
3
+ Evolve a skill's full body content or routing table, not just the description.
4
+ Uses a teacher-student model: a stronger LLM generates proposals, a cheaper
5
+ LLM validates them through a 3-gate pipeline.
6
+
7
+ ## Default Command
8
+
9
+ ```bash
10
+ selftune evolve-body --skill <name> --skill-path <path> --target <target> [options]
11
+ ```
12
+
13
+ ## Options
14
+
15
+ | Flag | Description | Default |
16
+ |------|-------------|---------|
17
+ | `--skill <name>` | Skill name | Required |
18
+ | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
19
+ | `--target <type>` | Evolution target: `routing_table` or `full_body` | Required |
20
+ | `--teacher-agent <name>` | Agent CLI for proposal generation | Auto-detected |
21
+ | `--student-agent <name>` | Agent CLI for validation | Same as teacher |
22
+ | `--teacher-model <flag>` | Model flag for teacher (e.g. `opus`) | Agent default |
23
+ | `--student-model <flag>` | Model flag for student (e.g. `haiku`) | Agent default |
24
+ | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
25
+ | `--dry-run` | Propose and validate without deploying | Off |
26
+ | `--max-iterations <n>` | Maximum refinement iterations | 3 |
27
+ | `--task-description <text>` | Context for the evolution goal | None |
28
+ | `--validation-model <model>` | Model for trigger-check validation calls (overrides `--student-model` for validation) | None |
29
+ | `--few-shot <paths>` | Comma-separated paths to example SKILL.md files | None |
30
+
31
+ ## Evolution Targets
32
+
33
+ ### `routing_table`
34
+
35
+ Optimizes the `## Workflow Routing` markdown table in SKILL.md. The teacher
36
+ LLM analyzes missed triggers and proposes new routing entries that map
37
+ trigger keywords to the correct workflow files.
38
+
39
+ ### `full_body`
40
+
41
+ Rewrites the entire SKILL.md body below the frontmatter. This includes
42
+ the description, routing table, examples, and all other sections. The
43
+ teacher generates a complete replacement, validated through 3 gates.
44
+
45
+ ## 3-Gate Validation Pipeline
46
+
47
+ Every proposal passes through three sequential gates:
48
+
49
+ | Gate | Type | What it checks | Cost |
50
+ |------|------|---------------|------|
51
+ | **Gate 1: Structural** | Pure code | YAML frontmatter present, `# Title` exists, `## Workflow Routing` preserved if original had one | Free |
52
+ | **Gate 2: Trigger Accuracy** | Student LLM | YES/NO trigger check per eval entry on the extracted description | Cheap |
53
+ | **Gate 3: Quality** | Student LLM | Body clarity and completeness score (0.0-1.0) | Cheap |
54
+
55
+ If any gate fails, the teacher receives structured feedback and generates
56
+ a refined proposal. This repeats up to `--max-iterations` times.
57
+
58
+ ## Steps
59
+
60
+ ### 0. Pre-Flight Configuration
61
+
62
+ Before running evolve-body, present configuration options to the user.
63
+ If the user says "use defaults" or similar, skip to step 1 with recommended defaults.
64
+
65
+ Present these options:
66
+
67
+ ```
68
+ selftune evolve-body — Pre-Flight Configuration
69
+
70
+ 1. Evolution Target
71
+ a) Routing table — optimize the workflow routing table only
72
+ b) Full body — rewrite entire SKILL.md body (more aggressive)
73
+
74
+ 2. Execution Mode
75
+ a) Dry run — preview proposal without deploying (recommended)
76
+ b) Live — validate and deploy if improved
77
+
78
+ 3. Teacher Model (generates proposals)
79
+ a) Balanced (sonnet) — good quality proposals (recommended)
80
+ b) Best (opus) — highest quality, slower and more expensive
81
+
82
+ 4. Student Model (validates proposals)
83
+ a) Fast (haiku) — cheap validation (recommended)
84
+ b) Balanced (sonnet) — higher quality validation
85
+
86
+ 5. Max Iterations: [3] (default)
87
+
88
+ 6. Few-Shot Examples: [none] (paths to example SKILL.md files for guidance)
89
+
90
+ → Reply with your choices or "use defaults" for recommended settings.
91
+ ```
92
+
93
+ After the user responds, show a confirmation summary:
94
+
95
+ ```
96
+ Configuration Summary:
97
+ Target: routing_table
98
+ Mode: dry-run
99
+ Teacher model: sonnet
100
+ Student model: haiku
101
+ Iterations: 3
102
+ Few-shot: none
103
+
104
+ Proceeding...
105
+ ```
106
+
107
+ ### 1. Parse Current Skill
108
+
109
+ The command reads SKILL.md and splits it into sections using `parseSkillSections()`:
110
+ - Frontmatter (YAML between `---` markers)
111
+ - Title (first `# Heading`)
112
+ - Description (text between title and first `## Section`)
113
+ - Workflow Routing (the `## Workflow Routing` section, if present)
114
+ - Remaining Body (everything else)
115
+
116
+ ### 2. Build Eval Set
117
+
118
+ If `--eval-set` is provided, use it directly. Otherwise, generate from logs
119
+ (same as `selftune evals --skill <name>`).
120
+
121
+ ### 3. Extract Failure Patterns
122
+
123
+ Groups missed queries by invocation type, same as the description evolution
124
+ pipeline. See `references/invocation-taxonomy.md`.
125
+
126
+ ### 4. Generate Proposal (Teacher)
127
+
128
+ The teacher LLM generates a proposal based on the target:
129
+ - **routing_table**: Optimized `## Workflow Routing` markdown table
130
+ - **full_body**: Complete SKILL.md body replacement
131
+
132
+ Few-shot examples from `--few-shot` paths provide structural guidance.
133
+
134
+ ### 5. Validate (3 Gates)
135
+
136
+ Each gate runs in sequence. If a gate fails, the teacher receives the
137
+ failure details and generates a refined proposal.
138
+
139
+ ### 6. Deploy or Preview
140
+
141
+ If `--dry-run`, prints the proposal without deploying. Otherwise:
142
+ 1. Creates a timestamped backup of the current SKILL.md
143
+ 2. Applies the change: `replaceSection()` for routing, `replaceBody()` for full_body
144
+ 3. Records audit entries
145
+ 4. Updates evolution memory
146
+
147
+ ## Common Patterns
148
+
149
+ **"Evolve the routing table for the Research skill"**
150
+ > `selftune evolve-body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target routing_table`
151
+
152
+ **"Rewrite the entire skill body"**
153
+ > `selftune evolve-body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target full_body --dry-run`
154
+
155
+ **"Use a stronger model for generation"**
156
+ > `selftune evolve-body --skill pptx --skill-path /path/SKILL.md --target full_body --teacher-model opus --student-model haiku`
157
+
158
+ **"Preview what would change"**
159
+ > Always start with `--dry-run` to review the proposal before deploying.
@@ -0,0 +1,111 @@
1
+ # selftune Import SkillsBench Workflow
2
+
3
+ Import evaluation tasks from the SkillsBench corpus (87 real-world agent
4
+ benchmarks) and convert them to selftune eval entries. This enriches
5
+ your skill's eval set with externally validated test cases.
6
+
7
+ ## Default Command
8
+
9
+ ```bash
10
+ selftune import-skillsbench --dir <path> --skill <name> --output <path> [options]
11
+ ```
12
+
13
+ ## Options
14
+
15
+ | Flag | Description | Default |
16
+ |------|-------------|---------|
17
+ | `--dir <path>` | Path to SkillsBench tasks directory | Required |
18
+ | `--skill <name>` | Target skill to match tasks against | Required |
19
+ | `--output <path>` | Output eval set JSON file | Required |
20
+ | `--match-strategy <type>` | Matching strategy: `exact` or `fuzzy` | `exact` |
21
+
22
+ ## Match Strategies
23
+
24
+ ### `exact`
25
+
26
+ Matches tasks where `expected_skill` in `task.toml` exactly matches the
27
+ target skill name. Precise but may miss relevant tasks.
28
+
29
+ ### `fuzzy`
30
+
31
+ Uses keyword overlap between the task's category/tags and the skill name.
32
+ Casts a wider net but may include marginally relevant tasks. Review the
33
+ output and remove false matches.
34
+
35
+ ## SkillsBench Directory Structure
36
+
37
+ The importer expects this layout:
38
+
39
+ ```
40
+ tasks/
41
+ ├── task-001/
42
+ │ ├── instruction.md # Task description (used as query)
43
+ │ └── task.toml # Metadata (difficulty, category, tags, expected_skill)
44
+ ├── task-002/
45
+ │ ├── instruction.md
46
+ │ └── task.toml
47
+ └── ...
48
+ ```
49
+
50
+ ### `task.toml` Format
51
+
52
+ ```toml
53
+ difficulty = "medium"
54
+ category = "research"
55
+ tags = ["web-search", "analysis", "summarization"]
56
+ expected_skill = "Research"
57
+ expected_tools = ["WebSearch", "Read"]
58
+ ```
59
+
60
+ All fields are optional. Tasks without `task.toml` use default values.
61
+
62
+ ## Output Format
63
+
64
+ Standard selftune eval entries:
65
+
66
+ ```json
67
+ [
68
+ {
69
+ "id": 1,
70
+ "query": "Find and summarize the latest papers on transformer architectures",
71
+ "expected": true,
72
+ "invocation_type": "implicit",
73
+ "skill_name": "Research",
74
+ "source_session": null,
75
+ "source": "skillsbench"
76
+ }
77
+ ]
78
+ ```
79
+
80
+ ## Steps
81
+
82
+ ### 1. Obtain SkillsBench Corpus
83
+
84
+ Clone or download the SkillsBench repository containing the task directory.
85
+
86
+ ### 2. Import Tasks
87
+
88
+ ```bash
89
+ selftune import-skillsbench --dir /path/to/skillsbench/tasks --skill Research --output evals-bench.json
90
+ ```
91
+
92
+ ### 3. Review Output
93
+
94
+ Inspect the generated eval entries. Remove any that don't match your skill's
95
+ intended scope. Adjust match strategy if needed.
96
+
97
+ ### 4. Merge with Existing Evals
98
+
99
+ Combine imported entries with your existing eval set for a richer validation
100
+ corpus. Use the merged set with `selftune evolve --eval-set merged-evals.json`.
101
+
102
+ ## Common Patterns
103
+
104
+ **"Import SkillsBench tasks for Research"**
105
+ > `selftune import-skillsbench --dir /path/tasks --skill Research --output bench-evals.json`
106
+
107
+ **"Use fuzzy matching for broader coverage"**
108
+ > `selftune import-skillsbench --dir /path/tasks --skill pptx --output bench-evals.json --match-strategy fuzzy`
109
+
110
+ **"Enrich my eval set with external benchmarks"**
111
+ > Import with `import-skillsbench`, then pass the output to `evolve --eval-set`.