selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.claude/agents/diagnosis-analyst.md +146 -0
  2. package/.claude/agents/evolution-reviewer.md +167 -0
  3. package/.claude/agents/integration-guide.md +200 -0
  4. package/.claude/agents/pattern-analyst.md +147 -0
  5. package/CHANGELOG.md +37 -0
  6. package/README.md +96 -256
  7. package/assets/BeforeAfter.gif +0 -0
  8. package/assets/FeedbackLoop.gif +0 -0
  9. package/assets/logo.svg +9 -0
  10. package/assets/skill-health-badge.svg +20 -0
  11. package/cli/selftune/activation-rules.ts +171 -0
  12. package/cli/selftune/badge/badge-data.ts +108 -0
  13. package/cli/selftune/badge/badge-svg.ts +212 -0
  14. package/cli/selftune/badge/badge.ts +103 -0
  15. package/cli/selftune/constants.ts +75 -1
  16. package/cli/selftune/contribute/bundle.ts +314 -0
  17. package/cli/selftune/contribute/contribute.ts +214 -0
  18. package/cli/selftune/contribute/sanitize.ts +162 -0
  19. package/cli/selftune/cron/setup.ts +266 -0
  20. package/cli/selftune/dashboard-server.ts +582 -0
  21. package/cli/selftune/dashboard.ts +25 -3
  22. package/cli/selftune/eval/baseline.ts +247 -0
  23. package/cli/selftune/eval/composability.ts +117 -0
  24. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  25. package/cli/selftune/eval/hooks-to-evals.ts +68 -2
  26. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  27. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  28. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  29. package/cli/selftune/eval/unit-test.ts +196 -0
  30. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  31. package/cli/selftune/evolution/evolve-body.ts +492 -0
  32. package/cli/selftune/evolution/evolve.ts +466 -103
  33. package/cli/selftune/evolution/extract-patterns.ts +32 -1
  34. package/cli/selftune/evolution/pareto.ts +314 -0
  35. package/cli/selftune/evolution/propose-body.ts +171 -0
  36. package/cli/selftune/evolution/propose-description.ts +100 -2
  37. package/cli/selftune/evolution/propose-routing.ts +166 -0
  38. package/cli/selftune/evolution/refine-body.ts +141 -0
  39. package/cli/selftune/evolution/rollback.ts +19 -2
  40. package/cli/selftune/evolution/validate-body.ts +254 -0
  41. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  42. package/cli/selftune/evolution/validate-routing.ts +177 -0
  43. package/cli/selftune/grading/grade-session.ts +138 -18
  44. package/cli/selftune/grading/pre-gates.ts +104 -0
  45. package/cli/selftune/hooks/auto-activate.ts +185 -0
  46. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  47. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  48. package/cli/selftune/index.ts +88 -0
  49. package/cli/selftune/ingestors/claude-replay.ts +351 -0
  50. package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
  51. package/cli/selftune/init.ts +150 -3
  52. package/cli/selftune/memory/writer.ts +447 -0
  53. package/cli/selftune/monitoring/watch.ts +25 -2
  54. package/cli/selftune/status.ts +17 -13
  55. package/cli/selftune/types.ts +377 -5
  56. package/cli/selftune/utils/frontmatter.ts +217 -0
  57. package/cli/selftune/utils/llm-call.ts +29 -3
  58. package/cli/selftune/utils/transcript.ts +35 -0
  59. package/cli/selftune/utils/trigger-check.ts +89 -0
  60. package/cli/selftune/utils/tui.ts +156 -0
  61. package/dashboard/index.html +569 -8
  62. package/package.json +8 -4
  63. package/skill/SKILL.md +124 -8
  64. package/skill/Workflows/AutoActivation.md +144 -0
  65. package/skill/Workflows/Badge.md +118 -0
  66. package/skill/Workflows/Baseline.md +121 -0
  67. package/skill/Workflows/Composability.md +100 -0
  68. package/skill/Workflows/Contribute.md +91 -0
  69. package/skill/Workflows/Cron.md +155 -0
  70. package/skill/Workflows/Dashboard.md +203 -0
  71. package/skill/Workflows/Doctor.md +37 -1
  72. package/skill/Workflows/Evals.md +69 -1
  73. package/skill/Workflows/EvolutionMemory.md +152 -0
  74. package/skill/Workflows/Evolve.md +111 -6
  75. package/skill/Workflows/EvolveBody.md +159 -0
  76. package/skill/Workflows/ImportSkillsBench.md +111 -0
  77. package/skill/Workflows/Ingest.md +117 -3
  78. package/skill/Workflows/Initialize.md +57 -3
  79. package/skill/Workflows/Replay.md +70 -0
  80. package/skill/Workflows/Rollback.md +20 -1
  81. package/skill/Workflows/UnitTest.md +138 -0
  82. package/skill/Workflows/Watch.md +22 -0
  83. package/skill/settings_snippet.json +23 -0
  84. package/templates/activation-rules-default.json +27 -0
  85. package/templates/multi-skill-settings.json +64 -0
  86. package/templates/single-skill-settings.json +58 -0
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "selftune",
3
- "version": "0.1.4",
4
- "description": "Skill observability and continuous improvement CLI for agent platforms",
3
+ "version": "0.2.0",
4
+ "description": "Self-improving skills CLI for AI agents",
5
5
  "type": "module",
6
6
  "license": "MIT",
7
7
  "author": "Daniel Petro",
@@ -20,7 +20,7 @@
20
20
  "keywords": [
21
21
  "selftune",
22
22
  "skill",
23
- "observability",
23
+ "self-improving",
24
24
  "claude-code",
25
25
  "codex",
26
26
  "opencode",
@@ -37,9 +37,12 @@
37
37
  "selftune": "bin/selftune.cjs"
38
38
  },
39
39
  "files": [
40
+ "assets/",
40
41
  "bin/",
41
42
  "cli/selftune/",
42
43
  "dashboard/",
44
+ "templates/",
45
+ ".claude/agents/",
43
46
  "skill/",
44
47
  "README.md",
45
48
  "CHANGELOG.md"
@@ -49,7 +52,8 @@
49
52
  "lint:fix": "bunx biome check --write .",
50
53
  "lint:arch": "bun run lint-architecture.ts",
51
54
  "test": "bun test",
52
- "check": "bun run lint && bun run lint:arch && bun test"
55
+ "check": "bun run lint && bun run lint:arch && bun test",
56
+ "start": "bun run cli/selftune/index.ts --help"
53
57
  },
54
58
  "devDependencies": {
55
59
  "@biomejs/biome": "^2.4.4",
package/skill/SKILL.md CHANGED
@@ -1,11 +1,17 @@
1
1
  ---
2
2
  name: selftune
3
3
  description: >
4
- Skill observability and continuous improvement. Use when the user wants to:
4
+ Self-improving skills toolkit. Use when the user wants to:
5
5
  grade a session, generate evals, check undertriggering, evolve a skill
6
- description, rollback an evolution, monitor post-deploy performance, check
7
- skill health status, view last session insight, open the dashboard, run
8
- health checks, or ingest sessions from Codex/OpenCode.
6
+ description or full body, evolve routing tables, rollback an evolution,
7
+ monitor post-deploy performance, check skill health status, view last
8
+ session insight, open the dashboard, serve the live dashboard, run health
9
+ checks, manage activation rules, ingest sessions from Codex/OpenCode/OpenClaw,
10
+ replay Claude Code transcripts, contribute anonymized data to the community,
11
+ set up autonomous cron jobs, manage evolution memory, configure auto-activation
12
+ suggestions, diagnose underperforming skills, analyze cross-skill patterns,
13
+ review evolution proposals, measure baseline lift, run skill unit tests,
14
+ analyze skill composability, or import SkillsBench evaluation corpora.
9
15
  ---
10
16
 
11
17
  # selftune
@@ -40,10 +46,22 @@ selftune watch --skill <name> --skill-path <path> [--auto-rollback]
40
46
  selftune status
41
47
  selftune last
42
48
  selftune doctor
43
- selftune dashboard [--export] [--out FILE]
49
+ selftune dashboard [--export] [--out FILE] [--serve]
44
50
  selftune ingest-codex
45
51
  selftune ingest-opencode
52
+ selftune ingest-openclaw [--agents-dir PATH] [--since DATE] [--dry-run] [--force] [--verbose]
46
53
  selftune wrap-codex -- <codex args>
54
+ selftune replay [--since DATE] [--dry-run] [--force] [--verbose]
55
+ selftune contribute [--skill NAME] [--preview] [--sanitize LEVEL] [--submit]
56
+ selftune cron setup [--dry-run] [--tz <timezone>]
57
+ selftune cron list
58
+ selftune cron remove [--dry-run]
59
+ selftune dashboard --serve [--port <port>]
60
+ selftune evolve-body --skill <name> --skill-path <path> --target <routing_table|full_body> [--dry-run]
61
+ selftune baseline --skill <name> --skill-path <path> [--eval-set <path>] [--agent <name>]
62
+ selftune unit-test --skill <name> --tests <path> [--run-agent] [--generate]
63
+ selftune composability --skill <name> [--window N] [--telemetry-log <path>]
64
+ selftune import-skillsbench --dir <path> --skill <name> --output <path> [--match-strategy exact|fuzzy]
47
65
  ```
48
66
 
49
67
  ## Workflow Routing
@@ -56,11 +74,59 @@ selftune wrap-codex -- <codex args>
56
74
  | rollback, undo, restore, revert evolution | Rollback | Workflows/Rollback.md |
57
75
  | watch, monitor, regression, post-deploy, performing | Watch | Workflows/Watch.md |
58
76
  | doctor, health, hooks, broken, diagnose | Doctor | Workflows/Doctor.md |
59
- | ingest, import, codex logs, opencode, wrap codex | Ingest | Workflows/Ingest.md |
77
+ | ingest, import, codex logs, opencode, openclaw, wrap codex | Ingest | Workflows/Ingest.md |
78
+ | replay, backfill, claude transcripts, historical sessions | Replay | Workflows/Replay.md |
79
+ | contribute, share, community, export data, anonymized | Contribute | Workflows/Contribute.md |
60
80
  | init, setup, bootstrap, first time | Initialize | Workflows/Initialize.md |
81
+ | cron, schedule, autonomous, automate evolution | Cron | Workflows/Cron.md |
82
+ | auto-activate, suggestions, activation rules, nag, why suggest | AutoActivation | Workflows/AutoActivation.md |
83
+ | dashboard, visual, open dashboard, skill grid, serve dashboard, live dashboard | Dashboard | Workflows/Dashboard.md |
84
+ | evolution memory, context memory, session continuity, what happened last | EvolutionMemory | Workflows/EvolutionMemory.md |
85
+ | evolve body, evolve routing, full body evolution, rewrite skill, teacher student | EvolveBody | Workflows/EvolveBody.md |
86
+ | baseline, baseline lift, adds value, skill value, no-skill comparison | Baseline | Workflows/Baseline.md |
87
+ | unit test, skill test, test skill, generate tests, run tests, assertions | UnitTest | Workflows/UnitTest.md |
88
+ | composability, co-occurrence, skill conflicts, skills together, conflict score | Composability | Workflows/Composability.md |
89
+ | import skillsbench, skillsbench, external evals, benchmark tasks, import corpus | ImportSkillsBench | Workflows/ImportSkillsBench.md |
61
90
  | status, health summary, skill health, pass rates, how are skills | Status | *(direct command — no workflow file)* |
62
91
  | last, last session, recent session, what happened | Last | *(direct command — no workflow file)* |
63
- | dashboard, visual, open dashboard, skill grid | Dashboard | *(direct command — no workflow file)* |
92
+
93
+ ## Interactive Configuration
94
+
95
+ Before running mutating workflows (evolve, evolve-body, evals, baseline), present
96
+ a pre-flight configuration prompt to the user. This gives them control over
97
+ execution mode, model selection, and key parameters.
98
+
99
+ ### Pre-Flight Pattern
100
+
101
+ Each mutating workflow has a **Pre-Flight Configuration** step. Follow this pattern:
102
+
103
+ 1. Present a summary of what the command will do
104
+ 2. Show numbered options with `(recommended)` markers for suggested defaults
105
+ 3. Ask the user to pick options or say "use defaults" / "go with defaults"
106
+ 4. Show a confirmation summary of selected options before executing
107
+
108
+ ### Model Tier Reference
109
+
110
+ When presenting model choices, use this table:
111
+
112
+ | Tier | Model | Speed | Cost | Quality | Best for |
113
+ |------|-------|-------|------|---------|----------|
114
+ | Fast | `haiku` | ~2s/call | $ | Good | Iteration loops, bulk validation |
115
+ | Balanced | `sonnet` | ~5s/call | $$ | Great | Single-pass proposals, gate checks |
116
+ | Best | `opus` | ~10s/call | $$$ | Excellent | High-stakes final validation |
117
+
118
+ ### Quick Path
119
+
120
+ If the user says "use defaults", "just do it", or similar — skip the pre-flight
121
+ and run with recommended defaults. The pre-flight is for users who want control,
122
+ not a mandatory gate.
123
+
124
+ ### Workflows That Skip Pre-Flight
125
+
126
+ These read-only or simple workflows run immediately without prompting:
127
+ `status`, `last`, `doctor`, `dashboard`, `watch`, `rollback`, `grade`,
128
+ `ingest-*`, `replay`, `contribute`, `cron`, `composability`, `unit-test`,
129
+ `import-skillsbench`.
64
130
 
65
131
  ## The Feedback Loop
66
132
 
@@ -94,7 +160,30 @@ Observe --> Detect --> Diagnose --> Propose --> Validate --> Deploy --> Watch
94
160
  | `Workflows/Rollback.md` | Undo an evolution, restore previous description |
95
161
  | `Workflows/Watch.md` | Post-deploy regression monitoring |
96
162
  | `Workflows/Doctor.md` | Health checks on logs, hooks, schema |
97
- | `Workflows/Ingest.md` | Import sessions from Codex and OpenCode |
163
+ | `Workflows/Ingest.md` | Import sessions from Codex, OpenCode, and OpenClaw |
164
+ | `Workflows/Replay.md` | Backfill logs from Claude Code transcripts |
165
+ | `Workflows/Contribute.md` | Export anonymized data for community contribution |
166
+ | `Workflows/Cron.md` | Manage OpenClaw cron jobs for autonomous evolution |
167
+ | `Workflows/AutoActivation.md` | Auto-activation hook behavior and rules |
168
+ | `Workflows/Dashboard.md` | Dashboard modes: static, export, live server |
169
+ | `Workflows/EvolutionMemory.md` | Evolution memory system for session continuity |
170
+ | `Workflows/EvolveBody.md` | Full body and routing table evolution |
171
+ | `Workflows/Baseline.md` | No-skill baseline comparison and lift measurement |
172
+ | `Workflows/UnitTest.md` | Skill-level unit test runner and generator |
173
+ | `Workflows/Composability.md` | Multi-skill co-occurrence conflict analysis |
174
+ | `Workflows/ImportSkillsBench.md` | SkillsBench task corpus importer |
175
+
176
+ ## Specialized Agents
177
+
178
+ selftune provides focused agents for deeper analysis. These live in
179
+ `.claude/agents/` and can be spawned as subagents for specialized tasks.
180
+
181
+ | Trigger keywords | Agent | Purpose |
182
+ |------------------|-------|---------|
183
+ | diagnose, root cause, why failing, skill failure, debug performance | diagnosis-analyst | Deep-dive analysis of underperforming skills |
184
+ | patterns, conflicts, cross-skill, overlap, trigger conflicts, optimize skills | pattern-analyst | Cross-skill pattern analysis and conflict detection |
185
+ | review evolution, check proposal, safe to deploy, approve evolution | evolution-reviewer | Safety gate review of pending evolution proposals |
186
+ | set up selftune, integrate, configure project, install selftune | integration-guide | Guided interactive setup for specific project types |
98
187
 
99
188
  ## Examples
100
189
 
@@ -110,7 +199,34 @@ Observe --> Detect --> Diagnose --> Propose --> Validate --> Deploy --> Watch
110
199
  - "How are my skills performing?"
111
200
  - "What happened in my last session?"
112
201
  - "Open the selftune dashboard"
202
+ - "Serve the dashboard at http://localhost:3141"
113
203
  - "Show skill health status"
204
+ - "Replay my Claude Code transcripts"
205
+ - "Backfill logs from historical sessions"
206
+ - "Contribute my selftune data to the community"
207
+ - "Share anonymized skill data"
208
+ - "Set up cron jobs for autonomous evolution"
209
+ - "Schedule selftune to run automatically"
210
+ - "Ingest my OpenClaw sessions"
211
+ - "Why is selftune suggesting things?"
212
+ - "Customize activation rules"
213
+ - "Start the live dashboard"
214
+ - "Serve the dashboard on port 8080"
215
+ - "What happened in the last evolution?"
216
+ - "Read the evolution memory"
217
+ - "Why is this skill underperforming?"
218
+ - "Are there conflicts between my skills?"
219
+ - "Review this evolution before deploying"
220
+ - "Set up selftune for my project"
221
+ - "Evolve the full body of the Research skill"
222
+ - "Rewrite the routing table for pptx"
223
+ - "Does this skill add value over no-skill baseline?"
224
+ - "Measure baseline lift for the Research skill"
225
+ - "Generate unit tests for the pptx skill"
226
+ - "Run skill unit tests"
227
+ - "Which skills conflict with each other?"
228
+ - "Analyze composability for the Research skill"
229
+ - "Import SkillsBench tasks for my skill"
114
230
 
115
231
  ## Negative Examples
116
232
 
@@ -0,0 +1,144 @@
1
+ # selftune Auto-Activation Workflow
2
+
3
+ Automatically suggests selftune commands during a session based on
4
+ activation rules. Runs as a `UserPromptSubmit` hook, evaluates rules
5
+ against session context, and outputs advisory suggestions to stderr.
6
+
7
+ ## How It Works
8
+
9
+ The `hooks/auto-activate.ts` script runs on every `UserPromptSubmit` event.
10
+ It reads session telemetry, query logs, and evolution audit data, then
11
+ evaluates a set of activation rules against the current context. When a
12
+ rule fires, the suggestion is written to stderr (shown to Claude as a
13
+ system message). The hook always exits 0 -- suggestions are advisory and
14
+ never block the user.
15
+
16
+ Flow:
17
+
18
+ 1. Claude Code triggers `UserPromptSubmit` hook
19
+ 2. Hook receives `{ session_id }` payload on stdin
20
+ 3. Checks PAI coexistence (see below)
21
+ 4. Loads default activation rules
22
+ 5. Evaluates each rule against session context
23
+ 6. Outputs suggestions to stderr (if any)
24
+ 7. Exits 0
25
+
26
+ ## PAI Coexistence
27
+
28
+ If PAI's `skill-activation-prompt` hook is detected in
29
+ `~/.claude/settings.json`, selftune skips all suggestions. PAI handles
30
+ skill-level activation; selftune handles observability. This prevents
31
+ duplicate or conflicting suggestions.
32
+
33
+ Detection scans all hook entries in settings for any command containing
34
+ `skill-activation-prompt`. If found, the hook exits silently.
35
+
36
+ ## Default Rules
37
+
38
+ | Rule ID | Description | Trigger Condition | Suggestion |
39
+ |---------|-------------|-------------------|------------|
40
+ | `post-session-diagnostic` | Suggest diagnostic review | >2 unmatched queries in current session | `selftune last` |
41
+ | `grading-threshold-breach` | Suggest evolution | Session pass rate < 0.6 (60%) | `selftune evolve` |
42
+ | `stale-evolution` | Suggest evolution | >7 days since last evolution AND pending false negatives exist | `selftune evolve` |
43
+ | `regression-detected` | Suggest rollback | Watch snapshot shows `regression_detected: true` | `selftune rollback` |
44
+
45
+ ### Rule Details
46
+
47
+ **post-session-diagnostic**: Compares query count against skill usage count
48
+ for the current session. If the difference exceeds 2, unmatched queries
49
+ likely indicate gaps in skill coverage.
50
+
51
+ **grading-threshold-breach**: Reads grading result files from
52
+ `~/.selftune/grading/result-*.json`. If the current session's pass rate
53
+ is below 0.6, the skill description may need evolution.
54
+
55
+ **stale-evolution**: Reads the evolution audit log to find the last
56
+ evolution timestamp. If older than 7 days, checks
57
+ `~/.selftune/false-negatives/pending.json` for pending false negatives.
58
+ Both conditions must be true.
59
+
60
+ **regression-detected**: Reads the latest monitoring snapshot from
61
+ `~/.selftune/monitoring/latest-snapshot.json`. If `regression_detected`
62
+ is true, suggests rollback with the skill name if available.
63
+
64
+ ## Session State Tracking
65
+
66
+ Each rule fires at most once per session. After a suggestion is shown,
67
+ the rule ID is recorded in session state to prevent repeated nags.
68
+
69
+ Session state is stored at `~/.selftune/session-state-<session_id>.json`:
70
+
71
+ ```json
72
+ {
73
+ "session_id": "abc-123",
74
+ "suggestions_shown": ["post-session-diagnostic", "grading-threshold-breach"],
75
+ "updated_at": "2026-03-02T10:00:00Z"
76
+ }
77
+ ```
78
+
79
+ State is keyed by `session_id`. If the session ID changes (new session),
80
+ state resets automatically.
81
+
82
+ ## Customizing Rules
83
+
84
+ Rules are defined in `cli/selftune/activation-rules.ts` as the
85
+ `DEFAULT_RULES` array. To customize rule behavior, edit that TypeScript
86
+ file directly. There is no runtime JSON config — the hook imports
87
+ `DEFAULT_RULES` at evaluation time.
88
+
89
+ Each rule conforms to the `ActivationRule` interface:
90
+
91
+ ```typescript
92
+ interface ActivationRule {
93
+ id: string;
94
+ description: string;
95
+ evaluate(ctx: ActivationContext): string | null;
96
+ }
97
+ ```
98
+
99
+ The `ActivationContext` provides paths to all log files and the selftune
100
+ config directory. Return a suggestion string when the rule fires, or
101
+ `null` to skip.
102
+
103
+ ## Disabling Auto-Activation
104
+
105
+ Remove the `auto-activate.ts` hook entry from `~/.claude/settings.json`.
106
+ The hook is registered under `UserPromptSubmit`:
107
+
108
+ ```json
109
+ {
110
+ "hooks": {
111
+ "UserPromptSubmit": [
112
+ {
113
+ "command": "bun run /path/to/cli/selftune/hooks/auto-activate.ts"
114
+ }
115
+ ]
116
+ }
117
+ }
118
+ ```
119
+
120
+ Delete or comment out the entry to disable all auto-activation suggestions.
121
+
122
+ ## Common Patterns
123
+
124
+ **"Stop suggesting commands"**
125
+ > Remove the auto-activate hook from settings (see Disabling above).
126
+ > Or wait -- each rule only fires once per session.
127
+
128
+ **"Why am I seeing selftune suggestions?"**
129
+ > The auto-activate hook detected an actionable condition. Check which
130
+ > rule fired (the suggestion includes the command) and follow the advice.
131
+
132
+ **"Suggestions aren't appearing"**
133
+ > Run `selftune doctor` to verify the hook is installed. Check that
134
+ > `UserPromptSubmit` includes the auto-activate hook in settings.
135
+
136
+ **"PAI is installed but I still see suggestions"**
137
+ > Verify PAI's `skill-activation-prompt` hook is in settings. The
138
+ > coexistence check scans for that specific command string.
139
+
140
+ **"I want custom activation logic"**
141
+ > Create rules conforming to the `ActivationRule` interface. Rules must
142
+ > be pure filesystem readers -- no network, no heavy imports. Add them
143
+ > to the rules array in `activation-rules.ts` or reference a custom
144
+ > rules file.
@@ -0,0 +1,118 @@
1
+ # Badge Command
2
+
3
+ Generate skill health badges for embedding in READMEs and documentation.
4
+
5
+ ## Usage
6
+
7
+ ```bash
8
+ selftune badge --skill <name> [--format svg|markdown|url] [--output <path>]
9
+ ```
10
+
11
+ ## Options
12
+
13
+ | Option | Required | Default | Description |
14
+ |--------|----------|---------|-------------|
15
+ | `--skill` | Yes | -- | Skill name to generate badge for |
16
+ | `--format` | No | `svg` | Output format: `svg`, `markdown`, or `url` |
17
+ | `--output` | No | stdout | Write output to file |
18
+ | `--help` | No | -- | Show usage information |
19
+
20
+ ## Examples
21
+
22
+ ### Generate SVG badge
23
+ ```bash
24
+ selftune badge --skill my-skill --format svg > badge.svg
25
+ ```
26
+
27
+ ### Get markdown for README
28
+ ```bash
29
+ selftune badge --skill my-skill --format markdown
30
+ ```
31
+ Output: `![Skill Health: my-skill](https://img.shields.io/badge/Skill%20Health-87%25%20%E2%86%91-4c1)`
32
+
33
+ ### Get shields.io URL
34
+ ```bash
35
+ selftune badge --skill my-skill --format url
36
+ ```
37
+
38
+ ### Write badge to file
39
+ ```bash
40
+ selftune badge --skill my-skill --output badge.svg
41
+ ```
42
+
43
+ ## Badge Branding
44
+
45
+ SVG badges (both `--format svg` and dashboard routes) include the selftune logo as an inline 14px icon in the label section. The logo is embedded as a base64 data URI — no external requests needed.
46
+
47
+ ```
48
+ [ 🔵 Skill Health (gray) ] [ 85% ↑ (green) ]
49
+ ^14px logo + 3px gap
50
+ ```
51
+
52
+ Markdown and URL formats use shields.io, which renders its own badge — the logo only appears in locally-generated SVGs.
53
+
54
+ ## Badge Colors
55
+
56
+ | Pass Rate | Color | Hex |
57
+ |-----------|-------|-----|
58
+ | > 80% | Green | `#4c1` |
59
+ | 60-80% | Yellow | `#dfb317` |
60
+ | < 60% | Red | `#e05d44` |
61
+ | No data | Gray | `#9f9f9f` |
62
+
63
+ ## Embedding in README
64
+
65
+ Add to your skill's README.md:
66
+ ```markdown
67
+ ![Skill Health: my-skill](https://img.shields.io/badge/Skill%20Health-87%25%20%E2%86%91-4c1)
68
+ ```
69
+
70
+ Or use the generated SVG directly for offline rendering.
71
+
72
+ ## Dashboard Routes (Phase 2)
73
+
74
+ The local dashboard server exposes badge and report routes:
75
+
76
+ ### GET /badge/:skillName
77
+
78
+ Returns a live SVG badge computed from local telemetry logs.
79
+
80
+ ```
81
+ http://localhost:<port>/badge/my-skill
82
+ ```
83
+
84
+ - Returns `image/svg+xml` with `Cache-Control: no-cache, no-store`
85
+ - Returns a gray "not found" badge (not JSON 404) for unknown skills
86
+
87
+ ### GET /report/:skillName
88
+
89
+ Returns an HTML report page with pass rate, trend, session count, and embed code.
90
+
91
+ ```
92
+ http://localhost:<port>/report/my-skill
93
+ ```
94
+
95
+ ## Hosted Service (Phase 3)
96
+
97
+ The hosted badge service at `badge.selftune.dev` aggregates community contributions and serves badges publicly.
98
+
99
+ ### Endpoints
100
+
101
+ | Route | Method | Description |
102
+ |-------|--------|-------------|
103
+ | `/badge/:skill` | GET | SVG badge from aggregated community data |
104
+ | `/badge/:org/:skill` | GET | Organization-scoped SVG badge |
105
+
106
+ ### Embedding from hosted service
107
+
108
+ ```markdown
109
+ ![Skill Health: my-skill](https://badge.selftune.dev/badge/my-skill)
110
+ ```
111
+
112
+ ### Contributing data
113
+
114
+ ```bash
115
+ selftune contribute --submit --skill my-skill
116
+ ```
117
+
118
+ Uses `--endpoint` to target a custom service URL, with `--github` as fallback.
@@ -0,0 +1,121 @@
1
+ # selftune Baseline Workflow
2
+
3
+ Measure whether a skill adds value over a no-skill baseline. Runs trigger
4
+ checks with and without the skill description to compute lift — the
5
+ improvement in pass rate that the skill provides.
6
+
7
+ ## Default Command
8
+
9
+ ```bash
10
+ selftune baseline --skill <name> --skill-path <path> [options]
11
+ ```
12
+
13
+ ## Options
14
+
15
+ | Flag | Description | Default |
16
+ |------|-------------|---------|
17
+ | `--skill <name>` | Skill name | Required |
18
+ | `--skill-path <path>` | Path to the skill's SKILL.md | Required |
19
+ | `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
20
+ | `--agent <name>` | Agent CLI to use | Auto-detected |
21
+
22
+ ## Output Format
23
+
24
+ ```json
25
+ {
26
+ "skill_name": "Research",
27
+ "eval_set_size": 25,
28
+ "baseline_pass_rate": 0.32,
29
+ "with_skill_pass_rate": 0.88,
30
+ "lift": 0.56,
31
+ "adds_value": true,
32
+ "measured_at": "2026-03-04T12:00:00.000Z"
33
+ }
34
+ ```
35
+
36
+ ## How It Works
37
+
38
+ 1. Loads the eval set (from `--eval-set` or auto-generated from logs)
39
+ 2. Reads the skill's current description from SKILL.md
40
+ 3. Runs trigger checks against an **empty description** (no-skill baseline)
41
+ 4. Runs trigger checks against the **actual description** (with-skill)
42
+ 5. Computes pass rates for both conditions
43
+ 6. Calculates `lift = with_skill_pass_rate - baseline_pass_rate`
44
+ 7. Sets `adds_value = lift >= 0.05`
45
+
46
+ ## Integration with Evolve
47
+
48
+ The `selftune evolve` command supports a `--with-baseline` flag:
49
+
50
+ ```bash
51
+ selftune evolve --skill Research --skill-path /path/SKILL.md --with-baseline
52
+ ```
53
+
54
+ When enabled, the evolve command measures baseline lift before deploying.
55
+ If the skill doesn't add at least 5% lift over no-skill, the evolution is
56
+ skipped — the skill needs fundamental rework, not description tweaks.
57
+
58
+ ## Steps
59
+
60
+ ### 0. Pre-Flight Configuration
61
+
62
+ Before running baseline measurement, present configuration options to the user.
63
+ If the user says "use defaults" or similar, skip to step 1 with recommended defaults.
64
+
65
+ Present these options:
66
+
67
+ ```
68
+ selftune baseline — Pre-Flight Configuration
69
+
70
+ 1. Eval Set Source
71
+ a) Auto-generate from logs (recommended if logs exist)
72
+ b) Use existing eval set file — provide path
73
+ c) Generate synthetic evals first (for new skills with no data)
74
+
75
+ 2. Agent CLI
76
+ a) Auto-detect (recommended)
77
+ b) Specify: claude / codex / opencode
78
+
79
+ → Reply with your choices or "use defaults" for recommended settings.
80
+ ```
81
+
82
+ After the user responds, show a confirmation summary:
83
+
84
+ ```
85
+ Configuration Summary:
86
+ Eval source: auto-generate from logs
87
+ Agent: auto-detect
88
+
89
+ Proceeding...
90
+ ```
91
+
92
+ ### 1. Run Baseline Measurement
93
+
94
+ ```bash
95
+ selftune baseline --skill Research --skill-path ~/.claude/skills/Research/SKILL.md
96
+ ```
97
+
98
+ ### 2. Interpret Results
99
+
100
+ | Lift | Interpretation | Action |
101
+ |------|---------------|--------|
102
+ | >= 0.20 | Strong value | Skill is working well |
103
+ | 0.05–0.20 | Moderate value | Consider evolving to improve |
104
+ | < 0.05 | Minimal value | Skill may need rework, not just evolution |
105
+ | < 0 | Negative value | Skill is hurting — investigate or disable |
106
+
107
+ ### 3. Use as Evolution Gate
108
+
109
+ Add `--with-baseline` to evolve commands to prevent wasting evolution
110
+ cycles on skills that don't add value.
111
+
112
+ ## Common Patterns
113
+
114
+ **"Does the Research skill add value?"**
115
+ > `selftune baseline --skill Research --skill-path ~/.claude/skills/Research/SKILL.md`
116
+
117
+ **"Only evolve if the skill is actually useful"**
118
+ > `selftune evolve --skill Research --skill-path /path/SKILL.md --with-baseline`
119
+
120
+ **"Check baseline with a custom eval set"**
121
+ > `selftune baseline --skill pptx --skill-path /path/SKILL.md --eval-set evals-pptx.json`