selftune 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/.claude/agents/diagnosis-analyst.md +146 -0
  2. package/.claude/agents/evolution-reviewer.md +167 -0
  3. package/.claude/agents/integration-guide.md +200 -0
  4. package/.claude/agents/pattern-analyst.md +147 -0
  5. package/CHANGELOG.md +38 -1
  6. package/README.md +96 -256
  7. package/assets/BeforeAfter.gif +0 -0
  8. package/assets/FeedbackLoop.gif +0 -0
  9. package/assets/logo.svg +9 -0
  10. package/assets/skill-health-badge.svg +20 -0
  11. package/cli/selftune/activation-rules.ts +171 -0
  12. package/cli/selftune/badge/badge-data.ts +108 -0
  13. package/cli/selftune/badge/badge-svg.ts +212 -0
  14. package/cli/selftune/badge/badge.ts +103 -0
  15. package/cli/selftune/constants.ts +75 -1
  16. package/cli/selftune/contribute/bundle.ts +314 -0
  17. package/cli/selftune/contribute/contribute.ts +214 -0
  18. package/cli/selftune/contribute/sanitize.ts +162 -0
  19. package/cli/selftune/cron/setup.ts +266 -0
  20. package/cli/selftune/dashboard-server.ts +582 -0
  21. package/cli/selftune/dashboard.ts +31 -12
  22. package/cli/selftune/eval/baseline.ts +247 -0
  23. package/cli/selftune/eval/composability.ts +117 -0
  24. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  25. package/cli/selftune/eval/hooks-to-evals.ts +68 -2
  26. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  27. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  28. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  29. package/cli/selftune/eval/unit-test.ts +196 -0
  30. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  31. package/cli/selftune/evolution/evolve-body.ts +492 -0
  32. package/cli/selftune/evolution/evolve.ts +479 -104
  33. package/cli/selftune/evolution/extract-patterns.ts +32 -1
  34. package/cli/selftune/evolution/pareto.ts +314 -0
  35. package/cli/selftune/evolution/propose-body.ts +171 -0
  36. package/cli/selftune/evolution/propose-description.ts +100 -2
  37. package/cli/selftune/evolution/propose-routing.ts +166 -0
  38. package/cli/selftune/evolution/refine-body.ts +141 -0
  39. package/cli/selftune/evolution/rollback.ts +20 -3
  40. package/cli/selftune/evolution/validate-body.ts +254 -0
  41. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  42. package/cli/selftune/evolution/validate-routing.ts +177 -0
  43. package/cli/selftune/grading/grade-session.ts +145 -19
  44. package/cli/selftune/grading/pre-gates.ts +104 -0
  45. package/cli/selftune/hooks/auto-activate.ts +185 -0
  46. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  47. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  48. package/cli/selftune/index.ts +88 -0
  49. package/cli/selftune/ingestors/claude-replay.ts +351 -0
  50. package/cli/selftune/ingestors/codex-rollout.ts +1 -1
  51. package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
  52. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  53. package/cli/selftune/init.ts +168 -5
  54. package/cli/selftune/last.ts +2 -2
  55. package/cli/selftune/memory/writer.ts +447 -0
  56. package/cli/selftune/monitoring/watch.ts +25 -2
  57. package/cli/selftune/status.ts +18 -15
  58. package/cli/selftune/types.ts +377 -5
  59. package/cli/selftune/utils/frontmatter.ts +217 -0
  60. package/cli/selftune/utils/llm-call.ts +29 -3
  61. package/cli/selftune/utils/transcript.ts +35 -0
  62. package/cli/selftune/utils/trigger-check.ts +89 -0
  63. package/cli/selftune/utils/tui.ts +156 -0
  64. package/dashboard/index.html +585 -19
  65. package/package.json +17 -6
  66. package/skill/SKILL.md +127 -10
  67. package/skill/Workflows/AutoActivation.md +144 -0
  68. package/skill/Workflows/Badge.md +118 -0
  69. package/skill/Workflows/Baseline.md +121 -0
  70. package/skill/Workflows/Composability.md +100 -0
  71. package/skill/Workflows/Contribute.md +91 -0
  72. package/skill/Workflows/Cron.md +155 -0
  73. package/skill/Workflows/Dashboard.md +203 -0
  74. package/skill/Workflows/Doctor.md +37 -1
  75. package/skill/Workflows/Evals.md +73 -5
  76. package/skill/Workflows/EvolutionMemory.md +152 -0
  77. package/skill/Workflows/Evolve.md +111 -6
  78. package/skill/Workflows/EvolveBody.md +159 -0
  79. package/skill/Workflows/ImportSkillsBench.md +111 -0
  80. package/skill/Workflows/Ingest.md +129 -15
  81. package/skill/Workflows/Initialize.md +58 -3
  82. package/skill/Workflows/Replay.md +70 -0
  83. package/skill/Workflows/Rollback.md +20 -1
  84. package/skill/Workflows/UnitTest.md +138 -0
  85. package/skill/Workflows/Watch.md +22 -0
  86. package/skill/settings_snippet.json +23 -0
  87. package/templates/activation-rules-default.json +27 -0
  88. package/templates/multi-skill-settings.json +64 -0
  89. package/templates/single-skill-settings.json +58 -0
package/CHANGELOG.md CHANGED
@@ -5,13 +5,50 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/).
7
7
 
8
- ## [0.6.0] - 2026-03-01
8
+ ## [Unreleased]
9
+
10
+ ## [0.2.0] — 2026-03-08
11
+
12
+ ### Added
13
+
14
+ - **Full skill body evolution** — Teacher-student model for evolving routing tables and complete skill bodies with 3-gate validation (structural, trigger, quality)
15
+ - **Synthetic eval generation** — `selftune evals --synthetic --skill <name> --skill-path <path>` generates eval sets from SKILL.md via LLM without needing real session logs. Solves cold-start for new skills.
16
+ - **Batch trigger validation** — `validateProposalBatched()` batches 10 queries per LLM call (configurable via `TRIGGER_CHECK_BATCH_SIZE`). ~10x faster evolution loops. Sequential `validateProposalSequential()` kept for backward compat.
17
+ - **Cheap-loop evolution mode** — `selftune evolve --cheap-loop` uses haiku for proposal generation and validation, sonnet only for the final deployment gate. New `--gate-model` and `--proposal-model` flags for manual per-stage control.
18
+ - **Validation model selection** — `--validation-model` flag on `evolve` and `evolve-body` commands (default: `haiku`).
19
+ - **Proposal model selection** — `--proposal-model` flag on `evolve`, passed through to `generateProposal()` and `generateMultipleProposals()`.
20
+ - **Gate validation dependency injection** — `gateValidateProposal` added to `EvolveDeps` for testability.
21
+ - **Auto-activation system** — `auto-activate.ts` UserPromptSubmit hook detects when selftune should run and outputs formatted suggestions; session state tracking prevents repeated nags; PAI coexistence support
22
+ - **Skill change guard** — `skill-change-guard.ts` PreToolUse hook detects Write/Edit to SKILL.md files and suggests running `selftune watch`
23
+ - **Evolution memory** — 3-file persistence system at `~/.selftune/memory/` (context.md, plan.md, decisions.md) survives context resets; auto-maintained by evolve, rollback, and watch commands
24
+ - **Specialized agents** — 4 purpose-built Claude Code agents: diagnosis-analyst, pattern-analyst, evolution-reviewer, integration-guide
25
+ - **Enforcement guardrails** — `evolution-guard.ts` PreToolUse hook blocks SKILL.md edits on actively monitored skills unless `selftune watch` has been run recently
26
+ - **Integration guide** — Comprehensive `docs/integration-guide.md` with project-type patterns (single-skill, multi-skill, monorepo, Codex-only, OpenCode-only, mixed)
27
+ - **Settings templates** — `templates/single-skill-settings.json`, `templates/multi-skill-settings.json`, `templates/activation-rules-default.json`
28
+ - **Enhanced init** — `selftune init` now detects workspace structure (skill count, monorepo layout) and suggests appropriate template
29
+ - **Dashboard server** — `selftune dashboard --serve` launches live Bun.serve server with SSE auto-refresh, action buttons (watch/evolve/rollback), and evolution timeline
30
+ - **Activation rules engine** — Configurable trigger rules for auto-activation (grading thresholds, stale evolutions, regression detection)
31
+ - **Sandbox test harness** (`tests/sandbox/run-sandbox.ts`): Exercises all CLI commands and hooks against fixture data in an isolated `/tmp` environment. Runs in ~400ms with 10/10 tests passing.
32
+ - **Devcontainer-based LLM testing** (`.devcontainer/` + `tests/sandbox/docker/`): Based on the official Claude Code devcontainer reference. Uses `claude -p` with `--dangerously-skip-permissions` for unattended LLM-dependent testing (grade, evolve, watch). No API key required — uses existing Claude subscription.
33
+ - **Realistic test fixtures**: 3 skills from skills.sh (find-skills, frontend-design, ai-image-generation) with 15 sessions, 30 queries, 7 skill usage records, and evolution audit history.
34
+ - **Hook integration tests**: All 3 Claude Code hooks (prompt-log, skill-eval, session-stop) tested via stdin payload injection.
35
+
36
+ ### Changed
37
+
38
+ - `validateProposal()` now delegates to `validateProposalBatched()` by default (was sequential).
39
+ - `hooks-to-evals.ts` `cliMain()` is now async to support synthetic generation.
40
+ - `EvolveOptions` extended with `validationModel`, `cheapLoop`, `gateModel`, `proposalModel`.
41
+ - `EvolveResult` extended with `gateValidation`.
42
+
43
+ ## [0.1.4] - 2026-03-01
9
44
 
10
45
  ### Added
11
46
 
12
47
  - `selftune status` — CLI skill health summary with pass rates, trends, and system health
13
48
  - `selftune last` — Quick insight from the most recent session
14
49
  - `selftune dashboard` — Skill-health-centric HTML dashboard with grid view and drill-down
50
+ - `selftune replay` — Claude Code transcript replay for retroactive log backfill
51
+ - `selftune contribute` — Opt-in anonymized data export for community contribution
15
52
  - CI/CD workflows: publish, auto-bump, CodeQL, scorecard
16
53
  - FOSS governance: LICENSE (MIT), CODE_OF_CONDUCT, CONTRIBUTING, SECURITY
17
54
  - npm package configuration with CJS bin entry point
package/README.md CHANGED
@@ -1,316 +1,156 @@
1
+ <div align="center">
2
+
3
+ <img src="assets/logo.svg" alt="selftune logo" width="80" />
4
+
5
+ # selftune
6
+
7
+ **Self-improving skills for AI agents.**
8
+
1
9
  [![CI](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml/badge.svg)](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml)
2
10
  [![CodeQL](https://github.com/WellDunDun/selftune/actions/workflows/codeql.yml/badge.svg)](https://github.com/WellDunDun/selftune/actions/workflows/codeql.yml)
3
11
  [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/WellDunDun/selftune/badge)](https://securityscorecards.dev/viewer/?uri=github.com/WellDunDun/selftune)
4
12
  [![npm version](https://img.shields.io/npm/v/selftune)](https://www.npmjs.com/package/selftune)
5
13
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
6
- [![TypeScript](https://img.shields.io/badge/TypeScript-5.0-blue.svg)](https://www.typescriptlang.org/)
7
- [![Zero Dependencies](https://img.shields.io/badge/dependencies-0-brightgreen)](https://www.npmjs.com/package/selftune?activeTab=dependencies)
8
- [![Bun](https://img.shields.io/badge/runtime-bun%20%7C%20node-black)](https://bun.sh)
9
-
10
- # selftune — Skill Observability & Continuous Improvement CLI
11
-
12
- [![npm version](https://img.shields.io/npm/v/selftune)](https://www.npmjs.com/package/selftune)
13
- [![CI](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml/badge.svg)](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml)
14
- [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
14
+ [![TypeScript](https://img.shields.io/badge/TypeScript-blue.svg)](https://www.typescriptlang.org/)
15
15
  [![Zero Dependencies](https://img.shields.io/badge/dependencies-0-brightgreen)](https://www.npmjs.com/package/selftune?activeTab=dependencies)
16
16
  [![Bun](https://img.shields.io/badge/runtime-bun%20%7C%20node-black)](https://bun.sh)
17
17
 
18
- Observe real sessions, detect missed triggers, grade execution quality, and automatically evolve skill descriptions toward the language real users actually use.
19
-
20
- Works with **Claude Code**, **Codex**, and **OpenCode**.
21
-
22
- ```
23
- Observe → Detect → Diagnose → Propose → Validate → Deploy → Watch → Repeat
24
- ```
25
-
26
- ---
27
-
28
- ## Install
29
-
30
- ```bash
31
- npx selftune@latest doctor
32
- ```
33
-
34
- Or install globally:
35
-
36
- ```bash
37
- npm install -g selftune
38
- selftune doctor
39
- ```
40
-
41
- Requires [Bun](https://bun.sh) or Node.js 18+ with [tsx](https://github.com/privatenumber/tsx).
42
-
43
- ---
44
-
45
- ## Why
18
+ Your agent skills learn how you work. Detect what's broken. Fix it automatically.
46
19
 
47
- Agent skills are static, but users are not. When a skill undertriggers when someone says "make me a slide deck" and the pptx skill doesn't fire — that failure is invisible. The user concludes "AI doesn't follow directions" rather than recognizing the skill description doesn't match how real people talk.
20
+ **[Install](#install)** · **[Use Cases](#built-for-how-you-actually-work)** · **[How It Works](#how-it-works)** · **[Commands](#commands)** · **[Platforms](#platforms)** · **[Docs](docs/integration-guide.md)**
48
21
 
49
- selftune closes this feedback loop.
22
+ </div>
50
23
 
51
24
  ---
52
25
 
53
- ## What It Does
26
+ Your skills don't understand how you talk. You say "make me a slide deck" and nothing happens — no error, no log, no signal. selftune watches your real sessions, learns how you actually speak, and rewrites skill descriptions to match. Automatically.
54
27
 
55
- | Capability | Description |
56
- |---|---|
57
- | **Session telemetry** | Captures per-session process metrics across all three platforms |
58
- | **False negative detection** | Surfaces queries where a skill should have fired but didn't |
59
- | **Eval set generation** | Converts hook logs into trigger eval sets with real usage as ground truth |
60
- | **Session grading** | 3-tier evaluation (Trigger / Process / Quality) using the agent you already have |
61
- | **Skill evolution** | Proposes improved descriptions, validates them, deploys with audit trail |
62
- | **Post-deploy monitoring** | Watches evolved skills for regressions, auto-rollback on pass rate drops |
63
-
64
- ---
28
+ Works with **Claude Code**, **Codex**, **OpenCode**, and **OpenClaw**. Zero runtime dependencies.
65
29
 
66
- ## Setup
67
-
68
- ### 1. Add the skill
30
+ ## Install
69
31
 
70
32
  ```bash
71
33
  npx skills add WellDunDun/selftune
72
34
  ```
73
35
 
74
- ### 2. Initialize
75
-
76
- Tell your agent: **"initialize selftune"**
77
-
78
- The agent will install the CLI (`npm install -g selftune`) if needed, run `selftune init` to bootstrap config, install hooks, and verify with `selftune doctor`.
79
-
80
- ---
81
-
82
- ## Development
83
-
84
- For contributors running from source.
85
-
86
- ### 1. Initialize
87
-
88
- ```bash
89
- npx selftune@latest init
90
- ```
91
-
92
- The `init` command auto-detects your agent environment (Claude Code, Codex, or OpenCode), resolves the CLI path, determines the LLM mode, and writes config to `~/.selftune/config.json`. All subsequent commands read from this config.
93
-
94
- Use `--agent claude_code|codex|opencode` to override detection, `--llm-mode agent|api` to override LLM mode, or `--force` to reinitialize.
95
-
96
- ### 4. Install hooks (Claude Code)
36
+ Then tell your agent: **"initialize selftune"**
97
37
 
98
- If `init` reports hooks are not installed, merge the entries from `skill/settings_snippet.json` into `~/.claude/settings.json`. Derive hook script paths from the `cli_path` field in `~/.selftune/config.json` — the hooks directory is at `dirname(cli_path)/hooks/`.
38
+ Two minutes. No API keys. No external services. No configuration ceremony. Uses your existing agent subscription. Within minutes you'll see which skills are undertriggering.
99
39
 
100
- ### 5. Verify setup
40
+ **CLI only** (no skill, just the CLI):
101
41
 
102
42
  ```bash
103
- selftune doctor
104
- ```
105
-
106
- Doctor checks log file health, hook installation, schema validity, and config status.
107
-
108
- ### Platform-Specific Notes
109
-
110
- **Claude Code** — Hooks capture telemetry automatically after installation. Zero configuration once hooks are in `settings.json`.
111
-
112
- **Codex** — Use the wrapper for real-time capture or the batch ingestor for historical logs:
113
- ```bash
114
- selftune wrap-codex -- <your codex args>
115
- selftune ingest-codex
43
+ npx selftune@latest doctor
116
44
  ```
117
45
 
118
- **OpenCode** Backfill historical sessions from SQLite:
119
- ```bash
120
- selftune ingest-opencode
121
- ```
46
+ ## Before / After
122
47
 
123
- All platforms write to the same shared JSONL log schema at `~/.claude/`.
48
+ <p align="center">
49
+ <img src="./assets/BeforeAfter.gif" alt="Before: 47% pass rate → After: 89% pass rate" width="800">
50
+ </p>
124
51
 
125
- ---
52
+ selftune learned that real users say "slides", "deck", "presentation for Monday" — none of which matched the original skill description. It rewrote the description to match how people actually talk. Validated against the eval set. Deployed with a backup. Done.
126
53
 
127
- ## Commands
54
+ ## Built for How You Actually Work
128
55
 
129
- ```
130
- selftune <command> [options]
131
- ```
56
+ **I write and use my own skills** — You built skills for your workflow but your descriptions don't match how you actually talk. selftune learns your language from real sessions and evolves descriptions to match — no more manual tuning. `selftune status` · `selftune evolve` · `selftune baseline`
132
57
 
133
- | Command | Purpose |
134
- |---|---|
135
- | `init` | Auto-detect agent environment, write `~/.selftune/config.json` |
136
- | `grade --skill <name>` | Grade a session (3-tier: trigger, process, quality) |
137
- | `evals --skill <name>` | Generate eval set from real usage logs |
138
- | `evals --list-skills` | Show logged skills and query counts |
139
- | `evolve --skill <name> --skill-path <path>` | Analyze failures, propose and deploy improved description |
140
- | `rollback --skill <name> --skill-path <path>` | Restore pre-evolution description |
141
- | `watch --skill <name> --skill-path <path>` | Monitor post-deploy pass rates, detect regressions |
142
- | `status` | Show skill health summary (pass rates, trends, missed queries) |
143
- | `last` | Show quick insight from the most recent session |
144
- | `doctor` | Health checks on logs, hooks, config, and schema |
145
- | `dashboard` | Open skill-health-centric HTML dashboard in browser |
146
- | `ingest-codex` | Batch ingest Codex rollout logs |
147
- | `ingest-opencode` | Backfill historical OpenCode sessions from SQLite |
148
- | `wrap-codex -- <args>` | Real-time Codex wrapper with telemetry |
149
-
150
- No separate API key required — grading and evolution use whatever agent CLI you already have installed (Claude Code, Codex, or OpenCode).
151
-
152
- See `skill/Workflows/` for detailed step-by-step guides for each command.
58
+ **I publish skills others install** — Your skill works for you, but every user talks differently. selftune ships skills that get better for every user automatically — adapting descriptions to how each person actually works. `selftune status` · `selftune evals` · `selftune badge`
153
59
 
154
- ---
60
+ **I manage an agent setup with many skills** — You have 15+ skills installed. Some work. Some don't. Some conflict. selftune gives you a health dashboard and automatically improves the skills that aren't keeping up with how your team works. `selftune dashboard` · `selftune composability` · `selftune doctor`
155
61
 
156
62
  ## How It Works
157
63
 
158
- ### Telemetry Capture
64
+ <p align="center">
65
+ <img src="./assets/FeedbackLoop.gif" alt="Observe → Detect → Evolve → Watch" width="800">
66
+ </p>
159
67
 
160
- ```
161
- Claude Code (hooks): OpenCode (hooks):
162
- UserPromptSubmit → prompt-log.ts message.* → opencode-prompt-log.ts
163
- PostToolUse → skill-eval.ts tool.execute.after → opencode-skill-eval.ts
164
- Stop → session-stop.ts session.idle → opencode-session-stop.ts
165
- │ │
166
- └──────────┬─────────────────────────┘
167
-
168
- Shared JSONL Log Schema (~/.claude/)
169
- ├── all_queries_log.jsonl
170
- ├── skill_usage_log.jsonl
171
- └── session_telemetry_log.jsonl
172
-
173
- Codex (wrapper/ingestor — hooks not yet available):
174
- codex-wrapper.ts (real-time tee of JSONL stream)
175
- codex-rollout.ts (batch ingest from rollout logs)
176
-
177
- └──→ Same shared JSONL schema
178
- ```
68
+ A continuous feedback loop that makes your skills learn and adapt. Automatically.
179
69
 
180
- ### Eval & Grading
70
+ **Observe** Hooks capture every user query and which skills fired. On Claude Code, hooks install automatically. Use `selftune replay` to backfill existing transcripts. This is how your skills start learning.
181
71
 
182
- ```
183
- selftune evals cross-references the two query logs:
184
- Positives = skill_usage_log entries for target skill
185
- Negatives = all_queries_log entries NOT in positives
186
-
187
- selftune grade reads:
188
- session_telemetry_log → process metrics (tool calls, errors, turns)
189
- transcript JSONL → what actually happened
190
- expectations → what should have happened
191
- ```
72
+ **Detect** — selftune finds the gap between how you talk and how your skills are described. You say "make me a slide deck" and your pptx skill stays silent — selftune catches that mismatch.
192
73
 
193
- ### Evolution Loop
74
+ **Evolve** Rewrites skill descriptions — and full skill bodies — to match how you actually work. Batched validation with per-stage model control (`--cheap-loop` uses haiku for the loop, sonnet for the gate). Teacher-student body evolution with 3-gate validation. Baseline comparison gates on measurable lift. Automatic backup.
194
75
 
195
- ```
196
- selftune evolve:
197
- 1. Load eval set (or generate from logs)
198
- 2. Extract failure patterns (missed queries grouped by invocation type)
199
- 3. Generate improved description via LLM
200
- 4. Validate against eval set (must improve, <5% regression)
201
- 5. Deploy updated SKILL.md + PR + audit trail
202
-
203
- selftune watch:
204
- Monitor pass rate over sliding window of recent sessions
205
- Alert (or auto-rollback) on regression > threshold
206
- ```
76
+ **Watch** — After deploying changes, selftune monitors skill trigger rates. If anything regresses, it rolls back automatically. Your skills keep improving without you touching them.
207
77
 
208
- ---
78
+ ## What's New in v0.2.0
209
79
 
210
- ## Architecture
80
+ - **Full skill body evolution** — Beyond descriptions: evolve routing tables and entire skill bodies using teacher-student model with structural, trigger, and quality gates
81
+ - **Synthetic eval generation** — `selftune evals --synthetic` generates eval sets from SKILL.md via LLM, no session logs needed. Solves cold-start: new skills get evals immediately.
82
+ - **Cheap-loop evolution** — `selftune evolve --cheap-loop` uses haiku for proposal generation and validation, sonnet only for the final deployment gate. ~80% cost reduction.
83
+ - **Batch trigger validation** — Validation now batches 10 queries per LLM call instead of one-per-query. ~10x faster evolution loops.
84
+ - **Per-stage model control** — `--validation-model`, `--proposal-model`, and `--gate-model` flags give fine-grained control over which model runs each evolution stage.
85
+ - **Auto-activation system** — Hooks detect when selftune should run and suggest actions
86
+ - **Enforcement guardrails** — Blocks SKILL.md edits on monitored skills unless `selftune watch` has been run
87
+ - **Live dashboard server** — `selftune dashboard --serve` with SSE auto-refresh and action buttons
88
+ - **Evolution memory** — Persists context, plans, and decisions across context resets
89
+ - **4 specialized agents** — Diagnosis analyst, pattern analyst, evolution reviewer, integration guide
90
+ - **Sandbox test harness** — Comprehensive automated test coverage, including devcontainer-based LLM testing
211
91
 
212
- ```
213
- cli/selftune/
214
- ├── index.ts CLI entry point (command router)
215
- ├── init.ts Agent detection, config bootstrap
216
- ├── types.ts, constants.ts Shared interfaces and constants
217
- ├── observability.ts Health checks (doctor command)
218
- ├── status.ts Skill health summary (status command)
219
- ├── last.ts Last session insight (last command)
220
- ├── dashboard.ts HTML dashboard builder (dashboard command)
221
- ├── utils/ JSONL, transcript parsing, LLM calls, schema validation
222
- ├── hooks/ Claude Code + OpenCode telemetry capture
223
- ├── ingestors/ Codex adapters + OpenCode backfill
224
- ├── eval/ False negative detection, eval set generation
225
- ├── grading/ 3-tier session grading (agent or API mode)
226
- ├── evolution/ Failure extraction, proposal, validation, deploy, rollback
227
- └── monitoring/ Post-deploy regression detection
228
-
229
- dashboard/
230
- └── index.html Skill-health-centric HTML dashboard template
231
-
232
- skill/
233
- ├── SKILL.md Routing table (~120 lines)
234
- ├── settings_snippet.json Claude Code hook config template
235
- ├── references/ Domain knowledge (logs, grading methodology, taxonomy)
236
- └── Workflows/ Step-by-step guides (1 per command)
237
- ```
238
-
239
- Dependencies flow forward only: `shared → hooks/ingestors → eval → grading → evolution → monitoring`. Enforced by `lint-architecture.ts`.
240
-
241
- Config persists at `~/.selftune/config.json` (written by `init`, read by all commands via skill workflows).
242
-
243
- See [ARCHITECTURE.md](ARCHITECTURE.md) for the full domain map and module rules.
244
-
245
- ---
246
-
247
- ## Log Schema
248
-
249
- Three append-only JSONL files at `~/.claude/`:
250
-
251
- | File | Record type | Key fields |
252
- |---|---|---|
253
- | `all_queries_log.jsonl` | `QueryLogRecord` | `timestamp`, `session_id`, `query`, `source?` |
254
- | `skill_usage_log.jsonl` | `SkillUsageRecord` | `timestamp`, `session_id`, `skill_name`, `query`, `triggered` |
255
- | `session_telemetry_log.jsonl` | `SessionTelemetryRecord` | `timestamp`, `session_id`, `tool_calls`, `bash_commands`, `skills_triggered`, `errors_encountered` |
256
- | `evolution_audit_log.jsonl` | `EvolutionAuditEntry` | `timestamp`, `proposal_id`, `action`, `details`, `eval_snapshot?` |
257
-
258
- The `source` field identifies the platform: `claude_code`, `codex`, or `opencode`.
259
-
260
- ---
261
-
262
- ## Development
92
+ ## Commands
263
93
 
264
- ```bash
265
- make check # lint + architecture lint + all tests
266
- make lint # biome check + architecture lint
267
- make test # bun test
268
- ```
94
+ | Command | What it does |
95
+ |---|---|
96
+ | `selftune status` | See which skills are undertriggering and why |
97
+ | `selftune evals --skill <name>` | Generate eval sets from real session data (`--synthetic` for cold-start) |
98
+ | `selftune evolve --skill <name>` | Propose, validate, and deploy improved descriptions (`--cheap-loop`, `--with-baseline`) |
99
+ | `selftune evolve-body --skill <name>` | Evolve full skill body or routing table (teacher-student, 3-gate validation) |
100
+ | `selftune baseline --skill <name>` | Measure skill value vs no-skill baseline |
101
+ | `selftune unit-test --skill <name>` | Run or generate skill-level unit tests |
102
+ | `selftune composability --skill <name>` | Detect conflicts between co-occurring skills |
103
+ | `selftune import-skillsbench` | Import external eval corpus from [SkillsBench](https://github.com/benchflow-ai/skillsbench) |
104
+ | `selftune badge --skill <name>` | Generate skill health badge SVG |
105
+ | `selftune watch --skill <name>` | Monitor after deploy. Auto-rollback on regression. |
106
+ | `selftune dashboard` | Open the visual skill health dashboard |
107
+ | `selftune replay` | Backfill data from existing Claude Code transcripts |
108
+ | `selftune doctor` | Health check: logs, hooks, config, permissions |
109
+
110
+ Full command reference: `selftune --help`
111
+
112
+ ## Why Not Just Rewrite Skills Manually?
113
+
114
+ | Approach | Problem |
115
+ |---|---|
116
+ | Rewrite the description yourself | No data on how users actually talk. No validation. No regression detection. |
117
+ | Add "ALWAYS invoke when..." directives | Brittle. One agent rewrite away from breaking. |
118
+ | Force-load skills on every prompt | Doesn't fix the description. Expensive band-aid. |
119
+ | **selftune** | Learns from real usage, rewrites descriptions to match how you work, validates against eval sets, auto-rollbacks on regressions. |
269
120
 
270
- Zero runtime dependencies. Uses Bun built-ins only.
121
+ ## Different Layer, Different Problem
271
122
 
272
- ---
123
+ LLM observability tools trace API calls. Infrastructure tools monitor servers. Neither knows whether the right skill fired for the right person. selftune does — and fixes it automatically.
273
124
 
274
- ## Tips
125
+ selftune is complementary to these tools, not competitive. They trace what happens inside the LLM. selftune makes sure the right skill is called in the first place.
275
126
 
276
- - Run `selftune init` first everything else reads from the config it writes.
277
- - Let logs accumulate over several days before running evals — more diverse real queries = more reliable signal.
278
- - All hooks are silent (exit 0) and take <50ms. Negligible overhead.
279
- - Logs are append-only JSONL. Safe to delete to start fresh, or archive old files.
280
- - Use `--max 75` to increase eval set size once you have enough data.
281
- - Use `--seed 123` for a different random sample of negatives.
282
- - Use `--dry-run` with `evolve` to preview proposals without deploying.
283
- - The `doctor` command checks log health, hook presence, config status, and schema validity.
127
+ | Dimension | selftune | Langfuse | LangSmith | OpenLIT |
128
+ |-----------|----------|----------|-----------|---------|
129
+ | **Layer** | Skill-specific | LLM call | Agent trace | Infrastructure |
130
+ | **Detects** | Missed triggers, false negatives, skill conflicts | Token usage, latency | Chain failures | System metrics |
131
+ | **Improves** | Descriptions, body, and routing automatically | | | — |
132
+ | **Setup** | Zero deps, zero API keys | Self-host or cloud | Cloud required | Helm chart |
133
+ | **Price** | Free (MIT) | Freemium | Paid | Free |
134
+ | **Unique** | Self-improving skills + auto-rollback | Prompt management | Evaluations | Dashboards |
284
135
 
285
- ---
286
-
287
- ## Contributing
136
+ ## Platforms
288
137
 
289
- See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, architecture rules, and PR guidelines.
138
+ **Claude Code** Hooks install automatically. `selftune replay` backfills existing transcripts.
290
139
 
291
- Please follow our [Code of Conduct](CODE_OF_CONDUCT.md).
140
+ **Codex** `selftune wrap-codex -- <args>` or `selftune ingest-codex`
292
141
 
293
- ---
142
+ **OpenCode** — `selftune ingest-opencode`
294
143
 
295
- ## Security
144
+ **OpenClaw** — `selftune ingest-openclaw` + `selftune cron setup` for autonomous evolution
296
145
 
297
- To report a vulnerability, see [SECURITY.md](SECURITY.md).
146
+ Requires [Bun](https://bun.sh) or Node.js 18+. No extra API keys.
298
147
 
299
148
  ---
300
149
 
301
- ## Sponsor
150
+ <div align="center">
302
151
 
303
- If selftune saves you time, consider [sponsoring the project](https://github.com/sponsors/WellDunDun).
304
-
305
- ---
152
+ [Architecture](ARCHITECTURE.md) · [Contributing](CONTRIBUTING.md) · [Security](SECURITY.md) · [Integration Guide](docs/integration-guide.md) · [Sponsor](https://github.com/sponsors/WellDunDun)
306
153
 
307
- ## Milestones
154
+ MIT licensed. Free forever. Works with Claude Code, Codex, OpenCode, and OpenClaw.
308
155
 
309
- | Version | Scope | Status |
310
- |---|---|---|
311
- | v0.1 | Hooks, ingestors, shared schema, eval generation | Done |
312
- | v0.2 | Session grading, grader skill | Done |
313
- | v0.3 | Evolution loop (propose, validate, deploy, rollback) | Done |
314
- | v0.4 | Post-deploy monitoring, regression detection | Done |
315
- | v0.5 | Agent-first skill restructure, `init` command, config bootstrap | Done |
316
- | v0.6 | Three-layer observability: `status`, `last`, redesigned dashboard | Done |
156
+ </div>
Binary file
Binary file
@@ -0,0 +1,9 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" width="250" height="250" viewBox="0 0 250 250" fill="none">
2
+ <path d="M 190.16,31.49 C 187.91,29.88 184.51,32.19 185.88,35.16 C 186.31,36.11 187.08,36.54 187.71,37.01 C 218.75,59.86 237.63,92.71 237.63,128.82 C 237.63,175.99 205.12,218.56 153.82,234.69 C 149.89,235.93 150.91,241.71 154.91,240.66 C 205.98,226.96 243.01,181.94 243,128.45 C 242.99,90.87 223.47,56.18 190.16,31.49 Z" fill="#E8DED0"/>
3
+ <path d="M 125.19,243.91 C 138.08,243.91 147.18,236.44 151.21,225.01 C 193.72,217.79 226.98,184.02 226.98,140.81 C 226.98,121.17 219.82,103.78 209.93,87.04 C 191.42,55.45 165.15,34.72 117.71,28.65 C 112.91,28.04 113.77,34.35 117.19,34.82 C 161.67,39.33 185.84,56.71 203.76,86.42 C 213.87,103.68 220.68,119.61 220.68,140.81 C 220.68,179.96 190.81,211.95 148.71,219.16 C 147.11,219.47 146.27,220.32 145.92,221.8 C 142.95,231.11 135.72,238.02 125.19,237.66 C 64.48,237.66 11.67,191.61 11.67,127.51 C 11.67,79.61 44.82,36.38 93.89,27.77 L 94.11,27.73 L 94.38,26.64 C 97.04,16.61 104.57,11.82 114.19,11.82 C 134.12,13.36 152.91,18.15 170.48,26.08 C 171.92,26.78 173.81,27.09 174.76,25.59 C 176.05,23.72 175.31,21.07 173.01,20.34 C 154.78,11.96 137.21,7.17 114.47,6 H 113.52 C 101.91,6 93.46,12.16 89.49,21.78 C 42.36,31.26 6.17,74.76 6.17,128.08 C 6.17,190.05 57.92,243.91 125.19,243.91 Z" fill="#E8DED0"/>
4
+ <path d="M 93.67,40.64 C 100.51,52.07 109.54,51.33 114.05,52.17 C 128.72,53.91 141.48,55.78 157.38,62.16 C 162.72,64.47 162.29,58.19 159.18,57.01 C 145.11,51.33 132.48,49.79 111.31,47.48 C 101.83,46.29 95.45,41.18 93.75,32.81 C 55.21,39.46 22.06,72.17 22.06,112.48 C 22.06,131.98 30.36,149.82 43.26,164.49 C 46.23,167.59 50.19,164.13 48.32,161.02 C 36.21,145.54 28.42,129.78 28.42,112.4 C 28.42,79.11 54.91,48.36 89.91,40.36 C 90.76,40.15 91.04,39.87 91.62,40.01 C 92.62,40.01 93.04,39.65 93.67,40.64 Z" fill="#E8DED0"/>
5
+ <path d="M 152.72,82.77 C 126.61,82.77 113.07,99.44 103.01,119.33 C 100.56,123.36 103.74,125.03 105.61,123.92 C 107.15,123.22 107.89,121.05 108.73,119.61 C 118.22,102.16 130.33,88.56 152.72,88.56 C 181.62,88.56 201.91,116.01 201.91,147.31 C 201.91,175.12 183.47,199.96 152.51,205.75 C 151.84,205.96 151.63,206.03 151.56,205.54 C 147.74,195.37 139.36,188.15 128.07,186.48 C 113.2,184.24 101.23,182.36 83.8,176.81 C 79.3,175.48 77.91,182.36 82.41,183.09 C 97.21,187.46 108.09,189.47 126.25,192.65 C 136.78,194.31 145.41,201.71 147.11,210.95 C 147.74,213.05 149.13,213.41 150.15,213.26 C 183.75,208.61 208.26,180.93 208.26,147.24 C 208.26,115.06 186.94,82.77 152.72,82.77 Z" fill="#E8DED0"/>
6
+ <path d="M 129.77,105.21 C 122.93,112.05 118.97,122.73 113.77,130.41 C 111.31,133.45 114.56,136.63 117.46,134.46 C 123.75,126.23 127.43,115.62 135.15,108.71 C 138.22,105.81 134.73,101.09 129.77,105.21 Z" fill="#E8DED0"/>
7
+ <path d="M 136.78,120.31 C 127.71,136.71 120.12,154.91 93.74,154.91 C 66.07,154.91 47.76,128.53 47.76,104.78 C 47.76,84.47 58.57,66.08 77.66,56.25 C 82.23,54.21 79.85,47.76 75.34,49.93 C 54.77,59.72 42.01,80.11 42.01,104.71 C 42.01,131.77 61.86,161.31 93.67,161.31 C 114.77,161.31 128.91,147.24 139.86,124.06 C 142.76,120.45 139.15,117.73 136.78,120.31 Z" fill="#E8DED0"/>
8
+ <path d="M 30.73,154.7 C 27.76,152.97 23.87,155.93 25.41,158.76 C 41.73,188.36 68.94,199.79 105.75,206.41 C 112.25,207.66 122.07,208.75 123.46,209.03 C 128.07,209.95 128.07,220.18 121.78,220.18 C 107.64,218.94 92.06,215.98 76.23,211.33 C 72.13,210.24 71.04,216.69 75.27,217.64 C 90.41,222.22 103.95,224.74 120.47,226.54 C 133.73,226.54 136.56,209.03 126.03,203.38 C 123.75,202.13 122.73,202.56 112.04,200.76 C 78.09,195.04 54.06,188.98 32.12,155.65 C 31.77,155.23 31.28,154.91 30.73,154.7 Z" fill="#E8DED0"/>
9
+ </svg>
@@ -0,0 +1,20 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" width="138" height="20" role="img" aria-label="Skill Health: no data">
2
+ <linearGradient id="b" x2="0" y2="100%">
3
+ <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
4
+ <stop offset="1" stop-opacity=".1"/>
5
+ </linearGradient>
6
+ <clipPath id="a">
7
+ <rect width="138" height="20" rx="3" fill="#fff"/>
8
+ </clipPath>
9
+ <g clip-path="url(#a)">
10
+ <rect width="78" height="20" fill="#555"/>
11
+ <rect x="79" width="59" height="20" fill="#9f9f9f"/>
12
+ <rect width="138" height="20" fill="url(#b)"/>
13
+ </g>
14
+ <g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" font-size="11">
15
+ <text x="39" y="15" fill="#010101" fill-opacity=".3">Skill Health</text>
16
+ <text x="39" y="14">Skill Health</text>
17
+ <text x="108.5" y="15" fill="#010101" fill-opacity=".3">no data</text>
18
+ <text x="108.5" y="14">no data</text>
19
+ </g>
20
+ </svg>