selftune 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +146 -0
- package/.claude/agents/evolution-reviewer.md +167 -0
- package/.claude/agents/integration-guide.md +200 -0
- package/.claude/agents/pattern-analyst.md +147 -0
- package/CHANGELOG.md +38 -1
- package/README.md +96 -256
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +103 -0
- package/cli/selftune/constants.ts +75 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-server.ts +582 -0
- package/cli/selftune/dashboard.ts +31 -12
- package/cli/selftune/eval/baseline.ts +247 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +68 -2
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evolve-body.ts +492 -0
- package/cli/selftune/evolution/evolve.ts +479 -104
- package/cli/selftune/evolution/extract-patterns.ts +32 -1
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +20 -3
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/grade-session.ts +145 -19
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/index.ts +88 -0
- package/cli/selftune/ingestors/claude-replay.ts +351 -0
- package/cli/selftune/ingestors/codex-rollout.ts +1 -1
- package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/init.ts +168 -5
- package/cli/selftune/last.ts +2 -2
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +25 -2
- package/cli/selftune/status.ts +18 -15
- package/cli/selftune/types.ts +377 -5
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/llm-call.ts +29 -3
- package/cli/selftune/utils/transcript.ts +35 -0
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/dashboard/index.html +585 -19
- package/package.json +17 -6
- package/skill/SKILL.md +127 -10
- package/skill/Workflows/AutoActivation.md +144 -0
- package/skill/Workflows/Badge.md +118 -0
- package/skill/Workflows/Baseline.md +121 -0
- package/skill/Workflows/Composability.md +100 -0
- package/skill/Workflows/Contribute.md +91 -0
- package/skill/Workflows/Cron.md +155 -0
- package/skill/Workflows/Dashboard.md +203 -0
- package/skill/Workflows/Doctor.md +37 -1
- package/skill/Workflows/Evals.md +73 -5
- package/skill/Workflows/EvolutionMemory.md +152 -0
- package/skill/Workflows/Evolve.md +111 -6
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/ImportSkillsBench.md +111 -0
- package/skill/Workflows/Ingest.md +129 -15
- package/skill/Workflows/Initialize.md +58 -3
- package/skill/Workflows/Replay.md +70 -0
- package/skill/Workflows/Rollback.md +20 -1
- package/skill/Workflows/UnitTest.md +138 -0
- package/skill/Workflows/Watch.md +22 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
package/CHANGELOG.md
CHANGED
|
@@ -5,13 +5,50 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/).
|
|
7
7
|
|
|
8
|
-
## [
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.2.0] — 2026-03-08
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- **Full skill body evolution** — Teacher-student model for evolving routing tables and complete skill bodies with 3-gate validation (structural, trigger, quality)
|
|
15
|
+
- **Synthetic eval generation** — `selftune evals --synthetic --skill <name> --skill-path <path>` generates eval sets from SKILL.md via LLM without needing real session logs. Solves cold-start for new skills.
|
|
16
|
+
- **Batch trigger validation** — `validateProposalBatched()` batches 10 queries per LLM call (configurable via `TRIGGER_CHECK_BATCH_SIZE`). ~10x faster evolution loops. Sequential `validateProposalSequential()` kept for backward compat.
|
|
17
|
+
- **Cheap-loop evolution mode** — `selftune evolve --cheap-loop` uses haiku for proposal generation and validation, sonnet only for the final deployment gate. New `--gate-model` and `--proposal-model` flags for manual per-stage control.
|
|
18
|
+
- **Validation model selection** — `--validation-model` flag on `evolve` and `evolve-body` commands (default: `haiku`).
|
|
19
|
+
- **Proposal model selection** — `--proposal-model` flag on `evolve`, passed through to `generateProposal()` and `generateMultipleProposals()`.
|
|
20
|
+
- **Gate validation dependency injection** — `gateValidateProposal` added to `EvolveDeps` for testability.
|
|
21
|
+
- **Auto-activation system** — `auto-activate.ts` UserPromptSubmit hook detects when selftune should run and outputs formatted suggestions; session state tracking prevents repeated nags; PAI coexistence support
|
|
22
|
+
- **Skill change guard** — `skill-change-guard.ts` PreToolUse hook detects Write/Edit to SKILL.md files and suggests running `selftune watch`
|
|
23
|
+
- **Evolution memory** — 3-file persistence system at `~/.selftune/memory/` (context.md, plan.md, decisions.md) survives context resets; auto-maintained by evolve, rollback, and watch commands
|
|
24
|
+
- **Specialized agents** — 4 purpose-built Claude Code agents: diagnosis-analyst, pattern-analyst, evolution-reviewer, integration-guide
|
|
25
|
+
- **Enforcement guardrails** — `evolution-guard.ts` PreToolUse hook blocks SKILL.md edits on actively monitored skills unless `selftune watch` has been run recently
|
|
26
|
+
- **Integration guide** — Comprehensive `docs/integration-guide.md` with project-type patterns (single-skill, multi-skill, monorepo, Codex-only, OpenCode-only, mixed)
|
|
27
|
+
- **Settings templates** — `templates/single-skill-settings.json`, `templates/multi-skill-settings.json`, `templates/activation-rules-default.json`
|
|
28
|
+
- **Enhanced init** — `selftune init` now detects workspace structure (skill count, monorepo layout) and suggests appropriate template
|
|
29
|
+
- **Dashboard server** — `selftune dashboard --serve` launches live Bun.serve server with SSE auto-refresh, action buttons (watch/evolve/rollback), and evolution timeline
|
|
30
|
+
- **Activation rules engine** — Configurable trigger rules for auto-activation (grading thresholds, stale evolutions, regression detection)
|
|
31
|
+
- **Sandbox test harness** (`tests/sandbox/run-sandbox.ts`): Exercises all CLI commands and hooks against fixture data in an isolated `/tmp` environment. Runs in ~400ms with 10/10 tests passing.
|
|
32
|
+
- **Devcontainer-based LLM testing** (`.devcontainer/` + `tests/sandbox/docker/`): Based on the official Claude Code devcontainer reference. Uses `claude -p` with `--dangerously-skip-permissions` for unattended LLM-dependent testing (grade, evolve, watch). No API key required — uses existing Claude subscription.
|
|
33
|
+
- **Realistic test fixtures**: 3 skills from skills.sh (find-skills, frontend-design, ai-image-generation) with 15 sessions, 30 queries, 7 skill usage records, and evolution audit history.
|
|
34
|
+
- **Hook integration tests**: All 3 Claude Code hooks (prompt-log, skill-eval, session-stop) tested via stdin payload injection.
|
|
35
|
+
|
|
36
|
+
### Changed
|
|
37
|
+
|
|
38
|
+
- `validateProposal()` now delegates to `validateProposalBatched()` by default (was sequential).
|
|
39
|
+
- `hooks-to-evals.ts` `cliMain()` is now async to support synthetic generation.
|
|
40
|
+
- `EvolveOptions` extended with `validationModel`, `cheapLoop`, `gateModel`, `proposalModel`.
|
|
41
|
+
- `EvolveResult` extended with `gateValidation`.
|
|
42
|
+
|
|
43
|
+
## [0.1.4] - 2026-03-01
|
|
9
44
|
|
|
10
45
|
### Added
|
|
11
46
|
|
|
12
47
|
- `selftune status` — CLI skill health summary with pass rates, trends, and system health
|
|
13
48
|
- `selftune last` — Quick insight from the most recent session
|
|
14
49
|
- `selftune dashboard` — Skill-health-centric HTML dashboard with grid view and drill-down
|
|
50
|
+
- `selftune replay` — Claude Code transcript replay for retroactive log backfill
|
|
51
|
+
- `selftune contribute` — Opt-in anonymized data export for community contribution
|
|
15
52
|
- CI/CD workflows: publish, auto-bump, CodeQL, scorecard
|
|
16
53
|
- FOSS governance: LICENSE (MIT), CODE_OF_CONDUCT, CONTRIBUTING, SECURITY
|
|
17
54
|
- npm package configuration with CJS bin entry point
|
package/README.md
CHANGED
|
@@ -1,316 +1,156 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<img src="assets/logo.svg" alt="selftune logo" width="80" />
|
|
4
|
+
|
|
5
|
+
# selftune
|
|
6
|
+
|
|
7
|
+
**Self-improving skills for AI agents.**
|
|
8
|
+
|
|
1
9
|
[](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml)
|
|
2
10
|
[](https://github.com/WellDunDun/selftune/actions/workflows/codeql.yml)
|
|
3
11
|
[](https://securityscorecards.dev/viewer/?uri=github.com/WellDunDun/selftune)
|
|
4
12
|
[](https://www.npmjs.com/package/selftune)
|
|
5
13
|
[](LICENSE)
|
|
6
|
-
[](https://www.npmjs.com/package/selftune?activeTab=dependencies)
|
|
8
|
-
[](https://bun.sh)
|
|
9
|
-
|
|
10
|
-
# selftune — Skill Observability & Continuous Improvement CLI
|
|
11
|
-
|
|
12
|
-
[](https://www.npmjs.com/package/selftune)
|
|
13
|
-
[](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml)
|
|
14
|
-
[](LICENSE)
|
|
14
|
+
[](https://www.typescriptlang.org/)
|
|
15
15
|
[](https://www.npmjs.com/package/selftune?activeTab=dependencies)
|
|
16
16
|
[](https://bun.sh)
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
Works with **Claude Code**, **Codex**, and **OpenCode**.
|
|
21
|
-
|
|
22
|
-
```
|
|
23
|
-
Observe → Detect → Diagnose → Propose → Validate → Deploy → Watch → Repeat
|
|
24
|
-
```
|
|
25
|
-
|
|
26
|
-
---
|
|
27
|
-
|
|
28
|
-
## Install
|
|
29
|
-
|
|
30
|
-
```bash
|
|
31
|
-
npx selftune@latest doctor
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
Or install globally:
|
|
35
|
-
|
|
36
|
-
```bash
|
|
37
|
-
npm install -g selftune
|
|
38
|
-
selftune doctor
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
Requires [Bun](https://bun.sh) or Node.js 18+ with [tsx](https://github.com/privatenumber/tsx).
|
|
42
|
-
|
|
43
|
-
---
|
|
44
|
-
|
|
45
|
-
## Why
|
|
18
|
+
Your agent skills learn how you work. Detect what's broken. Fix it automatically.
|
|
46
19
|
|
|
47
|
-
|
|
20
|
+
**[Install](#install)** · **[Use Cases](#built-for-how-you-actually-work)** · **[How It Works](#how-it-works)** · **[Commands](#commands)** · **[Platforms](#platforms)** · **[Docs](docs/integration-guide.md)**
|
|
48
21
|
|
|
49
|
-
|
|
22
|
+
</div>
|
|
50
23
|
|
|
51
24
|
---
|
|
52
25
|
|
|
53
|
-
|
|
26
|
+
Your skills don't understand how you talk. You say "make me a slide deck" and nothing happens — no error, no log, no signal. selftune watches your real sessions, learns how you actually speak, and rewrites skill descriptions to match. Automatically.
|
|
54
27
|
|
|
55
|
-
|
|
56
|
-
|---|---|
|
|
57
|
-
| **Session telemetry** | Captures per-session process metrics across all three platforms |
|
|
58
|
-
| **False negative detection** | Surfaces queries where a skill should have fired but didn't |
|
|
59
|
-
| **Eval set generation** | Converts hook logs into trigger eval sets with real usage as ground truth |
|
|
60
|
-
| **Session grading** | 3-tier evaluation (Trigger / Process / Quality) using the agent you already have |
|
|
61
|
-
| **Skill evolution** | Proposes improved descriptions, validates them, deploys with audit trail |
|
|
62
|
-
| **Post-deploy monitoring** | Watches evolved skills for regressions, auto-rollback on pass rate drops |
|
|
63
|
-
|
|
64
|
-
---
|
|
28
|
+
Works with **Claude Code**, **Codex**, **OpenCode**, and **OpenClaw**. Zero runtime dependencies.
|
|
65
29
|
|
|
66
|
-
##
|
|
67
|
-
|
|
68
|
-
### 1. Add the skill
|
|
30
|
+
## Install
|
|
69
31
|
|
|
70
32
|
```bash
|
|
71
33
|
npx skills add WellDunDun/selftune
|
|
72
34
|
```
|
|
73
35
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
Tell your agent: **"initialize selftune"**
|
|
77
|
-
|
|
78
|
-
The agent will install the CLI (`npm install -g selftune`) if needed, run `selftune init` to bootstrap config, install hooks, and verify with `selftune doctor`.
|
|
79
|
-
|
|
80
|
-
---
|
|
81
|
-
|
|
82
|
-
## Development
|
|
83
|
-
|
|
84
|
-
For contributors running from source.
|
|
85
|
-
|
|
86
|
-
### 1. Initialize
|
|
87
|
-
|
|
88
|
-
```bash
|
|
89
|
-
npx selftune@latest init
|
|
90
|
-
```
|
|
91
|
-
|
|
92
|
-
The `init` command auto-detects your agent environment (Claude Code, Codex, or OpenCode), resolves the CLI path, determines the LLM mode, and writes config to `~/.selftune/config.json`. All subsequent commands read from this config.
|
|
93
|
-
|
|
94
|
-
Use `--agent claude_code|codex|opencode` to override detection, `--llm-mode agent|api` to override LLM mode, or `--force` to reinitialize.
|
|
95
|
-
|
|
96
|
-
### 4. Install hooks (Claude Code)
|
|
36
|
+
Then tell your agent: **"initialize selftune"**
|
|
97
37
|
|
|
98
|
-
|
|
38
|
+
Two minutes. No API keys. No external services. No configuration ceremony. Uses your existing agent subscription. Within minutes you'll see which skills are undertriggering.
|
|
99
39
|
|
|
100
|
-
|
|
40
|
+
**CLI only** (no skill, just the CLI):
|
|
101
41
|
|
|
102
42
|
```bash
|
|
103
|
-
selftune doctor
|
|
104
|
-
```
|
|
105
|
-
|
|
106
|
-
Doctor checks log file health, hook installation, schema validity, and config status.
|
|
107
|
-
|
|
108
|
-
### Platform-Specific Notes
|
|
109
|
-
|
|
110
|
-
**Claude Code** — Hooks capture telemetry automatically after installation. Zero configuration once hooks are in `settings.json`.
|
|
111
|
-
|
|
112
|
-
**Codex** — Use the wrapper for real-time capture or the batch ingestor for historical logs:
|
|
113
|
-
```bash
|
|
114
|
-
selftune wrap-codex -- <your codex args>
|
|
115
|
-
selftune ingest-codex
|
|
43
|
+
npx selftune@latest doctor
|
|
116
44
|
```
|
|
117
45
|
|
|
118
|
-
|
|
119
|
-
```bash
|
|
120
|
-
selftune ingest-opencode
|
|
121
|
-
```
|
|
46
|
+
## Before / After
|
|
122
47
|
|
|
123
|
-
|
|
48
|
+
<p align="center">
|
|
49
|
+
<img src="./assets/BeforeAfter.gif" alt="Before: 47% pass rate → After: 89% pass rate" width="800">
|
|
50
|
+
</p>
|
|
124
51
|
|
|
125
|
-
|
|
52
|
+
selftune learned that real users say "slides", "deck", "presentation for Monday" — none of which matched the original skill description. It rewrote the description to match how people actually talk. Validated against the eval set. Deployed with a backup. Done.
|
|
126
53
|
|
|
127
|
-
##
|
|
54
|
+
## Built for How You Actually Work
|
|
128
55
|
|
|
129
|
-
|
|
130
|
-
selftune <command> [options]
|
|
131
|
-
```
|
|
56
|
+
**I write and use my own skills** — You built skills for your workflow but your descriptions don't match how you actually talk. selftune learns your language from real sessions and evolves descriptions to match — no more manual tuning. `selftune status` · `selftune evolve` · `selftune baseline`
|
|
132
57
|
|
|
133
|
-
|
|
134
|
-
|---|---|
|
|
135
|
-
| `init` | Auto-detect agent environment, write `~/.selftune/config.json` |
|
|
136
|
-
| `grade --skill <name>` | Grade a session (3-tier: trigger, process, quality) |
|
|
137
|
-
| `evals --skill <name>` | Generate eval set from real usage logs |
|
|
138
|
-
| `evals --list-skills` | Show logged skills and query counts |
|
|
139
|
-
| `evolve --skill <name> --skill-path <path>` | Analyze failures, propose and deploy improved description |
|
|
140
|
-
| `rollback --skill <name> --skill-path <path>` | Restore pre-evolution description |
|
|
141
|
-
| `watch --skill <name> --skill-path <path>` | Monitor post-deploy pass rates, detect regressions |
|
|
142
|
-
| `status` | Show skill health summary (pass rates, trends, missed queries) |
|
|
143
|
-
| `last` | Show quick insight from the most recent session |
|
|
144
|
-
| `doctor` | Health checks on logs, hooks, config, and schema |
|
|
145
|
-
| `dashboard` | Open skill-health-centric HTML dashboard in browser |
|
|
146
|
-
| `ingest-codex` | Batch ingest Codex rollout logs |
|
|
147
|
-
| `ingest-opencode` | Backfill historical OpenCode sessions from SQLite |
|
|
148
|
-
| `wrap-codex -- <args>` | Real-time Codex wrapper with telemetry |
|
|
149
|
-
|
|
150
|
-
No separate API key required — grading and evolution use whatever agent CLI you already have installed (Claude Code, Codex, or OpenCode).
|
|
151
|
-
|
|
152
|
-
See `skill/Workflows/` for detailed step-by-step guides for each command.
|
|
58
|
+
**I publish skills others install** — Your skill works for you, but every user talks differently. selftune ships skills that get better for every user automatically — adapting descriptions to how each person actually works. `selftune status` · `selftune evals` · `selftune badge`
|
|
153
59
|
|
|
154
|
-
|
|
60
|
+
**I manage an agent setup with many skills** — You have 15+ skills installed. Some work. Some don't. Some conflict. selftune gives you a health dashboard and automatically improves the skills that aren't keeping up with how your team works. `selftune dashboard` · `selftune composability` · `selftune doctor`
|
|
155
61
|
|
|
156
62
|
## How It Works
|
|
157
63
|
|
|
158
|
-
|
|
64
|
+
<p align="center">
|
|
65
|
+
<img src="./assets/FeedbackLoop.gif" alt="Observe → Detect → Evolve → Watch" width="800">
|
|
66
|
+
</p>
|
|
159
67
|
|
|
160
|
-
|
|
161
|
-
Claude Code (hooks): OpenCode (hooks):
|
|
162
|
-
UserPromptSubmit → prompt-log.ts message.* → opencode-prompt-log.ts
|
|
163
|
-
PostToolUse → skill-eval.ts tool.execute.after → opencode-skill-eval.ts
|
|
164
|
-
Stop → session-stop.ts session.idle → opencode-session-stop.ts
|
|
165
|
-
│ │
|
|
166
|
-
└──────────┬─────────────────────────┘
|
|
167
|
-
▼
|
|
168
|
-
Shared JSONL Log Schema (~/.claude/)
|
|
169
|
-
├── all_queries_log.jsonl
|
|
170
|
-
├── skill_usage_log.jsonl
|
|
171
|
-
└── session_telemetry_log.jsonl
|
|
172
|
-
|
|
173
|
-
Codex (wrapper/ingestor — hooks not yet available):
|
|
174
|
-
codex-wrapper.ts (real-time tee of JSONL stream)
|
|
175
|
-
codex-rollout.ts (batch ingest from rollout logs)
|
|
176
|
-
│
|
|
177
|
-
└──→ Same shared JSONL schema
|
|
178
|
-
```
|
|
68
|
+
A continuous feedback loop that makes your skills learn and adapt. Automatically.
|
|
179
69
|
|
|
180
|
-
|
|
70
|
+
**Observe** — Hooks capture every user query and which skills fired. On Claude Code, hooks install automatically. Use `selftune replay` to backfill existing transcripts. This is how your skills start learning.
|
|
181
71
|
|
|
182
|
-
|
|
183
|
-
selftune evals cross-references the two query logs:
|
|
184
|
-
Positives = skill_usage_log entries for target skill
|
|
185
|
-
Negatives = all_queries_log entries NOT in positives
|
|
186
|
-
|
|
187
|
-
selftune grade reads:
|
|
188
|
-
session_telemetry_log → process metrics (tool calls, errors, turns)
|
|
189
|
-
transcript JSONL → what actually happened
|
|
190
|
-
expectations → what should have happened
|
|
191
|
-
```
|
|
72
|
+
**Detect** — selftune finds the gap between how you talk and how your skills are described. You say "make me a slide deck" and your pptx skill stays silent — selftune catches that mismatch.
|
|
192
73
|
|
|
193
|
-
|
|
74
|
+
**Evolve** — Rewrites skill descriptions — and full skill bodies — to match how you actually work. Batched validation with per-stage model control (`--cheap-loop` uses haiku for the loop, sonnet for the gate). Teacher-student body evolution with 3-gate validation. Baseline comparison gates on measurable lift. Automatic backup.
|
|
194
75
|
|
|
195
|
-
|
|
196
|
-
selftune evolve:
|
|
197
|
-
1. Load eval set (or generate from logs)
|
|
198
|
-
2. Extract failure patterns (missed queries grouped by invocation type)
|
|
199
|
-
3. Generate improved description via LLM
|
|
200
|
-
4. Validate against eval set (must improve, <5% regression)
|
|
201
|
-
5. Deploy updated SKILL.md + PR + audit trail
|
|
202
|
-
|
|
203
|
-
selftune watch:
|
|
204
|
-
Monitor pass rate over sliding window of recent sessions
|
|
205
|
-
Alert (or auto-rollback) on regression > threshold
|
|
206
|
-
```
|
|
76
|
+
**Watch** — After deploying changes, selftune monitors skill trigger rates. If anything regresses, it rolls back automatically. Your skills keep improving without you touching them.
|
|
207
77
|
|
|
208
|
-
|
|
78
|
+
## What's New in v0.2.0
|
|
209
79
|
|
|
210
|
-
|
|
80
|
+
- **Full skill body evolution** — Beyond descriptions: evolve routing tables and entire skill bodies using teacher-student model with structural, trigger, and quality gates
|
|
81
|
+
- **Synthetic eval generation** — `selftune evals --synthetic` generates eval sets from SKILL.md via LLM, no session logs needed. Solves cold-start: new skills get evals immediately.
|
|
82
|
+
- **Cheap-loop evolution** — `selftune evolve --cheap-loop` uses haiku for proposal generation and validation, sonnet only for the final deployment gate. ~80% cost reduction.
|
|
83
|
+
- **Batch trigger validation** — Validation now batches 10 queries per LLM call instead of one-per-query. ~10x faster evolution loops.
|
|
84
|
+
- **Per-stage model control** — `--validation-model`, `--proposal-model`, and `--gate-model` flags give fine-grained control over which model runs each evolution stage.
|
|
85
|
+
- **Auto-activation system** — Hooks detect when selftune should run and suggest actions
|
|
86
|
+
- **Enforcement guardrails** — Blocks SKILL.md edits on monitored skills unless `selftune watch` has been run
|
|
87
|
+
- **Live dashboard server** — `selftune dashboard --serve` with SSE auto-refresh and action buttons
|
|
88
|
+
- **Evolution memory** — Persists context, plans, and decisions across context resets
|
|
89
|
+
- **4 specialized agents** — Diagnosis analyst, pattern analyst, evolution reviewer, integration guide
|
|
90
|
+
- **Sandbox test harness** — Comprehensive automated test coverage, including devcontainer-based LLM testing
|
|
211
91
|
|
|
212
|
-
|
|
213
|
-
cli/selftune/
|
|
214
|
-
├── index.ts CLI entry point (command router)
|
|
215
|
-
├── init.ts Agent detection, config bootstrap
|
|
216
|
-
├── types.ts, constants.ts Shared interfaces and constants
|
|
217
|
-
├── observability.ts Health checks (doctor command)
|
|
218
|
-
├── status.ts Skill health summary (status command)
|
|
219
|
-
├── last.ts Last session insight (last command)
|
|
220
|
-
├── dashboard.ts HTML dashboard builder (dashboard command)
|
|
221
|
-
├── utils/ JSONL, transcript parsing, LLM calls, schema validation
|
|
222
|
-
├── hooks/ Claude Code + OpenCode telemetry capture
|
|
223
|
-
├── ingestors/ Codex adapters + OpenCode backfill
|
|
224
|
-
├── eval/ False negative detection, eval set generation
|
|
225
|
-
├── grading/ 3-tier session grading (agent or API mode)
|
|
226
|
-
├── evolution/ Failure extraction, proposal, validation, deploy, rollback
|
|
227
|
-
└── monitoring/ Post-deploy regression detection
|
|
228
|
-
|
|
229
|
-
dashboard/
|
|
230
|
-
└── index.html Skill-health-centric HTML dashboard template
|
|
231
|
-
|
|
232
|
-
skill/
|
|
233
|
-
├── SKILL.md Routing table (~120 lines)
|
|
234
|
-
├── settings_snippet.json Claude Code hook config template
|
|
235
|
-
├── references/ Domain knowledge (logs, grading methodology, taxonomy)
|
|
236
|
-
└── Workflows/ Step-by-step guides (1 per command)
|
|
237
|
-
```
|
|
238
|
-
|
|
239
|
-
Dependencies flow forward only: `shared → hooks/ingestors → eval → grading → evolution → monitoring`. Enforced by `lint-architecture.ts`.
|
|
240
|
-
|
|
241
|
-
Config persists at `~/.selftune/config.json` (written by `init`, read by all commands via skill workflows).
|
|
242
|
-
|
|
243
|
-
See [ARCHITECTURE.md](ARCHITECTURE.md) for the full domain map and module rules.
|
|
244
|
-
|
|
245
|
-
---
|
|
246
|
-
|
|
247
|
-
## Log Schema
|
|
248
|
-
|
|
249
|
-
Three append-only JSONL files at `~/.claude/`:
|
|
250
|
-
|
|
251
|
-
| File | Record type | Key fields |
|
|
252
|
-
|---|---|---|
|
|
253
|
-
| `all_queries_log.jsonl` | `QueryLogRecord` | `timestamp`, `session_id`, `query`, `source?` |
|
|
254
|
-
| `skill_usage_log.jsonl` | `SkillUsageRecord` | `timestamp`, `session_id`, `skill_name`, `query`, `triggered` |
|
|
255
|
-
| `session_telemetry_log.jsonl` | `SessionTelemetryRecord` | `timestamp`, `session_id`, `tool_calls`, `bash_commands`, `skills_triggered`, `errors_encountered` |
|
|
256
|
-
| `evolution_audit_log.jsonl` | `EvolutionAuditEntry` | `timestamp`, `proposal_id`, `action`, `details`, `eval_snapshot?` |
|
|
257
|
-
|
|
258
|
-
The `source` field identifies the platform: `claude_code`, `codex`, or `opencode`.
|
|
259
|
-
|
|
260
|
-
---
|
|
261
|
-
|
|
262
|
-
## Development
|
|
92
|
+
## Commands
|
|
263
93
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
94
|
+
| Command | What it does |
|
|
95
|
+
|---|---|
|
|
96
|
+
| `selftune status` | See which skills are undertriggering and why |
|
|
97
|
+
| `selftune evals --skill <name>` | Generate eval sets from real session data (`--synthetic` for cold-start) |
|
|
98
|
+
| `selftune evolve --skill <name>` | Propose, validate, and deploy improved descriptions (`--cheap-loop`, `--with-baseline`) |
|
|
99
|
+
| `selftune evolve-body --skill <name>` | Evolve full skill body or routing table (teacher-student, 3-gate validation) |
|
|
100
|
+
| `selftune baseline --skill <name>` | Measure skill value vs no-skill baseline |
|
|
101
|
+
| `selftune unit-test --skill <name>` | Run or generate skill-level unit tests |
|
|
102
|
+
| `selftune composability --skill <name>` | Detect conflicts between co-occurring skills |
|
|
103
|
+
| `selftune import-skillsbench` | Import external eval corpus from [SkillsBench](https://github.com/benchflow-ai/skillsbench) |
|
|
104
|
+
| `selftune badge --skill <name>` | Generate skill health badge SVG |
|
|
105
|
+
| `selftune watch --skill <name>` | Monitor after deploy. Auto-rollback on regression. |
|
|
106
|
+
| `selftune dashboard` | Open the visual skill health dashboard |
|
|
107
|
+
| `selftune replay` | Backfill data from existing Claude Code transcripts |
|
|
108
|
+
| `selftune doctor` | Health check: logs, hooks, config, permissions |
|
|
109
|
+
|
|
110
|
+
Full command reference: `selftune --help`
|
|
111
|
+
|
|
112
|
+
## Why Not Just Rewrite Skills Manually?
|
|
113
|
+
|
|
114
|
+
| Approach | Problem |
|
|
115
|
+
|---|---|
|
|
116
|
+
| Rewrite the description yourself | No data on how users actually talk. No validation. No regression detection. |
|
|
117
|
+
| Add "ALWAYS invoke when..." directives | Brittle. One agent rewrite away from breaking. |
|
|
118
|
+
| Force-load skills on every prompt | Doesn't fix the description. Expensive band-aid. |
|
|
119
|
+
| **selftune** | Learns from real usage, rewrites descriptions to match how you work, validates against eval sets, auto-rollbacks on regressions. |
|
|
269
120
|
|
|
270
|
-
|
|
121
|
+
## Different Layer, Different Problem
|
|
271
122
|
|
|
272
|
-
|
|
123
|
+
LLM observability tools trace API calls. Infrastructure tools monitor servers. Neither knows whether the right skill fired for the right person. selftune does — and fixes it automatically.
|
|
273
124
|
|
|
274
|
-
|
|
125
|
+
selftune is complementary to these tools, not competitive. They trace what happens inside the LLM. selftune makes sure the right skill is called in the first place.
|
|
275
126
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
-
|
|
282
|
-
|
|
283
|
-
|
|
127
|
+
| Dimension | selftune | Langfuse | LangSmith | OpenLIT |
|
|
128
|
+
|-----------|----------|----------|-----------|---------|
|
|
129
|
+
| **Layer** | Skill-specific | LLM call | Agent trace | Infrastructure |
|
|
130
|
+
| **Detects** | Missed triggers, false negatives, skill conflicts | Token usage, latency | Chain failures | System metrics |
|
|
131
|
+
| **Improves** | Descriptions, body, and routing automatically | — | — | — |
|
|
132
|
+
| **Setup** | Zero deps, zero API keys | Self-host or cloud | Cloud required | Helm chart |
|
|
133
|
+
| **Price** | Free (MIT) | Freemium | Paid | Free |
|
|
134
|
+
| **Unique** | Self-improving skills + auto-rollback | Prompt management | Evaluations | Dashboards |
|
|
284
135
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
## Contributing
|
|
136
|
+
## Platforms
|
|
288
137
|
|
|
289
|
-
|
|
138
|
+
**Claude Code** — Hooks install automatically. `selftune replay` backfills existing transcripts.
|
|
290
139
|
|
|
291
|
-
|
|
140
|
+
**Codex** — `selftune wrap-codex -- <args>` or `selftune ingest-codex`
|
|
292
141
|
|
|
293
|
-
|
|
142
|
+
**OpenCode** — `selftune ingest-opencode`
|
|
294
143
|
|
|
295
|
-
|
|
144
|
+
**OpenClaw** — `selftune ingest-openclaw` + `selftune cron setup` for autonomous evolution
|
|
296
145
|
|
|
297
|
-
|
|
146
|
+
Requires [Bun](https://bun.sh) or Node.js 18+. No extra API keys.
|
|
298
147
|
|
|
299
148
|
---
|
|
300
149
|
|
|
301
|
-
|
|
150
|
+
<div align="center">
|
|
302
151
|
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
---
|
|
152
|
+
[Architecture](ARCHITECTURE.md) · [Contributing](CONTRIBUTING.md) · [Security](SECURITY.md) · [Integration Guide](docs/integration-guide.md) · [Sponsor](https://github.com/sponsors/WellDunDun)
|
|
306
153
|
|
|
307
|
-
|
|
154
|
+
MIT licensed. Free forever. Works with Claude Code, Codex, OpenCode, and OpenClaw.
|
|
308
155
|
|
|
309
|
-
|
|
310
|
-
|---|---|---|
|
|
311
|
-
| v0.1 | Hooks, ingestors, shared schema, eval generation | Done |
|
|
312
|
-
| v0.2 | Session grading, grader skill | Done |
|
|
313
|
-
| v0.3 | Evolution loop (propose, validate, deploy, rollback) | Done |
|
|
314
|
-
| v0.4 | Post-deploy monitoring, regression detection | Done |
|
|
315
|
-
| v0.5 | Agent-first skill restructure, `init` command, config bootstrap | Done |
|
|
316
|
-
| v0.6 | Three-layer observability: `status`, `last`, redesigned dashboard | Done |
|
|
156
|
+
</div>
|
|
Binary file
|
|
Binary file
|
package/assets/logo.svg
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" width="250" height="250" viewBox="0 0 250 250" fill="none">
|
|
2
|
+
<path d="M 190.16,31.49 C 187.91,29.88 184.51,32.19 185.88,35.16 C 186.31,36.11 187.08,36.54 187.71,37.01 C 218.75,59.86 237.63,92.71 237.63,128.82 C 237.63,175.99 205.12,218.56 153.82,234.69 C 149.89,235.93 150.91,241.71 154.91,240.66 C 205.98,226.96 243.01,181.94 243,128.45 C 242.99,90.87 223.47,56.18 190.16,31.49 Z" fill="#E8DED0"/>
|
|
3
|
+
<path d="M 125.19,243.91 C 138.08,243.91 147.18,236.44 151.21,225.01 C 193.72,217.79 226.98,184.02 226.98,140.81 C 226.98,121.17 219.82,103.78 209.93,87.04 C 191.42,55.45 165.15,34.72 117.71,28.65 C 112.91,28.04 113.77,34.35 117.19,34.82 C 161.67,39.33 185.84,56.71 203.76,86.42 C 213.87,103.68 220.68,119.61 220.68,140.81 C 220.68,179.96 190.81,211.95 148.71,219.16 C 147.11,219.47 146.27,220.32 145.92,221.8 C 142.95,231.11 135.72,238.02 125.19,237.66 C 64.48,237.66 11.67,191.61 11.67,127.51 C 11.67,79.61 44.82,36.38 93.89,27.77 L 94.11,27.73 L 94.38,26.64 C 97.04,16.61 104.57,11.82 114.19,11.82 C 134.12,13.36 152.91,18.15 170.48,26.08 C 171.92,26.78 173.81,27.09 174.76,25.59 C 176.05,23.72 175.31,21.07 173.01,20.34 C 154.78,11.96 137.21,7.17 114.47,6 H 113.52 C 101.91,6 93.46,12.16 89.49,21.78 C 42.36,31.26 6.17,74.76 6.17,128.08 C 6.17,190.05 57.92,243.91 125.19,243.91 Z" fill="#E8DED0"/>
|
|
4
|
+
<path d="M 93.67,40.64 C 100.51,52.07 109.54,51.33 114.05,52.17 C 128.72,53.91 141.48,55.78 157.38,62.16 C 162.72,64.47 162.29,58.19 159.18,57.01 C 145.11,51.33 132.48,49.79 111.31,47.48 C 101.83,46.29 95.45,41.18 93.75,32.81 C 55.21,39.46 22.06,72.17 22.06,112.48 C 22.06,131.98 30.36,149.82 43.26,164.49 C 46.23,167.59 50.19,164.13 48.32,161.02 C 36.21,145.54 28.42,129.78 28.42,112.4 C 28.42,79.11 54.91,48.36 89.91,40.36 C 90.76,40.15 91.04,39.87 91.62,40.01 C 92.62,40.01 93.04,39.65 93.67,40.64 Z" fill="#E8DED0"/>
|
|
5
|
+
<path d="M 152.72,82.77 C 126.61,82.77 113.07,99.44 103.01,119.33 C 100.56,123.36 103.74,125.03 105.61,123.92 C 107.15,123.22 107.89,121.05 108.73,119.61 C 118.22,102.16 130.33,88.56 152.72,88.56 C 181.62,88.56 201.91,116.01 201.91,147.31 C 201.91,175.12 183.47,199.96 152.51,205.75 C 151.84,205.96 151.63,206.03 151.56,205.54 C 147.74,195.37 139.36,188.15 128.07,186.48 C 113.2,184.24 101.23,182.36 83.8,176.81 C 79.3,175.48 77.91,182.36 82.41,183.09 C 97.21,187.46 108.09,189.47 126.25,192.65 C 136.78,194.31 145.41,201.71 147.11,210.95 C 147.74,213.05 149.13,213.41 150.15,213.26 C 183.75,208.61 208.26,180.93 208.26,147.24 C 208.26,115.06 186.94,82.77 152.72,82.77 Z" fill="#E8DED0"/>
|
|
6
|
+
<path d="M 129.77,105.21 C 122.93,112.05 118.97,122.73 113.77,130.41 C 111.31,133.45 114.56,136.63 117.46,134.46 C 123.75,126.23 127.43,115.62 135.15,108.71 C 138.22,105.81 134.73,101.09 129.77,105.21 Z" fill="#E8DED0"/>
|
|
7
|
+
<path d="M 136.78,120.31 C 127.71,136.71 120.12,154.91 93.74,154.91 C 66.07,154.91 47.76,128.53 47.76,104.78 C 47.76,84.47 58.57,66.08 77.66,56.25 C 82.23,54.21 79.85,47.76 75.34,49.93 C 54.77,59.72 42.01,80.11 42.01,104.71 C 42.01,131.77 61.86,161.31 93.67,161.31 C 114.77,161.31 128.91,147.24 139.86,124.06 C 142.76,120.45 139.15,117.73 136.78,120.31 Z" fill="#E8DED0"/>
|
|
8
|
+
<path d="M 30.73,154.7 C 27.76,152.97 23.87,155.93 25.41,158.76 C 41.73,188.36 68.94,199.79 105.75,206.41 C 112.25,207.66 122.07,208.75 123.46,209.03 C 128.07,209.95 128.07,220.18 121.78,220.18 C 107.64,218.94 92.06,215.98 76.23,211.33 C 72.13,210.24 71.04,216.69 75.27,217.64 C 90.41,222.22 103.95,224.74 120.47,226.54 C 133.73,226.54 136.56,209.03 126.03,203.38 C 123.75,202.13 122.73,202.56 112.04,200.76 C 78.09,195.04 54.06,188.98 32.12,155.65 C 31.77,155.23 31.28,154.91 30.73,154.7 Z" fill="#E8DED0"/>
|
|
9
|
+
</svg>
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" width="138" height="20" role="img" aria-label="Skill Health: no data">
|
|
2
|
+
<linearGradient id="b" x2="0" y2="100%">
|
|
3
|
+
<stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
|
|
4
|
+
<stop offset="1" stop-opacity=".1"/>
|
|
5
|
+
</linearGradient>
|
|
6
|
+
<clipPath id="a">
|
|
7
|
+
<rect width="138" height="20" rx="3" fill="#fff"/>
|
|
8
|
+
</clipPath>
|
|
9
|
+
<g clip-path="url(#a)">
|
|
10
|
+
<rect width="78" height="20" fill="#555"/>
|
|
11
|
+
<rect x="79" width="59" height="20" fill="#9f9f9f"/>
|
|
12
|
+
<rect width="138" height="20" fill="url(#b)"/>
|
|
13
|
+
</g>
|
|
14
|
+
<g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" font-size="11">
|
|
15
|
+
<text x="39" y="15" fill="#010101" fill-opacity=".3">Skill Health</text>
|
|
16
|
+
<text x="39" y="14">Skill Health</text>
|
|
17
|
+
<text x="108.5" y="15" fill="#010101" fill-opacity=".3">no data</text>
|
|
18
|
+
<text x="108.5" y="14">no data</text>
|
|
19
|
+
</g>
|
|
20
|
+
</svg>
|