npm - selftune - Versions diffs - 0.1.2 → 0.2.0 - Mend

selftune 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

package/.claude/agents/diagnosis-analyst.md +146 -0
package/.claude/agents/evolution-reviewer.md +167 -0
package/.claude/agents/integration-guide.md +200 -0
package/.claude/agents/pattern-analyst.md +147 -0
package/CHANGELOG.md +38 -1
package/README.md +96 -256
package/assets/BeforeAfter.gif +0 -0
package/assets/FeedbackLoop.gif +0 -0
package/assets/logo.svg +9 -0
package/assets/skill-health-badge.svg +20 -0
package/cli/selftune/activation-rules.ts +171 -0
package/cli/selftune/badge/badge-data.ts +108 -0
package/cli/selftune/badge/badge-svg.ts +212 -0
package/cli/selftune/badge/badge.ts +103 -0
package/cli/selftune/constants.ts +75 -1
package/cli/selftune/contribute/bundle.ts +314 -0
package/cli/selftune/contribute/contribute.ts +214 -0
package/cli/selftune/contribute/sanitize.ts +162 -0
package/cli/selftune/cron/setup.ts +266 -0
package/cli/selftune/dashboard-server.ts +582 -0
package/cli/selftune/dashboard.ts +31 -12
package/cli/selftune/eval/baseline.ts +247 -0
package/cli/selftune/eval/composability.ts +117 -0
package/cli/selftune/eval/generate-unit-tests.ts +143 -0
package/cli/selftune/eval/hooks-to-evals.ts +68 -2
package/cli/selftune/eval/import-skillsbench.ts +221 -0
package/cli/selftune/eval/synthetic-evals.ts +172 -0
package/cli/selftune/eval/unit-test-cli.ts +152 -0
package/cli/selftune/eval/unit-test.ts +196 -0
package/cli/selftune/evolution/deploy-proposal.ts +142 -1
package/cli/selftune/evolution/evolve-body.ts +492 -0
package/cli/selftune/evolution/evolve.ts +479 -104
package/cli/selftune/evolution/extract-patterns.ts +32 -1
package/cli/selftune/evolution/pareto.ts +314 -0
package/cli/selftune/evolution/propose-body.ts +171 -0
package/cli/selftune/evolution/propose-description.ts +100 -2
package/cli/selftune/evolution/propose-routing.ts +166 -0
package/cli/selftune/evolution/refine-body.ts +141 -0
package/cli/selftune/evolution/rollback.ts +20 -3
package/cli/selftune/evolution/validate-body.ts +254 -0
package/cli/selftune/evolution/validate-proposal.ts +257 -35
package/cli/selftune/evolution/validate-routing.ts +177 -0
package/cli/selftune/grading/grade-session.ts +145 -19
package/cli/selftune/grading/pre-gates.ts +104 -0
package/cli/selftune/hooks/auto-activate.ts +185 -0
package/cli/selftune/hooks/evolution-guard.ts +165 -0
package/cli/selftune/hooks/skill-change-guard.ts +112 -0
package/cli/selftune/index.ts +88 -0
package/cli/selftune/ingestors/claude-replay.ts +351 -0
package/cli/selftune/ingestors/codex-rollout.ts +1 -1
package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
package/cli/selftune/init.ts +168 -5
package/cli/selftune/last.ts +2 -2
package/cli/selftune/memory/writer.ts +447 -0
package/cli/selftune/monitoring/watch.ts +25 -2
package/cli/selftune/status.ts +18 -15
package/cli/selftune/types.ts +377 -5
package/cli/selftune/utils/frontmatter.ts +217 -0
package/cli/selftune/utils/llm-call.ts +29 -3
package/cli/selftune/utils/transcript.ts +35 -0
package/cli/selftune/utils/trigger-check.ts +89 -0
package/cli/selftune/utils/tui.ts +156 -0
package/dashboard/index.html +585 -19
package/package.json +17 -6
package/skill/SKILL.md +127 -10
package/skill/Workflows/AutoActivation.md +144 -0
package/skill/Workflows/Badge.md +118 -0
package/skill/Workflows/Baseline.md +121 -0
package/skill/Workflows/Composability.md +100 -0
package/skill/Workflows/Contribute.md +91 -0
package/skill/Workflows/Cron.md +155 -0
package/skill/Workflows/Dashboard.md +203 -0
package/skill/Workflows/Doctor.md +37 -1
package/skill/Workflows/Evals.md +73 -5
package/skill/Workflows/EvolutionMemory.md +152 -0
package/skill/Workflows/Evolve.md +111 -6
package/skill/Workflows/EvolveBody.md +159 -0
package/skill/Workflows/ImportSkillsBench.md +111 -0
package/skill/Workflows/Ingest.md +129 -15
package/skill/Workflows/Initialize.md +58 -3
package/skill/Workflows/Replay.md +70 -0
package/skill/Workflows/Rollback.md +20 -1
package/skill/Workflows/UnitTest.md +138 -0
package/skill/Workflows/Watch.md +22 -0
package/skill/settings_snippet.json +23 -0
package/templates/activation-rules-default.json +27 -0
package/templates/multi-skill-settings.json +64 -0
package/templates/single-skill-settings.json +58 -0

package/CHANGELOG.md CHANGED Viewed

@@ -5,13 +5,50 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/),
 and this project adheres to [Semantic Versioning](https://semver.org/).
-## [0.6.0] - 2026-03-01
+## [Unreleased]
+## [0.2.0] — 2026-03-08
+### Added
+- **Full skill body evolution** — Teacher-student model for evolving routing tables and complete skill bodies with 3-gate validation (structural, trigger, quality)
+- **Synthetic eval generation** — `selftune evals --synthetic --skill <name> --skill-path <path>` generates eval sets from SKILL.md via LLM without needing real session logs. Solves cold-start for new skills.
+- **Batch trigger validation** — `validateProposalBatched()` batches 10 queries per LLM call (configurable via `TRIGGER_CHECK_BATCH_SIZE`). ~10x faster evolution loops. Sequential `validateProposalSequential()` kept for backward compat.
+- **Cheap-loop evolution mode** — `selftune evolve --cheap-loop` uses haiku for proposal generation and validation, sonnet only for the final deployment gate. New `--gate-model` and `--proposal-model` flags for manual per-stage control.
+- **Validation model selection** — `--validation-model` flag on `evolve` and `evolve-body` commands (default: `haiku`).
+- **Proposal model selection** — `--proposal-model` flag on `evolve`, passed through to `generateProposal()` and `generateMultipleProposals()`.
+- **Gate validation dependency injection** — `gateValidateProposal` added to `EvolveDeps` for testability.
+- **Auto-activation system** — `auto-activate.ts` UserPromptSubmit hook detects when selftune should run and outputs formatted suggestions; session state tracking prevents repeated nags; PAI coexistence support
+- **Skill change guard** — `skill-change-guard.ts` PreToolUse hook detects Write/Edit to SKILL.md files and suggests running `selftune watch`
+- **Evolution memory** — 3-file persistence system at `~/.selftune/memory/` (context.md, plan.md, decisions.md) survives context resets; auto-maintained by evolve, rollback, and watch commands
+- **Specialized agents** — 4 purpose-built Claude Code agents: diagnosis-analyst, pattern-analyst, evolution-reviewer, integration-guide
+- **Enforcement guardrails** — `evolution-guard.ts` PreToolUse hook blocks SKILL.md edits on actively monitored skills unless `selftune watch` has been run recently
+- **Integration guide** — Comprehensive `docs/integration-guide.md` with project-type patterns (single-skill, multi-skill, monorepo, Codex-only, OpenCode-only, mixed)
+- **Settings templates** — `templates/single-skill-settings.json`, `templates/multi-skill-settings.json`, `templates/activation-rules-default.json`
+- **Enhanced init** — `selftune init` now detects workspace structure (skill count, monorepo layout) and suggests appropriate template
+- **Dashboard server** — `selftune dashboard --serve` launches live Bun.serve server with SSE auto-refresh, action buttons (watch/evolve/rollback), and evolution timeline
+- **Activation rules engine** — Configurable trigger rules for auto-activation (grading thresholds, stale evolutions, regression detection)
+- **Sandbox test harness** (`tests/sandbox/run-sandbox.ts`): Exercises all CLI commands and hooks against fixture data in an isolated `/tmp` environment. Runs in ~400ms with 10/10 tests passing.
+- **Devcontainer-based LLM testing** (`.devcontainer/` + `tests/sandbox/docker/`): Based on the official Claude Code devcontainer reference. Uses `claude -p` with `--dangerously-skip-permissions` for unattended LLM-dependent testing (grade, evolve, watch). No API key required — uses existing Claude subscription.
+- **Realistic test fixtures**: 3 skills from skills.sh (find-skills, frontend-design, ai-image-generation) with 15 sessions, 30 queries, 7 skill usage records, and evolution audit history.
+- **Hook integration tests**: All 3 Claude Code hooks (prompt-log, skill-eval, session-stop) tested via stdin payload injection.
+### Changed
+- `validateProposal()` now delegates to `validateProposalBatched()` by default (was sequential).
+- `hooks-to-evals.ts` `cliMain()` is now async to support synthetic generation.
+- `EvolveOptions` extended with `validationModel`, `cheapLoop`, `gateModel`, `proposalModel`.
+- `EvolveResult` extended with `gateValidation`.
+## [0.1.4] - 2026-03-01
 ### Added
 - `selftune status` — CLI skill health summary with pass rates, trends, and system health
 - `selftune last` — Quick insight from the most recent session
 - `selftune dashboard` — Skill-health-centric HTML dashboard with grid view and drill-down
+- `selftune replay` — Claude Code transcript replay for retroactive log backfill
+- `selftune contribute` — Opt-in anonymized data export for community contribution
 - CI/CD workflows: publish, auto-bump, CodeQL, scorecard
 - FOSS governance: LICENSE (MIT), CODE_OF_CONDUCT, CONTRIBUTING, SECURITY
 - npm package configuration with CJS bin entry point

package/README.md CHANGED Viewed

@@ -1,316 +1,156 @@
+<div align="center">
+<img src="assets/logo.svg" alt="selftune logo" width="80" />
+# selftune
+**Self-improving skills for AI agents.**
 [![CI](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml/badge.svg)](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml)
 [![CodeQL](https://github.com/WellDunDun/selftune/actions/workflows/codeql.yml/badge.svg)](https://github.com/WellDunDun/selftune/actions/workflows/codeql.yml)
 [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/WellDunDun/selftune/badge)](https://securityscorecards.dev/viewer/?uri=github.com/WellDunDun/selftune)
 [![npm version](https://img.shields.io/npm/v/selftune)](https://www.npmjs.com/package/selftune)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
-[![TypeScript](https://img.shields.io/badge/TypeScript-5.0-blue.svg)](https://www.typescriptlang.org/)
-[![Zero Dependencies](https://img.shields.io/badge/dependencies-0-brightgreen)](https://www.npmjs.com/package/selftune?activeTab=dependencies)
-[![Bun](https://img.shields.io/badge/runtime-bun%20%7C%20node-black)](https://bun.sh)
-# selftune — Skill Observability & Continuous Improvement CLI
-[![npm version](https://img.shields.io/npm/v/selftune)](https://www.npmjs.com/package/selftune)
-[![CI](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml/badge.svg)](https://github.com/WellDunDun/selftune/actions/workflows/ci.yml)
-[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
+[![TypeScript](https://img.shields.io/badge/TypeScript-blue.svg)](https://www.typescriptlang.org/)
 [![Zero Dependencies](https://img.shields.io/badge/dependencies-0-brightgreen)](https://www.npmjs.com/package/selftune?activeTab=dependencies)
 [![Bun](https://img.shields.io/badge/runtime-bun%20%7C%20node-black)](https://bun.sh)
-Observe real sessions, detect missed triggers, grade execution quality, and automatically evolve skill descriptions toward the language real users actually use.
-Works with **Claude Code**, **Codex**, and **OpenCode**.
-```
-Observe → Detect → Diagnose → Propose → Validate → Deploy → Watch → Repeat
-```
----
-## Install
-```bash
-npx selftune@latest doctor
-```
-Or install globally:
-```bash
-npm install -g selftune
-selftune doctor
-```
-Requires [Bun](https://bun.sh) or Node.js 18+ with [tsx](https://github.com/privatenumber/tsx).
----
-## Why
+Your agent skills learn how you work. Detect what's broken. Fix it automatically.
-Agent skills are static, but users are not. When a skill undertriggers — when someone says "make me a slide deck" and the pptx skill doesn't fire — that failure is invisible. The user concludes "AI doesn't follow directions" rather than recognizing the skill description doesn't match how real people talk.
+**[Install](#install)** · **[Use Cases](#built-for-how-you-actually-work)** · **[How It Works](#how-it-works)** · **[Commands](#commands)** · **[Platforms](#platforms)** · **[Docs](docs/integration-guide.md)**
-selftune closes this feedback loop.
+</div>
 ---
-## What It Does
+Your skills don't understand how you talk. You say "make me a slide deck" and nothing happens — no error, no log, no signal. selftune watches your real sessions, learns how you actually speak, and rewrites skill descriptions to match. Automatically.
-| Capability | Description |
-|---|---|
-| **Session telemetry** | Captures per-session process metrics across all three platforms |
-| **False negative detection** | Surfaces queries where a skill should have fired but didn't |
-| **Eval set generation** | Converts hook logs into trigger eval sets with real usage as ground truth |
-| **Session grading** | 3-tier evaluation (Trigger / Process / Quality) using the agent you already have |
-| **Skill evolution** | Proposes improved descriptions, validates them, deploys with audit trail |
-| **Post-deploy monitoring** | Watches evolved skills for regressions, auto-rollback on pass rate drops |
----
+Works with **Claude Code**, **Codex**, **OpenCode**, and **OpenClaw**. Zero runtime dependencies.
-## Setup
-### 1. Add the skill
+## Install
 ```bash
 npx skills add WellDunDun/selftune
 ```
-### 2. Initialize
-Tell your agent: **"initialize selftune"**
-The agent will install the CLI (`npm install -g selftune`) if needed, run `selftune init` to bootstrap config, install hooks, and verify with `selftune doctor`.
----
-## Development
-For contributors running from source.
-### 1. Initialize
-```bash
-npx selftune@latest init
-```
-The `init` command auto-detects your agent environment (Claude Code, Codex, or OpenCode), resolves the CLI path, determines the LLM mode, and writes config to `~/.selftune/config.json`. All subsequent commands read from this config.
-Use `--agent claude_code|codex|opencode` to override detection, `--llm-mode agent|api` to override LLM mode, or `--force` to reinitialize.
-### 4. Install hooks (Claude Code)
+Then tell your agent: **"initialize selftune"**
-If `init` reports hooks are not installed, merge the entries from `skill/settings_snippet.json` into `~/.claude/settings.json`. Derive hook script paths from the `cli_path` field in `~/.selftune/config.json` — the hooks directory is at `dirname(cli_path)/hooks/`.
+Two minutes. No API keys. No external services. No configuration ceremony. Uses your existing agent subscription. Within minutes you'll see which skills are undertriggering.
-### 5. Verify setup
+**CLI only** (no skill, just the CLI):
 ```bash
-selftune doctor
-```
-Doctor checks log file health, hook installation, schema validity, and config status.
-### Platform-Specific Notes
-**Claude Code** — Hooks capture telemetry automatically after installation. Zero configuration once hooks are in `settings.json`.
-**Codex** — Use the wrapper for real-time capture or the batch ingestor for historical logs:
-```bash
-selftune wrap-codex -- <your codex args>
-selftune ingest-codex
+npx selftune@latest doctor
 ```
-**OpenCode** — Backfill historical sessions from SQLite:
-```bash
-selftune ingest-opencode
-```
+## Before / After
-All platforms write to the same shared JSONL log schema at `~/.claude/`.
+<p align="center">
+  <img src="./assets/BeforeAfter.gif" alt="Before: 47% pass rate → After: 89% pass rate" width="800">
+</p>
----
+selftune learned that real users say "slides", "deck", "presentation for Monday" — none of which matched the original skill description. It rewrote the description to match how people actually talk. Validated against the eval set. Deployed with a backup. Done.
-## Commands
+## Built for How You Actually Work
-```
-selftune <command> [options]
-```
+**I write and use my own skills** — You built skills for your workflow but your descriptions don't match how you actually talk. selftune learns your language from real sessions and evolves descriptions to match — no more manual tuning. `selftune status` · `selftune evolve` · `selftune baseline`
-| Command | Purpose |
-|---|---|
-| `init` | Auto-detect agent environment, write `~/.selftune/config.json` |
-| `grade --skill <name>` | Grade a session (3-tier: trigger, process, quality) |
-| `evals --skill <name>` | Generate eval set from real usage logs |
-| `evals --list-skills` | Show logged skills and query counts |
-| `evolve --skill <name> --skill-path <path>` | Analyze failures, propose and deploy improved description |
-| `rollback --skill <name> --skill-path <path>` | Restore pre-evolution description |
-| `watch --skill <name> --skill-path <path>` | Monitor post-deploy pass rates, detect regressions |
-| `status` | Show skill health summary (pass rates, trends, missed queries) |
-| `last` | Show quick insight from the most recent session |
-| `doctor` | Health checks on logs, hooks, config, and schema |
-| `dashboard` | Open skill-health-centric HTML dashboard in browser |
-| `ingest-codex` | Batch ingest Codex rollout logs |
-| `ingest-opencode` | Backfill historical OpenCode sessions from SQLite |
-| `wrap-codex -- <args>` | Real-time Codex wrapper with telemetry |
-No separate API key required — grading and evolution use whatever agent CLI you already have installed (Claude Code, Codex, or OpenCode).
-See `skill/Workflows/` for detailed step-by-step guides for each command.
+**I publish skills others install** — Your skill works for you, but every user talks differently. selftune ships skills that get better for every user automatically — adapting descriptions to how each person actually works. `selftune status` · `selftune evals` · `selftune badge`
----
+**I manage an agent setup with many skills** — You have 15+ skills installed. Some work. Some don't. Some conflict. selftune gives you a health dashboard and automatically improves the skills that aren't keeping up with how your team works. `selftune dashboard` · `selftune composability` · `selftune doctor`
 ## How It Works
-### Telemetry Capture
+<p align="center">
+  <img src="./assets/FeedbackLoop.gif" alt="Observe → Detect → Evolve → Watch" width="800">
+</p>
-```
-Claude Code (hooks):                 OpenCode (hooks):
-  UserPromptSubmit → prompt-log.ts     message.*        → opencode-prompt-log.ts
-  PostToolUse      → skill-eval.ts     tool.execute.after → opencode-skill-eval.ts
-  Stop             → session-stop.ts   session.idle     → opencode-session-stop.ts
-          │                                    │
-          └──────────┬─────────────────────────┘
-                     ▼
-          Shared JSONL Log Schema (~/.claude/)
-            ├── all_queries_log.jsonl
-            ├── skill_usage_log.jsonl
-            └── session_telemetry_log.jsonl
-Codex (wrapper/ingestor — hooks not yet available):
-  codex-wrapper.ts  (real-time tee of JSONL stream)
-  codex-rollout.ts  (batch ingest from rollout logs)
-          │
-          └──→ Same shared JSONL schema
-```
+A continuous feedback loop that makes your skills learn and adapt. Automatically.
-### Eval & Grading
+**Observe** — Hooks capture every user query and which skills fired. On Claude Code, hooks install automatically. Use `selftune replay` to backfill existing transcripts. This is how your skills start learning.
-```
-selftune evals cross-references the two query logs:
-  Positives  = skill_usage_log entries for target skill
-  Negatives  = all_queries_log entries NOT in positives
-selftune grade reads:
-  session_telemetry_log → process metrics (tool calls, errors, turns)
-  transcript JSONL       → what actually happened
-  expectations           → what should have happened
-```
+**Detect** — selftune finds the gap between how you talk and how your skills are described. You say "make me a slide deck" and your pptx skill stays silent — selftune catches that mismatch.
-### Evolution Loop
+**Evolve** — Rewrites skill descriptions — and full skill bodies — to match how you actually work. Batched validation with per-stage model control (`--cheap-loop` uses haiku for the loop, sonnet for the gate). Teacher-student body evolution with 3-gate validation. Baseline comparison gates on measurable lift. Automatic backup.
-```
-selftune evolve:
-  1. Load eval set (or generate from logs)
-  2. Extract failure patterns (missed queries grouped by invocation type)
-  3. Generate improved description via LLM
-  4. Validate against eval set (must improve, <5% regression)
-  5. Deploy updated SKILL.md + PR + audit trail
-selftune watch:
-  Monitor pass rate over sliding window of recent sessions
-  Alert (or auto-rollback) on regression > threshold
-```
+**Watch** — After deploying changes, selftune monitors skill trigger rates. If anything regresses, it rolls back automatically. Your skills keep improving without you touching them.
----
+## What's New in v0.2.0
-## Architecture
+- **Full skill body evolution** — Beyond descriptions: evolve routing tables and entire skill bodies using teacher-student model with structural, trigger, and quality gates
+- **Synthetic eval generation** — `selftune evals --synthetic` generates eval sets from SKILL.md via LLM, no session logs needed. Solves cold-start: new skills get evals immediately.
+- **Cheap-loop evolution** — `selftune evolve --cheap-loop` uses haiku for proposal generation and validation, sonnet only for the final deployment gate. ~80% cost reduction.
+- **Batch trigger validation** — Validation now batches 10 queries per LLM call instead of one-per-query. ~10x faster evolution loops.
+- **Per-stage model control** — `--validation-model`, `--proposal-model`, and `--gate-model` flags give fine-grained control over which model runs each evolution stage.
+- **Auto-activation system** — Hooks detect when selftune should run and suggest actions
+- **Enforcement guardrails** — Blocks SKILL.md edits on monitored skills unless `selftune watch` has been run
+- **Live dashboard server** — `selftune dashboard --serve` with SSE auto-refresh and action buttons
+- **Evolution memory** — Persists context, plans, and decisions across context resets
+- **4 specialized agents** — Diagnosis analyst, pattern analyst, evolution reviewer, integration guide
+- **Sandbox test harness** — Comprehensive automated test coverage, including devcontainer-based LLM testing
-```
-cli/selftune/
-├── index.ts                     CLI entry point (command router)
-├── init.ts                      Agent detection, config bootstrap
-├── types.ts, constants.ts       Shared interfaces and constants
-├── observability.ts             Health checks (doctor command)
-├── status.ts                    Skill health summary (status command)
-├── last.ts                      Last session insight (last command)
-├── dashboard.ts                 HTML dashboard builder (dashboard command)
-├── utils/                       JSONL, transcript parsing, LLM calls, schema validation
-├── hooks/                       Claude Code + OpenCode telemetry capture
-├── ingestors/                   Codex adapters + OpenCode backfill
-├── eval/                        False negative detection, eval set generation
-├── grading/                     3-tier session grading (agent or API mode)
-├── evolution/                   Failure extraction, proposal, validation, deploy, rollback
-└── monitoring/                  Post-deploy regression detection
-dashboard/
-└── index.html                   Skill-health-centric HTML dashboard template
-skill/
-├── SKILL.md                     Routing table (~120 lines)
-├── settings_snippet.json        Claude Code hook config template
-├── references/                  Domain knowledge (logs, grading methodology, taxonomy)
-└── Workflows/                   Step-by-step guides (1 per command)
-```
-Dependencies flow forward only: `shared → hooks/ingestors → eval → grading → evolution → monitoring`. Enforced by `lint-architecture.ts`.
-Config persists at `~/.selftune/config.json` (written by `init`, read by all commands via skill workflows).
-See [ARCHITECTURE.md](ARCHITECTURE.md) for the full domain map and module rules.
----
-## Log Schema
-Three append-only JSONL files at `~/.claude/`:
-| File | Record type | Key fields |
-|---|---|---|
-| `all_queries_log.jsonl` | `QueryLogRecord` | `timestamp`, `session_id`, `query`, `source?` |
-| `skill_usage_log.jsonl` | `SkillUsageRecord` | `timestamp`, `session_id`, `skill_name`, `query`, `triggered` |
-| `session_telemetry_log.jsonl` | `SessionTelemetryRecord` | `timestamp`, `session_id`, `tool_calls`, `bash_commands`, `skills_triggered`, `errors_encountered` |
-| `evolution_audit_log.jsonl` | `EvolutionAuditEntry` | `timestamp`, `proposal_id`, `action`, `details`, `eval_snapshot?` |
-The `source` field identifies the platform: `claude_code`, `codex`, or `opencode`.
----
-## Development
+## Commands
-```bash
-make check    # lint + architecture lint + all tests
-make lint     # biome check + architecture lint
-make test     # bun test
-```
+| Command | What it does |
+|---|---|
+| `selftune status` | See which skills are undertriggering and why |
+| `selftune evals --skill <name>` | Generate eval sets from real session data (`--synthetic` for cold-start) |
+| `selftune evolve --skill <name>` | Propose, validate, and deploy improved descriptions (`--cheap-loop`, `--with-baseline`) |
+| `selftune evolve-body --skill <name>` | Evolve full skill body or routing table (teacher-student, 3-gate validation) |
+| `selftune baseline --skill <name>` | Measure skill value vs no-skill baseline |
+| `selftune unit-test --skill <name>` | Run or generate skill-level unit tests |
+| `selftune composability --skill <name>` | Detect conflicts between co-occurring skills |
+| `selftune import-skillsbench` | Import external eval corpus from [SkillsBench](https://github.com/benchflow-ai/skillsbench) |
+| `selftune badge --skill <name>` | Generate skill health badge SVG |
+| `selftune watch --skill <name>` | Monitor after deploy. Auto-rollback on regression. |
+| `selftune dashboard` | Open the visual skill health dashboard |
+| `selftune replay` | Backfill data from existing Claude Code transcripts |
+| `selftune doctor` | Health check: logs, hooks, config, permissions |
+Full command reference: `selftune --help`
+## Why Not Just Rewrite Skills Manually?
+| Approach | Problem |
+|---|---|
+| Rewrite the description yourself | No data on how users actually talk. No validation. No regression detection. |
+| Add "ALWAYS invoke when..." directives | Brittle. One agent rewrite away from breaking. |
+| Force-load skills on every prompt | Doesn't fix the description. Expensive band-aid. |
+| **selftune** | Learns from real usage, rewrites descriptions to match how you work, validates against eval sets, auto-rollbacks on regressions. |
-Zero runtime dependencies. Uses Bun built-ins only.
+## Different Layer, Different Problem
----
+LLM observability tools trace API calls. Infrastructure tools monitor servers. Neither knows whether the right skill fired for the right person. selftune does — and fixes it automatically.
-## Tips
+selftune is complementary to these tools, not competitive. They trace what happens inside the LLM. selftune makes sure the right skill is called in the first place.
-- Run `selftune init` first — everything else reads from the config it writes.
-- Let logs accumulate over several days before running evals — more diverse real queries = more reliable signal.
-- All hooks are silent (exit 0) and take <50ms. Negligible overhead.
-- Logs are append-only JSONL. Safe to delete to start fresh, or archive old files.
-- Use `--max 75` to increase eval set size once you have enough data.
-- Use `--seed 123` for a different random sample of negatives.
-- Use `--dry-run` with `evolve` to preview proposals without deploying.
-- The `doctor` command checks log health, hook presence, config status, and schema validity.
+| Dimension | selftune | Langfuse | LangSmith | OpenLIT |
+|-----------|----------|----------|-----------|---------|
+| **Layer** | Skill-specific | LLM call | Agent trace | Infrastructure |
+| **Detects** | Missed triggers, false negatives, skill conflicts | Token usage, latency | Chain failures | System metrics |
+| **Improves** | Descriptions, body, and routing automatically | — | — | — |
+| **Setup** | Zero deps, zero API keys | Self-host or cloud | Cloud required | Helm chart |
+| **Price** | Free (MIT) | Freemium | Paid | Free |
+| **Unique** | Self-improving skills + auto-rollback | Prompt management | Evaluations | Dashboards |
----
-## Contributing
+## Platforms
-See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, architecture rules, and PR guidelines.
+**Claude Code** — Hooks install automatically. `selftune replay` backfills existing transcripts.
-Please follow our [Code of Conduct](CODE_OF_CONDUCT.md).
+**Codex** — `selftune wrap-codex -- <args>` or `selftune ingest-codex`
----
+**OpenCode** — `selftune ingest-opencode`
-## Security
+**OpenClaw** — `selftune ingest-openclaw` + `selftune cron setup` for autonomous evolution
-To report a vulnerability, see [SECURITY.md](SECURITY.md).
+Requires [Bun](https://bun.sh) or Node.js 18+. No extra API keys.
 ---
-## Sponsor
+<div align="center">
-If selftune saves you time, consider [sponsoring the project](https://github.com/sponsors/WellDunDun).
----
+[Architecture](ARCHITECTURE.md) · [Contributing](CONTRIBUTING.md) · [Security](SECURITY.md) · [Integration Guide](docs/integration-guide.md) · [Sponsor](https://github.com/sponsors/WellDunDun)
-## Milestones
+MIT licensed. Free forever. Works with Claude Code, Codex, OpenCode, and OpenClaw.
-| Version | Scope | Status |
-|---|---|---|
-| v0.1 | Hooks, ingestors, shared schema, eval generation | Done |
-| v0.2 | Session grading, grader skill | Done |
-| v0.3 | Evolution loop (propose, validate, deploy, rollback) | Done |
-| v0.4 | Post-deploy monitoring, regression detection | Done |
-| v0.5 | Agent-first skill restructure, `init` command, config bootstrap | Done |
-| v0.6 | Three-layer observability: `status`, `last`, redesigned dashboard | Done |
+</div>

package/assets/BeforeAfter.gif ADDED Viewed

Binary file

package/assets/FeedbackLoop.gif ADDED Viewed

Binary file

package/assets/logo.svg ADDED Viewed

@@ -0,0 +1,9 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="250" height="250" viewBox="0 0 250 250" fill="none">
+<path d="M 190.16,31.49 C 187.91,29.88 184.51,32.19 185.88,35.16 C 186.31,36.11 187.08,36.54 187.71,37.01 C 218.75,59.86 237.63,92.71 237.63,128.82 C 237.63,175.99 205.12,218.56 153.82,234.69 C 149.89,235.93 150.91,241.71 154.91,240.66 C 205.98,226.96 243.01,181.94 243,128.45 C 242.99,90.87 223.47,56.18 190.16,31.49 Z" fill="#E8DED0"/>
+<path d="M 125.19,243.91 C 138.08,243.91 147.18,236.44 151.21,225.01 C 193.72,217.79 226.98,184.02 226.98,140.81 C 226.98,121.17 219.82,103.78 209.93,87.04 C 191.42,55.45 165.15,34.72 117.71,28.65 C 112.91,28.04 113.77,34.35 117.19,34.82 C 161.67,39.33 185.84,56.71 203.76,86.42 C 213.87,103.68 220.68,119.61 220.68,140.81 C 220.68,179.96 190.81,211.95 148.71,219.16 C 147.11,219.47 146.27,220.32 145.92,221.8 C 142.95,231.11 135.72,238.02 125.19,237.66 C 64.48,237.66 11.67,191.61 11.67,127.51 C 11.67,79.61 44.82,36.38 93.89,27.77 L 94.11,27.73 L 94.38,26.64 C 97.04,16.61 104.57,11.82 114.19,11.82 C 134.12,13.36 152.91,18.15 170.48,26.08 C 171.92,26.78 173.81,27.09 174.76,25.59 C 176.05,23.72 175.31,21.07 173.01,20.34 C 154.78,11.96 137.21,7.17 114.47,6 H 113.52 C 101.91,6 93.46,12.16 89.49,21.78 C 42.36,31.26 6.17,74.76 6.17,128.08 C 6.17,190.05 57.92,243.91 125.19,243.91 Z" fill="#E8DED0"/>
+<path d="M 93.67,40.64 C 100.51,52.07 109.54,51.33 114.05,52.17 C 128.72,53.91 141.48,55.78 157.38,62.16 C 162.72,64.47 162.29,58.19 159.18,57.01 C 145.11,51.33 132.48,49.79 111.31,47.48 C 101.83,46.29 95.45,41.18 93.75,32.81 C 55.21,39.46 22.06,72.17 22.06,112.48 C 22.06,131.98 30.36,149.82 43.26,164.49 C 46.23,167.59 50.19,164.13 48.32,161.02 C 36.21,145.54 28.42,129.78 28.42,112.4 C 28.42,79.11 54.91,48.36 89.91,40.36 C 90.76,40.15 91.04,39.87 91.62,40.01 C 92.62,40.01 93.04,39.65 93.67,40.64 Z" fill="#E8DED0"/>
+<path d="M 152.72,82.77 C 126.61,82.77 113.07,99.44 103.01,119.33 C 100.56,123.36 103.74,125.03 105.61,123.92 C 107.15,123.22 107.89,121.05 108.73,119.61 C 118.22,102.16 130.33,88.56 152.72,88.56 C 181.62,88.56 201.91,116.01 201.91,147.31 C 201.91,175.12 183.47,199.96 152.51,205.75 C 151.84,205.96 151.63,206.03 151.56,205.54 C 147.74,195.37 139.36,188.15 128.07,186.48 C 113.2,184.24 101.23,182.36 83.8,176.81 C 79.3,175.48 77.91,182.36 82.41,183.09 C 97.21,187.46 108.09,189.47 126.25,192.65 C 136.78,194.31 145.41,201.71 147.11,210.95 C 147.74,213.05 149.13,213.41 150.15,213.26 C 183.75,208.61 208.26,180.93 208.26,147.24 C 208.26,115.06 186.94,82.77 152.72,82.77 Z" fill="#E8DED0"/>
+<path d="M 129.77,105.21 C 122.93,112.05 118.97,122.73 113.77,130.41 C 111.31,133.45 114.56,136.63 117.46,134.46 C 123.75,126.23 127.43,115.62 135.15,108.71 C 138.22,105.81 134.73,101.09 129.77,105.21 Z" fill="#E8DED0"/>
+<path d="M 136.78,120.31 C 127.71,136.71 120.12,154.91 93.74,154.91 C 66.07,154.91 47.76,128.53 47.76,104.78 C 47.76,84.47 58.57,66.08 77.66,56.25 C 82.23,54.21 79.85,47.76 75.34,49.93 C 54.77,59.72 42.01,80.11 42.01,104.71 C 42.01,131.77 61.86,161.31 93.67,161.31 C 114.77,161.31 128.91,147.24 139.86,124.06 C 142.76,120.45 139.15,117.73 136.78,120.31 Z" fill="#E8DED0"/>
+<path d="M 30.73,154.7 C 27.76,152.97 23.87,155.93 25.41,158.76 C 41.73,188.36 68.94,199.79 105.75,206.41 C 112.25,207.66 122.07,208.75 123.46,209.03 C 128.07,209.95 128.07,220.18 121.78,220.18 C 107.64,218.94 92.06,215.98 76.23,211.33 C 72.13,210.24 71.04,216.69 75.27,217.64 C 90.41,222.22 103.95,224.74 120.47,226.54 C 133.73,226.54 136.56,209.03 126.03,203.38 C 123.75,202.13 122.73,202.56 112.04,200.76 C 78.09,195.04 54.06,188.98 32.12,155.65 C 31.77,155.23 31.28,154.91 30.73,154.7 Z" fill="#E8DED0"/>
+</svg>

package/assets/skill-health-badge.svg ADDED Viewed

@@ -0,0 +1,20 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="138" height="20" role="img" aria-label="Skill Health: no data">
+  <linearGradient id="b" x2="0" y2="100%">
+    <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
+    <stop offset="1" stop-opacity=".1"/>
+  </linearGradient>
+  <clipPath id="a">
+    <rect width="138" height="20" rx="3" fill="#fff"/>
+  </clipPath>
+  <g clip-path="url(#a)">
+    <rect width="78" height="20" fill="#555"/>
+    <rect x="79" width="59" height="20" fill="#9f9f9f"/>
+    <rect width="138" height="20" fill="url(#b)"/>
+  </g>
+  <g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" font-size="11">
+    <text x="39" y="15" fill="#010101" fill-opacity=".3">Skill Health</text>
+    <text x="39" y="14">Skill Health</text>
+    <text x="108.5" y="15" fill="#010101" fill-opacity=".3">no data</text>
+    <text x="108.5" y="14">no data</text>
+  </g>
+</svg>