npm - selftune - Versions diffs - 0.1.4 → 0.2.0 - Mend

selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/.claude/agents/diagnosis-analyst.md +146 -0
package/.claude/agents/evolution-reviewer.md +167 -0
package/.claude/agents/integration-guide.md +200 -0
package/.claude/agents/pattern-analyst.md +147 -0
package/CHANGELOG.md +37 -0
package/README.md +96 -256
package/assets/BeforeAfter.gif +0 -0
package/assets/FeedbackLoop.gif +0 -0
package/assets/logo.svg +9 -0
package/assets/skill-health-badge.svg +20 -0
package/cli/selftune/activation-rules.ts +171 -0
package/cli/selftune/badge/badge-data.ts +108 -0
package/cli/selftune/badge/badge-svg.ts +212 -0
package/cli/selftune/badge/badge.ts +103 -0
package/cli/selftune/constants.ts +75 -1
package/cli/selftune/contribute/bundle.ts +314 -0
package/cli/selftune/contribute/contribute.ts +214 -0
package/cli/selftune/contribute/sanitize.ts +162 -0
package/cli/selftune/cron/setup.ts +266 -0
package/cli/selftune/dashboard-server.ts +582 -0
package/cli/selftune/dashboard.ts +25 -3
package/cli/selftune/eval/baseline.ts +247 -0
package/cli/selftune/eval/composability.ts +117 -0
package/cli/selftune/eval/generate-unit-tests.ts +143 -0
package/cli/selftune/eval/hooks-to-evals.ts +68 -2
package/cli/selftune/eval/import-skillsbench.ts +221 -0
package/cli/selftune/eval/synthetic-evals.ts +172 -0
package/cli/selftune/eval/unit-test-cli.ts +152 -0
package/cli/selftune/eval/unit-test.ts +196 -0
package/cli/selftune/evolution/deploy-proposal.ts +142 -1
package/cli/selftune/evolution/evolve-body.ts +492 -0
package/cli/selftune/evolution/evolve.ts +466 -103
package/cli/selftune/evolution/extract-patterns.ts +32 -1
package/cli/selftune/evolution/pareto.ts +314 -0
package/cli/selftune/evolution/propose-body.ts +171 -0
package/cli/selftune/evolution/propose-description.ts +100 -2
package/cli/selftune/evolution/propose-routing.ts +166 -0
package/cli/selftune/evolution/refine-body.ts +141 -0
package/cli/selftune/evolution/rollback.ts +19 -2
package/cli/selftune/evolution/validate-body.ts +254 -0
package/cli/selftune/evolution/validate-proposal.ts +257 -35
package/cli/selftune/evolution/validate-routing.ts +177 -0
package/cli/selftune/grading/grade-session.ts +138 -18
package/cli/selftune/grading/pre-gates.ts +104 -0
package/cli/selftune/hooks/auto-activate.ts +185 -0
package/cli/selftune/hooks/evolution-guard.ts +165 -0
package/cli/selftune/hooks/skill-change-guard.ts +112 -0
package/cli/selftune/index.ts +88 -0
package/cli/selftune/ingestors/claude-replay.ts +351 -0
package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
package/cli/selftune/init.ts +150 -3
package/cli/selftune/memory/writer.ts +447 -0
package/cli/selftune/monitoring/watch.ts +25 -2
package/cli/selftune/status.ts +17 -13
package/cli/selftune/types.ts +377 -5
package/cli/selftune/utils/frontmatter.ts +217 -0
package/cli/selftune/utils/llm-call.ts +29 -3
package/cli/selftune/utils/transcript.ts +35 -0
package/cli/selftune/utils/trigger-check.ts +89 -0
package/cli/selftune/utils/tui.ts +156 -0
package/dashboard/index.html +569 -8
package/package.json +8 -4
package/skill/SKILL.md +124 -8
package/skill/Workflows/AutoActivation.md +144 -0
package/skill/Workflows/Badge.md +118 -0
package/skill/Workflows/Baseline.md +121 -0
package/skill/Workflows/Composability.md +100 -0
package/skill/Workflows/Contribute.md +91 -0
package/skill/Workflows/Cron.md +155 -0
package/skill/Workflows/Dashboard.md +203 -0
package/skill/Workflows/Doctor.md +37 -1
package/skill/Workflows/Evals.md +69 -1
package/skill/Workflows/EvolutionMemory.md +152 -0
package/skill/Workflows/Evolve.md +111 -6
package/skill/Workflows/EvolveBody.md +159 -0
package/skill/Workflows/ImportSkillsBench.md +111 -0
package/skill/Workflows/Ingest.md +117 -3
package/skill/Workflows/Initialize.md +57 -3
package/skill/Workflows/Replay.md +70 -0
package/skill/Workflows/Rollback.md +20 -1
package/skill/Workflows/UnitTest.md +138 -0
package/skill/Workflows/Watch.md +22 -0
package/skill/settings_snippet.json +23 -0
package/templates/activation-rules-default.json +27 -0
package/templates/multi-skill-settings.json +64 -0
package/templates/single-skill-settings.json +58 -0

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "selftune",
-  "version": "0.1.4",
-  "description": "Skill observability and continuous improvement CLI for agent platforms",
+  "version": "0.2.0",
+  "description": "Self-improving skills CLI for AI agents",
   "type": "module",
   "license": "MIT",
   "author": "Daniel Petro",
@@ -20,7 +20,7 @@
   "keywords": [
     "selftune",
     "skill",
-    "observability",
+    "self-improving",
     "claude-code",
     "codex",
     "opencode",
@@ -37,9 +37,12 @@
     "selftune": "bin/selftune.cjs"
   },
   "files": [
+    "assets/",
     "bin/",
     "cli/selftune/",
     "dashboard/",
+    "templates/",
+    ".claude/agents/",
     "skill/",
     "README.md",
     "CHANGELOG.md"
@@ -49,7 +52,8 @@
     "lint:fix": "bunx biome check --write .",
     "lint:arch": "bun run lint-architecture.ts",
     "test": "bun test",
-    "check": "bun run lint && bun run lint:arch && bun test"
+    "check": "bun run lint && bun run lint:arch && bun test",
+    "start": "bun run cli/selftune/index.ts --help"
   },
   "devDependencies": {
     "@biomejs/biome": "^2.4.4",

package/skill/SKILL.md CHANGED Viewed

@@ -1,11 +1,17 @@
 ---
 name: selftune
 description: >
-  Skill observability and continuous improvement. Use when the user wants to:
+  Self-improving skills toolkit. Use when the user wants to:
   grade a session, generate evals, check undertriggering, evolve a skill
-  description, rollback an evolution, monitor post-deploy performance, check
-  skill health status, view last session insight, open the dashboard, run
-  health checks, or ingest sessions from Codex/OpenCode.
+  description or full body, evolve routing tables, rollback an evolution,
+  monitor post-deploy performance, check skill health status, view last
+  session insight, open the dashboard, serve the live dashboard, run health
+  checks, manage activation rules, ingest sessions from Codex/OpenCode/OpenClaw,
+  replay Claude Code transcripts, contribute anonymized data to the community,
+  set up autonomous cron jobs, manage evolution memory, configure auto-activation
+  suggestions, diagnose underperforming skills, analyze cross-skill patterns,
+  review evolution proposals, measure baseline lift, run skill unit tests,
+  analyze skill composability, or import SkillsBench evaluation corpora.
 ---
 # selftune
@@ -40,10 +46,22 @@ selftune watch    --skill <name> --skill-path <path> [--auto-rollback]
 selftune status
 selftune last
 selftune doctor
-selftune dashboard [--export] [--out FILE]
+selftune dashboard [--export] [--out FILE] [--serve]
 selftune ingest-codex
 selftune ingest-opencode
+selftune ingest-openclaw [--agents-dir PATH] [--since DATE] [--dry-run] [--force] [--verbose]
 selftune wrap-codex -- <codex args>
+selftune replay     [--since DATE] [--dry-run] [--force] [--verbose]
+selftune contribute [--skill NAME] [--preview] [--sanitize LEVEL] [--submit]
+selftune cron setup [--dry-run] [--tz <timezone>]
+selftune cron list
+selftune cron remove [--dry-run]
+selftune dashboard --serve [--port <port>]
+selftune evolve-body --skill <name> --skill-path <path> --target <routing_table|full_body> [--dry-run]
+selftune baseline   --skill <name> --skill-path <path> [--eval-set <path>] [--agent <name>]
+selftune unit-test  --skill <name> --tests <path> [--run-agent] [--generate]
+selftune composability --skill <name> [--window N] [--telemetry-log <path>]
+selftune import-skillsbench --dir <path> --skill <name> --output <path> [--match-strategy exact|fuzzy]
 ```
 ## Workflow Routing
@@ -56,11 +74,59 @@ selftune wrap-codex -- <codex args>
 | rollback, undo, restore, revert evolution | Rollback | Workflows/Rollback.md |
 | watch, monitor, regression, post-deploy, performing | Watch | Workflows/Watch.md |
 | doctor, health, hooks, broken, diagnose | Doctor | Workflows/Doctor.md |
-| ingest, import, codex logs, opencode, wrap codex | Ingest | Workflows/Ingest.md |
+| ingest, import, codex logs, opencode, openclaw, wrap codex | Ingest | Workflows/Ingest.md |
+| replay, backfill, claude transcripts, historical sessions | Replay | Workflows/Replay.md |
+| contribute, share, community, export data, anonymized | Contribute | Workflows/Contribute.md |
 | init, setup, bootstrap, first time | Initialize | Workflows/Initialize.md |
+| cron, schedule, autonomous, automate evolution | Cron | Workflows/Cron.md |
+| auto-activate, suggestions, activation rules, nag, why suggest | AutoActivation | Workflows/AutoActivation.md |
+| dashboard, visual, open dashboard, skill grid, serve dashboard, live dashboard | Dashboard | Workflows/Dashboard.md |
+| evolution memory, context memory, session continuity, what happened last | EvolutionMemory | Workflows/EvolutionMemory.md |
+| evolve body, evolve routing, full body evolution, rewrite skill, teacher student | EvolveBody | Workflows/EvolveBody.md |
+| baseline, baseline lift, adds value, skill value, no-skill comparison | Baseline | Workflows/Baseline.md |
+| unit test, skill test, test skill, generate tests, run tests, assertions | UnitTest | Workflows/UnitTest.md |
+| composability, co-occurrence, skill conflicts, skills together, conflict score | Composability | Workflows/Composability.md |
+| import skillsbench, skillsbench, external evals, benchmark tasks, import corpus | ImportSkillsBench | Workflows/ImportSkillsBench.md |
 | status, health summary, skill health, pass rates, how are skills | Status | *(direct command — no workflow file)* |
 | last, last session, recent session, what happened | Last | *(direct command — no workflow file)* |
-| dashboard, visual, open dashboard, skill grid | Dashboard | *(direct command — no workflow file)* |
+## Interactive Configuration
+Before running mutating workflows (evolve, evolve-body, evals, baseline), present
+a pre-flight configuration prompt to the user. This gives them control over
+execution mode, model selection, and key parameters.
+### Pre-Flight Pattern
+Each mutating workflow has a **Pre-Flight Configuration** step. Follow this pattern:
+1. Present a summary of what the command will do
+2. Show numbered options with `(recommended)` markers for suggested defaults
+3. Ask the user to pick options or say "use defaults" / "go with defaults"
+4. Show a confirmation summary of selected options before executing
+### Model Tier Reference
+When presenting model choices, use this table:
+| Tier | Model | Speed | Cost | Quality | Best for |
+|------|-------|-------|------|---------|----------|
+| Fast | `haiku` | ~2s/call | $ | Good | Iteration loops, bulk validation |
+| Balanced | `sonnet` | ~5s/call | $$ | Great | Single-pass proposals, gate checks |
+| Best | `opus` | ~10s/call | $$$ | Excellent | High-stakes final validation |
+### Quick Path
+If the user says "use defaults", "just do it", or similar — skip the pre-flight
+and run with recommended defaults. The pre-flight is for users who want control,
+not a mandatory gate.
+### Workflows That Skip Pre-Flight
+These read-only or simple workflows run immediately without prompting:
+`status`, `last`, `doctor`, `dashboard`, `watch`, `rollback`, `grade`,
+`ingest-*`, `replay`, `contribute`, `cron`, `composability`, `unit-test`,
+`import-skillsbench`.
 ## The Feedback Loop
@@ -94,7 +160,30 @@ Observe --> Detect --> Diagnose --> Propose --> Validate --> Deploy --> Watch
 | `Workflows/Rollback.md` | Undo an evolution, restore previous description |
 | `Workflows/Watch.md` | Post-deploy regression monitoring |
 | `Workflows/Doctor.md` | Health checks on logs, hooks, schema |
-| `Workflows/Ingest.md` | Import sessions from Codex and OpenCode |
+| `Workflows/Ingest.md` | Import sessions from Codex, OpenCode, and OpenClaw |
+| `Workflows/Replay.md` | Backfill logs from Claude Code transcripts |
+| `Workflows/Contribute.md` | Export anonymized data for community contribution |
+| `Workflows/Cron.md` | Manage OpenClaw cron jobs for autonomous evolution |
+| `Workflows/AutoActivation.md` | Auto-activation hook behavior and rules |
+| `Workflows/Dashboard.md` | Dashboard modes: static, export, live server |
+| `Workflows/EvolutionMemory.md` | Evolution memory system for session continuity |
+| `Workflows/EvolveBody.md` | Full body and routing table evolution |
+| `Workflows/Baseline.md` | No-skill baseline comparison and lift measurement |
+| `Workflows/UnitTest.md` | Skill-level unit test runner and generator |
+| `Workflows/Composability.md` | Multi-skill co-occurrence conflict analysis |
+| `Workflows/ImportSkillsBench.md` | SkillsBench task corpus importer |
+## Specialized Agents
+selftune provides focused agents for deeper analysis. These live in
+`.claude/agents/` and can be spawned as subagents for specialized tasks.
+| Trigger keywords | Agent | Purpose |
+|------------------|-------|---------|
+| diagnose, root cause, why failing, skill failure, debug performance | diagnosis-analyst | Deep-dive analysis of underperforming skills |
+| patterns, conflicts, cross-skill, overlap, trigger conflicts, optimize skills | pattern-analyst | Cross-skill pattern analysis and conflict detection |
+| review evolution, check proposal, safe to deploy, approve evolution | evolution-reviewer | Safety gate review of pending evolution proposals |
+| set up selftune, integrate, configure project, install selftune | integration-guide | Guided interactive setup for specific project types |
 ## Examples
@@ -110,7 +199,34 @@ Observe --> Detect --> Diagnose --> Propose --> Validate --> Deploy --> Watch
 - "How are my skills performing?"
 - "What happened in my last session?"
 - "Open the selftune dashboard"
+- "Serve the dashboard at http://localhost:3141"
 - "Show skill health status"
+- "Replay my Claude Code transcripts"
+- "Backfill logs from historical sessions"
+- "Contribute my selftune data to the community"
+- "Share anonymized skill data"
+- "Set up cron jobs for autonomous evolution"
+- "Schedule selftune to run automatically"
+- "Ingest my OpenClaw sessions"
+- "Why is selftune suggesting things?"
+- "Customize activation rules"
+- "Start the live dashboard"
+- "Serve the dashboard on port 8080"
+- "What happened in the last evolution?"
+- "Read the evolution memory"
+- "Why is this skill underperforming?"
+- "Are there conflicts between my skills?"
+- "Review this evolution before deploying"
+- "Set up selftune for my project"
+- "Evolve the full body of the Research skill"
+- "Rewrite the routing table for pptx"
+- "Does this skill add value over no-skill baseline?"
+- "Measure baseline lift for the Research skill"
+- "Generate unit tests for the pptx skill"
+- "Run skill unit tests"
+- "Which skills conflict with each other?"
+- "Analyze composability for the Research skill"
+- "Import SkillsBench tasks for my skill"
 ## Negative Examples

package/skill/Workflows/AutoActivation.md ADDED Viewed

@@ -0,0 +1,144 @@
+# selftune Auto-Activation Workflow
+Automatically suggests selftune commands during a session based on
+activation rules. Runs as a `UserPromptSubmit` hook, evaluates rules
+against session context, and outputs advisory suggestions to stderr.
+## How It Works
+The `hooks/auto-activate.ts` script runs on every `UserPromptSubmit` event.
+It reads session telemetry, query logs, and evolution audit data, then
+evaluates a set of activation rules against the current context. When a
+rule fires, the suggestion is written to stderr (shown to Claude as a
+system message). The hook always exits 0 -- suggestions are advisory and
+never block the user.
+Flow:
+1. Claude Code triggers `UserPromptSubmit` hook
+2. Hook receives `{ session_id }` payload on stdin
+3. Checks PAI coexistence (see below)
+4. Loads default activation rules
+5. Evaluates each rule against session context
+6. Outputs suggestions to stderr (if any)
+7. Exits 0
+## PAI Coexistence
+If PAI's `skill-activation-prompt` hook is detected in
+`~/.claude/settings.json`, selftune skips all suggestions. PAI handles
+skill-level activation; selftune handles observability. This prevents
+duplicate or conflicting suggestions.
+Detection scans all hook entries in settings for any command containing
+`skill-activation-prompt`. If found, the hook exits silently.
+## Default Rules
+| Rule ID | Description | Trigger Condition | Suggestion |
+|---------|-------------|-------------------|------------|
+| `post-session-diagnostic` | Suggest diagnostic review | >2 unmatched queries in current session | `selftune last` |
+| `grading-threshold-breach` | Suggest evolution | Session pass rate < 0.6 (60%) | `selftune evolve` |
+| `stale-evolution` | Suggest evolution | >7 days since last evolution AND pending false negatives exist | `selftune evolve` |
+| `regression-detected` | Suggest rollback | Watch snapshot shows `regression_detected: true` | `selftune rollback` |
+### Rule Details
+**post-session-diagnostic**: Compares query count against skill usage count
+for the current session. If the difference exceeds 2, unmatched queries
+likely indicate gaps in skill coverage.
+**grading-threshold-breach**: Reads grading result files from
+`~/.selftune/grading/result-*.json`. If the current session's pass rate
+is below 0.6, the skill description may need evolution.
+**stale-evolution**: Reads the evolution audit log to find the last
+evolution timestamp. If older than 7 days, checks
+`~/.selftune/false-negatives/pending.json` for pending false negatives.
+Both conditions must be true.
+**regression-detected**: Reads the latest monitoring snapshot from
+`~/.selftune/monitoring/latest-snapshot.json`. If `regression_detected`
+is true, suggests rollback with the skill name if available.
+## Session State Tracking
+Each rule fires at most once per session. After a suggestion is shown,
+the rule ID is recorded in session state to prevent repeated nags.
+Session state is stored at `~/.selftune/session-state-<session_id>.json`:
+```json
+{
+  "session_id": "abc-123",
+  "suggestions_shown": ["post-session-diagnostic", "grading-threshold-breach"],
+  "updated_at": "2026-03-02T10:00:00Z"
+}
+```
+State is keyed by `session_id`. If the session ID changes (new session),
+state resets automatically.
+## Customizing Rules
+Rules are defined in `cli/selftune/activation-rules.ts` as the
+`DEFAULT_RULES` array. To customize rule behavior, edit that TypeScript
+file directly. There is no runtime JSON config — the hook imports
+`DEFAULT_RULES` at evaluation time.
+Each rule conforms to the `ActivationRule` interface:
+```typescript
+interface ActivationRule {
+  id: string;
+  description: string;
+  evaluate(ctx: ActivationContext): string | null;
+}
+```
+The `ActivationContext` provides paths to all log files and the selftune
+config directory. Return a suggestion string when the rule fires, or
+`null` to skip.
+## Disabling Auto-Activation
+Remove the `auto-activate.ts` hook entry from `~/.claude/settings.json`.
+The hook is registered under `UserPromptSubmit`:
+```json
+{
+  "hooks": {
+    "UserPromptSubmit": [
+      {
+        "command": "bun run /path/to/cli/selftune/hooks/auto-activate.ts"
+      }
+    ]
+  }
+}
+```
+Delete or comment out the entry to disable all auto-activation suggestions.
+## Common Patterns
+**"Stop suggesting commands"**
+> Remove the auto-activate hook from settings (see Disabling above).
+> Or wait -- each rule only fires once per session.
+**"Why am I seeing selftune suggestions?"**
+> The auto-activate hook detected an actionable condition. Check which
+> rule fired (the suggestion includes the command) and follow the advice.
+**"Suggestions aren't appearing"**
+> Run `selftune doctor` to verify the hook is installed. Check that
+> `UserPromptSubmit` includes the auto-activate hook in settings.
+**"PAI is installed but I still see suggestions"**
+> Verify PAI's `skill-activation-prompt` hook is in settings. The
+> coexistence check scans for that specific command string.
+**"I want custom activation logic"**
+> Create rules conforming to the `ActivationRule` interface. Rules must
+> be pure filesystem readers -- no network, no heavy imports. Add them
+> to the rules array in `activation-rules.ts` or reference a custom
+> rules file.

package/skill/Workflows/Badge.md ADDED Viewed

@@ -0,0 +1,118 @@
+# Badge Command
+Generate skill health badges for embedding in READMEs and documentation.
+## Usage
+```bash
+selftune badge --skill <name> [--format svg|markdown|url] [--output <path>]
+```
+## Options
+| Option | Required | Default | Description |
+|--------|----------|---------|-------------|
+| `--skill` | Yes | -- | Skill name to generate badge for |
+| `--format` | No | `svg` | Output format: `svg`, `markdown`, or `url` |
+| `--output` | No | stdout | Write output to file |
+| `--help` | No | -- | Show usage information |
+## Examples
+### Generate SVG badge
+```bash
+selftune badge --skill my-skill --format svg > badge.svg
+```
+### Get markdown for README
+```bash
+selftune badge --skill my-skill --format markdown
+```
+Output: `![Skill Health: my-skill](https://img.shields.io/badge/Skill%20Health-87%25%20%E2%86%91-4c1)`
+### Get shields.io URL
+```bash
+selftune badge --skill my-skill --format url
+```
+### Write badge to file
+```bash
+selftune badge --skill my-skill --output badge.svg
+```
+## Badge Branding
+SVG badges (both `--format svg` and dashboard routes) include the selftune logo as an inline 14px icon in the label section. The logo is embedded as a base64 data URI — no external requests needed.
+```
+[ 🔵 Skill Health (gray) ] [ 85% ↑ (green) ]
+  ^14px logo + 3px gap
+```
+Markdown and URL formats use shields.io, which renders its own badge — the logo only appears in locally-generated SVGs.
+## Badge Colors
+| Pass Rate | Color | Hex |
+|-----------|-------|-----|
+| > 80% | Green | `#4c1` |
+| 60-80% | Yellow | `#dfb317` |
+| < 60% | Red | `#e05d44` |
+| No data | Gray | `#9f9f9f` |
+## Embedding in README
+Add to your skill's README.md:
+```markdown
+![Skill Health: my-skill](https://img.shields.io/badge/Skill%20Health-87%25%20%E2%86%91-4c1)
+```
+Or use the generated SVG directly for offline rendering.
+## Dashboard Routes (Phase 2)
+The local dashboard server exposes badge and report routes:
+### GET /badge/:skillName
+Returns a live SVG badge computed from local telemetry logs.
+```
+http://localhost:<port>/badge/my-skill
+```
+- Returns `image/svg+xml` with `Cache-Control: no-cache, no-store`
+- Returns a gray "not found" badge (not JSON 404) for unknown skills
+### GET /report/:skillName
+Returns an HTML report page with pass rate, trend, session count, and embed code.
+```
+http://localhost:<port>/report/my-skill
+```
+## Hosted Service (Phase 3)
+The hosted badge service at `badge.selftune.dev` aggregates community contributions and serves badges publicly.
+### Endpoints
+| Route | Method | Description |
+|-------|--------|-------------|
+| `/badge/:skill` | GET | SVG badge from aggregated community data |
+| `/badge/:org/:skill` | GET | Organization-scoped SVG badge |
+### Embedding from hosted service
+```markdown
+![Skill Health: my-skill](https://badge.selftune.dev/badge/my-skill)
+```
+### Contributing data
+```bash
+selftune contribute --submit --skill my-skill
+```
+Uses `--endpoint` to target a custom service URL, with `--github` as fallback.

package/skill/Workflows/Baseline.md ADDED Viewed

@@ -0,0 +1,121 @@
+# selftune Baseline Workflow
+Measure whether a skill adds value over a no-skill baseline. Runs trigger
+checks with and without the skill description to compute lift — the
+improvement in pass rate that the skill provides.
+## Default Command
+```bash
+selftune baseline --skill <name> --skill-path <path> [options]
+```
+## Options
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--skill <name>` | Skill name | Required |
+| `--skill-path <path>` | Path to the skill's SKILL.md | Required |
+| `--eval-set <path>` | Pre-built eval set JSON | Auto-generated from logs |
+| `--agent <name>` | Agent CLI to use | Auto-detected |
+## Output Format
+```json
+{
+  "skill_name": "Research",
+  "eval_set_size": 25,
+  "baseline_pass_rate": 0.32,
+  "with_skill_pass_rate": 0.88,
+  "lift": 0.56,
+  "adds_value": true,
+  "measured_at": "2026-03-04T12:00:00.000Z"
+}
+```
+## How It Works
+1. Loads the eval set (from `--eval-set` or auto-generated from logs)
+2. Reads the skill's current description from SKILL.md
+3. Runs trigger checks against an **empty description** (no-skill baseline)
+4. Runs trigger checks against the **actual description** (with-skill)
+5. Computes pass rates for both conditions
+6. Calculates `lift = with_skill_pass_rate - baseline_pass_rate`
+7. Sets `adds_value = lift >= 0.05`
+## Integration with Evolve
+The `selftune evolve` command supports a `--with-baseline` flag:
+```bash
+selftune evolve --skill Research --skill-path /path/SKILL.md --with-baseline
+```
+When enabled, the evolve command measures baseline lift before deploying.
+If the skill doesn't add at least 5% lift over no-skill, the evolution is
+skipped — the skill needs fundamental rework, not description tweaks.
+## Steps
+### 0. Pre-Flight Configuration
+Before running baseline measurement, present configuration options to the user.
+If the user says "use defaults" or similar, skip to step 1 with recommended defaults.
+Present these options:
+```
+selftune baseline — Pre-Flight Configuration
+1. Eval Set Source
+   a) Auto-generate from logs (recommended if logs exist)
+   b) Use existing eval set file — provide path
+   c) Generate synthetic evals first (for new skills with no data)
+2. Agent CLI
+   a) Auto-detect (recommended)
+   b) Specify: claude / codex / opencode
+→ Reply with your choices or "use defaults" for recommended settings.
+```
+After the user responds, show a confirmation summary:
+```
+Configuration Summary:
+  Eval source:   auto-generate from logs
+  Agent:         auto-detect
+Proceeding...
+```
+### 1. Run Baseline Measurement
+```bash
+selftune baseline --skill Research --skill-path ~/.claude/skills/Research/SKILL.md
+```
+### 2. Interpret Results
+| Lift | Interpretation | Action |
+|------|---------------|--------|
+| >= 0.20 | Strong value | Skill is working well |
+| 0.05–0.20 | Moderate value | Consider evolving to improve |
+| < 0.05 | Minimal value | Skill may need rework, not just evolution |
+| < 0 | Negative value | Skill is hurting — investigate or disable |
+### 3. Use as Evolution Gate
+Add `--with-baseline` to evolve commands to prevent wasting evolution
+cycles on skills that don't add value.
+## Common Patterns
+**"Does the Research skill add value?"**
+> `selftune baseline --skill Research --skill-path ~/.claude/skills/Research/SKILL.md`
+**"Only evolve if the skill is actually useful"**
+> `selftune evolve --skill Research --skill-path /path/SKILL.md --with-baseline`
+**"Check baseline with a custom eval set"**
+> `selftune baseline --skill pptx --skill-path /path/SKILL.md --eval-set evals-pptx.json`