npm - selftune - Versions diffs - 0.2.6 → 0.2.9 - Mend

selftune 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

package/README.md +1 -0
package/apps/local-dashboard/dist/assets/index-Bs3Y4ixf.css +1 -0
package/apps/local-dashboard/dist/assets/index-C4UYGWKr.js +15 -0
package/apps/local-dashboard/dist/assets/vendor-react-BQH_6WrG.js +60 -0
package/apps/local-dashboard/dist/assets/{vendor-table-B7VF2Ipl.js → vendor-table-dK1QMLq9.js} +1 -1
package/apps/local-dashboard/dist/assets/{vendor-ui-r2k_Ku_V.js → vendor-ui-CO2mrx6e.js} +60 -65
package/apps/local-dashboard/dist/index.html +5 -5
package/cli/selftune/activation-rules.ts +57 -18
package/cli/selftune/agent-guidance.ts +96 -0
package/cli/selftune/alpha-identity.ts +156 -0
package/cli/selftune/alpha-upload/build-payloads.ts +151 -0
package/cli/selftune/alpha-upload/client.ts +113 -0
package/cli/selftune/alpha-upload/flush.ts +191 -0
package/cli/selftune/alpha-upload/index.ts +194 -0
package/cli/selftune/alpha-upload/queue.ts +252 -0
package/cli/selftune/alpha-upload/stage-canonical.ts +251 -0
package/cli/selftune/alpha-upload-contract.ts +52 -0
package/cli/selftune/auth/device-code.ts +110 -0
package/cli/selftune/auto-update.ts +130 -0
package/cli/selftune/badge/badge.ts +19 -9
package/cli/selftune/canonical-export.ts +16 -3
package/cli/selftune/constants.ts +28 -8
package/cli/selftune/contribute/bundle.ts +33 -5
package/cli/selftune/dashboard-contract.ts +32 -1
package/cli/selftune/dashboard-server.ts +215 -693
package/cli/selftune/dashboard.ts +1 -1
package/cli/selftune/eval/baseline.ts +11 -7
package/cli/selftune/eval/hooks-to-evals.ts +39 -15
package/cli/selftune/eval/synthetic-evals.ts +54 -1
package/cli/selftune/evolution/audit.ts +24 -19
package/cli/selftune/evolution/constitutional.ts +176 -0
package/cli/selftune/evolution/evidence.ts +18 -13
package/cli/selftune/evolution/evolve-body.ts +104 -7
package/cli/selftune/evolution/evolve.ts +195 -22
package/cli/selftune/evolution/propose-body.ts +18 -1
package/cli/selftune/evolution/propose-description.ts +27 -2
package/cli/selftune/evolution/rollback.ts +11 -15
package/cli/selftune/export.ts +84 -0
package/cli/selftune/grading/auto-grade.ts +14 -4
package/cli/selftune/grading/grade-session.ts +17 -6
package/cli/selftune/hooks/auto-activate.ts +5 -0
package/cli/selftune/hooks/evolution-guard.ts +25 -11
package/cli/selftune/hooks/prompt-log.ts +23 -9
package/cli/selftune/hooks/session-stop.ts +78 -15
package/cli/selftune/hooks/skill-eval.ts +189 -10
package/cli/selftune/index.ts +274 -2
package/cli/selftune/ingestors/claude-replay.ts +48 -21
package/cli/selftune/init.ts +260 -49
package/cli/selftune/last.ts +7 -7
package/cli/selftune/localdb/db.ts +90 -10
package/cli/selftune/localdb/direct-write.ts +573 -0
package/cli/selftune/localdb/materialize.ts +296 -42
package/cli/selftune/localdb/queries.ts +482 -32
package/cli/selftune/localdb/schema.ts +153 -1
package/cli/selftune/monitoring/watch.ts +27 -8
package/cli/selftune/normalization.ts +88 -15
package/cli/selftune/observability.ts +257 -5
package/cli/selftune/orchestrate.ts +176 -53
package/cli/selftune/quickstart.ts +34 -10
package/cli/selftune/repair/skill-usage.ts +15 -2
package/cli/selftune/routes/actions.ts +77 -0
package/cli/selftune/routes/badge.ts +66 -0
package/cli/selftune/routes/doctor.ts +12 -0
package/cli/selftune/routes/index.ts +14 -0
package/cli/selftune/routes/orchestrate-runs.ts +13 -0
package/cli/selftune/routes/overview.ts +14 -0
package/cli/selftune/routes/report.ts +293 -0
package/cli/selftune/routes/skill-report.ts +230 -0
package/cli/selftune/status.ts +203 -7
package/cli/selftune/sync.ts +14 -1
package/cli/selftune/types.ts +52 -2
package/cli/selftune/utils/jsonl.ts +58 -1
package/cli/selftune/utils/selftune-meta.ts +38 -0
package/cli/selftune/utils/skill-log.ts +30 -4
package/cli/selftune/utils/transcript.ts +15 -0
package/cli/selftune/workflows/workflows.ts +7 -6
package/package.json +11 -6
package/packages/telemetry-contract/fixtures/complete-push.ts +184 -0
package/packages/telemetry-contract/fixtures/evidence-only-push.ts +58 -0
package/packages/telemetry-contract/fixtures/golden.json +1 -0
package/packages/telemetry-contract/fixtures/index.ts +4 -0
package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +40 -0
package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +79 -0
package/packages/telemetry-contract/package.json +6 -1
package/packages/telemetry-contract/src/schemas.ts +196 -0
package/packages/telemetry-contract/src/types.ts +3 -1
package/packages/telemetry-contract/src/validators.ts +3 -1
package/packages/telemetry-contract/tests/compatibility.test.ts +144 -0
package/packages/ui/package.json +4 -0
package/packages/ui/src/components/ActivityTimeline.tsx +61 -29
package/packages/ui/src/components/section-cards.tsx +31 -14
package/packages/ui/src/types.ts +1 -0
package/skill/SKILL.md +214 -174
package/skill/Workflows/AlphaUpload.md +45 -0
package/skill/Workflows/Baseline.md +18 -12
package/skill/Workflows/Composability.md +3 -3
package/skill/Workflows/Dashboard.md +39 -91
package/skill/Workflows/Doctor.md +93 -66
package/skill/Workflows/Evals.md +49 -40
package/skill/Workflows/Evolve.md +76 -28
package/skill/Workflows/EvolveBody.md +37 -38
package/skill/Workflows/Initialize.md +145 -26
package/skill/Workflows/Orchestrate.md +11 -2
package/skill/Workflows/Sync.md +23 -0
package/skill/Workflows/Watch.md +2 -5
package/skill/agents/diagnosis-analyst.md +163 -0
package/skill/agents/evolution-reviewer.md +149 -0
package/skill/agents/integration-guide.md +154 -0
package/skill/agents/pattern-analyst.md +149 -0
package/skill/assets/multi-skill-settings.json +1 -1
package/skill/assets/single-skill-settings.json +1 -1
package/skill/references/interactive-config.md +39 -0
package/skill/references/invocation-taxonomy.md +34 -0
package/skill/references/logs.md +15 -1
package/skill/references/setup-patterns.md +3 -3
package/skill/settings_snippet.json +1 -1
package/apps/local-dashboard/dist/assets/index-C75H1Q3n.css +0 -1
package/apps/local-dashboard/dist/assets/index-axE4kz3Q.js +0 -15
package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +0 -60

package/skill/Workflows/Dashboard.md CHANGED Viewed

@@ -1,8 +1,8 @@
 # selftune Dashboard Workflow
 Visual dashboard for selftune telemetry, skill performance, evolution
-audit, and monitoring data. Supports static HTML export, file output,
-and a live server with polling-based auto-refresh and action buttons.
+audit, and monitoring data. Starts a local SPA server with SSE-based
+real-time updates and action buttons.
 ## Default Command
@@ -10,58 +10,23 @@ and a live server with polling-based auto-refresh and action buttons.
 selftune dashboard
 ```
-Opens a standalone HTML dashboard in the default browser with embedded
-data from all selftune log files.
+Starts a Bun HTTP server with a React SPA dashboard and opens it in the
+default browser. The dashboard reads SQLite directly and uses WAL-based
+invalidation to push live updates via Server-Sent Events (SSE).
+TanStack Query polling (60s) acts as a fallback. Action buttons trigger
+selftune commands directly from the dashboard. Use `selftune export` to
+generate JSONL from SQLite for debugging or offline analysis.
 ## Options
 | Flag | Description | Default |
 |------|-------------|---------|
-| `--export` | Export data-embedded HTML to stdout | Off |
-| `--out FILE` | Write data-embedded HTML to FILE | None |
-| `--serve` | Start live dashboard server | Off |
-| `--port <port>` | Custom port for live server (requires `--serve`) | 3141 |
+| `--port <port>` | Custom port for the server | 3141 |
+| `--no-open` | Start server without opening browser | Off |
+| `--serve` | *(Deprecated)* Alias for default behavior | — |
-## Modes
-### Static (Default)
-Builds an HTML file with all telemetry data embedded as JSON, saves it
-to `~/.selftune/dashboard.html`, and opens it in the default browser.
-The data is a point-in-time snapshot -- refresh by re-running the command.
-```bash
-selftune dashboard
-```
-### Export
-Writes the same data-embedded HTML to stdout. Useful for piping to other
-tools or capturing output programmatically.
-```bash
-selftune dashboard --export > dashboard.html
-```
-### File
-Writes the data-embedded HTML to a specific file path.
-```bash
-selftune dashboard --out /tmp/report.html
-```
-### Live Server
-Starts a Bun HTTP server with a React SPA dashboard. The SPA uses
-TanStack Query polling to auto-refresh data (overview every 15s,
-orchestrate runs every 30s, doctor every 30s) and provides action
-buttons to trigger selftune commands.
-```bash
-selftune dashboard --serve
-selftune dashboard --serve --port 8080
-```
+Note: `--export` and `--out` were removed. The CLI will error if used,
+suggesting `selftune dashboard` instead.
 ## Live Server
@@ -79,23 +44,23 @@ override.
 | `GET` | `/api/v2/skills/:name` | SQLite-backed per-skill report |
 | `GET` | `/api/v2/orchestrate-runs` | Recent orchestrate run reports |
 | `GET` | `/api/v2/doctor` | System health diagnostics (config, logs, hooks, evolution) |
+| `GET` | `/api/v2/events` | SSE stream for live dashboard updates |
 | `GET` | `/api/health` | Dashboard server health probe |
 | `POST` | `/api/actions/watch` | Trigger `selftune watch` for a skill |
 | `POST` | `/api/actions/evolve` | Trigger `selftune evolve` for a skill |
 | `POST` | `/api/actions/rollback` | Trigger `selftune evolve rollback` for a skill |
-### Auto-Refresh
+### Live Updates (SSE)
-The dashboard SPA uses TanStack Query with `refetchInterval` to poll
-the v2 API endpoints automatically:
+The dashboard connects to `/api/v2/events` via Server-Sent Events.
+The server watches the SQLite WAL file for changes and broadcasts an
+`update` event when new data is written. The SPA invalidates all cached
+queries, triggering immediate refetches (~1s latency).
-- `/api/v2/overview` — every 15 seconds
-- `/api/v2/orchestrate-runs` — every 30 seconds
-- `/api/v2/doctor` — every 30 seconds
-- `/api/v2/skills/:name` — every 30 seconds (when viewing a skill)
+TanStack Query polling (60s) acts as a fallback safety net in case the
+SSE connection drops. Data also refreshes on window focus.
-Data also refreshes on window focus. No SSE or websocket connection
-is required.
+See [docs/design-docs/live-dashboard-sse.md](../../docs/design-docs/live-dashboard-sse.md) for the full design.
 ### Action Endpoints
@@ -147,45 +112,32 @@ The dashboard displays data from these sources:
 | Data | Source | Description |
 |------|--------|-------------|
-| Telemetry | `session_telemetry_log.jsonl` | Session-level telemetry records |
-| Skills | `skill_usage_log.jsonl` | Skill activation and usage events |
-| Queries | `all_queries_log.jsonl` | All user queries across sessions |
-| Evolution | `evolution_audit_log.jsonl` | Evolution audit trail (create, deploy, rollback) |
+| Telemetry | SQLite (`~/.selftune/selftune.db`) | Session-level telemetry records |
+| Skills | SQLite (`~/.selftune/selftune.db`) | Skill activation and usage events |
+| Queries | SQLite (`~/.selftune/selftune.db`) | All user queries across sessions |
+| Evolution | SQLite (`~/.selftune/selftune.db`) | Evolution audit trail (create, deploy, rollback) |
 | Decisions | `~/.selftune/memory/` | Evolution decision records |
 | Snapshots | Computed | Per-skill monitoring snapshots (pass rate, regression status) |
 | Unmatched | Computed | Queries that did not trigger any skill |
 | Pending | Computed | Evolution proposals not yet deployed, rejected, or rolled back |
-If no log data is found, the static modes exit with an error message
-listing the checked file paths.
+If no log data is found, the server reports an error listing the
+checked file paths.
 ## Steps
-### 1. Choose Mode
-| Goal | Command |
-|------|---------|
-| Quick visual check | `selftune dashboard` |
-| Save report to file | `selftune dashboard --out report.html` |
-| Pipe to another tool | `selftune dashboard --export` |
-| Live monitoring | `selftune dashboard --serve` |
-### 2. Run Command
+### 1. Run Dashboard
 ```bash
-# Static (opens browser)
 selftune dashboard
-# Live server
-selftune dashboard --serve
+selftune dashboard --port 8080
+selftune dashboard --no-open
 ```
-### 3. Interact with Dashboard
+### 2. Interact with Dashboard
-- **Static mode**: View the snapshot. Re-run to refresh.
-- **Live mode**: Data refreshes automatically via polling (15-30s intervals).
-  Use action buttons to trigger watch, evolve, or rollback directly from
-  the dashboard.
+Data refreshes in real time via SSE (~1s latency). Use action buttons
+to trigger watch, evolve, or rollback directly from the dashboard.
 ## Common Patterns
@@ -194,12 +146,8 @@ selftune dashboard --serve
 > Report to the user that the dashboard is open.
 **User wants live monitoring**
-> Run `selftune dashboard --serve`. Inform the user that data refreshes
-> automatically every 15-30 seconds via polling.
-**User wants a shareable report**
-> Run `selftune dashboard --out report.html`. Report the file path to the
-> user. The HTML file is self-contained with all data embedded.
+> Run `selftune dashboard`. The server provides real-time updates via SSE
+> (~1 second latency).
 **Dashboard shows no data**
 > Run `selftune doctor` to verify hooks are installed. If hooks are missing,
@@ -207,8 +155,8 @@ selftune dashboard --serve
 > have run, inform the user that sessions must generate telemetry first.
 **User wants a different port**
-> Run `selftune dashboard --serve --port <port>`. Port must be 1-65535.
+> Run `selftune dashboard --port <port>`. Port must be 1-65535.
 **User wants to trigger actions from the dashboard**
-> Run `selftune dashboard --serve` for live mode. The dashboard provides
-> action buttons for watch, evolve, and rollback per skill via POST endpoints.
+> Run `selftune dashboard`. The dashboard provides action buttons for
+> watch, evolve, and rollback per skill via POST endpoints.

package/skill/Workflows/Doctor.md CHANGED Viewed

@@ -17,34 +17,57 @@ None. Doctor runs all checks unconditionally.
 ```json
 {
-  "healthy": true,
+  "command": "doctor",
+  "timestamp": "2026-02-28T10:00:00Z",
   "checks": [
     {
-      "name": "session_telemetry_log exists",
+      "name": "config",
+      "path": "/Users/you/.selftune/config.json",
       "status": "pass",
-      "detail": "Found 142 entries"
+      "message": "Valid config with agent_type and llm_mode"
     },
     {
-      "name": "skill_usage_log parseable",
+      "name": "log_session_telemetry",
+      "path": "/Users/you/.claude/session_telemetry_log.jsonl",
       "status": "pass",
-      "detail": "All 89 entries valid JSON"
+      "message": "Found 142 entries"
     },
     {
-      "name": "hooks installed",
+      "name": "hook_settings",
+      "path": "/Users/you/.claude/settings.json",
       "status": "fail",
-      "detail": "PostToolUse hook not found in ~/.claude/settings.json"
+      "message": "PostToolUse hook not found in ~/.claude/settings.json"
+    },
+    {
+      "name": "dashboard_freshness_mode",
+      "status": "pass",
+      "message": "Dashboard reads SQLite and watches WAL for live updates"
     }
   ],
   "summary": {
-    "passed": 5,
-    "failed": 1,
-    "total": 6
-  }
+    "pass": 9,
+    "fail": 1,
+    "warn": 0,
+    "total": 10
+  },
+  "healthy": false
 }
 ```
 The process exits with code 0 if `healthy: true`, code 1 otherwise.
+Failed or warning checks may include a machine-readable `guidance` object:
+```json
+{
+  "code": "config_missing",
+  "message": "selftune is not initialized yet.",
+  "next_command": "selftune init",
+  "suggested_commands": ["selftune doctor"],
+  "blocking": true
+}
+```
 ## Parsing Instructions
 ### Check Overall Health
@@ -57,69 +80,64 @@ The process exits with code 0 if `healthy: true`, code 1 otherwise.
 ### Find Failed Checks
 ```bash
-# Parse: .checks[] | select(.status == "fail") | { name, detail }
+# Parse: .checks[] | select(.status == "fail") | { name, message }
 ```
 ### Get Summary Counts
 ```bash
-# Parse: .summary.passed, .summary.failed, .summary.total
+# Parse: .summary.pass, .summary.fail, .summary.warn, .summary.total
 ```
 ## Health Checks
-Doctor validates these areas:
+Doctor validates these baseline areas (10 checks total), and adds alpha cloud-link
+or queue checks when alpha is configured:
-### Log File Checks
+### Config Check
-| Check | What it validates |
-|-------|-------------------|
-| Log files exist | `session_telemetry_log.jsonl`, `skill_usage_log.jsonl`, `all_queries_log.jsonl` exist in `~/.claude/` |
-| Logs are parseable | Every line in each log file is valid JSON |
-| Schema conformance | Required fields present per log type (see `references/logs.md`) |
+| Check name | What it validates |
+|------------|-------------------|
+| `config` | `~/.selftune/config.json` exists, is valid JSON, contains `agent_type` and `llm_mode` fields |
-### Hook Checks
+### Log Checks (4 checks)
-| Check | What it validates |
-|-------|-------------------|
-| Hooks installed | `UserPromptSubmit`, `PreToolUse`, `PostToolUse`, and `Stop` hooks are configured in `~/.claude/settings.json` |
-| Hook scripts exist | The script files referenced by hooks exist on disk |
-| Auto-activate hook | `hooks/auto-activate.ts` is registered in `UserPromptSubmit` and the file is executable |
-| Evolution guard hook | `hooks/evolution-guard.ts` is registered in `PreToolUse` and the file exists |
+| Check name | What it validates |
+|------------|-------------------|
+| `log_session_telemetry` | `session_telemetry_log.jsonl` exists and is parseable |
+| `log_skill_usage` | `skill_usage_log.jsonl` exists and is parseable |
+| `log_all_queries` | `all_queries_log.jsonl` exists and is parseable |
+| `log_evolution_audit` | `evolution_audit_log.jsonl` exists and is parseable |
-### Memory Checks
+### Hook Check
-| Check | What it validates |
-|-------|-------------------|
-| Memory directory exists | `~/.selftune/memory/` directory is present |
-| Memory files valid | `context.md`, `decisions.md`, `plan.md` exist and are non-empty (if previously written) |
+| Check name | What it validates |
+|------------|-------------------|
+| `hook_settings` | `~/.claude/settings.json` has selftune hooks configured |
-### Activation Rules Checks
+### Evolution Check
-| Check | What it validates |
-|-------|-------------------|
-| Rules file exists | `~/.selftune/activation-rules.json` is present |
-| Rules file valid | The file contains valid JSON conforming to the activation rules schema |
+| Check name | What it validates |
+|------------|-------------------|
+| `evolution_audit` | Evolution audit log entries have valid structure |
-### Agent Checks
+### Integrity Check
-| Check | What it validates |
-|-------|-------------------|
-| Optional agent directory exists | If `.claude/agents/` is present, it is readable |
-| Optional agent files present | If the repo bundles helper agents, the expected files are present |
+| Check name | What it validates |
+|------------|-------------------|
+| `dashboard_freshness_mode` | Warns when the dashboard still relies on legacy JSONL watcher invalidation instead of SQLite WAL live refresh |
-### Dashboard Checks (optional)
+### Skill Version Sync Check
-| Check | What it validates |
-|-------|-------------------|
-| Dashboard server accessible | `dashboard-server.ts` exists in the CLI directory |
+| Check name | What it validates |
+|------------|-------------------|
+| `skill_version_sync` | SKILL.md frontmatter version matches package.json version |
-### Evolution Audit Checks
+### Version Check
-| Check | What it validates |
-|-------|-------------------|
-| Audit log integrity | `evolution_audit_log.jsonl` entries have required fields (`timestamp`, `proposal_id`, `action`) |
-| Valid action values | All entries use known action types: `created`, `validated`, `deployed`, `rolled_back` |
+| Check name | What it validates |
+|------------|-------------------|
+| `version_up_to_date` | Installed version matches latest on npm registry |
 ## Steps
@@ -139,18 +157,13 @@ For each failed check, take the appropriate action:
 | Failed check | Fix |
 |-------------|-----|
-| Log files missing | Run a session to generate initial log entries. Check hook installation. |
-| Logs not parseable | Inspect the corrupted log file. Remove or fix invalid lines. |
-| Hooks not installed | Merge `skill/settings_snippet.json` into `~/.claude/settings.json`. Update paths. |
-| Hook scripts missing | Verify the selftune repo path. Re-run `init` if the repo was moved. |
-| Auto-activate missing | Add `hooks/auto-activate.ts` to `UserPromptSubmit` in settings. |
-| Evolution guard missing | Add `hooks/evolution-guard.ts` to `PreToolUse` in settings. |
-| Memory directory missing | Run `mkdir -p ~/.selftune/memory`. |
-| Memory files invalid | Delete and let the memory writer recreate them on next evolve/watch. |
-| Activation rules missing | Copy `assets/activation-rules-default.json` to `~/.selftune/activation-rules.json`. |
-| Activation rules invalid | Validate JSON syntax. Re-copy from template if corrupted. |
-| Agent files missing | If your repo uses optional helper agents, restore them in `.claude/agents/`. Otherwise ignore this advisory. |
-| Audit log invalid | Remove corrupted entries. Future operations will append clean entries. |
+| `config` | Run `selftune init` (or `selftune init --force` to regenerate). |
+| `log_*` | Run a session to generate initial log entries. Check hook installation with `selftune init`. |
+| `hook_settings` | Run `selftune init` to install hooks into `~/.claude/settings.json`. |
+| `evolution_audit` | Remove corrupted entries. Future operations will append clean entries. |
+| `dashboard_freshness_mode` | This is an operator warning, not a broken install. Expect possible freshness gaps for SQLite-only writes and export before destructive recovery. |
+| `skill_version_sync` | Run `bun run sync-version` to stamp SKILL.md from package.json. |
+| `version_up_to_date` | Run `npm install -g selftune` to update. |
 ### 4. Re-run Doctor
@@ -159,14 +172,28 @@ After fixes, run doctor again to verify all checks pass.
 ## Subagent Escalation
 If doctor reveals persistent issues with a specific skill — especially
-recurring failures that basic fixes do not resolve — spawn the
-`diagnosis-analyst` agent as a subagent for root cause analysis.
+recurring failures that basic fixes do not resolve — read
+`skill/agents/diagnosis-analyst.md` and spawn a subagent with those instructions
+for root cause analysis.
+### Alpha Upload Not Active
+**Symptoms:** `selftune status` shows alpha upload as "not enrolled" or "enrolled (missing credential)"
+**Diagnostic steps:**
+1. Check `selftune status` — look at "Alpha Upload" and "Cloud link" lines
+2. If `doctor` includes a `cloud_link` or alpha queue warning, prefer `.checks[].guidance.next_command`
+3. If "not enrolled" or "not linked": run `selftune init --alpha --alpha-email <email>` (opens browser for device-code auth)
+4. If "enrolled (missing credential)": re-run `selftune init --alpha --alpha-email <email> --force` (re-authenticates via browser)
+5. If "api_key has invalid format": re-run init with `--alpha --force` to re-authenticate
+**Resolution:** Follow the setup sequence in Initialize workflow → Alpha Enrollment section.
 ## Common Patterns
 **User reports something seems broken**
 > Run `selftune doctor`. Parse the JSON output for failed checks. Report
-> each failure's `name` and `detail` to the user with the recommended fix.
+> each failure's `name` and `message` to the user with the recommended fix.
 **User asks if hooks are working**
 > Run `selftune doctor`. Parse `.checks[]` for hook-related entries. If

package/skill/Workflows/Evals.md CHANGED Viewed

@@ -26,9 +26,14 @@ selftune eval generate --skill <name> [options]
 | `--skill <name>` | Skill to generate evals for | Required (unless `--list-skills`) |
 | `--list-skills` | List all logged skills with query counts | Off |
 | `--stats` | Show aggregate telemetry stats for the skill | Off |
-| `--max <n>` | Maximum eval entries to generate | 50 |
-| `--seed <n>` | Random seed for negative sampling | Random |
-| `--out <path>` | Output file path | `evals-<skill>.json` |
+| `--max <n>` | Maximum eval entries per side | 50 |
+| `--seed <n>` | Seed for deterministic shuffling | 42 |
+| `--output <path>` / `--out <path>` | Output file path | `{skillName}_trigger_eval.json` |
+| `--no-negatives` | Exclude negative examples from output | Off |
+| `--no-taxonomy` | Skip invocation_type classification | Off |
+| `--skill-log <path>` | Path to skill_usage_log.jsonl | Default log path |
+| `--query-log <path>` | Path to all_queries_log.jsonl | Default log path |
+| `--telemetry-log <path>` | Path to session_telemetry_log.jsonl | Default log path |
 | `--synthetic` | Generate evals from SKILL.md via LLM (no logs needed) | Off |
 | `--skill-path <path>` | Path to SKILL.md (required with `--synthetic`) | — |
 | `--model <model>` | LLM model to use for synthetic generation | Agent default |
@@ -40,24 +45,20 @@ selftune eval generate --skill <name> [options]
 ```json
 [
   {
-    "id": 1,
     "query": "Make me a slide deck for the Q3 board meeting",
-    "expected": true,
-    "invocation_type": "contextual",
-    "skill_name": "pptx",
-    "source_session": "abc123"
+    "should_trigger": true,
+    "invocation_type": "contextual"
   },
   {
-    "id": 2,
     "query": "What format should I use for a presentation?",
-    "expected": false,
-    "invocation_type": "negative",
-    "skill_name": "pptx",
-    "source_session": null
+    "should_trigger": false
   }
 ]
 ```
+Each entry has `query` (string, max 500 chars), `should_trigger` (boolean),
+and optional `invocation_type` (omitted when `--no-taxonomy` is set).
 ### List Skills
 ```json
@@ -93,14 +94,14 @@ selftune eval generate --skill <name> [options]
 ### Find Missed Queries (False Negatives)
 ```bash
-# Parse: .[] | select(.expected == true and .invocation_type != "explicit")
+# Parse: .[] | select(.should_trigger == true and .invocation_type != "explicit")
 # These are queries that should trigger but might be missed
 ```
 ### Get Negative Examples
 ```bash
-# Parse: .[] | select(.expected == false)
+# Parse: .[] | select(.should_trigger == false)
 ```
 ## Sub-Workflows
@@ -126,10 +127,16 @@ selftune eval generate --skill pptx --synthetic --skill-path /path/to/skills/ppt
 The command:
 1. Reads the SKILL.md file content
-2. Sends it to an LLM with a prompt requesting realistic test queries
-3. Parses the response into eval entries with invocation type annotations
-4. Classifies each positive query using the deterministic `classifyInvocation()` heuristic
-5. Writes the eval set to the output file
+2. Loads real user queries from the database (if available) as few-shot style examples so synthetic queries match real phrasing patterns
+3. Sends skill content and real examples to an LLM with a prompt requesting realistic test queries
+4. Parses the response into eval entries with invocation type annotations
+5. Classifies each positive query using the deterministic `classifyInvocation()` heuristic
+6. Writes the eval set to the output file
+**Note:** When real query data exists in the database, synthetic generation
+automatically includes high-confidence positive triggers and general queries as
+phrasing references. This produces more natural-sounding eval queries. If no
+database is available, generation proceeds without real examples (fail-open).
 Use `--model` to override the default LLM model:
@@ -144,7 +151,7 @@ Cross-reference `skill_usage_log.jsonl` (positive triggers) against
 an eval set annotated with invocation types.
 ```bash
-selftune eval generate --skill pptx --max 50 --out evals-pptx.json
+selftune eval generate --skill pptx --max 50 --output evals-pptx.json
 ```
 The command:
@@ -168,32 +175,34 @@ selftune eval generate --skill pptx --stats
 ### 0. Pre-Flight Configuration
-Before generating evals, present numbered configuration options to the user inline in your response, then wait for the user's answer before proceeding.
+Before generating evals, use the `AskUserQuestion` tool to present structured configuration options.
-If the user responds with "use defaults", "just do it", or similar shorthand, skip to step 1 using the recommended defaults.
+If the user responds with "use defaults" or similar shorthand, skip to step 1 using the recommended defaults. If the user cancels, stop -- do not proceed with defaults.
 For `--list-skills` or `--stats` requests, skip pre-flight entirely — these are read-only operations.
-Present the following options inline in your response:
-1. **Generation Mode**
-   - a) Log-based — build evals from real usage logs (recommended if logs exist)
-   - b) Synthetic — generate evals from SKILL.md via LLM (for new skills with no data)
-2. **Skill Path** (synthetic mode only)
-   - Provide absolute or relative path to the target SKILL.md
-   - Example: `./skills/pptx/SKILL.md`
-3. **Max Entries:** 50 (default — how many eval entries to generate)
+Use `AskUserQuestion` with these questions:
-4. **Model** (synthetic mode only)
-   - a) Fast (haiku) — quick generation
-   - b) Balanced (sonnet) — better query diversity (recommended)
-   - c) Best (opus) — highest quality synthetic queries
-5. **Output Path:** `evals-<skill>.json` (default)
+```json
+{
+  "questions": [
+    {
+      "question": "Generation Mode",
+      "options": ["Log-based — build from real usage logs (recommended if logs exist)", "Synthetic — generate from SKILL.md via LLM (for new skills)"]
+    },
+    {
+      "question": "Model (for synthetic mode)",
+      "options": ["Fast (haiku) — quick generation", "Balanced (sonnet) — better diversity (recommended)", "Best (opus) — highest quality"]
+    },
+    {
+      "question": "Max Entries",
+      "options": ["50 (default)", "25 (quick)", "100 (comprehensive)"]
+    }
+  ]
+}
+```
-Ask: "Reply with your choices or 'use defaults' for recommended settings."
+If `AskUserQuestion` is not available, fall back to presenting these as inline numbered options.
 After the user responds, parse their selections and map each choice to the corresponding CLI flags: