selftune 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/apps/local-dashboard/dist/assets/index-Bs3Y4ixf.css +1 -0
- package/apps/local-dashboard/dist/assets/index-C4UYGWKr.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-BQH_6WrG.js +60 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-B7VF2Ipl.js → vendor-table-dK1QMLq9.js} +1 -1
- package/apps/local-dashboard/dist/assets/{vendor-ui-r2k_Ku_V.js → vendor-ui-CO2mrx6e.js} +60 -65
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/activation-rules.ts +57 -18
- package/cli/selftune/agent-guidance.ts +96 -0
- package/cli/selftune/alpha-identity.ts +156 -0
- package/cli/selftune/alpha-upload/build-payloads.ts +151 -0
- package/cli/selftune/alpha-upload/client.ts +113 -0
- package/cli/selftune/alpha-upload/flush.ts +191 -0
- package/cli/selftune/alpha-upload/index.ts +194 -0
- package/cli/selftune/alpha-upload/queue.ts +252 -0
- package/cli/selftune/alpha-upload/stage-canonical.ts +251 -0
- package/cli/selftune/alpha-upload-contract.ts +52 -0
- package/cli/selftune/auth/device-code.ts +110 -0
- package/cli/selftune/auto-update.ts +130 -0
- package/cli/selftune/badge/badge.ts +19 -9
- package/cli/selftune/canonical-export.ts +16 -3
- package/cli/selftune/constants.ts +28 -8
- package/cli/selftune/contribute/bundle.ts +33 -5
- package/cli/selftune/dashboard-contract.ts +32 -1
- package/cli/selftune/dashboard-server.ts +215 -693
- package/cli/selftune/dashboard.ts +1 -1
- package/cli/selftune/eval/baseline.ts +11 -7
- package/cli/selftune/eval/hooks-to-evals.ts +39 -15
- package/cli/selftune/eval/synthetic-evals.ts +54 -1
- package/cli/selftune/evolution/audit.ts +24 -19
- package/cli/selftune/evolution/constitutional.ts +176 -0
- package/cli/selftune/evolution/evidence.ts +18 -13
- package/cli/selftune/evolution/evolve-body.ts +104 -7
- package/cli/selftune/evolution/evolve.ts +195 -22
- package/cli/selftune/evolution/propose-body.ts +18 -1
- package/cli/selftune/evolution/propose-description.ts +27 -2
- package/cli/selftune/evolution/rollback.ts +11 -15
- package/cli/selftune/export.ts +84 -0
- package/cli/selftune/grading/auto-grade.ts +14 -4
- package/cli/selftune/grading/grade-session.ts +17 -6
- package/cli/selftune/hooks/auto-activate.ts +5 -0
- package/cli/selftune/hooks/evolution-guard.ts +25 -11
- package/cli/selftune/hooks/prompt-log.ts +23 -9
- package/cli/selftune/hooks/session-stop.ts +78 -15
- package/cli/selftune/hooks/skill-eval.ts +189 -10
- package/cli/selftune/index.ts +274 -2
- package/cli/selftune/ingestors/claude-replay.ts +48 -21
- package/cli/selftune/init.ts +260 -49
- package/cli/selftune/last.ts +7 -7
- package/cli/selftune/localdb/db.ts +90 -10
- package/cli/selftune/localdb/direct-write.ts +573 -0
- package/cli/selftune/localdb/materialize.ts +296 -42
- package/cli/selftune/localdb/queries.ts +482 -32
- package/cli/selftune/localdb/schema.ts +153 -1
- package/cli/selftune/monitoring/watch.ts +27 -8
- package/cli/selftune/normalization.ts +88 -15
- package/cli/selftune/observability.ts +257 -5
- package/cli/selftune/orchestrate.ts +176 -53
- package/cli/selftune/quickstart.ts +34 -10
- package/cli/selftune/repair/skill-usage.ts +15 -2
- package/cli/selftune/routes/actions.ts +77 -0
- package/cli/selftune/routes/badge.ts +66 -0
- package/cli/selftune/routes/doctor.ts +12 -0
- package/cli/selftune/routes/index.ts +14 -0
- package/cli/selftune/routes/orchestrate-runs.ts +13 -0
- package/cli/selftune/routes/overview.ts +14 -0
- package/cli/selftune/routes/report.ts +293 -0
- package/cli/selftune/routes/skill-report.ts +230 -0
- package/cli/selftune/status.ts +203 -7
- package/cli/selftune/sync.ts +14 -1
- package/cli/selftune/types.ts +52 -2
- package/cli/selftune/utils/jsonl.ts +58 -1
- package/cli/selftune/utils/selftune-meta.ts +38 -0
- package/cli/selftune/utils/skill-log.ts +30 -4
- package/cli/selftune/utils/transcript.ts +15 -0
- package/cli/selftune/workflows/workflows.ts +7 -6
- package/package.json +11 -6
- package/packages/telemetry-contract/fixtures/complete-push.ts +184 -0
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +58 -0
- package/packages/telemetry-contract/fixtures/golden.json +1 -0
- package/packages/telemetry-contract/fixtures/index.ts +4 -0
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +40 -0
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +79 -0
- package/packages/telemetry-contract/package.json +6 -1
- package/packages/telemetry-contract/src/schemas.ts +196 -0
- package/packages/telemetry-contract/src/types.ts +3 -1
- package/packages/telemetry-contract/src/validators.ts +3 -1
- package/packages/telemetry-contract/tests/compatibility.test.ts +144 -0
- package/packages/ui/package.json +4 -0
- package/packages/ui/src/components/ActivityTimeline.tsx +61 -29
- package/packages/ui/src/components/section-cards.tsx +31 -14
- package/packages/ui/src/types.ts +1 -0
- package/skill/SKILL.md +214 -174
- package/skill/Workflows/AlphaUpload.md +45 -0
- package/skill/Workflows/Baseline.md +18 -12
- package/skill/Workflows/Composability.md +3 -3
- package/skill/Workflows/Dashboard.md +39 -91
- package/skill/Workflows/Doctor.md +93 -66
- package/skill/Workflows/Evals.md +49 -40
- package/skill/Workflows/Evolve.md +76 -28
- package/skill/Workflows/EvolveBody.md +37 -38
- package/skill/Workflows/Initialize.md +145 -26
- package/skill/Workflows/Orchestrate.md +11 -2
- package/skill/Workflows/Sync.md +23 -0
- package/skill/Workflows/Watch.md +2 -5
- package/skill/agents/diagnosis-analyst.md +163 -0
- package/skill/agents/evolution-reviewer.md +149 -0
- package/skill/agents/integration-guide.md +154 -0
- package/skill/agents/pattern-analyst.md +149 -0
- package/skill/assets/multi-skill-settings.json +1 -1
- package/skill/assets/single-skill-settings.json +1 -1
- package/skill/references/interactive-config.md +39 -0
- package/skill/references/invocation-taxonomy.md +34 -0
- package/skill/references/logs.md +15 -1
- package/skill/references/setup-patterns.md +3 -3
- package/skill/settings_snippet.json +1 -1
- package/apps/local-dashboard/dist/assets/index-C75H1Q3n.css +0 -1
- package/apps/local-dashboard/dist/assets/index-axE4kz3Q.js +0 -15
- package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +0 -60
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# selftune Dashboard Workflow
|
|
2
2
|
|
|
3
3
|
Visual dashboard for selftune telemetry, skill performance, evolution
|
|
4
|
-
audit, and monitoring data.
|
|
5
|
-
|
|
4
|
+
audit, and monitoring data. Starts a local SPA server with SSE-based
|
|
5
|
+
real-time updates and action buttons.
|
|
6
6
|
|
|
7
7
|
## Default Command
|
|
8
8
|
|
|
@@ -10,58 +10,23 @@ and a live server with polling-based auto-refresh and action buttons.
|
|
|
10
10
|
selftune dashboard
|
|
11
11
|
```
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
Starts a Bun HTTP server with a React SPA dashboard and opens it in the
|
|
14
|
+
default browser. The dashboard reads SQLite directly and uses WAL-based
|
|
15
|
+
invalidation to push live updates via Server-Sent Events (SSE).
|
|
16
|
+
TanStack Query polling (60s) acts as a fallback. Action buttons trigger
|
|
17
|
+
selftune commands directly from the dashboard. Use `selftune export` to
|
|
18
|
+
generate JSONL from SQLite for debugging or offline analysis.
|
|
15
19
|
|
|
16
20
|
## Options
|
|
17
21
|
|
|
18
22
|
| Flag | Description | Default |
|
|
19
23
|
|------|-------------|---------|
|
|
20
|
-
| `--
|
|
21
|
-
| `--
|
|
22
|
-
| `--serve` |
|
|
23
|
-
| `--port <port>` | Custom port for live server (requires `--serve`) | 3141 |
|
|
24
|
+
| `--port <port>` | Custom port for the server | 3141 |
|
|
25
|
+
| `--no-open` | Start server without opening browser | Off |
|
|
26
|
+
| `--serve` | *(Deprecated)* Alias for default behavior | — |
|
|
24
27
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
### Static (Default)
|
|
28
|
-
|
|
29
|
-
Builds an HTML file with all telemetry data embedded as JSON, saves it
|
|
30
|
-
to `~/.selftune/dashboard.html`, and opens it in the default browser.
|
|
31
|
-
The data is a point-in-time snapshot -- refresh by re-running the command.
|
|
32
|
-
|
|
33
|
-
```bash
|
|
34
|
-
selftune dashboard
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
### Export
|
|
38
|
-
|
|
39
|
-
Writes the same data-embedded HTML to stdout. Useful for piping to other
|
|
40
|
-
tools or capturing output programmatically.
|
|
41
|
-
|
|
42
|
-
```bash
|
|
43
|
-
selftune dashboard --export > dashboard.html
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
### File
|
|
47
|
-
|
|
48
|
-
Writes the data-embedded HTML to a specific file path.
|
|
49
|
-
|
|
50
|
-
```bash
|
|
51
|
-
selftune dashboard --out /tmp/report.html
|
|
52
|
-
```
|
|
53
|
-
|
|
54
|
-
### Live Server
|
|
55
|
-
|
|
56
|
-
Starts a Bun HTTP server with a React SPA dashboard. The SPA uses
|
|
57
|
-
TanStack Query polling to auto-refresh data (overview every 15s,
|
|
58
|
-
orchestrate runs every 30s, doctor every 30s) and provides action
|
|
59
|
-
buttons to trigger selftune commands.
|
|
60
|
-
|
|
61
|
-
```bash
|
|
62
|
-
selftune dashboard --serve
|
|
63
|
-
selftune dashboard --serve --port 8080
|
|
64
|
-
```
|
|
28
|
+
Note: `--export` and `--out` were removed. The CLI will error if used,
|
|
29
|
+
suggesting `selftune dashboard` instead.
|
|
65
30
|
|
|
66
31
|
## Live Server
|
|
67
32
|
|
|
@@ -79,23 +44,23 @@ override.
|
|
|
79
44
|
| `GET` | `/api/v2/skills/:name` | SQLite-backed per-skill report |
|
|
80
45
|
| `GET` | `/api/v2/orchestrate-runs` | Recent orchestrate run reports |
|
|
81
46
|
| `GET` | `/api/v2/doctor` | System health diagnostics (config, logs, hooks, evolution) |
|
|
47
|
+
| `GET` | `/api/v2/events` | SSE stream for live dashboard updates |
|
|
82
48
|
| `GET` | `/api/health` | Dashboard server health probe |
|
|
83
49
|
| `POST` | `/api/actions/watch` | Trigger `selftune watch` for a skill |
|
|
84
50
|
| `POST` | `/api/actions/evolve` | Trigger `selftune evolve` for a skill |
|
|
85
51
|
| `POST` | `/api/actions/rollback` | Trigger `selftune evolve rollback` for a skill |
|
|
86
52
|
|
|
87
|
-
###
|
|
53
|
+
### Live Updates (SSE)
|
|
88
54
|
|
|
89
|
-
The dashboard
|
|
90
|
-
the
|
|
55
|
+
The dashboard connects to `/api/v2/events` via Server-Sent Events.
|
|
56
|
+
The server watches the SQLite WAL file for changes and broadcasts an
|
|
57
|
+
`update` event when new data is written. The SPA invalidates all cached
|
|
58
|
+
queries, triggering immediate refetches (~1s latency).
|
|
91
59
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
- `/api/v2/doctor` — every 30 seconds
|
|
95
|
-
- `/api/v2/skills/:name` — every 30 seconds (when viewing a skill)
|
|
60
|
+
TanStack Query polling (60s) acts as a fallback safety net in case the
|
|
61
|
+
SSE connection drops. Data also refreshes on window focus.
|
|
96
62
|
|
|
97
|
-
|
|
98
|
-
is required.
|
|
63
|
+
See [docs/design-docs/live-dashboard-sse.md](../../docs/design-docs/live-dashboard-sse.md) for the full design.
|
|
99
64
|
|
|
100
65
|
### Action Endpoints
|
|
101
66
|
|
|
@@ -147,45 +112,32 @@ The dashboard displays data from these sources:
|
|
|
147
112
|
|
|
148
113
|
| Data | Source | Description |
|
|
149
114
|
|------|--------|-------------|
|
|
150
|
-
| Telemetry |
|
|
151
|
-
| Skills |
|
|
152
|
-
| Queries |
|
|
153
|
-
| Evolution |
|
|
115
|
+
| Telemetry | SQLite (`~/.selftune/selftune.db`) | Session-level telemetry records |
|
|
116
|
+
| Skills | SQLite (`~/.selftune/selftune.db`) | Skill activation and usage events |
|
|
117
|
+
| Queries | SQLite (`~/.selftune/selftune.db`) | All user queries across sessions |
|
|
118
|
+
| Evolution | SQLite (`~/.selftune/selftune.db`) | Evolution audit trail (create, deploy, rollback) |
|
|
154
119
|
| Decisions | `~/.selftune/memory/` | Evolution decision records |
|
|
155
120
|
| Snapshots | Computed | Per-skill monitoring snapshots (pass rate, regression status) |
|
|
156
121
|
| Unmatched | Computed | Queries that did not trigger any skill |
|
|
157
122
|
| Pending | Computed | Evolution proposals not yet deployed, rejected, or rolled back |
|
|
158
123
|
|
|
159
|
-
If no log data is found, the
|
|
160
|
-
|
|
124
|
+
If no log data is found, the server reports an error listing the
|
|
125
|
+
checked file paths.
|
|
161
126
|
|
|
162
127
|
## Steps
|
|
163
128
|
|
|
164
|
-
### 1.
|
|
165
|
-
|
|
166
|
-
| Goal | Command |
|
|
167
|
-
|------|---------|
|
|
168
|
-
| Quick visual check | `selftune dashboard` |
|
|
169
|
-
| Save report to file | `selftune dashboard --out report.html` |
|
|
170
|
-
| Pipe to another tool | `selftune dashboard --export` |
|
|
171
|
-
| Live monitoring | `selftune dashboard --serve` |
|
|
172
|
-
|
|
173
|
-
### 2. Run Command
|
|
129
|
+
### 1. Run Dashboard
|
|
174
130
|
|
|
175
131
|
```bash
|
|
176
|
-
# Static (opens browser)
|
|
177
132
|
selftune dashboard
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
selftune dashboard --serve
|
|
133
|
+
selftune dashboard --port 8080
|
|
134
|
+
selftune dashboard --no-open
|
|
181
135
|
```
|
|
182
136
|
|
|
183
|
-
###
|
|
137
|
+
### 2. Interact with Dashboard
|
|
184
138
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
Use action buttons to trigger watch, evolve, or rollback directly from
|
|
188
|
-
the dashboard.
|
|
139
|
+
Data refreshes in real time via SSE (~1s latency). Use action buttons
|
|
140
|
+
to trigger watch, evolve, or rollback directly from the dashboard.
|
|
189
141
|
|
|
190
142
|
## Common Patterns
|
|
191
143
|
|
|
@@ -194,12 +146,8 @@ selftune dashboard --serve
|
|
|
194
146
|
> Report to the user that the dashboard is open.
|
|
195
147
|
|
|
196
148
|
**User wants live monitoring**
|
|
197
|
-
> Run `selftune dashboard
|
|
198
|
-
>
|
|
199
|
-
|
|
200
|
-
**User wants a shareable report**
|
|
201
|
-
> Run `selftune dashboard --out report.html`. Report the file path to the
|
|
202
|
-
> user. The HTML file is self-contained with all data embedded.
|
|
149
|
+
> Run `selftune dashboard`. The server provides real-time updates via SSE
|
|
150
|
+
> (~1 second latency).
|
|
203
151
|
|
|
204
152
|
**Dashboard shows no data**
|
|
205
153
|
> Run `selftune doctor` to verify hooks are installed. If hooks are missing,
|
|
@@ -207,8 +155,8 @@ selftune dashboard --serve
|
|
|
207
155
|
> have run, inform the user that sessions must generate telemetry first.
|
|
208
156
|
|
|
209
157
|
**User wants a different port**
|
|
210
|
-
> Run `selftune dashboard --
|
|
158
|
+
> Run `selftune dashboard --port <port>`. Port must be 1-65535.
|
|
211
159
|
|
|
212
160
|
**User wants to trigger actions from the dashboard**
|
|
213
|
-
> Run `selftune dashboard
|
|
214
|
-
>
|
|
161
|
+
> Run `selftune dashboard`. The dashboard provides action buttons for
|
|
162
|
+
> watch, evolve, and rollback per skill via POST endpoints.
|
|
@@ -17,34 +17,57 @@ None. Doctor runs all checks unconditionally.
|
|
|
17
17
|
|
|
18
18
|
```json
|
|
19
19
|
{
|
|
20
|
-
"
|
|
20
|
+
"command": "doctor",
|
|
21
|
+
"timestamp": "2026-02-28T10:00:00Z",
|
|
21
22
|
"checks": [
|
|
22
23
|
{
|
|
23
|
-
"name": "
|
|
24
|
+
"name": "config",
|
|
25
|
+
"path": "/Users/you/.selftune/config.json",
|
|
24
26
|
"status": "pass",
|
|
25
|
-
"
|
|
27
|
+
"message": "Valid config with agent_type and llm_mode"
|
|
26
28
|
},
|
|
27
29
|
{
|
|
28
|
-
"name": "
|
|
30
|
+
"name": "log_session_telemetry",
|
|
31
|
+
"path": "/Users/you/.claude/session_telemetry_log.jsonl",
|
|
29
32
|
"status": "pass",
|
|
30
|
-
"
|
|
33
|
+
"message": "Found 142 entries"
|
|
31
34
|
},
|
|
32
35
|
{
|
|
33
|
-
"name": "
|
|
36
|
+
"name": "hook_settings",
|
|
37
|
+
"path": "/Users/you/.claude/settings.json",
|
|
34
38
|
"status": "fail",
|
|
35
|
-
"
|
|
39
|
+
"message": "PostToolUse hook not found in ~/.claude/settings.json"
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"name": "dashboard_freshness_mode",
|
|
43
|
+
"status": "pass",
|
|
44
|
+
"message": "Dashboard reads SQLite and watches WAL for live updates"
|
|
36
45
|
}
|
|
37
46
|
],
|
|
38
47
|
"summary": {
|
|
39
|
-
"
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
|
|
48
|
+
"pass": 9,
|
|
49
|
+
"fail": 1,
|
|
50
|
+
"warn": 0,
|
|
51
|
+
"total": 10
|
|
52
|
+
},
|
|
53
|
+
"healthy": false
|
|
43
54
|
}
|
|
44
55
|
```
|
|
45
56
|
|
|
46
57
|
The process exits with code 0 if `healthy: true`, code 1 otherwise.
|
|
47
58
|
|
|
59
|
+
Failed or warning checks may include a machine-readable `guidance` object:
|
|
60
|
+
|
|
61
|
+
```json
|
|
62
|
+
{
|
|
63
|
+
"code": "config_missing",
|
|
64
|
+
"message": "selftune is not initialized yet.",
|
|
65
|
+
"next_command": "selftune init",
|
|
66
|
+
"suggested_commands": ["selftune doctor"],
|
|
67
|
+
"blocking": true
|
|
68
|
+
}
|
|
69
|
+
```
|
|
70
|
+
|
|
48
71
|
## Parsing Instructions
|
|
49
72
|
|
|
50
73
|
### Check Overall Health
|
|
@@ -57,69 +80,64 @@ The process exits with code 0 if `healthy: true`, code 1 otherwise.
|
|
|
57
80
|
### Find Failed Checks
|
|
58
81
|
|
|
59
82
|
```bash
|
|
60
|
-
# Parse: .checks[] | select(.status == "fail") | { name,
|
|
83
|
+
# Parse: .checks[] | select(.status == "fail") | { name, message }
|
|
61
84
|
```
|
|
62
85
|
|
|
63
86
|
### Get Summary Counts
|
|
64
87
|
|
|
65
88
|
```bash
|
|
66
|
-
# Parse: .summary.
|
|
89
|
+
# Parse: .summary.pass, .summary.fail, .summary.warn, .summary.total
|
|
67
90
|
```
|
|
68
91
|
|
|
69
92
|
## Health Checks
|
|
70
93
|
|
|
71
|
-
Doctor validates these areas
|
|
94
|
+
Doctor validates these baseline areas (10 checks total), and adds alpha cloud-link
|
|
95
|
+
or queue checks when alpha is configured:
|
|
72
96
|
|
|
73
|
-
###
|
|
97
|
+
### Config Check
|
|
74
98
|
|
|
75
|
-
| Check | What it validates |
|
|
76
|
-
|
|
77
|
-
|
|
|
78
|
-
| Logs are parseable | Every line in each log file is valid JSON |
|
|
79
|
-
| Schema conformance | Required fields present per log type (see `references/logs.md`) |
|
|
99
|
+
| Check name | What it validates |
|
|
100
|
+
|------------|-------------------|
|
|
101
|
+
| `config` | `~/.selftune/config.json` exists, is valid JSON, contains `agent_type` and `llm_mode` fields |
|
|
80
102
|
|
|
81
|
-
###
|
|
103
|
+
### Log Checks (4 checks)
|
|
82
104
|
|
|
83
|
-
| Check | What it validates |
|
|
84
|
-
|
|
85
|
-
|
|
|
86
|
-
|
|
|
87
|
-
|
|
|
88
|
-
|
|
|
105
|
+
| Check name | What it validates |
|
|
106
|
+
|------------|-------------------|
|
|
107
|
+
| `log_session_telemetry` | `session_telemetry_log.jsonl` exists and is parseable |
|
|
108
|
+
| `log_skill_usage` | `skill_usage_log.jsonl` exists and is parseable |
|
|
109
|
+
| `log_all_queries` | `all_queries_log.jsonl` exists and is parseable |
|
|
110
|
+
| `log_evolution_audit` | `evolution_audit_log.jsonl` exists and is parseable |
|
|
89
111
|
|
|
90
|
-
###
|
|
112
|
+
### Hook Check
|
|
91
113
|
|
|
92
|
-
| Check | What it validates |
|
|
93
|
-
|
|
94
|
-
|
|
|
95
|
-
| Memory files valid | `context.md`, `decisions.md`, `plan.md` exist and are non-empty (if previously written) |
|
|
114
|
+
| Check name | What it validates |
|
|
115
|
+
|------------|-------------------|
|
|
116
|
+
| `hook_settings` | `~/.claude/settings.json` has selftune hooks configured |
|
|
96
117
|
|
|
97
|
-
###
|
|
118
|
+
### Evolution Check
|
|
98
119
|
|
|
99
|
-
| Check | What it validates |
|
|
100
|
-
|
|
101
|
-
|
|
|
102
|
-
| Rules file valid | The file contains valid JSON conforming to the activation rules schema |
|
|
120
|
+
| Check name | What it validates |
|
|
121
|
+
|------------|-------------------|
|
|
122
|
+
| `evolution_audit` | Evolution audit log entries have valid structure |
|
|
103
123
|
|
|
104
|
-
###
|
|
124
|
+
### Integrity Check
|
|
105
125
|
|
|
106
|
-
| Check | What it validates |
|
|
107
|
-
|
|
108
|
-
|
|
|
109
|
-
| Optional agent files present | If the repo bundles helper agents, the expected files are present |
|
|
126
|
+
| Check name | What it validates |
|
|
127
|
+
|------------|-------------------|
|
|
128
|
+
| `dashboard_freshness_mode` | Warns when the dashboard still relies on legacy JSONL watcher invalidation instead of SQLite WAL live refresh |
|
|
110
129
|
|
|
111
|
-
###
|
|
130
|
+
### Skill Version Sync Check
|
|
112
131
|
|
|
113
|
-
| Check | What it validates |
|
|
114
|
-
|
|
115
|
-
|
|
|
132
|
+
| Check name | What it validates |
|
|
133
|
+
|------------|-------------------|
|
|
134
|
+
| `skill_version_sync` | SKILL.md frontmatter version matches package.json version |
|
|
116
135
|
|
|
117
|
-
###
|
|
136
|
+
### Version Check
|
|
118
137
|
|
|
119
|
-
| Check | What it validates |
|
|
120
|
-
|
|
121
|
-
|
|
|
122
|
-
| Valid action values | All entries use known action types: `created`, `validated`, `deployed`, `rolled_back` |
|
|
138
|
+
| Check name | What it validates |
|
|
139
|
+
|------------|-------------------|
|
|
140
|
+
| `version_up_to_date` | Installed version matches latest on npm registry |
|
|
123
141
|
|
|
124
142
|
## Steps
|
|
125
143
|
|
|
@@ -139,18 +157,13 @@ For each failed check, take the appropriate action:
|
|
|
139
157
|
|
|
140
158
|
| Failed check | Fix |
|
|
141
159
|
|-------------|-----|
|
|
142
|
-
|
|
|
143
|
-
|
|
|
144
|
-
|
|
|
145
|
-
|
|
|
146
|
-
|
|
|
147
|
-
|
|
|
148
|
-
|
|
|
149
|
-
| Memory files invalid | Delete and let the memory writer recreate them on next evolve/watch. |
|
|
150
|
-
| Activation rules missing | Copy `assets/activation-rules-default.json` to `~/.selftune/activation-rules.json`. |
|
|
151
|
-
| Activation rules invalid | Validate JSON syntax. Re-copy from template if corrupted. |
|
|
152
|
-
| Agent files missing | If your repo uses optional helper agents, restore them in `.claude/agents/`. Otherwise ignore this advisory. |
|
|
153
|
-
| Audit log invalid | Remove corrupted entries. Future operations will append clean entries. |
|
|
160
|
+
| `config` | Run `selftune init` (or `selftune init --force` to regenerate). |
|
|
161
|
+
| `log_*` | Run a session to generate initial log entries. Check hook installation with `selftune init`. |
|
|
162
|
+
| `hook_settings` | Run `selftune init` to install hooks into `~/.claude/settings.json`. |
|
|
163
|
+
| `evolution_audit` | Remove corrupted entries. Future operations will append clean entries. |
|
|
164
|
+
| `dashboard_freshness_mode` | This is an operator warning, not a broken install. Expect possible freshness gaps for SQLite-only writes and export before destructive recovery. |
|
|
165
|
+
| `skill_version_sync` | Run `bun run sync-version` to stamp SKILL.md from package.json. |
|
|
166
|
+
| `version_up_to_date` | Run `npm install -g selftune` to update. |
|
|
154
167
|
|
|
155
168
|
### 4. Re-run Doctor
|
|
156
169
|
|
|
@@ -159,14 +172,28 @@ After fixes, run doctor again to verify all checks pass.
|
|
|
159
172
|
## Subagent Escalation
|
|
160
173
|
|
|
161
174
|
If doctor reveals persistent issues with a specific skill — especially
|
|
162
|
-
recurring failures that basic fixes do not resolve —
|
|
163
|
-
`diagnosis-analyst`
|
|
175
|
+
recurring failures that basic fixes do not resolve — read
|
|
176
|
+
`skill/agents/diagnosis-analyst.md` and spawn a subagent with those instructions
|
|
177
|
+
for root cause analysis.
|
|
178
|
+
|
|
179
|
+
### Alpha Upload Not Active
|
|
180
|
+
|
|
181
|
+
**Symptoms:** `selftune status` shows alpha upload as "not enrolled" or "enrolled (missing credential)"
|
|
182
|
+
|
|
183
|
+
**Diagnostic steps:**
|
|
184
|
+
1. Check `selftune status` — look at "Alpha Upload" and "Cloud link" lines
|
|
185
|
+
2. If `doctor` includes a `cloud_link` or alpha queue warning, prefer `.checks[].guidance.next_command`
|
|
186
|
+
3. If "not enrolled" or "not linked": run `selftune init --alpha --alpha-email <email>` (opens browser for device-code auth)
|
|
187
|
+
4. If "enrolled (missing credential)": re-run `selftune init --alpha --alpha-email <email> --force` (re-authenticates via browser)
|
|
188
|
+
5. If "api_key has invalid format": re-run init with `--alpha --force` to re-authenticate
|
|
189
|
+
|
|
190
|
+
**Resolution:** Follow the setup sequence in Initialize workflow → Alpha Enrollment section.
|
|
164
191
|
|
|
165
192
|
## Common Patterns
|
|
166
193
|
|
|
167
194
|
**User reports something seems broken**
|
|
168
195
|
> Run `selftune doctor`. Parse the JSON output for failed checks. Report
|
|
169
|
-
> each failure's `name` and `
|
|
196
|
+
> each failure's `name` and `message` to the user with the recommended fix.
|
|
170
197
|
|
|
171
198
|
**User asks if hooks are working**
|
|
172
199
|
> Run `selftune doctor`. Parse `.checks[]` for hook-related entries. If
|
package/skill/Workflows/Evals.md
CHANGED
|
@@ -26,9 +26,14 @@ selftune eval generate --skill <name> [options]
|
|
|
26
26
|
| `--skill <name>` | Skill to generate evals for | Required (unless `--list-skills`) |
|
|
27
27
|
| `--list-skills` | List all logged skills with query counts | Off |
|
|
28
28
|
| `--stats` | Show aggregate telemetry stats for the skill | Off |
|
|
29
|
-
| `--max <n>` | Maximum eval entries
|
|
30
|
-
| `--seed <n>` |
|
|
31
|
-
| `--out <path>` | Output file path | `
|
|
29
|
+
| `--max <n>` | Maximum eval entries per side | 50 |
|
|
30
|
+
| `--seed <n>` | Seed for deterministic shuffling | 42 |
|
|
31
|
+
| `--output <path>` / `--out <path>` | Output file path | `{skillName}_trigger_eval.json` |
|
|
32
|
+
| `--no-negatives` | Exclude negative examples from output | Off |
|
|
33
|
+
| `--no-taxonomy` | Skip invocation_type classification | Off |
|
|
34
|
+
| `--skill-log <path>` | Path to skill_usage_log.jsonl | Default log path |
|
|
35
|
+
| `--query-log <path>` | Path to all_queries_log.jsonl | Default log path |
|
|
36
|
+
| `--telemetry-log <path>` | Path to session_telemetry_log.jsonl | Default log path |
|
|
32
37
|
| `--synthetic` | Generate evals from SKILL.md via LLM (no logs needed) | Off |
|
|
33
38
|
| `--skill-path <path>` | Path to SKILL.md (required with `--synthetic`) | — |
|
|
34
39
|
| `--model <model>` | LLM model to use for synthetic generation | Agent default |
|
|
@@ -40,24 +45,20 @@ selftune eval generate --skill <name> [options]
|
|
|
40
45
|
```json
|
|
41
46
|
[
|
|
42
47
|
{
|
|
43
|
-
"id": 1,
|
|
44
48
|
"query": "Make me a slide deck for the Q3 board meeting",
|
|
45
|
-
"
|
|
46
|
-
"invocation_type": "contextual"
|
|
47
|
-
"skill_name": "pptx",
|
|
48
|
-
"source_session": "abc123"
|
|
49
|
+
"should_trigger": true,
|
|
50
|
+
"invocation_type": "contextual"
|
|
49
51
|
},
|
|
50
52
|
{
|
|
51
|
-
"id": 2,
|
|
52
53
|
"query": "What format should I use for a presentation?",
|
|
53
|
-
"
|
|
54
|
-
"invocation_type": "negative",
|
|
55
|
-
"skill_name": "pptx",
|
|
56
|
-
"source_session": null
|
|
54
|
+
"should_trigger": false
|
|
57
55
|
}
|
|
58
56
|
]
|
|
59
57
|
```
|
|
60
58
|
|
|
59
|
+
Each entry has `query` (string, max 500 chars), `should_trigger` (boolean),
|
|
60
|
+
and optional `invocation_type` (omitted when `--no-taxonomy` is set).
|
|
61
|
+
|
|
61
62
|
### List Skills
|
|
62
63
|
|
|
63
64
|
```json
|
|
@@ -93,14 +94,14 @@ selftune eval generate --skill <name> [options]
|
|
|
93
94
|
### Find Missed Queries (False Negatives)
|
|
94
95
|
|
|
95
96
|
```bash
|
|
96
|
-
# Parse: .[] | select(.
|
|
97
|
+
# Parse: .[] | select(.should_trigger == true and .invocation_type != "explicit")
|
|
97
98
|
# These are queries that should trigger but might be missed
|
|
98
99
|
```
|
|
99
100
|
|
|
100
101
|
### Get Negative Examples
|
|
101
102
|
|
|
102
103
|
```bash
|
|
103
|
-
# Parse: .[] | select(.
|
|
104
|
+
# Parse: .[] | select(.should_trigger == false)
|
|
104
105
|
```
|
|
105
106
|
|
|
106
107
|
## Sub-Workflows
|
|
@@ -126,10 +127,16 @@ selftune eval generate --skill pptx --synthetic --skill-path /path/to/skills/ppt
|
|
|
126
127
|
|
|
127
128
|
The command:
|
|
128
129
|
1. Reads the SKILL.md file content
|
|
129
|
-
2.
|
|
130
|
-
3.
|
|
131
|
-
4.
|
|
132
|
-
5.
|
|
130
|
+
2. Loads real user queries from the database (if available) as few-shot style examples so synthetic queries match real phrasing patterns
|
|
131
|
+
3. Sends skill content and real examples to an LLM with a prompt requesting realistic test queries
|
|
132
|
+
4. Parses the response into eval entries with invocation type annotations
|
|
133
|
+
5. Classifies each positive query using the deterministic `classifyInvocation()` heuristic
|
|
134
|
+
6. Writes the eval set to the output file
|
|
135
|
+
|
|
136
|
+
**Note:** When real query data exists in the database, synthetic generation
|
|
137
|
+
automatically includes high-confidence positive triggers and general queries as
|
|
138
|
+
phrasing references. This produces more natural-sounding eval queries. If no
|
|
139
|
+
database is available, generation proceeds without real examples (fail-open).
|
|
133
140
|
|
|
134
141
|
Use `--model` to override the default LLM model:
|
|
135
142
|
|
|
@@ -144,7 +151,7 @@ Cross-reference `skill_usage_log.jsonl` (positive triggers) against
|
|
|
144
151
|
an eval set annotated with invocation types.
|
|
145
152
|
|
|
146
153
|
```bash
|
|
147
|
-
selftune eval generate --skill pptx --max 50 --
|
|
154
|
+
selftune eval generate --skill pptx --max 50 --output evals-pptx.json
|
|
148
155
|
```
|
|
149
156
|
|
|
150
157
|
The command:
|
|
@@ -168,32 +175,34 @@ selftune eval generate --skill pptx --stats
|
|
|
168
175
|
|
|
169
176
|
### 0. Pre-Flight Configuration
|
|
170
177
|
|
|
171
|
-
Before generating evals,
|
|
178
|
+
Before generating evals, use the `AskUserQuestion` tool to present structured configuration options.
|
|
172
179
|
|
|
173
|
-
If the user responds with "use defaults"
|
|
180
|
+
If the user responds with "use defaults" or similar shorthand, skip to step 1 using the recommended defaults. If the user cancels, stop -- do not proceed with defaults.
|
|
174
181
|
|
|
175
182
|
For `--list-skills` or `--stats` requests, skip pre-flight entirely — these are read-only operations.
|
|
176
183
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
1. **Generation Mode**
|
|
180
|
-
- a) Log-based — build evals from real usage logs (recommended if logs exist)
|
|
181
|
-
- b) Synthetic — generate evals from SKILL.md via LLM (for new skills with no data)
|
|
182
|
-
|
|
183
|
-
2. **Skill Path** (synthetic mode only)
|
|
184
|
-
- Provide absolute or relative path to the target SKILL.md
|
|
185
|
-
- Example: `./skills/pptx/SKILL.md`
|
|
186
|
-
|
|
187
|
-
3. **Max Entries:** 50 (default — how many eval entries to generate)
|
|
184
|
+
Use `AskUserQuestion` with these questions:
|
|
188
185
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
186
|
+
```json
|
|
187
|
+
{
|
|
188
|
+
"questions": [
|
|
189
|
+
{
|
|
190
|
+
"question": "Generation Mode",
|
|
191
|
+
"options": ["Log-based — build from real usage logs (recommended if logs exist)", "Synthetic — generate from SKILL.md via LLM (for new skills)"]
|
|
192
|
+
},
|
|
193
|
+
{
|
|
194
|
+
"question": "Model (for synthetic mode)",
|
|
195
|
+
"options": ["Fast (haiku) — quick generation", "Balanced (sonnet) — better diversity (recommended)", "Best (opus) — highest quality"]
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"question": "Max Entries",
|
|
199
|
+
"options": ["50 (default)", "25 (quick)", "100 (comprehensive)"]
|
|
200
|
+
}
|
|
201
|
+
]
|
|
202
|
+
}
|
|
203
|
+
```
|
|
195
204
|
|
|
196
|
-
|
|
205
|
+
If `AskUserQuestion` is not available, fall back to presenting these as inline numbered options.
|
|
197
206
|
|
|
198
207
|
After the user responds, parse their selections and map each choice to the corresponding CLI flags:
|
|
199
208
|
|