selftune 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +146 -0
- package/.claude/agents/evolution-reviewer.md +167 -0
- package/.claude/agents/integration-guide.md +200 -0
- package/.claude/agents/pattern-analyst.md +147 -0
- package/CHANGELOG.md +37 -0
- package/README.md +96 -256
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +103 -0
- package/cli/selftune/constants.ts +75 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-server.ts +582 -0
- package/cli/selftune/dashboard.ts +25 -3
- package/cli/selftune/eval/baseline.ts +247 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +68 -2
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evolve-body.ts +492 -0
- package/cli/selftune/evolution/evolve.ts +466 -103
- package/cli/selftune/evolution/extract-patterns.ts +32 -1
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +19 -2
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/grade-session.ts +138 -18
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/index.ts +88 -0
- package/cli/selftune/ingestors/claude-replay.ts +351 -0
- package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
- package/cli/selftune/init.ts +150 -3
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +25 -2
- package/cli/selftune/status.ts +17 -13
- package/cli/selftune/types.ts +377 -5
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/llm-call.ts +29 -3
- package/cli/selftune/utils/transcript.ts +35 -0
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/dashboard/index.html +569 -8
- package/package.json +8 -4
- package/skill/SKILL.md +124 -8
- package/skill/Workflows/AutoActivation.md +144 -0
- package/skill/Workflows/Badge.md +118 -0
- package/skill/Workflows/Baseline.md +121 -0
- package/skill/Workflows/Composability.md +100 -0
- package/skill/Workflows/Contribute.md +91 -0
- package/skill/Workflows/Cron.md +155 -0
- package/skill/Workflows/Dashboard.md +203 -0
- package/skill/Workflows/Doctor.md +37 -1
- package/skill/Workflows/Evals.md +69 -1
- package/skill/Workflows/EvolutionMemory.md +152 -0
- package/skill/Workflows/Evolve.md +111 -6
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/ImportSkillsBench.md +111 -0
- package/skill/Workflows/Ingest.md +117 -3
- package/skill/Workflows/Initialize.md +57 -3
- package/skill/Workflows/Replay.md +70 -0
- package/skill/Workflows/Rollback.md +20 -1
- package/skill/Workflows/UnitTest.md +138 -0
- package/skill/Workflows/Watch.md +22 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
|
@@ -1,19 +1,69 @@
|
|
|
1
1
|
# selftune Ingest Workflow
|
|
2
2
|
|
|
3
|
-
Import sessions from
|
|
4
|
-
|
|
5
|
-
`ingest-
|
|
3
|
+
Import sessions from agent platforms into the shared selftune log format.
|
|
4
|
+
Covers five sub-commands: `replay`, `ingest-codex`, `ingest-opencode`,
|
|
5
|
+
`ingest-openclaw`, and `wrap-codex`.
|
|
6
6
|
|
|
7
7
|
## When to Use Each
|
|
8
8
|
|
|
9
9
|
| Sub-command | Platform | Mode | When |
|
|
10
10
|
|-------------|----------|------|------|
|
|
11
|
+
| `replay` | Claude Code | Batch | Backfill logs from existing Claude Code transcripts |
|
|
11
12
|
| `ingest-codex` | Codex | Batch | Import existing Codex rollout logs |
|
|
12
13
|
| `ingest-opencode` | OpenCode | Batch | Import existing OpenCode sessions |
|
|
14
|
+
| `ingest-openclaw` | OpenClaw | Batch | Import existing OpenClaw agent sessions |
|
|
13
15
|
| `wrap-codex` | Codex | Real-time | Wrap `codex exec` to capture telemetry live |
|
|
14
16
|
|
|
15
17
|
---
|
|
16
18
|
|
|
19
|
+
## replay
|
|
20
|
+
|
|
21
|
+
Batch ingest existing Claude Code session transcripts into the shared JSONL schema.
|
|
22
|
+
|
|
23
|
+
### Default Command
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
selftune replay
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Options
|
|
30
|
+
|
|
31
|
+
| Flag | Description |
|
|
32
|
+
|------|-------------|
|
|
33
|
+
| `--since <date>` | Only ingest sessions modified after this date (e.g., `2026-01-01`) |
|
|
34
|
+
| `--dry-run` | Show what would be ingested without writing to logs |
|
|
35
|
+
| `--force` | Re-ingest all sessions, ignoring the marker file |
|
|
36
|
+
| `--verbose` | Show per-file progress during ingestion |
|
|
37
|
+
| `--projects-dir <path>` | Override default `~/.claude/projects/` directory |
|
|
38
|
+
|
|
39
|
+
### Source
|
|
40
|
+
|
|
41
|
+
Reads from `~/.claude/projects/<hash>/<session-id>.jsonl`. These are the
|
|
42
|
+
transcript files Claude Code automatically saves for every session.
|
|
43
|
+
|
|
44
|
+
### Output
|
|
45
|
+
|
|
46
|
+
Writes to:
|
|
47
|
+
- `~/.claude/all_queries_log.jsonl` -- extracted user queries (one per query, not just last)
|
|
48
|
+
- `~/.claude/session_telemetry_log.jsonl` -- per-session metrics with `source: "claude_code_replay"`
|
|
49
|
+
- `~/.claude/skill_usage_log.jsonl` -- skill triggers with `source: "claude_code_replay"`
|
|
50
|
+
|
|
51
|
+
### Steps
|
|
52
|
+
|
|
53
|
+
1. Run `selftune replay --dry-run` to preview what would be ingested
|
|
54
|
+
2. Run `selftune replay` to ingest all sessions
|
|
55
|
+
3. Run `selftune doctor` to confirm logs are healthy
|
|
56
|
+
4. Run `selftune evals --list-skills` to see if the ingested sessions appear
|
|
57
|
+
|
|
58
|
+
### Notes
|
|
59
|
+
|
|
60
|
+
- Idempotent: uses a marker file (`~/.claude/claude_code_ingested_sessions.json`) to track
|
|
61
|
+
which transcripts have already been ingested. Safe to run repeatedly.
|
|
62
|
+
- Extracts ALL user queries per session, not just the last one.
|
|
63
|
+
- Filters out system messages, short queries (<4 chars), and queries matching `SKIP_PREFIXES`.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
17
67
|
## ingest-codex
|
|
18
68
|
|
|
19
69
|
Batch ingest Codex rollout logs into the shared JSONL schema.
|
|
@@ -84,6 +134,58 @@ Writes to:
|
|
|
84
134
|
|
|
85
135
|
---
|
|
86
136
|
|
|
137
|
+
## ingest-openclaw
|
|
138
|
+
|
|
139
|
+
Batch ingest OpenClaw agent session histories into the shared JSONL schema.
|
|
140
|
+
Supports multiple agents and auto-discovers session files across all agent directories.
|
|
141
|
+
|
|
142
|
+
### Default Command
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
selftune ingest-openclaw
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Options
|
|
149
|
+
|
|
150
|
+
| Flag | Description |
|
|
151
|
+
|------|-------------|
|
|
152
|
+
| `--agents-dir <path>` | Override default `~/.openclaw/agents/` directory |
|
|
153
|
+
| `--since <date>` | Only ingest sessions modified after this date (e.g., `2026-01-01`) |
|
|
154
|
+
| `--dry-run` | Show what would be ingested without writing to logs |
|
|
155
|
+
| `--force` | Re-ingest all sessions, ignoring the marker file |
|
|
156
|
+
| `--verbose` / `-v` | Show per-session progress during ingestion |
|
|
157
|
+
|
|
158
|
+
### Source
|
|
159
|
+
|
|
160
|
+
Reads from `~/.openclaw/agents/<agentId>/sessions/*.jsonl`. Each JSONL file contains:
|
|
161
|
+
- Line 1 (session header): `{"type":"session","version":5,"id":"<uuid>","timestamp":"<iso>","cwd":"<path>"}`
|
|
162
|
+
- Line 2+ (messages): `{"role":"user|assistant|toolResult","content":[...],"timestamp":<ms>}`
|
|
163
|
+
|
|
164
|
+
### Output
|
|
165
|
+
|
|
166
|
+
Writes to:
|
|
167
|
+
- `~/.claude/all_queries_log.jsonl` -- extracted user queries
|
|
168
|
+
- `~/.claude/session_telemetry_log.jsonl` -- per-session metrics with `source: "openclaw"`
|
|
169
|
+
- `~/.claude/skill_usage_log.jsonl` -- skill triggers with `source: "openclaw"`
|
|
170
|
+
|
|
171
|
+
### Steps
|
|
172
|
+
|
|
173
|
+
1. Run `selftune ingest-openclaw --dry-run` to preview what would be ingested
|
|
174
|
+
2. Run `selftune ingest-openclaw` to ingest all sessions
|
|
175
|
+
3. Run `selftune doctor` to confirm logs are healthy
|
|
176
|
+
4. Run `selftune evals --list-skills` to see if the ingested sessions appear
|
|
177
|
+
|
|
178
|
+
### Notes
|
|
179
|
+
|
|
180
|
+
- Idempotent: uses a marker file to track which sessions have already been ingested.
|
|
181
|
+
Safe to run repeatedly. Use `--force` to re-ingest everything.
|
|
182
|
+
- Skill detection heuristic: identifies skills by checking for `SKILL.md` file reads in
|
|
183
|
+
tool calls and by matching known skill names in assistant text content.
|
|
184
|
+
- Multi-agent support: scans all agent directories under the agents root, ingesting
|
|
185
|
+
sessions from every agent found.
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
87
189
|
## wrap-codex
|
|
88
190
|
|
|
89
191
|
Wrap `codex exec` with real-time telemetry capture. Drop-in replacement
|
|
@@ -123,12 +225,24 @@ stream for telemetry; it does not modify Codex behavior.
|
|
|
123
225
|
|
|
124
226
|
## Common Patterns
|
|
125
227
|
|
|
228
|
+
**"Backfill Claude Code sessions"**
|
|
229
|
+
> Run `selftune replay`. No options needed. Reads from `~/.claude/projects/`.
|
|
230
|
+
|
|
231
|
+
**"Replay only recent Claude Code sessions"**
|
|
232
|
+
> Run `selftune replay --since 2026-02-01` with an appropriate date.
|
|
233
|
+
|
|
126
234
|
**"Ingest codex logs"**
|
|
127
235
|
> Run `selftune ingest-codex`. No options needed. Reads from `$CODEX_HOME/sessions/`.
|
|
128
236
|
|
|
129
237
|
**"Import opencode sessions"**
|
|
130
238
|
> Run `selftune ingest-opencode`. Reads from the SQLite database automatically.
|
|
131
239
|
|
|
240
|
+
**"Ingest OpenClaw sessions"**
|
|
241
|
+
> Run `selftune ingest-openclaw`. Reads from `~/.openclaw/agents/` automatically.
|
|
242
|
+
|
|
243
|
+
**"Import only recent OpenClaw sessions"**
|
|
244
|
+
> Run `selftune ingest-openclaw --since 2026-02-01` with an appropriate date.
|
|
245
|
+
|
|
132
246
|
**"Run codex through selftune"**
|
|
133
247
|
> Use `selftune wrap-codex -- <codex args>` instead of `codex exec <args>` directly.
|
|
134
248
|
|
|
@@ -69,7 +69,7 @@ cat ~/.selftune/config.json 2>/dev/null
|
|
|
69
69
|
```
|
|
70
70
|
|
|
71
71
|
If the file exists and is valid JSON, selftune is already initialized.
|
|
72
|
-
Skip to Step
|
|
72
|
+
Skip to Step 8 (verify with doctor) unless the user wants to reinitialize.
|
|
73
73
|
|
|
74
74
|
### 3. Run Init
|
|
75
75
|
|
|
@@ -80,12 +80,15 @@ selftune init
|
|
|
80
80
|
### 4. Install Hooks (Claude Code)
|
|
81
81
|
|
|
82
82
|
If `init` reports hooks are not installed, merge the entries from
|
|
83
|
-
`skill/settings_snippet.json` into `~/.claude/settings.json`.
|
|
83
|
+
`skill/settings_snippet.json` into `~/.claude/settings.json`. Six hooks
|
|
84
84
|
are required:
|
|
85
85
|
|
|
86
86
|
| Hook | Script | Purpose |
|
|
87
87
|
|------|--------|---------|
|
|
88
88
|
| `UserPromptSubmit` | `hooks/prompt-log.ts` | Log every user query |
|
|
89
|
+
| `UserPromptSubmit` | `hooks/auto-activate.ts` | Suggest skills before prompt processing |
|
|
90
|
+
| `PreToolUse` (Write/Edit) | `hooks/skill-change-guard.ts` | Detect uncontrolled skill edits |
|
|
91
|
+
| `PreToolUse` (Write/Edit) | `hooks/evolution-guard.ts` | Block SKILL.md edits on monitored skills |
|
|
89
92
|
| `PostToolUse` (Read) | `hooks/skill-eval.ts` | Track skill triggers |
|
|
90
93
|
| `Stop` | `hooks/session-stop.ts` | Capture session telemetry |
|
|
91
94
|
|
|
@@ -100,7 +103,48 @@ The hooks directory is at `dirname(cli_path)/hooks/`.
|
|
|
100
103
|
- Use `selftune ingest-opencode` to import sessions from the SQLite database
|
|
101
104
|
- See `Workflows/Ingest.md` for details
|
|
102
105
|
|
|
103
|
-
### 5.
|
|
106
|
+
### 5. Initialize Memory Directory
|
|
107
|
+
|
|
108
|
+
Create the memory directory if it does not exist:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
mkdir -p ~/.selftune/memory
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
The memory system stores three files at `~/.selftune/memory/`:
|
|
115
|
+
- `context.md` -- active evolution state and session context
|
|
116
|
+
- `decisions.md` -- evolution decisions and rollback history
|
|
117
|
+
- `plan.md` -- current priorities and evolution strategy
|
|
118
|
+
|
|
119
|
+
These files are created automatically by the memory writer during evolve,
|
|
120
|
+
watch, and rollback workflows. The directory just needs to exist.
|
|
121
|
+
|
|
122
|
+
### 6. Set Up Activation Rules
|
|
123
|
+
|
|
124
|
+
Copy the default activation rules template:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
cp templates/activation-rules-default.json ~/.selftune/activation-rules.json
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
The activation rules file configures auto-activation behavior -- which skills
|
|
131
|
+
get suggested and under what conditions. Edit `~/.selftune/activation-rules.json`
|
|
132
|
+
to customize thresholds and skill mappings for your project.
|
|
133
|
+
|
|
134
|
+
### 7. Verify Agent Availability
|
|
135
|
+
|
|
136
|
+
Check that the specialized agent files are present:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
ls .claude/agents/
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Expected agents: `diagnosis-analyst.md`, `pattern-analyst.md`,
|
|
143
|
+
`evolution-reviewer.md`, `integration-guide.md`. These are used by evolve
|
|
144
|
+
and doctor workflows for deeper analysis. If missing, copy them from the
|
|
145
|
+
selftune repository's `.claude/agents/` directory.
|
|
146
|
+
|
|
147
|
+
### 8. Verify with Doctor
|
|
104
148
|
|
|
105
149
|
```bash
|
|
106
150
|
selftune doctor
|
|
@@ -109,6 +153,16 @@ selftune doctor
|
|
|
109
153
|
Parse the JSON output. All checks should pass. If any fail, address the
|
|
110
154
|
reported issues before proceeding.
|
|
111
155
|
|
|
156
|
+
## Integration Guide
|
|
157
|
+
|
|
158
|
+
For project-type-specific setup (single-skill, multi-skill, monorepo, Codex,
|
|
159
|
+
OpenCode, mixed agents), see [docs/integration-guide.md](../../docs/integration-guide.md).
|
|
160
|
+
|
|
161
|
+
Templates for each project type are in the `templates/` directory:
|
|
162
|
+
- `templates/single-skill-settings.json` — hooks for single-skill projects
|
|
163
|
+
- `templates/multi-skill-settings.json` — hooks for multi-skill projects with activation rules
|
|
164
|
+
- `templates/activation-rules-default.json` — default auto-activation rule configuration
|
|
165
|
+
|
|
112
166
|
## Common Patterns
|
|
113
167
|
|
|
114
168
|
**"Initialize selftune"**
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# selftune Replay Workflow
|
|
2
|
+
|
|
3
|
+
Backfill the shared JSONL logs from existing Claude Code conversation
|
|
4
|
+
transcripts. Useful for bootstrapping selftune with historical session data.
|
|
5
|
+
|
|
6
|
+
## When to Use
|
|
7
|
+
|
|
8
|
+
- New selftune installation with months of Claude Code history
|
|
9
|
+
- After re-initializing logs and wanting to recover data
|
|
10
|
+
- To populate eval data without waiting for new sessions
|
|
11
|
+
|
|
12
|
+
## Key Difference from Hooks
|
|
13
|
+
|
|
14
|
+
Real-time hooks capture only the **last** user query per session. Replay
|
|
15
|
+
extracts **all** user queries, writing one `QueryLogRecord` per message.
|
|
16
|
+
This produces much richer eval data from historical sessions.
|
|
17
|
+
|
|
18
|
+
## Default Command
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
selftune replay
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Options
|
|
25
|
+
|
|
26
|
+
| Flag | Description |
|
|
27
|
+
|------|-------------|
|
|
28
|
+
| `--since <date>` | Only include transcripts modified after this date |
|
|
29
|
+
| `--dry-run` | Preview what would be ingested without writing |
|
|
30
|
+
| `--force` | Re-ingest all transcripts (ignore marker file) |
|
|
31
|
+
| `--verbose` | Show detailed progress per file |
|
|
32
|
+
| `--projects-dir <path>` | Override default `~/.claude/projects/` path |
|
|
33
|
+
|
|
34
|
+
## Source
|
|
35
|
+
|
|
36
|
+
Reads Claude Code transcripts from `~/.claude/projects/<hash>/<session>.jsonl`.
|
|
37
|
+
Each transcript is a JSONL file containing user and assistant messages.
|
|
38
|
+
|
|
39
|
+
## Output
|
|
40
|
+
|
|
41
|
+
Writes to:
|
|
42
|
+
- `~/.claude/all_queries_log.jsonl` -- one record per user query (all messages, not just last)
|
|
43
|
+
- `~/.claude/session_telemetry_log.jsonl` -- per-session metrics with `source: "claude_code_replay"`
|
|
44
|
+
- `~/.claude/skill_usage_log.jsonl` -- skill triggers detected in transcripts
|
|
45
|
+
|
|
46
|
+
## Idempotency
|
|
47
|
+
|
|
48
|
+
Uses a marker file at `~/.claude/claude_code_ingested_sessions.json` to track
|
|
49
|
+
which transcripts have already been ingested. Use `--force` to re-ingest all.
|
|
50
|
+
|
|
51
|
+
## Steps
|
|
52
|
+
|
|
53
|
+
1. Run `selftune replay --dry-run` to preview what would be ingested
|
|
54
|
+
2. Run `selftune replay` to perform the ingestion
|
|
55
|
+
3. Run `selftune doctor` to verify logs are healthy
|
|
56
|
+
4. Run `selftune evals --list-skills` to see if replayed sessions appear
|
|
57
|
+
|
|
58
|
+
## Common Patterns
|
|
59
|
+
|
|
60
|
+
**"Backfill my logs"**
|
|
61
|
+
> Run `selftune replay`. No options needed.
|
|
62
|
+
|
|
63
|
+
**"Only replay recent sessions"**
|
|
64
|
+
> Run `selftune replay --since 2026-02-01`
|
|
65
|
+
|
|
66
|
+
**"Re-ingest everything"**
|
|
67
|
+
> Run `selftune replay --force`
|
|
68
|
+
|
|
69
|
+
**"How do I know it worked?"**
|
|
70
|
+
> Run `selftune doctor` after replay. Check log file line counts increased.
|
|
@@ -75,6 +75,16 @@ Manual restoration from version control is required.
|
|
|
75
75
|
|
|
76
76
|
## Steps
|
|
77
77
|
|
|
78
|
+
### 0. Read Evolution Context
|
|
79
|
+
|
|
80
|
+
Before starting, read `~/.selftune/memory/context.md` for session context:
|
|
81
|
+
- Active evolutions and their current status
|
|
82
|
+
- Previous rollback history
|
|
83
|
+
- Last update timestamp
|
|
84
|
+
|
|
85
|
+
This provides continuity across context resets. If the file doesn't exist,
|
|
86
|
+
proceed normally — it will be created after the first rollback.
|
|
87
|
+
|
|
78
88
|
### 1. Find the Last Evolution
|
|
79
89
|
|
|
80
90
|
Read `~/.claude/evolution_audit_log.jsonl` and find the most recent
|
|
@@ -101,7 +111,16 @@ After rollback, verify the SKILL.md content is restored:
|
|
|
101
111
|
- Check the audit log for the `rolled_back` entry
|
|
102
112
|
- Optionally re-run evals to confirm the original pass rate
|
|
103
113
|
|
|
104
|
-
### 4.
|
|
114
|
+
### 4. Update Memory
|
|
115
|
+
|
|
116
|
+
After rollback completes, the memory writer updates:
|
|
117
|
+
- `~/.selftune/memory/decisions.md` -- records the rollback decision and reason
|
|
118
|
+
- `~/.selftune/memory/context.md` -- clears the active evolution state and notes the rollback
|
|
119
|
+
|
|
120
|
+
This ensures future evolve and watch workflows have context about why the
|
|
121
|
+
rollback occurred, even across context window resets.
|
|
122
|
+
|
|
123
|
+
### 5. Post-Rollback Audit
|
|
105
124
|
|
|
106
125
|
The rollback is logged. Future `evolve` runs will see the rollback in the
|
|
107
126
|
audit trail and can use it to avoid repeating failed evolution patterns.
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# selftune Unit Test Workflow
|
|
2
|
+
|
|
3
|
+
Run or generate unit tests for individual skills. Tests verify trigger
|
|
4
|
+
accuracy, output content, and tool usage with deterministic assertions.
|
|
5
|
+
|
|
6
|
+
## Default Command
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
selftune unit-test --skill <name> --tests <path> [options]
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Options
|
|
13
|
+
|
|
14
|
+
| Flag | Description | Default |
|
|
15
|
+
|------|-------------|---------|
|
|
16
|
+
| `--skill <name>` | Skill name | Required |
|
|
17
|
+
| `--tests <path>` | Path to unit test JSON file | `~/.selftune/unit-tests/<skill>.json` |
|
|
18
|
+
| `--run-agent` | Run agent-based assertions (not just trigger checks) | Off |
|
|
19
|
+
| `--generate` | Generate tests from skill content instead of running | Off |
|
|
20
|
+
| `--skill-path <path>` | Path to SKILL.md (required for `--generate`) | None |
|
|
21
|
+
| `--eval-set <path>` | Eval set for failure context (used with `--generate`) | None |
|
|
22
|
+
| `--model <flag>` | Model flag for LLM calls | Agent default |
|
|
23
|
+
|
|
24
|
+
## Test Format
|
|
25
|
+
|
|
26
|
+
Tests are stored as JSON arrays in `~/.selftune/unit-tests/<skill>.json`:
|
|
27
|
+
|
|
28
|
+
```json
|
|
29
|
+
[
|
|
30
|
+
{
|
|
31
|
+
"test_id": "research-trigger-1",
|
|
32
|
+
"skill_name": "Research",
|
|
33
|
+
"description": "Should trigger on explicit research request",
|
|
34
|
+
"query": "Research the latest trends in AI safety",
|
|
35
|
+
"expected_trigger": true,
|
|
36
|
+
"assertions": [
|
|
37
|
+
{
|
|
38
|
+
"type": "trigger_check",
|
|
39
|
+
"value": "true",
|
|
40
|
+
"description": "Skill should trigger for this query"
|
|
41
|
+
}
|
|
42
|
+
],
|
|
43
|
+
"tags": ["explicit", "core"],
|
|
44
|
+
"source": "manual"
|
|
45
|
+
}
|
|
46
|
+
]
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Assertion Types
|
|
50
|
+
|
|
51
|
+
| Type | What it checks | Requires agent? |
|
|
52
|
+
|------|---------------|-----------------|
|
|
53
|
+
| `trigger_check` | Query triggers the skill description | No (LLM only) |
|
|
54
|
+
| `output_contains` | Agent output contains expected text | Yes |
|
|
55
|
+
| `output_matches_regex` | Agent output matches regex pattern | Yes |
|
|
56
|
+
| `tool_called` | Agent used a specific tool | Yes |
|
|
57
|
+
|
|
58
|
+
Trigger check assertions are cheap (single LLM call). Agent-based assertions
|
|
59
|
+
require `--run-agent` and run the query through the full agent.
|
|
60
|
+
|
|
61
|
+
## Output Format
|
|
62
|
+
|
|
63
|
+
```json
|
|
64
|
+
{
|
|
65
|
+
"skill_name": "Research",
|
|
66
|
+
"total": 10,
|
|
67
|
+
"passed": 8,
|
|
68
|
+
"failed": 2,
|
|
69
|
+
"pass_rate": 0.80,
|
|
70
|
+
"results": [
|
|
71
|
+
{
|
|
72
|
+
"test_id": "research-trigger-1",
|
|
73
|
+
"overall_passed": true,
|
|
74
|
+
"trigger_passed": true,
|
|
75
|
+
"assertion_results": [
|
|
76
|
+
{ "type": "trigger_check", "value": "true", "passed": true, "evidence": "LLM responded YES" }
|
|
77
|
+
],
|
|
78
|
+
"duration_ms": 450
|
|
79
|
+
}
|
|
80
|
+
],
|
|
81
|
+
"ran_at": "2026-03-04T12:00:00.000Z"
|
|
82
|
+
}
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Steps
|
|
86
|
+
|
|
87
|
+
### 1. Generate Tests (First Time)
|
|
88
|
+
|
|
89
|
+
For a new skill, generate initial tests from the skill content:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
selftune unit-test --skill Research --generate --skill-path ~/.claude/skills/Research/SKILL.md
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
This uses an LLM to create test cases covering:
|
|
96
|
+
- Explicit trigger queries
|
|
97
|
+
- Implicit trigger queries
|
|
98
|
+
- Contextual trigger queries
|
|
99
|
+
- Negative examples (should NOT trigger)
|
|
100
|
+
|
|
101
|
+
Tests are saved to `~/.selftune/unit-tests/Research.json`.
|
|
102
|
+
|
|
103
|
+
### 2. Run Tests
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
selftune unit-test --skill Research --tests ~/.selftune/unit-tests/Research.json
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
By default, only `trigger_check` assertions run (fast, no agent needed).
|
|
110
|
+
Add `--run-agent` for full agent-based assertions.
|
|
111
|
+
|
|
112
|
+
### 3. Review Results
|
|
113
|
+
|
|
114
|
+
Check `pass_rate` and investigate failures:
|
|
115
|
+
- Failed trigger checks → description needs improvement
|
|
116
|
+
- Failed output assertions → skill workflow needs fixes
|
|
117
|
+
- Failed tool assertions → skill routing is broken
|
|
118
|
+
|
|
119
|
+
### 4. Iterate
|
|
120
|
+
|
|
121
|
+
After evolving a skill, re-run unit tests to verify improvements:
|
|
122
|
+
1. Evolve: `selftune evolve --skill Research --skill-path /path/SKILL.md`
|
|
123
|
+
2. Test: `selftune unit-test --skill Research`
|
|
124
|
+
3. Check pass rate improved
|
|
125
|
+
|
|
126
|
+
## Common Patterns
|
|
127
|
+
|
|
128
|
+
**"Generate tests for the pptx skill"**
|
|
129
|
+
> `selftune unit-test --skill pptx --generate --skill-path /path/SKILL.md`
|
|
130
|
+
|
|
131
|
+
**"Run existing tests"**
|
|
132
|
+
> `selftune unit-test --skill pptx --tests ~/.selftune/unit-tests/pptx.json`
|
|
133
|
+
|
|
134
|
+
**"Run full agent tests"**
|
|
135
|
+
> `selftune unit-test --skill pptx --tests /path/tests.json --run-agent`
|
|
136
|
+
|
|
137
|
+
**"Test after evolution"**
|
|
138
|
+
> Run `selftune unit-test` after each `selftune evolve` to verify improvements.
|
package/skill/Workflows/Watch.md
CHANGED
|
@@ -65,6 +65,21 @@ selftune watch --skill <name> --skill-path <path> [options]
|
|
|
65
65
|
|
|
66
66
|
## Steps
|
|
67
67
|
|
|
68
|
+
### 0. Read Evolution Context
|
|
69
|
+
|
|
70
|
+
Before starting, read `~/.selftune/memory/context.md` for session context:
|
|
71
|
+
- Active evolutions and their current status
|
|
72
|
+
- Known issues and regression history
|
|
73
|
+
- Last update timestamp
|
|
74
|
+
|
|
75
|
+
This provides continuity across context resets. If the file doesn't exist,
|
|
76
|
+
proceed normally -- it will be created after the first watch.
|
|
77
|
+
|
|
78
|
+
The evolution-guard hook prevents conflicting SKILL.md edits while watch is
|
|
79
|
+
evaluating the skill. The auto-activation system uses watch results to
|
|
80
|
+
adjust suggestion confidence -- skills showing regressions get flagged for
|
|
81
|
+
attention in subsequent prompts.
|
|
82
|
+
|
|
68
83
|
### 1. Run Watch
|
|
69
84
|
|
|
70
85
|
```bash
|
|
@@ -100,6 +115,13 @@ Summarize the snapshot for the user:
|
|
|
100
115
|
- Whether regression was detected
|
|
101
116
|
- Recommended action
|
|
102
117
|
|
|
118
|
+
### 5. Update Memory
|
|
119
|
+
|
|
120
|
+
After watch completes, the memory writer updates
|
|
121
|
+
`~/.selftune/memory/context.md` with the current regression status,
|
|
122
|
+
pass rates, and recommended next action. This ensures continuity if the
|
|
123
|
+
context window resets before the user acts on the results.
|
|
124
|
+
|
|
103
125
|
## Common Patterns
|
|
104
126
|
|
|
105
127
|
**"Is the skill performing well after the change?"**
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"_readme": "Merge the 'hooks' block below into your ~/.claude/settings.json",
|
|
3
3
|
"_readme2": "Replace /PATH/TO/ with the actual directory where you saved the scripts",
|
|
4
|
+
"_readme3": "This is the comprehensive template. Simpler templates are in templates/",
|
|
4
5
|
|
|
5
6
|
"hooks": {
|
|
6
7
|
"UserPromptSubmit": [
|
|
@@ -10,6 +11,28 @@
|
|
|
10
11
|
"type": "command",
|
|
11
12
|
"command": "bun run /PATH/TO/cli/selftune/hooks/prompt-log.ts",
|
|
12
13
|
"timeout": 5
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"type": "command",
|
|
17
|
+
"command": "bun run /PATH/TO/cli/selftune/hooks/auto-activate.ts",
|
|
18
|
+
"timeout": 5
|
|
19
|
+
}
|
|
20
|
+
]
|
|
21
|
+
}
|
|
22
|
+
],
|
|
23
|
+
"PreToolUse": [
|
|
24
|
+
{
|
|
25
|
+
"matcher": "Write|Edit",
|
|
26
|
+
"hooks": [
|
|
27
|
+
{
|
|
28
|
+
"type": "command",
|
|
29
|
+
"command": "bun run /PATH/TO/cli/selftune/hooks/skill-change-guard.ts",
|
|
30
|
+
"timeout": 5
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"type": "command",
|
|
34
|
+
"command": "bun run /PATH/TO/cli/selftune/hooks/evolution-guard.ts",
|
|
35
|
+
"timeout": 5
|
|
13
36
|
}
|
|
14
37
|
]
|
|
15
38
|
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"_readme": "Default activation rules for selftune auto-activation. Copy to ~/.selftune/activation-rules.json to customize.",
|
|
3
|
+
"_docs": "See docs/integration-guide.md for details on each rule.",
|
|
4
|
+
|
|
5
|
+
"rules": [
|
|
6
|
+
{
|
|
7
|
+
"id": "post-session-diagnostic",
|
|
8
|
+
"enabled": true,
|
|
9
|
+
"description": "Suggest `selftune last` when session has >2 unmatched queries"
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"id": "grading-threshold-breach",
|
|
13
|
+
"enabled": true,
|
|
14
|
+
"description": "Suggest `selftune evolve` when session pass rate < 60%"
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"id": "stale-evolution",
|
|
18
|
+
"enabled": true,
|
|
19
|
+
"description": "Suggest `selftune evolve` when no evolution in >7 days and pending false negatives exist"
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"id": "regression-detected",
|
|
23
|
+
"enabled": true,
|
|
24
|
+
"description": "Suggest `selftune rollback` when monitoring detects a regression"
|
|
25
|
+
}
|
|
26
|
+
]
|
|
27
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
{
|
|
2
|
+
"_readme": "Settings template for multi-skill selftune projects. Merge into ~/.claude/settings.json.",
|
|
3
|
+
"_usage": "Replace /PATH/TO with the absolute path to your selftune installation.",
|
|
4
|
+
"_note": "Multi-skill projects use activation rules to route queries to the correct skill. See templates/activation-rules-default.json.",
|
|
5
|
+
|
|
6
|
+
"hooks": {
|
|
7
|
+
"UserPromptSubmit": [
|
|
8
|
+
{
|
|
9
|
+
"hooks": [
|
|
10
|
+
{
|
|
11
|
+
"type": "command",
|
|
12
|
+
"command": "bun run /PATH/TO/cli/selftune/hooks/prompt-log.ts",
|
|
13
|
+
"timeout": 5
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"type": "command",
|
|
17
|
+
"command": "bun run /PATH/TO/cli/selftune/hooks/auto-activate.ts",
|
|
18
|
+
"timeout": 5
|
|
19
|
+
}
|
|
20
|
+
]
|
|
21
|
+
}
|
|
22
|
+
],
|
|
23
|
+
"PreToolUse": [
|
|
24
|
+
{
|
|
25
|
+
"matcher": "Write|Edit",
|
|
26
|
+
"hooks": [
|
|
27
|
+
{
|
|
28
|
+
"type": "command",
|
|
29
|
+
"command": "bun run /PATH/TO/cli/selftune/hooks/skill-change-guard.ts",
|
|
30
|
+
"timeout": 5
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"type": "command",
|
|
34
|
+
"command": "bun run /PATH/TO/cli/selftune/hooks/evolution-guard.ts",
|
|
35
|
+
"timeout": 5
|
|
36
|
+
}
|
|
37
|
+
]
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
"PostToolUse": [
|
|
41
|
+
{
|
|
42
|
+
"matcher": "Read",
|
|
43
|
+
"hooks": [
|
|
44
|
+
{
|
|
45
|
+
"type": "command",
|
|
46
|
+
"command": "bun run /PATH/TO/cli/selftune/hooks/skill-eval.ts",
|
|
47
|
+
"timeout": 5
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
}
|
|
51
|
+
],
|
|
52
|
+
"Stop": [
|
|
53
|
+
{
|
|
54
|
+
"hooks": [
|
|
55
|
+
{
|
|
56
|
+
"type": "command",
|
|
57
|
+
"command": "bun run /PATH/TO/cli/selftune/hooks/session-stop.ts",
|
|
58
|
+
"timeout": 15
|
|
59
|
+
}
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
]
|
|
63
|
+
}
|
|
64
|
+
}
|