selftune 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/.claude/agents/diagnosis-analyst.md +146 -0
  2. package/.claude/agents/evolution-reviewer.md +167 -0
  3. package/.claude/agents/integration-guide.md +200 -0
  4. package/.claude/agents/pattern-analyst.md +147 -0
  5. package/CHANGELOG.md +38 -1
  6. package/README.md +96 -256
  7. package/assets/BeforeAfter.gif +0 -0
  8. package/assets/FeedbackLoop.gif +0 -0
  9. package/assets/logo.svg +9 -0
  10. package/assets/skill-health-badge.svg +20 -0
  11. package/cli/selftune/activation-rules.ts +171 -0
  12. package/cli/selftune/badge/badge-data.ts +108 -0
  13. package/cli/selftune/badge/badge-svg.ts +212 -0
  14. package/cli/selftune/badge/badge.ts +103 -0
  15. package/cli/selftune/constants.ts +75 -1
  16. package/cli/selftune/contribute/bundle.ts +314 -0
  17. package/cli/selftune/contribute/contribute.ts +214 -0
  18. package/cli/selftune/contribute/sanitize.ts +162 -0
  19. package/cli/selftune/cron/setup.ts +266 -0
  20. package/cli/selftune/dashboard-server.ts +582 -0
  21. package/cli/selftune/dashboard.ts +31 -12
  22. package/cli/selftune/eval/baseline.ts +247 -0
  23. package/cli/selftune/eval/composability.ts +117 -0
  24. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  25. package/cli/selftune/eval/hooks-to-evals.ts +68 -2
  26. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  27. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  28. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  29. package/cli/selftune/eval/unit-test.ts +196 -0
  30. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  31. package/cli/selftune/evolution/evolve-body.ts +492 -0
  32. package/cli/selftune/evolution/evolve.ts +479 -104
  33. package/cli/selftune/evolution/extract-patterns.ts +32 -1
  34. package/cli/selftune/evolution/pareto.ts +314 -0
  35. package/cli/selftune/evolution/propose-body.ts +171 -0
  36. package/cli/selftune/evolution/propose-description.ts +100 -2
  37. package/cli/selftune/evolution/propose-routing.ts +166 -0
  38. package/cli/selftune/evolution/refine-body.ts +141 -0
  39. package/cli/selftune/evolution/rollback.ts +20 -3
  40. package/cli/selftune/evolution/validate-body.ts +254 -0
  41. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  42. package/cli/selftune/evolution/validate-routing.ts +177 -0
  43. package/cli/selftune/grading/grade-session.ts +145 -19
  44. package/cli/selftune/grading/pre-gates.ts +104 -0
  45. package/cli/selftune/hooks/auto-activate.ts +185 -0
  46. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  47. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  48. package/cli/selftune/index.ts +88 -0
  49. package/cli/selftune/ingestors/claude-replay.ts +351 -0
  50. package/cli/selftune/ingestors/codex-rollout.ts +1 -1
  51. package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
  52. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  53. package/cli/selftune/init.ts +168 -5
  54. package/cli/selftune/last.ts +2 -2
  55. package/cli/selftune/memory/writer.ts +447 -0
  56. package/cli/selftune/monitoring/watch.ts +25 -2
  57. package/cli/selftune/status.ts +18 -15
  58. package/cli/selftune/types.ts +377 -5
  59. package/cli/selftune/utils/frontmatter.ts +217 -0
  60. package/cli/selftune/utils/llm-call.ts +29 -3
  61. package/cli/selftune/utils/transcript.ts +35 -0
  62. package/cli/selftune/utils/trigger-check.ts +89 -0
  63. package/cli/selftune/utils/tui.ts +156 -0
  64. package/dashboard/index.html +585 -19
  65. package/package.json +17 -6
  66. package/skill/SKILL.md +127 -10
  67. package/skill/Workflows/AutoActivation.md +144 -0
  68. package/skill/Workflows/Badge.md +118 -0
  69. package/skill/Workflows/Baseline.md +121 -0
  70. package/skill/Workflows/Composability.md +100 -0
  71. package/skill/Workflows/Contribute.md +91 -0
  72. package/skill/Workflows/Cron.md +155 -0
  73. package/skill/Workflows/Dashboard.md +203 -0
  74. package/skill/Workflows/Doctor.md +37 -1
  75. package/skill/Workflows/Evals.md +73 -5
  76. package/skill/Workflows/EvolutionMemory.md +152 -0
  77. package/skill/Workflows/Evolve.md +111 -6
  78. package/skill/Workflows/EvolveBody.md +159 -0
  79. package/skill/Workflows/ImportSkillsBench.md +111 -0
  80. package/skill/Workflows/Ingest.md +129 -15
  81. package/skill/Workflows/Initialize.md +58 -3
  82. package/skill/Workflows/Replay.md +70 -0
  83. package/skill/Workflows/Rollback.md +20 -1
  84. package/skill/Workflows/UnitTest.md +138 -0
  85. package/skill/Workflows/Watch.md +22 -0
  86. package/skill/settings_snippet.json +23 -0
  87. package/templates/activation-rules-default.json +27 -0
  88. package/templates/multi-skill-settings.json +64 -0
  89. package/templates/single-skill-settings.json +58 -0
@@ -0,0 +1,111 @@
1
+ # selftune Import SkillsBench Workflow
2
+
3
+ Import evaluation tasks from the SkillsBench corpus (87 real-world agent
4
+ benchmarks) and convert them to selftune eval entries. This enriches
5
+ your skill's eval set with externally validated test cases.
6
+
7
+ ## Default Command
8
+
9
+ ```bash
10
+ selftune import-skillsbench --dir <path> --skill <name> --output <path> [options]
11
+ ```
12
+
13
+ ## Options
14
+
15
+ | Flag | Description | Default |
16
+ |------|-------------|---------|
17
+ | `--dir <path>` | Path to SkillsBench tasks directory | Required |
18
+ | `--skill <name>` | Target skill to match tasks against | Required |
19
+ | `--output <path>` | Output eval set JSON file | Required |
20
+ | `--match-strategy <type>` | Matching strategy: `exact` or `fuzzy` | `exact` |
21
+
22
+ ## Match Strategies
23
+
24
+ ### `exact`
25
+
26
+ Matches tasks where `expected_skill` in `task.toml` exactly matches the
27
+ target skill name. Precise but may miss relevant tasks.
28
+
29
+ ### `fuzzy`
30
+
31
+ Uses keyword overlap between the task's category/tags and the skill name.
32
+ Casts a wider net but may include marginally relevant tasks. Review the
33
+ output and remove false matches.
34
+
35
+ ## SkillsBench Directory Structure
36
+
37
+ The importer expects this layout:
38
+
39
+ ```
40
+ tasks/
41
+ ├── task-001/
42
+ │ ├── instruction.md # Task description (used as query)
43
+ │ └── task.toml # Metadata (difficulty, category, tags, expected_skill)
44
+ ├── task-002/
45
+ │ ├── instruction.md
46
+ │ └── task.toml
47
+ └── ...
48
+ ```
49
+
50
+ ### `task.toml` Format
51
+
52
+ ```toml
53
+ difficulty = "medium"
54
+ category = "research"
55
+ tags = ["web-search", "analysis", "summarization"]
56
+ expected_skill = "Research"
57
+ expected_tools = ["WebSearch", "Read"]
58
+ ```
59
+
60
+ All fields are optional. Tasks without `task.toml` use default values.
61
+
62
+ ## Output Format
63
+
64
+ Standard selftune eval entries:
65
+
66
+ ```json
67
+ [
68
+ {
69
+ "id": 1,
70
+ "query": "Find and summarize the latest papers on transformer architectures",
71
+ "expected": true,
72
+ "invocation_type": "implicit",
73
+ "skill_name": "Research",
74
+ "source_session": null,
75
+ "source": "skillsbench"
76
+ }
77
+ ]
78
+ ```
79
+
80
+ ## Steps
81
+
82
+ ### 1. Obtain SkillsBench Corpus
83
+
84
+ Clone or download the SkillsBench repository containing the task directory.
85
+
86
+ ### 2. Import Tasks
87
+
88
+ ```bash
89
+ selftune import-skillsbench --dir /path/to/skillsbench/tasks --skill Research --output evals-bench.json
90
+ ```
91
+
92
+ ### 3. Review Output
93
+
94
+ Inspect the generated eval entries. Remove any that don't match your skill's
95
+ intended scope. Adjust match strategy if needed.
96
+
97
+ ### 4. Merge with Existing Evals
98
+
99
+ Combine imported entries with your existing eval set for a richer validation
100
+ corpus. Use the merged set with `selftune evolve --eval-set merged-evals.json`.
101
+
102
+ ## Common Patterns
103
+
104
+ **"Import SkillsBench tasks for Research"**
105
+ > `selftune import-skillsbench --dir /path/tasks --skill Research --output bench-evals.json`
106
+
107
+ **"Use fuzzy matching for broader coverage"**
108
+ > `selftune import-skillsbench --dir /path/tasks --skill pptx --output bench-evals.json --match-strategy fuzzy`
109
+
110
+ **"Enrich my eval set with external benchmarks"**
111
+ > Import with `import-skillsbench`, then pass the output to `evolve --eval-set`.
@@ -1,19 +1,69 @@
1
1
  # selftune Ingest Workflow
2
2
 
3
- Import sessions from non-Claude-Code agent platforms into the shared
4
- selftune log format. Covers three sub-commands: `ingest-codex`,
5
- `ingest-opencode`, and `wrap-codex`.
3
+ Import sessions from agent platforms into the shared selftune log format.
4
+ Covers five sub-commands: `replay`, `ingest-codex`, `ingest-opencode`,
5
+ `ingest-openclaw`, and `wrap-codex`.
6
6
 
7
7
  ## When to Use Each
8
8
 
9
9
  | Sub-command | Platform | Mode | When |
10
10
  |-------------|----------|------|------|
11
+ | `replay` | Claude Code | Batch | Backfill logs from existing Claude Code transcripts |
11
12
  | `ingest-codex` | Codex | Batch | Import existing Codex rollout logs |
12
13
  | `ingest-opencode` | OpenCode | Batch | Import existing OpenCode sessions |
14
+ | `ingest-openclaw` | OpenClaw | Batch | Import existing OpenClaw agent sessions |
13
15
  | `wrap-codex` | Codex | Real-time | Wrap `codex exec` to capture telemetry live |
14
16
 
15
17
  ---
16
18
 
19
+ ## replay
20
+
21
+ Batch ingest existing Claude Code session transcripts into the shared JSONL schema.
22
+
23
+ ### Default Command
24
+
25
+ ```bash
26
+ selftune replay
27
+ ```
28
+
29
+ ### Options
30
+
31
+ | Flag | Description |
32
+ |------|-------------|
33
+ | `--since <date>` | Only ingest sessions modified after this date (e.g., `2026-01-01`) |
34
+ | `--dry-run` | Show what would be ingested without writing to logs |
35
+ | `--force` | Re-ingest all sessions, ignoring the marker file |
36
+ | `--verbose` | Show per-file progress during ingestion |
37
+ | `--projects-dir <path>` | Override default `~/.claude/projects/` directory |
38
+
39
+ ### Source
40
+
41
+ Reads from `~/.claude/projects/<hash>/<session-id>.jsonl`. These are the
42
+ transcript files Claude Code automatically saves for every session.
43
+
44
+ ### Output
45
+
46
+ Writes to:
47
+ - `~/.claude/all_queries_log.jsonl` -- extracted user queries (one per query, not just last)
48
+ - `~/.claude/session_telemetry_log.jsonl` -- per-session metrics with `source: "claude_code_replay"`
49
+ - `~/.claude/skill_usage_log.jsonl` -- skill triggers with `source: "claude_code_replay"`
50
+
51
+ ### Steps
52
+
53
+ 1. Run `selftune replay --dry-run` to preview what would be ingested
54
+ 2. Run `selftune replay` to ingest all sessions
55
+ 3. Run `selftune doctor` to confirm logs are healthy
56
+ 4. Run `selftune evals --list-skills` to see if the ingested sessions appear
57
+
58
+ ### Notes
59
+
60
+ - Idempotent: uses a marker file (`~/.claude/claude_code_ingested_sessions.json`) to track
61
+ which transcripts have already been ingested. Safe to run repeatedly.
62
+ - Extracts ALL user queries per session, not just the last one.
63
+ - Filters out system messages, short queries (<4 chars), and queries matching `SKIP_PREFIXES`.
64
+
65
+ ---
66
+
17
67
  ## ingest-codex
18
68
 
19
69
  Batch ingest Codex rollout logs into the shared JSONL schema.
@@ -42,9 +92,9 @@ Writes to:
42
92
  ### Steps
43
93
 
44
94
  1. Verify `$CODEX_HOME/sessions/` directory exists and contains session files
45
- 2. Run `ingest-codex`
95
+ 2. Run `selftune ingest-codex`
46
96
  3. Verify entries were written by checking log file line counts
47
- 4. Run `doctor` to confirm logs are healthy
97
+ 4. Run `selftune doctor` to confirm logs are healthy
48
98
 
49
99
  ---
50
100
 
@@ -78,9 +128,61 @@ Writes to:
78
128
  ### Steps
79
129
 
80
130
  1. Verify the OpenCode database exists at the expected path
81
- 2. Run `ingest-opencode`
131
+ 2. Run `selftune ingest-opencode`
82
132
  3. Verify entries were written by checking log file line counts
83
- 4. Run `doctor` to confirm logs are healthy
133
+ 4. Run `selftune doctor` to confirm logs are healthy
134
+
135
+ ---
136
+
137
+ ## ingest-openclaw
138
+
139
+ Batch ingest OpenClaw agent session histories into the shared JSONL schema.
140
+ Supports multiple agents and auto-discovers session files across all agent directories.
141
+
142
+ ### Default Command
143
+
144
+ ```bash
145
+ selftune ingest-openclaw
146
+ ```
147
+
148
+ ### Options
149
+
150
+ | Flag | Description |
151
+ |------|-------------|
152
+ | `--agents-dir <path>` | Override default `~/.openclaw/agents/` directory |
153
+ | `--since <date>` | Only ingest sessions modified after this date (e.g., `2026-01-01`) |
154
+ | `--dry-run` | Show what would be ingested without writing to logs |
155
+ | `--force` | Re-ingest all sessions, ignoring the marker file |
156
+ | `--verbose` / `-v` | Show per-session progress during ingestion |
157
+
158
+ ### Source
159
+
160
+ Reads from `~/.openclaw/agents/<agentId>/sessions/*.jsonl`. Each JSONL file contains:
161
+ - Line 1 (session header): `{"type":"session","version":5,"id":"<uuid>","timestamp":"<iso>","cwd":"<path>"}`
162
+ - Line 2+ (messages): `{"role":"user|assistant|toolResult","content":[...],"timestamp":<ms>}`
163
+
164
+ ### Output
165
+
166
+ Writes to:
167
+ - `~/.claude/all_queries_log.jsonl` -- extracted user queries
168
+ - `~/.claude/session_telemetry_log.jsonl` -- per-session metrics with `source: "openclaw"`
169
+ - `~/.claude/skill_usage_log.jsonl` -- skill triggers with `source: "openclaw"`
170
+
171
+ ### Steps
172
+
173
+ 1. Run `selftune ingest-openclaw --dry-run` to preview what would be ingested
174
+ 2. Run `selftune ingest-openclaw` to ingest all sessions
175
+ 3. Run `selftune doctor` to confirm logs are healthy
176
+ 4. Run `selftune evals --list-skills` to see if the ingested sessions appear
177
+
178
+ ### Notes
179
+
180
+ - Idempotent: uses a marker file to track which sessions have already been ingested.
181
+ Safe to run repeatedly. Use `--force` to re-ingest everything.
182
+ - Skill detection heuristic: identifies skills by checking for `SKILL.md` file reads in
183
+ tool calls and by matching known skill names in assistant text content.
184
+ - Multi-agent support: scans all agent directories under the agents root, ingesting
185
+ sessions from every agent found.
84
186
 
85
187
  ---
86
188
 
@@ -117,25 +219,37 @@ stream for telemetry; it does not modify Codex behavior.
117
219
  1. Build the wrap-codex command with the desired Codex arguments
118
220
  2. Run the command (replaces `codex exec` in your workflow)
119
221
  3. Session telemetry is captured automatically
120
- 4. Verify with `doctor` after first use
222
+ 4. Verify with `selftune doctor` after first use
121
223
 
122
224
  ---
123
225
 
124
226
  ## Common Patterns
125
227
 
228
+ **"Backfill Claude Code sessions"**
229
+ > Run `selftune replay`. No options needed. Reads from `~/.claude/projects/`.
230
+
231
+ **"Replay only recent Claude Code sessions"**
232
+ > Run `selftune replay --since 2026-02-01` with an appropriate date.
233
+
126
234
  **"Ingest codex logs"**
127
- > Run `ingest-codex`. No options needed. Reads from `$CODEX_HOME/sessions/`.
235
+ > Run `selftune ingest-codex`. No options needed. Reads from `$CODEX_HOME/sessions/`.
128
236
 
129
237
  **"Import opencode sessions"**
130
- > Run `ingest-opencode`. Reads from the SQLite database automatically.
238
+ > Run `selftune ingest-opencode`. Reads from the SQLite database automatically.
239
+
240
+ **"Ingest OpenClaw sessions"**
241
+ > Run `selftune ingest-openclaw`. Reads from `~/.openclaw/agents/` automatically.
242
+
243
+ **"Import only recent OpenClaw sessions"**
244
+ > Run `selftune ingest-openclaw --since 2026-02-01` with an appropriate date.
131
245
 
132
246
  **"Run codex through selftune"**
133
- > Use `wrap-codex -- <codex args>` instead of `codex exec <args>` directly.
247
+ > Use `selftune wrap-codex -- <codex args>` instead of `codex exec <args>` directly.
134
248
 
135
249
  **"Batch ingest vs real-time"**
136
- > Use `ingest-codex` or `ingest-opencode` for historical sessions.
137
- > Use `wrap-codex` for ongoing sessions. Both produce the same log format.
250
+ > Use `selftune ingest-codex` or `selftune ingest-opencode` for historical sessions.
251
+ > Use `selftune wrap-codex` for ongoing sessions. Both produce the same log format.
138
252
 
139
253
  **"How do I know it worked?"**
140
- > Run `doctor` after ingestion. Check that log files exist and are parseable.
141
- > Run `evals --list-skills` to see if the ingested sessions appear.
254
+ > Run `selftune doctor` after ingestion. Check that log files exist and are parseable.
255
+ > Run `selftune evals --list-skills` to see if the ingested sessions appear.
@@ -19,6 +19,7 @@ selftune init [--agent <type>] [--cli-path <path>] [--force]
19
19
  | Flag | Description | Default |
20
20
  |------|-------------|---------|
21
21
  | `--agent <type>` | Agent platform: `claude`, `codex`, `opencode` | Auto-detected |
22
+ | `--cli-path <path>` | Override auto-detected CLI entry-point path | Auto-detected |
22
23
  | `--force` | Reinitialize even if config already exists | Off |
23
24
 
24
25
  ## Output Format
@@ -68,7 +69,7 @@ cat ~/.selftune/config.json 2>/dev/null
68
69
  ```
69
70
 
70
71
  If the file exists and is valid JSON, selftune is already initialized.
71
- Skip to Step 5 (verify with doctor) unless the user wants to reinitialize.
72
+ Skip to Step 8 (verify with doctor) unless the user wants to reinitialize.
72
73
 
73
74
  ### 3. Run Init
74
75
 
@@ -79,12 +80,15 @@ selftune init
79
80
  ### 4. Install Hooks (Claude Code)
80
81
 
81
82
  If `init` reports hooks are not installed, merge the entries from
82
- `skill/settings_snippet.json` into `~/.claude/settings.json`. Three hooks
83
+ `skill/settings_snippet.json` into `~/.claude/settings.json`. Six hooks
83
84
  are required:
84
85
 
85
86
  | Hook | Script | Purpose |
86
87
  |------|--------|---------|
87
88
  | `UserPromptSubmit` | `hooks/prompt-log.ts` | Log every user query |
89
+ | `UserPromptSubmit` | `hooks/auto-activate.ts` | Suggest skills before prompt processing |
90
+ | `PreToolUse` (Write/Edit) | `hooks/skill-change-guard.ts` | Detect uncontrolled skill edits |
91
+ | `PreToolUse` (Write/Edit) | `hooks/evolution-guard.ts` | Block SKILL.md edits on monitored skills |
88
92
  | `PostToolUse` (Read) | `hooks/skill-eval.ts` | Track skill triggers |
89
93
  | `Stop` | `hooks/session-stop.ts` | Capture session telemetry |
90
94
 
@@ -99,7 +103,48 @@ The hooks directory is at `dirname(cli_path)/hooks/`.
99
103
  - Use `selftune ingest-opencode` to import sessions from the SQLite database
100
104
  - See `Workflows/Ingest.md` for details
101
105
 
102
- ### 5. Verify with Doctor
106
+ ### 5. Initialize Memory Directory
107
+
108
+ Create the memory directory if it does not exist:
109
+
110
+ ```bash
111
+ mkdir -p ~/.selftune/memory
112
+ ```
113
+
114
+ The memory system stores three files at `~/.selftune/memory/`:
115
+ - `context.md` -- active evolution state and session context
116
+ - `decisions.md` -- evolution decisions and rollback history
117
+ - `plan.md` -- current priorities and evolution strategy
118
+
119
+ These files are created automatically by the memory writer during evolve,
120
+ watch, and rollback workflows. The directory just needs to exist.
121
+
122
+ ### 6. Set Up Activation Rules
123
+
124
+ Copy the default activation rules template:
125
+
126
+ ```bash
127
+ cp templates/activation-rules-default.json ~/.selftune/activation-rules.json
128
+ ```
129
+
130
+ The activation rules file configures auto-activation behavior -- which skills
131
+ get suggested and under what conditions. Edit `~/.selftune/activation-rules.json`
132
+ to customize thresholds and skill mappings for your project.
133
+
134
+ ### 7. Verify Agent Availability
135
+
136
+ Check that the specialized agent files are present:
137
+
138
+ ```bash
139
+ ls .claude/agents/
140
+ ```
141
+
142
+ Expected agents: `diagnosis-analyst.md`, `pattern-analyst.md`,
143
+ `evolution-reviewer.md`, `integration-guide.md`. These are used by evolve
144
+ and doctor workflows for deeper analysis. If missing, copy them from the
145
+ selftune repository's `.claude/agents/` directory.
146
+
147
+ ### 8. Verify with Doctor
103
148
 
104
149
  ```bash
105
150
  selftune doctor
@@ -108,6 +153,16 @@ selftune doctor
108
153
  Parse the JSON output. All checks should pass. If any fail, address the
109
154
  reported issues before proceeding.
110
155
 
156
+ ## Integration Guide
157
+
158
+ For project-type-specific setup (single-skill, multi-skill, monorepo, Codex,
159
+ OpenCode, mixed agents), see [docs/integration-guide.md](../../docs/integration-guide.md).
160
+
161
+ Templates for each project type are in the `templates/` directory:
162
+ - `templates/single-skill-settings.json` — hooks for single-skill projects
163
+ - `templates/multi-skill-settings.json` — hooks for multi-skill projects with activation rules
164
+ - `templates/activation-rules-default.json` — default auto-activation rule configuration
165
+
111
166
  ## Common Patterns
112
167
 
113
168
  **"Initialize selftune"**
@@ -0,0 +1,70 @@
1
+ # selftune Replay Workflow
2
+
3
+ Backfill the shared JSONL logs from existing Claude Code conversation
4
+ transcripts. Useful for bootstrapping selftune with historical session data.
5
+
6
+ ## When to Use
7
+
8
+ - New selftune installation with months of Claude Code history
9
+ - After re-initializing logs and wanting to recover data
10
+ - To populate eval data without waiting for new sessions
11
+
12
+ ## Key Difference from Hooks
13
+
14
+ Real-time hooks capture only the **last** user query per session. Replay
15
+ extracts **all** user queries, writing one `QueryLogRecord` per message.
16
+ This produces much richer eval data from historical sessions.
17
+
18
+ ## Default Command
19
+
20
+ ```bash
21
+ selftune replay
22
+ ```
23
+
24
+ ## Options
25
+
26
+ | Flag | Description |
27
+ |------|-------------|
28
+ | `--since <date>` | Only include transcripts modified after this date |
29
+ | `--dry-run` | Preview what would be ingested without writing |
30
+ | `--force` | Re-ingest all transcripts (ignore marker file) |
31
+ | `--verbose` | Show detailed progress per file |
32
+ | `--projects-dir <path>` | Override default `~/.claude/projects/` path |
33
+
34
+ ## Source
35
+
36
+ Reads Claude Code transcripts from `~/.claude/projects/<hash>/<session>.jsonl`.
37
+ Each transcript is a JSONL file containing user and assistant messages.
38
+
39
+ ## Output
40
+
41
+ Writes to:
42
+ - `~/.claude/all_queries_log.jsonl` -- one record per user query (all messages, not just last)
43
+ - `~/.claude/session_telemetry_log.jsonl` -- per-session metrics with `source: "claude_code_replay"`
44
+ - `~/.claude/skill_usage_log.jsonl` -- skill triggers detected in transcripts
45
+
46
+ ## Idempotency
47
+
48
+ Uses a marker file at `~/.claude/claude_code_ingested_sessions.json` to track
49
+ which transcripts have already been ingested. Use `--force` to re-ingest all.
50
+
51
+ ## Steps
52
+
53
+ 1. Run `selftune replay --dry-run` to preview what would be ingested
54
+ 2. Run `selftune replay` to perform the ingestion
55
+ 3. Run `selftune doctor` to verify logs are healthy
56
+ 4. Run `selftune evals --list-skills` to see if replayed sessions appear
57
+
58
+ ## Common Patterns
59
+
60
+ **"Backfill my logs"**
61
+ > Run `selftune replay`. No options needed.
62
+
63
+ **"Only replay recent sessions"**
64
+ > Run `selftune replay --since 2026-02-01`
65
+
66
+ **"Re-ingest everything"**
67
+ > Run `selftune replay --force`
68
+
69
+ **"How do I know it worked?"**
70
+ > Run `selftune doctor` after replay. Check log file line counts increased.
@@ -75,6 +75,16 @@ Manual restoration from version control is required.
75
75
 
76
76
  ## Steps
77
77
 
78
+ ### 0. Read Evolution Context
79
+
80
+ Before starting, read `~/.selftune/memory/context.md` for session context:
81
+ - Active evolutions and their current status
82
+ - Previous rollback history
83
+ - Last update timestamp
84
+
85
+ This provides continuity across context resets. If the file doesn't exist,
86
+ proceed normally — it will be created after the first rollback.
87
+
78
88
  ### 1. Find the Last Evolution
79
89
 
80
90
  Read `~/.claude/evolution_audit_log.jsonl` and find the most recent
@@ -101,7 +111,16 @@ After rollback, verify the SKILL.md content is restored:
101
111
  - Check the audit log for the `rolled_back` entry
102
112
  - Optionally re-run evals to confirm the original pass rate
103
113
 
104
- ### 4. Post-Rollback Audit
114
+ ### 4. Update Memory
115
+
116
+ After rollback completes, the memory writer updates:
117
+ - `~/.selftune/memory/decisions.md` -- records the rollback decision and reason
118
+ - `~/.selftune/memory/context.md` -- clears the active evolution state and notes the rollback
119
+
120
+ This ensures future evolve and watch workflows have context about why the
121
+ rollback occurred, even across context window resets.
122
+
123
+ ### 5. Post-Rollback Audit
105
124
 
106
125
  The rollback is logged. Future `evolve` runs will see the rollback in the
107
126
  audit trail and can use it to avoid repeating failed evolution patterns.
@@ -0,0 +1,138 @@
1
+ # selftune Unit Test Workflow
2
+
3
+ Run or generate unit tests for individual skills. Tests verify trigger
4
+ accuracy, output content, and tool usage with deterministic assertions.
5
+
6
+ ## Default Command
7
+
8
+ ```bash
9
+ selftune unit-test --skill <name> --tests <path> [options]
10
+ ```
11
+
12
+ ## Options
13
+
14
+ | Flag | Description | Default |
15
+ |------|-------------|---------|
16
+ | `--skill <name>` | Skill name | Required |
17
+ | `--tests <path>` | Path to unit test JSON file | `~/.selftune/unit-tests/<skill>.json` |
18
+ | `--run-agent` | Run agent-based assertions (not just trigger checks) | Off |
19
+ | `--generate` | Generate tests from skill content instead of running | Off |
20
+ | `--skill-path <path>` | Path to SKILL.md (required for `--generate`) | None |
21
+ | `--eval-set <path>` | Eval set for failure context (used with `--generate`) | None |
22
+ | `--model <flag>` | Model flag for LLM calls | Agent default |
23
+
24
+ ## Test Format
25
+
26
+ Tests are stored as JSON arrays in `~/.selftune/unit-tests/<skill>.json`:
27
+
28
+ ```json
29
+ [
30
+ {
31
+ "test_id": "research-trigger-1",
32
+ "skill_name": "Research",
33
+ "description": "Should trigger on explicit research request",
34
+ "query": "Research the latest trends in AI safety",
35
+ "expected_trigger": true,
36
+ "assertions": [
37
+ {
38
+ "type": "trigger_check",
39
+ "value": "true",
40
+ "description": "Skill should trigger for this query"
41
+ }
42
+ ],
43
+ "tags": ["explicit", "core"],
44
+ "source": "manual"
45
+ }
46
+ ]
47
+ ```
48
+
49
+ ## Assertion Types
50
+
51
+ | Type | What it checks | Requires agent? |
52
+ |------|---------------|-----------------|
53
+ | `trigger_check` | Query triggers the skill description | No (LLM only) |
54
+ | `output_contains` | Agent output contains expected text | Yes |
55
+ | `output_matches_regex` | Agent output matches regex pattern | Yes |
56
+ | `tool_called` | Agent used a specific tool | Yes |
57
+
58
+ Trigger check assertions are cheap (single LLM call). Agent-based assertions
59
+ require `--run-agent` and run the query through the full agent.
60
+
61
+ ## Output Format
62
+
63
+ ```json
64
+ {
65
+ "skill_name": "Research",
66
+ "total": 10,
67
+ "passed": 8,
68
+ "failed": 2,
69
+ "pass_rate": 0.80,
70
+ "results": [
71
+ {
72
+ "test_id": "research-trigger-1",
73
+ "overall_passed": true,
74
+ "trigger_passed": true,
75
+ "assertion_results": [
76
+ { "type": "trigger_check", "value": "true", "passed": true, "evidence": "LLM responded YES" }
77
+ ],
78
+ "duration_ms": 450
79
+ }
80
+ ],
81
+ "ran_at": "2026-03-04T12:00:00.000Z"
82
+ }
83
+ ```
84
+
85
+ ## Steps
86
+
87
+ ### 1. Generate Tests (First Time)
88
+
89
+ For a new skill, generate initial tests from the skill content:
90
+
91
+ ```bash
92
+ selftune unit-test --skill Research --generate --skill-path ~/.claude/skills/Research/SKILL.md
93
+ ```
94
+
95
+ This uses an LLM to create test cases covering:
96
+ - Explicit trigger queries
97
+ - Implicit trigger queries
98
+ - Contextual trigger queries
99
+ - Negative examples (should NOT trigger)
100
+
101
+ Tests are saved to `~/.selftune/unit-tests/Research.json`.
102
+
103
+ ### 2. Run Tests
104
+
105
+ ```bash
106
+ selftune unit-test --skill Research --tests ~/.selftune/unit-tests/Research.json
107
+ ```
108
+
109
+ By default, only `trigger_check` assertions run (fast, no agent needed).
110
+ Add `--run-agent` for full agent-based assertions.
111
+
112
+ ### 3. Review Results
113
+
114
+ Check `pass_rate` and investigate failures:
115
+ - Failed trigger checks → description needs improvement
116
+ - Failed output assertions → skill workflow needs fixes
117
+ - Failed tool assertions → skill routing is broken
118
+
119
+ ### 4. Iterate
120
+
121
+ After evolving a skill, re-run unit tests to verify improvements:
122
+ 1. Evolve: `selftune evolve --skill Research --skill-path /path/SKILL.md`
123
+ 2. Test: `selftune unit-test --skill Research`
124
+ 3. Check pass rate improved
125
+
126
+ ## Common Patterns
127
+
128
+ **"Generate tests for the pptx skill"**
129
+ > `selftune unit-test --skill pptx --generate --skill-path /path/SKILL.md`
130
+
131
+ **"Run existing tests"**
132
+ > `selftune unit-test --skill pptx --tests ~/.selftune/unit-tests/pptx.json`
133
+
134
+ **"Run full agent tests"**
135
+ > `selftune unit-test --skill pptx --tests /path/tests.json --run-agent`
136
+
137
+ **"Test after evolution"**
138
+ > Run `selftune unit-test` after each `selftune evolve` to verify improvements.
@@ -65,6 +65,21 @@ selftune watch --skill <name> --skill-path <path> [options]
65
65
 
66
66
  ## Steps
67
67
 
68
+ ### 0. Read Evolution Context
69
+
70
+ Before starting, read `~/.selftune/memory/context.md` for session context:
71
+ - Active evolutions and their current status
72
+ - Known issues and regression history
73
+ - Last update timestamp
74
+
75
+ This provides continuity across context resets. If the file doesn't exist,
76
+ proceed normally -- it will be created after the first watch.
77
+
78
+ The evolution-guard hook prevents conflicting SKILL.md edits while watch is
79
+ evaluating the skill. The auto-activation system uses watch results to
80
+ adjust suggestion confidence -- skills showing regressions get flagged for
81
+ attention in subsequent prompts.
82
+
68
83
  ### 1. Run Watch
69
84
 
70
85
  ```bash
@@ -100,6 +115,13 @@ Summarize the snapshot for the user:
100
115
  - Whether regression was detected
101
116
  - Recommended action
102
117
 
118
+ ### 5. Update Memory
119
+
120
+ After watch completes, the memory writer updates
121
+ `~/.selftune/memory/context.md` with the current regression status,
122
+ pass rates, and recommended next action. This ensures continuity if the
123
+ context window resets before the user acts on the results.
124
+
103
125
  ## Common Patterns
104
126
 
105
127
  **"Is the skill performing well after the change?"**