selftune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/README.md +259 -0
  3. package/bin/selftune.cjs +29 -0
  4. package/cli/selftune/constants.ts +71 -0
  5. package/cli/selftune/eval/hooks-to-evals.ts +422 -0
  6. package/cli/selftune/evolution/audit.ts +44 -0
  7. package/cli/selftune/evolution/deploy-proposal.ts +244 -0
  8. package/cli/selftune/evolution/evolve.ts +406 -0
  9. package/cli/selftune/evolution/extract-patterns.ts +145 -0
  10. package/cli/selftune/evolution/propose-description.ts +146 -0
  11. package/cli/selftune/evolution/rollback.ts +242 -0
  12. package/cli/selftune/evolution/stopping-criteria.ts +69 -0
  13. package/cli/selftune/evolution/validate-proposal.ts +137 -0
  14. package/cli/selftune/grading/grade-session.ts +459 -0
  15. package/cli/selftune/hooks/prompt-log.ts +52 -0
  16. package/cli/selftune/hooks/session-stop.ts +54 -0
  17. package/cli/selftune/hooks/skill-eval.ts +73 -0
  18. package/cli/selftune/index.ts +104 -0
  19. package/cli/selftune/ingestors/codex-rollout.ts +416 -0
  20. package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
  21. package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
  22. package/cli/selftune/init.ts +297 -0
  23. package/cli/selftune/monitoring/watch.ts +328 -0
  24. package/cli/selftune/observability.ts +255 -0
  25. package/cli/selftune/types.ts +255 -0
  26. package/cli/selftune/utils/jsonl.ts +75 -0
  27. package/cli/selftune/utils/llm-call.ts +192 -0
  28. package/cli/selftune/utils/logging.ts +40 -0
  29. package/cli/selftune/utils/schema-validator.ts +47 -0
  30. package/cli/selftune/utils/seeded-random.ts +31 -0
  31. package/cli/selftune/utils/transcript.ts +260 -0
  32. package/package.json +29 -0
  33. package/skill/SKILL.md +120 -0
  34. package/skill/Workflows/Doctor.md +145 -0
  35. package/skill/Workflows/Evals.md +193 -0
  36. package/skill/Workflows/Evolve.md +159 -0
  37. package/skill/Workflows/Grade.md +157 -0
  38. package/skill/Workflows/Ingest.md +159 -0
  39. package/skill/Workflows/Initialize.md +125 -0
  40. package/skill/Workflows/Rollback.md +131 -0
  41. package/skill/Workflows/Watch.md +128 -0
  42. package/skill/references/grading-methodology.md +176 -0
  43. package/skill/references/invocation-taxonomy.md +144 -0
  44. package/skill/references/logs.md +168 -0
  45. package/skill/settings_snippet.json +41 -0
@@ -0,0 +1,176 @@
1
+ # Grading Methodology Reference
2
+
3
+ How selftune evaluates skill sessions. Used by the `grade` command and
4
+ referenced by evolution workflows to understand quality signals.
5
+
6
+ ---
7
+
8
+ ## 3-Tier Grading Model
9
+
10
+ Every session is graded across three tiers, each answering a different question:
11
+
12
+ | Tier | Question | Example expectation |
13
+ |------|----------|---------------------|
14
+ | **Trigger** | Did the skill fire at all? | `skills_triggered` contains the skill name |
15
+ | **Process** | Did the agent follow the right steps? | SKILL.md was read before main work started |
16
+ | **Quality** | Was the output actually good? | Output file has correct content and structure |
17
+
18
+ A session can pass Trigger but fail Process (skill fired, but steps were wrong),
19
+ or pass Process but fail Quality (steps were right, but output was bad).
20
+
21
+ ---
22
+
23
+ ## Expectation Derivation
24
+
25
+ When the user does not supply explicit expectations, derive reasonable defaults.
26
+ Always include at least one Process and one Quality expectation.
27
+
28
+ ### Default Expectations
29
+
30
+ 1. **SKILL.md was read before main work started** (Process)
31
+ - Evidence: a `Read` tool call with a path ending in `SKILL.md` appears before
32
+ any `Write`, `Edit`, or significant `Bash` command.
33
+
34
+ 2. **No more than 1 error encountered** (Quality)
35
+ - Evidence: `errors_encountered` field in session telemetry is 0 or 1.
36
+
37
+ 3. **Expected output type exists** (Quality)
38
+ - Evidence: the file, command output, or artifact the user asked for is present.
39
+
40
+ 4. **No thrashing** (Process)
41
+ - Evidence: no single bash command or tool call is repeated more than 3 times
42
+ consecutively in the transcript.
43
+
44
+ 5. **Skill steps followed in order** (Process)
45
+ - Evidence: the sequence of tool calls matches the step order in SKILL.md.
46
+
47
+ ---
48
+
49
+ ## Evidence Standards
50
+
51
+ ### What counts as evidence
52
+
53
+ - A specific tool call from the transcript (e.g., `[TOOL:Read] /path/to/SKILL.md`)
54
+ - A bash command and its output (e.g., `Bash output: 'presentation.pptx created'`)
55
+ - A telemetry field value (e.g., `errors_encountered: 0`)
56
+ - A transcript line number and content
57
+
58
+ ### Strictness rules
59
+
60
+ - **A file existing is NOT evidence it has correct content.** Verify content claims
61
+ separately from existence claims.
62
+ - **Absence of evidence IS evidence of absence** for process steps. If the transcript
63
+ does not show SKILL.md being read, the expectation fails.
64
+ - **Cite specific evidence.** Never mark PASS without pointing to a transcript line,
65
+ tool call, or telemetry field.
66
+
67
+ ---
68
+
69
+ ## Claims Extraction
70
+
71
+ After grading explicit expectations, extract 2-4 implicit claims from the transcript.
72
+ Each claim falls into one of three types:
73
+
74
+ | Type | What it captures | Example |
75
+ |------|------------------|---------|
76
+ | **Factual** | A verifiable statement the agent made | "The agent said 12 slides were created" |
77
+ | **Process** | An observed behavior pattern | "The agent read SKILL.md before making any file changes" |
78
+ | **Quality** | An output characteristic | "The output file was named correctly" |
79
+
80
+ For each claim:
81
+ 1. State the claim clearly
82
+ 2. Classify its type
83
+ 3. Mark `verified: true` or `verified: false`
84
+ 4. Cite evidence (or note its absence)
85
+
86
+ ---
87
+
88
+ ## Eval Feedback and Eval Gap Flagging
89
+
90
+ After grading, review each PASSED expectation and ask:
91
+
92
+ > "Would this expectation also pass if the agent produced wrong output?"
93
+
94
+ If yes, flag it in `eval_feedback.suggestions` with a reason. This drives
95
+ eval set improvement over time.
96
+
97
+ ### When to flag
98
+
99
+ - An expectation checks file existence but not content
100
+ - An expectation checks command success but not output correctness
101
+ - An expectation is too generic to catch quality regressions
102
+
103
+ ### When NOT to flag
104
+
105
+ - The expectation is already specific enough
106
+ - The gap is trivial or not worth the eval set complexity
107
+
108
+ Only raise things worth improving. The goal is actionable feedback, not exhaustive nitpicking.
109
+
110
+ ---
111
+
112
+ ## grading.json Schema
113
+
114
+ ```json
115
+ {
116
+ "session_id": "abc123",
117
+ "skill_name": "pptx",
118
+ "transcript_path": "/home/user/.claude/projects/.../abc123.jsonl",
119
+ "graded_at": "2026-02-28T12:00:00Z",
120
+ "expectations": [
121
+ {
122
+ "text": "SKILL.md was read before any file was created",
123
+ "passed": true,
124
+ "evidence": "Transcript line 3: [TOOL:Read] /path/to/SKILL.md"
125
+ },
126
+ {
127
+ "text": "Output file has correct slide count",
128
+ "passed": false,
129
+ "evidence": "Expected 12 slides, found 8 in bash output"
130
+ }
131
+ ],
132
+ "summary": {
133
+ "passed": 2,
134
+ "failed": 1,
135
+ "total": 3,
136
+ "pass_rate": 0.67
137
+ },
138
+ "execution_metrics": {
139
+ "tool_calls": { "Read": 2, "Write": 1, "Bash": 3 },
140
+ "total_tool_calls": 6,
141
+ "total_steps": 4,
142
+ "bash_commands_run": 3,
143
+ "errors_encountered": 0,
144
+ "skills_triggered": ["pptx"],
145
+ "transcript_chars": 4200
146
+ },
147
+ "claims": [
148
+ {
149
+ "claim": "Output was a .pptx file",
150
+ "type": "factual",
151
+ "verified": true,
152
+ "evidence": "Bash output: 'presentation.pptx created'"
153
+ }
154
+ ],
155
+ "eval_feedback": {
156
+ "suggestions": [
157
+ { "reason": "No expectation checks slide content" }
158
+ ],
159
+ "overall": "Process coverage good; add output quality assertions."
160
+ }
161
+ }
162
+ ```
163
+
164
+ ### Field descriptions
165
+
166
+ | Field | Type | Description |
167
+ |-------|------|-------------|
168
+ | `session_id` | string | From session telemetry |
169
+ | `skill_name` | string | The skill being graded |
170
+ | `transcript_path` | string | Path to the session transcript JSONL |
171
+ | `graded_at` | string | ISO 8601 timestamp of grading |
172
+ | `expectations[]` | array | Each expectation with verdict and evidence |
173
+ | `summary` | object | Aggregate pass/fail counts and rate |
174
+ | `execution_metrics` | object | Raw metrics from session telemetry |
175
+ | `claims[]` | array | Implicit claims extracted from transcript |
176
+ | `eval_feedback` | object | Suggestions for improving the eval set |
@@ -0,0 +1,144 @@
1
+ # Invocation Taxonomy Reference
2
+
3
+ How selftune classifies the ways users trigger (or should trigger) a skill.
4
+ Used by the `evals` command and referenced by evolution workflows to understand
5
+ coverage gaps.
6
+
7
+ ---
8
+
9
+ ## The 4 Invocation Types
10
+
11
+ Every query in an eval set is annotated with one of four invocation types.
12
+ Three are positive (should trigger the skill), one is negative (should not).
13
+
14
+ ### Explicit
15
+
16
+ The user names the skill directly.
17
+
18
+ > "Use the pptx skill to make slides"
19
+ > "Run the selftune grade command"
20
+ > "Open the reins audit tool"
21
+
22
+ **What it means:** The user knows the skill exists and asks for it by name.
23
+ This is the easiest type to catch. If a skill misses explicit invocations,
24
+ something is fundamentally broken.
25
+
26
+ ### Implicit
27
+
28
+ The user describes the task without naming the skill.
29
+
30
+ > "Make me a slide deck"
31
+ > "Grade my last session"
32
+ > "Score this project's readiness"
33
+
34
+ **What it means:** The user knows what they want but not which skill does it.
35
+ The skill description's trigger phrases must cover these natural-language
36
+ variations. Missing implicit invocations means the description is too narrow.
37
+
38
+ ### Contextual
39
+
40
+ The user describes the task with domain-specific noise and context.
41
+
42
+ > "I need slides for the Q3 board meeting with revenue charts"
43
+ > "After that deploy, check if the skill is still working"
44
+ > "The last codex run felt off, can you evaluate it"
45
+
46
+ **What it means:** The user is thinking about their domain, not about skills.
47
+ The query contains the intent buried in context. Missing contextual invocations
48
+ means the skill description lacks real-world vocabulary.
49
+
50
+ ### Negative
51
+
52
+ The query should NOT trigger the skill.
53
+
54
+ > "What format should I use for a presentation?"
55
+ > "Explain what eval means in machine learning"
56
+ > "How do I write a grading rubric for my class"
57
+
58
+ **What it means:** The query contains keywords that might confuse a matcher
59
+ (e.g., "presentation", "eval", "grading") but the intent does not match
60
+ the skill's purpose. Negative examples prevent false positives.
61
+
62
+ ---
63
+
64
+ ## What "Healthy" Looks Like
65
+
66
+ A healthy skill catches all three positive invocation types:
67
+
68
+ | Type | Healthy | Unhealthy |
69
+ |------|---------|-----------|
70
+ | Explicit | Catches all | Misses some (broken) |
71
+ | Implicit | Catches most | Only catches explicit (too rigid) |
72
+ | Contextual | Catches many | Only catches explicit + some implicit (needs evolution) |
73
+ | Negative | Rejects all | False positives on keyword overlap |
74
+
75
+ ### The Coverage Spectrum
76
+
77
+ ```
78
+ Explicit only --> Skill is too rigid, users must babysit
79
+ + Implicit --> Skill works for informed users
80
+ + Contextual --> Skill works naturally in real workflows
81
+ - Negative clean --> No false positives
82
+ ```
83
+
84
+ A skill that only catches explicit invocations is forcing users to know its
85
+ name and syntax. That defeats the purpose of skill-based routing.
86
+
87
+ ---
88
+
89
+ ## Connection to Evolution
90
+
91
+ The invocation taxonomy directly drives the evolution feedback loop:
92
+
93
+ ### Missed Implicit = Undertriggering
94
+
95
+ When `evals` shows implicit queries that don't trigger the skill, the
96
+ description is too narrow. The `evolve` command will:
97
+ 1. Extract the missed implicit patterns
98
+ 2. Propose description changes that cover them
99
+ 3. Validate that existing triggers still work
100
+
101
+ ### Missed Contextual = Under-evolved
102
+
103
+ When implicit queries trigger but contextual ones don't, the skill needs
104
+ richer vocabulary. Evolution should add domain-specific language to the
105
+ description's trigger phrases.
106
+
107
+ ### False-Positive Negatives = Overtriggering
108
+
109
+ When negative queries trigger the skill, the description is too broad.
110
+ Evolution should tighten the scope or add "Don't Use When" clauses.
111
+
112
+ ### The Evolution Priority
113
+
114
+ Fix in this order:
115
+ 1. **Missed explicit** -- broken, fix immediately
116
+ 2. **Missed implicit** -- undertriggering, evolve next
117
+ 3. **Missed contextual** -- under-evolved, evolve when implicit is clean
118
+ 4. **False-positive negatives** -- overtriggering, tighten after broadening
119
+
120
+ ---
121
+
122
+ ## Eval Set Structure
123
+
124
+ Each entry in a generated eval set looks like:
125
+
126
+ ```json
127
+ {
128
+ "id": 1,
129
+ "query": "Make me a slide deck for the Q3 board meeting",
130
+ "expected": true,
131
+ "invocation_type": "contextual",
132
+ "skill_name": "pptx",
133
+ "source_session": "abc123"
134
+ }
135
+ ```
136
+
137
+ | Field | Description |
138
+ |-------|-------------|
139
+ | `id` | Sequential identifier |
140
+ | `query` | The user's original query text |
141
+ | `expected` | `true` = should trigger, `false` = should not |
142
+ | `invocation_type` | One of: `explicit`, `implicit`, `contextual`, `negative` |
143
+ | `skill_name` | The skill this eval targets |
144
+ | `source_session` | Session ID the query came from (if positive) |
@@ -0,0 +1,168 @@
1
+ # Log Format Reference
2
+
3
+ selftune writes to four log files. This reference describes each format
4
+ in detail for the skill to use when parsing sessions and audit trails.
5
+
6
+ ---
7
+
8
+ ## ~/.claude/session_telemetry_log.jsonl
9
+
10
+ One JSON record per line. Each record is one completed agent session.
11
+
12
+ ```json
13
+ {
14
+ "timestamp": "2026-02-28T10:00:00+00:00",
15
+ "session_id": "abc123",
16
+ "source": "claude_code",
17
+ "cwd": "/home/user/projects/myapp",
18
+ "transcript_path": "/home/user/.claude/projects/.../abc123.jsonl",
19
+ "last_user_query": "Make me a slide deck for the board meeting",
20
+ "tool_calls": {
21
+ "Read": 2,
22
+ "Write": 1,
23
+ "Bash": 3,
24
+ "Edit": 0
25
+ },
26
+ "total_tool_calls": 6,
27
+ "bash_commands": [
28
+ "pip install python-pptx --break-system-packages",
29
+ "python3 /tmp/create_pptx.py"
30
+ ],
31
+ "skills_triggered": ["pptx"],
32
+ "assistant_turns": 5,
33
+ "errors_encountered": 0,
34
+ "transcript_chars": 4200
35
+ }
36
+ ```
37
+
38
+ **source values:**
39
+ - `claude_code` — written by session-stop.ts (Stop hook)
40
+ - `codex` — written by ingestors/codex-wrapper.ts
41
+ - `codex_rollout` — written by ingestors/codex-rollout.ts
42
+ - `opencode` — written by ingestors/opencode-ingest.ts
43
+ - `opencode_json` — legacy OpenCode JSON files
44
+
45
+ ---
46
+
47
+ ## ~/.claude/skill_usage_log.jsonl
48
+
49
+ One record per skill trigger event. Populated by skill-eval.ts (PostToolUse hook).
50
+
51
+ ```json
52
+ {
53
+ "timestamp": "2026-02-28T10:00:00+00:00",
54
+ "session_id": "abc123",
55
+ "skill_name": "pptx",
56
+ "skill_path": "/mnt/skills/public/pptx/SKILL.md",
57
+ "query": "Make me a slide deck for the board meeting",
58
+ "triggered": true,
59
+ "source": "claude_code"
60
+ }
61
+ ```
62
+
63
+ ---
64
+
65
+ ## ~/.claude/all_queries_log.jsonl
66
+
67
+ Every user query, whether or not it triggered a skill. Populated by prompt-log.ts (UserPromptSubmit hook).
68
+
69
+ ```json
70
+ {
71
+ "timestamp": "2026-02-28T10:00:00+00:00",
72
+ "session_id": "abc123",
73
+ "query": "Make me a slide deck for the board meeting",
74
+ "source": "claude_code"
75
+ }
76
+ ```
77
+
78
+ ---
79
+
80
+ ## ~/.claude/evolution_audit_log.jsonl
81
+
82
+ One record per evolution action. Written by the evolution and rollback modules.
83
+
84
+ ```json
85
+ {
86
+ "timestamp": "2026-02-28T12:00:00+00:00",
87
+ "proposal_id": "evolve-pptx-1709125200000",
88
+ "action": "created",
89
+ "details": "original_description: Grade a skill session against expectations...",
90
+ "eval_snapshot": {
91
+ "total": 50,
92
+ "passed": 35,
93
+ "failed": 15,
94
+ "pass_rate": 0.70
95
+ }
96
+ }
97
+ ```
98
+
99
+ **action values:**
100
+ - `created` — New evolution proposal generated. `details` starts with `original_description:` prefix preserving the pre-evolution SKILL.md content.
101
+ - `validated` — Proposal tested against eval set. `eval_snapshot` contains before/after pass rates.
102
+ - `deployed` — Updated SKILL.md written to disk. `eval_snapshot` contains final pass rates.
103
+ - `rolled_back` — SKILL.md restored to pre-evolution state (from `.bak` file or audit trail).
104
+
105
+ **Required fields:** `timestamp`, `proposal_id`, `action`
106
+
107
+ **Optional fields:** `details`, `eval_snapshot`
108
+
109
+ ---
110
+
111
+ ## Claude Code Transcript Format (~/.claude/projects/.../session.jsonl)
112
+
113
+ One JSON object per line. Two observed variants:
114
+
115
+ **Variant A (nested, current):**
116
+ ```json
117
+ {"type": "user", "message": {"role": "user", "content": [{"type": "text", "text": "..."}]}}
118
+ {"type": "assistant", "message": {"role": "assistant", "content": [
119
+ {"type": "text", "text": "I'll read the skill first."},
120
+ {"type": "tool_use", "name": "Read", "input": {"file_path": "/path/to/SKILL.md"}}
121
+ ]}}
122
+ ```
123
+
124
+ **Variant B (flat, older):**
125
+ ```json
126
+ {"role": "user", "content": "..."}
127
+ {"role": "assistant", "content": [{"type": "tool_use", "name": "Bash", "input": {"command": "..."}}]}
128
+ ```
129
+
130
+ Tool use always appears in assistant content blocks as `{"type": "tool_use", "name": "ToolName", "input": {...}}`.
131
+
132
+ Skill reads appear as `Read` tool calls where `input.file_path` ends in `SKILL.md`.
133
+
134
+ ---
135
+
136
+ ## Codex Rollout Format ($CODEX_HOME/sessions/YYYY/MM/DD/rollout-*.jsonl)
137
+
138
+ ```json
139
+ {"type": "thread.started", "thread_id": "..."}
140
+ {"type": "turn.started"}
141
+ {"type": "item.completed", "item": {"id": "i0", "item_type": "reasoning", "text": "I should use the setup-demo-app skill"}}
142
+ {"type": "item.completed", "item": {"id": "i1", "item_type": "command_execution", "command": "npm install", "exit_code": 0}}
143
+ {"type": "item.completed", "item": {"id": "i2", "item_type": "file_change", "changes": [{"path": "..."}]}}
144
+ {"type": "item.completed", "item": {"id": "i3", "item_type": "agent_message", "text": "Done!"}}
145
+ {"type": "turn.completed", "usage": {"input_tokens": 1200, "output_tokens": 450}}
146
+ ```
147
+
148
+ Item types: `reasoning`, `command_execution`, `file_change`, `agent_message`,
149
+ `mcp_tool_call`, `web_search`, `todo_list`, `error`
150
+
151
+ ---
152
+
153
+ ## OpenCode Message Format (in SQLite message.content column)
154
+
155
+ Content is a JSON string containing an array of blocks. Anthropic format:
156
+
157
+ ```json
158
+ [
159
+ {"type": "text", "text": "I'll create the presentation."},
160
+ {"type": "tool_use", "name": "Bash", "input": {"command": "pip install python-pptx"}},
161
+ {"type": "tool_use", "name": "Read", "input": {"file_path": "/skills/pptx/SKILL.md"}}
162
+ ]
163
+ ```
164
+
165
+ Tool results appear in subsequent user messages:
166
+ ```json
167
+ [{"type": "tool_result", "tool_use_id": "...", "content": "OK", "is_error": false}]
168
+ ```
@@ -0,0 +1,41 @@
1
+ {
2
+ "_readme": "Merge the 'hooks' block below into your ~/.claude/settings.json",
3
+ "_readme2": "Replace /PATH/TO/ with the actual directory where you saved the scripts",
4
+
5
+ "hooks": {
6
+ "UserPromptSubmit": [
7
+ {
8
+ "hooks": [
9
+ {
10
+ "type": "command",
11
+ "command": "bun run /PATH/TO/cli/selftune/hooks/prompt-log.ts",
12
+ "timeout": 5
13
+ }
14
+ ]
15
+ }
16
+ ],
17
+ "PostToolUse": [
18
+ {
19
+ "matcher": "Read",
20
+ "hooks": [
21
+ {
22
+ "type": "command",
23
+ "command": "bun run /PATH/TO/cli/selftune/hooks/skill-eval.ts",
24
+ "timeout": 5
25
+ }
26
+ ]
27
+ }
28
+ ],
29
+ "Stop": [
30
+ {
31
+ "hooks": [
32
+ {
33
+ "type": "command",
34
+ "command": "bun run /PATH/TO/cli/selftune/hooks/session-stop.ts",
35
+ "timeout": 15
36
+ }
37
+ ]
38
+ }
39
+ ]
40
+ }
41
+ }