selftune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/README.md +259 -0
  3. package/bin/selftune.cjs +29 -0
  4. package/cli/selftune/constants.ts +71 -0
  5. package/cli/selftune/eval/hooks-to-evals.ts +422 -0
  6. package/cli/selftune/evolution/audit.ts +44 -0
  7. package/cli/selftune/evolution/deploy-proposal.ts +244 -0
  8. package/cli/selftune/evolution/evolve.ts +406 -0
  9. package/cli/selftune/evolution/extract-patterns.ts +145 -0
  10. package/cli/selftune/evolution/propose-description.ts +146 -0
  11. package/cli/selftune/evolution/rollback.ts +242 -0
  12. package/cli/selftune/evolution/stopping-criteria.ts +69 -0
  13. package/cli/selftune/evolution/validate-proposal.ts +137 -0
  14. package/cli/selftune/grading/grade-session.ts +459 -0
  15. package/cli/selftune/hooks/prompt-log.ts +52 -0
  16. package/cli/selftune/hooks/session-stop.ts +54 -0
  17. package/cli/selftune/hooks/skill-eval.ts +73 -0
  18. package/cli/selftune/index.ts +104 -0
  19. package/cli/selftune/ingestors/codex-rollout.ts +416 -0
  20. package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
  21. package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
  22. package/cli/selftune/init.ts +297 -0
  23. package/cli/selftune/monitoring/watch.ts +328 -0
  24. package/cli/selftune/observability.ts +255 -0
  25. package/cli/selftune/types.ts +255 -0
  26. package/cli/selftune/utils/jsonl.ts +75 -0
  27. package/cli/selftune/utils/llm-call.ts +192 -0
  28. package/cli/selftune/utils/logging.ts +40 -0
  29. package/cli/selftune/utils/schema-validator.ts +47 -0
  30. package/cli/selftune/utils/seeded-random.ts +31 -0
  31. package/cli/selftune/utils/transcript.ts +260 -0
  32. package/package.json +29 -0
  33. package/skill/SKILL.md +120 -0
  34. package/skill/Workflows/Doctor.md +145 -0
  35. package/skill/Workflows/Evals.md +193 -0
  36. package/skill/Workflows/Evolve.md +159 -0
  37. package/skill/Workflows/Grade.md +157 -0
  38. package/skill/Workflows/Ingest.md +159 -0
  39. package/skill/Workflows/Initialize.md +125 -0
  40. package/skill/Workflows/Rollback.md +131 -0
  41. package/skill/Workflows/Watch.md +128 -0
  42. package/skill/references/grading-methodology.md +176 -0
  43. package/skill/references/invocation-taxonomy.md +144 -0
  44. package/skill/references/logs.md +168 -0
  45. package/skill/settings_snippet.json +41 -0
package/CHANGELOG.md ADDED
@@ -0,0 +1,23 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/).
7
+
8
+ ## [0.1.0] - 2026-02-28
9
+
10
+ ### Added
11
+
12
+ - CLI entry point with 10 commands: `init`, `evals`, `grade`, `evolve`, `rollback`, `watch`, `doctor`, `ingest-codex`, `ingest-opencode`, `wrap-codex`
13
+ - Agent auto-detection for Claude Code, Codex, and OpenCode
14
+ - Telemetry hooks for Claude Code (`prompt-log`, `skill-eval`, `session-stop`)
15
+ - Codex wrapper and batch ingestor for rollout logs
16
+ - OpenCode session backfill from SQLite
17
+ - False negative detection and eval set generation from real usage logs
18
+ - 3-tier session grading (Trigger / Process / Quality)
19
+ - Skill evolution loop: extract patterns, propose description, validate, deploy
20
+ - Post-deploy monitoring with sliding window regression detection and auto-rollback
21
+ - Health check system (`doctor` command)
22
+ - Architecture enforcement via custom lint rules
23
+ - Comprehensive test suite (27 test files)
package/README.md ADDED
@@ -0,0 +1,259 @@
1
+ # selftune — Skill Observability & Continuous Improvement CLI
2
+
3
+ [![npm version](https://img.shields.io/npm/v/selftune)](https://www.npmjs.com/package/selftune)
4
+ [![CI](https://github.com/WellDunDun/douala/actions/workflows/ci.yml/badge.svg)](https://github.com/WellDunDun/douala/actions/workflows/ci.yml)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
6
+ [![Zero Dependencies](https://img.shields.io/badge/dependencies-0-brightgreen)]()
7
+ [![Bun](https://img.shields.io/badge/runtime-bun%20%7C%20node-black)](https://bun.sh)
8
+
9
+ Observe real sessions, detect missed triggers, grade execution quality, and automatically evolve skill descriptions toward the language real users actually use.
10
+
11
+ Works with **Claude Code**, **Codex**, and **OpenCode**.
12
+
13
+ ```
14
+ Observe → Detect → Diagnose → Propose → Validate → Deploy → Watch → Repeat
15
+ ```
16
+
17
+ ---
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ npx selftune@latest doctor
23
+ ```
24
+
25
+ Or install globally:
26
+
27
+ ```bash
28
+ npm install -g selftune
29
+ selftune doctor
30
+ ```
31
+
32
+ Requires [Bun](https://bun.sh) or Node.js 18+ with [tsx](https://github.com/privatenumber/tsx).
33
+
34
+ ---
35
+
36
+ ## Why
37
+
38
+ Agent skills are static, but users are not. When a skill undertriggers — when someone says "make me a slide deck" and the pptx skill doesn't fire — that failure is invisible. The user concludes "AI doesn't follow directions" rather than recognizing the skill description doesn't match how real people talk.
39
+
40
+ selftune closes this feedback loop.
41
+
42
+ ---
43
+
44
+ ## What It Does
45
+
46
+ | Capability | Description |
47
+ |---|---|
48
+ | **Session telemetry** | Captures per-session process metrics across all three platforms |
49
+ | **False negative detection** | Surfaces queries where a skill should have fired but didn't |
50
+ | **Eval set generation** | Converts hook logs into trigger eval sets with real usage as ground truth |
51
+ | **Session grading** | 3-tier evaluation (Trigger / Process / Quality) using the agent you already have |
52
+ | **Skill evolution** | Proposes improved descriptions, validates them, deploys with audit trail |
53
+ | **Post-deploy monitoring** | Watches evolved skills for regressions, auto-rollback on pass rate drops |
54
+
55
+ ---
56
+
57
+ ## Quick Start
58
+
59
+ ### 1. Initialize
60
+
61
+ ```bash
62
+ npx selftune@latest init
63
+ ```
64
+
65
+ The `init` command auto-detects your agent environment (Claude Code, Codex, or OpenCode), resolves the CLI path, determines the LLM mode, and writes config to `~/.selftune/config.json`. All subsequent commands read from this config.
66
+
67
+ Use `--agent claude_code|codex|opencode` to override detection, `--llm-mode agent|api` to override LLM mode, or `--force` to reinitialize.
68
+
69
+ ### 4. Install hooks (Claude Code)
70
+
71
+ If `init` reports hooks are not installed, merge the entries from `skill/settings_snippet.json` into `~/.claude/settings.json`. Replace `/PATH/TO/` with the absolute path to this repository.
72
+
73
+ ### 5. Verify setup
74
+
75
+ ```bash
76
+ selftune doctor
77
+ ```
78
+
79
+ Doctor checks log file health, hook installation, schema validity, and config status.
80
+
81
+ ### Platform-Specific Notes
82
+
83
+ **Claude Code** — Hooks capture telemetry automatically after installation. Zero configuration once hooks are in `settings.json`.
84
+
85
+ **Codex** — Use the wrapper for real-time capture or the batch ingestor for historical logs:
86
+ ```bash
87
+ selftune wrap-codex -- <your codex args>
88
+ selftune ingest-codex
89
+ ```
90
+
91
+ **OpenCode** — Backfill historical sessions from SQLite:
92
+ ```bash
93
+ selftune ingest-opencode
94
+ ```
95
+
96
+ All platforms write to the same shared JSONL log schema at `~/.claude/`.
97
+
98
+ ---
99
+
100
+ ## Commands
101
+
102
+ ```
103
+ selftune <command> [options]
104
+ ```
105
+
106
+ | Command | Purpose |
107
+ |---|---|
108
+ | `init` | Auto-detect agent environment, write `~/.selftune/config.json` |
109
+ | `grade --skill <name>` | Grade a session (3-tier: trigger, process, quality) |
110
+ | `evals --skill <name>` | Generate eval set from real usage logs |
111
+ | `evals --list-skills` | Show logged skills and query counts |
112
+ | `evolve --skill <name> --skill-path <path>` | Analyze failures, propose and deploy improved description |
113
+ | `rollback --skill <name> --skill-path <path>` | Restore pre-evolution description |
114
+ | `watch --skill <name> --skill-path <path>` | Monitor post-deploy pass rates, detect regressions |
115
+ | `doctor` | Health checks on logs, hooks, config, and schema |
116
+ | `ingest-codex` | Batch ingest Codex rollout logs |
117
+ | `ingest-opencode` | Backfill historical OpenCode sessions from SQLite |
118
+ | `wrap-codex -- <args>` | Real-time Codex wrapper with telemetry |
119
+
120
+ No separate API key required — grading and evolution use whatever agent CLI you already have installed. Set `ANTHROPIC_API_KEY` to use the API directly instead.
121
+
122
+ See `skill/Workflows/` for detailed step-by-step guides for each command.
123
+
124
+ ---
125
+
126
+ ## How It Works
127
+
128
+ ### Telemetry Capture
129
+
130
+ ```
131
+ Claude Code (hooks): OpenCode (hooks):
132
+ UserPromptSubmit → prompt-log.ts message.* → opencode-prompt-log.ts
133
+ PostToolUse → skill-eval.ts tool.execute.after → opencode-skill-eval.ts
134
+ Stop → session-stop.ts session.idle → opencode-session-stop.ts
135
+ │ │
136
+ └──────────┬─────────────────────────┘
137
+
138
+ Shared JSONL Log Schema (~/.claude/)
139
+ ├── all_queries_log.jsonl
140
+ ├── skill_usage_log.jsonl
141
+ └── session_telemetry_log.jsonl
142
+
143
+ Codex (wrapper/ingestor — hooks not yet available):
144
+ codex-wrapper.ts (real-time tee of JSONL stream)
145
+ codex-rollout.ts (batch ingest from rollout logs)
146
+
147
+ └──→ Same shared JSONL schema
148
+ ```
149
+
150
+ ### Eval & Grading
151
+
152
+ ```
153
+ selftune evals cross-references the two query logs:
154
+ Positives = skill_usage_log entries for target skill
155
+ Negatives = all_queries_log entries NOT in positives
156
+
157
+ selftune grade reads:
158
+ session_telemetry_log → process metrics (tool calls, errors, turns)
159
+ transcript JSONL → what actually happened
160
+ expectations → what should have happened
161
+ ```
162
+
163
+ ### Evolution Loop
164
+
165
+ ```
166
+ selftune evolve:
167
+ 1. Load eval set (or generate from logs)
168
+ 2. Extract failure patterns (missed queries grouped by invocation type)
169
+ 3. Generate improved description via LLM
170
+ 4. Validate against eval set (must improve, <5% regression)
171
+ 5. Deploy updated SKILL.md + PR + audit trail
172
+
173
+ selftune watch:
174
+ Monitor pass rate over sliding window of recent sessions
175
+ Alert (or auto-rollback) on regression > threshold
176
+ ```
177
+
178
+ ---
179
+
180
+ ## Architecture
181
+
182
+ ```
183
+ cli/selftune/
184
+ ├── index.ts CLI entry point (command router)
185
+ ├── init.ts Agent detection, config bootstrap
186
+ ├── types.ts, constants.ts Shared interfaces and constants
187
+ ├── observability.ts Health checks (doctor command)
188
+ ├── utils/ JSONL, transcript parsing, LLM calls, schema validation
189
+ ├── hooks/ Claude Code + OpenCode telemetry capture
190
+ ├── ingestors/ Codex adapters + OpenCode backfill
191
+ ├── eval/ False negative detection, eval set generation
192
+ ├── grading/ 3-tier session grading (agent or API mode)
193
+ ├── evolution/ Failure extraction, proposal, validation, deploy, rollback
194
+ └── monitoring/ Post-deploy regression detection
195
+
196
+ skill/
197
+ ├── SKILL.md Routing table (~120 lines)
198
+ ├── settings_snippet.json Claude Code hook config template
199
+ ├── references/ Domain knowledge (logs, grading methodology, taxonomy)
200
+ └── Workflows/ Step-by-step guides (1 per command)
201
+ ```
202
+
203
+ Dependencies flow forward only: `shared → hooks/ingestors → eval → grading → evolution → monitoring`. Enforced by `lint-architecture.ts`.
204
+
205
+ Config persists at `~/.selftune/config.json` (written by `init`, read by all commands via skill workflows).
206
+
207
+ See [ARCHITECTURE.md](ARCHITECTURE.md) for the full domain map and module rules.
208
+
209
+ ---
210
+
211
+ ## Log Schema
212
+
213
+ Three append-only JSONL files at `~/.claude/`:
214
+
215
+ | File | Record type | Key fields |
216
+ |---|---|---|
217
+ | `all_queries_log.jsonl` | `QueryLogRecord` | `timestamp`, `session_id`, `query`, `source?` |
218
+ | `skill_usage_log.jsonl` | `SkillUsageRecord` | `timestamp`, `session_id`, `skill_name`, `query`, `triggered` |
219
+ | `session_telemetry_log.jsonl` | `SessionTelemetryRecord` | `timestamp`, `session_id`, `tool_calls`, `bash_commands`, `skills_triggered`, `errors_encountered` |
220
+ | `evolution_audit_log.jsonl` | `EvolutionAuditEntry` | `timestamp`, `proposal_id`, `action`, `details`, `eval_snapshot?` |
221
+
222
+ The `source` field identifies the platform: `claude_code`, `codex`, or `opencode`.
223
+
224
+ ---
225
+
226
+ ## Development
227
+
228
+ ```bash
229
+ make check # lint + architecture lint + all tests
230
+ make lint # biome check + architecture lint
231
+ make test # bun test
232
+ ```
233
+
234
+ Zero runtime dependencies. Uses Bun built-ins only.
235
+
236
+ ---
237
+
238
+ ## Tips
239
+
240
+ - Run `selftune init` first — everything else reads from the config it writes.
241
+ - Let logs accumulate over several days before running evals — more diverse real queries = more reliable signal.
242
+ - All hooks are silent (exit 0) and take <50ms. Negligible overhead.
243
+ - Logs are append-only JSONL. Safe to delete to start fresh, or archive old files.
244
+ - Use `--max 75` to increase eval set size once you have enough data.
245
+ - Use `--seed 123` for a different random sample of negatives.
246
+ - Use `--dry-run` with `evolve` to preview proposals without deploying.
247
+ - The `doctor` command checks log health, hook presence, config status, and schema validity.
248
+
249
+ ---
250
+
251
+ ## Milestones
252
+
253
+ | Version | Scope | Status |
254
+ |---|---|---|
255
+ | v0.1 | Hooks, ingestors, shared schema, eval generation | Done |
256
+ | v0.2 | Session grading, grader skill | Done |
257
+ | v0.3 | Evolution loop (propose, validate, deploy, rollback) | Done |
258
+ | v0.4 | Post-deploy monitoring, regression detection | Done |
259
+ | v0.5 | Agent-first skill restructure, `init` command, config bootstrap | Done |
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env node
2
+
3
+ const { execFileSync } = require("child_process");
4
+ const { join } = require("path");
5
+
6
+ const entrypoint = join(__dirname, "..", "cli", "selftune", "index.ts");
7
+
8
+ const runners = [
9
+ ["bun", [entrypoint, ...process.argv.slice(2)]],
10
+ ["npx", ["tsx", entrypoint, ...process.argv.slice(2)]],
11
+ ];
12
+
13
+ for (const [cmd, args] of runners) {
14
+ try {
15
+ execFileSync(cmd, args, { stdio: "inherit" });
16
+ process.exit(0);
17
+ } catch (e) {
18
+ if (e.status !== undefined) {
19
+ process.exit(e.status);
20
+ }
21
+ }
22
+ }
23
+
24
+ console.error(
25
+ JSON.stringify({
26
+ error: "No TypeScript runtime found. Install bun (https://bun.sh) or tsx (npx tsx).",
27
+ })
28
+ );
29
+ process.exit(1);
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Shared constants for selftune.
3
+ */
4
+
5
+ import { homedir } from "node:os";
6
+ import { join } from "node:path";
7
+
8
+ export const SELFTUNE_CONFIG_DIR = join(homedir(), ".selftune");
9
+ export const SELFTUNE_CONFIG_PATH = join(SELFTUNE_CONFIG_DIR, "config.json");
10
+
11
+ export const LOG_DIR = join(homedir(), ".claude");
12
+
13
+ export const TELEMETRY_LOG = join(LOG_DIR, "session_telemetry_log.jsonl");
14
+ export const SKILL_LOG = join(LOG_DIR, "skill_usage_log.jsonl");
15
+ export const QUERY_LOG = join(LOG_DIR, "all_queries_log.jsonl");
16
+ export const EVOLUTION_AUDIT_LOG = join(LOG_DIR, "evolution_audit_log.jsonl");
17
+
18
+ /** Tool names Claude Code uses. */
19
+ export const KNOWN_TOOLS = new Set([
20
+ "Read",
21
+ "Write",
22
+ "Edit",
23
+ "MultiEdit",
24
+ "Bash",
25
+ "Glob",
26
+ "Grep",
27
+ "WebFetch",
28
+ "WebSearch",
29
+ "Task",
30
+ "TodoRead",
31
+ "TodoWrite",
32
+ ]);
33
+
34
+ /** Prefixes indicating automated/tool-injected content, not real user prompts. */
35
+ export const SKIP_PREFIXES = ["<tool_result", "<function_result", "[Automated", "[System"] as const;
36
+
37
+ /** Fallback negatives for padding eval sets when real negatives are sparse. */
38
+ export const GENERIC_NEGATIVES = [
39
+ "What time is it?",
40
+ "Tell me a joke",
41
+ "Summarize this paragraph",
42
+ "What is the capital of France?",
43
+ "Help me debug this Python error",
44
+ "Write a haiku about autumn",
45
+ "Explain what recursion means",
46
+ "How do I reverse a string in JavaScript?",
47
+ "What is 42 times 17?",
48
+ "Translate 'hello' to Spanish",
49
+ "Can you review this code?",
50
+ "What does this error mean?",
51
+ "Help me write a commit message",
52
+ "Explain this function to me",
53
+ "How do I center a div in CSS?",
54
+ ] as const;
55
+
56
+ /** Required fields per log type (for schema validation). */
57
+ export const REQUIRED_FIELDS: Record<string, Set<string>> = {
58
+ session_telemetry: new Set(["timestamp", "session_id", "source"]),
59
+ skill_usage: new Set(["timestamp", "session_id", "skill_name"]),
60
+ all_queries: new Set(["timestamp", "session_id", "query"]),
61
+ evolution_audit: new Set(["timestamp", "proposal_id", "action"]),
62
+ };
63
+
64
+ /** Agent CLI candidates in detection order. */
65
+ export const AGENT_CANDIDATES = ["claude", "codex", "opencode"] as const;
66
+
67
+ /** Anthropic API URL for direct grading. */
68
+ export const API_URL = "https://api.anthropic.com/v1/messages";
69
+
70
+ /** Default model for direct API grading. */
71
+ export const MODEL = "claude-sonnet-4-20250514";