npm - selftune - Versions diffs - 0.1.0 - Mend

selftune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +23 -0
package/README.md +259 -0
package/bin/selftune.cjs +29 -0
package/cli/selftune/constants.ts +71 -0
package/cli/selftune/eval/hooks-to-evals.ts +422 -0
package/cli/selftune/evolution/audit.ts +44 -0
package/cli/selftune/evolution/deploy-proposal.ts +244 -0
package/cli/selftune/evolution/evolve.ts +406 -0
package/cli/selftune/evolution/extract-patterns.ts +145 -0
package/cli/selftune/evolution/propose-description.ts +146 -0
package/cli/selftune/evolution/rollback.ts +242 -0
package/cli/selftune/evolution/stopping-criteria.ts +69 -0
package/cli/selftune/evolution/validate-proposal.ts +137 -0
package/cli/selftune/grading/grade-session.ts +459 -0
package/cli/selftune/hooks/prompt-log.ts +52 -0
package/cli/selftune/hooks/session-stop.ts +54 -0
package/cli/selftune/hooks/skill-eval.ts +73 -0
package/cli/selftune/index.ts +104 -0
package/cli/selftune/ingestors/codex-rollout.ts +416 -0
package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
package/cli/selftune/init.ts +297 -0
package/cli/selftune/monitoring/watch.ts +328 -0
package/cli/selftune/observability.ts +255 -0
package/cli/selftune/types.ts +255 -0
package/cli/selftune/utils/jsonl.ts +75 -0
package/cli/selftune/utils/llm-call.ts +192 -0
package/cli/selftune/utils/logging.ts +40 -0
package/cli/selftune/utils/schema-validator.ts +47 -0
package/cli/selftune/utils/seeded-random.ts +31 -0
package/cli/selftune/utils/transcript.ts +260 -0
package/package.json +29 -0
package/skill/SKILL.md +120 -0
package/skill/Workflows/Doctor.md +145 -0
package/skill/Workflows/Evals.md +193 -0
package/skill/Workflows/Evolve.md +159 -0
package/skill/Workflows/Grade.md +157 -0
package/skill/Workflows/Ingest.md +159 -0
package/skill/Workflows/Initialize.md +125 -0
package/skill/Workflows/Rollback.md +131 -0
package/skill/Workflows/Watch.md +128 -0
package/skill/references/grading-methodology.md +176 -0
package/skill/references/invocation-taxonomy.md +144 -0
package/skill/references/logs.md +168 -0
package/skill/settings_snippet.json +41 -0

package/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,23 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/),
+and this project adheres to [Semantic Versioning](https://semver.org/).
+## [0.1.0] - 2026-02-28
+### Added
+- CLI entry point with 10 commands: `init`, `evals`, `grade`, `evolve`, `rollback`, `watch`, `doctor`, `ingest-codex`, `ingest-opencode`, `wrap-codex`
+- Agent auto-detection for Claude Code, Codex, and OpenCode
+- Telemetry hooks for Claude Code (`prompt-log`, `skill-eval`, `session-stop`)
+- Codex wrapper and batch ingestor for rollout logs
+- OpenCode session backfill from SQLite
+- False negative detection and eval set generation from real usage logs
+- 3-tier session grading (Trigger / Process / Quality)
+- Skill evolution loop: extract patterns, propose description, validate, deploy
+- Post-deploy monitoring with sliding window regression detection and auto-rollback
+- Health check system (`doctor` command)
+- Architecture enforcement via custom lint rules
+- Comprehensive test suite (27 test files)

package/README.md ADDED Viewed

@@ -0,0 +1,259 @@
+# selftune — Skill Observability & Continuous Improvement CLI
+[![npm version](https://img.shields.io/npm/v/selftune)](https://www.npmjs.com/package/selftune)
+[![CI](https://github.com/WellDunDun/douala/actions/workflows/ci.yml/badge.svg)](https://github.com/WellDunDun/douala/actions/workflows/ci.yml)
+[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
+[![Zero Dependencies](https://img.shields.io/badge/dependencies-0-brightgreen)]()
+[![Bun](https://img.shields.io/badge/runtime-bun%20%7C%20node-black)](https://bun.sh)
+Observe real sessions, detect missed triggers, grade execution quality, and automatically evolve skill descriptions toward the language real users actually use.
+Works with **Claude Code**, **Codex**, and **OpenCode**.
+```
+Observe → Detect → Diagnose → Propose → Validate → Deploy → Watch → Repeat
+```
+---
+## Install
+```bash
+npx selftune@latest doctor
+```
+Or install globally:
+```bash
+npm install -g selftune
+selftune doctor
+```
+Requires [Bun](https://bun.sh) or Node.js 18+ with [tsx](https://github.com/privatenumber/tsx).
+---
+## Why
+Agent skills are static, but users are not. When a skill undertriggers — when someone says "make me a slide deck" and the pptx skill doesn't fire — that failure is invisible. The user concludes "AI doesn't follow directions" rather than recognizing the skill description doesn't match how real people talk.
+selftune closes this feedback loop.
+---
+## What It Does
+| Capability | Description |
+|---|---|
+| **Session telemetry** | Captures per-session process metrics across all three platforms |
+| **False negative detection** | Surfaces queries where a skill should have fired but didn't |
+| **Eval set generation** | Converts hook logs into trigger eval sets with real usage as ground truth |
+| **Session grading** | 3-tier evaluation (Trigger / Process / Quality) using the agent you already have |
+| **Skill evolution** | Proposes improved descriptions, validates them, deploys with audit trail |
+| **Post-deploy monitoring** | Watches evolved skills for regressions, auto-rollback on pass rate drops |
+---
+## Quick Start
+### 1. Initialize
+```bash
+npx selftune@latest init
+```
+The `init` command auto-detects your agent environment (Claude Code, Codex, or OpenCode), resolves the CLI path, determines the LLM mode, and writes config to `~/.selftune/config.json`. All subsequent commands read from this config.
+Use `--agent claude_code|codex|opencode` to override detection, `--llm-mode agent|api` to override LLM mode, or `--force` to reinitialize.
+### 4. Install hooks (Claude Code)
+If `init` reports hooks are not installed, merge the entries from `skill/settings_snippet.json` into `~/.claude/settings.json`. Replace `/PATH/TO/` with the absolute path to this repository.
+### 5. Verify setup
+```bash
+selftune doctor
+```
+Doctor checks log file health, hook installation, schema validity, and config status.
+### Platform-Specific Notes
+**Claude Code** — Hooks capture telemetry automatically after installation. Zero configuration once hooks are in `settings.json`.
+**Codex** — Use the wrapper for real-time capture or the batch ingestor for historical logs:
+```bash
+selftune wrap-codex -- <your codex args>
+selftune ingest-codex
+```
+**OpenCode** — Backfill historical sessions from SQLite:
+```bash
+selftune ingest-opencode
+```
+All platforms write to the same shared JSONL log schema at `~/.claude/`.
+---
+## Commands
+```
+selftune <command> [options]
+```
+| Command | Purpose |
+|---|---|
+| `init` | Auto-detect agent environment, write `~/.selftune/config.json` |
+| `grade --skill <name>` | Grade a session (3-tier: trigger, process, quality) |
+| `evals --skill <name>` | Generate eval set from real usage logs |
+| `evals --list-skills` | Show logged skills and query counts |
+| `evolve --skill <name> --skill-path <path>` | Analyze failures, propose and deploy improved description |
+| `rollback --skill <name> --skill-path <path>` | Restore pre-evolution description |
+| `watch --skill <name> --skill-path <path>` | Monitor post-deploy pass rates, detect regressions |
+| `doctor` | Health checks on logs, hooks, config, and schema |
+| `ingest-codex` | Batch ingest Codex rollout logs |
+| `ingest-opencode` | Backfill historical OpenCode sessions from SQLite |
+| `wrap-codex -- <args>` | Real-time Codex wrapper with telemetry |
+No separate API key required — grading and evolution use whatever agent CLI you already have installed. Set `ANTHROPIC_API_KEY` to use the API directly instead.
+See `skill/Workflows/` for detailed step-by-step guides for each command.
+---
+## How It Works
+### Telemetry Capture
+```
+Claude Code (hooks):                 OpenCode (hooks):
+  UserPromptSubmit → prompt-log.ts     message.*        → opencode-prompt-log.ts
+  PostToolUse      → skill-eval.ts     tool.execute.after → opencode-skill-eval.ts
+  Stop             → session-stop.ts   session.idle     → opencode-session-stop.ts
+          │                                    │
+          └──────────┬─────────────────────────┘
+                     ▼
+          Shared JSONL Log Schema (~/.claude/)
+            ├── all_queries_log.jsonl
+            ├── skill_usage_log.jsonl
+            └── session_telemetry_log.jsonl
+Codex (wrapper/ingestor — hooks not yet available):
+  codex-wrapper.ts  (real-time tee of JSONL stream)
+  codex-rollout.ts  (batch ingest from rollout logs)
+          │
+          └──→ Same shared JSONL schema
+```
+### Eval & Grading
+```
+selftune evals cross-references the two query logs:
+  Positives  = skill_usage_log entries for target skill
+  Negatives  = all_queries_log entries NOT in positives
+selftune grade reads:
+  session_telemetry_log → process metrics (tool calls, errors, turns)
+  transcript JSONL       → what actually happened
+  expectations           → what should have happened
+```
+### Evolution Loop
+```
+selftune evolve:
+  1. Load eval set (or generate from logs)
+  2. Extract failure patterns (missed queries grouped by invocation type)
+  3. Generate improved description via LLM
+  4. Validate against eval set (must improve, <5% regression)
+  5. Deploy updated SKILL.md + PR + audit trail
+selftune watch:
+  Monitor pass rate over sliding window of recent sessions
+  Alert (or auto-rollback) on regression > threshold
+```
+---
+## Architecture
+```
+cli/selftune/
+├── index.ts                     CLI entry point (command router)
+├── init.ts                      Agent detection, config bootstrap
+├── types.ts, constants.ts       Shared interfaces and constants
+├── observability.ts             Health checks (doctor command)
+├── utils/                       JSONL, transcript parsing, LLM calls, schema validation
+├── hooks/                       Claude Code + OpenCode telemetry capture
+├── ingestors/                   Codex adapters + OpenCode backfill
+├── eval/                        False negative detection, eval set generation
+├── grading/                     3-tier session grading (agent or API mode)
+├── evolution/                   Failure extraction, proposal, validation, deploy, rollback
+└── monitoring/                  Post-deploy regression detection
+skill/
+├── SKILL.md                     Routing table (~120 lines)
+├── settings_snippet.json        Claude Code hook config template
+├── references/                  Domain knowledge (logs, grading methodology, taxonomy)
+└── Workflows/                   Step-by-step guides (1 per command)
+```
+Dependencies flow forward only: `shared → hooks/ingestors → eval → grading → evolution → monitoring`. Enforced by `lint-architecture.ts`.
+Config persists at `~/.selftune/config.json` (written by `init`, read by all commands via skill workflows).
+See [ARCHITECTURE.md](ARCHITECTURE.md) for the full domain map and module rules.
+---
+## Log Schema
+Three append-only JSONL files at `~/.claude/`:
+| File | Record type | Key fields |
+|---|---|---|
+| `all_queries_log.jsonl` | `QueryLogRecord` | `timestamp`, `session_id`, `query`, `source?` |
+| `skill_usage_log.jsonl` | `SkillUsageRecord` | `timestamp`, `session_id`, `skill_name`, `query`, `triggered` |
+| `session_telemetry_log.jsonl` | `SessionTelemetryRecord` | `timestamp`, `session_id`, `tool_calls`, `bash_commands`, `skills_triggered`, `errors_encountered` |
+| `evolution_audit_log.jsonl` | `EvolutionAuditEntry` | `timestamp`, `proposal_id`, `action`, `details`, `eval_snapshot?` |
+The `source` field identifies the platform: `claude_code`, `codex`, or `opencode`.
+---
+## Development
+```bash
+make check    # lint + architecture lint + all tests
+make lint     # biome check + architecture lint
+make test     # bun test
+```
+Zero runtime dependencies. Uses Bun built-ins only.
+---
+## Tips
+- Run `selftune init` first — everything else reads from the config it writes.
+- Let logs accumulate over several days before running evals — more diverse real queries = more reliable signal.
+- All hooks are silent (exit 0) and take <50ms. Negligible overhead.
+- Logs are append-only JSONL. Safe to delete to start fresh, or archive old files.
+- Use `--max 75` to increase eval set size once you have enough data.
+- Use `--seed 123` for a different random sample of negatives.
+- Use `--dry-run` with `evolve` to preview proposals without deploying.
+- The `doctor` command checks log health, hook presence, config status, and schema validity.
+---
+## Milestones
+| Version | Scope | Status |
+|---|---|---|
+| v0.1 | Hooks, ingestors, shared schema, eval generation | Done |
+| v0.2 | Session grading, grader skill | Done |
+| v0.3 | Evolution loop (propose, validate, deploy, rollback) | Done |
+| v0.4 | Post-deploy monitoring, regression detection | Done |
+| v0.5 | Agent-first skill restructure, `init` command, config bootstrap | Done |

package/bin/selftune.cjs ADDED Viewed

@@ -0,0 +1,29 @@
+#!/usr/bin/env node
+const { execFileSync } = require("child_process");
+const { join } = require("path");
+const entrypoint = join(__dirname, "..", "cli", "selftune", "index.ts");
+const runners = [
+  ["bun", [entrypoint, ...process.argv.slice(2)]],
+  ["npx", ["tsx", entrypoint, ...process.argv.slice(2)]],
+];
+for (const [cmd, args] of runners) {
+  try {
+    execFileSync(cmd, args, { stdio: "inherit" });
+    process.exit(0);
+  } catch (e) {
+    if (e.status !== undefined) {
+      process.exit(e.status);
+    }
+  }
+}
+console.error(
+  JSON.stringify({
+    error: "No TypeScript runtime found. Install bun (https://bun.sh) or tsx (npx tsx).",
+  })
+);
+process.exit(1);

package/cli/selftune/constants.ts ADDED Viewed

@@ -0,0 +1,71 @@
+/**
+ * Shared constants for selftune.
+ */
+import { homedir } from "node:os";
+import { join } from "node:path";
+export const SELFTUNE_CONFIG_DIR = join(homedir(), ".selftune");
+export const SELFTUNE_CONFIG_PATH = join(SELFTUNE_CONFIG_DIR, "config.json");
+export const LOG_DIR = join(homedir(), ".claude");
+export const TELEMETRY_LOG = join(LOG_DIR, "session_telemetry_log.jsonl");
+export const SKILL_LOG = join(LOG_DIR, "skill_usage_log.jsonl");
+export const QUERY_LOG = join(LOG_DIR, "all_queries_log.jsonl");
+export const EVOLUTION_AUDIT_LOG = join(LOG_DIR, "evolution_audit_log.jsonl");
+/** Tool names Claude Code uses. */
+export const KNOWN_TOOLS = new Set([
+  "Read",
+  "Write",
+  "Edit",
+  "MultiEdit",
+  "Bash",
+  "Glob",
+  "Grep",
+  "WebFetch",
+  "WebSearch",
+  "Task",
+  "TodoRead",
+  "TodoWrite",
+]);
+/** Prefixes indicating automated/tool-injected content, not real user prompts. */
+export const SKIP_PREFIXES = ["<tool_result", "<function_result", "[Automated", "[System"] as const;
+/** Fallback negatives for padding eval sets when real negatives are sparse. */
+export const GENERIC_NEGATIVES = [
+  "What time is it?",
+  "Tell me a joke",
+  "Summarize this paragraph",
+  "What is the capital of France?",
+  "Help me debug this Python error",
+  "Write a haiku about autumn",
+  "Explain what recursion means",
+  "How do I reverse a string in JavaScript?",
+  "What is 42 times 17?",
+  "Translate 'hello' to Spanish",
+  "Can you review this code?",
+  "What does this error mean?",
+  "Help me write a commit message",
+  "Explain this function to me",
+  "How do I center a div in CSS?",
+] as const;
+/** Required fields per log type (for schema validation). */
+export const REQUIRED_FIELDS: Record<string, Set<string>> = {
+  session_telemetry: new Set(["timestamp", "session_id", "source"]),
+  skill_usage: new Set(["timestamp", "session_id", "skill_name"]),
+  all_queries: new Set(["timestamp", "session_id", "query"]),
+  evolution_audit: new Set(["timestamp", "proposal_id", "action"]),
+};
+/** Agent CLI candidates in detection order. */
+export const AGENT_CANDIDATES = ["claude", "codex", "opencode"] as const;
+/** Anthropic API URL for direct grading. */
+export const API_URL = "https://api.anthropic.com/v1/messages";
+/** Default model for direct API grading. */
+export const MODEL = "claude-sonnet-4-20250514";