npm - selftune - Versions diffs - 0.1.0 - Mend

selftune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +23 -0
package/README.md +259 -0
package/bin/selftune.cjs +29 -0
package/cli/selftune/constants.ts +71 -0
package/cli/selftune/eval/hooks-to-evals.ts +422 -0
package/cli/selftune/evolution/audit.ts +44 -0
package/cli/selftune/evolution/deploy-proposal.ts +244 -0
package/cli/selftune/evolution/evolve.ts +406 -0
package/cli/selftune/evolution/extract-patterns.ts +145 -0
package/cli/selftune/evolution/propose-description.ts +146 -0
package/cli/selftune/evolution/rollback.ts +242 -0
package/cli/selftune/evolution/stopping-criteria.ts +69 -0
package/cli/selftune/evolution/validate-proposal.ts +137 -0
package/cli/selftune/grading/grade-session.ts +459 -0
package/cli/selftune/hooks/prompt-log.ts +52 -0
package/cli/selftune/hooks/session-stop.ts +54 -0
package/cli/selftune/hooks/skill-eval.ts +73 -0
package/cli/selftune/index.ts +104 -0
package/cli/selftune/ingestors/codex-rollout.ts +416 -0
package/cli/selftune/ingestors/codex-wrapper.ts +332 -0
package/cli/selftune/ingestors/opencode-ingest.ts +565 -0
package/cli/selftune/init.ts +297 -0
package/cli/selftune/monitoring/watch.ts +328 -0
package/cli/selftune/observability.ts +255 -0
package/cli/selftune/types.ts +255 -0
package/cli/selftune/utils/jsonl.ts +75 -0
package/cli/selftune/utils/llm-call.ts +192 -0
package/cli/selftune/utils/logging.ts +40 -0
package/cli/selftune/utils/schema-validator.ts +47 -0
package/cli/selftune/utils/seeded-random.ts +31 -0
package/cli/selftune/utils/transcript.ts +260 -0
package/package.json +29 -0
package/skill/SKILL.md +120 -0
package/skill/Workflows/Doctor.md +145 -0
package/skill/Workflows/Evals.md +193 -0
package/skill/Workflows/Evolve.md +159 -0
package/skill/Workflows/Grade.md +157 -0
package/skill/Workflows/Ingest.md +159 -0
package/skill/Workflows/Initialize.md +125 -0
package/skill/Workflows/Rollback.md +131 -0
package/skill/Workflows/Watch.md +128 -0
package/skill/references/grading-methodology.md +176 -0
package/skill/references/invocation-taxonomy.md +144 -0
package/skill/references/logs.md +168 -0
package/skill/settings_snippet.json +41 -0

package/cli/selftune/hooks/skill-eval.ts ADDED Viewed

@@ -0,0 +1,73 @@
+#!/usr/bin/env bun
+/**
+ * Claude Code PostToolUse hook: skill-eval.ts
+ *
+ * Fires whenever Claude reads a file. If that file is a SKILL.md, this hook:
+ *   1. Finds the triggering user query from the transcript JSONL
+ *   2. Appends a usage record to ~/.claude/skill_usage_log.jsonl
+ *
+ * This builds a real-usage eval dataset over time, seeding the
+ * `should_trigger: true` half of trigger evals.
+ */
+import { basename, dirname } from "node:path";
+import { SKILL_LOG } from "../constants.js";
+import type { PostToolUsePayload, SkillUsageRecord } from "../types.js";
+import { appendJsonl } from "../utils/jsonl.js";
+import { getLastUserMessage } from "../utils/transcript.js";
+/**
+ * Extract the skill folder name from a file path ending in SKILL.md.
+ * Returns null if this doesn't look like a skill file.
+ */
+export function extractSkillName(filePath: string): string | null {
+  if (basename(filePath).toUpperCase() !== "SKILL.MD") return null;
+  return basename(dirname(filePath)) || "unknown";
+}
+/**
+ * Core processing logic, exported for testability.
+ * Returns the record that was appended, or null if skipped.
+ */
+export function processToolUse(
+  payload: PostToolUsePayload,
+  logPath: string = SKILL_LOG,
+): SkillUsageRecord | null {
+  // Only care about Read tool
+  if (payload.tool_name !== "Read") return null;
+  const rawPath = payload.tool_input?.file_path;
+  const filePath = typeof rawPath === "string" ? rawPath : "";
+  const skillName = extractSkillName(filePath);
+  if (skillName === null) return null;
+  const transcriptPath = payload.transcript_path ?? "";
+  const sessionId = payload.session_id ?? "unknown";
+  const query = getLastUserMessage(transcriptPath) ?? "(query not found)";
+  const record: SkillUsageRecord = {
+    timestamp: new Date().toISOString(),
+    session_id: sessionId,
+    skill_name: skillName,
+    skill_path: filePath,
+    query,
+    triggered: true,
+    source: "claude_code",
+  };
+  appendJsonl(logPath, record);
+  return record;
+}
+// --- stdin main (only when executed directly, not when imported) ---
+if (import.meta.main) {
+  try {
+    const payload: PostToolUsePayload = JSON.parse(await Bun.stdin.text());
+    processToolUse(payload);
+  } catch {
+    // silent — hooks must never block Claude
+  }
+  process.exit(0);
+}

package/cli/selftune/index.ts ADDED Viewed

@@ -0,0 +1,104 @@
+#!/usr/bin/env bun
+/**
+ * selftune CLI entry point.
+ *
+ * Usage:
+ *   selftune init [options]           — Initialize agent identity and config
+ *   selftune evals [options]          — Generate eval sets from hook logs
+ *   selftune grade [options]          — Grade a skill session
+ *   selftune ingest-codex [options]   — Ingest Codex rollout logs
+ *   selftune ingest-opencode [options] — Ingest OpenCode sessions
+ *   selftune wrap-codex [options]     — Wrap codex exec with telemetry
+ *   selftune evolve [options]         — Evolve a skill description via failure patterns
+ *   selftune rollback [options]       — Rollback a skill to its pre-evolution state
+ *   selftune watch [options]          — Monitor post-deploy skill health
+ *   selftune doctor                   — Run health checks
+ */
+const command = process.argv[2];
+if (!command || command === "--help" || command === "-h") {
+  console.log(`selftune — Skill observability and continuous improvement
+Usage:
+  selftune <command> [options]
+Commands:
+  init               Initialize agent identity and config
+  evals              Generate eval sets from hook logs
+  grade              Grade a skill session
+  ingest-codex       Ingest Codex rollout logs
+  ingest-opencode    Ingest OpenCode sessions
+  wrap-codex         Wrap codex exec with telemetry
+  evolve             Evolve a skill description via failure patterns
+  rollback           Rollback a skill to its pre-evolution state
+  watch              Monitor post-deploy skill health
+  doctor             Run health checks
+Run 'selftune <command> --help' for command-specific options.`);
+  process.exit(0);
+}
+// Route to the appropriate subcommand module.
+// We use dynamic imports so only the needed module is loaded.
+// Each module exports a cliMain() function that the router calls explicitly,
+// since import.meta.main is false for dynamically imported modules.
+process.argv = [process.argv[0], process.argv[1], ...process.argv.slice(3)];
+switch (command) {
+  case "init": {
+    const { cliMain } = await import("./init.js");
+    await cliMain();
+    break;
+  }
+  case "evals": {
+    const { cliMain } = await import("./eval/hooks-to-evals.js");
+    cliMain();
+    break;
+  }
+  case "grade": {
+    const { cliMain } = await import("./grading/grade-session.js");
+    await cliMain();
+    break;
+  }
+  case "ingest-codex": {
+    const { cliMain } = await import("./ingestors/codex-rollout.js");
+    cliMain();
+    break;
+  }
+  case "ingest-opencode": {
+    const { cliMain } = await import("./ingestors/opencode-ingest.js");
+    cliMain();
+    break;
+  }
+  case "wrap-codex": {
+    const { cliMain } = await import("./ingestors/codex-wrapper.js");
+    await cliMain();
+    break;
+  }
+  case "evolve": {
+    const { cliMain } = await import("./evolution/evolve.js");
+    await cliMain();
+    break;
+  }
+  case "rollback": {
+    const { cliMain } = await import("./evolution/rollback.js");
+    await cliMain();
+    break;
+  }
+  case "watch": {
+    const { cliMain } = await import("./monitoring/watch.js");
+    await cliMain();
+    break;
+  }
+  case "doctor": {
+    const { doctor } = await import("./observability.js");
+    const result = doctor();
+    console.log(JSON.stringify(result, null, 2));
+    process.exit(result.healthy ? 0 : 1);
+    break;
+  }
+  default:
+    console.error(`Unknown command: ${command}\nRun 'selftune --help' for available commands.`);
+    process.exit(1);
+}

package/cli/selftune/ingestors/codex-rollout.ts ADDED Viewed

@@ -0,0 +1,416 @@
+#!/usr/bin/env bun
+/**
+ * Codex rollout ingestor: codex-rollout.ts
+ *
+ * Retroactively ingests Codex's auto-written rollout logs into our shared
+ * skill eval log format.
+ *
+ * Codex CLI saves every session to:
+ *   $CODEX_HOME/sessions/YYYY/MM/DD/rollout-<thread_id>.jsonl
+ *
+ * This script scans those files and populates:
+ *   ~/.claude/all_queries_log.jsonl
+ *   ~/.claude/session_telemetry_log.jsonl
+ *   ~/.claude/skill_usage_log.jsonl
+ *
+ * Usage:
+ *   bun codex-rollout.ts
+ *   bun codex-rollout.ts --since 2026-01-01
+ *   bun codex-rollout.ts --codex-home /custom/path
+ *   bun codex-rollout.ts --dry-run
+ *   bun codex-rollout.ts --force
+ */
+import { existsSync, readFileSync, readdirSync, statSync } from "node:fs";
+import { homedir } from "node:os";
+import { basename, join } from "node:path";
+import { parseArgs } from "node:util";
+import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
+import type { QueryLogRecord, SessionTelemetryRecord, SkillUsageRecord } from "../types.js";
+import { appendJsonl, loadMarker, saveMarker } from "../utils/jsonl.js";
+const MARKER_FILE = join(homedir(), ".claude", "codex_ingested_rollouts.json");
+const DEFAULT_CODEX_HOME = process.env.CODEX_HOME ?? join(homedir(), ".codex");
+const CODEX_SKILLS_DIRS = [
+  join(process.cwd(), ".codex", "skills"),
+  join(homedir(), ".codex", "skills"),
+];
+/** Return skill names from Codex skill directories. */
+export function findSkillNames(dirs: string[] = CODEX_SKILLS_DIRS): Set<string> {
+  const names = new Set<string>();
+  for (const dir of dirs) {
+    if (!existsSync(dir)) continue;
+    for (const entry of readdirSync(dir)) {
+      const skillDir = join(dir, entry);
+      try {
+        if (statSync(skillDir).isDirectory() && existsSync(join(skillDir, "SKILL.md"))) {
+          names.add(entry);
+        }
+      } catch {
+        // skip entries that can't be stat'd (broken symlinks, permission errors, etc.)
+      }
+    }
+  }
+  return names;
+}
+/**
+ * Find all rollout-*.jsonl files under codexHome/sessions/YYYY/MM/DD/.
+ * If `since` is given, only return files from that date onward.
+ */
+export function findRolloutFiles(codexHome: string, since?: Date): string[] {
+  const sessionsDir = join(codexHome, "sessions");
+  if (!existsSync(sessionsDir)) return [];
+  const files: string[] = [];
+  for (const yearEntry of readdirSync(sessionsDir).sort()) {
+    const yearDir = join(sessionsDir, yearEntry);
+    try {
+      if (!statSync(yearDir).isDirectory()) continue;
+    } catch {
+      continue;
+    }
+    const year = Number.parseInt(yearEntry, 10);
+    if (Number.isNaN(year)) continue;
+    for (const monthEntry of readdirSync(yearDir).sort()) {
+      const monthDir = join(yearDir, monthEntry);
+      try {
+        if (!statSync(monthDir).isDirectory()) continue;
+      } catch {
+        continue;
+      }
+      const month = Number.parseInt(monthEntry, 10);
+      if (Number.isNaN(month)) continue;
+      for (const dayEntry of readdirSync(monthDir).sort()) {
+        const dayDir = join(monthDir, dayEntry);
+        try {
+          if (!statSync(dayDir).isDirectory()) continue;
+        } catch {
+          continue;
+        }
+        const day = Number.parseInt(dayEntry, 10);
+        if (Number.isNaN(day)) continue;
+        if (since) {
+          const fileDate = new Date(year, month - 1, day);
+          if (fileDate < since) continue;
+        }
+        for (const file of readdirSync(dayDir).sort()) {
+          if (file.startsWith("rollout-") && file.endsWith(".jsonl")) {
+            files.push(join(dayDir, file));
+          }
+        }
+      }
+    }
+  }
+  return files;
+}
+export interface ParsedRollout {
+  timestamp: string;
+  session_id: string;
+  source: string;
+  rollout_path: string;
+  query: string;
+  tool_calls: Record<string, number>;
+  total_tool_calls: number;
+  bash_commands: string[];
+  skills_triggered: string[];
+  assistant_turns: number;
+  errors_encountered: number;
+  input_tokens: number;
+  output_tokens: number;
+  transcript_chars: number;
+  cwd: string;
+  transcript_path: string;
+  last_user_query: string;
+}
+/**
+ * Parse a Codex rollout JSONL file.
+ * Returns parsed data or null if the file is empty/unparseable.
+ */
+export function parseRolloutFile(path: string, skillNames: Set<string>): ParsedRollout | null {
+  let content: string;
+  try {
+    content = readFileSync(path, "utf-8");
+  } catch {
+    return null;
+  }
+  const lines = content
+    .split("\n")
+    .map((l) => l.trim())
+    .filter((l) => l.length > 0);
+  if (lines.length === 0) return null;
+  const threadId = basename(path, ".jsonl").replace("rollout-", "");
+  let prompt = "";
+  const toolCalls: Record<string, number> = {};
+  const bashCommands: string[] = [];
+  const skillsTriggered: string[] = [];
+  let errors = 0;
+  let turns = 0;
+  let inputTokens = 0;
+  let outputTokens = 0;
+  for (const line of lines) {
+    let event: Record<string, unknown>;
+    try {
+      event = JSON.parse(line);
+    } catch {
+      continue;
+    }
+    const etype = (event.type as string) ?? "";
+    if (etype === "turn.started") {
+      turns += 1;
+    } else if (etype === "turn.completed") {
+      const usage = (event.usage as Record<string, number>) ?? {};
+      inputTokens += usage.input_tokens ?? 0;
+      outputTokens += usage.output_tokens ?? 0;
+      if (!prompt) {
+        prompt = (event.user_message as string) ?? "";
+      }
+    } else if (etype === "turn.failed") {
+      errors += 1;
+    } else if (etype === "item.completed" || etype === "item.started" || etype === "item.updated") {
+      const item = (event.item as Record<string, unknown>) ?? {};
+      const itemType = (item.item_type as string) ?? (item.type as string) ?? "";
+      if (etype === "item.completed") {
+        if (itemType === "command_execution") {
+          toolCalls.command_execution = (toolCalls.command_execution ?? 0) + 1;
+          const cmd = ((item.command as string) ?? "").trim();
+          if (cmd) bashCommands.push(cmd);
+          if ((item.exit_code as number) !== 0 && item.exit_code !== undefined) {
+            errors += 1;
+          }
+        } else if (itemType === "file_change") {
+          toolCalls.file_change = (toolCalls.file_change ?? 0) + 1;
+        } else if (itemType === "mcp_tool_call") {
+          toolCalls.mcp_tool_call = (toolCalls.mcp_tool_call ?? 0) + 1;
+        } else if (itemType === "web_search") {
+          toolCalls.web_search = (toolCalls.web_search ?? 0) + 1;
+        } else if (itemType === "reasoning") {
+          toolCalls.reasoning = (toolCalls.reasoning ?? 0) + 1;
+        }
+      }
+      // Detect skill names in text content on completed events
+      const textContent = ((item.text as string) ?? "") + ((item.command as string) ?? "");
+      for (const skillName of skillNames) {
+        if (
+          textContent.includes(skillName) &&
+          !skillsTriggered.includes(skillName) &&
+          etype === "item.completed"
+        ) {
+          skillsTriggered.push(skillName);
+        }
+      }
+    } else if (etype === "error") {
+      errors += 1;
+    }
+    // Some rollout formats embed the original prompt
+    if (!prompt && (event.prompt as string)) {
+      prompt = event.prompt as string;
+    }
+  }
+  // Infer file date from path structure: .../YYYY/MM/DD/rollout-*.jsonl
+  let fileDate: string;
+  const parts = path.split("/");
+  try {
+    const dayStr = parts[parts.length - 2];
+    const monthStr = parts[parts.length - 3];
+    const yearStr = parts[parts.length - 4];
+    const year = Number.parseInt(yearStr, 10);
+    const month = Number.parseInt(monthStr, 10);
+    const day = Number.parseInt(dayStr, 10);
+    if (!Number.isNaN(year) && !Number.isNaN(month) && !Number.isNaN(day)) {
+      fileDate = new Date(Date.UTC(year, month - 1, day)).toISOString();
+    } else {
+      fileDate = new Date().toISOString();
+    }
+  } catch {
+    fileDate = new Date().toISOString();
+  }
+  return {
+    timestamp: fileDate,
+    session_id: threadId,
+    source: "codex_rollout",
+    rollout_path: path,
+    query: prompt,
+    tool_calls: toolCalls,
+    total_tool_calls: Object.values(toolCalls).reduce((a, b) => a + b, 0),
+    bash_commands: bashCommands,
+    skills_triggered: skillsTriggered,
+    assistant_turns: turns,
+    errors_encountered: errors,
+    input_tokens: inputTokens,
+    output_tokens: outputTokens,
+    transcript_chars: lines.reduce((sum, l) => sum + l.length, 0),
+    cwd: "",
+    transcript_path: path,
+    last_user_query: prompt,
+  };
+}
+/** Write parsed session data to shared logs. */
+export function ingestFile(
+  parsed: ParsedRollout,
+  dryRun = false,
+  queryLogPath: string = QUERY_LOG,
+  telemetryLogPath: string = TELEMETRY_LOG,
+  skillLogPath: string = SKILL_LOG,
+): boolean {
+  const { query: prompt, session_id: sessionId, skills_triggered: skills } = parsed;
+  if (dryRun) {
+    console.log(
+      `  [DRY RUN] Would ingest: session=${sessionId.slice(0, 12)}... ` +
+        `turns=${parsed.assistant_turns} commands=${parsed.bash_commands.length} skills=${JSON.stringify(skills)}`,
+    );
+    if (prompt) console.log(`           query: ${prompt.slice(0, 80)}`);
+    return true;
+  }
+  // Write to all_queries_log if we have a prompt
+  if (prompt && prompt.length >= 4) {
+    const queryRecord: QueryLogRecord = {
+      timestamp: parsed.timestamp,
+      session_id: sessionId,
+      query: prompt,
+      source: "codex_rollout",
+    };
+    appendJsonl(queryLogPath, queryRecord, "all_queries");
+  }
+  // Write telemetry — explicitly select SessionTelemetryRecord fields
+  const telemetry: SessionTelemetryRecord = {
+    timestamp: parsed.timestamp,
+    session_id: sessionId,
+    cwd: parsed.cwd,
+    transcript_path: parsed.transcript_path,
+    tool_calls: parsed.tool_calls,
+    total_tool_calls: parsed.total_tool_calls,
+    bash_commands: parsed.bash_commands,
+    skills_triggered: skills,
+    assistant_turns: parsed.assistant_turns,
+    errors_encountered: parsed.errors_encountered,
+    transcript_chars: parsed.transcript_chars,
+    last_user_query: parsed.last_user_query,
+    source: parsed.source,
+    input_tokens: parsed.input_tokens,
+    output_tokens: parsed.output_tokens,
+    rollout_path: parsed.rollout_path,
+  };
+  appendJsonl(telemetryLogPath, telemetry, "session_telemetry");
+  // Write skill triggers
+  for (const skillName of skills) {
+    const skillRecord: SkillUsageRecord = {
+      timestamp: parsed.timestamp,
+      session_id: sessionId,
+      skill_name: skillName,
+      skill_path: `(codex:${skillName})`,
+      query: prompt,
+      triggered: true,
+      source: "codex_rollout",
+    };
+    appendJsonl(skillLogPath, skillRecord, "skill_usage");
+  }
+  return true;
+}
+// --- CLI main ---
+export function cliMain(): void {
+  const { values } = parseArgs({
+    options: {
+      "codex-home": { type: "string", default: DEFAULT_CODEX_HOME },
+      since: { type: "string" },
+      "dry-run": { type: "boolean", default: false },
+      force: { type: "boolean", default: false },
+      verbose: { type: "boolean", short: "v", default: false },
+    },
+    strict: true,
+  });
+  const codexHome = values["codex-home"] ?? DEFAULT_CODEX_HOME;
+  let since: Date | undefined;
+  if (values.since) {
+    since = new Date(values.since);
+    if (Number.isNaN(since.getTime())) {
+      console.error(
+        `Error: Invalid --since date: "${values.since}". Use a valid date format (e.g., 2026-01-01).`,
+      );
+      process.exit(1);
+    }
+  }
+  const rolloutFiles = findRolloutFiles(codexHome, since);
+  if (rolloutFiles.length === 0) {
+    console.log(`No rollout files found under ${codexHome}/sessions/`);
+    console.log("Make sure CODEX_HOME is correct and you've run some `codex exec` sessions.");
+    process.exit(0);
+  }
+  const alreadyIngested = values.force ? new Set<string>() : loadMarker(MARKER_FILE);
+  const skillNames = findSkillNames();
+  const newIngested = new Set<string>();
+  const pending = rolloutFiles.filter((f) => !alreadyIngested.has(f));
+  console.log(`Found ${rolloutFiles.length} rollout files, ${pending.length} not yet ingested.`);
+  if (since) {
+    console.log(`  Filtering to sessions from ${values.since} onward.`);
+  }
+  let ingestedCount = 0;
+  let skippedCount = 0;
+  for (const rolloutFile of pending) {
+    const parsed = parseRolloutFile(rolloutFile, skillNames);
+    if (parsed === null) {
+      if (values.verbose) {
+        console.log(`  SKIP (empty/unparseable): ${basename(rolloutFile)}`);
+      }
+      skippedCount += 1;
+      continue;
+    }
+    if (values.verbose || values["dry-run"]) {
+      console.log(`  ${values["dry-run"] ? "[DRY] " : ""}Ingesting: ${basename(rolloutFile)}`);
+    }
+    ingestFile(parsed, values["dry-run"]);
+    newIngested.add(rolloutFile);
+    ingestedCount += 1;
+  }
+  if (!values["dry-run"]) {
+    saveMarker(MARKER_FILE, new Set([...alreadyIngested, ...newIngested]));
+  }
+  console.log(`\nDone. Ingested ${ingestedCount} sessions, skipped ${skippedCount}.`);
+  if (newIngested.size > 0 && !values["dry-run"]) {
+    console.log(`Marker updated: ${MARKER_FILE}`);
+  }
+}
+if (import.meta.main) {
+  cliMain();
+}