npm - selftune - Versions diffs - 0.1.4 → 0.2.0 - Mend

selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/.claude/agents/diagnosis-analyst.md +146 -0
package/.claude/agents/evolution-reviewer.md +167 -0
package/.claude/agents/integration-guide.md +200 -0
package/.claude/agents/pattern-analyst.md +147 -0
package/CHANGELOG.md +37 -0
package/README.md +96 -256
package/assets/BeforeAfter.gif +0 -0
package/assets/FeedbackLoop.gif +0 -0
package/assets/logo.svg +9 -0
package/assets/skill-health-badge.svg +20 -0
package/cli/selftune/activation-rules.ts +171 -0
package/cli/selftune/badge/badge-data.ts +108 -0
package/cli/selftune/badge/badge-svg.ts +212 -0
package/cli/selftune/badge/badge.ts +103 -0
package/cli/selftune/constants.ts +75 -1
package/cli/selftune/contribute/bundle.ts +314 -0
package/cli/selftune/contribute/contribute.ts +214 -0
package/cli/selftune/contribute/sanitize.ts +162 -0
package/cli/selftune/cron/setup.ts +266 -0
package/cli/selftune/dashboard-server.ts +582 -0
package/cli/selftune/dashboard.ts +25 -3
package/cli/selftune/eval/baseline.ts +247 -0
package/cli/selftune/eval/composability.ts +117 -0
package/cli/selftune/eval/generate-unit-tests.ts +143 -0
package/cli/selftune/eval/hooks-to-evals.ts +68 -2
package/cli/selftune/eval/import-skillsbench.ts +221 -0
package/cli/selftune/eval/synthetic-evals.ts +172 -0
package/cli/selftune/eval/unit-test-cli.ts +152 -0
package/cli/selftune/eval/unit-test.ts +196 -0
package/cli/selftune/evolution/deploy-proposal.ts +142 -1
package/cli/selftune/evolution/evolve-body.ts +492 -0
package/cli/selftune/evolution/evolve.ts +466 -103
package/cli/selftune/evolution/extract-patterns.ts +32 -1
package/cli/selftune/evolution/pareto.ts +314 -0
package/cli/selftune/evolution/propose-body.ts +171 -0
package/cli/selftune/evolution/propose-description.ts +100 -2
package/cli/selftune/evolution/propose-routing.ts +166 -0
package/cli/selftune/evolution/refine-body.ts +141 -0
package/cli/selftune/evolution/rollback.ts +19 -2
package/cli/selftune/evolution/validate-body.ts +254 -0
package/cli/selftune/evolution/validate-proposal.ts +257 -35
package/cli/selftune/evolution/validate-routing.ts +177 -0
package/cli/selftune/grading/grade-session.ts +138 -18
package/cli/selftune/grading/pre-gates.ts +104 -0
package/cli/selftune/hooks/auto-activate.ts +185 -0
package/cli/selftune/hooks/evolution-guard.ts +165 -0
package/cli/selftune/hooks/skill-change-guard.ts +112 -0
package/cli/selftune/index.ts +88 -0
package/cli/selftune/ingestors/claude-replay.ts +351 -0
package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
package/cli/selftune/init.ts +150 -3
package/cli/selftune/memory/writer.ts +447 -0
package/cli/selftune/monitoring/watch.ts +25 -2
package/cli/selftune/status.ts +17 -13
package/cli/selftune/types.ts +377 -5
package/cli/selftune/utils/frontmatter.ts +217 -0
package/cli/selftune/utils/llm-call.ts +29 -3
package/cli/selftune/utils/transcript.ts +35 -0
package/cli/selftune/utils/trigger-check.ts +89 -0
package/cli/selftune/utils/tui.ts +156 -0
package/dashboard/index.html +569 -8
package/package.json +8 -4
package/skill/SKILL.md +124 -8
package/skill/Workflows/AutoActivation.md +144 -0
package/skill/Workflows/Badge.md +118 -0
package/skill/Workflows/Baseline.md +121 -0
package/skill/Workflows/Composability.md +100 -0
package/skill/Workflows/Contribute.md +91 -0
package/skill/Workflows/Cron.md +155 -0
package/skill/Workflows/Dashboard.md +203 -0
package/skill/Workflows/Doctor.md +37 -1
package/skill/Workflows/Evals.md +69 -1
package/skill/Workflows/EvolutionMemory.md +152 -0
package/skill/Workflows/Evolve.md +111 -6
package/skill/Workflows/EvolveBody.md +159 -0
package/skill/Workflows/ImportSkillsBench.md +111 -0
package/skill/Workflows/Ingest.md +117 -3
package/skill/Workflows/Initialize.md +57 -3
package/skill/Workflows/Replay.md +70 -0
package/skill/Workflows/Rollback.md +20 -1
package/skill/Workflows/UnitTest.md +138 -0
package/skill/Workflows/Watch.md +22 -0
package/skill/settings_snippet.json +23 -0
package/templates/activation-rules-default.json +27 -0
package/templates/multi-skill-settings.json +64 -0
package/templates/single-skill-settings.json +58 -0

package/cli/selftune/eval/baseline.ts ADDED Viewed

@@ -0,0 +1,247 @@
+/**
+ * baseline.ts
+ *
+ * Measures the value a skill adds over a no-skill baseline.
+ *
+ * Runs trigger checks against an EMPTY string description (no-skill baseline)
+ * and against the current description (with-skill), then computes lift.
+ * A skill "adds value" when lift >= 0.05 (5 percentage points).
+ */
+import { parseArgs } from "node:util";
+import type { BaselineResult, EvalEntry } from "../types.js";
+import { callLlm } from "../utils/llm-call.js";
+import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+export interface BaselineOptions {
+  evalSet: EvalEntry[];
+  skillDescription: string;
+  skillName: string;
+  agent: string;
+  modelFlag?: string;
+}
+export interface BaselineMeasurement {
+  skill_name: string;
+  baseline_pass_rate: number;
+  with_skill_pass_rate: number;
+  lift: number;
+  adds_value: boolean;
+  per_entry: BaselineResult[];
+  measured_at: string;
+}
+/**
+ * Injectable dependencies for measureBaseline(). When omitted, the real
+ * module imports are used. Pass overrides in tests to avoid real LLM calls.
+ */
+export interface BaselineDeps {
+  callLlm?: typeof callLlm;
+}
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+const LIFT_THRESHOLD = 0.05;
+const SYSTEM_PROMPT = "You are an evaluation assistant. Answer only YES or NO.";
+// ---------------------------------------------------------------------------
+// Core measurement
+// ---------------------------------------------------------------------------
+/** Measure baseline vs. with-skill trigger accuracy across an eval set. */
+export async function measureBaseline(
+  options: BaselineOptions,
+  _deps: BaselineDeps = {},
+): Promise<BaselineMeasurement> {
+  const { evalSet, skillDescription, skillName, agent, modelFlag } = options;
+  const _callLlm = _deps.callLlm ?? callLlm;
+  if (evalSet.length === 0) {
+    return {
+      skill_name: skillName,
+      baseline_pass_rate: 0,
+      with_skill_pass_rate: 0,
+      lift: 0,
+      adds_value: false,
+      per_entry: [],
+      measured_at: new Date().toISOString(),
+    };
+  }
+  const perEntry: BaselineResult[] = [];
+  let baselinePassed = 0;
+  let withSkillPassed = 0;
+  for (const entry of evalSet) {
+    // --- Baseline check (empty description) ---
+    const baselinePrompt = buildTriggerCheckPrompt("", entry.query);
+    const baselineRaw = await _callLlm(SYSTEM_PROMPT, baselinePrompt, agent, modelFlag);
+    const baselineTriggered = parseTriggerResponse(baselineRaw);
+    const baselinePass =
+      (entry.should_trigger && baselineTriggered) || (!entry.should_trigger && !baselineTriggered);
+    if (baselinePass) baselinePassed++;
+    perEntry.push({
+      skill_name: skillName,
+      query: entry.query,
+      with_skill: false,
+      triggered: baselineTriggered,
+      pass: baselinePass,
+      measured_at: new Date().toISOString(),
+    });
+    // --- With-skill check (actual description) ---
+    const withSkillPrompt = buildTriggerCheckPrompt(skillDescription, entry.query);
+    const withSkillRaw = await _callLlm(SYSTEM_PROMPT, withSkillPrompt, agent, modelFlag);
+    const withSkillTriggered = parseTriggerResponse(withSkillRaw);
+    const withSkillPass =
+      (entry.should_trigger && withSkillTriggered) ||
+      (!entry.should_trigger && !withSkillTriggered);
+    if (withSkillPass) withSkillPassed++;
+    perEntry.push({
+      skill_name: skillName,
+      query: entry.query,
+      with_skill: true,
+      triggered: withSkillTriggered,
+      pass: withSkillPass,
+      measured_at: new Date().toISOString(),
+    });
+  }
+  const total = evalSet.length;
+  const baselinePassRate = baselinePassed / total;
+  const withSkillPassRate = withSkillPassed / total;
+  const lift = withSkillPassRate - baselinePassRate;
+  return {
+    skill_name: skillName,
+    baseline_pass_rate: baselinePassRate,
+    with_skill_pass_rate: withSkillPassRate,
+    lift,
+    adds_value: lift >= LIFT_THRESHOLD,
+    per_entry: perEntry,
+    measured_at: new Date().toISOString(),
+  };
+}
+// ---------------------------------------------------------------------------
+// CLI entry point
+// ---------------------------------------------------------------------------
+export async function cliMain(): Promise<void> {
+  const { values } = parseArgs({
+    options: {
+      skill: { type: "string" },
+      "skill-path": { type: "string" },
+      "eval-set": { type: "string" },
+      agent: { type: "string" },
+      help: { type: "boolean", default: false },
+    },
+    strict: true,
+  });
+  if (values.help) {
+    console.log(`selftune baseline — Measure skill value vs. no-skill baseline
+Usage:
+  selftune baseline --skill <name> --skill-path <path> [options]
+Options:
+  --skill         Skill name (required)
+  --skill-path    Path to SKILL.md (required)
+  --eval-set      Path to eval set JSON (optional, builds from logs if omitted)
+  --agent         Agent CLI to use (claude, codex, opencode)
+  --help          Show this help message`);
+    process.exit(0);
+  }
+  if (!values.skill || !values["skill-path"]) {
+    console.error("[ERROR] --skill and --skill-path are required");
+    process.exit(1);
+  }
+  const { existsSync, readFileSync } = await import("node:fs");
+  // Read skill description
+  const skillPath = values["skill-path"];
+  if (!existsSync(skillPath)) {
+    console.error(`[ERROR] SKILL.md not found at ${skillPath}`);
+    process.exit(1);
+  }
+  const skillDescription = readFileSync(skillPath, "utf-8");
+  // Load eval set
+  let evalSet: EvalEntry[];
+  if (values["eval-set"] && existsSync(values["eval-set"])) {
+    const raw = readFileSync(values["eval-set"], "utf-8");
+    evalSet = JSON.parse(raw) as EvalEntry[];
+  } else {
+    // Build from logs
+    const { QUERY_LOG, SKILL_LOG } = await import("../constants.js");
+    const { readJsonl } = await import("../utils/jsonl.js");
+    const { buildEvalSet } = await import("./hooks-to-evals.js");
+    const skillRecords = readJsonl(SKILL_LOG);
+    const queryRecords = readJsonl(QUERY_LOG);
+    evalSet = buildEvalSet(skillRecords, queryRecords, values.skill);
+  }
+  // Detect agent
+  const { detectAgent } = await import("../utils/llm-call.js");
+  const requestedAgent = values.agent;
+  if (requestedAgent && !Bun.which(requestedAgent)) {
+    console.error(
+      JSON.stringify({
+        level: "error",
+        code: "agent_not_in_path",
+        message: `Agent CLI '${requestedAgent}' not found in PATH.`,
+        action: "Install it or omit --agent to use auto-detection.",
+      }),
+    );
+    process.exit(1);
+  }
+  const agent = requestedAgent ?? detectAgent();
+  if (!agent) {
+    console.error(
+      JSON.stringify({
+        level: "error",
+        code: "agent_not_found",
+        message: "No agent CLI (claude/codex/opencode) found in PATH.",
+        action: "Install Claude Code, Codex, or OpenCode.",
+      }),
+    );
+    process.exit(1);
+  }
+  const result = await measureBaseline({
+    evalSet,
+    skillDescription,
+    skillName: values.skill,
+    agent,
+  });
+  console.log(JSON.stringify(result, null, 2));
+  process.exit(result.adds_value ? 0 : 1);
+}
+if (import.meta.main) {
+  cliMain().catch((err) => {
+    console.error(
+      JSON.stringify({
+        level: "fatal",
+        message: err instanceof Error ? err.message : String(err),
+        stack: err instanceof Error ? err.stack : undefined,
+      }),
+    );
+    process.exit(1);
+  });
+}

package/cli/selftune/eval/composability.ts ADDED Viewed

@@ -0,0 +1,117 @@
+/**
+ * composability.ts
+ *
+ * Analyzes co-occurrence patterns between skills in session telemetry
+ * to detect composability conflicts. A conflict is flagged when two
+ * skills used together produce more errors than either skill used alone.
+ *
+ * Pure function -- no I/O. CLI wrapper handles reading JSONL.
+ */
+import type { ComposabilityReport, CoOccurrencePair, SessionTelemetryRecord } from "../types.js";
+/**
+ * Clamp a number between min and max.
+ */
+function clamp(value: number, min: number, max: number): number {
+  return Math.max(min, Math.min(max, value));
+}
+/**
+ * Analyze composability of a target skill against all co-occurring skills.
+ *
+ * @param skillName - The skill to analyze
+ * @param telemetry - All session telemetry records
+ * @param window    - Optional: only consider the last N sessions (by timestamp)
+ * @returns ComposabilityReport with co-occurrence pairs and conflict detection
+ */
+export function analyzeComposability(
+  skillName: string,
+  telemetry: SessionTelemetryRecord[],
+  window?: number,
+): ComposabilityReport {
+  // Apply window: sort by timestamp descending, take last N
+  let sessions = telemetry.filter((r) => r && Array.isArray(r.skills_triggered));
+  if (window && window > 0) {
+    sessions = sessions
+      .sort((a, b) => (b.timestamp ?? "").localeCompare(a.timestamp ?? ""))
+      .slice(0, window);
+  }
+  // Sessions where the target skill was triggered
+  const skillSessions = sessions.filter((r) => r.skills_triggered.includes(skillName));
+  // Sessions where the target skill was triggered ALONE (no other skills)
+  const aloneSessions = skillSessions.filter((r) => r.skills_triggered.length === 1);
+  // Average errors when skill is used alone
+  const errorsAlone =
+    aloneSessions.length > 0
+      ? aloneSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
+        aloneSessions.length
+      : 0;
+  // Find all co-occurring skills
+  const coSkills = new Set<string>();
+  for (const r of skillSessions) {
+    for (const s of r.skills_triggered) {
+      if (s !== skillName) coSkills.add(s);
+    }
+  }
+  // For each co-occurring skill, compute conflict score
+  const pairs: CoOccurrencePair[] = [];
+  for (const coSkill of coSkills) {
+    // Sessions where BOTH skills are triggered together
+    const togetherSessions = skillSessions.filter((r) => r.skills_triggered.includes(coSkill));
+    const coOccurrenceCount = togetherSessions.length;
+    // Average errors when both skills are used together
+    const errorsTogether =
+      togetherSessions.length > 0
+        ? togetherSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
+          togetherSessions.length
+        : 0;
+    // Baseline should consider BOTH skills alone to avoid false positives
+    const coSkillAloneSessions = sessions.filter(
+      (r) => r.skills_triggered.includes(coSkill) && !r.skills_triggered.includes(skillName),
+    );
+    const errorsCoSkillAlone =
+      coSkillAloneSessions.length > 0
+        ? coSkillAloneSessions.reduce((sum, r) => sum + (r.errors_encountered ?? 0), 0) /
+          coSkillAloneSessions.length
+        : errorsAlone;
+    const baselineAlone = Math.max(errorsAlone, errorsCoSkillAlone);
+    // conflict_score = clamp((errors_together - baseline) / (baseline + 1), 0, 1)
+    const conflictScore = clamp((errorsTogether - baselineAlone) / (baselineAlone + 1), 0, 1);
+    const conflictDetected = conflictScore > 0.3;
+    const pair: CoOccurrencePair = {
+      skill_a: skillName,
+      skill_b: coSkill,
+      co_occurrence_count: coOccurrenceCount,
+      conflict_detected: conflictDetected,
+    };
+    if (conflictDetected) {
+      pair.conflict_reason = `conflict_score=${conflictScore.toFixed(3)} (avg errors together=${errorsTogether.toFixed(1)} vs alone=${errorsAlone.toFixed(1)})`;
+    }
+    pairs.push(pair);
+  }
+  // Sort by co-occurrence count descending for readability
+  pairs.sort((a, b) => b.co_occurrence_count - a.co_occurrence_count);
+  return {
+    pairs,
+    total_sessions_analyzed: skillSessions.length,
+    conflict_count: pairs.filter((p) => p.conflict_detected).length,
+    generated_at: new Date().toISOString(),
+  };
+}

package/cli/selftune/eval/generate-unit-tests.ts ADDED Viewed

@@ -0,0 +1,143 @@
+/**
+ * Skill unit test generator.
+ *
+ * Uses an LLM to generate unit test cases from skill content and eval failures.
+ * Tests are output as SkillUnitTest[] JSON arrays.
+ */
+import type { EvalEntry, SkillUnitTest } from "../types.js";
+// Note: we don't use stripMarkdownFences from llm-call.ts because it
+// assumes JSON objects (looks for `{`), but we return JSON arrays.
+/** Strip markdown fences and find JSON array content. */
+function stripArrayFences(raw: string): string {
+  let text = raw.trim();
+  // Strip markdown code fences
+  const fenceMatch = text.match(/^```\w*\n([\s\S]*?)\n```$/);
+  if (fenceMatch) {
+    text = fenceMatch[1].trim();
+  }
+  // Find first [ in case there's preamble text
+  const bracketIdx = text.indexOf("[");
+  if (bracketIdx >= 0) {
+    text = text.slice(bracketIdx);
+  }
+  return text;
+}
+// ---------------------------------------------------------------------------
+// LLM caller type (injectable for testing)
+// ---------------------------------------------------------------------------
+export type LlmCaller = (systemPrompt: string, userPrompt: string) => Promise<string>;
+// ---------------------------------------------------------------------------
+// Prompt building
+// ---------------------------------------------------------------------------
+const SYSTEM_PROMPT = `You are a test engineer generating skill unit tests.
+Given a skill name, its content/description, and optionally some eval failures,
+generate unit test cases as a JSON array of objects.
+Each test object must have:
+- id: unique string (e.g. "gen-1", "gen-2")
+- skill_name: the skill name provided
+- query: a user query that would test this skill
+- assertions: array of assertion objects, each with:
+  - type: one of "contains", "not_contains", "regex", "tool_called", "tool_not_called", "json_path"
+  - value: the value to check for
+  - description: (optional) human-readable description of what this checks
+- tags: (optional) array of tag strings like ["generated", "smoke"]
+Focus on:
+1. Covering different invocation patterns (explicit, implicit, contextual)
+2. Testing edge cases from eval failures if provided
+3. Verifying expected tools are called
+4. Checking output contains expected content
+Respond with ONLY a JSON array. No explanation.`;
+/** Build the user prompt for test generation. */
+export function buildGenerationPrompt(
+  skillName: string,
+  skillContent: string,
+  evalFailures: EvalEntry[],
+): string {
+  const parts: string[] = [`Skill name: ${skillName}`, "", "Skill content:", skillContent, ""];
+  if (evalFailures.length > 0) {
+    parts.push("Eval failures (queries that failed trigger checks):");
+    for (const f of evalFailures) {
+      parts.push(
+        `  - query: "${f.query}" (should_trigger=${f.should_trigger}, type=${f.invocation_type ?? "unknown"})`,
+      );
+    }
+    parts.push("");
+  }
+  parts.push("Example test case format:");
+  parts.push(
+    JSON.stringify(
+      [
+        {
+          id: "example-1",
+          skill_name: skillName,
+          query: "example query for this skill",
+          assertions: [
+            {
+              type: "contains",
+              value: "expected output",
+              description: "checks for expected content",
+            },
+            { type: "tool_called", value: "Write", description: "verifies Write tool was used" },
+          ],
+          tags: ["generated"],
+        },
+      ],
+      null,
+      2,
+    ),
+  );
+  parts.push("");
+  parts.push("Generate 5-10 diverse test cases covering the skill's functionality.");
+  return parts.join("\n");
+}
+// ---------------------------------------------------------------------------
+// Generate unit tests
+// ---------------------------------------------------------------------------
+/** Generate unit tests for a skill using an LLM. Returns empty array on error. */
+export async function generateUnitTests(
+  skillName: string,
+  skillContent: string,
+  evalFailures: EvalEntry[],
+  llmCaller: LlmCaller,
+): Promise<SkillUnitTest[]> {
+  try {
+    const userPrompt = buildGenerationPrompt(skillName, skillContent, evalFailures);
+    const raw = await llmCaller(SYSTEM_PROMPT, userPrompt);
+    const cleaned = stripArrayFences(raw);
+    const parsed = JSON.parse(cleaned);
+    if (!Array.isArray(parsed)) {
+      console.warn("[WARN] LLM did not return a JSON array for unit test generation");
+      return [];
+    }
+    // Ensure skill_name is set correctly on each test
+    return parsed.map((t: SkillUnitTest) => ({
+      ...t,
+      skill_name: t.skill_name || skillName,
+    }));
+  } catch (err) {
+    console.warn("[WARN] Failed to generate unit tests:", err);
+    return [];
+  }
+}

package/cli/selftune/eval/hooks-to-evals.ts CHANGED Viewed

@@ -26,7 +26,9 @@ import type {
   SkillUsageRecord,
 } from "../types.js";
 import { readJsonl } from "../utils/jsonl.js";
+import { detectAgent } from "../utils/llm-call.js";
 import { seededShuffle } from "../utils/seeded-random.js";
+import { generateSyntheticEvals } from "./synthetic-evals.js";
 // ---------------------------------------------------------------------------
 // Query truncation
@@ -359,7 +361,7 @@ export function printEvalStats(
 // CLI entry point
 // ---------------------------------------------------------------------------
-export function cliMain(): void {
+export async function cliMain(): Promise<void> {
   const { values } = parseArgs({
     options: {
       skill: { type: "string" },
@@ -373,10 +375,71 @@ export function cliMain(): void {
       "skill-log": { type: "string", default: SKILL_LOG },
       "query-log": { type: "string", default: QUERY_LOG },
       "telemetry-log": { type: "string", default: TELEMETRY_LOG },
+      synthetic: { type: "boolean", default: false },
+      "skill-path": { type: "string" },
+      model: { type: "string" },
     },
     strict: true,
   });
+  // --- Synthetic mode: generate evals from SKILL.md via LLM ---
+  if (values.synthetic) {
+    if (!values.skill) {
+      console.error("[ERROR] --skill required with --synthetic");
+      process.exit(1);
+    }
+    if (!values["skill-path"]) {
+      console.error("[ERROR] --skill-path required with --synthetic");
+      process.exit(1);
+    }
+    const agent = detectAgent();
+    if (!agent) {
+      console.error("[ERROR] No agent CLI found (claude/codex/opencode). Install one first.");
+      process.exit(1);
+    }
+    const maxPerSide = Number.parseInt(values.max ?? "50", 10);
+    const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
+    console.log(`Generating synthetic evals for skill '${values.skill}'...`);
+    const evalSet = await generateSyntheticEvals(values["skill-path"], values.skill, agent, {
+      maxPositives: effectiveMax,
+      maxNegatives: effectiveMax,
+      modelFlag: values.model,
+    });
+    const outputPath = values.output ?? `${values.skill}_trigger_eval.json`;
+    writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
+    const pos = evalSet.filter((e) => e.should_trigger);
+    const neg = evalSet.filter((e) => !e.should_trigger);
+    console.log(`Wrote ${evalSet.length} synthetic eval entries to ${outputPath}`);
+    console.log(`  Positives (should_trigger=true) : ${pos.length}`);
+    console.log(`  Negatives (should_trigger=false): ${neg.length}`);
+    if (pos.length > 0) {
+      const types = new Map<string, number>();
+      for (const e of pos) {
+        const t = e.invocation_type ?? "?";
+        types.set(t, (types.get(t) ?? 0) + 1);
+      }
+      console.log("\n  Positive invocation types:");
+      for (const [t, c] of [...types.entries()].sort()) {
+        console.log(`    ${t.padEnd(15)}  ${c}`);
+      }
+    }
+    console.log("\nNext steps:");
+    console.log("  bun run cli/selftune/eval/run-eval.ts \\");
+    console.log(`    --eval-set ${outputPath} \\`);
+    console.log(`    --skill-path ${values["skill-path"]} \\`);
+    console.log("    --runs-per-query 3 --verbose");
+    return;
+  }
+  // --- Log-based mode (original behavior) ---
   const skillRecords = readJsonl<SkillUsageRecord>(values["skill-log"] ?? SKILL_LOG);
   const queryRecords = readJsonl<QueryLogRecord>(values["query-log"] ?? QUERY_LOG);
   const telemetryRecords = readJsonl<SessionTelemetryRecord>(
@@ -418,5 +481,8 @@ export function cliMain(): void {
 }
 if (import.meta.main) {
-  cliMain();
+  cliMain().catch((err) => {
+    console.error(err);
+    process.exit(1);
+  });
 }