npm - selftune - Versions diffs - 0.2.23 → 0.2.25 - Mend

selftune 0.2.23 → 0.2.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

package/cli/selftune/eval/hooks-to-evals.ts CHANGED Viewed

@@ -23,6 +23,7 @@
 import { writeFileSync } from "node:fs";
 import { parseArgs } from "node:util";
+import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
 import { GENERIC_NEGATIVES, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
 import { getDb } from "../localdb/db.js";
 import {
@@ -32,27 +33,31 @@ import {
 } from "../localdb/queries.js";
 import type {
   EvalEntry,
-  InvocationType,
+  EvalSourceStats,
   QueryLogRecord,
   SessionTelemetryRecord,
   SkillUsageRecord,
 } from "../types.js";
 import { CLIError, handleCLIError } from "../utils/cli-error.js";
-import { detectAgent } from "../utils/llm-call.js";
+import { detectLlmAgent } from "../utils/llm-call.js";
 import {
   filterActionableQueryRecords,
   filterActionableSkillUsageRecords,
 } from "../utils/query-filter.js";
 import { seededShuffle } from "../utils/seeded-random.js";
 import {
-  escapeRegExp,
   findInstalledSkillNames,
   findInstalledSkillPath,
   findRepositoryClaudeSkillDirs,
   findRepositorySkillDirs,
 } from "../utils/skill-discovery.js";
 import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
+import { readJsonl } from "../utils/jsonl.js";
+import { classifyInvocation } from "./invocation-classifier.js";
 import { generateSyntheticEvals } from "./synthetic-evals.js";
+import { writeCanonicalEvalSet } from "../testing-readiness.js";
+export { classifyInvocation } from "./invocation-classifier.js";
 // ---------------------------------------------------------------------------
 // Query truncation
@@ -64,69 +69,6 @@ function truncateQuery(query: string): string {
   return query.length > MAX_QUERY_LENGTH ? query.slice(0, MAX_QUERY_LENGTH) : query;
 }
-// ---------------------------------------------------------------------------
-// Invocation taxonomy classifier
-// ---------------------------------------------------------------------------
-export function classifyInvocation(query: string, skillName: string): InvocationType {
-  const qLower = query.toLowerCase();
-  const skillLower = skillName.toLowerCase();
-  // --- Explicit checks ---
-  // Explicit: mentions skill name or $skill syntax
-  if (
-    qLower.includes(`$${skillLower}`) ||
-    query.includes(`$${skillName}`) ||
-    qLower.includes(skillLower)
-  ) {
-    return "explicit";
-  }
-  // Handle hyphenated skill names: check if all parts appear
-  if (skillLower.includes("-")) {
-    const parts = skillLower.split("-");
-    if (parts.every((part) => new RegExp(`\\b${escapeRegExp(part)}\\b`, "i").test(query))) {
-      return "explicit";
-    }
-  }
-  // Convert skill-name to camelCase and check
-  const camelCase = skillLower.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
-  if (camelCase !== skillLower && qLower.includes(camelCase.toLowerCase())) {
-    return "explicit";
-  }
-  // --- Contextual checks ---
-  const wordCount = query.split(/\s+/).length;
-  const hasProperNoun = /\b[A-Z][a-z]{2,}\b/.test(query);
-  // Temporal references suggest domain context
-  const hasTemporalRef =
-    /\b(next week|last week|tomorrow|yesterday|Q[1-4]|monday|tuesday|wednesday|thursday|friday|january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(
-      query,
-    );
-  // Filenames suggest contextual usage
-  const hasFilename = /\b\w+\.\w{2,4}\b/.test(query);
-  // Email addresses suggest contextual usage
-  const hasEmail = /\b\S+@\S+\.\S+\b/.test(query);
-  if (wordCount > 15 || hasProperNoun || hasTemporalRef || hasFilename || hasEmail) {
-    return "contextual";
-  }
-  // Borderline: 10-15 words with domain signals (multi-digit numbers, uppercase acronyms)
-  const hasDomainSignal = /\b\d{2,}\b/.test(query) || /[A-Z]{2,}/.test(query);
-  if (wordCount >= 10 && hasDomainSignal) {
-    return "contextual";
-  }
-  return "implicit";
-}
 // ---------------------------------------------------------------------------
 // Build eval set
 // ---------------------------------------------------------------------------
@@ -144,6 +86,7 @@ export function buildEvalSet(
   const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
   const effectiveMaxPerSide = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
   const effectiveSeed = Number.isNaN(seed) ? 42 : seed;
+  const buildTimestamp = new Date().toISOString();
   // Build set of positive query texts (for exclusion from negatives)
   const positiveQueries = new Set<string>();
@@ -166,7 +109,12 @@ export function buildEvalSet(
     const q = (r.query ?? "").trim();
     if (!q || q === "(query not found)" || seen.has(q)) continue;
     seen.add(q);
-    const entry: EvalEntry = { query: truncateQuery(q), should_trigger: true };
+    const entry: EvalEntry = {
+      query: truncateQuery(q),
+      should_trigger: true,
+      source: "log",
+      created_at: buildTimestamp,
+    };
     if (annotateTaxonomy) {
       entry.invocation_type = classifyInvocation(q, skillName);
     }
@@ -189,7 +137,12 @@ export function buildEvalSet(
     const shuffledNeg = seededShuffle(negCandidates, effectiveSeed).slice(0, effectiveMaxPerSide);
     negatives = shuffledNeg.map((q) => {
-      const entry: EvalEntry = { query: truncateQuery(q), should_trigger: false };
+      const entry: EvalEntry = {
+        query: truncateQuery(q),
+        should_trigger: false,
+        source: "log",
+        created_at: buildTimestamp,
+      };
       if (annotateTaxonomy) {
         entry.invocation_type = "negative";
       }
@@ -202,7 +155,12 @@ export function buildEvalSet(
       const fallbacks: EvalEntry[] = [];
       for (const q of GENERIC_NEGATIVES) {
         if (negSeen.has(q) || positiveQueries.has(q)) continue;
-        const entry: EvalEntry = { query: q, should_trigger: false };
+        const entry: EvalEntry = {
+          query: q,
+          should_trigger: false,
+          source: "log",
+          created_at: buildTimestamp,
+        };
         if (annotateTaxonomy) {
           entry.invocation_type = "negative";
         }
@@ -215,6 +173,116 @@ export function buildEvalSet(
   return [...shuffledPositives, ...negatives];
 }
+// ---------------------------------------------------------------------------
+// Normalized Levenshtein distance
+// ---------------------------------------------------------------------------
+function levenshteinDistance(a: string, b: string): number {
+  const la = a.length;
+  const lb = b.length;
+  if (la === 0) return lb;
+  if (lb === 0) return la;
+  // Use two-row optimization to keep memory O(min(la, lb))
+  let prev = Array.from<number>({ length: lb + 1 });
+  let curr = Array.from<number>({ length: lb + 1 });
+  for (let j = 0; j <= lb; j++) prev[j] = j;
+  for (let i = 1; i <= la; i++) {
+    curr[0] = i;
+    for (let j = 1; j <= lb; j++) {
+      const cost = a[i - 1] === b[j - 1] ? 0 : 1;
+      curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost);
+    }
+    [prev, curr] = [curr, prev];
+  }
+  return prev[lb];
+}
+function normalizedLevenshtein(a: string, b: string): number {
+  const maxLen = Math.max(a.length, b.length);
+  if (maxLen === 0) return 0;
+  return levenshteinDistance(a, b) / maxLen;
+}
+// ---------------------------------------------------------------------------
+// Blend eval sets (log + synthetic)
+// ---------------------------------------------------------------------------
+/**
+ * Blend log-based and synthetic eval entries.
+ *
+ * Policy:
+ *   - Keep ALL log-based entries (source: "log")
+ *   - Add synthetic entries that cover gaps (boundary cases, underrepresented types)
+ *   - Deduplicate: drop synthetic if normalizedLevenshtein(synthetic, anyLog) < 0.3
+ *   - Mark surviving synthetic entries as source: "blended"
+ *   - Cap total at 2x the log-based count
+ */
+export function blendEvalSets(logEntries: EvalEntry[], syntheticEntries: EvalEntry[]): EvalEntry[] {
+  const result: EvalEntry[] = [...logEntries];
+  const logCount = logEntries.length;
+  const cap = logCount * 2;
+  if (logCount === 0 || syntheticEntries.length === 0) {
+    return result.slice(0, cap);
+  }
+  // Normalize log queries for comparison
+  const logQueries = logEntries.map((e) => e.query.toLowerCase().trim());
+  // Filter synthetic entries: drop those too similar to any log entry
+  const candidates: EvalEntry[] = [];
+  for (const synth of syntheticEntries) {
+    const synthNorm = synth.query.toLowerCase().trim();
+    let tooSimilar = false;
+    for (const logQ of logQueries) {
+      // Length pre-filter: skip Levenshtein if lengths differ by >70%
+      const maxLen = Math.max(synthNorm.length, logQ.length);
+      if (maxLen > 0 && Math.abs(synthNorm.length - logQ.length) / maxLen > 0.7) continue;
+      if (normalizedLevenshtein(synthNorm, logQ) < 0.3) {
+        tooSimilar = true;
+        break;
+      }
+    }
+    if (!tooSimilar) {
+      candidates.push({ ...synth, source: "blended" });
+    }
+  }
+  // Add candidates up to the cap
+  const slotsAvailable = cap - result.length;
+  result.push(...candidates.slice(0, slotsAvailable));
+  return result;
+}
+// ---------------------------------------------------------------------------
+// Eval source stats
+// ---------------------------------------------------------------------------
+export function computeEvalSourceStats(entries: EvalEntry[]): EvalSourceStats {
+  const stats: EvalSourceStats = { total: entries.length, synthetic: 0, log: 0, blended: 0 };
+  const timestamps: string[] = [];
+  for (const entry of entries) {
+    if (entry.source === "synthetic") stats.synthetic++;
+    else if (entry.source === "log") stats.log++;
+    else if (entry.source === "blended") stats.blended++;
+    if (entry.created_at) timestamps.push(entry.created_at);
+  }
+  if (timestamps.length > 0) {
+    timestamps.sort();
+    stats.oldest = timestamps[0];
+    stats.newest = timestamps[timestamps.length - 1];
+  }
+  return stats;
+}
 // ---------------------------------------------------------------------------
 // Installed skill discovery / readiness
 // ---------------------------------------------------------------------------
@@ -505,33 +573,40 @@ export async function cliMain(): Promise<void> {
       "telemetry-log": { type: "string", default: TELEMETRY_LOG },
       synthetic: { type: "boolean", default: false },
       "auto-synthetic": { type: "boolean", default: false },
+      blend: { type: "boolean", default: false },
       "skill-path": { type: "string" },
       model: { type: "string" },
+      help: { type: "boolean", default: false },
     },
     strict: true,
   });
+  if (values.help) {
+    console.log(renderCommandHelp(PUBLIC_COMMAND_SURFACES.evalGenerate));
+    process.exit(0);
+  }
   // --- Synthetic mode: generate evals from SKILL.md via LLM ---
   if (values.synthetic) {
     if (!values.skill) {
       throw new CLIError(
         "--skill required with --synthetic",
         "MISSING_FLAG",
-        "selftune evals --synthetic --skill <name> --skill-path <path>",
+        "selftune eval generate --synthetic --skill <name> --skill-path <path>",
       );
     }
     if (!values["skill-path"]) {
       throw new CLIError(
         "--skill-path required with --synthetic",
         "MISSING_FLAG",
-        "selftune evals --synthetic --skill <name> --skill-path <path>",
+        "selftune eval generate --synthetic --skill <name> --skill-path <path>",
       );
     }
-    const agent = detectAgent();
+    const agent = detectLlmAgent();
     if (!agent) {
       throw new CLIError(
-        "No agent CLI found (claude/codex/opencode)",
+        "No agent CLI found (claude/codex/opencode/pi)",
         "AGENT_NOT_FOUND",
         "Install one of the supported agent CLIs",
       );
@@ -549,11 +624,13 @@ export async function cliMain(): Promise<void> {
     const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
     writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
+    const canonicalPath = writeCanonicalEvalSet(values.skill, evalSet);
     const pos = evalSet.filter((e) => e.should_trigger);
     const neg = evalSet.filter((e) => !e.should_trigger);
     console.log(`Wrote ${evalSet.length} synthetic eval entries to ${outputPath}`);
+    console.log(`Canonical eval copy: ${canonicalPath}`);
     console.log(`  Positives (should_trigger=true) : ${pos.length}`);
     console.log(`  Negatives (should_trigger=false): ${neg.length}`);
@@ -582,10 +659,23 @@ export async function cliMain(): Promise<void> {
   let queryRecords: QueryLogRecord[];
   let telemetryRecords: SessionTelemetryRecord[];
-  const db = getDb();
-  skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
-  queryRecords = queryQueryLog(db) as QueryLogRecord[];
-  telemetryRecords = querySessionTelemetry(db) as SessionTelemetryRecord[];
+  const skillLogPath = values["skill-log"] ?? SKILL_LOG;
+  const queryLogPath = values["query-log"] ?? QUERY_LOG;
+  const telemetryLogPath = values["telemetry-log"] ?? TELEMETRY_LOG;
+  const hasCustomSkillLog = skillLogPath !== SKILL_LOG;
+  const hasCustomQueryLog = queryLogPath !== QUERY_LOG;
+  const hasCustomTelemetryLog = telemetryLogPath !== TELEMETRY_LOG;
+  const db = hasCustomSkillLog && hasCustomQueryLog && hasCustomTelemetryLog ? undefined : getDb();
+  skillRecords = hasCustomSkillLog
+    ? readJsonl<SkillUsageRecord>(skillLogPath)
+    : (querySkillUsageRecords(db!) as SkillUsageRecord[]);
+  queryRecords = hasCustomQueryLog
+    ? readJsonl<QueryLogRecord>(queryLogPath)
+    : (queryQueryLog(db!) as QueryLogRecord[]);
+  telemetryRecords = hasCustomTelemetryLog
+    ? readJsonl<SessionTelemetryRecord>(telemetryLogPath)
+    : (querySessionTelemetry(db!) as SessionTelemetryRecord[]);
   if (values["list-skills"]) {
     listSkills(skillRecords, queryRecords, telemetryRecords);
@@ -596,7 +686,7 @@ export async function cliMain(): Promise<void> {
     throw new CLIError(
       "--skill required (or use --list-skills)",
       "MISSING_FLAG",
-      "selftune evals --skill <name> or selftune evals --list-skills",
+      "selftune eval generate --skill <name> or selftune eval generate --list-skills",
     );
   }
@@ -632,10 +722,10 @@ export async function cliMain(): Promise<void> {
       );
     }
-    const agent = detectAgent();
+    const agent = detectLlmAgent();
     if (!agent) {
       throw new CLIError(
-        "No agent CLI found (claude/codex/opencode)",
+        "No agent CLI found (claude/codex/opencode/pi)",
         "AGENT_NOT_FOUND",
         "Install one of the supported agent CLIs",
       );
@@ -652,10 +742,12 @@ export async function cliMain(): Promise<void> {
     });
     const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
     writeFileSync(outputPath, JSON.stringify(syntheticEvalSet, null, 2), "utf-8");
+    const canonicalPath = writeCanonicalEvalSet(values.skill, syntheticEvalSet);
     const pos = syntheticEvalSet.filter((e) => e.should_trigger);
     const neg = syntheticEvalSet.filter((e) => !e.should_trigger);
     console.log(`Wrote ${syntheticEvalSet.length} synthetic eval entries to ${outputPath}`);
+    console.log(`Canonical eval copy: ${canonicalPath}`);
     console.log(`  Positives (should_trigger=true) : ${pos.length}`);
     console.log(`  Negatives (should_trigger=false): ${neg.length}`);
     console.log("\nNext steps:");
@@ -666,9 +758,63 @@ export async function cliMain(): Promise<void> {
     return;
   }
+  // --- Blend mode: merge log-based evals with synthetic gap-fillers ---
+  let finalEvalSet = evalSet;
+  if (values.blend) {
+    const skillPath = values["skill-path"] ?? detectedSkillPath;
+    if (!skillPath) {
+      throw new CLIError(
+        `--blend requires a resolvable SKILL.md path. Use --skill-path or install the skill locally.`,
+        "MISSING_FLAG",
+        `selftune eval generate --skill ${values.skill} --blend --skill-path /path/to/SKILL.md`,
+      );
+    }
+    const agent = detectLlmAgent();
+    if (!agent) {
+      throw new CLIError(
+        "No agent CLI found (claude/codex/opencode/pi)",
+        "AGENT_NOT_FOUND",
+        "Install one of the supported agent CLIs",
+      );
+    }
+    // Fail fast before expensive LLM calls — blending with zero logs always produces []
+    if (evalSet.length === 0) {
+      throw new CLIError(
+        `--blend requires log-based eval entries to blend with synthetic entries. No log data found for skill "${values.skill}".`,
+        "BLEND_NO_LOGS",
+        `Use --synthetic instead for cold-start skills, or run selftune sync first to ingest session data.`,
+      );
+    }
+    const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
+    console.log(`Generating synthetic evals for blending with '${values.skill}'...`);
+    const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
+      maxPositives: effectiveMax,
+      maxNegatives: effectiveMax,
+      modelFlag: values.model,
+    });
+    finalEvalSet = blendEvalSets(evalSet, syntheticEvalSet);
+    const stats = computeEvalSourceStats(finalEvalSet);
+    console.log(
+      `Blended: ${stats.log} log + ${stats.blended} synthetic gap-fillers = ${stats.total} total`,
+    );
+  }
   const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
-  writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
-  printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
+  writeFileSync(outputPath, JSON.stringify(finalEvalSet, null, 2), "utf-8");
+  const canonicalPath = writeCanonicalEvalSet(values.skill, finalEvalSet);
+  printEvalStats(
+    finalEvalSet,
+    values.skill,
+    outputPath,
+    skillRecords,
+    queryRecords,
+    annotateTaxonomy,
+  );
+  console.log(`Canonical eval copy: ${canonicalPath}`);
   if (positiveCount === 0 && detectedSkillPath) {
     printSyntheticFallbackHint(values.skill, detectedSkillPath);
   }

package/cli/selftune/eval/import-skillsbench.ts CHANGED Viewed

@@ -10,7 +10,7 @@
  *   <dir>/tasks/<task-id>/task.toml        — metadata (difficulty, category, tags, etc.)
  */
-import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
+import { existsSync, readdirSync, readFileSync, type Dirent, writeFileSync } from "node:fs";
 import { join } from "node:path";
 import { parseArgs } from "node:util";
@@ -72,7 +72,7 @@ export function parseSkillsBenchDir(dirPath: string): SkillsBenchTask[] {
   const tasks: SkillsBenchTask[] = [];
-  let entries: ReturnType<typeof readdirSync>;
+  let entries: Dirent[];
   try {
     entries = readdirSync(tasksDir, { withFileTypes: true });
   } catch {

package/cli/selftune/eval/invocation-classifier.ts ADDED Viewed

@@ -0,0 +1,56 @@
+import type { InvocationType } from "../types.js";
+import { escapeRegExp } from "../utils/skill-discovery.js";
+/**
+ * Classify how directly a user query invokes a skill.
+ *
+ * Kept separate from eval generation so synthetic evals can reuse the
+ * classifier without creating an import cycle with hooks-to-evals.
+ */
+export function classifyInvocation(query: string, skillName: string): InvocationType {
+  const qLower = query.toLowerCase();
+  const skillLower = skillName.toLowerCase();
+  // Explicit: mentions skill name or $skill syntax.
+  if (
+    qLower.includes(`$${skillLower}`) ||
+    query.includes(`$${skillName}`) ||
+    qLower.includes(skillLower)
+  ) {
+    return "explicit";
+  }
+  // Handle hyphenated skill names: check if all parts appear.
+  if (skillLower.includes("-")) {
+    const parts = skillLower.split("-");
+    if (parts.every((part) => new RegExp(`\\b${escapeRegExp(part)}\\b`, "i").test(query))) {
+      return "explicit";
+    }
+  }
+  // Convert skill-name to camelCase and check.
+  const camelCase = skillLower.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
+  if (camelCase !== skillLower && qLower.includes(camelCase.toLowerCase())) {
+    return "explicit";
+  }
+  const wordCount = query.split(/\s+/).length;
+  const hasProperNoun = /\b[A-Z][a-z]{2,}\b/.test(query);
+  const hasTemporalRef =
+    /\b(next week|last week|tomorrow|yesterday|Q[1-4]|monday|tuesday|wednesday|thursday|friday|january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(
+      query,
+    );
+  const hasFilename = /\b\w+\.\w{2,4}\b/.test(query);
+  const hasEmail = /\b\S+@\S+\.\S+\b/.test(query);
+  if (wordCount > 15 || hasProperNoun || hasTemporalRef || hasFilename || hasEmail) {
+    return "contextual";
+  }
+  const hasDomainSignal = /\b\d{2,}\b/.test(query) || /[A-Z]{2,}/.test(query);
+  if (wordCount >= 10 && hasDomainSignal) {
+    return "contextual";
+  }
+  return "implicit";
+}

package/cli/selftune/eval/synthetic-evals.ts CHANGED Viewed

@@ -8,10 +8,10 @@
 import { readFileSync } from "node:fs";
-import type { EvalEntry, InvocationType } from "../types.js";
+import type { EvalEntry, InvocationType, SkillUsageRecord } from "../types.js";
 import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
 import { findInstalledSkillNames } from "../utils/skill-discovery.js";
-import { classifyInvocation } from "./hooks-to-evals.js";
+import { classifyInvocation } from "./invocation-classifier.js";
 // ---------------------------------------------------------------------------
 // Types
@@ -414,6 +414,8 @@ export function parseSyntheticResponse(raw: string, skillName: string): EvalEntr
       query,
       should_trigger: entry.should_trigger,
       invocation_type: invocationType,
+      source: "synthetic",
+      created_at: new Date().toISOString(),
     });
   }
@@ -449,7 +451,7 @@ export async function generateSyntheticEvals(
     const db = getDb();
     // Positives: high-confidence triggered records for this skill
-    const skillRecords = querySkillUsageRecords(db);
+    const skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
     const positive = skillRecords
       .filter((r) => isHighConfidencePositiveSkillRecord(r, skillName))
       .map((r) => r.query)

package/cli/selftune/eval/unit-test-cli.ts CHANGED Viewed

@@ -19,8 +19,9 @@ import { parseArgs } from "node:util";
 import { SELFTUNE_CONFIG_DIR } from "../constants.js";
 import type { EvalEntry } from "../types.js";
+import { writeUnitTestRunResult } from "../testing-readiness.js";
 import { CLIError } from "../utils/cli-error.js";
-import { callLlm, detectAgent } from "../utils/llm-call.js";
+import { callLlm, detectLlmAgent } from "../utils/llm-call.js";
 import { generateUnitTests } from "./generate-unit-tests.js";
 import type { AgentRunner } from "./unit-test.js";
 import { loadUnitTests, runUnitTestSuite } from "./unit-test.js";
@@ -58,10 +59,10 @@ export async function cliMain(): Promise<void> {
   // --generate: create tests from skill content
   if (values.generate) {
-    const agent = detectAgent();
+    const agent = detectLlmAgent();
     if (!agent) {
       throw new CLIError(
-        "No agent CLI found (claude/codex/opencode). Cannot generate tests",
+        "No agent CLI found (claude/codex/opencode/pi). Cannot generate tests",
         "AGENT_NOT_FOUND",
         "Install one of the supported agent CLIs",
       );
@@ -118,7 +119,7 @@ export async function cliMain(): Promise<void> {
   let agentRunner: AgentRunner;
   if (values["run-agent"]) {
-    const agent = detectAgent();
+    const agent = detectLlmAgent();
     if (!agent) {
       throw new CLIError(
         "No agent CLI found. Cannot run agent-based tests",
@@ -137,11 +138,13 @@ export async function cliMain(): Promise<void> {
   }
   const suite = await runUnitTestSuite(tests, skillName, agentRunner);
+  const resultPath = writeUnitTestRunResult(skillName, suite);
   // Print results
   console.log(`\nResults for '${suite.skill_name}':`);
   console.log(`  Total: ${suite.total}  Passed: ${suite.passed}  Failed: ${suite.failed}`);
   console.log(`  Pass rate: ${(suite.pass_rate * 100).toFixed(1)}%`);
+  console.log(`  Stored: ${resultPath}`);
   if (suite.failed > 0) {
     console.log("\nFailed tests:");