npm - selftune - Versions diffs - 0.2.14 → 0.2.15 - Mend

selftune 0.2.14 → 0.2.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +16 -0
package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +12 -0
package/apps/local-dashboard/dist/index.html +2 -2
package/cli/selftune/analytics.ts +13 -11
package/cli/selftune/badge/badge.ts +13 -9
package/cli/selftune/canonical-export.ts +6 -6
package/cli/selftune/contribute/contribute.ts +2 -1
package/cli/selftune/cron/setup.ts +3 -1
package/cli/selftune/dashboard-contract.ts +10 -0
package/cli/selftune/dashboard.ts +10 -5
package/cli/selftune/eval/baseline.ts +20 -30
package/cli/selftune/eval/hooks-to-evals.ts +22 -12
package/cli/selftune/eval/import-skillsbench.ts +21 -8
package/cli/selftune/eval/unit-test-cli.ts +22 -11
package/cli/selftune/evolution/description-quality.ts +224 -0
package/cli/selftune/evolution/evolve-body.ts +17 -10
package/cli/selftune/evolution/evolve.ts +70 -57
package/cli/selftune/evolution/rollback.ts +7 -6
package/cli/selftune/grading/auto-grade.ts +24 -22
package/cli/selftune/grading/grade-session.ts +21 -17
package/cli/selftune/hooks/auto-activate.ts +12 -3
package/cli/selftune/hooks/prompt-log.ts +7 -1
package/cli/selftune/index.ts +66 -69
package/cli/selftune/ingestors/claude-replay.ts +29 -14
package/cli/selftune/ingestors/codex-rollout.ts +6 -1
package/cli/selftune/init.ts +14 -9
package/cli/selftune/monitoring/watch.ts +32 -16
package/cli/selftune/orchestrate.ts +18 -17
package/cli/selftune/routes/skill-report.ts +17 -0
package/cli/selftune/schedule.ts +23 -9
package/cli/selftune/sync.ts +7 -3
package/cli/selftune/types.ts +44 -10
package/cli/selftune/utils/cli-error.ts +102 -0
package/cli/selftune/workflows/workflows.ts +23 -17
package/package.json +1 -1
package/skill/SKILL.md +1 -1
package/skill/Workflows/Evolve.md +4 -0
package/skill/Workflows/Initialize.md +8 -8
package/skill/settings_snippet.json +29 -6
package/apps/local-dashboard/dist/assets/index-DIrdlu2_.js +0 -16
package/apps/local-dashboard/dist/assets/vendor-ui-7xD7fNEU.js +0 -12

package/cli/selftune/evolution/description-quality.ts ADDED Viewed

@@ -0,0 +1,224 @@
+/**
+ * description-quality.ts
+ *
+ * Pure, deterministic scoring function that evaluates the quality of a skill
+ * description for routing accuracy. No LLM calls — heuristic-only.
+ *
+ * Inspired by OpenAI's finding that "writing better skill descriptions improved
+ * routing accuracy more than any change to the underlying skill logic itself."
+ */
+import type { DescriptionQualityScore } from "../types.js";
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+/** Optimal description length range (characters). */
+const MIN_LENGTH = 40;
+const MAX_LENGTH = 500;
+const IDEAL_MIN = 80;
+const IDEAL_MAX = 300;
+/** Words that indicate trigger context — the description says *when* the skill fires. */
+const TRIGGER_CONTEXT_WORDS = [
+  "when",
+  "if",
+  "after",
+  "before",
+  "during",
+  "while",
+  "upon",
+  "whenever",
+  "use when",
+  "trigger",
+  "activate",
+];
+/** Vague words that weaken routing precision. */
+const VAGUE_WORDS = [
+  "various",
+  "general",
+  "misc",
+  "miscellaneous",
+  "stuff",
+  "things",
+  "etc",
+  "and more",
+  "and so on",
+  "other",
+  "multiple",
+  "several",
+  "many",
+  "some",
+  "certain",
+  "related",
+];
+/** Common filler phrases that add no routing signal. */
+const FILLER_PHRASES = [
+  "this skill",
+  "a tool for",
+  "a tool that",
+  "helps with",
+  "is used for",
+  "can be used",
+  "is designed to",
+];
+/** Action verbs that signal concrete behavior. */
+const ACTION_VERBS = [
+  "run",
+  "execute",
+  "analyze",
+  "generate",
+  "create",
+  "deploy",
+  "validate",
+  "check",
+  "build",
+  "test",
+  "scan",
+  "extract",
+  "transform",
+  "monitor",
+  "grade",
+  "evolve",
+  "sync",
+  "watch",
+  "review",
+  "audit",
+  "parse",
+  "format",
+  "search",
+  "fetch",
+  "publish",
+  "install",
+  "configure",
+  "diagnose",
+  "debug",
+  "fix",
+  "optimize",
+  "measure",
+];
+// ---------------------------------------------------------------------------
+// Pre-compiled word-boundary patterns
+// ---------------------------------------------------------------------------
+/** Compile a word list into pre-built RegExp patterns at module load time. */
+function compileWordPatterns(words: string[]): RegExp[] {
+  return words.map((w) => new RegExp(`\\b${w.replace(/\s+/g, "\\s+")}\\b`, "i"));
+}
+const TRIGGER_PATTERNS = compileWordPatterns(TRIGGER_CONTEXT_WORDS);
+const VAGUE_PATTERNS = compileWordPatterns(VAGUE_WORDS);
+const ACTION_PATTERNS = compileWordPatterns(ACTION_VERBS);
+/** Count how many pre-compiled patterns match in a string. */
+function countWordMatches(text: string, patterns: RegExp[]): number {
+  let count = 0;
+  for (const p of patterns) {
+    if (p.test(text)) count++;
+  }
+  return count;
+}
+// ---------------------------------------------------------------------------
+// Criterion scorers
+// ---------------------------------------------------------------------------
+/** Score description length: 1.0 for ideal range, graded falloff outside. */
+export function scoreLengthCriterion(description: string): number {
+  const len = description.length;
+  if (len < MIN_LENGTH) return len / MIN_LENGTH;
+  if (len >= IDEAL_MIN && len <= IDEAL_MAX) return 1.0;
+  if (len < IDEAL_MIN) return 0.7 + 0.3 * ((len - MIN_LENGTH) / (IDEAL_MIN - MIN_LENGTH));
+  if (len <= MAX_LENGTH) return 0.7 + 0.3 * ((MAX_LENGTH - len) / (MAX_LENGTH - IDEAL_MAX));
+  return Math.max(0.3, 0.7 - 0.4 * ((len - MAX_LENGTH) / MAX_LENGTH));
+}
+/** Score presence of trigger context words (when/if/before/after etc). */
+export function scoreTriggerContextCriterion(description: string): number {
+  const matches = countWordMatches(description.toLowerCase(), TRIGGER_PATTERNS);
+  if (matches === 0) return 0.0;
+  if (matches === 1) return 0.7;
+  return Math.min(1.0, 0.7 + 0.15 * (matches - 1));
+}
+/** Score absence of vague words (lower is worse). */
+export function scoreVaguenessCriterion(description: string): number {
+  const matches = countWordMatches(description.toLowerCase(), VAGUE_PATTERNS);
+  if (matches === 0) return 1.0;
+  if (matches === 1) return 0.6;
+  return Math.max(0.1, 0.6 - 0.15 * (matches - 1));
+}
+/** Score whether description specifies at least one concrete action or domain. */
+export function scoreSpecificityCriterion(description: string): number {
+  const lower = description.toLowerCase();
+  const hasAction = ACTION_PATTERNS.some((p) => p.test(lower));
+  const fillerCount = FILLER_PHRASES.filter((f) => lower.includes(f)).length;
+  const words = description.split(/\s+/).length;
+  const fillerRatio = fillerCount > 0 ? fillerCount / Math.max(1, words / 10) : 0;
+  if (!hasAction) return 0.2;
+  return Math.max(0.3, 1.0 - fillerRatio * 0.3);
+}
+/** Score whether description is not just the skill name restated. */
+export function scoreNotJustNameCriterion(description: string, skillName?: string): number {
+  if (!skillName) return 1.0;
+  const descNorm = description
+    .toLowerCase()
+    .trim()
+    .replace(/[^a-z0-9\s]/g, "");
+  const nameNorm = skillName
+    .toLowerCase()
+    .trim()
+    .replace(/[^a-z0-9\s]/g, "");
+  const nameFromKebab = skillName.replace(/[-_]/g, " ").toLowerCase().trim();
+  if (descNorm === nameNorm || descNorm === nameFromKebab) return 0.0;
+  if (descNorm.length < nameNorm.length + 10) return 0.3;
+  return 1.0;
+}
+// ---------------------------------------------------------------------------
+// Main scoring function
+// ---------------------------------------------------------------------------
+/** Criterion weights — trigger context is weighted highest per OpenAI's finding. */
+const WEIGHTS = {
+  length: 0.15,
+  trigger_context: 0.3,
+  vagueness: 0.2,
+  specificity: 0.2,
+  not_just_name: 0.15,
+} as const;
+/**
+ * Score a skill description on heuristic quality criteria.
+ * Returns a 0.0-1.0 composite score with per-criterion breakdown.
+ * Pure function — no I/O, no LLM calls.
+ */
+export function scoreDescription(description: string, skillName?: string): DescriptionQualityScore {
+  const criteria = {
+    length: scoreLengthCriterion(description),
+    trigger_context: scoreTriggerContextCriterion(description),
+    vagueness: scoreVaguenessCriterion(description),
+    specificity: scoreSpecificityCriterion(description),
+    not_just_name: scoreNotJustNameCriterion(description, skillName),
+  };
+  const composite = (Object.keys(WEIGHTS) as (keyof typeof WEIGHTS)[]).reduce(
+    (sum, key) => sum + criteria[key] * WEIGHTS[key],
+    0,
+  );
+  return {
+    composite: +composite.toFixed(3),
+    criteria,
+  };
+}

package/cli/selftune/evolution/evolve-body.ts CHANGED Viewed

@@ -25,6 +25,7 @@ import type {
   QueryLogRecord,
   SkillUsageRecord,
 } from "../types.js";
+import { CLIError, handleCLIError } from "../utils/cli-error.js";
 import type { EffortLevel, SubagentCallOptions } from "../utils/llm-call.js";
 import { callViaSubagent } from "../utils/llm-call.js";
 import { appendAuditEntry } from "./audit.js";
@@ -710,8 +711,11 @@ Options:
   }
   if (!values.skill || !values["skill-path"]) {
-    console.error("[ERROR] --skill and --skill-path are required");
-    process.exit(1);
+    throw new CLIError(
+      "--skill and --skill-path are required",
+      "MISSING_FLAG",
+      "selftune evolve body --skill <name> --skill-path <path>",
+    );
   }
   const { detectAgent } = await import("../utils/llm-call.js");
@@ -719,15 +723,21 @@ Options:
   const studentAgent = values["student-agent"] ?? teacherAgent;
   if (!teacherAgent) {
-    console.error("[ERROR] No agent CLI found. Install Claude Code, Codex, or OpenCode.");
-    process.exit(1);
+    throw new CLIError(
+      "No agent CLI found. Install Claude Code, Codex, or OpenCode.",
+      "AGENT_NOT_FOUND",
+      "Install Claude Code, Codex, or OpenCode.",
+    );
   }
   // Parse target
   const targetStr = values.target ?? "body";
   if (targetStr !== "body" && targetStr !== "routing") {
-    console.error("[ERROR] --target must be 'body' or 'routing'");
-    process.exit(1);
+    throw new CLIError(
+      "--target must be 'body' or 'routing'",
+      "INVALID_FLAG",
+      "Use --target body or --target routing",
+    );
   }
   // Parse few-shot examples
@@ -763,8 +773,5 @@ Options:
 }
 if (import.meta.main) {
-  cliMain().catch((err) => {
-    console.error(`[FATAL] ${err}`);
-    process.exit(1);
-  });
+  cliMain().catch(handleCLIError);
 }

package/cli/selftune/evolution/evolve.ts CHANGED Viewed

@@ -36,10 +36,12 @@ import type {
   SessionTelemetryRecord,
   SkillUsageRecord,
 } from "../types.js";
+import { CLIError, handleCLIError } from "../utils/cli-error.js";
 import { parseFrontmatter, replaceDescription } from "../utils/frontmatter.js";
 import { createEvolveTUI } from "../utils/tui.js";
 import { appendAuditEntry } from "./audit.js";
 import { checkConstitution } from "./constitutional.js";
+import { scoreDescription } from "./description-quality.js";
 import { appendEvidenceEntry } from "./evidence.js";
 import { extractFailurePatterns } from "./extract-patterns.js";
 import {
@@ -94,6 +96,8 @@ export interface EvolveResult {
   baselineResult?: BaselineMeasurement;
   gateValidation?: ValidationResult;
   sync_result?: SyncResult;
+  descriptionQualityBefore?: number;
+  descriptionQualityAfter?: number;
 }
 /**
@@ -247,16 +251,26 @@ export async function evolve(
     );
   /** Stamp every return with pipeline stats so callers always get them. */
-  const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult => ({
-    ...r,
-    llmCallCount,
-    elapsedMs: Date.now() - pipelineStart,
-    ...(syncResult ? { sync_result: syncResult } : {}),
-  });
+  const withStats = (r: Omit<EvolveResult, "llmCallCount" | "elapsedMs">): EvolveResult => {
+    const descQualityAfterScore = r.proposal
+      ? scoreDescription(r.proposal.proposed_description, options.skillName).composite
+      : undefined;
+    return {
+      ...r,
+      llmCallCount,
+      elapsedMs: Date.now() - pipelineStart,
+      ...(syncResult ? { sync_result: syncResult } : {}),
+      ...(descQualityBeforeScore != null
+        ? { descriptionQualityBefore: descQualityBeforeScore }
+        : {}),
+      ...(descQualityAfterScore != null ? { descriptionQualityAfter: descQualityAfterScore } : {}),
+    };
+  };
-  // Hoisted so catch block can preserve partial results on error
+  // Hoisted so catch block and withStats can preserve partial results on error
   let lastProposal: EvolutionProposal | null = null;
   let lastValidation: ValidationResult | null = null;
+  let descQualityBeforeScore: number | undefined;
   try {
     // -----------------------------------------------------------------------
@@ -281,7 +295,11 @@ export async function evolve(
     const versionTag = skillVersion ? `, v${skillVersion}` : "";
     const createdAuditDetails = (message: string) =>
       `original_description:${rawContent}\n${message}`;
-    tui.done(`Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag})`);
+    const descQualityBefore = scoreDescription(currentDescription, skillName);
+    descQualityBeforeScore = descQualityBefore.composite;
+    tui.done(
+      `Loaded SKILL.md (desc: ${currentDescription.length} chars${versionTag}, quality: ${descQualityBefore.composite})`,
+    );
     if (options.syncFirst) {
       tui.step(`Syncing source-truth telemetry${options.syncForce ? " (force)" : ""}...`);
@@ -1111,38 +1129,36 @@ Options:
   }
   if (!values.skill || !values["skill-path"]) {
-    console.error("[ERROR] --skill and --skill-path are required");
-    process.exit(1);
+    throw new CLIError(
+      "--skill and --skill-path are required",
+      "MISSING_FLAG",
+      "selftune evolve --skill <name> --skill-path <path>",
+    );
   }
   if ((values["sync-force"] ?? false) && !(values["sync-first"] ?? false)) {
-    console.error("[ERROR] --sync-force requires --sync-first");
-    process.exit(1);
+    throw new CLIError(
+      "--sync-force requires --sync-first",
+      "INVALID_FLAG",
+      "Add --sync-first when using --sync-force",
+    );
   }
   const { detectAgent } = await import("../utils/llm-call.js");
   const requestedAgent = values.agent;
   if (requestedAgent && !Bun.which(requestedAgent)) {
-    console.error(
-      JSON.stringify({
-        level: "error",
-        code: "agent_not_in_path",
-        message: `Agent CLI '${requestedAgent}' not found in PATH.`,
-        action: "Install it or omit --agent to use auto-detection.",
-      }),
+    throw new CLIError(
+      `Agent CLI '${requestedAgent}' not found in PATH.`,
+      "AGENT_NOT_FOUND",
+      "Install it or omit --agent to use auto-detection.",
     );
-    process.exit(1);
   }
   const agent = requestedAgent ?? detectAgent();
   if (!agent) {
-    console.error(
-      JSON.stringify({
-        level: "error",
-        code: "agent_not_found",
-        message: "No agent CLI (claude/codex/opencode) found in PATH.",
-        action: "Install Claude Code, Codex, or OpenCode.",
-      }),
+    throw new CLIError(
+      "No agent CLI (claude/codex/opencode) found in PATH.",
+      "AGENT_NOT_FOUND",
+      "Install Claude Code, Codex, or OpenCode.",
     );
-    process.exit(1);
   }
   // -------------------------------------------------------------------------
@@ -1150,20 +1166,27 @@ Options:
   // -------------------------------------------------------------------------
   const skillPath = values["skill-path"];
   if (!skillPath) {
-    console.error("[ERROR] --skill-path is required.");
-    process.exit(1);
+    throw new CLIError(
+      "--skill-path is required.",
+      "MISSING_FLAG",
+      "selftune evolve --skill <name> --skill-path <path>",
+    );
   }
   if (!existsSync(skillPath)) {
-    console.error(`[ERROR] SKILL.md not found at: ${skillPath}`);
-    console.error("  Verify the --skill-path argument points to an existing SKILL.md file.");
-    process.exit(1);
+    throw new CLIError(
+      `SKILL.md not found at: ${skillPath}`,
+      "FILE_NOT_FOUND",
+      "Verify the --skill-path argument points to an existing SKILL.md file.",
+    );
   }
   const evalSetPath = values["eval-set"];
   if (evalSetPath && !existsSync(evalSetPath)) {
-    console.error(`[ERROR] Eval set file not found at: ${evalSetPath}`);
-    console.error("  Verify the --eval-set argument points to an existing JSON file.");
-    process.exit(1);
+    throw new CLIError(
+      `Eval set file not found at: ${evalSetPath}`,
+      "FILE_NOT_FOUND",
+      "Verify the --eval-set argument points to an existing JSON file.",
+    );
   }
   // If no eval-set provided, check that log files exist for auto-generation
@@ -1172,12 +1195,11 @@ Options:
     const hasSkillLog = querySkillUsageRecords(dbCheck).length > 0;
     const hasQueryLog = existsSync(QUERY_LOG);
     if (!hasSkillLog && !hasQueryLog) {
-      console.error("[ERROR] No eval set provided and no telemetry logs found.");
-      console.error(
-        "  Either pass --eval-set <path> or generate logs first by using selftune-enabled skills.",
+      throw new CLIError(
+        `No eval set provided and no telemetry logs found. Expected logs at: ${SKILL_LOG} and ${QUERY_LOG}`,
+        "MISSING_DATA",
+        "Either pass --eval-set <path> or generate logs first by using selftune-enabled skills.",
       );
-      console.error(`  Expected logs at: ${SKILL_LOG} and ${QUERY_LOG}`);
-      process.exit(1);
     }
   }
@@ -1244,6 +1266,12 @@ Options:
       rationale: result.proposal?.rationale ?? "",
       ...(result.skillVersion ? { version: result.skillVersion } : {}),
       dashboard_url: `http://localhost:3141/report/${encodeURIComponent(values.skill)}`,
+      ...(result.descriptionQualityBefore != null
+        ? { description_quality_before: result.descriptionQualityBefore }
+        : {}),
+      ...(result.descriptionQualityAfter != null
+        ? { description_quality_after: result.descriptionQualityAfter }
+        : {}),
     };
     console.log(JSON.stringify(summary, null, 2));
   }
@@ -1276,20 +1304,5 @@ Options:
 }
 if (import.meta.main) {
-  cliMain().catch((err) => {
-    const message = err instanceof Error ? err.message : String(err);
-    const stack = err instanceof Error ? err.stack : undefined;
-    console.error(`[FATAL] ${message}`);
-    if (stack && process.env.SELFTUNE_VERBOSE === "1") {
-      console.error(stack);
-    }
-    console.error(
-      "\nTroubleshooting:\n" +
-        "  - Verify --skill-path points to a valid SKILL.md file\n" +
-        "  - Ensure eval data exists (run `selftune eval generate` first) or pass --eval-set\n" +
-        "  - Check that ANTHROPIC_API_KEY is set if using Claude\n" +
-        "  - Re-run with --verbose for full diagnostic output",
-    );
-    process.exit(1);
-  });
+  cliMain().catch(handleCLIError);
 }

package/cli/selftune/evolution/rollback.ts CHANGED Viewed

@@ -13,6 +13,7 @@ import { parseArgs } from "node:util";
 import { updateContextAfterRollback } from "../memory/writer.js";
 import type { EvolutionAuditEntry } from "../types.js";
+import { CLIError, handleCLIError } from "../utils/cli-error.js";
 import { replaceDescription } from "../utils/frontmatter.js";
 import { appendAuditEntry, getLastDeployedProposal, readAuditTrail } from "./audit.js";
@@ -233,8 +234,11 @@ Options:
   }
   if (!values.skill || !values["skill-path"]) {
-    console.error("[ERROR] --skill and --skill-path are required");
-    process.exit(1);
+    throw new CLIError(
+      "--skill and --skill-path are required",
+      "MISSING_FLAG",
+      "selftune evolve rollback --skill <name> --skill-path <path>",
+    );
   }
   const result = await rollback({
@@ -248,8 +252,5 @@ Options:
 }
 if (import.meta.main) {
-  cliMain().catch((err) => {
-    console.error(`[FATAL] ${err}`);
-    process.exit(1);
-  });
+  cliMain().catch(handleCLIError);
 }

package/cli/selftune/grading/auto-grade.ts CHANGED Viewed

@@ -17,6 +17,7 @@ import { AGENT_CANDIDATES, TELEMETRY_LOG } from "../constants.js";
 import { getDb } from "../localdb/db.js";
 import { querySessionTelemetry, querySkillUsageRecords } from "../localdb/queries.js";
 import type { GradingResult, SessionTelemetryRecord, SkillUsageRecord } from "../types.js";
+import { CLIError, handleCLIError } from "../utils/cli-error.js";
 import { detectAgent as _detectAgent } from "../utils/llm-call.js";
 import { readExcerpt } from "../utils/transcript.js";
 import {
@@ -62,8 +63,7 @@ Options:
   const skill = values.skill;
   if (!skill) {
-    console.error("[ERROR] --skill is required");
-    process.exit(1);
+    throw new CLIError("--skill is required", "MISSING_FLAG", "selftune auto-grade --skill <name>");
   }
   // --- Determine agent ---
@@ -71,10 +71,11 @@ Options:
   const validAgents = [...AGENT_CANDIDATES];
   if (values.agent) {
     if (!validAgents.includes(values.agent)) {
-      console.error(
-        `[ERROR] Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
+      throw new CLIError(
+        `Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
+        "INVALID_FLAG",
+        `selftune auto-grade --skill <name> --agent ${validAgents[0]}`,
       );
-      process.exit(1);
     }
     agent = values.agent;
   } else {
@@ -82,11 +83,11 @@ Options:
   }
   if (!agent) {
-    console.error(
-      `[ERROR] No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH.\n` +
-        "Install one of the supported agent CLIs.",
+    throw new CLIError(
+      `No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH`,
+      "AGENT_NOT_FOUND",
+      "Install one of the supported agent CLIs",
     );
-    process.exit(1);
   }
   console.error(`[INFO] Auto-grade via agent: ${agent}`);
@@ -104,21 +105,22 @@ Options:
     sessionId = values["session-id"];
     const resolved = resolveSessionById(telRecords, sessionId);
     if (!resolved) {
-      console.error(
-        `[ERROR] Session '${sessionId}' not found in telemetry or recoverable transcript data. ` +
-          "Check the session ID or omit --session-id to auto-select the latest matching session.",
+      throw new CLIError(
+        `Session '${sessionId}' not found in telemetry or recoverable transcript data`,
+        "MISSING_DATA",
+        "Check the session ID or omit --session-id to auto-select the latest matching session",
       );
-      process.exit(1);
     }
     telemetry = resolved.telemetry;
     transcriptPath = resolved.transcriptPath;
   } else {
     const resolved = resolveLatestSessionForSkill(telRecords, skillUsageRecords, skill);
     if (!resolved) {
-      console.error(
-        `[ERROR] No session found for skill '${skill}'. Run the skill first, or pass --session-id.`,
+      throw new CLIError(
+        `No session found for skill '${skill}'`,
+        "MISSING_DATA",
+        "Run the skill first, or pass --session-id",
       );
-      process.exit(1);
     }
     telemetry = resolved.telemetry;
     sessionId = resolved.sessionId ?? "unknown";
@@ -159,8 +161,11 @@ Options:
       agent,
     });
   } catch (err) {
-    console.error(`[ERROR] ${err instanceof Error ? err.message : String(err)}`);
-    process.exit(1);
+    throw new CLIError(
+      `Grading failed: ${err instanceof Error ? err.message : String(err)}`,
+      "OPERATION_FAILED",
+      "Check agent availability and try again",
+    );
   }
   const outputPath = values.output ?? buildDefaultGradingOutputPath(sessionId);
@@ -193,8 +198,5 @@ Options:
 // Guard: only run when invoked directly
 if (import.meta.main) {
-  cliMain().catch((err) => {
-    console.error(`[FATAL] ${err}`);
-    process.exit(1);
-  });
+  cliMain().catch(handleCLIError);
 }