npm - selftune - Versions diffs - 0.1.4 → 0.2.1 - Mend

selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

package/.claude/agents/diagnosis-analyst.md +156 -0
package/.claude/agents/evolution-reviewer.md +180 -0
package/.claude/agents/integration-guide.md +212 -0
package/.claude/agents/pattern-analyst.md +160 -0
package/CHANGELOG.md +46 -1
package/README.md +105 -257
package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
package/apps/local-dashboard/dist/favicon.png +0 -0
package/apps/local-dashboard/dist/index.html +17 -0
package/apps/local-dashboard/dist/logo.png +0 -0
package/apps/local-dashboard/dist/logo.svg +9 -0
package/assets/BeforeAfter.gif +0 -0
package/assets/FeedbackLoop.gif +0 -0
package/assets/logo.svg +9 -0
package/assets/skill-health-badge.svg +20 -0
package/cli/selftune/activation-rules.ts +171 -0
package/cli/selftune/badge/badge-data.ts +108 -0
package/cli/selftune/badge/badge-svg.ts +212 -0
package/cli/selftune/badge/badge.ts +99 -0
package/cli/selftune/canonical-export.ts +183 -0
package/cli/selftune/constants.ts +103 -1
package/cli/selftune/contribute/bundle.ts +314 -0
package/cli/selftune/contribute/contribute.ts +214 -0
package/cli/selftune/contribute/sanitize.ts +162 -0
package/cli/selftune/cron/setup.ts +266 -0
package/cli/selftune/dashboard-contract.ts +202 -0
package/cli/selftune/dashboard-server.ts +1049 -0
package/cli/selftune/dashboard.ts +43 -156
package/cli/selftune/eval/baseline.ts +248 -0
package/cli/selftune/eval/composability-v2.ts +273 -0
package/cli/selftune/eval/composability.ts +117 -0
package/cli/selftune/eval/generate-unit-tests.ts +143 -0
package/cli/selftune/eval/hooks-to-evals.ts +101 -16
package/cli/selftune/eval/import-skillsbench.ts +221 -0
package/cli/selftune/eval/synthetic-evals.ts +172 -0
package/cli/selftune/eval/unit-test-cli.ts +152 -0
package/cli/selftune/eval/unit-test.ts +196 -0
package/cli/selftune/evolution/deploy-proposal.ts +142 -1
package/cli/selftune/evolution/evidence.ts +26 -0
package/cli/selftune/evolution/evolve-body.ts +586 -0
package/cli/selftune/evolution/evolve.ts +825 -116
package/cli/selftune/evolution/extract-patterns.ts +105 -16
package/cli/selftune/evolution/pareto.ts +314 -0
package/cli/selftune/evolution/propose-body.ts +171 -0
package/cli/selftune/evolution/propose-description.ts +100 -2
package/cli/selftune/evolution/propose-routing.ts +166 -0
package/cli/selftune/evolution/refine-body.ts +141 -0
package/cli/selftune/evolution/rollback.ts +21 -4
package/cli/selftune/evolution/validate-body.ts +254 -0
package/cli/selftune/evolution/validate-proposal.ts +257 -35
package/cli/selftune/evolution/validate-routing.ts +177 -0
package/cli/selftune/grading/auto-grade.ts +200 -0
package/cli/selftune/grading/grade-session.ts +513 -42
package/cli/selftune/grading/pre-gates.ts +104 -0
package/cli/selftune/grading/results.ts +42 -0
package/cli/selftune/hooks/auto-activate.ts +185 -0
package/cli/selftune/hooks/evolution-guard.ts +165 -0
package/cli/selftune/hooks/prompt-log.ts +172 -2
package/cli/selftune/hooks/session-stop.ts +123 -3
package/cli/selftune/hooks/skill-change-guard.ts +112 -0
package/cli/selftune/hooks/skill-eval.ts +119 -3
package/cli/selftune/index.ts +415 -48
package/cli/selftune/ingestors/claude-replay.ts +377 -0
package/cli/selftune/ingestors/codex-rollout.ts +345 -46
package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
package/cli/selftune/init.ts +376 -16
package/cli/selftune/last.ts +14 -5
package/cli/selftune/localdb/db.ts +63 -0
package/cli/selftune/localdb/materialize.ts +428 -0
package/cli/selftune/localdb/queries.ts +376 -0
package/cli/selftune/localdb/schema.ts +204 -0
package/cli/selftune/memory/writer.ts +447 -0
package/cli/selftune/monitoring/watch.ts +90 -16
package/cli/selftune/normalization.ts +682 -0
package/cli/selftune/observability.ts +19 -44
package/cli/selftune/orchestrate.ts +1073 -0
package/cli/selftune/quickstart.ts +203 -0
package/cli/selftune/repair/skill-usage.ts +576 -0
package/cli/selftune/schedule.ts +561 -0
package/cli/selftune/status.ts +59 -33
package/cli/selftune/sync.ts +627 -0
package/cli/selftune/types.ts +525 -5
package/cli/selftune/utils/canonical-log.ts +45 -0
package/cli/selftune/utils/frontmatter.ts +217 -0
package/cli/selftune/utils/hooks.ts +41 -0
package/cli/selftune/utils/html.ts +27 -0
package/cli/selftune/utils/llm-call.ts +103 -19
package/cli/selftune/utils/math.ts +10 -0
package/cli/selftune/utils/query-filter.ts +139 -0
package/cli/selftune/utils/skill-discovery.ts +340 -0
package/cli/selftune/utils/skill-log.ts +68 -0
package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
package/cli/selftune/utils/transcript.ts +307 -26
package/cli/selftune/utils/trigger-check.ts +89 -0
package/cli/selftune/utils/tui.ts +156 -0
package/cli/selftune/workflows/discover.ts +254 -0
package/cli/selftune/workflows/skill-md-writer.ts +288 -0
package/cli/selftune/workflows/workflows.ts +188 -0
package/package.json +28 -11
package/packages/telemetry-contract/README.md +11 -0
package/packages/telemetry-contract/fixtures/golden.json +87 -0
package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
package/packages/telemetry-contract/index.ts +1 -0
package/packages/telemetry-contract/package.json +19 -0
package/packages/telemetry-contract/src/index.ts +2 -0
package/packages/telemetry-contract/src/types.ts +163 -0
package/packages/telemetry-contract/src/validators.ts +109 -0
package/skill/SKILL.md +180 -33
package/skill/Workflows/AutoActivation.md +145 -0
package/skill/Workflows/Badge.md +124 -0
package/skill/Workflows/Baseline.md +144 -0
package/skill/Workflows/Composability.md +107 -0
package/skill/Workflows/Contribute.md +94 -0
package/skill/Workflows/Cron.md +132 -0
package/skill/Workflows/Dashboard.md +214 -0
package/skill/Workflows/Doctor.md +63 -14
package/skill/Workflows/Evals.md +110 -18
package/skill/Workflows/EvolutionMemory.md +154 -0
package/skill/Workflows/Evolve.md +181 -21
package/skill/Workflows/EvolveBody.md +159 -0
package/skill/Workflows/Grade.md +36 -31
package/skill/Workflows/ImportSkillsBench.md +117 -0
package/skill/Workflows/Ingest.md +142 -21
package/skill/Workflows/Initialize.md +91 -23
package/skill/Workflows/Orchestrate.md +139 -0
package/skill/Workflows/Replay.md +91 -0
package/skill/Workflows/Rollback.md +23 -4
package/skill/Workflows/Schedule.md +61 -0
package/skill/Workflows/Sync.md +88 -0
package/skill/Workflows/UnitTest.md +150 -0
package/skill/Workflows/Watch.md +33 -1
package/skill/Workflows/Workflows.md +129 -0
package/skill/assets/activation-rules-default.json +26 -0
package/skill/assets/multi-skill-settings.json +63 -0
package/skill/assets/single-skill-settings.json +57 -0
package/skill/references/invocation-taxonomy.md +2 -2
package/skill/references/logs.md +164 -2
package/skill/references/setup-patterns.md +65 -0
package/skill/references/version-history.md +40 -0
package/skill/settings_snippet.json +23 -0
package/templates/activation-rules-default.json +27 -0
package/templates/multi-skill-settings.json +64 -0
package/templates/single-skill-settings.json +58 -0
package/dashboard/index.html +0 -1119

package/cli/selftune/dashboard.ts CHANGED Viewed

@@ -1,124 +1,12 @@
 /**
- * selftune dashboard — Exports JSONL data into a standalone HTML viewer.
+ * selftune dashboard — Start the local React SPA dashboard server.
  *
  * Usage:
- *   selftune dashboard              — Open dashboard in default browser
- *   selftune dashboard --export     — Export data-embedded HTML to stdout
- *   selftune dashboard --out FILE   — Write data-embedded HTML to FILE
+ *   selftune dashboard              — Start server on port 3141 and open browser
+ *   selftune dashboard --port 8080  — Start on custom port
+ *   selftune dashboard --serve      — Deprecated alias for the default behavior
  */
-import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
-import { homedir } from "node:os";
-import { dirname, join, resolve } from "node:path";
-import { EVOLUTION_AUDIT_LOG, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "./constants.js";
-import { getLastDeployedProposal, readAuditTrail } from "./evolution/audit.js";
-import { computeMonitoringSnapshot } from "./monitoring/watch.js";
-import type {
-  EvolutionAuditEntry,
-  QueryLogRecord,
-  SessionTelemetryRecord,
-  SkillUsageRecord,
-} from "./types.js";
-import { readJsonl } from "./utils/jsonl.js";
-function findViewerHTML(): string {
-  // Try relative to this module first (works for both dev and installed)
-  const candidates = [
-    join(dirname(import.meta.dir), "..", "dashboard", "index.html"),
-    join(dirname(import.meta.dir), "dashboard", "index.html"),
-    resolve("dashboard", "index.html"),
-  ];
-  for (const c of candidates) {
-    if (existsSync(c)) return c;
-  }
-  throw new Error("Could not find dashboard/index.html. Ensure it exists in the selftune repo.");
-}
-function buildEmbeddedHTML(): string {
-  const template = readFileSync(findViewerHTML(), "utf-8");
-  const telemetry = readJsonl<SessionTelemetryRecord>(TELEMETRY_LOG);
-  const skills = readJsonl<SkillUsageRecord>(SKILL_LOG);
-  const queries = readJsonl<QueryLogRecord>(QUERY_LOG);
-  const evolution = readJsonl<EvolutionAuditEntry>(EVOLUTION_AUDIT_LOG);
-  const totalRecords = telemetry.length + skills.length + queries.length + evolution.length;
-  if (totalRecords === 0) {
-    console.error("No log data found. Run some sessions first.");
-    console.error(`  Checked: ${TELEMETRY_LOG}`);
-    console.error(`           ${SKILL_LOG}`);
-    console.error(`           ${QUERY_LOG}`);
-    console.error(`           ${EVOLUTION_AUDIT_LOG}`);
-    process.exit(1);
-  }
-  // Compute per-skill monitoring snapshots
-  const skillNames = [...new Set(skills.map((r) => r.skill_name))];
-  const snapshots: Record<string, ReturnType<typeof computeMonitoringSnapshot>> = {};
-  for (const name of skillNames) {
-    const lastDeployed = getLastDeployedProposal(name);
-    const baselinePassRate = lastDeployed?.eval_snapshot?.pass_rate ?? 0.5;
-    snapshots[name] = computeMonitoringSnapshot(
-      name,
-      telemetry,
-      skills,
-      queries,
-      telemetry.length,
-      baselinePassRate,
-    );
-  }
-  // Compute unmatched queries
-  const triggeredQueries = new Set(
-    skills.filter((r) => r.triggered).map((r) => r.query.toLowerCase().trim()),
-  );
-  const unmatched = queries
-    .filter((q) => !triggeredQueries.has(q.query.toLowerCase().trim()))
-    .map((q) => ({
-      timestamp: q.timestamp,
-      session_id: q.session_id,
-      query: q.query,
-    }));
-  // Compute pending proposals
-  const auditTrail = readAuditTrail();
-  const proposalStatus: Record<string, string[]> = {};
-  for (const e of auditTrail) {
-    if (!proposalStatus[e.proposal_id]) proposalStatus[e.proposal_id] = [];
-    proposalStatus[e.proposal_id].push(e.action);
-  }
-  // Deduplicate by proposal_id: one entry per pending proposal
-  const terminalActions = new Set(["deployed", "rejected", "rolled_back"]);
-  const seenProposals = new Set<string>();
-  const pendingProposals = auditTrail.filter((e) => {
-    if (e.action !== "created" && e.action !== "validated") return false;
-    if (seenProposals.has(e.proposal_id)) return false;
-    const actions = proposalStatus[e.proposal_id] || [];
-    const isPending = !actions.some((a: string) => terminalActions.has(a));
-    if (isPending) seenProposals.add(e.proposal_id);
-    return isPending;
-  });
-  const data = {
-    telemetry,
-    skills,
-    queries,
-    evolution,
-    computed: {
-      snapshots,
-      unmatched,
-      pendingProposals,
-    },
-  };
-  // Inject embedded data right before </body>
-  // Escape </script> sequences to prevent XSS via embedded JSON
-  const safeJson = JSON.stringify(data).replace(/<\/script>/gi, "<\\/script>");
-  const dataScript = `<script id="embedded-data" type="application/json">${safeJson}</script>`;
-  return template.replace("</body>", `${dataScript}\n</body>`);
-}
 export async function cliMain(): Promise<void> {
   const args = process.argv.slice(2);
@@ -126,51 +14,50 @@ export async function cliMain(): Promise<void> {
     console.log(`selftune dashboard — Visual data dashboard
 Usage:
-  selftune dashboard              Open dashboard in default browser
-  selftune dashboard --export     Export data-embedded HTML to stdout
-  selftune dashboard --out FILE   Write data-embedded HTML to FILE`);
+  selftune dashboard                      Start dashboard server (port 3141)
+  selftune dashboard --port 8080          Start on custom port
+  selftune dashboard --serve              Deprecated alias for default behavior
+  selftune dashboard --no-open            Start server without opening browser`);
     process.exit(0);
   }
-  if (args.includes("--export")) {
-    process.stdout.write(buildEmbeddedHTML());
-    return;
+  if (args.includes("--export") || args.includes("--out")) {
+    console.error("Legacy dashboard export was removed.");
+    console.error(
+      "Use `selftune dashboard` to run the SPA locally, then share a route or screenshot instead.",
+    );
+    process.exit(1);
   }
-  const outIdx = args.indexOf("--out");
-  if (outIdx !== -1) {
-    const outPath = args[outIdx + 1];
-    if (!outPath) {
-      console.error("--out requires a file path argument");
+  const portIdx = args.indexOf("--port");
+  let port: number | undefined;
+  if (portIdx !== -1) {
+    const parsed = Number.parseInt(args[portIdx + 1], 10);
+    if (!Number.isInteger(parsed) || parsed < 1 || parsed > 65535) {
+      console.error(`Invalid port "${args[portIdx + 1]}": must be an integer between 1 and 65535.`);
       process.exit(1);
     }
-    const html = buildEmbeddedHTML();
-    writeFileSync(outPath, html, "utf-8");
-    console.log(`Dashboard written to ${outPath}`);
-    return;
-  }
-  // Default: write to temp file and open in browser
-  const tmpDir = join(homedir(), ".selftune");
-  if (!existsSync(tmpDir)) {
-    mkdirSync(tmpDir, { recursive: true });
-  }
-  const tmpPath = join(tmpDir, "dashboard.html");
-  const html = buildEmbeddedHTML();
-  writeFileSync(tmpPath, html, "utf-8");
-  console.log(`Dashboard saved to ${tmpPath}`);
-  console.log("Opening in browser...");
-  try {
-    const platform = process.platform;
-    const cmd = platform === "darwin" ? "open" : platform === "linux" ? "xdg-open" : null;
-    if (!cmd) throw new Error("Unsupported platform");
-    const proc = Bun.spawn([cmd, tmpPath], { stdio: ["ignore", "ignore", "ignore"] });
-    await proc.exited;
-    if (proc.exitCode !== 0) throw new Error(`Failed to launch ${cmd}`);
-  } catch {
-    console.log(`Open manually: file://${tmpPath}`);
-  }
-  process.exit(0);
+    port = parsed;
+  }
+  if (args.includes("--serve")) {
+    console.warn("`selftune dashboard --serve` is deprecated; use `selftune dashboard` instead.");
+  }
+  const openBrowser = !args.includes("--no-open");
+  const { startDashboardServer } = await import("./dashboard-server.js");
+  const { stop } = await startDashboardServer({ port, openBrowser });
+  await new Promise<void>((resolve) => {
+    let closed = false;
+    const keepAlive = setInterval(() => {}, 1 << 30);
+    const shutdown = () => {
+      if (closed) return;
+      closed = true;
+      clearInterval(keepAlive);
+      stop();
+      resolve();
+    };
+    process.on("SIGINT", shutdown);
+    process.on("SIGTERM", shutdown);
+  });
 }

package/cli/selftune/eval/baseline.ts ADDED Viewed

@@ -0,0 +1,248 @@
+/**
+ * baseline.ts
+ *
+ * Measures the value a skill adds over a no-skill baseline.
+ *
+ * Runs trigger checks against an EMPTY string description (no-skill baseline)
+ * and against the current description (with-skill), then computes lift.
+ * A skill "adds value" when lift >= 0.05 (5 percentage points).
+ */
+import { parseArgs } from "node:util";
+import type { BaselineResult, EvalEntry } from "../types.js";
+import { callLlm } from "../utils/llm-call.js";
+import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+export interface BaselineOptions {
+  evalSet: EvalEntry[];
+  skillDescription: string;
+  skillName: string;
+  agent: string;
+  modelFlag?: string;
+}
+export interface BaselineMeasurement {
+  skill_name: string;
+  baseline_pass_rate: number;
+  with_skill_pass_rate: number;
+  lift: number;
+  adds_value: boolean;
+  per_entry: BaselineResult[];
+  measured_at: string;
+}
+/**
+ * Injectable dependencies for measureBaseline(). When omitted, the real
+ * module imports are used. Pass overrides in tests to avoid real LLM calls.
+ */
+export interface BaselineDeps {
+  callLlm?: typeof callLlm;
+}
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+const LIFT_THRESHOLD = 0.05;
+const SYSTEM_PROMPT = "You are an evaluation assistant. Answer only YES or NO.";
+// ---------------------------------------------------------------------------
+// Core measurement
+// ---------------------------------------------------------------------------
+/** Measure baseline vs. with-skill trigger accuracy across an eval set. */
+export async function measureBaseline(
+  options: BaselineOptions,
+  _deps: BaselineDeps = {},
+): Promise<BaselineMeasurement> {
+  const { evalSet, skillDescription, skillName, agent, modelFlag } = options;
+  const _callLlm = _deps.callLlm ?? callLlm;
+  if (evalSet.length === 0) {
+    return {
+      skill_name: skillName,
+      baseline_pass_rate: 0,
+      with_skill_pass_rate: 0,
+      lift: 0,
+      adds_value: false,
+      per_entry: [],
+      measured_at: new Date().toISOString(),
+    };
+  }
+  const perEntry: BaselineResult[] = [];
+  let baselinePassed = 0;
+  let withSkillPassed = 0;
+  for (const entry of evalSet) {
+    // --- Baseline check (empty description) ---
+    const baselinePrompt = buildTriggerCheckPrompt("", entry.query);
+    const baselineRaw = await _callLlm(SYSTEM_PROMPT, baselinePrompt, agent, modelFlag);
+    const baselineTriggered = parseTriggerResponse(baselineRaw);
+    const baselinePass =
+      (entry.should_trigger && baselineTriggered) || (!entry.should_trigger && !baselineTriggered);
+    if (baselinePass) baselinePassed++;
+    perEntry.push({
+      skill_name: skillName,
+      query: entry.query,
+      with_skill: false,
+      triggered: baselineTriggered,
+      pass: baselinePass,
+      measured_at: new Date().toISOString(),
+    });
+    // --- With-skill check (actual description) ---
+    const withSkillPrompt = buildTriggerCheckPrompt(skillDescription, entry.query);
+    const withSkillRaw = await _callLlm(SYSTEM_PROMPT, withSkillPrompt, agent, modelFlag);
+    const withSkillTriggered = parseTriggerResponse(withSkillRaw);
+    const withSkillPass =
+      (entry.should_trigger && withSkillTriggered) ||
+      (!entry.should_trigger && !withSkillTriggered);
+    if (withSkillPass) withSkillPassed++;
+    perEntry.push({
+      skill_name: skillName,
+      query: entry.query,
+      with_skill: true,
+      triggered: withSkillTriggered,
+      pass: withSkillPass,
+      measured_at: new Date().toISOString(),
+    });
+  }
+  const total = evalSet.length;
+  const baselinePassRate = baselinePassed / total;
+  const withSkillPassRate = withSkillPassed / total;
+  const lift = withSkillPassRate - baselinePassRate;
+  return {
+    skill_name: skillName,
+    baseline_pass_rate: baselinePassRate,
+    with_skill_pass_rate: withSkillPassRate,
+    lift,
+    adds_value: lift >= LIFT_THRESHOLD,
+    per_entry: perEntry,
+    measured_at: new Date().toISOString(),
+  };
+}
+// ---------------------------------------------------------------------------
+// CLI entry point
+// ---------------------------------------------------------------------------
+export async function cliMain(): Promise<void> {
+  const { values } = parseArgs({
+    options: {
+      skill: { type: "string" },
+      "skill-path": { type: "string" },
+      "eval-set": { type: "string" },
+      agent: { type: "string" },
+      help: { type: "boolean", default: false },
+    },
+    strict: true,
+  });
+  if (values.help) {
+    console.log(`selftune grade baseline — Measure skill value vs. no-skill baseline
+Usage:
+  selftune grade baseline --skill <name> --skill-path <path> [options]
+Options:
+  --skill         Skill name (required)
+  --skill-path    Path to SKILL.md (required)
+  --eval-set      Path to eval set JSON (optional, builds from logs if omitted)
+  --agent         Agent CLI to use (claude, codex, opencode)
+  --help          Show this help message`);
+    process.exit(0);
+  }
+  if (!values.skill || !values["skill-path"]) {
+    console.error("[ERROR] --skill and --skill-path are required");
+    process.exit(1);
+  }
+  const { existsSync, readFileSync } = await import("node:fs");
+  // Read skill description
+  const skillPath = values["skill-path"];
+  if (!existsSync(skillPath)) {
+    console.error(`[ERROR] SKILL.md not found at ${skillPath}`);
+    process.exit(1);
+  }
+  const skillDescription = readFileSync(skillPath, "utf-8");
+  // Load eval set
+  let evalSet: EvalEntry[];
+  if (values["eval-set"] && existsSync(values["eval-set"])) {
+    const raw = readFileSync(values["eval-set"], "utf-8");
+    evalSet = JSON.parse(raw) as EvalEntry[];
+  } else {
+    // Build from logs
+    const { QUERY_LOG } = await import("../constants.js");
+    const { readJsonl } = await import("../utils/jsonl.js");
+    const { readEffectiveSkillUsageRecords } = await import("../utils/skill-log.js");
+    const { buildEvalSet } = await import("./hooks-to-evals.js");
+    const skillRecords = readEffectiveSkillUsageRecords();
+    const queryRecords = readJsonl(QUERY_LOG);
+    evalSet = buildEvalSet(skillRecords, queryRecords, values.skill);
+  }
+  // Detect agent
+  const { detectAgent } = await import("../utils/llm-call.js");
+  const requestedAgent = values.agent;
+  if (requestedAgent && !Bun.which(requestedAgent)) {
+    console.error(
+      JSON.stringify({
+        level: "error",
+        code: "agent_not_in_path",
+        message: `Agent CLI '${requestedAgent}' not found in PATH.`,
+        action: "Install it or omit --agent to use auto-detection.",
+      }),
+    );
+    process.exit(1);
+  }
+  const agent = requestedAgent ?? detectAgent();
+  if (!agent) {
+    console.error(
+      JSON.stringify({
+        level: "error",
+        code: "agent_not_found",
+        message: "No agent CLI (claude/codex/opencode) found in PATH.",
+        action: "Install Claude Code, Codex, or OpenCode.",
+      }),
+    );
+    process.exit(1);
+  }
+  const result = await measureBaseline({
+    evalSet,
+    skillDescription,
+    skillName: values.skill,
+    agent,
+  });
+  console.log(JSON.stringify(result, null, 2));
+  process.exit(result.adds_value ? 0 : 1);
+}
+if (import.meta.main) {
+  cliMain().catch((err) => {
+    console.error(
+      JSON.stringify({
+        level: "fatal",
+        message: err instanceof Error ? err.message : String(err),
+        stack: err instanceof Error ? err.stack : undefined,
+      }),
+    );
+    process.exit(1);
+  });
+}