npm - selftune - Versions diffs - 0.2.29 → 0.2.31 - Mend

selftune 0.2.29 → 0.2.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +15 -0
package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +1 -0
package/apps/local-dashboard/dist/index.html +3 -3
package/cli/selftune/auto-update.ts +40 -8
package/cli/selftune/command-surface.ts +1 -1
package/cli/selftune/constants.ts +5 -0
package/cli/selftune/dashboard-action-events.ts +117 -0
package/cli/selftune/dashboard-action-instrumentation.ts +103 -0
package/cli/selftune/dashboard-action-result.ts +90 -0
package/cli/selftune/dashboard-action-stream.ts +252 -0
package/cli/selftune/dashboard-contract.ts +81 -1
package/cli/selftune/dashboard-server.ts +133 -16
package/cli/selftune/eval/hooks-to-evals.ts +157 -0
package/cli/selftune/eval/synthetic-evals.ts +33 -2
package/cli/selftune/eval/unit-test-cli.ts +53 -5
package/cli/selftune/evolution/validate-host-replay.ts +191 -14
package/cli/selftune/index.ts +4 -0
package/cli/selftune/ingestors/opencode-ingest.ts +117 -8
package/cli/selftune/localdb/schema.ts +34 -0
package/cli/selftune/registry/github-install.ts +256 -0
package/cli/selftune/registry/index.ts +1 -1
package/cli/selftune/registry/install.ts +58 -7
package/cli/selftune/routes/actions.ts +273 -42
package/cli/selftune/testing-readiness.ts +203 -10
package/cli/selftune/utils/llm-call.ts +90 -1
package/package.json +1 -1
package/packages/dashboard-core/src/routes/manifest.ts +2 -2
package/packages/ui/src/components/EvolutionTimeline.tsx +1 -1
package/packages/ui/src/components/SkillReportPanels.tsx +7 -7
package/packages/ui/src/primitives/button.tsx +5 -0
package/skill/SKILL.md +1 -1
package/skill/workflows/Dashboard.md +50 -23
package/skill/workflows/Registry.md +19 -13
package/apps/local-dashboard/dist/assets/index-BcvtYmmL.js +0 -15
package/apps/local-dashboard/dist/assets/index-BpRIxnpS.css +0 -1
package/apps/local-dashboard/dist/assets/vendor-ui-DqH_uxum.js +0 -1

package/cli/selftune/dashboard-server.ts CHANGED Viewed

@@ -19,12 +19,14 @@
  */
 import type { Database } from "bun:sqlite";
-import { existsSync, readFileSync, unwatchFile, watchFile } from "node:fs";
+import { existsSync, readFileSync, statSync, unwatchFile, watchFile } from "node:fs";
 import { dirname, extname, isAbsolute, join, relative, resolve } from "node:path";
 import type { BadgeFormat } from "./badge/badge-data.js";
-import { LOG_DIR, SELFTUNE_CONFIG_DIR } from "./constants.js";
+import { getCachedUpdateStatus } from "./auto-update.js";
+import { DASHBOARD_ACTION_STREAM_LOG, LOG_DIR, SELFTUNE_CONFIG_DIR } from "./constants.js";
 import type {
+  DashboardActionEvent,
   HealthResponse,
   OverviewResponse,
   SkillReportResponse,
@@ -53,6 +55,7 @@ import {
 import type { StatusResult } from "./status.js";
 import { computeStatus } from "./status.js";
 import type { EvolutionAuditEntry, EvolutionEvidenceEntry } from "./types.js";
+import { readJsonlFrom } from "./utils/jsonl.js";
 export interface DashboardServerOptions {
   port?: number;
@@ -72,6 +75,13 @@ interface DashboardSocketData {
   upstreamUrl?: string;
 }
+interface ActionEventHistoryEntry {
+  eventId: string;
+  updatedAt: number;
+  finished: boolean;
+  events: DashboardActionEvent[];
+}
 /** Read selftune version from package.json (fresh on each call to pick up auto-updates). */
 const VERSION_PKG_PATH = join(import.meta.dir, "..", "..", "package.json");
 function getSelftuneVersion(): string {
@@ -189,7 +199,10 @@ async function serveSpaShell(spaDir: string | null): Promise<Response> {
   if (!spaDir) {
     return new Response("Dashboard build not found. Run `bun run build:dashboard` first.", {
       status: 503,
-      headers: { "Content-Type": "text/plain; charset=utf-8", ...corsHeaders() },
+      headers: {
+        "Content-Type": "text/plain; charset=utf-8",
+        ...corsHeaders(),
+      },
     });
   }
@@ -260,9 +273,11 @@ function withCors(response: Response): Response {
   });
 }
-export async function startDashboardServer(
-  options?: DashboardServerOptions,
-): Promise<{ server: ReturnType<typeof Bun.serve>; stop: () => void; port: number }> {
+export async function startDashboardServer(options?: DashboardServerOptions): Promise<{
+  server: ReturnType<typeof Bun.serve>;
+  stop: () => void;
+  port: number;
+}> {
   const port = options?.port ?? 3141;
   const hostname = options?.host ?? "localhost";
   const openBrowser = options?.openBrowser ?? true;
@@ -321,12 +336,60 @@ export async function startDashboardServer(
   // -- SSE (Server-Sent Events) live update layer -----------------------------
   const sseClients = new Set<ReadableStreamDefaultController>();
+  const actionEventHistory = new Map<string, ActionEventHistoryEntry>();
+  const MAX_ACTION_HISTORY_RUNS = 24;
+  const MAX_ACTION_HISTORY_EVENTS_PER_RUN = 320;
+  function trimActionEventHistory(): void {
+    if (actionEventHistory.size <= MAX_ACTION_HISTORY_RUNS) return;
+    const staleEntries = [...actionEventHistory.values()].sort((left, right) => {
+      if (left.finished !== right.finished) {
+        return left.finished ? -1 : 1;
+      }
+      return left.updatedAt - right.updatedAt;
+    });
-  function broadcastSSE(eventType: string): void {
-    const payload = `event: ${eventType}\ndata: ${JSON.stringify({ type: eventType, ts: Date.now() })}\n\n`;
+    while (actionEventHistory.size > MAX_ACTION_HISTORY_RUNS) {
+      const next = staleEntries.shift();
+      if (!next) break;
+      actionEventHistory.delete(next.eventId);
+    }
+  }
+  function rememberActionEvent(event: DashboardActionEvent): void {
+    const existing = actionEventHistory.get(event.event_id);
+    if (existing) {
+      existing.updatedAt = event.ts;
+      existing.finished = event.stage === "finished" ? true : existing.finished;
+      existing.events.push(event);
+      existing.events = existing.events.slice(-MAX_ACTION_HISTORY_EVENTS_PER_RUN);
+      return;
+    }
+    actionEventHistory.set(event.event_id, {
+      eventId: event.event_id,
+      updatedAt: event.ts,
+      finished: event.stage === "finished",
+      events: [event],
+    });
+    trimActionEventHistory();
+  }
+  function recentActionEventsForBackfill(): DashboardActionEvent[] {
+    return [...actionEventHistory.values()]
+      .sort((left, right) => left.updatedAt - right.updatedAt)
+      .flatMap((entry) => entry.events);
+  }
+  function broadcastSSE(eventType: string, payload: Record<string, unknown>): void {
+    if (eventType === "action") {
+      rememberActionEvent(payload as DashboardActionEvent);
+    }
+    const message = `event: ${eventType}\ndata: ${JSON.stringify(payload)}\n\n`;
     for (const controller of sseClients) {
       try {
-        controller.enqueue(new TextEncoder().encode(payload));
+        controller.enqueue(new TextEncoder().encode(message));
       } catch {
         sseClients.delete(controller);
       }
@@ -347,9 +410,16 @@ export async function startDashboardServer(
   // -- SQLite WAL watcher for push-based updates ------------------------------
   const walPath = `${DB_PATH}-wal`;
   let walWatcherActive = false;
+  const actionStreamPath =
+    process.env.SELFTUNE_DASHBOARD_ACTION_STREAM_LOG || DASHBOARD_ACTION_STREAM_LOG;
+  let actionStreamWatcherActive = false;
+  let actionStreamOffset = existsSync(actionStreamPath) ? statSync(actionStreamPath).size : 0;
   let fsDebounceTimer: ReturnType<typeof setTimeout> | null = null;
+  let actionStreamDebounceTimer: ReturnType<typeof setTimeout> | null = null;
   const FS_DEBOUNCE_MS = 500;
+  const ACTION_STREAM_DEBOUNCE_MS = 100;
+  const ACTION_STREAM_POLL_MS = 250;
   const proxiedSpaSockets = new Map<unknown, WebSocket>();
   function onWALChange(): void {
@@ -357,15 +427,36 @@ export async function startDashboardServer(
     fsDebounceTimer = setTimeout(() => {
       fsDebounceTimer = null;
       refreshV2DataImmediate();
-      broadcastSSE("update");
+      broadcastSSE("update", { type: "update", ts: Date.now() });
     }, FS_DEBOUNCE_MS);
   }
   watchFile(walPath, { interval: 500 }, onWALChange);
   walWatcherActive = true;
+  function flushActionStream(): void {
+    if (actionStreamDebounceTimer) return;
+    actionStreamDebounceTimer = setTimeout(() => {
+      actionStreamDebounceTimer = null;
+      const { records, newOffset } = readJsonlFrom<DashboardActionEvent>(
+        actionStreamPath,
+        actionStreamOffset,
+      );
+      actionStreamOffset = newOffset;
+      for (const record of records) {
+        broadcastSSE("action", record);
+      }
+    }, ACTION_STREAM_DEBOUNCE_MS);
+  }
+  const actionStreamPoller = setInterval(() => {
+    flushActionStream();
+  }, ACTION_STREAM_POLL_MS);
+  actionStreamWatcherActive = true;
   function getWatcherMode(): HealthResponse["watcher_mode"] {
-    return walWatcherActive ? "wal" : "none";
+    if (walWatcherActive && actionStreamWatcherActive) return "wal";
+    return walWatcherActive || actionStreamWatcherActive ? "wal" : "none";
   }
   let cachedStatusResult: StatusResult | null = null;
@@ -454,10 +545,15 @@ export async function startDashboardServer(
       // ---- GET /api/health ----
       if (url.pathname === "/api/health" && req.method === "GET") {
+        const updateStatus = getCachedUpdateStatus();
         const healthResponse: HealthResponse = {
           ok: true,
           service: "selftune-dashboard",
           version: getSelftuneVersion(),
+          latest_version: updateStatus.latestVersion,
+          update_available: updateStatus.updateAvailable,
+          auto_update_supported: updateStatus.autoUpdateSupported,
+          update_hint: updateStatus.updateHint,
           pid: process.pid,
           spa: Boolean(spaDir || spaProxyUrl),
           spa_mode: spaMode,
@@ -503,6 +599,11 @@ export async function startDashboardServer(
           start(controller) {
             sseClients.add(controller);
             controller.enqueue(new TextEncoder().encode(": connected\n\n"));
+            for (const event of recentActionEventsForBackfill()) {
+              controller.enqueue(
+                new TextEncoder().encode(`event: action\ndata: ${JSON.stringify(event)}\n\n`),
+              );
+            }
           },
           cancel(controller) {
             sseClients.delete(controller);
@@ -533,7 +634,10 @@ export async function startDashboardServer(
             `Dashboard SPA proxy unavailable at ${spaProxyUrl.toString()}: ${message}`,
             {
               status: 502,
-              headers: { "Content-Type": "text/plain; charset=utf-8", ...corsHeaders() },
+              headers: {
+                "Content-Type": "text/plain; charset=utf-8",
+                ...corsHeaders(),
+              },
             },
           );
         }
@@ -544,7 +648,10 @@ export async function startDashboardServer(
         const filePath = resolve(spaDir, `.${url.pathname}`);
         const rel = relative(spaDir, filePath);
         if (rel.startsWith("..") || isAbsolute(rel)) {
-          return new Response("Not Found", { status: 404, headers: corsHeaders() });
+          return new Response("Not Found", {
+            status: 404,
+            headers: corsHeaders(),
+          });
         }
         const bunFile = Bun.file(filePath);
         if (await bunFile.exists()) {
@@ -558,7 +665,10 @@ export async function startDashboardServer(
             },
           });
         }
-        return new Response("Not Found", { status: 404, headers: corsHeaders() });
+        return new Response("Not Found", {
+          status: 404,
+          headers: corsHeaders(),
+        });
       }
       // ---- GET / ---- Serve SPA shell
@@ -597,7 +707,10 @@ export async function startDashboardServer(
             { status: 400, headers: corsHeaders() },
           );
         }
-        return withCors(await handleAction(action, body, executeAction));
+        const emitActionEvent = (event: DashboardActionEvent) => {
+          broadcastSSE("action", event);
+        };
+        return withCors(await handleAction(action, body, executeAction, emitActionEvent));
       }
       // ---- GET /badge/:skillName ----
@@ -634,7 +747,9 @@ export async function startDashboardServer(
       // ---- GET /api/v2/overview ----
       if (url.pathname === "/api/v2/overview" && req.method === "GET") {
         if (getOverviewResponse) {
-          return Response.json(getOverviewResponse(), { headers: corsHeaders() });
+          return Response.json(getOverviewResponse(), {
+            headers: corsHeaders(),
+          });
         }
         if (!db) {
           return Response.json(
@@ -737,6 +852,7 @@ export async function startDashboardServer(
   const shutdownHandler = () => {
     unwatchFile(walPath, onWALChange);
     clearInterval(sseKeepaliveTimer);
+    clearInterval(actionStreamPoller);
     for (const c of sseClients) {
       try {
         c.close();
@@ -754,6 +870,7 @@ export async function startDashboardServer(
     }
     proxiedSpaSockets.clear();
     if (fsDebounceTimer) clearTimeout(fsDebounceTimer);
+    if (actionStreamDebounceTimer) clearTimeout(actionStreamDebounceTimer);
     closeSingleton();
     server.stop();
   };

package/cli/selftune/eval/hooks-to-evals.ts CHANGED Viewed

@@ -25,6 +25,10 @@ import { parseArgs } from "node:util";
 import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
 import { GENERIC_NEGATIVES, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
+import {
+  createDashboardLlmObserver,
+  emitDashboardStepProgress,
+} from "../dashboard-action-instrumentation.js";
 import { getDb } from "../localdb/db.js";
 import {
   queryQueryLog,
@@ -615,16 +619,49 @@ export async function cliMain(): Promise<void> {
     const maxPerSide = Number.parseInt(values.max ?? "50", 10);
     const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
+    emitDashboardStepProgress({
+      current: 1,
+      total: 4,
+      status: "started",
+      phase: "load_skill",
+      label: "Load skill content",
+    });
     console.log(`Generating synthetic evals for skill '${values.skill}'...`);
     const evalSet = await generateSyntheticEvals(values["skill-path"], values.skill, agent, {
       maxPositives: effectiveMax,
       maxNegatives: effectiveMax,
       modelFlag: values.model,
+      llmObserverFactory: createDashboardLlmObserver,
+    });
+    emitDashboardStepProgress({
+      current: 1,
+      total: 4,
+      status: "finished",
+      phase: "load_skill",
+      label: "Load skill content",
+      passed: true,
+      evidence: values["skill-path"],
     });
     const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
+    emitDashboardStepProgress({
+      current: 4,
+      total: 4,
+      status: "started",
+      phase: "write_eval_set",
+      label: "Write eval set",
+    });
     writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
     const canonicalPath = writeCanonicalEvalSet(values.skill, evalSet);
+    emitDashboardStepProgress({
+      current: 4,
+      total: 4,
+      status: "finished",
+      phase: "write_eval_set",
+      label: "Write eval set",
+      passed: true,
+      evidence: outputPath,
+    });
     const pos = evalSet.filter((e) => e.should_trigger);
     const neg = evalSet.filter((e) => !e.should_trigger);
@@ -666,6 +703,13 @@ export async function cliMain(): Promise<void> {
   const hasCustomQueryLog = queryLogPath !== QUERY_LOG;
   const hasCustomTelemetryLog = telemetryLogPath !== TELEMETRY_LOG;
+  emitDashboardStepProgress({
+    current: 1,
+    total: values.blend ? 5 : 3,
+    status: "started",
+    phase: "load_records",
+    label: "Load telemetry and query records",
+  });
   const db = hasCustomSkillLog && hasCustomQueryLog && hasCustomTelemetryLog ? undefined : getDb();
   skillRecords = hasCustomSkillLog
     ? readJsonl<SkillUsageRecord>(skillLogPath)
@@ -676,6 +720,15 @@ export async function cliMain(): Promise<void> {
   telemetryRecords = hasCustomTelemetryLog
     ? readJsonl<SessionTelemetryRecord>(telemetryLogPath)
     : (querySessionTelemetry(db!) as SessionTelemetryRecord[]);
+  emitDashboardStepProgress({
+    current: 1,
+    total: values.blend ? 5 : 3,
+    status: "finished",
+    phase: "load_records",
+    label: "Load telemetry and query records",
+    passed: true,
+    evidence: `${skillRecords.length} skill rows · ${queryRecords.length} query rows`,
+  });
   if (values["list-skills"]) {
     listSkills(skillRecords, queryRecords, telemetryRecords);
@@ -701,6 +754,13 @@ export async function cliMain(): Promise<void> {
   const searchDirs = getEvalSkillSearchDirs();
   const detectedSkillPath = findInstalledSkillPath(values.skill, searchDirs);
+  emitDashboardStepProgress({
+    current: 2,
+    total: values.blend ? 5 : 3,
+    status: "started",
+    phase: "build_eval_set",
+    label: "Build eval set",
+  });
   const evalSet = buildEvalSet(
     skillRecords,
     queryRecords,
@@ -710,6 +770,15 @@ export async function cliMain(): Promise<void> {
     seed,
     annotateTaxonomy,
   );
+  emitDashboardStepProgress({
+    current: 2,
+    total: values.blend ? 5 : 3,
+    status: "finished",
+    phase: "build_eval_set",
+    label: "Build eval set",
+    passed: true,
+    evidence: `${evalSet.length} entries`,
+  });
   const positiveCount = evalSet.filter((entry) => entry.should_trigger).length;
   if (positiveCount === 0 && values["auto-synthetic"]) {
@@ -731,6 +800,13 @@ export async function cliMain(): Promise<void> {
       );
     }
+    emitDashboardStepProgress({
+      current: 1,
+      total: 4,
+      status: "started",
+      phase: "load_skill",
+      label: "Load skill content",
+    });
     console.log(
       `No trusted triggers found for '${values.skill}'. Falling back to synthetic cold-start eval generation...`,
     );
@@ -739,10 +815,36 @@ export async function cliMain(): Promise<void> {
       maxPositives: effectiveMax,
       maxNegatives: effectiveMax,
       modelFlag: values.model,
+      llmObserverFactory: createDashboardLlmObserver,
+    });
+    emitDashboardStepProgress({
+      current: 1,
+      total: 4,
+      status: "finished",
+      phase: "load_skill",
+      label: "Load skill content",
+      passed: true,
+      evidence: skillPath,
     });
     const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
+    emitDashboardStepProgress({
+      current: 4,
+      total: 4,
+      status: "started",
+      phase: "write_eval_set",
+      label: "Write eval set",
+    });
     writeFileSync(outputPath, JSON.stringify(syntheticEvalSet, null, 2), "utf-8");
     const canonicalPath = writeCanonicalEvalSet(values.skill, syntheticEvalSet);
+    emitDashboardStepProgress({
+      current: 4,
+      total: 4,
+      status: "finished",
+      phase: "write_eval_set",
+      label: "Write eval set",
+      passed: true,
+      evidence: outputPath,
+    });
     const pos = syntheticEvalSet.filter((e) => e.should_trigger);
     const neg = syntheticEvalSet.filter((e) => !e.should_trigger);
@@ -789,23 +891,78 @@ export async function cliMain(): Promise<void> {
     }
     const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
+    emitDashboardStepProgress({
+      current: 1,
+      total: 5,
+      status: "started",
+      phase: "build_log_eval_set",
+      label: "Build log eval set",
+    });
+    emitDashboardStepProgress({
+      current: 1,
+      total: 5,
+      status: "finished",
+      phase: "build_log_eval_set",
+      label: "Build log eval set",
+      passed: true,
+      evidence: `${evalSet.length} entries`,
+    });
     console.log(`Generating synthetic evals for blending with '${values.skill}'...`);
     const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
       maxPositives: effectiveMax,
       maxNegatives: effectiveMax,
       modelFlag: values.model,
+      llmObserverFactory: ({ current, total, phase, label }) =>
+        createDashboardLlmObserver({
+          current: current + 1,
+          total: total + 1,
+          phase,
+          label,
+        }),
     });
+    emitDashboardStepProgress({
+      current: 4,
+      total: 5,
+      status: "started",
+      phase: "blend_eval_sets",
+      label: "Blend log and synthetic evals",
+    });
     finalEvalSet = blendEvalSets(evalSet, syntheticEvalSet);
     const stats = computeEvalSourceStats(finalEvalSet);
+    emitDashboardStepProgress({
+      current: 4,
+      total: 5,
+      status: "finished",
+      phase: "blend_eval_sets",
+      label: "Blend log and synthetic evals",
+      passed: true,
+      evidence: `${stats.total} total entries`,
+    });
     console.log(
       `Blended: ${stats.log} log + ${stats.blended} synthetic gap-fillers = ${stats.total} total`,
     );
   }
   const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
+  emitDashboardStepProgress({
+    current: values.blend ? 5 : 3,
+    total: values.blend ? 5 : 3,
+    status: "started",
+    phase: "write_eval_set",
+    label: "Write eval set",
+  });
   writeFileSync(outputPath, JSON.stringify(finalEvalSet, null, 2), "utf-8");
   const canonicalPath = writeCanonicalEvalSet(values.skill, finalEvalSet);
+  emitDashboardStepProgress({
+    current: values.blend ? 5 : 3,
+    total: values.blend ? 5 : 3,
+    status: "finished",
+    phase: "write_eval_set",
+    label: "Write eval set",
+    passed: true,
+    evidence: outputPath,
+  });
   printEvalStats(
     finalEvalSet,
     values.skill,

package/cli/selftune/eval/synthetic-evals.ts CHANGED Viewed

@@ -10,6 +10,7 @@ import { readFileSync } from "node:fs";
 import type { EvalEntry, InvocationType, SkillUsageRecord } from "../types.js";
 import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
+import type { LlmCallObserver } from "../utils/llm-call.js";
 import { findInstalledSkillNames } from "../utils/skill-discovery.js";
 import { classifyInvocation } from "./invocation-classifier.js";
@@ -21,6 +22,12 @@ export interface SyntheticEvalOptions {
   maxPositives?: number;
   maxNegatives?: number;
   modelFlag?: string;
+  llmObserverFactory?: (step: {
+    current: number;
+    total: number;
+    phase: string;
+    label: string;
+  }) => LlmCallObserver | undefined;
 }
 interface RawSyntheticEntry {
@@ -484,7 +491,19 @@ export async function generateSyntheticEvals(
     siblingSkills,
   );
-  const raw = await callLlm(system, user, agent, options.modelFlag);
+  const raw = await callLlm(
+    system,
+    user,
+    agent,
+    options.modelFlag,
+    undefined,
+    options.llmObserverFactory?.({
+      current: 2,
+      total: 4,
+      phase: "draft_eval_set",
+      label: "Draft synthetic eval set",
+    }),
+  );
   const firstPass = dedupeEvalEntries(parseSyntheticResponse(raw, skillName));
   try {
@@ -496,7 +515,19 @@ export async function generateSyntheticEvals(
       maxNegatives,
       siblingSkills,
     );
-    const refinedRaw = await callLlm(refinement.system, refinement.user, agent, options.modelFlag);
+    const refinedRaw = await callLlm(
+      refinement.system,
+      refinement.user,
+      agent,
+      options.modelFlag,
+      undefined,
+      options.llmObserverFactory?.({
+        current: 3,
+        total: 4,
+        phase: "refine_eval_set",
+        label: "Refine synthetic eval set",
+      }),
+    );
     const refined = dedupeEvalEntries(parseSyntheticResponse(refinedRaw, skillName));
     const selected = selectBalancedEvalEntries(refined, maxPositives, maxNegatives, siblingSkills);
     if (

package/cli/selftune/eval/unit-test-cli.ts CHANGED Viewed

@@ -13,13 +13,17 @@
  *   --model <m>       Model flag for LLM calls
  */
-import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { existsSync, mkdirSync, readFileSync } from "node:fs";
 import { join } from "node:path";
 import { parseArgs } from "node:util";
 import { SELFTUNE_CONFIG_DIR } from "../constants.js";
+import {
+  createDashboardLlmObserver,
+  emitDashboardStepProgress,
+} from "../dashboard-action-instrumentation.js";
 import type { EvalEntry } from "../types.js";
-import { writeUnitTestRunResult } from "../testing-readiness.js";
+import { writeCanonicalUnitTests, writeUnitTestRunResult } from "../testing-readiness.js";
 import { CLIError } from "../utils/cli-error.js";
 import { callLlm, detectLlmAgent } from "../utils/llm-call.js";
 import { generateUnitTests } from "./generate-unit-tests.js";
@@ -69,6 +73,13 @@ export async function cliMain(): Promise<void> {
     }
     let skillContent = `Skill: ${skillName}`;
+    emitDashboardStepProgress({
+      current: 1,
+      total: 3,
+      status: "started",
+      phase: "load_generation_inputs",
+      label: "Load skill and failure context",
+    });
     if (values["skill-path"] && existsSync(values["skill-path"])) {
       skillContent = readFileSync(values["skill-path"], "utf-8");
     } else if (values["skill-path"]) {
@@ -85,10 +96,31 @@ export async function cliMain(): Promise<void> {
         console.warn("[WARN] Failed to parse eval set. Proceeding without failure context.");
       }
     }
+    emitDashboardStepProgress({
+      current: 1,
+      total: 3,
+      status: "finished",
+      phase: "load_generation_inputs",
+      label: "Load skill and failure context",
+      passed: true,
+      evidence: `${evalFailures.length} eval failures`,
+    });
     const modelFlag = values.model;
     const llmCaller = (systemPrompt: string, userPrompt: string) =>
-      callLlm(systemPrompt, userPrompt, agent, modelFlag);
+      callLlm(
+        systemPrompt,
+        userPrompt,
+        agent,
+        modelFlag,
+        undefined,
+        createDashboardLlmObserver({
+          current: 2,
+          total: 3,
+          phase: "generate_tests",
+          label: "Generate unit tests",
+        }),
+      );
     console.log(`Generating unit tests for skill '${skillName}'...`);
     const tests = await generateUnitTests(skillName, skillContent, evalFailures, llmCaller);
@@ -98,9 +130,25 @@ export async function cliMain(): Promise<void> {
     }
     // Ensure output directory exists
+    emitDashboardStepProgress({
+      current: 3,
+      total: 3,
+      status: "started",
+      phase: "write_tests",
+      label: "Write generated tests",
+    });
     mkdirSync(unitTestDir, { recursive: true });
-    writeFileSync(testsPath, JSON.stringify(tests, null, 2), "utf-8");
-    console.log(`Generated ${tests.length} unit tests -> ${testsPath}`);
+    const storedPath = writeCanonicalUnitTests(skillName, tests, testsPath);
+    emitDashboardStepProgress({
+      current: 3,
+      total: 3,
+      status: "finished",
+      phase: "write_tests",
+      label: "Write generated tests",
+      passed: true,
+      evidence: storedPath,
+    });
+    console.log(`Generated ${tests.length} unit tests -> ${storedPath}`);
     return;
   }