selftune 0.2.29 → 0.2.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/apps/local-dashboard/dist/assets/index-B7v_o1WC.js +15 -0
  2. package/apps/local-dashboard/dist/assets/index-CrO77SVi.css +1 -0
  3. package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +1 -0
  4. package/apps/local-dashboard/dist/index.html +3 -3
  5. package/cli/selftune/auto-update.ts +40 -8
  6. package/cli/selftune/command-surface.ts +1 -1
  7. package/cli/selftune/constants.ts +5 -0
  8. package/cli/selftune/dashboard-action-events.ts +117 -0
  9. package/cli/selftune/dashboard-action-instrumentation.ts +103 -0
  10. package/cli/selftune/dashboard-action-result.ts +90 -0
  11. package/cli/selftune/dashboard-action-stream.ts +252 -0
  12. package/cli/selftune/dashboard-contract.ts +81 -1
  13. package/cli/selftune/dashboard-server.ts +133 -16
  14. package/cli/selftune/eval/hooks-to-evals.ts +157 -0
  15. package/cli/selftune/eval/synthetic-evals.ts +33 -2
  16. package/cli/selftune/eval/unit-test-cli.ts +53 -5
  17. package/cli/selftune/evolution/validate-host-replay.ts +191 -14
  18. package/cli/selftune/index.ts +4 -0
  19. package/cli/selftune/ingestors/opencode-ingest.ts +117 -8
  20. package/cli/selftune/localdb/schema.ts +34 -0
  21. package/cli/selftune/registry/github-install.ts +256 -0
  22. package/cli/selftune/registry/index.ts +1 -1
  23. package/cli/selftune/registry/install.ts +58 -7
  24. package/cli/selftune/routes/actions.ts +273 -42
  25. package/cli/selftune/testing-readiness.ts +203 -10
  26. package/cli/selftune/utils/llm-call.ts +90 -1
  27. package/package.json +1 -1
  28. package/packages/dashboard-core/src/routes/manifest.ts +2 -2
  29. package/packages/ui/src/components/EvolutionTimeline.tsx +1 -1
  30. package/packages/ui/src/components/SkillReportPanels.tsx +7 -7
  31. package/packages/ui/src/primitives/button.tsx +5 -0
  32. package/skill/SKILL.md +1 -1
  33. package/skill/workflows/Dashboard.md +50 -23
  34. package/skill/workflows/Registry.md +19 -13
  35. package/apps/local-dashboard/dist/assets/index-BcvtYmmL.js +0 -15
  36. package/apps/local-dashboard/dist/assets/index-BpRIxnpS.css +0 -1
  37. package/apps/local-dashboard/dist/assets/vendor-ui-DqH_uxum.js +0 -1
@@ -19,12 +19,14 @@
19
19
  */
20
20
 
21
21
  import type { Database } from "bun:sqlite";
22
- import { existsSync, readFileSync, unwatchFile, watchFile } from "node:fs";
22
+ import { existsSync, readFileSync, statSync, unwatchFile, watchFile } from "node:fs";
23
23
  import { dirname, extname, isAbsolute, join, relative, resolve } from "node:path";
24
24
 
25
25
  import type { BadgeFormat } from "./badge/badge-data.js";
26
- import { LOG_DIR, SELFTUNE_CONFIG_DIR } from "./constants.js";
26
+ import { getCachedUpdateStatus } from "./auto-update.js";
27
+ import { DASHBOARD_ACTION_STREAM_LOG, LOG_DIR, SELFTUNE_CONFIG_DIR } from "./constants.js";
27
28
  import type {
29
+ DashboardActionEvent,
28
30
  HealthResponse,
29
31
  OverviewResponse,
30
32
  SkillReportResponse,
@@ -53,6 +55,7 @@ import {
53
55
  import type { StatusResult } from "./status.js";
54
56
  import { computeStatus } from "./status.js";
55
57
  import type { EvolutionAuditEntry, EvolutionEvidenceEntry } from "./types.js";
58
+ import { readJsonlFrom } from "./utils/jsonl.js";
56
59
 
57
60
  export interface DashboardServerOptions {
58
61
  port?: number;
@@ -72,6 +75,13 @@ interface DashboardSocketData {
72
75
  upstreamUrl?: string;
73
76
  }
74
77
 
78
+ interface ActionEventHistoryEntry {
79
+ eventId: string;
80
+ updatedAt: number;
81
+ finished: boolean;
82
+ events: DashboardActionEvent[];
83
+ }
84
+
75
85
  /** Read selftune version from package.json (fresh on each call to pick up auto-updates). */
76
86
  const VERSION_PKG_PATH = join(import.meta.dir, "..", "..", "package.json");
77
87
  function getSelftuneVersion(): string {
@@ -189,7 +199,10 @@ async function serveSpaShell(spaDir: string | null): Promise<Response> {
189
199
  if (!spaDir) {
190
200
  return new Response("Dashboard build not found. Run `bun run build:dashboard` first.", {
191
201
  status: 503,
192
- headers: { "Content-Type": "text/plain; charset=utf-8", ...corsHeaders() },
202
+ headers: {
203
+ "Content-Type": "text/plain; charset=utf-8",
204
+ ...corsHeaders(),
205
+ },
193
206
  });
194
207
  }
195
208
 
@@ -260,9 +273,11 @@ function withCors(response: Response): Response {
260
273
  });
261
274
  }
262
275
 
263
- export async function startDashboardServer(
264
- options?: DashboardServerOptions,
265
- ): Promise<{ server: ReturnType<typeof Bun.serve>; stop: () => void; port: number }> {
276
+ export async function startDashboardServer(options?: DashboardServerOptions): Promise<{
277
+ server: ReturnType<typeof Bun.serve>;
278
+ stop: () => void;
279
+ port: number;
280
+ }> {
266
281
  const port = options?.port ?? 3141;
267
282
  const hostname = options?.host ?? "localhost";
268
283
  const openBrowser = options?.openBrowser ?? true;
@@ -321,12 +336,60 @@ export async function startDashboardServer(
321
336
 
322
337
  // -- SSE (Server-Sent Events) live update layer -----------------------------
323
338
  const sseClients = new Set<ReadableStreamDefaultController>();
339
+ const actionEventHistory = new Map<string, ActionEventHistoryEntry>();
340
+ const MAX_ACTION_HISTORY_RUNS = 24;
341
+ const MAX_ACTION_HISTORY_EVENTS_PER_RUN = 320;
342
+
343
+ function trimActionEventHistory(): void {
344
+ if (actionEventHistory.size <= MAX_ACTION_HISTORY_RUNS) return;
345
+
346
+ const staleEntries = [...actionEventHistory.values()].sort((left, right) => {
347
+ if (left.finished !== right.finished) {
348
+ return left.finished ? -1 : 1;
349
+ }
350
+ return left.updatedAt - right.updatedAt;
351
+ });
324
352
 
325
- function broadcastSSE(eventType: string): void {
326
- const payload = `event: ${eventType}\ndata: ${JSON.stringify({ type: eventType, ts: Date.now() })}\n\n`;
353
+ while (actionEventHistory.size > MAX_ACTION_HISTORY_RUNS) {
354
+ const next = staleEntries.shift();
355
+ if (!next) break;
356
+ actionEventHistory.delete(next.eventId);
357
+ }
358
+ }
359
+
360
+ function rememberActionEvent(event: DashboardActionEvent): void {
361
+ const existing = actionEventHistory.get(event.event_id);
362
+ if (existing) {
363
+ existing.updatedAt = event.ts;
364
+ existing.finished = event.stage === "finished" ? true : existing.finished;
365
+ existing.events.push(event);
366
+ existing.events = existing.events.slice(-MAX_ACTION_HISTORY_EVENTS_PER_RUN);
367
+ return;
368
+ }
369
+
370
+ actionEventHistory.set(event.event_id, {
371
+ eventId: event.event_id,
372
+ updatedAt: event.ts,
373
+ finished: event.stage === "finished",
374
+ events: [event],
375
+ });
376
+ trimActionEventHistory();
377
+ }
378
+
379
+ function recentActionEventsForBackfill(): DashboardActionEvent[] {
380
+ return [...actionEventHistory.values()]
381
+ .sort((left, right) => left.updatedAt - right.updatedAt)
382
+ .flatMap((entry) => entry.events);
383
+ }
384
+
385
+ function broadcastSSE(eventType: string, payload: Record<string, unknown>): void {
386
+ if (eventType === "action") {
387
+ rememberActionEvent(payload as DashboardActionEvent);
388
+ }
389
+ const message = `event: ${eventType}\ndata: ${JSON.stringify(payload)}\n\n`;
327
390
  for (const controller of sseClients) {
328
391
  try {
329
- controller.enqueue(new TextEncoder().encode(payload));
392
+ controller.enqueue(new TextEncoder().encode(message));
330
393
  } catch {
331
394
  sseClients.delete(controller);
332
395
  }
@@ -347,9 +410,16 @@ export async function startDashboardServer(
347
410
  // -- SQLite WAL watcher for push-based updates ------------------------------
348
411
  const walPath = `${DB_PATH}-wal`;
349
412
  let walWatcherActive = false;
413
+ const actionStreamPath =
414
+ process.env.SELFTUNE_DASHBOARD_ACTION_STREAM_LOG || DASHBOARD_ACTION_STREAM_LOG;
415
+ let actionStreamWatcherActive = false;
416
+ let actionStreamOffset = existsSync(actionStreamPath) ? statSync(actionStreamPath).size : 0;
350
417
 
351
418
  let fsDebounceTimer: ReturnType<typeof setTimeout> | null = null;
419
+ let actionStreamDebounceTimer: ReturnType<typeof setTimeout> | null = null;
352
420
  const FS_DEBOUNCE_MS = 500;
421
+ const ACTION_STREAM_DEBOUNCE_MS = 100;
422
+ const ACTION_STREAM_POLL_MS = 250;
353
423
  const proxiedSpaSockets = new Map<unknown, WebSocket>();
354
424
 
355
425
  function onWALChange(): void {
@@ -357,15 +427,36 @@ export async function startDashboardServer(
357
427
  fsDebounceTimer = setTimeout(() => {
358
428
  fsDebounceTimer = null;
359
429
  refreshV2DataImmediate();
360
- broadcastSSE("update");
430
+ broadcastSSE("update", { type: "update", ts: Date.now() });
361
431
  }, FS_DEBOUNCE_MS);
362
432
  }
363
433
 
364
434
  watchFile(walPath, { interval: 500 }, onWALChange);
365
435
  walWatcherActive = true;
366
436
 
437
+ function flushActionStream(): void {
438
+ if (actionStreamDebounceTimer) return;
439
+ actionStreamDebounceTimer = setTimeout(() => {
440
+ actionStreamDebounceTimer = null;
441
+ const { records, newOffset } = readJsonlFrom<DashboardActionEvent>(
442
+ actionStreamPath,
443
+ actionStreamOffset,
444
+ );
445
+ actionStreamOffset = newOffset;
446
+ for (const record of records) {
447
+ broadcastSSE("action", record);
448
+ }
449
+ }, ACTION_STREAM_DEBOUNCE_MS);
450
+ }
451
+
452
+ const actionStreamPoller = setInterval(() => {
453
+ flushActionStream();
454
+ }, ACTION_STREAM_POLL_MS);
455
+ actionStreamWatcherActive = true;
456
+
367
457
  function getWatcherMode(): HealthResponse["watcher_mode"] {
368
- return walWatcherActive ? "wal" : "none";
458
+ if (walWatcherActive && actionStreamWatcherActive) return "wal";
459
+ return walWatcherActive || actionStreamWatcherActive ? "wal" : "none";
369
460
  }
370
461
 
371
462
  let cachedStatusResult: StatusResult | null = null;
@@ -454,10 +545,15 @@ export async function startDashboardServer(
454
545
 
455
546
  // ---- GET /api/health ----
456
547
  if (url.pathname === "/api/health" && req.method === "GET") {
548
+ const updateStatus = getCachedUpdateStatus();
457
549
  const healthResponse: HealthResponse = {
458
550
  ok: true,
459
551
  service: "selftune-dashboard",
460
552
  version: getSelftuneVersion(),
553
+ latest_version: updateStatus.latestVersion,
554
+ update_available: updateStatus.updateAvailable,
555
+ auto_update_supported: updateStatus.autoUpdateSupported,
556
+ update_hint: updateStatus.updateHint,
461
557
  pid: process.pid,
462
558
  spa: Boolean(spaDir || spaProxyUrl),
463
559
  spa_mode: spaMode,
@@ -503,6 +599,11 @@ export async function startDashboardServer(
503
599
  start(controller) {
504
600
  sseClients.add(controller);
505
601
  controller.enqueue(new TextEncoder().encode(": connected\n\n"));
602
+ for (const event of recentActionEventsForBackfill()) {
603
+ controller.enqueue(
604
+ new TextEncoder().encode(`event: action\ndata: ${JSON.stringify(event)}\n\n`),
605
+ );
606
+ }
506
607
  },
507
608
  cancel(controller) {
508
609
  sseClients.delete(controller);
@@ -533,7 +634,10 @@ export async function startDashboardServer(
533
634
  `Dashboard SPA proxy unavailable at ${spaProxyUrl.toString()}: ${message}`,
534
635
  {
535
636
  status: 502,
536
- headers: { "Content-Type": "text/plain; charset=utf-8", ...corsHeaders() },
637
+ headers: {
638
+ "Content-Type": "text/plain; charset=utf-8",
639
+ ...corsHeaders(),
640
+ },
537
641
  },
538
642
  );
539
643
  }
@@ -544,7 +648,10 @@ export async function startDashboardServer(
544
648
  const filePath = resolve(spaDir, `.${url.pathname}`);
545
649
  const rel = relative(spaDir, filePath);
546
650
  if (rel.startsWith("..") || isAbsolute(rel)) {
547
- return new Response("Not Found", { status: 404, headers: corsHeaders() });
651
+ return new Response("Not Found", {
652
+ status: 404,
653
+ headers: corsHeaders(),
654
+ });
548
655
  }
549
656
  const bunFile = Bun.file(filePath);
550
657
  if (await bunFile.exists()) {
@@ -558,7 +665,10 @@ export async function startDashboardServer(
558
665
  },
559
666
  });
560
667
  }
561
- return new Response("Not Found", { status: 404, headers: corsHeaders() });
668
+ return new Response("Not Found", {
669
+ status: 404,
670
+ headers: corsHeaders(),
671
+ });
562
672
  }
563
673
 
564
674
  // ---- GET / ---- Serve SPA shell
@@ -597,7 +707,10 @@ export async function startDashboardServer(
597
707
  { status: 400, headers: corsHeaders() },
598
708
  );
599
709
  }
600
- return withCors(await handleAction(action, body, executeAction));
710
+ const emitActionEvent = (event: DashboardActionEvent) => {
711
+ broadcastSSE("action", event);
712
+ };
713
+ return withCors(await handleAction(action, body, executeAction, emitActionEvent));
601
714
  }
602
715
 
603
716
  // ---- GET /badge/:skillName ----
@@ -634,7 +747,9 @@ export async function startDashboardServer(
634
747
  // ---- GET /api/v2/overview ----
635
748
  if (url.pathname === "/api/v2/overview" && req.method === "GET") {
636
749
  if (getOverviewResponse) {
637
- return Response.json(getOverviewResponse(), { headers: corsHeaders() });
750
+ return Response.json(getOverviewResponse(), {
751
+ headers: corsHeaders(),
752
+ });
638
753
  }
639
754
  if (!db) {
640
755
  return Response.json(
@@ -737,6 +852,7 @@ export async function startDashboardServer(
737
852
  const shutdownHandler = () => {
738
853
  unwatchFile(walPath, onWALChange);
739
854
  clearInterval(sseKeepaliveTimer);
855
+ clearInterval(actionStreamPoller);
740
856
  for (const c of sseClients) {
741
857
  try {
742
858
  c.close();
@@ -754,6 +870,7 @@ export async function startDashboardServer(
754
870
  }
755
871
  proxiedSpaSockets.clear();
756
872
  if (fsDebounceTimer) clearTimeout(fsDebounceTimer);
873
+ if (actionStreamDebounceTimer) clearTimeout(actionStreamDebounceTimer);
757
874
  closeSingleton();
758
875
  server.stop();
759
876
  };
@@ -25,6 +25,10 @@ import { parseArgs } from "node:util";
25
25
 
26
26
  import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
27
27
  import { GENERIC_NEGATIVES, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
28
+ import {
29
+ createDashboardLlmObserver,
30
+ emitDashboardStepProgress,
31
+ } from "../dashboard-action-instrumentation.js";
28
32
  import { getDb } from "../localdb/db.js";
29
33
  import {
30
34
  queryQueryLog,
@@ -615,16 +619,49 @@ export async function cliMain(): Promise<void> {
615
619
  const maxPerSide = Number.parseInt(values.max ?? "50", 10);
616
620
  const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
617
621
 
622
+ emitDashboardStepProgress({
623
+ current: 1,
624
+ total: 4,
625
+ status: "started",
626
+ phase: "load_skill",
627
+ label: "Load skill content",
628
+ });
618
629
  console.log(`Generating synthetic evals for skill '${values.skill}'...`);
619
630
  const evalSet = await generateSyntheticEvals(values["skill-path"], values.skill, agent, {
620
631
  maxPositives: effectiveMax,
621
632
  maxNegatives: effectiveMax,
622
633
  modelFlag: values.model,
634
+ llmObserverFactory: createDashboardLlmObserver,
635
+ });
636
+ emitDashboardStepProgress({
637
+ current: 1,
638
+ total: 4,
639
+ status: "finished",
640
+ phase: "load_skill",
641
+ label: "Load skill content",
642
+ passed: true,
643
+ evidence: values["skill-path"],
623
644
  });
624
645
 
625
646
  const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
647
+ emitDashboardStepProgress({
648
+ current: 4,
649
+ total: 4,
650
+ status: "started",
651
+ phase: "write_eval_set",
652
+ label: "Write eval set",
653
+ });
626
654
  writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
627
655
  const canonicalPath = writeCanonicalEvalSet(values.skill, evalSet);
656
+ emitDashboardStepProgress({
657
+ current: 4,
658
+ total: 4,
659
+ status: "finished",
660
+ phase: "write_eval_set",
661
+ label: "Write eval set",
662
+ passed: true,
663
+ evidence: outputPath,
664
+ });
628
665
 
629
666
  const pos = evalSet.filter((e) => e.should_trigger);
630
667
  const neg = evalSet.filter((e) => !e.should_trigger);
@@ -666,6 +703,13 @@ export async function cliMain(): Promise<void> {
666
703
  const hasCustomQueryLog = queryLogPath !== QUERY_LOG;
667
704
  const hasCustomTelemetryLog = telemetryLogPath !== TELEMETRY_LOG;
668
705
 
706
+ emitDashboardStepProgress({
707
+ current: 1,
708
+ total: values.blend ? 5 : 3,
709
+ status: "started",
710
+ phase: "load_records",
711
+ label: "Load telemetry and query records",
712
+ });
669
713
  const db = hasCustomSkillLog && hasCustomQueryLog && hasCustomTelemetryLog ? undefined : getDb();
670
714
  skillRecords = hasCustomSkillLog
671
715
  ? readJsonl<SkillUsageRecord>(skillLogPath)
@@ -676,6 +720,15 @@ export async function cliMain(): Promise<void> {
676
720
  telemetryRecords = hasCustomTelemetryLog
677
721
  ? readJsonl<SessionTelemetryRecord>(telemetryLogPath)
678
722
  : (querySessionTelemetry(db!) as SessionTelemetryRecord[]);
723
+ emitDashboardStepProgress({
724
+ current: 1,
725
+ total: values.blend ? 5 : 3,
726
+ status: "finished",
727
+ phase: "load_records",
728
+ label: "Load telemetry and query records",
729
+ passed: true,
730
+ evidence: `${skillRecords.length} skill rows · ${queryRecords.length} query rows`,
731
+ });
679
732
 
680
733
  if (values["list-skills"]) {
681
734
  listSkills(skillRecords, queryRecords, telemetryRecords);
@@ -701,6 +754,13 @@ export async function cliMain(): Promise<void> {
701
754
  const searchDirs = getEvalSkillSearchDirs();
702
755
  const detectedSkillPath = findInstalledSkillPath(values.skill, searchDirs);
703
756
 
757
+ emitDashboardStepProgress({
758
+ current: 2,
759
+ total: values.blend ? 5 : 3,
760
+ status: "started",
761
+ phase: "build_eval_set",
762
+ label: "Build eval set",
763
+ });
704
764
  const evalSet = buildEvalSet(
705
765
  skillRecords,
706
766
  queryRecords,
@@ -710,6 +770,15 @@ export async function cliMain(): Promise<void> {
710
770
  seed,
711
771
  annotateTaxonomy,
712
772
  );
773
+ emitDashboardStepProgress({
774
+ current: 2,
775
+ total: values.blend ? 5 : 3,
776
+ status: "finished",
777
+ phase: "build_eval_set",
778
+ label: "Build eval set",
779
+ passed: true,
780
+ evidence: `${evalSet.length} entries`,
781
+ });
713
782
 
714
783
  const positiveCount = evalSet.filter((entry) => entry.should_trigger).length;
715
784
  if (positiveCount === 0 && values["auto-synthetic"]) {
@@ -731,6 +800,13 @@ export async function cliMain(): Promise<void> {
731
800
  );
732
801
  }
733
802
 
803
+ emitDashboardStepProgress({
804
+ current: 1,
805
+ total: 4,
806
+ status: "started",
807
+ phase: "load_skill",
808
+ label: "Load skill content",
809
+ });
734
810
  console.log(
735
811
  `No trusted triggers found for '${values.skill}'. Falling back to synthetic cold-start eval generation...`,
736
812
  );
@@ -739,10 +815,36 @@ export async function cliMain(): Promise<void> {
739
815
  maxPositives: effectiveMax,
740
816
  maxNegatives: effectiveMax,
741
817
  modelFlag: values.model,
818
+ llmObserverFactory: createDashboardLlmObserver,
819
+ });
820
+ emitDashboardStepProgress({
821
+ current: 1,
822
+ total: 4,
823
+ status: "finished",
824
+ phase: "load_skill",
825
+ label: "Load skill content",
826
+ passed: true,
827
+ evidence: skillPath,
742
828
  });
743
829
  const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
830
+ emitDashboardStepProgress({
831
+ current: 4,
832
+ total: 4,
833
+ status: "started",
834
+ phase: "write_eval_set",
835
+ label: "Write eval set",
836
+ });
744
837
  writeFileSync(outputPath, JSON.stringify(syntheticEvalSet, null, 2), "utf-8");
745
838
  const canonicalPath = writeCanonicalEvalSet(values.skill, syntheticEvalSet);
839
+ emitDashboardStepProgress({
840
+ current: 4,
841
+ total: 4,
842
+ status: "finished",
843
+ phase: "write_eval_set",
844
+ label: "Write eval set",
845
+ passed: true,
846
+ evidence: outputPath,
847
+ });
746
848
  const pos = syntheticEvalSet.filter((e) => e.should_trigger);
747
849
  const neg = syntheticEvalSet.filter((e) => !e.should_trigger);
748
850
 
@@ -789,23 +891,78 @@ export async function cliMain(): Promise<void> {
789
891
  }
790
892
 
791
893
  const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
894
+ emitDashboardStepProgress({
895
+ current: 1,
896
+ total: 5,
897
+ status: "started",
898
+ phase: "build_log_eval_set",
899
+ label: "Build log eval set",
900
+ });
901
+ emitDashboardStepProgress({
902
+ current: 1,
903
+ total: 5,
904
+ status: "finished",
905
+ phase: "build_log_eval_set",
906
+ label: "Build log eval set",
907
+ passed: true,
908
+ evidence: `${evalSet.length} entries`,
909
+ });
792
910
  console.log(`Generating synthetic evals for blending with '${values.skill}'...`);
793
911
  const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
794
912
  maxPositives: effectiveMax,
795
913
  maxNegatives: effectiveMax,
796
914
  modelFlag: values.model,
915
+ llmObserverFactory: ({ current, total, phase, label }) =>
916
+ createDashboardLlmObserver({
917
+ current: current + 1,
918
+ total: total + 1,
919
+ phase,
920
+ label,
921
+ }),
797
922
  });
798
923
 
924
+ emitDashboardStepProgress({
925
+ current: 4,
926
+ total: 5,
927
+ status: "started",
928
+ phase: "blend_eval_sets",
929
+ label: "Blend log and synthetic evals",
930
+ });
799
931
  finalEvalSet = blendEvalSets(evalSet, syntheticEvalSet);
800
932
  const stats = computeEvalSourceStats(finalEvalSet);
933
+ emitDashboardStepProgress({
934
+ current: 4,
935
+ total: 5,
936
+ status: "finished",
937
+ phase: "blend_eval_sets",
938
+ label: "Blend log and synthetic evals",
939
+ passed: true,
940
+ evidence: `${stats.total} total entries`,
941
+ });
801
942
  console.log(
802
943
  `Blended: ${stats.log} log + ${stats.blended} synthetic gap-fillers = ${stats.total} total`,
803
944
  );
804
945
  }
805
946
 
806
947
  const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
948
+ emitDashboardStepProgress({
949
+ current: values.blend ? 5 : 3,
950
+ total: values.blend ? 5 : 3,
951
+ status: "started",
952
+ phase: "write_eval_set",
953
+ label: "Write eval set",
954
+ });
807
955
  writeFileSync(outputPath, JSON.stringify(finalEvalSet, null, 2), "utf-8");
808
956
  const canonicalPath = writeCanonicalEvalSet(values.skill, finalEvalSet);
957
+ emitDashboardStepProgress({
958
+ current: values.blend ? 5 : 3,
959
+ total: values.blend ? 5 : 3,
960
+ status: "finished",
961
+ phase: "write_eval_set",
962
+ label: "Write eval set",
963
+ passed: true,
964
+ evidence: outputPath,
965
+ });
809
966
  printEvalStats(
810
967
  finalEvalSet,
811
968
  values.skill,
@@ -10,6 +10,7 @@ import { readFileSync } from "node:fs";
10
10
 
11
11
  import type { EvalEntry, InvocationType, SkillUsageRecord } from "../types.js";
12
12
  import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
13
+ import type { LlmCallObserver } from "../utils/llm-call.js";
13
14
  import { findInstalledSkillNames } from "../utils/skill-discovery.js";
14
15
  import { classifyInvocation } from "./invocation-classifier.js";
15
16
 
@@ -21,6 +22,12 @@ export interface SyntheticEvalOptions {
21
22
  maxPositives?: number;
22
23
  maxNegatives?: number;
23
24
  modelFlag?: string;
25
+ llmObserverFactory?: (step: {
26
+ current: number;
27
+ total: number;
28
+ phase: string;
29
+ label: string;
30
+ }) => LlmCallObserver | undefined;
24
31
  }
25
32
 
26
33
  interface RawSyntheticEntry {
@@ -484,7 +491,19 @@ export async function generateSyntheticEvals(
484
491
  siblingSkills,
485
492
  );
486
493
 
487
- const raw = await callLlm(system, user, agent, options.modelFlag);
494
+ const raw = await callLlm(
495
+ system,
496
+ user,
497
+ agent,
498
+ options.modelFlag,
499
+ undefined,
500
+ options.llmObserverFactory?.({
501
+ current: 2,
502
+ total: 4,
503
+ phase: "draft_eval_set",
504
+ label: "Draft synthetic eval set",
505
+ }),
506
+ );
488
507
  const firstPass = dedupeEvalEntries(parseSyntheticResponse(raw, skillName));
489
508
 
490
509
  try {
@@ -496,7 +515,19 @@ export async function generateSyntheticEvals(
496
515
  maxNegatives,
497
516
  siblingSkills,
498
517
  );
499
- const refinedRaw = await callLlm(refinement.system, refinement.user, agent, options.modelFlag);
518
+ const refinedRaw = await callLlm(
519
+ refinement.system,
520
+ refinement.user,
521
+ agent,
522
+ options.modelFlag,
523
+ undefined,
524
+ options.llmObserverFactory?.({
525
+ current: 3,
526
+ total: 4,
527
+ phase: "refine_eval_set",
528
+ label: "Refine synthetic eval set",
529
+ }),
530
+ );
500
531
  const refined = dedupeEvalEntries(parseSyntheticResponse(refinedRaw, skillName));
501
532
  const selected = selectBalancedEvalEntries(refined, maxPositives, maxNegatives, siblingSkills);
502
533
  if (
@@ -13,13 +13,17 @@
13
13
  * --model <m> Model flag for LLM calls
14
14
  */
15
15
 
16
- import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
16
+ import { existsSync, mkdirSync, readFileSync } from "node:fs";
17
17
  import { join } from "node:path";
18
18
  import { parseArgs } from "node:util";
19
19
 
20
20
  import { SELFTUNE_CONFIG_DIR } from "../constants.js";
21
+ import {
22
+ createDashboardLlmObserver,
23
+ emitDashboardStepProgress,
24
+ } from "../dashboard-action-instrumentation.js";
21
25
  import type { EvalEntry } from "../types.js";
22
- import { writeUnitTestRunResult } from "../testing-readiness.js";
26
+ import { writeCanonicalUnitTests, writeUnitTestRunResult } from "../testing-readiness.js";
23
27
  import { CLIError } from "../utils/cli-error.js";
24
28
  import { callLlm, detectLlmAgent } from "../utils/llm-call.js";
25
29
  import { generateUnitTests } from "./generate-unit-tests.js";
@@ -69,6 +73,13 @@ export async function cliMain(): Promise<void> {
69
73
  }
70
74
 
71
75
  let skillContent = `Skill: ${skillName}`;
76
+ emitDashboardStepProgress({
77
+ current: 1,
78
+ total: 3,
79
+ status: "started",
80
+ phase: "load_generation_inputs",
81
+ label: "Load skill and failure context",
82
+ });
72
83
  if (values["skill-path"] && existsSync(values["skill-path"])) {
73
84
  skillContent = readFileSync(values["skill-path"], "utf-8");
74
85
  } else if (values["skill-path"]) {
@@ -85,10 +96,31 @@ export async function cliMain(): Promise<void> {
85
96
  console.warn("[WARN] Failed to parse eval set. Proceeding without failure context.");
86
97
  }
87
98
  }
99
+ emitDashboardStepProgress({
100
+ current: 1,
101
+ total: 3,
102
+ status: "finished",
103
+ phase: "load_generation_inputs",
104
+ label: "Load skill and failure context",
105
+ passed: true,
106
+ evidence: `${evalFailures.length} eval failures`,
107
+ });
88
108
 
89
109
  const modelFlag = values.model;
90
110
  const llmCaller = (systemPrompt: string, userPrompt: string) =>
91
- callLlm(systemPrompt, userPrompt, agent, modelFlag);
111
+ callLlm(
112
+ systemPrompt,
113
+ userPrompt,
114
+ agent,
115
+ modelFlag,
116
+ undefined,
117
+ createDashboardLlmObserver({
118
+ current: 2,
119
+ total: 3,
120
+ phase: "generate_tests",
121
+ label: "Generate unit tests",
122
+ }),
123
+ );
92
124
 
93
125
  console.log(`Generating unit tests for skill '${skillName}'...`);
94
126
  const tests = await generateUnitTests(skillName, skillContent, evalFailures, llmCaller);
@@ -98,9 +130,25 @@ export async function cliMain(): Promise<void> {
98
130
  }
99
131
 
100
132
  // Ensure output directory exists
133
+ emitDashboardStepProgress({
134
+ current: 3,
135
+ total: 3,
136
+ status: "started",
137
+ phase: "write_tests",
138
+ label: "Write generated tests",
139
+ });
101
140
  mkdirSync(unitTestDir, { recursive: true });
102
- writeFileSync(testsPath, JSON.stringify(tests, null, 2), "utf-8");
103
- console.log(`Generated ${tests.length} unit tests -> ${testsPath}`);
141
+ const storedPath = writeCanonicalUnitTests(skillName, tests, testsPath);
142
+ emitDashboardStepProgress({
143
+ current: 3,
144
+ total: 3,
145
+ status: "finished",
146
+ phase: "write_tests",
147
+ label: "Write generated tests",
148
+ passed: true,
149
+ evidence: storedPath,
150
+ });
151
+ console.log(`Generated ${tests.length} unit tests -> ${storedPath}`);
104
152
  return;
105
153
  }
106
154