selftune 0.2.29 → 0.2.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/auto-update.ts +40 -8
- package/cli/selftune/command-surface.ts +1 -1
- package/cli/selftune/constants.ts +5 -0
- package/cli/selftune/dashboard-action-events.ts +117 -0
- package/cli/selftune/dashboard-action-instrumentation.ts +103 -0
- package/cli/selftune/dashboard-action-result.ts +90 -0
- package/cli/selftune/dashboard-action-stream.ts +252 -0
- package/cli/selftune/dashboard-contract.ts +81 -1
- package/cli/selftune/dashboard-server.ts +133 -16
- package/cli/selftune/eval/hooks-to-evals.ts +157 -0
- package/cli/selftune/eval/synthetic-evals.ts +33 -2
- package/cli/selftune/eval/unit-test-cli.ts +53 -5
- package/cli/selftune/evolution/validate-host-replay.ts +191 -14
- package/cli/selftune/index.ts +4 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +117 -8
- package/cli/selftune/localdb/schema.ts +34 -0
- package/cli/selftune/routes/actions.ts +273 -42
- package/cli/selftune/testing-readiness.ts +203 -10
- package/cli/selftune/utils/llm-call.ts +90 -1
- package/package.json +1 -1
- package/packages/ui/src/components/EvolutionTimeline.tsx +1 -1
- package/skill/SKILL.md +1 -1
- package/skill/workflows/Dashboard.md +50 -23
- package/apps/local-dashboard/dist/assets/index-BcvtYmmL.js +0 -15
- package/apps/local-dashboard/dist/assets/index-BpRIxnpS.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-DqH_uxum.js +0 -1
|
@@ -19,12 +19,14 @@
|
|
|
19
19
|
*/
|
|
20
20
|
|
|
21
21
|
import type { Database } from "bun:sqlite";
|
|
22
|
-
import { existsSync, readFileSync, unwatchFile, watchFile } from "node:fs";
|
|
22
|
+
import { existsSync, readFileSync, statSync, unwatchFile, watchFile } from "node:fs";
|
|
23
23
|
import { dirname, extname, isAbsolute, join, relative, resolve } from "node:path";
|
|
24
24
|
|
|
25
25
|
import type { BadgeFormat } from "./badge/badge-data.js";
|
|
26
|
-
import {
|
|
26
|
+
import { getCachedUpdateStatus } from "./auto-update.js";
|
|
27
|
+
import { DASHBOARD_ACTION_STREAM_LOG, LOG_DIR, SELFTUNE_CONFIG_DIR } from "./constants.js";
|
|
27
28
|
import type {
|
|
29
|
+
DashboardActionEvent,
|
|
28
30
|
HealthResponse,
|
|
29
31
|
OverviewResponse,
|
|
30
32
|
SkillReportResponse,
|
|
@@ -53,6 +55,7 @@ import {
|
|
|
53
55
|
import type { StatusResult } from "./status.js";
|
|
54
56
|
import { computeStatus } from "./status.js";
|
|
55
57
|
import type { EvolutionAuditEntry, EvolutionEvidenceEntry } from "./types.js";
|
|
58
|
+
import { readJsonlFrom } from "./utils/jsonl.js";
|
|
56
59
|
|
|
57
60
|
export interface DashboardServerOptions {
|
|
58
61
|
port?: number;
|
|
@@ -72,6 +75,13 @@ interface DashboardSocketData {
|
|
|
72
75
|
upstreamUrl?: string;
|
|
73
76
|
}
|
|
74
77
|
|
|
78
|
+
interface ActionEventHistoryEntry {
|
|
79
|
+
eventId: string;
|
|
80
|
+
updatedAt: number;
|
|
81
|
+
finished: boolean;
|
|
82
|
+
events: DashboardActionEvent[];
|
|
83
|
+
}
|
|
84
|
+
|
|
75
85
|
/** Read selftune version from package.json (fresh on each call to pick up auto-updates). */
|
|
76
86
|
const VERSION_PKG_PATH = join(import.meta.dir, "..", "..", "package.json");
|
|
77
87
|
function getSelftuneVersion(): string {
|
|
@@ -189,7 +199,10 @@ async function serveSpaShell(spaDir: string | null): Promise<Response> {
|
|
|
189
199
|
if (!spaDir) {
|
|
190
200
|
return new Response("Dashboard build not found. Run `bun run build:dashboard` first.", {
|
|
191
201
|
status: 503,
|
|
192
|
-
headers: {
|
|
202
|
+
headers: {
|
|
203
|
+
"Content-Type": "text/plain; charset=utf-8",
|
|
204
|
+
...corsHeaders(),
|
|
205
|
+
},
|
|
193
206
|
});
|
|
194
207
|
}
|
|
195
208
|
|
|
@@ -260,9 +273,11 @@ function withCors(response: Response): Response {
|
|
|
260
273
|
});
|
|
261
274
|
}
|
|
262
275
|
|
|
263
|
-
export async function startDashboardServer(
|
|
264
|
-
|
|
265
|
-
|
|
276
|
+
export async function startDashboardServer(options?: DashboardServerOptions): Promise<{
|
|
277
|
+
server: ReturnType<typeof Bun.serve>;
|
|
278
|
+
stop: () => void;
|
|
279
|
+
port: number;
|
|
280
|
+
}> {
|
|
266
281
|
const port = options?.port ?? 3141;
|
|
267
282
|
const hostname = options?.host ?? "localhost";
|
|
268
283
|
const openBrowser = options?.openBrowser ?? true;
|
|
@@ -321,12 +336,60 @@ export async function startDashboardServer(
|
|
|
321
336
|
|
|
322
337
|
// -- SSE (Server-Sent Events) live update layer -----------------------------
|
|
323
338
|
const sseClients = new Set<ReadableStreamDefaultController>();
|
|
339
|
+
const actionEventHistory = new Map<string, ActionEventHistoryEntry>();
|
|
340
|
+
const MAX_ACTION_HISTORY_RUNS = 24;
|
|
341
|
+
const MAX_ACTION_HISTORY_EVENTS_PER_RUN = 320;
|
|
342
|
+
|
|
343
|
+
function trimActionEventHistory(): void {
|
|
344
|
+
if (actionEventHistory.size <= MAX_ACTION_HISTORY_RUNS) return;
|
|
345
|
+
|
|
346
|
+
const staleEntries = [...actionEventHistory.values()].sort((left, right) => {
|
|
347
|
+
if (left.finished !== right.finished) {
|
|
348
|
+
return left.finished ? -1 : 1;
|
|
349
|
+
}
|
|
350
|
+
return left.updatedAt - right.updatedAt;
|
|
351
|
+
});
|
|
324
352
|
|
|
325
|
-
|
|
326
|
-
|
|
353
|
+
while (actionEventHistory.size > MAX_ACTION_HISTORY_RUNS) {
|
|
354
|
+
const next = staleEntries.shift();
|
|
355
|
+
if (!next) break;
|
|
356
|
+
actionEventHistory.delete(next.eventId);
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
function rememberActionEvent(event: DashboardActionEvent): void {
|
|
361
|
+
const existing = actionEventHistory.get(event.event_id);
|
|
362
|
+
if (existing) {
|
|
363
|
+
existing.updatedAt = event.ts;
|
|
364
|
+
existing.finished = event.stage === "finished" ? true : existing.finished;
|
|
365
|
+
existing.events.push(event);
|
|
366
|
+
existing.events = existing.events.slice(-MAX_ACTION_HISTORY_EVENTS_PER_RUN);
|
|
367
|
+
return;
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
actionEventHistory.set(event.event_id, {
|
|
371
|
+
eventId: event.event_id,
|
|
372
|
+
updatedAt: event.ts,
|
|
373
|
+
finished: event.stage === "finished",
|
|
374
|
+
events: [event],
|
|
375
|
+
});
|
|
376
|
+
trimActionEventHistory();
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
function recentActionEventsForBackfill(): DashboardActionEvent[] {
|
|
380
|
+
return [...actionEventHistory.values()]
|
|
381
|
+
.sort((left, right) => left.updatedAt - right.updatedAt)
|
|
382
|
+
.flatMap((entry) => entry.events);
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
function broadcastSSE(eventType: string, payload: Record<string, unknown>): void {
|
|
386
|
+
if (eventType === "action") {
|
|
387
|
+
rememberActionEvent(payload as DashboardActionEvent);
|
|
388
|
+
}
|
|
389
|
+
const message = `event: ${eventType}\ndata: ${JSON.stringify(payload)}\n\n`;
|
|
327
390
|
for (const controller of sseClients) {
|
|
328
391
|
try {
|
|
329
|
-
controller.enqueue(new TextEncoder().encode(
|
|
392
|
+
controller.enqueue(new TextEncoder().encode(message));
|
|
330
393
|
} catch {
|
|
331
394
|
sseClients.delete(controller);
|
|
332
395
|
}
|
|
@@ -347,9 +410,16 @@ export async function startDashboardServer(
|
|
|
347
410
|
// -- SQLite WAL watcher for push-based updates ------------------------------
|
|
348
411
|
const walPath = `${DB_PATH}-wal`;
|
|
349
412
|
let walWatcherActive = false;
|
|
413
|
+
const actionStreamPath =
|
|
414
|
+
process.env.SELFTUNE_DASHBOARD_ACTION_STREAM_LOG || DASHBOARD_ACTION_STREAM_LOG;
|
|
415
|
+
let actionStreamWatcherActive = false;
|
|
416
|
+
let actionStreamOffset = existsSync(actionStreamPath) ? statSync(actionStreamPath).size : 0;
|
|
350
417
|
|
|
351
418
|
let fsDebounceTimer: ReturnType<typeof setTimeout> | null = null;
|
|
419
|
+
let actionStreamDebounceTimer: ReturnType<typeof setTimeout> | null = null;
|
|
352
420
|
const FS_DEBOUNCE_MS = 500;
|
|
421
|
+
const ACTION_STREAM_DEBOUNCE_MS = 100;
|
|
422
|
+
const ACTION_STREAM_POLL_MS = 250;
|
|
353
423
|
const proxiedSpaSockets = new Map<unknown, WebSocket>();
|
|
354
424
|
|
|
355
425
|
function onWALChange(): void {
|
|
@@ -357,15 +427,36 @@ export async function startDashboardServer(
|
|
|
357
427
|
fsDebounceTimer = setTimeout(() => {
|
|
358
428
|
fsDebounceTimer = null;
|
|
359
429
|
refreshV2DataImmediate();
|
|
360
|
-
broadcastSSE("update");
|
|
430
|
+
broadcastSSE("update", { type: "update", ts: Date.now() });
|
|
361
431
|
}, FS_DEBOUNCE_MS);
|
|
362
432
|
}
|
|
363
433
|
|
|
364
434
|
watchFile(walPath, { interval: 500 }, onWALChange);
|
|
365
435
|
walWatcherActive = true;
|
|
366
436
|
|
|
437
|
+
function flushActionStream(): void {
|
|
438
|
+
if (actionStreamDebounceTimer) return;
|
|
439
|
+
actionStreamDebounceTimer = setTimeout(() => {
|
|
440
|
+
actionStreamDebounceTimer = null;
|
|
441
|
+
const { records, newOffset } = readJsonlFrom<DashboardActionEvent>(
|
|
442
|
+
actionStreamPath,
|
|
443
|
+
actionStreamOffset,
|
|
444
|
+
);
|
|
445
|
+
actionStreamOffset = newOffset;
|
|
446
|
+
for (const record of records) {
|
|
447
|
+
broadcastSSE("action", record);
|
|
448
|
+
}
|
|
449
|
+
}, ACTION_STREAM_DEBOUNCE_MS);
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
const actionStreamPoller = setInterval(() => {
|
|
453
|
+
flushActionStream();
|
|
454
|
+
}, ACTION_STREAM_POLL_MS);
|
|
455
|
+
actionStreamWatcherActive = true;
|
|
456
|
+
|
|
367
457
|
function getWatcherMode(): HealthResponse["watcher_mode"] {
|
|
368
|
-
|
|
458
|
+
if (walWatcherActive && actionStreamWatcherActive) return "wal";
|
|
459
|
+
return walWatcherActive || actionStreamWatcherActive ? "wal" : "none";
|
|
369
460
|
}
|
|
370
461
|
|
|
371
462
|
let cachedStatusResult: StatusResult | null = null;
|
|
@@ -454,10 +545,15 @@ export async function startDashboardServer(
|
|
|
454
545
|
|
|
455
546
|
// ---- GET /api/health ----
|
|
456
547
|
if (url.pathname === "/api/health" && req.method === "GET") {
|
|
548
|
+
const updateStatus = getCachedUpdateStatus();
|
|
457
549
|
const healthResponse: HealthResponse = {
|
|
458
550
|
ok: true,
|
|
459
551
|
service: "selftune-dashboard",
|
|
460
552
|
version: getSelftuneVersion(),
|
|
553
|
+
latest_version: updateStatus.latestVersion,
|
|
554
|
+
update_available: updateStatus.updateAvailable,
|
|
555
|
+
auto_update_supported: updateStatus.autoUpdateSupported,
|
|
556
|
+
update_hint: updateStatus.updateHint,
|
|
461
557
|
pid: process.pid,
|
|
462
558
|
spa: Boolean(spaDir || spaProxyUrl),
|
|
463
559
|
spa_mode: spaMode,
|
|
@@ -503,6 +599,11 @@ export async function startDashboardServer(
|
|
|
503
599
|
start(controller) {
|
|
504
600
|
sseClients.add(controller);
|
|
505
601
|
controller.enqueue(new TextEncoder().encode(": connected\n\n"));
|
|
602
|
+
for (const event of recentActionEventsForBackfill()) {
|
|
603
|
+
controller.enqueue(
|
|
604
|
+
new TextEncoder().encode(`event: action\ndata: ${JSON.stringify(event)}\n\n`),
|
|
605
|
+
);
|
|
606
|
+
}
|
|
506
607
|
},
|
|
507
608
|
cancel(controller) {
|
|
508
609
|
sseClients.delete(controller);
|
|
@@ -533,7 +634,10 @@ export async function startDashboardServer(
|
|
|
533
634
|
`Dashboard SPA proxy unavailable at ${spaProxyUrl.toString()}: ${message}`,
|
|
534
635
|
{
|
|
535
636
|
status: 502,
|
|
536
|
-
headers: {
|
|
637
|
+
headers: {
|
|
638
|
+
"Content-Type": "text/plain; charset=utf-8",
|
|
639
|
+
...corsHeaders(),
|
|
640
|
+
},
|
|
537
641
|
},
|
|
538
642
|
);
|
|
539
643
|
}
|
|
@@ -544,7 +648,10 @@ export async function startDashboardServer(
|
|
|
544
648
|
const filePath = resolve(spaDir, `.${url.pathname}`);
|
|
545
649
|
const rel = relative(spaDir, filePath);
|
|
546
650
|
if (rel.startsWith("..") || isAbsolute(rel)) {
|
|
547
|
-
return new Response("Not Found", {
|
|
651
|
+
return new Response("Not Found", {
|
|
652
|
+
status: 404,
|
|
653
|
+
headers: corsHeaders(),
|
|
654
|
+
});
|
|
548
655
|
}
|
|
549
656
|
const bunFile = Bun.file(filePath);
|
|
550
657
|
if (await bunFile.exists()) {
|
|
@@ -558,7 +665,10 @@ export async function startDashboardServer(
|
|
|
558
665
|
},
|
|
559
666
|
});
|
|
560
667
|
}
|
|
561
|
-
return new Response("Not Found", {
|
|
668
|
+
return new Response("Not Found", {
|
|
669
|
+
status: 404,
|
|
670
|
+
headers: corsHeaders(),
|
|
671
|
+
});
|
|
562
672
|
}
|
|
563
673
|
|
|
564
674
|
// ---- GET / ---- Serve SPA shell
|
|
@@ -597,7 +707,10 @@ export async function startDashboardServer(
|
|
|
597
707
|
{ status: 400, headers: corsHeaders() },
|
|
598
708
|
);
|
|
599
709
|
}
|
|
600
|
-
|
|
710
|
+
const emitActionEvent = (event: DashboardActionEvent) => {
|
|
711
|
+
broadcastSSE("action", event);
|
|
712
|
+
};
|
|
713
|
+
return withCors(await handleAction(action, body, executeAction, emitActionEvent));
|
|
601
714
|
}
|
|
602
715
|
|
|
603
716
|
// ---- GET /badge/:skillName ----
|
|
@@ -634,7 +747,9 @@ export async function startDashboardServer(
|
|
|
634
747
|
// ---- GET /api/v2/overview ----
|
|
635
748
|
if (url.pathname === "/api/v2/overview" && req.method === "GET") {
|
|
636
749
|
if (getOverviewResponse) {
|
|
637
|
-
return Response.json(getOverviewResponse(), {
|
|
750
|
+
return Response.json(getOverviewResponse(), {
|
|
751
|
+
headers: corsHeaders(),
|
|
752
|
+
});
|
|
638
753
|
}
|
|
639
754
|
if (!db) {
|
|
640
755
|
return Response.json(
|
|
@@ -737,6 +852,7 @@ export async function startDashboardServer(
|
|
|
737
852
|
const shutdownHandler = () => {
|
|
738
853
|
unwatchFile(walPath, onWALChange);
|
|
739
854
|
clearInterval(sseKeepaliveTimer);
|
|
855
|
+
clearInterval(actionStreamPoller);
|
|
740
856
|
for (const c of sseClients) {
|
|
741
857
|
try {
|
|
742
858
|
c.close();
|
|
@@ -754,6 +870,7 @@ export async function startDashboardServer(
|
|
|
754
870
|
}
|
|
755
871
|
proxiedSpaSockets.clear();
|
|
756
872
|
if (fsDebounceTimer) clearTimeout(fsDebounceTimer);
|
|
873
|
+
if (actionStreamDebounceTimer) clearTimeout(actionStreamDebounceTimer);
|
|
757
874
|
closeSingleton();
|
|
758
875
|
server.stop();
|
|
759
876
|
};
|
|
@@ -25,6 +25,10 @@ import { parseArgs } from "node:util";
|
|
|
25
25
|
|
|
26
26
|
import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
|
|
27
27
|
import { GENERIC_NEGATIVES, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
|
|
28
|
+
import {
|
|
29
|
+
createDashboardLlmObserver,
|
|
30
|
+
emitDashboardStepProgress,
|
|
31
|
+
} from "../dashboard-action-instrumentation.js";
|
|
28
32
|
import { getDb } from "../localdb/db.js";
|
|
29
33
|
import {
|
|
30
34
|
queryQueryLog,
|
|
@@ -615,16 +619,49 @@ export async function cliMain(): Promise<void> {
|
|
|
615
619
|
const maxPerSide = Number.parseInt(values.max ?? "50", 10);
|
|
616
620
|
const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
617
621
|
|
|
622
|
+
emitDashboardStepProgress({
|
|
623
|
+
current: 1,
|
|
624
|
+
total: 4,
|
|
625
|
+
status: "started",
|
|
626
|
+
phase: "load_skill",
|
|
627
|
+
label: "Load skill content",
|
|
628
|
+
});
|
|
618
629
|
console.log(`Generating synthetic evals for skill '${values.skill}'...`);
|
|
619
630
|
const evalSet = await generateSyntheticEvals(values["skill-path"], values.skill, agent, {
|
|
620
631
|
maxPositives: effectiveMax,
|
|
621
632
|
maxNegatives: effectiveMax,
|
|
622
633
|
modelFlag: values.model,
|
|
634
|
+
llmObserverFactory: createDashboardLlmObserver,
|
|
635
|
+
});
|
|
636
|
+
emitDashboardStepProgress({
|
|
637
|
+
current: 1,
|
|
638
|
+
total: 4,
|
|
639
|
+
status: "finished",
|
|
640
|
+
phase: "load_skill",
|
|
641
|
+
label: "Load skill content",
|
|
642
|
+
passed: true,
|
|
643
|
+
evidence: values["skill-path"],
|
|
623
644
|
});
|
|
624
645
|
|
|
625
646
|
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
647
|
+
emitDashboardStepProgress({
|
|
648
|
+
current: 4,
|
|
649
|
+
total: 4,
|
|
650
|
+
status: "started",
|
|
651
|
+
phase: "write_eval_set",
|
|
652
|
+
label: "Write eval set",
|
|
653
|
+
});
|
|
626
654
|
writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
|
|
627
655
|
const canonicalPath = writeCanonicalEvalSet(values.skill, evalSet);
|
|
656
|
+
emitDashboardStepProgress({
|
|
657
|
+
current: 4,
|
|
658
|
+
total: 4,
|
|
659
|
+
status: "finished",
|
|
660
|
+
phase: "write_eval_set",
|
|
661
|
+
label: "Write eval set",
|
|
662
|
+
passed: true,
|
|
663
|
+
evidence: outputPath,
|
|
664
|
+
});
|
|
628
665
|
|
|
629
666
|
const pos = evalSet.filter((e) => e.should_trigger);
|
|
630
667
|
const neg = evalSet.filter((e) => !e.should_trigger);
|
|
@@ -666,6 +703,13 @@ export async function cliMain(): Promise<void> {
|
|
|
666
703
|
const hasCustomQueryLog = queryLogPath !== QUERY_LOG;
|
|
667
704
|
const hasCustomTelemetryLog = telemetryLogPath !== TELEMETRY_LOG;
|
|
668
705
|
|
|
706
|
+
emitDashboardStepProgress({
|
|
707
|
+
current: 1,
|
|
708
|
+
total: values.blend ? 5 : 3,
|
|
709
|
+
status: "started",
|
|
710
|
+
phase: "load_records",
|
|
711
|
+
label: "Load telemetry and query records",
|
|
712
|
+
});
|
|
669
713
|
const db = hasCustomSkillLog && hasCustomQueryLog && hasCustomTelemetryLog ? undefined : getDb();
|
|
670
714
|
skillRecords = hasCustomSkillLog
|
|
671
715
|
? readJsonl<SkillUsageRecord>(skillLogPath)
|
|
@@ -676,6 +720,15 @@ export async function cliMain(): Promise<void> {
|
|
|
676
720
|
telemetryRecords = hasCustomTelemetryLog
|
|
677
721
|
? readJsonl<SessionTelemetryRecord>(telemetryLogPath)
|
|
678
722
|
: (querySessionTelemetry(db!) as SessionTelemetryRecord[]);
|
|
723
|
+
emitDashboardStepProgress({
|
|
724
|
+
current: 1,
|
|
725
|
+
total: values.blend ? 5 : 3,
|
|
726
|
+
status: "finished",
|
|
727
|
+
phase: "load_records",
|
|
728
|
+
label: "Load telemetry and query records",
|
|
729
|
+
passed: true,
|
|
730
|
+
evidence: `${skillRecords.length} skill rows · ${queryRecords.length} query rows`,
|
|
731
|
+
});
|
|
679
732
|
|
|
680
733
|
if (values["list-skills"]) {
|
|
681
734
|
listSkills(skillRecords, queryRecords, telemetryRecords);
|
|
@@ -701,6 +754,13 @@ export async function cliMain(): Promise<void> {
|
|
|
701
754
|
const searchDirs = getEvalSkillSearchDirs();
|
|
702
755
|
const detectedSkillPath = findInstalledSkillPath(values.skill, searchDirs);
|
|
703
756
|
|
|
757
|
+
emitDashboardStepProgress({
|
|
758
|
+
current: 2,
|
|
759
|
+
total: values.blend ? 5 : 3,
|
|
760
|
+
status: "started",
|
|
761
|
+
phase: "build_eval_set",
|
|
762
|
+
label: "Build eval set",
|
|
763
|
+
});
|
|
704
764
|
const evalSet = buildEvalSet(
|
|
705
765
|
skillRecords,
|
|
706
766
|
queryRecords,
|
|
@@ -710,6 +770,15 @@ export async function cliMain(): Promise<void> {
|
|
|
710
770
|
seed,
|
|
711
771
|
annotateTaxonomy,
|
|
712
772
|
);
|
|
773
|
+
emitDashboardStepProgress({
|
|
774
|
+
current: 2,
|
|
775
|
+
total: values.blend ? 5 : 3,
|
|
776
|
+
status: "finished",
|
|
777
|
+
phase: "build_eval_set",
|
|
778
|
+
label: "Build eval set",
|
|
779
|
+
passed: true,
|
|
780
|
+
evidence: `${evalSet.length} entries`,
|
|
781
|
+
});
|
|
713
782
|
|
|
714
783
|
const positiveCount = evalSet.filter((entry) => entry.should_trigger).length;
|
|
715
784
|
if (positiveCount === 0 && values["auto-synthetic"]) {
|
|
@@ -731,6 +800,13 @@ export async function cliMain(): Promise<void> {
|
|
|
731
800
|
);
|
|
732
801
|
}
|
|
733
802
|
|
|
803
|
+
emitDashboardStepProgress({
|
|
804
|
+
current: 1,
|
|
805
|
+
total: 4,
|
|
806
|
+
status: "started",
|
|
807
|
+
phase: "load_skill",
|
|
808
|
+
label: "Load skill content",
|
|
809
|
+
});
|
|
734
810
|
console.log(
|
|
735
811
|
`No trusted triggers found for '${values.skill}'. Falling back to synthetic cold-start eval generation...`,
|
|
736
812
|
);
|
|
@@ -739,10 +815,36 @@ export async function cliMain(): Promise<void> {
|
|
|
739
815
|
maxPositives: effectiveMax,
|
|
740
816
|
maxNegatives: effectiveMax,
|
|
741
817
|
modelFlag: values.model,
|
|
818
|
+
llmObserverFactory: createDashboardLlmObserver,
|
|
819
|
+
});
|
|
820
|
+
emitDashboardStepProgress({
|
|
821
|
+
current: 1,
|
|
822
|
+
total: 4,
|
|
823
|
+
status: "finished",
|
|
824
|
+
phase: "load_skill",
|
|
825
|
+
label: "Load skill content",
|
|
826
|
+
passed: true,
|
|
827
|
+
evidence: skillPath,
|
|
742
828
|
});
|
|
743
829
|
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
830
|
+
emitDashboardStepProgress({
|
|
831
|
+
current: 4,
|
|
832
|
+
total: 4,
|
|
833
|
+
status: "started",
|
|
834
|
+
phase: "write_eval_set",
|
|
835
|
+
label: "Write eval set",
|
|
836
|
+
});
|
|
744
837
|
writeFileSync(outputPath, JSON.stringify(syntheticEvalSet, null, 2), "utf-8");
|
|
745
838
|
const canonicalPath = writeCanonicalEvalSet(values.skill, syntheticEvalSet);
|
|
839
|
+
emitDashboardStepProgress({
|
|
840
|
+
current: 4,
|
|
841
|
+
total: 4,
|
|
842
|
+
status: "finished",
|
|
843
|
+
phase: "write_eval_set",
|
|
844
|
+
label: "Write eval set",
|
|
845
|
+
passed: true,
|
|
846
|
+
evidence: outputPath,
|
|
847
|
+
});
|
|
746
848
|
const pos = syntheticEvalSet.filter((e) => e.should_trigger);
|
|
747
849
|
const neg = syntheticEvalSet.filter((e) => !e.should_trigger);
|
|
748
850
|
|
|
@@ -789,23 +891,78 @@ export async function cliMain(): Promise<void> {
|
|
|
789
891
|
}
|
|
790
892
|
|
|
791
893
|
const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
|
|
894
|
+
emitDashboardStepProgress({
|
|
895
|
+
current: 1,
|
|
896
|
+
total: 5,
|
|
897
|
+
status: "started",
|
|
898
|
+
phase: "build_log_eval_set",
|
|
899
|
+
label: "Build log eval set",
|
|
900
|
+
});
|
|
901
|
+
emitDashboardStepProgress({
|
|
902
|
+
current: 1,
|
|
903
|
+
total: 5,
|
|
904
|
+
status: "finished",
|
|
905
|
+
phase: "build_log_eval_set",
|
|
906
|
+
label: "Build log eval set",
|
|
907
|
+
passed: true,
|
|
908
|
+
evidence: `${evalSet.length} entries`,
|
|
909
|
+
});
|
|
792
910
|
console.log(`Generating synthetic evals for blending with '${values.skill}'...`);
|
|
793
911
|
const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
|
|
794
912
|
maxPositives: effectiveMax,
|
|
795
913
|
maxNegatives: effectiveMax,
|
|
796
914
|
modelFlag: values.model,
|
|
915
|
+
llmObserverFactory: ({ current, total, phase, label }) =>
|
|
916
|
+
createDashboardLlmObserver({
|
|
917
|
+
current: current + 1,
|
|
918
|
+
total: total + 1,
|
|
919
|
+
phase,
|
|
920
|
+
label,
|
|
921
|
+
}),
|
|
797
922
|
});
|
|
798
923
|
|
|
924
|
+
emitDashboardStepProgress({
|
|
925
|
+
current: 4,
|
|
926
|
+
total: 5,
|
|
927
|
+
status: "started",
|
|
928
|
+
phase: "blend_eval_sets",
|
|
929
|
+
label: "Blend log and synthetic evals",
|
|
930
|
+
});
|
|
799
931
|
finalEvalSet = blendEvalSets(evalSet, syntheticEvalSet);
|
|
800
932
|
const stats = computeEvalSourceStats(finalEvalSet);
|
|
933
|
+
emitDashboardStepProgress({
|
|
934
|
+
current: 4,
|
|
935
|
+
total: 5,
|
|
936
|
+
status: "finished",
|
|
937
|
+
phase: "blend_eval_sets",
|
|
938
|
+
label: "Blend log and synthetic evals",
|
|
939
|
+
passed: true,
|
|
940
|
+
evidence: `${stats.total} total entries`,
|
|
941
|
+
});
|
|
801
942
|
console.log(
|
|
802
943
|
`Blended: ${stats.log} log + ${stats.blended} synthetic gap-fillers = ${stats.total} total`,
|
|
803
944
|
);
|
|
804
945
|
}
|
|
805
946
|
|
|
806
947
|
const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
|
|
948
|
+
emitDashboardStepProgress({
|
|
949
|
+
current: values.blend ? 5 : 3,
|
|
950
|
+
total: values.blend ? 5 : 3,
|
|
951
|
+
status: "started",
|
|
952
|
+
phase: "write_eval_set",
|
|
953
|
+
label: "Write eval set",
|
|
954
|
+
});
|
|
807
955
|
writeFileSync(outputPath, JSON.stringify(finalEvalSet, null, 2), "utf-8");
|
|
808
956
|
const canonicalPath = writeCanonicalEvalSet(values.skill, finalEvalSet);
|
|
957
|
+
emitDashboardStepProgress({
|
|
958
|
+
current: values.blend ? 5 : 3,
|
|
959
|
+
total: values.blend ? 5 : 3,
|
|
960
|
+
status: "finished",
|
|
961
|
+
phase: "write_eval_set",
|
|
962
|
+
label: "Write eval set",
|
|
963
|
+
passed: true,
|
|
964
|
+
evidence: outputPath,
|
|
965
|
+
});
|
|
809
966
|
printEvalStats(
|
|
810
967
|
finalEvalSet,
|
|
811
968
|
values.skill,
|
|
@@ -10,6 +10,7 @@ import { readFileSync } from "node:fs";
|
|
|
10
10
|
|
|
11
11
|
import type { EvalEntry, InvocationType, SkillUsageRecord } from "../types.js";
|
|
12
12
|
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
13
|
+
import type { LlmCallObserver } from "../utils/llm-call.js";
|
|
13
14
|
import { findInstalledSkillNames } from "../utils/skill-discovery.js";
|
|
14
15
|
import { classifyInvocation } from "./invocation-classifier.js";
|
|
15
16
|
|
|
@@ -21,6 +22,12 @@ export interface SyntheticEvalOptions {
|
|
|
21
22
|
maxPositives?: number;
|
|
22
23
|
maxNegatives?: number;
|
|
23
24
|
modelFlag?: string;
|
|
25
|
+
llmObserverFactory?: (step: {
|
|
26
|
+
current: number;
|
|
27
|
+
total: number;
|
|
28
|
+
phase: string;
|
|
29
|
+
label: string;
|
|
30
|
+
}) => LlmCallObserver | undefined;
|
|
24
31
|
}
|
|
25
32
|
|
|
26
33
|
interface RawSyntheticEntry {
|
|
@@ -484,7 +491,19 @@ export async function generateSyntheticEvals(
|
|
|
484
491
|
siblingSkills,
|
|
485
492
|
);
|
|
486
493
|
|
|
487
|
-
const raw = await callLlm(
|
|
494
|
+
const raw = await callLlm(
|
|
495
|
+
system,
|
|
496
|
+
user,
|
|
497
|
+
agent,
|
|
498
|
+
options.modelFlag,
|
|
499
|
+
undefined,
|
|
500
|
+
options.llmObserverFactory?.({
|
|
501
|
+
current: 2,
|
|
502
|
+
total: 4,
|
|
503
|
+
phase: "draft_eval_set",
|
|
504
|
+
label: "Draft synthetic eval set",
|
|
505
|
+
}),
|
|
506
|
+
);
|
|
488
507
|
const firstPass = dedupeEvalEntries(parseSyntheticResponse(raw, skillName));
|
|
489
508
|
|
|
490
509
|
try {
|
|
@@ -496,7 +515,19 @@ export async function generateSyntheticEvals(
|
|
|
496
515
|
maxNegatives,
|
|
497
516
|
siblingSkills,
|
|
498
517
|
);
|
|
499
|
-
const refinedRaw = await callLlm(
|
|
518
|
+
const refinedRaw = await callLlm(
|
|
519
|
+
refinement.system,
|
|
520
|
+
refinement.user,
|
|
521
|
+
agent,
|
|
522
|
+
options.modelFlag,
|
|
523
|
+
undefined,
|
|
524
|
+
options.llmObserverFactory?.({
|
|
525
|
+
current: 3,
|
|
526
|
+
total: 4,
|
|
527
|
+
phase: "refine_eval_set",
|
|
528
|
+
label: "Refine synthetic eval set",
|
|
529
|
+
}),
|
|
530
|
+
);
|
|
500
531
|
const refined = dedupeEvalEntries(parseSyntheticResponse(refinedRaw, skillName));
|
|
501
532
|
const selected = selectBalancedEvalEntries(refined, maxPositives, maxNegatives, siblingSkills);
|
|
502
533
|
if (
|
|
@@ -13,13 +13,17 @@
|
|
|
13
13
|
* --model <m> Model flag for LLM calls
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
|
-
import { existsSync, mkdirSync, readFileSync
|
|
16
|
+
import { existsSync, mkdirSync, readFileSync } from "node:fs";
|
|
17
17
|
import { join } from "node:path";
|
|
18
18
|
import { parseArgs } from "node:util";
|
|
19
19
|
|
|
20
20
|
import { SELFTUNE_CONFIG_DIR } from "../constants.js";
|
|
21
|
+
import {
|
|
22
|
+
createDashboardLlmObserver,
|
|
23
|
+
emitDashboardStepProgress,
|
|
24
|
+
} from "../dashboard-action-instrumentation.js";
|
|
21
25
|
import type { EvalEntry } from "../types.js";
|
|
22
|
-
import { writeUnitTestRunResult } from "../testing-readiness.js";
|
|
26
|
+
import { writeCanonicalUnitTests, writeUnitTestRunResult } from "../testing-readiness.js";
|
|
23
27
|
import { CLIError } from "../utils/cli-error.js";
|
|
24
28
|
import { callLlm, detectLlmAgent } from "../utils/llm-call.js";
|
|
25
29
|
import { generateUnitTests } from "./generate-unit-tests.js";
|
|
@@ -69,6 +73,13 @@ export async function cliMain(): Promise<void> {
|
|
|
69
73
|
}
|
|
70
74
|
|
|
71
75
|
let skillContent = `Skill: ${skillName}`;
|
|
76
|
+
emitDashboardStepProgress({
|
|
77
|
+
current: 1,
|
|
78
|
+
total: 3,
|
|
79
|
+
status: "started",
|
|
80
|
+
phase: "load_generation_inputs",
|
|
81
|
+
label: "Load skill and failure context",
|
|
82
|
+
});
|
|
72
83
|
if (values["skill-path"] && existsSync(values["skill-path"])) {
|
|
73
84
|
skillContent = readFileSync(values["skill-path"], "utf-8");
|
|
74
85
|
} else if (values["skill-path"]) {
|
|
@@ -85,10 +96,31 @@ export async function cliMain(): Promise<void> {
|
|
|
85
96
|
console.warn("[WARN] Failed to parse eval set. Proceeding without failure context.");
|
|
86
97
|
}
|
|
87
98
|
}
|
|
99
|
+
emitDashboardStepProgress({
|
|
100
|
+
current: 1,
|
|
101
|
+
total: 3,
|
|
102
|
+
status: "finished",
|
|
103
|
+
phase: "load_generation_inputs",
|
|
104
|
+
label: "Load skill and failure context",
|
|
105
|
+
passed: true,
|
|
106
|
+
evidence: `${evalFailures.length} eval failures`,
|
|
107
|
+
});
|
|
88
108
|
|
|
89
109
|
const modelFlag = values.model;
|
|
90
110
|
const llmCaller = (systemPrompt: string, userPrompt: string) =>
|
|
91
|
-
callLlm(
|
|
111
|
+
callLlm(
|
|
112
|
+
systemPrompt,
|
|
113
|
+
userPrompt,
|
|
114
|
+
agent,
|
|
115
|
+
modelFlag,
|
|
116
|
+
undefined,
|
|
117
|
+
createDashboardLlmObserver({
|
|
118
|
+
current: 2,
|
|
119
|
+
total: 3,
|
|
120
|
+
phase: "generate_tests",
|
|
121
|
+
label: "Generate unit tests",
|
|
122
|
+
}),
|
|
123
|
+
);
|
|
92
124
|
|
|
93
125
|
console.log(`Generating unit tests for skill '${skillName}'...`);
|
|
94
126
|
const tests = await generateUnitTests(skillName, skillContent, evalFailures, llmCaller);
|
|
@@ -98,9 +130,25 @@ export async function cliMain(): Promise<void> {
|
|
|
98
130
|
}
|
|
99
131
|
|
|
100
132
|
// Ensure output directory exists
|
|
133
|
+
emitDashboardStepProgress({
|
|
134
|
+
current: 3,
|
|
135
|
+
total: 3,
|
|
136
|
+
status: "started",
|
|
137
|
+
phase: "write_tests",
|
|
138
|
+
label: "Write generated tests",
|
|
139
|
+
});
|
|
101
140
|
mkdirSync(unitTestDir, { recursive: true });
|
|
102
|
-
|
|
103
|
-
|
|
141
|
+
const storedPath = writeCanonicalUnitTests(skillName, tests, testsPath);
|
|
142
|
+
emitDashboardStepProgress({
|
|
143
|
+
current: 3,
|
|
144
|
+
total: 3,
|
|
145
|
+
status: "finished",
|
|
146
|
+
phase: "write_tests",
|
|
147
|
+
label: "Write generated tests",
|
|
148
|
+
passed: true,
|
|
149
|
+
evidence: storedPath,
|
|
150
|
+
});
|
|
151
|
+
console.log(`Generated ${tests.length} unit tests -> ${storedPath}`);
|
|
104
152
|
return;
|
|
105
153
|
}
|
|
106
154
|
|