selftune 0.2.29 → 0.2.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/local-dashboard/dist/assets/index-BcXquWFB.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Coq42hE4.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-B0H8s1mP.js +1 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/auto-update.ts +40 -8
- package/cli/selftune/command-surface.ts +1 -1
- package/cli/selftune/constants.ts +5 -0
- package/cli/selftune/dashboard-action-events.ts +117 -0
- package/cli/selftune/dashboard-action-instrumentation.ts +103 -0
- package/cli/selftune/dashboard-action-result.ts +90 -0
- package/cli/selftune/dashboard-action-stream.ts +252 -0
- package/cli/selftune/dashboard-contract.ts +81 -1
- package/cli/selftune/dashboard-server.ts +133 -16
- package/cli/selftune/eval/hooks-to-evals.ts +157 -0
- package/cli/selftune/eval/synthetic-evals.ts +33 -2
- package/cli/selftune/eval/unit-test-cli.ts +53 -5
- package/cli/selftune/evolution/validate-host-replay.ts +191 -14
- package/cli/selftune/index.ts +4 -0
- package/cli/selftune/ingestors/opencode-ingest.ts +117 -8
- package/cli/selftune/localdb/schema.ts +34 -0
- package/cli/selftune/routes/actions.ts +273 -42
- package/cli/selftune/testing-readiness.ts +203 -10
- package/cli/selftune/utils/llm-call.ts +90 -1
- package/package.json +1 -1
- package/packages/ui/src/components/EvolutionTimeline.tsx +1 -1
- package/skill/SKILL.md +1 -1
- package/skill/workflows/Dashboard.md +50 -23
- package/apps/local-dashboard/dist/assets/index-BcvtYmmL.js +0 -15
- package/apps/local-dashboard/dist/assets/index-BpRIxnpS.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-DqH_uxum.js +0 -1
|
@@ -4,44 +4,235 @@
|
|
|
4
4
|
* Triggers selftune CLI commands as child processes and returns the result.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
|
+
import { randomUUID } from "node:crypto";
|
|
7
8
|
import { join } from "node:path";
|
|
8
9
|
|
|
10
|
+
import {
|
|
11
|
+
dashboardActionContextEnv,
|
|
12
|
+
type DashboardActionContext,
|
|
13
|
+
} from "../dashboard-action-events.js";
|
|
14
|
+
import { resolveDashboardActionOutcome } from "../dashboard-action-result.js";
|
|
15
|
+
import type { DashboardActionEvent, DashboardActionName } from "../dashboard-contract.js";
|
|
16
|
+
import { getCanonicalEvalSetPath, getUnitTestPath } from "../testing-readiness.js";
|
|
9
17
|
import { saveWatchedSkills } from "../watchlist.js";
|
|
10
18
|
|
|
19
|
+
export interface ActionExecutionHooks {
|
|
20
|
+
actionContext?: DashboardActionContext;
|
|
21
|
+
onStdout?: (chunk: string) => void;
|
|
22
|
+
onStderr?: (chunk: string) => void;
|
|
23
|
+
}
|
|
24
|
+
|
|
11
25
|
export type ActionRunner = (
|
|
12
26
|
command: string,
|
|
13
27
|
args: string[],
|
|
14
|
-
|
|
28
|
+
hooks?: ActionExecutionHooks,
|
|
29
|
+
) => Promise<{
|
|
30
|
+
success: boolean;
|
|
31
|
+
output: string;
|
|
32
|
+
error: string | null;
|
|
33
|
+
exitCode: number | null;
|
|
34
|
+
}>;
|
|
35
|
+
|
|
36
|
+
export type ActionEventEmitter = (event: DashboardActionEvent) => void;
|
|
37
|
+
|
|
38
|
+
async function readProcessStream(
|
|
39
|
+
stream: ReadableStream<Uint8Array> | null | undefined,
|
|
40
|
+
onChunk?: (chunk: string) => void,
|
|
41
|
+
): Promise<string> {
|
|
42
|
+
if (!stream) return "";
|
|
43
|
+
const reader = stream.getReader();
|
|
44
|
+
const decoder = new TextDecoder();
|
|
45
|
+
let output = "";
|
|
46
|
+
|
|
47
|
+
while (true) {
|
|
48
|
+
const { value, done } = await reader.read();
|
|
49
|
+
if (done) break;
|
|
50
|
+
const chunk = decoder.decode(value, { stream: true });
|
|
51
|
+
if (!chunk) continue;
|
|
52
|
+
output += chunk;
|
|
53
|
+
onChunk?.(chunk);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const tail = decoder.decode();
|
|
57
|
+
if (tail) {
|
|
58
|
+
output += tail;
|
|
59
|
+
onChunk?.(tail);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return output;
|
|
63
|
+
}
|
|
15
64
|
|
|
16
65
|
export async function runAction(
|
|
17
66
|
command: string,
|
|
18
67
|
args: string[],
|
|
19
|
-
|
|
68
|
+
hooks?: ActionExecutionHooks,
|
|
69
|
+
): Promise<{
|
|
70
|
+
success: boolean;
|
|
71
|
+
output: string;
|
|
72
|
+
error: string | null;
|
|
73
|
+
exitCode: number | null;
|
|
74
|
+
}> {
|
|
20
75
|
try {
|
|
21
76
|
const indexPath = join(import.meta.dir, "..", "index.ts");
|
|
22
77
|
const proc = Bun.spawn(["bun", "run", indexPath, command, ...args], {
|
|
23
78
|
stdout: "pipe",
|
|
24
79
|
stderr: "pipe",
|
|
80
|
+
env: {
|
|
81
|
+
...process.env,
|
|
82
|
+
SELFTUNE_SKIP_AUTO_UPDATE: "1",
|
|
83
|
+
SELFTUNE_DASHBOARD_STREAM_DISABLE: "1",
|
|
84
|
+
...dashboardActionContextEnv(hooks?.actionContext ?? null),
|
|
85
|
+
},
|
|
25
86
|
});
|
|
26
|
-
const
|
|
27
|
-
|
|
28
|
-
|
|
87
|
+
const stdoutPromise = readProcessStream(proc.stdout, hooks?.onStdout);
|
|
88
|
+
const stderrPromise = readProcessStream(proc.stderr, hooks?.onStderr);
|
|
89
|
+
const [exitCode, stdout, stderr] = await Promise.all([
|
|
90
|
+
proc.exited,
|
|
91
|
+
stdoutPromise,
|
|
92
|
+
stderrPromise,
|
|
29
93
|
]);
|
|
30
|
-
const
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
94
|
+
const action = command === "evolve" && args.includes("--dry-run") ? "replay-dry-run" : null;
|
|
95
|
+
const outcome = action
|
|
96
|
+
? resolveDashboardActionOutcome({
|
|
97
|
+
action,
|
|
98
|
+
stdout,
|
|
99
|
+
stderr,
|
|
100
|
+
exitCode,
|
|
101
|
+
})
|
|
102
|
+
: {
|
|
103
|
+
success: exitCode === 0,
|
|
104
|
+
error: exitCode === 0 ? null : stderr || `Exit code ${exitCode}`,
|
|
105
|
+
};
|
|
106
|
+
return {
|
|
107
|
+
success: outcome.success,
|
|
108
|
+
output: stdout,
|
|
109
|
+
error: outcome.error,
|
|
110
|
+
exitCode,
|
|
111
|
+
};
|
|
35
112
|
} catch (err: unknown) {
|
|
36
113
|
const message = err instanceof Error ? err.message : String(err);
|
|
37
|
-
return { success: false, output: "", error: message };
|
|
114
|
+
return { success: false, output: "", error: message, exitCode: null };
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function requireSkillInput(
|
|
119
|
+
body: Record<string, unknown>,
|
|
120
|
+
): { skill: string; skillPath: string } | Response {
|
|
121
|
+
const skill = body.skill as string | undefined;
|
|
122
|
+
const skillPath = body.skillPath as string | undefined;
|
|
123
|
+
if (!skill || !skillPath) {
|
|
124
|
+
return Response.json(
|
|
125
|
+
{ success: false, error: "Missing required fields: skill, skillPath" },
|
|
126
|
+
{ status: 400 },
|
|
127
|
+
);
|
|
38
128
|
}
|
|
129
|
+
return { skill, skillPath };
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function buildActionExecution(
|
|
133
|
+
action: DashboardActionName,
|
|
134
|
+
body: Record<string, unknown>,
|
|
135
|
+
): { command: string; args: string[]; skill: string; skillPath: string } | Response {
|
|
136
|
+
const skillInput = requireSkillInput(body);
|
|
137
|
+
if (skillInput instanceof Response) return skillInput;
|
|
138
|
+
const { skill, skillPath } = skillInput;
|
|
139
|
+
|
|
140
|
+
if (action === "generate-evals") {
|
|
141
|
+
const args = [
|
|
142
|
+
"generate",
|
|
143
|
+
"--skill",
|
|
144
|
+
skill,
|
|
145
|
+
"--skill-path",
|
|
146
|
+
skillPath,
|
|
147
|
+
"--output",
|
|
148
|
+
getCanonicalEvalSetPath(skill),
|
|
149
|
+
];
|
|
150
|
+
if (body.autoSynthetic === true) {
|
|
151
|
+
args.push("--auto-synthetic");
|
|
152
|
+
}
|
|
153
|
+
return { command: "eval", args, skill, skillPath };
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if (action === "generate-unit-tests") {
|
|
157
|
+
return {
|
|
158
|
+
command: "eval",
|
|
159
|
+
args: [
|
|
160
|
+
"unit-test",
|
|
161
|
+
"--skill",
|
|
162
|
+
skill,
|
|
163
|
+
"--generate",
|
|
164
|
+
"--skill-path",
|
|
165
|
+
skillPath,
|
|
166
|
+
"--tests",
|
|
167
|
+
getUnitTestPath(skill),
|
|
168
|
+
],
|
|
169
|
+
skill,
|
|
170
|
+
skillPath,
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if (action === "replay-dry-run") {
|
|
175
|
+
return {
|
|
176
|
+
command: "evolve",
|
|
177
|
+
args: [
|
|
178
|
+
"--skill",
|
|
179
|
+
skill,
|
|
180
|
+
"--skill-path",
|
|
181
|
+
skillPath,
|
|
182
|
+
"--dry-run",
|
|
183
|
+
"--validation-mode",
|
|
184
|
+
"replay",
|
|
185
|
+
"--sync-first",
|
|
186
|
+
],
|
|
187
|
+
skill,
|
|
188
|
+
skillPath,
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
if (action === "measure-baseline") {
|
|
193
|
+
return {
|
|
194
|
+
command: "grade",
|
|
195
|
+
args: ["baseline", "--skill", skill, "--skill-path", skillPath],
|
|
196
|
+
skill,
|
|
197
|
+
skillPath,
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if (action === "deploy-candidate") {
|
|
202
|
+
return {
|
|
203
|
+
command: "evolve",
|
|
204
|
+
args: ["--skill", skill, "--skill-path", skillPath, "--sync-first"],
|
|
205
|
+
skill,
|
|
206
|
+
skillPath,
|
|
207
|
+
};
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if (action === "watch") {
|
|
211
|
+
return {
|
|
212
|
+
command: "watch",
|
|
213
|
+
args: ["--skill", skill, "--skill-path", skillPath, "--sync-first"],
|
|
214
|
+
skill,
|
|
215
|
+
skillPath,
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if (action === "rollback") {
|
|
220
|
+
const proposalId = body.proposalId as string | undefined;
|
|
221
|
+
const args = ["rollback", "--skill", skill, "--skill-path", skillPath];
|
|
222
|
+
if (proposalId) {
|
|
223
|
+
args.push("--proposal-id", proposalId);
|
|
224
|
+
}
|
|
225
|
+
return { command: "evolve", args, skill, skillPath };
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return Response.json({ success: false, error: `Unknown action: ${action}` }, { status: 400 });
|
|
39
229
|
}
|
|
40
230
|
|
|
41
231
|
export async function handleAction(
|
|
42
232
|
action: string,
|
|
43
233
|
body: Record<string, unknown>,
|
|
44
234
|
executeAction: ActionRunner = runAction,
|
|
235
|
+
emitEvent?: ActionEventEmitter,
|
|
45
236
|
): Promise<Response> {
|
|
46
237
|
if (action === "watchlist") {
|
|
47
238
|
const skills = body.skills;
|
|
@@ -62,7 +253,11 @@ export async function handleAction(
|
|
|
62
253
|
}
|
|
63
254
|
try {
|
|
64
255
|
const saved = saveWatchedSkills(skills);
|
|
65
|
-
return Response.json({
|
|
256
|
+
return Response.json({
|
|
257
|
+
success: true,
|
|
258
|
+
watched_skills: saved,
|
|
259
|
+
error: null,
|
|
260
|
+
});
|
|
66
261
|
} catch (error: unknown) {
|
|
67
262
|
const message = error instanceof Error ? error.message : String(error);
|
|
68
263
|
return Response.json(
|
|
@@ -75,37 +270,73 @@ export async function handleAction(
|
|
|
75
270
|
}
|
|
76
271
|
}
|
|
77
272
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
return Response.json(
|
|
83
|
-
{ success: false, error: "Missing required fields: skill, skillPath" },
|
|
84
|
-
{ status: 400 },
|
|
85
|
-
);
|
|
86
|
-
}
|
|
87
|
-
const args = ["--skill", skill, "--skill-path", skillPath, "--sync-first"];
|
|
88
|
-
const result = await executeAction(action, args);
|
|
89
|
-
return Response.json(result);
|
|
273
|
+
const normalizedAction = action === "evolve" ? "deploy-candidate" : action;
|
|
274
|
+
const executable = buildActionExecution(normalizedAction as DashboardActionName, body);
|
|
275
|
+
if (executable instanceof Response) {
|
|
276
|
+
return executable;
|
|
90
277
|
}
|
|
91
278
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
}
|
|
102
|
-
const args = ["--skill", skill, "--skill-path", skillPath];
|
|
103
|
-
if (proposalId) {
|
|
104
|
-
args.push("--proposal-id", proposalId);
|
|
105
|
-
}
|
|
106
|
-
const result = await executeAction(action, args);
|
|
107
|
-
return Response.json(result);
|
|
108
|
-
}
|
|
279
|
+
const eventId = randomUUID();
|
|
280
|
+
emitEvent?.({
|
|
281
|
+
event_id: eventId,
|
|
282
|
+
action: normalizedAction as DashboardActionName,
|
|
283
|
+
stage: "started",
|
|
284
|
+
skill_name: executable.skill,
|
|
285
|
+
skill_path: executable.skillPath,
|
|
286
|
+
ts: Date.now(),
|
|
287
|
+
});
|
|
109
288
|
|
|
110
|
-
|
|
289
|
+
const result = await executeAction(executable.command, executable.args, {
|
|
290
|
+
actionContext: {
|
|
291
|
+
eventId,
|
|
292
|
+
action: normalizedAction as DashboardActionName,
|
|
293
|
+
skillName: executable.skill,
|
|
294
|
+
skillPath: executable.skillPath,
|
|
295
|
+
},
|
|
296
|
+
onStdout(chunk) {
|
|
297
|
+
emitEvent?.({
|
|
298
|
+
event_id: eventId,
|
|
299
|
+
action: normalizedAction as DashboardActionName,
|
|
300
|
+
stage: "stdout",
|
|
301
|
+
skill_name: executable.skill,
|
|
302
|
+
skill_path: executable.skillPath,
|
|
303
|
+
ts: Date.now(),
|
|
304
|
+
chunk,
|
|
305
|
+
});
|
|
306
|
+
},
|
|
307
|
+
onStderr(chunk) {
|
|
308
|
+
emitEvent?.({
|
|
309
|
+
event_id: eventId,
|
|
310
|
+
action: normalizedAction as DashboardActionName,
|
|
311
|
+
stage: "stderr",
|
|
312
|
+
skill_name: executable.skill,
|
|
313
|
+
skill_path: executable.skillPath,
|
|
314
|
+
ts: Date.now(),
|
|
315
|
+
chunk,
|
|
316
|
+
});
|
|
317
|
+
},
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
emitEvent?.({
|
|
321
|
+
event_id: eventId,
|
|
322
|
+
action: normalizedAction as DashboardActionName,
|
|
323
|
+
stage: "finished",
|
|
324
|
+
skill_name: executable.skill,
|
|
325
|
+
skill_path: executable.skillPath,
|
|
326
|
+
ts: Date.now(),
|
|
327
|
+
success: result.success,
|
|
328
|
+
exit_code: result.exitCode,
|
|
329
|
+
error: result.error,
|
|
330
|
+
summary:
|
|
331
|
+
executable.command === "evolve" && executable.args.includes("--dry-run")
|
|
332
|
+
? resolveDashboardActionOutcome({
|
|
333
|
+
action: "replay-dry-run",
|
|
334
|
+
stdout: result.output,
|
|
335
|
+
stderr: result.error,
|
|
336
|
+
exitCode: result.exitCode ?? 0,
|
|
337
|
+
}).summary
|
|
338
|
+
: null,
|
|
339
|
+
});
|
|
340
|
+
|
|
341
|
+
return Response.json(result);
|
|
111
342
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import type { Database } from "bun:sqlite";
|
|
2
2
|
|
|
3
3
|
import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from "node:fs";
|
|
4
|
-
import { join } from "node:path";
|
|
4
|
+
import { dirname, join } from "node:path";
|
|
5
5
|
|
|
6
6
|
import { SELFTUNE_CONFIG_DIR } from "./constants.js";
|
|
7
7
|
import type {
|
|
@@ -11,7 +11,8 @@ import type {
|
|
|
11
11
|
SkillEvalReadiness,
|
|
12
12
|
SkillTestingReadiness,
|
|
13
13
|
} from "./dashboard-contract.js";
|
|
14
|
-
import
|
|
14
|
+
import { getDb } from "./localdb/db.js";
|
|
15
|
+
import type { EvalEntry, SkillUnitTest, UnitTestSuiteResult } from "./types.js";
|
|
15
16
|
import { queryEvolutionEvidence } from "./localdb/queries/evolution.js";
|
|
16
17
|
import { queryTrustedSkillObservationRows } from "./localdb/queries/trust.js";
|
|
17
18
|
import {
|
|
@@ -27,6 +28,7 @@ interface TrustedSkillObservationSummary {
|
|
|
27
28
|
}
|
|
28
29
|
|
|
29
30
|
interface TestingReadinessContext {
|
|
31
|
+
db: Database;
|
|
30
32
|
knownSkills: Set<string>;
|
|
31
33
|
searchDirs: string[];
|
|
32
34
|
trustedRowsBySkill: Map<string, TrustedSkillObservationSummary[]>;
|
|
@@ -64,14 +66,188 @@ export function getUnitTestResultPath(skillName: string): string {
|
|
|
64
66
|
return join(getUnitTestDir(), `${skillName}.last-run.json`);
|
|
65
67
|
}
|
|
66
68
|
|
|
69
|
+
function getOptionalDb(): Database | null {
|
|
70
|
+
try {
|
|
71
|
+
return getDb();
|
|
72
|
+
} catch {
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function parseJsonArray(value: string | null | undefined): unknown[] {
|
|
78
|
+
if (!value) return [];
|
|
79
|
+
try {
|
|
80
|
+
const parsed = JSON.parse(value) as unknown;
|
|
81
|
+
return Array.isArray(parsed) ? parsed : [];
|
|
82
|
+
} catch {
|
|
83
|
+
return [];
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function upsertCanonicalEvalSet(db: Database, skillName: string, evalSet: EvalEntry[]): void {
|
|
88
|
+
db.run(
|
|
89
|
+
`INSERT INTO canonical_eval_sets (skill_name, stored_at, eval_set_json)
|
|
90
|
+
VALUES (?, ?, ?)
|
|
91
|
+
ON CONFLICT(skill_name) DO UPDATE SET
|
|
92
|
+
stored_at = excluded.stored_at,
|
|
93
|
+
eval_set_json = excluded.eval_set_json`,
|
|
94
|
+
[skillName, new Date().toISOString(), JSON.stringify(evalSet)],
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function upsertUnitTestFile(db: Database, skillName: string, tests: SkillUnitTest[]): void {
|
|
99
|
+
db.run(
|
|
100
|
+
`INSERT INTO unit_test_files (skill_name, stored_at, tests_json)
|
|
101
|
+
VALUES (?, ?, ?)
|
|
102
|
+
ON CONFLICT(skill_name) DO UPDATE SET
|
|
103
|
+
stored_at = excluded.stored_at,
|
|
104
|
+
tests_json = excluded.tests_json`,
|
|
105
|
+
[skillName, new Date().toISOString(), JSON.stringify(tests)],
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function upsertUnitTestRunResult(
|
|
110
|
+
db: Database,
|
|
111
|
+
skillName: string,
|
|
112
|
+
suite: UnitTestSuiteResult,
|
|
113
|
+
): void {
|
|
114
|
+
db.run(
|
|
115
|
+
`INSERT INTO unit_test_run_results
|
|
116
|
+
(skill_name, run_at, total, passed, failed, pass_rate, result_json)
|
|
117
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
118
|
+
ON CONFLICT(skill_name) DO UPDATE SET
|
|
119
|
+
run_at = excluded.run_at,
|
|
120
|
+
total = excluded.total,
|
|
121
|
+
passed = excluded.passed,
|
|
122
|
+
failed = excluded.failed,
|
|
123
|
+
pass_rate = excluded.pass_rate,
|
|
124
|
+
result_json = excluded.result_json`,
|
|
125
|
+
[
|
|
126
|
+
skillName,
|
|
127
|
+
suite.run_at,
|
|
128
|
+
suite.total,
|
|
129
|
+
suite.passed,
|
|
130
|
+
suite.failed,
|
|
131
|
+
suite.pass_rate,
|
|
132
|
+
JSON.stringify(suite),
|
|
133
|
+
],
|
|
134
|
+
);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function readCanonicalEvalSetFromDb(
|
|
138
|
+
db: Database,
|
|
139
|
+
skillName: string,
|
|
140
|
+
): {
|
|
141
|
+
entries: EvalEntry[];
|
|
142
|
+
storedAt: string | null;
|
|
143
|
+
} | null {
|
|
144
|
+
const row = db
|
|
145
|
+
.query(
|
|
146
|
+
`SELECT eval_set_json, stored_at
|
|
147
|
+
FROM canonical_eval_sets
|
|
148
|
+
WHERE skill_name = ?`,
|
|
149
|
+
)
|
|
150
|
+
.get(skillName) as { eval_set_json: string; stored_at: string } | null;
|
|
151
|
+
if (!row) return null;
|
|
152
|
+
return {
|
|
153
|
+
entries: parseJsonArray(row.eval_set_json) as EvalEntry[],
|
|
154
|
+
storedAt: row.stored_at ?? null,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function readUnitTestsFromDb(
|
|
159
|
+
db: Database,
|
|
160
|
+
skillName: string,
|
|
161
|
+
): {
|
|
162
|
+
tests: SkillUnitTest[];
|
|
163
|
+
storedAt: string | null;
|
|
164
|
+
} | null {
|
|
165
|
+
const row = db
|
|
166
|
+
.query(
|
|
167
|
+
`SELECT tests_json, stored_at
|
|
168
|
+
FROM unit_test_files
|
|
169
|
+
WHERE skill_name = ?`,
|
|
170
|
+
)
|
|
171
|
+
.get(skillName) as { tests_json: string; stored_at: string } | null;
|
|
172
|
+
if (!row) return null;
|
|
173
|
+
return {
|
|
174
|
+
tests: parseJsonArray(row.tests_json) as SkillUnitTest[],
|
|
175
|
+
storedAt: row.stored_at ?? null,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
function readUnitTestRunResultFromDb(db: Database, skillName: string): UnitTestSuiteResult | null {
|
|
180
|
+
const row = db
|
|
181
|
+
.query(
|
|
182
|
+
`SELECT result_json
|
|
183
|
+
FROM unit_test_run_results
|
|
184
|
+
WHERE skill_name = ?`,
|
|
185
|
+
)
|
|
186
|
+
.get(skillName) as { result_json: string } | null;
|
|
187
|
+
if (!row?.result_json) return null;
|
|
188
|
+
try {
|
|
189
|
+
const parsed = JSON.parse(row.result_json) as Partial<UnitTestSuiteResult>;
|
|
190
|
+
if (
|
|
191
|
+
typeof parsed !== "object" ||
|
|
192
|
+
parsed == null ||
|
|
193
|
+
typeof parsed.skill_name !== "string" ||
|
|
194
|
+
typeof parsed.total !== "number" ||
|
|
195
|
+
typeof parsed.passed !== "number" ||
|
|
196
|
+
typeof parsed.failed !== "number" ||
|
|
197
|
+
typeof parsed.pass_rate !== "number" ||
|
|
198
|
+
typeof parsed.run_at !== "string"
|
|
199
|
+
) {
|
|
200
|
+
return null;
|
|
201
|
+
}
|
|
202
|
+
return parsed as UnitTestSuiteResult;
|
|
203
|
+
} catch {
|
|
204
|
+
return null;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
function listStoredSkillNames(db: Database, tableName: string): Set<string> {
|
|
209
|
+
const rows = db.query(`SELECT skill_name FROM ${tableName}`).all() as Array<{
|
|
210
|
+
skill_name: string;
|
|
211
|
+
}>;
|
|
212
|
+
return new Set(rows.map((row) => row.skill_name).filter(Boolean));
|
|
213
|
+
}
|
|
214
|
+
|
|
67
215
|
export function writeCanonicalEvalSet(skillName: string, evalSet: EvalEntry[]): string {
|
|
68
|
-
mkdirSync(getEvalSetDir(), { recursive: true });
|
|
69
216
|
const path = getCanonicalEvalSetPath(skillName);
|
|
217
|
+
const db = getOptionalDb();
|
|
218
|
+
if (db) {
|
|
219
|
+
upsertCanonicalEvalSet(db, skillName, evalSet);
|
|
220
|
+
}
|
|
221
|
+
mkdirSync(getEvalSetDir(), { recursive: true });
|
|
70
222
|
writeFileSync(path, JSON.stringify(evalSet, null, 2), "utf-8");
|
|
71
223
|
return path;
|
|
72
224
|
}
|
|
73
225
|
|
|
226
|
+
export function writeCanonicalUnitTests(
|
|
227
|
+
skillName: string,
|
|
228
|
+
tests: SkillUnitTest[],
|
|
229
|
+
outputPath?: string,
|
|
230
|
+
): string {
|
|
231
|
+
const canonicalPath = getUnitTestPath(skillName);
|
|
232
|
+
const db = getOptionalDb();
|
|
233
|
+
if (db) {
|
|
234
|
+
upsertUnitTestFile(db, skillName, tests);
|
|
235
|
+
}
|
|
236
|
+
mkdirSync(getUnitTestDir(), { recursive: true });
|
|
237
|
+
writeFileSync(canonicalPath, JSON.stringify(tests, null, 2), "utf-8");
|
|
238
|
+
if (outputPath && outputPath !== canonicalPath) {
|
|
239
|
+
mkdirSync(dirname(outputPath), { recursive: true });
|
|
240
|
+
writeFileSync(outputPath, JSON.stringify(tests, null, 2), "utf-8");
|
|
241
|
+
return outputPath;
|
|
242
|
+
}
|
|
243
|
+
return canonicalPath;
|
|
244
|
+
}
|
|
245
|
+
|
|
74
246
|
export function writeUnitTestRunResult(skillName: string, suite: UnitTestSuiteResult): string {
|
|
247
|
+
const db = getOptionalDb();
|
|
248
|
+
if (db) {
|
|
249
|
+
upsertUnitTestRunResult(db, skillName, suite);
|
|
250
|
+
}
|
|
75
251
|
mkdirSync(getUnitTestDir(), { recursive: true });
|
|
76
252
|
const path = getUnitTestResultPath(skillName);
|
|
77
253
|
writeFileSync(path, JSON.stringify(suite, null, 2), "utf-8");
|
|
@@ -188,14 +364,14 @@ function summarizeReadiness(
|
|
|
188
364
|
switch (nextStep) {
|
|
189
365
|
case "generate_evals":
|
|
190
366
|
if (evalReadiness === "log_ready") {
|
|
191
|
-
return "Trusted telemetry exists, but no canonical eval set is
|
|
367
|
+
return "Trusted telemetry exists, but no canonical eval set is stored yet.";
|
|
192
368
|
}
|
|
193
369
|
if (evalReadiness === "cold_start_ready") {
|
|
194
370
|
return "Installed locally but still cold-start. Generate synthetic evals before you evolve it.";
|
|
195
371
|
}
|
|
196
372
|
return "Telemetry exists, but selftune cannot resolve a local SKILL.md yet. Point it at the skill and generate evals.";
|
|
197
373
|
case "run_unit_tests":
|
|
198
|
-
return `Eval coverage is present (${evalSetEntries} entries), but no unit
|
|
374
|
+
return `Eval coverage is present (${evalSetEntries} entries), but no unit tests are stored yet.`;
|
|
199
375
|
case "run_replay_dry_run": {
|
|
200
376
|
const passRateText =
|
|
201
377
|
unitTestPassRate != null
|
|
@@ -331,6 +507,9 @@ function buildTestingReadinessContext(db: Database, searchDirs: string[]): Testi
|
|
|
331
507
|
if (!entry.endsWith(".json")) return null;
|
|
332
508
|
return entry.slice(0, -".json".length);
|
|
333
509
|
});
|
|
510
|
+
const storedEvalNames = listStoredSkillNames(db, "canonical_eval_sets");
|
|
511
|
+
const storedUnitTestNames = listStoredSkillNames(db, "unit_test_files");
|
|
512
|
+
const storedUnitTestRunNames = listStoredSkillNames(db, "unit_test_run_results");
|
|
334
513
|
|
|
335
514
|
const evidenceRows = queryEvolutionEvidence(db);
|
|
336
515
|
const evalEvidenceBySkill = new Map<string, { count: number; latestAt: string | null }>();
|
|
@@ -445,6 +624,9 @@ function buildTestingReadinessContext(db: Database, searchDirs: string[]): Testi
|
|
|
445
624
|
...unitTestNames,
|
|
446
625
|
...unitTestResultNames,
|
|
447
626
|
...canonicalEvalNames,
|
|
627
|
+
...storedEvalNames,
|
|
628
|
+
...storedUnitTestNames,
|
|
629
|
+
...storedUnitTestRunNames,
|
|
448
630
|
...evalEvidenceBySkill.keys(),
|
|
449
631
|
...replayBySkill.keys(),
|
|
450
632
|
...baselineBySkill.keys(),
|
|
@@ -452,6 +634,7 @@ function buildTestingReadinessContext(db: Database, searchDirs: string[]): Testi
|
|
|
452
634
|
]);
|
|
453
635
|
|
|
454
636
|
return {
|
|
637
|
+
db,
|
|
455
638
|
knownSkills,
|
|
456
639
|
searchDirs,
|
|
457
640
|
trustedRowsBySkill,
|
|
@@ -480,16 +663,26 @@ function buildSkillTestingReadinessRow(
|
|
|
480
663
|
const evalReadiness = deriveEvalReadiness(skillPath, trustedTriggerCount);
|
|
481
664
|
|
|
482
665
|
const canonicalEvalPath = getCanonicalEvalSetPath(skillName);
|
|
483
|
-
const
|
|
484
|
-
const
|
|
666
|
+
const storedEvalSet = readCanonicalEvalSetFromDb(context.db, skillName);
|
|
667
|
+
const canonicalEvalEntries =
|
|
668
|
+
storedEvalSet?.entries ?? (readJsonArrayFile(canonicalEvalPath) as EvalEntry[]);
|
|
669
|
+
const canonicalEvalStat =
|
|
670
|
+
!storedEvalSet && existsSync(canonicalEvalPath) ? statSync(canonicalEvalPath) : null;
|
|
485
671
|
const evidenceEval = context.evalEvidenceBySkill.get(skillName) ?? { count: 0, latestAt: null };
|
|
486
672
|
const evalSetEntries =
|
|
487
673
|
canonicalEvalEntries.length > 0 ? canonicalEvalEntries.length : evidenceEval.count;
|
|
488
|
-
const latestEvalAt =
|
|
674
|
+
const latestEvalAt =
|
|
675
|
+
storedEvalSet?.storedAt ??
|
|
676
|
+
canonicalEvalStat?.mtime.toISOString?.() ??
|
|
677
|
+
evidenceEval.latestAt ??
|
|
678
|
+
null;
|
|
489
679
|
|
|
490
680
|
const unitTestPath = getUnitTestPath(skillName);
|
|
491
|
-
const
|
|
492
|
-
const
|
|
681
|
+
const storedUnitTests = readUnitTestsFromDb(context.db, skillName);
|
|
682
|
+
const unitTestCases = storedUnitTests?.tests.length ?? readJsonArrayFile(unitTestPath).length;
|
|
683
|
+
const unitTestResult =
|
|
684
|
+
readUnitTestRunResultFromDb(context.db, skillName) ??
|
|
685
|
+
readUnitTestResult(getUnitTestResultPath(skillName));
|
|
493
686
|
|
|
494
687
|
const replay = context.replayBySkill.get(skillName) ?? {
|
|
495
688
|
check_count: 0,
|