selftune 0.2.19 → 0.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/local-dashboard/dist/assets/{index-DnhnXQm6.js → index-D8O-RG1I.js} +2 -2
- package/apps/local-dashboard/dist/index.html +1 -1
- package/cli/selftune/dashboard-contract.ts +4 -0
- package/cli/selftune/eval/family-overlap.ts +320 -1
- package/cli/selftune/evolution/evidence.ts +5 -0
- package/cli/selftune/evolution/evolve-body.ts +86 -2
- package/cli/selftune/evolution/evolve.ts +58 -1
- package/cli/selftune/evolution/validate-body.ts +10 -0
- package/cli/selftune/evolution/validate-host-replay.ts +624 -0
- package/cli/selftune/evolution/validate-proposal.ts +10 -0
- package/cli/selftune/evolution/validate-routing.ts +112 -5
- package/cli/selftune/localdb/direct-write.ts +8 -3
- package/cli/selftune/localdb/materialize.ts +7 -2
- package/cli/selftune/localdb/queries.ts +11 -1
- package/cli/selftune/localdb/schema.ts +10 -1
- package/cli/selftune/routes/skill-report.ts +6 -1
- package/cli/selftune/types.ts +54 -0
- package/cli/selftune/utils/text-similarity.ts +73 -0
- package/package.json +1 -1
- package/packages/ui/src/components/EvidenceViewer.tsx +85 -2
- package/packages/ui/src/components/EvolutionTimeline.tsx +23 -1
- package/packages/ui/src/types.ts +4 -0
- package/skill/Workflows/Composability.md +15 -1
- package/skill/Workflows/Evolve.md +39 -0
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env bun
|
|
2
2
|
|
|
3
|
+
import { readFileSync } from "node:fs";
|
|
3
4
|
import { parseArgs } from "node:util";
|
|
4
5
|
|
|
5
6
|
import { getDb } from "../localdb/db.js";
|
|
6
7
|
import { queryQueryLog, querySkillUsageRecords } from "../localdb/queries.js";
|
|
7
8
|
import type {
|
|
9
|
+
SkillFamilyColdStartPair,
|
|
10
|
+
SkillFamilyColdStartSuspicion,
|
|
8
11
|
QueryLogRecord,
|
|
9
12
|
SkillFamilyOverlapMember,
|
|
10
13
|
SkillFamilyOverlapPair,
|
|
@@ -13,17 +16,54 @@ import type {
|
|
|
13
16
|
SkillUsageRecord,
|
|
14
17
|
} from "../types.js";
|
|
15
18
|
import { CLIError } from "../utils/cli-error.js";
|
|
19
|
+
import { parseFrontmatter } from "../utils/frontmatter.js";
|
|
16
20
|
import {
|
|
17
21
|
findInstalledSkillNames,
|
|
18
22
|
findInstalledSkillPath,
|
|
19
23
|
findRepositoryClaudeSkillDirs,
|
|
20
24
|
findRepositorySkillDirs,
|
|
21
25
|
} from "../utils/skill-discovery.js";
|
|
26
|
+
import {
|
|
27
|
+
buildStopwordSet,
|
|
28
|
+
extractWhenToUseLines,
|
|
29
|
+
jaccardSimilarity,
|
|
30
|
+
tokenizeText,
|
|
31
|
+
} from "../utils/text-similarity.js";
|
|
22
32
|
import { buildEvalSet } from "./hooks-to-evals.js";
|
|
23
33
|
|
|
24
34
|
const DEFAULT_MIN_OVERLAP = 0.3;
|
|
25
35
|
const DEFAULT_MIN_SHARED = 2;
|
|
26
36
|
const DEFAULT_MAX_SHARED = 10;
|
|
37
|
+
const DESCRIPTION_SIMILARITY_THRESHOLD = 0.18;
|
|
38
|
+
const WHEN_TO_USE_SIMILARITY_THRESHOLD = 0.18;
|
|
39
|
+
const CONFUSION_QUERY_LINE_OVERLAP_THRESHOLD = 0.12;
|
|
40
|
+
const COMMAND_AUGMENTED_HIGH_SIMILARITY_THRESHOLD = 0.22;
|
|
41
|
+
const LOW_SUSPICION_SIMILARITY_THRESHOLD = 0.28;
|
|
42
|
+
const SHARED_TERM_LIMIT = 6;
|
|
43
|
+
const STATIC_PAIR_LIMIT = 10;
|
|
44
|
+
|
|
45
|
+
const STOPWORDS = buildStopwordSet([
|
|
46
|
+
"between",
|
|
47
|
+
"by",
|
|
48
|
+
"can",
|
|
49
|
+
"change",
|
|
50
|
+
"content",
|
|
51
|
+
"decision",
|
|
52
|
+
"decisions",
|
|
53
|
+
"do",
|
|
54
|
+
"get",
|
|
55
|
+
"help",
|
|
56
|
+
"i",
|
|
57
|
+
"if",
|
|
58
|
+
"my",
|
|
59
|
+
"state",
|
|
60
|
+
"their",
|
|
61
|
+
"users",
|
|
62
|
+
"want",
|
|
63
|
+
"wants",
|
|
64
|
+
"you",
|
|
65
|
+
"your",
|
|
66
|
+
]);
|
|
27
67
|
|
|
28
68
|
interface FamilyOverlapOptions {
|
|
29
69
|
familyPrefix?: string;
|
|
@@ -34,6 +74,15 @@ interface FamilyOverlapOptions {
|
|
|
34
74
|
searchDirs?: string[];
|
|
35
75
|
}
|
|
36
76
|
|
|
77
|
+
interface InstalledSkillSurface {
|
|
78
|
+
skillName: string;
|
|
79
|
+
skillPath?: string;
|
|
80
|
+
descriptionTokens: Set<string>;
|
|
81
|
+
whenToUseTokens: Set<string>;
|
|
82
|
+
whenToUseLines: string[];
|
|
83
|
+
commandSurfaces: string[];
|
|
84
|
+
}
|
|
85
|
+
|
|
37
86
|
function getEvalSkillSearchDirs(): string[] {
|
|
38
87
|
const cwd = process.cwd();
|
|
39
88
|
const homeDir = process.env.HOME ?? "";
|
|
@@ -119,6 +168,266 @@ function scoreConsolidationPressure(overlapPct: number): "low" | "medium" | "hig
|
|
|
119
168
|
return "low";
|
|
120
169
|
}
|
|
121
170
|
|
|
171
|
+
function sharedTerms(
|
|
172
|
+
leftDescription: Set<string>,
|
|
173
|
+
leftWhenToUse: Set<string>,
|
|
174
|
+
rightDescription: Set<string>,
|
|
175
|
+
rightWhenToUse: Set<string>,
|
|
176
|
+
): string[] {
|
|
177
|
+
const shared = new Set<string>();
|
|
178
|
+
for (const token of leftDescription) {
|
|
179
|
+
if (rightDescription.has(token) || rightWhenToUse.has(token)) shared.add(token);
|
|
180
|
+
}
|
|
181
|
+
for (const token of leftWhenToUse) {
|
|
182
|
+
if (rightDescription.has(token) || rightWhenToUse.has(token)) shared.add(token);
|
|
183
|
+
}
|
|
184
|
+
return [...shared].sort((a, b) => a.localeCompare(b)).slice(0, SHARED_TERM_LIMIT);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
function buildSyntheticSiblingConfusionQueries(
|
|
188
|
+
left: InstalledSkillSurface,
|
|
189
|
+
right: InstalledSkillSurface,
|
|
190
|
+
sharedCommandSurfaces: string[],
|
|
191
|
+
): string[] {
|
|
192
|
+
const leftSurfaceTokens = new Set([...left.descriptionTokens, ...left.whenToUseTokens]);
|
|
193
|
+
const rightSurfaceTokens = new Set([...right.descriptionTokens, ...right.whenToUseTokens]);
|
|
194
|
+
const candidates = new Set<string>();
|
|
195
|
+
|
|
196
|
+
const maybeAdd = (line: string, sourceTokens: Set<string>, compareTokens: Set<string>) => {
|
|
197
|
+
const trimmed = line.trim();
|
|
198
|
+
if (!trimmed) return;
|
|
199
|
+
const lineTokens = tokenizeText(trimmed, STOPWORDS);
|
|
200
|
+
const overlap = jaccardSimilarity(lineTokens, compareTokens);
|
|
201
|
+
if (
|
|
202
|
+
overlap >= CONFUSION_QUERY_LINE_OVERLAP_THRESHOLD ||
|
|
203
|
+
jaccardSimilarity(sourceTokens, compareTokens) >= WHEN_TO_USE_SIMILARITY_THRESHOLD
|
|
204
|
+
) {
|
|
205
|
+
candidates.add(trimmed);
|
|
206
|
+
}
|
|
207
|
+
};
|
|
208
|
+
|
|
209
|
+
for (const line of left.whenToUseLines) {
|
|
210
|
+
maybeAdd(line, leftSurfaceTokens, rightSurfaceTokens);
|
|
211
|
+
}
|
|
212
|
+
for (const line of right.whenToUseLines) {
|
|
213
|
+
maybeAdd(line, rightSurfaceTokens, leftSurfaceTokens);
|
|
214
|
+
}
|
|
215
|
+
for (const command of sharedCommandSurfaces) {
|
|
216
|
+
candidates.add(`${command} for a sibling-family request`);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
return [...candidates].slice(0, 4);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
function extractCommandSurfaces(body: string): string[] {
|
|
223
|
+
const matches = body.matchAll(/```[\w-]*\r?\n([\s\S]*?)```/g);
|
|
224
|
+
const commands = new Set<string>();
|
|
225
|
+
for (const match of matches) {
|
|
226
|
+
const block = match[1] ?? "";
|
|
227
|
+
for (const line of block.split("\n")) {
|
|
228
|
+
const trimmed = line.trim();
|
|
229
|
+
if (!trimmed || trimmed.startsWith("#") || trimmed.startsWith(">")) continue;
|
|
230
|
+
const tokens = trimmed.split(/\s+/).filter(Boolean);
|
|
231
|
+
if (tokens.length < 2 || tokens[1]?.startsWith("-")) continue;
|
|
232
|
+
commands.add(`${tokens[0]} ${tokens[1]}`);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
return [...commands].sort((a, b) => a.localeCompare(b));
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
function loadInstalledSkillSurface(skillName: string, searchDirs: string[]): InstalledSkillSurface {
|
|
239
|
+
const skillPath = findInstalledSkillPath(skillName, searchDirs);
|
|
240
|
+
if (!skillPath) {
|
|
241
|
+
return {
|
|
242
|
+
skillName,
|
|
243
|
+
descriptionTokens: new Set<string>(),
|
|
244
|
+
whenToUseTokens: new Set<string>(),
|
|
245
|
+
whenToUseLines: [],
|
|
246
|
+
commandSurfaces: [],
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
try {
|
|
251
|
+
const raw = readFileSync(skillPath, "utf8");
|
|
252
|
+
const parsed = parseFrontmatter(raw);
|
|
253
|
+
const whenToUseLines = extractWhenToUseLines(parsed.body);
|
|
254
|
+
return {
|
|
255
|
+
skillName,
|
|
256
|
+
skillPath,
|
|
257
|
+
descriptionTokens: tokenizeText(parsed.description, STOPWORDS),
|
|
258
|
+
whenToUseTokens: tokenizeText(whenToUseLines.join(" "), STOPWORDS),
|
|
259
|
+
whenToUseLines,
|
|
260
|
+
commandSurfaces: extractCommandSurfaces(parsed.body),
|
|
261
|
+
};
|
|
262
|
+
} catch {
|
|
263
|
+
// Discovery is intentionally silent here: missing or malformed skill files are
|
|
264
|
+
// expected in mixed registries, and callers should degrade to empty surfaces.
|
|
265
|
+
return {
|
|
266
|
+
skillName,
|
|
267
|
+
skillPath,
|
|
268
|
+
descriptionTokens: new Set<string>(),
|
|
269
|
+
whenToUseTokens: new Set<string>(),
|
|
270
|
+
whenToUseLines: [],
|
|
271
|
+
commandSurfaces: [],
|
|
272
|
+
};
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
function scoreStaticSuspicion(
|
|
277
|
+
descriptionSimilarity: number,
|
|
278
|
+
whenToUseSimilarity: number,
|
|
279
|
+
sharedCommandSurfaces: string[],
|
|
280
|
+
): "low" | "medium" | "high" | null {
|
|
281
|
+
const descriptionSignal = descriptionSimilarity >= DESCRIPTION_SIMILARITY_THRESHOLD;
|
|
282
|
+
const whenToUseSignal = whenToUseSimilarity >= WHEN_TO_USE_SIMILARITY_THRESHOLD;
|
|
283
|
+
const commandSignal = sharedCommandSurfaces.length > 0;
|
|
284
|
+
const signalCount = Number(descriptionSignal) + Number(whenToUseSignal) + Number(commandSignal);
|
|
285
|
+
|
|
286
|
+
if (
|
|
287
|
+
signalCount >= 3 ||
|
|
288
|
+
(commandSignal &&
|
|
289
|
+
Math.max(descriptionSimilarity, whenToUseSimilarity) >=
|
|
290
|
+
COMMAND_AUGMENTED_HIGH_SIMILARITY_THRESHOLD)
|
|
291
|
+
) {
|
|
292
|
+
return "high";
|
|
293
|
+
}
|
|
294
|
+
if (signalCount >= 2) return "medium";
|
|
295
|
+
if (Math.max(descriptionSimilarity, whenToUseSimilarity) >= LOW_SUSPICION_SIMILARITY_THRESHOLD) {
|
|
296
|
+
return "low";
|
|
297
|
+
}
|
|
298
|
+
return null;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
function analyzeColdStartSuspicion(
|
|
302
|
+
skills: string[],
|
|
303
|
+
searchDirs: string[],
|
|
304
|
+
readySkillCount: number,
|
|
305
|
+
): SkillFamilyColdStartSuspicion | undefined {
|
|
306
|
+
const surfaces = skills.map((skillName) => loadInstalledSkillSurface(skillName, searchDirs));
|
|
307
|
+
const availableSurfaces = surfaces.filter(
|
|
308
|
+
(surface) =>
|
|
309
|
+
Boolean(surface.skillPath) &&
|
|
310
|
+
(surface.descriptionTokens.size > 0 ||
|
|
311
|
+
surface.whenToUseTokens.size > 0 ||
|
|
312
|
+
surface.commandSurfaces.length > 0),
|
|
313
|
+
);
|
|
314
|
+
if (availableSurfaces.length < 2) return undefined;
|
|
315
|
+
|
|
316
|
+
const pairs: SkillFamilyColdStartPair[] = [];
|
|
317
|
+
let analyzedPairs = 0;
|
|
318
|
+
for (let i = 0; i < availableSurfaces.length; i++) {
|
|
319
|
+
for (let j = i + 1; j < availableSurfaces.length; j++) {
|
|
320
|
+
analyzedPairs += 1;
|
|
321
|
+
const left = availableSurfaces[i];
|
|
322
|
+
const right = availableSurfaces[j];
|
|
323
|
+
if (!left || !right) continue;
|
|
324
|
+
const descriptionSimilarity = jaccardSimilarity(
|
|
325
|
+
left.descriptionTokens,
|
|
326
|
+
right.descriptionTokens,
|
|
327
|
+
);
|
|
328
|
+
const whenToUseSimilarity = jaccardSimilarity(left.whenToUseTokens, right.whenToUseTokens);
|
|
329
|
+
const sharedCommandSurfaces = left.commandSurfaces.filter((command) =>
|
|
330
|
+
right.commandSurfaces.includes(command),
|
|
331
|
+
);
|
|
332
|
+
const suspicionLevel = scoreStaticSuspicion(
|
|
333
|
+
descriptionSimilarity,
|
|
334
|
+
whenToUseSimilarity,
|
|
335
|
+
sharedCommandSurfaces,
|
|
336
|
+
);
|
|
337
|
+
if (!suspicionLevel) continue;
|
|
338
|
+
|
|
339
|
+
pairs.push({
|
|
340
|
+
skill_a: left.skillName,
|
|
341
|
+
skill_b: right.skillName,
|
|
342
|
+
description_similarity: descriptionSimilarity,
|
|
343
|
+
when_to_use_similarity: whenToUseSimilarity,
|
|
344
|
+
shared_command_surfaces: sharedCommandSurfaces,
|
|
345
|
+
shared_terms: sharedTerms(
|
|
346
|
+
left.descriptionTokens,
|
|
347
|
+
left.whenToUseTokens,
|
|
348
|
+
right.descriptionTokens,
|
|
349
|
+
right.whenToUseTokens,
|
|
350
|
+
),
|
|
351
|
+
synthetic_confusion_queries: buildSyntheticSiblingConfusionQueries(
|
|
352
|
+
left,
|
|
353
|
+
right,
|
|
354
|
+
sharedCommandSurfaces,
|
|
355
|
+
),
|
|
356
|
+
suspicion_level: suspicionLevel,
|
|
357
|
+
});
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
pairs.sort(
|
|
362
|
+
(a, b) =>
|
|
363
|
+
Number(b.suspicion_level === "high") - Number(a.suspicion_level === "high") ||
|
|
364
|
+
Number(b.suspicion_level === "medium") - Number(a.suspicion_level === "medium") ||
|
|
365
|
+
b.when_to_use_similarity - a.when_to_use_similarity ||
|
|
366
|
+
b.description_similarity - a.description_similarity,
|
|
367
|
+
);
|
|
368
|
+
|
|
369
|
+
const suspiciousPairCount = pairs.length;
|
|
370
|
+
const averageStaticSimilarity =
|
|
371
|
+
suspiciousPairCount > 0
|
|
372
|
+
? pairs.reduce(
|
|
373
|
+
// Command overlap is binary, but we weight it equally with the two Jaccard scores
|
|
374
|
+
// because shared command surfaces are a high-precision cold-start signal even when
|
|
375
|
+
// description text is sparse or noisy.
|
|
376
|
+
(sum, pair) =>
|
|
377
|
+
sum +
|
|
378
|
+
(pair.description_similarity +
|
|
379
|
+
pair.when_to_use_similarity +
|
|
380
|
+
(pair.shared_command_surfaces.length > 0 ? 1 : 0)) /
|
|
381
|
+
3,
|
|
382
|
+
0,
|
|
383
|
+
) / suspiciousPairCount
|
|
384
|
+
: 0;
|
|
385
|
+
const candidate =
|
|
386
|
+
suspiciousPairCount > 0 &&
|
|
387
|
+
readySkillCount < 2 &&
|
|
388
|
+
suspiciousPairCount >= (skills.length >= 3 ? 2 : 1);
|
|
389
|
+
|
|
390
|
+
const rationale: string[] = [];
|
|
391
|
+
if (suspiciousPairCount === 0) {
|
|
392
|
+
rationale.push(
|
|
393
|
+
"Installed skill surfaces do not show meaningful overlap yet. Keep gathering cold-start evals and real usage before making a packaging call.",
|
|
394
|
+
);
|
|
395
|
+
} else {
|
|
396
|
+
rationale.push(
|
|
397
|
+
`${suspiciousPairCount} sibling pair${suspiciousPairCount === 1 ? "" : "s"} show overlapping installed skill surfaces before trusted telemetry is available.`,
|
|
398
|
+
);
|
|
399
|
+
if (pairs.some((pair) => pair.shared_command_surfaces.length > 0)) {
|
|
400
|
+
rationale.push(
|
|
401
|
+
"Shared command surfaces suggest some siblings may be thin wrappers around the same backend or query path.",
|
|
402
|
+
);
|
|
403
|
+
}
|
|
404
|
+
if (pairs.some((pair) => pair.when_to_use_similarity >= WHEN_TO_USE_SIMILARITY_THRESHOLD)) {
|
|
405
|
+
rationale.push(
|
|
406
|
+
"Overlapping `When to Use` language suggests sibling boundaries may already be competing on intent before enough telemetry exists to confirm it.",
|
|
407
|
+
);
|
|
408
|
+
}
|
|
409
|
+
if (pairs.some((pair) => pair.synthetic_confusion_queries.length > 0)) {
|
|
410
|
+
rationale.push(
|
|
411
|
+
"Synthetic sibling-confusion probes are available for suspicious pairs, so you can test the family boundary before real telemetry converges.",
|
|
412
|
+
);
|
|
413
|
+
}
|
|
414
|
+
if (candidate) {
|
|
415
|
+
rationale.push(
|
|
416
|
+
"Treat this as architecture suspicion, not proof. Run cold-start evals and gather trusted usage before consolidating the family.",
|
|
417
|
+
);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
return {
|
|
422
|
+
candidate,
|
|
423
|
+
analyzed_pairs: analyzedPairs,
|
|
424
|
+
suspicious_pair_count: suspiciousPairCount,
|
|
425
|
+
average_static_similarity: averageStaticSimilarity,
|
|
426
|
+
pairs: pairs.slice(0, STATIC_PAIR_LIMIT),
|
|
427
|
+
rationale,
|
|
428
|
+
};
|
|
429
|
+
}
|
|
430
|
+
|
|
122
431
|
function buildRefactorProposal(
|
|
123
432
|
skills: string[],
|
|
124
433
|
familyPrefix: string | undefined,
|
|
@@ -214,6 +523,7 @@ export function analyzeSkillFamilyOverlap(
|
|
|
214
523
|
const readySkillCount = members.filter(
|
|
215
524
|
(member) => member.positive_query_count >= minSharedQueries,
|
|
216
525
|
).length;
|
|
526
|
+
const coldStartSuspicion = analyzeColdStartSuspicion(skills, searchDirs, readySkillCount);
|
|
217
527
|
const consolidationCandidate =
|
|
218
528
|
readySkillCount >= 2 &&
|
|
219
529
|
skills.length >= 3 &&
|
|
@@ -239,6 +549,12 @@ export function analyzeSkillFamilyOverlap(
|
|
|
239
549
|
);
|
|
240
550
|
}
|
|
241
551
|
|
|
552
|
+
if (readySkillCount < 2 && coldStartSuspicion?.candidate) {
|
|
553
|
+
rationale.push(
|
|
554
|
+
"Installed skill surfaces already suggest an architecture suspicion: some siblings look like overlapping entry points to the same underlying workflow family.",
|
|
555
|
+
);
|
|
556
|
+
}
|
|
557
|
+
|
|
242
558
|
if (consolidationCandidate) {
|
|
243
559
|
rationale.push(
|
|
244
560
|
"This family looks like a packaging problem, not just a wording problem. Test a parent skill with internal workflows before continuing standalone description optimization.",
|
|
@@ -250,6 +566,7 @@ export function analyzeSkillFamilyOverlap(
|
|
|
250
566
|
analyzed_skills: skills,
|
|
251
567
|
members,
|
|
252
568
|
pairs,
|
|
569
|
+
cold_start_suspicion: coldStartSuspicion,
|
|
253
570
|
total_pairs_analyzed: totalPairsAnalyzed,
|
|
254
571
|
overlap_count: overlapCount,
|
|
255
572
|
overlap_density: overlapDensity,
|
|
@@ -257,7 +574,9 @@ export function analyzeSkillFamilyOverlap(
|
|
|
257
574
|
consolidation_candidate: consolidationCandidate,
|
|
258
575
|
recommendation:
|
|
259
576
|
readySkillCount < 2
|
|
260
|
-
?
|
|
577
|
+
? coldStartSuspicion?.candidate
|
|
578
|
+
? "Trusted telemetry is still sparse, but installed skill surfaces suggest this family may want a parent skill. Treat this as cold-start architecture suspicion, then confirm with cold-start evals plus real usage."
|
|
579
|
+
: "Insufficient trusted telemetry to make a family-packaging call yet. Use cold-start evals plus a few days of real usage before deciding whether to consolidate."
|
|
261
580
|
: consolidationCandidate
|
|
262
581
|
? `Consider consolidating this family under a parent skill like \`${parentSkillName}\`.`
|
|
263
582
|
: "Keep the skills separate for now and continue improving boundaries at the description/workflow level.",
|
|
@@ -29,3 +29,8 @@ export function readEvidenceTrail(skillName?: string, _logPath?: string): Evolut
|
|
|
29
29
|
const db = getDb();
|
|
30
30
|
return queryEvolutionEvidence(db, skillName) as EvolutionEvidenceEntry[];
|
|
31
31
|
}
|
|
32
|
+
|
|
33
|
+
/** Build the stable evidence key used to connect audit entries to validation artifacts. */
|
|
34
|
+
export function buildValidationEvidenceRef(proposalId: string, stage: string): string {
|
|
35
|
+
return `evolution_evidence:${proposalId}:${stage}`;
|
|
36
|
+
}
|
|
@@ -23,6 +23,7 @@ import type {
|
|
|
23
23
|
FailurePattern,
|
|
24
24
|
GradingResult,
|
|
25
25
|
QueryLogRecord,
|
|
26
|
+
RoutingReplayFixture,
|
|
26
27
|
SkillUsageRecord,
|
|
27
28
|
} from "../types.js";
|
|
28
29
|
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
@@ -31,12 +32,16 @@ import { callViaSubagent } from "../utils/llm-call.js";
|
|
|
31
32
|
import { appendAuditEntry } from "./audit.js";
|
|
32
33
|
import { checkConstitutionSizeOnly } from "./constitutional.js";
|
|
33
34
|
import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js";
|
|
34
|
-
import { appendEvidenceEntry } from "./evidence.js";
|
|
35
|
+
import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
|
|
35
36
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
36
37
|
import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
|
|
37
38
|
import { generateRoutingProposal } from "./propose-routing.js";
|
|
38
39
|
import { refineBodyProposal } from "./refine-body.js";
|
|
39
40
|
import { validateBodyProposal } from "./validate-body.js";
|
|
41
|
+
import {
|
|
42
|
+
buildRoutingReplayFixture,
|
|
43
|
+
runClaudeRuntimeReplayFixture,
|
|
44
|
+
} from "./validate-host-replay.js";
|
|
40
45
|
import { validateRoutingProposal } from "./validate-routing.js";
|
|
41
46
|
|
|
42
47
|
// ---------------------------------------------------------------------------
|
|
@@ -106,6 +111,10 @@ function createAuditEntry(
|
|
|
106
111
|
action: EvolutionAuditEntry["action"],
|
|
107
112
|
details: string,
|
|
108
113
|
skillName?: string,
|
|
114
|
+
provenance?: Pick<
|
|
115
|
+
EvolutionAuditEntry,
|
|
116
|
+
"validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
|
|
117
|
+
>,
|
|
109
118
|
): EvolutionAuditEntry {
|
|
110
119
|
return {
|
|
111
120
|
timestamp: new Date().toISOString(),
|
|
@@ -113,6 +122,14 @@ function createAuditEntry(
|
|
|
113
122
|
skill_name: skillName,
|
|
114
123
|
action,
|
|
115
124
|
details,
|
|
125
|
+
...(provenance?.validation_mode ? { validation_mode: provenance.validation_mode } : {}),
|
|
126
|
+
...(provenance?.validation_agent ? { validation_agent: provenance.validation_agent } : {}),
|
|
127
|
+
...(provenance?.validation_fixture_id
|
|
128
|
+
? { validation_fixture_id: provenance.validation_fixture_id }
|
|
129
|
+
: {}),
|
|
130
|
+
...(provenance?.validation_evidence_ref
|
|
131
|
+
? { validation_evidence_ref: provenance.validation_evidence_ref }
|
|
132
|
+
: {}),
|
|
116
133
|
};
|
|
117
134
|
}
|
|
118
135
|
|
|
@@ -181,8 +198,12 @@ export async function evolveBody(
|
|
|
181
198
|
proposalId: string,
|
|
182
199
|
action: EvolutionAuditEntry["action"],
|
|
183
200
|
details: string,
|
|
201
|
+
provenance?: Pick<
|
|
202
|
+
EvolutionAuditEntry,
|
|
203
|
+
"validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
|
|
204
|
+
>,
|
|
184
205
|
): void {
|
|
185
|
-
const entry = createAuditEntry(proposalId, action, details, skillName);
|
|
206
|
+
const entry = createAuditEntry(proposalId, action, details, skillName, provenance);
|
|
186
207
|
auditEntries.push(entry);
|
|
187
208
|
try {
|
|
188
209
|
_appendAuditEntry(entry);
|
|
@@ -443,11 +464,37 @@ export async function evolveBody(
|
|
|
443
464
|
const validationModelFlag = options.validationModel ?? studentModel;
|
|
444
465
|
let validation: BodyValidationResult;
|
|
445
466
|
if (target === "routing") {
|
|
467
|
+
const replayFixture = buildRoutingReplayFixture({
|
|
468
|
+
skillName,
|
|
469
|
+
skillPath,
|
|
470
|
+
platform: studentAgent === "codex" ? "codex" : "claude_code",
|
|
471
|
+
});
|
|
472
|
+
const replayRunner =
|
|
473
|
+
replayFixture.platform === "claude_code" && studentAgent === "claude"
|
|
474
|
+
? async ({
|
|
475
|
+
routing,
|
|
476
|
+
evalSet,
|
|
477
|
+
fixture,
|
|
478
|
+
}: {
|
|
479
|
+
routing: string;
|
|
480
|
+
evalSet: EvalEntry[];
|
|
481
|
+
fixture: RoutingReplayFixture;
|
|
482
|
+
}) =>
|
|
483
|
+
await runClaudeRuntimeReplayFixture({
|
|
484
|
+
routing,
|
|
485
|
+
evalSet,
|
|
486
|
+
fixture,
|
|
487
|
+
})
|
|
488
|
+
: undefined;
|
|
446
489
|
validation = await _validateRoutingProposal(
|
|
447
490
|
proposal,
|
|
448
491
|
evalSet,
|
|
449
492
|
studentAgent,
|
|
450
493
|
validationModelFlag,
|
|
494
|
+
{
|
|
495
|
+
replayFixture,
|
|
496
|
+
...(replayRunner ? { replayRunner } : {}),
|
|
497
|
+
},
|
|
451
498
|
);
|
|
452
499
|
} else {
|
|
453
500
|
validation = await _validateBodyProposal(
|
|
@@ -458,11 +505,18 @@ export async function evolveBody(
|
|
|
458
505
|
);
|
|
459
506
|
}
|
|
460
507
|
lastValidation = validation;
|
|
508
|
+
const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
461
509
|
|
|
462
510
|
recordAudit(
|
|
463
511
|
proposal.proposal_id,
|
|
464
512
|
"validated",
|
|
465
513
|
`Validation: ${validation.gates_passed}/${validation.gates_total} gates passed`,
|
|
514
|
+
{
|
|
515
|
+
validation_mode: validation.validation_mode,
|
|
516
|
+
validation_agent: validation.validation_agent,
|
|
517
|
+
validation_fixture_id: validation.validation_fixture_id,
|
|
518
|
+
validation_evidence_ref: validatedEvidenceRef,
|
|
519
|
+
},
|
|
466
520
|
);
|
|
467
521
|
recordEvidence({
|
|
468
522
|
timestamp: new Date().toISOString(),
|
|
@@ -480,6 +534,12 @@ export async function evolveBody(
|
|
|
480
534
|
gates_total: validation.gates_total,
|
|
481
535
|
gate_results: validation.gate_results,
|
|
482
536
|
regressions: validation.regressions,
|
|
537
|
+
before_pass_rate: validation.before_pass_rate,
|
|
538
|
+
after_pass_rate: validation.after_pass_rate,
|
|
539
|
+
validation_mode: validation.validation_mode,
|
|
540
|
+
validation_agent: validation.validation_agent,
|
|
541
|
+
validation_fixture_id: validation.validation_fixture_id,
|
|
542
|
+
validation_evidence_ref: validatedEvidenceRef,
|
|
483
543
|
},
|
|
484
544
|
});
|
|
485
545
|
|
|
@@ -491,6 +551,12 @@ export async function evolveBody(
|
|
|
491
551
|
proposal.proposal_id,
|
|
492
552
|
"rejected",
|
|
493
553
|
`Validation failed: ${validation.gates_passed}/${validation.gates_total} gates`,
|
|
554
|
+
{
|
|
555
|
+
validation_mode: validation.validation_mode,
|
|
556
|
+
validation_agent: validation.validation_agent,
|
|
557
|
+
validation_fixture_id: validation.validation_fixture_id,
|
|
558
|
+
validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
|
|
559
|
+
},
|
|
494
560
|
);
|
|
495
561
|
recordEvidence({
|
|
496
562
|
timestamp: new Date().toISOString(),
|
|
@@ -508,6 +574,12 @@ export async function evolveBody(
|
|
|
508
574
|
gates_total: validation.gates_total,
|
|
509
575
|
gate_results: validation.gate_results,
|
|
510
576
|
regressions: validation.regressions,
|
|
577
|
+
before_pass_rate: validation.before_pass_rate,
|
|
578
|
+
after_pass_rate: validation.after_pass_rate,
|
|
579
|
+
validation_mode: validation.validation_mode,
|
|
580
|
+
validation_agent: validation.validation_agent,
|
|
581
|
+
validation_fixture_id: validation.validation_fixture_id,
|
|
582
|
+
validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
|
|
511
583
|
},
|
|
512
584
|
});
|
|
513
585
|
|
|
@@ -607,6 +679,12 @@ export async function evolveBody(
|
|
|
607
679
|
lastProposal.proposal_id,
|
|
608
680
|
"deployed",
|
|
609
681
|
`Deployed ${target} proposal for ${skillName}`,
|
|
682
|
+
{
|
|
683
|
+
validation_mode: lastValidation.validation_mode,
|
|
684
|
+
validation_agent: lastValidation.validation_agent,
|
|
685
|
+
validation_fixture_id: lastValidation.validation_fixture_id,
|
|
686
|
+
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
687
|
+
},
|
|
610
688
|
);
|
|
611
689
|
recordEvidence({
|
|
612
690
|
timestamp: new Date().toISOString(),
|
|
@@ -624,6 +702,12 @@ export async function evolveBody(
|
|
|
624
702
|
gates_total: lastValidation.gates_total,
|
|
625
703
|
gate_results: lastValidation.gate_results,
|
|
626
704
|
regressions: lastValidation.regressions,
|
|
705
|
+
before_pass_rate: lastValidation.before_pass_rate,
|
|
706
|
+
after_pass_rate: lastValidation.after_pass_rate,
|
|
707
|
+
validation_mode: lastValidation.validation_mode,
|
|
708
|
+
validation_agent: lastValidation.validation_agent,
|
|
709
|
+
validation_fixture_id: lastValidation.validation_fixture_id,
|
|
710
|
+
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
627
711
|
},
|
|
628
712
|
});
|
|
629
713
|
|