selftune 0.2.19 → 0.2.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/local-dashboard/dist/assets/{index-DnhnXQm6.js → index-D8O-RG1I.js} +2 -2
- package/apps/local-dashboard/dist/index.html +1 -1
- package/cli/selftune/dashboard-contract.ts +4 -0
- package/cli/selftune/eval/family-overlap.ts +320 -1
- package/cli/selftune/evolution/evidence.ts +5 -0
- package/cli/selftune/evolution/evolve-body.ts +62 -2
- package/cli/selftune/evolution/evolve.ts +58 -1
- package/cli/selftune/evolution/validate-body.ts +10 -0
- package/cli/selftune/evolution/validate-host-replay.ts +236 -0
- package/cli/selftune/evolution/validate-proposal.ts +10 -0
- package/cli/selftune/evolution/validate-routing.ts +112 -5
- package/cli/selftune/localdb/direct-write.ts +8 -3
- package/cli/selftune/localdb/materialize.ts +7 -2
- package/cli/selftune/localdb/queries.ts +11 -1
- package/cli/selftune/localdb/schema.ts +10 -1
- package/cli/selftune/routes/skill-report.ts +6 -1
- package/cli/selftune/types.ts +54 -0
- package/cli/selftune/utils/text-similarity.ts +73 -0
- package/package.json +1 -1
- package/packages/ui/src/components/EvidenceViewer.tsx +85 -2
- package/packages/ui/src/components/EvolutionTimeline.tsx +23 -1
- package/packages/ui/src/types.ts +4 -0
- package/skill/Workflows/Composability.md +15 -1
- package/skill/Workflows/Evolve.md +23 -0
|
@@ -43,7 +43,7 @@ import { createEvolveTUI } from "../utils/tui.js";
|
|
|
43
43
|
import { appendAuditEntry } from "./audit.js";
|
|
44
44
|
import { checkConstitution } from "./constitutional.js";
|
|
45
45
|
import { scoreDescription } from "./description-quality.js";
|
|
46
|
-
import { appendEvidenceEntry } from "./evidence.js";
|
|
46
|
+
import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
|
|
47
47
|
import { extractFailurePatterns } from "./extract-patterns.js";
|
|
48
48
|
import {
|
|
49
49
|
computeInvocationScores,
|
|
@@ -139,6 +139,10 @@ function createAuditEntry(
|
|
|
139
139
|
evalSnapshot?: EvalPassRate,
|
|
140
140
|
skillName?: string,
|
|
141
141
|
iterationsUsed?: number,
|
|
142
|
+
provenance?: Pick<
|
|
143
|
+
EvolutionAuditEntry,
|
|
144
|
+
"validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
|
|
145
|
+
>,
|
|
142
146
|
): EvolutionAuditEntry {
|
|
143
147
|
return {
|
|
144
148
|
timestamp: new Date().toISOString(),
|
|
@@ -148,6 +152,14 @@ function createAuditEntry(
|
|
|
148
152
|
...(skillName ? { skill_name: skillName } : {}),
|
|
149
153
|
...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
|
|
150
154
|
...(iterationsUsed != null ? { iterations_used: iterationsUsed } : {}),
|
|
155
|
+
...(provenance?.validation_mode ? { validation_mode: provenance.validation_mode } : {}),
|
|
156
|
+
...(provenance?.validation_agent ? { validation_agent: provenance.validation_agent } : {}),
|
|
157
|
+
...(provenance?.validation_fixture_id
|
|
158
|
+
? { validation_fixture_id: provenance.validation_fixture_id }
|
|
159
|
+
: {}),
|
|
160
|
+
...(provenance?.validation_evidence_ref
|
|
161
|
+
? { validation_evidence_ref: provenance.validation_evidence_ref }
|
|
162
|
+
: {}),
|
|
151
163
|
};
|
|
152
164
|
}
|
|
153
165
|
|
|
@@ -289,6 +301,10 @@ export async function evolve(
|
|
|
289
301
|
details: string,
|
|
290
302
|
evalSnapshot?: EvalPassRate,
|
|
291
303
|
iterationsUsed?: number,
|
|
304
|
+
provenance?: Pick<
|
|
305
|
+
EvolutionAuditEntry,
|
|
306
|
+
"validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
|
|
307
|
+
>,
|
|
292
308
|
): void {
|
|
293
309
|
const entry = createAuditEntry(
|
|
294
310
|
proposalId,
|
|
@@ -297,6 +313,7 @@ export async function evolve(
|
|
|
297
313
|
evalSnapshot,
|
|
298
314
|
skillName,
|
|
299
315
|
iterationsUsed,
|
|
316
|
+
provenance,
|
|
300
317
|
);
|
|
301
318
|
auditEntries.push(entry);
|
|
302
319
|
try {
|
|
@@ -637,10 +654,18 @@ export async function evolve(
|
|
|
637
654
|
options.validationModel,
|
|
638
655
|
);
|
|
639
656
|
llmCallCount += countValidationLlmCalls(evalSet.length);
|
|
657
|
+
const evidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
640
658
|
recordAudit(
|
|
641
659
|
proposal.proposal_id,
|
|
642
660
|
"validated",
|
|
643
661
|
`Pareto validation: improved=${validation.improved}`,
|
|
662
|
+
undefined,
|
|
663
|
+
undefined,
|
|
664
|
+
{
|
|
665
|
+
validation_mode: validation.validation_mode,
|
|
666
|
+
validation_agent: validation.validation_agent,
|
|
667
|
+
validation_evidence_ref: evidenceRef,
|
|
668
|
+
},
|
|
644
669
|
);
|
|
645
670
|
recordEvidence({
|
|
646
671
|
timestamp: new Date().toISOString(),
|
|
@@ -660,6 +685,9 @@ export async function evolve(
|
|
|
660
685
|
regressions: validation.regressions,
|
|
661
686
|
new_passes: validation.new_passes,
|
|
662
687
|
per_entry_results: validation.per_entry_results,
|
|
688
|
+
validation_mode: validation.validation_mode,
|
|
689
|
+
validation_agent: validation.validation_agent,
|
|
690
|
+
validation_evidence_ref: evidenceRef,
|
|
663
691
|
},
|
|
664
692
|
});
|
|
665
693
|
|
|
@@ -866,11 +894,18 @@ export async function evolve(
|
|
|
866
894
|
failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
|
|
867
895
|
pass_rate: validation.after_pass_rate,
|
|
868
896
|
};
|
|
897
|
+
const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
869
898
|
recordAudit(
|
|
870
899
|
proposal.proposal_id,
|
|
871
900
|
"validated",
|
|
872
901
|
`Validation complete: improved=${validation.improved}`,
|
|
873
902
|
evalSnapshot,
|
|
903
|
+
undefined,
|
|
904
|
+
{
|
|
905
|
+
validation_mode: validation.validation_mode,
|
|
906
|
+
validation_agent: validation.validation_agent,
|
|
907
|
+
validation_evidence_ref: validatedEvidenceRef,
|
|
908
|
+
},
|
|
874
909
|
);
|
|
875
910
|
recordEvidence({
|
|
876
911
|
timestamp: new Date().toISOString(),
|
|
@@ -890,6 +925,9 @@ export async function evolve(
|
|
|
890
925
|
regressions: validation.regressions,
|
|
891
926
|
new_passes: validation.new_passes,
|
|
892
927
|
per_entry_results: validation.per_entry_results,
|
|
928
|
+
validation_mode: validation.validation_mode,
|
|
929
|
+
validation_agent: validation.validation_agent,
|
|
930
|
+
validation_evidence_ref: validatedEvidenceRef,
|
|
893
931
|
},
|
|
894
932
|
});
|
|
895
933
|
|
|
@@ -906,10 +944,18 @@ export async function evolve(
|
|
|
906
944
|
|
|
907
945
|
if (!validation.improved) {
|
|
908
946
|
feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
|
|
947
|
+
const rejectedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "rejected");
|
|
909
948
|
recordAudit(
|
|
910
949
|
proposal.proposal_id,
|
|
911
950
|
"rejected",
|
|
912
951
|
`Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
|
|
952
|
+
undefined,
|
|
953
|
+
undefined,
|
|
954
|
+
{
|
|
955
|
+
validation_mode: validation.validation_mode,
|
|
956
|
+
validation_agent: validation.validation_agent,
|
|
957
|
+
validation_evidence_ref: rejectedEvidenceRef,
|
|
958
|
+
},
|
|
913
959
|
);
|
|
914
960
|
recordEvidence({
|
|
915
961
|
timestamp: new Date().toISOString(),
|
|
@@ -929,6 +975,9 @@ export async function evolve(
|
|
|
929
975
|
regressions: validation.regressions,
|
|
930
976
|
new_passes: validation.new_passes,
|
|
931
977
|
per_entry_results: validation.per_entry_results,
|
|
978
|
+
validation_mode: validation.validation_mode,
|
|
979
|
+
validation_agent: validation.validation_agent,
|
|
980
|
+
validation_evidence_ref: rejectedEvidenceRef,
|
|
932
981
|
},
|
|
933
982
|
});
|
|
934
983
|
|
|
@@ -1138,6 +1187,11 @@ export async function evolve(
|
|
|
1138
1187
|
pass_rate: lastValidation.after_pass_rate,
|
|
1139
1188
|
},
|
|
1140
1189
|
iterationsCompleted,
|
|
1190
|
+
{
|
|
1191
|
+
validation_mode: lastValidation.validation_mode,
|
|
1192
|
+
validation_agent: lastValidation.validation_agent,
|
|
1193
|
+
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
1194
|
+
},
|
|
1141
1195
|
);
|
|
1142
1196
|
recordEvidence({
|
|
1143
1197
|
timestamp: new Date().toISOString(),
|
|
@@ -1157,6 +1211,9 @@ export async function evolve(
|
|
|
1157
1211
|
regressions: lastValidation.regressions,
|
|
1158
1212
|
new_passes: lastValidation.new_passes,
|
|
1159
1213
|
per_entry_results: lastValidation.per_entry_results,
|
|
1214
|
+
validation_mode: lastValidation.validation_mode,
|
|
1215
|
+
validation_agent: lastValidation.validation_agent,
|
|
1216
|
+
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
1160
1217
|
},
|
|
1161
1218
|
});
|
|
1162
1219
|
}
|
|
@@ -209,6 +209,8 @@ export async function validateBodyProposal(
|
|
|
209
209
|
gate_results: gateResults,
|
|
210
210
|
improved: false,
|
|
211
211
|
regressions: [],
|
|
212
|
+
validation_mode: "structural_guard",
|
|
213
|
+
validation_agent: agent,
|
|
212
214
|
};
|
|
213
215
|
}
|
|
214
216
|
|
|
@@ -250,5 +252,13 @@ export async function validateBodyProposal(
|
|
|
250
252
|
gate_results: gateResults,
|
|
251
253
|
improved: gatesPassed === 3,
|
|
252
254
|
regressions: accuracy.regressions,
|
|
255
|
+
validation_mode: "llm_judge",
|
|
256
|
+
validation_agent: agent,
|
|
257
|
+
...(evalSet.length > 0
|
|
258
|
+
? {
|
|
259
|
+
before_pass_rate: accuracy.before_pass_rate,
|
|
260
|
+
after_pass_rate: accuracy.after_pass_rate,
|
|
261
|
+
}
|
|
262
|
+
: {}),
|
|
253
263
|
};
|
|
254
264
|
}
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
import { existsSync, readFileSync, readdirSync, realpathSync, statSync } from "node:fs";
|
|
2
|
+
import { basename, dirname, join } from "node:path";
|
|
3
|
+
|
|
4
|
+
import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
|
|
5
|
+
import { parseFrontmatter } from "../utils/frontmatter.js";
|
|
6
|
+
import { containsWholeSkillMention } from "../utils/skill-discovery.js";
|
|
7
|
+
import { findGitRepositoryRoot } from "../utils/skill-discovery.js";
|
|
8
|
+
import {
|
|
9
|
+
extractWhenToUseLines,
|
|
10
|
+
jaccardSimilarity,
|
|
11
|
+
tokenizeText,
|
|
12
|
+
} from "../utils/text-similarity.js";
|
|
13
|
+
|
|
14
|
+
interface ReplaySkillSurface {
|
|
15
|
+
skillName: string;
|
|
16
|
+
descriptionTokens: Set<string>;
|
|
17
|
+
whenToUseTokens: Set<string>;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Minimum score needed before replay treats routing text or skill-surface overlap
|
|
22
|
+
* as a real match. Tuned to suppress weak false positives without killing recall
|
|
23
|
+
* for short routing phrases and sparse skill surfaces.
|
|
24
|
+
*/
|
|
25
|
+
const HOST_REPLAY_MATCH_THRESHOLD = 0.18;
|
|
26
|
+
|
|
27
|
+
function resolveReplayPath(path: string): string {
|
|
28
|
+
try {
|
|
29
|
+
return realpathSync(path);
|
|
30
|
+
} catch {
|
|
31
|
+
return path;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function listCompetingSkillPaths(targetSkillPath: string): string[] {
|
|
36
|
+
const normalizedTargetPath = resolveReplayPath(targetSkillPath);
|
|
37
|
+
const targetSkillDir = dirname(normalizedTargetPath);
|
|
38
|
+
const registryDir = dirname(targetSkillDir);
|
|
39
|
+
const targetDirName = basename(targetSkillDir);
|
|
40
|
+
const competingPaths: string[] = [];
|
|
41
|
+
|
|
42
|
+
try {
|
|
43
|
+
for (const entry of readdirSync(registryDir)) {
|
|
44
|
+
if (entry === targetDirName) continue;
|
|
45
|
+
const candidateDir = join(registryDir, entry);
|
|
46
|
+
try {
|
|
47
|
+
if (!statSync(candidateDir).isDirectory()) continue;
|
|
48
|
+
} catch {
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const candidateSkillPath = join(candidateDir, "SKILL.md");
|
|
53
|
+
if (!existsSync(candidateSkillPath)) continue;
|
|
54
|
+
competingPaths.push(resolveReplayPath(candidateSkillPath));
|
|
55
|
+
}
|
|
56
|
+
} catch {
|
|
57
|
+
// Ignore unreadable registries and treat the fixture as target-only.
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return competingPaths.sort((a, b) => a.localeCompare(b));
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export function buildRoutingReplayFixture(options: {
|
|
64
|
+
skillName: string;
|
|
65
|
+
skillPath: string;
|
|
66
|
+
platform?: RoutingReplayFixture["platform"];
|
|
67
|
+
fixtureId?: string;
|
|
68
|
+
workspaceRoot?: string;
|
|
69
|
+
}): RoutingReplayFixture {
|
|
70
|
+
const targetSkillPath = resolveReplayPath(options.skillPath);
|
|
71
|
+
const workspaceRoot =
|
|
72
|
+
options.workspaceRoot ?? findGitRepositoryRoot(dirname(dirname(targetSkillPath)));
|
|
73
|
+
const platform = options.platform ?? "claude_code";
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
fixture_id: options.fixtureId ?? `auto-${platform}-${options.skillName}`,
|
|
77
|
+
platform,
|
|
78
|
+
target_skill_name: options.skillName,
|
|
79
|
+
target_skill_path: targetSkillPath,
|
|
80
|
+
competing_skill_paths: listCompetingSkillPaths(targetSkillPath),
|
|
81
|
+
...(workspaceRoot ? { workspace_root: workspaceRoot } : {}),
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function loadReplaySkillSurface(skillPath: string): ReplaySkillSurface {
|
|
86
|
+
const fallbackName = basename(dirname(skillPath)) || "unknown-skill";
|
|
87
|
+
try {
|
|
88
|
+
const raw = readFileSync(skillPath, "utf8");
|
|
89
|
+
const parsed = parseFrontmatter(raw);
|
|
90
|
+
return {
|
|
91
|
+
skillName: parsed.name.trim() || fallbackName,
|
|
92
|
+
descriptionTokens: tokenizeText(parsed.description),
|
|
93
|
+
whenToUseTokens: tokenizeText(extractWhenToUseLines(parsed.body).join(" ")),
|
|
94
|
+
};
|
|
95
|
+
} catch {
|
|
96
|
+
return {
|
|
97
|
+
skillName: fallbackName,
|
|
98
|
+
descriptionTokens: new Set<string>(),
|
|
99
|
+
whenToUseTokens: new Set<string>(),
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function extractRoutingTriggerPhrases(routing: string): string[] {
|
|
105
|
+
const lines = routing
|
|
106
|
+
.trim()
|
|
107
|
+
.split("\n")
|
|
108
|
+
.map((line) => line.trim())
|
|
109
|
+
.filter(Boolean);
|
|
110
|
+
if (lines.length < 3) return [];
|
|
111
|
+
|
|
112
|
+
const phrases: string[] = [];
|
|
113
|
+
for (const row of lines.slice(2)) {
|
|
114
|
+
if (!row.startsWith("|") || !row.endsWith("|")) continue;
|
|
115
|
+
const cells = row.split("|").map((cell) => cell.trim());
|
|
116
|
+
const triggerCell = cells[1];
|
|
117
|
+
if (!triggerCell) continue;
|
|
118
|
+
for (const part of triggerCell.split(/,|\/| or /i)) {
|
|
119
|
+
const phrase = part.trim().replace(/^["'`]|["'`]$/g, "");
|
|
120
|
+
if (phrase.length >= 3) phrases.push(phrase);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return phrases;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function scoreQueryAgainstTriggerPhrases(query: string, triggerPhrases: string[]): number {
|
|
127
|
+
const normalizedQuery = query.toLowerCase();
|
|
128
|
+
const queryTokens = tokenizeText(query);
|
|
129
|
+
let best = 0;
|
|
130
|
+
for (const phrase of triggerPhrases) {
|
|
131
|
+
const normalizedPhrase = phrase.toLowerCase();
|
|
132
|
+
if (normalizedQuery.includes(normalizedPhrase)) {
|
|
133
|
+
best = Math.max(best, 1);
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
best = Math.max(best, jaccardSimilarity(queryTokens, tokenizeText(phrase)));
|
|
137
|
+
}
|
|
138
|
+
return best;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
function scoreQueryAgainstSkillSurface(query: string, surface: ReplaySkillSurface): number {
|
|
142
|
+
const queryTokens = tokenizeText(query);
|
|
143
|
+
return Math.max(
|
|
144
|
+
jaccardSimilarity(queryTokens, surface.descriptionTokens),
|
|
145
|
+
jaccardSimilarity(queryTokens, surface.whenToUseTokens),
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function evaluateReplayTrigger(
|
|
150
|
+
query: string,
|
|
151
|
+
routing: string,
|
|
152
|
+
targetSurface: ReplaySkillSurface,
|
|
153
|
+
competingSurfaces: ReplaySkillSurface[],
|
|
154
|
+
): { triggered: boolean; evidence: string } {
|
|
155
|
+
const normalizedQuery = query.trim();
|
|
156
|
+
if (containsWholeSkillMention(normalizedQuery, targetSurface.skillName)) {
|
|
157
|
+
return {
|
|
158
|
+
triggered: true,
|
|
159
|
+
evidence: `explicit target mention: ${targetSurface.skillName}`,
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
for (const competingSurface of competingSurfaces) {
|
|
164
|
+
if (containsWholeSkillMention(normalizedQuery, competingSurface.skillName)) {
|
|
165
|
+
return {
|
|
166
|
+
triggered: false,
|
|
167
|
+
evidence: `explicit competing skill mention: ${competingSurface.skillName}`,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const triggerPhrases = extractRoutingTriggerPhrases(routing);
|
|
173
|
+
const triggerScore = scoreQueryAgainstTriggerPhrases(normalizedQuery, triggerPhrases);
|
|
174
|
+
const targetSurfaceScore = scoreQueryAgainstSkillSurface(normalizedQuery, targetSurface);
|
|
175
|
+
const targetScore = Math.max(triggerScore, targetSurfaceScore);
|
|
176
|
+
const bestCompetitor = competingSurfaces
|
|
177
|
+
.map((surface) => ({
|
|
178
|
+
skillName: surface.skillName,
|
|
179
|
+
score: scoreQueryAgainstSkillSurface(normalizedQuery, surface),
|
|
180
|
+
}))
|
|
181
|
+
.sort((a, b) => b.score - a.score)[0];
|
|
182
|
+
|
|
183
|
+
if (targetScore < HOST_REPLAY_MATCH_THRESHOLD) {
|
|
184
|
+
return {
|
|
185
|
+
triggered: false,
|
|
186
|
+
evidence: "target routing and skill surface did not clear replay threshold",
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if (bestCompetitor && bestCompetitor.score >= targetScore) {
|
|
191
|
+
return {
|
|
192
|
+
triggered: false,
|
|
193
|
+
evidence: `competing skill surface scored higher: ${bestCompetitor.skillName}`,
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if (triggerScore >= targetSurfaceScore) {
|
|
198
|
+
return {
|
|
199
|
+
triggered: true,
|
|
200
|
+
evidence:
|
|
201
|
+
triggerScore === 1
|
|
202
|
+
? "query matched a routing trigger phrase exactly"
|
|
203
|
+
: "query aligned with routing trigger language",
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
triggered: true,
|
|
209
|
+
evidence: "query aligned with target skill surface in replay fixture",
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
export function runHostReplayFixture(options: {
|
|
214
|
+
routing: string;
|
|
215
|
+
evalSet: EvalEntry[];
|
|
216
|
+
fixture: RoutingReplayFixture;
|
|
217
|
+
}): RoutingReplayEntryResult[] {
|
|
218
|
+
const targetSurface = loadReplaySkillSurface(options.fixture.target_skill_path);
|
|
219
|
+
const competingSurfaces = options.fixture.competing_skill_paths.map(loadReplaySkillSurface);
|
|
220
|
+
|
|
221
|
+
return options.evalSet.map((entry) => {
|
|
222
|
+
const evaluated = evaluateReplayTrigger(
|
|
223
|
+
entry.query,
|
|
224
|
+
options.routing,
|
|
225
|
+
targetSurface,
|
|
226
|
+
competingSurfaces,
|
|
227
|
+
);
|
|
228
|
+
return {
|
|
229
|
+
query: entry.query,
|
|
230
|
+
should_trigger: entry.should_trigger,
|
|
231
|
+
triggered: evaluated.triggered,
|
|
232
|
+
passed: evaluated.triggered === entry.should_trigger,
|
|
233
|
+
evidence: evaluated.evidence,
|
|
234
|
+
};
|
|
235
|
+
});
|
|
236
|
+
}
|
|
@@ -40,6 +40,8 @@ export interface ValidationResult {
|
|
|
40
40
|
net_change: number; // after - before pass rate
|
|
41
41
|
by_invocation_type?: InvocationTypeScores;
|
|
42
42
|
per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
|
|
43
|
+
validation_mode?: "llm_judge";
|
|
44
|
+
validation_agent?: string;
|
|
43
45
|
}
|
|
44
46
|
|
|
45
47
|
// ---------------------------------------------------------------------------
|
|
@@ -63,6 +65,8 @@ export async function validateProposalSequential(
|
|
|
63
65
|
regressions: [],
|
|
64
66
|
new_passes: [],
|
|
65
67
|
net_change: 0,
|
|
68
|
+
validation_mode: "llm_judge",
|
|
69
|
+
validation_agent: agent,
|
|
66
70
|
};
|
|
67
71
|
}
|
|
68
72
|
|
|
@@ -174,6 +178,8 @@ export async function validateProposalSequential(
|
|
|
174
178
|
net_change: netChange,
|
|
175
179
|
by_invocation_type: invocationScores,
|
|
176
180
|
per_entry_results: perEntryResults,
|
|
181
|
+
validation_mode: "llm_judge",
|
|
182
|
+
validation_agent: agent,
|
|
177
183
|
};
|
|
178
184
|
}
|
|
179
185
|
|
|
@@ -220,6 +226,8 @@ export async function validateProposalBatched(
|
|
|
220
226
|
regressions: [],
|
|
221
227
|
new_passes: [],
|
|
222
228
|
net_change: 0,
|
|
229
|
+
validation_mode: "llm_judge",
|
|
230
|
+
validation_agent: agent,
|
|
223
231
|
};
|
|
224
232
|
}
|
|
225
233
|
|
|
@@ -342,6 +350,8 @@ export async function validateProposalBatched(
|
|
|
342
350
|
net_change: netChange,
|
|
343
351
|
by_invocation_type: invocationScores,
|
|
344
352
|
per_entry_results: perEntryResults,
|
|
353
|
+
validation_mode: "llm_judge",
|
|
354
|
+
validation_agent: agent,
|
|
345
355
|
};
|
|
346
356
|
}
|
|
347
357
|
|
|
@@ -5,9 +5,43 @@
|
|
|
5
5
|
* and running trigger accuracy checks against an eval set.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
import type {
|
|
8
|
+
import type {
|
|
9
|
+
BodyEvolutionProposal,
|
|
10
|
+
BodyValidationResult,
|
|
11
|
+
EvalEntry,
|
|
12
|
+
RoutingReplayEntryResult,
|
|
13
|
+
RoutingReplayFixture,
|
|
14
|
+
ValidationMode,
|
|
15
|
+
} from "../types.js";
|
|
9
16
|
import { callLlm } from "../utils/llm-call.js";
|
|
10
17
|
import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
|
|
18
|
+
import { runHostReplayFixture } from "./validate-host-replay.js";
|
|
19
|
+
|
|
20
|
+
export interface RoutingReplayRunnerInput {
|
|
21
|
+
routing: string;
|
|
22
|
+
evalSet: EvalEntry[];
|
|
23
|
+
agent: string;
|
|
24
|
+
fixture: RoutingReplayFixture;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export type RoutingReplayRunner = (
|
|
28
|
+
input: RoutingReplayRunnerInput,
|
|
29
|
+
) => Promise<RoutingReplayEntryResult[]>;
|
|
30
|
+
|
|
31
|
+
export interface RoutingValidationOptions {
|
|
32
|
+
replayFixture?: RoutingReplayFixture;
|
|
33
|
+
replayRunner?: RoutingReplayRunner;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface RoutingTriggerAccuracyResult {
|
|
37
|
+
before_pass_rate: number;
|
|
38
|
+
after_pass_rate: number;
|
|
39
|
+
improved: boolean;
|
|
40
|
+
validation_mode: ValidationMode;
|
|
41
|
+
validation_agent: string;
|
|
42
|
+
validation_fixture_id?: string;
|
|
43
|
+
per_entry_results?: RoutingReplayEntryResult[];
|
|
44
|
+
}
|
|
11
45
|
|
|
12
46
|
// ---------------------------------------------------------------------------
|
|
13
47
|
// Structural validation
|
|
@@ -77,9 +111,70 @@ export async function validateRoutingTriggerAccuracy(
|
|
|
77
111
|
evalSet: EvalEntry[],
|
|
78
112
|
agent: string,
|
|
79
113
|
modelFlag?: string,
|
|
80
|
-
|
|
114
|
+
options: RoutingValidationOptions = {},
|
|
115
|
+
): Promise<RoutingTriggerAccuracyResult> {
|
|
81
116
|
if (evalSet.length === 0) {
|
|
82
|
-
return {
|
|
117
|
+
return {
|
|
118
|
+
before_pass_rate: 0,
|
|
119
|
+
after_pass_rate: 0,
|
|
120
|
+
improved: false,
|
|
121
|
+
validation_mode: "structural_guard",
|
|
122
|
+
validation_agent: agent,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (options.replayFixture && options.replayRunner) {
|
|
127
|
+
const beforeResults = await options.replayRunner({
|
|
128
|
+
routing: originalRouting,
|
|
129
|
+
evalSet,
|
|
130
|
+
agent,
|
|
131
|
+
fixture: options.replayFixture,
|
|
132
|
+
});
|
|
133
|
+
const afterResults = await options.replayRunner({
|
|
134
|
+
routing: proposedRouting,
|
|
135
|
+
evalSet,
|
|
136
|
+
agent,
|
|
137
|
+
fixture: options.replayFixture,
|
|
138
|
+
});
|
|
139
|
+
const beforePassed = beforeResults.filter((result) => result.passed).length;
|
|
140
|
+
const afterPassed = afterResults.filter((result) => result.passed).length;
|
|
141
|
+
const total = evalSet.length;
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
before_pass_rate: beforePassed / total,
|
|
145
|
+
after_pass_rate: afterPassed / total,
|
|
146
|
+
improved: afterPassed > beforePassed,
|
|
147
|
+
validation_mode: "host_replay",
|
|
148
|
+
validation_agent: agent,
|
|
149
|
+
validation_fixture_id: options.replayFixture.fixture_id,
|
|
150
|
+
per_entry_results: afterResults,
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (options.replayFixture) {
|
|
155
|
+
const beforeResults = runHostReplayFixture({
|
|
156
|
+
routing: originalRouting,
|
|
157
|
+
evalSet,
|
|
158
|
+
fixture: options.replayFixture,
|
|
159
|
+
});
|
|
160
|
+
const afterResults = runHostReplayFixture({
|
|
161
|
+
routing: proposedRouting,
|
|
162
|
+
evalSet,
|
|
163
|
+
fixture: options.replayFixture,
|
|
164
|
+
});
|
|
165
|
+
const beforePassed = beforeResults.filter((result) => result.passed).length;
|
|
166
|
+
const afterPassed = afterResults.filter((result) => result.passed).length;
|
|
167
|
+
const total = evalSet.length;
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
before_pass_rate: beforePassed / total,
|
|
171
|
+
after_pass_rate: afterPassed / total,
|
|
172
|
+
improved: afterPassed > beforePassed,
|
|
173
|
+
validation_mode: "host_replay",
|
|
174
|
+
validation_agent: agent,
|
|
175
|
+
validation_fixture_id: options.replayFixture.fixture_id,
|
|
176
|
+
per_entry_results: afterResults,
|
|
177
|
+
};
|
|
83
178
|
}
|
|
84
179
|
|
|
85
180
|
const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
|
|
@@ -113,6 +208,8 @@ export async function validateRoutingTriggerAccuracy(
|
|
|
113
208
|
before_pass_rate: beforePassRate,
|
|
114
209
|
after_pass_rate: afterPassRate,
|
|
115
210
|
improved: afterPassRate > beforePassRate,
|
|
211
|
+
validation_mode: "llm_judge",
|
|
212
|
+
validation_agent: agent,
|
|
116
213
|
};
|
|
117
214
|
}
|
|
118
215
|
|
|
@@ -126,6 +223,7 @@ export async function validateRoutingProposal(
|
|
|
126
223
|
evalSet: EvalEntry[],
|
|
127
224
|
agent: string,
|
|
128
225
|
modelFlag?: string,
|
|
226
|
+
options: RoutingValidationOptions = {},
|
|
129
227
|
): Promise<BodyValidationResult> {
|
|
130
228
|
const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
|
|
131
229
|
|
|
@@ -145,6 +243,8 @@ export async function validateRoutingProposal(
|
|
|
145
243
|
gate_results: gateResults,
|
|
146
244
|
improved: false,
|
|
147
245
|
regressions: [],
|
|
246
|
+
validation_mode: "structural_guard",
|
|
247
|
+
validation_agent: agent,
|
|
148
248
|
};
|
|
149
249
|
}
|
|
150
250
|
|
|
@@ -155,13 +255,14 @@ export async function validateRoutingProposal(
|
|
|
155
255
|
evalSet,
|
|
156
256
|
agent,
|
|
157
257
|
modelFlag,
|
|
258
|
+
options,
|
|
158
259
|
);
|
|
159
260
|
gateResults.push({
|
|
160
261
|
gate: "trigger_accuracy",
|
|
161
262
|
passed: accuracy.improved,
|
|
162
263
|
reason: accuracy.improved
|
|
163
|
-
? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
164
|
-
: `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
264
|
+
? `Improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
265
|
+
: `Not improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
165
266
|
});
|
|
166
267
|
|
|
167
268
|
const gatesPassed = gateResults.filter((g) => g.passed).length;
|
|
@@ -173,5 +274,11 @@ export async function validateRoutingProposal(
|
|
|
173
274
|
gate_results: gateResults,
|
|
174
275
|
improved: gatesPassed === 2,
|
|
175
276
|
regressions: [],
|
|
277
|
+
validation_mode: accuracy.validation_mode,
|
|
278
|
+
validation_agent: accuracy.validation_agent,
|
|
279
|
+
validation_fixture_id: accuracy.validation_fixture_id,
|
|
280
|
+
before_pass_rate: accuracy.before_pass_rate,
|
|
281
|
+
after_pass_rate: accuracy.after_pass_rate,
|
|
282
|
+
per_entry_results: accuracy.per_entry_results,
|
|
176
283
|
};
|
|
177
284
|
}
|
|
@@ -285,11 +285,12 @@ export function writeEvolutionAuditToDb(record: EvolutionAuditEntry): boolean {
|
|
|
285
285
|
return safeWrite("evolution-audit", (db) => {
|
|
286
286
|
getStmt(
|
|
287
287
|
db,
|
|
288
|
-
"evolution-audit-
|
|
288
|
+
"evolution-audit-v3",
|
|
289
289
|
`
|
|
290
290
|
INSERT OR IGNORE INTO evolution_audit
|
|
291
|
-
(timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used
|
|
292
|
-
|
|
291
|
+
(timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used,
|
|
292
|
+
validation_mode, validation_agent, validation_fixture_id, validation_evidence_ref)
|
|
293
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
293
294
|
`,
|
|
294
295
|
).run(
|
|
295
296
|
record.timestamp,
|
|
@@ -299,6 +300,10 @@ export function writeEvolutionAuditToDb(record: EvolutionAuditEntry): boolean {
|
|
|
299
300
|
record.details,
|
|
300
301
|
record.eval_snapshot ? JSON.stringify(record.eval_snapshot) : null,
|
|
301
302
|
record.iterations_used ?? null,
|
|
303
|
+
record.validation_mode ?? null,
|
|
304
|
+
record.validation_agent ?? null,
|
|
305
|
+
record.validation_fixture_id ?? null,
|
|
306
|
+
record.validation_evidence_ref ?? null,
|
|
302
307
|
);
|
|
303
308
|
});
|
|
304
309
|
}
|
|
@@ -600,8 +600,9 @@ function insertEvolutionAudit(db: Database, records: EvolutionAuditEntry[]): num
|
|
|
600
600
|
// (idx_evo_audit_dedup defined in schema.ts).
|
|
601
601
|
const stmt = db.prepare(`
|
|
602
602
|
INSERT OR IGNORE INTO evolution_audit
|
|
603
|
-
(timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used
|
|
604
|
-
|
|
603
|
+
(timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used,
|
|
604
|
+
validation_mode, validation_agent, validation_fixture_id, validation_evidence_ref)
|
|
605
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
605
606
|
`);
|
|
606
607
|
|
|
607
608
|
let count = 0;
|
|
@@ -614,6 +615,10 @@ function insertEvolutionAudit(db: Database, records: EvolutionAuditEntry[]): num
|
|
|
614
615
|
r.details,
|
|
615
616
|
r.eval_snapshot ? JSON.stringify(r.eval_snapshot) : null,
|
|
616
617
|
r.iterations_used ?? null,
|
|
618
|
+
r.validation_mode ?? null,
|
|
619
|
+
r.validation_agent ?? null,
|
|
620
|
+
r.validation_fixture_id ?? null,
|
|
621
|
+
r.validation_evidence_ref ?? null,
|
|
617
622
|
);
|
|
618
623
|
count++;
|
|
619
624
|
}
|