selftune 0.2.19 → 0.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,7 +43,7 @@ import { createEvolveTUI } from "../utils/tui.js";
43
43
  import { appendAuditEntry } from "./audit.js";
44
44
  import { checkConstitution } from "./constitutional.js";
45
45
  import { scoreDescription } from "./description-quality.js";
46
- import { appendEvidenceEntry } from "./evidence.js";
46
+ import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
47
47
  import { extractFailurePatterns } from "./extract-patterns.js";
48
48
  import {
49
49
  computeInvocationScores,
@@ -139,6 +139,10 @@ function createAuditEntry(
139
139
  evalSnapshot?: EvalPassRate,
140
140
  skillName?: string,
141
141
  iterationsUsed?: number,
142
+ provenance?: Pick<
143
+ EvolutionAuditEntry,
144
+ "validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
145
+ >,
142
146
  ): EvolutionAuditEntry {
143
147
  return {
144
148
  timestamp: new Date().toISOString(),
@@ -148,6 +152,14 @@ function createAuditEntry(
148
152
  ...(skillName ? { skill_name: skillName } : {}),
149
153
  ...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
150
154
  ...(iterationsUsed != null ? { iterations_used: iterationsUsed } : {}),
155
+ ...(provenance?.validation_mode ? { validation_mode: provenance.validation_mode } : {}),
156
+ ...(provenance?.validation_agent ? { validation_agent: provenance.validation_agent } : {}),
157
+ ...(provenance?.validation_fixture_id
158
+ ? { validation_fixture_id: provenance.validation_fixture_id }
159
+ : {}),
160
+ ...(provenance?.validation_evidence_ref
161
+ ? { validation_evidence_ref: provenance.validation_evidence_ref }
162
+ : {}),
151
163
  };
152
164
  }
153
165
 
@@ -289,6 +301,10 @@ export async function evolve(
289
301
  details: string,
290
302
  evalSnapshot?: EvalPassRate,
291
303
  iterationsUsed?: number,
304
+ provenance?: Pick<
305
+ EvolutionAuditEntry,
306
+ "validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
307
+ >,
292
308
  ): void {
293
309
  const entry = createAuditEntry(
294
310
  proposalId,
@@ -297,6 +313,7 @@ export async function evolve(
297
313
  evalSnapshot,
298
314
  skillName,
299
315
  iterationsUsed,
316
+ provenance,
300
317
  );
301
318
  auditEntries.push(entry);
302
319
  try {
@@ -637,10 +654,18 @@ export async function evolve(
637
654
  options.validationModel,
638
655
  );
639
656
  llmCallCount += countValidationLlmCalls(evalSet.length);
657
+ const evidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
640
658
  recordAudit(
641
659
  proposal.proposal_id,
642
660
  "validated",
643
661
  `Pareto validation: improved=${validation.improved}`,
662
+ undefined,
663
+ undefined,
664
+ {
665
+ validation_mode: validation.validation_mode,
666
+ validation_agent: validation.validation_agent,
667
+ validation_evidence_ref: evidenceRef,
668
+ },
644
669
  );
645
670
  recordEvidence({
646
671
  timestamp: new Date().toISOString(),
@@ -660,6 +685,9 @@ export async function evolve(
660
685
  regressions: validation.regressions,
661
686
  new_passes: validation.new_passes,
662
687
  per_entry_results: validation.per_entry_results,
688
+ validation_mode: validation.validation_mode,
689
+ validation_agent: validation.validation_agent,
690
+ validation_evidence_ref: evidenceRef,
663
691
  },
664
692
  });
665
693
 
@@ -866,11 +894,18 @@ export async function evolve(
866
894
  failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
867
895
  pass_rate: validation.after_pass_rate,
868
896
  };
897
+ const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
869
898
  recordAudit(
870
899
  proposal.proposal_id,
871
900
  "validated",
872
901
  `Validation complete: improved=${validation.improved}`,
873
902
  evalSnapshot,
903
+ undefined,
904
+ {
905
+ validation_mode: validation.validation_mode,
906
+ validation_agent: validation.validation_agent,
907
+ validation_evidence_ref: validatedEvidenceRef,
908
+ },
874
909
  );
875
910
  recordEvidence({
876
911
  timestamp: new Date().toISOString(),
@@ -890,6 +925,9 @@ export async function evolve(
890
925
  regressions: validation.regressions,
891
926
  new_passes: validation.new_passes,
892
927
  per_entry_results: validation.per_entry_results,
928
+ validation_mode: validation.validation_mode,
929
+ validation_agent: validation.validation_agent,
930
+ validation_evidence_ref: validatedEvidenceRef,
893
931
  },
894
932
  });
895
933
 
@@ -906,10 +944,18 @@ export async function evolve(
906
944
 
907
945
  if (!validation.improved) {
908
946
  feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
947
+ const rejectedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "rejected");
909
948
  recordAudit(
910
949
  proposal.proposal_id,
911
950
  "rejected",
912
951
  `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
952
+ undefined,
953
+ undefined,
954
+ {
955
+ validation_mode: validation.validation_mode,
956
+ validation_agent: validation.validation_agent,
957
+ validation_evidence_ref: rejectedEvidenceRef,
958
+ },
913
959
  );
914
960
  recordEvidence({
915
961
  timestamp: new Date().toISOString(),
@@ -929,6 +975,9 @@ export async function evolve(
929
975
  regressions: validation.regressions,
930
976
  new_passes: validation.new_passes,
931
977
  per_entry_results: validation.per_entry_results,
978
+ validation_mode: validation.validation_mode,
979
+ validation_agent: validation.validation_agent,
980
+ validation_evidence_ref: rejectedEvidenceRef,
932
981
  },
933
982
  });
934
983
 
@@ -1138,6 +1187,11 @@ export async function evolve(
1138
1187
  pass_rate: lastValidation.after_pass_rate,
1139
1188
  },
1140
1189
  iterationsCompleted,
1190
+ {
1191
+ validation_mode: lastValidation.validation_mode,
1192
+ validation_agent: lastValidation.validation_agent,
1193
+ validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
1194
+ },
1141
1195
  );
1142
1196
  recordEvidence({
1143
1197
  timestamp: new Date().toISOString(),
@@ -1157,6 +1211,9 @@ export async function evolve(
1157
1211
  regressions: lastValidation.regressions,
1158
1212
  new_passes: lastValidation.new_passes,
1159
1213
  per_entry_results: lastValidation.per_entry_results,
1214
+ validation_mode: lastValidation.validation_mode,
1215
+ validation_agent: lastValidation.validation_agent,
1216
+ validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
1160
1217
  },
1161
1218
  });
1162
1219
  }
@@ -209,6 +209,8 @@ export async function validateBodyProposal(
209
209
  gate_results: gateResults,
210
210
  improved: false,
211
211
  regressions: [],
212
+ validation_mode: "structural_guard",
213
+ validation_agent: agent,
212
214
  };
213
215
  }
214
216
 
@@ -250,5 +252,13 @@ export async function validateBodyProposal(
250
252
  gate_results: gateResults,
251
253
  improved: gatesPassed === 3,
252
254
  regressions: accuracy.regressions,
255
+ validation_mode: "llm_judge",
256
+ validation_agent: agent,
257
+ ...(evalSet.length > 0
258
+ ? {
259
+ before_pass_rate: accuracy.before_pass_rate,
260
+ after_pass_rate: accuracy.after_pass_rate,
261
+ }
262
+ : {}),
253
263
  };
254
264
  }
@@ -0,0 +1,236 @@
1
+ import { existsSync, readFileSync, readdirSync, realpathSync, statSync } from "node:fs";
2
+ import { basename, dirname, join } from "node:path";
3
+
4
+ import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
5
+ import { parseFrontmatter } from "../utils/frontmatter.js";
6
+ import { containsWholeSkillMention } from "../utils/skill-discovery.js";
7
+ import { findGitRepositoryRoot } from "../utils/skill-discovery.js";
8
+ import {
9
+ extractWhenToUseLines,
10
+ jaccardSimilarity,
11
+ tokenizeText,
12
+ } from "../utils/text-similarity.js";
13
+
14
+ interface ReplaySkillSurface {
15
+ skillName: string;
16
+ descriptionTokens: Set<string>;
17
+ whenToUseTokens: Set<string>;
18
+ }
19
+
20
+ /**
21
+ * Minimum score needed before replay treats routing text or skill-surface overlap
22
+ * as a real match. Tuned to suppress weak false positives without killing recall
23
+ * for short routing phrases and sparse skill surfaces.
24
+ */
25
+ const HOST_REPLAY_MATCH_THRESHOLD = 0.18;
26
+
27
+ function resolveReplayPath(path: string): string {
28
+ try {
29
+ return realpathSync(path);
30
+ } catch {
31
+ return path;
32
+ }
33
+ }
34
+
35
+ function listCompetingSkillPaths(targetSkillPath: string): string[] {
36
+ const normalizedTargetPath = resolveReplayPath(targetSkillPath);
37
+ const targetSkillDir = dirname(normalizedTargetPath);
38
+ const registryDir = dirname(targetSkillDir);
39
+ const targetDirName = basename(targetSkillDir);
40
+ const competingPaths: string[] = [];
41
+
42
+ try {
43
+ for (const entry of readdirSync(registryDir)) {
44
+ if (entry === targetDirName) continue;
45
+ const candidateDir = join(registryDir, entry);
46
+ try {
47
+ if (!statSync(candidateDir).isDirectory()) continue;
48
+ } catch {
49
+ continue;
50
+ }
51
+
52
+ const candidateSkillPath = join(candidateDir, "SKILL.md");
53
+ if (!existsSync(candidateSkillPath)) continue;
54
+ competingPaths.push(resolveReplayPath(candidateSkillPath));
55
+ }
56
+ } catch {
57
+ // Ignore unreadable registries and treat the fixture as target-only.
58
+ }
59
+
60
+ return competingPaths.sort((a, b) => a.localeCompare(b));
61
+ }
62
+
63
+ export function buildRoutingReplayFixture(options: {
64
+ skillName: string;
65
+ skillPath: string;
66
+ platform?: RoutingReplayFixture["platform"];
67
+ fixtureId?: string;
68
+ workspaceRoot?: string;
69
+ }): RoutingReplayFixture {
70
+ const targetSkillPath = resolveReplayPath(options.skillPath);
71
+ const workspaceRoot =
72
+ options.workspaceRoot ?? findGitRepositoryRoot(dirname(dirname(targetSkillPath)));
73
+ const platform = options.platform ?? "claude_code";
74
+
75
+ return {
76
+ fixture_id: options.fixtureId ?? `auto-${platform}-${options.skillName}`,
77
+ platform,
78
+ target_skill_name: options.skillName,
79
+ target_skill_path: targetSkillPath,
80
+ competing_skill_paths: listCompetingSkillPaths(targetSkillPath),
81
+ ...(workspaceRoot ? { workspace_root: workspaceRoot } : {}),
82
+ };
83
+ }
84
+
85
+ function loadReplaySkillSurface(skillPath: string): ReplaySkillSurface {
86
+ const fallbackName = basename(dirname(skillPath)) || "unknown-skill";
87
+ try {
88
+ const raw = readFileSync(skillPath, "utf8");
89
+ const parsed = parseFrontmatter(raw);
90
+ return {
91
+ skillName: parsed.name.trim() || fallbackName,
92
+ descriptionTokens: tokenizeText(parsed.description),
93
+ whenToUseTokens: tokenizeText(extractWhenToUseLines(parsed.body).join(" ")),
94
+ };
95
+ } catch {
96
+ return {
97
+ skillName: fallbackName,
98
+ descriptionTokens: new Set<string>(),
99
+ whenToUseTokens: new Set<string>(),
100
+ };
101
+ }
102
+ }
103
+
104
+ function extractRoutingTriggerPhrases(routing: string): string[] {
105
+ const lines = routing
106
+ .trim()
107
+ .split("\n")
108
+ .map((line) => line.trim())
109
+ .filter(Boolean);
110
+ if (lines.length < 3) return [];
111
+
112
+ const phrases: string[] = [];
113
+ for (const row of lines.slice(2)) {
114
+ if (!row.startsWith("|") || !row.endsWith("|")) continue;
115
+ const cells = row.split("|").map((cell) => cell.trim());
116
+ const triggerCell = cells[1];
117
+ if (!triggerCell) continue;
118
+ for (const part of triggerCell.split(/,|\/| or /i)) {
119
+ const phrase = part.trim().replace(/^["'`]|["'`]$/g, "");
120
+ if (phrase.length >= 3) phrases.push(phrase);
121
+ }
122
+ }
123
+ return phrases;
124
+ }
125
+
126
+ function scoreQueryAgainstTriggerPhrases(query: string, triggerPhrases: string[]): number {
127
+ const normalizedQuery = query.toLowerCase();
128
+ const queryTokens = tokenizeText(query);
129
+ let best = 0;
130
+ for (const phrase of triggerPhrases) {
131
+ const normalizedPhrase = phrase.toLowerCase();
132
+ if (normalizedQuery.includes(normalizedPhrase)) {
133
+ best = Math.max(best, 1);
134
+ continue;
135
+ }
136
+ best = Math.max(best, jaccardSimilarity(queryTokens, tokenizeText(phrase)));
137
+ }
138
+ return best;
139
+ }
140
+
141
+ function scoreQueryAgainstSkillSurface(query: string, surface: ReplaySkillSurface): number {
142
+ const queryTokens = tokenizeText(query);
143
+ return Math.max(
144
+ jaccardSimilarity(queryTokens, surface.descriptionTokens),
145
+ jaccardSimilarity(queryTokens, surface.whenToUseTokens),
146
+ );
147
+ }
148
+
149
+ function evaluateReplayTrigger(
150
+ query: string,
151
+ routing: string,
152
+ targetSurface: ReplaySkillSurface,
153
+ competingSurfaces: ReplaySkillSurface[],
154
+ ): { triggered: boolean; evidence: string } {
155
+ const normalizedQuery = query.trim();
156
+ if (containsWholeSkillMention(normalizedQuery, targetSurface.skillName)) {
157
+ return {
158
+ triggered: true,
159
+ evidence: `explicit target mention: ${targetSurface.skillName}`,
160
+ };
161
+ }
162
+
163
+ for (const competingSurface of competingSurfaces) {
164
+ if (containsWholeSkillMention(normalizedQuery, competingSurface.skillName)) {
165
+ return {
166
+ triggered: false,
167
+ evidence: `explicit competing skill mention: ${competingSurface.skillName}`,
168
+ };
169
+ }
170
+ }
171
+
172
+ const triggerPhrases = extractRoutingTriggerPhrases(routing);
173
+ const triggerScore = scoreQueryAgainstTriggerPhrases(normalizedQuery, triggerPhrases);
174
+ const targetSurfaceScore = scoreQueryAgainstSkillSurface(normalizedQuery, targetSurface);
175
+ const targetScore = Math.max(triggerScore, targetSurfaceScore);
176
+ const bestCompetitor = competingSurfaces
177
+ .map((surface) => ({
178
+ skillName: surface.skillName,
179
+ score: scoreQueryAgainstSkillSurface(normalizedQuery, surface),
180
+ }))
181
+ .sort((a, b) => b.score - a.score)[0];
182
+
183
+ if (targetScore < HOST_REPLAY_MATCH_THRESHOLD) {
184
+ return {
185
+ triggered: false,
186
+ evidence: "target routing and skill surface did not clear replay threshold",
187
+ };
188
+ }
189
+
190
+ if (bestCompetitor && bestCompetitor.score >= targetScore) {
191
+ return {
192
+ triggered: false,
193
+ evidence: `competing skill surface scored higher: ${bestCompetitor.skillName}`,
194
+ };
195
+ }
196
+
197
+ if (triggerScore >= targetSurfaceScore) {
198
+ return {
199
+ triggered: true,
200
+ evidence:
201
+ triggerScore === 1
202
+ ? "query matched a routing trigger phrase exactly"
203
+ : "query aligned with routing trigger language",
204
+ };
205
+ }
206
+
207
+ return {
208
+ triggered: true,
209
+ evidence: "query aligned with target skill surface in replay fixture",
210
+ };
211
+ }
212
+
213
+ export function runHostReplayFixture(options: {
214
+ routing: string;
215
+ evalSet: EvalEntry[];
216
+ fixture: RoutingReplayFixture;
217
+ }): RoutingReplayEntryResult[] {
218
+ const targetSurface = loadReplaySkillSurface(options.fixture.target_skill_path);
219
+ const competingSurfaces = options.fixture.competing_skill_paths.map(loadReplaySkillSurface);
220
+
221
+ return options.evalSet.map((entry) => {
222
+ const evaluated = evaluateReplayTrigger(
223
+ entry.query,
224
+ options.routing,
225
+ targetSurface,
226
+ competingSurfaces,
227
+ );
228
+ return {
229
+ query: entry.query,
230
+ should_trigger: entry.should_trigger,
231
+ triggered: evaluated.triggered,
232
+ passed: evaluated.triggered === entry.should_trigger,
233
+ evidence: evaluated.evidence,
234
+ };
235
+ });
236
+ }
@@ -40,6 +40,8 @@ export interface ValidationResult {
40
40
  net_change: number; // after - before pass rate
41
41
  by_invocation_type?: InvocationTypeScores;
42
42
  per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
43
+ validation_mode?: "llm_judge";
44
+ validation_agent?: string;
43
45
  }
44
46
 
45
47
  // ---------------------------------------------------------------------------
@@ -63,6 +65,8 @@ export async function validateProposalSequential(
63
65
  regressions: [],
64
66
  new_passes: [],
65
67
  net_change: 0,
68
+ validation_mode: "llm_judge",
69
+ validation_agent: agent,
66
70
  };
67
71
  }
68
72
 
@@ -174,6 +178,8 @@ export async function validateProposalSequential(
174
178
  net_change: netChange,
175
179
  by_invocation_type: invocationScores,
176
180
  per_entry_results: perEntryResults,
181
+ validation_mode: "llm_judge",
182
+ validation_agent: agent,
177
183
  };
178
184
  }
179
185
 
@@ -220,6 +226,8 @@ export async function validateProposalBatched(
220
226
  regressions: [],
221
227
  new_passes: [],
222
228
  net_change: 0,
229
+ validation_mode: "llm_judge",
230
+ validation_agent: agent,
223
231
  };
224
232
  }
225
233
 
@@ -342,6 +350,8 @@ export async function validateProposalBatched(
342
350
  net_change: netChange,
343
351
  by_invocation_type: invocationScores,
344
352
  per_entry_results: perEntryResults,
353
+ validation_mode: "llm_judge",
354
+ validation_agent: agent,
345
355
  };
346
356
  }
347
357
 
@@ -5,9 +5,43 @@
5
5
  * and running trigger accuracy checks against an eval set.
6
6
  */
7
7
 
8
- import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
8
+ import type {
9
+ BodyEvolutionProposal,
10
+ BodyValidationResult,
11
+ EvalEntry,
12
+ RoutingReplayEntryResult,
13
+ RoutingReplayFixture,
14
+ ValidationMode,
15
+ } from "../types.js";
9
16
  import { callLlm } from "../utils/llm-call.js";
10
17
  import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
18
+ import { runHostReplayFixture } from "./validate-host-replay.js";
19
+
20
+ export interface RoutingReplayRunnerInput {
21
+ routing: string;
22
+ evalSet: EvalEntry[];
23
+ agent: string;
24
+ fixture: RoutingReplayFixture;
25
+ }
26
+
27
+ export type RoutingReplayRunner = (
28
+ input: RoutingReplayRunnerInput,
29
+ ) => Promise<RoutingReplayEntryResult[]>;
30
+
31
+ export interface RoutingValidationOptions {
32
+ replayFixture?: RoutingReplayFixture;
33
+ replayRunner?: RoutingReplayRunner;
34
+ }
35
+
36
+ export interface RoutingTriggerAccuracyResult {
37
+ before_pass_rate: number;
38
+ after_pass_rate: number;
39
+ improved: boolean;
40
+ validation_mode: ValidationMode;
41
+ validation_agent: string;
42
+ validation_fixture_id?: string;
43
+ per_entry_results?: RoutingReplayEntryResult[];
44
+ }
11
45
 
12
46
  // ---------------------------------------------------------------------------
13
47
  // Structural validation
@@ -77,9 +111,70 @@ export async function validateRoutingTriggerAccuracy(
77
111
  evalSet: EvalEntry[],
78
112
  agent: string,
79
113
  modelFlag?: string,
80
- ): Promise<{ before_pass_rate: number; after_pass_rate: number; improved: boolean }> {
114
+ options: RoutingValidationOptions = {},
115
+ ): Promise<RoutingTriggerAccuracyResult> {
81
116
  if (evalSet.length === 0) {
82
- return { before_pass_rate: 0, after_pass_rate: 0, improved: false };
117
+ return {
118
+ before_pass_rate: 0,
119
+ after_pass_rate: 0,
120
+ improved: false,
121
+ validation_mode: "structural_guard",
122
+ validation_agent: agent,
123
+ };
124
+ }
125
+
126
+ if (options.replayFixture && options.replayRunner) {
127
+ const beforeResults = await options.replayRunner({
128
+ routing: originalRouting,
129
+ evalSet,
130
+ agent,
131
+ fixture: options.replayFixture,
132
+ });
133
+ const afterResults = await options.replayRunner({
134
+ routing: proposedRouting,
135
+ evalSet,
136
+ agent,
137
+ fixture: options.replayFixture,
138
+ });
139
+ const beforePassed = beforeResults.filter((result) => result.passed).length;
140
+ const afterPassed = afterResults.filter((result) => result.passed).length;
141
+ const total = evalSet.length;
142
+
143
+ return {
144
+ before_pass_rate: beforePassed / total,
145
+ after_pass_rate: afterPassed / total,
146
+ improved: afterPassed > beforePassed,
147
+ validation_mode: "host_replay",
148
+ validation_agent: agent,
149
+ validation_fixture_id: options.replayFixture.fixture_id,
150
+ per_entry_results: afterResults,
151
+ };
152
+ }
153
+
154
+ if (options.replayFixture) {
155
+ const beforeResults = runHostReplayFixture({
156
+ routing: originalRouting,
157
+ evalSet,
158
+ fixture: options.replayFixture,
159
+ });
160
+ const afterResults = runHostReplayFixture({
161
+ routing: proposedRouting,
162
+ evalSet,
163
+ fixture: options.replayFixture,
164
+ });
165
+ const beforePassed = beforeResults.filter((result) => result.passed).length;
166
+ const afterPassed = afterResults.filter((result) => result.passed).length;
167
+ const total = evalSet.length;
168
+
169
+ return {
170
+ before_pass_rate: beforePassed / total,
171
+ after_pass_rate: afterPassed / total,
172
+ improved: afterPassed > beforePassed,
173
+ validation_mode: "host_replay",
174
+ validation_agent: agent,
175
+ validation_fixture_id: options.replayFixture.fixture_id,
176
+ per_entry_results: afterResults,
177
+ };
83
178
  }
84
179
 
85
180
  const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
@@ -113,6 +208,8 @@ export async function validateRoutingTriggerAccuracy(
113
208
  before_pass_rate: beforePassRate,
114
209
  after_pass_rate: afterPassRate,
115
210
  improved: afterPassRate > beforePassRate,
211
+ validation_mode: "llm_judge",
212
+ validation_agent: agent,
116
213
  };
117
214
  }
118
215
 
@@ -126,6 +223,7 @@ export async function validateRoutingProposal(
126
223
  evalSet: EvalEntry[],
127
224
  agent: string,
128
225
  modelFlag?: string,
226
+ options: RoutingValidationOptions = {},
129
227
  ): Promise<BodyValidationResult> {
130
228
  const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
131
229
 
@@ -145,6 +243,8 @@ export async function validateRoutingProposal(
145
243
  gate_results: gateResults,
146
244
  improved: false,
147
245
  regressions: [],
246
+ validation_mode: "structural_guard",
247
+ validation_agent: agent,
148
248
  };
149
249
  }
150
250
 
@@ -155,13 +255,14 @@ export async function validateRoutingProposal(
155
255
  evalSet,
156
256
  agent,
157
257
  modelFlag,
258
+ options,
158
259
  );
159
260
  gateResults.push({
160
261
  gate: "trigger_accuracy",
161
262
  passed: accuracy.improved,
162
263
  reason: accuracy.improved
163
- ? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
164
- : `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
264
+ ? `Improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
265
+ : `Not improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
165
266
  });
166
267
 
167
268
  const gatesPassed = gateResults.filter((g) => g.passed).length;
@@ -173,5 +274,11 @@ export async function validateRoutingProposal(
173
274
  gate_results: gateResults,
174
275
  improved: gatesPassed === 2,
175
276
  regressions: [],
277
+ validation_mode: accuracy.validation_mode,
278
+ validation_agent: accuracy.validation_agent,
279
+ validation_fixture_id: accuracy.validation_fixture_id,
280
+ before_pass_rate: accuracy.before_pass_rate,
281
+ after_pass_rate: accuracy.after_pass_rate,
282
+ per_entry_results: accuracy.per_entry_results,
176
283
  };
177
284
  }
@@ -285,11 +285,12 @@ export function writeEvolutionAuditToDb(record: EvolutionAuditEntry): boolean {
285
285
  return safeWrite("evolution-audit", (db) => {
286
286
  getStmt(
287
287
  db,
288
- "evolution-audit-v2",
288
+ "evolution-audit-v3",
289
289
  `
290
290
  INSERT OR IGNORE INTO evolution_audit
291
- (timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used)
292
- VALUES (?, ?, ?, ?, ?, ?, ?)
291
+ (timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used,
292
+ validation_mode, validation_agent, validation_fixture_id, validation_evidence_ref)
293
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
293
294
  `,
294
295
  ).run(
295
296
  record.timestamp,
@@ -299,6 +300,10 @@ export function writeEvolutionAuditToDb(record: EvolutionAuditEntry): boolean {
299
300
  record.details,
300
301
  record.eval_snapshot ? JSON.stringify(record.eval_snapshot) : null,
301
302
  record.iterations_used ?? null,
303
+ record.validation_mode ?? null,
304
+ record.validation_agent ?? null,
305
+ record.validation_fixture_id ?? null,
306
+ record.validation_evidence_ref ?? null,
302
307
  );
303
308
  });
304
309
  }
@@ -600,8 +600,9 @@ function insertEvolutionAudit(db: Database, records: EvolutionAuditEntry[]): num
600
600
  // (idx_evo_audit_dedup defined in schema.ts).
601
601
  const stmt = db.prepare(`
602
602
  INSERT OR IGNORE INTO evolution_audit
603
- (timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used)
604
- VALUES (?, ?, ?, ?, ?, ?, ?)
603
+ (timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used,
604
+ validation_mode, validation_agent, validation_fixture_id, validation_evidence_ref)
605
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
605
606
  `);
606
607
 
607
608
  let count = 0;
@@ -614,6 +615,10 @@ function insertEvolutionAudit(db: Database, records: EvolutionAuditEntry[]): num
614
615
  r.details,
615
616
  r.eval_snapshot ? JSON.stringify(r.eval_snapshot) : null,
616
617
  r.iterations_used ?? null,
618
+ r.validation_mode ?? null,
619
+ r.validation_agent ?? null,
620
+ r.validation_fixture_id ?? null,
621
+ r.validation_evidence_ref ?? null,
617
622
  );
618
623
  count++;
619
624
  }