selftune 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.claude/agents/diagnosis-analyst.md +146 -0
  2. package/.claude/agents/evolution-reviewer.md +167 -0
  3. package/.claude/agents/integration-guide.md +200 -0
  4. package/.claude/agents/pattern-analyst.md +147 -0
  5. package/CHANGELOG.md +37 -0
  6. package/README.md +96 -256
  7. package/assets/BeforeAfter.gif +0 -0
  8. package/assets/FeedbackLoop.gif +0 -0
  9. package/assets/logo.svg +9 -0
  10. package/assets/skill-health-badge.svg +20 -0
  11. package/cli/selftune/activation-rules.ts +171 -0
  12. package/cli/selftune/badge/badge-data.ts +108 -0
  13. package/cli/selftune/badge/badge-svg.ts +212 -0
  14. package/cli/selftune/badge/badge.ts +103 -0
  15. package/cli/selftune/constants.ts +75 -1
  16. package/cli/selftune/contribute/bundle.ts +314 -0
  17. package/cli/selftune/contribute/contribute.ts +214 -0
  18. package/cli/selftune/contribute/sanitize.ts +162 -0
  19. package/cli/selftune/cron/setup.ts +266 -0
  20. package/cli/selftune/dashboard-server.ts +582 -0
  21. package/cli/selftune/dashboard.ts +25 -3
  22. package/cli/selftune/eval/baseline.ts +247 -0
  23. package/cli/selftune/eval/composability.ts +117 -0
  24. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  25. package/cli/selftune/eval/hooks-to-evals.ts +68 -2
  26. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  27. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  28. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  29. package/cli/selftune/eval/unit-test.ts +196 -0
  30. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  31. package/cli/selftune/evolution/evolve-body.ts +492 -0
  32. package/cli/selftune/evolution/evolve.ts +466 -103
  33. package/cli/selftune/evolution/extract-patterns.ts +32 -1
  34. package/cli/selftune/evolution/pareto.ts +314 -0
  35. package/cli/selftune/evolution/propose-body.ts +171 -0
  36. package/cli/selftune/evolution/propose-description.ts +100 -2
  37. package/cli/selftune/evolution/propose-routing.ts +166 -0
  38. package/cli/selftune/evolution/refine-body.ts +141 -0
  39. package/cli/selftune/evolution/rollback.ts +19 -2
  40. package/cli/selftune/evolution/validate-body.ts +254 -0
  41. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  42. package/cli/selftune/evolution/validate-routing.ts +177 -0
  43. package/cli/selftune/grading/grade-session.ts +138 -18
  44. package/cli/selftune/grading/pre-gates.ts +104 -0
  45. package/cli/selftune/hooks/auto-activate.ts +185 -0
  46. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  47. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  48. package/cli/selftune/index.ts +88 -0
  49. package/cli/selftune/ingestors/claude-replay.ts +351 -0
  50. package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
  51. package/cli/selftune/init.ts +150 -3
  52. package/cli/selftune/memory/writer.ts +447 -0
  53. package/cli/selftune/monitoring/watch.ts +25 -2
  54. package/cli/selftune/status.ts +17 -13
  55. package/cli/selftune/types.ts +377 -5
  56. package/cli/selftune/utils/frontmatter.ts +217 -0
  57. package/cli/selftune/utils/llm-call.ts +29 -3
  58. package/cli/selftune/utils/transcript.ts +35 -0
  59. package/cli/selftune/utils/trigger-check.ts +89 -0
  60. package/cli/selftune/utils/tui.ts +156 -0
  61. package/dashboard/index.html +569 -8
  62. package/package.json +8 -4
  63. package/skill/SKILL.md +124 -8
  64. package/skill/Workflows/AutoActivation.md +144 -0
  65. package/skill/Workflows/Badge.md +118 -0
  66. package/skill/Workflows/Baseline.md +121 -0
  67. package/skill/Workflows/Composability.md +100 -0
  68. package/skill/Workflows/Contribute.md +91 -0
  69. package/skill/Workflows/Cron.md +155 -0
  70. package/skill/Workflows/Dashboard.md +203 -0
  71. package/skill/Workflows/Doctor.md +37 -1
  72. package/skill/Workflows/Evals.md +69 -1
  73. package/skill/Workflows/EvolutionMemory.md +152 -0
  74. package/skill/Workflows/Evolve.md +111 -6
  75. package/skill/Workflows/EvolveBody.md +159 -0
  76. package/skill/Workflows/ImportSkillsBench.md +111 -0
  77. package/skill/Workflows/Ingest.md +117 -3
  78. package/skill/Workflows/Initialize.md +57 -3
  79. package/skill/Workflows/Replay.md +70 -0
  80. package/skill/Workflows/Rollback.md +20 -1
  81. package/skill/Workflows/UnitTest.md +138 -0
  82. package/skill/Workflows/Watch.md +22 -0
  83. package/skill/settings_snippet.json +23 -0
  84. package/templates/activation-rules-default.json +27 -0
  85. package/templates/multi-skill-settings.json +64 -0
  86. package/templates/single-skill-settings.json +58 -0
@@ -6,8 +6,25 @@
6
6
  * to determine whether the proposal is an improvement.
7
7
  */
8
8
 
9
- import type { EvalEntry, EvolutionProposal } from "../types.js";
9
+ import type { EvalEntry, EvolutionProposal, InvocationTypeScores } from "../types.js";
10
10
  import { callLlm } from "../utils/llm-call.js";
11
+ import {
12
+ buildBatchTriggerCheckPrompt,
13
+ buildTriggerCheckPrompt,
14
+ parseBatchTriggerResponse,
15
+ parseTriggerResponse,
16
+ } from "../utils/trigger-check.js";
17
+
18
+ // Re-export so existing consumers don't break
19
+ export { buildTriggerCheckPrompt, parseTriggerResponse };
20
+
21
+ /** Number of eval queries to batch into a single LLM call.
22
+ * Higher = fewer claude -p spawns = much faster (each spawn has ~30-60s overhead).
23
+ * Haiku handles 50+ YES/NO checks in a single call easily. */
24
+ export const TRIGGER_CHECK_BATCH_SIZE = 50;
25
+
26
+ /** Number of times to run each batch and majority-vote to reduce LLM variance. */
27
+ export const VALIDATION_RUNS = 3;
11
28
 
12
29
  // ---------------------------------------------------------------------------
13
30
  // Types
@@ -21,47 +38,20 @@ export interface ValidationResult {
21
38
  regressions: EvalEntry[]; // passed before, fail after
22
39
  new_passes: EvalEntry[]; // failed before, pass after
23
40
  net_change: number; // after - before pass rate
24
- }
25
-
26
- // ---------------------------------------------------------------------------
27
- // Prompt building
28
- // ---------------------------------------------------------------------------
29
-
30
- /** Build the trigger check prompt for the LLM. */
31
- export function buildTriggerCheckPrompt(description: string, query: string): string {
32
- return [
33
- "Given this skill description, would the following user query trigger this skill?",
34
- "Respond YES or NO only.",
35
- "",
36
- "Skill description:",
37
- description,
38
- "",
39
- "User query:",
40
- query,
41
- ].join("\n");
42
- }
43
-
44
- // ---------------------------------------------------------------------------
45
- // Response parsing
46
- // ---------------------------------------------------------------------------
47
-
48
- /** Parse YES/NO from LLM response. */
49
- export function parseTriggerResponse(response: string): boolean {
50
- const normalized = response.trim().toUpperCase();
51
- if (normalized.startsWith("YES")) return true;
52
- if (normalized.startsWith("NO")) return false;
53
- return false; // conservative default
41
+ by_invocation_type?: InvocationTypeScores;
42
+ per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
54
43
  }
55
44
 
56
45
  // ---------------------------------------------------------------------------
57
46
  // Proposal validation
58
47
  // ---------------------------------------------------------------------------
59
48
 
60
- /** Validate a proposal by running trigger checks against the eval set. */
61
- export async function validateProposal(
49
+ /** Validate a proposal sequentially (one LLM call per query). Kept for backward compat. */
50
+ export async function validateProposalSequential(
62
51
  proposal: EvolutionProposal,
63
52
  evalSet: EvalEntry[],
64
53
  agent: string,
54
+ modelFlag?: string,
65
55
  ): Promise<ValidationResult> {
66
56
  if (evalSet.length === 0) {
67
57
  return {
@@ -78,20 +68,22 @@ export async function validateProposal(
78
68
  const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
79
69
  const regressions: EvalEntry[] = [];
80
70
  const newPasses: EvalEntry[] = [];
71
+ const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
72
+ [];
81
73
  let beforePassed = 0;
82
74
  let afterPassed = 0;
83
75
 
84
76
  for (const entry of evalSet) {
85
77
  // Check with original description
86
78
  const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
87
- const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent);
79
+ const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
88
80
  const beforeTriggered = parseTriggerResponse(beforeRaw);
89
81
  const beforePass =
90
82
  (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
91
83
 
92
84
  // Check with proposed description
93
85
  const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
94
- const afterRaw = await callLlm(systemPrompt, afterPrompt, agent);
86
+ const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
95
87
  const afterTriggered = parseTriggerResponse(afterRaw);
96
88
  const afterPass =
97
89
  (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
@@ -108,6 +100,8 @@ export async function validateProposal(
108
100
  if (!beforePass && afterPass) {
109
101
  newPasses.push(entry);
110
102
  }
103
+
104
+ perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
111
105
  }
112
106
 
113
107
  const total = evalSet.length;
@@ -124,6 +118,51 @@ export async function validateProposal(
124
118
  regressions.length < total * 0.05 &&
125
119
  (netChange >= 0.1 || newPasses.length >= 2);
126
120
 
121
+ // Compute per-invocation-type scores (initialize all required keys)
122
+ const byInvocationType: Record<string, { passed: number; total: number }> = {
123
+ explicit: { passed: 0, total: 0 },
124
+ implicit: { passed: 0, total: 0 },
125
+ contextual: { passed: 0, total: 0 },
126
+ negative: { passed: 0, total: 0 },
127
+ };
128
+ for (const r of perEntryResults) {
129
+ const type = r.entry.invocation_type ?? "implicit";
130
+ if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
131
+ byInvocationType[type].total++;
132
+ if (r.after_pass) byInvocationType[type].passed++;
133
+ }
134
+
135
+ const invocationScores: InvocationTypeScores = {
136
+ explicit: {
137
+ ...byInvocationType.explicit,
138
+ pass_rate:
139
+ byInvocationType.explicit.total > 0
140
+ ? byInvocationType.explicit.passed / byInvocationType.explicit.total
141
+ : 0,
142
+ },
143
+ implicit: {
144
+ ...byInvocationType.implicit,
145
+ pass_rate:
146
+ byInvocationType.implicit.total > 0
147
+ ? byInvocationType.implicit.passed / byInvocationType.implicit.total
148
+ : 0,
149
+ },
150
+ contextual: {
151
+ ...byInvocationType.contextual,
152
+ pass_rate:
153
+ byInvocationType.contextual.total > 0
154
+ ? byInvocationType.contextual.passed / byInvocationType.contextual.total
155
+ : 0,
156
+ },
157
+ negative: {
158
+ ...byInvocationType.negative,
159
+ pass_rate:
160
+ byInvocationType.negative.total > 0
161
+ ? byInvocationType.negative.passed / byInvocationType.negative.total
162
+ : 0,
163
+ },
164
+ };
165
+
127
166
  return {
128
167
  proposal_id: proposal.proposal_id,
129
168
  before_pass_rate: beforePassRate,
@@ -132,5 +171,188 @@ export async function validateProposal(
132
171
  regressions,
133
172
  new_passes: newPasses,
134
173
  net_change: netChange,
174
+ by_invocation_type: invocationScores,
175
+ per_entry_results: perEntryResults,
135
176
  };
136
177
  }
178
+
179
+ // ---------------------------------------------------------------------------
180
+ // Batched proposal validation
181
+ // ---------------------------------------------------------------------------
182
+
183
+ /** Chunk an array into groups of `size`. */
184
+ function chunk<T>(arr: T[], size: number): T[][] {
185
+ const chunks: T[][] = [];
186
+ for (let i = 0; i < arr.length; i += size) {
187
+ chunks.push(arr.slice(i, i + size));
188
+ }
189
+ return chunks;
190
+ }
191
+
192
+ /** Majority-vote across multiple boolean arrays. Returns true if >50% of runs agree. */
193
+ function majorityVote(runs: boolean[][], index: number): boolean {
194
+ let yesCount = 0;
195
+ for (const run of runs) {
196
+ if (run[index]) yesCount++;
197
+ }
198
+ return yesCount > runs.length / 2;
199
+ }
200
+
201
+ /**
202
+ * Validate a proposal by batching trigger checks.
203
+ * Instead of 2 LLM calls per entry, this makes 2 calls per batch
204
+ * (one for "before", one for "after"), reducing total calls from 2N to ~2*(N/batchSize).
205
+ */
206
+ export async function validateProposalBatched(
207
+ proposal: EvolutionProposal,
208
+ evalSet: EvalEntry[],
209
+ agent: string,
210
+ modelFlag?: string,
211
+ ): Promise<ValidationResult> {
212
+ if (evalSet.length === 0) {
213
+ return {
214
+ proposal_id: proposal.proposal_id,
215
+ before_pass_rate: 0,
216
+ after_pass_rate: 0,
217
+ improved: false,
218
+ regressions: [],
219
+ new_passes: [],
220
+ net_change: 0,
221
+ };
222
+ }
223
+
224
+ const systemPrompt =
225
+ "You are an evaluation assistant. For each numbered query, respond with the number followed by YES or NO.";
226
+
227
+ const regressions: EvalEntry[] = [];
228
+ const newPasses: EvalEntry[] = [];
229
+ const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
230
+ [];
231
+ let beforePassed = 0;
232
+ let afterPassed = 0;
233
+
234
+ const batches = chunk(evalSet, TRIGGER_CHECK_BATCH_SIZE);
235
+
236
+ for (const batch of batches) {
237
+ const queries = batch.map((e) => e.query);
238
+
239
+ const beforePrompt = buildBatchTriggerCheckPrompt(proposal.original_description, queries);
240
+ const afterPrompt = buildBatchTriggerCheckPrompt(proposal.proposed_description, queries);
241
+
242
+ // Run VALIDATION_RUNS times in parallel and majority-vote to reduce LLM variance
243
+ const allCalls: Promise<string>[] = [];
244
+ for (let r = 0; r < VALIDATION_RUNS; r++) {
245
+ allCalls.push(callLlm(systemPrompt, beforePrompt, agent, modelFlag));
246
+ allCalls.push(callLlm(systemPrompt, afterPrompt, agent, modelFlag));
247
+ }
248
+ const allRaw = await Promise.all(allCalls);
249
+
250
+ // Parse into arrays of [before, after] per run
251
+ const beforeRuns: boolean[][] = [];
252
+ const afterRuns: boolean[][] = [];
253
+ for (let r = 0; r < VALIDATION_RUNS; r++) {
254
+ beforeRuns.push(parseBatchTriggerResponse(allRaw[r * 2], queries.length));
255
+ afterRuns.push(parseBatchTriggerResponse(allRaw[r * 2 + 1], queries.length));
256
+ }
257
+
258
+ for (let i = 0; i < batch.length; i++) {
259
+ const entry = batch[i];
260
+ const beforeTriggered = majorityVote(beforeRuns, i);
261
+ const afterTriggered = majorityVote(afterRuns, i);
262
+
263
+ const beforePass =
264
+ (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
265
+ const afterPass =
266
+ (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
267
+
268
+ if (beforePass) beforePassed++;
269
+ if (afterPass) afterPassed++;
270
+
271
+ if (beforePass && !afterPass) regressions.push(entry);
272
+ if (!beforePass && afterPass) newPasses.push(entry);
273
+
274
+ perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
275
+ }
276
+ }
277
+
278
+ const total = evalSet.length;
279
+ const beforePassRate = beforePassed / total;
280
+ const afterPassRate = afterPassed / total;
281
+ const netChange = afterPassRate - beforePassRate;
282
+
283
+ const improved =
284
+ afterPassRate > beforePassRate &&
285
+ regressions.length < total * 0.05 &&
286
+ (netChange >= 0.1 || newPasses.length >= 2);
287
+
288
+ // Compute per-invocation-type scores (initialize all required keys)
289
+ const byInvocationType: Record<string, { passed: number; total: number }> = {
290
+ explicit: { passed: 0, total: 0 },
291
+ implicit: { passed: 0, total: 0 },
292
+ contextual: { passed: 0, total: 0 },
293
+ negative: { passed: 0, total: 0 },
294
+ };
295
+ for (const r of perEntryResults) {
296
+ const type = r.entry.invocation_type ?? "implicit";
297
+ if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
298
+ byInvocationType[type].total++;
299
+ if (r.after_pass) byInvocationType[type].passed++;
300
+ }
301
+
302
+ const invocationScores: InvocationTypeScores = {
303
+ explicit: {
304
+ ...byInvocationType.explicit,
305
+ pass_rate:
306
+ byInvocationType.explicit.total > 0
307
+ ? byInvocationType.explicit.passed / byInvocationType.explicit.total
308
+ : 0,
309
+ },
310
+ implicit: {
311
+ ...byInvocationType.implicit,
312
+ pass_rate:
313
+ byInvocationType.implicit.total > 0
314
+ ? byInvocationType.implicit.passed / byInvocationType.implicit.total
315
+ : 0,
316
+ },
317
+ contextual: {
318
+ ...byInvocationType.contextual,
319
+ pass_rate:
320
+ byInvocationType.contextual.total > 0
321
+ ? byInvocationType.contextual.passed / byInvocationType.contextual.total
322
+ : 0,
323
+ },
324
+ negative: {
325
+ ...byInvocationType.negative,
326
+ pass_rate:
327
+ byInvocationType.negative.total > 0
328
+ ? byInvocationType.negative.passed / byInvocationType.negative.total
329
+ : 0,
330
+ },
331
+ };
332
+
333
+ return {
334
+ proposal_id: proposal.proposal_id,
335
+ before_pass_rate: beforePassRate,
336
+ after_pass_rate: afterPassRate,
337
+ improved,
338
+ regressions,
339
+ new_passes: newPasses,
340
+ net_change: netChange,
341
+ by_invocation_type: invocationScores,
342
+ per_entry_results: perEntryResults,
343
+ };
344
+ }
345
+
346
+ // ---------------------------------------------------------------------------
347
+ // Default export — batched is the default
348
+ // ---------------------------------------------------------------------------
349
+
350
+ /** Validate a proposal by running trigger checks against the eval set (batched by default). */
351
+ export async function validateProposal(
352
+ proposal: EvolutionProposal,
353
+ evalSet: EvalEntry[],
354
+ agent: string,
355
+ modelFlag?: string,
356
+ ): Promise<ValidationResult> {
357
+ return validateProposalBatched(proposal, evalSet, agent, modelFlag);
358
+ }
@@ -0,0 +1,177 @@
1
+ /**
2
+ * validate-routing.ts
3
+ *
4
+ * Validates a routing table evolution proposal by checking structural validity
5
+ * and running trigger accuracy checks against an eval set.
6
+ */
7
+
8
+ import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
9
+ import { callLlm } from "../utils/llm-call.js";
10
+ import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Structural validation
14
+ // ---------------------------------------------------------------------------
15
+
16
+ /**
17
+ * Check that a routing table is valid markdown table syntax with
18
+ * `| Trigger | Workflow |` columns.
19
+ */
20
+ export function validateRoutingStructure(routing: string): { valid: boolean; reason: string } {
21
+ const lines = routing
22
+ .trim()
23
+ .split("\n")
24
+ .filter((l) => l.trim().length > 0);
25
+
26
+ if (lines.length < 2) {
27
+ return { valid: false, reason: "Routing table must have at least a header and one data row" };
28
+ }
29
+
30
+ // Check header row contains Trigger and Workflow columns
31
+ const headerLine = lines[0].trim();
32
+ if (!headerLine.startsWith("|") || !headerLine.endsWith("|")) {
33
+ return {
34
+ valid: false,
35
+ reason: "Header row must be a markdown table row starting and ending with |",
36
+ };
37
+ }
38
+
39
+ const headerLower = headerLine.toLowerCase();
40
+ if (!headerLower.includes("trigger") || !headerLower.includes("workflow")) {
41
+ return { valid: false, reason: "Header must contain 'Trigger' and 'Workflow' columns" };
42
+ }
43
+
44
+ // Check separator row (line 2) has dashes
45
+ const separatorLine = lines[1].trim();
46
+ if (!separatorLine.includes("---")) {
47
+ return { valid: false, reason: "Second row must be a markdown table separator (contains ---)" };
48
+ }
49
+
50
+ // Check at least one data row
51
+ if (lines.length < 3) {
52
+ return { valid: false, reason: "Routing table must have at least one data row" };
53
+ }
54
+
55
+ // Check data rows are pipe-delimited
56
+ for (let i = 2; i < lines.length; i++) {
57
+ const row = lines[i].trim();
58
+ if (!row.startsWith("|") || !row.endsWith("|")) {
59
+ return { valid: false, reason: `Data row ${i - 1} is not a valid markdown table row` };
60
+ }
61
+ }
62
+
63
+ return { valid: true, reason: "Valid markdown routing table" };
64
+ }
65
+
66
+ // ---------------------------------------------------------------------------
67
+ // Trigger accuracy validation
68
+ // ---------------------------------------------------------------------------
69
+
70
+ /**
71
+ * Run before/after trigger checks on the eval set using the routing content.
72
+ * Returns pass rates for comparison.
73
+ */
74
+ export async function validateRoutingTriggerAccuracy(
75
+ originalRouting: string,
76
+ proposedRouting: string,
77
+ evalSet: EvalEntry[],
78
+ agent: string,
79
+ modelFlag?: string,
80
+ ): Promise<{ before_pass_rate: number; after_pass_rate: number; improved: boolean }> {
81
+ if (evalSet.length === 0) {
82
+ return { before_pass_rate: 0, after_pass_rate: 0, improved: false };
83
+ }
84
+
85
+ const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
86
+ let beforePassed = 0;
87
+ let afterPassed = 0;
88
+
89
+ for (const entry of evalSet) {
90
+ // Check with original routing
91
+ const beforePrompt = buildTriggerCheckPrompt(originalRouting, entry.query);
92
+ const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
93
+ const beforeTriggered = parseTriggerResponse(beforeRaw);
94
+ const beforePass =
95
+ (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
96
+
97
+ // Check with proposed routing
98
+ const afterPrompt = buildTriggerCheckPrompt(proposedRouting, entry.query);
99
+ const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
100
+ const afterTriggered = parseTriggerResponse(afterRaw);
101
+ const afterPass =
102
+ (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
103
+
104
+ if (beforePass) beforePassed++;
105
+ if (afterPass) afterPassed++;
106
+ }
107
+
108
+ const total = evalSet.length;
109
+ const beforePassRate = beforePassed / total;
110
+ const afterPassRate = afterPassed / total;
111
+
112
+ return {
113
+ before_pass_rate: beforePassRate,
114
+ after_pass_rate: afterPassRate,
115
+ improved: afterPassRate > beforePassRate,
116
+ };
117
+ }
118
+
119
+ // ---------------------------------------------------------------------------
120
+ // Full routing validation
121
+ // ---------------------------------------------------------------------------
122
+
123
+ /** Validate a routing table proposal: structural check + trigger accuracy. */
124
+ export async function validateRoutingProposal(
125
+ proposal: BodyEvolutionProposal,
126
+ evalSet: EvalEntry[],
127
+ agent: string,
128
+ modelFlag?: string,
129
+ ): Promise<BodyValidationResult> {
130
+ const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
131
+
132
+ // Gate 1: Structural validation
133
+ const structural = validateRoutingStructure(proposal.proposed_body);
134
+ gateResults.push({
135
+ gate: "structural",
136
+ passed: structural.valid,
137
+ reason: structural.reason,
138
+ });
139
+
140
+ if (!structural.valid) {
141
+ return {
142
+ proposal_id: proposal.proposal_id,
143
+ gates_passed: 0,
144
+ gates_total: 2,
145
+ gate_results: gateResults,
146
+ improved: false,
147
+ regressions: [],
148
+ };
149
+ }
150
+
151
+ // Gate 2: Trigger accuracy
152
+ const accuracy = await validateRoutingTriggerAccuracy(
153
+ proposal.original_body,
154
+ proposal.proposed_body,
155
+ evalSet,
156
+ agent,
157
+ modelFlag,
158
+ );
159
+ gateResults.push({
160
+ gate: "trigger_accuracy",
161
+ passed: accuracy.improved,
162
+ reason: accuracy.improved
163
+ ? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
164
+ : `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
165
+ });
166
+
167
+ const gatesPassed = gateResults.filter((g) => g.passed).length;
168
+
169
+ return {
170
+ proposal_id: proposal.proposal_id,
171
+ gates_passed: gatesPassed,
172
+ gates_total: 2,
173
+ gate_results: gateResults,
174
+ improved: gatesPassed === 2,
175
+ regressions: [],
176
+ };
177
+ }