selftune 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/diagnosis-analyst.md +146 -0
- package/.claude/agents/evolution-reviewer.md +167 -0
- package/.claude/agents/integration-guide.md +200 -0
- package/.claude/agents/pattern-analyst.md +147 -0
- package/CHANGELOG.md +37 -0
- package/README.md +96 -256
- package/assets/BeforeAfter.gif +0 -0
- package/assets/FeedbackLoop.gif +0 -0
- package/assets/logo.svg +9 -0
- package/assets/skill-health-badge.svg +20 -0
- package/cli/selftune/activation-rules.ts +171 -0
- package/cli/selftune/badge/badge-data.ts +108 -0
- package/cli/selftune/badge/badge-svg.ts +212 -0
- package/cli/selftune/badge/badge.ts +103 -0
- package/cli/selftune/constants.ts +75 -1
- package/cli/selftune/contribute/bundle.ts +314 -0
- package/cli/selftune/contribute/contribute.ts +214 -0
- package/cli/selftune/contribute/sanitize.ts +162 -0
- package/cli/selftune/cron/setup.ts +266 -0
- package/cli/selftune/dashboard-server.ts +582 -0
- package/cli/selftune/dashboard.ts +25 -3
- package/cli/selftune/eval/baseline.ts +247 -0
- package/cli/selftune/eval/composability.ts +117 -0
- package/cli/selftune/eval/generate-unit-tests.ts +143 -0
- package/cli/selftune/eval/hooks-to-evals.ts +68 -2
- package/cli/selftune/eval/import-skillsbench.ts +221 -0
- package/cli/selftune/eval/synthetic-evals.ts +172 -0
- package/cli/selftune/eval/unit-test-cli.ts +152 -0
- package/cli/selftune/eval/unit-test.ts +196 -0
- package/cli/selftune/evolution/deploy-proposal.ts +142 -1
- package/cli/selftune/evolution/evolve-body.ts +492 -0
- package/cli/selftune/evolution/evolve.ts +466 -103
- package/cli/selftune/evolution/extract-patterns.ts +32 -1
- package/cli/selftune/evolution/pareto.ts +314 -0
- package/cli/selftune/evolution/propose-body.ts +171 -0
- package/cli/selftune/evolution/propose-description.ts +100 -2
- package/cli/selftune/evolution/propose-routing.ts +166 -0
- package/cli/selftune/evolution/refine-body.ts +141 -0
- package/cli/selftune/evolution/rollback.ts +19 -2
- package/cli/selftune/evolution/validate-body.ts +254 -0
- package/cli/selftune/evolution/validate-proposal.ts +257 -35
- package/cli/selftune/evolution/validate-routing.ts +177 -0
- package/cli/selftune/grading/grade-session.ts +138 -18
- package/cli/selftune/grading/pre-gates.ts +104 -0
- package/cli/selftune/hooks/auto-activate.ts +185 -0
- package/cli/selftune/hooks/evolution-guard.ts +165 -0
- package/cli/selftune/hooks/skill-change-guard.ts +112 -0
- package/cli/selftune/index.ts +88 -0
- package/cli/selftune/ingestors/claude-replay.ts +351 -0
- package/cli/selftune/ingestors/openclaw-ingest.ts +440 -0
- package/cli/selftune/init.ts +150 -3
- package/cli/selftune/memory/writer.ts +447 -0
- package/cli/selftune/monitoring/watch.ts +25 -2
- package/cli/selftune/status.ts +17 -13
- package/cli/selftune/types.ts +377 -5
- package/cli/selftune/utils/frontmatter.ts +217 -0
- package/cli/selftune/utils/llm-call.ts +29 -3
- package/cli/selftune/utils/transcript.ts +35 -0
- package/cli/selftune/utils/trigger-check.ts +89 -0
- package/cli/selftune/utils/tui.ts +156 -0
- package/dashboard/index.html +569 -8
- package/package.json +8 -4
- package/skill/SKILL.md +124 -8
- package/skill/Workflows/AutoActivation.md +144 -0
- package/skill/Workflows/Badge.md +118 -0
- package/skill/Workflows/Baseline.md +121 -0
- package/skill/Workflows/Composability.md +100 -0
- package/skill/Workflows/Contribute.md +91 -0
- package/skill/Workflows/Cron.md +155 -0
- package/skill/Workflows/Dashboard.md +203 -0
- package/skill/Workflows/Doctor.md +37 -1
- package/skill/Workflows/Evals.md +69 -1
- package/skill/Workflows/EvolutionMemory.md +152 -0
- package/skill/Workflows/Evolve.md +111 -6
- package/skill/Workflows/EvolveBody.md +159 -0
- package/skill/Workflows/ImportSkillsBench.md +111 -0
- package/skill/Workflows/Ingest.md +117 -3
- package/skill/Workflows/Initialize.md +57 -3
- package/skill/Workflows/Replay.md +70 -0
- package/skill/Workflows/Rollback.md +20 -1
- package/skill/Workflows/UnitTest.md +138 -0
- package/skill/Workflows/Watch.md +22 -0
- package/skill/settings_snippet.json +23 -0
- package/templates/activation-rules-default.json +27 -0
- package/templates/multi-skill-settings.json +64 -0
- package/templates/single-skill-settings.json +58 -0
|
@@ -6,8 +6,25 @@
|
|
|
6
6
|
* to determine whether the proposal is an improvement.
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
|
-
import type { EvalEntry, EvolutionProposal } from "../types.js";
|
|
9
|
+
import type { EvalEntry, EvolutionProposal, InvocationTypeScores } from "../types.js";
|
|
10
10
|
import { callLlm } from "../utils/llm-call.js";
|
|
11
|
+
import {
|
|
12
|
+
buildBatchTriggerCheckPrompt,
|
|
13
|
+
buildTriggerCheckPrompt,
|
|
14
|
+
parseBatchTriggerResponse,
|
|
15
|
+
parseTriggerResponse,
|
|
16
|
+
} from "../utils/trigger-check.js";
|
|
17
|
+
|
|
18
|
+
// Re-export so existing consumers don't break
|
|
19
|
+
export { buildTriggerCheckPrompt, parseTriggerResponse };
|
|
20
|
+
|
|
21
|
+
/** Number of eval queries to batch into a single LLM call.
|
|
22
|
+
* Higher = fewer claude -p spawns = much faster (each spawn has ~30-60s overhead).
|
|
23
|
+
* Haiku handles 50+ YES/NO checks in a single call easily. */
|
|
24
|
+
export const TRIGGER_CHECK_BATCH_SIZE = 50;
|
|
25
|
+
|
|
26
|
+
/** Number of times to run each batch and majority-vote to reduce LLM variance. */
|
|
27
|
+
export const VALIDATION_RUNS = 3;
|
|
11
28
|
|
|
12
29
|
// ---------------------------------------------------------------------------
|
|
13
30
|
// Types
|
|
@@ -21,47 +38,20 @@ export interface ValidationResult {
|
|
|
21
38
|
regressions: EvalEntry[]; // passed before, fail after
|
|
22
39
|
new_passes: EvalEntry[]; // failed before, pass after
|
|
23
40
|
net_change: number; // after - before pass rate
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
// ---------------------------------------------------------------------------
|
|
27
|
-
// Prompt building
|
|
28
|
-
// ---------------------------------------------------------------------------
|
|
29
|
-
|
|
30
|
-
/** Build the trigger check prompt for the LLM. */
|
|
31
|
-
export function buildTriggerCheckPrompt(description: string, query: string): string {
|
|
32
|
-
return [
|
|
33
|
-
"Given this skill description, would the following user query trigger this skill?",
|
|
34
|
-
"Respond YES or NO only.",
|
|
35
|
-
"",
|
|
36
|
-
"Skill description:",
|
|
37
|
-
description,
|
|
38
|
-
"",
|
|
39
|
-
"User query:",
|
|
40
|
-
query,
|
|
41
|
-
].join("\n");
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
// ---------------------------------------------------------------------------
|
|
45
|
-
// Response parsing
|
|
46
|
-
// ---------------------------------------------------------------------------
|
|
47
|
-
|
|
48
|
-
/** Parse YES/NO from LLM response. */
|
|
49
|
-
export function parseTriggerResponse(response: string): boolean {
|
|
50
|
-
const normalized = response.trim().toUpperCase();
|
|
51
|
-
if (normalized.startsWith("YES")) return true;
|
|
52
|
-
if (normalized.startsWith("NO")) return false;
|
|
53
|
-
return false; // conservative default
|
|
41
|
+
by_invocation_type?: InvocationTypeScores;
|
|
42
|
+
per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
|
|
54
43
|
}
|
|
55
44
|
|
|
56
45
|
// ---------------------------------------------------------------------------
|
|
57
46
|
// Proposal validation
|
|
58
47
|
// ---------------------------------------------------------------------------
|
|
59
48
|
|
|
60
|
-
/** Validate a proposal
|
|
61
|
-
export async function
|
|
49
|
+
/** Validate a proposal sequentially (one LLM call per query). Kept for backward compat. */
|
|
50
|
+
export async function validateProposalSequential(
|
|
62
51
|
proposal: EvolutionProposal,
|
|
63
52
|
evalSet: EvalEntry[],
|
|
64
53
|
agent: string,
|
|
54
|
+
modelFlag?: string,
|
|
65
55
|
): Promise<ValidationResult> {
|
|
66
56
|
if (evalSet.length === 0) {
|
|
67
57
|
return {
|
|
@@ -78,20 +68,22 @@ export async function validateProposal(
|
|
|
78
68
|
const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
|
|
79
69
|
const regressions: EvalEntry[] = [];
|
|
80
70
|
const newPasses: EvalEntry[] = [];
|
|
71
|
+
const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
|
|
72
|
+
[];
|
|
81
73
|
let beforePassed = 0;
|
|
82
74
|
let afterPassed = 0;
|
|
83
75
|
|
|
84
76
|
for (const entry of evalSet) {
|
|
85
77
|
// Check with original description
|
|
86
78
|
const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
|
|
87
|
-
const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent);
|
|
79
|
+
const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
|
|
88
80
|
const beforeTriggered = parseTriggerResponse(beforeRaw);
|
|
89
81
|
const beforePass =
|
|
90
82
|
(entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
|
|
91
83
|
|
|
92
84
|
// Check with proposed description
|
|
93
85
|
const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
|
|
94
|
-
const afterRaw = await callLlm(systemPrompt, afterPrompt, agent);
|
|
86
|
+
const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
|
|
95
87
|
const afterTriggered = parseTriggerResponse(afterRaw);
|
|
96
88
|
const afterPass =
|
|
97
89
|
(entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
|
|
@@ -108,6 +100,8 @@ export async function validateProposal(
|
|
|
108
100
|
if (!beforePass && afterPass) {
|
|
109
101
|
newPasses.push(entry);
|
|
110
102
|
}
|
|
103
|
+
|
|
104
|
+
perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
|
|
111
105
|
}
|
|
112
106
|
|
|
113
107
|
const total = evalSet.length;
|
|
@@ -124,6 +118,51 @@ export async function validateProposal(
|
|
|
124
118
|
regressions.length < total * 0.05 &&
|
|
125
119
|
(netChange >= 0.1 || newPasses.length >= 2);
|
|
126
120
|
|
|
121
|
+
// Compute per-invocation-type scores (initialize all required keys)
|
|
122
|
+
const byInvocationType: Record<string, { passed: number; total: number }> = {
|
|
123
|
+
explicit: { passed: 0, total: 0 },
|
|
124
|
+
implicit: { passed: 0, total: 0 },
|
|
125
|
+
contextual: { passed: 0, total: 0 },
|
|
126
|
+
negative: { passed: 0, total: 0 },
|
|
127
|
+
};
|
|
128
|
+
for (const r of perEntryResults) {
|
|
129
|
+
const type = r.entry.invocation_type ?? "implicit";
|
|
130
|
+
if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
|
|
131
|
+
byInvocationType[type].total++;
|
|
132
|
+
if (r.after_pass) byInvocationType[type].passed++;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const invocationScores: InvocationTypeScores = {
|
|
136
|
+
explicit: {
|
|
137
|
+
...byInvocationType.explicit,
|
|
138
|
+
pass_rate:
|
|
139
|
+
byInvocationType.explicit.total > 0
|
|
140
|
+
? byInvocationType.explicit.passed / byInvocationType.explicit.total
|
|
141
|
+
: 0,
|
|
142
|
+
},
|
|
143
|
+
implicit: {
|
|
144
|
+
...byInvocationType.implicit,
|
|
145
|
+
pass_rate:
|
|
146
|
+
byInvocationType.implicit.total > 0
|
|
147
|
+
? byInvocationType.implicit.passed / byInvocationType.implicit.total
|
|
148
|
+
: 0,
|
|
149
|
+
},
|
|
150
|
+
contextual: {
|
|
151
|
+
...byInvocationType.contextual,
|
|
152
|
+
pass_rate:
|
|
153
|
+
byInvocationType.contextual.total > 0
|
|
154
|
+
? byInvocationType.contextual.passed / byInvocationType.contextual.total
|
|
155
|
+
: 0,
|
|
156
|
+
},
|
|
157
|
+
negative: {
|
|
158
|
+
...byInvocationType.negative,
|
|
159
|
+
pass_rate:
|
|
160
|
+
byInvocationType.negative.total > 0
|
|
161
|
+
? byInvocationType.negative.passed / byInvocationType.negative.total
|
|
162
|
+
: 0,
|
|
163
|
+
},
|
|
164
|
+
};
|
|
165
|
+
|
|
127
166
|
return {
|
|
128
167
|
proposal_id: proposal.proposal_id,
|
|
129
168
|
before_pass_rate: beforePassRate,
|
|
@@ -132,5 +171,188 @@ export async function validateProposal(
|
|
|
132
171
|
regressions,
|
|
133
172
|
new_passes: newPasses,
|
|
134
173
|
net_change: netChange,
|
|
174
|
+
by_invocation_type: invocationScores,
|
|
175
|
+
per_entry_results: perEntryResults,
|
|
135
176
|
};
|
|
136
177
|
}
|
|
178
|
+
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
// Batched proposal validation
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
|
|
183
|
+
/** Chunk an array into groups of `size`. */
|
|
184
|
+
function chunk<T>(arr: T[], size: number): T[][] {
|
|
185
|
+
const chunks: T[][] = [];
|
|
186
|
+
for (let i = 0; i < arr.length; i += size) {
|
|
187
|
+
chunks.push(arr.slice(i, i + size));
|
|
188
|
+
}
|
|
189
|
+
return chunks;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/** Majority-vote across multiple boolean arrays. Returns true if >50% of runs agree. */
|
|
193
|
+
function majorityVote(runs: boolean[][], index: number): boolean {
|
|
194
|
+
let yesCount = 0;
|
|
195
|
+
for (const run of runs) {
|
|
196
|
+
if (run[index]) yesCount++;
|
|
197
|
+
}
|
|
198
|
+
return yesCount > runs.length / 2;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Validate a proposal by batching trigger checks.
|
|
203
|
+
* Instead of 2 LLM calls per entry, this makes 2 calls per batch
|
|
204
|
+
* (one for "before", one for "after"), reducing total calls from 2N to ~2*(N/batchSize).
|
|
205
|
+
*/
|
|
206
|
+
export async function validateProposalBatched(
|
|
207
|
+
proposal: EvolutionProposal,
|
|
208
|
+
evalSet: EvalEntry[],
|
|
209
|
+
agent: string,
|
|
210
|
+
modelFlag?: string,
|
|
211
|
+
): Promise<ValidationResult> {
|
|
212
|
+
if (evalSet.length === 0) {
|
|
213
|
+
return {
|
|
214
|
+
proposal_id: proposal.proposal_id,
|
|
215
|
+
before_pass_rate: 0,
|
|
216
|
+
after_pass_rate: 0,
|
|
217
|
+
improved: false,
|
|
218
|
+
regressions: [],
|
|
219
|
+
new_passes: [],
|
|
220
|
+
net_change: 0,
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const systemPrompt =
|
|
225
|
+
"You are an evaluation assistant. For each numbered query, respond with the number followed by YES or NO.";
|
|
226
|
+
|
|
227
|
+
const regressions: EvalEntry[] = [];
|
|
228
|
+
const newPasses: EvalEntry[] = [];
|
|
229
|
+
const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
|
|
230
|
+
[];
|
|
231
|
+
let beforePassed = 0;
|
|
232
|
+
let afterPassed = 0;
|
|
233
|
+
|
|
234
|
+
const batches = chunk(evalSet, TRIGGER_CHECK_BATCH_SIZE);
|
|
235
|
+
|
|
236
|
+
for (const batch of batches) {
|
|
237
|
+
const queries = batch.map((e) => e.query);
|
|
238
|
+
|
|
239
|
+
const beforePrompt = buildBatchTriggerCheckPrompt(proposal.original_description, queries);
|
|
240
|
+
const afterPrompt = buildBatchTriggerCheckPrompt(proposal.proposed_description, queries);
|
|
241
|
+
|
|
242
|
+
// Run VALIDATION_RUNS times in parallel and majority-vote to reduce LLM variance
|
|
243
|
+
const allCalls: Promise<string>[] = [];
|
|
244
|
+
for (let r = 0; r < VALIDATION_RUNS; r++) {
|
|
245
|
+
allCalls.push(callLlm(systemPrompt, beforePrompt, agent, modelFlag));
|
|
246
|
+
allCalls.push(callLlm(systemPrompt, afterPrompt, agent, modelFlag));
|
|
247
|
+
}
|
|
248
|
+
const allRaw = await Promise.all(allCalls);
|
|
249
|
+
|
|
250
|
+
// Parse into arrays of [before, after] per run
|
|
251
|
+
const beforeRuns: boolean[][] = [];
|
|
252
|
+
const afterRuns: boolean[][] = [];
|
|
253
|
+
for (let r = 0; r < VALIDATION_RUNS; r++) {
|
|
254
|
+
beforeRuns.push(parseBatchTriggerResponse(allRaw[r * 2], queries.length));
|
|
255
|
+
afterRuns.push(parseBatchTriggerResponse(allRaw[r * 2 + 1], queries.length));
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
for (let i = 0; i < batch.length; i++) {
|
|
259
|
+
const entry = batch[i];
|
|
260
|
+
const beforeTriggered = majorityVote(beforeRuns, i);
|
|
261
|
+
const afterTriggered = majorityVote(afterRuns, i);
|
|
262
|
+
|
|
263
|
+
const beforePass =
|
|
264
|
+
(entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
|
|
265
|
+
const afterPass =
|
|
266
|
+
(entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
|
|
267
|
+
|
|
268
|
+
if (beforePass) beforePassed++;
|
|
269
|
+
if (afterPass) afterPassed++;
|
|
270
|
+
|
|
271
|
+
if (beforePass && !afterPass) regressions.push(entry);
|
|
272
|
+
if (!beforePass && afterPass) newPasses.push(entry);
|
|
273
|
+
|
|
274
|
+
perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
const total = evalSet.length;
|
|
279
|
+
const beforePassRate = beforePassed / total;
|
|
280
|
+
const afterPassRate = afterPassed / total;
|
|
281
|
+
const netChange = afterPassRate - beforePassRate;
|
|
282
|
+
|
|
283
|
+
const improved =
|
|
284
|
+
afterPassRate > beforePassRate &&
|
|
285
|
+
regressions.length < total * 0.05 &&
|
|
286
|
+
(netChange >= 0.1 || newPasses.length >= 2);
|
|
287
|
+
|
|
288
|
+
// Compute per-invocation-type scores (initialize all required keys)
|
|
289
|
+
const byInvocationType: Record<string, { passed: number; total: number }> = {
|
|
290
|
+
explicit: { passed: 0, total: 0 },
|
|
291
|
+
implicit: { passed: 0, total: 0 },
|
|
292
|
+
contextual: { passed: 0, total: 0 },
|
|
293
|
+
negative: { passed: 0, total: 0 },
|
|
294
|
+
};
|
|
295
|
+
for (const r of perEntryResults) {
|
|
296
|
+
const type = r.entry.invocation_type ?? "implicit";
|
|
297
|
+
if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
|
|
298
|
+
byInvocationType[type].total++;
|
|
299
|
+
if (r.after_pass) byInvocationType[type].passed++;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
const invocationScores: InvocationTypeScores = {
|
|
303
|
+
explicit: {
|
|
304
|
+
...byInvocationType.explicit,
|
|
305
|
+
pass_rate:
|
|
306
|
+
byInvocationType.explicit.total > 0
|
|
307
|
+
? byInvocationType.explicit.passed / byInvocationType.explicit.total
|
|
308
|
+
: 0,
|
|
309
|
+
},
|
|
310
|
+
implicit: {
|
|
311
|
+
...byInvocationType.implicit,
|
|
312
|
+
pass_rate:
|
|
313
|
+
byInvocationType.implicit.total > 0
|
|
314
|
+
? byInvocationType.implicit.passed / byInvocationType.implicit.total
|
|
315
|
+
: 0,
|
|
316
|
+
},
|
|
317
|
+
contextual: {
|
|
318
|
+
...byInvocationType.contextual,
|
|
319
|
+
pass_rate:
|
|
320
|
+
byInvocationType.contextual.total > 0
|
|
321
|
+
? byInvocationType.contextual.passed / byInvocationType.contextual.total
|
|
322
|
+
: 0,
|
|
323
|
+
},
|
|
324
|
+
negative: {
|
|
325
|
+
...byInvocationType.negative,
|
|
326
|
+
pass_rate:
|
|
327
|
+
byInvocationType.negative.total > 0
|
|
328
|
+
? byInvocationType.negative.passed / byInvocationType.negative.total
|
|
329
|
+
: 0,
|
|
330
|
+
},
|
|
331
|
+
};
|
|
332
|
+
|
|
333
|
+
return {
|
|
334
|
+
proposal_id: proposal.proposal_id,
|
|
335
|
+
before_pass_rate: beforePassRate,
|
|
336
|
+
after_pass_rate: afterPassRate,
|
|
337
|
+
improved,
|
|
338
|
+
regressions,
|
|
339
|
+
new_passes: newPasses,
|
|
340
|
+
net_change: netChange,
|
|
341
|
+
by_invocation_type: invocationScores,
|
|
342
|
+
per_entry_results: perEntryResults,
|
|
343
|
+
};
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// ---------------------------------------------------------------------------
|
|
347
|
+
// Default export — batched is the default
|
|
348
|
+
// ---------------------------------------------------------------------------
|
|
349
|
+
|
|
350
|
+
/** Validate a proposal by running trigger checks against the eval set (batched by default). */
|
|
351
|
+
export async function validateProposal(
|
|
352
|
+
proposal: EvolutionProposal,
|
|
353
|
+
evalSet: EvalEntry[],
|
|
354
|
+
agent: string,
|
|
355
|
+
modelFlag?: string,
|
|
356
|
+
): Promise<ValidationResult> {
|
|
357
|
+
return validateProposalBatched(proposal, evalSet, agent, modelFlag);
|
|
358
|
+
}
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* validate-routing.ts
|
|
3
|
+
*
|
|
4
|
+
* Validates a routing table evolution proposal by checking structural validity
|
|
5
|
+
* and running trigger accuracy checks against an eval set.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
|
|
9
|
+
import { callLlm } from "../utils/llm-call.js";
|
|
10
|
+
import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Structural validation
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Check that a routing table is valid markdown table syntax with
|
|
18
|
+
* `| Trigger | Workflow |` columns.
|
|
19
|
+
*/
|
|
20
|
+
export function validateRoutingStructure(routing: string): { valid: boolean; reason: string } {
|
|
21
|
+
const lines = routing
|
|
22
|
+
.trim()
|
|
23
|
+
.split("\n")
|
|
24
|
+
.filter((l) => l.trim().length > 0);
|
|
25
|
+
|
|
26
|
+
if (lines.length < 2) {
|
|
27
|
+
return { valid: false, reason: "Routing table must have at least a header and one data row" };
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Check header row contains Trigger and Workflow columns
|
|
31
|
+
const headerLine = lines[0].trim();
|
|
32
|
+
if (!headerLine.startsWith("|") || !headerLine.endsWith("|")) {
|
|
33
|
+
return {
|
|
34
|
+
valid: false,
|
|
35
|
+
reason: "Header row must be a markdown table row starting and ending with |",
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const headerLower = headerLine.toLowerCase();
|
|
40
|
+
if (!headerLower.includes("trigger") || !headerLower.includes("workflow")) {
|
|
41
|
+
return { valid: false, reason: "Header must contain 'Trigger' and 'Workflow' columns" };
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Check separator row (line 2) has dashes
|
|
45
|
+
const separatorLine = lines[1].trim();
|
|
46
|
+
if (!separatorLine.includes("---")) {
|
|
47
|
+
return { valid: false, reason: "Second row must be a markdown table separator (contains ---)" };
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Check at least one data row
|
|
51
|
+
if (lines.length < 3) {
|
|
52
|
+
return { valid: false, reason: "Routing table must have at least one data row" };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Check data rows are pipe-delimited
|
|
56
|
+
for (let i = 2; i < lines.length; i++) {
|
|
57
|
+
const row = lines[i].trim();
|
|
58
|
+
if (!row.startsWith("|") || !row.endsWith("|")) {
|
|
59
|
+
return { valid: false, reason: `Data row ${i - 1} is not a valid markdown table row` };
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return { valid: true, reason: "Valid markdown routing table" };
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
// Trigger accuracy validation
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Run before/after trigger checks on the eval set using the routing content.
|
|
72
|
+
* Returns pass rates for comparison.
|
|
73
|
+
*/
|
|
74
|
+
export async function validateRoutingTriggerAccuracy(
|
|
75
|
+
originalRouting: string,
|
|
76
|
+
proposedRouting: string,
|
|
77
|
+
evalSet: EvalEntry[],
|
|
78
|
+
agent: string,
|
|
79
|
+
modelFlag?: string,
|
|
80
|
+
): Promise<{ before_pass_rate: number; after_pass_rate: number; improved: boolean }> {
|
|
81
|
+
if (evalSet.length === 0) {
|
|
82
|
+
return { before_pass_rate: 0, after_pass_rate: 0, improved: false };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
|
|
86
|
+
let beforePassed = 0;
|
|
87
|
+
let afterPassed = 0;
|
|
88
|
+
|
|
89
|
+
for (const entry of evalSet) {
|
|
90
|
+
// Check with original routing
|
|
91
|
+
const beforePrompt = buildTriggerCheckPrompt(originalRouting, entry.query);
|
|
92
|
+
const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
|
|
93
|
+
const beforeTriggered = parseTriggerResponse(beforeRaw);
|
|
94
|
+
const beforePass =
|
|
95
|
+
(entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
|
|
96
|
+
|
|
97
|
+
// Check with proposed routing
|
|
98
|
+
const afterPrompt = buildTriggerCheckPrompt(proposedRouting, entry.query);
|
|
99
|
+
const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
|
|
100
|
+
const afterTriggered = parseTriggerResponse(afterRaw);
|
|
101
|
+
const afterPass =
|
|
102
|
+
(entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
|
|
103
|
+
|
|
104
|
+
if (beforePass) beforePassed++;
|
|
105
|
+
if (afterPass) afterPassed++;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const total = evalSet.length;
|
|
109
|
+
const beforePassRate = beforePassed / total;
|
|
110
|
+
const afterPassRate = afterPassed / total;
|
|
111
|
+
|
|
112
|
+
return {
|
|
113
|
+
before_pass_rate: beforePassRate,
|
|
114
|
+
after_pass_rate: afterPassRate,
|
|
115
|
+
improved: afterPassRate > beforePassRate,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// ---------------------------------------------------------------------------
|
|
120
|
+
// Full routing validation
|
|
121
|
+
// ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
/** Validate a routing table proposal: structural check + trigger accuracy. */
|
|
124
|
+
export async function validateRoutingProposal(
|
|
125
|
+
proposal: BodyEvolutionProposal,
|
|
126
|
+
evalSet: EvalEntry[],
|
|
127
|
+
agent: string,
|
|
128
|
+
modelFlag?: string,
|
|
129
|
+
): Promise<BodyValidationResult> {
|
|
130
|
+
const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
|
|
131
|
+
|
|
132
|
+
// Gate 1: Structural validation
|
|
133
|
+
const structural = validateRoutingStructure(proposal.proposed_body);
|
|
134
|
+
gateResults.push({
|
|
135
|
+
gate: "structural",
|
|
136
|
+
passed: structural.valid,
|
|
137
|
+
reason: structural.reason,
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
if (!structural.valid) {
|
|
141
|
+
return {
|
|
142
|
+
proposal_id: proposal.proposal_id,
|
|
143
|
+
gates_passed: 0,
|
|
144
|
+
gates_total: 2,
|
|
145
|
+
gate_results: gateResults,
|
|
146
|
+
improved: false,
|
|
147
|
+
regressions: [],
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// Gate 2: Trigger accuracy
|
|
152
|
+
const accuracy = await validateRoutingTriggerAccuracy(
|
|
153
|
+
proposal.original_body,
|
|
154
|
+
proposal.proposed_body,
|
|
155
|
+
evalSet,
|
|
156
|
+
agent,
|
|
157
|
+
modelFlag,
|
|
158
|
+
);
|
|
159
|
+
gateResults.push({
|
|
160
|
+
gate: "trigger_accuracy",
|
|
161
|
+
passed: accuracy.improved,
|
|
162
|
+
reason: accuracy.improved
|
|
163
|
+
? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
164
|
+
: `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
const gatesPassed = gateResults.filter((g) => g.passed).length;
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
proposal_id: proposal.proposal_id,
|
|
171
|
+
gates_passed: gatesPassed,
|
|
172
|
+
gates_total: 2,
|
|
173
|
+
gate_results: gateResults,
|
|
174
|
+
improved: gatesPassed === 2,
|
|
175
|
+
regressions: [],
|
|
176
|
+
};
|
|
177
|
+
}
|