selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/.claude/agents/diagnosis-analyst.md +156 -0
  2. package/.claude/agents/evolution-reviewer.md +180 -0
  3. package/.claude/agents/integration-guide.md +212 -0
  4. package/.claude/agents/pattern-analyst.md +160 -0
  5. package/CHANGELOG.md +46 -1
  6. package/README.md +105 -257
  7. package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
  8. package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
  9. package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
  10. package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
  11. package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
  12. package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
  13. package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
  14. package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
  15. package/apps/local-dashboard/dist/favicon.png +0 -0
  16. package/apps/local-dashboard/dist/index.html +17 -0
  17. package/apps/local-dashboard/dist/logo.png +0 -0
  18. package/apps/local-dashboard/dist/logo.svg +9 -0
  19. package/assets/BeforeAfter.gif +0 -0
  20. package/assets/FeedbackLoop.gif +0 -0
  21. package/assets/logo.svg +9 -0
  22. package/assets/skill-health-badge.svg +20 -0
  23. package/cli/selftune/activation-rules.ts +171 -0
  24. package/cli/selftune/badge/badge-data.ts +108 -0
  25. package/cli/selftune/badge/badge-svg.ts +212 -0
  26. package/cli/selftune/badge/badge.ts +99 -0
  27. package/cli/selftune/canonical-export.ts +183 -0
  28. package/cli/selftune/constants.ts +103 -1
  29. package/cli/selftune/contribute/bundle.ts +314 -0
  30. package/cli/selftune/contribute/contribute.ts +214 -0
  31. package/cli/selftune/contribute/sanitize.ts +162 -0
  32. package/cli/selftune/cron/setup.ts +266 -0
  33. package/cli/selftune/dashboard-contract.ts +202 -0
  34. package/cli/selftune/dashboard-server.ts +1049 -0
  35. package/cli/selftune/dashboard.ts +43 -156
  36. package/cli/selftune/eval/baseline.ts +248 -0
  37. package/cli/selftune/eval/composability-v2.ts +273 -0
  38. package/cli/selftune/eval/composability.ts +117 -0
  39. package/cli/selftune/eval/generate-unit-tests.ts +143 -0
  40. package/cli/selftune/eval/hooks-to-evals.ts +101 -16
  41. package/cli/selftune/eval/import-skillsbench.ts +221 -0
  42. package/cli/selftune/eval/synthetic-evals.ts +172 -0
  43. package/cli/selftune/eval/unit-test-cli.ts +152 -0
  44. package/cli/selftune/eval/unit-test.ts +196 -0
  45. package/cli/selftune/evolution/deploy-proposal.ts +142 -1
  46. package/cli/selftune/evolution/evidence.ts +26 -0
  47. package/cli/selftune/evolution/evolve-body.ts +586 -0
  48. package/cli/selftune/evolution/evolve.ts +825 -116
  49. package/cli/selftune/evolution/extract-patterns.ts +105 -16
  50. package/cli/selftune/evolution/pareto.ts +314 -0
  51. package/cli/selftune/evolution/propose-body.ts +171 -0
  52. package/cli/selftune/evolution/propose-description.ts +100 -2
  53. package/cli/selftune/evolution/propose-routing.ts +166 -0
  54. package/cli/selftune/evolution/refine-body.ts +141 -0
  55. package/cli/selftune/evolution/rollback.ts +21 -4
  56. package/cli/selftune/evolution/validate-body.ts +254 -0
  57. package/cli/selftune/evolution/validate-proposal.ts +257 -35
  58. package/cli/selftune/evolution/validate-routing.ts +177 -0
  59. package/cli/selftune/grading/auto-grade.ts +200 -0
  60. package/cli/selftune/grading/grade-session.ts +513 -42
  61. package/cli/selftune/grading/pre-gates.ts +104 -0
  62. package/cli/selftune/grading/results.ts +42 -0
  63. package/cli/selftune/hooks/auto-activate.ts +185 -0
  64. package/cli/selftune/hooks/evolution-guard.ts +165 -0
  65. package/cli/selftune/hooks/prompt-log.ts +172 -2
  66. package/cli/selftune/hooks/session-stop.ts +123 -3
  67. package/cli/selftune/hooks/skill-change-guard.ts +112 -0
  68. package/cli/selftune/hooks/skill-eval.ts +119 -3
  69. package/cli/selftune/index.ts +415 -48
  70. package/cli/selftune/ingestors/claude-replay.ts +377 -0
  71. package/cli/selftune/ingestors/codex-rollout.ts +345 -46
  72. package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
  73. package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
  74. package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
  75. package/cli/selftune/init.ts +376 -16
  76. package/cli/selftune/last.ts +14 -5
  77. package/cli/selftune/localdb/db.ts +63 -0
  78. package/cli/selftune/localdb/materialize.ts +428 -0
  79. package/cli/selftune/localdb/queries.ts +376 -0
  80. package/cli/selftune/localdb/schema.ts +204 -0
  81. package/cli/selftune/memory/writer.ts +447 -0
  82. package/cli/selftune/monitoring/watch.ts +90 -16
  83. package/cli/selftune/normalization.ts +682 -0
  84. package/cli/selftune/observability.ts +19 -44
  85. package/cli/selftune/orchestrate.ts +1073 -0
  86. package/cli/selftune/quickstart.ts +203 -0
  87. package/cli/selftune/repair/skill-usage.ts +576 -0
  88. package/cli/selftune/schedule.ts +561 -0
  89. package/cli/selftune/status.ts +59 -33
  90. package/cli/selftune/sync.ts +627 -0
  91. package/cli/selftune/types.ts +525 -5
  92. package/cli/selftune/utils/canonical-log.ts +45 -0
  93. package/cli/selftune/utils/frontmatter.ts +217 -0
  94. package/cli/selftune/utils/hooks.ts +41 -0
  95. package/cli/selftune/utils/html.ts +27 -0
  96. package/cli/selftune/utils/llm-call.ts +103 -19
  97. package/cli/selftune/utils/math.ts +10 -0
  98. package/cli/selftune/utils/query-filter.ts +139 -0
  99. package/cli/selftune/utils/skill-discovery.ts +340 -0
  100. package/cli/selftune/utils/skill-log.ts +68 -0
  101. package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
  102. package/cli/selftune/utils/transcript.ts +307 -26
  103. package/cli/selftune/utils/trigger-check.ts +89 -0
  104. package/cli/selftune/utils/tui.ts +156 -0
  105. package/cli/selftune/workflows/discover.ts +254 -0
  106. package/cli/selftune/workflows/skill-md-writer.ts +288 -0
  107. package/cli/selftune/workflows/workflows.ts +188 -0
  108. package/package.json +28 -11
  109. package/packages/telemetry-contract/README.md +11 -0
  110. package/packages/telemetry-contract/fixtures/golden.json +87 -0
  111. package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
  112. package/packages/telemetry-contract/index.ts +1 -0
  113. package/packages/telemetry-contract/package.json +19 -0
  114. package/packages/telemetry-contract/src/index.ts +2 -0
  115. package/packages/telemetry-contract/src/types.ts +163 -0
  116. package/packages/telemetry-contract/src/validators.ts +109 -0
  117. package/skill/SKILL.md +180 -33
  118. package/skill/Workflows/AutoActivation.md +145 -0
  119. package/skill/Workflows/Badge.md +124 -0
  120. package/skill/Workflows/Baseline.md +144 -0
  121. package/skill/Workflows/Composability.md +107 -0
  122. package/skill/Workflows/Contribute.md +94 -0
  123. package/skill/Workflows/Cron.md +132 -0
  124. package/skill/Workflows/Dashboard.md +214 -0
  125. package/skill/Workflows/Doctor.md +63 -14
  126. package/skill/Workflows/Evals.md +110 -18
  127. package/skill/Workflows/EvolutionMemory.md +154 -0
  128. package/skill/Workflows/Evolve.md +181 -21
  129. package/skill/Workflows/EvolveBody.md +159 -0
  130. package/skill/Workflows/Grade.md +36 -31
  131. package/skill/Workflows/ImportSkillsBench.md +117 -0
  132. package/skill/Workflows/Ingest.md +142 -21
  133. package/skill/Workflows/Initialize.md +91 -23
  134. package/skill/Workflows/Orchestrate.md +139 -0
  135. package/skill/Workflows/Replay.md +91 -0
  136. package/skill/Workflows/Rollback.md +23 -4
  137. package/skill/Workflows/Schedule.md +61 -0
  138. package/skill/Workflows/Sync.md +88 -0
  139. package/skill/Workflows/UnitTest.md +150 -0
  140. package/skill/Workflows/Watch.md +33 -1
  141. package/skill/Workflows/Workflows.md +129 -0
  142. package/skill/assets/activation-rules-default.json +26 -0
  143. package/skill/assets/multi-skill-settings.json +63 -0
  144. package/skill/assets/single-skill-settings.json +57 -0
  145. package/skill/references/invocation-taxonomy.md +2 -2
  146. package/skill/references/logs.md +164 -2
  147. package/skill/references/setup-patterns.md +65 -0
  148. package/skill/references/version-history.md +40 -0
  149. package/skill/settings_snippet.json +23 -0
  150. package/templates/activation-rules-default.json +27 -0
  151. package/templates/multi-skill-settings.json +64 -0
  152. package/templates/single-skill-settings.json +58 -0
  153. package/dashboard/index.html +0 -1119
@@ -0,0 +1,254 @@
1
+ /**
2
+ * validate-body.ts
3
+ *
4
+ * 3-gate validation for full body evolution proposals:
5
+ * Gate 1 (structural): Pure code — YAML frontmatter, # Title, ## Workflow Routing preserved
6
+ * Gate 2 (trigger accuracy): Student model YES/NO per eval entry
7
+ * Gate 3 (quality): Student model rates body clarity/completeness 0.0-1.0
8
+ */
9
+
10
+ import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
11
+ import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
12
+ import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
13
+
14
+ // ---------------------------------------------------------------------------
15
+ // Gate 1: Structural validation (pure code, no LLM)
16
+ // ---------------------------------------------------------------------------
17
+
18
+ /**
19
+ * Check that a proposed body preserves required structural elements.
20
+ * Verifies:
21
+ * - Contains a ## Workflow Routing section
22
+ * - Routing table has valid markdown table syntax
23
+ * - Body is non-empty
24
+ */
25
+ export function validateBodyStructure(proposedBody: string): { valid: boolean; reason: string } {
26
+ if (!proposedBody || proposedBody.trim().length === 0) {
27
+ return { valid: false, reason: "Proposed body is empty" };
28
+ }
29
+
30
+ // Check for ## Workflow Routing section
31
+ if (!proposedBody.includes("## Workflow Routing")) {
32
+ return { valid: false, reason: "Missing required '## Workflow Routing' section" };
33
+ }
34
+
35
+ // Extract the routing section and check for table syntax
36
+ const routingIdx = proposedBody.indexOf("## Workflow Routing");
37
+ const afterRouting = proposedBody.slice(routingIdx + "## Workflow Routing".length);
38
+ // Find end of section (next ## heading or EOF)
39
+ const nextSectionMatch = afterRouting.match(/\n## /);
40
+ const routingContent = nextSectionMatch
41
+ ? afterRouting.slice(0, nextSectionMatch.index)
42
+ : afterRouting;
43
+
44
+ // Check for pipe-delimited table rows
45
+ const tableLines = routingContent
46
+ .split("\n")
47
+ .filter((l) => l.trim().startsWith("|") && l.trim().endsWith("|"));
48
+ if (tableLines.length < 2) {
49
+ return {
50
+ valid: false,
51
+ reason:
52
+ "Workflow Routing section lacks a valid markdown table (need header + separator + rows)",
53
+ };
54
+ }
55
+
56
+ return { valid: true, reason: "Structural validation passed" };
57
+ }
58
+
59
+ // ---------------------------------------------------------------------------
60
+ // Gate 2: Trigger accuracy (student model YES/NO)
61
+ // ---------------------------------------------------------------------------
62
+
63
+ /**
64
+ * Run trigger checks on the eval set using the proposed body content.
65
+ * Returns before/after pass rates.
66
+ */
67
+ export async function validateBodyTriggerAccuracy(
68
+ originalBody: string,
69
+ proposedBody: string,
70
+ evalSet: EvalEntry[],
71
+ agent: string,
72
+ modelFlag?: string,
73
+ ): Promise<{
74
+ before_pass_rate: number;
75
+ after_pass_rate: number;
76
+ improved: boolean;
77
+ regressions: string[];
78
+ }> {
79
+ if (evalSet.length === 0) {
80
+ return { before_pass_rate: 0, after_pass_rate: 0, improved: false, regressions: [] };
81
+ }
82
+
83
+ const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
84
+ let beforePassed = 0;
85
+ let afterPassed = 0;
86
+ const regressions: string[] = [];
87
+
88
+ for (const entry of evalSet) {
89
+ // Check with original body
90
+ const beforePrompt = buildTriggerCheckPrompt(originalBody, entry.query);
91
+ const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
92
+ const beforeTriggered = parseTriggerResponse(beforeRaw);
93
+ const beforePass =
94
+ (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
95
+
96
+ // Check with proposed body
97
+ const afterPrompt = buildTriggerCheckPrompt(proposedBody, entry.query);
98
+ const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
99
+ const afterTriggered = parseTriggerResponse(afterRaw);
100
+ const afterPass =
101
+ (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
102
+
103
+ if (beforePass) beforePassed++;
104
+ if (afterPass) afterPassed++;
105
+
106
+ // Track regressions
107
+ if (beforePass && !afterPass) {
108
+ regressions.push(entry.query);
109
+ }
110
+ }
111
+
112
+ const total = evalSet.length;
113
+ const beforePassRate = beforePassed / total;
114
+ const afterPassRate = afterPassed / total;
115
+
116
+ return {
117
+ before_pass_rate: beforePassRate,
118
+ after_pass_rate: afterPassRate,
119
+ improved: afterPassRate > beforePassRate,
120
+ regressions,
121
+ };
122
+ }
123
+
124
+ // ---------------------------------------------------------------------------
125
+ // Gate 3: Quality assessment (student model 0.0-1.0)
126
+ // ---------------------------------------------------------------------------
127
+
128
+ /** System prompt for quality assessment. */
129
+ const QUALITY_ASSESSMENT_SYSTEM = `You are a skill document quality assessor for an AI agent system.
130
+
131
+ Rate the quality of the provided skill document body on these dimensions:
132
+ - Clarity: Is the description clear and unambiguous?
133
+ - Completeness: Does it cover the expected use cases?
134
+ - Structure: Is it well-organized with proper sections?
135
+ - Routing accuracy: Does the routing table seem comprehensive?
136
+
137
+ Output ONLY valid JSON with exactly these fields:
138
+ - "score" (number): Overall quality score 0.0-1.0
139
+ - "reason" (string): Brief explanation of the score
140
+
141
+ Do NOT include any text outside the JSON object.`;
142
+
143
+ /** Assess the quality of a proposed body via student model. */
144
+ export async function assessBodyQuality(
145
+ proposedBody: string,
146
+ skillName: string,
147
+ agent: string,
148
+ modelFlag?: string,
149
+ ): Promise<{ score: number; reason: string }> {
150
+ const userPrompt = `Skill Name: ${skillName}
151
+
152
+ Proposed Skill Body:
153
+ ${proposedBody}
154
+
155
+ Rate the quality of this skill document body. Output ONLY a JSON object with "score" (0.0-1.0) and "reason" fields.`;
156
+
157
+ const rawResponse = await callLlm(QUALITY_ASSESSMENT_SYSTEM, userPrompt, agent, modelFlag);
158
+ const cleaned = stripMarkdownFences(rawResponse);
159
+
160
+ let parsed: unknown;
161
+ try {
162
+ parsed = JSON.parse(cleaned);
163
+ } catch {
164
+ // If parsing fails, return a conservative default
165
+ return { score: 0.5, reason: "Failed to parse quality assessment response" };
166
+ }
167
+
168
+ if (typeof parsed !== "object" || parsed === null) {
169
+ return { score: 0.5, reason: "Quality assessment response is not a JSON object" };
170
+ }
171
+
172
+ const obj = parsed as Record<string, unknown>;
173
+ const score = typeof obj.score === "number" ? Math.max(0.0, Math.min(1.0, obj.score)) : 0.5;
174
+ const reason = typeof obj.reason === "string" ? obj.reason : "No reason provided";
175
+
176
+ return { score, reason };
177
+ }
178
+
179
+ // ---------------------------------------------------------------------------
180
+ // Full 3-gate body validation
181
+ // ---------------------------------------------------------------------------
182
+
183
+ /** Minimum quality score to pass Gate 3. */
184
+ const QUALITY_THRESHOLD = 0.6;
185
+
186
+ /** Validate a body proposal through all 3 gates. */
187
+ export async function validateBodyProposal(
188
+ proposal: BodyEvolutionProposal,
189
+ evalSet: EvalEntry[],
190
+ agent: string,
191
+ modelFlag?: string,
192
+ qualityThreshold = QUALITY_THRESHOLD,
193
+ ): Promise<BodyValidationResult> {
194
+ const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
195
+
196
+ // Gate 1: Structural validation (pure code)
197
+ const structural = validateBodyStructure(proposal.proposed_body);
198
+ gateResults.push({
199
+ gate: "structural",
200
+ passed: structural.valid,
201
+ reason: structural.reason,
202
+ });
203
+
204
+ if (!structural.valid) {
205
+ return {
206
+ proposal_id: proposal.proposal_id,
207
+ gates_passed: 0,
208
+ gates_total: 3,
209
+ gate_results: gateResults,
210
+ improved: false,
211
+ regressions: [],
212
+ };
213
+ }
214
+
215
+ // Gate 2: Trigger accuracy (student model)
216
+ const accuracy = await validateBodyTriggerAccuracy(
217
+ proposal.original_body,
218
+ proposal.proposed_body,
219
+ evalSet,
220
+ agent,
221
+ modelFlag,
222
+ );
223
+ gateResults.push({
224
+ gate: "trigger_accuracy",
225
+ passed: accuracy.improved,
226
+ reason: accuracy.improved
227
+ ? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
228
+ : `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
229
+ });
230
+
231
+ // Gate 3: Quality assessment (student model)
232
+ const quality = await assessBodyQuality(
233
+ proposal.proposed_body,
234
+ proposal.skill_name,
235
+ agent,
236
+ modelFlag,
237
+ );
238
+ gateResults.push({
239
+ gate: "quality",
240
+ passed: quality.score >= qualityThreshold,
241
+ reason: `Quality score: ${quality.score.toFixed(2)} (threshold: ${qualityThreshold}) — ${quality.reason}`,
242
+ });
243
+
244
+ const gatesPassed = gateResults.filter((g) => g.passed).length;
245
+
246
+ return {
247
+ proposal_id: proposal.proposal_id,
248
+ gates_passed: gatesPassed,
249
+ gates_total: 3,
250
+ gate_results: gateResults,
251
+ improved: gatesPassed === 3,
252
+ regressions: accuracy.regressions,
253
+ };
254
+ }
@@ -6,8 +6,25 @@
6
6
  * to determine whether the proposal is an improvement.
7
7
  */
8
8
 
9
- import type { EvalEntry, EvolutionProposal } from "../types.js";
9
+ import type { EvalEntry, EvolutionProposal, InvocationTypeScores } from "../types.js";
10
10
  import { callLlm } from "../utils/llm-call.js";
11
+ import {
12
+ buildBatchTriggerCheckPrompt,
13
+ buildTriggerCheckPrompt,
14
+ parseBatchTriggerResponse,
15
+ parseTriggerResponse,
16
+ } from "../utils/trigger-check.js";
17
+
18
+ // Re-export so existing consumers don't break
19
+ export { buildTriggerCheckPrompt, parseTriggerResponse };
20
+
21
+ /** Number of eval queries to batch into a single LLM call.
22
+ * Higher = fewer claude -p spawns = much faster (each spawn has ~30-60s overhead).
23
+ * Haiku handles 50+ YES/NO checks in a single call easily. */
24
+ export const TRIGGER_CHECK_BATCH_SIZE = 50;
25
+
26
+ /** Number of times to run each batch and majority-vote to reduce LLM variance. */
27
+ export const VALIDATION_RUNS = 3;
11
28
 
12
29
  // ---------------------------------------------------------------------------
13
30
  // Types
@@ -21,47 +38,20 @@ export interface ValidationResult {
21
38
  regressions: EvalEntry[]; // passed before, fail after
22
39
  new_passes: EvalEntry[]; // failed before, pass after
23
40
  net_change: number; // after - before pass rate
24
- }
25
-
26
- // ---------------------------------------------------------------------------
27
- // Prompt building
28
- // ---------------------------------------------------------------------------
29
-
30
- /** Build the trigger check prompt for the LLM. */
31
- export function buildTriggerCheckPrompt(description: string, query: string): string {
32
- return [
33
- "Given this skill description, would the following user query trigger this skill?",
34
- "Respond YES or NO only.",
35
- "",
36
- "Skill description:",
37
- description,
38
- "",
39
- "User query:",
40
- query,
41
- ].join("\n");
42
- }
43
-
44
- // ---------------------------------------------------------------------------
45
- // Response parsing
46
- // ---------------------------------------------------------------------------
47
-
48
- /** Parse YES/NO from LLM response. */
49
- export function parseTriggerResponse(response: string): boolean {
50
- const normalized = response.trim().toUpperCase();
51
- if (normalized.startsWith("YES")) return true;
52
- if (normalized.startsWith("NO")) return false;
53
- return false; // conservative default
41
+ by_invocation_type?: InvocationTypeScores;
42
+ per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
54
43
  }
55
44
 
56
45
  // ---------------------------------------------------------------------------
57
46
  // Proposal validation
58
47
  // ---------------------------------------------------------------------------
59
48
 
60
- /** Validate a proposal by running trigger checks against the eval set. */
61
- export async function validateProposal(
49
+ /** Validate a proposal sequentially (one LLM call per query). Kept for backward compat. */
50
+ export async function validateProposalSequential(
62
51
  proposal: EvolutionProposal,
63
52
  evalSet: EvalEntry[],
64
53
  agent: string,
54
+ modelFlag?: string,
65
55
  ): Promise<ValidationResult> {
66
56
  if (evalSet.length === 0) {
67
57
  return {
@@ -78,20 +68,22 @@ export async function validateProposal(
78
68
  const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
79
69
  const regressions: EvalEntry[] = [];
80
70
  const newPasses: EvalEntry[] = [];
71
+ const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
72
+ [];
81
73
  let beforePassed = 0;
82
74
  let afterPassed = 0;
83
75
 
84
76
  for (const entry of evalSet) {
85
77
  // Check with original description
86
78
  const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
87
- const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent);
79
+ const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
88
80
  const beforeTriggered = parseTriggerResponse(beforeRaw);
89
81
  const beforePass =
90
82
  (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
91
83
 
92
84
  // Check with proposed description
93
85
  const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
94
- const afterRaw = await callLlm(systemPrompt, afterPrompt, agent);
86
+ const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
95
87
  const afterTriggered = parseTriggerResponse(afterRaw);
96
88
  const afterPass =
97
89
  (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
@@ -108,6 +100,8 @@ export async function validateProposal(
108
100
  if (!beforePass && afterPass) {
109
101
  newPasses.push(entry);
110
102
  }
103
+
104
+ perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
111
105
  }
112
106
 
113
107
  const total = evalSet.length;
@@ -124,6 +118,51 @@ export async function validateProposal(
124
118
  regressions.length < total * 0.05 &&
125
119
  (netChange >= 0.1 || newPasses.length >= 2);
126
120
 
121
+ // Compute per-invocation-type scores (initialize all required keys)
122
+ const byInvocationType: Record<string, { passed: number; total: number }> = {
123
+ explicit: { passed: 0, total: 0 },
124
+ implicit: { passed: 0, total: 0 },
125
+ contextual: { passed: 0, total: 0 },
126
+ negative: { passed: 0, total: 0 },
127
+ };
128
+ for (const r of perEntryResults) {
129
+ const type = r.entry.invocation_type ?? "implicit";
130
+ if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
131
+ byInvocationType[type].total++;
132
+ if (r.after_pass) byInvocationType[type].passed++;
133
+ }
134
+
135
+ const invocationScores: InvocationTypeScores = {
136
+ explicit: {
137
+ ...byInvocationType.explicit,
138
+ pass_rate:
139
+ byInvocationType.explicit.total > 0
140
+ ? byInvocationType.explicit.passed / byInvocationType.explicit.total
141
+ : 0,
142
+ },
143
+ implicit: {
144
+ ...byInvocationType.implicit,
145
+ pass_rate:
146
+ byInvocationType.implicit.total > 0
147
+ ? byInvocationType.implicit.passed / byInvocationType.implicit.total
148
+ : 0,
149
+ },
150
+ contextual: {
151
+ ...byInvocationType.contextual,
152
+ pass_rate:
153
+ byInvocationType.contextual.total > 0
154
+ ? byInvocationType.contextual.passed / byInvocationType.contextual.total
155
+ : 0,
156
+ },
157
+ negative: {
158
+ ...byInvocationType.negative,
159
+ pass_rate:
160
+ byInvocationType.negative.total > 0
161
+ ? byInvocationType.negative.passed / byInvocationType.negative.total
162
+ : 0,
163
+ },
164
+ };
165
+
127
166
  return {
128
167
  proposal_id: proposal.proposal_id,
129
168
  before_pass_rate: beforePassRate,
@@ -132,5 +171,188 @@ export async function validateProposal(
132
171
  regressions,
133
172
  new_passes: newPasses,
134
173
  net_change: netChange,
174
+ by_invocation_type: invocationScores,
175
+ per_entry_results: perEntryResults,
135
176
  };
136
177
  }
178
+
179
+ // ---------------------------------------------------------------------------
180
+ // Batched proposal validation
181
+ // ---------------------------------------------------------------------------
182
+
183
+ /** Chunk an array into groups of `size`. */
184
+ function chunk<T>(arr: T[], size: number): T[][] {
185
+ const chunks: T[][] = [];
186
+ for (let i = 0; i < arr.length; i += size) {
187
+ chunks.push(arr.slice(i, i + size));
188
+ }
189
+ return chunks;
190
+ }
191
+
192
+ /** Majority-vote across multiple boolean arrays. Returns true if >50% of runs agree. */
193
+ function majorityVote(runs: boolean[][], index: number): boolean {
194
+ let yesCount = 0;
195
+ for (const run of runs) {
196
+ if (run[index]) yesCount++;
197
+ }
198
+ return yesCount > runs.length / 2;
199
+ }
200
+
201
+ /**
202
+ * Validate a proposal by batching trigger checks.
203
+ * Instead of 2 LLM calls per entry, this makes 2 calls per batch
204
+ * (one for "before", one for "after"), reducing total calls from 2N to ~2*(N/batchSize).
205
+ */
206
+ export async function validateProposalBatched(
207
+ proposal: EvolutionProposal,
208
+ evalSet: EvalEntry[],
209
+ agent: string,
210
+ modelFlag?: string,
211
+ ): Promise<ValidationResult> {
212
+ if (evalSet.length === 0) {
213
+ return {
214
+ proposal_id: proposal.proposal_id,
215
+ before_pass_rate: 0,
216
+ after_pass_rate: 0,
217
+ improved: false,
218
+ regressions: [],
219
+ new_passes: [],
220
+ net_change: 0,
221
+ };
222
+ }
223
+
224
+ const systemPrompt =
225
+ "You are an evaluation assistant. For each numbered query, respond with the number followed by YES or NO.";
226
+
227
+ const regressions: EvalEntry[] = [];
228
+ const newPasses: EvalEntry[] = [];
229
+ const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
230
+ [];
231
+ let beforePassed = 0;
232
+ let afterPassed = 0;
233
+
234
+ const batches = chunk(evalSet, TRIGGER_CHECK_BATCH_SIZE);
235
+
236
+ for (const batch of batches) {
237
+ const queries = batch.map((e) => e.query);
238
+
239
+ const beforePrompt = buildBatchTriggerCheckPrompt(proposal.original_description, queries);
240
+ const afterPrompt = buildBatchTriggerCheckPrompt(proposal.proposed_description, queries);
241
+
242
+ // Run VALIDATION_RUNS times in parallel and majority-vote to reduce LLM variance
243
+ const allCalls: Promise<string>[] = [];
244
+ for (let r = 0; r < VALIDATION_RUNS; r++) {
245
+ allCalls.push(callLlm(systemPrompt, beforePrompt, agent, modelFlag));
246
+ allCalls.push(callLlm(systemPrompt, afterPrompt, agent, modelFlag));
247
+ }
248
+ const allRaw = await Promise.all(allCalls);
249
+
250
+ // Parse into arrays of [before, after] per run
251
+ const beforeRuns: boolean[][] = [];
252
+ const afterRuns: boolean[][] = [];
253
+ for (let r = 0; r < VALIDATION_RUNS; r++) {
254
+ beforeRuns.push(parseBatchTriggerResponse(allRaw[r * 2], queries.length));
255
+ afterRuns.push(parseBatchTriggerResponse(allRaw[r * 2 + 1], queries.length));
256
+ }
257
+
258
+ for (let i = 0; i < batch.length; i++) {
259
+ const entry = batch[i];
260
+ const beforeTriggered = majorityVote(beforeRuns, i);
261
+ const afterTriggered = majorityVote(afterRuns, i);
262
+
263
+ const beforePass =
264
+ (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
265
+ const afterPass =
266
+ (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
267
+
268
+ if (beforePass) beforePassed++;
269
+ if (afterPass) afterPassed++;
270
+
271
+ if (beforePass && !afterPass) regressions.push(entry);
272
+ if (!beforePass && afterPass) newPasses.push(entry);
273
+
274
+ perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
275
+ }
276
+ }
277
+
278
+ const total = evalSet.length;
279
+ const beforePassRate = beforePassed / total;
280
+ const afterPassRate = afterPassed / total;
281
+ const netChange = afterPassRate - beforePassRate;
282
+
283
+ const improved =
284
+ afterPassRate > beforePassRate &&
285
+ regressions.length < total * 0.05 &&
286
+ (netChange >= 0.1 || newPasses.length >= 2);
287
+
288
+ // Compute per-invocation-type scores (initialize all required keys)
289
+ const byInvocationType: Record<string, { passed: number; total: number }> = {
290
+ explicit: { passed: 0, total: 0 },
291
+ implicit: { passed: 0, total: 0 },
292
+ contextual: { passed: 0, total: 0 },
293
+ negative: { passed: 0, total: 0 },
294
+ };
295
+ for (const r of perEntryResults) {
296
+ const type = r.entry.invocation_type ?? "implicit";
297
+ if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
298
+ byInvocationType[type].total++;
299
+ if (r.after_pass) byInvocationType[type].passed++;
300
+ }
301
+
302
+ const invocationScores: InvocationTypeScores = {
303
+ explicit: {
304
+ ...byInvocationType.explicit,
305
+ pass_rate:
306
+ byInvocationType.explicit.total > 0
307
+ ? byInvocationType.explicit.passed / byInvocationType.explicit.total
308
+ : 0,
309
+ },
310
+ implicit: {
311
+ ...byInvocationType.implicit,
312
+ pass_rate:
313
+ byInvocationType.implicit.total > 0
314
+ ? byInvocationType.implicit.passed / byInvocationType.implicit.total
315
+ : 0,
316
+ },
317
+ contextual: {
318
+ ...byInvocationType.contextual,
319
+ pass_rate:
320
+ byInvocationType.contextual.total > 0
321
+ ? byInvocationType.contextual.passed / byInvocationType.contextual.total
322
+ : 0,
323
+ },
324
+ negative: {
325
+ ...byInvocationType.negative,
326
+ pass_rate:
327
+ byInvocationType.negative.total > 0
328
+ ? byInvocationType.negative.passed / byInvocationType.negative.total
329
+ : 0,
330
+ },
331
+ };
332
+
333
+ return {
334
+ proposal_id: proposal.proposal_id,
335
+ before_pass_rate: beforePassRate,
336
+ after_pass_rate: afterPassRate,
337
+ improved,
338
+ regressions,
339
+ new_passes: newPasses,
340
+ net_change: netChange,
341
+ by_invocation_type: invocationScores,
342
+ per_entry_results: perEntryResults,
343
+ };
344
+ }
345
+
346
+ // ---------------------------------------------------------------------------
347
+ // Default export — batched is the default
348
+ // ---------------------------------------------------------------------------
349
+
350
+ /** Validate a proposal by running trigger checks against the eval set (batched by default). */
351
+ export async function validateProposal(
352
+ proposal: EvolutionProposal,
353
+ evalSet: EvalEntry[],
354
+ agent: string,
355
+ modelFlag?: string,
356
+ ): Promise<ValidationResult> {
357
+ return validateProposalBatched(proposal, evalSet, agent, modelFlag);
358
+ }