npm - selftune - Versions diffs - 0.1.4 → 0.2.1 - Mend

selftune 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

package/.claude/agents/diagnosis-analyst.md +156 -0
package/.claude/agents/evolution-reviewer.md +180 -0
package/.claude/agents/integration-guide.md +212 -0
package/.claude/agents/pattern-analyst.md +160 -0
package/CHANGELOG.md +46 -1
package/README.md +105 -257
package/apps/local-dashboard/dist/assets/geist-cyrillic-wght-normal-CHSlOQsW.woff2 +0 -0
package/apps/local-dashboard/dist/assets/geist-latin-ext-wght-normal-DMtmJ5ZE.woff2 +0 -0
package/apps/local-dashboard/dist/assets/geist-latin-wght-normal-Dm3htQBi.woff2 +0 -0
package/apps/local-dashboard/dist/assets/index-C4EOTFZ2.js +15 -0
package/apps/local-dashboard/dist/assets/index-bl-Webyd.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-react-U7zYD9Rg.js +60 -0
package/apps/local-dashboard/dist/assets/vendor-table-B7VF2Ipl.js +26 -0
package/apps/local-dashboard/dist/assets/vendor-ui-D7_zX_qy.js +346 -0
package/apps/local-dashboard/dist/favicon.png +0 -0
package/apps/local-dashboard/dist/index.html +17 -0
package/apps/local-dashboard/dist/logo.png +0 -0
package/apps/local-dashboard/dist/logo.svg +9 -0
package/assets/BeforeAfter.gif +0 -0
package/assets/FeedbackLoop.gif +0 -0
package/assets/logo.svg +9 -0
package/assets/skill-health-badge.svg +20 -0
package/cli/selftune/activation-rules.ts +171 -0
package/cli/selftune/badge/badge-data.ts +108 -0
package/cli/selftune/badge/badge-svg.ts +212 -0
package/cli/selftune/badge/badge.ts +99 -0
package/cli/selftune/canonical-export.ts +183 -0
package/cli/selftune/constants.ts +103 -1
package/cli/selftune/contribute/bundle.ts +314 -0
package/cli/selftune/contribute/contribute.ts +214 -0
package/cli/selftune/contribute/sanitize.ts +162 -0
package/cli/selftune/cron/setup.ts +266 -0
package/cli/selftune/dashboard-contract.ts +202 -0
package/cli/selftune/dashboard-server.ts +1049 -0
package/cli/selftune/dashboard.ts +43 -156
package/cli/selftune/eval/baseline.ts +248 -0
package/cli/selftune/eval/composability-v2.ts +273 -0
package/cli/selftune/eval/composability.ts +117 -0
package/cli/selftune/eval/generate-unit-tests.ts +143 -0
package/cli/selftune/eval/hooks-to-evals.ts +101 -16
package/cli/selftune/eval/import-skillsbench.ts +221 -0
package/cli/selftune/eval/synthetic-evals.ts +172 -0
package/cli/selftune/eval/unit-test-cli.ts +152 -0
package/cli/selftune/eval/unit-test.ts +196 -0
package/cli/selftune/evolution/deploy-proposal.ts +142 -1
package/cli/selftune/evolution/evidence.ts +26 -0
package/cli/selftune/evolution/evolve-body.ts +586 -0
package/cli/selftune/evolution/evolve.ts +825 -116
package/cli/selftune/evolution/extract-patterns.ts +105 -16
package/cli/selftune/evolution/pareto.ts +314 -0
package/cli/selftune/evolution/propose-body.ts +171 -0
package/cli/selftune/evolution/propose-description.ts +100 -2
package/cli/selftune/evolution/propose-routing.ts +166 -0
package/cli/selftune/evolution/refine-body.ts +141 -0
package/cli/selftune/evolution/rollback.ts +21 -4
package/cli/selftune/evolution/validate-body.ts +254 -0
package/cli/selftune/evolution/validate-proposal.ts +257 -35
package/cli/selftune/evolution/validate-routing.ts +177 -0
package/cli/selftune/grading/auto-grade.ts +200 -0
package/cli/selftune/grading/grade-session.ts +513 -42
package/cli/selftune/grading/pre-gates.ts +104 -0
package/cli/selftune/grading/results.ts +42 -0
package/cli/selftune/hooks/auto-activate.ts +185 -0
package/cli/selftune/hooks/evolution-guard.ts +165 -0
package/cli/selftune/hooks/prompt-log.ts +172 -2
package/cli/selftune/hooks/session-stop.ts +123 -3
package/cli/selftune/hooks/skill-change-guard.ts +112 -0
package/cli/selftune/hooks/skill-eval.ts +119 -3
package/cli/selftune/index.ts +415 -48
package/cli/selftune/ingestors/claude-replay.ts +377 -0
package/cli/selftune/ingestors/codex-rollout.ts +345 -46
package/cli/selftune/ingestors/codex-wrapper.ts +207 -39
package/cli/selftune/ingestors/openclaw-ingest.ts +573 -0
package/cli/selftune/ingestors/opencode-ingest.ts +193 -17
package/cli/selftune/init.ts +376 -16
package/cli/selftune/last.ts +14 -5
package/cli/selftune/localdb/db.ts +63 -0
package/cli/selftune/localdb/materialize.ts +428 -0
package/cli/selftune/localdb/queries.ts +376 -0
package/cli/selftune/localdb/schema.ts +204 -0
package/cli/selftune/memory/writer.ts +447 -0
package/cli/selftune/monitoring/watch.ts +90 -16
package/cli/selftune/normalization.ts +682 -0
package/cli/selftune/observability.ts +19 -44
package/cli/selftune/orchestrate.ts +1073 -0
package/cli/selftune/quickstart.ts +203 -0
package/cli/selftune/repair/skill-usage.ts +576 -0
package/cli/selftune/schedule.ts +561 -0
package/cli/selftune/status.ts +59 -33
package/cli/selftune/sync.ts +627 -0
package/cli/selftune/types.ts +525 -5
package/cli/selftune/utils/canonical-log.ts +45 -0
package/cli/selftune/utils/frontmatter.ts +217 -0
package/cli/selftune/utils/hooks.ts +41 -0
package/cli/selftune/utils/html.ts +27 -0
package/cli/selftune/utils/llm-call.ts +103 -19
package/cli/selftune/utils/math.ts +10 -0
package/cli/selftune/utils/query-filter.ts +139 -0
package/cli/selftune/utils/skill-discovery.ts +340 -0
package/cli/selftune/utils/skill-log.ts +68 -0
package/cli/selftune/utils/skill-usage-confidence.ts +18 -0
package/cli/selftune/utils/transcript.ts +307 -26
package/cli/selftune/utils/trigger-check.ts +89 -0
package/cli/selftune/utils/tui.ts +156 -0
package/cli/selftune/workflows/discover.ts +254 -0
package/cli/selftune/workflows/skill-md-writer.ts +288 -0
package/cli/selftune/workflows/workflows.ts +188 -0
package/package.json +28 -11
package/packages/telemetry-contract/README.md +11 -0
package/packages/telemetry-contract/fixtures/golden.json +87 -0
package/packages/telemetry-contract/fixtures/golden.test.ts +42 -0
package/packages/telemetry-contract/index.ts +1 -0
package/packages/telemetry-contract/package.json +19 -0
package/packages/telemetry-contract/src/index.ts +2 -0
package/packages/telemetry-contract/src/types.ts +163 -0
package/packages/telemetry-contract/src/validators.ts +109 -0
package/skill/SKILL.md +180 -33
package/skill/Workflows/AutoActivation.md +145 -0
package/skill/Workflows/Badge.md +124 -0
package/skill/Workflows/Baseline.md +144 -0
package/skill/Workflows/Composability.md +107 -0
package/skill/Workflows/Contribute.md +94 -0
package/skill/Workflows/Cron.md +132 -0
package/skill/Workflows/Dashboard.md +214 -0
package/skill/Workflows/Doctor.md +63 -14
package/skill/Workflows/Evals.md +110 -18
package/skill/Workflows/EvolutionMemory.md +154 -0
package/skill/Workflows/Evolve.md +181 -21
package/skill/Workflows/EvolveBody.md +159 -0
package/skill/Workflows/Grade.md +36 -31
package/skill/Workflows/ImportSkillsBench.md +117 -0
package/skill/Workflows/Ingest.md +142 -21
package/skill/Workflows/Initialize.md +91 -23
package/skill/Workflows/Orchestrate.md +139 -0
package/skill/Workflows/Replay.md +91 -0
package/skill/Workflows/Rollback.md +23 -4
package/skill/Workflows/Schedule.md +61 -0
package/skill/Workflows/Sync.md +88 -0
package/skill/Workflows/UnitTest.md +150 -0
package/skill/Workflows/Watch.md +33 -1
package/skill/Workflows/Workflows.md +129 -0
package/skill/assets/activation-rules-default.json +26 -0
package/skill/assets/multi-skill-settings.json +63 -0
package/skill/assets/single-skill-settings.json +57 -0
package/skill/references/invocation-taxonomy.md +2 -2
package/skill/references/logs.md +164 -2
package/skill/references/setup-patterns.md +65 -0
package/skill/references/version-history.md +40 -0
package/skill/settings_snippet.json +23 -0
package/templates/activation-rules-default.json +27 -0
package/templates/multi-skill-settings.json +64 -0
package/templates/single-skill-settings.json +58 -0
package/dashboard/index.html +0 -1119

package/cli/selftune/evolution/validate-body.ts ADDED Viewed

@@ -0,0 +1,254 @@
+/**
+ * validate-body.ts
+ *
+ * 3-gate validation for full body evolution proposals:
+ *   Gate 1 (structural): Pure code — YAML frontmatter, # Title, ## Workflow Routing preserved
+ *   Gate 2 (trigger accuracy): Student model YES/NO per eval entry
+ *   Gate 3 (quality): Student model rates body clarity/completeness 0.0-1.0
+ */
+import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
+import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
+import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
+// ---------------------------------------------------------------------------
+// Gate 1: Structural validation (pure code, no LLM)
+// ---------------------------------------------------------------------------
+/**
+ * Check that a proposed body preserves required structural elements.
+ * Verifies:
+ *  - Contains a ## Workflow Routing section
+ *  - Routing table has valid markdown table syntax
+ *  - Body is non-empty
+ */
+export function validateBodyStructure(proposedBody: string): { valid: boolean; reason: string } {
+  if (!proposedBody || proposedBody.trim().length === 0) {
+    return { valid: false, reason: "Proposed body is empty" };
+  }
+  // Check for ## Workflow Routing section
+  if (!proposedBody.includes("## Workflow Routing")) {
+    return { valid: false, reason: "Missing required '## Workflow Routing' section" };
+  }
+  // Extract the routing section and check for table syntax
+  const routingIdx = proposedBody.indexOf("## Workflow Routing");
+  const afterRouting = proposedBody.slice(routingIdx + "## Workflow Routing".length);
+  // Find end of section (next ## heading or EOF)
+  const nextSectionMatch = afterRouting.match(/\n## /);
+  const routingContent = nextSectionMatch
+    ? afterRouting.slice(0, nextSectionMatch.index)
+    : afterRouting;
+  // Check for pipe-delimited table rows
+  const tableLines = routingContent
+    .split("\n")
+    .filter((l) => l.trim().startsWith("|") && l.trim().endsWith("|"));
+  if (tableLines.length < 2) {
+    return {
+      valid: false,
+      reason:
+        "Workflow Routing section lacks a valid markdown table (need header + separator + rows)",
+    };
+  }
+  return { valid: true, reason: "Structural validation passed" };
+}
+// ---------------------------------------------------------------------------
+// Gate 2: Trigger accuracy (student model YES/NO)
+// ---------------------------------------------------------------------------
+/**
+ * Run trigger checks on the eval set using the proposed body content.
+ * Returns before/after pass rates.
+ */
+export async function validateBodyTriggerAccuracy(
+  originalBody: string,
+  proposedBody: string,
+  evalSet: EvalEntry[],
+  agent: string,
+  modelFlag?: string,
+): Promise<{
+  before_pass_rate: number;
+  after_pass_rate: number;
+  improved: boolean;
+  regressions: string[];
+}> {
+  if (evalSet.length === 0) {
+    return { before_pass_rate: 0, after_pass_rate: 0, improved: false, regressions: [] };
+  }
+  const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
+  let beforePassed = 0;
+  let afterPassed = 0;
+  const regressions: string[] = [];
+  for (const entry of evalSet) {
+    // Check with original body
+    const beforePrompt = buildTriggerCheckPrompt(originalBody, entry.query);
+    const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
+    const beforeTriggered = parseTriggerResponse(beforeRaw);
+    const beforePass =
+      (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
+    // Check with proposed body
+    const afterPrompt = buildTriggerCheckPrompt(proposedBody, entry.query);
+    const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
+    const afterTriggered = parseTriggerResponse(afterRaw);
+    const afterPass =
+      (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
+    if (beforePass) beforePassed++;
+    if (afterPass) afterPassed++;
+    // Track regressions
+    if (beforePass && !afterPass) {
+      regressions.push(entry.query);
+    }
+  }
+  const total = evalSet.length;
+  const beforePassRate = beforePassed / total;
+  const afterPassRate = afterPassed / total;
+  return {
+    before_pass_rate: beforePassRate,
+    after_pass_rate: afterPassRate,
+    improved: afterPassRate > beforePassRate,
+    regressions,
+  };
+}
+// ---------------------------------------------------------------------------
+// Gate 3: Quality assessment (student model 0.0-1.0)
+// ---------------------------------------------------------------------------
+/** System prompt for quality assessment. */
+const QUALITY_ASSESSMENT_SYSTEM = `You are a skill document quality assessor for an AI agent system.
+Rate the quality of the provided skill document body on these dimensions:
+- Clarity: Is the description clear and unambiguous?
+- Completeness: Does it cover the expected use cases?
+- Structure: Is it well-organized with proper sections?
+- Routing accuracy: Does the routing table seem comprehensive?
+Output ONLY valid JSON with exactly these fields:
+  - "score" (number): Overall quality score 0.0-1.0
+  - "reason" (string): Brief explanation of the score
+Do NOT include any text outside the JSON object.`;
+/** Assess the quality of a proposed body via student model. */
+export async function assessBodyQuality(
+  proposedBody: string,
+  skillName: string,
+  agent: string,
+  modelFlag?: string,
+): Promise<{ score: number; reason: string }> {
+  const userPrompt = `Skill Name: ${skillName}
+Proposed Skill Body:
+${proposedBody}
+Rate the quality of this skill document body. Output ONLY a JSON object with "score" (0.0-1.0) and "reason" fields.`;
+  const rawResponse = await callLlm(QUALITY_ASSESSMENT_SYSTEM, userPrompt, agent, modelFlag);
+  const cleaned = stripMarkdownFences(rawResponse);
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(cleaned);
+  } catch {
+    // If parsing fails, return a conservative default
+    return { score: 0.5, reason: "Failed to parse quality assessment response" };
+  }
+  if (typeof parsed !== "object" || parsed === null) {
+    return { score: 0.5, reason: "Quality assessment response is not a JSON object" };
+  }
+  const obj = parsed as Record<string, unknown>;
+  const score = typeof obj.score === "number" ? Math.max(0.0, Math.min(1.0, obj.score)) : 0.5;
+  const reason = typeof obj.reason === "string" ? obj.reason : "No reason provided";
+  return { score, reason };
+}
+// ---------------------------------------------------------------------------
+// Full 3-gate body validation
+// ---------------------------------------------------------------------------
+/** Minimum quality score to pass Gate 3. */
+const QUALITY_THRESHOLD = 0.6;
+/** Validate a body proposal through all 3 gates. */
+export async function validateBodyProposal(
+  proposal: BodyEvolutionProposal,
+  evalSet: EvalEntry[],
+  agent: string,
+  modelFlag?: string,
+  qualityThreshold = QUALITY_THRESHOLD,
+): Promise<BodyValidationResult> {
+  const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
+  // Gate 1: Structural validation (pure code)
+  const structural = validateBodyStructure(proposal.proposed_body);
+  gateResults.push({
+    gate: "structural",
+    passed: structural.valid,
+    reason: structural.reason,
+  });
+  if (!structural.valid) {
+    return {
+      proposal_id: proposal.proposal_id,
+      gates_passed: 0,
+      gates_total: 3,
+      gate_results: gateResults,
+      improved: false,
+      regressions: [],
+    };
+  }
+  // Gate 2: Trigger accuracy (student model)
+  const accuracy = await validateBodyTriggerAccuracy(
+    proposal.original_body,
+    proposal.proposed_body,
+    evalSet,
+    agent,
+    modelFlag,
+  );
+  gateResults.push({
+    gate: "trigger_accuracy",
+    passed: accuracy.improved,
+    reason: accuracy.improved
+      ? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
+      : `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
+  });
+  // Gate 3: Quality assessment (student model)
+  const quality = await assessBodyQuality(
+    proposal.proposed_body,
+    proposal.skill_name,
+    agent,
+    modelFlag,
+  );
+  gateResults.push({
+    gate: "quality",
+    passed: quality.score >= qualityThreshold,
+    reason: `Quality score: ${quality.score.toFixed(2)} (threshold: ${qualityThreshold}) — ${quality.reason}`,
+  });
+  const gatesPassed = gateResults.filter((g) => g.passed).length;
+  return {
+    proposal_id: proposal.proposal_id,
+    gates_passed: gatesPassed,
+    gates_total: 3,
+    gate_results: gateResults,
+    improved: gatesPassed === 3,
+    regressions: accuracy.regressions,
+  };
+}

package/cli/selftune/evolution/validate-proposal.ts CHANGED Viewed

@@ -6,8 +6,25 @@
  * to determine whether the proposal is an improvement.
  */
-import type { EvalEntry, EvolutionProposal } from "../types.js";
+import type { EvalEntry, EvolutionProposal, InvocationTypeScores } from "../types.js";
 import { callLlm } from "../utils/llm-call.js";
+import {
+  buildBatchTriggerCheckPrompt,
+  buildTriggerCheckPrompt,
+  parseBatchTriggerResponse,
+  parseTriggerResponse,
+} from "../utils/trigger-check.js";
+// Re-export so existing consumers don't break
+export { buildTriggerCheckPrompt, parseTriggerResponse };
+/** Number of eval queries to batch into a single LLM call.
+ * Higher = fewer claude -p spawns = much faster (each spawn has ~30-60s overhead).
+ * Haiku handles 50+ YES/NO checks in a single call easily. */
+export const TRIGGER_CHECK_BATCH_SIZE = 50;
+/** Number of times to run each batch and majority-vote to reduce LLM variance. */
+export const VALIDATION_RUNS = 3;
 // ---------------------------------------------------------------------------
 // Types
@@ -21,47 +38,20 @@ export interface ValidationResult {
   regressions: EvalEntry[]; // passed before, fail after
   new_passes: EvalEntry[]; // failed before, pass after
   net_change: number; // after - before pass rate
-}
-// ---------------------------------------------------------------------------
-// Prompt building
-// ---------------------------------------------------------------------------
-/** Build the trigger check prompt for the LLM. */
-export function buildTriggerCheckPrompt(description: string, query: string): string {
-  return [
-    "Given this skill description, would the following user query trigger this skill?",
-    "Respond YES or NO only.",
-    "",
-    "Skill description:",
-    description,
-    "",
-    "User query:",
-    query,
-  ].join("\n");
-}
-// ---------------------------------------------------------------------------
-// Response parsing
-// ---------------------------------------------------------------------------
-/** Parse YES/NO from LLM response. */
-export function parseTriggerResponse(response: string): boolean {
-  const normalized = response.trim().toUpperCase();
-  if (normalized.startsWith("YES")) return true;
-  if (normalized.startsWith("NO")) return false;
-  return false; // conservative default
+  by_invocation_type?: InvocationTypeScores;
+  per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
 }
 // ---------------------------------------------------------------------------
 // Proposal validation
 // ---------------------------------------------------------------------------
-/** Validate a proposal by running trigger checks against the eval set. */
-export async function validateProposal(
+/** Validate a proposal sequentially (one LLM call per query). Kept for backward compat. */
+export async function validateProposalSequential(
   proposal: EvolutionProposal,
   evalSet: EvalEntry[],
   agent: string,
+  modelFlag?: string,
 ): Promise<ValidationResult> {
   if (evalSet.length === 0) {
     return {
@@ -78,20 +68,22 @@ export async function validateProposal(
   const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
   const regressions: EvalEntry[] = [];
   const newPasses: EvalEntry[] = [];
+  const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
+    [];
   let beforePassed = 0;
   let afterPassed = 0;
   for (const entry of evalSet) {
     // Check with original description
     const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
-    const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent);
+    const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
     const beforeTriggered = parseTriggerResponse(beforeRaw);
     const beforePass =
       (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
     // Check with proposed description
     const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
-    const afterRaw = await callLlm(systemPrompt, afterPrompt, agent);
+    const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
     const afterTriggered = parseTriggerResponse(afterRaw);
     const afterPass =
       (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
@@ -108,6 +100,8 @@ export async function validateProposal(
     if (!beforePass && afterPass) {
       newPasses.push(entry);
     }
+    perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
   }
   const total = evalSet.length;
@@ -124,6 +118,51 @@ export async function validateProposal(
     regressions.length < total * 0.05 &&
     (netChange >= 0.1 || newPasses.length >= 2);
+  // Compute per-invocation-type scores (initialize all required keys)
+  const byInvocationType: Record<string, { passed: number; total: number }> = {
+    explicit: { passed: 0, total: 0 },
+    implicit: { passed: 0, total: 0 },
+    contextual: { passed: 0, total: 0 },
+    negative: { passed: 0, total: 0 },
+  };
+  for (const r of perEntryResults) {
+    const type = r.entry.invocation_type ?? "implicit";
+    if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
+    byInvocationType[type].total++;
+    if (r.after_pass) byInvocationType[type].passed++;
+  }
+  const invocationScores: InvocationTypeScores = {
+    explicit: {
+      ...byInvocationType.explicit,
+      pass_rate:
+        byInvocationType.explicit.total > 0
+          ? byInvocationType.explicit.passed / byInvocationType.explicit.total
+          : 0,
+    },
+    implicit: {
+      ...byInvocationType.implicit,
+      pass_rate:
+        byInvocationType.implicit.total > 0
+          ? byInvocationType.implicit.passed / byInvocationType.implicit.total
+          : 0,
+    },
+    contextual: {
+      ...byInvocationType.contextual,
+      pass_rate:
+        byInvocationType.contextual.total > 0
+          ? byInvocationType.contextual.passed / byInvocationType.contextual.total
+          : 0,
+    },
+    negative: {
+      ...byInvocationType.negative,
+      pass_rate:
+        byInvocationType.negative.total > 0
+          ? byInvocationType.negative.passed / byInvocationType.negative.total
+          : 0,
+    },
+  };
   return {
     proposal_id: proposal.proposal_id,
     before_pass_rate: beforePassRate,
@@ -132,5 +171,188 @@ export async function validateProposal(
     regressions,
     new_passes: newPasses,
     net_change: netChange,
+    by_invocation_type: invocationScores,
+    per_entry_results: perEntryResults,
   };
 }
+// ---------------------------------------------------------------------------
+// Batched proposal validation
+// ---------------------------------------------------------------------------
+/** Chunk an array into groups of `size`. */
+function chunk<T>(arr: T[], size: number): T[][] {
+  const chunks: T[][] = [];
+  for (let i = 0; i < arr.length; i += size) {
+    chunks.push(arr.slice(i, i + size));
+  }
+  return chunks;
+}
+/** Majority-vote across multiple boolean arrays. Returns true if >50% of runs agree. */
+function majorityVote(runs: boolean[][], index: number): boolean {
+  let yesCount = 0;
+  for (const run of runs) {
+    if (run[index]) yesCount++;
+  }
+  return yesCount > runs.length / 2;
+}
+/**
+ * Validate a proposal by batching trigger checks.
+ * Instead of 2 LLM calls per entry, this makes 2 calls per batch
+ * (one for "before", one for "after"), reducing total calls from 2N to ~2*(N/batchSize).
+ */
+export async function validateProposalBatched(
+  proposal: EvolutionProposal,
+  evalSet: EvalEntry[],
+  agent: string,
+  modelFlag?: string,
+): Promise<ValidationResult> {
+  if (evalSet.length === 0) {
+    return {
+      proposal_id: proposal.proposal_id,
+      before_pass_rate: 0,
+      after_pass_rate: 0,
+      improved: false,
+      regressions: [],
+      new_passes: [],
+      net_change: 0,
+    };
+  }
+  const systemPrompt =
+    "You are an evaluation assistant. For each numbered query, respond with the number followed by YES or NO.";
+  const regressions: EvalEntry[] = [];
+  const newPasses: EvalEntry[] = [];
+  const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
+    [];
+  let beforePassed = 0;
+  let afterPassed = 0;
+  const batches = chunk(evalSet, TRIGGER_CHECK_BATCH_SIZE);
+  for (const batch of batches) {
+    const queries = batch.map((e) => e.query);
+    const beforePrompt = buildBatchTriggerCheckPrompt(proposal.original_description, queries);
+    const afterPrompt = buildBatchTriggerCheckPrompt(proposal.proposed_description, queries);
+    // Run VALIDATION_RUNS times in parallel and majority-vote to reduce LLM variance
+    const allCalls: Promise<string>[] = [];
+    for (let r = 0; r < VALIDATION_RUNS; r++) {
+      allCalls.push(callLlm(systemPrompt, beforePrompt, agent, modelFlag));
+      allCalls.push(callLlm(systemPrompt, afterPrompt, agent, modelFlag));
+    }
+    const allRaw = await Promise.all(allCalls);
+    // Parse into arrays of [before, after] per run
+    const beforeRuns: boolean[][] = [];
+    const afterRuns: boolean[][] = [];
+    for (let r = 0; r < VALIDATION_RUNS; r++) {
+      beforeRuns.push(parseBatchTriggerResponse(allRaw[r * 2], queries.length));
+      afterRuns.push(parseBatchTriggerResponse(allRaw[r * 2 + 1], queries.length));
+    }
+    for (let i = 0; i < batch.length; i++) {
+      const entry = batch[i];
+      const beforeTriggered = majorityVote(beforeRuns, i);
+      const afterTriggered = majorityVote(afterRuns, i);
+      const beforePass =
+        (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
+      const afterPass =
+        (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
+      if (beforePass) beforePassed++;
+      if (afterPass) afterPassed++;
+      if (beforePass && !afterPass) regressions.push(entry);
+      if (!beforePass && afterPass) newPasses.push(entry);
+      perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
+    }
+  }
+  const total = evalSet.length;
+  const beforePassRate = beforePassed / total;
+  const afterPassRate = afterPassed / total;
+  const netChange = afterPassRate - beforePassRate;
+  const improved =
+    afterPassRate > beforePassRate &&
+    regressions.length < total * 0.05 &&
+    (netChange >= 0.1 || newPasses.length >= 2);
+  // Compute per-invocation-type scores (initialize all required keys)
+  const byInvocationType: Record<string, { passed: number; total: number }> = {
+    explicit: { passed: 0, total: 0 },
+    implicit: { passed: 0, total: 0 },
+    contextual: { passed: 0, total: 0 },
+    negative: { passed: 0, total: 0 },
+  };
+  for (const r of perEntryResults) {
+    const type = r.entry.invocation_type ?? "implicit";
+    if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
+    byInvocationType[type].total++;
+    if (r.after_pass) byInvocationType[type].passed++;
+  }
+  const invocationScores: InvocationTypeScores = {
+    explicit: {
+      ...byInvocationType.explicit,
+      pass_rate:
+        byInvocationType.explicit.total > 0
+          ? byInvocationType.explicit.passed / byInvocationType.explicit.total
+          : 0,
+    },
+    implicit: {
+      ...byInvocationType.implicit,
+      pass_rate:
+        byInvocationType.implicit.total > 0
+          ? byInvocationType.implicit.passed / byInvocationType.implicit.total
+          : 0,
+    },
+    contextual: {
+      ...byInvocationType.contextual,
+      pass_rate:
+        byInvocationType.contextual.total > 0
+          ? byInvocationType.contextual.passed / byInvocationType.contextual.total
+          : 0,
+    },
+    negative: {
+      ...byInvocationType.negative,
+      pass_rate:
+        byInvocationType.negative.total > 0
+          ? byInvocationType.negative.passed / byInvocationType.negative.total
+          : 0,
+    },
+  };
+  return {
+    proposal_id: proposal.proposal_id,
+    before_pass_rate: beforePassRate,
+    after_pass_rate: afterPassRate,
+    improved,
+    regressions,
+    new_passes: newPasses,
+    net_change: netChange,
+    by_invocation_type: invocationScores,
+    per_entry_results: perEntryResults,
+  };
+}
+// ---------------------------------------------------------------------------
+// Default export — batched is the default
+// ---------------------------------------------------------------------------
+/** Validate a proposal by running trigger checks against the eval set (batched by default). */
+export async function validateProposal(
+  proposal: EvolutionProposal,
+  evalSet: EvalEntry[],
+  agent: string,
+  modelFlag?: string,
+): Promise<ValidationResult> {
+  return validateProposalBatched(proposal, evalSet, agent, modelFlag);
+}