npm - selftune - Versions diffs - 0.2.18 → 0.2.20 - Mend

selftune 0.2.18 → 0.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

package/README.md +9 -4
package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +60 -0
package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
package/apps/local-dashboard/dist/index.html +5 -5
package/cli/selftune/alpha-upload/stage-canonical.ts +7 -6
package/cli/selftune/constants.ts +10 -0
package/cli/selftune/contribute/contribute.ts +30 -2
package/cli/selftune/contribution-config.ts +249 -0
package/cli/selftune/contribution-relay.ts +177 -0
package/cli/selftune/contribution-signals.ts +219 -0
package/cli/selftune/contribution-staging.ts +147 -0
package/cli/selftune/contributions.ts +532 -0
package/cli/selftune/creator-contributions.ts +333 -0
package/cli/selftune/dashboard-contract.ts +209 -1
package/cli/selftune/dashboard-server.ts +45 -11
package/cli/selftune/eval/family-overlap.ts +714 -0
package/cli/selftune/eval/hooks-to-evals.ts +182 -28
package/cli/selftune/eval/synthetic-evals.ts +298 -11
package/cli/selftune/evolution/evidence.ts +5 -0
package/cli/selftune/evolution/evolve-body.ts +62 -2
package/cli/selftune/evolution/evolve.ts +58 -1
package/cli/selftune/evolution/validate-body.ts +10 -0
package/cli/selftune/evolution/validate-host-replay.ts +236 -0
package/cli/selftune/evolution/validate-proposal.ts +10 -0
package/cli/selftune/evolution/validate-routing.ts +112 -5
package/cli/selftune/export.ts +2 -2
package/cli/selftune/index.ts +41 -5
package/cli/selftune/ingestors/codex-rollout.ts +31 -35
package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
package/cli/selftune/localdb/db.ts +2 -2
package/cli/selftune/localdb/direct-write.ts +8 -3
package/cli/selftune/localdb/materialize.ts +7 -2
package/cli/selftune/localdb/queries.ts +712 -31
package/cli/selftune/localdb/schema.ts +30 -1
package/cli/selftune/recover.ts +153 -0
package/cli/selftune/repair/skill-usage.ts +363 -4
package/cli/selftune/routes/actions.ts +35 -1
package/cli/selftune/routes/analytics.ts +14 -0
package/cli/selftune/routes/index.ts +1 -0
package/cli/selftune/routes/overview.ts +112 -4
package/cli/selftune/routes/skill-report.ts +575 -11
package/cli/selftune/status.ts +81 -2
package/cli/selftune/sync.ts +56 -2
package/cli/selftune/trust-model.ts +66 -0
package/cli/selftune/types.ts +103 -0
package/cli/selftune/utils/skill-detection.ts +43 -0
package/cli/selftune/utils/text-similarity.ts +73 -0
package/cli/selftune/watchlist.ts +65 -0
package/package.json +1 -1
package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
package/packages/ui/src/components/EvidenceViewer.tsx +419 -145
package/packages/ui/src/components/EvolutionTimeline.tsx +81 -29
package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
package/packages/ui/src/components/section-cards.tsx +12 -9
package/packages/ui/src/primitives/card.tsx +1 -1
package/packages/ui/src/types.ts +4 -0
package/skill/SKILL.md +11 -1
package/skill/Workflows/AlphaUpload.md +4 -0
package/skill/Workflows/Composability.md +78 -0
package/skill/Workflows/Contribute.md +6 -3
package/skill/Workflows/Contributions.md +97 -0
package/skill/Workflows/CreatorContributions.md +74 -0
package/skill/Workflows/Dashboard.md +31 -0
package/skill/Workflows/Evals.md +57 -8
package/skill/Workflows/Evolve.md +23 -0
package/skill/Workflows/Ingest.md +7 -0
package/skill/Workflows/Initialize.md +20 -1
package/skill/Workflows/Recover.md +84 -0
package/skill/Workflows/RepairSkillUsage.md +12 -4
package/skill/Workflows/Sync.md +18 -12
package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12

package/cli/selftune/evolution/evolve-body.ts CHANGED Viewed

@@ -31,12 +31,13 @@ import { callViaSubagent } from "../utils/llm-call.js";
 import { appendAuditEntry } from "./audit.js";
 import { checkConstitutionSizeOnly } from "./constitutional.js";
 import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js";
-import { appendEvidenceEntry } from "./evidence.js";
+import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
 import { extractFailurePatterns } from "./extract-patterns.js";
 import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
 import { generateRoutingProposal } from "./propose-routing.js";
 import { refineBodyProposal } from "./refine-body.js";
 import { validateBodyProposal } from "./validate-body.js";
+import { buildRoutingReplayFixture } from "./validate-host-replay.js";
 import { validateRoutingProposal } from "./validate-routing.js";
 // ---------------------------------------------------------------------------
@@ -106,6 +107,10 @@ function createAuditEntry(
   action: EvolutionAuditEntry["action"],
   details: string,
   skillName?: string,
+  provenance?: Pick<
+    EvolutionAuditEntry,
+    "validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
+  >,
 ): EvolutionAuditEntry {
   return {
     timestamp: new Date().toISOString(),
@@ -113,6 +118,14 @@ function createAuditEntry(
     skill_name: skillName,
     action,
     details,
+    ...(provenance?.validation_mode ? { validation_mode: provenance.validation_mode } : {}),
+    ...(provenance?.validation_agent ? { validation_agent: provenance.validation_agent } : {}),
+    ...(provenance?.validation_fixture_id
+      ? { validation_fixture_id: provenance.validation_fixture_id }
+      : {}),
+    ...(provenance?.validation_evidence_ref
+      ? { validation_evidence_ref: provenance.validation_evidence_ref }
+      : {}),
   };
 }
@@ -181,8 +194,12 @@ export async function evolveBody(
     proposalId: string,
     action: EvolutionAuditEntry["action"],
     details: string,
+    provenance?: Pick<
+      EvolutionAuditEntry,
+      "validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
+    >,
   ): void {
-    const entry = createAuditEntry(proposalId, action, details, skillName);
+    const entry = createAuditEntry(proposalId, action, details, skillName, provenance);
     auditEntries.push(entry);
     try {
       _appendAuditEntry(entry);
@@ -443,11 +460,17 @@ export async function evolveBody(
       const validationModelFlag = options.validationModel ?? studentModel;
       let validation: BodyValidationResult;
       if (target === "routing") {
+        const replayFixture = buildRoutingReplayFixture({
+          skillName,
+          skillPath,
+          platform: studentAgent === "codex" ? "codex" : "claude_code",
+        });
         validation = await _validateRoutingProposal(
           proposal,
           evalSet,
           studentAgent,
           validationModelFlag,
+          { replayFixture },
         );
       } else {
         validation = await _validateBodyProposal(
@@ -458,11 +481,18 @@ export async function evolveBody(
         );
       }
       lastValidation = validation;
+      const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
       recordAudit(
         proposal.proposal_id,
         "validated",
         `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed`,
+        {
+          validation_mode: validation.validation_mode,
+          validation_agent: validation.validation_agent,
+          validation_fixture_id: validation.validation_fixture_id,
+          validation_evidence_ref: validatedEvidenceRef,
+        },
       );
       recordEvidence({
         timestamp: new Date().toISOString(),
@@ -480,6 +510,12 @@ export async function evolveBody(
           gates_total: validation.gates_total,
           gate_results: validation.gate_results,
           regressions: validation.regressions,
+          before_pass_rate: validation.before_pass_rate,
+          after_pass_rate: validation.after_pass_rate,
+          validation_mode: validation.validation_mode,
+          validation_agent: validation.validation_agent,
+          validation_fixture_id: validation.validation_fixture_id,
+          validation_evidence_ref: validatedEvidenceRef,
         },
       });
@@ -491,6 +527,12 @@ export async function evolveBody(
         proposal.proposal_id,
         "rejected",
         `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates`,
+        {
+          validation_mode: validation.validation_mode,
+          validation_agent: validation.validation_agent,
+          validation_fixture_id: validation.validation_fixture_id,
+          validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
+        },
       );
       recordEvidence({
         timestamp: new Date().toISOString(),
@@ -508,6 +550,12 @@ export async function evolveBody(
           gates_total: validation.gates_total,
           gate_results: validation.gate_results,
           regressions: validation.regressions,
+          before_pass_rate: validation.before_pass_rate,
+          after_pass_rate: validation.after_pass_rate,
+          validation_mode: validation.validation_mode,
+          validation_agent: validation.validation_agent,
+          validation_fixture_id: validation.validation_fixture_id,
+          validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
         },
       });
@@ -607,6 +655,12 @@ export async function evolveBody(
         lastProposal.proposal_id,
         "deployed",
         `Deployed ${target} proposal for ${skillName}`,
+        {
+          validation_mode: lastValidation.validation_mode,
+          validation_agent: lastValidation.validation_agent,
+          validation_fixture_id: lastValidation.validation_fixture_id,
+          validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
+        },
       );
       recordEvidence({
         timestamp: new Date().toISOString(),
@@ -624,6 +678,12 @@ export async function evolveBody(
           gates_total: lastValidation.gates_total,
           gate_results: lastValidation.gate_results,
           regressions: lastValidation.regressions,
+          before_pass_rate: lastValidation.before_pass_rate,
+          after_pass_rate: lastValidation.after_pass_rate,
+          validation_mode: lastValidation.validation_mode,
+          validation_agent: lastValidation.validation_agent,
+          validation_fixture_id: lastValidation.validation_fixture_id,
+          validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
         },
       });

package/cli/selftune/evolution/evolve.ts CHANGED Viewed

@@ -43,7 +43,7 @@ import { createEvolveTUI } from "../utils/tui.js";
 import { appendAuditEntry } from "./audit.js";
 import { checkConstitution } from "./constitutional.js";
 import { scoreDescription } from "./description-quality.js";
-import { appendEvidenceEntry } from "./evidence.js";
+import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
 import { extractFailurePatterns } from "./extract-patterns.js";
 import {
   computeInvocationScores,
@@ -139,6 +139,10 @@ function createAuditEntry(
   evalSnapshot?: EvalPassRate,
   skillName?: string,
   iterationsUsed?: number,
+  provenance?: Pick<
+    EvolutionAuditEntry,
+    "validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
+  >,
 ): EvolutionAuditEntry {
   return {
     timestamp: new Date().toISOString(),
@@ -148,6 +152,14 @@ function createAuditEntry(
     ...(skillName ? { skill_name: skillName } : {}),
     ...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
     ...(iterationsUsed != null ? { iterations_used: iterationsUsed } : {}),
+    ...(provenance?.validation_mode ? { validation_mode: provenance.validation_mode } : {}),
+    ...(provenance?.validation_agent ? { validation_agent: provenance.validation_agent } : {}),
+    ...(provenance?.validation_fixture_id
+      ? { validation_fixture_id: provenance.validation_fixture_id }
+      : {}),
+    ...(provenance?.validation_evidence_ref
+      ? { validation_evidence_ref: provenance.validation_evidence_ref }
+      : {}),
   };
 }
@@ -289,6 +301,10 @@ export async function evolve(
     details: string,
     evalSnapshot?: EvalPassRate,
     iterationsUsed?: number,
+    provenance?: Pick<
+      EvolutionAuditEntry,
+      "validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
+    >,
   ): void {
     const entry = createAuditEntry(
       proposalId,
@@ -297,6 +313,7 @@ export async function evolve(
       evalSnapshot,
       skillName,
       iterationsUsed,
+      provenance,
     );
     auditEntries.push(entry);
     try {
@@ -637,10 +654,18 @@ export async function evolve(
           options.validationModel,
         );
         llmCallCount += countValidationLlmCalls(evalSet.length);
+        const evidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
         recordAudit(
           proposal.proposal_id,
           "validated",
           `Pareto validation: improved=${validation.improved}`,
+          undefined,
+          undefined,
+          {
+            validation_mode: validation.validation_mode,
+            validation_agent: validation.validation_agent,
+            validation_evidence_ref: evidenceRef,
+          },
         );
         recordEvidence({
           timestamp: new Date().toISOString(),
@@ -660,6 +685,9 @@ export async function evolve(
             regressions: validation.regressions,
             new_passes: validation.new_passes,
             per_entry_results: validation.per_entry_results,
+            validation_mode: validation.validation_mode,
+            validation_agent: validation.validation_agent,
+            validation_evidence_ref: evidenceRef,
           },
         });
@@ -866,11 +894,18 @@ export async function evolve(
           failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
           pass_rate: validation.after_pass_rate,
         };
+        const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
         recordAudit(
           proposal.proposal_id,
           "validated",
           `Validation complete: improved=${validation.improved}`,
           evalSnapshot,
+          undefined,
+          {
+            validation_mode: validation.validation_mode,
+            validation_agent: validation.validation_agent,
+            validation_evidence_ref: validatedEvidenceRef,
+          },
         );
         recordEvidence({
           timestamp: new Date().toISOString(),
@@ -890,6 +925,9 @@ export async function evolve(
             regressions: validation.regressions,
             new_passes: validation.new_passes,
             per_entry_results: validation.per_entry_results,
+            validation_mode: validation.validation_mode,
+            validation_agent: validation.validation_agent,
+            validation_evidence_ref: validatedEvidenceRef,
           },
         });
@@ -906,10 +944,18 @@ export async function evolve(
         if (!validation.improved) {
           feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
+          const rejectedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "rejected");
           recordAudit(
             proposal.proposal_id,
             "rejected",
             `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
+            undefined,
+            undefined,
+            {
+              validation_mode: validation.validation_mode,
+              validation_agent: validation.validation_agent,
+              validation_evidence_ref: rejectedEvidenceRef,
+            },
           );
           recordEvidence({
             timestamp: new Date().toISOString(),
@@ -929,6 +975,9 @@ export async function evolve(
               regressions: validation.regressions,
               new_passes: validation.new_passes,
               per_entry_results: validation.per_entry_results,
+              validation_mode: validation.validation_mode,
+              validation_agent: validation.validation_agent,
+              validation_evidence_ref: rejectedEvidenceRef,
             },
           });
@@ -1138,6 +1187,11 @@ export async function evolve(
           pass_rate: lastValidation.after_pass_rate,
         },
         iterationsCompleted,
+        {
+          validation_mode: lastValidation.validation_mode,
+          validation_agent: lastValidation.validation_agent,
+          validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
+        },
       );
       recordEvidence({
         timestamp: new Date().toISOString(),
@@ -1157,6 +1211,9 @@ export async function evolve(
           regressions: lastValidation.regressions,
           new_passes: lastValidation.new_passes,
           per_entry_results: lastValidation.per_entry_results,
+          validation_mode: lastValidation.validation_mode,
+          validation_agent: lastValidation.validation_agent,
+          validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
         },
       });
     }

package/cli/selftune/evolution/validate-body.ts CHANGED Viewed

@@ -209,6 +209,8 @@ export async function validateBodyProposal(
       gate_results: gateResults,
       improved: false,
       regressions: [],
+      validation_mode: "structural_guard",
+      validation_agent: agent,
     };
   }
@@ -250,5 +252,13 @@ export async function validateBodyProposal(
     gate_results: gateResults,
     improved: gatesPassed === 3,
     regressions: accuracy.regressions,
+    validation_mode: "llm_judge",
+    validation_agent: agent,
+    ...(evalSet.length > 0
+      ? {
+          before_pass_rate: accuracy.before_pass_rate,
+          after_pass_rate: accuracy.after_pass_rate,
+        }
+      : {}),
   };
 }

package/cli/selftune/evolution/validate-host-replay.ts ADDED Viewed

@@ -0,0 +1,236 @@
+import { existsSync, readFileSync, readdirSync, realpathSync, statSync } from "node:fs";
+import { basename, dirname, join } from "node:path";
+import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
+import { parseFrontmatter } from "../utils/frontmatter.js";
+import { containsWholeSkillMention } from "../utils/skill-discovery.js";
+import { findGitRepositoryRoot } from "../utils/skill-discovery.js";
+import {
+  extractWhenToUseLines,
+  jaccardSimilarity,
+  tokenizeText,
+} from "../utils/text-similarity.js";
+interface ReplaySkillSurface {
+  skillName: string;
+  descriptionTokens: Set<string>;
+  whenToUseTokens: Set<string>;
+}
+/**
+ * Minimum score needed before replay treats routing text or skill-surface overlap
+ * as a real match. Tuned to suppress weak false positives without killing recall
+ * for short routing phrases and sparse skill surfaces.
+ */
+const HOST_REPLAY_MATCH_THRESHOLD = 0.18;
+function resolveReplayPath(path: string): string {
+  try {
+    return realpathSync(path);
+  } catch {
+    return path;
+  }
+}
+function listCompetingSkillPaths(targetSkillPath: string): string[] {
+  const normalizedTargetPath = resolveReplayPath(targetSkillPath);
+  const targetSkillDir = dirname(normalizedTargetPath);
+  const registryDir = dirname(targetSkillDir);
+  const targetDirName = basename(targetSkillDir);
+  const competingPaths: string[] = [];
+  try {
+    for (const entry of readdirSync(registryDir)) {
+      if (entry === targetDirName) continue;
+      const candidateDir = join(registryDir, entry);
+      try {
+        if (!statSync(candidateDir).isDirectory()) continue;
+      } catch {
+        continue;
+      }
+      const candidateSkillPath = join(candidateDir, "SKILL.md");
+      if (!existsSync(candidateSkillPath)) continue;
+      competingPaths.push(resolveReplayPath(candidateSkillPath));
+    }
+  } catch {
+    // Ignore unreadable registries and treat the fixture as target-only.
+  }
+  return competingPaths.sort((a, b) => a.localeCompare(b));
+}
+export function buildRoutingReplayFixture(options: {
+  skillName: string;
+  skillPath: string;
+  platform?: RoutingReplayFixture["platform"];
+  fixtureId?: string;
+  workspaceRoot?: string;
+}): RoutingReplayFixture {
+  const targetSkillPath = resolveReplayPath(options.skillPath);
+  const workspaceRoot =
+    options.workspaceRoot ?? findGitRepositoryRoot(dirname(dirname(targetSkillPath)));
+  const platform = options.platform ?? "claude_code";
+  return {
+    fixture_id: options.fixtureId ?? `auto-${platform}-${options.skillName}`,
+    platform,
+    target_skill_name: options.skillName,
+    target_skill_path: targetSkillPath,
+    competing_skill_paths: listCompetingSkillPaths(targetSkillPath),
+    ...(workspaceRoot ? { workspace_root: workspaceRoot } : {}),
+  };
+}
+function loadReplaySkillSurface(skillPath: string): ReplaySkillSurface {
+  const fallbackName = basename(dirname(skillPath)) || "unknown-skill";
+  try {
+    const raw = readFileSync(skillPath, "utf8");
+    const parsed = parseFrontmatter(raw);
+    return {
+      skillName: parsed.name.trim() || fallbackName,
+      descriptionTokens: tokenizeText(parsed.description),
+      whenToUseTokens: tokenizeText(extractWhenToUseLines(parsed.body).join(" ")),
+    };
+  } catch {
+    return {
+      skillName: fallbackName,
+      descriptionTokens: new Set<string>(),
+      whenToUseTokens: new Set<string>(),
+    };
+  }
+}
+function extractRoutingTriggerPhrases(routing: string): string[] {
+  const lines = routing
+    .trim()
+    .split("\n")
+    .map((line) => line.trim())
+    .filter(Boolean);
+  if (lines.length < 3) return [];
+  const phrases: string[] = [];
+  for (const row of lines.slice(2)) {
+    if (!row.startsWith("|") || !row.endsWith("|")) continue;
+    const cells = row.split("|").map((cell) => cell.trim());
+    const triggerCell = cells[1];
+    if (!triggerCell) continue;
+    for (const part of triggerCell.split(/,|\/| or /i)) {
+      const phrase = part.trim().replace(/^["'`]|["'`]$/g, "");
+      if (phrase.length >= 3) phrases.push(phrase);
+    }
+  }
+  return phrases;
+}
+function scoreQueryAgainstTriggerPhrases(query: string, triggerPhrases: string[]): number {
+  const normalizedQuery = query.toLowerCase();
+  const queryTokens = tokenizeText(query);
+  let best = 0;
+  for (const phrase of triggerPhrases) {
+    const normalizedPhrase = phrase.toLowerCase();
+    if (normalizedQuery.includes(normalizedPhrase)) {
+      best = Math.max(best, 1);
+      continue;
+    }
+    best = Math.max(best, jaccardSimilarity(queryTokens, tokenizeText(phrase)));
+  }
+  return best;
+}
+function scoreQueryAgainstSkillSurface(query: string, surface: ReplaySkillSurface): number {
+  const queryTokens = tokenizeText(query);
+  return Math.max(
+    jaccardSimilarity(queryTokens, surface.descriptionTokens),
+    jaccardSimilarity(queryTokens, surface.whenToUseTokens),
+  );
+}
+function evaluateReplayTrigger(
+  query: string,
+  routing: string,
+  targetSurface: ReplaySkillSurface,
+  competingSurfaces: ReplaySkillSurface[],
+): { triggered: boolean; evidence: string } {
+  const normalizedQuery = query.trim();
+  if (containsWholeSkillMention(normalizedQuery, targetSurface.skillName)) {
+    return {
+      triggered: true,
+      evidence: `explicit target mention: ${targetSurface.skillName}`,
+    };
+  }
+  for (const competingSurface of competingSurfaces) {
+    if (containsWholeSkillMention(normalizedQuery, competingSurface.skillName)) {
+      return {
+        triggered: false,
+        evidence: `explicit competing skill mention: ${competingSurface.skillName}`,
+      };
+    }
+  }
+  const triggerPhrases = extractRoutingTriggerPhrases(routing);
+  const triggerScore = scoreQueryAgainstTriggerPhrases(normalizedQuery, triggerPhrases);
+  const targetSurfaceScore = scoreQueryAgainstSkillSurface(normalizedQuery, targetSurface);
+  const targetScore = Math.max(triggerScore, targetSurfaceScore);
+  const bestCompetitor = competingSurfaces
+    .map((surface) => ({
+      skillName: surface.skillName,
+      score: scoreQueryAgainstSkillSurface(normalizedQuery, surface),
+    }))
+    .sort((a, b) => b.score - a.score)[0];
+  if (targetScore < HOST_REPLAY_MATCH_THRESHOLD) {
+    return {
+      triggered: false,
+      evidence: "target routing and skill surface did not clear replay threshold",
+    };
+  }
+  if (bestCompetitor && bestCompetitor.score >= targetScore) {
+    return {
+      triggered: false,
+      evidence: `competing skill surface scored higher: ${bestCompetitor.skillName}`,
+    };
+  }
+  if (triggerScore >= targetSurfaceScore) {
+    return {
+      triggered: true,
+      evidence:
+        triggerScore === 1
+          ? "query matched a routing trigger phrase exactly"
+          : "query aligned with routing trigger language",
+    };
+  }
+  return {
+    triggered: true,
+    evidence: "query aligned with target skill surface in replay fixture",
+  };
+}
+export function runHostReplayFixture(options: {
+  routing: string;
+  evalSet: EvalEntry[];
+  fixture: RoutingReplayFixture;
+}): RoutingReplayEntryResult[] {
+  const targetSurface = loadReplaySkillSurface(options.fixture.target_skill_path);
+  const competingSurfaces = options.fixture.competing_skill_paths.map(loadReplaySkillSurface);
+  return options.evalSet.map((entry) => {
+    const evaluated = evaluateReplayTrigger(
+      entry.query,
+      options.routing,
+      targetSurface,
+      competingSurfaces,
+    );
+    return {
+      query: entry.query,
+      should_trigger: entry.should_trigger,
+      triggered: evaluated.triggered,
+      passed: evaluated.triggered === entry.should_trigger,
+      evidence: evaluated.evidence,
+    };
+  });
+}

package/cli/selftune/evolution/validate-proposal.ts CHANGED Viewed

@@ -40,6 +40,8 @@ export interface ValidationResult {
   net_change: number; // after - before pass rate
   by_invocation_type?: InvocationTypeScores;
   per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
+  validation_mode?: "llm_judge";
+  validation_agent?: string;
 }
 // ---------------------------------------------------------------------------
@@ -63,6 +65,8 @@ export async function validateProposalSequential(
       regressions: [],
       new_passes: [],
       net_change: 0,
+      validation_mode: "llm_judge",
+      validation_agent: agent,
     };
   }
@@ -174,6 +178,8 @@ export async function validateProposalSequential(
     net_change: netChange,
     by_invocation_type: invocationScores,
     per_entry_results: perEntryResults,
+    validation_mode: "llm_judge",
+    validation_agent: agent,
   };
 }
@@ -220,6 +226,8 @@ export async function validateProposalBatched(
       regressions: [],
       new_passes: [],
       net_change: 0,
+      validation_mode: "llm_judge",
+      validation_agent: agent,
     };
   }
@@ -342,6 +350,8 @@ export async function validateProposalBatched(
     net_change: netChange,
     by_invocation_type: invocationScores,
     per_entry_results: perEntryResults,
+    validation_mode: "llm_judge",
+    validation_agent: agent,
   };
 }