selftune 0.2.18 → 0.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +9 -4
  2. package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +60 -0
  3. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +1 -0
  4. package/apps/local-dashboard/dist/assets/vendor-table-BIiI3YhS.js +1 -0
  5. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +12 -0
  6. package/apps/local-dashboard/dist/index.html +5 -5
  7. package/cli/selftune/alpha-upload/stage-canonical.ts +7 -6
  8. package/cli/selftune/constants.ts +10 -0
  9. package/cli/selftune/contribute/contribute.ts +30 -2
  10. package/cli/selftune/contribution-config.ts +249 -0
  11. package/cli/selftune/contribution-relay.ts +177 -0
  12. package/cli/selftune/contribution-signals.ts +219 -0
  13. package/cli/selftune/contribution-staging.ts +147 -0
  14. package/cli/selftune/contributions.ts +532 -0
  15. package/cli/selftune/creator-contributions.ts +333 -0
  16. package/cli/selftune/dashboard-contract.ts +209 -1
  17. package/cli/selftune/dashboard-server.ts +45 -11
  18. package/cli/selftune/eval/family-overlap.ts +714 -0
  19. package/cli/selftune/eval/hooks-to-evals.ts +182 -28
  20. package/cli/selftune/eval/synthetic-evals.ts +298 -11
  21. package/cli/selftune/evolution/evidence.ts +5 -0
  22. package/cli/selftune/evolution/evolve-body.ts +62 -2
  23. package/cli/selftune/evolution/evolve.ts +58 -1
  24. package/cli/selftune/evolution/validate-body.ts +10 -0
  25. package/cli/selftune/evolution/validate-host-replay.ts +236 -0
  26. package/cli/selftune/evolution/validate-proposal.ts +10 -0
  27. package/cli/selftune/evolution/validate-routing.ts +112 -5
  28. package/cli/selftune/export.ts +2 -2
  29. package/cli/selftune/index.ts +41 -5
  30. package/cli/selftune/ingestors/codex-rollout.ts +31 -35
  31. package/cli/selftune/ingestors/codex-wrapper.ts +32 -24
  32. package/cli/selftune/localdb/db.ts +2 -2
  33. package/cli/selftune/localdb/direct-write.ts +8 -3
  34. package/cli/selftune/localdb/materialize.ts +7 -2
  35. package/cli/selftune/localdb/queries.ts +712 -31
  36. package/cli/selftune/localdb/schema.ts +30 -1
  37. package/cli/selftune/recover.ts +153 -0
  38. package/cli/selftune/repair/skill-usage.ts +363 -4
  39. package/cli/selftune/routes/actions.ts +35 -1
  40. package/cli/selftune/routes/analytics.ts +14 -0
  41. package/cli/selftune/routes/index.ts +1 -0
  42. package/cli/selftune/routes/overview.ts +112 -4
  43. package/cli/selftune/routes/skill-report.ts +575 -11
  44. package/cli/selftune/status.ts +81 -2
  45. package/cli/selftune/sync.ts +56 -2
  46. package/cli/selftune/trust-model.ts +66 -0
  47. package/cli/selftune/types.ts +103 -0
  48. package/cli/selftune/utils/skill-detection.ts +43 -0
  49. package/cli/selftune/utils/text-similarity.ts +73 -0
  50. package/cli/selftune/watchlist.ts +65 -0
  51. package/package.json +1 -1
  52. package/packages/ui/src/components/ActivityTimeline.tsx +165 -150
  53. package/packages/ui/src/components/EvidenceViewer.tsx +419 -145
  54. package/packages/ui/src/components/EvolutionTimeline.tsx +81 -29
  55. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +33 -16
  56. package/packages/ui/src/components/RecentActivityFeed.tsx +72 -41
  57. package/packages/ui/src/components/section-cards.tsx +12 -9
  58. package/packages/ui/src/primitives/card.tsx +1 -1
  59. package/packages/ui/src/types.ts +4 -0
  60. package/skill/SKILL.md +11 -1
  61. package/skill/Workflows/AlphaUpload.md +4 -0
  62. package/skill/Workflows/Composability.md +78 -0
  63. package/skill/Workflows/Contribute.md +6 -3
  64. package/skill/Workflows/Contributions.md +97 -0
  65. package/skill/Workflows/CreatorContributions.md +74 -0
  66. package/skill/Workflows/Dashboard.md +31 -0
  67. package/skill/Workflows/Evals.md +57 -8
  68. package/skill/Workflows/Evolve.md +23 -0
  69. package/skill/Workflows/Ingest.md +7 -0
  70. package/skill/Workflows/Initialize.md +20 -1
  71. package/skill/Workflows/Recover.md +84 -0
  72. package/skill/Workflows/RepairSkillUsage.md +12 -4
  73. package/skill/Workflows/Sync.md +18 -12
  74. package/apps/local-dashboard/dist/assets/index-BMIS6uUh.css +0 -2
  75. package/apps/local-dashboard/dist/assets/index-DOu3iLD9.js +0 -16
  76. package/apps/local-dashboard/dist/assets/vendor-table-pHbDxq36.js +0 -8
  77. package/apps/local-dashboard/dist/assets/vendor-ui-DIwlrGlb.js +0 -12
@@ -31,12 +31,13 @@ import { callViaSubagent } from "../utils/llm-call.js";
31
31
  import { appendAuditEntry } from "./audit.js";
32
32
  import { checkConstitutionSizeOnly } from "./constitutional.js";
33
33
  import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js";
34
- import { appendEvidenceEntry } from "./evidence.js";
34
+ import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
35
35
  import { extractFailurePatterns } from "./extract-patterns.js";
36
36
  import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
37
37
  import { generateRoutingProposal } from "./propose-routing.js";
38
38
  import { refineBodyProposal } from "./refine-body.js";
39
39
  import { validateBodyProposal } from "./validate-body.js";
40
+ import { buildRoutingReplayFixture } from "./validate-host-replay.js";
40
41
  import { validateRoutingProposal } from "./validate-routing.js";
41
42
 
42
43
  // ---------------------------------------------------------------------------
@@ -106,6 +107,10 @@ function createAuditEntry(
106
107
  action: EvolutionAuditEntry["action"],
107
108
  details: string,
108
109
  skillName?: string,
110
+ provenance?: Pick<
111
+ EvolutionAuditEntry,
112
+ "validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
113
+ >,
109
114
  ): EvolutionAuditEntry {
110
115
  return {
111
116
  timestamp: new Date().toISOString(),
@@ -113,6 +118,14 @@ function createAuditEntry(
113
118
  skill_name: skillName,
114
119
  action,
115
120
  details,
121
+ ...(provenance?.validation_mode ? { validation_mode: provenance.validation_mode } : {}),
122
+ ...(provenance?.validation_agent ? { validation_agent: provenance.validation_agent } : {}),
123
+ ...(provenance?.validation_fixture_id
124
+ ? { validation_fixture_id: provenance.validation_fixture_id }
125
+ : {}),
126
+ ...(provenance?.validation_evidence_ref
127
+ ? { validation_evidence_ref: provenance.validation_evidence_ref }
128
+ : {}),
116
129
  };
117
130
  }
118
131
 
@@ -181,8 +194,12 @@ export async function evolveBody(
181
194
  proposalId: string,
182
195
  action: EvolutionAuditEntry["action"],
183
196
  details: string,
197
+ provenance?: Pick<
198
+ EvolutionAuditEntry,
199
+ "validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
200
+ >,
184
201
  ): void {
185
- const entry = createAuditEntry(proposalId, action, details, skillName);
202
+ const entry = createAuditEntry(proposalId, action, details, skillName, provenance);
186
203
  auditEntries.push(entry);
187
204
  try {
188
205
  _appendAuditEntry(entry);
@@ -443,11 +460,17 @@ export async function evolveBody(
443
460
  const validationModelFlag = options.validationModel ?? studentModel;
444
461
  let validation: BodyValidationResult;
445
462
  if (target === "routing") {
463
+ const replayFixture = buildRoutingReplayFixture({
464
+ skillName,
465
+ skillPath,
466
+ platform: studentAgent === "codex" ? "codex" : "claude_code",
467
+ });
446
468
  validation = await _validateRoutingProposal(
447
469
  proposal,
448
470
  evalSet,
449
471
  studentAgent,
450
472
  validationModelFlag,
473
+ { replayFixture },
451
474
  );
452
475
  } else {
453
476
  validation = await _validateBodyProposal(
@@ -458,11 +481,18 @@ export async function evolveBody(
458
481
  );
459
482
  }
460
483
  lastValidation = validation;
484
+ const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
461
485
 
462
486
  recordAudit(
463
487
  proposal.proposal_id,
464
488
  "validated",
465
489
  `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed`,
490
+ {
491
+ validation_mode: validation.validation_mode,
492
+ validation_agent: validation.validation_agent,
493
+ validation_fixture_id: validation.validation_fixture_id,
494
+ validation_evidence_ref: validatedEvidenceRef,
495
+ },
466
496
  );
467
497
  recordEvidence({
468
498
  timestamp: new Date().toISOString(),
@@ -480,6 +510,12 @@ export async function evolveBody(
480
510
  gates_total: validation.gates_total,
481
511
  gate_results: validation.gate_results,
482
512
  regressions: validation.regressions,
513
+ before_pass_rate: validation.before_pass_rate,
514
+ after_pass_rate: validation.after_pass_rate,
515
+ validation_mode: validation.validation_mode,
516
+ validation_agent: validation.validation_agent,
517
+ validation_fixture_id: validation.validation_fixture_id,
518
+ validation_evidence_ref: validatedEvidenceRef,
483
519
  },
484
520
  });
485
521
 
@@ -491,6 +527,12 @@ export async function evolveBody(
491
527
  proposal.proposal_id,
492
528
  "rejected",
493
529
  `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates`,
530
+ {
531
+ validation_mode: validation.validation_mode,
532
+ validation_agent: validation.validation_agent,
533
+ validation_fixture_id: validation.validation_fixture_id,
534
+ validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
535
+ },
494
536
  );
495
537
  recordEvidence({
496
538
  timestamp: new Date().toISOString(),
@@ -508,6 +550,12 @@ export async function evolveBody(
508
550
  gates_total: validation.gates_total,
509
551
  gate_results: validation.gate_results,
510
552
  regressions: validation.regressions,
553
+ before_pass_rate: validation.before_pass_rate,
554
+ after_pass_rate: validation.after_pass_rate,
555
+ validation_mode: validation.validation_mode,
556
+ validation_agent: validation.validation_agent,
557
+ validation_fixture_id: validation.validation_fixture_id,
558
+ validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
511
559
  },
512
560
  });
513
561
 
@@ -607,6 +655,12 @@ export async function evolveBody(
607
655
  lastProposal.proposal_id,
608
656
  "deployed",
609
657
  `Deployed ${target} proposal for ${skillName}`,
658
+ {
659
+ validation_mode: lastValidation.validation_mode,
660
+ validation_agent: lastValidation.validation_agent,
661
+ validation_fixture_id: lastValidation.validation_fixture_id,
662
+ validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
663
+ },
610
664
  );
611
665
  recordEvidence({
612
666
  timestamp: new Date().toISOString(),
@@ -624,6 +678,12 @@ export async function evolveBody(
624
678
  gates_total: lastValidation.gates_total,
625
679
  gate_results: lastValidation.gate_results,
626
680
  regressions: lastValidation.regressions,
681
+ before_pass_rate: lastValidation.before_pass_rate,
682
+ after_pass_rate: lastValidation.after_pass_rate,
683
+ validation_mode: lastValidation.validation_mode,
684
+ validation_agent: lastValidation.validation_agent,
685
+ validation_fixture_id: lastValidation.validation_fixture_id,
686
+ validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
627
687
  },
628
688
  });
629
689
 
@@ -43,7 +43,7 @@ import { createEvolveTUI } from "../utils/tui.js";
43
43
  import { appendAuditEntry } from "./audit.js";
44
44
  import { checkConstitution } from "./constitutional.js";
45
45
  import { scoreDescription } from "./description-quality.js";
46
- import { appendEvidenceEntry } from "./evidence.js";
46
+ import { appendEvidenceEntry, buildValidationEvidenceRef } from "./evidence.js";
47
47
  import { extractFailurePatterns } from "./extract-patterns.js";
48
48
  import {
49
49
  computeInvocationScores,
@@ -139,6 +139,10 @@ function createAuditEntry(
139
139
  evalSnapshot?: EvalPassRate,
140
140
  skillName?: string,
141
141
  iterationsUsed?: number,
142
+ provenance?: Pick<
143
+ EvolutionAuditEntry,
144
+ "validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
145
+ >,
142
146
  ): EvolutionAuditEntry {
143
147
  return {
144
148
  timestamp: new Date().toISOString(),
@@ -148,6 +152,14 @@ function createAuditEntry(
148
152
  ...(skillName ? { skill_name: skillName } : {}),
149
153
  ...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}),
150
154
  ...(iterationsUsed != null ? { iterations_used: iterationsUsed } : {}),
155
+ ...(provenance?.validation_mode ? { validation_mode: provenance.validation_mode } : {}),
156
+ ...(provenance?.validation_agent ? { validation_agent: provenance.validation_agent } : {}),
157
+ ...(provenance?.validation_fixture_id
158
+ ? { validation_fixture_id: provenance.validation_fixture_id }
159
+ : {}),
160
+ ...(provenance?.validation_evidence_ref
161
+ ? { validation_evidence_ref: provenance.validation_evidence_ref }
162
+ : {}),
151
163
  };
152
164
  }
153
165
 
@@ -289,6 +301,10 @@ export async function evolve(
289
301
  details: string,
290
302
  evalSnapshot?: EvalPassRate,
291
303
  iterationsUsed?: number,
304
+ provenance?: Pick<
305
+ EvolutionAuditEntry,
306
+ "validation_mode" | "validation_agent" | "validation_fixture_id" | "validation_evidence_ref"
307
+ >,
292
308
  ): void {
293
309
  const entry = createAuditEntry(
294
310
  proposalId,
@@ -297,6 +313,7 @@ export async function evolve(
297
313
  evalSnapshot,
298
314
  skillName,
299
315
  iterationsUsed,
316
+ provenance,
300
317
  );
301
318
  auditEntries.push(entry);
302
319
  try {
@@ -637,10 +654,18 @@ export async function evolve(
637
654
  options.validationModel,
638
655
  );
639
656
  llmCallCount += countValidationLlmCalls(evalSet.length);
657
+ const evidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
640
658
  recordAudit(
641
659
  proposal.proposal_id,
642
660
  "validated",
643
661
  `Pareto validation: improved=${validation.improved}`,
662
+ undefined,
663
+ undefined,
664
+ {
665
+ validation_mode: validation.validation_mode,
666
+ validation_agent: validation.validation_agent,
667
+ validation_evidence_ref: evidenceRef,
668
+ },
644
669
  );
645
670
  recordEvidence({
646
671
  timestamp: new Date().toISOString(),
@@ -660,6 +685,9 @@ export async function evolve(
660
685
  regressions: validation.regressions,
661
686
  new_passes: validation.new_passes,
662
687
  per_entry_results: validation.per_entry_results,
688
+ validation_mode: validation.validation_mode,
689
+ validation_agent: validation.validation_agent,
690
+ validation_evidence_ref: evidenceRef,
663
691
  },
664
692
  });
665
693
 
@@ -866,11 +894,18 @@ export async function evolve(
866
894
  failed: evalSet.length - Math.round(validation.after_pass_rate * evalSet.length),
867
895
  pass_rate: validation.after_pass_rate,
868
896
  };
897
+ const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
869
898
  recordAudit(
870
899
  proposal.proposal_id,
871
900
  "validated",
872
901
  `Validation complete: improved=${validation.improved}`,
873
902
  evalSnapshot,
903
+ undefined,
904
+ {
905
+ validation_mode: validation.validation_mode,
906
+ validation_agent: validation.validation_agent,
907
+ validation_evidence_ref: validatedEvidenceRef,
908
+ },
874
909
  );
875
910
  recordEvidence({
876
911
  timestamp: new Date().toISOString(),
@@ -890,6 +925,9 @@ export async function evolve(
890
925
  regressions: validation.regressions,
891
926
  new_passes: validation.new_passes,
892
927
  per_entry_results: validation.per_entry_results,
928
+ validation_mode: validation.validation_mode,
929
+ validation_agent: validation.validation_agent,
930
+ validation_evidence_ref: validatedEvidenceRef,
893
931
  },
894
932
  });
895
933
 
@@ -906,10 +944,18 @@ export async function evolve(
906
944
 
907
945
  if (!validation.improved) {
908
946
  feedbackReason = `Validation failed: net_change=${validation.net_change.toFixed(3)}, improved=false`;
947
+ const rejectedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "rejected");
909
948
  recordAudit(
910
949
  proposal.proposal_id,
911
950
  "rejected",
912
951
  `Validation failed: net_change=${validation.net_change.toFixed(3)} (stopping: ${stopping.reason})`,
952
+ undefined,
953
+ undefined,
954
+ {
955
+ validation_mode: validation.validation_mode,
956
+ validation_agent: validation.validation_agent,
957
+ validation_evidence_ref: rejectedEvidenceRef,
958
+ },
913
959
  );
914
960
  recordEvidence({
915
961
  timestamp: new Date().toISOString(),
@@ -929,6 +975,9 @@ export async function evolve(
929
975
  regressions: validation.regressions,
930
976
  new_passes: validation.new_passes,
931
977
  per_entry_results: validation.per_entry_results,
978
+ validation_mode: validation.validation_mode,
979
+ validation_agent: validation.validation_agent,
980
+ validation_evidence_ref: rejectedEvidenceRef,
932
981
  },
933
982
  });
934
983
 
@@ -1138,6 +1187,11 @@ export async function evolve(
1138
1187
  pass_rate: lastValidation.after_pass_rate,
1139
1188
  },
1140
1189
  iterationsCompleted,
1190
+ {
1191
+ validation_mode: lastValidation.validation_mode,
1192
+ validation_agent: lastValidation.validation_agent,
1193
+ validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
1194
+ },
1141
1195
  );
1142
1196
  recordEvidence({
1143
1197
  timestamp: new Date().toISOString(),
@@ -1157,6 +1211,9 @@ export async function evolve(
1157
1211
  regressions: lastValidation.regressions,
1158
1212
  new_passes: lastValidation.new_passes,
1159
1213
  per_entry_results: lastValidation.per_entry_results,
1214
+ validation_mode: lastValidation.validation_mode,
1215
+ validation_agent: lastValidation.validation_agent,
1216
+ validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
1160
1217
  },
1161
1218
  });
1162
1219
  }
@@ -209,6 +209,8 @@ export async function validateBodyProposal(
209
209
  gate_results: gateResults,
210
210
  improved: false,
211
211
  regressions: [],
212
+ validation_mode: "structural_guard",
213
+ validation_agent: agent,
212
214
  };
213
215
  }
214
216
 
@@ -250,5 +252,13 @@ export async function validateBodyProposal(
250
252
  gate_results: gateResults,
251
253
  improved: gatesPassed === 3,
252
254
  regressions: accuracy.regressions,
255
+ validation_mode: "llm_judge",
256
+ validation_agent: agent,
257
+ ...(evalSet.length > 0
258
+ ? {
259
+ before_pass_rate: accuracy.before_pass_rate,
260
+ after_pass_rate: accuracy.after_pass_rate,
261
+ }
262
+ : {}),
253
263
  };
254
264
  }
@@ -0,0 +1,236 @@
1
+ import { existsSync, readFileSync, readdirSync, realpathSync, statSync } from "node:fs";
2
+ import { basename, dirname, join } from "node:path";
3
+
4
+ import type { EvalEntry, RoutingReplayEntryResult, RoutingReplayFixture } from "../types.js";
5
+ import { parseFrontmatter } from "../utils/frontmatter.js";
6
+ import { containsWholeSkillMention } from "../utils/skill-discovery.js";
7
+ import { findGitRepositoryRoot } from "../utils/skill-discovery.js";
8
+ import {
9
+ extractWhenToUseLines,
10
+ jaccardSimilarity,
11
+ tokenizeText,
12
+ } from "../utils/text-similarity.js";
13
+
14
+ interface ReplaySkillSurface {
15
+ skillName: string;
16
+ descriptionTokens: Set<string>;
17
+ whenToUseTokens: Set<string>;
18
+ }
19
+
20
+ /**
21
+ * Minimum score needed before replay treats routing text or skill-surface overlap
22
+ * as a real match. Tuned to suppress weak false positives without killing recall
23
+ * for short routing phrases and sparse skill surfaces.
24
+ */
25
+ const HOST_REPLAY_MATCH_THRESHOLD = 0.18;
26
+
27
+ function resolveReplayPath(path: string): string {
28
+ try {
29
+ return realpathSync(path);
30
+ } catch {
31
+ return path;
32
+ }
33
+ }
34
+
35
+ function listCompetingSkillPaths(targetSkillPath: string): string[] {
36
+ const normalizedTargetPath = resolveReplayPath(targetSkillPath);
37
+ const targetSkillDir = dirname(normalizedTargetPath);
38
+ const registryDir = dirname(targetSkillDir);
39
+ const targetDirName = basename(targetSkillDir);
40
+ const competingPaths: string[] = [];
41
+
42
+ try {
43
+ for (const entry of readdirSync(registryDir)) {
44
+ if (entry === targetDirName) continue;
45
+ const candidateDir = join(registryDir, entry);
46
+ try {
47
+ if (!statSync(candidateDir).isDirectory()) continue;
48
+ } catch {
49
+ continue;
50
+ }
51
+
52
+ const candidateSkillPath = join(candidateDir, "SKILL.md");
53
+ if (!existsSync(candidateSkillPath)) continue;
54
+ competingPaths.push(resolveReplayPath(candidateSkillPath));
55
+ }
56
+ } catch {
57
+ // Ignore unreadable registries and treat the fixture as target-only.
58
+ }
59
+
60
+ return competingPaths.sort((a, b) => a.localeCompare(b));
61
+ }
62
+
63
+ export function buildRoutingReplayFixture(options: {
64
+ skillName: string;
65
+ skillPath: string;
66
+ platform?: RoutingReplayFixture["platform"];
67
+ fixtureId?: string;
68
+ workspaceRoot?: string;
69
+ }): RoutingReplayFixture {
70
+ const targetSkillPath = resolveReplayPath(options.skillPath);
71
+ const workspaceRoot =
72
+ options.workspaceRoot ?? findGitRepositoryRoot(dirname(dirname(targetSkillPath)));
73
+ const platform = options.platform ?? "claude_code";
74
+
75
+ return {
76
+ fixture_id: options.fixtureId ?? `auto-${platform}-${options.skillName}`,
77
+ platform,
78
+ target_skill_name: options.skillName,
79
+ target_skill_path: targetSkillPath,
80
+ competing_skill_paths: listCompetingSkillPaths(targetSkillPath),
81
+ ...(workspaceRoot ? { workspace_root: workspaceRoot } : {}),
82
+ };
83
+ }
84
+
85
+ function loadReplaySkillSurface(skillPath: string): ReplaySkillSurface {
86
+ const fallbackName = basename(dirname(skillPath)) || "unknown-skill";
87
+ try {
88
+ const raw = readFileSync(skillPath, "utf8");
89
+ const parsed = parseFrontmatter(raw);
90
+ return {
91
+ skillName: parsed.name.trim() || fallbackName,
92
+ descriptionTokens: tokenizeText(parsed.description),
93
+ whenToUseTokens: tokenizeText(extractWhenToUseLines(parsed.body).join(" ")),
94
+ };
95
+ } catch {
96
+ return {
97
+ skillName: fallbackName,
98
+ descriptionTokens: new Set<string>(),
99
+ whenToUseTokens: new Set<string>(),
100
+ };
101
+ }
102
+ }
103
+
104
+ function extractRoutingTriggerPhrases(routing: string): string[] {
105
+ const lines = routing
106
+ .trim()
107
+ .split("\n")
108
+ .map((line) => line.trim())
109
+ .filter(Boolean);
110
+ if (lines.length < 3) return [];
111
+
112
+ const phrases: string[] = [];
113
+ for (const row of lines.slice(2)) {
114
+ if (!row.startsWith("|") || !row.endsWith("|")) continue;
115
+ const cells = row.split("|").map((cell) => cell.trim());
116
+ const triggerCell = cells[1];
117
+ if (!triggerCell) continue;
118
+ for (const part of triggerCell.split(/,|\/| or /i)) {
119
+ const phrase = part.trim().replace(/^["'`]|["'`]$/g, "");
120
+ if (phrase.length >= 3) phrases.push(phrase);
121
+ }
122
+ }
123
+ return phrases;
124
+ }
125
+
126
+ function scoreQueryAgainstTriggerPhrases(query: string, triggerPhrases: string[]): number {
127
+ const normalizedQuery = query.toLowerCase();
128
+ const queryTokens = tokenizeText(query);
129
+ let best = 0;
130
+ for (const phrase of triggerPhrases) {
131
+ const normalizedPhrase = phrase.toLowerCase();
132
+ if (normalizedQuery.includes(normalizedPhrase)) {
133
+ best = Math.max(best, 1);
134
+ continue;
135
+ }
136
+ best = Math.max(best, jaccardSimilarity(queryTokens, tokenizeText(phrase)));
137
+ }
138
+ return best;
139
+ }
140
+
141
+ function scoreQueryAgainstSkillSurface(query: string, surface: ReplaySkillSurface): number {
142
+ const queryTokens = tokenizeText(query);
143
+ return Math.max(
144
+ jaccardSimilarity(queryTokens, surface.descriptionTokens),
145
+ jaccardSimilarity(queryTokens, surface.whenToUseTokens),
146
+ );
147
+ }
148
+
149
+ function evaluateReplayTrigger(
150
+ query: string,
151
+ routing: string,
152
+ targetSurface: ReplaySkillSurface,
153
+ competingSurfaces: ReplaySkillSurface[],
154
+ ): { triggered: boolean; evidence: string } {
155
+ const normalizedQuery = query.trim();
156
+ if (containsWholeSkillMention(normalizedQuery, targetSurface.skillName)) {
157
+ return {
158
+ triggered: true,
159
+ evidence: `explicit target mention: ${targetSurface.skillName}`,
160
+ };
161
+ }
162
+
163
+ for (const competingSurface of competingSurfaces) {
164
+ if (containsWholeSkillMention(normalizedQuery, competingSurface.skillName)) {
165
+ return {
166
+ triggered: false,
167
+ evidence: `explicit competing skill mention: ${competingSurface.skillName}`,
168
+ };
169
+ }
170
+ }
171
+
172
+ const triggerPhrases = extractRoutingTriggerPhrases(routing);
173
+ const triggerScore = scoreQueryAgainstTriggerPhrases(normalizedQuery, triggerPhrases);
174
+ const targetSurfaceScore = scoreQueryAgainstSkillSurface(normalizedQuery, targetSurface);
175
+ const targetScore = Math.max(triggerScore, targetSurfaceScore);
176
+ const bestCompetitor = competingSurfaces
177
+ .map((surface) => ({
178
+ skillName: surface.skillName,
179
+ score: scoreQueryAgainstSkillSurface(normalizedQuery, surface),
180
+ }))
181
+ .sort((a, b) => b.score - a.score)[0];
182
+
183
+ if (targetScore < HOST_REPLAY_MATCH_THRESHOLD) {
184
+ return {
185
+ triggered: false,
186
+ evidence: "target routing and skill surface did not clear replay threshold",
187
+ };
188
+ }
189
+
190
+ if (bestCompetitor && bestCompetitor.score >= targetScore) {
191
+ return {
192
+ triggered: false,
193
+ evidence: `competing skill surface scored higher: ${bestCompetitor.skillName}`,
194
+ };
195
+ }
196
+
197
+ if (triggerScore >= targetSurfaceScore) {
198
+ return {
199
+ triggered: true,
200
+ evidence:
201
+ triggerScore === 1
202
+ ? "query matched a routing trigger phrase exactly"
203
+ : "query aligned with routing trigger language",
204
+ };
205
+ }
206
+
207
+ return {
208
+ triggered: true,
209
+ evidence: "query aligned with target skill surface in replay fixture",
210
+ };
211
+ }
212
+
213
+ export function runHostReplayFixture(options: {
214
+ routing: string;
215
+ evalSet: EvalEntry[];
216
+ fixture: RoutingReplayFixture;
217
+ }): RoutingReplayEntryResult[] {
218
+ const targetSurface = loadReplaySkillSurface(options.fixture.target_skill_path);
219
+ const competingSurfaces = options.fixture.competing_skill_paths.map(loadReplaySkillSurface);
220
+
221
+ return options.evalSet.map((entry) => {
222
+ const evaluated = evaluateReplayTrigger(
223
+ entry.query,
224
+ options.routing,
225
+ targetSurface,
226
+ competingSurfaces,
227
+ );
228
+ return {
229
+ query: entry.query,
230
+ should_trigger: entry.should_trigger,
231
+ triggered: evaluated.triggered,
232
+ passed: evaluated.triggered === entry.should_trigger,
233
+ evidence: evaluated.evidence,
234
+ };
235
+ });
236
+ }
@@ -40,6 +40,8 @@ export interface ValidationResult {
40
40
  net_change: number; // after - before pass rate
41
41
  by_invocation_type?: InvocationTypeScores;
42
42
  per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
43
+ validation_mode?: "llm_judge";
44
+ validation_agent?: string;
43
45
  }
44
46
 
45
47
  // ---------------------------------------------------------------------------
@@ -63,6 +65,8 @@ export async function validateProposalSequential(
63
65
  regressions: [],
64
66
  new_passes: [],
65
67
  net_change: 0,
68
+ validation_mode: "llm_judge",
69
+ validation_agent: agent,
66
70
  };
67
71
  }
68
72
 
@@ -174,6 +178,8 @@ export async function validateProposalSequential(
174
178
  net_change: netChange,
175
179
  by_invocation_type: invocationScores,
176
180
  per_entry_results: perEntryResults,
181
+ validation_mode: "llm_judge",
182
+ validation_agent: agent,
177
183
  };
178
184
  }
179
185
 
@@ -220,6 +226,8 @@ export async function validateProposalBatched(
220
226
  regressions: [],
221
227
  new_passes: [],
222
228
  net_change: 0,
229
+ validation_mode: "llm_judge",
230
+ validation_agent: agent,
223
231
  };
224
232
  }
225
233
 
@@ -342,6 +350,8 @@ export async function validateProposalBatched(
342
350
  net_change: netChange,
343
351
  by_invocation_type: invocationScores,
344
352
  per_entry_results: perEntryResults,
353
+ validation_mode: "llm_judge",
354
+ validation_agent: agent,
345
355
  };
346
356
  }
347
357