selftune 0.2.22 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +95 -15
  3. package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
  4. package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
  5. package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
  6. package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
  7. package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
  8. package/apps/local-dashboard/dist/index.html +5 -5
  9. package/cli/selftune/adapters/codex/install.ts +310 -78
  10. package/cli/selftune/adapters/opencode/install.ts +3 -4
  11. package/cli/selftune/adapters/pi/hook.ts +273 -0
  12. package/cli/selftune/adapters/pi/install.ts +207 -0
  13. package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
  14. package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
  15. package/cli/selftune/auto-update.ts +200 -8
  16. package/cli/selftune/canonical-export.ts +55 -25
  17. package/cli/selftune/command-surface.ts +397 -0
  18. package/cli/selftune/constants.ts +10 -1
  19. package/cli/selftune/contribute/contribute.ts +64 -13
  20. package/cli/selftune/contribution-config.ts +57 -3
  21. package/cli/selftune/contribution-preferences.ts +117 -0
  22. package/cli/selftune/contribution-signals.ts +8 -4
  23. package/cli/selftune/contribution-staging.ts +13 -2
  24. package/cli/selftune/contributions.ts +55 -121
  25. package/cli/selftune/creator-contributions.ts +29 -10
  26. package/cli/selftune/cron/setup.ts +7 -3
  27. package/cli/selftune/dashboard-contract.ts +87 -0
  28. package/cli/selftune/dashboard-server.ts +168 -17
  29. package/cli/selftune/dashboard.ts +350 -17
  30. package/cli/selftune/eval/baseline.ts +21 -5
  31. package/cli/selftune/eval/execution-eval.ts +170 -0
  32. package/cli/selftune/eval/family-overlap.ts +2 -2
  33. package/cli/selftune/eval/hooks-to-evals.ts +228 -82
  34. package/cli/selftune/eval/import-skillsbench.ts +2 -2
  35. package/cli/selftune/eval/invocation-classifier.ts +56 -0
  36. package/cli/selftune/eval/synthetic-evals.ts +5 -3
  37. package/cli/selftune/eval/unit-test-cli.ts +7 -4
  38. package/cli/selftune/evolution/apply-proposal.ts +295 -0
  39. package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
  40. package/cli/selftune/evolution/engines/replay-engine.ts +180 -0
  41. package/cli/selftune/evolution/evidence.ts +2 -6
  42. package/cli/selftune/evolution/evolve-body.ts +152 -38
  43. package/cli/selftune/evolution/evolve.ts +244 -52
  44. package/cli/selftune/evolution/rollback.ts +0 -1
  45. package/cli/selftune/evolution/validate-body.ts +111 -49
  46. package/cli/selftune/evolution/validate-host-replay.ts +510 -60
  47. package/cli/selftune/evolution/validate-proposal.ts +11 -150
  48. package/cli/selftune/evolution/validate-routing.ts +51 -108
  49. package/cli/selftune/evolution/validation-contract.ts +91 -0
  50. package/cli/selftune/grading/auto-grade.ts +11 -7
  51. package/cli/selftune/grading/grade-session.ts +10 -16
  52. package/cli/selftune/hooks/skill-eval.ts +2 -1
  53. package/cli/selftune/hooks-shared/types.ts +1 -0
  54. package/cli/selftune/index.ts +58 -15
  55. package/cli/selftune/ingestors/claude-replay.ts +15 -10
  56. package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
  57. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  58. package/cli/selftune/ingestors/pi-ingest.ts +727 -0
  59. package/cli/selftune/init.ts +38 -4
  60. package/cli/selftune/localdb/direct-write.ts +120 -1
  61. package/cli/selftune/localdb/materialize.ts +6 -7
  62. package/cli/selftune/localdb/queries/cron.ts +34 -0
  63. package/cli/selftune/localdb/queries/dashboard.ts +834 -0
  64. package/cli/selftune/localdb/queries/evolution.ts +158 -0
  65. package/cli/selftune/localdb/queries/execution.ts +133 -0
  66. package/cli/selftune/localdb/queries/json.ts +18 -0
  67. package/cli/selftune/localdb/queries/monitoring.ts +263 -0
  68. package/cli/selftune/localdb/queries/raw.ts +95 -0
  69. package/cli/selftune/localdb/queries/staging.ts +270 -0
  70. package/cli/selftune/localdb/queries/trust.ts +392 -0
  71. package/cli/selftune/localdb/queries.ts +60 -2162
  72. package/cli/selftune/localdb/schema.ts +59 -0
  73. package/cli/selftune/monitoring/watch.ts +96 -29
  74. package/cli/selftune/normalization.ts +3 -0
  75. package/cli/selftune/observability.ts +12 -3
  76. package/cli/selftune/orchestrate/cli.ts +161 -0
  77. package/cli/selftune/orchestrate/execute.ts +295 -0
  78. package/cli/selftune/orchestrate/finalize.ts +157 -0
  79. package/cli/selftune/orchestrate/locks.ts +40 -0
  80. package/cli/selftune/orchestrate/plan.ts +131 -0
  81. package/cli/selftune/orchestrate/post-run.ts +59 -0
  82. package/cli/selftune/orchestrate/prepare.ts +334 -0
  83. package/cli/selftune/orchestrate/report.ts +182 -0
  84. package/cli/selftune/orchestrate/runtime.ts +120 -0
  85. package/cli/selftune/orchestrate/signals.ts +48 -0
  86. package/cli/selftune/orchestrate.ts +162 -1142
  87. package/cli/selftune/registry/client.ts +74 -0
  88. package/cli/selftune/registry/history.ts +54 -0
  89. package/cli/selftune/registry/index.ts +90 -0
  90. package/cli/selftune/registry/install.ts +141 -0
  91. package/cli/selftune/registry/list.ts +44 -0
  92. package/cli/selftune/registry/push.ts +171 -0
  93. package/cli/selftune/registry/rollback.ts +49 -0
  94. package/cli/selftune/registry/status.ts +62 -0
  95. package/cli/selftune/registry/sync.ts +125 -0
  96. package/cli/selftune/repair/skill-usage.ts +9 -3
  97. package/cli/selftune/routes/overview.ts +5 -2
  98. package/cli/selftune/routes/skill-report.ts +15 -2
  99. package/cli/selftune/schedule.ts +5 -5
  100. package/cli/selftune/status.ts +70 -2
  101. package/cli/selftune/sync.ts +127 -23
  102. package/cli/selftune/testing-readiness.ts +597 -0
  103. package/cli/selftune/types.ts +46 -5
  104. package/cli/selftune/uninstall.ts +2 -1
  105. package/cli/selftune/utils/canonical-log.ts +1 -9
  106. package/cli/selftune/utils/cli-error.ts +9 -0
  107. package/cli/selftune/utils/jsonl.ts +1 -30
  108. package/cli/selftune/utils/llm-call.ts +126 -6
  109. package/cli/selftune/utils/skill-discovery.ts +24 -0
  110. package/cli/selftune/workflows/proposals.ts +184 -0
  111. package/cli/selftune/workflows/skill-scaffold.ts +241 -0
  112. package/cli/selftune/workflows/workflows.ts +100 -26
  113. package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
  114. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
  115. package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
  116. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  117. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
  118. package/node_modules/@selftune/telemetry-contract/package.json +1 -1
  119. package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
  120. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +63 -5
  121. package/node_modules/@selftune/telemetry-contract/src/types.ts +97 -7
  122. package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
  123. package/package.json +25 -9
  124. package/packages/dashboard-core/AGENTS.md +18 -0
  125. package/packages/dashboard-core/README.md +30 -0
  126. package/packages/dashboard-core/index.ts +3 -0
  127. package/packages/dashboard-core/package.json +39 -0
  128. package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
  129. package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
  130. package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
  131. package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
  132. package/packages/dashboard-core/src/chrome/index.ts +14 -0
  133. package/packages/dashboard-core/src/chrome/types.ts +81 -0
  134. package/packages/dashboard-core/src/chrome/utils.ts +23 -0
  135. package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
  136. package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
  137. package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
  138. package/packages/dashboard-core/src/gates/index.ts +3 -0
  139. package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
  140. package/packages/dashboard-core/src/host/adapter.ts +47 -0
  141. package/packages/dashboard-core/src/host/capabilities.ts +55 -0
  142. package/packages/dashboard-core/src/host/index.ts +3 -0
  143. package/packages/dashboard-core/src/models/analytics.ts +39 -0
  144. package/packages/dashboard-core/src/models/index.ts +4 -0
  145. package/packages/dashboard-core/src/models/overview.ts +98 -0
  146. package/packages/dashboard-core/src/models/runtime.ts +7 -0
  147. package/packages/dashboard-core/src/models/skills.ts +34 -0
  148. package/packages/dashboard-core/src/routes/index.ts +2 -0
  149. package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
  150. package/packages/dashboard-core/src/routes/manifest.ts +451 -0
  151. package/packages/dashboard-core/src/routes/types.ts +39 -0
  152. package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
  153. package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
  154. package/packages/dashboard-core/src/screens/index.ts +37 -0
  155. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
  156. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
  157. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
  158. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
  159. package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
  160. package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
  161. package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
  162. package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
  163. package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
  164. package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
  165. package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
  166. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
  167. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
  168. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
  169. package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
  170. package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
  171. package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
  172. package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
  173. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
  174. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
  175. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
  176. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
  177. package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
  178. package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
  179. package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
  180. package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
  181. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
  182. package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
  183. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  184. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
  185. package/packages/telemetry-contract/package.json +1 -1
  186. package/packages/telemetry-contract/src/index.ts +1 -0
  187. package/packages/telemetry-contract/src/schemas.ts +63 -5
  188. package/packages/telemetry-contract/src/types.ts +97 -7
  189. package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
  190. package/packages/ui/AGENTS.md +16 -0
  191. package/packages/ui/README.md +1 -1
  192. package/packages/ui/package.json +1 -1
  193. package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
  194. package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
  195. package/packages/ui/src/components/EvidenceViewer.tsx +229 -464
  196. package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
  197. package/packages/ui/src/components/InfoTip.tsx +1 -2
  198. package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
  199. package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
  200. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
  201. package/packages/ui/src/components/OverviewPanels.tsx +693 -0
  202. package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
  203. package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
  204. package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
  205. package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
  206. package/packages/ui/src/components/index.ts +56 -1
  207. package/packages/ui/src/components/section-cards.tsx +18 -35
  208. package/packages/ui/src/components/skill-health-grid.tsx +47 -37
  209. package/packages/ui/src/lib/constants.tsx +0 -1
  210. package/packages/ui/src/primitives/card.tsx +1 -1
  211. package/packages/ui/src/primitives/checkbox.tsx +1 -1
  212. package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
  213. package/packages/ui/src/primitives/select.tsx +2 -2
  214. package/packages/ui/src/primitives/tabs.tsx +7 -6
  215. package/packages/ui/src/types.ts +182 -4
  216. package/skill/SKILL.md +130 -318
  217. package/skill/agents/diagnosis-analyst.md +3 -3
  218. package/skill/agents/evolution-reviewer.md +3 -3
  219. package/skill/agents/integration-guide.md +3 -3
  220. package/skill/agents/pattern-analyst.md +2 -2
  221. package/skill/references/cli-quick-reference.md +89 -0
  222. package/skill/references/creator-playbook.md +131 -0
  223. package/skill/references/examples.md +48 -0
  224. package/skill/references/troubleshooting.md +47 -0
  225. package/skill/references/version-history.md +1 -1
  226. package/skill/selftune.contribute.json +11 -0
  227. package/skill/{Workflows → workflows}/Baseline.md +20 -1
  228. package/skill/{Workflows → workflows}/Contribute.md +23 -10
  229. package/skill/{Workflows → workflows}/Contributions.md +13 -5
  230. package/skill/workflows/CreateTestDeploy.md +170 -0
  231. package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
  232. package/skill/{Workflows → workflows}/Cron.md +1 -1
  233. package/skill/{Workflows → workflows}/Dashboard.md +20 -0
  234. package/skill/{Workflows → workflows}/Doctor.md +1 -1
  235. package/skill/{Workflows → workflows}/Evals.md +67 -2
  236. package/skill/{Workflows → workflows}/Evolve.md +119 -30
  237. package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
  238. package/skill/{Workflows → workflows}/Grade.md +1 -1
  239. package/skill/{Workflows → workflows}/Ingest.md +60 -2
  240. package/skill/{Workflows → workflows}/Initialize.md +16 -9
  241. package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
  242. package/skill/{Workflows → workflows}/PlatformHooks.md +19 -3
  243. package/skill/workflows/Registry.md +99 -0
  244. package/skill/{Workflows → workflows}/Schedule.md +3 -3
  245. package/skill/workflows/SignalsDashboard.md +87 -0
  246. package/skill/{Workflows → workflows}/Sync.md +3 -1
  247. package/skill/{Workflows → workflows}/UnitTest.md +19 -0
  248. package/skill/{Workflows → workflows}/Watch.md +42 -2
  249. package/skill/{Workflows → workflows}/Workflows.md +39 -2
  250. package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
  251. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
  252. package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
  253. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
  254. package/cli/selftune/utils/html.ts +0 -27
  255. package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
  256. /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
  257. /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
  258. /package/skill/{Workflows → workflows}/Badge.md +0 -0
  259. /package/skill/{Workflows → workflows}/Composability.md +0 -0
  260. /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
  261. /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
  262. /package/skill/{Workflows → workflows}/Hook.md +0 -0
  263. /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
  264. /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
  265. /package/skill/{Workflows → workflows}/Recover.md +0 -0
  266. /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
  267. /package/skill/{Workflows → workflows}/Replay.md +0 -0
  268. /package/skill/{Workflows → workflows}/Rollback.md +0 -0
  269. /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
  270. /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
@@ -3,13 +3,37 @@
3
3
  *
4
4
  * 3-gate validation for full body evolution proposals:
5
5
  * Gate 1 (structural): Pure code — YAML frontmatter, # Title, ## Workflow Routing preserved
6
- * Gate 2 (trigger accuracy): Student model YES/NO per eval entry
6
+ * Gate 2 (trigger accuracy): Replay-backed or student model YES/NO per eval entry
7
7
  * Gate 3 (quality): Student model rates body clarity/completeness 0.0-1.0
8
+ *
9
+ * Gate 2 now supports replay-backed validation (via replay engine) in addition
10
+ * to LLM-judge-based checking. When real host/runtime replay is available and
11
+ * succeeds, the replay path is preferred. Falls back to LLM judge otherwise.
8
12
  */
9
13
 
10
- import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
14
+ import type {
15
+ BodyEvolutionProposal,
16
+ BodyValidationResult,
17
+ EvalEntry,
18
+ RoutingReplayEntryResult,
19
+ ValidationGate,
20
+ ValidationMode,
21
+ } from "../types.js";
11
22
  import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
12
- import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
23
+ import { runJudgeValidation } from "./engines/judge-engine.js";
24
+ import type { ReplayValidationOptions } from "./engines/replay-engine.js";
25
+ import { runValidationContract, type ValidationStrategy } from "./validation-contract.js";
26
+
27
+ // ---------------------------------------------------------------------------
28
+ // Types
29
+ // ---------------------------------------------------------------------------
30
+
31
+ export interface BodyValidationOptions {
32
+ /** Replay options for Gate 2 trigger accuracy. */
33
+ replay?: ReplayValidationOptions;
34
+ mode?: ValidationStrategy;
35
+ onReplayFallback?: (reason?: string) => void;
36
+ }
13
37
 
14
38
  // ---------------------------------------------------------------------------
15
39
  // Gate 1: Structural validation (pure code, no LLM)
@@ -57,12 +81,15 @@ export function validateBodyStructure(proposedBody: string): { valid: boolean; r
57
81
  }
58
82
 
59
83
  // ---------------------------------------------------------------------------
60
- // Gate 2: Trigger accuracy (student model YES/NO)
84
+ // Gate 2: Trigger accuracy (replay-backed or student model YES/NO)
61
85
  // ---------------------------------------------------------------------------
62
86
 
63
87
  /**
64
88
  * Run trigger checks on the eval set using the proposed body content.
65
89
  * Returns before/after pass rates.
90
+ *
91
+ * When replay options are provided, attempts host/runtime replay first.
92
+ * Falls back to LLM judge when replay is unavailable or no options given.
66
93
  */
67
94
  export async function validateBodyTriggerAccuracy(
68
95
  originalBody: string,
@@ -70,55 +97,84 @@ export async function validateBodyTriggerAccuracy(
70
97
  evalSet: EvalEntry[],
71
98
  agent: string,
72
99
  modelFlag?: string,
100
+ options?: BodyValidationOptions,
73
101
  ): Promise<{
74
102
  before_pass_rate: number;
75
103
  after_pass_rate: number;
76
104
  improved: boolean;
77
105
  regressions: string[];
106
+ validation_mode: ValidationMode;
107
+ validation_agent?: string;
108
+ validation_fixture_id?: string;
109
+ validation_fallback_reason?: string;
110
+ per_entry_results?: import("../types.js").RoutingReplayEntryResult[];
111
+ before_entry_results?: import("../types.js").RoutingReplayEntryResult[];
78
112
  }> {
79
113
  if (evalSet.length === 0) {
80
- return { before_pass_rate: 0, after_pass_rate: 0, improved: false, regressions: [] };
81
- }
82
-
83
- const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
84
- let beforePassed = 0;
85
- let afterPassed = 0;
86
- const regressions: string[] = [];
87
-
88
- for (const entry of evalSet) {
89
- // Check with original body
90
- const beforePrompt = buildTriggerCheckPrompt(originalBody, entry.query);
91
- const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
92
- const beforeTriggered = parseTriggerResponse(beforeRaw);
93
- const beforePass =
94
- (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
95
-
96
- // Check with proposed body
97
- const afterPrompt = buildTriggerCheckPrompt(proposedBody, entry.query);
98
- const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
99
- const afterTriggered = parseTriggerResponse(afterRaw);
100
- const afterPass =
101
- (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
102
-
103
- if (beforePass) beforePassed++;
104
- if (afterPass) afterPassed++;
105
-
106
- // Track regressions
107
- if (beforePass && !afterPass) {
108
- regressions.push(entry.query);
109
- }
114
+ return {
115
+ before_pass_rate: 0,
116
+ after_pass_rate: 0,
117
+ improved: false,
118
+ regressions: [],
119
+ validation_mode: "llm_judge",
120
+ validation_agent: agent,
121
+ };
110
122
  }
111
123
 
112
- const total = evalSet.length;
113
- const beforePassRate = beforePassed / total;
114
- const afterPassRate = afterPassed / total;
124
+ const { result, fallbackReason } = await runValidationContract<{
125
+ before_pass_rate: number;
126
+ after_pass_rate: number;
127
+ improved: boolean;
128
+ regressions: string[];
129
+ validation_mode: ValidationMode;
130
+ validation_agent?: string;
131
+ validation_fixture_id?: string;
132
+ validation_fallback_reason?: string;
133
+ per_entry_results?: RoutingReplayEntryResult[];
134
+ before_entry_results?: RoutingReplayEntryResult[];
135
+ }>({
136
+ mode: options?.mode ?? "auto",
137
+ originalContent: originalBody,
138
+ proposedContent: proposedBody,
139
+ evalSet,
140
+ agent,
141
+ replayOptions: options?.replay,
142
+ runJudge: async () => {
143
+ const judgeResult = await runJudgeValidation(
144
+ originalBody,
145
+ proposedBody,
146
+ evalSet,
147
+ agent,
148
+ modelFlag,
149
+ );
150
+
151
+ return {
152
+ result: {
153
+ before_pass_rate: judgeResult.before_pass_rate,
154
+ after_pass_rate: judgeResult.after_pass_rate,
155
+ improved: judgeResult.improved,
156
+ regressions: judgeResult.regressions,
157
+ validation_mode: judgeResult.validation_mode,
158
+ validation_agent: judgeResult.validation_agent,
159
+ },
160
+ modeUsed: judgeResult.validation_mode,
161
+ };
162
+ },
163
+ onReplayFallback: options?.onReplayFallback,
164
+ adaptReplayResult: (replayResult) => ({
165
+ before_pass_rate: replayResult.before_pass_rate,
166
+ after_pass_rate: replayResult.after_pass_rate,
167
+ improved: replayResult.improved,
168
+ regressions: [],
169
+ validation_mode: replayResult.validation_mode,
170
+ validation_agent: replayResult.validation_agent,
171
+ validation_fixture_id: replayResult.validation_fixture_id,
172
+ per_entry_results: replayResult.per_entry_results,
173
+ before_entry_results: replayResult.before_entry_results,
174
+ }),
175
+ });
115
176
 
116
- return {
117
- before_pass_rate: beforePassRate,
118
- after_pass_rate: afterPassRate,
119
- improved: afterPassRate > beforePassRate,
120
- regressions,
121
- };
177
+ return fallbackReason ? { ...result, validation_fallback_reason: fallbackReason } : result;
122
178
  }
123
179
 
124
180
  // ---------------------------------------------------------------------------
@@ -190,8 +246,9 @@ export async function validateBodyProposal(
190
246
  agent: string,
191
247
  modelFlag?: string,
192
248
  qualityThreshold = QUALITY_THRESHOLD,
249
+ options?: BodyValidationOptions,
193
250
  ): Promise<BodyValidationResult> {
194
- const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
251
+ const gateResults: Array<{ gate: ValidationGate; passed: boolean; reason: string }> = [];
195
252
 
196
253
  // Gate 1: Structural validation (pure code)
197
254
  const structural = validateBodyStructure(proposal.proposed_body);
@@ -214,20 +271,21 @@ export async function validateBodyProposal(
214
271
  };
215
272
  }
216
273
 
217
- // Gate 2: Trigger accuracy (student model)
274
+ // Gate 2: Trigger accuracy (replay-backed or student model)
218
275
  const accuracy = await validateBodyTriggerAccuracy(
219
276
  proposal.original_body,
220
277
  proposal.proposed_body,
221
278
  evalSet,
222
279
  agent,
223
280
  modelFlag,
281
+ options,
224
282
  );
225
283
  gateResults.push({
226
284
  gate: "trigger_accuracy",
227
285
  passed: accuracy.improved,
228
286
  reason: accuracy.improved
229
- ? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
230
- : `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
287
+ ? `Improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
288
+ : `Not improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
231
289
  });
232
290
 
233
291
  // Gate 3: Quality assessment (student model)
@@ -252,13 +310,17 @@ export async function validateBodyProposal(
252
310
  gate_results: gateResults,
253
311
  improved: gatesPassed === 3,
254
312
  regressions: accuracy.regressions,
255
- validation_mode: "llm_judge",
256
- validation_agent: agent,
313
+ validation_mode: accuracy.validation_mode,
314
+ validation_agent: accuracy.validation_agent ?? agent,
315
+ validation_fallback_reason: accuracy.validation_fallback_reason,
316
+ validation_fixture_id: accuracy.validation_fixture_id,
257
317
  ...(evalSet.length > 0
258
318
  ? {
259
319
  before_pass_rate: accuracy.before_pass_rate,
260
320
  after_pass_rate: accuracy.after_pass_rate,
261
321
  }
262
322
  : {}),
323
+ per_entry_results: accuracy.per_entry_results,
324
+ before_entry_results: accuracy.before_entry_results,
263
325
  };
264
326
  }