selftune 0.2.21 → 0.2.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +15 -8
  2. package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +1 -0
  3. package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +59 -0
  4. package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +12 -0
  5. package/apps/local-dashboard/dist/index.html +3 -3
  6. package/cli/selftune/adapters/cline/hook.ts +167 -0
  7. package/cli/selftune/adapters/cline/install.ts +197 -0
  8. package/cli/selftune/adapters/codex/hook.ts +296 -0
  9. package/cli/selftune/adapters/codex/install.ts +289 -0
  10. package/cli/selftune/adapters/opencode/hook.ts +222 -0
  11. package/cli/selftune/adapters/opencode/install.ts +543 -0
  12. package/cli/selftune/adapters/pi/hook.ts +273 -0
  13. package/cli/selftune/adapters/pi/install.ts +207 -0
  14. package/cli/selftune/constants.ts +10 -1
  15. package/cli/selftune/dashboard-contract.ts +14 -0
  16. package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
  17. package/cli/selftune/evolution/engines/replay-engine.ts +158 -0
  18. package/cli/selftune/evolution/evidence.ts +2 -6
  19. package/cli/selftune/evolution/evolve-body.ts +73 -20
  20. package/cli/selftune/evolution/validate-body.ts +78 -42
  21. package/cli/selftune/evolution/validate-routing.ts +45 -104
  22. package/cli/selftune/hooks/auto-activate.ts +43 -37
  23. package/cli/selftune/hooks/skill-eval.ts +2 -1
  24. package/cli/selftune/hooks-shared/git-metadata.ts +149 -0
  25. package/cli/selftune/hooks-shared/hook-output.ts +105 -0
  26. package/cli/selftune/hooks-shared/normalize.ts +196 -0
  27. package/cli/selftune/hooks-shared/session-state.ts +76 -0
  28. package/cli/selftune/hooks-shared/skill-paths.ts +50 -0
  29. package/cli/selftune/hooks-shared/stdin-dispatch.ts +59 -0
  30. package/cli/selftune/hooks-shared/types.ts +91 -0
  31. package/cli/selftune/index.ts +76 -6
  32. package/cli/selftune/ingestors/pi-ingest.ts +726 -0
  33. package/cli/selftune/init.ts +11 -1
  34. package/cli/selftune/localdb/direct-write.ts +85 -0
  35. package/cli/selftune/localdb/materialize.ts +6 -7
  36. package/cli/selftune/localdb/queries.ts +126 -0
  37. package/cli/selftune/localdb/schema.ts +38 -0
  38. package/cli/selftune/observability.ts +8 -1
  39. package/cli/selftune/orchestrate.ts +43 -0
  40. package/cli/selftune/registry/client.ts +74 -0
  41. package/cli/selftune/registry/history.ts +54 -0
  42. package/cli/selftune/registry/index.ts +90 -0
  43. package/cli/selftune/registry/install.ts +141 -0
  44. package/cli/selftune/registry/list.ts +44 -0
  45. package/cli/selftune/registry/push.ts +171 -0
  46. package/cli/selftune/registry/rollback.ts +49 -0
  47. package/cli/selftune/registry/status.ts +62 -0
  48. package/cli/selftune/registry/sync.ts +125 -0
  49. package/cli/selftune/repair/skill-usage.ts +4 -1
  50. package/cli/selftune/status.ts +31 -0
  51. package/cli/selftune/sync.ts +127 -23
  52. package/cli/selftune/types.ts +2 -1
  53. package/cli/selftune/utils/jsonl.ts +1 -30
  54. package/cli/selftune/utils/llm-call.ts +99 -34
  55. package/cli/selftune/utils/skill-discovery.ts +22 -0
  56. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  57. package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
  58. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  59. package/node_modules/@selftune/telemetry-contract/package.json +1 -1
  60. package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
  61. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +22 -4
  62. package/node_modules/@selftune/telemetry-contract/src/types.ts +1 -12
  63. package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
  64. package/package.json +1 -1
  65. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  66. package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
  67. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  68. package/packages/telemetry-contract/package.json +1 -1
  69. package/packages/telemetry-contract/src/index.ts +1 -0
  70. package/packages/telemetry-contract/src/schemas.ts +22 -4
  71. package/packages/telemetry-contract/src/types.ts +1 -12
  72. package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
  73. package/packages/ui/AGENTS.md +16 -0
  74. package/packages/ui/README.md +1 -1
  75. package/packages/ui/package.json +1 -1
  76. package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
  77. package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
  78. package/packages/ui/src/components/EvidenceViewer.tsx +153 -443
  79. package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
  80. package/packages/ui/src/components/InfoTip.tsx +1 -2
  81. package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
  82. package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
  83. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
  84. package/packages/ui/src/components/OverviewPanels.tsx +652 -0
  85. package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
  86. package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
  87. package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
  88. package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
  89. package/packages/ui/src/components/index.ts +56 -1
  90. package/packages/ui/src/components/section-cards.tsx +18 -35
  91. package/packages/ui/src/components/skill-health-grid.tsx +47 -37
  92. package/packages/ui/src/lib/constants.tsx +0 -1
  93. package/packages/ui/src/primitives/card.tsx +1 -1
  94. package/packages/ui/src/primitives/checkbox.tsx +1 -1
  95. package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
  96. package/packages/ui/src/primitives/select.tsx +2 -2
  97. package/packages/ui/src/types.ts +172 -4
  98. package/skill/SKILL.md +26 -2
  99. package/skill/Workflows/Ingest.md +60 -2
  100. package/skill/Workflows/Initialize.md +54 -9
  101. package/skill/Workflows/PlatformHooks.md +109 -0
  102. package/skill/Workflows/Registry.md +99 -0
  103. package/skill/Workflows/Sync.md +3 -1
  104. package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
  105. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
  106. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
  107. package/cli/selftune/utils/html.ts +0 -27
  108. package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
@@ -0,0 +1,158 @@
1
+ /**
2
+ * replay-engine.ts
3
+ *
4
+ * Cohesive module for all replay-based validation logic:
5
+ * - Host/runtime replay (PRIMARY path — real agent routing decisions)
6
+ * - Fixture-backed replay (FALLBACK — surface similarity matching)
7
+ * - Custom replay runner support
8
+ *
9
+ * Host/runtime replay is preferred because it captures actual agent routing
10
+ * behavior. Fixture-backed replay is used as a fallback when no invoker is
11
+ * provided or when the invoker returns an error.
12
+ *
13
+ * Extracted from validate-routing.ts and validate-body.ts to isolate
14
+ * replay-specific concerns from judge-specific concerns.
15
+ */
16
+
17
+ import type {
18
+ EvalEntry,
19
+ RoutingReplayEntryResult,
20
+ RoutingReplayFixture,
21
+ ValidationMode,
22
+ } from "../../types.js";
23
+ import { runHostReplayFixture } from "../validate-host-replay.js";
24
+
25
+ // ---------------------------------------------------------------------------
26
+ // Types
27
+ // ---------------------------------------------------------------------------
28
+
29
+ export interface ReplayRunnerInput {
30
+ routing: string;
31
+ evalSet: EvalEntry[];
32
+ agent: string;
33
+ fixture: RoutingReplayFixture;
34
+ }
35
+
36
+ export type ReplayRunner = (input: ReplayRunnerInput) => Promise<RoutingReplayEntryResult[]>;
37
+
38
+ export interface ReplayValidationOptions {
39
+ replayFixture?: RoutingReplayFixture;
40
+ /** Host/runtime replay runner — PRIMARY validation path when provided. */
41
+ replayRunner?: ReplayRunner;
42
+ }
43
+
44
+ export interface ReplayValidationResult {
45
+ before_pass_rate: number;
46
+ after_pass_rate: number;
47
+ improved: boolean;
48
+ validation_mode: ValidationMode;
49
+ validation_agent: string;
50
+ validation_fixture_id?: string;
51
+ per_entry_results?: RoutingReplayEntryResult[];
52
+ /** Before-phase per-entry results for structured persistence. */
53
+ before_entry_results?: RoutingReplayEntryResult[];
54
+ }
55
+
56
+ // ---------------------------------------------------------------------------
57
+ // Internal helpers
58
+ // ---------------------------------------------------------------------------
59
+
60
+ function computeReplayResult(
61
+ beforeResults: RoutingReplayEntryResult[],
62
+ afterResults: RoutingReplayEntryResult[],
63
+ total: number,
64
+ mode: ValidationMode,
65
+ agent: string,
66
+ fixtureId: string,
67
+ ): ReplayValidationResult {
68
+ const beforePassed = beforeResults.filter((result) => result.passed).length;
69
+ const afterPassed = afterResults.filter((result) => result.passed).length;
70
+
71
+ return {
72
+ before_pass_rate: beforePassed / total,
73
+ after_pass_rate: afterPassed / total,
74
+ improved: afterPassed > beforePassed,
75
+ validation_mode: mode,
76
+ validation_agent: agent,
77
+ validation_fixture_id: fixtureId,
78
+ per_entry_results: afterResults,
79
+ before_entry_results: beforeResults,
80
+ };
81
+ }
82
+
83
+ // ---------------------------------------------------------------------------
84
+ // Replay validation engine
85
+ // ---------------------------------------------------------------------------
86
+
87
+ /**
88
+ * Attempt replay-backed validation. Prefers host/runtime replay when a
89
+ * replayRunner is provided; falls back to fixture-based replay when:
90
+ * - No replayRunner is provided
91
+ * - The replayRunner throws an error
92
+ *
93
+ * Returns null if no replay path is available (no fixture provided).
94
+ */
95
+ export async function runReplayValidation(
96
+ originalContent: string,
97
+ proposedContent: string,
98
+ evalSet: EvalEntry[],
99
+ agent: string,
100
+ options: ReplayValidationOptions = {},
101
+ ): Promise<ReplayValidationResult | null> {
102
+ if (evalSet.length === 0 || !options.replayFixture) {
103
+ return null;
104
+ }
105
+
106
+ const fixture = options.replayFixture;
107
+ const total = evalSet.length;
108
+
109
+ // PRIMARY path: Host/runtime replay when a runner is provided
110
+ if (options.replayRunner) {
111
+ try {
112
+ const beforeResults = await options.replayRunner({
113
+ routing: originalContent,
114
+ evalSet,
115
+ agent,
116
+ fixture,
117
+ });
118
+ const afterResults = await options.replayRunner({
119
+ routing: proposedContent,
120
+ evalSet,
121
+ agent,
122
+ fixture,
123
+ });
124
+
125
+ return computeReplayResult(
126
+ beforeResults,
127
+ afterResults,
128
+ total,
129
+ "host_replay",
130
+ agent,
131
+ fixture.fixture_id,
132
+ );
133
+ } catch {
134
+ // Host replay failed — fall through to fixture-based fallback
135
+ }
136
+ }
137
+
138
+ // FALLBACK path: Fixture-backed replay (surface similarity matching)
139
+ const beforeResults = runHostReplayFixture({
140
+ routing: originalContent,
141
+ evalSet,
142
+ fixture,
143
+ });
144
+ const afterResults = runHostReplayFixture({
145
+ routing: proposedContent,
146
+ evalSet,
147
+ fixture,
148
+ });
149
+
150
+ return computeReplayResult(
151
+ beforeResults,
152
+ afterResults,
153
+ total,
154
+ "fixture_replay",
155
+ agent,
156
+ fixture.fixture_id,
157
+ );
158
+ }
@@ -12,11 +12,7 @@ import { queryEvolutionEvidence } from "../localdb/queries.js";
12
12
  import type { EvolutionEvidenceEntry } from "../types.js";
13
13
 
14
14
  /** Append a structured evidence artifact to the evolution evidence log (SQLite). */
15
- export function appendEvidenceEntry(
16
- entry: EvolutionEvidenceEntry,
17
- /** @deprecated Unused; retained for API compatibility during migration */
18
- _logPath?: string,
19
- ): void {
15
+ export function appendEvidenceEntry(entry: EvolutionEvidenceEntry): void {
20
16
  writeEvolutionEvidenceToDb(entry);
21
17
  }
22
18
 
@@ -25,7 +21,7 @@ export function appendEvidenceEntry(
25
21
  *
26
22
  * @param skillName - Optional skill name to filter by
27
23
  */
28
- export function readEvidenceTrail(skillName?: string, _logPath?: string): EvolutionEvidenceEntry[] {
24
+ export function readEvidenceTrail(skillName?: string): EvolutionEvidenceEntry[] {
29
25
  const db = getDb();
30
26
  return queryEvolutionEvidence(db, skillName) as EvolutionEvidenceEntry[];
31
27
  }
@@ -12,6 +12,10 @@ import { parseArgs } from "node:util";
12
12
  import { buildEvalSet } from "../eval/hooks-to-evals.js";
13
13
  import { readGradingResultsForSkill } from "../grading/results.js";
14
14
  import { getDb } from "../localdb/db.js";
15
+ import {
16
+ type ReplayEntryResultInput,
17
+ writeReplayEntryResultsToDb,
18
+ } from "../localdb/direct-write.js";
15
19
  import { queryQueryLog, querySkillUsageRecords } from "../localdb/queries.js";
16
20
  import type {
17
21
  BodyEvolutionProposal,
@@ -37,6 +41,7 @@ import { extractFailurePatterns } from "./extract-patterns.js";
37
41
  import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
38
42
  import { generateRoutingProposal } from "./propose-routing.js";
39
43
  import { refineBodyProposal } from "./refine-body.js";
44
+ import type { BodyValidationOptions } from "./validate-body.js";
40
45
  import { validateBodyProposal } from "./validate-body.js";
41
46
  import {
42
47
  buildRoutingReplayFixture,
@@ -463,29 +468,32 @@ export async function evolveBody(
463
468
  // Validate (validationModel overrides studentModel for validation calls)
464
469
  const validationModelFlag = options.validationModel ?? studentModel;
465
470
  let validation: BodyValidationResult;
466
- if (target === "routing") {
467
- const replayFixture = buildRoutingReplayFixture({
468
- skillName,
469
- skillPath,
470
- platform: studentAgent === "codex" ? "codex" : "claude_code",
471
- });
472
- const replayRunner =
473
- replayFixture.platform === "claude_code" && studentAgent === "claude"
474
- ? async ({
471
+
472
+ // Build replay fixture + runner for ALL targets (not just routing)
473
+ const replayFixture = buildRoutingReplayFixture({
474
+ skillName,
475
+ skillPath,
476
+ platform: studentAgent === "codex" ? "codex" : "claude_code",
477
+ });
478
+ const replayRunner =
479
+ replayFixture.platform === "claude_code" && studentAgent === "claude"
480
+ ? async ({
481
+ routing,
482
+ evalSet,
483
+ fixture,
484
+ }: {
485
+ routing: string;
486
+ evalSet: EvalEntry[];
487
+ fixture: RoutingReplayFixture;
488
+ }) =>
489
+ await runClaudeRuntimeReplayFixture({
475
490
  routing,
476
491
  evalSet,
477
492
  fixture,
478
- }: {
479
- routing: string;
480
- evalSet: EvalEntry[];
481
- fixture: RoutingReplayFixture;
482
- }) =>
483
- await runClaudeRuntimeReplayFixture({
484
- routing,
485
- evalSet,
486
- fixture,
487
- })
488
- : undefined;
493
+ })
494
+ : undefined;
495
+
496
+ if (target === "routing") {
489
497
  validation = await _validateRoutingProposal(
490
498
  proposal,
491
499
  evalSet,
@@ -497,11 +505,16 @@ export async function evolveBody(
497
505
  },
498
506
  );
499
507
  } else {
508
+ const bodyReplayOptions: BodyValidationOptions = replayRunner
509
+ ? { replay: { replayFixture, replayRunner } }
510
+ : {};
500
511
  validation = await _validateBodyProposal(
501
512
  proposal,
502
513
  evalSet,
503
514
  studentAgent,
504
515
  validationModelFlag,
516
+ undefined,
517
+ bodyReplayOptions,
505
518
  );
506
519
  }
507
520
  lastValidation = validation;
@@ -543,6 +556,46 @@ export async function evolveBody(
543
556
  },
544
557
  });
545
558
 
559
+ // Persist per-entry replay results to SQLite
560
+ try {
561
+ const entryResults: ReplayEntryResultInput[] = [];
562
+ if (validation.before_entry_results) {
563
+ for (const r of validation.before_entry_results) {
564
+ entryResults.push({
565
+ proposal_id: proposal.proposal_id,
566
+ skill_name: skillName,
567
+ validation_mode: validation.validation_mode ?? "llm_judge",
568
+ phase: "before",
569
+ query: r.query,
570
+ should_trigger: r.should_trigger,
571
+ triggered: r.triggered,
572
+ passed: r.passed,
573
+ evidence: r.evidence,
574
+ });
575
+ }
576
+ }
577
+ if (validation.per_entry_results) {
578
+ for (const r of validation.per_entry_results) {
579
+ entryResults.push({
580
+ proposal_id: proposal.proposal_id,
581
+ skill_name: skillName,
582
+ validation_mode: validation.validation_mode ?? "llm_judge",
583
+ phase: "after",
584
+ query: r.query,
585
+ should_trigger: r.should_trigger,
586
+ triggered: r.triggered,
587
+ passed: r.passed,
588
+ evidence: r.evidence,
589
+ });
590
+ }
591
+ }
592
+ if (entryResults.length > 0) {
593
+ writeReplayEntryResultsToDb(entryResults);
594
+ }
595
+ } catch {
596
+ // Fail-open: replay entry persistence is non-blocking
597
+ }
598
+
546
599
  if (validation.improved) {
547
600
  break;
548
601
  }
@@ -3,13 +3,32 @@
3
3
  *
4
4
  * 3-gate validation for full body evolution proposals:
5
5
  * Gate 1 (structural): Pure code — YAML frontmatter, # Title, ## Workflow Routing preserved
6
- * Gate 2 (trigger accuracy): Student model YES/NO per eval entry
6
+ * Gate 2 (trigger accuracy): Replay-backed or student model YES/NO per eval entry
7
7
  * Gate 3 (quality): Student model rates body clarity/completeness 0.0-1.0
8
+ *
9
+ * Gate 2 now supports replay-backed validation (via replay engine) in addition
10
+ * to LLM-judge-based checking. When replay options are provided and succeed,
11
+ * the replay path is preferred. Falls back to LLM judge otherwise.
8
12
  */
9
13
 
10
- import type { BodyEvolutionProposal, BodyValidationResult, EvalEntry } from "../types.js";
14
+ import type {
15
+ BodyEvolutionProposal,
16
+ BodyValidationResult,
17
+ EvalEntry,
18
+ ValidationMode,
19
+ } from "../types.js";
11
20
  import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
12
- import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
21
+ import { runJudgeValidation } from "./engines/judge-engine.js";
22
+ import { runReplayValidation, type ReplayValidationOptions } from "./engines/replay-engine.js";
23
+
24
+ // ---------------------------------------------------------------------------
25
+ // Types
26
+ // ---------------------------------------------------------------------------
27
+
28
+ export interface BodyValidationOptions {
29
+ /** Replay options for Gate 2 trigger accuracy. */
30
+ replay?: ReplayValidationOptions;
31
+ }
13
32
 
14
33
  // ---------------------------------------------------------------------------
15
34
  // Gate 1: Structural validation (pure code, no LLM)
@@ -57,12 +76,15 @@ export function validateBodyStructure(proposedBody: string): { valid: boolean; r
57
76
  }
58
77
 
59
78
  // ---------------------------------------------------------------------------
60
- // Gate 2: Trigger accuracy (student model YES/NO)
79
+ // Gate 2: Trigger accuracy (replay-backed or student model YES/NO)
61
80
  // ---------------------------------------------------------------------------
62
81
 
63
82
  /**
64
83
  * Run trigger checks on the eval set using the proposed body content.
65
84
  * Returns before/after pass rates.
85
+ *
86
+ * When replay options are provided, attempts replay-backed validation first.
87
+ * Falls back to LLM judge when replay is unavailable or no options given.
66
88
  */
67
89
  export async function validateBodyTriggerAccuracy(
68
90
  originalBody: string,
@@ -70,54 +92,64 @@ export async function validateBodyTriggerAccuracy(
70
92
  evalSet: EvalEntry[],
71
93
  agent: string,
72
94
  modelFlag?: string,
95
+ options?: BodyValidationOptions,
73
96
  ): Promise<{
74
97
  before_pass_rate: number;
75
98
  after_pass_rate: number;
76
99
  improved: boolean;
77
100
  regressions: string[];
101
+ validation_mode: ValidationMode;
102
+ per_entry_results?: import("../types.js").RoutingReplayEntryResult[];
103
+ before_entry_results?: import("../types.js").RoutingReplayEntryResult[];
78
104
  }> {
79
105
  if (evalSet.length === 0) {
80
- return { before_pass_rate: 0, after_pass_rate: 0, improved: false, regressions: [] };
106
+ return {
107
+ before_pass_rate: 0,
108
+ after_pass_rate: 0,
109
+ improved: false,
110
+ regressions: [],
111
+ validation_mode: "llm_judge",
112
+ };
81
113
  }
82
114
 
83
- const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
84
- let beforePassed = 0;
85
- let afterPassed = 0;
86
- const regressions: string[] = [];
87
-
88
- for (const entry of evalSet) {
89
- // Check with original body
90
- const beforePrompt = buildTriggerCheckPrompt(originalBody, entry.query);
91
- const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
92
- const beforeTriggered = parseTriggerResponse(beforeRaw);
93
- const beforePass =
94
- (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
95
-
96
- // Check with proposed body
97
- const afterPrompt = buildTriggerCheckPrompt(proposedBody, entry.query);
98
- const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
99
- const afterTriggered = parseTriggerResponse(afterRaw);
100
- const afterPass =
101
- (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
102
-
103
- if (beforePass) beforePassed++;
104
- if (afterPass) afterPassed++;
105
-
106
- // Track regressions
107
- if (beforePass && !afterPass) {
108
- regressions.push(entry.query);
115
+ // Try replay-backed validation when options are provided
116
+ if (options?.replay) {
117
+ const replayResult = await runReplayValidation(
118
+ originalBody,
119
+ proposedBody,
120
+ evalSet,
121
+ agent,
122
+ options.replay,
123
+ );
124
+
125
+ if (replayResult) {
126
+ return {
127
+ before_pass_rate: replayResult.before_pass_rate,
128
+ after_pass_rate: replayResult.after_pass_rate,
129
+ improved: replayResult.improved,
130
+ regressions: [],
131
+ validation_mode: replayResult.validation_mode,
132
+ per_entry_results: replayResult.per_entry_results,
133
+ before_entry_results: replayResult.before_entry_results,
134
+ };
109
135
  }
110
136
  }
111
137
 
112
- const total = evalSet.length;
113
- const beforePassRate = beforePassed / total;
114
- const afterPassRate = afterPassed / total;
138
+ // Fall back to LLM judge
139
+ const judgeResult = await runJudgeValidation(
140
+ originalBody,
141
+ proposedBody,
142
+ evalSet,
143
+ agent,
144
+ modelFlag,
145
+ );
115
146
 
116
147
  return {
117
- before_pass_rate: beforePassRate,
118
- after_pass_rate: afterPassRate,
119
- improved: afterPassRate > beforePassRate,
120
- regressions,
148
+ before_pass_rate: judgeResult.before_pass_rate,
149
+ after_pass_rate: judgeResult.after_pass_rate,
150
+ improved: judgeResult.improved,
151
+ regressions: judgeResult.regressions,
152
+ validation_mode: judgeResult.validation_mode,
121
153
  };
122
154
  }
123
155
 
@@ -190,6 +222,7 @@ export async function validateBodyProposal(
190
222
  agent: string,
191
223
  modelFlag?: string,
192
224
  qualityThreshold = QUALITY_THRESHOLD,
225
+ options?: BodyValidationOptions,
193
226
  ): Promise<BodyValidationResult> {
194
227
  const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
195
228
 
@@ -214,20 +247,21 @@ export async function validateBodyProposal(
214
247
  };
215
248
  }
216
249
 
217
- // Gate 2: Trigger accuracy (student model)
250
+ // Gate 2: Trigger accuracy (replay-backed or student model)
218
251
  const accuracy = await validateBodyTriggerAccuracy(
219
252
  proposal.original_body,
220
253
  proposal.proposed_body,
221
254
  evalSet,
222
255
  agent,
223
256
  modelFlag,
257
+ options,
224
258
  );
225
259
  gateResults.push({
226
260
  gate: "trigger_accuracy",
227
261
  passed: accuracy.improved,
228
262
  reason: accuracy.improved
229
- ? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
230
- : `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
263
+ ? `Improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
264
+ : `Not improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
231
265
  });
232
266
 
233
267
  // Gate 3: Quality assessment (student model)
@@ -252,7 +286,7 @@ export async function validateBodyProposal(
252
286
  gate_results: gateResults,
253
287
  improved: gatesPassed === 3,
254
288
  regressions: accuracy.regressions,
255
- validation_mode: "llm_judge",
289
+ validation_mode: accuracy.validation_mode,
256
290
  validation_agent: agent,
257
291
  ...(evalSet.length > 0
258
292
  ? {
@@ -260,5 +294,7 @@ export async function validateBodyProposal(
260
294
  after_pass_rate: accuracy.after_pass_rate,
261
295
  }
262
296
  : {}),
297
+ per_entry_results: accuracy.per_entry_results,
298
+ before_entry_results: accuracy.before_entry_results,
263
299
  };
264
300
  }