selftune 0.2.21 → 0.2.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -8
- package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +1 -0
- package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +59 -0
- package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +12 -0
- package/apps/local-dashboard/dist/index.html +3 -3
- package/cli/selftune/adapters/cline/hook.ts +167 -0
- package/cli/selftune/adapters/cline/install.ts +197 -0
- package/cli/selftune/adapters/codex/hook.ts +296 -0
- package/cli/selftune/adapters/codex/install.ts +289 -0
- package/cli/selftune/adapters/opencode/hook.ts +222 -0
- package/cli/selftune/adapters/opencode/install.ts +543 -0
- package/cli/selftune/adapters/pi/hook.ts +273 -0
- package/cli/selftune/adapters/pi/install.ts +207 -0
- package/cli/selftune/constants.ts +10 -1
- package/cli/selftune/dashboard-contract.ts +14 -0
- package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
- package/cli/selftune/evolution/engines/replay-engine.ts +158 -0
- package/cli/selftune/evolution/evidence.ts +2 -6
- package/cli/selftune/evolution/evolve-body.ts +73 -20
- package/cli/selftune/evolution/validate-body.ts +78 -42
- package/cli/selftune/evolution/validate-routing.ts +45 -104
- package/cli/selftune/hooks/auto-activate.ts +43 -37
- package/cli/selftune/hooks/skill-eval.ts +2 -1
- package/cli/selftune/hooks-shared/git-metadata.ts +149 -0
- package/cli/selftune/hooks-shared/hook-output.ts +105 -0
- package/cli/selftune/hooks-shared/normalize.ts +196 -0
- package/cli/selftune/hooks-shared/session-state.ts +76 -0
- package/cli/selftune/hooks-shared/skill-paths.ts +50 -0
- package/cli/selftune/hooks-shared/stdin-dispatch.ts +59 -0
- package/cli/selftune/hooks-shared/types.ts +91 -0
- package/cli/selftune/index.ts +76 -6
- package/cli/selftune/ingestors/pi-ingest.ts +726 -0
- package/cli/selftune/init.ts +11 -1
- package/cli/selftune/localdb/direct-write.ts +85 -0
- package/cli/selftune/localdb/materialize.ts +6 -7
- package/cli/selftune/localdb/queries.ts +126 -0
- package/cli/selftune/localdb/schema.ts +38 -0
- package/cli/selftune/observability.ts +8 -1
- package/cli/selftune/orchestrate.ts +43 -0
- package/cli/selftune/registry/client.ts +74 -0
- package/cli/selftune/registry/history.ts +54 -0
- package/cli/selftune/registry/index.ts +90 -0
- package/cli/selftune/registry/install.ts +141 -0
- package/cli/selftune/registry/list.ts +44 -0
- package/cli/selftune/registry/push.ts +171 -0
- package/cli/selftune/registry/rollback.ts +49 -0
- package/cli/selftune/registry/status.ts +62 -0
- package/cli/selftune/registry/sync.ts +125 -0
- package/cli/selftune/repair/skill-usage.ts +4 -1
- package/cli/selftune/status.ts +31 -0
- package/cli/selftune/sync.ts +127 -23
- package/cli/selftune/types.ts +2 -1
- package/cli/selftune/utils/jsonl.ts +1 -30
- package/cli/selftune/utils/llm-call.ts +99 -34
- package/cli/selftune/utils/skill-discovery.ts +22 -0
- package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/package.json +1 -1
- package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
- package/node_modules/@selftune/telemetry-contract/src/schemas.ts +22 -4
- package/node_modules/@selftune/telemetry-contract/src/types.ts +1 -12
- package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
- package/package.json +1 -1
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/packages/telemetry-contract/package.json +1 -1
- package/packages/telemetry-contract/src/index.ts +1 -0
- package/packages/telemetry-contract/src/schemas.ts +22 -4
- package/packages/telemetry-contract/src/types.ts +1 -12
- package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
- package/packages/ui/AGENTS.md +16 -0
- package/packages/ui/README.md +1 -1
- package/packages/ui/package.json +1 -1
- package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
- package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
- package/packages/ui/src/components/EvidenceViewer.tsx +153 -443
- package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
- package/packages/ui/src/components/InfoTip.tsx +1 -2
- package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
- package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
- package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
- package/packages/ui/src/components/OverviewPanels.tsx +652 -0
- package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
- package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
- package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
- package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
- package/packages/ui/src/components/index.ts +56 -1
- package/packages/ui/src/components/section-cards.tsx +18 -35
- package/packages/ui/src/components/skill-health-grid.tsx +47 -37
- package/packages/ui/src/lib/constants.tsx +0 -1
- package/packages/ui/src/primitives/card.tsx +1 -1
- package/packages/ui/src/primitives/checkbox.tsx +1 -1
- package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
- package/packages/ui/src/primitives/select.tsx +2 -2
- package/packages/ui/src/types.ts +172 -4
- package/skill/SKILL.md +26 -2
- package/skill/Workflows/Ingest.md +60 -2
- package/skill/Workflows/Initialize.md +54 -9
- package/skill/Workflows/PlatformHooks.md +109 -0
- package/skill/Workflows/Registry.md +99 -0
- package/skill/Workflows/Sync.md +3 -1
- package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
- package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
- package/cli/selftune/utils/html.ts +0 -27
- package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* replay-engine.ts
|
|
3
|
+
*
|
|
4
|
+
* Cohesive module for all replay-based validation logic:
|
|
5
|
+
* - Host/runtime replay (PRIMARY path — real agent routing decisions)
|
|
6
|
+
* - Fixture-backed replay (FALLBACK — surface similarity matching)
|
|
7
|
+
* - Custom replay runner support
|
|
8
|
+
*
|
|
9
|
+
* Host/runtime replay is preferred because it captures actual agent routing
|
|
10
|
+
* behavior. Fixture-backed replay is used as a fallback when no invoker is
|
|
11
|
+
* provided or when the invoker returns an error.
|
|
12
|
+
*
|
|
13
|
+
* Extracted from validate-routing.ts and validate-body.ts to isolate
|
|
14
|
+
* replay-specific concerns from judge-specific concerns.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import type {
|
|
18
|
+
EvalEntry,
|
|
19
|
+
RoutingReplayEntryResult,
|
|
20
|
+
RoutingReplayFixture,
|
|
21
|
+
ValidationMode,
|
|
22
|
+
} from "../../types.js";
|
|
23
|
+
import { runHostReplayFixture } from "../validate-host-replay.js";
|
|
24
|
+
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
// Types
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
export interface ReplayRunnerInput {
|
|
30
|
+
routing: string;
|
|
31
|
+
evalSet: EvalEntry[];
|
|
32
|
+
agent: string;
|
|
33
|
+
fixture: RoutingReplayFixture;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export type ReplayRunner = (input: ReplayRunnerInput) => Promise<RoutingReplayEntryResult[]>;
|
|
37
|
+
|
|
38
|
+
export interface ReplayValidationOptions {
|
|
39
|
+
replayFixture?: RoutingReplayFixture;
|
|
40
|
+
/** Host/runtime replay runner — PRIMARY validation path when provided. */
|
|
41
|
+
replayRunner?: ReplayRunner;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export interface ReplayValidationResult {
|
|
45
|
+
before_pass_rate: number;
|
|
46
|
+
after_pass_rate: number;
|
|
47
|
+
improved: boolean;
|
|
48
|
+
validation_mode: ValidationMode;
|
|
49
|
+
validation_agent: string;
|
|
50
|
+
validation_fixture_id?: string;
|
|
51
|
+
per_entry_results?: RoutingReplayEntryResult[];
|
|
52
|
+
/** Before-phase per-entry results for structured persistence. */
|
|
53
|
+
before_entry_results?: RoutingReplayEntryResult[];
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
// Internal helpers
|
|
58
|
+
// ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
function computeReplayResult(
|
|
61
|
+
beforeResults: RoutingReplayEntryResult[],
|
|
62
|
+
afterResults: RoutingReplayEntryResult[],
|
|
63
|
+
total: number,
|
|
64
|
+
mode: ValidationMode,
|
|
65
|
+
agent: string,
|
|
66
|
+
fixtureId: string,
|
|
67
|
+
): ReplayValidationResult {
|
|
68
|
+
const beforePassed = beforeResults.filter((result) => result.passed).length;
|
|
69
|
+
const afterPassed = afterResults.filter((result) => result.passed).length;
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
before_pass_rate: beforePassed / total,
|
|
73
|
+
after_pass_rate: afterPassed / total,
|
|
74
|
+
improved: afterPassed > beforePassed,
|
|
75
|
+
validation_mode: mode,
|
|
76
|
+
validation_agent: agent,
|
|
77
|
+
validation_fixture_id: fixtureId,
|
|
78
|
+
per_entry_results: afterResults,
|
|
79
|
+
before_entry_results: beforeResults,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
// Replay validation engine
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Attempt replay-backed validation. Prefers host/runtime replay when a
|
|
89
|
+
* replayRunner is provided; falls back to fixture-based replay when:
|
|
90
|
+
* - No replayRunner is provided
|
|
91
|
+
* - The replayRunner throws an error
|
|
92
|
+
*
|
|
93
|
+
* Returns null if no replay path is available (no fixture provided).
|
|
94
|
+
*/
|
|
95
|
+
export async function runReplayValidation(
|
|
96
|
+
originalContent: string,
|
|
97
|
+
proposedContent: string,
|
|
98
|
+
evalSet: EvalEntry[],
|
|
99
|
+
agent: string,
|
|
100
|
+
options: ReplayValidationOptions = {},
|
|
101
|
+
): Promise<ReplayValidationResult | null> {
|
|
102
|
+
if (evalSet.length === 0 || !options.replayFixture) {
|
|
103
|
+
return null;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const fixture = options.replayFixture;
|
|
107
|
+
const total = evalSet.length;
|
|
108
|
+
|
|
109
|
+
// PRIMARY path: Host/runtime replay when a runner is provided
|
|
110
|
+
if (options.replayRunner) {
|
|
111
|
+
try {
|
|
112
|
+
const beforeResults = await options.replayRunner({
|
|
113
|
+
routing: originalContent,
|
|
114
|
+
evalSet,
|
|
115
|
+
agent,
|
|
116
|
+
fixture,
|
|
117
|
+
});
|
|
118
|
+
const afterResults = await options.replayRunner({
|
|
119
|
+
routing: proposedContent,
|
|
120
|
+
evalSet,
|
|
121
|
+
agent,
|
|
122
|
+
fixture,
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
return computeReplayResult(
|
|
126
|
+
beforeResults,
|
|
127
|
+
afterResults,
|
|
128
|
+
total,
|
|
129
|
+
"host_replay",
|
|
130
|
+
agent,
|
|
131
|
+
fixture.fixture_id,
|
|
132
|
+
);
|
|
133
|
+
} catch {
|
|
134
|
+
// Host replay failed — fall through to fixture-based fallback
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// FALLBACK path: Fixture-backed replay (surface similarity matching)
|
|
139
|
+
const beforeResults = runHostReplayFixture({
|
|
140
|
+
routing: originalContent,
|
|
141
|
+
evalSet,
|
|
142
|
+
fixture,
|
|
143
|
+
});
|
|
144
|
+
const afterResults = runHostReplayFixture({
|
|
145
|
+
routing: proposedContent,
|
|
146
|
+
evalSet,
|
|
147
|
+
fixture,
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
return computeReplayResult(
|
|
151
|
+
beforeResults,
|
|
152
|
+
afterResults,
|
|
153
|
+
total,
|
|
154
|
+
"fixture_replay",
|
|
155
|
+
agent,
|
|
156
|
+
fixture.fixture_id,
|
|
157
|
+
);
|
|
158
|
+
}
|
|
@@ -12,11 +12,7 @@ import { queryEvolutionEvidence } from "../localdb/queries.js";
|
|
|
12
12
|
import type { EvolutionEvidenceEntry } from "../types.js";
|
|
13
13
|
|
|
14
14
|
/** Append a structured evidence artifact to the evolution evidence log (SQLite). */
|
|
15
|
-
export function appendEvidenceEntry(
|
|
16
|
-
entry: EvolutionEvidenceEntry,
|
|
17
|
-
/** @deprecated Unused; retained for API compatibility during migration */
|
|
18
|
-
_logPath?: string,
|
|
19
|
-
): void {
|
|
15
|
+
export function appendEvidenceEntry(entry: EvolutionEvidenceEntry): void {
|
|
20
16
|
writeEvolutionEvidenceToDb(entry);
|
|
21
17
|
}
|
|
22
18
|
|
|
@@ -25,7 +21,7 @@ export function appendEvidenceEntry(
|
|
|
25
21
|
*
|
|
26
22
|
* @param skillName - Optional skill name to filter by
|
|
27
23
|
*/
|
|
28
|
-
export function readEvidenceTrail(skillName?: string
|
|
24
|
+
export function readEvidenceTrail(skillName?: string): EvolutionEvidenceEntry[] {
|
|
29
25
|
const db = getDb();
|
|
30
26
|
return queryEvolutionEvidence(db, skillName) as EvolutionEvidenceEntry[];
|
|
31
27
|
}
|
|
@@ -12,6 +12,10 @@ import { parseArgs } from "node:util";
|
|
|
12
12
|
import { buildEvalSet } from "../eval/hooks-to-evals.js";
|
|
13
13
|
import { readGradingResultsForSkill } from "../grading/results.js";
|
|
14
14
|
import { getDb } from "../localdb/db.js";
|
|
15
|
+
import {
|
|
16
|
+
type ReplayEntryResultInput,
|
|
17
|
+
writeReplayEntryResultsToDb,
|
|
18
|
+
} from "../localdb/direct-write.js";
|
|
15
19
|
import { queryQueryLog, querySkillUsageRecords } from "../localdb/queries.js";
|
|
16
20
|
import type {
|
|
17
21
|
BodyEvolutionProposal,
|
|
@@ -37,6 +41,7 @@ import { extractFailurePatterns } from "./extract-patterns.js";
|
|
|
37
41
|
import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
|
|
38
42
|
import { generateRoutingProposal } from "./propose-routing.js";
|
|
39
43
|
import { refineBodyProposal } from "./refine-body.js";
|
|
44
|
+
import type { BodyValidationOptions } from "./validate-body.js";
|
|
40
45
|
import { validateBodyProposal } from "./validate-body.js";
|
|
41
46
|
import {
|
|
42
47
|
buildRoutingReplayFixture,
|
|
@@ -463,29 +468,32 @@ export async function evolveBody(
|
|
|
463
468
|
// Validate (validationModel overrides studentModel for validation calls)
|
|
464
469
|
const validationModelFlag = options.validationModel ?? studentModel;
|
|
465
470
|
let validation: BodyValidationResult;
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
471
|
+
|
|
472
|
+
// Build replay fixture + runner for ALL targets (not just routing)
|
|
473
|
+
const replayFixture = buildRoutingReplayFixture({
|
|
474
|
+
skillName,
|
|
475
|
+
skillPath,
|
|
476
|
+
platform: studentAgent === "codex" ? "codex" : "claude_code",
|
|
477
|
+
});
|
|
478
|
+
const replayRunner =
|
|
479
|
+
replayFixture.platform === "claude_code" && studentAgent === "claude"
|
|
480
|
+
? async ({
|
|
481
|
+
routing,
|
|
482
|
+
evalSet,
|
|
483
|
+
fixture,
|
|
484
|
+
}: {
|
|
485
|
+
routing: string;
|
|
486
|
+
evalSet: EvalEntry[];
|
|
487
|
+
fixture: RoutingReplayFixture;
|
|
488
|
+
}) =>
|
|
489
|
+
await runClaudeRuntimeReplayFixture({
|
|
475
490
|
routing,
|
|
476
491
|
evalSet,
|
|
477
492
|
fixture,
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
}) =>
|
|
483
|
-
await runClaudeRuntimeReplayFixture({
|
|
484
|
-
routing,
|
|
485
|
-
evalSet,
|
|
486
|
-
fixture,
|
|
487
|
-
})
|
|
488
|
-
: undefined;
|
|
493
|
+
})
|
|
494
|
+
: undefined;
|
|
495
|
+
|
|
496
|
+
if (target === "routing") {
|
|
489
497
|
validation = await _validateRoutingProposal(
|
|
490
498
|
proposal,
|
|
491
499
|
evalSet,
|
|
@@ -497,11 +505,16 @@ export async function evolveBody(
|
|
|
497
505
|
},
|
|
498
506
|
);
|
|
499
507
|
} else {
|
|
508
|
+
const bodyReplayOptions: BodyValidationOptions = replayRunner
|
|
509
|
+
? { replay: { replayFixture, replayRunner } }
|
|
510
|
+
: {};
|
|
500
511
|
validation = await _validateBodyProposal(
|
|
501
512
|
proposal,
|
|
502
513
|
evalSet,
|
|
503
514
|
studentAgent,
|
|
504
515
|
validationModelFlag,
|
|
516
|
+
undefined,
|
|
517
|
+
bodyReplayOptions,
|
|
505
518
|
);
|
|
506
519
|
}
|
|
507
520
|
lastValidation = validation;
|
|
@@ -543,6 +556,46 @@ export async function evolveBody(
|
|
|
543
556
|
},
|
|
544
557
|
});
|
|
545
558
|
|
|
559
|
+
// Persist per-entry replay results to SQLite
|
|
560
|
+
try {
|
|
561
|
+
const entryResults: ReplayEntryResultInput[] = [];
|
|
562
|
+
if (validation.before_entry_results) {
|
|
563
|
+
for (const r of validation.before_entry_results) {
|
|
564
|
+
entryResults.push({
|
|
565
|
+
proposal_id: proposal.proposal_id,
|
|
566
|
+
skill_name: skillName,
|
|
567
|
+
validation_mode: validation.validation_mode ?? "llm_judge",
|
|
568
|
+
phase: "before",
|
|
569
|
+
query: r.query,
|
|
570
|
+
should_trigger: r.should_trigger,
|
|
571
|
+
triggered: r.triggered,
|
|
572
|
+
passed: r.passed,
|
|
573
|
+
evidence: r.evidence,
|
|
574
|
+
});
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
if (validation.per_entry_results) {
|
|
578
|
+
for (const r of validation.per_entry_results) {
|
|
579
|
+
entryResults.push({
|
|
580
|
+
proposal_id: proposal.proposal_id,
|
|
581
|
+
skill_name: skillName,
|
|
582
|
+
validation_mode: validation.validation_mode ?? "llm_judge",
|
|
583
|
+
phase: "after",
|
|
584
|
+
query: r.query,
|
|
585
|
+
should_trigger: r.should_trigger,
|
|
586
|
+
triggered: r.triggered,
|
|
587
|
+
passed: r.passed,
|
|
588
|
+
evidence: r.evidence,
|
|
589
|
+
});
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
if (entryResults.length > 0) {
|
|
593
|
+
writeReplayEntryResultsToDb(entryResults);
|
|
594
|
+
}
|
|
595
|
+
} catch {
|
|
596
|
+
// Fail-open: replay entry persistence is non-blocking
|
|
597
|
+
}
|
|
598
|
+
|
|
546
599
|
if (validation.improved) {
|
|
547
600
|
break;
|
|
548
601
|
}
|
|
@@ -3,13 +3,32 @@
|
|
|
3
3
|
*
|
|
4
4
|
* 3-gate validation for full body evolution proposals:
|
|
5
5
|
* Gate 1 (structural): Pure code — YAML frontmatter, # Title, ## Workflow Routing preserved
|
|
6
|
-
* Gate 2 (trigger accuracy):
|
|
6
|
+
* Gate 2 (trigger accuracy): Replay-backed or student model YES/NO per eval entry
|
|
7
7
|
* Gate 3 (quality): Student model rates body clarity/completeness 0.0-1.0
|
|
8
|
+
*
|
|
9
|
+
* Gate 2 now supports replay-backed validation (via replay engine) in addition
|
|
10
|
+
* to LLM-judge-based checking. When replay options are provided and succeed,
|
|
11
|
+
* the replay path is preferred. Falls back to LLM judge otherwise.
|
|
8
12
|
*/
|
|
9
13
|
|
|
10
|
-
import type {
|
|
14
|
+
import type {
|
|
15
|
+
BodyEvolutionProposal,
|
|
16
|
+
BodyValidationResult,
|
|
17
|
+
EvalEntry,
|
|
18
|
+
ValidationMode,
|
|
19
|
+
} from "../types.js";
|
|
11
20
|
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
12
|
-
import {
|
|
21
|
+
import { runJudgeValidation } from "./engines/judge-engine.js";
|
|
22
|
+
import { runReplayValidation, type ReplayValidationOptions } from "./engines/replay-engine.js";
|
|
23
|
+
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Types
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
export interface BodyValidationOptions {
|
|
29
|
+
/** Replay options for Gate 2 trigger accuracy. */
|
|
30
|
+
replay?: ReplayValidationOptions;
|
|
31
|
+
}
|
|
13
32
|
|
|
14
33
|
// ---------------------------------------------------------------------------
|
|
15
34
|
// Gate 1: Structural validation (pure code, no LLM)
|
|
@@ -57,12 +76,15 @@ export function validateBodyStructure(proposedBody: string): { valid: boolean; r
|
|
|
57
76
|
}
|
|
58
77
|
|
|
59
78
|
// ---------------------------------------------------------------------------
|
|
60
|
-
// Gate 2: Trigger accuracy (student model YES/NO)
|
|
79
|
+
// Gate 2: Trigger accuracy (replay-backed or student model YES/NO)
|
|
61
80
|
// ---------------------------------------------------------------------------
|
|
62
81
|
|
|
63
82
|
/**
|
|
64
83
|
* Run trigger checks on the eval set using the proposed body content.
|
|
65
84
|
* Returns before/after pass rates.
|
|
85
|
+
*
|
|
86
|
+
* When replay options are provided, attempts replay-backed validation first.
|
|
87
|
+
* Falls back to LLM judge when replay is unavailable or no options given.
|
|
66
88
|
*/
|
|
67
89
|
export async function validateBodyTriggerAccuracy(
|
|
68
90
|
originalBody: string,
|
|
@@ -70,54 +92,64 @@ export async function validateBodyTriggerAccuracy(
|
|
|
70
92
|
evalSet: EvalEntry[],
|
|
71
93
|
agent: string,
|
|
72
94
|
modelFlag?: string,
|
|
95
|
+
options?: BodyValidationOptions,
|
|
73
96
|
): Promise<{
|
|
74
97
|
before_pass_rate: number;
|
|
75
98
|
after_pass_rate: number;
|
|
76
99
|
improved: boolean;
|
|
77
100
|
regressions: string[];
|
|
101
|
+
validation_mode: ValidationMode;
|
|
102
|
+
per_entry_results?: import("../types.js").RoutingReplayEntryResult[];
|
|
103
|
+
before_entry_results?: import("../types.js").RoutingReplayEntryResult[];
|
|
78
104
|
}> {
|
|
79
105
|
if (evalSet.length === 0) {
|
|
80
|
-
return {
|
|
106
|
+
return {
|
|
107
|
+
before_pass_rate: 0,
|
|
108
|
+
after_pass_rate: 0,
|
|
109
|
+
improved: false,
|
|
110
|
+
regressions: [],
|
|
111
|
+
validation_mode: "llm_judge",
|
|
112
|
+
};
|
|
81
113
|
}
|
|
82
114
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
if (beforePass) beforePassed++;
|
|
104
|
-
if (afterPass) afterPassed++;
|
|
105
|
-
|
|
106
|
-
// Track regressions
|
|
107
|
-
if (beforePass && !afterPass) {
|
|
108
|
-
regressions.push(entry.query);
|
|
115
|
+
// Try replay-backed validation when options are provided
|
|
116
|
+
if (options?.replay) {
|
|
117
|
+
const replayResult = await runReplayValidation(
|
|
118
|
+
originalBody,
|
|
119
|
+
proposedBody,
|
|
120
|
+
evalSet,
|
|
121
|
+
agent,
|
|
122
|
+
options.replay,
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
if (replayResult) {
|
|
126
|
+
return {
|
|
127
|
+
before_pass_rate: replayResult.before_pass_rate,
|
|
128
|
+
after_pass_rate: replayResult.after_pass_rate,
|
|
129
|
+
improved: replayResult.improved,
|
|
130
|
+
regressions: [],
|
|
131
|
+
validation_mode: replayResult.validation_mode,
|
|
132
|
+
per_entry_results: replayResult.per_entry_results,
|
|
133
|
+
before_entry_results: replayResult.before_entry_results,
|
|
134
|
+
};
|
|
109
135
|
}
|
|
110
136
|
}
|
|
111
137
|
|
|
112
|
-
|
|
113
|
-
const
|
|
114
|
-
|
|
138
|
+
// Fall back to LLM judge
|
|
139
|
+
const judgeResult = await runJudgeValidation(
|
|
140
|
+
originalBody,
|
|
141
|
+
proposedBody,
|
|
142
|
+
evalSet,
|
|
143
|
+
agent,
|
|
144
|
+
modelFlag,
|
|
145
|
+
);
|
|
115
146
|
|
|
116
147
|
return {
|
|
117
|
-
before_pass_rate:
|
|
118
|
-
after_pass_rate:
|
|
119
|
-
improved:
|
|
120
|
-
regressions,
|
|
148
|
+
before_pass_rate: judgeResult.before_pass_rate,
|
|
149
|
+
after_pass_rate: judgeResult.after_pass_rate,
|
|
150
|
+
improved: judgeResult.improved,
|
|
151
|
+
regressions: judgeResult.regressions,
|
|
152
|
+
validation_mode: judgeResult.validation_mode,
|
|
121
153
|
};
|
|
122
154
|
}
|
|
123
155
|
|
|
@@ -190,6 +222,7 @@ export async function validateBodyProposal(
|
|
|
190
222
|
agent: string,
|
|
191
223
|
modelFlag?: string,
|
|
192
224
|
qualityThreshold = QUALITY_THRESHOLD,
|
|
225
|
+
options?: BodyValidationOptions,
|
|
193
226
|
): Promise<BodyValidationResult> {
|
|
194
227
|
const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
|
|
195
228
|
|
|
@@ -214,20 +247,21 @@ export async function validateBodyProposal(
|
|
|
214
247
|
};
|
|
215
248
|
}
|
|
216
249
|
|
|
217
|
-
// Gate 2: Trigger accuracy (student model)
|
|
250
|
+
// Gate 2: Trigger accuracy (replay-backed or student model)
|
|
218
251
|
const accuracy = await validateBodyTriggerAccuracy(
|
|
219
252
|
proposal.original_body,
|
|
220
253
|
proposal.proposed_body,
|
|
221
254
|
evalSet,
|
|
222
255
|
agent,
|
|
223
256
|
modelFlag,
|
|
257
|
+
options,
|
|
224
258
|
);
|
|
225
259
|
gateResults.push({
|
|
226
260
|
gate: "trigger_accuracy",
|
|
227
261
|
passed: accuracy.improved,
|
|
228
262
|
reason: accuracy.improved
|
|
229
|
-
? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
230
|
-
: `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
263
|
+
? `Improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
264
|
+
: `Not improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
231
265
|
});
|
|
232
266
|
|
|
233
267
|
// Gate 3: Quality assessment (student model)
|
|
@@ -252,7 +286,7 @@ export async function validateBodyProposal(
|
|
|
252
286
|
gate_results: gateResults,
|
|
253
287
|
improved: gatesPassed === 3,
|
|
254
288
|
regressions: accuracy.regressions,
|
|
255
|
-
validation_mode:
|
|
289
|
+
validation_mode: accuracy.validation_mode,
|
|
256
290
|
validation_agent: agent,
|
|
257
291
|
...(evalSet.length > 0
|
|
258
292
|
? {
|
|
@@ -260,5 +294,7 @@ export async function validateBodyProposal(
|
|
|
260
294
|
after_pass_rate: accuracy.after_pass_rate,
|
|
261
295
|
}
|
|
262
296
|
: {}),
|
|
297
|
+
per_entry_results: accuracy.per_entry_results,
|
|
298
|
+
before_entry_results: accuracy.before_entry_results,
|
|
263
299
|
};
|
|
264
300
|
}
|