selftune 0.2.22 → 0.2.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +95 -15
- package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/adapters/codex/install.ts +310 -78
- package/cli/selftune/adapters/opencode/install.ts +3 -4
- package/cli/selftune/adapters/pi/hook.ts +273 -0
- package/cli/selftune/adapters/pi/install.ts +207 -0
- package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
- package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
- package/cli/selftune/auto-update.ts +200 -8
- package/cli/selftune/canonical-export.ts +55 -25
- package/cli/selftune/command-surface.ts +397 -0
- package/cli/selftune/constants.ts +10 -1
- package/cli/selftune/contribute/contribute.ts +64 -13
- package/cli/selftune/contribution-config.ts +57 -3
- package/cli/selftune/contribution-preferences.ts +117 -0
- package/cli/selftune/contribution-signals.ts +8 -4
- package/cli/selftune/contribution-staging.ts +13 -2
- package/cli/selftune/contributions.ts +55 -121
- package/cli/selftune/creator-contributions.ts +29 -10
- package/cli/selftune/cron/setup.ts +7 -3
- package/cli/selftune/dashboard-contract.ts +87 -0
- package/cli/selftune/dashboard-server.ts +168 -17
- package/cli/selftune/dashboard.ts +350 -17
- package/cli/selftune/eval/baseline.ts +21 -5
- package/cli/selftune/eval/execution-eval.ts +170 -0
- package/cli/selftune/eval/family-overlap.ts +2 -2
- package/cli/selftune/eval/hooks-to-evals.ts +228 -82
- package/cli/selftune/eval/import-skillsbench.ts +2 -2
- package/cli/selftune/eval/invocation-classifier.ts +56 -0
- package/cli/selftune/eval/synthetic-evals.ts +5 -3
- package/cli/selftune/eval/unit-test-cli.ts +7 -4
- package/cli/selftune/evolution/apply-proposal.ts +295 -0
- package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
- package/cli/selftune/evolution/engines/replay-engine.ts +180 -0
- package/cli/selftune/evolution/evidence.ts +2 -6
- package/cli/selftune/evolution/evolve-body.ts +152 -38
- package/cli/selftune/evolution/evolve.ts +244 -52
- package/cli/selftune/evolution/rollback.ts +0 -1
- package/cli/selftune/evolution/validate-body.ts +111 -49
- package/cli/selftune/evolution/validate-host-replay.ts +510 -60
- package/cli/selftune/evolution/validate-proposal.ts +11 -150
- package/cli/selftune/evolution/validate-routing.ts +51 -108
- package/cli/selftune/evolution/validation-contract.ts +91 -0
- package/cli/selftune/grading/auto-grade.ts +11 -7
- package/cli/selftune/grading/grade-session.ts +10 -16
- package/cli/selftune/hooks/skill-eval.ts +2 -1
- package/cli/selftune/hooks-shared/types.ts +1 -0
- package/cli/selftune/index.ts +58 -15
- package/cli/selftune/ingestors/claude-replay.ts +15 -10
- package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/ingestors/pi-ingest.ts +727 -0
- package/cli/selftune/init.ts +38 -4
- package/cli/selftune/localdb/direct-write.ts +120 -1
- package/cli/selftune/localdb/materialize.ts +6 -7
- package/cli/selftune/localdb/queries/cron.ts +34 -0
- package/cli/selftune/localdb/queries/dashboard.ts +834 -0
- package/cli/selftune/localdb/queries/evolution.ts +158 -0
- package/cli/selftune/localdb/queries/execution.ts +133 -0
- package/cli/selftune/localdb/queries/json.ts +18 -0
- package/cli/selftune/localdb/queries/monitoring.ts +263 -0
- package/cli/selftune/localdb/queries/raw.ts +95 -0
- package/cli/selftune/localdb/queries/staging.ts +270 -0
- package/cli/selftune/localdb/queries/trust.ts +392 -0
- package/cli/selftune/localdb/queries.ts +60 -2162
- package/cli/selftune/localdb/schema.ts +59 -0
- package/cli/selftune/monitoring/watch.ts +96 -29
- package/cli/selftune/normalization.ts +3 -0
- package/cli/selftune/observability.ts +12 -3
- package/cli/selftune/orchestrate/cli.ts +161 -0
- package/cli/selftune/orchestrate/execute.ts +295 -0
- package/cli/selftune/orchestrate/finalize.ts +157 -0
- package/cli/selftune/orchestrate/locks.ts +40 -0
- package/cli/selftune/orchestrate/plan.ts +131 -0
- package/cli/selftune/orchestrate/post-run.ts +59 -0
- package/cli/selftune/orchestrate/prepare.ts +334 -0
- package/cli/selftune/orchestrate/report.ts +182 -0
- package/cli/selftune/orchestrate/runtime.ts +120 -0
- package/cli/selftune/orchestrate/signals.ts +48 -0
- package/cli/selftune/orchestrate.ts +162 -1142
- package/cli/selftune/registry/client.ts +74 -0
- package/cli/selftune/registry/history.ts +54 -0
- package/cli/selftune/registry/index.ts +90 -0
- package/cli/selftune/registry/install.ts +141 -0
- package/cli/selftune/registry/list.ts +44 -0
- package/cli/selftune/registry/push.ts +171 -0
- package/cli/selftune/registry/rollback.ts +49 -0
- package/cli/selftune/registry/status.ts +62 -0
- package/cli/selftune/registry/sync.ts +125 -0
- package/cli/selftune/repair/skill-usage.ts +9 -3
- package/cli/selftune/routes/overview.ts +5 -2
- package/cli/selftune/routes/skill-report.ts +15 -2
- package/cli/selftune/schedule.ts +5 -5
- package/cli/selftune/status.ts +70 -2
- package/cli/selftune/sync.ts +127 -23
- package/cli/selftune/testing-readiness.ts +597 -0
- package/cli/selftune/types.ts +46 -5
- package/cli/selftune/uninstall.ts +2 -1
- package/cli/selftune/utils/canonical-log.ts +1 -9
- package/cli/selftune/utils/cli-error.ts +9 -0
- package/cli/selftune/utils/jsonl.ts +1 -30
- package/cli/selftune/utils/llm-call.ts +126 -6
- package/cli/selftune/utils/skill-discovery.ts +24 -0
- package/cli/selftune/workflows/proposals.ts +184 -0
- package/cli/selftune/workflows/skill-scaffold.ts +241 -0
- package/cli/selftune/workflows/workflows.ts +100 -26
- package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
- package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
- package/node_modules/@selftune/telemetry-contract/package.json +1 -1
- package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
- package/node_modules/@selftune/telemetry-contract/src/schemas.ts +63 -5
- package/node_modules/@selftune/telemetry-contract/src/types.ts +97 -7
- package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
- package/package.json +25 -9
- package/packages/dashboard-core/AGENTS.md +18 -0
- package/packages/dashboard-core/README.md +30 -0
- package/packages/dashboard-core/index.ts +3 -0
- package/packages/dashboard-core/package.json +39 -0
- package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
- package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
- package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
- package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
- package/packages/dashboard-core/src/chrome/index.ts +14 -0
- package/packages/dashboard-core/src/chrome/types.ts +81 -0
- package/packages/dashboard-core/src/chrome/utils.ts +23 -0
- package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
- package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
- package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
- package/packages/dashboard-core/src/gates/index.ts +3 -0
- package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
- package/packages/dashboard-core/src/host/adapter.ts +47 -0
- package/packages/dashboard-core/src/host/capabilities.ts +55 -0
- package/packages/dashboard-core/src/host/index.ts +3 -0
- package/packages/dashboard-core/src/models/analytics.ts +39 -0
- package/packages/dashboard-core/src/models/index.ts +4 -0
- package/packages/dashboard-core/src/models/overview.ts +98 -0
- package/packages/dashboard-core/src/models/runtime.ts +7 -0
- package/packages/dashboard-core/src/models/skills.ts +34 -0
- package/packages/dashboard-core/src/routes/index.ts +2 -0
- package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
- package/packages/dashboard-core/src/routes/manifest.ts +451 -0
- package/packages/dashboard-core/src/routes/types.ts +39 -0
- package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
- package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
- package/packages/dashboard-core/src/screens/index.ts +37 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
- package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
- package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
- package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
- package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
- package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
- package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
- package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
- package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
- package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
- package/packages/telemetry-contract/package.json +1 -1
- package/packages/telemetry-contract/src/index.ts +1 -0
- package/packages/telemetry-contract/src/schemas.ts +63 -5
- package/packages/telemetry-contract/src/types.ts +97 -7
- package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
- package/packages/ui/AGENTS.md +16 -0
- package/packages/ui/README.md +1 -1
- package/packages/ui/package.json +1 -1
- package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
- package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
- package/packages/ui/src/components/EvidenceViewer.tsx +229 -464
- package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
- package/packages/ui/src/components/InfoTip.tsx +1 -2
- package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
- package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
- package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
- package/packages/ui/src/components/OverviewPanels.tsx +693 -0
- package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
- package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
- package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
- package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
- package/packages/ui/src/components/index.ts +56 -1
- package/packages/ui/src/components/section-cards.tsx +18 -35
- package/packages/ui/src/components/skill-health-grid.tsx +47 -37
- package/packages/ui/src/lib/constants.tsx +0 -1
- package/packages/ui/src/primitives/card.tsx +1 -1
- package/packages/ui/src/primitives/checkbox.tsx +1 -1
- package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
- package/packages/ui/src/primitives/select.tsx +2 -2
- package/packages/ui/src/primitives/tabs.tsx +7 -6
- package/packages/ui/src/types.ts +182 -4
- package/skill/SKILL.md +130 -318
- package/skill/agents/diagnosis-analyst.md +3 -3
- package/skill/agents/evolution-reviewer.md +3 -3
- package/skill/agents/integration-guide.md +3 -3
- package/skill/agents/pattern-analyst.md +2 -2
- package/skill/references/cli-quick-reference.md +89 -0
- package/skill/references/creator-playbook.md +131 -0
- package/skill/references/examples.md +48 -0
- package/skill/references/troubleshooting.md +47 -0
- package/skill/references/version-history.md +1 -1
- package/skill/selftune.contribute.json +11 -0
- package/skill/{Workflows → workflows}/Baseline.md +20 -1
- package/skill/{Workflows → workflows}/Contribute.md +23 -10
- package/skill/{Workflows → workflows}/Contributions.md +13 -5
- package/skill/workflows/CreateTestDeploy.md +170 -0
- package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
- package/skill/{Workflows → workflows}/Cron.md +1 -1
- package/skill/{Workflows → workflows}/Dashboard.md +20 -0
- package/skill/{Workflows → workflows}/Doctor.md +1 -1
- package/skill/{Workflows → workflows}/Evals.md +67 -2
- package/skill/{Workflows → workflows}/Evolve.md +119 -30
- package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
- package/skill/{Workflows → workflows}/Grade.md +1 -1
- package/skill/{Workflows → workflows}/Ingest.md +60 -2
- package/skill/{Workflows → workflows}/Initialize.md +16 -9
- package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
- package/skill/{Workflows → workflows}/PlatformHooks.md +19 -3
- package/skill/workflows/Registry.md +99 -0
- package/skill/{Workflows → workflows}/Schedule.md +3 -3
- package/skill/workflows/SignalsDashboard.md +87 -0
- package/skill/{Workflows → workflows}/Sync.md +3 -1
- package/skill/{Workflows → workflows}/UnitTest.md +19 -0
- package/skill/{Workflows → workflows}/Watch.md +42 -2
- package/skill/{Workflows → workflows}/Workflows.md +39 -2
- package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
- package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
- package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
- package/cli/selftune/utils/html.ts +0 -27
- package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
- /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
- /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
- /package/skill/{Workflows → workflows}/Badge.md +0 -0
- /package/skill/{Workflows → workflows}/Composability.md +0 -0
- /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
- /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
- /package/skill/{Workflows → workflows}/Hook.md +0 -0
- /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
- /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
- /package/skill/{Workflows → workflows}/Recover.md +0 -0
- /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
- /package/skill/{Workflows → workflows}/Replay.md +0 -0
- /package/skill/{Workflows → workflows}/Rollback.md +0 -0
- /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
- /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
|
@@ -3,13 +3,37 @@
|
|
|
3
3
|
*
|
|
4
4
|
* 3-gate validation for full body evolution proposals:
|
|
5
5
|
* Gate 1 (structural): Pure code — YAML frontmatter, # Title, ## Workflow Routing preserved
|
|
6
|
-
* Gate 2 (trigger accuracy):
|
|
6
|
+
* Gate 2 (trigger accuracy): Replay-backed or student model YES/NO per eval entry
|
|
7
7
|
* Gate 3 (quality): Student model rates body clarity/completeness 0.0-1.0
|
|
8
|
+
*
|
|
9
|
+
* Gate 2 now supports replay-backed validation (via replay engine) in addition
|
|
10
|
+
* to LLM-judge-based checking. When real host/runtime replay is available and
|
|
11
|
+
* succeeds, the replay path is preferred. Falls back to LLM judge otherwise.
|
|
8
12
|
*/
|
|
9
13
|
|
|
10
|
-
import type {
|
|
14
|
+
import type {
|
|
15
|
+
BodyEvolutionProposal,
|
|
16
|
+
BodyValidationResult,
|
|
17
|
+
EvalEntry,
|
|
18
|
+
RoutingReplayEntryResult,
|
|
19
|
+
ValidationGate,
|
|
20
|
+
ValidationMode,
|
|
21
|
+
} from "../types.js";
|
|
11
22
|
import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
|
|
12
|
-
import {
|
|
23
|
+
import { runJudgeValidation } from "./engines/judge-engine.js";
|
|
24
|
+
import type { ReplayValidationOptions } from "./engines/replay-engine.js";
|
|
25
|
+
import { runValidationContract, type ValidationStrategy } from "./validation-contract.js";
|
|
26
|
+
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
// Types
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
export interface BodyValidationOptions {
|
|
32
|
+
/** Replay options for Gate 2 trigger accuracy. */
|
|
33
|
+
replay?: ReplayValidationOptions;
|
|
34
|
+
mode?: ValidationStrategy;
|
|
35
|
+
onReplayFallback?: (reason?: string) => void;
|
|
36
|
+
}
|
|
13
37
|
|
|
14
38
|
// ---------------------------------------------------------------------------
|
|
15
39
|
// Gate 1: Structural validation (pure code, no LLM)
|
|
@@ -57,12 +81,15 @@ export function validateBodyStructure(proposedBody: string): { valid: boolean; r
|
|
|
57
81
|
}
|
|
58
82
|
|
|
59
83
|
// ---------------------------------------------------------------------------
|
|
60
|
-
// Gate 2: Trigger accuracy (student model YES/NO)
|
|
84
|
+
// Gate 2: Trigger accuracy (replay-backed or student model YES/NO)
|
|
61
85
|
// ---------------------------------------------------------------------------
|
|
62
86
|
|
|
63
87
|
/**
|
|
64
88
|
* Run trigger checks on the eval set using the proposed body content.
|
|
65
89
|
* Returns before/after pass rates.
|
|
90
|
+
*
|
|
91
|
+
* When replay options are provided, attempts host/runtime replay first.
|
|
92
|
+
* Falls back to LLM judge when replay is unavailable or no options given.
|
|
66
93
|
*/
|
|
67
94
|
export async function validateBodyTriggerAccuracy(
|
|
68
95
|
originalBody: string,
|
|
@@ -70,55 +97,84 @@ export async function validateBodyTriggerAccuracy(
|
|
|
70
97
|
evalSet: EvalEntry[],
|
|
71
98
|
agent: string,
|
|
72
99
|
modelFlag?: string,
|
|
100
|
+
options?: BodyValidationOptions,
|
|
73
101
|
): Promise<{
|
|
74
102
|
before_pass_rate: number;
|
|
75
103
|
after_pass_rate: number;
|
|
76
104
|
improved: boolean;
|
|
77
105
|
regressions: string[];
|
|
106
|
+
validation_mode: ValidationMode;
|
|
107
|
+
validation_agent?: string;
|
|
108
|
+
validation_fixture_id?: string;
|
|
109
|
+
validation_fallback_reason?: string;
|
|
110
|
+
per_entry_results?: import("../types.js").RoutingReplayEntryResult[];
|
|
111
|
+
before_entry_results?: import("../types.js").RoutingReplayEntryResult[];
|
|
78
112
|
}> {
|
|
79
113
|
if (evalSet.length === 0) {
|
|
80
|
-
return {
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
for (const entry of evalSet) {
|
|
89
|
-
// Check with original body
|
|
90
|
-
const beforePrompt = buildTriggerCheckPrompt(originalBody, entry.query);
|
|
91
|
-
const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
|
|
92
|
-
const beforeTriggered = parseTriggerResponse(beforeRaw);
|
|
93
|
-
const beforePass =
|
|
94
|
-
(entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
|
|
95
|
-
|
|
96
|
-
// Check with proposed body
|
|
97
|
-
const afterPrompt = buildTriggerCheckPrompt(proposedBody, entry.query);
|
|
98
|
-
const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
|
|
99
|
-
const afterTriggered = parseTriggerResponse(afterRaw);
|
|
100
|
-
const afterPass =
|
|
101
|
-
(entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
|
|
102
|
-
|
|
103
|
-
if (beforePass) beforePassed++;
|
|
104
|
-
if (afterPass) afterPassed++;
|
|
105
|
-
|
|
106
|
-
// Track regressions
|
|
107
|
-
if (beforePass && !afterPass) {
|
|
108
|
-
regressions.push(entry.query);
|
|
109
|
-
}
|
|
114
|
+
return {
|
|
115
|
+
before_pass_rate: 0,
|
|
116
|
+
after_pass_rate: 0,
|
|
117
|
+
improved: false,
|
|
118
|
+
regressions: [],
|
|
119
|
+
validation_mode: "llm_judge",
|
|
120
|
+
validation_agent: agent,
|
|
121
|
+
};
|
|
110
122
|
}
|
|
111
123
|
|
|
112
|
-
const
|
|
113
|
-
|
|
114
|
-
|
|
124
|
+
const { result, fallbackReason } = await runValidationContract<{
|
|
125
|
+
before_pass_rate: number;
|
|
126
|
+
after_pass_rate: number;
|
|
127
|
+
improved: boolean;
|
|
128
|
+
regressions: string[];
|
|
129
|
+
validation_mode: ValidationMode;
|
|
130
|
+
validation_agent?: string;
|
|
131
|
+
validation_fixture_id?: string;
|
|
132
|
+
validation_fallback_reason?: string;
|
|
133
|
+
per_entry_results?: RoutingReplayEntryResult[];
|
|
134
|
+
before_entry_results?: RoutingReplayEntryResult[];
|
|
135
|
+
}>({
|
|
136
|
+
mode: options?.mode ?? "auto",
|
|
137
|
+
originalContent: originalBody,
|
|
138
|
+
proposedContent: proposedBody,
|
|
139
|
+
evalSet,
|
|
140
|
+
agent,
|
|
141
|
+
replayOptions: options?.replay,
|
|
142
|
+
runJudge: async () => {
|
|
143
|
+
const judgeResult = await runJudgeValidation(
|
|
144
|
+
originalBody,
|
|
145
|
+
proposedBody,
|
|
146
|
+
evalSet,
|
|
147
|
+
agent,
|
|
148
|
+
modelFlag,
|
|
149
|
+
);
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
result: {
|
|
153
|
+
before_pass_rate: judgeResult.before_pass_rate,
|
|
154
|
+
after_pass_rate: judgeResult.after_pass_rate,
|
|
155
|
+
improved: judgeResult.improved,
|
|
156
|
+
regressions: judgeResult.regressions,
|
|
157
|
+
validation_mode: judgeResult.validation_mode,
|
|
158
|
+
validation_agent: judgeResult.validation_agent,
|
|
159
|
+
},
|
|
160
|
+
modeUsed: judgeResult.validation_mode,
|
|
161
|
+
};
|
|
162
|
+
},
|
|
163
|
+
onReplayFallback: options?.onReplayFallback,
|
|
164
|
+
adaptReplayResult: (replayResult) => ({
|
|
165
|
+
before_pass_rate: replayResult.before_pass_rate,
|
|
166
|
+
after_pass_rate: replayResult.after_pass_rate,
|
|
167
|
+
improved: replayResult.improved,
|
|
168
|
+
regressions: [],
|
|
169
|
+
validation_mode: replayResult.validation_mode,
|
|
170
|
+
validation_agent: replayResult.validation_agent,
|
|
171
|
+
validation_fixture_id: replayResult.validation_fixture_id,
|
|
172
|
+
per_entry_results: replayResult.per_entry_results,
|
|
173
|
+
before_entry_results: replayResult.before_entry_results,
|
|
174
|
+
}),
|
|
175
|
+
});
|
|
115
176
|
|
|
116
|
-
return {
|
|
117
|
-
before_pass_rate: beforePassRate,
|
|
118
|
-
after_pass_rate: afterPassRate,
|
|
119
|
-
improved: afterPassRate > beforePassRate,
|
|
120
|
-
regressions,
|
|
121
|
-
};
|
|
177
|
+
return fallbackReason ? { ...result, validation_fallback_reason: fallbackReason } : result;
|
|
122
178
|
}
|
|
123
179
|
|
|
124
180
|
// ---------------------------------------------------------------------------
|
|
@@ -190,8 +246,9 @@ export async function validateBodyProposal(
|
|
|
190
246
|
agent: string,
|
|
191
247
|
modelFlag?: string,
|
|
192
248
|
qualityThreshold = QUALITY_THRESHOLD,
|
|
249
|
+
options?: BodyValidationOptions,
|
|
193
250
|
): Promise<BodyValidationResult> {
|
|
194
|
-
const gateResults: Array<{ gate:
|
|
251
|
+
const gateResults: Array<{ gate: ValidationGate; passed: boolean; reason: string }> = [];
|
|
195
252
|
|
|
196
253
|
// Gate 1: Structural validation (pure code)
|
|
197
254
|
const structural = validateBodyStructure(proposal.proposed_body);
|
|
@@ -214,20 +271,21 @@ export async function validateBodyProposal(
|
|
|
214
271
|
};
|
|
215
272
|
}
|
|
216
273
|
|
|
217
|
-
// Gate 2: Trigger accuracy (student model)
|
|
274
|
+
// Gate 2: Trigger accuracy (replay-backed or student model)
|
|
218
275
|
const accuracy = await validateBodyTriggerAccuracy(
|
|
219
276
|
proposal.original_body,
|
|
220
277
|
proposal.proposed_body,
|
|
221
278
|
evalSet,
|
|
222
279
|
agent,
|
|
223
280
|
modelFlag,
|
|
281
|
+
options,
|
|
224
282
|
);
|
|
225
283
|
gateResults.push({
|
|
226
284
|
gate: "trigger_accuracy",
|
|
227
285
|
passed: accuracy.improved,
|
|
228
286
|
reason: accuracy.improved
|
|
229
|
-
? `Improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
230
|
-
: `Not improved: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
287
|
+
? `Improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`
|
|
288
|
+
: `Not improved via ${accuracy.validation_mode}: ${(accuracy.before_pass_rate * 100).toFixed(1)}% -> ${(accuracy.after_pass_rate * 100).toFixed(1)}%`,
|
|
231
289
|
});
|
|
232
290
|
|
|
233
291
|
// Gate 3: Quality assessment (student model)
|
|
@@ -252,13 +310,17 @@ export async function validateBodyProposal(
|
|
|
252
310
|
gate_results: gateResults,
|
|
253
311
|
improved: gatesPassed === 3,
|
|
254
312
|
regressions: accuracy.regressions,
|
|
255
|
-
validation_mode:
|
|
256
|
-
validation_agent: agent,
|
|
313
|
+
validation_mode: accuracy.validation_mode,
|
|
314
|
+
validation_agent: accuracy.validation_agent ?? agent,
|
|
315
|
+
validation_fallback_reason: accuracy.validation_fallback_reason,
|
|
316
|
+
validation_fixture_id: accuracy.validation_fixture_id,
|
|
257
317
|
...(evalSet.length > 0
|
|
258
318
|
? {
|
|
259
319
|
before_pass_rate: accuracy.before_pass_rate,
|
|
260
320
|
after_pass_rate: accuracy.after_pass_rate,
|
|
261
321
|
}
|
|
262
322
|
: {}),
|
|
323
|
+
per_entry_results: accuracy.per_entry_results,
|
|
324
|
+
before_entry_results: accuracy.before_entry_results,
|
|
263
325
|
};
|
|
264
326
|
}
|