selftune 0.2.22 → 0.2.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +95 -15
- package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/adapters/codex/install.ts +310 -78
- package/cli/selftune/adapters/opencode/install.ts +3 -4
- package/cli/selftune/adapters/pi/hook.ts +273 -0
- package/cli/selftune/adapters/pi/install.ts +207 -0
- package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
- package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
- package/cli/selftune/auto-update.ts +200 -8
- package/cli/selftune/canonical-export.ts +55 -25
- package/cli/selftune/command-surface.ts +397 -0
- package/cli/selftune/constants.ts +10 -1
- package/cli/selftune/contribute/contribute.ts +64 -13
- package/cli/selftune/contribution-config.ts +57 -3
- package/cli/selftune/contribution-preferences.ts +117 -0
- package/cli/selftune/contribution-signals.ts +8 -4
- package/cli/selftune/contribution-staging.ts +13 -2
- package/cli/selftune/contributions.ts +55 -121
- package/cli/selftune/creator-contributions.ts +29 -10
- package/cli/selftune/cron/setup.ts +7 -3
- package/cli/selftune/dashboard-contract.ts +87 -0
- package/cli/selftune/dashboard-server.ts +168 -17
- package/cli/selftune/dashboard.ts +350 -17
- package/cli/selftune/eval/baseline.ts +21 -5
- package/cli/selftune/eval/execution-eval.ts +170 -0
- package/cli/selftune/eval/family-overlap.ts +2 -2
- package/cli/selftune/eval/hooks-to-evals.ts +228 -82
- package/cli/selftune/eval/import-skillsbench.ts +2 -2
- package/cli/selftune/eval/invocation-classifier.ts +56 -0
- package/cli/selftune/eval/synthetic-evals.ts +5 -3
- package/cli/selftune/eval/unit-test-cli.ts +7 -4
- package/cli/selftune/evolution/apply-proposal.ts +295 -0
- package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
- package/cli/selftune/evolution/engines/replay-engine.ts +180 -0
- package/cli/selftune/evolution/evidence.ts +2 -6
- package/cli/selftune/evolution/evolve-body.ts +152 -38
- package/cli/selftune/evolution/evolve.ts +244 -52
- package/cli/selftune/evolution/rollback.ts +0 -1
- package/cli/selftune/evolution/validate-body.ts +111 -49
- package/cli/selftune/evolution/validate-host-replay.ts +510 -60
- package/cli/selftune/evolution/validate-proposal.ts +11 -150
- package/cli/selftune/evolution/validate-routing.ts +51 -108
- package/cli/selftune/evolution/validation-contract.ts +91 -0
- package/cli/selftune/grading/auto-grade.ts +11 -7
- package/cli/selftune/grading/grade-session.ts +10 -16
- package/cli/selftune/hooks/skill-eval.ts +2 -1
- package/cli/selftune/hooks-shared/types.ts +1 -0
- package/cli/selftune/index.ts +58 -15
- package/cli/selftune/ingestors/claude-replay.ts +15 -10
- package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/ingestors/pi-ingest.ts +727 -0
- package/cli/selftune/init.ts +38 -4
- package/cli/selftune/localdb/direct-write.ts +120 -1
- package/cli/selftune/localdb/materialize.ts +6 -7
- package/cli/selftune/localdb/queries/cron.ts +34 -0
- package/cli/selftune/localdb/queries/dashboard.ts +834 -0
- package/cli/selftune/localdb/queries/evolution.ts +158 -0
- package/cli/selftune/localdb/queries/execution.ts +133 -0
- package/cli/selftune/localdb/queries/json.ts +18 -0
- package/cli/selftune/localdb/queries/monitoring.ts +263 -0
- package/cli/selftune/localdb/queries/raw.ts +95 -0
- package/cli/selftune/localdb/queries/staging.ts +270 -0
- package/cli/selftune/localdb/queries/trust.ts +392 -0
- package/cli/selftune/localdb/queries.ts +60 -2162
- package/cli/selftune/localdb/schema.ts +59 -0
- package/cli/selftune/monitoring/watch.ts +96 -29
- package/cli/selftune/normalization.ts +3 -0
- package/cli/selftune/observability.ts +12 -3
- package/cli/selftune/orchestrate/cli.ts +161 -0
- package/cli/selftune/orchestrate/execute.ts +295 -0
- package/cli/selftune/orchestrate/finalize.ts +157 -0
- package/cli/selftune/orchestrate/locks.ts +40 -0
- package/cli/selftune/orchestrate/plan.ts +131 -0
- package/cli/selftune/orchestrate/post-run.ts +59 -0
- package/cli/selftune/orchestrate/prepare.ts +334 -0
- package/cli/selftune/orchestrate/report.ts +182 -0
- package/cli/selftune/orchestrate/runtime.ts +120 -0
- package/cli/selftune/orchestrate/signals.ts +48 -0
- package/cli/selftune/orchestrate.ts +162 -1142
- package/cli/selftune/registry/client.ts +74 -0
- package/cli/selftune/registry/history.ts +54 -0
- package/cli/selftune/registry/index.ts +90 -0
- package/cli/selftune/registry/install.ts +141 -0
- package/cli/selftune/registry/list.ts +44 -0
- package/cli/selftune/registry/push.ts +171 -0
- package/cli/selftune/registry/rollback.ts +49 -0
- package/cli/selftune/registry/status.ts +62 -0
- package/cli/selftune/registry/sync.ts +125 -0
- package/cli/selftune/repair/skill-usage.ts +9 -3
- package/cli/selftune/routes/overview.ts +5 -2
- package/cli/selftune/routes/skill-report.ts +15 -2
- package/cli/selftune/schedule.ts +5 -5
- package/cli/selftune/status.ts +70 -2
- package/cli/selftune/sync.ts +127 -23
- package/cli/selftune/testing-readiness.ts +597 -0
- package/cli/selftune/types.ts +46 -5
- package/cli/selftune/uninstall.ts +2 -1
- package/cli/selftune/utils/canonical-log.ts +1 -9
- package/cli/selftune/utils/cli-error.ts +9 -0
- package/cli/selftune/utils/jsonl.ts +1 -30
- package/cli/selftune/utils/llm-call.ts +126 -6
- package/cli/selftune/utils/skill-discovery.ts +24 -0
- package/cli/selftune/workflows/proposals.ts +184 -0
- package/cli/selftune/workflows/skill-scaffold.ts +241 -0
- package/cli/selftune/workflows/workflows.ts +100 -26
- package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
- package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
- package/node_modules/@selftune/telemetry-contract/package.json +1 -1
- package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
- package/node_modules/@selftune/telemetry-contract/src/schemas.ts +63 -5
- package/node_modules/@selftune/telemetry-contract/src/types.ts +97 -7
- package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
- package/package.json +25 -9
- package/packages/dashboard-core/AGENTS.md +18 -0
- package/packages/dashboard-core/README.md +30 -0
- package/packages/dashboard-core/index.ts +3 -0
- package/packages/dashboard-core/package.json +39 -0
- package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
- package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
- package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
- package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
- package/packages/dashboard-core/src/chrome/index.ts +14 -0
- package/packages/dashboard-core/src/chrome/types.ts +81 -0
- package/packages/dashboard-core/src/chrome/utils.ts +23 -0
- package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
- package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
- package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
- package/packages/dashboard-core/src/gates/index.ts +3 -0
- package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
- package/packages/dashboard-core/src/host/adapter.ts +47 -0
- package/packages/dashboard-core/src/host/capabilities.ts +55 -0
- package/packages/dashboard-core/src/host/index.ts +3 -0
- package/packages/dashboard-core/src/models/analytics.ts +39 -0
- package/packages/dashboard-core/src/models/index.ts +4 -0
- package/packages/dashboard-core/src/models/overview.ts +98 -0
- package/packages/dashboard-core/src/models/runtime.ts +7 -0
- package/packages/dashboard-core/src/models/skills.ts +34 -0
- package/packages/dashboard-core/src/routes/index.ts +2 -0
- package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
- package/packages/dashboard-core/src/routes/manifest.ts +451 -0
- package/packages/dashboard-core/src/routes/types.ts +39 -0
- package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
- package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
- package/packages/dashboard-core/src/screens/index.ts +37 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
- package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
- package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
- package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
- package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
- package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
- package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
- package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
- package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
- package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
- package/packages/telemetry-contract/package.json +1 -1
- package/packages/telemetry-contract/src/index.ts +1 -0
- package/packages/telemetry-contract/src/schemas.ts +63 -5
- package/packages/telemetry-contract/src/types.ts +97 -7
- package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
- package/packages/ui/AGENTS.md +16 -0
- package/packages/ui/README.md +1 -1
- package/packages/ui/package.json +1 -1
- package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
- package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
- package/packages/ui/src/components/EvidenceViewer.tsx +229 -464
- package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
- package/packages/ui/src/components/InfoTip.tsx +1 -2
- package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
- package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
- package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
- package/packages/ui/src/components/OverviewPanels.tsx +693 -0
- package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
- package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
- package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
- package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
- package/packages/ui/src/components/index.ts +56 -1
- package/packages/ui/src/components/section-cards.tsx +18 -35
- package/packages/ui/src/components/skill-health-grid.tsx +47 -37
- package/packages/ui/src/lib/constants.tsx +0 -1
- package/packages/ui/src/primitives/card.tsx +1 -1
- package/packages/ui/src/primitives/checkbox.tsx +1 -1
- package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
- package/packages/ui/src/primitives/select.tsx +2 -2
- package/packages/ui/src/primitives/tabs.tsx +7 -6
- package/packages/ui/src/types.ts +182 -4
- package/skill/SKILL.md +130 -318
- package/skill/agents/diagnosis-analyst.md +3 -3
- package/skill/agents/evolution-reviewer.md +3 -3
- package/skill/agents/integration-guide.md +3 -3
- package/skill/agents/pattern-analyst.md +2 -2
- package/skill/references/cli-quick-reference.md +89 -0
- package/skill/references/creator-playbook.md +131 -0
- package/skill/references/examples.md +48 -0
- package/skill/references/troubleshooting.md +47 -0
- package/skill/references/version-history.md +1 -1
- package/skill/selftune.contribute.json +11 -0
- package/skill/{Workflows → workflows}/Baseline.md +20 -1
- package/skill/{Workflows → workflows}/Contribute.md +23 -10
- package/skill/{Workflows → workflows}/Contributions.md +13 -5
- package/skill/workflows/CreateTestDeploy.md +170 -0
- package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
- package/skill/{Workflows → workflows}/Cron.md +1 -1
- package/skill/{Workflows → workflows}/Dashboard.md +20 -0
- package/skill/{Workflows → workflows}/Doctor.md +1 -1
- package/skill/{Workflows → workflows}/Evals.md +67 -2
- package/skill/{Workflows → workflows}/Evolve.md +119 -30
- package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
- package/skill/{Workflows → workflows}/Grade.md +1 -1
- package/skill/{Workflows → workflows}/Ingest.md +60 -2
- package/skill/{Workflows → workflows}/Initialize.md +16 -9
- package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
- package/skill/{Workflows → workflows}/PlatformHooks.md +19 -3
- package/skill/workflows/Registry.md +99 -0
- package/skill/{Workflows → workflows}/Schedule.md +3 -3
- package/skill/workflows/SignalsDashboard.md +87 -0
- package/skill/{Workflows → workflows}/Sync.md +3 -1
- package/skill/{Workflows → workflows}/UnitTest.md +19 -0
- package/skill/{Workflows → workflows}/Watch.md +42 -2
- package/skill/{Workflows → workflows}/Workflows.md +39 -2
- package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
- package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
- package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
- package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
- package/cli/selftune/utils/html.ts +0 -27
- package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
- /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
- /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
- /package/skill/{Workflows → workflows}/Badge.md +0 -0
- /package/skill/{Workflows → workflows}/Composability.md +0 -0
- /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
- /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
- /package/skill/{Workflows → workflows}/Hook.md +0 -0
- /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
- /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
- /package/skill/{Workflows → workflows}/Recover.md +0 -0
- /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
- /package/skill/{Workflows → workflows}/Replay.md +0 -0
- /package/skill/{Workflows → workflows}/Rollback.md +0 -0
- /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
- /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
|
@@ -12,6 +12,10 @@ import { parseArgs } from "node:util";
|
|
|
12
12
|
import { buildEvalSet } from "../eval/hooks-to-evals.js";
|
|
13
13
|
import { readGradingResultsForSkill } from "../grading/results.js";
|
|
14
14
|
import { getDb } from "../localdb/db.js";
|
|
15
|
+
import {
|
|
16
|
+
type ReplayEntryResultInput,
|
|
17
|
+
writeReplayEntryResultsToDb,
|
|
18
|
+
} from "../localdb/direct-write.js";
|
|
15
19
|
import { queryQueryLog, querySkillUsageRecords } from "../localdb/queries.js";
|
|
16
20
|
import type {
|
|
17
21
|
BodyEvolutionProposal,
|
|
@@ -23,12 +27,11 @@ import type {
|
|
|
23
27
|
FailurePattern,
|
|
24
28
|
GradingResult,
|
|
25
29
|
QueryLogRecord,
|
|
26
|
-
RoutingReplayFixture,
|
|
27
30
|
SkillUsageRecord,
|
|
28
31
|
} from "../types.js";
|
|
29
32
|
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
30
33
|
import type { EffortLevel, SubagentCallOptions } from "../utils/llm-call.js";
|
|
31
|
-
import { callViaSubagent } from "../utils/llm-call.js";
|
|
34
|
+
import { callViaSubagent, detectLlmAgent } from "../utils/llm-call.js";
|
|
32
35
|
import { appendAuditEntry } from "./audit.js";
|
|
33
36
|
import { checkConstitutionSizeOnly } from "./constitutional.js";
|
|
34
37
|
import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js";
|
|
@@ -37,12 +40,11 @@ import { extractFailurePatterns } from "./extract-patterns.js";
|
|
|
37
40
|
import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
|
|
38
41
|
import { generateRoutingProposal } from "./propose-routing.js";
|
|
39
42
|
import { refineBodyProposal } from "./refine-body.js";
|
|
43
|
+
import type { BodyValidationOptions } from "./validate-body.js";
|
|
40
44
|
import { validateBodyProposal } from "./validate-body.js";
|
|
41
|
-
import {
|
|
42
|
-
buildRoutingReplayFixture,
|
|
43
|
-
runClaudeRuntimeReplayFixture,
|
|
44
|
-
} from "./validate-host-replay.js";
|
|
45
|
+
import { buildRuntimeReplayValidationOptions } from "./validate-host-replay.js";
|
|
45
46
|
import { validateRoutingProposal } from "./validate-routing.js";
|
|
47
|
+
import { DEFAULT_VALIDATION_STRATEGY, type ValidationStrategy } from "./validation-contract.js";
|
|
46
48
|
|
|
47
49
|
// ---------------------------------------------------------------------------
|
|
48
50
|
// Types
|
|
@@ -64,6 +66,7 @@ export interface EvolveBodyOptions {
|
|
|
64
66
|
fewShotExamples?: string[];
|
|
65
67
|
gradingResults?: GradingResult[];
|
|
66
68
|
validationModel?: string;
|
|
69
|
+
validationMode?: ValidationStrategy;
|
|
67
70
|
teacherEffort?: EffortLevel;
|
|
68
71
|
/** Run evolution-reviewer subagent as Gate 4 before deployment. */
|
|
69
72
|
useReviewer?: boolean;
|
|
@@ -171,6 +174,7 @@ export async function evolveBody(
|
|
|
171
174
|
const teacherModel = options.teacherModel ?? DEFAULT_TEACHER_MODEL;
|
|
172
175
|
const studentModel = options.studentModel ?? DEFAULT_STUDENT_MODEL;
|
|
173
176
|
const teacherEffort = options.teacherEffort ?? DEFAULT_TEACHER_EFFORT;
|
|
177
|
+
const effectiveValidationMode = options.validationMode ?? DEFAULT_VALIDATION_STRATEGY;
|
|
174
178
|
|
|
175
179
|
// Resolve injectable dependencies
|
|
176
180
|
const _extractFailurePatterns = _deps.extractFailurePatterns ?? extractFailurePatterns;
|
|
@@ -463,54 +467,92 @@ export async function evolveBody(
|
|
|
463
467
|
// Validate (validationModel overrides studentModel for validation calls)
|
|
464
468
|
const validationModelFlag = options.validationModel ?? studentModel;
|
|
465
469
|
let validation: BodyValidationResult;
|
|
470
|
+
let replayFallbackReason: string | undefined;
|
|
471
|
+
|
|
472
|
+
// Build replay fixture + runner for targets that can use runtime replay.
|
|
473
|
+
const replayOptions = buildRuntimeReplayValidationOptions({
|
|
474
|
+
skillName,
|
|
475
|
+
skillPath,
|
|
476
|
+
agent: studentAgent,
|
|
477
|
+
contentTarget: target === "body" ? "body" : "routing",
|
|
478
|
+
});
|
|
479
|
+
const replayFixture = replayOptions?.replayFixture;
|
|
480
|
+
const replayRunner = replayOptions?.replayRunner;
|
|
481
|
+
|
|
466
482
|
if (target === "routing") {
|
|
467
|
-
const replayFixture = buildRoutingReplayFixture({
|
|
468
|
-
skillName,
|
|
469
|
-
skillPath,
|
|
470
|
-
platform: studentAgent === "codex" ? "codex" : "claude_code",
|
|
471
|
-
});
|
|
472
|
-
const replayRunner =
|
|
473
|
-
replayFixture.platform === "claude_code" && studentAgent === "claude"
|
|
474
|
-
? async ({
|
|
475
|
-
routing,
|
|
476
|
-
evalSet,
|
|
477
|
-
fixture,
|
|
478
|
-
}: {
|
|
479
|
-
routing: string;
|
|
480
|
-
evalSet: EvalEntry[];
|
|
481
|
-
fixture: RoutingReplayFixture;
|
|
482
|
-
}) =>
|
|
483
|
-
await runClaudeRuntimeReplayFixture({
|
|
484
|
-
routing,
|
|
485
|
-
evalSet,
|
|
486
|
-
fixture,
|
|
487
|
-
})
|
|
488
|
-
: undefined;
|
|
489
483
|
validation = await _validateRoutingProposal(
|
|
490
484
|
proposal,
|
|
491
485
|
evalSet,
|
|
492
486
|
studentAgent,
|
|
493
487
|
validationModelFlag,
|
|
494
488
|
{
|
|
495
|
-
replayFixture,
|
|
489
|
+
...(replayFixture ? { replayFixture } : {}),
|
|
496
490
|
...(replayRunner ? { replayRunner } : {}),
|
|
491
|
+
mode: effectiveValidationMode,
|
|
492
|
+
onReplayFallback: (reason) => {
|
|
493
|
+
replayFallbackReason = reason;
|
|
494
|
+
if (reason) {
|
|
495
|
+
console.error(
|
|
496
|
+
`[evolve-body] Replay not available (${reason}), falling back to LLM judge validation.`,
|
|
497
|
+
);
|
|
498
|
+
return;
|
|
499
|
+
}
|
|
500
|
+
console.error(
|
|
501
|
+
"[evolve-body] Replay not available, falling back to LLM judge validation.",
|
|
502
|
+
);
|
|
503
|
+
},
|
|
497
504
|
},
|
|
498
505
|
);
|
|
499
506
|
} else {
|
|
507
|
+
const bodyReplayOptions: BodyValidationOptions = {
|
|
508
|
+
...(replayFixture
|
|
509
|
+
? {
|
|
510
|
+
replay: {
|
|
511
|
+
replayFixture,
|
|
512
|
+
...(replayRunner ? { replayRunner } : {}),
|
|
513
|
+
},
|
|
514
|
+
}
|
|
515
|
+
: {}),
|
|
516
|
+
mode: effectiveValidationMode,
|
|
517
|
+
onReplayFallback: (reason) => {
|
|
518
|
+
replayFallbackReason = reason;
|
|
519
|
+
if (reason) {
|
|
520
|
+
console.error(
|
|
521
|
+
`[evolve-body] Replay not available (${reason}), falling back to LLM judge validation.`,
|
|
522
|
+
);
|
|
523
|
+
return;
|
|
524
|
+
}
|
|
525
|
+
console.error(
|
|
526
|
+
"[evolve-body] Replay not available, falling back to LLM judge validation.",
|
|
527
|
+
);
|
|
528
|
+
},
|
|
529
|
+
};
|
|
500
530
|
validation = await _validateBodyProposal(
|
|
501
531
|
proposal,
|
|
502
532
|
evalSet,
|
|
503
533
|
studentAgent,
|
|
504
534
|
validationModelFlag,
|
|
535
|
+
undefined,
|
|
536
|
+
bodyReplayOptions,
|
|
505
537
|
);
|
|
506
538
|
}
|
|
539
|
+
if (replayFallbackReason && !validation.validation_fallback_reason) {
|
|
540
|
+
validation = {
|
|
541
|
+
...validation,
|
|
542
|
+
validation_fallback_reason: replayFallbackReason,
|
|
543
|
+
};
|
|
544
|
+
}
|
|
507
545
|
lastValidation = validation;
|
|
508
546
|
const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
509
547
|
|
|
510
548
|
recordAudit(
|
|
511
549
|
proposal.proposal_id,
|
|
512
550
|
"validated",
|
|
513
|
-
`Validation: ${validation.gates_passed}/${validation.gates_total} gates passed
|
|
551
|
+
`Validation: ${validation.gates_passed}/${validation.gates_total} gates passed${
|
|
552
|
+
validation.validation_fallback_reason
|
|
553
|
+
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
554
|
+
: ""
|
|
555
|
+
}`,
|
|
514
556
|
{
|
|
515
557
|
validation_mode: validation.validation_mode,
|
|
516
558
|
validation_agent: validation.validation_agent,
|
|
@@ -527,7 +569,11 @@ export async function evolveBody(
|
|
|
527
569
|
stage: "validated",
|
|
528
570
|
rationale: proposal.rationale,
|
|
529
571
|
confidence: proposal.confidence,
|
|
530
|
-
details: `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed
|
|
572
|
+
details: `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed${
|
|
573
|
+
validation.validation_fallback_reason
|
|
574
|
+
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
575
|
+
: ""
|
|
576
|
+
}`,
|
|
531
577
|
validation: {
|
|
532
578
|
improved: validation.improved,
|
|
533
579
|
gates_passed: validation.gates_passed,
|
|
@@ -539,10 +585,51 @@ export async function evolveBody(
|
|
|
539
585
|
validation_mode: validation.validation_mode,
|
|
540
586
|
validation_agent: validation.validation_agent,
|
|
541
587
|
validation_fixture_id: validation.validation_fixture_id,
|
|
588
|
+
validation_fallback_reason: validation.validation_fallback_reason,
|
|
542
589
|
validation_evidence_ref: validatedEvidenceRef,
|
|
543
590
|
},
|
|
544
591
|
});
|
|
545
592
|
|
|
593
|
+
// Persist per-entry replay results to SQLite
|
|
594
|
+
try {
|
|
595
|
+
const entryResults: ReplayEntryResultInput[] = [];
|
|
596
|
+
if (validation.before_entry_results) {
|
|
597
|
+
for (const r of validation.before_entry_results) {
|
|
598
|
+
entryResults.push({
|
|
599
|
+
proposal_id: proposal.proposal_id,
|
|
600
|
+
skill_name: skillName,
|
|
601
|
+
validation_mode: validation.validation_mode ?? "llm_judge",
|
|
602
|
+
phase: "before",
|
|
603
|
+
query: r.query,
|
|
604
|
+
should_trigger: r.should_trigger,
|
|
605
|
+
triggered: r.triggered,
|
|
606
|
+
passed: r.passed,
|
|
607
|
+
evidence: r.evidence,
|
|
608
|
+
});
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
if (validation.per_entry_results) {
|
|
612
|
+
for (const r of validation.per_entry_results) {
|
|
613
|
+
entryResults.push({
|
|
614
|
+
proposal_id: proposal.proposal_id,
|
|
615
|
+
skill_name: skillName,
|
|
616
|
+
validation_mode: validation.validation_mode ?? "llm_judge",
|
|
617
|
+
phase: "after",
|
|
618
|
+
query: r.query,
|
|
619
|
+
should_trigger: r.should_trigger,
|
|
620
|
+
triggered: r.triggered,
|
|
621
|
+
passed: r.passed,
|
|
622
|
+
evidence: r.evidence,
|
|
623
|
+
});
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
if (entryResults.length > 0) {
|
|
627
|
+
writeReplayEntryResultsToDb(entryResults);
|
|
628
|
+
}
|
|
629
|
+
} catch {
|
|
630
|
+
// Fail-open: replay entry persistence is non-blocking
|
|
631
|
+
}
|
|
632
|
+
|
|
546
633
|
if (validation.improved) {
|
|
547
634
|
break;
|
|
548
635
|
}
|
|
@@ -550,7 +637,11 @@ export async function evolveBody(
|
|
|
550
637
|
recordAudit(
|
|
551
638
|
proposal.proposal_id,
|
|
552
639
|
"rejected",
|
|
553
|
-
`Validation failed: ${validation.gates_passed}/${validation.gates_total} gates
|
|
640
|
+
`Validation failed: ${validation.gates_passed}/${validation.gates_total} gates${
|
|
641
|
+
validation.validation_fallback_reason
|
|
642
|
+
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
643
|
+
: ""
|
|
644
|
+
}`,
|
|
554
645
|
{
|
|
555
646
|
validation_mode: validation.validation_mode,
|
|
556
647
|
validation_agent: validation.validation_agent,
|
|
@@ -567,7 +658,11 @@ export async function evolveBody(
|
|
|
567
658
|
stage: "rejected",
|
|
568
659
|
rationale: proposal.rationale,
|
|
569
660
|
confidence: proposal.confidence,
|
|
570
|
-
details: `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates
|
|
661
|
+
details: `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates${
|
|
662
|
+
validation.validation_fallback_reason
|
|
663
|
+
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
664
|
+
: ""
|
|
665
|
+
}`,
|
|
571
666
|
validation: {
|
|
572
667
|
improved: validation.improved,
|
|
573
668
|
gates_passed: validation.gates_passed,
|
|
@@ -579,6 +674,7 @@ export async function evolveBody(
|
|
|
579
674
|
validation_mode: validation.validation_mode,
|
|
580
675
|
validation_agent: validation.validation_agent,
|
|
581
676
|
validation_fixture_id: validation.validation_fixture_id,
|
|
677
|
+
validation_fallback_reason: validation.validation_fallback_reason,
|
|
582
678
|
validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
|
|
583
679
|
},
|
|
584
680
|
});
|
|
@@ -678,7 +774,11 @@ export async function evolveBody(
|
|
|
678
774
|
recordAudit(
|
|
679
775
|
lastProposal.proposal_id,
|
|
680
776
|
"deployed",
|
|
681
|
-
`Deployed ${target} proposal for ${skillName}
|
|
777
|
+
`Deployed ${target} proposal for ${skillName}${
|
|
778
|
+
lastValidation.validation_fallback_reason
|
|
779
|
+
? ` (replay fallback: ${lastValidation.validation_fallback_reason})`
|
|
780
|
+
: ""
|
|
781
|
+
}`,
|
|
682
782
|
{
|
|
683
783
|
validation_mode: lastValidation.validation_mode,
|
|
684
784
|
validation_agent: lastValidation.validation_agent,
|
|
@@ -707,6 +807,7 @@ export async function evolveBody(
|
|
|
707
807
|
validation_mode: lastValidation.validation_mode,
|
|
708
808
|
validation_agent: lastValidation.validation_agent,
|
|
709
809
|
validation_fixture_id: lastValidation.validation_fixture_id,
|
|
810
|
+
validation_fallback_reason: lastValidation.validation_fallback_reason,
|
|
710
811
|
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
711
812
|
},
|
|
712
813
|
});
|
|
@@ -760,6 +861,7 @@ export async function cliMain(): Promise<void> {
|
|
|
760
861
|
"task-description": { type: "string" },
|
|
761
862
|
"few-shot": { type: "string" },
|
|
762
863
|
"validation-model": { type: "string" },
|
|
864
|
+
"validation-mode": { type: "string", default: DEFAULT_VALIDATION_STRATEGY },
|
|
763
865
|
"teacher-effort": { type: "string", default: "high" },
|
|
764
866
|
review: { type: "boolean", default: false },
|
|
765
867
|
help: { type: "boolean", default: false },
|
|
@@ -788,6 +890,7 @@ Options:
|
|
|
788
890
|
--task-description Optional task description context
|
|
789
891
|
--few-shot Comma-separated paths to example skill files
|
|
790
892
|
--validation-model Model for trigger-check validation calls (overrides --student-model for validation)
|
|
893
|
+
--validation-mode Validation strategy: auto, replay, or judge (default: auto)
|
|
791
894
|
--teacher-effort Effort level for teacher LLM: low, medium, high, max (default: high)
|
|
792
895
|
--review Run evolution-reviewer subagent before deployment (Gate 4)
|
|
793
896
|
--help Show this help message`);
|
|
@@ -802,15 +905,24 @@ Options:
|
|
|
802
905
|
);
|
|
803
906
|
}
|
|
804
907
|
|
|
805
|
-
|
|
806
|
-
|
|
908
|
+
if (
|
|
909
|
+
values["validation-mode"] &&
|
|
910
|
+
!["auto", "replay", "judge"].includes(values["validation-mode"])
|
|
911
|
+
) {
|
|
912
|
+
throw new CLIError(
|
|
913
|
+
`Invalid --validation-mode value: ${values["validation-mode"]}`,
|
|
914
|
+
"INVALID_FLAG",
|
|
915
|
+
"Use one of: auto, replay, judge",
|
|
916
|
+
);
|
|
917
|
+
}
|
|
918
|
+
const teacherAgent = values["teacher-agent"] ?? detectLlmAgent() ?? "";
|
|
807
919
|
const studentAgent = values["student-agent"] ?? teacherAgent;
|
|
808
920
|
|
|
809
921
|
if (!teacherAgent) {
|
|
810
922
|
throw new CLIError(
|
|
811
|
-
"No agent CLI found. Install Claude Code, Codex, or
|
|
923
|
+
"No agent CLI found. Install Claude Code, Codex, OpenCode, or Pi.",
|
|
812
924
|
"AGENT_NOT_FOUND",
|
|
813
|
-
"Install Claude Code, Codex, or
|
|
925
|
+
"Install Claude Code, Codex, OpenCode, or Pi.",
|
|
814
926
|
);
|
|
815
927
|
}
|
|
816
928
|
|
|
@@ -848,6 +960,8 @@ Options:
|
|
|
848
960
|
fewShotExamples,
|
|
849
961
|
gradingResults,
|
|
850
962
|
validationModel: values["validation-model"],
|
|
963
|
+
validationMode:
|
|
964
|
+
(values["validation-mode"] as ValidationStrategy | undefined) ?? DEFAULT_VALIDATION_STRATEGY,
|
|
851
965
|
teacherEffort: (values["teacher-effort"] as EffortLevel) ?? "high",
|
|
852
966
|
useReviewer: values.review ?? false,
|
|
853
967
|
});
|