selftune 0.2.23 → 0.2.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +93 -15
- package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
- package/apps/local-dashboard/dist/assets/index-Dhgv5BQO.js +15 -0
- package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
- package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
- package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
- package/apps/local-dashboard/dist/index.html +5 -5
- package/cli/selftune/adapters/codex/install.ts +310 -78
- package/cli/selftune/adapters/opencode/install.ts +3 -4
- package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
- package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
- package/cli/selftune/auto-update.ts +200 -8
- package/cli/selftune/canonical-export.ts +55 -25
- package/cli/selftune/command-surface.ts +397 -0
- package/cli/selftune/contribute/contribute.ts +64 -13
- package/cli/selftune/contribution-config.ts +57 -3
- package/cli/selftune/contribution-preferences.ts +117 -0
- package/cli/selftune/contribution-signals.ts +8 -4
- package/cli/selftune/contribution-staging.ts +13 -2
- package/cli/selftune/contributions.ts +55 -121
- package/cli/selftune/creator-contributions.ts +29 -10
- package/cli/selftune/cron/setup.ts +7 -3
- package/cli/selftune/dashboard-contract.ts +73 -0
- package/cli/selftune/dashboard-server.ts +168 -17
- package/cli/selftune/dashboard.ts +350 -17
- package/cli/selftune/eval/baseline.ts +21 -5
- package/cli/selftune/eval/execution-eval.ts +170 -0
- package/cli/selftune/eval/family-overlap.ts +2 -2
- package/cli/selftune/eval/hooks-to-evals.ts +228 -82
- package/cli/selftune/eval/import-skillsbench.ts +2 -2
- package/cli/selftune/eval/invocation-classifier.ts +56 -0
- package/cli/selftune/eval/synthetic-evals.ts +5 -3
- package/cli/selftune/eval/unit-test-cli.ts +7 -4
- package/cli/selftune/evolution/apply-proposal.ts +295 -0
- package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
- package/cli/selftune/evolution/evolve-body.ts +100 -39
- package/cli/selftune/evolution/evolve.ts +244 -52
- package/cli/selftune/evolution/rollback.ts +0 -1
- package/cli/selftune/evolution/validate-body.ts +68 -42
- package/cli/selftune/evolution/validate-host-replay.ts +510 -60
- package/cli/selftune/evolution/validate-proposal.ts +11 -150
- package/cli/selftune/evolution/validate-routing.ts +43 -41
- package/cli/selftune/evolution/validation-contract.ts +91 -0
- package/cli/selftune/grading/auto-grade.ts +11 -7
- package/cli/selftune/grading/grade-session.ts +10 -16
- package/cli/selftune/index.ts +35 -10
- package/cli/selftune/ingestors/claude-replay.ts +15 -10
- package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
- package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
- package/cli/selftune/ingestors/pi-ingest.ts +3 -2
- package/cli/selftune/init.ts +27 -3
- package/cli/selftune/localdb/direct-write.ts +35 -1
- package/cli/selftune/localdb/queries/cron.ts +34 -0
- package/cli/selftune/localdb/queries/dashboard.ts +834 -0
- package/cli/selftune/localdb/queries/evolution.ts +158 -0
- package/cli/selftune/localdb/queries/execution.ts +133 -0
- package/cli/selftune/localdb/queries/json.ts +18 -0
- package/cli/selftune/localdb/queries/monitoring.ts +263 -0
- package/cli/selftune/localdb/queries/raw.ts +95 -0
- package/cli/selftune/localdb/queries/staging.ts +270 -0
- package/cli/selftune/localdb/queries/trust.ts +392 -0
- package/cli/selftune/localdb/queries.ts +60 -2288
- package/cli/selftune/localdb/schema.ts +21 -0
- package/cli/selftune/monitoring/watch.ts +96 -29
- package/cli/selftune/normalization.ts +3 -0
- package/cli/selftune/observability.ts +4 -2
- package/cli/selftune/orchestrate/cli.ts +161 -0
- package/cli/selftune/orchestrate/execute.ts +295 -0
- package/cli/selftune/orchestrate/finalize.ts +157 -0
- package/cli/selftune/orchestrate/locks.ts +40 -0
- package/cli/selftune/orchestrate/plan.ts +131 -0
- package/cli/selftune/orchestrate/post-run.ts +59 -0
- package/cli/selftune/orchestrate/prepare.ts +334 -0
- package/cli/selftune/orchestrate/report.ts +182 -0
- package/cli/selftune/orchestrate/runtime.ts +120 -0
- package/cli/selftune/orchestrate/signals.ts +48 -0
- package/cli/selftune/orchestrate.ts +150 -1173
- package/cli/selftune/repair/skill-usage.ts +5 -2
- package/cli/selftune/routes/overview.ts +5 -2
- package/cli/selftune/routes/skill-report.ts +15 -2
- package/cli/selftune/schedule.ts +5 -5
- package/cli/selftune/status.ts +39 -2
- package/cli/selftune/testing-readiness.ts +597 -0
- package/cli/selftune/types.ts +44 -4
- package/cli/selftune/uninstall.ts +2 -1
- package/cli/selftune/utils/canonical-log.ts +1 -9
- package/cli/selftune/utils/cli-error.ts +9 -0
- package/cli/selftune/utils/llm-call.ts +126 -6
- package/cli/selftune/utils/skill-discovery.ts +2 -0
- package/cli/selftune/workflows/proposals.ts +184 -0
- package/cli/selftune/workflows/skill-scaffold.ts +241 -0
- package/cli/selftune/workflows/workflows.ts +100 -26
- package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
- package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
- package/package.json +25 -9
- package/packages/dashboard-core/AGENTS.md +18 -0
- package/packages/dashboard-core/README.md +30 -0
- package/packages/dashboard-core/index.ts +3 -0
- package/packages/dashboard-core/package.json +39 -0
- package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
- package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
- package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
- package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
- package/packages/dashboard-core/src/chrome/index.ts +14 -0
- package/packages/dashboard-core/src/chrome/types.ts +81 -0
- package/packages/dashboard-core/src/chrome/utils.ts +23 -0
- package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
- package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
- package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
- package/packages/dashboard-core/src/gates/index.ts +3 -0
- package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
- package/packages/dashboard-core/src/host/adapter.ts +47 -0
- package/packages/dashboard-core/src/host/capabilities.ts +55 -0
- package/packages/dashboard-core/src/host/index.ts +3 -0
- package/packages/dashboard-core/src/models/analytics.ts +39 -0
- package/packages/dashboard-core/src/models/index.ts +4 -0
- package/packages/dashboard-core/src/models/overview.ts +98 -0
- package/packages/dashboard-core/src/models/runtime.ts +7 -0
- package/packages/dashboard-core/src/models/skills.ts +34 -0
- package/packages/dashboard-core/src/routes/index.ts +2 -0
- package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
- package/packages/dashboard-core/src/routes/manifest.ts +451 -0
- package/packages/dashboard-core/src/routes/types.ts +39 -0
- package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
- package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
- package/packages/dashboard-core/src/screens/index.ts +37 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
- package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
- package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
- package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
- package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
- package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
- package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
- package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
- package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
- package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
- package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
- package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
- package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
- package/packages/telemetry-contract/src/schemas.ts +41 -1
- package/packages/telemetry-contract/src/types.ts +103 -2
- package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
- package/packages/ui/src/components/OverviewPanels.tsx +67 -26
- package/packages/ui/src/primitives/tabs.tsx +7 -6
- package/packages/ui/src/types.ts +10 -0
- package/skill/SKILL.md +130 -332
- package/skill/agents/diagnosis-analyst.md +3 -3
- package/skill/agents/evolution-reviewer.md +3 -3
- package/skill/agents/integration-guide.md +3 -3
- package/skill/agents/pattern-analyst.md +2 -2
- package/skill/references/cli-quick-reference.md +89 -0
- package/skill/references/creator-playbook.md +131 -0
- package/skill/references/examples.md +48 -0
- package/skill/references/troubleshooting.md +47 -0
- package/skill/references/version-history.md +1 -1
- package/skill/selftune.contribute.json +11 -0
- package/skill/{Workflows → workflows}/Baseline.md +20 -1
- package/skill/{Workflows → workflows}/Contribute.md +23 -10
- package/skill/{Workflows → workflows}/Contributions.md +13 -5
- package/skill/workflows/CreateTestDeploy.md +170 -0
- package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
- package/skill/{Workflows → workflows}/Cron.md +1 -1
- package/skill/{Workflows → workflows}/Dashboard.md +20 -0
- package/skill/{Workflows → workflows}/Doctor.md +1 -1
- package/skill/{Workflows → workflows}/Evals.md +67 -2
- package/skill/{Workflows → workflows}/Evolve.md +119 -30
- package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
- package/skill/{Workflows → workflows}/Grade.md +1 -1
- package/skill/{Workflows → workflows}/Initialize.md +8 -4
- package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
- package/skill/{Workflows → workflows}/Schedule.md +3 -3
- package/skill/workflows/SignalsDashboard.md +87 -0
- package/skill/{Workflows → workflows}/UnitTest.md +19 -0
- package/skill/{Workflows → workflows}/Watch.md +42 -2
- package/skill/{Workflows → workflows}/Workflows.md +39 -2
- package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
- package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
- package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
- package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
- /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
- /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
- /package/skill/{Workflows → workflows}/Badge.md +0 -0
- /package/skill/{Workflows → workflows}/Composability.md +0 -0
- /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
- /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
- /package/skill/{Workflows → workflows}/Hook.md +0 -0
- /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
- /package/skill/{Workflows → workflows}/Ingest.md +0 -0
- /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
- /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
- /package/skill/{Workflows → workflows}/Recover.md +0 -0
- /package/skill/{Workflows → workflows}/Registry.md +0 -0
- /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
- /package/skill/{Workflows → workflows}/Replay.md +0 -0
- /package/skill/{Workflows → workflows}/Rollback.md +0 -0
- /package/skill/{Workflows → workflows}/Sync.md +0 -0
- /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
- /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
|
@@ -27,12 +27,11 @@ import type {
|
|
|
27
27
|
FailurePattern,
|
|
28
28
|
GradingResult,
|
|
29
29
|
QueryLogRecord,
|
|
30
|
-
RoutingReplayFixture,
|
|
31
30
|
SkillUsageRecord,
|
|
32
31
|
} from "../types.js";
|
|
33
32
|
import { CLIError, handleCLIError } from "../utils/cli-error.js";
|
|
34
33
|
import type { EffortLevel, SubagentCallOptions } from "../utils/llm-call.js";
|
|
35
|
-
import { callViaSubagent } from "../utils/llm-call.js";
|
|
34
|
+
import { callViaSubagent, detectLlmAgent } from "../utils/llm-call.js";
|
|
36
35
|
import { appendAuditEntry } from "./audit.js";
|
|
37
36
|
import { checkConstitutionSizeOnly } from "./constitutional.js";
|
|
38
37
|
import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js";
|
|
@@ -43,11 +42,9 @@ import { generateRoutingProposal } from "./propose-routing.js";
|
|
|
43
42
|
import { refineBodyProposal } from "./refine-body.js";
|
|
44
43
|
import type { BodyValidationOptions } from "./validate-body.js";
|
|
45
44
|
import { validateBodyProposal } from "./validate-body.js";
|
|
46
|
-
import {
|
|
47
|
-
buildRoutingReplayFixture,
|
|
48
|
-
runClaudeRuntimeReplayFixture,
|
|
49
|
-
} from "./validate-host-replay.js";
|
|
45
|
+
import { buildRuntimeReplayValidationOptions } from "./validate-host-replay.js";
|
|
50
46
|
import { validateRoutingProposal } from "./validate-routing.js";
|
|
47
|
+
import { DEFAULT_VALIDATION_STRATEGY, type ValidationStrategy } from "./validation-contract.js";
|
|
51
48
|
|
|
52
49
|
// ---------------------------------------------------------------------------
|
|
53
50
|
// Types
|
|
@@ -69,6 +66,7 @@ export interface EvolveBodyOptions {
|
|
|
69
66
|
fewShotExamples?: string[];
|
|
70
67
|
gradingResults?: GradingResult[];
|
|
71
68
|
validationModel?: string;
|
|
69
|
+
validationMode?: ValidationStrategy;
|
|
72
70
|
teacherEffort?: EffortLevel;
|
|
73
71
|
/** Run evolution-reviewer subagent as Gate 4 before deployment. */
|
|
74
72
|
useReviewer?: boolean;
|
|
@@ -176,6 +174,7 @@ export async function evolveBody(
|
|
|
176
174
|
const teacherModel = options.teacherModel ?? DEFAULT_TEACHER_MODEL;
|
|
177
175
|
const studentModel = options.studentModel ?? DEFAULT_STUDENT_MODEL;
|
|
178
176
|
const teacherEffort = options.teacherEffort ?? DEFAULT_TEACHER_EFFORT;
|
|
177
|
+
const effectiveValidationMode = options.validationMode ?? DEFAULT_VALIDATION_STRATEGY;
|
|
179
178
|
|
|
180
179
|
// Resolve injectable dependencies
|
|
181
180
|
const _extractFailurePatterns = _deps.extractFailurePatterns ?? extractFailurePatterns;
|
|
@@ -468,30 +467,17 @@ export async function evolveBody(
|
|
|
468
467
|
// Validate (validationModel overrides studentModel for validation calls)
|
|
469
468
|
const validationModelFlag = options.validationModel ?? studentModel;
|
|
470
469
|
let validation: BodyValidationResult;
|
|
470
|
+
let replayFallbackReason: string | undefined;
|
|
471
471
|
|
|
472
|
-
// Build replay fixture + runner for
|
|
473
|
-
const
|
|
472
|
+
// Build replay fixture + runner for targets that can use runtime replay.
|
|
473
|
+
const replayOptions = buildRuntimeReplayValidationOptions({
|
|
474
474
|
skillName,
|
|
475
475
|
skillPath,
|
|
476
|
-
|
|
476
|
+
agent: studentAgent,
|
|
477
|
+
contentTarget: target === "body" ? "body" : "routing",
|
|
477
478
|
});
|
|
478
|
-
const
|
|
479
|
-
|
|
480
|
-
? async ({
|
|
481
|
-
routing,
|
|
482
|
-
evalSet,
|
|
483
|
-
fixture,
|
|
484
|
-
}: {
|
|
485
|
-
routing: string;
|
|
486
|
-
evalSet: EvalEntry[];
|
|
487
|
-
fixture: RoutingReplayFixture;
|
|
488
|
-
}) =>
|
|
489
|
-
await runClaudeRuntimeReplayFixture({
|
|
490
|
-
routing,
|
|
491
|
-
evalSet,
|
|
492
|
-
fixture,
|
|
493
|
-
})
|
|
494
|
-
: undefined;
|
|
479
|
+
const replayFixture = replayOptions?.replayFixture;
|
|
480
|
+
const replayRunner = replayOptions?.replayRunner;
|
|
495
481
|
|
|
496
482
|
if (target === "routing") {
|
|
497
483
|
validation = await _validateRoutingProposal(
|
|
@@ -500,14 +486,47 @@ export async function evolveBody(
|
|
|
500
486
|
studentAgent,
|
|
501
487
|
validationModelFlag,
|
|
502
488
|
{
|
|
503
|
-
replayFixture,
|
|
489
|
+
...(replayFixture ? { replayFixture } : {}),
|
|
504
490
|
...(replayRunner ? { replayRunner } : {}),
|
|
491
|
+
mode: effectiveValidationMode,
|
|
492
|
+
onReplayFallback: (reason) => {
|
|
493
|
+
replayFallbackReason = reason;
|
|
494
|
+
if (reason) {
|
|
495
|
+
console.error(
|
|
496
|
+
`[evolve-body] Replay not available (${reason}), falling back to LLM judge validation.`,
|
|
497
|
+
);
|
|
498
|
+
return;
|
|
499
|
+
}
|
|
500
|
+
console.error(
|
|
501
|
+
"[evolve-body] Replay not available, falling back to LLM judge validation.",
|
|
502
|
+
);
|
|
503
|
+
},
|
|
505
504
|
},
|
|
506
505
|
);
|
|
507
506
|
} else {
|
|
508
|
-
const bodyReplayOptions: BodyValidationOptions =
|
|
509
|
-
|
|
510
|
-
|
|
507
|
+
const bodyReplayOptions: BodyValidationOptions = {
|
|
508
|
+
...(replayFixture
|
|
509
|
+
? {
|
|
510
|
+
replay: {
|
|
511
|
+
replayFixture,
|
|
512
|
+
...(replayRunner ? { replayRunner } : {}),
|
|
513
|
+
},
|
|
514
|
+
}
|
|
515
|
+
: {}),
|
|
516
|
+
mode: effectiveValidationMode,
|
|
517
|
+
onReplayFallback: (reason) => {
|
|
518
|
+
replayFallbackReason = reason;
|
|
519
|
+
if (reason) {
|
|
520
|
+
console.error(
|
|
521
|
+
`[evolve-body] Replay not available (${reason}), falling back to LLM judge validation.`,
|
|
522
|
+
);
|
|
523
|
+
return;
|
|
524
|
+
}
|
|
525
|
+
console.error(
|
|
526
|
+
"[evolve-body] Replay not available, falling back to LLM judge validation.",
|
|
527
|
+
);
|
|
528
|
+
},
|
|
529
|
+
};
|
|
511
530
|
validation = await _validateBodyProposal(
|
|
512
531
|
proposal,
|
|
513
532
|
evalSet,
|
|
@@ -517,13 +536,23 @@ export async function evolveBody(
|
|
|
517
536
|
bodyReplayOptions,
|
|
518
537
|
);
|
|
519
538
|
}
|
|
539
|
+
if (replayFallbackReason && !validation.validation_fallback_reason) {
|
|
540
|
+
validation = {
|
|
541
|
+
...validation,
|
|
542
|
+
validation_fallback_reason: replayFallbackReason,
|
|
543
|
+
};
|
|
544
|
+
}
|
|
520
545
|
lastValidation = validation;
|
|
521
546
|
const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
|
|
522
547
|
|
|
523
548
|
recordAudit(
|
|
524
549
|
proposal.proposal_id,
|
|
525
550
|
"validated",
|
|
526
|
-
`Validation: ${validation.gates_passed}/${validation.gates_total} gates passed
|
|
551
|
+
`Validation: ${validation.gates_passed}/${validation.gates_total} gates passed${
|
|
552
|
+
validation.validation_fallback_reason
|
|
553
|
+
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
554
|
+
: ""
|
|
555
|
+
}`,
|
|
527
556
|
{
|
|
528
557
|
validation_mode: validation.validation_mode,
|
|
529
558
|
validation_agent: validation.validation_agent,
|
|
@@ -540,7 +569,11 @@ export async function evolveBody(
|
|
|
540
569
|
stage: "validated",
|
|
541
570
|
rationale: proposal.rationale,
|
|
542
571
|
confidence: proposal.confidence,
|
|
543
|
-
details: `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed
|
|
572
|
+
details: `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed${
|
|
573
|
+
validation.validation_fallback_reason
|
|
574
|
+
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
575
|
+
: ""
|
|
576
|
+
}`,
|
|
544
577
|
validation: {
|
|
545
578
|
improved: validation.improved,
|
|
546
579
|
gates_passed: validation.gates_passed,
|
|
@@ -552,6 +585,7 @@ export async function evolveBody(
|
|
|
552
585
|
validation_mode: validation.validation_mode,
|
|
553
586
|
validation_agent: validation.validation_agent,
|
|
554
587
|
validation_fixture_id: validation.validation_fixture_id,
|
|
588
|
+
validation_fallback_reason: validation.validation_fallback_reason,
|
|
555
589
|
validation_evidence_ref: validatedEvidenceRef,
|
|
556
590
|
},
|
|
557
591
|
});
|
|
@@ -603,7 +637,11 @@ export async function evolveBody(
|
|
|
603
637
|
recordAudit(
|
|
604
638
|
proposal.proposal_id,
|
|
605
639
|
"rejected",
|
|
606
|
-
`Validation failed: ${validation.gates_passed}/${validation.gates_total} gates
|
|
640
|
+
`Validation failed: ${validation.gates_passed}/${validation.gates_total} gates${
|
|
641
|
+
validation.validation_fallback_reason
|
|
642
|
+
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
643
|
+
: ""
|
|
644
|
+
}`,
|
|
607
645
|
{
|
|
608
646
|
validation_mode: validation.validation_mode,
|
|
609
647
|
validation_agent: validation.validation_agent,
|
|
@@ -620,7 +658,11 @@ export async function evolveBody(
|
|
|
620
658
|
stage: "rejected",
|
|
621
659
|
rationale: proposal.rationale,
|
|
622
660
|
confidence: proposal.confidence,
|
|
623
|
-
details: `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates
|
|
661
|
+
details: `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates${
|
|
662
|
+
validation.validation_fallback_reason
|
|
663
|
+
? ` (replay fallback: ${validation.validation_fallback_reason})`
|
|
664
|
+
: ""
|
|
665
|
+
}`,
|
|
624
666
|
validation: {
|
|
625
667
|
improved: validation.improved,
|
|
626
668
|
gates_passed: validation.gates_passed,
|
|
@@ -632,6 +674,7 @@ export async function evolveBody(
|
|
|
632
674
|
validation_mode: validation.validation_mode,
|
|
633
675
|
validation_agent: validation.validation_agent,
|
|
634
676
|
validation_fixture_id: validation.validation_fixture_id,
|
|
677
|
+
validation_fallback_reason: validation.validation_fallback_reason,
|
|
635
678
|
validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
|
|
636
679
|
},
|
|
637
680
|
});
|
|
@@ -731,7 +774,11 @@ export async function evolveBody(
|
|
|
731
774
|
recordAudit(
|
|
732
775
|
lastProposal.proposal_id,
|
|
733
776
|
"deployed",
|
|
734
|
-
`Deployed ${target} proposal for ${skillName}
|
|
777
|
+
`Deployed ${target} proposal for ${skillName}${
|
|
778
|
+
lastValidation.validation_fallback_reason
|
|
779
|
+
? ` (replay fallback: ${lastValidation.validation_fallback_reason})`
|
|
780
|
+
: ""
|
|
781
|
+
}`,
|
|
735
782
|
{
|
|
736
783
|
validation_mode: lastValidation.validation_mode,
|
|
737
784
|
validation_agent: lastValidation.validation_agent,
|
|
@@ -760,6 +807,7 @@ export async function evolveBody(
|
|
|
760
807
|
validation_mode: lastValidation.validation_mode,
|
|
761
808
|
validation_agent: lastValidation.validation_agent,
|
|
762
809
|
validation_fixture_id: lastValidation.validation_fixture_id,
|
|
810
|
+
validation_fallback_reason: lastValidation.validation_fallback_reason,
|
|
763
811
|
validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
|
|
764
812
|
},
|
|
765
813
|
});
|
|
@@ -813,6 +861,7 @@ export async function cliMain(): Promise<void> {
|
|
|
813
861
|
"task-description": { type: "string" },
|
|
814
862
|
"few-shot": { type: "string" },
|
|
815
863
|
"validation-model": { type: "string" },
|
|
864
|
+
"validation-mode": { type: "string", default: DEFAULT_VALIDATION_STRATEGY },
|
|
816
865
|
"teacher-effort": { type: "string", default: "high" },
|
|
817
866
|
review: { type: "boolean", default: false },
|
|
818
867
|
help: { type: "boolean", default: false },
|
|
@@ -841,6 +890,7 @@ Options:
|
|
|
841
890
|
--task-description Optional task description context
|
|
842
891
|
--few-shot Comma-separated paths to example skill files
|
|
843
892
|
--validation-model Model for trigger-check validation calls (overrides --student-model for validation)
|
|
893
|
+
--validation-mode Validation strategy: auto, replay, or judge (default: auto)
|
|
844
894
|
--teacher-effort Effort level for teacher LLM: low, medium, high, max (default: high)
|
|
845
895
|
--review Run evolution-reviewer subagent before deployment (Gate 4)
|
|
846
896
|
--help Show this help message`);
|
|
@@ -855,15 +905,24 @@ Options:
|
|
|
855
905
|
);
|
|
856
906
|
}
|
|
857
907
|
|
|
858
|
-
|
|
859
|
-
|
|
908
|
+
if (
|
|
909
|
+
values["validation-mode"] &&
|
|
910
|
+
!["auto", "replay", "judge"].includes(values["validation-mode"])
|
|
911
|
+
) {
|
|
912
|
+
throw new CLIError(
|
|
913
|
+
`Invalid --validation-mode value: ${values["validation-mode"]}`,
|
|
914
|
+
"INVALID_FLAG",
|
|
915
|
+
"Use one of: auto, replay, judge",
|
|
916
|
+
);
|
|
917
|
+
}
|
|
918
|
+
const teacherAgent = values["teacher-agent"] ?? detectLlmAgent() ?? "";
|
|
860
919
|
const studentAgent = values["student-agent"] ?? teacherAgent;
|
|
861
920
|
|
|
862
921
|
if (!teacherAgent) {
|
|
863
922
|
throw new CLIError(
|
|
864
|
-
"No agent CLI found. Install Claude Code, Codex, or
|
|
923
|
+
"No agent CLI found. Install Claude Code, Codex, OpenCode, or Pi.",
|
|
865
924
|
"AGENT_NOT_FOUND",
|
|
866
|
-
"Install Claude Code, Codex, or
|
|
925
|
+
"Install Claude Code, Codex, OpenCode, or Pi.",
|
|
867
926
|
);
|
|
868
927
|
}
|
|
869
928
|
|
|
@@ -901,6 +960,8 @@ Options:
|
|
|
901
960
|
fewShotExamples,
|
|
902
961
|
gradingResults,
|
|
903
962
|
validationModel: values["validation-model"],
|
|
963
|
+
validationMode:
|
|
964
|
+
(values["validation-mode"] as ValidationStrategy | undefined) ?? DEFAULT_VALIDATION_STRATEGY,
|
|
904
965
|
teacherEffort: (values["teacher-effort"] as EffortLevel) ?? "high",
|
|
905
966
|
useReviewer: values.review ?? false,
|
|
906
967
|
});
|