selftune 0.2.22 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +95 -15
  3. package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
  4. package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
  5. package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
  6. package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
  7. package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
  8. package/apps/local-dashboard/dist/index.html +5 -5
  9. package/cli/selftune/adapters/codex/install.ts +310 -78
  10. package/cli/selftune/adapters/opencode/install.ts +3 -4
  11. package/cli/selftune/adapters/pi/hook.ts +273 -0
  12. package/cli/selftune/adapters/pi/install.ts +207 -0
  13. package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
  14. package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
  15. package/cli/selftune/auto-update.ts +200 -8
  16. package/cli/selftune/canonical-export.ts +55 -25
  17. package/cli/selftune/command-surface.ts +397 -0
  18. package/cli/selftune/constants.ts +10 -1
  19. package/cli/selftune/contribute/contribute.ts +64 -13
  20. package/cli/selftune/contribution-config.ts +57 -3
  21. package/cli/selftune/contribution-preferences.ts +117 -0
  22. package/cli/selftune/contribution-signals.ts +8 -4
  23. package/cli/selftune/contribution-staging.ts +13 -2
  24. package/cli/selftune/contributions.ts +55 -121
  25. package/cli/selftune/creator-contributions.ts +29 -10
  26. package/cli/selftune/cron/setup.ts +7 -3
  27. package/cli/selftune/dashboard-contract.ts +87 -0
  28. package/cli/selftune/dashboard-server.ts +168 -17
  29. package/cli/selftune/dashboard.ts +350 -17
  30. package/cli/selftune/eval/baseline.ts +21 -5
  31. package/cli/selftune/eval/execution-eval.ts +170 -0
  32. package/cli/selftune/eval/family-overlap.ts +2 -2
  33. package/cli/selftune/eval/hooks-to-evals.ts +228 -82
  34. package/cli/selftune/eval/import-skillsbench.ts +2 -2
  35. package/cli/selftune/eval/invocation-classifier.ts +56 -0
  36. package/cli/selftune/eval/synthetic-evals.ts +5 -3
  37. package/cli/selftune/eval/unit-test-cli.ts +7 -4
  38. package/cli/selftune/evolution/apply-proposal.ts +295 -0
  39. package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
  40. package/cli/selftune/evolution/engines/replay-engine.ts +180 -0
  41. package/cli/selftune/evolution/evidence.ts +2 -6
  42. package/cli/selftune/evolution/evolve-body.ts +152 -38
  43. package/cli/selftune/evolution/evolve.ts +244 -52
  44. package/cli/selftune/evolution/rollback.ts +0 -1
  45. package/cli/selftune/evolution/validate-body.ts +111 -49
  46. package/cli/selftune/evolution/validate-host-replay.ts +510 -60
  47. package/cli/selftune/evolution/validate-proposal.ts +11 -150
  48. package/cli/selftune/evolution/validate-routing.ts +51 -108
  49. package/cli/selftune/evolution/validation-contract.ts +91 -0
  50. package/cli/selftune/grading/auto-grade.ts +11 -7
  51. package/cli/selftune/grading/grade-session.ts +10 -16
  52. package/cli/selftune/hooks/skill-eval.ts +2 -1
  53. package/cli/selftune/hooks-shared/types.ts +1 -0
  54. package/cli/selftune/index.ts +58 -15
  55. package/cli/selftune/ingestors/claude-replay.ts +15 -10
  56. package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
  57. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  58. package/cli/selftune/ingestors/pi-ingest.ts +727 -0
  59. package/cli/selftune/init.ts +38 -4
  60. package/cli/selftune/localdb/direct-write.ts +120 -1
  61. package/cli/selftune/localdb/materialize.ts +6 -7
  62. package/cli/selftune/localdb/queries/cron.ts +34 -0
  63. package/cli/selftune/localdb/queries/dashboard.ts +834 -0
  64. package/cli/selftune/localdb/queries/evolution.ts +158 -0
  65. package/cli/selftune/localdb/queries/execution.ts +133 -0
  66. package/cli/selftune/localdb/queries/json.ts +18 -0
  67. package/cli/selftune/localdb/queries/monitoring.ts +263 -0
  68. package/cli/selftune/localdb/queries/raw.ts +95 -0
  69. package/cli/selftune/localdb/queries/staging.ts +270 -0
  70. package/cli/selftune/localdb/queries/trust.ts +392 -0
  71. package/cli/selftune/localdb/queries.ts +60 -2162
  72. package/cli/selftune/localdb/schema.ts +59 -0
  73. package/cli/selftune/monitoring/watch.ts +96 -29
  74. package/cli/selftune/normalization.ts +3 -0
  75. package/cli/selftune/observability.ts +12 -3
  76. package/cli/selftune/orchestrate/cli.ts +161 -0
  77. package/cli/selftune/orchestrate/execute.ts +295 -0
  78. package/cli/selftune/orchestrate/finalize.ts +157 -0
  79. package/cli/selftune/orchestrate/locks.ts +40 -0
  80. package/cli/selftune/orchestrate/plan.ts +131 -0
  81. package/cli/selftune/orchestrate/post-run.ts +59 -0
  82. package/cli/selftune/orchestrate/prepare.ts +334 -0
  83. package/cli/selftune/orchestrate/report.ts +182 -0
  84. package/cli/selftune/orchestrate/runtime.ts +120 -0
  85. package/cli/selftune/orchestrate/signals.ts +48 -0
  86. package/cli/selftune/orchestrate.ts +162 -1142
  87. package/cli/selftune/registry/client.ts +74 -0
  88. package/cli/selftune/registry/history.ts +54 -0
  89. package/cli/selftune/registry/index.ts +90 -0
  90. package/cli/selftune/registry/install.ts +141 -0
  91. package/cli/selftune/registry/list.ts +44 -0
  92. package/cli/selftune/registry/push.ts +171 -0
  93. package/cli/selftune/registry/rollback.ts +49 -0
  94. package/cli/selftune/registry/status.ts +62 -0
  95. package/cli/selftune/registry/sync.ts +125 -0
  96. package/cli/selftune/repair/skill-usage.ts +9 -3
  97. package/cli/selftune/routes/overview.ts +5 -2
  98. package/cli/selftune/routes/skill-report.ts +15 -2
  99. package/cli/selftune/schedule.ts +5 -5
  100. package/cli/selftune/status.ts +70 -2
  101. package/cli/selftune/sync.ts +127 -23
  102. package/cli/selftune/testing-readiness.ts +597 -0
  103. package/cli/selftune/types.ts +46 -5
  104. package/cli/selftune/uninstall.ts +2 -1
  105. package/cli/selftune/utils/canonical-log.ts +1 -9
  106. package/cli/selftune/utils/cli-error.ts +9 -0
  107. package/cli/selftune/utils/jsonl.ts +1 -30
  108. package/cli/selftune/utils/llm-call.ts +126 -6
  109. package/cli/selftune/utils/skill-discovery.ts +24 -0
  110. package/cli/selftune/workflows/proposals.ts +184 -0
  111. package/cli/selftune/workflows/skill-scaffold.ts +241 -0
  112. package/cli/selftune/workflows/workflows.ts +100 -26
  113. package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
  114. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
  115. package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
  116. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  117. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
  118. package/node_modules/@selftune/telemetry-contract/package.json +1 -1
  119. package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
  120. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +63 -5
  121. package/node_modules/@selftune/telemetry-contract/src/types.ts +97 -7
  122. package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
  123. package/package.json +25 -9
  124. package/packages/dashboard-core/AGENTS.md +18 -0
  125. package/packages/dashboard-core/README.md +30 -0
  126. package/packages/dashboard-core/index.ts +3 -0
  127. package/packages/dashboard-core/package.json +39 -0
  128. package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
  129. package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
  130. package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
  131. package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
  132. package/packages/dashboard-core/src/chrome/index.ts +14 -0
  133. package/packages/dashboard-core/src/chrome/types.ts +81 -0
  134. package/packages/dashboard-core/src/chrome/utils.ts +23 -0
  135. package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
  136. package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
  137. package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
  138. package/packages/dashboard-core/src/gates/index.ts +3 -0
  139. package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
  140. package/packages/dashboard-core/src/host/adapter.ts +47 -0
  141. package/packages/dashboard-core/src/host/capabilities.ts +55 -0
  142. package/packages/dashboard-core/src/host/index.ts +3 -0
  143. package/packages/dashboard-core/src/models/analytics.ts +39 -0
  144. package/packages/dashboard-core/src/models/index.ts +4 -0
  145. package/packages/dashboard-core/src/models/overview.ts +98 -0
  146. package/packages/dashboard-core/src/models/runtime.ts +7 -0
  147. package/packages/dashboard-core/src/models/skills.ts +34 -0
  148. package/packages/dashboard-core/src/routes/index.ts +2 -0
  149. package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
  150. package/packages/dashboard-core/src/routes/manifest.ts +451 -0
  151. package/packages/dashboard-core/src/routes/types.ts +39 -0
  152. package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
  153. package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
  154. package/packages/dashboard-core/src/screens/index.ts +37 -0
  155. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
  156. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
  157. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
  158. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
  159. package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
  160. package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
  161. package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
  162. package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
  163. package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
  164. package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
  165. package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
  166. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
  167. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
  168. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
  169. package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
  170. package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
  171. package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
  172. package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
  173. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
  174. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
  175. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
  176. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
  177. package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
  178. package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
  179. package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
  180. package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
  181. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
  182. package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
  183. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  184. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
  185. package/packages/telemetry-contract/package.json +1 -1
  186. package/packages/telemetry-contract/src/index.ts +1 -0
  187. package/packages/telemetry-contract/src/schemas.ts +63 -5
  188. package/packages/telemetry-contract/src/types.ts +97 -7
  189. package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
  190. package/packages/ui/AGENTS.md +16 -0
  191. package/packages/ui/README.md +1 -1
  192. package/packages/ui/package.json +1 -1
  193. package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
  194. package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
  195. package/packages/ui/src/components/EvidenceViewer.tsx +229 -464
  196. package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
  197. package/packages/ui/src/components/InfoTip.tsx +1 -2
  198. package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
  199. package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
  200. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
  201. package/packages/ui/src/components/OverviewPanels.tsx +693 -0
  202. package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
  203. package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
  204. package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
  205. package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
  206. package/packages/ui/src/components/index.ts +56 -1
  207. package/packages/ui/src/components/section-cards.tsx +18 -35
  208. package/packages/ui/src/components/skill-health-grid.tsx +47 -37
  209. package/packages/ui/src/lib/constants.tsx +0 -1
  210. package/packages/ui/src/primitives/card.tsx +1 -1
  211. package/packages/ui/src/primitives/checkbox.tsx +1 -1
  212. package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
  213. package/packages/ui/src/primitives/select.tsx +2 -2
  214. package/packages/ui/src/primitives/tabs.tsx +7 -6
  215. package/packages/ui/src/types.ts +182 -4
  216. package/skill/SKILL.md +130 -318
  217. package/skill/agents/diagnosis-analyst.md +3 -3
  218. package/skill/agents/evolution-reviewer.md +3 -3
  219. package/skill/agents/integration-guide.md +3 -3
  220. package/skill/agents/pattern-analyst.md +2 -2
  221. package/skill/references/cli-quick-reference.md +89 -0
  222. package/skill/references/creator-playbook.md +131 -0
  223. package/skill/references/examples.md +48 -0
  224. package/skill/references/troubleshooting.md +47 -0
  225. package/skill/references/version-history.md +1 -1
  226. package/skill/selftune.contribute.json +11 -0
  227. package/skill/{Workflows → workflows}/Baseline.md +20 -1
  228. package/skill/{Workflows → workflows}/Contribute.md +23 -10
  229. package/skill/{Workflows → workflows}/Contributions.md +13 -5
  230. package/skill/workflows/CreateTestDeploy.md +170 -0
  231. package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
  232. package/skill/{Workflows → workflows}/Cron.md +1 -1
  233. package/skill/{Workflows → workflows}/Dashboard.md +20 -0
  234. package/skill/{Workflows → workflows}/Doctor.md +1 -1
  235. package/skill/{Workflows → workflows}/Evals.md +67 -2
  236. package/skill/{Workflows → workflows}/Evolve.md +119 -30
  237. package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
  238. package/skill/{Workflows → workflows}/Grade.md +1 -1
  239. package/skill/{Workflows → workflows}/Ingest.md +60 -2
  240. package/skill/{Workflows → workflows}/Initialize.md +16 -9
  241. package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
  242. package/skill/{Workflows → workflows}/PlatformHooks.md +19 -3
  243. package/skill/workflows/Registry.md +99 -0
  244. package/skill/{Workflows → workflows}/Schedule.md +3 -3
  245. package/skill/workflows/SignalsDashboard.md +87 -0
  246. package/skill/{Workflows → workflows}/Sync.md +3 -1
  247. package/skill/{Workflows → workflows}/UnitTest.md +19 -0
  248. package/skill/{Workflows → workflows}/Watch.md +42 -2
  249. package/skill/{Workflows → workflows}/Workflows.md +39 -2
  250. package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
  251. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
  252. package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
  253. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
  254. package/cli/selftune/utils/html.ts +0 -27
  255. package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
  256. /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
  257. /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
  258. /package/skill/{Workflows → workflows}/Badge.md +0 -0
  259. /package/skill/{Workflows → workflows}/Composability.md +0 -0
  260. /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
  261. /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
  262. /package/skill/{Workflows → workflows}/Hook.md +0 -0
  263. /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
  264. /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
  265. /package/skill/{Workflows → workflows}/Recover.md +0 -0
  266. /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
  267. /package/skill/{Workflows → workflows}/Replay.md +0 -0
  268. /package/skill/{Workflows → workflows}/Rollback.md +0 -0
  269. /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
  270. /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
@@ -12,6 +12,10 @@ import { parseArgs } from "node:util";
12
12
  import { buildEvalSet } from "../eval/hooks-to-evals.js";
13
13
  import { readGradingResultsForSkill } from "../grading/results.js";
14
14
  import { getDb } from "../localdb/db.js";
15
+ import {
16
+ type ReplayEntryResultInput,
17
+ writeReplayEntryResultsToDb,
18
+ } from "../localdb/direct-write.js";
15
19
  import { queryQueryLog, querySkillUsageRecords } from "../localdb/queries.js";
16
20
  import type {
17
21
  BodyEvolutionProposal,
@@ -23,12 +27,11 @@ import type {
23
27
  FailurePattern,
24
28
  GradingResult,
25
29
  QueryLogRecord,
26
- RoutingReplayFixture,
27
30
  SkillUsageRecord,
28
31
  } from "../types.js";
29
32
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
30
33
  import type { EffortLevel, SubagentCallOptions } from "../utils/llm-call.js";
31
- import { callViaSubagent } from "../utils/llm-call.js";
34
+ import { callViaSubagent, detectLlmAgent } from "../utils/llm-call.js";
32
35
  import { appendAuditEntry } from "./audit.js";
33
36
  import { checkConstitutionSizeOnly } from "./constitutional.js";
34
37
  import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js";
@@ -37,12 +40,11 @@ import { extractFailurePatterns } from "./extract-patterns.js";
37
40
  import { type ExecutionContext, generateBodyProposal } from "./propose-body.js";
38
41
  import { generateRoutingProposal } from "./propose-routing.js";
39
42
  import { refineBodyProposal } from "./refine-body.js";
43
+ import type { BodyValidationOptions } from "./validate-body.js";
40
44
  import { validateBodyProposal } from "./validate-body.js";
41
- import {
42
- buildRoutingReplayFixture,
43
- runClaudeRuntimeReplayFixture,
44
- } from "./validate-host-replay.js";
45
+ import { buildRuntimeReplayValidationOptions } from "./validate-host-replay.js";
45
46
  import { validateRoutingProposal } from "./validate-routing.js";
47
+ import { DEFAULT_VALIDATION_STRATEGY, type ValidationStrategy } from "./validation-contract.js";
46
48
 
47
49
  // ---------------------------------------------------------------------------
48
50
  // Types
@@ -64,6 +66,7 @@ export interface EvolveBodyOptions {
64
66
  fewShotExamples?: string[];
65
67
  gradingResults?: GradingResult[];
66
68
  validationModel?: string;
69
+ validationMode?: ValidationStrategy;
67
70
  teacherEffort?: EffortLevel;
68
71
  /** Run evolution-reviewer subagent as Gate 4 before deployment. */
69
72
  useReviewer?: boolean;
@@ -171,6 +174,7 @@ export async function evolveBody(
171
174
  const teacherModel = options.teacherModel ?? DEFAULT_TEACHER_MODEL;
172
175
  const studentModel = options.studentModel ?? DEFAULT_STUDENT_MODEL;
173
176
  const teacherEffort = options.teacherEffort ?? DEFAULT_TEACHER_EFFORT;
177
+ const effectiveValidationMode = options.validationMode ?? DEFAULT_VALIDATION_STRATEGY;
174
178
 
175
179
  // Resolve injectable dependencies
176
180
  const _extractFailurePatterns = _deps.extractFailurePatterns ?? extractFailurePatterns;
@@ -463,54 +467,92 @@ export async function evolveBody(
463
467
  // Validate (validationModel overrides studentModel for validation calls)
464
468
  const validationModelFlag = options.validationModel ?? studentModel;
465
469
  let validation: BodyValidationResult;
470
+ let replayFallbackReason: string | undefined;
471
+
472
+ // Build replay fixture + runner for targets that can use runtime replay.
473
+ const replayOptions = buildRuntimeReplayValidationOptions({
474
+ skillName,
475
+ skillPath,
476
+ agent: studentAgent,
477
+ contentTarget: target === "body" ? "body" : "routing",
478
+ });
479
+ const replayFixture = replayOptions?.replayFixture;
480
+ const replayRunner = replayOptions?.replayRunner;
481
+
466
482
  if (target === "routing") {
467
- const replayFixture = buildRoutingReplayFixture({
468
- skillName,
469
- skillPath,
470
- platform: studentAgent === "codex" ? "codex" : "claude_code",
471
- });
472
- const replayRunner =
473
- replayFixture.platform === "claude_code" && studentAgent === "claude"
474
- ? async ({
475
- routing,
476
- evalSet,
477
- fixture,
478
- }: {
479
- routing: string;
480
- evalSet: EvalEntry[];
481
- fixture: RoutingReplayFixture;
482
- }) =>
483
- await runClaudeRuntimeReplayFixture({
484
- routing,
485
- evalSet,
486
- fixture,
487
- })
488
- : undefined;
489
483
  validation = await _validateRoutingProposal(
490
484
  proposal,
491
485
  evalSet,
492
486
  studentAgent,
493
487
  validationModelFlag,
494
488
  {
495
- replayFixture,
489
+ ...(replayFixture ? { replayFixture } : {}),
496
490
  ...(replayRunner ? { replayRunner } : {}),
491
+ mode: effectiveValidationMode,
492
+ onReplayFallback: (reason) => {
493
+ replayFallbackReason = reason;
494
+ if (reason) {
495
+ console.error(
496
+ `[evolve-body] Replay not available (${reason}), falling back to LLM judge validation.`,
497
+ );
498
+ return;
499
+ }
500
+ console.error(
501
+ "[evolve-body] Replay not available, falling back to LLM judge validation.",
502
+ );
503
+ },
497
504
  },
498
505
  );
499
506
  } else {
507
+ const bodyReplayOptions: BodyValidationOptions = {
508
+ ...(replayFixture
509
+ ? {
510
+ replay: {
511
+ replayFixture,
512
+ ...(replayRunner ? { replayRunner } : {}),
513
+ },
514
+ }
515
+ : {}),
516
+ mode: effectiveValidationMode,
517
+ onReplayFallback: (reason) => {
518
+ replayFallbackReason = reason;
519
+ if (reason) {
520
+ console.error(
521
+ `[evolve-body] Replay not available (${reason}), falling back to LLM judge validation.`,
522
+ );
523
+ return;
524
+ }
525
+ console.error(
526
+ "[evolve-body] Replay not available, falling back to LLM judge validation.",
527
+ );
528
+ },
529
+ };
500
530
  validation = await _validateBodyProposal(
501
531
  proposal,
502
532
  evalSet,
503
533
  studentAgent,
504
534
  validationModelFlag,
535
+ undefined,
536
+ bodyReplayOptions,
505
537
  );
506
538
  }
539
+ if (replayFallbackReason && !validation.validation_fallback_reason) {
540
+ validation = {
541
+ ...validation,
542
+ validation_fallback_reason: replayFallbackReason,
543
+ };
544
+ }
507
545
  lastValidation = validation;
508
546
  const validatedEvidenceRef = buildValidationEvidenceRef(proposal.proposal_id, "validated");
509
547
 
510
548
  recordAudit(
511
549
  proposal.proposal_id,
512
550
  "validated",
513
- `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed`,
551
+ `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed${
552
+ validation.validation_fallback_reason
553
+ ? ` (replay fallback: ${validation.validation_fallback_reason})`
554
+ : ""
555
+ }`,
514
556
  {
515
557
  validation_mode: validation.validation_mode,
516
558
  validation_agent: validation.validation_agent,
@@ -527,7 +569,11 @@ export async function evolveBody(
527
569
  stage: "validated",
528
570
  rationale: proposal.rationale,
529
571
  confidence: proposal.confidence,
530
- details: `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed`,
572
+ details: `Validation: ${validation.gates_passed}/${validation.gates_total} gates passed${
573
+ validation.validation_fallback_reason
574
+ ? ` (replay fallback: ${validation.validation_fallback_reason})`
575
+ : ""
576
+ }`,
531
577
  validation: {
532
578
  improved: validation.improved,
533
579
  gates_passed: validation.gates_passed,
@@ -539,10 +585,51 @@ export async function evolveBody(
539
585
  validation_mode: validation.validation_mode,
540
586
  validation_agent: validation.validation_agent,
541
587
  validation_fixture_id: validation.validation_fixture_id,
588
+ validation_fallback_reason: validation.validation_fallback_reason,
542
589
  validation_evidence_ref: validatedEvidenceRef,
543
590
  },
544
591
  });
545
592
 
593
+ // Persist per-entry replay results to SQLite
594
+ try {
595
+ const entryResults: ReplayEntryResultInput[] = [];
596
+ if (validation.before_entry_results) {
597
+ for (const r of validation.before_entry_results) {
598
+ entryResults.push({
599
+ proposal_id: proposal.proposal_id,
600
+ skill_name: skillName,
601
+ validation_mode: validation.validation_mode ?? "llm_judge",
602
+ phase: "before",
603
+ query: r.query,
604
+ should_trigger: r.should_trigger,
605
+ triggered: r.triggered,
606
+ passed: r.passed,
607
+ evidence: r.evidence,
608
+ });
609
+ }
610
+ }
611
+ if (validation.per_entry_results) {
612
+ for (const r of validation.per_entry_results) {
613
+ entryResults.push({
614
+ proposal_id: proposal.proposal_id,
615
+ skill_name: skillName,
616
+ validation_mode: validation.validation_mode ?? "llm_judge",
617
+ phase: "after",
618
+ query: r.query,
619
+ should_trigger: r.should_trigger,
620
+ triggered: r.triggered,
621
+ passed: r.passed,
622
+ evidence: r.evidence,
623
+ });
624
+ }
625
+ }
626
+ if (entryResults.length > 0) {
627
+ writeReplayEntryResultsToDb(entryResults);
628
+ }
629
+ } catch {
630
+ // Fail-open: replay entry persistence is non-blocking
631
+ }
632
+
546
633
  if (validation.improved) {
547
634
  break;
548
635
  }
@@ -550,7 +637,11 @@ export async function evolveBody(
550
637
  recordAudit(
551
638
  proposal.proposal_id,
552
639
  "rejected",
553
- `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates`,
640
+ `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates${
641
+ validation.validation_fallback_reason
642
+ ? ` (replay fallback: ${validation.validation_fallback_reason})`
643
+ : ""
644
+ }`,
554
645
  {
555
646
  validation_mode: validation.validation_mode,
556
647
  validation_agent: validation.validation_agent,
@@ -567,7 +658,11 @@ export async function evolveBody(
567
658
  stage: "rejected",
568
659
  rationale: proposal.rationale,
569
660
  confidence: proposal.confidence,
570
- details: `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates`,
661
+ details: `Validation failed: ${validation.gates_passed}/${validation.gates_total} gates${
662
+ validation.validation_fallback_reason
663
+ ? ` (replay fallback: ${validation.validation_fallback_reason})`
664
+ : ""
665
+ }`,
571
666
  validation: {
572
667
  improved: validation.improved,
573
668
  gates_passed: validation.gates_passed,
@@ -579,6 +674,7 @@ export async function evolveBody(
579
674
  validation_mode: validation.validation_mode,
580
675
  validation_agent: validation.validation_agent,
581
676
  validation_fixture_id: validation.validation_fixture_id,
677
+ validation_fallback_reason: validation.validation_fallback_reason,
582
678
  validation_evidence_ref: buildValidationEvidenceRef(proposal.proposal_id, "rejected"),
583
679
  },
584
680
  });
@@ -678,7 +774,11 @@ export async function evolveBody(
678
774
  recordAudit(
679
775
  lastProposal.proposal_id,
680
776
  "deployed",
681
- `Deployed ${target} proposal for ${skillName}`,
777
+ `Deployed ${target} proposal for ${skillName}${
778
+ lastValidation.validation_fallback_reason
779
+ ? ` (replay fallback: ${lastValidation.validation_fallback_reason})`
780
+ : ""
781
+ }`,
682
782
  {
683
783
  validation_mode: lastValidation.validation_mode,
684
784
  validation_agent: lastValidation.validation_agent,
@@ -707,6 +807,7 @@ export async function evolveBody(
707
807
  validation_mode: lastValidation.validation_mode,
708
808
  validation_agent: lastValidation.validation_agent,
709
809
  validation_fixture_id: lastValidation.validation_fixture_id,
810
+ validation_fallback_reason: lastValidation.validation_fallback_reason,
710
811
  validation_evidence_ref: buildValidationEvidenceRef(lastProposal.proposal_id, "deployed"),
711
812
  },
712
813
  });
@@ -760,6 +861,7 @@ export async function cliMain(): Promise<void> {
760
861
  "task-description": { type: "string" },
761
862
  "few-shot": { type: "string" },
762
863
  "validation-model": { type: "string" },
864
+ "validation-mode": { type: "string", default: DEFAULT_VALIDATION_STRATEGY },
763
865
  "teacher-effort": { type: "string", default: "high" },
764
866
  review: { type: "boolean", default: false },
765
867
  help: { type: "boolean", default: false },
@@ -788,6 +890,7 @@ Options:
788
890
  --task-description Optional task description context
789
891
  --few-shot Comma-separated paths to example skill files
790
892
  --validation-model Model for trigger-check validation calls (overrides --student-model for validation)
893
+ --validation-mode Validation strategy: auto, replay, or judge (default: auto)
791
894
  --teacher-effort Effort level for teacher LLM: low, medium, high, max (default: high)
792
895
  --review Run evolution-reviewer subagent before deployment (Gate 4)
793
896
  --help Show this help message`);
@@ -802,15 +905,24 @@ Options:
802
905
  );
803
906
  }
804
907
 
805
- const { detectAgent } = await import("../utils/llm-call.js");
806
- const teacherAgent = values["teacher-agent"] ?? detectAgent() ?? "";
908
+ if (
909
+ values["validation-mode"] &&
910
+ !["auto", "replay", "judge"].includes(values["validation-mode"])
911
+ ) {
912
+ throw new CLIError(
913
+ `Invalid --validation-mode value: ${values["validation-mode"]}`,
914
+ "INVALID_FLAG",
915
+ "Use one of: auto, replay, judge",
916
+ );
917
+ }
918
+ const teacherAgent = values["teacher-agent"] ?? detectLlmAgent() ?? "";
807
919
  const studentAgent = values["student-agent"] ?? teacherAgent;
808
920
 
809
921
  if (!teacherAgent) {
810
922
  throw new CLIError(
811
- "No agent CLI found. Install Claude Code, Codex, or OpenCode.",
923
+ "No agent CLI found. Install Claude Code, Codex, OpenCode, or Pi.",
812
924
  "AGENT_NOT_FOUND",
813
- "Install Claude Code, Codex, or OpenCode.",
925
+ "Install Claude Code, Codex, OpenCode, or Pi.",
814
926
  );
815
927
  }
816
928
 
@@ -848,6 +960,8 @@ Options:
848
960
  fewShotExamples,
849
961
  gradingResults,
850
962
  validationModel: values["validation-model"],
963
+ validationMode:
964
+ (values["validation-mode"] as ValidationStrategy | undefined) ?? DEFAULT_VALIDATION_STRATEGY,
851
965
  teacherEffort: (values["teacher-effort"] as EffortLevel) ?? "high",
852
966
  useReviewer: values.review ?? false,
853
967
  });