selftune 0.2.22 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +95 -15
  3. package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
  4. package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
  5. package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
  6. package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
  7. package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
  8. package/apps/local-dashboard/dist/index.html +5 -5
  9. package/cli/selftune/adapters/codex/install.ts +310 -78
  10. package/cli/selftune/adapters/opencode/install.ts +3 -4
  11. package/cli/selftune/adapters/pi/hook.ts +273 -0
  12. package/cli/selftune/adapters/pi/install.ts +207 -0
  13. package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
  14. package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
  15. package/cli/selftune/auto-update.ts +200 -8
  16. package/cli/selftune/canonical-export.ts +55 -25
  17. package/cli/selftune/command-surface.ts +397 -0
  18. package/cli/selftune/constants.ts +10 -1
  19. package/cli/selftune/contribute/contribute.ts +64 -13
  20. package/cli/selftune/contribution-config.ts +57 -3
  21. package/cli/selftune/contribution-preferences.ts +117 -0
  22. package/cli/selftune/contribution-signals.ts +8 -4
  23. package/cli/selftune/contribution-staging.ts +13 -2
  24. package/cli/selftune/contributions.ts +55 -121
  25. package/cli/selftune/creator-contributions.ts +29 -10
  26. package/cli/selftune/cron/setup.ts +7 -3
  27. package/cli/selftune/dashboard-contract.ts +87 -0
  28. package/cli/selftune/dashboard-server.ts +168 -17
  29. package/cli/selftune/dashboard.ts +350 -17
  30. package/cli/selftune/eval/baseline.ts +21 -5
  31. package/cli/selftune/eval/execution-eval.ts +170 -0
  32. package/cli/selftune/eval/family-overlap.ts +2 -2
  33. package/cli/selftune/eval/hooks-to-evals.ts +228 -82
  34. package/cli/selftune/eval/import-skillsbench.ts +2 -2
  35. package/cli/selftune/eval/invocation-classifier.ts +56 -0
  36. package/cli/selftune/eval/synthetic-evals.ts +5 -3
  37. package/cli/selftune/eval/unit-test-cli.ts +7 -4
  38. package/cli/selftune/evolution/apply-proposal.ts +295 -0
  39. package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
  40. package/cli/selftune/evolution/engines/replay-engine.ts +180 -0
  41. package/cli/selftune/evolution/evidence.ts +2 -6
  42. package/cli/selftune/evolution/evolve-body.ts +152 -38
  43. package/cli/selftune/evolution/evolve.ts +244 -52
  44. package/cli/selftune/evolution/rollback.ts +0 -1
  45. package/cli/selftune/evolution/validate-body.ts +111 -49
  46. package/cli/selftune/evolution/validate-host-replay.ts +510 -60
  47. package/cli/selftune/evolution/validate-proposal.ts +11 -150
  48. package/cli/selftune/evolution/validate-routing.ts +51 -108
  49. package/cli/selftune/evolution/validation-contract.ts +91 -0
  50. package/cli/selftune/grading/auto-grade.ts +11 -7
  51. package/cli/selftune/grading/grade-session.ts +10 -16
  52. package/cli/selftune/hooks/skill-eval.ts +2 -1
  53. package/cli/selftune/hooks-shared/types.ts +1 -0
  54. package/cli/selftune/index.ts +58 -15
  55. package/cli/selftune/ingestors/claude-replay.ts +15 -10
  56. package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
  57. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  58. package/cli/selftune/ingestors/pi-ingest.ts +727 -0
  59. package/cli/selftune/init.ts +38 -4
  60. package/cli/selftune/localdb/direct-write.ts +120 -1
  61. package/cli/selftune/localdb/materialize.ts +6 -7
  62. package/cli/selftune/localdb/queries/cron.ts +34 -0
  63. package/cli/selftune/localdb/queries/dashboard.ts +834 -0
  64. package/cli/selftune/localdb/queries/evolution.ts +158 -0
  65. package/cli/selftune/localdb/queries/execution.ts +133 -0
  66. package/cli/selftune/localdb/queries/json.ts +18 -0
  67. package/cli/selftune/localdb/queries/monitoring.ts +263 -0
  68. package/cli/selftune/localdb/queries/raw.ts +95 -0
  69. package/cli/selftune/localdb/queries/staging.ts +270 -0
  70. package/cli/selftune/localdb/queries/trust.ts +392 -0
  71. package/cli/selftune/localdb/queries.ts +60 -2162
  72. package/cli/selftune/localdb/schema.ts +59 -0
  73. package/cli/selftune/monitoring/watch.ts +96 -29
  74. package/cli/selftune/normalization.ts +3 -0
  75. package/cli/selftune/observability.ts +12 -3
  76. package/cli/selftune/orchestrate/cli.ts +161 -0
  77. package/cli/selftune/orchestrate/execute.ts +295 -0
  78. package/cli/selftune/orchestrate/finalize.ts +157 -0
  79. package/cli/selftune/orchestrate/locks.ts +40 -0
  80. package/cli/selftune/orchestrate/plan.ts +131 -0
  81. package/cli/selftune/orchestrate/post-run.ts +59 -0
  82. package/cli/selftune/orchestrate/prepare.ts +334 -0
  83. package/cli/selftune/orchestrate/report.ts +182 -0
  84. package/cli/selftune/orchestrate/runtime.ts +120 -0
  85. package/cli/selftune/orchestrate/signals.ts +48 -0
  86. package/cli/selftune/orchestrate.ts +162 -1142
  87. package/cli/selftune/registry/client.ts +74 -0
  88. package/cli/selftune/registry/history.ts +54 -0
  89. package/cli/selftune/registry/index.ts +90 -0
  90. package/cli/selftune/registry/install.ts +141 -0
  91. package/cli/selftune/registry/list.ts +44 -0
  92. package/cli/selftune/registry/push.ts +171 -0
  93. package/cli/selftune/registry/rollback.ts +49 -0
  94. package/cli/selftune/registry/status.ts +62 -0
  95. package/cli/selftune/registry/sync.ts +125 -0
  96. package/cli/selftune/repair/skill-usage.ts +9 -3
  97. package/cli/selftune/routes/overview.ts +5 -2
  98. package/cli/selftune/routes/skill-report.ts +15 -2
  99. package/cli/selftune/schedule.ts +5 -5
  100. package/cli/selftune/status.ts +70 -2
  101. package/cli/selftune/sync.ts +127 -23
  102. package/cli/selftune/testing-readiness.ts +597 -0
  103. package/cli/selftune/types.ts +46 -5
  104. package/cli/selftune/uninstall.ts +2 -1
  105. package/cli/selftune/utils/canonical-log.ts +1 -9
  106. package/cli/selftune/utils/cli-error.ts +9 -0
  107. package/cli/selftune/utils/jsonl.ts +1 -30
  108. package/cli/selftune/utils/llm-call.ts +126 -6
  109. package/cli/selftune/utils/skill-discovery.ts +24 -0
  110. package/cli/selftune/workflows/proposals.ts +184 -0
  111. package/cli/selftune/workflows/skill-scaffold.ts +241 -0
  112. package/cli/selftune/workflows/workflows.ts +100 -26
  113. package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
  114. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
  115. package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
  116. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  117. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
  118. package/node_modules/@selftune/telemetry-contract/package.json +1 -1
  119. package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
  120. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +63 -5
  121. package/node_modules/@selftune/telemetry-contract/src/types.ts +97 -7
  122. package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
  123. package/package.json +25 -9
  124. package/packages/dashboard-core/AGENTS.md +18 -0
  125. package/packages/dashboard-core/README.md +30 -0
  126. package/packages/dashboard-core/index.ts +3 -0
  127. package/packages/dashboard-core/package.json +39 -0
  128. package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
  129. package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
  130. package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
  131. package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
  132. package/packages/dashboard-core/src/chrome/index.ts +14 -0
  133. package/packages/dashboard-core/src/chrome/types.ts +81 -0
  134. package/packages/dashboard-core/src/chrome/utils.ts +23 -0
  135. package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
  136. package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
  137. package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
  138. package/packages/dashboard-core/src/gates/index.ts +3 -0
  139. package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
  140. package/packages/dashboard-core/src/host/adapter.ts +47 -0
  141. package/packages/dashboard-core/src/host/capabilities.ts +55 -0
  142. package/packages/dashboard-core/src/host/index.ts +3 -0
  143. package/packages/dashboard-core/src/models/analytics.ts +39 -0
  144. package/packages/dashboard-core/src/models/index.ts +4 -0
  145. package/packages/dashboard-core/src/models/overview.ts +98 -0
  146. package/packages/dashboard-core/src/models/runtime.ts +7 -0
  147. package/packages/dashboard-core/src/models/skills.ts +34 -0
  148. package/packages/dashboard-core/src/routes/index.ts +2 -0
  149. package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
  150. package/packages/dashboard-core/src/routes/manifest.ts +451 -0
  151. package/packages/dashboard-core/src/routes/types.ts +39 -0
  152. package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
  153. package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
  154. package/packages/dashboard-core/src/screens/index.ts +37 -0
  155. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
  156. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
  157. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
  158. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
  159. package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
  160. package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
  161. package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
  162. package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
  163. package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
  164. package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
  165. package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
  166. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
  167. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
  168. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
  169. package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
  170. package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
  171. package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
  172. package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
  173. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
  174. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
  175. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
  176. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
  177. package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
  178. package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
  179. package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
  180. package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
  181. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
  182. package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
  183. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  184. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
  185. package/packages/telemetry-contract/package.json +1 -1
  186. package/packages/telemetry-contract/src/index.ts +1 -0
  187. package/packages/telemetry-contract/src/schemas.ts +63 -5
  188. package/packages/telemetry-contract/src/types.ts +97 -7
  189. package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
  190. package/packages/ui/AGENTS.md +16 -0
  191. package/packages/ui/README.md +1 -1
  192. package/packages/ui/package.json +1 -1
  193. package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
  194. package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
  195. package/packages/ui/src/components/EvidenceViewer.tsx +229 -464
  196. package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
  197. package/packages/ui/src/components/InfoTip.tsx +1 -2
  198. package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
  199. package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
  200. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
  201. package/packages/ui/src/components/OverviewPanels.tsx +693 -0
  202. package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
  203. package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
  204. package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
  205. package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
  206. package/packages/ui/src/components/index.ts +56 -1
  207. package/packages/ui/src/components/section-cards.tsx +18 -35
  208. package/packages/ui/src/components/skill-health-grid.tsx +47 -37
  209. package/packages/ui/src/lib/constants.tsx +0 -1
  210. package/packages/ui/src/primitives/card.tsx +1 -1
  211. package/packages/ui/src/primitives/checkbox.tsx +1 -1
  212. package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
  213. package/packages/ui/src/primitives/select.tsx +2 -2
  214. package/packages/ui/src/primitives/tabs.tsx +7 -6
  215. package/packages/ui/src/types.ts +182 -4
  216. package/skill/SKILL.md +130 -318
  217. package/skill/agents/diagnosis-analyst.md +3 -3
  218. package/skill/agents/evolution-reviewer.md +3 -3
  219. package/skill/agents/integration-guide.md +3 -3
  220. package/skill/agents/pattern-analyst.md +2 -2
  221. package/skill/references/cli-quick-reference.md +89 -0
  222. package/skill/references/creator-playbook.md +131 -0
  223. package/skill/references/examples.md +48 -0
  224. package/skill/references/troubleshooting.md +47 -0
  225. package/skill/references/version-history.md +1 -1
  226. package/skill/selftune.contribute.json +11 -0
  227. package/skill/{Workflows → workflows}/Baseline.md +20 -1
  228. package/skill/{Workflows → workflows}/Contribute.md +23 -10
  229. package/skill/{Workflows → workflows}/Contributions.md +13 -5
  230. package/skill/workflows/CreateTestDeploy.md +170 -0
  231. package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
  232. package/skill/{Workflows → workflows}/Cron.md +1 -1
  233. package/skill/{Workflows → workflows}/Dashboard.md +20 -0
  234. package/skill/{Workflows → workflows}/Doctor.md +1 -1
  235. package/skill/{Workflows → workflows}/Evals.md +67 -2
  236. package/skill/{Workflows → workflows}/Evolve.md +119 -30
  237. package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
  238. package/skill/{Workflows → workflows}/Grade.md +1 -1
  239. package/skill/{Workflows → workflows}/Ingest.md +60 -2
  240. package/skill/{Workflows → workflows}/Initialize.md +16 -9
  241. package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
  242. package/skill/{Workflows → workflows}/PlatformHooks.md +19 -3
  243. package/skill/workflows/Registry.md +99 -0
  244. package/skill/{Workflows → workflows}/Schedule.md +3 -3
  245. package/skill/workflows/SignalsDashboard.md +87 -0
  246. package/skill/{Workflows → workflows}/Sync.md +3 -1
  247. package/skill/{Workflows → workflows}/UnitTest.md +19 -0
  248. package/skill/{Workflows → workflows}/Watch.md +42 -2
  249. package/skill/{Workflows → workflows}/Workflows.md +39 -2
  250. package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
  251. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
  252. package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
  253. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
  254. package/cli/selftune/utils/html.ts +0 -27
  255. package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
  256. /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
  257. /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
  258. /package/skill/{Workflows → workflows}/Badge.md +0 -0
  259. /package/skill/{Workflows → workflows}/Composability.md +0 -0
  260. /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
  261. /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
  262. /package/skill/{Workflows → workflows}/Hook.md +0 -0
  263. /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
  264. /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
  265. /package/skill/{Workflows → workflows}/Recover.md +0 -0
  266. /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
  267. /package/skill/{Workflows → workflows}/Replay.md +0 -0
  268. /package/skill/{Workflows → workflows}/Rollback.md +0 -0
  269. /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
  270. /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
@@ -6,17 +6,14 @@
6
6
  * to determine whether the proposal is an improvement.
7
7
  */
8
8
 
9
- import type { EvalEntry, EvolutionProposal, InvocationTypeScores } from "../types.js";
9
+ import type {
10
+ EvalEntry,
11
+ EvolutionProposal,
12
+ InvocationTypeScores,
13
+ ValidationMode,
14
+ } from "../types.js";
10
15
  import { callLlm, type EffortLevel } from "../utils/llm-call.js";
11
- import {
12
- buildBatchTriggerCheckPrompt,
13
- buildTriggerCheckPrompt,
14
- parseBatchTriggerResponse,
15
- parseTriggerResponse,
16
- } from "../utils/trigger-check.js";
17
-
18
- // Re-export so existing consumers don't break
19
- export { buildTriggerCheckPrompt, parseTriggerResponse };
16
+ import { buildBatchTriggerCheckPrompt, parseBatchTriggerResponse } from "../utils/trigger-check.js";
20
17
 
21
18
  /** Number of eval queries to batch into a single LLM call.
22
19
  * Higher = fewer claude -p spawns = much faster (each spawn has ~30-60s overhead).
@@ -40,147 +37,11 @@ export interface ValidationResult {
40
37
  net_change: number; // after - before pass rate
41
38
  by_invocation_type?: InvocationTypeScores;
42
39
  per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
43
- validation_mode?: "llm_judge";
40
+ validation_mode?: ValidationMode;
44
41
  validation_agent?: string;
45
- }
46
-
47
- // ---------------------------------------------------------------------------
48
- // Proposal validation
49
- // ---------------------------------------------------------------------------
50
-
51
- /** Validate a proposal sequentially (one LLM call per query). Kept for backward compat. */
52
- export async function validateProposalSequential(
53
- proposal: EvolutionProposal,
54
- evalSet: EvalEntry[],
55
- agent: string,
56
- modelFlag?: string,
57
- effort?: EffortLevel,
58
- ): Promise<ValidationResult> {
59
- if (evalSet.length === 0) {
60
- return {
61
- proposal_id: proposal.proposal_id,
62
- before_pass_rate: 0,
63
- after_pass_rate: 0,
64
- improved: false,
65
- regressions: [],
66
- new_passes: [],
67
- net_change: 0,
68
- validation_mode: "llm_judge",
69
- validation_agent: agent,
70
- };
71
- }
72
-
73
- const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
74
- const regressions: EvalEntry[] = [];
75
- const newPasses: EvalEntry[] = [];
76
- const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
77
- [];
78
- let beforePassed = 0;
79
- let afterPassed = 0;
80
-
81
- for (const entry of evalSet) {
82
- // Check with original description
83
- const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
84
- const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag, effort);
85
- const beforeTriggered = parseTriggerResponse(beforeRaw);
86
- const beforePass =
87
- (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
88
-
89
- // Check with proposed description
90
- const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
91
- const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag, effort);
92
- const afterTriggered = parseTriggerResponse(afterRaw);
93
- const afterPass =
94
- (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
95
-
96
- if (beforePass) beforePassed++;
97
- if (afterPass) afterPassed++;
98
-
99
- // Regression: passed before, fails after
100
- if (beforePass && !afterPass) {
101
- regressions.push(entry);
102
- }
103
-
104
- // New pass: failed before, passes after
105
- if (!beforePass && afterPass) {
106
- newPasses.push(entry);
107
- }
108
-
109
- perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
110
- }
111
-
112
- const total = evalSet.length;
113
- const beforePassRate = beforePassed / total;
114
- const afterPassRate = afterPassed / total;
115
- const netChange = afterPassRate - beforePassRate;
116
-
117
- // A proposal is improved when ALL of:
118
- // - after_pass_rate > before_pass_rate
119
- // - regressions count < 5% of total eval entries
120
- // - Either net improvement >= 0.10 OR new_passes.length >= 2
121
- const improved =
122
- afterPassRate > beforePassRate &&
123
- regressions.length < total * 0.05 &&
124
- (netChange >= 0.1 || newPasses.length >= 2);
125
-
126
- // Compute per-invocation-type scores (initialize all required keys)
127
- const byInvocationType: Record<string, { passed: number; total: number }> = {
128
- explicit: { passed: 0, total: 0 },
129
- implicit: { passed: 0, total: 0 },
130
- contextual: { passed: 0, total: 0 },
131
- negative: { passed: 0, total: 0 },
132
- };
133
- for (const r of perEntryResults) {
134
- const type = r.entry.invocation_type ?? "implicit";
135
- if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
136
- byInvocationType[type].total++;
137
- if (r.after_pass) byInvocationType[type].passed++;
138
- }
139
-
140
- const invocationScores: InvocationTypeScores = {
141
- explicit: {
142
- ...byInvocationType.explicit,
143
- pass_rate:
144
- byInvocationType.explicit.total > 0
145
- ? byInvocationType.explicit.passed / byInvocationType.explicit.total
146
- : 0,
147
- },
148
- implicit: {
149
- ...byInvocationType.implicit,
150
- pass_rate:
151
- byInvocationType.implicit.total > 0
152
- ? byInvocationType.implicit.passed / byInvocationType.implicit.total
153
- : 0,
154
- },
155
- contextual: {
156
- ...byInvocationType.contextual,
157
- pass_rate:
158
- byInvocationType.contextual.total > 0
159
- ? byInvocationType.contextual.passed / byInvocationType.contextual.total
160
- : 0,
161
- },
162
- negative: {
163
- ...byInvocationType.negative,
164
- pass_rate:
165
- byInvocationType.negative.total > 0
166
- ? byInvocationType.negative.passed / byInvocationType.negative.total
167
- : 0,
168
- },
169
- };
170
-
171
- return {
172
- proposal_id: proposal.proposal_id,
173
- before_pass_rate: beforePassRate,
174
- after_pass_rate: afterPassRate,
175
- improved,
176
- regressions,
177
- new_passes: newPasses,
178
- net_change: netChange,
179
- by_invocation_type: invocationScores,
180
- per_entry_results: perEntryResults,
181
- validation_mode: "llm_judge",
182
- validation_agent: agent,
183
- };
42
+ validation_fixture_id?: string;
43
+ validation_fallback_reason?: string;
44
+ before_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
184
45
  }
185
46
 
186
47
  // ---------------------------------------------------------------------------
@@ -3,6 +3,9 @@
3
3
  *
4
4
  * Validates a routing table evolution proposal by checking structural validity
5
5
  * and running trigger accuracy checks against an eval set.
6
+ *
7
+ * Delegates replay-based and judge-based validation to dedicated engines
8
+ * (engines/replay-engine.ts and engines/judge-engine.ts).
6
9
  */
7
10
 
8
11
  import type {
@@ -10,28 +13,12 @@ import type {
10
13
  BodyValidationResult,
11
14
  EvalEntry,
12
15
  RoutingReplayEntryResult,
13
- RoutingReplayFixture,
16
+ ValidationGate,
14
17
  ValidationMode,
15
18
  } from "../types.js";
16
- import { callLlm } from "../utils/llm-call.js";
17
- import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
18
- import { runHostReplayFixture } from "./validate-host-replay.js";
19
-
20
- export interface RoutingReplayRunnerInput {
21
- routing: string;
22
- evalSet: EvalEntry[];
23
- agent: string;
24
- fixture: RoutingReplayFixture;
25
- }
26
-
27
- export type RoutingReplayRunner = (
28
- input: RoutingReplayRunnerInput,
29
- ) => Promise<RoutingReplayEntryResult[]>;
30
-
31
- export interface RoutingValidationOptions {
32
- replayFixture?: RoutingReplayFixture;
33
- replayRunner?: RoutingReplayRunner;
34
- }
19
+ import { runJudgeValidation } from "./engines/judge-engine.js";
20
+ import { type ReplayValidationOptions } from "./engines/replay-engine.js";
21
+ import { runValidationContract, type ValidationStrategy } from "./validation-contract.js";
35
22
 
36
23
  export interface RoutingTriggerAccuracyResult {
37
24
  before_pass_rate: number;
@@ -40,7 +27,14 @@ export interface RoutingTriggerAccuracyResult {
40
27
  validation_mode: ValidationMode;
41
28
  validation_agent: string;
42
29
  validation_fixture_id?: string;
30
+ validation_fallback_reason?: string;
43
31
  per_entry_results?: RoutingReplayEntryResult[];
32
+ before_entry_results?: RoutingReplayEntryResult[];
33
+ }
34
+
35
+ export interface RoutingValidationOptions extends ReplayValidationOptions {
36
+ mode?: ValidationStrategy;
37
+ onReplayFallback?: (reason?: string) => void;
44
38
  }
45
39
 
46
40
  // ---------------------------------------------------------------------------
@@ -104,6 +98,9 @@ export function validateRoutingStructure(routing: string): { valid: boolean; rea
104
98
  /**
105
99
  * Run before/after trigger checks on the eval set using the routing content.
106
100
  * Returns pass rates for comparison.
101
+ *
102
+ * Prefers host/runtime replay when a runtime runner is available,
103
+ * falls back to LLM judge otherwise.
107
104
  */
108
105
  export async function validateRoutingTriggerAccuracy(
109
106
  originalRouting: string,
@@ -123,94 +120,38 @@ export async function validateRoutingTriggerAccuracy(
123
120
  };
124
121
  }
125
122
 
126
- if (options.replayFixture && options.replayRunner) {
127
- const beforeResults = await options.replayRunner({
128
- routing: originalRouting,
129
- evalSet,
130
- agent,
131
- fixture: options.replayFixture,
132
- });
133
- const afterResults = await options.replayRunner({
134
- routing: proposedRouting,
135
- evalSet,
136
- agent,
137
- fixture: options.replayFixture,
138
- });
139
- const beforePassed = beforeResults.filter((result) => result.passed).length;
140
- const afterPassed = afterResults.filter((result) => result.passed).length;
141
- const total = evalSet.length;
142
-
143
- return {
144
- before_pass_rate: beforePassed / total,
145
- after_pass_rate: afterPassed / total,
146
- improved: afterPassed > beforePassed,
147
- validation_mode: "host_replay",
148
- validation_agent: agent,
149
- validation_fixture_id: options.replayFixture.fixture_id,
150
- per_entry_results: afterResults,
151
- };
152
- }
153
-
154
- if (options.replayFixture) {
155
- const beforeResults = runHostReplayFixture({
156
- routing: originalRouting,
157
- evalSet,
158
- fixture: options.replayFixture,
159
- });
160
- const afterResults = runHostReplayFixture({
161
- routing: proposedRouting,
162
- evalSet,
163
- fixture: options.replayFixture,
164
- });
165
- const beforePassed = beforeResults.filter((result) => result.passed).length;
166
- const afterPassed = afterResults.filter((result) => result.passed).length;
167
- const total = evalSet.length;
168
-
169
- return {
170
- before_pass_rate: beforePassed / total,
171
- after_pass_rate: afterPassed / total,
172
- improved: afterPassed > beforePassed,
173
- validation_mode: "host_replay",
174
- validation_agent: agent,
175
- validation_fixture_id: options.replayFixture.fixture_id,
176
- per_entry_results: afterResults,
177
- };
178
- }
179
-
180
- const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
181
- let beforePassed = 0;
182
- let afterPassed = 0;
183
-
184
- for (const entry of evalSet) {
185
- // Check with original routing
186
- const beforePrompt = buildTriggerCheckPrompt(originalRouting, entry.query);
187
- const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag);
188
- const beforeTriggered = parseTriggerResponse(beforeRaw);
189
- const beforePass =
190
- (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
191
-
192
- // Check with proposed routing
193
- const afterPrompt = buildTriggerCheckPrompt(proposedRouting, entry.query);
194
- const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag);
195
- const afterTriggered = parseTriggerResponse(afterRaw);
196
- const afterPass =
197
- (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
198
-
199
- if (beforePass) beforePassed++;
200
- if (afterPass) afterPassed++;
201
- }
202
-
203
- const total = evalSet.length;
204
- const beforePassRate = beforePassed / total;
205
- const afterPassRate = afterPassed / total;
123
+ const { result, fallbackReason } = await runValidationContract<RoutingTriggerAccuracyResult>({
124
+ mode: options.mode ?? "auto",
125
+ originalContent: originalRouting,
126
+ proposedContent: proposedRouting,
127
+ evalSet,
128
+ agent,
129
+ replayOptions: options,
130
+ runJudge: async () => {
131
+ const judgeResult = await runJudgeValidation(
132
+ originalRouting,
133
+ proposedRouting,
134
+ evalSet,
135
+ agent,
136
+ modelFlag,
137
+ );
138
+
139
+ return {
140
+ result: {
141
+ before_pass_rate: judgeResult.before_pass_rate,
142
+ after_pass_rate: judgeResult.after_pass_rate,
143
+ improved: judgeResult.improved,
144
+ validation_mode: judgeResult.validation_mode,
145
+ validation_agent: judgeResult.validation_agent,
146
+ },
147
+ modeUsed: judgeResult.validation_mode,
148
+ };
149
+ },
150
+ onReplayFallback: options.onReplayFallback,
151
+ adaptReplayResult: (replayResult) => replayResult,
152
+ });
206
153
 
207
- return {
208
- before_pass_rate: beforePassRate,
209
- after_pass_rate: afterPassRate,
210
- improved: afterPassRate > beforePassRate,
211
- validation_mode: "llm_judge",
212
- validation_agent: agent,
213
- };
154
+ return fallbackReason ? { ...result, validation_fallback_reason: fallbackReason } : result;
214
155
  }
215
156
 
216
157
  // ---------------------------------------------------------------------------
@@ -225,7 +166,7 @@ export async function validateRoutingProposal(
225
166
  modelFlag?: string,
226
167
  options: RoutingValidationOptions = {},
227
168
  ): Promise<BodyValidationResult> {
228
- const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
169
+ const gateResults: Array<{ gate: ValidationGate; passed: boolean; reason: string }> = [];
229
170
 
230
171
  // Gate 1: Structural validation
231
172
  const structural = validateRoutingStructure(proposal.proposed_body);
@@ -277,8 +218,10 @@ export async function validateRoutingProposal(
277
218
  validation_mode: accuracy.validation_mode,
278
219
  validation_agent: accuracy.validation_agent,
279
220
  validation_fixture_id: accuracy.validation_fixture_id,
221
+ validation_fallback_reason: accuracy.validation_fallback_reason,
280
222
  before_pass_rate: accuracy.before_pass_rate,
281
223
  after_pass_rate: accuracy.after_pass_rate,
282
224
  per_entry_results: accuracy.per_entry_results,
225
+ before_entry_results: accuracy.before_entry_results,
283
226
  };
284
227
  }
@@ -0,0 +1,91 @@
1
+ import type { EvalEntry, ValidationMode } from "../types.js";
2
+ import { CLIError } from "../utils/cli-error.js";
3
+ import {
4
+ runReplayValidation,
5
+ type ReplayValidationOptions,
6
+ type ReplayValidationResult,
7
+ } from "./engines/replay-engine.js";
8
+
9
+ export type ValidationStrategy = "auto" | "replay" | "judge";
10
+
11
+ export const DEFAULT_VALIDATION_STRATEGY: ValidationStrategy = "auto";
12
+
13
+ export interface ValidationExecutionResult<TResult> {
14
+ result: TResult;
15
+ modeUsed: ValidationMode;
16
+ fallbackReason?: string;
17
+ }
18
+
19
+ export interface ValidationContractOptions<TResult> {
20
+ mode?: ValidationStrategy;
21
+ originalContent: string;
22
+ proposedContent: string;
23
+ evalSet: EvalEntry[];
24
+ agent: string;
25
+ replayOptions?: ReplayValidationOptions;
26
+ runJudge: () => Promise<ValidationExecutionResult<TResult>>;
27
+ adaptReplayResult: (replayResult: ReplayValidationResult) => TResult;
28
+ onReplayFallback?: (reason?: string) => void;
29
+ }
30
+
31
+ export function hasReplayValidationPath(
32
+ replayOptions?: ReplayValidationOptions,
33
+ ): replayOptions is ReplayValidationOptions {
34
+ return Boolean(replayOptions?.replayFixture || replayOptions?.replayRunner);
35
+ }
36
+
37
+ export function createReplayUnavailableError(reason?: string): CLIError {
38
+ const message = reason
39
+ ? `Replay validation requested but real host/runtime replay is unavailable: ${reason}`
40
+ : "Replay validation requested but real host/runtime replay is unavailable.";
41
+ return new CLIError(
42
+ message,
43
+ "REPLAY_UNAVAILABLE",
44
+ "Use --validation-mode auto to allow LLM judge fallback, or run selftune on a host/agent with runtime replay support for this skill.",
45
+ );
46
+ }
47
+
48
+ export async function runValidationContract<TResult>(
49
+ options: ValidationContractOptions<TResult>,
50
+ ): Promise<ValidationExecutionResult<TResult>> {
51
+ const mode = options.mode ?? DEFAULT_VALIDATION_STRATEGY;
52
+
53
+ if (mode === "judge") {
54
+ return options.runJudge();
55
+ }
56
+
57
+ if (hasReplayValidationPath(options.replayOptions)) {
58
+ const replayAttempt = await runReplayValidation(
59
+ options.originalContent,
60
+ options.proposedContent,
61
+ options.evalSet,
62
+ options.agent,
63
+ options.replayOptions,
64
+ );
65
+
66
+ if (replayAttempt.result) {
67
+ return {
68
+ result: options.adaptReplayResult(replayAttempt.result),
69
+ modeUsed: replayAttempt.result.validation_mode,
70
+ };
71
+ }
72
+
73
+ if (mode === "replay") {
74
+ throw createReplayUnavailableError(replayAttempt.fallbackReason);
75
+ }
76
+
77
+ options.onReplayFallback?.(replayAttempt.fallbackReason);
78
+ const judgeResult = await options.runJudge();
79
+ return {
80
+ ...judgeResult,
81
+ fallbackReason: replayAttempt.fallbackReason,
82
+ };
83
+ }
84
+
85
+ if (mode === "replay") {
86
+ throw createReplayUnavailableError();
87
+ }
88
+
89
+ options.onReplayFallback?.();
90
+ return options.runJudge();
91
+ }
@@ -13,12 +13,16 @@ import { mkdirSync, writeFileSync } from "node:fs";
13
13
  import { dirname } from "node:path";
14
14
  import { parseArgs } from "node:util";
15
15
 
16
- import { AGENT_CANDIDATES, TELEMETRY_LOG } from "../constants.js";
16
+ import { TELEMETRY_LOG } from "../constants.js";
17
17
  import { getDb } from "../localdb/db.js";
18
18
  import { querySessionTelemetry, querySkillUsageRecords } from "../localdb/queries.js";
19
19
  import type { GradingResult, SessionTelemetryRecord, SkillUsageRecord } from "../types.js";
20
20
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
21
- import { detectAgent as _detectAgent } from "../utils/llm-call.js";
21
+ import {
22
+ detectLlmAgent as _detectAgent,
23
+ isLlmBackedAgent,
24
+ LLM_BACKED_AGENT_CANDIDATES,
25
+ } from "../utils/llm-call.js";
22
26
  import { readExcerpt } from "../utils/transcript.js";
23
27
  import {
24
28
  buildDefaultGradingOutputPath,
@@ -55,7 +59,7 @@ Options:
55
59
  --session-id Grade a specific session (auto-detects most recent if omitted)
56
60
  --telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
57
61
  --output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
58
- --agent Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
62
+ --agent Agent CLI to use (${LLM_BACKED_AGENT_CANDIDATES.join(", ")})
59
63
  --show-transcript Print transcript excerpt before grading
60
64
  -h, --help Show this help message`);
61
65
  process.exit(0);
@@ -68,9 +72,9 @@ Options:
68
72
 
69
73
  // --- Determine agent ---
70
74
  let agent: string | null = null;
71
- const validAgents = [...AGENT_CANDIDATES];
75
+ const validAgents = [...LLM_BACKED_AGENT_CANDIDATES];
72
76
  if (values.agent) {
73
- if (!validAgents.includes(values.agent)) {
77
+ if (!isLlmBackedAgent(values.agent)) {
74
78
  throw new CLIError(
75
79
  `Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
76
80
  "INVALID_FLAG",
@@ -84,9 +88,9 @@ Options:
84
88
 
85
89
  if (!agent) {
86
90
  throw new CLIError(
87
- `No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH`,
91
+ `No supported agent CLI (${LLM_BACKED_AGENT_CANDIDATES.join("/")}) found in PATH`,
88
92
  "AGENT_NOT_FOUND",
89
- "Install one of the supported agent CLIs",
93
+ "Install Claude Code, Codex, OpenCode, or Pi",
90
94
  );
91
95
  }
92
96
 
@@ -5,19 +5,14 @@
5
5
  * Rubric-based grader for Claude Code skill sessions.
6
6
  * Migrated from grade_session.py.
7
7
  *
8
- * Grades via an installed agent CLI selected from AGENT_CANDIDATES.
8
+ * Grades via an installed agent CLI selected from the LLM-backed agent set.
9
9
  */
10
10
 
11
11
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
12
12
  import { basename, dirname, join } from "node:path";
13
13
  import { parseArgs } from "node:util";
14
14
 
15
- import {
16
- AGENT_CANDIDATES,
17
- CLAUDE_CODE_PROJECTS_DIR,
18
- SELFTUNE_CONFIG_DIR,
19
- TELEMETRY_LOG,
20
- } from "../constants.js";
15
+ import { CLAUDE_CODE_PROJECTS_DIR, SELFTUNE_CONFIG_DIR, TELEMETRY_LOG } from "../constants.js";
21
16
  import { getDb } from "../localdb/db.js";
22
17
  import { querySessionTelemetry, querySkillUsageRecords } from "../localdb/queries.js";
23
18
  import type {
@@ -31,7 +26,9 @@ import type {
31
26
  } from "../types.js";
32
27
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
33
28
  import {
34
- detectAgent as _detectAgent,
29
+ detectLlmAgent as _detectAgent,
30
+ isLlmBackedAgent,
31
+ LLM_BACKED_AGENT_CANDIDATES,
35
32
  stripMarkdownFences as _stripMarkdownFences,
36
33
  callViaAgent,
37
34
  } from "../utils/llm-call.js";
@@ -42,9 +39,6 @@ import {
42
39
  } from "../utils/transcript.js";
43
40
  import { type PreGateContext, runPreGates } from "./pre-gates.js";
44
41
 
45
- // Re-export for backward compatibility
46
- export { detectAgent, stripMarkdownFences } from "../utils/llm-call.js";
47
-
48
42
  // ---------------------------------------------------------------------------
49
43
  // Constants
50
44
  // ---------------------------------------------------------------------------
@@ -756,7 +750,7 @@ Options:
756
750
  --transcript Path to transcript file
757
751
  --telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
758
752
  --output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
759
- --agent Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
753
+ --agent Agent CLI to use (${LLM_BACKED_AGENT_CANDIDATES.join(", ")})
760
754
  --show-transcript Print transcript excerpt before grading
761
755
  -h, --help Show this help message`);
762
756
  process.exit(0);
@@ -769,9 +763,9 @@ Options:
769
763
 
770
764
  // --- Determine agent ---
771
765
  let agent: string | null = null;
772
- const validAgents = [...AGENT_CANDIDATES];
766
+ const validAgents = [...LLM_BACKED_AGENT_CANDIDATES];
773
767
  if (values.agent) {
774
- if (!validAgents.includes(values.agent)) {
768
+ if (!isLlmBackedAgent(values.agent)) {
775
769
  throw new CLIError(
776
770
  `Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
777
771
  "INVALID_FLAG",
@@ -785,9 +779,9 @@ Options:
785
779
 
786
780
  if (!agent) {
787
781
  throw new CLIError(
788
- `No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH`,
782
+ `No supported agent CLI (${LLM_BACKED_AGENT_CANDIDATES.join("/")}) found in PATH`,
789
783
  "AGENT_NOT_FOUND",
790
- "Install claude, codex, or opencode CLI, then retry",
784
+ "Install Claude Code, Codex, OpenCode, or Pi, then retry",
791
785
  );
792
786
  }
793
787
 
@@ -25,7 +25,7 @@ import {
25
25
  getLatestPromptIdentity,
26
26
  } from "../normalization.js";
27
27
  import type { PostToolUsePayload, SkillUsageRecord } from "../types.js";
28
- import { classifySkillPath } from "../utils/skill-discovery.js";
28
+ import { classifySkillPath, isTestFixturePath } from "../utils/skill-discovery.js";
29
29
  import { getLastUserMessage } from "../utils/transcript.js";
30
30
 
31
31
  /**
@@ -122,6 +122,7 @@ export async function processToolUse(
122
122
  const skillName = extractSkillName(filePath);
123
123
 
124
124
  if (skillName === null) return null;
125
+ if (isTestFixturePath(filePath)) return null;
125
126
 
126
127
  const transcriptPath = payload.transcript_path ?? "";
127
128
  const sessionId = payload.session_id ?? "unknown";
@@ -83,6 +83,7 @@ export const PLATFORM_EVENT_MAP: Record<HookPlatform, Partial<Record<HookEventTy
83
83
  session_end: "TaskComplete",
84
84
  },
85
85
  pi: {
86
+ prompt_submit: "message",
86
87
  pre_tool_use: "tool_call",
87
88
  post_tool_use: "tool_result",
88
89
  session_end: "session_shutdown",