selftune 0.2.23 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +93 -15
  3. package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
  4. package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
  5. package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
  6. package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
  7. package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
  8. package/apps/local-dashboard/dist/index.html +5 -5
  9. package/cli/selftune/adapters/codex/install.ts +310 -78
  10. package/cli/selftune/adapters/opencode/install.ts +3 -4
  11. package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
  12. package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
  13. package/cli/selftune/auto-update.ts +200 -8
  14. package/cli/selftune/canonical-export.ts +55 -25
  15. package/cli/selftune/command-surface.ts +397 -0
  16. package/cli/selftune/contribute/contribute.ts +64 -13
  17. package/cli/selftune/contribution-config.ts +57 -3
  18. package/cli/selftune/contribution-preferences.ts +117 -0
  19. package/cli/selftune/contribution-signals.ts +8 -4
  20. package/cli/selftune/contribution-staging.ts +13 -2
  21. package/cli/selftune/contributions.ts +55 -121
  22. package/cli/selftune/creator-contributions.ts +29 -10
  23. package/cli/selftune/cron/setup.ts +7 -3
  24. package/cli/selftune/dashboard-contract.ts +73 -0
  25. package/cli/selftune/dashboard-server.ts +168 -17
  26. package/cli/selftune/dashboard.ts +350 -17
  27. package/cli/selftune/eval/baseline.ts +21 -5
  28. package/cli/selftune/eval/execution-eval.ts +170 -0
  29. package/cli/selftune/eval/family-overlap.ts +2 -2
  30. package/cli/selftune/eval/hooks-to-evals.ts +228 -82
  31. package/cli/selftune/eval/import-skillsbench.ts +2 -2
  32. package/cli/selftune/eval/invocation-classifier.ts +56 -0
  33. package/cli/selftune/eval/synthetic-evals.ts +5 -3
  34. package/cli/selftune/eval/unit-test-cli.ts +7 -4
  35. package/cli/selftune/evolution/apply-proposal.ts +295 -0
  36. package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
  37. package/cli/selftune/evolution/evolve-body.ts +100 -39
  38. package/cli/selftune/evolution/evolve.ts +244 -52
  39. package/cli/selftune/evolution/rollback.ts +0 -1
  40. package/cli/selftune/evolution/validate-body.ts +68 -42
  41. package/cli/selftune/evolution/validate-host-replay.ts +510 -60
  42. package/cli/selftune/evolution/validate-proposal.ts +11 -150
  43. package/cli/selftune/evolution/validate-routing.ts +43 -41
  44. package/cli/selftune/evolution/validation-contract.ts +91 -0
  45. package/cli/selftune/grading/auto-grade.ts +11 -7
  46. package/cli/selftune/grading/grade-session.ts +10 -16
  47. package/cli/selftune/index.ts +35 -10
  48. package/cli/selftune/ingestors/claude-replay.ts +15 -10
  49. package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
  50. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  51. package/cli/selftune/ingestors/pi-ingest.ts +3 -2
  52. package/cli/selftune/init.ts +27 -3
  53. package/cli/selftune/localdb/direct-write.ts +35 -1
  54. package/cli/selftune/localdb/queries/cron.ts +34 -0
  55. package/cli/selftune/localdb/queries/dashboard.ts +834 -0
  56. package/cli/selftune/localdb/queries/evolution.ts +158 -0
  57. package/cli/selftune/localdb/queries/execution.ts +133 -0
  58. package/cli/selftune/localdb/queries/json.ts +18 -0
  59. package/cli/selftune/localdb/queries/monitoring.ts +263 -0
  60. package/cli/selftune/localdb/queries/raw.ts +95 -0
  61. package/cli/selftune/localdb/queries/staging.ts +270 -0
  62. package/cli/selftune/localdb/queries/trust.ts +392 -0
  63. package/cli/selftune/localdb/queries.ts +60 -2288
  64. package/cli/selftune/localdb/schema.ts +21 -0
  65. package/cli/selftune/monitoring/watch.ts +96 -29
  66. package/cli/selftune/normalization.ts +3 -0
  67. package/cli/selftune/observability.ts +4 -2
  68. package/cli/selftune/orchestrate/cli.ts +161 -0
  69. package/cli/selftune/orchestrate/execute.ts +295 -0
  70. package/cli/selftune/orchestrate/finalize.ts +157 -0
  71. package/cli/selftune/orchestrate/locks.ts +40 -0
  72. package/cli/selftune/orchestrate/plan.ts +131 -0
  73. package/cli/selftune/orchestrate/post-run.ts +59 -0
  74. package/cli/selftune/orchestrate/prepare.ts +334 -0
  75. package/cli/selftune/orchestrate/report.ts +182 -0
  76. package/cli/selftune/orchestrate/runtime.ts +120 -0
  77. package/cli/selftune/orchestrate/signals.ts +48 -0
  78. package/cli/selftune/orchestrate.ts +150 -1173
  79. package/cli/selftune/repair/skill-usage.ts +5 -2
  80. package/cli/selftune/routes/overview.ts +5 -2
  81. package/cli/selftune/routes/skill-report.ts +15 -2
  82. package/cli/selftune/schedule.ts +5 -5
  83. package/cli/selftune/status.ts +39 -2
  84. package/cli/selftune/testing-readiness.ts +597 -0
  85. package/cli/selftune/types.ts +44 -4
  86. package/cli/selftune/uninstall.ts +2 -1
  87. package/cli/selftune/utils/canonical-log.ts +1 -9
  88. package/cli/selftune/utils/cli-error.ts +9 -0
  89. package/cli/selftune/utils/llm-call.ts +126 -6
  90. package/cli/selftune/utils/skill-discovery.ts +2 -0
  91. package/cli/selftune/workflows/proposals.ts +184 -0
  92. package/cli/selftune/workflows/skill-scaffold.ts +241 -0
  93. package/cli/selftune/workflows/workflows.ts +100 -26
  94. package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
  95. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  96. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  97. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  98. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
  99. package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
  100. package/package.json +25 -9
  101. package/packages/dashboard-core/AGENTS.md +18 -0
  102. package/packages/dashboard-core/README.md +30 -0
  103. package/packages/dashboard-core/index.ts +3 -0
  104. package/packages/dashboard-core/package.json +39 -0
  105. package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
  106. package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
  107. package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
  108. package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
  109. package/packages/dashboard-core/src/chrome/index.ts +14 -0
  110. package/packages/dashboard-core/src/chrome/types.ts +81 -0
  111. package/packages/dashboard-core/src/chrome/utils.ts +23 -0
  112. package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
  113. package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
  114. package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
  115. package/packages/dashboard-core/src/gates/index.ts +3 -0
  116. package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
  117. package/packages/dashboard-core/src/host/adapter.ts +47 -0
  118. package/packages/dashboard-core/src/host/capabilities.ts +55 -0
  119. package/packages/dashboard-core/src/host/index.ts +3 -0
  120. package/packages/dashboard-core/src/models/analytics.ts +39 -0
  121. package/packages/dashboard-core/src/models/index.ts +4 -0
  122. package/packages/dashboard-core/src/models/overview.ts +98 -0
  123. package/packages/dashboard-core/src/models/runtime.ts +7 -0
  124. package/packages/dashboard-core/src/models/skills.ts +34 -0
  125. package/packages/dashboard-core/src/routes/index.ts +2 -0
  126. package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
  127. package/packages/dashboard-core/src/routes/manifest.ts +451 -0
  128. package/packages/dashboard-core/src/routes/types.ts +39 -0
  129. package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
  130. package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
  131. package/packages/dashboard-core/src/screens/index.ts +37 -0
  132. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
  133. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
  134. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
  135. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
  136. package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
  137. package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
  138. package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
  139. package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
  140. package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
  141. package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
  142. package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
  143. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
  144. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
  145. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
  146. package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
  147. package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
  148. package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
  149. package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
  150. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
  151. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
  152. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
  153. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
  154. package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
  155. package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
  156. package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
  157. package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
  158. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  159. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  160. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  161. package/packages/telemetry-contract/src/schemas.ts +41 -1
  162. package/packages/telemetry-contract/src/types.ts +103 -2
  163. package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
  164. package/packages/ui/src/components/OverviewPanels.tsx +67 -26
  165. package/packages/ui/src/primitives/tabs.tsx +7 -6
  166. package/packages/ui/src/types.ts +10 -0
  167. package/skill/SKILL.md +130 -332
  168. package/skill/agents/diagnosis-analyst.md +3 -3
  169. package/skill/agents/evolution-reviewer.md +3 -3
  170. package/skill/agents/integration-guide.md +3 -3
  171. package/skill/agents/pattern-analyst.md +2 -2
  172. package/skill/references/cli-quick-reference.md +89 -0
  173. package/skill/references/creator-playbook.md +131 -0
  174. package/skill/references/examples.md +48 -0
  175. package/skill/references/troubleshooting.md +47 -0
  176. package/skill/references/version-history.md +1 -1
  177. package/skill/selftune.contribute.json +11 -0
  178. package/skill/{Workflows → workflows}/Baseline.md +20 -1
  179. package/skill/{Workflows → workflows}/Contribute.md +23 -10
  180. package/skill/{Workflows → workflows}/Contributions.md +13 -5
  181. package/skill/workflows/CreateTestDeploy.md +170 -0
  182. package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
  183. package/skill/{Workflows → workflows}/Cron.md +1 -1
  184. package/skill/{Workflows → workflows}/Dashboard.md +20 -0
  185. package/skill/{Workflows → workflows}/Doctor.md +1 -1
  186. package/skill/{Workflows → workflows}/Evals.md +67 -2
  187. package/skill/{Workflows → workflows}/Evolve.md +119 -30
  188. package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
  189. package/skill/{Workflows → workflows}/Grade.md +1 -1
  190. package/skill/{Workflows → workflows}/Initialize.md +8 -4
  191. package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
  192. package/skill/{Workflows → workflows}/Schedule.md +3 -3
  193. package/skill/workflows/SignalsDashboard.md +87 -0
  194. package/skill/{Workflows → workflows}/UnitTest.md +19 -0
  195. package/skill/{Workflows → workflows}/Watch.md +42 -2
  196. package/skill/{Workflows → workflows}/Workflows.md +39 -2
  197. package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
  198. package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
  199. package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
  200. package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
  201. /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
  202. /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
  203. /package/skill/{Workflows → workflows}/Badge.md +0 -0
  204. /package/skill/{Workflows → workflows}/Composability.md +0 -0
  205. /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
  206. /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
  207. /package/skill/{Workflows → workflows}/Hook.md +0 -0
  208. /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
  209. /package/skill/{Workflows → workflows}/Ingest.md +0 -0
  210. /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
  211. /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
  212. /package/skill/{Workflows → workflows}/Recover.md +0 -0
  213. /package/skill/{Workflows → workflows}/Registry.md +0 -0
  214. /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
  215. /package/skill/{Workflows → workflows}/Replay.md +0 -0
  216. /package/skill/{Workflows → workflows}/Rollback.md +0 -0
  217. /package/skill/{Workflows → workflows}/Sync.md +0 -0
  218. /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
  219. /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
@@ -6,17 +6,14 @@
6
6
  * to determine whether the proposal is an improvement.
7
7
  */
8
8
 
9
- import type { EvalEntry, EvolutionProposal, InvocationTypeScores } from "../types.js";
9
+ import type {
10
+ EvalEntry,
11
+ EvolutionProposal,
12
+ InvocationTypeScores,
13
+ ValidationMode,
14
+ } from "../types.js";
10
15
  import { callLlm, type EffortLevel } from "../utils/llm-call.js";
11
- import {
12
- buildBatchTriggerCheckPrompt,
13
- buildTriggerCheckPrompt,
14
- parseBatchTriggerResponse,
15
- parseTriggerResponse,
16
- } from "../utils/trigger-check.js";
17
-
18
- // Re-export so existing consumers don't break
19
- export { buildTriggerCheckPrompt, parseTriggerResponse };
16
+ import { buildBatchTriggerCheckPrompt, parseBatchTriggerResponse } from "../utils/trigger-check.js";
20
17
 
21
18
  /** Number of eval queries to batch into a single LLM call.
22
19
  * Higher = fewer claude -p spawns = much faster (each spawn has ~30-60s overhead).
@@ -40,147 +37,11 @@ export interface ValidationResult {
40
37
  net_change: number; // after - before pass rate
41
38
  by_invocation_type?: InvocationTypeScores;
42
39
  per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
43
- validation_mode?: "llm_judge";
40
+ validation_mode?: ValidationMode;
44
41
  validation_agent?: string;
45
- }
46
-
47
- // ---------------------------------------------------------------------------
48
- // Proposal validation
49
- // ---------------------------------------------------------------------------
50
-
51
- /** Validate a proposal sequentially (one LLM call per query). Kept for backward compat. */
52
- export async function validateProposalSequential(
53
- proposal: EvolutionProposal,
54
- evalSet: EvalEntry[],
55
- agent: string,
56
- modelFlag?: string,
57
- effort?: EffortLevel,
58
- ): Promise<ValidationResult> {
59
- if (evalSet.length === 0) {
60
- return {
61
- proposal_id: proposal.proposal_id,
62
- before_pass_rate: 0,
63
- after_pass_rate: 0,
64
- improved: false,
65
- regressions: [],
66
- new_passes: [],
67
- net_change: 0,
68
- validation_mode: "llm_judge",
69
- validation_agent: agent,
70
- };
71
- }
72
-
73
- const systemPrompt = "You are an evaluation assistant. Answer only YES or NO.";
74
- const regressions: EvalEntry[] = [];
75
- const newPasses: EvalEntry[] = [];
76
- const perEntryResults: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }> =
77
- [];
78
- let beforePassed = 0;
79
- let afterPassed = 0;
80
-
81
- for (const entry of evalSet) {
82
- // Check with original description
83
- const beforePrompt = buildTriggerCheckPrompt(proposal.original_description, entry.query);
84
- const beforeRaw = await callLlm(systemPrompt, beforePrompt, agent, modelFlag, effort);
85
- const beforeTriggered = parseTriggerResponse(beforeRaw);
86
- const beforePass =
87
- (entry.should_trigger && beforeTriggered) || (!entry.should_trigger && !beforeTriggered);
88
-
89
- // Check with proposed description
90
- const afterPrompt = buildTriggerCheckPrompt(proposal.proposed_description, entry.query);
91
- const afterRaw = await callLlm(systemPrompt, afterPrompt, agent, modelFlag, effort);
92
- const afterTriggered = parseTriggerResponse(afterRaw);
93
- const afterPass =
94
- (entry.should_trigger && afterTriggered) || (!entry.should_trigger && !afterTriggered);
95
-
96
- if (beforePass) beforePassed++;
97
- if (afterPass) afterPassed++;
98
-
99
- // Regression: passed before, fails after
100
- if (beforePass && !afterPass) {
101
- regressions.push(entry);
102
- }
103
-
104
- // New pass: failed before, passes after
105
- if (!beforePass && afterPass) {
106
- newPasses.push(entry);
107
- }
108
-
109
- perEntryResults.push({ entry, before_pass: beforePass, after_pass: afterPass });
110
- }
111
-
112
- const total = evalSet.length;
113
- const beforePassRate = beforePassed / total;
114
- const afterPassRate = afterPassed / total;
115
- const netChange = afterPassRate - beforePassRate;
116
-
117
- // A proposal is improved when ALL of:
118
- // - after_pass_rate > before_pass_rate
119
- // - regressions count < 5% of total eval entries
120
- // - Either net improvement >= 0.10 OR new_passes.length >= 2
121
- const improved =
122
- afterPassRate > beforePassRate &&
123
- regressions.length < total * 0.05 &&
124
- (netChange >= 0.1 || newPasses.length >= 2);
125
-
126
- // Compute per-invocation-type scores (initialize all required keys)
127
- const byInvocationType: Record<string, { passed: number; total: number }> = {
128
- explicit: { passed: 0, total: 0 },
129
- implicit: { passed: 0, total: 0 },
130
- contextual: { passed: 0, total: 0 },
131
- negative: { passed: 0, total: 0 },
132
- };
133
- for (const r of perEntryResults) {
134
- const type = r.entry.invocation_type ?? "implicit";
135
- if (!byInvocationType[type]) byInvocationType[type] = { passed: 0, total: 0 };
136
- byInvocationType[type].total++;
137
- if (r.after_pass) byInvocationType[type].passed++;
138
- }
139
-
140
- const invocationScores: InvocationTypeScores = {
141
- explicit: {
142
- ...byInvocationType.explicit,
143
- pass_rate:
144
- byInvocationType.explicit.total > 0
145
- ? byInvocationType.explicit.passed / byInvocationType.explicit.total
146
- : 0,
147
- },
148
- implicit: {
149
- ...byInvocationType.implicit,
150
- pass_rate:
151
- byInvocationType.implicit.total > 0
152
- ? byInvocationType.implicit.passed / byInvocationType.implicit.total
153
- : 0,
154
- },
155
- contextual: {
156
- ...byInvocationType.contextual,
157
- pass_rate:
158
- byInvocationType.contextual.total > 0
159
- ? byInvocationType.contextual.passed / byInvocationType.contextual.total
160
- : 0,
161
- },
162
- negative: {
163
- ...byInvocationType.negative,
164
- pass_rate:
165
- byInvocationType.negative.total > 0
166
- ? byInvocationType.negative.passed / byInvocationType.negative.total
167
- : 0,
168
- },
169
- };
170
-
171
- return {
172
- proposal_id: proposal.proposal_id,
173
- before_pass_rate: beforePassRate,
174
- after_pass_rate: afterPassRate,
175
- improved,
176
- regressions,
177
- new_passes: newPasses,
178
- net_change: netChange,
179
- by_invocation_type: invocationScores,
180
- per_entry_results: perEntryResults,
181
- validation_mode: "llm_judge",
182
- validation_agent: agent,
183
- };
42
+ validation_fixture_id?: string;
43
+ validation_fallback_reason?: string;
44
+ before_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
184
45
  }
185
46
 
186
47
  // ---------------------------------------------------------------------------
@@ -13,20 +13,12 @@ import type {
13
13
  BodyValidationResult,
14
14
  EvalEntry,
15
15
  RoutingReplayEntryResult,
16
+ ValidationGate,
16
17
  ValidationMode,
17
18
  } from "../types.js";
18
19
  import { runJudgeValidation } from "./engines/judge-engine.js";
19
- import {
20
- runReplayValidation,
21
- type ReplayRunner,
22
- type ReplayRunnerInput,
23
- type ReplayValidationOptions,
24
- } from "./engines/replay-engine.js";
25
-
26
- // Re-export engine types for backward compatibility
27
- export type { ReplayRunnerInput as RoutingReplayRunnerInput };
28
- export type { ReplayRunner as RoutingReplayRunner };
29
- export type { ReplayValidationOptions as RoutingValidationOptions };
20
+ import { type ReplayValidationOptions } from "./engines/replay-engine.js";
21
+ import { runValidationContract, type ValidationStrategy } from "./validation-contract.js";
30
22
 
31
23
  export interface RoutingTriggerAccuracyResult {
32
24
  before_pass_rate: number;
@@ -35,10 +27,16 @@ export interface RoutingTriggerAccuracyResult {
35
27
  validation_mode: ValidationMode;
36
28
  validation_agent: string;
37
29
  validation_fixture_id?: string;
30
+ validation_fallback_reason?: string;
38
31
  per_entry_results?: RoutingReplayEntryResult[];
39
32
  before_entry_results?: RoutingReplayEntryResult[];
40
33
  }
41
34
 
35
+ export interface RoutingValidationOptions extends ReplayValidationOptions {
36
+ mode?: ValidationStrategy;
37
+ onReplayFallback?: (reason?: string) => void;
38
+ }
39
+
42
40
  // ---------------------------------------------------------------------------
43
41
  // Structural validation
44
42
  // ---------------------------------------------------------------------------
@@ -101,7 +99,7 @@ export function validateRoutingStructure(routing: string): { valid: boolean; rea
101
99
  * Run before/after trigger checks on the eval set using the routing content.
102
100
  * Returns pass rates for comparison.
103
101
  *
104
- * Prefers replay-backed validation when a fixture is available,
102
+ * Prefers host/runtime replay when a runtime runner is available,
105
103
  * falls back to LLM judge otherwise.
106
104
  */
107
105
  export async function validateRoutingTriggerAccuracy(
@@ -110,7 +108,7 @@ export async function validateRoutingTriggerAccuracy(
110
108
  evalSet: EvalEntry[],
111
109
  agent: string,
112
110
  modelFlag?: string,
113
- options: ReplayValidationOptions = {},
111
+ options: RoutingValidationOptions = {},
114
112
  ): Promise<RoutingTriggerAccuracyResult> {
115
113
  if (evalSet.length === 0) {
116
114
  return {
@@ -122,35 +120,38 @@ export async function validateRoutingTriggerAccuracy(
122
120
  };
123
121
  }
124
122
 
125
- // Try replay-backed validation first
126
- const replayResult = await runReplayValidation(
127
- originalRouting,
128
- proposedRouting,
129
- evalSet,
130
- agent,
131
- options,
132
- );
133
-
134
- if (replayResult) {
135
- return replayResult;
136
- }
137
-
138
- // Fall back to LLM judge
139
- const judgeResult = await runJudgeValidation(
140
- originalRouting,
141
- proposedRouting,
123
+ const { result, fallbackReason } = await runValidationContract<RoutingTriggerAccuracyResult>({
124
+ mode: options.mode ?? "auto",
125
+ originalContent: originalRouting,
126
+ proposedContent: proposedRouting,
142
127
  evalSet,
143
128
  agent,
144
- modelFlag,
145
- );
129
+ replayOptions: options,
130
+ runJudge: async () => {
131
+ const judgeResult = await runJudgeValidation(
132
+ originalRouting,
133
+ proposedRouting,
134
+ evalSet,
135
+ agent,
136
+ modelFlag,
137
+ );
138
+
139
+ return {
140
+ result: {
141
+ before_pass_rate: judgeResult.before_pass_rate,
142
+ after_pass_rate: judgeResult.after_pass_rate,
143
+ improved: judgeResult.improved,
144
+ validation_mode: judgeResult.validation_mode,
145
+ validation_agent: judgeResult.validation_agent,
146
+ },
147
+ modeUsed: judgeResult.validation_mode,
148
+ };
149
+ },
150
+ onReplayFallback: options.onReplayFallback,
151
+ adaptReplayResult: (replayResult) => replayResult,
152
+ });
146
153
 
147
- return {
148
- before_pass_rate: judgeResult.before_pass_rate,
149
- after_pass_rate: judgeResult.after_pass_rate,
150
- improved: judgeResult.improved,
151
- validation_mode: judgeResult.validation_mode,
152
- validation_agent: judgeResult.validation_agent,
153
- };
154
+ return fallbackReason ? { ...result, validation_fallback_reason: fallbackReason } : result;
154
155
  }
155
156
 
156
157
  // ---------------------------------------------------------------------------
@@ -163,9 +164,9 @@ export async function validateRoutingProposal(
163
164
  evalSet: EvalEntry[],
164
165
  agent: string,
165
166
  modelFlag?: string,
166
- options: ReplayValidationOptions = {},
167
+ options: RoutingValidationOptions = {},
167
168
  ): Promise<BodyValidationResult> {
168
- const gateResults: Array<{ gate: string; passed: boolean; reason: string }> = [];
169
+ const gateResults: Array<{ gate: ValidationGate; passed: boolean; reason: string }> = [];
169
170
 
170
171
  // Gate 1: Structural validation
171
172
  const structural = validateRoutingStructure(proposal.proposed_body);
@@ -217,6 +218,7 @@ export async function validateRoutingProposal(
217
218
  validation_mode: accuracy.validation_mode,
218
219
  validation_agent: accuracy.validation_agent,
219
220
  validation_fixture_id: accuracy.validation_fixture_id,
221
+ validation_fallback_reason: accuracy.validation_fallback_reason,
220
222
  before_pass_rate: accuracy.before_pass_rate,
221
223
  after_pass_rate: accuracy.after_pass_rate,
222
224
  per_entry_results: accuracy.per_entry_results,
@@ -0,0 +1,91 @@
1
+ import type { EvalEntry, ValidationMode } from "../types.js";
2
+ import { CLIError } from "../utils/cli-error.js";
3
+ import {
4
+ runReplayValidation,
5
+ type ReplayValidationOptions,
6
+ type ReplayValidationResult,
7
+ } from "./engines/replay-engine.js";
8
+
9
+ export type ValidationStrategy = "auto" | "replay" | "judge";
10
+
11
+ export const DEFAULT_VALIDATION_STRATEGY: ValidationStrategy = "auto";
12
+
13
+ export interface ValidationExecutionResult<TResult> {
14
+ result: TResult;
15
+ modeUsed: ValidationMode;
16
+ fallbackReason?: string;
17
+ }
18
+
19
+ export interface ValidationContractOptions<TResult> {
20
+ mode?: ValidationStrategy;
21
+ originalContent: string;
22
+ proposedContent: string;
23
+ evalSet: EvalEntry[];
24
+ agent: string;
25
+ replayOptions?: ReplayValidationOptions;
26
+ runJudge: () => Promise<ValidationExecutionResult<TResult>>;
27
+ adaptReplayResult: (replayResult: ReplayValidationResult) => TResult;
28
+ onReplayFallback?: (reason?: string) => void;
29
+ }
30
+
31
+ export function hasReplayValidationPath(
32
+ replayOptions?: ReplayValidationOptions,
33
+ ): replayOptions is ReplayValidationOptions {
34
+ return Boolean(replayOptions?.replayFixture || replayOptions?.replayRunner);
35
+ }
36
+
37
+ export function createReplayUnavailableError(reason?: string): CLIError {
38
+ const message = reason
39
+ ? `Replay validation requested but real host/runtime replay is unavailable: ${reason}`
40
+ : "Replay validation requested but real host/runtime replay is unavailable.";
41
+ return new CLIError(
42
+ message,
43
+ "REPLAY_UNAVAILABLE",
44
+ "Use --validation-mode auto to allow LLM judge fallback, or run selftune on a host/agent with runtime replay support for this skill.",
45
+ );
46
+ }
47
+
48
+ export async function runValidationContract<TResult>(
49
+ options: ValidationContractOptions<TResult>,
50
+ ): Promise<ValidationExecutionResult<TResult>> {
51
+ const mode = options.mode ?? DEFAULT_VALIDATION_STRATEGY;
52
+
53
+ if (mode === "judge") {
54
+ return options.runJudge();
55
+ }
56
+
57
+ if (hasReplayValidationPath(options.replayOptions)) {
58
+ const replayAttempt = await runReplayValidation(
59
+ options.originalContent,
60
+ options.proposedContent,
61
+ options.evalSet,
62
+ options.agent,
63
+ options.replayOptions,
64
+ );
65
+
66
+ if (replayAttempt.result) {
67
+ return {
68
+ result: options.adaptReplayResult(replayAttempt.result),
69
+ modeUsed: replayAttempt.result.validation_mode,
70
+ };
71
+ }
72
+
73
+ if (mode === "replay") {
74
+ throw createReplayUnavailableError(replayAttempt.fallbackReason);
75
+ }
76
+
77
+ options.onReplayFallback?.(replayAttempt.fallbackReason);
78
+ const judgeResult = await options.runJudge();
79
+ return {
80
+ ...judgeResult,
81
+ fallbackReason: replayAttempt.fallbackReason,
82
+ };
83
+ }
84
+
85
+ if (mode === "replay") {
86
+ throw createReplayUnavailableError();
87
+ }
88
+
89
+ options.onReplayFallback?.();
90
+ return options.runJudge();
91
+ }
@@ -13,12 +13,16 @@ import { mkdirSync, writeFileSync } from "node:fs";
13
13
  import { dirname } from "node:path";
14
14
  import { parseArgs } from "node:util";
15
15
 
16
- import { AGENT_CANDIDATES, TELEMETRY_LOG } from "../constants.js";
16
+ import { TELEMETRY_LOG } from "../constants.js";
17
17
  import { getDb } from "../localdb/db.js";
18
18
  import { querySessionTelemetry, querySkillUsageRecords } from "../localdb/queries.js";
19
19
  import type { GradingResult, SessionTelemetryRecord, SkillUsageRecord } from "../types.js";
20
20
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
21
- import { detectAgent as _detectAgent } from "../utils/llm-call.js";
21
+ import {
22
+ detectLlmAgent as _detectAgent,
23
+ isLlmBackedAgent,
24
+ LLM_BACKED_AGENT_CANDIDATES,
25
+ } from "../utils/llm-call.js";
22
26
  import { readExcerpt } from "../utils/transcript.js";
23
27
  import {
24
28
  buildDefaultGradingOutputPath,
@@ -55,7 +59,7 @@ Options:
55
59
  --session-id Grade a specific session (auto-detects most recent if omitted)
56
60
  --telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
57
61
  --output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
58
- --agent Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
62
+ --agent Agent CLI to use (${LLM_BACKED_AGENT_CANDIDATES.join(", ")})
59
63
  --show-transcript Print transcript excerpt before grading
60
64
  -h, --help Show this help message`);
61
65
  process.exit(0);
@@ -68,9 +72,9 @@ Options:
68
72
 
69
73
  // --- Determine agent ---
70
74
  let agent: string | null = null;
71
- const validAgents = [...AGENT_CANDIDATES];
75
+ const validAgents = [...LLM_BACKED_AGENT_CANDIDATES];
72
76
  if (values.agent) {
73
- if (!validAgents.includes(values.agent)) {
77
+ if (!isLlmBackedAgent(values.agent)) {
74
78
  throw new CLIError(
75
79
  `Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
76
80
  "INVALID_FLAG",
@@ -84,9 +88,9 @@ Options:
84
88
 
85
89
  if (!agent) {
86
90
  throw new CLIError(
87
- `No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH`,
91
+ `No supported agent CLI (${LLM_BACKED_AGENT_CANDIDATES.join("/")}) found in PATH`,
88
92
  "AGENT_NOT_FOUND",
89
- "Install one of the supported agent CLIs",
93
+ "Install Claude Code, Codex, OpenCode, or Pi",
90
94
  );
91
95
  }
92
96
 
@@ -5,19 +5,14 @@
5
5
  * Rubric-based grader for Claude Code skill sessions.
6
6
  * Migrated from grade_session.py.
7
7
  *
8
- * Grades via an installed agent CLI selected from AGENT_CANDIDATES.
8
+ * Grades via an installed agent CLI selected from the LLM-backed agent set.
9
9
  */
10
10
 
11
11
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
12
12
  import { basename, dirname, join } from "node:path";
13
13
  import { parseArgs } from "node:util";
14
14
 
15
- import {
16
- AGENT_CANDIDATES,
17
- CLAUDE_CODE_PROJECTS_DIR,
18
- SELFTUNE_CONFIG_DIR,
19
- TELEMETRY_LOG,
20
- } from "../constants.js";
15
+ import { CLAUDE_CODE_PROJECTS_DIR, SELFTUNE_CONFIG_DIR, TELEMETRY_LOG } from "../constants.js";
21
16
  import { getDb } from "../localdb/db.js";
22
17
  import { querySessionTelemetry, querySkillUsageRecords } from "../localdb/queries.js";
23
18
  import type {
@@ -31,7 +26,9 @@ import type {
31
26
  } from "../types.js";
32
27
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
33
28
  import {
34
- detectAgent as _detectAgent,
29
+ detectLlmAgent as _detectAgent,
30
+ isLlmBackedAgent,
31
+ LLM_BACKED_AGENT_CANDIDATES,
35
32
  stripMarkdownFences as _stripMarkdownFences,
36
33
  callViaAgent,
37
34
  } from "../utils/llm-call.js";
@@ -42,9 +39,6 @@ import {
42
39
  } from "../utils/transcript.js";
43
40
  import { type PreGateContext, runPreGates } from "./pre-gates.js";
44
41
 
45
- // Re-export for backward compatibility
46
- export { detectAgent, stripMarkdownFences } from "../utils/llm-call.js";
47
-
48
42
  // ---------------------------------------------------------------------------
49
43
  // Constants
50
44
  // ---------------------------------------------------------------------------
@@ -756,7 +750,7 @@ Options:
756
750
  --transcript Path to transcript file
757
751
  --telemetry-log Path to telemetry log (default: ~/.claude/session_telemetry_log.jsonl)
758
752
  --output Output path for grading JSON (default: ~/.selftune/grading/result-<session>.json)
759
- --agent Agent CLI to use (${AGENT_CANDIDATES.join(", ")})
753
+ --agent Agent CLI to use (${LLM_BACKED_AGENT_CANDIDATES.join(", ")})
760
754
  --show-transcript Print transcript excerpt before grading
761
755
  -h, --help Show this help message`);
762
756
  process.exit(0);
@@ -769,9 +763,9 @@ Options:
769
763
 
770
764
  // --- Determine agent ---
771
765
  let agent: string | null = null;
772
- const validAgents = [...AGENT_CANDIDATES];
766
+ const validAgents = [...LLM_BACKED_AGENT_CANDIDATES];
773
767
  if (values.agent) {
774
- if (!validAgents.includes(values.agent)) {
768
+ if (!isLlmBackedAgent(values.agent)) {
775
769
  throw new CLIError(
776
770
  `Invalid --agent '${values.agent}'. Expected one of: ${validAgents.join(", ")}`,
777
771
  "INVALID_FLAG",
@@ -785,9 +779,9 @@ Options:
785
779
 
786
780
  if (!agent) {
787
781
  throw new CLIError(
788
- `No supported agent CLI (${AGENT_CANDIDATES.join("/")}) found in PATH`,
782
+ `No supported agent CLI (${LLM_BACKED_AGENT_CANDIDATES.join("/")}) found in PATH`,
789
783
  "AGENT_NOT_FOUND",
790
- "Install claude, codex, or opencode CLI, then retry",
784
+ "Install Claude Code, Codex, OpenCode, or Pi, then retry",
791
785
  );
792
786
  }
793
787
 
@@ -21,7 +21,7 @@
21
21
  * selftune contribute — Export anonymized skill data for community
22
22
  * selftune contributions — Manage creator-directed sharing preferences
23
23
  * selftune creator-contributions — Manage creator-side contribution configs
24
- * selftune workflows — Discover and manage multi-skill workflows
24
+ * selftune workflows — Discover workflows and scaffold workflow skills
25
25
  * selftune quickstart — Guided onboarding: init, ingest, status, and suggestions
26
26
  * selftune repair-skill-usage — Rebuild trustworthy skill usage from transcripts
27
27
  * selftune export — Export SQLite data to JSONL snapshots
@@ -38,6 +38,7 @@
38
38
  */
39
39
 
40
40
  import { CLIError, handleCLIError } from "./utils/cli-error.js";
41
+ import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "./command-surface.js";
41
42
 
42
43
  process.on("uncaughtException", handleCLIError);
43
44
  process.on("unhandledRejection", handleCLIError);
@@ -69,7 +70,7 @@ Commands:
69
70
  contribute Export anonymized skill data for community
70
71
  contributions Manage creator-directed sharing preferences
71
72
  creator-contributions Manage creator-side contribution configs
72
- workflows Discover and manage multi-skill workflows
73
+ workflows Discover workflows and scaffold workflow skills
73
74
  quickstart Guided onboarding: init, ingest, status, and suggestions
74
75
  repair-skill-usage Rebuild trustworthy skill usage from transcripts
75
76
  export Export SQLite data to JSONL snapshots
@@ -227,12 +228,12 @@ Run 'selftune grade <subcommand> --help' for subcommand-specific options.`);
227
228
  case "evolve": {
228
229
  const sub = process.argv[2];
229
230
  if (sub === "--help" || sub === "-h") {
230
- console.log(`selftune evolve — Evolve skill descriptions
231
+ console.log(`${renderCommandHelp(PUBLIC_COMMAND_SURFACES.evolve)}
231
232
 
232
- Usage:
233
- selftune evolve [options] Run description evolution
234
- selftune evolve body [options] Evolve full body or routing table
235
- selftune evolve rollback [options] Rollback a previous evolution
233
+ Subcommands:
234
+ selftune evolve body [options] Evolve full body or routing table
235
+ selftune evolve rollback [options] Rollback a previous evolution
236
+ selftune evolve apply-proposal [options] Apply an approved contributor proposal
236
237
 
237
238
  Run 'selftune evolve <subcommand> --help' for subcommand-specific options.`);
238
239
  process.exit(0);
@@ -255,6 +256,11 @@ Run 'selftune evolve <subcommand> --help' for subcommand-specific options.`);
255
256
  await cliMain();
256
257
  break;
257
258
  }
259
+ case "apply-proposal": {
260
+ const { cliMain } = await import("./evolution/apply-proposal.js");
261
+ await cliMain();
262
+ break;
263
+ }
258
264
  default:
259
265
  throw new CLIError(
260
266
  `Unknown evolve target: ${sub}`,
@@ -281,6 +287,12 @@ Actions:
281
287
  composability Analyze skill co-occurrence conflicts
282
288
  family-overlap Detect sibling-skill overlap and consolidation pressure
283
289
 
290
+ Recommended creator loop:
291
+ 1. selftune eval generate --skill <name>
292
+ 2. selftune eval unit-test --skill <name> --generate --skill-path <path>
293
+ 3. selftune evolve --skill <name> --skill-path <path> --dry-run --validation-mode replay
294
+ 4. selftune grade baseline --skill <name> --skill-path <path>
295
+
284
296
  Run 'selftune eval <action> --help' for action-specific options.`);
285
297
  process.exit(0);
286
298
  }
@@ -337,7 +349,8 @@ Run 'selftune eval <action> --help' for action-specific options.`);
337
349
  "selftune eval composability --skill <name>",
338
350
  );
339
351
  }
340
- const logPath = values["telemetry-log"] ?? TELEMETRY_LOG;
352
+ const logPath =
353
+ typeof values["telemetry-log"] === "string" ? values["telemetry-log"] : TELEMETRY_LOG;
341
354
  let telemetry: unknown[];
342
355
  if (logPath === TELEMETRY_LOG) {
343
356
  try {
@@ -363,7 +376,19 @@ Run 'selftune eval <action> --help' for action-specific options.`);
363
376
  );
364
377
  }
365
378
  const windowSize = rawWindow === undefined ? undefined : Number(rawWindow);
366
- const report = analyzeComposability(values.skill, telemetry, windowSize);
379
+ const skillName = typeof values.skill === "string" ? values.skill : undefined;
380
+ if (!skillName) {
381
+ throw new CLIError(
382
+ "--skill <name> is required.",
383
+ "MISSING_FLAG",
384
+ "selftune eval composability --skill <name>",
385
+ );
386
+ }
387
+ const report = analyzeComposability(
388
+ skillName,
389
+ telemetry as import("./types.js").SessionTelemetryRecord[],
390
+ windowSize,
391
+ );
367
392
  console.log(JSON.stringify(report, null, 2));
368
393
  break;
369
394
  }
@@ -720,7 +745,7 @@ Output:
720
745
  userId: identity.user_id,
721
746
  agentType: readConfiguredAgentType(SELFTUNE_CONFIG_PATH, "unknown"),
722
747
  selftuneVersion: getSelftuneVersion(),
723
- dryRun: values["dry-run"] ?? false,
748
+ dryRun: values["dry-run"] === true,
724
749
  apiKey: identity.api_key,
725
750
  });
726
751