selftune 0.2.22 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +95 -15
  3. package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
  4. package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
  5. package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
  6. package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
  7. package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
  8. package/apps/local-dashboard/dist/index.html +5 -5
  9. package/cli/selftune/adapters/codex/install.ts +310 -78
  10. package/cli/selftune/adapters/opencode/install.ts +3 -4
  11. package/cli/selftune/adapters/pi/hook.ts +273 -0
  12. package/cli/selftune/adapters/pi/install.ts +207 -0
  13. package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
  14. package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
  15. package/cli/selftune/auto-update.ts +200 -8
  16. package/cli/selftune/canonical-export.ts +55 -25
  17. package/cli/selftune/command-surface.ts +397 -0
  18. package/cli/selftune/constants.ts +10 -1
  19. package/cli/selftune/contribute/contribute.ts +64 -13
  20. package/cli/selftune/contribution-config.ts +57 -3
  21. package/cli/selftune/contribution-preferences.ts +117 -0
  22. package/cli/selftune/contribution-signals.ts +8 -4
  23. package/cli/selftune/contribution-staging.ts +13 -2
  24. package/cli/selftune/contributions.ts +55 -121
  25. package/cli/selftune/creator-contributions.ts +29 -10
  26. package/cli/selftune/cron/setup.ts +7 -3
  27. package/cli/selftune/dashboard-contract.ts +87 -0
  28. package/cli/selftune/dashboard-server.ts +168 -17
  29. package/cli/selftune/dashboard.ts +350 -17
  30. package/cli/selftune/eval/baseline.ts +21 -5
  31. package/cli/selftune/eval/execution-eval.ts +170 -0
  32. package/cli/selftune/eval/family-overlap.ts +2 -2
  33. package/cli/selftune/eval/hooks-to-evals.ts +228 -82
  34. package/cli/selftune/eval/import-skillsbench.ts +2 -2
  35. package/cli/selftune/eval/invocation-classifier.ts +56 -0
  36. package/cli/selftune/eval/synthetic-evals.ts +5 -3
  37. package/cli/selftune/eval/unit-test-cli.ts +7 -4
  38. package/cli/selftune/evolution/apply-proposal.ts +295 -0
  39. package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
  40. package/cli/selftune/evolution/engines/replay-engine.ts +180 -0
  41. package/cli/selftune/evolution/evidence.ts +2 -6
  42. package/cli/selftune/evolution/evolve-body.ts +152 -38
  43. package/cli/selftune/evolution/evolve.ts +244 -52
  44. package/cli/selftune/evolution/rollback.ts +0 -1
  45. package/cli/selftune/evolution/validate-body.ts +111 -49
  46. package/cli/selftune/evolution/validate-host-replay.ts +510 -60
  47. package/cli/selftune/evolution/validate-proposal.ts +11 -150
  48. package/cli/selftune/evolution/validate-routing.ts +51 -108
  49. package/cli/selftune/evolution/validation-contract.ts +91 -0
  50. package/cli/selftune/grading/auto-grade.ts +11 -7
  51. package/cli/selftune/grading/grade-session.ts +10 -16
  52. package/cli/selftune/hooks/skill-eval.ts +2 -1
  53. package/cli/selftune/hooks-shared/types.ts +1 -0
  54. package/cli/selftune/index.ts +58 -15
  55. package/cli/selftune/ingestors/claude-replay.ts +15 -10
  56. package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
  57. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  58. package/cli/selftune/ingestors/pi-ingest.ts +727 -0
  59. package/cli/selftune/init.ts +38 -4
  60. package/cli/selftune/localdb/direct-write.ts +120 -1
  61. package/cli/selftune/localdb/materialize.ts +6 -7
  62. package/cli/selftune/localdb/queries/cron.ts +34 -0
  63. package/cli/selftune/localdb/queries/dashboard.ts +834 -0
  64. package/cli/selftune/localdb/queries/evolution.ts +158 -0
  65. package/cli/selftune/localdb/queries/execution.ts +133 -0
  66. package/cli/selftune/localdb/queries/json.ts +18 -0
  67. package/cli/selftune/localdb/queries/monitoring.ts +263 -0
  68. package/cli/selftune/localdb/queries/raw.ts +95 -0
  69. package/cli/selftune/localdb/queries/staging.ts +270 -0
  70. package/cli/selftune/localdb/queries/trust.ts +392 -0
  71. package/cli/selftune/localdb/queries.ts +60 -2162
  72. package/cli/selftune/localdb/schema.ts +59 -0
  73. package/cli/selftune/monitoring/watch.ts +96 -29
  74. package/cli/selftune/normalization.ts +3 -0
  75. package/cli/selftune/observability.ts +12 -3
  76. package/cli/selftune/orchestrate/cli.ts +161 -0
  77. package/cli/selftune/orchestrate/execute.ts +295 -0
  78. package/cli/selftune/orchestrate/finalize.ts +157 -0
  79. package/cli/selftune/orchestrate/locks.ts +40 -0
  80. package/cli/selftune/orchestrate/plan.ts +131 -0
  81. package/cli/selftune/orchestrate/post-run.ts +59 -0
  82. package/cli/selftune/orchestrate/prepare.ts +334 -0
  83. package/cli/selftune/orchestrate/report.ts +182 -0
  84. package/cli/selftune/orchestrate/runtime.ts +120 -0
  85. package/cli/selftune/orchestrate/signals.ts +48 -0
  86. package/cli/selftune/orchestrate.ts +162 -1142
  87. package/cli/selftune/registry/client.ts +74 -0
  88. package/cli/selftune/registry/history.ts +54 -0
  89. package/cli/selftune/registry/index.ts +90 -0
  90. package/cli/selftune/registry/install.ts +141 -0
  91. package/cli/selftune/registry/list.ts +44 -0
  92. package/cli/selftune/registry/push.ts +171 -0
  93. package/cli/selftune/registry/rollback.ts +49 -0
  94. package/cli/selftune/registry/status.ts +62 -0
  95. package/cli/selftune/registry/sync.ts +125 -0
  96. package/cli/selftune/repair/skill-usage.ts +9 -3
  97. package/cli/selftune/routes/overview.ts +5 -2
  98. package/cli/selftune/routes/skill-report.ts +15 -2
  99. package/cli/selftune/schedule.ts +5 -5
  100. package/cli/selftune/status.ts +70 -2
  101. package/cli/selftune/sync.ts +127 -23
  102. package/cli/selftune/testing-readiness.ts +597 -0
  103. package/cli/selftune/types.ts +46 -5
  104. package/cli/selftune/uninstall.ts +2 -1
  105. package/cli/selftune/utils/canonical-log.ts +1 -9
  106. package/cli/selftune/utils/cli-error.ts +9 -0
  107. package/cli/selftune/utils/jsonl.ts +1 -30
  108. package/cli/selftune/utils/llm-call.ts +126 -6
  109. package/cli/selftune/utils/skill-discovery.ts +24 -0
  110. package/cli/selftune/workflows/proposals.ts +184 -0
  111. package/cli/selftune/workflows/skill-scaffold.ts +241 -0
  112. package/cli/selftune/workflows/workflows.ts +100 -26
  113. package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
  114. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
  115. package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
  116. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  117. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
  118. package/node_modules/@selftune/telemetry-contract/package.json +1 -1
  119. package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
  120. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +63 -5
  121. package/node_modules/@selftune/telemetry-contract/src/types.ts +97 -7
  122. package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
  123. package/package.json +25 -9
  124. package/packages/dashboard-core/AGENTS.md +18 -0
  125. package/packages/dashboard-core/README.md +30 -0
  126. package/packages/dashboard-core/index.ts +3 -0
  127. package/packages/dashboard-core/package.json +39 -0
  128. package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
  129. package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
  130. package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
  131. package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
  132. package/packages/dashboard-core/src/chrome/index.ts +14 -0
  133. package/packages/dashboard-core/src/chrome/types.ts +81 -0
  134. package/packages/dashboard-core/src/chrome/utils.ts +23 -0
  135. package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
  136. package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
  137. package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
  138. package/packages/dashboard-core/src/gates/index.ts +3 -0
  139. package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
  140. package/packages/dashboard-core/src/host/adapter.ts +47 -0
  141. package/packages/dashboard-core/src/host/capabilities.ts +55 -0
  142. package/packages/dashboard-core/src/host/index.ts +3 -0
  143. package/packages/dashboard-core/src/models/analytics.ts +39 -0
  144. package/packages/dashboard-core/src/models/index.ts +4 -0
  145. package/packages/dashboard-core/src/models/overview.ts +98 -0
  146. package/packages/dashboard-core/src/models/runtime.ts +7 -0
  147. package/packages/dashboard-core/src/models/skills.ts +34 -0
  148. package/packages/dashboard-core/src/routes/index.ts +2 -0
  149. package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
  150. package/packages/dashboard-core/src/routes/manifest.ts +451 -0
  151. package/packages/dashboard-core/src/routes/types.ts +39 -0
  152. package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
  153. package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
  154. package/packages/dashboard-core/src/screens/index.ts +37 -0
  155. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
  156. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
  157. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
  158. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
  159. package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
  160. package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
  161. package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
  162. package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
  163. package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
  164. package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
  165. package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
  166. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
  167. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
  168. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
  169. package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
  170. package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
  171. package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
  172. package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
  173. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
  174. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
  175. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
  176. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
  177. package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
  178. package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
  179. package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
  180. package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
  181. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
  182. package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
  183. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  184. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
  185. package/packages/telemetry-contract/package.json +1 -1
  186. package/packages/telemetry-contract/src/index.ts +1 -0
  187. package/packages/telemetry-contract/src/schemas.ts +63 -5
  188. package/packages/telemetry-contract/src/types.ts +97 -7
  189. package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
  190. package/packages/ui/AGENTS.md +16 -0
  191. package/packages/ui/README.md +1 -1
  192. package/packages/ui/package.json +1 -1
  193. package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
  194. package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
  195. package/packages/ui/src/components/EvidenceViewer.tsx +229 -464
  196. package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
  197. package/packages/ui/src/components/InfoTip.tsx +1 -2
  198. package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
  199. package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
  200. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
  201. package/packages/ui/src/components/OverviewPanels.tsx +693 -0
  202. package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
  203. package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
  204. package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
  205. package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
  206. package/packages/ui/src/components/index.ts +56 -1
  207. package/packages/ui/src/components/section-cards.tsx +18 -35
  208. package/packages/ui/src/components/skill-health-grid.tsx +47 -37
  209. package/packages/ui/src/lib/constants.tsx +0 -1
  210. package/packages/ui/src/primitives/card.tsx +1 -1
  211. package/packages/ui/src/primitives/checkbox.tsx +1 -1
  212. package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
  213. package/packages/ui/src/primitives/select.tsx +2 -2
  214. package/packages/ui/src/primitives/tabs.tsx +7 -6
  215. package/packages/ui/src/types.ts +182 -4
  216. package/skill/SKILL.md +130 -318
  217. package/skill/agents/diagnosis-analyst.md +3 -3
  218. package/skill/agents/evolution-reviewer.md +3 -3
  219. package/skill/agents/integration-guide.md +3 -3
  220. package/skill/agents/pattern-analyst.md +2 -2
  221. package/skill/references/cli-quick-reference.md +89 -0
  222. package/skill/references/creator-playbook.md +131 -0
  223. package/skill/references/examples.md +48 -0
  224. package/skill/references/troubleshooting.md +47 -0
  225. package/skill/references/version-history.md +1 -1
  226. package/skill/selftune.contribute.json +11 -0
  227. package/skill/{Workflows → workflows}/Baseline.md +20 -1
  228. package/skill/{Workflows → workflows}/Contribute.md +23 -10
  229. package/skill/{Workflows → workflows}/Contributions.md +13 -5
  230. package/skill/workflows/CreateTestDeploy.md +170 -0
  231. package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
  232. package/skill/{Workflows → workflows}/Cron.md +1 -1
  233. package/skill/{Workflows → workflows}/Dashboard.md +20 -0
  234. package/skill/{Workflows → workflows}/Doctor.md +1 -1
  235. package/skill/{Workflows → workflows}/Evals.md +67 -2
  236. package/skill/{Workflows → workflows}/Evolve.md +119 -30
  237. package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
  238. package/skill/{Workflows → workflows}/Grade.md +1 -1
  239. package/skill/{Workflows → workflows}/Ingest.md +60 -2
  240. package/skill/{Workflows → workflows}/Initialize.md +16 -9
  241. package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
  242. package/skill/{Workflows → workflows}/PlatformHooks.md +19 -3
  243. package/skill/workflows/Registry.md +99 -0
  244. package/skill/{Workflows → workflows}/Schedule.md +3 -3
  245. package/skill/workflows/SignalsDashboard.md +87 -0
  246. package/skill/{Workflows → workflows}/Sync.md +3 -1
  247. package/skill/{Workflows → workflows}/UnitTest.md +19 -0
  248. package/skill/{Workflows → workflows}/Watch.md +42 -2
  249. package/skill/{Workflows → workflows}/Workflows.md +39 -2
  250. package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
  251. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
  252. package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
  253. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
  254. package/cli/selftune/utils/html.ts +0 -27
  255. package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
  256. /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
  257. /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
  258. /package/skill/{Workflows → workflows}/Badge.md +0 -0
  259. /package/skill/{Workflows → workflows}/Composability.md +0 -0
  260. /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
  261. /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
  262. /package/skill/{Workflows → workflows}/Hook.md +0 -0
  263. /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
  264. /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
  265. /package/skill/{Workflows → workflows}/Recover.md +0 -0
  266. /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
  267. /package/skill/{Workflows → workflows}/Replay.md +0 -0
  268. /package/skill/{Workflows → workflows}/Rollback.md +0 -0
  269. /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
  270. /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
@@ -0,0 +1,597 @@
1
+ import type { Database } from "bun:sqlite";
2
+
3
+ import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from "node:fs";
4
+ import { join } from "node:path";
5
+
6
+ import { SELFTUNE_CONFIG_DIR } from "./constants.js";
7
+ import type {
8
+ CreatorLoopNextStep,
9
+ CreatorTestingOverview,
10
+ DeploymentReadiness,
11
+ SkillEvalReadiness,
12
+ SkillTestingReadiness,
13
+ } from "./dashboard-contract.js";
14
+ import type { EvalEntry, UnitTestSuiteResult } from "./types.js";
15
+ import { queryEvolutionEvidence } from "./localdb/queries/evolution.js";
16
+ import { queryTrustedSkillObservationRows } from "./localdb/queries/trust.js";
17
+ import {
18
+ findInstalledSkillNames,
19
+ findInstalledSkillPath,
20
+ findRepositoryClaudeSkillDirs,
21
+ findRepositorySkillDirs,
22
+ } from "./utils/skill-discovery.js";
23
+
24
+ interface TrustedSkillObservationSummary {
25
+ session_id: string;
26
+ triggered: number;
27
+ }
28
+
29
+ interface TestingReadinessContext {
30
+ knownSkills: Set<string>;
31
+ searchDirs: string[];
32
+ trustedRowsBySkill: Map<string, TrustedSkillObservationSummary[]>;
33
+ evalEvidenceBySkill: Map<string, { count: number; latestAt: string | null }>;
34
+ fallbackSkillPathBySkill: Map<string, string>;
35
+ replayBySkill: Map<string, { check_count: number; latest_validation_mode: string | null }>;
36
+ baselineBySkill: Map<
37
+ string,
38
+ { sample_size: number; pass_rate: number | null; measured_at: string | null }
39
+ >;
40
+ latestEvolutionBySkill: Map<string, { action: string | null; timestamp: string | null }>;
41
+ }
42
+
43
+ function getConfigDir(): string {
44
+ return process.env.SELFTUNE_CONFIG_DIR || SELFTUNE_CONFIG_DIR;
45
+ }
46
+
47
+ function getEvalSetDir(): string {
48
+ return join(getConfigDir(), "eval-sets");
49
+ }
50
+
51
+ function getUnitTestDir(): string {
52
+ return join(getConfigDir(), "unit-tests");
53
+ }
54
+
55
+ export function getCanonicalEvalSetPath(skillName: string): string {
56
+ return join(getEvalSetDir(), `${skillName}.json`);
57
+ }
58
+
59
+ export function getUnitTestPath(skillName: string): string {
60
+ return join(getUnitTestDir(), `${skillName}.json`);
61
+ }
62
+
63
+ export function getUnitTestResultPath(skillName: string): string {
64
+ return join(getUnitTestDir(), `${skillName}.last-run.json`);
65
+ }
66
+
67
+ export function writeCanonicalEvalSet(skillName: string, evalSet: EvalEntry[]): string {
68
+ mkdirSync(getEvalSetDir(), { recursive: true });
69
+ const path = getCanonicalEvalSetPath(skillName);
70
+ writeFileSync(path, JSON.stringify(evalSet, null, 2), "utf-8");
71
+ return path;
72
+ }
73
+
74
+ export function writeUnitTestRunResult(skillName: string, suite: UnitTestSuiteResult): string {
75
+ mkdirSync(getUnitTestDir(), { recursive: true });
76
+ const path = getUnitTestResultPath(skillName);
77
+ writeFileSync(path, JSON.stringify(suite, null, 2), "utf-8");
78
+ return path;
79
+ }
80
+
81
+ function readJsonArrayFile(path: string): unknown[] {
82
+ try {
83
+ if (!existsSync(path)) return [];
84
+ const parsed = JSON.parse(readFileSync(path, "utf-8")) as unknown;
85
+ return Array.isArray(parsed) ? parsed : [];
86
+ } catch {
87
+ return [];
88
+ }
89
+ }
90
+
91
+ function readUnitTestResult(path: string): UnitTestSuiteResult | null {
92
+ try {
93
+ if (!existsSync(path)) return null;
94
+ const parsed = JSON.parse(readFileSync(path, "utf-8")) as Partial<UnitTestSuiteResult>;
95
+ if (typeof parsed !== "object" || parsed == null) return null;
96
+ if (
97
+ typeof parsed.skill_name !== "string" ||
98
+ typeof parsed.total !== "number" ||
99
+ typeof parsed.passed !== "number" ||
100
+ typeof parsed.failed !== "number" ||
101
+ typeof parsed.pass_rate !== "number" ||
102
+ typeof parsed.run_at !== "string"
103
+ ) {
104
+ return null;
105
+ }
106
+ return parsed as UnitTestSuiteResult;
107
+ } catch {
108
+ return null;
109
+ }
110
+ }
111
+
112
+ function getSkillSearchDirs(): string[] {
113
+ const cwd = process.cwd();
114
+ const homeDir = process.env.HOME ?? "";
115
+ const codexHome = process.env.CODEX_HOME ?? `${homeDir}/.codex`;
116
+ return [
117
+ ...findRepositorySkillDirs(cwd),
118
+ ...findRepositoryClaudeSkillDirs(cwd),
119
+ `${homeDir}/.agents/skills`,
120
+ `${homeDir}/.claude/skills`,
121
+ `${codexHome}/skills`,
122
+ ];
123
+ }
124
+
125
+ function scanSkillNamesFromDir(
126
+ dir: string,
127
+ matcher: (entryName: string) => string | null,
128
+ ): Set<string> {
129
+ const names = new Set<string>();
130
+ if (!existsSync(dir)) return names;
131
+ try {
132
+ for (const entry of readdirSync(dir)) {
133
+ const name = matcher(entry);
134
+ if (name) names.add(name);
135
+ }
136
+ } catch {
137
+ return names;
138
+ }
139
+ return names;
140
+ }
141
+
142
+ function deriveEvalReadiness(
143
+ skillPath: string | null,
144
+ trustedTriggerCount: number,
145
+ ): SkillEvalReadiness {
146
+ if (trustedTriggerCount > 0) return "log_ready";
147
+ if (skillPath) return "cold_start_ready";
148
+ return "telemetry_only";
149
+ }
150
+
151
+ function formatSkillPathArg(skillPath: string | null, skillName: string): string {
152
+ return skillPath ?? `/path/to/skills/${skillName}/SKILL.md`;
153
+ }
154
+
155
+ function recommendCommand(
156
+ skillName: string,
157
+ skillPath: string | null,
158
+ nextStep: CreatorLoopNextStep,
159
+ ): string {
160
+ const pathArg = formatSkillPathArg(skillPath, skillName);
161
+ switch (nextStep) {
162
+ case "generate_evals":
163
+ return skillPath
164
+ ? `selftune eval generate --skill ${skillName} --auto-synthetic --skill-path ${pathArg}`
165
+ : `selftune eval generate --skill ${skillName} --skill-path ${pathArg}`;
166
+ case "run_unit_tests":
167
+ return `selftune eval unit-test --skill ${skillName} --generate --skill-path ${pathArg}`;
168
+ case "run_replay_dry_run":
169
+ return `selftune evolve --skill ${skillName} --skill-path ${pathArg} --dry-run --validation-mode replay`;
170
+ case "measure_baseline":
171
+ return `selftune grade baseline --skill ${skillName} --skill-path ${pathArg}`;
172
+ case "deploy_candidate":
173
+ return `selftune evolve --skill ${skillName} --skill-path ${pathArg} --with-baseline`;
174
+ case "watch_deployment":
175
+ return `selftune watch --skill ${skillName}`;
176
+ }
177
+ }
178
+
179
+ function summarizeReadiness(
180
+ nextStep: CreatorLoopNextStep,
181
+ evalReadiness: SkillEvalReadiness,
182
+ evalSetEntries: number,
183
+ unitTestCases: number,
184
+ replayCheckCount: number,
185
+ baselineSampleSize: number,
186
+ unitTestPassRate: number | null,
187
+ ): string {
188
+ switch (nextStep) {
189
+ case "generate_evals":
190
+ if (evalReadiness === "log_ready") {
191
+ return "Trusted telemetry exists, but no canonical eval set is saved yet.";
192
+ }
193
+ if (evalReadiness === "cold_start_ready") {
194
+ return "Installed locally but still cold-start. Generate synthetic evals before you evolve it.";
195
+ }
196
+ return "Telemetry exists, but selftune cannot resolve a local SKILL.md yet. Point it at the skill and generate evals.";
197
+ case "run_unit_tests":
198
+ return `Eval coverage is present (${evalSetEntries} entries), but no unit test file is saved yet.`;
199
+ case "run_replay_dry_run": {
200
+ const passRateText =
201
+ unitTestPassRate != null
202
+ ? ` Last unit-test run passed ${Math.round(unitTestPassRate * 100)}%.`
203
+ : "";
204
+ return `Unit tests are present (${unitTestCases} cases), but replay-backed dry-run validation has not been recorded yet.${passRateText}`;
205
+ }
206
+ case "measure_baseline":
207
+ return `Replay-backed validation exists (${replayCheckCount} recorded checks), but no stored no-skill baseline exists yet.`;
208
+ case "deploy_candidate":
209
+ return `Evals, unit tests, replay validation, and a baseline are all present. Ready to run a live evolve and deploy a watched candidate.${baselineSampleSize > 0 ? ` Latest baseline used ${baselineSampleSize} samples.` : ""}`;
210
+ case "watch_deployment":
211
+ return `A candidate has already been deployed for this skill. Keep watching live traffic and baseline lift before making another mutation.${baselineSampleSize > 0 ? ` Latest baseline used ${baselineSampleSize} samples.` : ""}`;
212
+ }
213
+ }
214
+
215
+ function nextStepPriority(step: CreatorLoopNextStep): number {
216
+ switch (step) {
217
+ case "generate_evals":
218
+ return 0;
219
+ case "run_unit_tests":
220
+ return 1;
221
+ case "run_replay_dry_run":
222
+ return 2;
223
+ case "measure_baseline":
224
+ return 3;
225
+ case "deploy_candidate":
226
+ return 4;
227
+ case "watch_deployment":
228
+ return 5;
229
+ }
230
+ }
231
+
232
+ function deriveDeploymentReadiness(
233
+ nextStep: CreatorLoopNextStep,
234
+ latestEvolutionAction: string | null,
235
+ ): DeploymentReadiness {
236
+ if (nextStep !== "deploy_candidate" && nextStep !== "watch_deployment") {
237
+ return "blocked";
238
+ }
239
+ if (latestEvolutionAction === "rolled_back") {
240
+ return "rolled_back";
241
+ }
242
+ if (nextStep === "watch_deployment" || latestEvolutionAction === "deployed") {
243
+ return "watching";
244
+ }
245
+ return "ready_to_deploy";
246
+ }
247
+
248
+ function summarizeDeploymentReadiness(
249
+ deploymentReadiness: DeploymentReadiness,
250
+ skillName: string,
251
+ skillPath: string | null,
252
+ ): { summary: string; command: string | null } {
253
+ const pathArg = formatSkillPathArg(skillPath, skillName);
254
+ switch (deploymentReadiness) {
255
+ case "blocked":
256
+ return {
257
+ summary: "Finish the creator test loop before shipping this skill.",
258
+ command: null,
259
+ };
260
+ case "ready_to_deploy":
261
+ return {
262
+ summary:
263
+ "Tests and baseline are in place. Run a live evolve so selftune can validate and deploy the strongest candidate.",
264
+ command: `selftune evolve --skill ${skillName} --skill-path ${pathArg} --with-baseline`,
265
+ };
266
+ case "watching":
267
+ return {
268
+ summary:
269
+ "A candidate is already deployed. Keep watching live trigger behavior and baseline lift before making another mutation.",
270
+ command: `selftune watch --skill ${skillName}`,
271
+ };
272
+ case "rolled_back":
273
+ return {
274
+ summary:
275
+ "The last deployment rolled back. Review the failure evidence, rerun a replay dry-run if needed, then redeploy once the candidate is trustworthy again.",
276
+ command: `selftune evolve --skill ${skillName} --skill-path ${pathArg} --dry-run --validation-mode replay`,
277
+ };
278
+ }
279
+ }
280
+
281
+ export function listSkillTestingReadiness(
282
+ db: Database,
283
+ searchDirs: string[] = getSkillSearchDirs(),
284
+ ): SkillTestingReadiness[] {
285
+ const context = buildTestingReadinessContext(db, searchDirs);
286
+
287
+ return [...context.knownSkills]
288
+ .sort((a, b) => a.localeCompare(b))
289
+ .map((skillName) => buildSkillTestingReadinessRow(skillName, context))
290
+ .filter((row): row is SkillTestingReadiness => row != null)
291
+ .sort((a, b) => {
292
+ const priorityDiff = nextStepPriority(a.next_step) - nextStepPriority(b.next_step);
293
+ if (priorityDiff !== 0) return priorityDiff;
294
+ const trustedDiff = b.trusted_session_count - a.trusted_session_count;
295
+ if (trustedDiff !== 0) return trustedDiff;
296
+ return a.skill_name.localeCompare(b.skill_name);
297
+ });
298
+ }
299
+
300
+ export function getSkillTestingReadiness(
301
+ db: Database,
302
+ skillName: string,
303
+ searchDirs: string[] = getSkillSearchDirs(),
304
+ ): SkillTestingReadiness | null {
305
+ return buildSkillTestingReadinessRow(skillName, buildTestingReadinessContext(db, searchDirs));
306
+ }
307
+
308
+ function buildTestingReadinessContext(db: Database, searchDirs: string[]): TestingReadinessContext {
309
+ const trustedRows = queryTrustedSkillObservationRows(db);
310
+ const trustedRowsBySkill = new Map<string, TrustedSkillObservationSummary[]>();
311
+
312
+ for (const row of trustedRows) {
313
+ const existing = trustedRowsBySkill.get(row.skill_name);
314
+ const compact = { session_id: row.session_id, triggered: row.triggered };
315
+ if (existing) existing.push(compact);
316
+ else trustedRowsBySkill.set(row.skill_name, [compact]);
317
+ }
318
+
319
+ const installedNames = findInstalledSkillNames(searchDirs);
320
+ const unitTestDir = getUnitTestDir();
321
+ const evalSetDir = getEvalSetDir();
322
+ const unitTestNames = scanSkillNamesFromDir(unitTestDir, (entry) => {
323
+ if (!entry.endsWith(".json") || entry.endsWith(".last-run.json")) return null;
324
+ return entry.slice(0, -".json".length);
325
+ });
326
+ const unitTestResultNames = scanSkillNamesFromDir(unitTestDir, (entry) => {
327
+ if (!entry.endsWith(".last-run.json")) return null;
328
+ return entry.slice(0, -".last-run.json".length);
329
+ });
330
+ const canonicalEvalNames = scanSkillNamesFromDir(evalSetDir, (entry) => {
331
+ if (!entry.endsWith(".json")) return null;
332
+ return entry.slice(0, -".json".length);
333
+ });
334
+
335
+ const evidenceRows = queryEvolutionEvidence(db);
336
+ const evalEvidenceBySkill = new Map<string, { count: number; latestAt: string | null }>();
337
+ const fallbackSkillPathBySkill = new Map<string, string>();
338
+ for (const row of evidenceRows) {
339
+ if (row.eval_set && row.eval_set.length > 0 && !evalEvidenceBySkill.has(row.skill_name)) {
340
+ evalEvidenceBySkill.set(row.skill_name, {
341
+ count: row.eval_set.length,
342
+ latestAt: row.timestamp,
343
+ });
344
+ }
345
+ if (row.skill_path && !fallbackSkillPathBySkill.has(row.skill_name)) {
346
+ fallbackSkillPathBySkill.set(row.skill_name, row.skill_path);
347
+ }
348
+ }
349
+
350
+ const replayRows = db
351
+ .query(
352
+ `SELECT skill_name, validation_mode, COUNT(*) AS check_count, MAX(id) AS latest_id
353
+ FROM replay_entry_results
354
+ GROUP BY skill_name, validation_mode
355
+ ORDER BY latest_id DESC`,
356
+ )
357
+ .all() as Array<{
358
+ skill_name: string;
359
+ validation_mode: string;
360
+ check_count: number;
361
+ latest_id: number;
362
+ }>;
363
+ const replayBySkill = new Map<
364
+ string,
365
+ { check_count: number; latest_validation_mode: string | null }
366
+ >();
367
+ for (const row of replayRows) {
368
+ const existing = replayBySkill.get(row.skill_name);
369
+ if (existing) {
370
+ existing.check_count += row.check_count;
371
+ continue;
372
+ }
373
+ replayBySkill.set(row.skill_name, {
374
+ check_count: row.check_count,
375
+ latest_validation_mode: row.validation_mode ?? null,
376
+ });
377
+ }
378
+
379
+ const baselineRows = db
380
+ .query(
381
+ `SELECT skill_name, pass_rate, sample_size, measured_at
382
+ FROM grading_baselines
383
+ ORDER BY measured_at DESC`,
384
+ )
385
+ .all() as Array<{
386
+ skill_name: string;
387
+ pass_rate: number;
388
+ sample_size: number;
389
+ measured_at: string;
390
+ }>;
391
+ const baselineBySkill = new Map<
392
+ string,
393
+ { sample_size: number; pass_rate: number | null; measured_at: string | null }
394
+ >();
395
+ for (const row of baselineRows) {
396
+ if (baselineBySkill.has(row.skill_name)) continue;
397
+ baselineBySkill.set(row.skill_name, {
398
+ sample_size: row.sample_size,
399
+ pass_rate: row.pass_rate,
400
+ measured_at: row.measured_at,
401
+ });
402
+ }
403
+
404
+ const latestEvolutionRows = db
405
+ .query(
406
+ `SELECT skill_name, action, timestamp
407
+ FROM evolution_audit
408
+ WHERE skill_name IS NOT NULL
409
+ ORDER BY timestamp DESC`,
410
+ )
411
+ .all() as Array<{
412
+ skill_name: string;
413
+ action: string;
414
+ timestamp: string;
415
+ }>;
416
+ const latestEvolutionBySkill = new Map<
417
+ string,
418
+ { action: string | null; timestamp: string | null }
419
+ >();
420
+ for (const row of latestEvolutionRows) {
421
+ if (latestEvolutionBySkill.has(row.skill_name)) continue;
422
+ latestEvolutionBySkill.set(row.skill_name, {
423
+ action: row.action,
424
+ timestamp: row.timestamp,
425
+ });
426
+ }
427
+
428
+ const latestSkillPathRows = db
429
+ .query(
430
+ `SELECT skill_name, skill_path
431
+ FROM skill_invocations
432
+ WHERE skill_path IS NOT NULL AND skill_path != ''
433
+ ORDER BY occurred_at DESC`,
434
+ )
435
+ .all() as Array<{ skill_name: string; skill_path: string }>;
436
+ for (const row of latestSkillPathRows) {
437
+ if (!fallbackSkillPathBySkill.has(row.skill_name)) {
438
+ fallbackSkillPathBySkill.set(row.skill_name, row.skill_path);
439
+ }
440
+ }
441
+
442
+ const knownSkills = new Set<string>([
443
+ ...trustedRowsBySkill.keys(),
444
+ ...installedNames,
445
+ ...unitTestNames,
446
+ ...unitTestResultNames,
447
+ ...canonicalEvalNames,
448
+ ...evalEvidenceBySkill.keys(),
449
+ ...replayBySkill.keys(),
450
+ ...baselineBySkill.keys(),
451
+ ...fallbackSkillPathBySkill.keys(),
452
+ ]);
453
+
454
+ return {
455
+ knownSkills,
456
+ searchDirs,
457
+ trustedRowsBySkill,
458
+ evalEvidenceBySkill,
459
+ fallbackSkillPathBySkill,
460
+ replayBySkill,
461
+ baselineBySkill,
462
+ latestEvolutionBySkill,
463
+ };
464
+ }
465
+
466
+ function buildSkillTestingReadinessRow(
467
+ skillName: string,
468
+ context: TestingReadinessContext,
469
+ ): SkillTestingReadiness | null {
470
+ const trustRows = context.trustedRowsBySkill.get(skillName) ?? [];
471
+ const trustedTriggerCount = trustRows.filter((row) => row.triggered === 1).length;
472
+ const trustedSessionCount = new Set(trustRows.map((row) => row.session_id)).size;
473
+
474
+ const installedSkillPath = findInstalledSkillPath(skillName, context.searchDirs) ?? null;
475
+ if (!context.knownSkills.has(skillName) && installedSkillPath == null) {
476
+ return null;
477
+ }
478
+
479
+ const skillPath = installedSkillPath ?? context.fallbackSkillPathBySkill.get(skillName) ?? null;
480
+ const evalReadiness = deriveEvalReadiness(skillPath, trustedTriggerCount);
481
+
482
+ const canonicalEvalPath = getCanonicalEvalSetPath(skillName);
483
+ const canonicalEvalEntries = readJsonArrayFile(canonicalEvalPath);
484
+ const canonicalEvalStat = existsSync(canonicalEvalPath) ? statSync(canonicalEvalPath) : null;
485
+ const evidenceEval = context.evalEvidenceBySkill.get(skillName) ?? { count: 0, latestAt: null };
486
+ const evalSetEntries =
487
+ canonicalEvalEntries.length > 0 ? canonicalEvalEntries.length : evidenceEval.count;
488
+ const latestEvalAt = canonicalEvalStat?.mtime.toISOString?.() ?? evidenceEval.latestAt ?? null;
489
+
490
+ const unitTestPath = getUnitTestPath(skillName);
491
+ const unitTestCases = readJsonArrayFile(unitTestPath).length;
492
+ const unitTestResult = readUnitTestResult(getUnitTestResultPath(skillName));
493
+
494
+ const replay = context.replayBySkill.get(skillName) ?? {
495
+ check_count: 0,
496
+ latest_validation_mode: null,
497
+ };
498
+ const baseline = context.baselineBySkill.get(skillName) ?? {
499
+ sample_size: 0,
500
+ pass_rate: null,
501
+ measured_at: null,
502
+ };
503
+ const latestEvolution = context.latestEvolutionBySkill.get(skillName) ?? {
504
+ action: null,
505
+ timestamp: null,
506
+ };
507
+
508
+ let nextStep: CreatorLoopNextStep;
509
+ if (evalSetEntries === 0) {
510
+ nextStep = "generate_evals";
511
+ } else if (unitTestCases === 0) {
512
+ nextStep = "run_unit_tests";
513
+ } else if (replay.check_count === 0) {
514
+ nextStep = "run_replay_dry_run";
515
+ } else if (baseline.sample_size === 0) {
516
+ nextStep = "measure_baseline";
517
+ } else if (latestEvolution.action === "deployed") {
518
+ nextStep = "watch_deployment";
519
+ } else {
520
+ nextStep = "deploy_candidate";
521
+ }
522
+
523
+ const deploymentReadiness = deriveDeploymentReadiness(nextStep, latestEvolution.action);
524
+ const deployment = summarizeDeploymentReadiness(deploymentReadiness, skillName, skillPath);
525
+ const recommended_command = recommendCommand(skillName, skillPath, nextStep);
526
+ const summary = summarizeReadiness(
527
+ nextStep,
528
+ evalReadiness,
529
+ evalSetEntries,
530
+ unitTestCases,
531
+ replay.check_count,
532
+ baseline.sample_size,
533
+ unitTestResult?.pass_rate ?? null,
534
+ );
535
+
536
+ return {
537
+ skill_name: skillName,
538
+ eval_readiness: evalReadiness,
539
+ next_step: nextStep,
540
+ summary,
541
+ recommended_command,
542
+ skill_path: skillPath,
543
+ trusted_trigger_count: trustedTriggerCount,
544
+ trusted_session_count: trustedSessionCount,
545
+ eval_set_entries: evalSetEntries,
546
+ latest_eval_at: latestEvalAt,
547
+ unit_test_cases: unitTestCases,
548
+ unit_test_pass_rate: unitTestResult?.pass_rate ?? null,
549
+ unit_test_ran_at: unitTestResult?.run_at ?? null,
550
+ replay_check_count: replay.check_count,
551
+ latest_validation_mode:
552
+ replay.latest_validation_mode === "host_replay" ||
553
+ replay.latest_validation_mode === "llm_judge" ||
554
+ replay.latest_validation_mode === "structural_guard"
555
+ ? replay.latest_validation_mode
556
+ : null,
557
+ baseline_sample_size: baseline.sample_size,
558
+ baseline_pass_rate: baseline.pass_rate,
559
+ latest_baseline_at: baseline.measured_at,
560
+ deployment_readiness: deploymentReadiness,
561
+ deployment_summary: deployment.summary,
562
+ deployment_command: deployment.command,
563
+ latest_evolution_action: latestEvolution.action,
564
+ latest_evolution_at: latestEvolution.timestamp,
565
+ } satisfies SkillTestingReadiness;
566
+ }
567
+
568
+ export function buildCreatorTestingOverview(
569
+ readinessRows: SkillTestingReadiness[],
570
+ ): CreatorTestingOverview {
571
+ const counts = {
572
+ generate_evals: 0,
573
+ run_unit_tests: 0,
574
+ run_replay_dry_run: 0,
575
+ measure_baseline: 0,
576
+ deploy_candidate: 0,
577
+ watch_deployment: 0,
578
+ } satisfies CreatorTestingOverview["counts"];
579
+
580
+ for (const row of readinessRows) {
581
+ counts[row.next_step]++;
582
+ }
583
+
584
+ const priorities = readinessRows
585
+ .filter((row) => row.next_step !== "watch_deployment")
586
+ .slice(0, 5)
587
+ .map((row) => ({
588
+ skill_name: row.skill_name,
589
+ next_step: row.next_step,
590
+ summary: row.summary,
591
+ recommended_command: row.recommended_command,
592
+ }));
593
+
594
+ const summary = `${counts.deploy_candidate} ready to deploy, ${counts.watch_deployment} already shipped and under watch, ${counts.generate_evals} still need evals, ${counts.run_unit_tests} need unit tests, ${counts.run_replay_dry_run} need replay dry-runs, ${counts.measure_baseline} need baselines.`;
595
+
596
+ return { summary, counts, priorities };
597
+ }