selftune 0.2.23 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +93 -15
  3. package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
  4. package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
  5. package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
  6. package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
  7. package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
  8. package/apps/local-dashboard/dist/index.html +5 -5
  9. package/cli/selftune/adapters/codex/install.ts +310 -78
  10. package/cli/selftune/adapters/opencode/install.ts +3 -4
  11. package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
  12. package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
  13. package/cli/selftune/auto-update.ts +200 -8
  14. package/cli/selftune/canonical-export.ts +55 -25
  15. package/cli/selftune/command-surface.ts +397 -0
  16. package/cli/selftune/contribute/contribute.ts +64 -13
  17. package/cli/selftune/contribution-config.ts +57 -3
  18. package/cli/selftune/contribution-preferences.ts +117 -0
  19. package/cli/selftune/contribution-signals.ts +8 -4
  20. package/cli/selftune/contribution-staging.ts +13 -2
  21. package/cli/selftune/contributions.ts +55 -121
  22. package/cli/selftune/creator-contributions.ts +29 -10
  23. package/cli/selftune/cron/setup.ts +7 -3
  24. package/cli/selftune/dashboard-contract.ts +73 -0
  25. package/cli/selftune/dashboard-server.ts +168 -17
  26. package/cli/selftune/dashboard.ts +350 -17
  27. package/cli/selftune/eval/baseline.ts +21 -5
  28. package/cli/selftune/eval/execution-eval.ts +170 -0
  29. package/cli/selftune/eval/family-overlap.ts +2 -2
  30. package/cli/selftune/eval/hooks-to-evals.ts +228 -82
  31. package/cli/selftune/eval/import-skillsbench.ts +2 -2
  32. package/cli/selftune/eval/invocation-classifier.ts +56 -0
  33. package/cli/selftune/eval/synthetic-evals.ts +5 -3
  34. package/cli/selftune/eval/unit-test-cli.ts +7 -4
  35. package/cli/selftune/evolution/apply-proposal.ts +295 -0
  36. package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
  37. package/cli/selftune/evolution/evolve-body.ts +100 -39
  38. package/cli/selftune/evolution/evolve.ts +244 -52
  39. package/cli/selftune/evolution/rollback.ts +0 -1
  40. package/cli/selftune/evolution/validate-body.ts +68 -42
  41. package/cli/selftune/evolution/validate-host-replay.ts +510 -60
  42. package/cli/selftune/evolution/validate-proposal.ts +11 -150
  43. package/cli/selftune/evolution/validate-routing.ts +43 -41
  44. package/cli/selftune/evolution/validation-contract.ts +91 -0
  45. package/cli/selftune/grading/auto-grade.ts +11 -7
  46. package/cli/selftune/grading/grade-session.ts +10 -16
  47. package/cli/selftune/index.ts +35 -10
  48. package/cli/selftune/ingestors/claude-replay.ts +15 -10
  49. package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
  50. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  51. package/cli/selftune/ingestors/pi-ingest.ts +3 -2
  52. package/cli/selftune/init.ts +27 -3
  53. package/cli/selftune/localdb/direct-write.ts +35 -1
  54. package/cli/selftune/localdb/queries/cron.ts +34 -0
  55. package/cli/selftune/localdb/queries/dashboard.ts +834 -0
  56. package/cli/selftune/localdb/queries/evolution.ts +158 -0
  57. package/cli/selftune/localdb/queries/execution.ts +133 -0
  58. package/cli/selftune/localdb/queries/json.ts +18 -0
  59. package/cli/selftune/localdb/queries/monitoring.ts +263 -0
  60. package/cli/selftune/localdb/queries/raw.ts +95 -0
  61. package/cli/selftune/localdb/queries/staging.ts +270 -0
  62. package/cli/selftune/localdb/queries/trust.ts +392 -0
  63. package/cli/selftune/localdb/queries.ts +60 -2288
  64. package/cli/selftune/localdb/schema.ts +21 -0
  65. package/cli/selftune/monitoring/watch.ts +96 -29
  66. package/cli/selftune/normalization.ts +3 -0
  67. package/cli/selftune/observability.ts +4 -2
  68. package/cli/selftune/orchestrate/cli.ts +161 -0
  69. package/cli/selftune/orchestrate/execute.ts +295 -0
  70. package/cli/selftune/orchestrate/finalize.ts +157 -0
  71. package/cli/selftune/orchestrate/locks.ts +40 -0
  72. package/cli/selftune/orchestrate/plan.ts +131 -0
  73. package/cli/selftune/orchestrate/post-run.ts +59 -0
  74. package/cli/selftune/orchestrate/prepare.ts +334 -0
  75. package/cli/selftune/orchestrate/report.ts +182 -0
  76. package/cli/selftune/orchestrate/runtime.ts +120 -0
  77. package/cli/selftune/orchestrate/signals.ts +48 -0
  78. package/cli/selftune/orchestrate.ts +150 -1173
  79. package/cli/selftune/repair/skill-usage.ts +5 -2
  80. package/cli/selftune/routes/overview.ts +5 -2
  81. package/cli/selftune/routes/skill-report.ts +15 -2
  82. package/cli/selftune/schedule.ts +5 -5
  83. package/cli/selftune/status.ts +39 -2
  84. package/cli/selftune/testing-readiness.ts +597 -0
  85. package/cli/selftune/types.ts +44 -4
  86. package/cli/selftune/uninstall.ts +2 -1
  87. package/cli/selftune/utils/canonical-log.ts +1 -9
  88. package/cli/selftune/utils/cli-error.ts +9 -0
  89. package/cli/selftune/utils/llm-call.ts +126 -6
  90. package/cli/selftune/utils/skill-discovery.ts +2 -0
  91. package/cli/selftune/workflows/proposals.ts +184 -0
  92. package/cli/selftune/workflows/skill-scaffold.ts +241 -0
  93. package/cli/selftune/workflows/workflows.ts +100 -26
  94. package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
  95. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  96. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  97. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  98. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
  99. package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
  100. package/package.json +25 -9
  101. package/packages/dashboard-core/AGENTS.md +18 -0
  102. package/packages/dashboard-core/README.md +30 -0
  103. package/packages/dashboard-core/index.ts +3 -0
  104. package/packages/dashboard-core/package.json +39 -0
  105. package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
  106. package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
  107. package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
  108. package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
  109. package/packages/dashboard-core/src/chrome/index.ts +14 -0
  110. package/packages/dashboard-core/src/chrome/types.ts +81 -0
  111. package/packages/dashboard-core/src/chrome/utils.ts +23 -0
  112. package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
  113. package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
  114. package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
  115. package/packages/dashboard-core/src/gates/index.ts +3 -0
  116. package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
  117. package/packages/dashboard-core/src/host/adapter.ts +47 -0
  118. package/packages/dashboard-core/src/host/capabilities.ts +55 -0
  119. package/packages/dashboard-core/src/host/index.ts +3 -0
  120. package/packages/dashboard-core/src/models/analytics.ts +39 -0
  121. package/packages/dashboard-core/src/models/index.ts +4 -0
  122. package/packages/dashboard-core/src/models/overview.ts +98 -0
  123. package/packages/dashboard-core/src/models/runtime.ts +7 -0
  124. package/packages/dashboard-core/src/models/skills.ts +34 -0
  125. package/packages/dashboard-core/src/routes/index.ts +2 -0
  126. package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
  127. package/packages/dashboard-core/src/routes/manifest.ts +451 -0
  128. package/packages/dashboard-core/src/routes/types.ts +39 -0
  129. package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
  130. package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
  131. package/packages/dashboard-core/src/screens/index.ts +37 -0
  132. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
  133. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
  134. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
  135. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
  136. package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
  137. package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
  138. package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
  139. package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
  140. package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
  141. package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
  142. package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
  143. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
  144. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
  145. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
  146. package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
  147. package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
  148. package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
  149. package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
  150. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
  151. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
  152. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
  153. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
  154. package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
  155. package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
  156. package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
  157. package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
  158. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  159. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  160. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  161. package/packages/telemetry-contract/src/schemas.ts +41 -1
  162. package/packages/telemetry-contract/src/types.ts +103 -2
  163. package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
  164. package/packages/ui/src/components/OverviewPanels.tsx +67 -26
  165. package/packages/ui/src/primitives/tabs.tsx +7 -6
  166. package/packages/ui/src/types.ts +10 -0
  167. package/skill/SKILL.md +130 -332
  168. package/skill/agents/diagnosis-analyst.md +3 -3
  169. package/skill/agents/evolution-reviewer.md +3 -3
  170. package/skill/agents/integration-guide.md +3 -3
  171. package/skill/agents/pattern-analyst.md +2 -2
  172. package/skill/references/cli-quick-reference.md +89 -0
  173. package/skill/references/creator-playbook.md +131 -0
  174. package/skill/references/examples.md +48 -0
  175. package/skill/references/troubleshooting.md +47 -0
  176. package/skill/references/version-history.md +1 -1
  177. package/skill/selftune.contribute.json +11 -0
  178. package/skill/{Workflows → workflows}/Baseline.md +20 -1
  179. package/skill/{Workflows → workflows}/Contribute.md +23 -10
  180. package/skill/{Workflows → workflows}/Contributions.md +13 -5
  181. package/skill/workflows/CreateTestDeploy.md +170 -0
  182. package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
  183. package/skill/{Workflows → workflows}/Cron.md +1 -1
  184. package/skill/{Workflows → workflows}/Dashboard.md +20 -0
  185. package/skill/{Workflows → workflows}/Doctor.md +1 -1
  186. package/skill/{Workflows → workflows}/Evals.md +67 -2
  187. package/skill/{Workflows → workflows}/Evolve.md +119 -30
  188. package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
  189. package/skill/{Workflows → workflows}/Grade.md +1 -1
  190. package/skill/{Workflows → workflows}/Initialize.md +8 -4
  191. package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
  192. package/skill/{Workflows → workflows}/Schedule.md +3 -3
  193. package/skill/workflows/SignalsDashboard.md +87 -0
  194. package/skill/{Workflows → workflows}/UnitTest.md +19 -0
  195. package/skill/{Workflows → workflows}/Watch.md +42 -2
  196. package/skill/{Workflows → workflows}/Workflows.md +39 -2
  197. package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
  198. package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
  199. package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
  200. package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
  201. /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
  202. /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
  203. /package/skill/{Workflows → workflows}/Badge.md +0 -0
  204. /package/skill/{Workflows → workflows}/Composability.md +0 -0
  205. /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
  206. /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
  207. /package/skill/{Workflows → workflows}/Hook.md +0 -0
  208. /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
  209. /package/skill/{Workflows → workflows}/Ingest.md +0 -0
  210. /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
  211. /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
  212. /package/skill/{Workflows → workflows}/Recover.md +0 -0
  213. /package/skill/{Workflows → workflows}/Registry.md +0 -0
  214. /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
  215. /package/skill/{Workflows → workflows}/Replay.md +0 -0
  216. /package/skill/{Workflows → workflows}/Rollback.md +0 -0
  217. /package/skill/{Workflows → workflows}/Sync.md +0 -0
  218. /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
  219. /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
@@ -0,0 +1,597 @@
1
+ import type { Database } from "bun:sqlite";
2
+
3
+ import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from "node:fs";
4
+ import { join } from "node:path";
5
+
6
+ import { SELFTUNE_CONFIG_DIR } from "./constants.js";
7
+ import type {
8
+ CreatorLoopNextStep,
9
+ CreatorTestingOverview,
10
+ DeploymentReadiness,
11
+ SkillEvalReadiness,
12
+ SkillTestingReadiness,
13
+ } from "./dashboard-contract.js";
14
+ import type { EvalEntry, UnitTestSuiteResult } from "./types.js";
15
+ import { queryEvolutionEvidence } from "./localdb/queries/evolution.js";
16
+ import { queryTrustedSkillObservationRows } from "./localdb/queries/trust.js";
17
+ import {
18
+ findInstalledSkillNames,
19
+ findInstalledSkillPath,
20
+ findRepositoryClaudeSkillDirs,
21
+ findRepositorySkillDirs,
22
+ } from "./utils/skill-discovery.js";
23
+
24
+ interface TrustedSkillObservationSummary {
25
+ session_id: string;
26
+ triggered: number;
27
+ }
28
+
29
+ interface TestingReadinessContext {
30
+ knownSkills: Set<string>;
31
+ searchDirs: string[];
32
+ trustedRowsBySkill: Map<string, TrustedSkillObservationSummary[]>;
33
+ evalEvidenceBySkill: Map<string, { count: number; latestAt: string | null }>;
34
+ fallbackSkillPathBySkill: Map<string, string>;
35
+ replayBySkill: Map<string, { check_count: number; latest_validation_mode: string | null }>;
36
+ baselineBySkill: Map<
37
+ string,
38
+ { sample_size: number; pass_rate: number | null; measured_at: string | null }
39
+ >;
40
+ latestEvolutionBySkill: Map<string, { action: string | null; timestamp: string | null }>;
41
+ }
42
+
43
+ function getConfigDir(): string {
44
+ return process.env.SELFTUNE_CONFIG_DIR || SELFTUNE_CONFIG_DIR;
45
+ }
46
+
47
+ function getEvalSetDir(): string {
48
+ return join(getConfigDir(), "eval-sets");
49
+ }
50
+
51
+ function getUnitTestDir(): string {
52
+ return join(getConfigDir(), "unit-tests");
53
+ }
54
+
55
+ export function getCanonicalEvalSetPath(skillName: string): string {
56
+ return join(getEvalSetDir(), `${skillName}.json`);
57
+ }
58
+
59
+ export function getUnitTestPath(skillName: string): string {
60
+ return join(getUnitTestDir(), `${skillName}.json`);
61
+ }
62
+
63
+ export function getUnitTestResultPath(skillName: string): string {
64
+ return join(getUnitTestDir(), `${skillName}.last-run.json`);
65
+ }
66
+
67
+ export function writeCanonicalEvalSet(skillName: string, evalSet: EvalEntry[]): string {
68
+ mkdirSync(getEvalSetDir(), { recursive: true });
69
+ const path = getCanonicalEvalSetPath(skillName);
70
+ writeFileSync(path, JSON.stringify(evalSet, null, 2), "utf-8");
71
+ return path;
72
+ }
73
+
74
+ export function writeUnitTestRunResult(skillName: string, suite: UnitTestSuiteResult): string {
75
+ mkdirSync(getUnitTestDir(), { recursive: true });
76
+ const path = getUnitTestResultPath(skillName);
77
+ writeFileSync(path, JSON.stringify(suite, null, 2), "utf-8");
78
+ return path;
79
+ }
80
+
81
+ function readJsonArrayFile(path: string): unknown[] {
82
+ try {
83
+ if (!existsSync(path)) return [];
84
+ const parsed = JSON.parse(readFileSync(path, "utf-8")) as unknown;
85
+ return Array.isArray(parsed) ? parsed : [];
86
+ } catch {
87
+ return [];
88
+ }
89
+ }
90
+
91
+ function readUnitTestResult(path: string): UnitTestSuiteResult | null {
92
+ try {
93
+ if (!existsSync(path)) return null;
94
+ const parsed = JSON.parse(readFileSync(path, "utf-8")) as Partial<UnitTestSuiteResult>;
95
+ if (typeof parsed !== "object" || parsed == null) return null;
96
+ if (
97
+ typeof parsed.skill_name !== "string" ||
98
+ typeof parsed.total !== "number" ||
99
+ typeof parsed.passed !== "number" ||
100
+ typeof parsed.failed !== "number" ||
101
+ typeof parsed.pass_rate !== "number" ||
102
+ typeof parsed.run_at !== "string"
103
+ ) {
104
+ return null;
105
+ }
106
+ return parsed as UnitTestSuiteResult;
107
+ } catch {
108
+ return null;
109
+ }
110
+ }
111
+
112
+ function getSkillSearchDirs(): string[] {
113
+ const cwd = process.cwd();
114
+ const homeDir = process.env.HOME ?? "";
115
+ const codexHome = process.env.CODEX_HOME ?? `${homeDir}/.codex`;
116
+ return [
117
+ ...findRepositorySkillDirs(cwd),
118
+ ...findRepositoryClaudeSkillDirs(cwd),
119
+ `${homeDir}/.agents/skills`,
120
+ `${homeDir}/.claude/skills`,
121
+ `${codexHome}/skills`,
122
+ ];
123
+ }
124
+
125
+ function scanSkillNamesFromDir(
126
+ dir: string,
127
+ matcher: (entryName: string) => string | null,
128
+ ): Set<string> {
129
+ const names = new Set<string>();
130
+ if (!existsSync(dir)) return names;
131
+ try {
132
+ for (const entry of readdirSync(dir)) {
133
+ const name = matcher(entry);
134
+ if (name) names.add(name);
135
+ }
136
+ } catch {
137
+ return names;
138
+ }
139
+ return names;
140
+ }
141
+
142
+ function deriveEvalReadiness(
143
+ skillPath: string | null,
144
+ trustedTriggerCount: number,
145
+ ): SkillEvalReadiness {
146
+ if (trustedTriggerCount > 0) return "log_ready";
147
+ if (skillPath) return "cold_start_ready";
148
+ return "telemetry_only";
149
+ }
150
+
151
+ function formatSkillPathArg(skillPath: string | null, skillName: string): string {
152
+ return skillPath ?? `/path/to/skills/${skillName}/SKILL.md`;
153
+ }
154
+
155
+ function recommendCommand(
156
+ skillName: string,
157
+ skillPath: string | null,
158
+ nextStep: CreatorLoopNextStep,
159
+ ): string {
160
+ const pathArg = formatSkillPathArg(skillPath, skillName);
161
+ switch (nextStep) {
162
+ case "generate_evals":
163
+ return skillPath
164
+ ? `selftune eval generate --skill ${skillName} --auto-synthetic --skill-path ${pathArg}`
165
+ : `selftune eval generate --skill ${skillName} --skill-path ${pathArg}`;
166
+ case "run_unit_tests":
167
+ return `selftune eval unit-test --skill ${skillName} --generate --skill-path ${pathArg}`;
168
+ case "run_replay_dry_run":
169
+ return `selftune evolve --skill ${skillName} --skill-path ${pathArg} --dry-run --validation-mode replay`;
170
+ case "measure_baseline":
171
+ return `selftune grade baseline --skill ${skillName} --skill-path ${pathArg}`;
172
+ case "deploy_candidate":
173
+ return `selftune evolve --skill ${skillName} --skill-path ${pathArg} --with-baseline`;
174
+ case "watch_deployment":
175
+ return `selftune watch --skill ${skillName}`;
176
+ }
177
+ }
178
+
179
+ function summarizeReadiness(
180
+ nextStep: CreatorLoopNextStep,
181
+ evalReadiness: SkillEvalReadiness,
182
+ evalSetEntries: number,
183
+ unitTestCases: number,
184
+ replayCheckCount: number,
185
+ baselineSampleSize: number,
186
+ unitTestPassRate: number | null,
187
+ ): string {
188
+ switch (nextStep) {
189
+ case "generate_evals":
190
+ if (evalReadiness === "log_ready") {
191
+ return "Trusted telemetry exists, but no canonical eval set is saved yet.";
192
+ }
193
+ if (evalReadiness === "cold_start_ready") {
194
+ return "Installed locally but still cold-start. Generate synthetic evals before you evolve it.";
195
+ }
196
+ return "Telemetry exists, but selftune cannot resolve a local SKILL.md yet. Point it at the skill and generate evals.";
197
+ case "run_unit_tests":
198
+ return `Eval coverage is present (${evalSetEntries} entries), but no unit test file is saved yet.`;
199
+ case "run_replay_dry_run": {
200
+ const passRateText =
201
+ unitTestPassRate != null
202
+ ? ` Last unit-test run passed ${Math.round(unitTestPassRate * 100)}%.`
203
+ : "";
204
+ return `Unit tests are present (${unitTestCases} cases), but replay-backed dry-run validation has not been recorded yet.${passRateText}`;
205
+ }
206
+ case "measure_baseline":
207
+ return `Replay-backed validation exists (${replayCheckCount} recorded checks), but no stored no-skill baseline exists yet.`;
208
+ case "deploy_candidate":
209
+ return `Evals, unit tests, replay validation, and a baseline are all present. Ready to run a live evolve and deploy a watched candidate.${baselineSampleSize > 0 ? ` Latest baseline used ${baselineSampleSize} samples.` : ""}`;
210
+ case "watch_deployment":
211
+ return `A candidate has already been deployed for this skill. Keep watching live traffic and baseline lift before making another mutation.${baselineSampleSize > 0 ? ` Latest baseline used ${baselineSampleSize} samples.` : ""}`;
212
+ }
213
+ }
214
+
215
+ function nextStepPriority(step: CreatorLoopNextStep): number {
216
+ switch (step) {
217
+ case "generate_evals":
218
+ return 0;
219
+ case "run_unit_tests":
220
+ return 1;
221
+ case "run_replay_dry_run":
222
+ return 2;
223
+ case "measure_baseline":
224
+ return 3;
225
+ case "deploy_candidate":
226
+ return 4;
227
+ case "watch_deployment":
228
+ return 5;
229
+ }
230
+ }
231
+
232
+ function deriveDeploymentReadiness(
233
+ nextStep: CreatorLoopNextStep,
234
+ latestEvolutionAction: string | null,
235
+ ): DeploymentReadiness {
236
+ if (nextStep !== "deploy_candidate" && nextStep !== "watch_deployment") {
237
+ return "blocked";
238
+ }
239
+ if (latestEvolutionAction === "rolled_back") {
240
+ return "rolled_back";
241
+ }
242
+ if (nextStep === "watch_deployment" || latestEvolutionAction === "deployed") {
243
+ return "watching";
244
+ }
245
+ return "ready_to_deploy";
246
+ }
247
+
248
+ function summarizeDeploymentReadiness(
249
+ deploymentReadiness: DeploymentReadiness,
250
+ skillName: string,
251
+ skillPath: string | null,
252
+ ): { summary: string; command: string | null } {
253
+ const pathArg = formatSkillPathArg(skillPath, skillName);
254
+ switch (deploymentReadiness) {
255
+ case "blocked":
256
+ return {
257
+ summary: "Finish the creator test loop before shipping this skill.",
258
+ command: null,
259
+ };
260
+ case "ready_to_deploy":
261
+ return {
262
+ summary:
263
+ "Tests and baseline are in place. Run a live evolve so selftune can validate and deploy the strongest candidate.",
264
+ command: `selftune evolve --skill ${skillName} --skill-path ${pathArg} --with-baseline`,
265
+ };
266
+ case "watching":
267
+ return {
268
+ summary:
269
+ "A candidate is already deployed. Keep watching live trigger behavior and baseline lift before making another mutation.",
270
+ command: `selftune watch --skill ${skillName}`,
271
+ };
272
+ case "rolled_back":
273
+ return {
274
+ summary:
275
+ "The last deployment rolled back. Review the failure evidence, rerun a replay dry-run if needed, then redeploy once the candidate is trustworthy again.",
276
+ command: `selftune evolve --skill ${skillName} --skill-path ${pathArg} --dry-run --validation-mode replay`,
277
+ };
278
+ }
279
+ }
280
+
281
+ export function listSkillTestingReadiness(
282
+ db: Database,
283
+ searchDirs: string[] = getSkillSearchDirs(),
284
+ ): SkillTestingReadiness[] {
285
+ const context = buildTestingReadinessContext(db, searchDirs);
286
+
287
+ return [...context.knownSkills]
288
+ .sort((a, b) => a.localeCompare(b))
289
+ .map((skillName) => buildSkillTestingReadinessRow(skillName, context))
290
+ .filter((row): row is SkillTestingReadiness => row != null)
291
+ .sort((a, b) => {
292
+ const priorityDiff = nextStepPriority(a.next_step) - nextStepPriority(b.next_step);
293
+ if (priorityDiff !== 0) return priorityDiff;
294
+ const trustedDiff = b.trusted_session_count - a.trusted_session_count;
295
+ if (trustedDiff !== 0) return trustedDiff;
296
+ return a.skill_name.localeCompare(b.skill_name);
297
+ });
298
+ }
299
+
300
+ export function getSkillTestingReadiness(
301
+ db: Database,
302
+ skillName: string,
303
+ searchDirs: string[] = getSkillSearchDirs(),
304
+ ): SkillTestingReadiness | null {
305
+ return buildSkillTestingReadinessRow(skillName, buildTestingReadinessContext(db, searchDirs));
306
+ }
307
+
308
+ function buildTestingReadinessContext(db: Database, searchDirs: string[]): TestingReadinessContext {
309
+ const trustedRows = queryTrustedSkillObservationRows(db);
310
+ const trustedRowsBySkill = new Map<string, TrustedSkillObservationSummary[]>();
311
+
312
+ for (const row of trustedRows) {
313
+ const existing = trustedRowsBySkill.get(row.skill_name);
314
+ const compact = { session_id: row.session_id, triggered: row.triggered };
315
+ if (existing) existing.push(compact);
316
+ else trustedRowsBySkill.set(row.skill_name, [compact]);
317
+ }
318
+
319
+ const installedNames = findInstalledSkillNames(searchDirs);
320
+ const unitTestDir = getUnitTestDir();
321
+ const evalSetDir = getEvalSetDir();
322
+ const unitTestNames = scanSkillNamesFromDir(unitTestDir, (entry) => {
323
+ if (!entry.endsWith(".json") || entry.endsWith(".last-run.json")) return null;
324
+ return entry.slice(0, -".json".length);
325
+ });
326
+ const unitTestResultNames = scanSkillNamesFromDir(unitTestDir, (entry) => {
327
+ if (!entry.endsWith(".last-run.json")) return null;
328
+ return entry.slice(0, -".last-run.json".length);
329
+ });
330
+ const canonicalEvalNames = scanSkillNamesFromDir(evalSetDir, (entry) => {
331
+ if (!entry.endsWith(".json")) return null;
332
+ return entry.slice(0, -".json".length);
333
+ });
334
+
335
+ const evidenceRows = queryEvolutionEvidence(db);
336
+ const evalEvidenceBySkill = new Map<string, { count: number; latestAt: string | null }>();
337
+ const fallbackSkillPathBySkill = new Map<string, string>();
338
+ for (const row of evidenceRows) {
339
+ if (row.eval_set && row.eval_set.length > 0 && !evalEvidenceBySkill.has(row.skill_name)) {
340
+ evalEvidenceBySkill.set(row.skill_name, {
341
+ count: row.eval_set.length,
342
+ latestAt: row.timestamp,
343
+ });
344
+ }
345
+ if (row.skill_path && !fallbackSkillPathBySkill.has(row.skill_name)) {
346
+ fallbackSkillPathBySkill.set(row.skill_name, row.skill_path);
347
+ }
348
+ }
349
+
350
+ const replayRows = db
351
+ .query(
352
+ `SELECT skill_name, validation_mode, COUNT(*) AS check_count, MAX(id) AS latest_id
353
+ FROM replay_entry_results
354
+ GROUP BY skill_name, validation_mode
355
+ ORDER BY latest_id DESC`,
356
+ )
357
+ .all() as Array<{
358
+ skill_name: string;
359
+ validation_mode: string;
360
+ check_count: number;
361
+ latest_id: number;
362
+ }>;
363
+ const replayBySkill = new Map<
364
+ string,
365
+ { check_count: number; latest_validation_mode: string | null }
366
+ >();
367
+ for (const row of replayRows) {
368
+ const existing = replayBySkill.get(row.skill_name);
369
+ if (existing) {
370
+ existing.check_count += row.check_count;
371
+ continue;
372
+ }
373
+ replayBySkill.set(row.skill_name, {
374
+ check_count: row.check_count,
375
+ latest_validation_mode: row.validation_mode ?? null,
376
+ });
377
+ }
378
+
379
+ const baselineRows = db
380
+ .query(
381
+ `SELECT skill_name, pass_rate, sample_size, measured_at
382
+ FROM grading_baselines
383
+ ORDER BY measured_at DESC`,
384
+ )
385
+ .all() as Array<{
386
+ skill_name: string;
387
+ pass_rate: number;
388
+ sample_size: number;
389
+ measured_at: string;
390
+ }>;
391
+ const baselineBySkill = new Map<
392
+ string,
393
+ { sample_size: number; pass_rate: number | null; measured_at: string | null }
394
+ >();
395
+ for (const row of baselineRows) {
396
+ if (baselineBySkill.has(row.skill_name)) continue;
397
+ baselineBySkill.set(row.skill_name, {
398
+ sample_size: row.sample_size,
399
+ pass_rate: row.pass_rate,
400
+ measured_at: row.measured_at,
401
+ });
402
+ }
403
+
404
+ const latestEvolutionRows = db
405
+ .query(
406
+ `SELECT skill_name, action, timestamp
407
+ FROM evolution_audit
408
+ WHERE skill_name IS NOT NULL
409
+ ORDER BY timestamp DESC`,
410
+ )
411
+ .all() as Array<{
412
+ skill_name: string;
413
+ action: string;
414
+ timestamp: string;
415
+ }>;
416
+ const latestEvolutionBySkill = new Map<
417
+ string,
418
+ { action: string | null; timestamp: string | null }
419
+ >();
420
+ for (const row of latestEvolutionRows) {
421
+ if (latestEvolutionBySkill.has(row.skill_name)) continue;
422
+ latestEvolutionBySkill.set(row.skill_name, {
423
+ action: row.action,
424
+ timestamp: row.timestamp,
425
+ });
426
+ }
427
+
428
+ const latestSkillPathRows = db
429
+ .query(
430
+ `SELECT skill_name, skill_path
431
+ FROM skill_invocations
432
+ WHERE skill_path IS NOT NULL AND skill_path != ''
433
+ ORDER BY occurred_at DESC`,
434
+ )
435
+ .all() as Array<{ skill_name: string; skill_path: string }>;
436
+ for (const row of latestSkillPathRows) {
437
+ if (!fallbackSkillPathBySkill.has(row.skill_name)) {
438
+ fallbackSkillPathBySkill.set(row.skill_name, row.skill_path);
439
+ }
440
+ }
441
+
442
+ const knownSkills = new Set<string>([
443
+ ...trustedRowsBySkill.keys(),
444
+ ...installedNames,
445
+ ...unitTestNames,
446
+ ...unitTestResultNames,
447
+ ...canonicalEvalNames,
448
+ ...evalEvidenceBySkill.keys(),
449
+ ...replayBySkill.keys(),
450
+ ...baselineBySkill.keys(),
451
+ ...fallbackSkillPathBySkill.keys(),
452
+ ]);
453
+
454
+ return {
455
+ knownSkills,
456
+ searchDirs,
457
+ trustedRowsBySkill,
458
+ evalEvidenceBySkill,
459
+ fallbackSkillPathBySkill,
460
+ replayBySkill,
461
+ baselineBySkill,
462
+ latestEvolutionBySkill,
463
+ };
464
+ }
465
+
466
+ function buildSkillTestingReadinessRow(
467
+ skillName: string,
468
+ context: TestingReadinessContext,
469
+ ): SkillTestingReadiness | null {
470
+ const trustRows = context.trustedRowsBySkill.get(skillName) ?? [];
471
+ const trustedTriggerCount = trustRows.filter((row) => row.triggered === 1).length;
472
+ const trustedSessionCount = new Set(trustRows.map((row) => row.session_id)).size;
473
+
474
+ const installedSkillPath = findInstalledSkillPath(skillName, context.searchDirs) ?? null;
475
+ if (!context.knownSkills.has(skillName) && installedSkillPath == null) {
476
+ return null;
477
+ }
478
+
479
+ const skillPath = installedSkillPath ?? context.fallbackSkillPathBySkill.get(skillName) ?? null;
480
+ const evalReadiness = deriveEvalReadiness(skillPath, trustedTriggerCount);
481
+
482
+ const canonicalEvalPath = getCanonicalEvalSetPath(skillName);
483
+ const canonicalEvalEntries = readJsonArrayFile(canonicalEvalPath);
484
+ const canonicalEvalStat = existsSync(canonicalEvalPath) ? statSync(canonicalEvalPath) : null;
485
+ const evidenceEval = context.evalEvidenceBySkill.get(skillName) ?? { count: 0, latestAt: null };
486
+ const evalSetEntries =
487
+ canonicalEvalEntries.length > 0 ? canonicalEvalEntries.length : evidenceEval.count;
488
+ const latestEvalAt = canonicalEvalStat?.mtime.toISOString?.() ?? evidenceEval.latestAt ?? null;
489
+
490
+ const unitTestPath = getUnitTestPath(skillName);
491
+ const unitTestCases = readJsonArrayFile(unitTestPath).length;
492
+ const unitTestResult = readUnitTestResult(getUnitTestResultPath(skillName));
493
+
494
+ const replay = context.replayBySkill.get(skillName) ?? {
495
+ check_count: 0,
496
+ latest_validation_mode: null,
497
+ };
498
+ const baseline = context.baselineBySkill.get(skillName) ?? {
499
+ sample_size: 0,
500
+ pass_rate: null,
501
+ measured_at: null,
502
+ };
503
+ const latestEvolution = context.latestEvolutionBySkill.get(skillName) ?? {
504
+ action: null,
505
+ timestamp: null,
506
+ };
507
+
508
+ let nextStep: CreatorLoopNextStep;
509
+ if (evalSetEntries === 0) {
510
+ nextStep = "generate_evals";
511
+ } else if (unitTestCases === 0) {
512
+ nextStep = "run_unit_tests";
513
+ } else if (replay.check_count === 0) {
514
+ nextStep = "run_replay_dry_run";
515
+ } else if (baseline.sample_size === 0) {
516
+ nextStep = "measure_baseline";
517
+ } else if (latestEvolution.action === "deployed") {
518
+ nextStep = "watch_deployment";
519
+ } else {
520
+ nextStep = "deploy_candidate";
521
+ }
522
+
523
+ const deploymentReadiness = deriveDeploymentReadiness(nextStep, latestEvolution.action);
524
+ const deployment = summarizeDeploymentReadiness(deploymentReadiness, skillName, skillPath);
525
+ const recommended_command = recommendCommand(skillName, skillPath, nextStep);
526
+ const summary = summarizeReadiness(
527
+ nextStep,
528
+ evalReadiness,
529
+ evalSetEntries,
530
+ unitTestCases,
531
+ replay.check_count,
532
+ baseline.sample_size,
533
+ unitTestResult?.pass_rate ?? null,
534
+ );
535
+
536
+ return {
537
+ skill_name: skillName,
538
+ eval_readiness: evalReadiness,
539
+ next_step: nextStep,
540
+ summary,
541
+ recommended_command,
542
+ skill_path: skillPath,
543
+ trusted_trigger_count: trustedTriggerCount,
544
+ trusted_session_count: trustedSessionCount,
545
+ eval_set_entries: evalSetEntries,
546
+ latest_eval_at: latestEvalAt,
547
+ unit_test_cases: unitTestCases,
548
+ unit_test_pass_rate: unitTestResult?.pass_rate ?? null,
549
+ unit_test_ran_at: unitTestResult?.run_at ?? null,
550
+ replay_check_count: replay.check_count,
551
+ latest_validation_mode:
552
+ replay.latest_validation_mode === "host_replay" ||
553
+ replay.latest_validation_mode === "llm_judge" ||
554
+ replay.latest_validation_mode === "structural_guard"
555
+ ? replay.latest_validation_mode
556
+ : null,
557
+ baseline_sample_size: baseline.sample_size,
558
+ baseline_pass_rate: baseline.pass_rate,
559
+ latest_baseline_at: baseline.measured_at,
560
+ deployment_readiness: deploymentReadiness,
561
+ deployment_summary: deployment.summary,
562
+ deployment_command: deployment.command,
563
+ latest_evolution_action: latestEvolution.action,
564
+ latest_evolution_at: latestEvolution.timestamp,
565
+ } satisfies SkillTestingReadiness;
566
+ }
567
+
568
+ export function buildCreatorTestingOverview(
569
+ readinessRows: SkillTestingReadiness[],
570
+ ): CreatorTestingOverview {
571
+ const counts = {
572
+ generate_evals: 0,
573
+ run_unit_tests: 0,
574
+ run_replay_dry_run: 0,
575
+ measure_baseline: 0,
576
+ deploy_candidate: 0,
577
+ watch_deployment: 0,
578
+ } satisfies CreatorTestingOverview["counts"];
579
+
580
+ for (const row of readinessRows) {
581
+ counts[row.next_step]++;
582
+ }
583
+
584
+ const priorities = readinessRows
585
+ .filter((row) => row.next_step !== "watch_deployment")
586
+ .slice(0, 5)
587
+ .map((row) => ({
588
+ skill_name: row.skill_name,
589
+ next_step: row.next_step,
590
+ summary: row.summary,
591
+ recommended_command: row.recommended_command,
592
+ }));
593
+
594
+ const summary = `${counts.deploy_candidate} ready to deploy, ${counts.watch_deployment} already shipped and under watch, ${counts.generate_evals} still need evals, ${counts.run_unit_tests} need unit tests, ${counts.run_replay_dry_run} need replay dry-runs, ${counts.measure_baseline} need baselines.`;
595
+
596
+ return { summary, counts, priorities };
597
+ }
@@ -12,6 +12,8 @@ export interface AlphaIdentity {
12
12
  cloud_user_id?: string;
13
13
  /** Cloud-issued org ID. Set during device-code approval. */
14
14
  cloud_org_id?: string;
15
+ /** Optional override for cloud API base URL. */
16
+ cloud_api_url?: string;
15
17
  /** Cached email from cloud account. Not authoritative. */
16
18
  email?: string;
17
19
  /** Cached display name from cloud account. Not authoritative. */
@@ -134,6 +136,7 @@ export type {
134
136
  CanonicalRecordKind,
135
137
  CanonicalSchemaVersion,
136
138
  CanonicalSessionRecord,
139
+ CanonicalSessionRecordBase,
137
140
  CanonicalSkillInvocationRecord,
138
141
  CanonicalSourceSessionKind,
139
142
  } from "@selftune/telemetry-contract/types";
@@ -167,7 +170,7 @@ export interface TranscriptMetrics {
167
170
  total_tool_calls: number;
168
171
  bash_commands: string[];
169
172
  skills_triggered: string[];
170
- skills_invoked: string[];
173
+ skills_invoked?: string[];
171
174
  assistant_turns: number;
172
175
  errors_encountered: number;
173
176
  transcript_chars: number;
@@ -247,6 +250,40 @@ export interface EvalEntry {
247
250
  query: string;
248
251
  should_trigger: boolean;
249
252
  invocation_type?: InvocationType;
253
+ /** Provenance: where this eval entry originated */
254
+ source?: "synthetic" | "log" | "blended";
255
+ /** ISO timestamp when this eval entry was created */
256
+ created_at?: string;
257
+ }
258
+
259
+ /** Experimental execution eval entry — extends trigger evals with assertion-based validation. */
260
+ export interface ExecutionEvalEntry extends EvalEntry {
261
+ /** Assertions to verify against the execution result */
262
+ assertions: ExecutionAssertion[];
263
+ /** Whether this entry requires a staged workspace */
264
+ requires_workspace?: boolean;
265
+ /** Experimental flag — must be explicitly opted into */
266
+ experimental: true;
267
+ }
268
+
269
+ export interface ExecutionAssertion {
270
+ /** What to check: file existence, content match, command output, etc. */
271
+ type: "file_exists" | "file_contains" | "command_output" | "skill_triggered" | "custom";
272
+ /** Target path, command, or skill name depending on type */
273
+ target: string;
274
+ /** Expected value or pattern (regex for content/output checks) */
275
+ expected?: string;
276
+ /** Whether the assertion is negated (must NOT match) */
277
+ negated?: boolean;
278
+ }
279
+
280
+ export interface EvalSourceStats {
281
+ total: number;
282
+ synthetic: number;
283
+ log: number;
284
+ blended: number;
285
+ oldest?: string;
286
+ newest?: string;
250
287
  }
251
288
 
252
289
  // ---------------------------------------------------------------------------
@@ -414,12 +451,14 @@ export interface EvolutionEvidenceValidation {
414
451
  regressions?: EvalEntry[] | string[];
415
452
  new_passes?: EvalEntry[];
416
453
  per_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
454
+ before_entry_results?: Array<{ entry: EvalEntry; before_pass: boolean; after_pass: boolean }>;
417
455
  gates_passed?: number;
418
456
  gates_total?: number;
419
457
  gate_results?: Array<{ gate: ValidationGate; passed: boolean; reason: string }>;
420
458
  validation_mode?: ValidationMode;
421
459
  validation_agent?: string;
422
460
  validation_fixture_id?: string;
461
+ validation_fallback_reason?: string;
423
462
  validation_evidence_ref?: string;
424
463
  }
425
464
 
@@ -429,7 +468,7 @@ export interface EvolutionEvidenceEntry {
429
468
  skill_name: string;
430
469
  skill_path: string;
431
470
  target: EvolutionTarget;
432
- stage: "created" | "validated" | "deployed" | "rejected" | "rolled_back";
471
+ stage: "proposed" | "created" | "validated" | "deployed" | "rejected" | "rolled_back";
433
472
  rationale?: string;
434
473
  confidence?: number;
435
474
  details?: string;
@@ -677,7 +716,7 @@ export interface ContributionBundle {
677
716
  // ---------------------------------------------------------------------------
678
717
 
679
718
  /** Which part of a skill is being evolved. */
680
- export type EvolutionTarget = "description" | "routing" | "body";
719
+ export type EvolutionTarget = "description" | "routing" | "body" | "new_skill";
681
720
 
682
721
  /** Parsed sections of a SKILL.md file. */
683
722
  export interface SkillSections {
@@ -709,7 +748,7 @@ export type ValidationMode = "structural_guard" | "host_replay" | "llm_judge";
709
748
 
710
749
  export interface RoutingReplayFixture {
711
750
  fixture_id: string;
712
- platform: "claude_code" | "codex";
751
+ platform: "claude_code" | "codex" | "opencode";
713
752
  target_skill_name: string;
714
753
  target_skill_path: string;
715
754
  competing_skill_paths: string[];
@@ -735,6 +774,7 @@ export interface BodyValidationResult {
735
774
  validation_mode?: ValidationMode;
736
775
  validation_agent?: string;
737
776
  validation_fixture_id?: string;
777
+ validation_fallback_reason?: string;
738
778
  before_pass_rate?: number;
739
779
  after_pass_rate?: number;
740
780
  per_entry_results?: RoutingReplayEntryResult[];