selftune 0.2.23 → 0.2.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +93 -15
  3. package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
  4. package/apps/local-dashboard/dist/assets/index-Dhgv5BQO.js +15 -0
  5. package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
  6. package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
  7. package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
  8. package/apps/local-dashboard/dist/index.html +5 -5
  9. package/cli/selftune/adapters/codex/install.ts +310 -78
  10. package/cli/selftune/adapters/opencode/install.ts +3 -4
  11. package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
  12. package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
  13. package/cli/selftune/auto-update.ts +200 -8
  14. package/cli/selftune/canonical-export.ts +55 -25
  15. package/cli/selftune/command-surface.ts +397 -0
  16. package/cli/selftune/contribute/contribute.ts +64 -13
  17. package/cli/selftune/contribution-config.ts +57 -3
  18. package/cli/selftune/contribution-preferences.ts +117 -0
  19. package/cli/selftune/contribution-signals.ts +8 -4
  20. package/cli/selftune/contribution-staging.ts +13 -2
  21. package/cli/selftune/contributions.ts +55 -121
  22. package/cli/selftune/creator-contributions.ts +29 -10
  23. package/cli/selftune/cron/setup.ts +7 -3
  24. package/cli/selftune/dashboard-contract.ts +73 -0
  25. package/cli/selftune/dashboard-server.ts +168 -17
  26. package/cli/selftune/dashboard.ts +350 -17
  27. package/cli/selftune/eval/baseline.ts +21 -5
  28. package/cli/selftune/eval/execution-eval.ts +170 -0
  29. package/cli/selftune/eval/family-overlap.ts +2 -2
  30. package/cli/selftune/eval/hooks-to-evals.ts +228 -82
  31. package/cli/selftune/eval/import-skillsbench.ts +2 -2
  32. package/cli/selftune/eval/invocation-classifier.ts +56 -0
  33. package/cli/selftune/eval/synthetic-evals.ts +5 -3
  34. package/cli/selftune/eval/unit-test-cli.ts +7 -4
  35. package/cli/selftune/evolution/apply-proposal.ts +295 -0
  36. package/cli/selftune/evolution/engines/replay-engine.ts +79 -57
  37. package/cli/selftune/evolution/evolve-body.ts +100 -39
  38. package/cli/selftune/evolution/evolve.ts +244 -52
  39. package/cli/selftune/evolution/rollback.ts +0 -1
  40. package/cli/selftune/evolution/validate-body.ts +68 -42
  41. package/cli/selftune/evolution/validate-host-replay.ts +510 -60
  42. package/cli/selftune/evolution/validate-proposal.ts +11 -150
  43. package/cli/selftune/evolution/validate-routing.ts +43 -41
  44. package/cli/selftune/evolution/validation-contract.ts +91 -0
  45. package/cli/selftune/grading/auto-grade.ts +11 -7
  46. package/cli/selftune/grading/grade-session.ts +10 -16
  47. package/cli/selftune/index.ts +35 -10
  48. package/cli/selftune/ingestors/claude-replay.ts +15 -10
  49. package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
  50. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  51. package/cli/selftune/ingestors/pi-ingest.ts +3 -2
  52. package/cli/selftune/init.ts +27 -3
  53. package/cli/selftune/localdb/direct-write.ts +35 -1
  54. package/cli/selftune/localdb/queries/cron.ts +34 -0
  55. package/cli/selftune/localdb/queries/dashboard.ts +834 -0
  56. package/cli/selftune/localdb/queries/evolution.ts +158 -0
  57. package/cli/selftune/localdb/queries/execution.ts +133 -0
  58. package/cli/selftune/localdb/queries/json.ts +18 -0
  59. package/cli/selftune/localdb/queries/monitoring.ts +263 -0
  60. package/cli/selftune/localdb/queries/raw.ts +95 -0
  61. package/cli/selftune/localdb/queries/staging.ts +270 -0
  62. package/cli/selftune/localdb/queries/trust.ts +392 -0
  63. package/cli/selftune/localdb/queries.ts +60 -2288
  64. package/cli/selftune/localdb/schema.ts +21 -0
  65. package/cli/selftune/monitoring/watch.ts +96 -29
  66. package/cli/selftune/normalization.ts +3 -0
  67. package/cli/selftune/observability.ts +4 -2
  68. package/cli/selftune/orchestrate/cli.ts +161 -0
  69. package/cli/selftune/orchestrate/execute.ts +295 -0
  70. package/cli/selftune/orchestrate/finalize.ts +157 -0
  71. package/cli/selftune/orchestrate/locks.ts +40 -0
  72. package/cli/selftune/orchestrate/plan.ts +131 -0
  73. package/cli/selftune/orchestrate/post-run.ts +59 -0
  74. package/cli/selftune/orchestrate/prepare.ts +334 -0
  75. package/cli/selftune/orchestrate/report.ts +182 -0
  76. package/cli/selftune/orchestrate/runtime.ts +120 -0
  77. package/cli/selftune/orchestrate/signals.ts +48 -0
  78. package/cli/selftune/orchestrate.ts +150 -1173
  79. package/cli/selftune/repair/skill-usage.ts +5 -2
  80. package/cli/selftune/routes/overview.ts +5 -2
  81. package/cli/selftune/routes/skill-report.ts +15 -2
  82. package/cli/selftune/schedule.ts +5 -5
  83. package/cli/selftune/status.ts +39 -2
  84. package/cli/selftune/testing-readiness.ts +597 -0
  85. package/cli/selftune/types.ts +44 -4
  86. package/cli/selftune/uninstall.ts +2 -1
  87. package/cli/selftune/utils/canonical-log.ts +1 -9
  88. package/cli/selftune/utils/cli-error.ts +9 -0
  89. package/cli/selftune/utils/llm-call.ts +126 -6
  90. package/cli/selftune/utils/skill-discovery.ts +2 -0
  91. package/cli/selftune/workflows/proposals.ts +184 -0
  92. package/cli/selftune/workflows/skill-scaffold.ts +241 -0
  93. package/cli/selftune/workflows/workflows.ts +100 -26
  94. package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
  95. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  96. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  97. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  98. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +41 -1
  99. package/node_modules/@selftune/telemetry-contract/src/types.ts +103 -2
  100. package/package.json +25 -9
  101. package/packages/dashboard-core/AGENTS.md +18 -0
  102. package/packages/dashboard-core/README.md +30 -0
  103. package/packages/dashboard-core/index.ts +3 -0
  104. package/packages/dashboard-core/package.json +39 -0
  105. package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
  106. package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
  107. package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
  108. package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
  109. package/packages/dashboard-core/src/chrome/index.ts +14 -0
  110. package/packages/dashboard-core/src/chrome/types.ts +81 -0
  111. package/packages/dashboard-core/src/chrome/utils.ts +23 -0
  112. package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
  113. package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
  114. package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
  115. package/packages/dashboard-core/src/gates/index.ts +3 -0
  116. package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
  117. package/packages/dashboard-core/src/host/adapter.ts +47 -0
  118. package/packages/dashboard-core/src/host/capabilities.ts +55 -0
  119. package/packages/dashboard-core/src/host/index.ts +3 -0
  120. package/packages/dashboard-core/src/models/analytics.ts +39 -0
  121. package/packages/dashboard-core/src/models/index.ts +4 -0
  122. package/packages/dashboard-core/src/models/overview.ts +98 -0
  123. package/packages/dashboard-core/src/models/runtime.ts +7 -0
  124. package/packages/dashboard-core/src/models/skills.ts +34 -0
  125. package/packages/dashboard-core/src/routes/index.ts +2 -0
  126. package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
  127. package/packages/dashboard-core/src/routes/manifest.ts +451 -0
  128. package/packages/dashboard-core/src/routes/types.ts +39 -0
  129. package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
  130. package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
  131. package/packages/dashboard-core/src/screens/index.ts +37 -0
  132. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
  133. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
  134. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
  135. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
  136. package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
  137. package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
  138. package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
  139. package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
  140. package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
  141. package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
  142. package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
  143. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
  144. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
  145. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
  146. package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
  147. package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
  148. package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
  149. package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
  150. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
  151. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
  152. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
  153. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
  154. package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
  155. package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
  156. package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
  157. package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
  158. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +1 -1
  159. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  160. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +1 -1
  161. package/packages/telemetry-contract/src/schemas.ts +41 -1
  162. package/packages/telemetry-contract/src/types.ts +103 -2
  163. package/packages/ui/src/components/EvidenceViewer.tsx +80 -25
  164. package/packages/ui/src/components/OverviewPanels.tsx +67 -26
  165. package/packages/ui/src/primitives/tabs.tsx +7 -6
  166. package/packages/ui/src/types.ts +10 -0
  167. package/skill/SKILL.md +130 -332
  168. package/skill/agents/diagnosis-analyst.md +3 -3
  169. package/skill/agents/evolution-reviewer.md +3 -3
  170. package/skill/agents/integration-guide.md +3 -3
  171. package/skill/agents/pattern-analyst.md +2 -2
  172. package/skill/references/cli-quick-reference.md +89 -0
  173. package/skill/references/creator-playbook.md +131 -0
  174. package/skill/references/examples.md +48 -0
  175. package/skill/references/troubleshooting.md +47 -0
  176. package/skill/references/version-history.md +1 -1
  177. package/skill/selftune.contribute.json +11 -0
  178. package/skill/{Workflows → workflows}/Baseline.md +20 -1
  179. package/skill/{Workflows → workflows}/Contribute.md +23 -10
  180. package/skill/{Workflows → workflows}/Contributions.md +13 -5
  181. package/skill/workflows/CreateTestDeploy.md +170 -0
  182. package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
  183. package/skill/{Workflows → workflows}/Cron.md +1 -1
  184. package/skill/{Workflows → workflows}/Dashboard.md +20 -0
  185. package/skill/{Workflows → workflows}/Doctor.md +1 -1
  186. package/skill/{Workflows → workflows}/Evals.md +67 -2
  187. package/skill/{Workflows → workflows}/Evolve.md +119 -30
  188. package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
  189. package/skill/{Workflows → workflows}/Grade.md +1 -1
  190. package/skill/{Workflows → workflows}/Initialize.md +8 -4
  191. package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
  192. package/skill/{Workflows → workflows}/Schedule.md +3 -3
  193. package/skill/workflows/SignalsDashboard.md +87 -0
  194. package/skill/{Workflows → workflows}/UnitTest.md +19 -0
  195. package/skill/{Workflows → workflows}/Watch.md +42 -2
  196. package/skill/{Workflows → workflows}/Workflows.md +39 -2
  197. package/apps/local-dashboard/dist/assets/index-CwOtTrUS.css +0 -1
  198. package/apps/local-dashboard/dist/assets/index-f1HQpbeH.js +0 -59
  199. package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
  200. package/apps/local-dashboard/dist/assets/vendor-ui-jVSaIZey.js +0 -12
  201. /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
  202. /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
  203. /package/skill/{Workflows → workflows}/Badge.md +0 -0
  204. /package/skill/{Workflows → workflows}/Composability.md +0 -0
  205. /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
  206. /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
  207. /package/skill/{Workflows → workflows}/Hook.md +0 -0
  208. /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
  209. /package/skill/{Workflows → workflows}/Ingest.md +0 -0
  210. /package/skill/{Workflows → workflows}/PlatformHooks.md +0 -0
  211. /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
  212. /package/skill/{Workflows → workflows}/Recover.md +0 -0
  213. /package/skill/{Workflows → workflows}/Registry.md +0 -0
  214. /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
  215. /package/skill/{Workflows → workflows}/Replay.md +0 -0
  216. /package/skill/{Workflows → workflows}/Rollback.md +0 -0
  217. /package/skill/{Workflows → workflows}/Sync.md +0 -0
  218. /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
  219. /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
@@ -23,6 +23,7 @@
23
23
  import { writeFileSync } from "node:fs";
24
24
  import { parseArgs } from "node:util";
25
25
 
26
+ import { PUBLIC_COMMAND_SURFACES, renderCommandHelp } from "../command-surface.js";
26
27
  import { GENERIC_NEGATIVES, QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js";
27
28
  import { getDb } from "../localdb/db.js";
28
29
  import {
@@ -32,27 +33,31 @@ import {
32
33
  } from "../localdb/queries.js";
33
34
  import type {
34
35
  EvalEntry,
35
- InvocationType,
36
+ EvalSourceStats,
36
37
  QueryLogRecord,
37
38
  SessionTelemetryRecord,
38
39
  SkillUsageRecord,
39
40
  } from "../types.js";
40
41
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
41
- import { detectAgent } from "../utils/llm-call.js";
42
+ import { detectLlmAgent } from "../utils/llm-call.js";
42
43
  import {
43
44
  filterActionableQueryRecords,
44
45
  filterActionableSkillUsageRecords,
45
46
  } from "../utils/query-filter.js";
46
47
  import { seededShuffle } from "../utils/seeded-random.js";
47
48
  import {
48
- escapeRegExp,
49
49
  findInstalledSkillNames,
50
50
  findInstalledSkillPath,
51
51
  findRepositoryClaudeSkillDirs,
52
52
  findRepositorySkillDirs,
53
53
  } from "../utils/skill-discovery.js";
54
54
  import { isHighConfidencePositiveSkillRecord } from "../utils/skill-usage-confidence.js";
55
+ import { readJsonl } from "../utils/jsonl.js";
56
+ import { classifyInvocation } from "./invocation-classifier.js";
55
57
  import { generateSyntheticEvals } from "./synthetic-evals.js";
58
+ import { writeCanonicalEvalSet } from "../testing-readiness.js";
59
+
60
+ export { classifyInvocation } from "./invocation-classifier.js";
56
61
 
57
62
  // ---------------------------------------------------------------------------
58
63
  // Query truncation
@@ -64,69 +69,6 @@ function truncateQuery(query: string): string {
64
69
  return query.length > MAX_QUERY_LENGTH ? query.slice(0, MAX_QUERY_LENGTH) : query;
65
70
  }
66
71
 
67
- // ---------------------------------------------------------------------------
68
- // Invocation taxonomy classifier
69
- // ---------------------------------------------------------------------------
70
-
71
- export function classifyInvocation(query: string, skillName: string): InvocationType {
72
- const qLower = query.toLowerCase();
73
- const skillLower = skillName.toLowerCase();
74
-
75
- // --- Explicit checks ---
76
-
77
- // Explicit: mentions skill name or $skill syntax
78
- if (
79
- qLower.includes(`$${skillLower}`) ||
80
- query.includes(`$${skillName}`) ||
81
- qLower.includes(skillLower)
82
- ) {
83
- return "explicit";
84
- }
85
-
86
- // Handle hyphenated skill names: check if all parts appear
87
- if (skillLower.includes("-")) {
88
- const parts = skillLower.split("-");
89
- if (parts.every((part) => new RegExp(`\\b${escapeRegExp(part)}\\b`, "i").test(query))) {
90
- return "explicit";
91
- }
92
- }
93
-
94
- // Convert skill-name to camelCase and check
95
- const camelCase = skillLower.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
96
- if (camelCase !== skillLower && qLower.includes(camelCase.toLowerCase())) {
97
- return "explicit";
98
- }
99
-
100
- // --- Contextual checks ---
101
-
102
- const wordCount = query.split(/\s+/).length;
103
- const hasProperNoun = /\b[A-Z][a-z]{2,}\b/.test(query);
104
-
105
- // Temporal references suggest domain context
106
- const hasTemporalRef =
107
- /\b(next week|last week|tomorrow|yesterday|Q[1-4]|monday|tuesday|wednesday|thursday|friday|january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(
108
- query,
109
- );
110
-
111
- // Filenames suggest contextual usage
112
- const hasFilename = /\b\w+\.\w{2,4}\b/.test(query);
113
-
114
- // Email addresses suggest contextual usage
115
- const hasEmail = /\b\S+@\S+\.\S+\b/.test(query);
116
-
117
- if (wordCount > 15 || hasProperNoun || hasTemporalRef || hasFilename || hasEmail) {
118
- return "contextual";
119
- }
120
-
121
- // Borderline: 10-15 words with domain signals (multi-digit numbers, uppercase acronyms)
122
- const hasDomainSignal = /\b\d{2,}\b/.test(query) || /[A-Z]{2,}/.test(query);
123
- if (wordCount >= 10 && hasDomainSignal) {
124
- return "contextual";
125
- }
126
-
127
- return "implicit";
128
- }
129
-
130
72
  // ---------------------------------------------------------------------------
131
73
  // Build eval set
132
74
  // ---------------------------------------------------------------------------
@@ -144,6 +86,7 @@ export function buildEvalSet(
144
86
  const actionableQueryRecords = filterActionableQueryRecords(queryRecords);
145
87
  const effectiveMaxPerSide = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
146
88
  const effectiveSeed = Number.isNaN(seed) ? 42 : seed;
89
+ const buildTimestamp = new Date().toISOString();
147
90
 
148
91
  // Build set of positive query texts (for exclusion from negatives)
149
92
  const positiveQueries = new Set<string>();
@@ -166,7 +109,12 @@ export function buildEvalSet(
166
109
  const q = (r.query ?? "").trim();
167
110
  if (!q || q === "(query not found)" || seen.has(q)) continue;
168
111
  seen.add(q);
169
- const entry: EvalEntry = { query: truncateQuery(q), should_trigger: true };
112
+ const entry: EvalEntry = {
113
+ query: truncateQuery(q),
114
+ should_trigger: true,
115
+ source: "log",
116
+ created_at: buildTimestamp,
117
+ };
170
118
  if (annotateTaxonomy) {
171
119
  entry.invocation_type = classifyInvocation(q, skillName);
172
120
  }
@@ -189,7 +137,12 @@ export function buildEvalSet(
189
137
 
190
138
  const shuffledNeg = seededShuffle(negCandidates, effectiveSeed).slice(0, effectiveMaxPerSide);
191
139
  negatives = shuffledNeg.map((q) => {
192
- const entry: EvalEntry = { query: truncateQuery(q), should_trigger: false };
140
+ const entry: EvalEntry = {
141
+ query: truncateQuery(q),
142
+ should_trigger: false,
143
+ source: "log",
144
+ created_at: buildTimestamp,
145
+ };
193
146
  if (annotateTaxonomy) {
194
147
  entry.invocation_type = "negative";
195
148
  }
@@ -202,7 +155,12 @@ export function buildEvalSet(
202
155
  const fallbacks: EvalEntry[] = [];
203
156
  for (const q of GENERIC_NEGATIVES) {
204
157
  if (negSeen.has(q) || positiveQueries.has(q)) continue;
205
- const entry: EvalEntry = { query: q, should_trigger: false };
158
+ const entry: EvalEntry = {
159
+ query: q,
160
+ should_trigger: false,
161
+ source: "log",
162
+ created_at: buildTimestamp,
163
+ };
206
164
  if (annotateTaxonomy) {
207
165
  entry.invocation_type = "negative";
208
166
  }
@@ -215,6 +173,116 @@ export function buildEvalSet(
215
173
  return [...shuffledPositives, ...negatives];
216
174
  }
217
175
 
176
+ // ---------------------------------------------------------------------------
177
+ // Normalized Levenshtein distance
178
+ // ---------------------------------------------------------------------------
179
+
180
+ function levenshteinDistance(a: string, b: string): number {
181
+ const la = a.length;
182
+ const lb = b.length;
183
+ if (la === 0) return lb;
184
+ if (lb === 0) return la;
185
+
186
+ // Use two-row optimization to keep memory O(min(la, lb))
187
+ let prev = Array.from<number>({ length: lb + 1 });
188
+ let curr = Array.from<number>({ length: lb + 1 });
189
+
190
+ for (let j = 0; j <= lb; j++) prev[j] = j;
191
+
192
+ for (let i = 1; i <= la; i++) {
193
+ curr[0] = i;
194
+ for (let j = 1; j <= lb; j++) {
195
+ const cost = a[i - 1] === b[j - 1] ? 0 : 1;
196
+ curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost);
197
+ }
198
+ [prev, curr] = [curr, prev];
199
+ }
200
+
201
+ return prev[lb];
202
+ }
203
+
204
+ function normalizedLevenshtein(a: string, b: string): number {
205
+ const maxLen = Math.max(a.length, b.length);
206
+ if (maxLen === 0) return 0;
207
+ return levenshteinDistance(a, b) / maxLen;
208
+ }
209
+
210
+ // ---------------------------------------------------------------------------
211
+ // Blend eval sets (log + synthetic)
212
+ // ---------------------------------------------------------------------------
213
+
214
+ /**
215
+ * Blend log-based and synthetic eval entries.
216
+ *
217
+ * Policy:
218
+ * - Keep ALL log-based entries (source: "log")
219
+ * - Add synthetic entries that cover gaps (boundary cases, underrepresented types)
220
+ * - Deduplicate: drop synthetic if normalizedLevenshtein(synthetic, anyLog) < 0.3
221
+ * - Mark surviving synthetic entries as source: "blended"
222
+ * - Cap total at 2x the log-based count
223
+ */
224
+ export function blendEvalSets(logEntries: EvalEntry[], syntheticEntries: EvalEntry[]): EvalEntry[] {
225
+ const result: EvalEntry[] = [...logEntries];
226
+ const logCount = logEntries.length;
227
+ const cap = logCount * 2;
228
+
229
+ if (logCount === 0 || syntheticEntries.length === 0) {
230
+ return result.slice(0, cap);
231
+ }
232
+
233
+ // Normalize log queries for comparison
234
+ const logQueries = logEntries.map((e) => e.query.toLowerCase().trim());
235
+
236
+ // Filter synthetic entries: drop those too similar to any log entry
237
+ const candidates: EvalEntry[] = [];
238
+ for (const synth of syntheticEntries) {
239
+ const synthNorm = synth.query.toLowerCase().trim();
240
+ let tooSimilar = false;
241
+ for (const logQ of logQueries) {
242
+ // Length pre-filter: skip Levenshtein if lengths differ by >70%
243
+ const maxLen = Math.max(synthNorm.length, logQ.length);
244
+ if (maxLen > 0 && Math.abs(synthNorm.length - logQ.length) / maxLen > 0.7) continue;
245
+ if (normalizedLevenshtein(synthNorm, logQ) < 0.3) {
246
+ tooSimilar = true;
247
+ break;
248
+ }
249
+ }
250
+ if (!tooSimilar) {
251
+ candidates.push({ ...synth, source: "blended" });
252
+ }
253
+ }
254
+
255
+ // Add candidates up to the cap
256
+ const slotsAvailable = cap - result.length;
257
+ result.push(...candidates.slice(0, slotsAvailable));
258
+
259
+ return result;
260
+ }
261
+
262
+ // ---------------------------------------------------------------------------
263
+ // Eval source stats
264
+ // ---------------------------------------------------------------------------
265
+
266
+ export function computeEvalSourceStats(entries: EvalEntry[]): EvalSourceStats {
267
+ const stats: EvalSourceStats = { total: entries.length, synthetic: 0, log: 0, blended: 0 };
268
+ const timestamps: string[] = [];
269
+
270
+ for (const entry of entries) {
271
+ if (entry.source === "synthetic") stats.synthetic++;
272
+ else if (entry.source === "log") stats.log++;
273
+ else if (entry.source === "blended") stats.blended++;
274
+ if (entry.created_at) timestamps.push(entry.created_at);
275
+ }
276
+
277
+ if (timestamps.length > 0) {
278
+ timestamps.sort();
279
+ stats.oldest = timestamps[0];
280
+ stats.newest = timestamps[timestamps.length - 1];
281
+ }
282
+
283
+ return stats;
284
+ }
285
+
218
286
  // ---------------------------------------------------------------------------
219
287
  // Installed skill discovery / readiness
220
288
  // ---------------------------------------------------------------------------
@@ -505,33 +573,40 @@ export async function cliMain(): Promise<void> {
505
573
  "telemetry-log": { type: "string", default: TELEMETRY_LOG },
506
574
  synthetic: { type: "boolean", default: false },
507
575
  "auto-synthetic": { type: "boolean", default: false },
576
+ blend: { type: "boolean", default: false },
508
577
  "skill-path": { type: "string" },
509
578
  model: { type: "string" },
579
+ help: { type: "boolean", default: false },
510
580
  },
511
581
  strict: true,
512
582
  });
513
583
 
584
+ if (values.help) {
585
+ console.log(renderCommandHelp(PUBLIC_COMMAND_SURFACES.evalGenerate));
586
+ process.exit(0);
587
+ }
588
+
514
589
  // --- Synthetic mode: generate evals from SKILL.md via LLM ---
515
590
  if (values.synthetic) {
516
591
  if (!values.skill) {
517
592
  throw new CLIError(
518
593
  "--skill required with --synthetic",
519
594
  "MISSING_FLAG",
520
- "selftune evals --synthetic --skill <name> --skill-path <path>",
595
+ "selftune eval generate --synthetic --skill <name> --skill-path <path>",
521
596
  );
522
597
  }
523
598
  if (!values["skill-path"]) {
524
599
  throw new CLIError(
525
600
  "--skill-path required with --synthetic",
526
601
  "MISSING_FLAG",
527
- "selftune evals --synthetic --skill <name> --skill-path <path>",
602
+ "selftune eval generate --synthetic --skill <name> --skill-path <path>",
528
603
  );
529
604
  }
530
605
 
531
- const agent = detectAgent();
606
+ const agent = detectLlmAgent();
532
607
  if (!agent) {
533
608
  throw new CLIError(
534
- "No agent CLI found (claude/codex/opencode)",
609
+ "No agent CLI found (claude/codex/opencode/pi)",
535
610
  "AGENT_NOT_FOUND",
536
611
  "Install one of the supported agent CLIs",
537
612
  );
@@ -549,11 +624,13 @@ export async function cliMain(): Promise<void> {
549
624
 
550
625
  const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
551
626
  writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
627
+ const canonicalPath = writeCanonicalEvalSet(values.skill, evalSet);
552
628
 
553
629
  const pos = evalSet.filter((e) => e.should_trigger);
554
630
  const neg = evalSet.filter((e) => !e.should_trigger);
555
631
 
556
632
  console.log(`Wrote ${evalSet.length} synthetic eval entries to ${outputPath}`);
633
+ console.log(`Canonical eval copy: ${canonicalPath}`);
557
634
  console.log(` Positives (should_trigger=true) : ${pos.length}`);
558
635
  console.log(` Negatives (should_trigger=false): ${neg.length}`);
559
636
 
@@ -582,10 +659,23 @@ export async function cliMain(): Promise<void> {
582
659
  let queryRecords: QueryLogRecord[];
583
660
  let telemetryRecords: SessionTelemetryRecord[];
584
661
 
585
- const db = getDb();
586
- skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
587
- queryRecords = queryQueryLog(db) as QueryLogRecord[];
588
- telemetryRecords = querySessionTelemetry(db) as SessionTelemetryRecord[];
662
+ const skillLogPath = values["skill-log"] ?? SKILL_LOG;
663
+ const queryLogPath = values["query-log"] ?? QUERY_LOG;
664
+ const telemetryLogPath = values["telemetry-log"] ?? TELEMETRY_LOG;
665
+ const hasCustomSkillLog = skillLogPath !== SKILL_LOG;
666
+ const hasCustomQueryLog = queryLogPath !== QUERY_LOG;
667
+ const hasCustomTelemetryLog = telemetryLogPath !== TELEMETRY_LOG;
668
+
669
+ const db = hasCustomSkillLog && hasCustomQueryLog && hasCustomTelemetryLog ? undefined : getDb();
670
+ skillRecords = hasCustomSkillLog
671
+ ? readJsonl<SkillUsageRecord>(skillLogPath)
672
+ : (querySkillUsageRecords(db!) as SkillUsageRecord[]);
673
+ queryRecords = hasCustomQueryLog
674
+ ? readJsonl<QueryLogRecord>(queryLogPath)
675
+ : (queryQueryLog(db!) as QueryLogRecord[]);
676
+ telemetryRecords = hasCustomTelemetryLog
677
+ ? readJsonl<SessionTelemetryRecord>(telemetryLogPath)
678
+ : (querySessionTelemetry(db!) as SessionTelemetryRecord[]);
589
679
 
590
680
  if (values["list-skills"]) {
591
681
  listSkills(skillRecords, queryRecords, telemetryRecords);
@@ -596,7 +686,7 @@ export async function cliMain(): Promise<void> {
596
686
  throw new CLIError(
597
687
  "--skill required (or use --list-skills)",
598
688
  "MISSING_FLAG",
599
- "selftune evals --skill <name> or selftune evals --list-skills",
689
+ "selftune eval generate --skill <name> or selftune eval generate --list-skills",
600
690
  );
601
691
  }
602
692
 
@@ -632,10 +722,10 @@ export async function cliMain(): Promise<void> {
632
722
  );
633
723
  }
634
724
 
635
- const agent = detectAgent();
725
+ const agent = detectLlmAgent();
636
726
  if (!agent) {
637
727
  throw new CLIError(
638
- "No agent CLI found (claude/codex/opencode)",
728
+ "No agent CLI found (claude/codex/opencode/pi)",
639
729
  "AGENT_NOT_FOUND",
640
730
  "Install one of the supported agent CLIs",
641
731
  );
@@ -652,10 +742,12 @@ export async function cliMain(): Promise<void> {
652
742
  });
653
743
  const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
654
744
  writeFileSync(outputPath, JSON.stringify(syntheticEvalSet, null, 2), "utf-8");
745
+ const canonicalPath = writeCanonicalEvalSet(values.skill, syntheticEvalSet);
655
746
  const pos = syntheticEvalSet.filter((e) => e.should_trigger);
656
747
  const neg = syntheticEvalSet.filter((e) => !e.should_trigger);
657
748
 
658
749
  console.log(`Wrote ${syntheticEvalSet.length} synthetic eval entries to ${outputPath}`);
750
+ console.log(`Canonical eval copy: ${canonicalPath}`);
659
751
  console.log(` Positives (should_trigger=true) : ${pos.length}`);
660
752
  console.log(` Negatives (should_trigger=false): ${neg.length}`);
661
753
  console.log("\nNext steps:");
@@ -666,9 +758,63 @@ export async function cliMain(): Promise<void> {
666
758
  return;
667
759
  }
668
760
 
761
+ // --- Blend mode: merge log-based evals with synthetic gap-fillers ---
762
+ let finalEvalSet = evalSet;
763
+ if (values.blend) {
764
+ const skillPath = values["skill-path"] ?? detectedSkillPath;
765
+ if (!skillPath) {
766
+ throw new CLIError(
767
+ `--blend requires a resolvable SKILL.md path. Use --skill-path or install the skill locally.`,
768
+ "MISSING_FLAG",
769
+ `selftune eval generate --skill ${values.skill} --blend --skill-path /path/to/SKILL.md`,
770
+ );
771
+ }
772
+
773
+ const agent = detectLlmAgent();
774
+ if (!agent) {
775
+ throw new CLIError(
776
+ "No agent CLI found (claude/codex/opencode/pi)",
777
+ "AGENT_NOT_FOUND",
778
+ "Install one of the supported agent CLIs",
779
+ );
780
+ }
781
+
782
+ // Fail fast before expensive LLM calls — blending with zero logs always produces []
783
+ if (evalSet.length === 0) {
784
+ throw new CLIError(
785
+ `--blend requires log-based eval entries to blend with synthetic entries. No log data found for skill "${values.skill}".`,
786
+ "BLEND_NO_LOGS",
787
+ `Use --synthetic instead for cold-start skills, or run selftune sync first to ingest session data.`,
788
+ );
789
+ }
790
+
791
+ const effectiveMax = Number.isNaN(maxPerSide) || maxPerSide <= 0 ? 50 : maxPerSide;
792
+ console.log(`Generating synthetic evals for blending with '${values.skill}'...`);
793
+ const syntheticEvalSet = await generateSyntheticEvals(skillPath, values.skill, agent, {
794
+ maxPositives: effectiveMax,
795
+ maxNegatives: effectiveMax,
796
+ modelFlag: values.model,
797
+ });
798
+
799
+ finalEvalSet = blendEvalSets(evalSet, syntheticEvalSet);
800
+ const stats = computeEvalSourceStats(finalEvalSet);
801
+ console.log(
802
+ `Blended: ${stats.log} log + ${stats.blended} synthetic gap-fillers = ${stats.total} total`,
803
+ );
804
+ }
805
+
669
806
  const outputPath = values.output ?? values.out ?? `${values.skill}_trigger_eval.json`;
670
- writeFileSync(outputPath, JSON.stringify(evalSet, null, 2), "utf-8");
671
- printEvalStats(evalSet, values.skill, outputPath, skillRecords, queryRecords, annotateTaxonomy);
807
+ writeFileSync(outputPath, JSON.stringify(finalEvalSet, null, 2), "utf-8");
808
+ const canonicalPath = writeCanonicalEvalSet(values.skill, finalEvalSet);
809
+ printEvalStats(
810
+ finalEvalSet,
811
+ values.skill,
812
+ outputPath,
813
+ skillRecords,
814
+ queryRecords,
815
+ annotateTaxonomy,
816
+ );
817
+ console.log(`Canonical eval copy: ${canonicalPath}`);
672
818
  if (positiveCount === 0 && detectedSkillPath) {
673
819
  printSyntheticFallbackHint(values.skill, detectedSkillPath);
674
820
  }
@@ -10,7 +10,7 @@
10
10
  * <dir>/tasks/<task-id>/task.toml — metadata (difficulty, category, tags, etc.)
11
11
  */
12
12
 
13
- import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
13
+ import { existsSync, readdirSync, readFileSync, type Dirent, writeFileSync } from "node:fs";
14
14
  import { join } from "node:path";
15
15
  import { parseArgs } from "node:util";
16
16
 
@@ -72,7 +72,7 @@ export function parseSkillsBenchDir(dirPath: string): SkillsBenchTask[] {
72
72
 
73
73
  const tasks: SkillsBenchTask[] = [];
74
74
 
75
- let entries: ReturnType<typeof readdirSync>;
75
+ let entries: Dirent[];
76
76
  try {
77
77
  entries = readdirSync(tasksDir, { withFileTypes: true });
78
78
  } catch {
@@ -0,0 +1,56 @@
1
+ import type { InvocationType } from "../types.js";
2
+ import { escapeRegExp } from "../utils/skill-discovery.js";
3
+
4
+ /**
5
+ * Classify how directly a user query invokes a skill.
6
+ *
7
+ * Kept separate from eval generation so synthetic evals can reuse the
8
+ * classifier without creating an import cycle with hooks-to-evals.
9
+ */
10
+ export function classifyInvocation(query: string, skillName: string): InvocationType {
11
+ const qLower = query.toLowerCase();
12
+ const skillLower = skillName.toLowerCase();
13
+
14
+ // Explicit: mentions skill name or $skill syntax.
15
+ if (
16
+ qLower.includes(`$${skillLower}`) ||
17
+ query.includes(`$${skillName}`) ||
18
+ qLower.includes(skillLower)
19
+ ) {
20
+ return "explicit";
21
+ }
22
+
23
+ // Handle hyphenated skill names: check if all parts appear.
24
+ if (skillLower.includes("-")) {
25
+ const parts = skillLower.split("-");
26
+ if (parts.every((part) => new RegExp(`\\b${escapeRegExp(part)}\\b`, "i").test(query))) {
27
+ return "explicit";
28
+ }
29
+ }
30
+
31
+ // Convert skill-name to camelCase and check.
32
+ const camelCase = skillLower.replace(/-([a-z])/g, (_, c) => c.toUpperCase());
33
+ if (camelCase !== skillLower && qLower.includes(camelCase.toLowerCase())) {
34
+ return "explicit";
35
+ }
36
+
37
+ const wordCount = query.split(/\s+/).length;
38
+ const hasProperNoun = /\b[A-Z][a-z]{2,}\b/.test(query);
39
+ const hasTemporalRef =
40
+ /\b(next week|last week|tomorrow|yesterday|Q[1-4]|monday|tuesday|wednesday|thursday|friday|january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(
41
+ query,
42
+ );
43
+ const hasFilename = /\b\w+\.\w{2,4}\b/.test(query);
44
+ const hasEmail = /\b\S+@\S+\.\S+\b/.test(query);
45
+
46
+ if (wordCount > 15 || hasProperNoun || hasTemporalRef || hasFilename || hasEmail) {
47
+ return "contextual";
48
+ }
49
+
50
+ const hasDomainSignal = /\b\d{2,}\b/.test(query) || /[A-Z]{2,}/.test(query);
51
+ if (wordCount >= 10 && hasDomainSignal) {
52
+ return "contextual";
53
+ }
54
+
55
+ return "implicit";
56
+ }
@@ -8,10 +8,10 @@
8
8
 
9
9
  import { readFileSync } from "node:fs";
10
10
 
11
- import type { EvalEntry, InvocationType } from "../types.js";
11
+ import type { EvalEntry, InvocationType, SkillUsageRecord } from "../types.js";
12
12
  import { callLlm, stripMarkdownFences } from "../utils/llm-call.js";
13
13
  import { findInstalledSkillNames } from "../utils/skill-discovery.js";
14
- import { classifyInvocation } from "./hooks-to-evals.js";
14
+ import { classifyInvocation } from "./invocation-classifier.js";
15
15
 
16
16
  // ---------------------------------------------------------------------------
17
17
  // Types
@@ -414,6 +414,8 @@ export function parseSyntheticResponse(raw: string, skillName: string): EvalEntr
414
414
  query,
415
415
  should_trigger: entry.should_trigger,
416
416
  invocation_type: invocationType,
417
+ source: "synthetic",
418
+ created_at: new Date().toISOString(),
417
419
  });
418
420
  }
419
421
 
@@ -449,7 +451,7 @@ export async function generateSyntheticEvals(
449
451
  const db = getDb();
450
452
 
451
453
  // Positives: high-confidence triggered records for this skill
452
- const skillRecords = querySkillUsageRecords(db);
454
+ const skillRecords = querySkillUsageRecords(db) as SkillUsageRecord[];
453
455
  const positive = skillRecords
454
456
  .filter((r) => isHighConfidencePositiveSkillRecord(r, skillName))
455
457
  .map((r) => r.query)
@@ -19,8 +19,9 @@ import { parseArgs } from "node:util";
19
19
 
20
20
  import { SELFTUNE_CONFIG_DIR } from "../constants.js";
21
21
  import type { EvalEntry } from "../types.js";
22
+ import { writeUnitTestRunResult } from "../testing-readiness.js";
22
23
  import { CLIError } from "../utils/cli-error.js";
23
- import { callLlm, detectAgent } from "../utils/llm-call.js";
24
+ import { callLlm, detectLlmAgent } from "../utils/llm-call.js";
24
25
  import { generateUnitTests } from "./generate-unit-tests.js";
25
26
  import type { AgentRunner } from "./unit-test.js";
26
27
  import { loadUnitTests, runUnitTestSuite } from "./unit-test.js";
@@ -58,10 +59,10 @@ export async function cliMain(): Promise<void> {
58
59
 
59
60
  // --generate: create tests from skill content
60
61
  if (values.generate) {
61
- const agent = detectAgent();
62
+ const agent = detectLlmAgent();
62
63
  if (!agent) {
63
64
  throw new CLIError(
64
- "No agent CLI found (claude/codex/opencode). Cannot generate tests",
65
+ "No agent CLI found (claude/codex/opencode/pi). Cannot generate tests",
65
66
  "AGENT_NOT_FOUND",
66
67
  "Install one of the supported agent CLIs",
67
68
  );
@@ -118,7 +119,7 @@ export async function cliMain(): Promise<void> {
118
119
  let agentRunner: AgentRunner;
119
120
 
120
121
  if (values["run-agent"]) {
121
- const agent = detectAgent();
122
+ const agent = detectLlmAgent();
122
123
  if (!agent) {
123
124
  throw new CLIError(
124
125
  "No agent CLI found. Cannot run agent-based tests",
@@ -137,11 +138,13 @@ export async function cliMain(): Promise<void> {
137
138
  }
138
139
 
139
140
  const suite = await runUnitTestSuite(tests, skillName, agentRunner);
141
+ const resultPath = writeUnitTestRunResult(skillName, suite);
140
142
 
141
143
  // Print results
142
144
  console.log(`\nResults for '${suite.skill_name}':`);
143
145
  console.log(` Total: ${suite.total} Passed: ${suite.passed} Failed: ${suite.failed}`);
144
146
  console.log(` Pass rate: ${(suite.pass_rate * 100).toFixed(1)}%`);
147
+ console.log(` Stored: ${resultPath}`);
145
148
 
146
149
  if (suite.failed > 0) {
147
150
  console.log("\nFailed tests:");