selftune 0.2.22 → 0.2.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +95 -15
  3. package/apps/local-dashboard/dist/assets/index-DgY2KGP-.css +1 -0
  4. package/apps/local-dashboard/dist/assets/index-Dmx7LPVX.js +15 -0
  5. package/apps/local-dashboard/dist/assets/vendor-react-C5oyHiV1.js +11 -0
  6. package/apps/local-dashboard/dist/assets/{vendor-table-BIiI3YhS.js → vendor-table-Bc_bbKd8.js} +1 -1
  7. package/apps/local-dashboard/dist/assets/vendor-ui-B3BPIYy7.js +1 -0
  8. package/apps/local-dashboard/dist/index.html +5 -5
  9. package/cli/selftune/adapters/codex/install.ts +310 -78
  10. package/cli/selftune/adapters/opencode/install.ts +3 -4
  11. package/cli/selftune/adapters/pi/hook.ts +273 -0
  12. package/cli/selftune/adapters/pi/install.ts +207 -0
  13. package/cli/selftune/alpha-upload/build-payloads.ts +3 -3
  14. package/cli/selftune/alpha-upload/stage-canonical.ts +17 -11
  15. package/cli/selftune/auto-update.ts +200 -8
  16. package/cli/selftune/canonical-export.ts +55 -25
  17. package/cli/selftune/command-surface.ts +397 -0
  18. package/cli/selftune/constants.ts +10 -1
  19. package/cli/selftune/contribute/contribute.ts +64 -13
  20. package/cli/selftune/contribution-config.ts +57 -3
  21. package/cli/selftune/contribution-preferences.ts +117 -0
  22. package/cli/selftune/contribution-signals.ts +8 -4
  23. package/cli/selftune/contribution-staging.ts +13 -2
  24. package/cli/selftune/contributions.ts +55 -121
  25. package/cli/selftune/creator-contributions.ts +29 -10
  26. package/cli/selftune/cron/setup.ts +7 -3
  27. package/cli/selftune/dashboard-contract.ts +87 -0
  28. package/cli/selftune/dashboard-server.ts +168 -17
  29. package/cli/selftune/dashboard.ts +350 -17
  30. package/cli/selftune/eval/baseline.ts +21 -5
  31. package/cli/selftune/eval/execution-eval.ts +170 -0
  32. package/cli/selftune/eval/family-overlap.ts +2 -2
  33. package/cli/selftune/eval/hooks-to-evals.ts +228 -82
  34. package/cli/selftune/eval/import-skillsbench.ts +2 -2
  35. package/cli/selftune/eval/invocation-classifier.ts +56 -0
  36. package/cli/selftune/eval/synthetic-evals.ts +5 -3
  37. package/cli/selftune/eval/unit-test-cli.ts +7 -4
  38. package/cli/selftune/evolution/apply-proposal.ts +295 -0
  39. package/cli/selftune/evolution/engines/judge-engine.ts +96 -0
  40. package/cli/selftune/evolution/engines/replay-engine.ts +180 -0
  41. package/cli/selftune/evolution/evidence.ts +2 -6
  42. package/cli/selftune/evolution/evolve-body.ts +152 -38
  43. package/cli/selftune/evolution/evolve.ts +244 -52
  44. package/cli/selftune/evolution/rollback.ts +0 -1
  45. package/cli/selftune/evolution/validate-body.ts +111 -49
  46. package/cli/selftune/evolution/validate-host-replay.ts +510 -60
  47. package/cli/selftune/evolution/validate-proposal.ts +11 -150
  48. package/cli/selftune/evolution/validate-routing.ts +51 -108
  49. package/cli/selftune/evolution/validation-contract.ts +91 -0
  50. package/cli/selftune/grading/auto-grade.ts +11 -7
  51. package/cli/selftune/grading/grade-session.ts +10 -16
  52. package/cli/selftune/hooks/skill-eval.ts +2 -1
  53. package/cli/selftune/hooks-shared/types.ts +1 -0
  54. package/cli/selftune/index.ts +58 -15
  55. package/cli/selftune/ingestors/claude-replay.ts +15 -10
  56. package/cli/selftune/ingestors/codex-wrapper.ts +3 -3
  57. package/cli/selftune/ingestors/opencode-ingest.ts +2 -2
  58. package/cli/selftune/ingestors/pi-ingest.ts +727 -0
  59. package/cli/selftune/init.ts +38 -4
  60. package/cli/selftune/localdb/direct-write.ts +120 -1
  61. package/cli/selftune/localdb/materialize.ts +6 -7
  62. package/cli/selftune/localdb/queries/cron.ts +34 -0
  63. package/cli/selftune/localdb/queries/dashboard.ts +834 -0
  64. package/cli/selftune/localdb/queries/evolution.ts +158 -0
  65. package/cli/selftune/localdb/queries/execution.ts +133 -0
  66. package/cli/selftune/localdb/queries/json.ts +18 -0
  67. package/cli/selftune/localdb/queries/monitoring.ts +263 -0
  68. package/cli/selftune/localdb/queries/raw.ts +95 -0
  69. package/cli/selftune/localdb/queries/staging.ts +270 -0
  70. package/cli/selftune/localdb/queries/trust.ts +392 -0
  71. package/cli/selftune/localdb/queries.ts +60 -2162
  72. package/cli/selftune/localdb/schema.ts +59 -0
  73. package/cli/selftune/monitoring/watch.ts +96 -29
  74. package/cli/selftune/normalization.ts +3 -0
  75. package/cli/selftune/observability.ts +12 -3
  76. package/cli/selftune/orchestrate/cli.ts +161 -0
  77. package/cli/selftune/orchestrate/execute.ts +295 -0
  78. package/cli/selftune/orchestrate/finalize.ts +157 -0
  79. package/cli/selftune/orchestrate/locks.ts +40 -0
  80. package/cli/selftune/orchestrate/plan.ts +131 -0
  81. package/cli/selftune/orchestrate/post-run.ts +59 -0
  82. package/cli/selftune/orchestrate/prepare.ts +334 -0
  83. package/cli/selftune/orchestrate/report.ts +182 -0
  84. package/cli/selftune/orchestrate/runtime.ts +120 -0
  85. package/cli/selftune/orchestrate/signals.ts +48 -0
  86. package/cli/selftune/orchestrate.ts +162 -1142
  87. package/cli/selftune/registry/client.ts +74 -0
  88. package/cli/selftune/registry/history.ts +54 -0
  89. package/cli/selftune/registry/index.ts +90 -0
  90. package/cli/selftune/registry/install.ts +141 -0
  91. package/cli/selftune/registry/list.ts +44 -0
  92. package/cli/selftune/registry/push.ts +171 -0
  93. package/cli/selftune/registry/rollback.ts +49 -0
  94. package/cli/selftune/registry/status.ts +62 -0
  95. package/cli/selftune/registry/sync.ts +125 -0
  96. package/cli/selftune/repair/skill-usage.ts +9 -3
  97. package/cli/selftune/routes/overview.ts +5 -2
  98. package/cli/selftune/routes/skill-report.ts +15 -2
  99. package/cli/selftune/schedule.ts +5 -5
  100. package/cli/selftune/status.ts +70 -2
  101. package/cli/selftune/sync.ts +127 -23
  102. package/cli/selftune/testing-readiness.ts +597 -0
  103. package/cli/selftune/types.ts +46 -5
  104. package/cli/selftune/uninstall.ts +2 -1
  105. package/cli/selftune/utils/canonical-log.ts +1 -9
  106. package/cli/selftune/utils/cli-error.ts +9 -0
  107. package/cli/selftune/utils/jsonl.ts +1 -30
  108. package/cli/selftune/utils/llm-call.ts +126 -6
  109. package/cli/selftune/utils/skill-discovery.ts +24 -0
  110. package/cli/selftune/workflows/proposals.ts +184 -0
  111. package/cli/selftune/workflows/skill-scaffold.ts +241 -0
  112. package/cli/selftune/workflows/workflows.ts +100 -26
  113. package/node_modules/@selftune/telemetry-contract/fixtures/complete-push.ts +1 -1
  114. package/node_modules/@selftune/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
  115. package/node_modules/@selftune/telemetry-contract/fixtures/golden.test.ts +0 -1
  116. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  117. package/node_modules/@selftune/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
  118. package/node_modules/@selftune/telemetry-contract/package.json +1 -1
  119. package/node_modules/@selftune/telemetry-contract/src/index.ts +1 -0
  120. package/node_modules/@selftune/telemetry-contract/src/schemas.ts +63 -5
  121. package/node_modules/@selftune/telemetry-contract/src/types.ts +97 -7
  122. package/node_modules/@selftune/telemetry-contract/tests/compatibility.test.ts +0 -1
  123. package/package.json +25 -9
  124. package/packages/dashboard-core/AGENTS.md +18 -0
  125. package/packages/dashboard-core/README.md +30 -0
  126. package/packages/dashboard-core/index.ts +3 -0
  127. package/packages/dashboard-core/package.json +39 -0
  128. package/packages/dashboard-core/src/chrome/DashboardChrome.tsx +74 -0
  129. package/packages/dashboard-core/src/chrome/DashboardHeader.tsx +200 -0
  130. package/packages/dashboard-core/src/chrome/DashboardSidebar.tsx +219 -0
  131. package/packages/dashboard-core/src/chrome/RuntimeBadge.tsx +46 -0
  132. package/packages/dashboard-core/src/chrome/index.ts +14 -0
  133. package/packages/dashboard-core/src/chrome/types.ts +81 -0
  134. package/packages/dashboard-core/src/chrome/utils.ts +23 -0
  135. package/packages/dashboard-core/src/gates/FeatureGate.tsx +11 -0
  136. package/packages/dashboard-core/src/gates/LockedRoute.tsx +29 -0
  137. package/packages/dashboard-core/src/gates/UpgradeCard.tsx +89 -0
  138. package/packages/dashboard-core/src/gates/index.ts +3 -0
  139. package/packages/dashboard-core/src/host/DashboardHostProvider.tsx +62 -0
  140. package/packages/dashboard-core/src/host/adapter.ts +47 -0
  141. package/packages/dashboard-core/src/host/capabilities.ts +55 -0
  142. package/packages/dashboard-core/src/host/index.ts +3 -0
  143. package/packages/dashboard-core/src/models/analytics.ts +39 -0
  144. package/packages/dashboard-core/src/models/index.ts +4 -0
  145. package/packages/dashboard-core/src/models/overview.ts +98 -0
  146. package/packages/dashboard-core/src/models/runtime.ts +7 -0
  147. package/packages/dashboard-core/src/models/skills.ts +34 -0
  148. package/packages/dashboard-core/src/routes/index.ts +2 -0
  149. package/packages/dashboard-core/src/routes/manifest.test.ts +70 -0
  150. package/packages/dashboard-core/src/routes/manifest.ts +451 -0
  151. package/packages/dashboard-core/src/routes/types.ts +39 -0
  152. package/packages/dashboard-core/src/screens/analytics/AnalyticsScreen.tsx +278 -0
  153. package/packages/dashboard-core/src/screens/analytics/index.ts +1 -0
  154. package/packages/dashboard-core/src/screens/index.ts +37 -0
  155. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.test.ts +101 -0
  156. package/packages/dashboard-core/src/screens/overview/OverviewComparisonSurface.tsx +393 -0
  157. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.test.tsx +113 -0
  158. package/packages/dashboard-core/src/screens/overview/OverviewCompositionSurface.tsx +72 -0
  159. package/packages/dashboard-core/src/screens/overview/OverviewCoreSurface.tsx +71 -0
  160. package/packages/dashboard-core/src/screens/overview/OverviewOnboardingBanner.tsx +90 -0
  161. package/packages/dashboard-core/src/screens/overview/OverviewRunSummary.tsx +40 -0
  162. package/packages/dashboard-core/src/screens/overview/index.ts +16 -0
  163. package/packages/dashboard-core/src/screens/overview/types.ts +13 -0
  164. package/packages/dashboard-core/src/screens/skill-report/SkillReportDailyBreakdownSection.tsx +99 -0
  165. package/packages/dashboard-core/src/screens/skill-report/SkillReportDataQualityTabContent.tsx +35 -0
  166. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceRail.tsx +71 -0
  167. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceSection.tsx +63 -0
  168. package/packages/dashboard-core/src/screens/skill-report/SkillReportEvidenceTabContent.tsx +25 -0
  169. package/packages/dashboard-core/src/screens/skill-report/SkillReportInvocationsSection.tsx +24 -0
  170. package/packages/dashboard-core/src/screens/skill-report/SkillReportMissedQueriesSection.tsx +79 -0
  171. package/packages/dashboard-core/src/screens/skill-report/SkillReportScaffold.tsx +150 -0
  172. package/packages/dashboard-core/src/screens/skill-report/SkillReportSections.test.tsx +224 -0
  173. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.test.tsx +76 -0
  174. package/packages/dashboard-core/src/screens/skill-report/SkillReportTabs.tsx +88 -0
  175. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrendSection.tsx +33 -0
  176. package/packages/dashboard-core/src/screens/skill-report/SkillReportTrustBadge.tsx +67 -0
  177. package/packages/dashboard-core/src/screens/skill-report/index.ts +45 -0
  178. package/packages/dashboard-core/src/screens/skills/SkillsLibraryScreen.tsx +162 -0
  179. package/packages/dashboard-core/src/screens/skills/index.ts +6 -0
  180. package/packages/telemetry-contract/fixtures/complete-push.ts +1 -1
  181. package/packages/telemetry-contract/fixtures/evidence-only-push.ts +2 -2
  182. package/packages/telemetry-contract/fixtures/golden.test.ts +0 -1
  183. package/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts +1 -1
  184. package/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts +2 -2
  185. package/packages/telemetry-contract/package.json +1 -1
  186. package/packages/telemetry-contract/src/index.ts +1 -0
  187. package/packages/telemetry-contract/src/schemas.ts +63 -5
  188. package/packages/telemetry-contract/src/types.ts +97 -7
  189. package/packages/telemetry-contract/tests/compatibility.test.ts +0 -1
  190. package/packages/ui/AGENTS.md +16 -0
  191. package/packages/ui/README.md +1 -1
  192. package/packages/ui/package.json +1 -1
  193. package/packages/ui/src/components/ActivityTimeline.tsx +152 -168
  194. package/packages/ui/src/components/AnalyticsCharts.tsx +344 -0
  195. package/packages/ui/src/components/EvidenceViewer.tsx +229 -464
  196. package/packages/ui/src/components/EvolutionTimeline.tsx +34 -87
  197. package/packages/ui/src/components/InfoTip.tsx +1 -2
  198. package/packages/ui/src/components/InvocationsPanel.tsx +413 -0
  199. package/packages/ui/src/components/JobHistoryTimeline.tsx +156 -0
  200. package/packages/ui/src/components/OrchestrateRunsPanel.tsx +18 -36
  201. package/packages/ui/src/components/OverviewPanels.tsx +693 -0
  202. package/packages/ui/src/components/PipelineStatusBar.tsx +65 -0
  203. package/packages/ui/src/components/SkillReportGuide.tsx +215 -0
  204. package/packages/ui/src/components/SkillReportPanels.tsx +919 -0
  205. package/packages/ui/src/components/SkillsLibrary.tsx +437 -0
  206. package/packages/ui/src/components/index.ts +56 -1
  207. package/packages/ui/src/components/section-cards.tsx +18 -35
  208. package/packages/ui/src/components/skill-health-grid.tsx +47 -37
  209. package/packages/ui/src/lib/constants.tsx +0 -1
  210. package/packages/ui/src/primitives/card.tsx +1 -1
  211. package/packages/ui/src/primitives/checkbox.tsx +1 -1
  212. package/packages/ui/src/primitives/dropdown-menu.tsx +2 -2
  213. package/packages/ui/src/primitives/select.tsx +2 -2
  214. package/packages/ui/src/primitives/tabs.tsx +7 -6
  215. package/packages/ui/src/types.ts +182 -4
  216. package/skill/SKILL.md +130 -318
  217. package/skill/agents/diagnosis-analyst.md +3 -3
  218. package/skill/agents/evolution-reviewer.md +3 -3
  219. package/skill/agents/integration-guide.md +3 -3
  220. package/skill/agents/pattern-analyst.md +2 -2
  221. package/skill/references/cli-quick-reference.md +89 -0
  222. package/skill/references/creator-playbook.md +131 -0
  223. package/skill/references/examples.md +48 -0
  224. package/skill/references/troubleshooting.md +47 -0
  225. package/skill/references/version-history.md +1 -1
  226. package/skill/selftune.contribute.json +11 -0
  227. package/skill/{Workflows → workflows}/Baseline.md +20 -1
  228. package/skill/{Workflows → workflows}/Contribute.md +23 -10
  229. package/skill/{Workflows → workflows}/Contributions.md +13 -5
  230. package/skill/workflows/CreateTestDeploy.md +170 -0
  231. package/skill/{Workflows → workflows}/CreatorContributions.md +18 -6
  232. package/skill/{Workflows → workflows}/Cron.md +1 -1
  233. package/skill/{Workflows → workflows}/Dashboard.md +20 -0
  234. package/skill/{Workflows → workflows}/Doctor.md +1 -1
  235. package/skill/{Workflows → workflows}/Evals.md +67 -2
  236. package/skill/{Workflows → workflows}/Evolve.md +119 -30
  237. package/skill/{Workflows → workflows}/EvolveBody.md +41 -1
  238. package/skill/{Workflows → workflows}/Grade.md +1 -1
  239. package/skill/{Workflows → workflows}/Ingest.md +60 -2
  240. package/skill/{Workflows → workflows}/Initialize.md +16 -9
  241. package/skill/{Workflows → workflows}/Orchestrate.md +13 -3
  242. package/skill/{Workflows → workflows}/PlatformHooks.md +19 -3
  243. package/skill/workflows/Registry.md +99 -0
  244. package/skill/{Workflows → workflows}/Schedule.md +3 -3
  245. package/skill/workflows/SignalsDashboard.md +87 -0
  246. package/skill/{Workflows → workflows}/Sync.md +3 -1
  247. package/skill/{Workflows → workflows}/UnitTest.md +19 -0
  248. package/skill/{Workflows → workflows}/Watch.md +42 -2
  249. package/skill/{Workflows → workflows}/Workflows.md +39 -2
  250. package/apps/local-dashboard/dist/assets/index-D8O-RG1I.js +0 -60
  251. package/apps/local-dashboard/dist/assets/index-_EcLywDg.css +0 -1
  252. package/apps/local-dashboard/dist/assets/vendor-react-CKkiCskZ.js +0 -11
  253. package/apps/local-dashboard/dist/assets/vendor-ui-CGEmUayx.js +0 -12
  254. package/cli/selftune/utils/html.ts +0 -27
  255. package/packages/ui/src/components/RecentActivityFeed.tsx +0 -117
  256. /package/skill/{Workflows → workflows}/AlphaUpload.md +0 -0
  257. /package/skill/{Workflows → workflows}/AutoActivation.md +0 -0
  258. /package/skill/{Workflows → workflows}/Badge.md +0 -0
  259. /package/skill/{Workflows → workflows}/Composability.md +0 -0
  260. /package/skill/{Workflows → workflows}/EvolutionMemory.md +0 -0
  261. /package/skill/{Workflows → workflows}/ExportCanonical.md +0 -0
  262. /package/skill/{Workflows → workflows}/Hook.md +0 -0
  263. /package/skill/{Workflows → workflows}/ImportSkillsBench.md +0 -0
  264. /package/skill/{Workflows → workflows}/Quickstart.md +0 -0
  265. /package/skill/{Workflows → workflows}/Recover.md +0 -0
  266. /package/skill/{Workflows → workflows}/RepairSkillUsage.md +0 -0
  267. /package/skill/{Workflows → workflows}/Replay.md +0 -0
  268. /package/skill/{Workflows → workflows}/Rollback.md +0 -0
  269. /package/skill/{Workflows → workflows}/Telemetry.md +0 -0
  270. /package/skill/{Workflows → workflows}/Uninstall.md +0 -0
@@ -10,9 +10,10 @@
10
10
 
11
11
  import { parseArgs } from "node:util";
12
12
 
13
+ import { writeGradingBaseline } from "../localdb/direct-write.js";
13
14
  import type { BaselineResult, EvalEntry } from "../types.js";
14
15
  import { CLIError, handleCLIError } from "../utils/cli-error.js";
15
- import { callLlm } from "../utils/llm-call.js";
16
+ import { callLlm, detectLlmAgent } from "../utils/llm-call.js";
16
17
  import { buildTriggerCheckPrompt, parseTriggerResponse } from "../utils/trigger-check.js";
17
18
 
18
19
  // ---------------------------------------------------------------------------
@@ -208,7 +209,6 @@ Options:
208
209
  }
209
210
 
210
211
  // Detect agent
211
- const { detectAgent } = await import("../utils/llm-call.js");
212
212
  const requestedAgent = values.agent;
213
213
  if (requestedAgent && !Bun.which(requestedAgent)) {
214
214
  throw new CLIError(
@@ -217,12 +217,12 @@ Options:
217
217
  "Install it or omit --agent to use auto-detection",
218
218
  );
219
219
  }
220
- const agent = requestedAgent ?? detectAgent();
220
+ const agent = requestedAgent ?? detectLlmAgent();
221
221
  if (!agent) {
222
222
  throw new CLIError(
223
- "No agent CLI (claude/codex/opencode) found in PATH",
223
+ "No agent CLI (claude/codex/opencode/pi) found in PATH",
224
224
  "AGENT_NOT_FOUND",
225
- "Install Claude Code, Codex, or OpenCode",
225
+ "Install Claude Code, Codex, OpenCode, or Pi",
226
226
  );
227
227
  }
228
228
 
@@ -233,6 +233,22 @@ Options:
233
233
  agent,
234
234
  });
235
235
 
236
+ writeGradingBaseline({
237
+ skill_name: values.skill,
238
+ proposal_id: null,
239
+ measured_at: result.measured_at,
240
+ pass_rate: result.with_skill_pass_rate,
241
+ mean_score: null,
242
+ sample_size: evalSet.length,
243
+ grading_results_json: JSON.stringify({
244
+ baseline_pass_rate: result.baseline_pass_rate,
245
+ with_skill_pass_rate: result.with_skill_pass_rate,
246
+ lift: result.lift,
247
+ adds_value: result.adds_value,
248
+ per_entry: result.per_entry,
249
+ }),
250
+ });
251
+
236
252
  console.log(JSON.stringify(result, null, 2));
237
253
  process.exit(result.adds_value ? 0 : 1);
238
254
  }
@@ -0,0 +1,170 @@
1
+ /**
2
+ * execution-eval.ts
3
+ *
4
+ * Experimental execution eval harness — runs assertion-based evals
5
+ * in a staged skill workspace. Phase 2 of eval system gap closure.
6
+ *
7
+ * Behind experimental flag: must be explicitly opted into.
8
+ */
9
+
10
+ import { copyFileSync, existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
11
+ import { tmpdir } from "node:os";
12
+ import { join } from "node:path";
13
+ import type { ExecutionAssertion, ExecutionEvalEntry } from "../types.js";
14
+
15
+ // ---------------------------------------------------------------------------
16
+ // Result types
17
+ // ---------------------------------------------------------------------------
18
+
19
+ export interface ExecutionEvalResult {
20
+ entry: ExecutionEvalEntry;
21
+ passed: boolean;
22
+ assertion_results: AssertionResult[];
23
+ workspace_path?: string;
24
+ elapsed_ms: number;
25
+ }
26
+
27
+ export interface AssertionResult {
28
+ assertion: ExecutionAssertion;
29
+ passed: boolean;
30
+ actual?: string;
31
+ error?: string;
32
+ }
33
+
34
+ // ---------------------------------------------------------------------------
35
+ // Staged workspace management
36
+ // ---------------------------------------------------------------------------
37
+
38
+ /**
39
+ * Create a staged workspace for execution eval isolation.
40
+ * Copies the skill into a temp directory so assertions don't affect the real skill.
41
+ */
42
+ export function createStagedWorkspace(skillPath: string): string {
43
+ const workspace = mkdtempSync(join(tmpdir(), "selftune-exec-eval-"));
44
+ const skillDir = join(workspace, "skill");
45
+ mkdirSync(skillDir, { recursive: true });
46
+
47
+ if (existsSync(skillPath)) {
48
+ copyFileSync(skillPath, join(skillDir, "SKILL.md"));
49
+ }
50
+
51
+ return workspace;
52
+ }
53
+
54
+ /**
55
+ * Clean up a staged workspace.
56
+ */
57
+ export function cleanupStagedWorkspace(workspacePath: string): void {
58
+ try {
59
+ rmSync(workspacePath, { recursive: true, force: true });
60
+ } catch {
61
+ // Best-effort cleanup — non-fatal
62
+ }
63
+ }
64
+
65
+ // ---------------------------------------------------------------------------
66
+ // Assertion runner
67
+ // ---------------------------------------------------------------------------
68
+
69
+ /**
70
+ * Run a single execution assertion against a workspace.
71
+ */
72
+ export function runAssertion(
73
+ assertion: ExecutionAssertion,
74
+ workspacePath: string,
75
+ ): AssertionResult {
76
+ try {
77
+ switch (assertion.type) {
78
+ case "file_exists": {
79
+ const target = join(workspacePath, assertion.target);
80
+ const exists = existsSync(target);
81
+ const passed = assertion.negated ? !exists : exists;
82
+ return { assertion, passed, actual: exists ? "exists" : "not found" };
83
+ }
84
+
85
+ case "file_contains": {
86
+ const target = join(workspacePath, assertion.target);
87
+ if (!existsSync(target)) {
88
+ return { assertion, passed: !!assertion.negated, actual: "file not found" };
89
+ }
90
+ const content = readFileSync(target, "utf-8");
91
+ const pattern = new RegExp(assertion.expected ?? "");
92
+ const matches = pattern.test(content);
93
+ const passed = assertion.negated ? !matches : matches;
94
+ return { assertion, passed, actual: matches ? "matched" : "no match" };
95
+ }
96
+
97
+ case "command_output":
98
+ case "skill_triggered":
99
+ case "custom": {
100
+ return {
101
+ assertion,
102
+ passed: false,
103
+ error: `Assertion type "${assertion.type}" not yet implemented`,
104
+ };
105
+ }
106
+
107
+ default:
108
+ return { assertion, passed: false, error: "Unknown assertion type" };
109
+ }
110
+ } catch (err) {
111
+ return { assertion, passed: false, error: String(err) };
112
+ }
113
+ }
114
+
115
+ // ---------------------------------------------------------------------------
116
+ // Execution eval runner
117
+ // ---------------------------------------------------------------------------
118
+
119
+ /**
120
+ * Run all execution evals for a set of entries.
121
+ * Requires experimental opt-in (entries must have `experimental: true`).
122
+ */
123
+ export async function runExecutionEvals(
124
+ entries: ExecutionEvalEntry[],
125
+ skillPath: string,
126
+ options?: { gateDeployment?: boolean },
127
+ ): Promise<{
128
+ results: ExecutionEvalResult[];
129
+ gate_passed: boolean;
130
+ summary: { total: number; passed: number; failed: number };
131
+ }> {
132
+ const results: ExecutionEvalResult[] = [];
133
+
134
+ for (const entry of entries) {
135
+ const start = Date.now();
136
+ let workspace: string | undefined;
137
+
138
+ try {
139
+ if (entry.requires_workspace) {
140
+ workspace = createStagedWorkspace(skillPath);
141
+ }
142
+
143
+ const assertionResults = entry.assertions.map((a) => runAssertion(a, workspace ?? "."));
144
+
145
+ const passed = assertionResults.every((r) => r.passed);
146
+
147
+ results.push({
148
+ entry,
149
+ passed,
150
+ assertion_results: assertionResults,
151
+ workspace_path: workspace,
152
+ elapsed_ms: Date.now() - start,
153
+ });
154
+ } finally {
155
+ if (workspace) {
156
+ cleanupStagedWorkspace(workspace);
157
+ }
158
+ }
159
+ }
160
+
161
+ const passed = results.filter((r) => r.passed).length;
162
+ const failed = results.length - passed;
163
+ const gatePassed = !options?.gateDeployment || failed === 0;
164
+
165
+ return {
166
+ results,
167
+ gate_passed: gatePassed,
168
+ summary: { total: results.length, passed, failed },
169
+ };
170
+ }
@@ -438,7 +438,7 @@ function buildRefactorProposal(
438
438
  return {
439
439
  workflow_name: workflowName,
440
440
  source_skill: skillName,
441
- suggested_path: `Workflows/${workflowName}.md`,
441
+ suggested_path: `workflows/${workflowName}.md`,
442
442
  };
443
443
  });
444
444
 
@@ -453,7 +453,7 @@ function buildRefactorProposal(
453
453
  migration_notes: [
454
454
  `Create a parent skill \`${parentSkillName}\` whose SKILL.md routes into internal workflows instead of exposing each family member as a primary top-level trigger surface.`,
455
455
  "Keep the existing sibling skills as thin compatibility aliases for at least one release cycle while usage shifts to the parent skill.",
456
- "Move execution-specific instructions into internal Workflows/ or references/ files so the parent SKILL.md stays focused on routing and progressive disclosure.",
456
+ "Move execution-specific instructions into internal workflows/ or references/ files so the parent SKILL.md stays focused on routing and progressive disclosure.",
457
457
  "Use the compatibility aliases to measure whether trigger quality improves before removing the old skill entry points.",
458
458
  ],
459
459
  };