switchroom 0.7.15 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (301) hide show
  1. package/README.md +51 -59
  2. package/bin/run-hook.sh +27 -11
  3. package/bin/timezone-hook.sh +9 -7
  4. package/dist/agent-scheduler/index.js +410 -133
  5. package/dist/auth-broker/index.js +13932 -0
  6. package/dist/cli/switchroom.js +26937 -5601
  7. package/dist/host-control/main.js +12702 -0
  8. package/dist/vault/approvals/kernel-server.js +467 -184
  9. package/dist/vault/broker/server.js +1430 -724
  10. package/examples/minimal.yaml +63 -0
  11. package/examples/personal-google-workspace-mcp/.env.example +34 -0
  12. package/examples/personal-google-workspace-mcp/README.md +194 -0
  13. package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
  14. package/examples/switchroom.yaml +220 -0
  15. package/package.json +7 -4
  16. package/profiles/_base/settings.json.hbs +20 -5
  17. package/profiles/_base/start.sh.hbs +16 -3
  18. package/profiles/_shared/agent-self-service.md.hbs +126 -0
  19. package/profiles/_shared/telegram-style.md.hbs +20 -90
  20. package/profiles/_shared/vault-protocol.md.hbs +68 -0
  21. package/profiles/default/CLAUDE.md +50 -96
  22. package/profiles/default/CLAUDE.md.hbs +36 -6
  23. package/profiles/default/workspace/SOUL.md.hbs +12 -5
  24. package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
  25. package/skills/buildkite-agent-runtime/SKILL.md +44 -11
  26. package/skills/buildkite-api/SKILL.md +31 -8
  27. package/skills/buildkite-cli/SKILL.md +27 -9
  28. package/skills/buildkite-migration/SKILL.md +22 -9
  29. package/skills/buildkite-pipelines/SKILL.md +26 -9
  30. package/skills/buildkite-secure-delivery/SKILL.md +23 -9
  31. package/skills/buildkite-test-engine/SKILL.md +25 -8
  32. package/skills/docx/SKILL.md +1 -1
  33. package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
  34. package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
  35. package/skills/file-bug/SKILL.md +34 -6
  36. package/skills/humanizer/SKILL.md +15 -0
  37. package/skills/humanizer-calibrate/SKILL.md +7 -1
  38. package/skills/mcp-builder/SKILL.md +1 -1
  39. package/skills/pdf/SKILL.md +1 -1
  40. package/skills/pptx/SKILL.md +1 -1
  41. package/skills/skill-creator/SKILL.md +21 -1
  42. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
  43. package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
  44. package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
  45. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
  46. package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
  47. package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
  48. package/skills/switchroom-cli/SKILL.md +63 -64
  49. package/skills/switchroom-health/SKILL.md +23 -10
  50. package/skills/switchroom-install/SKILL.md +3 -3
  51. package/skills/switchroom-manage/SKILL.md +26 -19
  52. package/skills/switchroom-runtime/SKILL.md +191 -0
  53. package/skills/switchroom-status/SKILL.md +27 -2
  54. package/skills/telegram-test-harness/SKILL.md +3 -0
  55. package/skills/token-helpers/SKILL.md +24 -1
  56. package/skills/webapp-testing/SKILL.md +31 -1
  57. package/skills/xlsx/SKILL.md +1 -1
  58. package/telegram-plugin/admin-commands/index.ts +7 -5
  59. package/telegram-plugin/analytics-posthog.ts +191 -0
  60. package/telegram-plugin/bridge/bridge.ts +69 -0
  61. package/telegram-plugin/bridge/ipc-client.ts +4 -1
  62. package/telegram-plugin/dist/bridge/bridge.js +194 -119
  63. package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
  64. package/telegram-plugin/dist/server.js +245 -189
  65. package/telegram-plugin/first-paint.ts +3 -24
  66. package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
  67. package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
  68. package/telegram-plugin/gateway/auth-command.ts +794 -0
  69. package/telegram-plugin/gateway/auth-line.ts +123 -0
  70. package/telegram-plugin/gateway/boot-card.ts +169 -40
  71. package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
  72. package/telegram-plugin/gateway/boot-probes.ts +166 -123
  73. package/telegram-plugin/gateway/boot-reason.ts +41 -7
  74. package/telegram-plugin/gateway/boot-version.ts +66 -0
  75. package/telegram-plugin/gateway/gateway.ts +3499 -1885
  76. package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
  77. package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
  78. package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
  79. package/telegram-plugin/gateway/quarantine.ts +69 -0
  80. package/telegram-plugin/gateway/quota-cache.ts +9 -4
  81. package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
  82. package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
  83. package/telegram-plugin/gateway/recent-denials.ts +77 -0
  84. package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
  85. package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
  86. package/telegram-plugin/history.ts +91 -0
  87. package/telegram-plugin/hooks/hooks.json +10 -0
  88. package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
  89. package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
  90. package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
  91. package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
  92. package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
  93. package/telegram-plugin/inbound-classifier.ts +50 -0
  94. package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
  95. package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
  96. package/telegram-plugin/package.json +4 -2
  97. package/telegram-plugin/permission-rule.ts +51 -0
  98. package/telegram-plugin/permission-title.ts +56 -0
  99. package/telegram-plugin/quota-check.ts +19 -41
  100. package/telegram-plugin/registry/reaper.ts +223 -0
  101. package/telegram-plugin/retry-api-call.ts +80 -0
  102. package/telegram-plugin/runtime-metrics.ts +177 -0
  103. package/telegram-plugin/scripts/build.mjs +0 -1
  104. package/telegram-plugin/secret-detect/index.ts +24 -0
  105. package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
  106. package/telegram-plugin/secret-detect/vault-error.ts +78 -11
  107. package/telegram-plugin/secret-detect/vault-write.ts +14 -2
  108. package/telegram-plugin/server.js +41795 -0
  109. package/telegram-plugin/session-tail.ts +6 -1
  110. package/telegram-plugin/shared/bot-runtime.ts +5 -4
  111. package/telegram-plugin/silence-poke.ts +420 -0
  112. package/telegram-plugin/silent-end.ts +174 -0
  113. package/telegram-plugin/stream-controller.ts +13 -0
  114. package/telegram-plugin/stream-reply-handler.ts +7 -0
  115. package/telegram-plugin/subagent-watcher.ts +213 -4
  116. package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
  117. package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
  118. package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
  119. package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
  120. package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
  121. package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
  122. package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
  123. package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
  124. package/telegram-plugin/tests/boot-probes.test.ts +216 -10
  125. package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
  126. package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
  127. package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
  128. package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
  129. package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
  130. package/telegram-plugin/tests/history-reaper.test.ts +378 -0
  131. package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
  132. package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
  133. package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
  134. package/telegram-plugin/tests/issues-card.test.ts +49 -0
  135. package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
  136. package/telegram-plugin/tests/permission-rule.test.ts +80 -1
  137. package/telegram-plugin/tests/permission-title.test.ts +31 -0
  138. package/telegram-plugin/tests/quota-check.test.ts +5 -35
  139. package/telegram-plugin/tests/races.test.ts +179 -0
  140. package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
  141. package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
  142. package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
  143. package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
  144. package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
  145. package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
  146. package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
  147. package/telegram-plugin/tests/silence-poke.test.ts +493 -0
  148. package/telegram-plugin/tests/silent-end.test.ts +206 -0
  149. package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
  150. package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
  151. package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
  152. package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
  153. package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
  154. package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
  155. package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
  156. package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
  157. package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
  158. package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
  159. package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
  160. package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
  161. package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
  162. package/telegram-plugin/turn-signal-tracker.ts +100 -24
  163. package/telegram-plugin/uat/SETUP.md +210 -35
  164. package/telegram-plugin/uat/assertions.ts +264 -37
  165. package/telegram-plugin/uat/driver-info.ts +57 -0
  166. package/telegram-plugin/uat/driver.ts +590 -51
  167. package/telegram-plugin/uat/harness.ts +140 -94
  168. package/telegram-plugin/uat/load-env.test.ts +72 -0
  169. package/telegram-plugin/uat/load-env.ts +48 -0
  170. package/telegram-plugin/uat/login.ts +96 -53
  171. package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
  172. package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
  173. package/telegram-plugin/uat/runners/report.ts +150 -0
  174. package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
  175. package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
  176. package/telegram-plugin/uat/runners/scorer.ts +106 -0
  177. package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
  178. package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
  179. package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
  180. package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
  181. package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
  182. package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
  183. package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
  184. package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
  185. package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
  186. package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
  187. package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
  188. package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
  189. package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
  190. package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
  191. package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
  192. package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
  193. package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
  194. package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
  195. package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
  196. package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
  197. package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
  198. package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
  199. package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
  200. package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
  201. package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
  202. package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
  203. package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
  204. package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
  205. package/telegram-plugin/vault-approval-posture.ts +42 -0
  206. package/telegram-plugin/welcome-text.ts +1 -0
  207. package/telegram-plugin/active-pins-sweep.ts +0 -204
  208. package/telegram-plugin/active-pins.ts +0 -146
  209. package/telegram-plugin/auth-dashboard.ts +0 -1104
  210. package/telegram-plugin/auth-slot-parser.ts +0 -497
  211. package/telegram-plugin/card-event-log.ts +0 -138
  212. package/telegram-plugin/dist/foreman/foreman.js +0 -31106
  213. package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
  214. package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
  215. package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
  216. package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
  217. package/telegram-plugin/foreman/foreman.ts +0 -1165
  218. package/telegram-plugin/foreman/setup-flow.ts +0 -345
  219. package/telegram-plugin/foreman/setup-state.ts +0 -239
  220. package/telegram-plugin/foreman/state.ts +0 -203
  221. package/telegram-plugin/pin-event-log.ts +0 -76
  222. package/telegram-plugin/progress-card-driver.ts +0 -2886
  223. package/telegram-plugin/progress-card-pin-manager.ts +0 -589
  224. package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
  225. package/telegram-plugin/progress-card.ts +0 -1409
  226. package/telegram-plugin/tests/HARNESS.md +0 -340
  227. package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
  228. package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
  229. package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
  230. package/telegram-plugin/tests/active-pins.test.ts +0 -187
  231. package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
  232. package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
  233. package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
  234. package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
  235. package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
  236. package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
  237. package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
  238. package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
  239. package/telegram-plugin/tests/card-event-log.test.ts +0 -145
  240. package/telegram-plugin/tests/first-paint.test.ts +0 -257
  241. package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
  242. package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
  243. package/telegram-plugin/tests/foreman-state.test.ts +0 -164
  244. package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
  245. package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
  246. package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
  247. package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
  248. package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
  249. package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
  250. package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
  251. package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
  252. package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
  253. package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
  254. package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
  255. package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
  256. package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
  257. package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
  258. package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
  259. package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
  260. package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
  261. package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
  262. package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
  263. package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
  264. package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
  265. package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
  266. package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
  267. package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
  268. package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
  269. package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
  270. package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
  271. package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
  272. package/telegram-plugin/tests/setup-flow.test.ts +0 -510
  273. package/telegram-plugin/tests/setup-state.test.ts +0 -146
  274. package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
  275. package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
  276. package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
  277. package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
  278. package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
  279. package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
  280. package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
  281. package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
  282. package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
  283. package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
  284. package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
  285. package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
  286. package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
  287. package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
  288. package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
  289. package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
  290. package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
  291. package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
  292. package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
  293. package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
  294. package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
  295. package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
  296. package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
  297. package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
  298. package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
  299. package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
  300. package/telegram-plugin/two-zone-card.ts +0 -269
  301. package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
@@ -0,0 +1,150 @@
1
+ /**
2
+ * Markdown report renderer for the agent-self-sufficiency UAT.
3
+ *
4
+ * Layout decisions:
5
+ *
6
+ * - Per-criterion pass-rate table is the headline — operator reads
7
+ * "did we move the needle" in one glance.
8
+ * - Per-agent + per-shape tables answer "did this regress for one
9
+ * agent" and "did one shape (typo/voice/multi) collapse".
10
+ * - Triage table lists every failure / timeout / error verbatim with
11
+ * the prompt and the reply, so the operator can diff them in the
12
+ * PR without re-running. Cap at 100 rows to keep the PR body
13
+ * digestible — the JSON sidecar (written alongside) has everything.
14
+ */
15
+
16
+ import type { CaseResult } from "./scorer.js";
17
+ import { aggregate } from "./scorer.js";
18
+ import { CRITERIA } from "./paraphrases.js";
19
+
20
+ export interface RenderOptions {
21
+ /** When the run started (used in the report header). */
22
+ startedAt: Date;
23
+ /** Total wall-clock seconds for the run. */
24
+ durationSeconds: number;
25
+ /** Agents the runner targeted. */
26
+ agents: readonly string[];
27
+ /** Cap on triage rows shown in the rendered markdown. Default 100. */
28
+ triageCap?: number;
29
+ }
30
+
31
+ export function renderMarkdown(
32
+ results: readonly CaseResult[],
33
+ opts: RenderOptions,
34
+ ): string {
35
+ const agg = aggregate(results);
36
+ const total = results.length;
37
+ const passes = results.filter((r) => r.outcome === "pass").length;
38
+ const passRate = total === 0 ? 0 : (passes / total) * 100;
39
+ const cap = opts.triageCap ?? 100;
40
+
41
+ const lines: string[] = [];
42
+ lines.push("# Agent self-sufficiency UAT report");
43
+ lines.push("");
44
+ lines.push(`- **Run start:** ${opts.startedAt.toISOString()}`);
45
+ lines.push(`- **Duration:** ${opts.durationSeconds.toFixed(1)}s`);
46
+ lines.push(`- **Agents:** ${opts.agents.join(", ") || "(none)"}`);
47
+ lines.push(`- **Total cases:** ${total}`);
48
+ lines.push(`- **Overall pass rate:** ${passRate.toFixed(1)}% (${passes}/${total})`);
49
+ lines.push("");
50
+
51
+ // Per-criterion table.
52
+ lines.push("## Pass rate by acceptance criterion");
53
+ lines.push("");
54
+ lines.push("| Criterion | Description | Pass | Fail | Timeout | Error | Rate |");
55
+ lines.push("|---|---|---:|---:|---:|---:|---:|");
56
+ for (const spec of CRITERIA) {
57
+ const row = agg.byCriterion.get(spec.id) ?? {
58
+ pass: 0,
59
+ fail: 0,
60
+ timeout: 0,
61
+ error: 0,
62
+ };
63
+ const n = row.pass + row.fail + row.timeout + row.error;
64
+ const rate = n === 0 ? "—" : `${((row.pass / n) * 100).toFixed(0)}%`;
65
+ lines.push(
66
+ `| \`${spec.id}\` | ${spec.description} | ${row.pass} | ${row.fail} | ${row.timeout} | ${row.error} | ${rate} |`,
67
+ );
68
+ }
69
+ lines.push("");
70
+
71
+ // Per-agent table.
72
+ lines.push("## Pass rate by agent");
73
+ lines.push("");
74
+ lines.push("| Agent | Pass | Fail | Timeout | Error | Rate |");
75
+ lines.push("|---|---:|---:|---:|---:|---:|");
76
+ for (const agent of opts.agents) {
77
+ const row = agg.byAgent.get(agent) ?? {
78
+ pass: 0,
79
+ fail: 0,
80
+ timeout: 0,
81
+ error: 0,
82
+ };
83
+ const n = row.pass + row.fail + row.timeout + row.error;
84
+ const rate = n === 0 ? "—" : `${((row.pass / n) * 100).toFixed(0)}%`;
85
+ lines.push(`| \`${agent}\` | ${row.pass} | ${row.fail} | ${row.timeout} | ${row.error} | ${rate} |`);
86
+ }
87
+ lines.push("");
88
+
89
+ // Per-shape table — does the corpus's typo / voice / multi-intent
90
+ // styles regress relative to formal / terse?
91
+ lines.push("## Pass rate by paraphrase shape");
92
+ lines.push("");
93
+ lines.push("| Shape | Pass | Fail | Timeout | Error | Rate |");
94
+ lines.push("|---|---:|---:|---:|---:|---:|");
95
+ for (const shape of ["formal", "terse", "typo", "voice", "multi"] as const) {
96
+ const row = agg.byShape.get(shape) ?? {
97
+ pass: 0,
98
+ fail: 0,
99
+ timeout: 0,
100
+ error: 0,
101
+ };
102
+ const n = row.pass + row.fail + row.timeout + row.error;
103
+ const rate = n === 0 ? "—" : `${((row.pass / n) * 100).toFixed(0)}%`;
104
+ lines.push(`| ${shape} | ${row.pass} | ${row.fail} | ${row.timeout} | ${row.error} | ${rate} |`);
105
+ }
106
+ lines.push("");
107
+
108
+ // Triage — every non-pass, verbatim.
109
+ const triage = results.filter((r) => r.outcome !== "pass");
110
+ if (triage.length > 0) {
111
+ lines.push("## Triage — failures, timeouts, errors");
112
+ lines.push("");
113
+ lines.push(`${triage.length} non-pass cases (showing up to ${cap}):`);
114
+ lines.push("");
115
+ lines.push("| # | Agent | Criterion | Shape | Outcome | Prompt | Reply (or error) |");
116
+ lines.push("|---:|---|---|---|---|---|---|");
117
+ triage.slice(0, cap).forEach((r, i) => {
118
+ const reply =
119
+ r.outcome === "error"
120
+ ? `_error: ${escapeCell(r.errorMessage ?? "?")}_`
121
+ : r.outcome === "timeout"
122
+ ? `_timeout after ${r.durationMs}ms_`
123
+ : escapeCell(truncate(r.reply, 240));
124
+ lines.push(
125
+ `| ${i + 1} | \`${r.agent}\` | \`${r.criterion}\` | ${r.paraphrase.shape} | ${r.outcome} | ${escapeCell(truncate(r.paraphrase.text, 120))} | ${reply} |`,
126
+ );
127
+ });
128
+ if (triage.length > cap) {
129
+ lines.push("");
130
+ lines.push(`_…and ${triage.length - cap} more. Full results in the JSON sidecar._`);
131
+ }
132
+ lines.push("");
133
+ } else {
134
+ lines.push("## Triage");
135
+ lines.push("");
136
+ lines.push("All cases passed. No triage required.");
137
+ lines.push("");
138
+ }
139
+
140
+ return lines.join("\n");
141
+ }
142
+
143
+ function escapeCell(s: string): string {
144
+ return s.replace(/\|/g, "\\|").replace(/\n/g, " ").replace(/`/g, "ʼ");
145
+ }
146
+
147
+ function truncate(s: string, n: number): string {
148
+ if (s.length <= n) return s;
149
+ return s.slice(0, n - 1) + "…";
150
+ }
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env bash
2
+ # Run the agent-self-sufficiency UAT against the live fleet on this host.
3
+ #
4
+ # Why a wrapper script: the UAT runner needs three secrets out of the
5
+ # vault (TELEGRAM_API_ID / API_HASH / DRIVER_SESSION) plus the per-agent
6
+ # bot usernames. Pulling them inline here so an operator can run the
7
+ # whole suite with a single command:
8
+ #
9
+ # ./telegram-plugin/uat/runners/run-agent-self-sufficiency.sh
10
+ #
11
+ # The vault prompts for its passphrase interactively (once); the script
12
+ # then exports the three secrets only into the bun subprocess, never to
13
+ # the surrounding shell.
14
+ #
15
+ # Override fleet selection with UAT_FLEET / UAT_ADMIN_AGENTS (see the
16
+ # runner's --help for the format).
17
+
18
+ set -euo pipefail
19
+
20
+ cd "$(dirname "$0")/../../.." # → repo root
21
+
22
+ # ── 1. Pull the three UAT secrets from vault ────────────────────────────
23
+ # `switchroom vault get` prompts for the passphrase on first call and
24
+ # caches the unlocked broker for the session — subsequent gets are
25
+ # silent. We avoid passing tokens via argv so they don't show up in
26
+ # `ps`. Failed lookups fail loud.
27
+ echo "[uat] unlocking vault to read UAT secrets..."
28
+ TELEGRAM_API_ID="$(switchroom vault get telegram-uat-api-id)"
29
+ TELEGRAM_API_HASH="$(switchroom vault get telegram-uat-api-hash)"
30
+ TELEGRAM_UAT_DRIVER_SESSION="$(switchroom vault get telegram-uat-driver-session)"
31
+ export TELEGRAM_API_ID TELEGRAM_API_HASH TELEGRAM_UAT_DRIVER_SESSION
32
+
33
+ # ── 2. Discover the fleet from switchroom.yaml ──────────────────────────
34
+ # Operator may override by exporting UAT_FLEET / UAT_ADMIN_AGENTS
35
+ # explicitly. Otherwise we extract each agent's bot username from its
36
+ # token via getMe. This requires the operator to have read access to
37
+ # the per-agent .env files — if not, point UAT_FLEET at the right
38
+ # usernames manually.
39
+ if [[ -z "${UAT_FLEET:-}" ]]; then
40
+ echo "[uat] UAT_FLEET not set — set it explicitly to:"
41
+ echo " UAT_FLEET=\"agent1:@bot1,agent2:@bot2,agent3:@bot3\""
42
+ echo " UAT_ADMIN_AGENTS=\"agent1,agent2\" # optional"
43
+ echo ""
44
+ echo " Bot usernames live in BotFather or can be read from each"
45
+ echo " agent's vault entry. Set them and re-run."
46
+ exit 64
47
+ fi
48
+
49
+ # ── 3. Run ──────────────────────────────────────────────────────────────
50
+ exec bun telegram-plugin/uat/runners/agent-self-sufficiency.ts "$@"
@@ -0,0 +1,196 @@
1
+ /**
2
+ * Tests for the agent-self-sufficiency UAT runner's pure functions.
3
+ * The driver / Telegram orchestration is exercised live via the
4
+ * runner script itself (`agent-self-sufficiency.ts`) — these tests
5
+ * pin the scoring + reporting contracts so a refactor doesn't
6
+ * silently flip "fail" to "pass" or scramble the markdown layout.
7
+ */
8
+
9
+ import { describe, it, expect } from "vitest";
10
+ import { scoreReply, aggregate, type CaseResult } from "./scorer.js";
11
+ import { CRITERIA, patternFor } from "./paraphrases.js";
12
+ import { renderMarkdown } from "./report.js";
13
+
14
+ const SPEC_IDENTITY = CRITERIA.find((c) => c.id === "2a_what_are_you")!;
15
+ const SPEC_NAME = CRITERIA.find((c) => c.id === "2b_your_name")!;
16
+ const SPEC_PEERS = CRITERIA.find((c) => c.id === "2c_peers")!;
17
+ const SPEC_CRON = CRITERIA.find((c) => c.id === "1b_cron_list")!;
18
+ const SPEC_REFUSAL = CRITERIA.find((c) => c.id === "3d_admin_refusal")!;
19
+
20
+ describe("CRITERIA corpus shape", () => {
21
+ it("has at least 10 paraphrases per criterion (goal acceptance gate)", () => {
22
+ for (const c of CRITERIA) {
23
+ expect(c.paraphrases.length, `criterion ${c.id}`).toBeGreaterThanOrEqual(
24
+ 10,
25
+ );
26
+ }
27
+ });
28
+
29
+ it("covers every paraphrase shape at least once per criterion", () => {
30
+ const shapes = ["formal", "terse", "typo", "voice", "multi"] as const;
31
+ for (const c of CRITERIA) {
32
+ const seen = new Set(c.paraphrases.map((p) => p.shape));
33
+ for (const s of shapes) {
34
+ expect(seen.has(s), `${c.id} missing shape ${s}`).toBe(true);
35
+ }
36
+ }
37
+ });
38
+ });
39
+
40
+ describe("scoreReply", () => {
41
+ it("returns pass when the identity criterion's reply mentions switchroom + claude code", () => {
42
+ const reply =
43
+ "I'm a switchroom agent running Claude Code under the official `claude` CLI.";
44
+ expect(scoreReply(SPEC_IDENTITY, reply, { agentName: "x" })).toBe("pass");
45
+ });
46
+
47
+ it("returns fail when the identity reply is generic 'AI assistant' boilerplate", () => {
48
+ const reply = "I'm an AI assistant here to help you with tasks.";
49
+ expect(scoreReply(SPEC_IDENTITY, reply, { agentName: "x" })).toBe("fail");
50
+ });
51
+
52
+ it("returns fail on empty replies regardless of criterion", () => {
53
+ expect(scoreReply(SPEC_PEERS, "", { agentName: "x" })).toBe("fail");
54
+ expect(scoreReply(SPEC_PEERS, " ", { agentName: "x" })).toBe("fail");
55
+ });
56
+
57
+ it("strips markdown bold/code before matching so formatting doesn't flip outcomes", () => {
58
+ // The bold + backticks would have shielded the keyword if we
59
+ // matched raw — this proves stripMarkdown does its job.
60
+ const reply = "I'm a **switchroom** agent on `claude code`.";
61
+ expect(scoreReply(SPEC_IDENTITY, reply, { agentName: "x" })).toBe("pass");
62
+ });
63
+
64
+ it("substitutes __INJECTED_AGENT_NAME__ for the per-agent name criterion", () => {
65
+ const pattern = patternFor(SPEC_NAME, { agentName: "klanker" });
66
+ expect(pattern.test("my name is klanker")).toBe(true);
67
+ expect(pattern.test("my name is doc")).toBe(false);
68
+ });
69
+
70
+ it("scores 2b_your_name pass when the reply contains the agent name", () => {
71
+ const reply = "My name is klanker.";
72
+ expect(scoreReply(SPEC_NAME, reply, { agentName: "klanker" })).toBe("pass");
73
+ });
74
+
75
+ it("scores 2b_your_name fail when the reply names a different agent", () => {
76
+ const reply = "I'm doc.";
77
+ expect(scoreReply(SPEC_NAME, reply, { agentName: "klanker" })).toBe("fail");
78
+ });
79
+
80
+ it("scores 1b_cron_list pass for honest 'nothing scheduled' replies", () => {
81
+ const reply = "Nothing scheduled right now — my cron list is empty.";
82
+ expect(scoreReply(SPEC_CRON, reply, { agentName: "x" })).toBe("pass");
83
+ });
84
+
85
+ it("scores 3d_admin_refusal pass when reply says can't + names admin agent", () => {
86
+ const reply =
87
+ "I can't restart the fleet — ask klanker, they're the admin agent on this instance.";
88
+ expect(scoreReply(SPEC_REFUSAL, reply, { agentName: "scribe" })).toBe(
89
+ "pass",
90
+ );
91
+ });
92
+ });
93
+
94
+ describe("aggregate", () => {
95
+ it("counts by criterion / agent / shape", () => {
96
+ const mk = (
97
+ agent: string,
98
+ criterion: CaseResult["criterion"],
99
+ shape: "formal" | "terse" | "typo" | "voice" | "multi",
100
+ outcome: "pass" | "fail" | "timeout" | "error",
101
+ ): CaseResult => ({
102
+ agent,
103
+ criterion,
104
+ paraphrase: { label: "x", shape, text: "y" },
105
+ outcome,
106
+ reply: "",
107
+ durationMs: 1,
108
+ });
109
+ const results = [
110
+ mk("a", "2a_what_are_you", "formal", "pass"),
111
+ mk("a", "2a_what_are_you", "typo", "fail"),
112
+ mk("b", "2a_what_are_you", "voice", "pass"),
113
+ mk("b", "2c_peers", "terse", "timeout"),
114
+ ];
115
+ const a = aggregate(results);
116
+ expect(a.byCriterion.get("2a_what_are_you")).toEqual({
117
+ pass: 2,
118
+ fail: 1,
119
+ timeout: 0,
120
+ error: 0,
121
+ });
122
+ expect(a.byAgent.get("a")).toEqual({
123
+ pass: 1,
124
+ fail: 1,
125
+ timeout: 0,
126
+ error: 0,
127
+ });
128
+ expect(a.byShape.get("typo")).toEqual({
129
+ pass: 0,
130
+ fail: 1,
131
+ timeout: 0,
132
+ error: 0,
133
+ });
134
+ });
135
+ });
136
+
137
+ describe("renderMarkdown", () => {
138
+ it("produces a report with overall pass rate, per-criterion table, and triage when there are failures", () => {
139
+ const results: CaseResult[] = [
140
+ {
141
+ agent: "a",
142
+ criterion: "2a_what_are_you",
143
+ paraphrase: { label: "p1", shape: "formal", text: "what are you?" },
144
+ outcome: "pass",
145
+ reply: "I'm a switchroom agent.",
146
+ durationMs: 500,
147
+ },
148
+ {
149
+ agent: "a",
150
+ criterion: "2a_what_are_you",
151
+ paraphrase: { label: "p2", shape: "typo", text: "wht r u" },
152
+ outcome: "fail",
153
+ reply: "I'm just an AI.",
154
+ durationMs: 800,
155
+ },
156
+ {
157
+ agent: "b",
158
+ criterion: "2c_peers",
159
+ paraphrase: { label: "p3", shape: "voice", text: "who else is here?" },
160
+ outcome: "timeout",
161
+ reply: "",
162
+ durationMs: 60_000,
163
+ },
164
+ ];
165
+ const md = renderMarkdown(results, {
166
+ startedAt: new Date("2026-05-14T00:00:00Z"),
167
+ durationSeconds: 90,
168
+ agents: ["a", "b"],
169
+ });
170
+ expect(md).toContain("# Agent self-sufficiency UAT report");
171
+ expect(md).toContain("33.3% (1/3)");
172
+ expect(md).toContain("`2a_what_are_you`");
173
+ expect(md).toContain("Triage");
174
+ // Triage row carries the verbatim prompt + reply.
175
+ expect(md).toContain("wht r u");
176
+ expect(md).toContain("I'm just an AI.");
177
+ expect(md).toMatch(/timeout after 60000ms/);
178
+ });
179
+
180
+ it("renders 'All cases passed' when there are no failures", () => {
181
+ const md = renderMarkdown(
182
+ [
183
+ {
184
+ agent: "a",
185
+ criterion: "2a_what_are_you",
186
+ paraphrase: { label: "p", shape: "formal", text: "what are you?" },
187
+ outcome: "pass",
188
+ reply: "I'm a switchroom agent.",
189
+ durationMs: 500,
190
+ },
191
+ ],
192
+ { startedAt: new Date(), durationSeconds: 1, agents: ["a"] },
193
+ );
194
+ expect(md).toContain("All cases passed");
195
+ });
196
+ });
@@ -0,0 +1,106 @@
1
+ /**
2
+ * Heuristic pass/fail scoring for the agent-self-sufficiency UAT.
3
+ *
4
+ * Each result also carries the verbatim reply so the report's triage
5
+ * table can show the operator exactly what the agent said. Scoring is
6
+ * deliberately permissive — we're testing whether the agent
7
+ * understood the *intent* (and reached for the right tool), not
8
+ * whether the reply matches a specific phrasing.
9
+ *
10
+ * Failure modes the runner needs to distinguish from "wrong answer":
11
+ *
12
+ * - timeout: agent never replied within the budget. Could mean
13
+ * the agent is wedged, the bot token's wrong, or
14
+ * Telegram is throttling. Reported separately so the
15
+ * operator doesn't conflate "didn't reply" with
16
+ * "replied wrong".
17
+ * - send_error: driver couldn't even deliver the inbound (bot
18
+ * username missing, mtcute connection died, etc.).
19
+ * These bubble up as `error` results, not `fail`.
20
+ */
21
+
22
+ import type { CriterionSpec, Paraphrase } from "./paraphrases.js";
23
+ import { patternFor } from "./paraphrases.js";
24
+
25
+ export type Outcome = "pass" | "fail" | "timeout" | "error";
26
+
27
+ export interface CaseResult {
28
+ agent: string;
29
+ criterion: CriterionSpec["id"];
30
+ paraphrase: Paraphrase;
31
+ outcome: Outcome;
32
+ /** Verbatim reply text, empty for timeout/error. Trimmed; markdown
33
+ * preserved so the report can show what the user actually saw. */
34
+ reply: string;
35
+ /** Wall-clock ms from sendDM to first reply (or to timeout). */
36
+ durationMs: number;
37
+ /** Optional error message for `error` outcomes. */
38
+ errorMessage?: string;
39
+ }
40
+
41
+ /**
42
+ * Score a single reply against a criterion. The runner does NOT call
43
+ * this on timeouts or errors — those outcomes are set directly. For
44
+ * `2b_your_name` and other criteria with `__INJECTED_AGENT_NAME__` in
45
+ * their passPattern, the caller passes the agent name so the matcher
46
+ * substitutes correctly.
47
+ */
48
+ export function scoreReply(
49
+ spec: CriterionSpec,
50
+ reply: string,
51
+ injection: { agentName: string },
52
+ ): Outcome {
53
+ if (!reply.trim()) return "fail";
54
+ const normalized = stripMarkdown(reply).toLowerCase();
55
+ return patternFor(spec, injection).test(normalized) ? "pass" : "fail";
56
+ }
57
+
58
+ /**
59
+ * Strip markdown bold/italic/code-fence markers and collapse runs of
60
+ * whitespace. Permissive on purpose — the scorer's regex matches
61
+ * against words, not formatting.
62
+ */
63
+ function stripMarkdown(s: string): string {
64
+ return s
65
+ .replace(/```[\s\S]*?```/g, " ")
66
+ .replace(/`([^`]+)`/g, "$1")
67
+ .replace(/\*\*([^*]+)\*\*/g, "$1")
68
+ .replace(/__([^_]+)__/g, "$1")
69
+ .replace(/\*([^*]+)\*/g, "$1")
70
+ .replace(/_([^_]+)_/g, "$1")
71
+ .replace(/\s+/g, " ")
72
+ .trim();
73
+ }
74
+
75
+ /**
76
+ * Aggregate per-criterion / per-agent / per-shape pass rates. Pure
77
+ * function — easy to test.
78
+ */
79
+ export interface Aggregate {
80
+ byCriterion: Map<string, { pass: number; fail: number; timeout: number; error: number }>;
81
+ byAgent: Map<string, { pass: number; fail: number; timeout: number; error: number }>;
82
+ byShape: Map<string, { pass: number; fail: number; timeout: number; error: number }>;
83
+ }
84
+
85
+ export function aggregate(results: readonly CaseResult[]): Aggregate {
86
+ const acc: Aggregate = {
87
+ byCriterion: new Map(),
88
+ byAgent: new Map(),
89
+ byShape: new Map(),
90
+ };
91
+ const bump = (
92
+ m: Aggregate["byCriterion"],
93
+ k: string,
94
+ outcome: Outcome,
95
+ ): void => {
96
+ const row = m.get(k) ?? { pass: 0, fail: 0, timeout: 0, error: 0 };
97
+ row[outcome] += 1;
98
+ m.set(k, row);
99
+ };
100
+ for (const r of results) {
101
+ bump(acc.byCriterion, r.criterion, r.outcome);
102
+ bump(acc.byAgent, r.agent, r.outcome);
103
+ bump(acc.byShape, r.paraphrase.shape, r.outcome);
104
+ }
105
+ return acc;
106
+ }
@@ -0,0 +1,100 @@
1
+ /**
2
+ * Unit tests for the skill-coverage UAT runner's pure pieces:
3
+ * label extractor + sidecar JSONL reader. Live driver/network paths
4
+ * are validated by operator-driven runs (see runbook).
5
+ */
6
+
7
+ import { describe, it, expect } from "vitest";
8
+ import {
9
+ extractSkillFromLabel,
10
+ readSkillRowsSince,
11
+ } from "./skill-coverage.js";
12
+
13
+ describe("extractSkillFromLabel", () => {
14
+ it("pulls the slug from the hook's canonical label", () => {
15
+ expect(extractSkillFromLabel("Running skill switchroom-cli")).toBe(
16
+ "switchroom-cli",
17
+ );
18
+ });
19
+
20
+ it("is case-insensitive on the label but lowercases the slug", () => {
21
+ expect(extractSkillFromLabel("RUNNING SKILL BUILDKITE-API")).toBe(
22
+ "buildkite-api",
23
+ );
24
+ });
25
+
26
+ it("returns null for non-Skill labels", () => {
27
+ expect(extractSkillFromLabel("Reading scaffold.ts")).toBeNull();
28
+ expect(extractSkillFromLabel("Replying")).toBeNull();
29
+ });
30
+
31
+ it("returns null when the slug is missing or malformed", () => {
32
+ expect(extractSkillFromLabel("running skill")).toBeNull();
33
+ expect(extractSkillFromLabel("running skill (and)")).toBeNull();
34
+ });
35
+ });
36
+
37
+ describe("readSkillRowsSince", () => {
38
+ const files: Record<string, string> = {
39
+ "tool-labels-A.jsonl": [
40
+ // before sinceMs: ignored
41
+ JSON.stringify({ ts: 100, tool_use_id: "u1", agent_id: "ag", label: "Running skill docx", tool_name: "Skill" }),
42
+ // after sinceMs, Skill: kept
43
+ JSON.stringify({ ts: 1500, tool_use_id: "u2", agent_id: "ag", label: "Running skill switchroom-cli", tool_name: "Skill" }),
44
+ // after sinceMs, non-Skill: ignored
45
+ JSON.stringify({ ts: 1600, tool_use_id: "u3", agent_id: "ag", label: "Reading foo.ts", tool_name: "Read" }),
46
+ ].join("\n") + "\n",
47
+ "tool-labels-B.jsonl": [
48
+ JSON.stringify({ ts: 2000, tool_use_id: "u4", agent_id: "ag", label: "Running skill buildkite-cli", tool_name: "Skill" }),
49
+ // malformed line: ignored
50
+ "{not-json",
51
+ "",
52
+ ].join("\n") + "\n",
53
+ "other.jsonl": JSON.stringify({ ts: 2500, tool_name: "Skill", label: "Running skill x" }),
54
+ };
55
+
56
+ const fakeReaddir = (_p: string): string[] => Object.keys(files);
57
+ const fakeReadFile = (p: string): string => {
58
+ const name = p.split("/").pop()!;
59
+ if (files[name] === undefined) throw new Error("ENOENT");
60
+ return files[name]!;
61
+ };
62
+
63
+ it("returns only Skill rows from tool-labels-*.jsonl with ts >= sinceMs", () => {
64
+ const got = readSkillRowsSince("/fake", 1000, fakeReaddir, fakeReadFile);
65
+ const labels = got.map((r) => r.label).sort();
66
+ expect(labels).toEqual([
67
+ "Running skill buildkite-cli",
68
+ "Running skill switchroom-cli",
69
+ ]);
70
+ });
71
+
72
+ it("returns [] when the dir read throws", () => {
73
+ expect(
74
+ readSkillRowsSince("/fake", 0, () => { throw new Error("EACCES"); }, fakeReadFile),
75
+ ).toEqual([]);
76
+ });
77
+
78
+ it("skips files that fail to read but keeps siblings", () => {
79
+ const breakingRead = (p: string): string => {
80
+ if (p.endsWith("tool-labels-A.jsonl")) throw new Error("EACCES");
81
+ return fakeReadFile(p);
82
+ };
83
+ const got = readSkillRowsSince("/fake", 0, fakeReaddir, breakingRead);
84
+ expect(got.map((r) => r.label)).toEqual(["Running skill buildkite-cli"]);
85
+ });
86
+
87
+ it("ignores files that don't match the tool-labels-*.jsonl pattern", () => {
88
+ const files2: Record<string, string> = {
89
+ "other.jsonl": JSON.stringify({ ts: 100, tool_name: "Skill", label: "x" }),
90
+ "tool-labels-A.jsonl": "",
91
+ };
92
+ const got = readSkillRowsSince(
93
+ "/fake",
94
+ 0,
95
+ () => Object.keys(files2),
96
+ (p) => files2[p.split("/").pop()!]!,
97
+ );
98
+ expect(got).toEqual([]);
99
+ });
100
+ });