switchroom 0.7.15 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (301) hide show
  1. package/README.md +51 -59
  2. package/bin/run-hook.sh +27 -11
  3. package/bin/timezone-hook.sh +9 -7
  4. package/dist/agent-scheduler/index.js +410 -133
  5. package/dist/auth-broker/index.js +13932 -0
  6. package/dist/cli/switchroom.js +26937 -5601
  7. package/dist/host-control/main.js +12702 -0
  8. package/dist/vault/approvals/kernel-server.js +467 -184
  9. package/dist/vault/broker/server.js +1430 -724
  10. package/examples/minimal.yaml +63 -0
  11. package/examples/personal-google-workspace-mcp/.env.example +34 -0
  12. package/examples/personal-google-workspace-mcp/README.md +194 -0
  13. package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
  14. package/examples/switchroom.yaml +220 -0
  15. package/package.json +7 -4
  16. package/profiles/_base/settings.json.hbs +20 -5
  17. package/profiles/_base/start.sh.hbs +16 -3
  18. package/profiles/_shared/agent-self-service.md.hbs +126 -0
  19. package/profiles/_shared/telegram-style.md.hbs +20 -90
  20. package/profiles/_shared/vault-protocol.md.hbs +68 -0
  21. package/profiles/default/CLAUDE.md +50 -96
  22. package/profiles/default/CLAUDE.md.hbs +36 -6
  23. package/profiles/default/workspace/SOUL.md.hbs +12 -5
  24. package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
  25. package/skills/buildkite-agent-runtime/SKILL.md +44 -11
  26. package/skills/buildkite-api/SKILL.md +31 -8
  27. package/skills/buildkite-cli/SKILL.md +27 -9
  28. package/skills/buildkite-migration/SKILL.md +22 -9
  29. package/skills/buildkite-pipelines/SKILL.md +26 -9
  30. package/skills/buildkite-secure-delivery/SKILL.md +23 -9
  31. package/skills/buildkite-test-engine/SKILL.md +25 -8
  32. package/skills/docx/SKILL.md +1 -1
  33. package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
  34. package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
  35. package/skills/file-bug/SKILL.md +34 -6
  36. package/skills/humanizer/SKILL.md +15 -0
  37. package/skills/humanizer-calibrate/SKILL.md +7 -1
  38. package/skills/mcp-builder/SKILL.md +1 -1
  39. package/skills/pdf/SKILL.md +1 -1
  40. package/skills/pptx/SKILL.md +1 -1
  41. package/skills/skill-creator/SKILL.md +21 -1
  42. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
  43. package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
  44. package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
  45. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
  46. package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
  47. package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
  48. package/skills/switchroom-cli/SKILL.md +63 -64
  49. package/skills/switchroom-health/SKILL.md +23 -10
  50. package/skills/switchroom-install/SKILL.md +3 -3
  51. package/skills/switchroom-manage/SKILL.md +26 -19
  52. package/skills/switchroom-runtime/SKILL.md +191 -0
  53. package/skills/switchroom-status/SKILL.md +27 -2
  54. package/skills/telegram-test-harness/SKILL.md +3 -0
  55. package/skills/token-helpers/SKILL.md +24 -1
  56. package/skills/webapp-testing/SKILL.md +31 -1
  57. package/skills/xlsx/SKILL.md +1 -1
  58. package/telegram-plugin/admin-commands/index.ts +7 -5
  59. package/telegram-plugin/analytics-posthog.ts +191 -0
  60. package/telegram-plugin/bridge/bridge.ts +69 -0
  61. package/telegram-plugin/bridge/ipc-client.ts +4 -1
  62. package/telegram-plugin/dist/bridge/bridge.js +194 -119
  63. package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
  64. package/telegram-plugin/dist/server.js +245 -189
  65. package/telegram-plugin/first-paint.ts +3 -24
  66. package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
  67. package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
  68. package/telegram-plugin/gateway/auth-command.ts +794 -0
  69. package/telegram-plugin/gateway/auth-line.ts +123 -0
  70. package/telegram-plugin/gateway/boot-card.ts +169 -40
  71. package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
  72. package/telegram-plugin/gateway/boot-probes.ts +166 -123
  73. package/telegram-plugin/gateway/boot-reason.ts +41 -7
  74. package/telegram-plugin/gateway/boot-version.ts +66 -0
  75. package/telegram-plugin/gateway/gateway.ts +3499 -1885
  76. package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
  77. package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
  78. package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
  79. package/telegram-plugin/gateway/quarantine.ts +69 -0
  80. package/telegram-plugin/gateway/quota-cache.ts +9 -4
  81. package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
  82. package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
  83. package/telegram-plugin/gateway/recent-denials.ts +77 -0
  84. package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
  85. package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
  86. package/telegram-plugin/history.ts +91 -0
  87. package/telegram-plugin/hooks/hooks.json +10 -0
  88. package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
  89. package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
  90. package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
  91. package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
  92. package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
  93. package/telegram-plugin/inbound-classifier.ts +50 -0
  94. package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
  95. package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
  96. package/telegram-plugin/package.json +4 -2
  97. package/telegram-plugin/permission-rule.ts +51 -0
  98. package/telegram-plugin/permission-title.ts +56 -0
  99. package/telegram-plugin/quota-check.ts +19 -41
  100. package/telegram-plugin/registry/reaper.ts +223 -0
  101. package/telegram-plugin/retry-api-call.ts +80 -0
  102. package/telegram-plugin/runtime-metrics.ts +177 -0
  103. package/telegram-plugin/scripts/build.mjs +0 -1
  104. package/telegram-plugin/secret-detect/index.ts +24 -0
  105. package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
  106. package/telegram-plugin/secret-detect/vault-error.ts +78 -11
  107. package/telegram-plugin/secret-detect/vault-write.ts +14 -2
  108. package/telegram-plugin/server.js +41795 -0
  109. package/telegram-plugin/session-tail.ts +6 -1
  110. package/telegram-plugin/shared/bot-runtime.ts +5 -4
  111. package/telegram-plugin/silence-poke.ts +420 -0
  112. package/telegram-plugin/silent-end.ts +174 -0
  113. package/telegram-plugin/stream-controller.ts +13 -0
  114. package/telegram-plugin/stream-reply-handler.ts +7 -0
  115. package/telegram-plugin/subagent-watcher.ts +213 -4
  116. package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
  117. package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
  118. package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
  119. package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
  120. package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
  121. package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
  122. package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
  123. package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
  124. package/telegram-plugin/tests/boot-probes.test.ts +216 -10
  125. package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
  126. package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
  127. package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
  128. package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
  129. package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
  130. package/telegram-plugin/tests/history-reaper.test.ts +378 -0
  131. package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
  132. package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
  133. package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
  134. package/telegram-plugin/tests/issues-card.test.ts +49 -0
  135. package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
  136. package/telegram-plugin/tests/permission-rule.test.ts +80 -1
  137. package/telegram-plugin/tests/permission-title.test.ts +31 -0
  138. package/telegram-plugin/tests/quota-check.test.ts +5 -35
  139. package/telegram-plugin/tests/races.test.ts +179 -0
  140. package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
  141. package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
  142. package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
  143. package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
  144. package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
  145. package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
  146. package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
  147. package/telegram-plugin/tests/silence-poke.test.ts +493 -0
  148. package/telegram-plugin/tests/silent-end.test.ts +206 -0
  149. package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
  150. package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
  151. package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
  152. package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
  153. package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
  154. package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
  155. package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
  156. package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
  157. package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
  158. package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
  159. package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
  160. package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
  161. package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
  162. package/telegram-plugin/turn-signal-tracker.ts +100 -24
  163. package/telegram-plugin/uat/SETUP.md +210 -35
  164. package/telegram-plugin/uat/assertions.ts +264 -37
  165. package/telegram-plugin/uat/driver-info.ts +57 -0
  166. package/telegram-plugin/uat/driver.ts +590 -51
  167. package/telegram-plugin/uat/harness.ts +140 -94
  168. package/telegram-plugin/uat/load-env.test.ts +72 -0
  169. package/telegram-plugin/uat/load-env.ts +48 -0
  170. package/telegram-plugin/uat/login.ts +96 -53
  171. package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
  172. package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
  173. package/telegram-plugin/uat/runners/report.ts +150 -0
  174. package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
  175. package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
  176. package/telegram-plugin/uat/runners/scorer.ts +106 -0
  177. package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
  178. package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
  179. package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
  180. package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
  181. package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
  182. package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
  183. package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
  184. package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
  185. package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
  186. package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
  187. package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
  188. package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
  189. package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
  190. package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
  191. package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
  192. package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
  193. package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
  194. package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
  195. package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
  196. package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
  197. package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
  198. package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
  199. package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
  200. package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
  201. package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
  202. package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
  203. package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
  204. package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
  205. package/telegram-plugin/vault-approval-posture.ts +42 -0
  206. package/telegram-plugin/welcome-text.ts +1 -0
  207. package/telegram-plugin/active-pins-sweep.ts +0 -204
  208. package/telegram-plugin/active-pins.ts +0 -146
  209. package/telegram-plugin/auth-dashboard.ts +0 -1104
  210. package/telegram-plugin/auth-slot-parser.ts +0 -497
  211. package/telegram-plugin/card-event-log.ts +0 -138
  212. package/telegram-plugin/dist/foreman/foreman.js +0 -31106
  213. package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
  214. package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
  215. package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
  216. package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
  217. package/telegram-plugin/foreman/foreman.ts +0 -1165
  218. package/telegram-plugin/foreman/setup-flow.ts +0 -345
  219. package/telegram-plugin/foreman/setup-state.ts +0 -239
  220. package/telegram-plugin/foreman/state.ts +0 -203
  221. package/telegram-plugin/pin-event-log.ts +0 -76
  222. package/telegram-plugin/progress-card-driver.ts +0 -2886
  223. package/telegram-plugin/progress-card-pin-manager.ts +0 -589
  224. package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
  225. package/telegram-plugin/progress-card.ts +0 -1409
  226. package/telegram-plugin/tests/HARNESS.md +0 -340
  227. package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
  228. package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
  229. package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
  230. package/telegram-plugin/tests/active-pins.test.ts +0 -187
  231. package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
  232. package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
  233. package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
  234. package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
  235. package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
  236. package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
  237. package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
  238. package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
  239. package/telegram-plugin/tests/card-event-log.test.ts +0 -145
  240. package/telegram-plugin/tests/first-paint.test.ts +0 -257
  241. package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
  242. package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
  243. package/telegram-plugin/tests/foreman-state.test.ts +0 -164
  244. package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
  245. package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
  246. package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
  247. package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
  248. package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
  249. package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
  250. package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
  251. package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
  252. package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
  253. package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
  254. package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
  255. package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
  256. package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
  257. package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
  258. package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
  259. package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
  260. package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
  261. package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
  262. package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
  263. package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
  264. package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
  265. package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
  266. package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
  267. package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
  268. package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
  269. package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
  270. package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
  271. package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
  272. package/telegram-plugin/tests/setup-flow.test.ts +0 -510
  273. package/telegram-plugin/tests/setup-state.test.ts +0 -146
  274. package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
  275. package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
  276. package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
  277. package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
  278. package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
  279. package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
  280. package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
  281. package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
  282. package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
  283. package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
  284. package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
  285. package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
  286. package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
  287. package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
  288. package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
  289. package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
  290. package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
  291. package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
  292. package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
  293. package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
  294. package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
  295. package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
  296. package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
  297. package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
  298. package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
  299. package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
  300. package/telegram-plugin/two-zone-card.ts +0 -269
  301. package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
@@ -0,0 +1,123 @@
1
+ /**
2
+ * UAT scenario — operator pastes a real-shaped secret into the bot's
3
+ * DM; bot detects, deletes the original, posts a redaction card.
4
+ *
5
+ * Part of: secret-redaction bug class reported 2026-05-12 (Bug A —
6
+ * sometimes the original message isn't actually deleted from chat
7
+ * history despite the bot claiming it was).
8
+ *
9
+ * **Skipped by default.** To unskip:
10
+ *
11
+ * 1. Run the standard UAT preflight (uat/SETUP.md §5-6) so the
12
+ * test-harness agent is live and the driver session is auth'd.
13
+ *
14
+ * 2. Verify the test-harness chat has secret-detect enabled. The
15
+ * agent's switchroom.yaml `access.json` must include the driver
16
+ * in `allowFrom` so the driver's paste is treated as a real
17
+ * operator message (not silently ignored). Existing UAT setup
18
+ * already covers this for the smoke scenario.
19
+ *
20
+ * 3. Confirm a vault passphrase is cached in the test-harness chat
21
+ * so the high-confidence-stored branch fires (not the
22
+ * no-passphrase deferred branch). Easiest: send `/vault unlock`
23
+ * + passphrase as the driver once before running this scenario.
24
+ * Without a cached passphrase the assertion changes — the bot
25
+ * posts the "🔒 caught a secret. tap below to unlock the vault
26
+ * and save it" card instead of "🔒 captured N secrets:". Both
27
+ * paths MUST delete the original; the matcher here is loose
28
+ * enough to accept either.
29
+ *
30
+ * 4. Remove the `describe.skip` below.
31
+ *
32
+ * Why skipped: sends a real-shaped (but synthetic) secret-pattern
33
+ * string into Telegram. The pattern doesn't unlock any actual
34
+ * secret, but committing the scenario unskipped would also commit
35
+ * the test fixture into git history where secretlint pre-commit
36
+ * hooks might flag it. Generated at runtime to dodge the scan.
37
+ */
38
+
39
+ import { describe, expect, it } from "vitest";
40
+ import { spinUp } from "../harness.js";
41
+
42
+ describe.skip("uat: secret-redaction deletes the original message (Bug A 2026-05-12)", () => {
43
+ it(
44
+ "paste a real-shaped secret; bot deletes the original from chat history",
45
+ async () => {
46
+ const sc = await spinUp({ agent: "test-harness" });
47
+ try {
48
+ // Build a real-shaped (but synthetic) ANTHROPIC_API_KEY value
49
+ // at runtime so the source file doesn't trip Push Protection.
50
+ // Same idiom as telegram-plugin/tests/secret-detect-secretlint.test.ts:1.
51
+ const fakeApiKey =
52
+ `sk-ant-` + "a1b2c3d4".repeat(4) + "_test_synthetic";
53
+ const inboundText = `set ANTHROPIC_API_KEY=${fakeApiKey}`;
54
+
55
+ // Send the secret-bearing message. Capture the messageId we
56
+ // sent so we can later assert it's gone from history.
57
+ const sent = await sc.sendDM(inboundText);
58
+ const sentMessageId = sent.messageId;
59
+
60
+ // The bot should reply with either:
61
+ // - "🔒 captured N secret(s):" (high-confidence stored
62
+ // path, requires cached passphrase)
63
+ // - "🔒 caught a secret. we deleted it from chat. tap
64
+ // below to unlock the vault..." (deferred path)
65
+ // OR the new fail-loud variant (if delete failed):
66
+ // - "⚠️ Could not auto-delete message containing your ..."
67
+ // The contract this test pins is: ONE of the first two
68
+ // success messages appears AND the original message is
69
+ // actually gone from history.
70
+ const reply = await sc.expectMessage(
71
+ /🔒 (captured|caught)/,
72
+ { from: "bot", timeout: 30_000 },
73
+ );
74
+ expect(reply.text).toMatch(/deleted (it )?from chat|captured/i);
75
+
76
+ // The load-bearing assertion: the original message is
77
+ // unreachable in chat history. driver.getMessage returns
78
+ // null for deleted messages (driver.ts:525-534).
79
+ //
80
+ // Pre-2026-05-12 fix: this would sometimes pass when delete
81
+ // succeeded and silently leave the message behind when it
82
+ // failed (Telegram rate limits, network blip, message was
83
+ // edited mid-delete, etc.) — and the operator would never
84
+ // know.
85
+ //
86
+ // Post-fix: deleteSensitiveMessage either deletes
87
+ // successfully OR posts an in-chat warning "⚠️ Could not
88
+ // auto-delete..." which we'd see as a SECOND bot message.
89
+ // The assertion here is the strict "actually gone" version.
90
+ // chat_id for the driver's view of a DM = the partner's
91
+ // (bot's) user_id.
92
+ const stillThere = await sc.driver.getMessage(sc.botUserId, sentMessageId);
93
+ expect(
94
+ stillThere,
95
+ `original secret-bearing message ${sentMessageId} was NOT deleted — Telegram history still has it`,
96
+ ).toBeNull();
97
+ } finally {
98
+ await sc.tearDown();
99
+ }
100
+ },
101
+ 120_000,
102
+ );
103
+
104
+ it(
105
+ "when delete fails (simulated by editing the message just before delete), the bot posts a warning naming the leaked msg_id",
106
+ async () => {
107
+ // This case is harder to repro without a fault-injection
108
+ // hook — Telegram doesn't let us "make deleteMessage fail
109
+ // deterministically" from the driver side. The contract is
110
+ // pinned by the unit test at
111
+ // telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts
112
+ // (deleteSensitiveMessage helper still logs SECURITY: …
113
+ // FAILED + posts an in-chat warning on its catch path).
114
+ // This UAT slot stays skipped pending a fault-injection
115
+ // affordance in the driver — tracked as a TODO on the
116
+ // harness roadmap.
117
+ const _ = await spinUp({ agent: "test-harness" });
118
+ void _;
119
+ expect(true).toBe(true);
120
+ },
121
+ 60_000,
122
+ );
123
+ });
@@ -0,0 +1,87 @@
1
+ /**
2
+ * UAT scenario — operator chats casually about secrets/tokens
3
+ * (mentioning the words, not pasting actual credentials); bot
4
+ * MUST NOT redact the operator's question.
5
+ *
6
+ * Part of: secret-redaction bug class reported 2026-05-12 (Bug B —
7
+ * false positive on the word "secret"/"token" or on
8
+ * code-shaped-but-placeholder values like `MY_TOKEN=hello`).
9
+ *
10
+ * **Skipped by default.** Unskip after the standard UAT preflight
11
+ * (uat/SETUP.md §5-6). No host-state mutations.
12
+ *
13
+ * The unit-shape contract is pinned in
14
+ * `telegram-plugin/tests/secret-detect-false-positives.test.ts` —
15
+ * which runs every CI cycle. This UAT scenario adds the
16
+ * end-to-end Telegram round-trip so a future regression in the
17
+ * gateway integration (not the detector) would also surface.
18
+ */
19
+
20
+ import { describe, expect, it } from "vitest";
21
+ import { spinUp } from "../harness.js";
22
+
23
+ const CASUAL_MENTIONS = [
24
+ "what's my fatsecret token?",
25
+ "delete that secret you sent earlier",
26
+ "the FATSECRET_TOKEN env var is missing",
27
+ "set MY_TOKEN=hello and try again",
28
+ "I keep forgetting my password again",
29
+ ];
30
+
31
+ describe.skip("uat: secret-redaction does NOT fire on casual mentions (Bug B 2026-05-12)", () => {
32
+ for (const text of CASUAL_MENTIONS) {
33
+ it(
34
+ `does not redact: ${JSON.stringify(text)}`,
35
+ async () => {
36
+ const sc = await spinUp({ agent: "test-harness" });
37
+ try {
38
+ const sent = await sc.sendDM(text);
39
+
40
+ // Wait a short period for any (incorrect) redaction reply
41
+ // to arrive. If the bot's gonna fire the redaction
42
+ // pipeline, it does so synchronously in handleInbound —
43
+ // well under 10s.
44
+ //
45
+ // The assertion: we should NOT see a "🔒 captured" or
46
+ // "🔒 caught a secret" reply. If we do, the false
47
+ // positive is back.
48
+ //
49
+ // We tolerate the bot's normal Claude reply (which is
50
+ // unrelated content). Pin only the absence of the
51
+ // redaction marker.
52
+ let sawRedaction = false;
53
+ try {
54
+ await sc.expectMessage(/🔒 (captured|caught)/, {
55
+ from: "bot",
56
+ timeout: 10_000,
57
+ });
58
+ sawRedaction = true;
59
+ } catch {
60
+ // Expected: timeout means no redaction fired.
61
+ }
62
+ expect(
63
+ sawRedaction,
64
+ `false-positive redaction fired on casual chat: ${JSON.stringify(text)}`,
65
+ ).toBe(false);
66
+
67
+ // The original message must remain visible — the
68
+ // operator asked a real question and the bot deleted
69
+ // it would be terrible UX.
70
+ // chat_id for the driver's view of a DM = the partner's
71
+ // (bot's) user_id.
72
+ const stillThere = await sc.driver.getMessage(
73
+ sc.botUserId,
74
+ sent.messageId,
75
+ );
76
+ expect(
77
+ stillThere,
78
+ `the bot deleted the operator's question (false positive on '${text}')`,
79
+ ).not.toBeNull();
80
+ } finally {
81
+ await sc.tearDown();
82
+ }
83
+ },
84
+ 60_000,
85
+ );
86
+ }
87
+ });
@@ -0,0 +1,155 @@
1
+ /**
2
+ * Silence-poke soft-fire end-to-end scenario.
3
+ *
4
+ * Goal context: cause class CC-3 in `docs/status-ask-cause-classes.md`
5
+ * — the L3 safety net. Unit tests (`silence-poke.test.ts`) cover the
6
+ * state machine: tick semantics, ladder thresholds, success measurement.
7
+ * They DO NOT cover the wire path between `consumeArmedPoke()` (in
8
+ * `silence-poke.ts`) and the model actually receiving the
9
+ * `[silence-poke]` system-reminder block on its next tool result.
10
+ *
11
+ * The wire path lives at `gateway.ts:2740`:
12
+ *
13
+ * onToolCall → executeToolCall(...) → consumeArmedPoke() →
14
+ * append `<system-reminder>[silence-poke] ...</system-reminder>`
15
+ * to the tool-result text.
16
+ *
17
+ * If that integration ever breaks — a refactor swaps `executeToolCall`
18
+ * for a path that doesn't call `consumeArmedPoke`, the result-content
19
+ * shape mutation gets dropped, MCP framing changes — the unit tests
20
+ * still pass but the model never sees the nudge, the user goes silent
21
+ * past 75s, and `inbound_status_query` ticks. This UAT closes that
22
+ * regression window end-to-end.
23
+ *
24
+ * ## Strategy
25
+ *
26
+ * Force the agent into a stretch of silent tool churn that exceeds the
27
+ * 75s soft threshold without the model emitting any outbound `reply`.
28
+ * The conversational-pacing prompt instructs the model to soft-commit
29
+ * fast turns, so we have to explicitly suppress that:
30
+ *
31
+ * - Prompt instructs three sequential 30s `sleep` Bash calls, NO
32
+ * mid-turn replies, single final reply when done.
33
+ * - Total silent stretch is ~90s + tool overhead, comfortably past
34
+ * the 75s soft threshold.
35
+ * - If the silence-poke wire works: the model sees the
36
+ * `[silence-poke]` system-reminder appended to the result of the
37
+ * first or second sleep, breaks the no-reply rule, sends a brief
38
+ * update. We observe a reply in the [70s, 200s] window.
39
+ * - If the wire is broken: model never receives the nudge, no
40
+ * reply until the third sleep ends at ~90s+, OR the framework
41
+ * fallback at 300s fires. We catch the latter as a separate
42
+ * failure (the framework fallback is the FLOOR, not the goal).
43
+ *
44
+ * ## Tolerances
45
+ *
46
+ * Real-Telegram UAT against a real Claude model has variability:
47
+ *
48
+ * - Model may insert one soft-commit "on it" reply at start; that
49
+ * resets the silence clock. Three 30s sleeps still pushes the
50
+ * post-commit silence past 75s as long as the commit lands
51
+ * within the first ~10s. We tolerate this.
52
+ * - Model may decline to follow the "no replies" instruction and
53
+ * send updates organically; if the FIRST reply still lands in
54
+ * [70s, 200s], the conversational pacing layer is doing its job
55
+ * and the test passes regardless of whether silence-poke
56
+ * specifically fired.
57
+ * - Window is generous (70-200s) to absorb 5s poll interval,
58
+ * mtcute receive lag, Telegram delivery jitter.
59
+ *
60
+ * ## Failure shapes the assertion catches
61
+ *
62
+ * 1. Wire path broken — first reply lands >200s after sendDM
63
+ * because the framework fallback (300s) is the only thing that
64
+ * eventually breaks the silence.
65
+ * 2. Soft poke armed but not drained — first reply lands at >200s
66
+ * similarly.
67
+ * 3. Model misbehavior — first reply is the FINAL answer (long
68
+ * text after all three sleeps complete at ~90s+); strictly that
69
+ * passes the window check, but the test also asserts the first
70
+ * reply is brief (<400 chars) as a sanity floor on "this is
71
+ * actually a poke response, not the final answer." Skip strict
72
+ * length if the prompt happens to be so simple the final
73
+ * answer IS brief.
74
+ *
75
+ * Requires the same env as `smoke-dm-reply.test.ts` (see
76
+ * `uat/SETUP.md` §6). Long-running: outer budget 4 min.
77
+ */
78
+
79
+ import { describe, expect, it } from "vitest";
80
+ import { spinUp } from "../harness.js";
81
+
82
+ const SOFT_WINDOW_MIN_MS = 70_000;
83
+ const SOFT_WINDOW_MAX_MS = 200_000;
84
+
85
+ // Explicit instruction shape. Mirrors the `BG_DISPATCH_PROMPT` pattern
86
+ // in `bg-sub-agent-dispatch-dm.test.ts` — pin the tool + the sequence
87
+ // so behaviour is deterministic enough to test the *infra*, not the
88
+ // model's free-form judgement.
89
+ const SILENT_CHURN_PROMPT =
90
+ "I need you to test something. Run THREE separate Bash tool calls " +
91
+ "in sequence: first `sleep 30`, then `sleep 30`, then `sleep 30`. " +
92
+ "Critical: do NOT send any `reply` or `stream_reply` between or " +
93
+ "during the sleeps — no soft commit, no progress updates, no " +
94
+ "narration. Just the three Bash calls back-to-back. Once all three " +
95
+ "complete, send ONE brief final reply saying 'done' so I know " +
96
+ "you're back.";
97
+
98
+ describe("uat: silence-poke soft fires + reaches the model wire", () => {
99
+ it(
100
+ "agent breaks self-imposed silence in [70s, 200s] window via silence-poke",
101
+ async () => {
102
+ const sc = await spinUp({ agent: "test-harness" });
103
+ try {
104
+ const sendStart = Date.now();
105
+ await sc.sendDM(SILENT_CHURN_PROMPT);
106
+
107
+ // Wait for the FIRST reply. If silence-poke + the wire path
108
+ // are working, this lands between ~75s and ~110s as the
109
+ // model responds to the [silence-poke] system-reminder
110
+ // appended to the first or second sleep's tool result.
111
+ const firstReply = await sc.expectMessage(/\S/, {
112
+ from: "bot",
113
+ timeout: SOFT_WINDOW_MAX_MS + 20_000,
114
+ });
115
+ const elapsed = Date.now() - sendStart;
116
+
117
+ expect(firstReply.text.length).toBeGreaterThan(0);
118
+
119
+ // Primary window assertion.
120
+ expect(
121
+ elapsed,
122
+ `first bot reply lands at ${elapsed}ms (target window ` +
123
+ `[${SOFT_WINDOW_MIN_MS}, ${SOFT_WINDOW_MAX_MS}]). ` +
124
+ `Reply text: ${JSON.stringify(firstReply.text.slice(0, 200))}.`,
125
+ ).toBeGreaterThanOrEqual(SOFT_WINDOW_MIN_MS);
126
+ expect(
127
+ elapsed,
128
+ `first bot reply lands at ${elapsed}ms — above ${SOFT_WINDOW_MAX_MS}ms ` +
129
+ `ceiling. Either silence-poke wire is broken (poke armed but ` +
130
+ `not drained at gateway.ts:onToolCall) or the framework ` +
131
+ `fallback at 300s was the first thing to break silence. ` +
132
+ `Reply text: ${JSON.stringify(firstReply.text.slice(0, 200))}.`,
133
+ ).toBeLessThanOrEqual(SOFT_WINDOW_MAX_MS);
134
+
135
+ // Sanity floor: the first reply should be brief — proves it's
136
+ // a poke-driven update, not the final "done" answer after all
137
+ // three sleeps finished naturally. ~400 char ceiling allows a
138
+ // verbose model to add a sentence of context. Bump this if it
139
+ // flakes on perfectly valid short answers.
140
+ if (firstReply.text.length > 400) {
141
+ console.warn(
142
+ `[silence-poke] first reply at ${elapsed}ms is ${firstReply.text.length} ` +
143
+ `chars — longer than expected for a poke-driven update. The ` +
144
+ `window assertion still passed, but consider whether the model ` +
145
+ `bypassed the silence stretch (e.g. ran the sleeps in one ` +
146
+ `Bash call, dodging the per-call result poke chokepoint).`,
147
+ );
148
+ }
149
+ } finally {
150
+ await sc.tearDown();
151
+ }
152
+ },
153
+ 240_000,
154
+ );
155
+ });
@@ -0,0 +1,95 @@
1
+ /**
2
+ * Silent-end recovery scenario — the regression PR3 (#1126) introduced
3
+ * and PR1129 fixed.
4
+ *
5
+ * The bug: PR3 deleted the progress-card driver, and with it the
6
+ * `onSilentEnd` callback that wrote
7
+ * $TELEGRAM_STATE_DIR/silent-end-pending.json. The Stop hook
8
+ * (`silent-end-interrupt-stop.mjs`) reads that file to decide whether
9
+ * to block-and-re-prompt. With the writer gone, the hook always read
10
+ * "no silent-end pending" and allowed the stop. The model would
11
+ * produce an answer in its CLI session but never call `reply`, and
12
+ * the user got nothing back.
13
+ *
14
+ * This UAT exercises the outcome side directly: send a DM that
15
+ * SHOULD produce a reply, assert that a reply lands within a budget
16
+ * that covers (a) normal turn latency, (b) one Stop-hook re-prompt
17
+ * cycle (the agent goes silent → hook blocks → re-prompted → calls
18
+ * reply), and (c) worst-case framework fallback at 5 min.
19
+ *
20
+ * Why this scenario specifically:
21
+ * - The bug surfaced as "user gets no reply at all." The most
22
+ * defensible UAT assertion is "after asking, the user gets SOME
23
+ * reply within a reasonable bound." Anything that breaks this
24
+ * contract — silent-end gap, scaffold staleness, hook misconfig,
25
+ * gateway crash — fails this test.
26
+ * - Unlike `smoke-dm-reply.test.ts` (trivial inbound, fast reply),
27
+ * this scenario uses a tool-heavy prompt that pushes the model
28
+ * into the silent-end zone (lots of tool churn, easy to forget to
29
+ * call reply afterward). It's the actual JTBD-failure shape.
30
+ *
31
+ * Budget: 6 min outer, 5 min for the reply itself. Covers the
32
+ * 5-min framework fallback floor.
33
+ */
34
+
35
+ import { describe, it, expect } from "vitest";
36
+ import { spinUp } from "../harness.js";
37
+
38
+ // The prompt pushes the model into a tool-heavy state where it has
39
+ // produced "an answer" internally but hasn't yet realised it must
40
+ // surface that via `reply`. This is the shape of the gymbro
41
+ // regression: the model did the work (cat, pip install, garmin-pull,
42
+ // etc), produced a summary, then ended the turn without `reply`.
43
+ const TOOL_HEAVY_PROMPT = (
44
+ "Pick a directory under /tmp that doesn't exist yet. Create it. "
45
+ + "List its contents (should be empty). Write a small file in it. "
46
+ + "List again. Then report what you did in a one-line reply."
47
+ );
48
+
49
+ describe("uat: silent-end recovery", () => {
50
+ it(
51
+ "user asks → agent always replies (the gymbro regression must not return)",
52
+ async () => {
53
+ const sc = await spinUp({ agent: "test-harness" });
54
+ try {
55
+ const { messageId: inboundId } = await sc.sendDM(TOOL_HEAVY_PROMPT);
56
+ expect(inboundId).toBeGreaterThan(0);
57
+
58
+ // The core assertion: SOMETHING comes back from the bot
59
+ // within 5min. That covers the worst case of the
60
+ // silent-end-recovery ladder:
61
+ // t=0: inbound
62
+ // t<30s: normal reply if all is well
63
+ // t=75s: silence-poke #1 fires (model re-prompted)
64
+ // t=180s: silence-poke #2 fires
65
+ // t=300s: framework fallback ("still working… (no update
66
+ // from agent in 5 min)") fires from the gateway.
67
+ // If we still get nothing by 300s+slack the bug is back.
68
+ const reply = await sc.expectMessage(/\S/, {
69
+ from: "bot",
70
+ timeout: 320_000,
71
+ });
72
+
73
+ expect(reply.text.length).toBeGreaterThan(0);
74
+ expect(reply.senderUserId).toBe(sc.botUserId);
75
+
76
+ // Subtler regression catch: if the reply is the framework
77
+ // fallback wording ("still working… (no update from agent
78
+ // in N min)") that means the silent-end loop fired AND the
79
+ // model didn't recover. Acceptable outcome — the user got
80
+ // something — but a design-health alarm. Log it.
81
+ if (/no update from agent/i.test(reply.text)) {
82
+ console.warn(
83
+ `[silent-end-recovery] reply was the framework fallback — `
84
+ + `model never replied on its own. Reply text: ${JSON.stringify(reply.text.slice(0, 200))}`,
85
+ );
86
+ }
87
+ } finally {
88
+ await sc.tearDown();
89
+ }
90
+ },
91
+ // Outer budget = inner deadline (320s) + spinUp overhead
92
+ // (~12s mtcute connect + DEFAULT_SETTLE_MS) + headroom.
93
+ 360_000,
94
+ );
95
+ });
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Smoke scenario — driver DMs the test bot, bot replies.
3
+ *
4
+ * Part of: https://github.com/switchroom/switchroom/issues/866
5
+ *
6
+ * Runs against real Telegram. Requires:
7
+ * - test-harness agent running (see uat/SETUP.md §5)
8
+ * - TELEGRAM_API_ID / TELEGRAM_API_HASH / TELEGRAM_UAT_DRIVER_SESSION
9
+ * in the env (operator script in SETUP.md §6)
10
+ * - TELEGRAM_TEST_BOT_USERNAME (defaults to `meken_switchroom_test_bot`)
11
+ *
12
+ * Invoke via `bun run test:uat` from `telegram-plugin/`. Default
13
+ * `bun test` / vitest do NOT discover this file — see
14
+ * vitest.config.ts.
15
+ *
16
+ * This is intentionally the simplest possible end-to-end check —
17
+ * just confirms the DM round-trip works. Richer assertions
18
+ * (reactions, progress card, edits) roll in with #866 Phase 2b.
19
+ */
20
+
21
+ import { describe, it, expect } from "vitest";
22
+ import { spinUp } from "../harness.js";
23
+
24
+ const SMOKE_INBOUND = `uat-smoke ${new Date().toISOString()}`;
25
+
26
+ describe("uat: DM round-trip smoke", () => {
27
+ it(
28
+ "driver DMs the test bot and observes a bot reply",
29
+ async () => {
30
+ const sc = await spinUp({ agent: "test-harness" });
31
+
32
+ try {
33
+ await sc.sendDM(SMOKE_INBOUND);
34
+
35
+ // 90s wall-clock budget: tolerates one rate-limit retry on the
36
+ // bot side + a normal Claude turn. If the agent is healthy the
37
+ // reply arrives in <20s.
38
+ const reply = await sc.expectMessage(/.+/, {
39
+ from: "bot",
40
+ timeout: 90_000,
41
+ });
42
+
43
+ expect(reply.text.length).toBeGreaterThan(0);
44
+ expect(reply.senderUserId).toBe(sc.botUserId);
45
+ } finally {
46
+ await sc.tearDown();
47
+ }
48
+ },
49
+ // Per-test budget — must exceed the 90s inner expectMessage
50
+ // deadline plus spinUp overhead (~3s mtcute connect +
51
+ // DEFAULT_SETTLE_MS gap + unpin), so add ~12s of headroom on top
52
+ // for symmetry with progress-card-dm. bun:test's default of 5s
53
+ // would otherwise cut the test off on any turn that takes longer
54
+ // than a few seconds.
55
+ 110_000,
56
+ );
57
+ });
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Issue #1116 — subagent-watcher must not re-fire "✓ Worker done"
3
+ * after the terminal-cleanup grace window elapses.
4
+ *
5
+ * Pre-fix repro (validated by RCA on `clerk` DM, 2026-05-12): once a
6
+ * background sub-agent completed, `cleanupTerminalAgent` ran ~30s
7
+ * later, deleting the agent's filePath from `knownFiles` and its row
8
+ * from `registry`. The JSONL itself stayed on disk, so the next
9
+ * `rescanSubagentDirs` poll rediscovered it, re-registered the agent
10
+ * with `completionNotified=false`, read the terminal `turn_duration`
11
+ * line, and emitted a fresh `✓ Worker done: …` notification. The loop
12
+ * ran indefinitely — operator saw the same 4 sub-agents (30/2/15/105
13
+ * tools) re-announcing completion every ~6 minutes.
14
+ *
15
+ * Post-fix invariant: each completed sub-agent emits exactly ONE
16
+ * `✓ Worker done` notification for the lifetime of the gateway.
17
+ *
18
+ * As a side-benefit, this scenario also catches the original RFC's
19
+ * "raw HTML tags rendered in card text" symptom (Bug C in the RCA):
20
+ * any bot message containing a literal `<b>` / `<i>` / `<code>`
21
+ * substring during the window is flagged. The watcher's own
22
+ * notification path is HTML-correct on `main`, so this assertion is
23
+ * a regression detector — if a future change starts leaking raw
24
+ * tags via a fall-through send site, this scenario goes red.
25
+ *
26
+ * Requires the same env as the other DM scenarios (see SETUP.md §6)
27
+ * and the test-harness override `progress_card.delay_ms: 1000` so a
28
+ * short DM turn actually pins a card (SETUP.md §5).
29
+ *
30
+ * Time budget: the bg sub-agent does two ~10s sleeps (~20s total)
31
+ * + we listen for an extra 75s post-completion (>30s grace +
32
+ * generous rescan slack) to catch a rerun. Plus parent-turn ack
33
+ * latency and Telegram-edit settle. Outer cap 240s.
34
+ */
35
+
36
+ import { describe, expect, it } from "vitest";
37
+ import { spinUp } from "../harness.js";
38
+
39
+ // Same Option-1 explicit-dispatch prompt as bg-sub-agent-dispatch-dm.test.ts
40
+ // — naming the tool + run_in_background flag keeps the model
41
+ // deterministic. The inner sleeps are shorter here (3×10s = ~30s
42
+ // background phase) so the outer budget stays sane: we only need
43
+ // the sub-agent to *complete* once. The duplicate-detection window
44
+ // is what makes the test meaningful, not the bg phase duration.
45
+ const BG_DISPATCH_PROMPT =
46
+ `Use the Agent tool with subagent_type "general-purpose" and ` +
47
+ `run_in_background: true to dispatch a worker with this exact task: ` +
48
+ `"Run \`sleep 10\` via the Bash tool, then \`echo step1\`, then ` +
49
+ `\`sleep 10\` again, then \`echo step2\`, then \`echo done\`. ` +
50
+ `That's two separate Bash sleeps and three echoes." After ` +
51
+ `dispatching, send a brief reply saying you've kicked off the ` +
52
+ `background worker so I can watch the progress card.`;
53
+
54
+ const WORKER_DONE_RE = /✓\s*Worker done/;
55
+ const RAW_HTML_TAG_RE = /<\/?(b|i|code|pre|strong|em)>/i;
56
+
57
+ describe("uat: issue #1116 — subagent-watcher does not re-fire Worker done", () => {
58
+ it(
59
+ "emits exactly one ✓ Worker done per bg sub-agent and no raw HTML leaks",
60
+ async () => {
61
+ const sc = await spinUp({ agent: "test-harness" });
62
+ try {
63
+ await sc.sendDM(BG_DISPATCH_PROMPT);
64
+
65
+ // Wait for the bg sub-agent to complete — the watcher's
66
+ // `✓ Worker done: …` notification is what we're locking
67
+ // behaviour around. Generous timeout: parent ack + bg sleeps
68
+ // + completion plumbing.
69
+ const firstDone = await sc.expectMessage(WORKER_DONE_RE, {
70
+ from: "bot",
71
+ timeout: 120_000,
72
+ });
73
+ expect(firstDone.text).toMatch(WORKER_DONE_RE);
74
+
75
+ // Snapshot bot-side messages observed after the first done.
76
+ // Pre-fix the same notification re-fired every ~30s
77
+ // (TERMINAL_CLEANUP_GRACE_MS + rescan). 75s gives us a
78
+ // comfortable >2 grace windows worth of observation.
79
+ const collected: Array<{ text: string; messageId: number }> = [];
80
+ const observer = sc.driver
81
+ .observeMessages(sc.botUserId)
82
+ [Symbol.asyncIterator]();
83
+ const deadline = Date.now() + 75_000;
84
+ try {
85
+ while (Date.now() < deadline) {
86
+ const remaining = deadline - Date.now();
87
+ if (remaining <= 0) break;
88
+ const winner = await Promise.race([
89
+ observer.next(),
90
+ new Promise<{ value?: undefined; done: true }>((resolve) =>
91
+ setTimeout(() => resolve({ done: true }), remaining),
92
+ ),
93
+ ]);
94
+ if (winner.done) break;
95
+ const msg = winner.value;
96
+ if (!msg) continue;
97
+ // Only count bot-sent messages (filter out anything the
98
+ // driver itself echoed in this window).
99
+ if (msg.fromUserId === sc.driverUserId) continue;
100
+ collected.push({ text: msg.text ?? "", messageId: msg.messageId });
101
+ }
102
+ } finally {
103
+ // Closing the iterator unregisters the mtcute listeners.
104
+ await observer.return?.();
105
+ }
106
+
107
+ // Invariant 1: no DUPLICATE Worker-done with the same shape
108
+ // as the first one. We compare text rather than message_id
109
+ // because the bug emits FRESH messages (not edits), so each
110
+ // re-fire has a new message_id but identical text.
111
+ const reruns = collected.filter((m) => WORKER_DONE_RE.test(m.text));
112
+ expect(
113
+ reruns,
114
+ `Expected zero re-fires of "Worker done" in the ${75}s post-completion window, got ${reruns.length}: ${JSON.stringify(reruns.slice(0, 4).map((r) => r.text.slice(0, 80)))}`,
115
+ ).toHaveLength(0);
116
+
117
+ // Invariant 2: no raw HTML tags in any bot text — including
118
+ // the original `firstDone` notification. Catches Bug C
119
+ // (RCA's third symptom) as a regression detector.
120
+ const allBotTexts = [firstDone.text, ...collected.map((m) => m.text)];
121
+ for (const text of allBotTexts) {
122
+ expect(
123
+ text,
124
+ `Raw HTML tag leaked into bot text: ${text.slice(0, 120)}`,
125
+ ).not.toMatch(RAW_HTML_TAG_RE);
126
+ }
127
+ } finally {
128
+ await sc.tearDown();
129
+ }
130
+ },
131
+ // Outer budget: 120s wait-for-done + 75s observation window +
132
+ // ~12s spinUp settle + slack. Round up.
133
+ 240_000,
134
+ );
135
+ });