switchroom 0.7.15 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (301) hide show
  1. package/README.md +51 -59
  2. package/bin/run-hook.sh +27 -11
  3. package/bin/timezone-hook.sh +9 -7
  4. package/dist/agent-scheduler/index.js +410 -133
  5. package/dist/auth-broker/index.js +13932 -0
  6. package/dist/cli/switchroom.js +26937 -5601
  7. package/dist/host-control/main.js +12702 -0
  8. package/dist/vault/approvals/kernel-server.js +467 -184
  9. package/dist/vault/broker/server.js +1430 -724
  10. package/examples/minimal.yaml +63 -0
  11. package/examples/personal-google-workspace-mcp/.env.example +34 -0
  12. package/examples/personal-google-workspace-mcp/README.md +194 -0
  13. package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
  14. package/examples/switchroom.yaml +220 -0
  15. package/package.json +7 -4
  16. package/profiles/_base/settings.json.hbs +20 -5
  17. package/profiles/_base/start.sh.hbs +16 -3
  18. package/profiles/_shared/agent-self-service.md.hbs +126 -0
  19. package/profiles/_shared/telegram-style.md.hbs +20 -90
  20. package/profiles/_shared/vault-protocol.md.hbs +68 -0
  21. package/profiles/default/CLAUDE.md +50 -96
  22. package/profiles/default/CLAUDE.md.hbs +36 -6
  23. package/profiles/default/workspace/SOUL.md.hbs +12 -5
  24. package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
  25. package/skills/buildkite-agent-runtime/SKILL.md +44 -11
  26. package/skills/buildkite-api/SKILL.md +31 -8
  27. package/skills/buildkite-cli/SKILL.md +27 -9
  28. package/skills/buildkite-migration/SKILL.md +22 -9
  29. package/skills/buildkite-pipelines/SKILL.md +26 -9
  30. package/skills/buildkite-secure-delivery/SKILL.md +23 -9
  31. package/skills/buildkite-test-engine/SKILL.md +25 -8
  32. package/skills/docx/SKILL.md +1 -1
  33. package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
  34. package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
  35. package/skills/file-bug/SKILL.md +34 -6
  36. package/skills/humanizer/SKILL.md +15 -0
  37. package/skills/humanizer-calibrate/SKILL.md +7 -1
  38. package/skills/mcp-builder/SKILL.md +1 -1
  39. package/skills/pdf/SKILL.md +1 -1
  40. package/skills/pptx/SKILL.md +1 -1
  41. package/skills/skill-creator/SKILL.md +21 -1
  42. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
  43. package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
  44. package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
  45. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
  46. package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
  47. package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
  48. package/skills/switchroom-cli/SKILL.md +63 -64
  49. package/skills/switchroom-health/SKILL.md +23 -10
  50. package/skills/switchroom-install/SKILL.md +3 -3
  51. package/skills/switchroom-manage/SKILL.md +26 -19
  52. package/skills/switchroom-runtime/SKILL.md +191 -0
  53. package/skills/switchroom-status/SKILL.md +27 -2
  54. package/skills/telegram-test-harness/SKILL.md +3 -0
  55. package/skills/token-helpers/SKILL.md +24 -1
  56. package/skills/webapp-testing/SKILL.md +31 -1
  57. package/skills/xlsx/SKILL.md +1 -1
  58. package/telegram-plugin/admin-commands/index.ts +7 -5
  59. package/telegram-plugin/analytics-posthog.ts +191 -0
  60. package/telegram-plugin/bridge/bridge.ts +69 -0
  61. package/telegram-plugin/bridge/ipc-client.ts +4 -1
  62. package/telegram-plugin/dist/bridge/bridge.js +194 -119
  63. package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
  64. package/telegram-plugin/dist/server.js +245 -189
  65. package/telegram-plugin/first-paint.ts +3 -24
  66. package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
  67. package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
  68. package/telegram-plugin/gateway/auth-command.ts +794 -0
  69. package/telegram-plugin/gateway/auth-line.ts +123 -0
  70. package/telegram-plugin/gateway/boot-card.ts +169 -40
  71. package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
  72. package/telegram-plugin/gateway/boot-probes.ts +166 -123
  73. package/telegram-plugin/gateway/boot-reason.ts +41 -7
  74. package/telegram-plugin/gateway/boot-version.ts +66 -0
  75. package/telegram-plugin/gateway/gateway.ts +3499 -1885
  76. package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
  77. package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
  78. package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
  79. package/telegram-plugin/gateway/quarantine.ts +69 -0
  80. package/telegram-plugin/gateway/quota-cache.ts +9 -4
  81. package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
  82. package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
  83. package/telegram-plugin/gateway/recent-denials.ts +77 -0
  84. package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
  85. package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
  86. package/telegram-plugin/history.ts +91 -0
  87. package/telegram-plugin/hooks/hooks.json +10 -0
  88. package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
  89. package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
  90. package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
  91. package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
  92. package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
  93. package/telegram-plugin/inbound-classifier.ts +50 -0
  94. package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
  95. package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
  96. package/telegram-plugin/package.json +4 -2
  97. package/telegram-plugin/permission-rule.ts +51 -0
  98. package/telegram-plugin/permission-title.ts +56 -0
  99. package/telegram-plugin/quota-check.ts +19 -41
  100. package/telegram-plugin/registry/reaper.ts +223 -0
  101. package/telegram-plugin/retry-api-call.ts +80 -0
  102. package/telegram-plugin/runtime-metrics.ts +177 -0
  103. package/telegram-plugin/scripts/build.mjs +0 -1
  104. package/telegram-plugin/secret-detect/index.ts +24 -0
  105. package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
  106. package/telegram-plugin/secret-detect/vault-error.ts +78 -11
  107. package/telegram-plugin/secret-detect/vault-write.ts +14 -2
  108. package/telegram-plugin/server.js +41795 -0
  109. package/telegram-plugin/session-tail.ts +6 -1
  110. package/telegram-plugin/shared/bot-runtime.ts +5 -4
  111. package/telegram-plugin/silence-poke.ts +420 -0
  112. package/telegram-plugin/silent-end.ts +174 -0
  113. package/telegram-plugin/stream-controller.ts +13 -0
  114. package/telegram-plugin/stream-reply-handler.ts +7 -0
  115. package/telegram-plugin/subagent-watcher.ts +213 -4
  116. package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
  117. package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
  118. package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
  119. package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
  120. package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
  121. package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
  122. package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
  123. package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
  124. package/telegram-plugin/tests/boot-probes.test.ts +216 -10
  125. package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
  126. package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
  127. package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
  128. package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
  129. package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
  130. package/telegram-plugin/tests/history-reaper.test.ts +378 -0
  131. package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
  132. package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
  133. package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
  134. package/telegram-plugin/tests/issues-card.test.ts +49 -0
  135. package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
  136. package/telegram-plugin/tests/permission-rule.test.ts +80 -1
  137. package/telegram-plugin/tests/permission-title.test.ts +31 -0
  138. package/telegram-plugin/tests/quota-check.test.ts +5 -35
  139. package/telegram-plugin/tests/races.test.ts +179 -0
  140. package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
  141. package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
  142. package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
  143. package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
  144. package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
  145. package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
  146. package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
  147. package/telegram-plugin/tests/silence-poke.test.ts +493 -0
  148. package/telegram-plugin/tests/silent-end.test.ts +206 -0
  149. package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
  150. package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
  151. package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
  152. package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
  153. package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
  154. package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
  155. package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
  156. package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
  157. package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
  158. package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
  159. package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
  160. package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
  161. package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
  162. package/telegram-plugin/turn-signal-tracker.ts +100 -24
  163. package/telegram-plugin/uat/SETUP.md +210 -35
  164. package/telegram-plugin/uat/assertions.ts +264 -37
  165. package/telegram-plugin/uat/driver-info.ts +57 -0
  166. package/telegram-plugin/uat/driver.ts +590 -51
  167. package/telegram-plugin/uat/harness.ts +140 -94
  168. package/telegram-plugin/uat/load-env.test.ts +72 -0
  169. package/telegram-plugin/uat/load-env.ts +48 -0
  170. package/telegram-plugin/uat/login.ts +96 -53
  171. package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
  172. package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
  173. package/telegram-plugin/uat/runners/report.ts +150 -0
  174. package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
  175. package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
  176. package/telegram-plugin/uat/runners/scorer.ts +106 -0
  177. package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
  178. package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
  179. package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
  180. package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
  181. package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
  182. package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
  183. package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
  184. package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
  185. package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
  186. package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
  187. package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
  188. package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
  189. package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
  190. package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
  191. package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
  192. package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
  193. package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
  194. package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
  195. package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
  196. package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
  197. package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
  198. package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
  199. package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
  200. package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
  201. package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
  202. package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
  203. package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
  204. package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
  205. package/telegram-plugin/vault-approval-posture.ts +42 -0
  206. package/telegram-plugin/welcome-text.ts +1 -0
  207. package/telegram-plugin/active-pins-sweep.ts +0 -204
  208. package/telegram-plugin/active-pins.ts +0 -146
  209. package/telegram-plugin/auth-dashboard.ts +0 -1104
  210. package/telegram-plugin/auth-slot-parser.ts +0 -497
  211. package/telegram-plugin/card-event-log.ts +0 -138
  212. package/telegram-plugin/dist/foreman/foreman.js +0 -31106
  213. package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
  214. package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
  215. package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
  216. package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
  217. package/telegram-plugin/foreman/foreman.ts +0 -1165
  218. package/telegram-plugin/foreman/setup-flow.ts +0 -345
  219. package/telegram-plugin/foreman/setup-state.ts +0 -239
  220. package/telegram-plugin/foreman/state.ts +0 -203
  221. package/telegram-plugin/pin-event-log.ts +0 -76
  222. package/telegram-plugin/progress-card-driver.ts +0 -2886
  223. package/telegram-plugin/progress-card-pin-manager.ts +0 -589
  224. package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
  225. package/telegram-plugin/progress-card.ts +0 -1409
  226. package/telegram-plugin/tests/HARNESS.md +0 -340
  227. package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
  228. package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
  229. package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
  230. package/telegram-plugin/tests/active-pins.test.ts +0 -187
  231. package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
  232. package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
  233. package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
  234. package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
  235. package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
  236. package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
  237. package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
  238. package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
  239. package/telegram-plugin/tests/card-event-log.test.ts +0 -145
  240. package/telegram-plugin/tests/first-paint.test.ts +0 -257
  241. package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
  242. package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
  243. package/telegram-plugin/tests/foreman-state.test.ts +0 -164
  244. package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
  245. package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
  246. package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
  247. package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
  248. package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
  249. package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
  250. package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
  251. package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
  252. package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
  253. package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
  254. package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
  255. package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
  256. package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
  257. package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
  258. package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
  259. package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
  260. package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
  261. package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
  262. package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
  263. package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
  264. package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
  265. package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
  266. package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
  267. package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
  268. package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
  269. package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
  270. package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
  271. package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
  272. package/telegram-plugin/tests/setup-flow.test.ts +0 -510
  273. package/telegram-plugin/tests/setup-state.test.ts +0 -146
  274. package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
  275. package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
  276. package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
  277. package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
  278. package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
  279. package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
  280. package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
  281. package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
  282. package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
  283. package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
  284. package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
  285. package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
  286. package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
  287. package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
  288. package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
  289. package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
  290. package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
  291. package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
  292. package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
  293. package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
  294. package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
  295. package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
  296. package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
  297. package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
  298. package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
  299. package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
  300. package/telegram-plugin/two-zone-card.ts +0 -269
  301. package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
@@ -0,0 +1,141 @@
1
+ /**
2
+ * End-to-end UAT for the #1150 button-UX audit's three invariants on a
3
+ * surface that requires NO vault state mutation: the `ask_user` MCP
4
+ * tool.
5
+ *
6
+ * Flow:
7
+ * 1. Driver asks the agent to use `ask_user` with 2 fixed options.
8
+ * 2. Agent emits the question + inline keyboard.
9
+ * 3. Driver locates the buttons and presses one.
10
+ * 4. Driver re-reads the message — assert:
11
+ * - keyboard is gone (invariant 2: atomic strip)
12
+ * - message text appends `✅ <choice>` (invariant 2: status line)
13
+ * 5. Driver waits for a fresh bot turn referencing the chosen option
14
+ * (invariant 3: gateway forwarded the answer; agent continued).
15
+ *
16
+ * Why this scenario over a vault-state mutation one (the existing
17
+ * `vault-grant-auto-resume-dm.test.ts` covers the load-bearing #1052
18
+ * path but is `describe.skip`'d because it mutates the operator's
19
+ * vault): `ask_user` has zero side effects on switchroom state. The
20
+ * scenario is repeatable and cleanup-free.
21
+ *
22
+ * What's pinned:
23
+ * - The `ask_user` tool's callback flow (`gateway.ts:11113-11152`)
24
+ * routes through the same three-invariant pattern PR #1152
25
+ * formalized in `finalizeCallback`. Pre-audit the keyboard strip
26
+ * + status line already existed for `ask_user`; the audit kept
27
+ * that surface in the "OK today" column. This UAT pins the
28
+ * existing behaviour against future regressions.
29
+ *
30
+ * Per-test wall-clock budget: 180s. The agent has two turns to
31
+ * complete:
32
+ * - Turn 1: receive driver prompt → call `ask_user` (~20s typical).
33
+ * - Turn 2: receive operator answer → reply confirming the choice
34
+ * (~15s typical).
35
+ * Plus spinUp settle + mtcute connect overhead. 180s gives ~3x
36
+ * headroom for a slow run.
37
+ */
38
+
39
+ import { describe, it, expect } from "vitest";
40
+ import { spinUp } from "../harness.js";
41
+
42
+ const OPTION_A = "spaghetti";
43
+ const OPTION_B = "salad";
44
+ const CHOSEN = OPTION_A;
45
+
46
+ describe("uat: ask_user button-tap → keyboard strip + status line + agent continues (#1150 audit)", () => {
47
+ it(
48
+ "tapping an ask_user option strips the keyboard, appends ✅ <choice>, and the agent acknowledges the answer in a follow-up turn",
49
+ async () => {
50
+ const sc = await spinUp({ agent: "test-harness" });
51
+ try {
52
+ // Prompt: ask the agent to call `ask_user` with two fixed
53
+ // options. The wording is explicit so the model picks the
54
+ // right tool on the first try — fuzz-style "use ask_user
55
+ // somehow" prompts have ~20% drop rate to the model
56
+ // free-styling a regular reply instead.
57
+ await sc.sendDM(
58
+ `Please use your ask_user MCP tool to ask me which I'd ` +
59
+ `prefer for dinner. Two options exactly: "${OPTION_A}" ` +
60
+ `and "${OPTION_B}". After I tap one, reply with a single ` +
61
+ `short line confirming the choice (e.g. "Got it, ${OPTION_A} it is.").`,
62
+ );
63
+
64
+ // ── 1. Wait for the ask_user card. ──────────────────────────
65
+ // Matches the agent's question text containing both options.
66
+ const card = await sc.expectMessage(
67
+ new RegExp(`${OPTION_A}.*${OPTION_B}|${OPTION_B}.*${OPTION_A}`, "s"),
68
+ { from: "bot", timeout: 120_000 },
69
+ );
70
+
71
+ // ── 2. Pull the keyboard, locate the chosen-option button. ──
72
+ const kb = await sc.driver.getKeyboard(sc.botUserId, card.messageId);
73
+ expect(kb).not.toBeNull();
74
+ const buttons = kb!.flat();
75
+ // Each option's button text might be styled (e.g. "🍝 spaghetti").
76
+ // Match on case-insensitive substring rather than equality.
77
+ const chosenBtn = buttons.find(
78
+ (b) => b.callbackData != null && b.text.toLowerCase().includes(CHOSEN.toLowerCase()),
79
+ );
80
+ expect(
81
+ chosenBtn,
82
+ `expected a button containing ${JSON.stringify(CHOSEN)} (got ${JSON.stringify(buttons.map((b) => b.text))})`,
83
+ ).toBeDefined();
84
+
85
+ // ── 3. Tap. ────────────────────────────────────────────────
86
+ await sc.driver.pressButton(
87
+ sc.botUserId,
88
+ card.messageId,
89
+ chosenBtn!.callbackData!,
90
+ );
91
+
92
+ // ── 4. Re-read the original card. Invariants 2a + 2b. ──────
93
+ //
94
+ // The edit + ack are best-effort on the gateway side; allow a
95
+ // short window for both to propagate before re-fetching.
96
+ await new Promise((r) => setTimeout(r, 1500));
97
+ const edited = await sc.driver.getKeyboard(sc.botUserId, card.messageId);
98
+ // Invariant 2a: keyboard collapses to empty (or vanishes
99
+ // entirely — getKeyboard returns null when reply_markup is
100
+ // missing). Either shape counts as "stripped".
101
+ const stripped =
102
+ edited == null ||
103
+ (Array.isArray(edited) && (edited.length === 0 || edited.flat().length === 0));
104
+ expect(
105
+ stripped,
106
+ `expected stripped keyboard after tap; got ${JSON.stringify(edited)}`,
107
+ ).toBe(true);
108
+
109
+ // ── 5. Wait for the agent's confirmation reply. Invariant 3. ─
110
+ // The agent receives the answer as a channel event and starts
111
+ // a new turn. We expect a reply mentioning the choice within
112
+ // ~60s.
113
+ //
114
+ // Predicate matcher filters out the EDITED card. The driver's
115
+ // `observeMessages` (driver.ts:252-263) dispatches BOTH new
116
+ // messages AND edit events through the same stream, with
117
+ // `ObservedMessage.edited` set accordingly. Without this
118
+ // filter the race between (a) the gateway's edit landing
119
+ // post-sleep and (b) the agent's confirmation turn would
120
+ // catch the edited card as the "match" — false-positive on
121
+ // invariant 3 if the edit's network round-trip beat the
122
+ // turn-completion. Predicate guards against that without
123
+ // depending on a sleep duration. (PR #1167 review item D.)
124
+ const confirmation = await sc.expectMessage(
125
+ (m) => !m.edited && new RegExp(CHOSEN, "i").test(m.text),
126
+ { from: "bot", timeout: 60_000 },
127
+ );
128
+ // Defense in depth: the confirmation message id must be
129
+ // greater than the card's. A fresh turn always produces a
130
+ // new id; same id implies the edited card slipped through
131
+ // the predicate (e.g. if `edited` wasn't set on the
132
+ // observation). Soft assertion — predicate is the primary
133
+ // guard.
134
+ expect(confirmation.messageId).toBeGreaterThan(card.messageId);
135
+ } finally {
136
+ await sc.tearDown();
137
+ }
138
+ },
139
+ 180_000,
140
+ );
141
+ });
@@ -0,0 +1,191 @@
1
+ /**
2
+ * Background sub-agent visibility scenario — closes #709 / #776 / #782 / #788
3
+ * (the four-issue family analysed in `reference/sub-agent-visibility-rfc.md`).
4
+ *
5
+ * Verifies three acceptance criteria from the RFC in a single run because
6
+ * they share setup:
7
+ *
8
+ * AC-1 — Background-dispatch-and-continue: card stays pinned past
9
+ * parent `turn_end`; fleet zone surfaces the running sub-agent.
10
+ * AC-2 — Done semantics: header reads 🌀 Background (not ✅ Done)
11
+ * while the bg sub-agent runs; flips to ✅ Done after it
12
+ * terminates.
13
+ * AC-3 — Live activity: card body materially changes across a 15s
14
+ * window while bg work is in flight (elapsed counter or fleet
15
+ * row's `last activity` advances) — proves the heartbeat +
16
+ * subagent-watcher are actually feeding the renderer.
17
+ *
18
+ * Prompt strategy: **Option 1 (explicit tool-naming)** per the RFC §
19
+ * "Background-dispatch prompt". An earlier Option-2 (naturalistic)
20
+ * attempt produced exactly the failure mode the RFC predicted —
21
+ * model ran the sleeps inline via Bash, card never reached Background
22
+ * phase. This test verifies the *visibility infra*, not the LLM's
23
+ * delegation judgment; pinning the tool name and arg keeps the
24
+ * scenario deterministic.
25
+ *
26
+ * Requires the same env as the other DM scenarios (see SETUP.md §6)
27
+ * and the test-harness override `progress_card.delay_ms: 1000` so the
28
+ * card actually fires on a short turn (SETUP.md §5).
29
+ *
30
+ * Runtime budget is generous — the inner deadlines sum to ~150s
31
+ * worst-case (5s pin + 30s parent-ack + 30s background phase + 15s
32
+ * delta-snapshot + 120s done) plus ~12s spinUp overhead. The outer
33
+ * `it()` timeout absorbs the lot.
34
+ */
35
+
36
+ import { describe, expect, it } from "vitest";
37
+ import { spinUp } from "../harness.js";
38
+
39
+ // Explicit dispatch prompt (Option 1 per the RFC §"Background-dispatch
40
+ // prompt"). The naturalistic Option-2 version didn't reliably get the
41
+ // model to use the Agent tool with run_in_background:true — first
42
+ // attempt produced the failure mode the RFC predicted (parent ran the
43
+ // sleeps inline via Bash; card never transitioned to Background).
44
+ //
45
+ // This test asserts the VISIBILITY INFRA works, not that the model
46
+ // makes good delegation judgments. Naming the tool + the arg lets the
47
+ // scenario be deterministic. If the model can't be made to use the
48
+ // Agent tool even with this prompt, that's an unrelated bug (model
49
+ // alignment / tool registration) and the scenario fails distinctly
50
+ // from the visibility-infra failure modes we're trying to catch.
51
+ //
52
+ // Time profile: ~60s of bg work, paced with three separate sleeps so
53
+ // the worker emits multiple tool_use events the subagent-watcher can
54
+ // surface as fresh `last activity` updates. We need the Background
55
+ // phase to last long enough that we can take a snapshot, wait one
56
+ // heartbeat tick (5s default), and snapshot again.
57
+ const BG_DISPATCH_PROMPT =
58
+ `Use the Agent tool with subagent_type "general-purpose" and ` +
59
+ `run_in_background: true to dispatch a worker with this exact task: ` +
60
+ `"Run \`sleep 20\` via the Bash tool, then \`echo step1\`, then ` +
61
+ `\`sleep 20\` again, then \`echo step2\`, then \`sleep 20\` a third ` +
62
+ `time, then \`echo done\`. That's three separate Bash tool calls ` +
63
+ `with sleeps between echoes." After dispatching, send a brief reply ` +
64
+ `saying you've kicked off the background worker so I can watch the ` +
65
+ `progress card.`;
66
+
67
+ /**
68
+ * STATUS: currently red — surfaces two real production bugs the
69
+ * RFC §Risks predicted as possible-but-unverified. Marked `it.fails`
70
+ * so a future fix flips it green and a regression flips it red again.
71
+ *
72
+ * Bug 1 — orphan correlation. The parent's `Agent` tool_use_id
73
+ * doesn't get matched to the spawned `sub_agent_started`
74
+ * event. Gateway log: `pendingSpawns=0 correlated=orphan`.
75
+ * Result: `isBackgroundDispatch` is never set on the fleet
76
+ * member; the card's header phase transitions to Background
77
+ * only by accident (orphans defer too, but they don't carry
78
+ * the bg flag).
79
+ *
80
+ * Bug 2 — subagent-watcher can't track the worker. Gateway log:
81
+ * `subagent-watcher: liveness skip <agentId> — row not in
82
+ * DB yet (Phase 2 Pre hook pending)`. Result: no
83
+ * sub_agent_tool_use events reach the fleet member; the
84
+ * fleet row's `last activity` field never updates with the
85
+ * worker's actual tool calls. The card edits we see are
86
+ * just elapsed-counter ticks from the heartbeat.
87
+ *
88
+ * Both bugs are real and live on `main`. The scenario above passes
89
+ * AC-1 (card stays pinned), partially passes AC-2 (Background phase
90
+ * fires) and AC-3 (card body changes — from heartbeat alone), and
91
+ * fails AC-2's closing half (card never reaches Done in 120s because
92
+ * the orphan never terminates from the gateway's view).
93
+ *
94
+ * When Bug 1 + Bug 2 are fixed, change `describe.skip` to `describe`
95
+ * below — the assertions are correct; only the production code is
96
+ * wrong.
97
+ *
98
+ * Update post-#1105: all five RFC bugs (1–5 in earlier PRs, 6–7 in
99
+ * #1105) merged. Unskipped here for the next UAT re-run. If 6/6 ACs
100
+ * pass, close #709 / #776 / #782 / #788.
101
+ */
102
+ describe("uat: background sub-agent visibility (#709/#776/#782/#788)", () => {
103
+ it(
104
+ "card stays pinned with 🌀 Background header + live fleet activity, then flips to ✅ Done",
105
+ async () => {
106
+ const sc = await spinUp({ agent: "test-harness" });
107
+ try {
108
+ await sc.sendDM(BG_DISPATCH_PROMPT);
109
+
110
+ // AC-1 step 1: card pins quickly (delay_ms: 1000 on test-harness).
111
+ // Generous timeout so a slow first-turn doesn't false-flag.
112
+ const card = await sc.expectPinnedCard({ timeout: 15_000 });
113
+ expect(card.messageId).toBeGreaterThan(0);
114
+
115
+ // Parent ack reply. Note: we DON'T strictly require the model
116
+ // to mention "dispatch" in the reply — naturalistic prompt means
117
+ // the model picks the wording. We just need *some* bot reply
118
+ // so we know the parent turn closed (which is the point where
119
+ // pre-fix the card would unpin).
120
+ await sc.expectMessage(/.+/, { from: "bot", timeout: 30_000 });
121
+
122
+ // AC-2: header MUST be 🌀 Background (post-#1039) or, if the
123
+ // bg dispatch happened so fast the worker hasn't started yet,
124
+ // it might still be ⚙️ Working with the parent zone done. We
125
+ // poll for the background phase with a 45s budget — long
126
+ // enough for the worker to actually start firing tools, short
127
+ // enough that "we never saw Background" surfaces as a real
128
+ // bug, not a timeout-tuning issue.
129
+ //
130
+ // The dual-acceptable phases below model the realistic flow:
131
+ // parent reply lands → header should be Background (or
132
+ // briefly still Working if the parent's `done` event lags
133
+ // the bg dispatch's tool_use).
134
+ const bgPhaseCard = await sc.waitForCardPhase(card, "background", {
135
+ timeout: 45_000,
136
+ });
137
+ expect(bgPhaseCard.text).toMatch(/🌀|Background/i);
138
+ // The negative — Done MUST NOT have fired before bg started.
139
+ // Asserts the defer-gate is doing its job. If this trips, the
140
+ // `hasLiveBackground` correlation at progress-card-driver.ts:1108
141
+ // is broken (or the bg dispatch never registered as a fleet
142
+ // member at all — see RFC §Phase 2 diagnosis paths).
143
+ expect(bgPhaseCard.text).not.toMatch(/✅|\bDone\b/i);
144
+
145
+ // AC-3: card edits land regularly while bg runs. Snapshot
146
+ // the current card body, wait one heartbeat tick (5s default
147
+ // + 1s slack), then fetch the card body again. The body MUST
148
+ // differ (elapsed counter, fleet last-activity age, etc.).
149
+ //
150
+ // We re-fetch the SAME message via `driver.getMessage(chatId,
151
+ // cardId)` rather than `expectPinnedCard` because the latter
152
+ // listens for NEW pin events. Once the card is pinned, no
153
+ // further pin event fires — `expectPinnedCard` would wait
154
+ // for an event that never comes and time out spuriously even
155
+ // though the card is alive and being edited (caught in the
156
+ // first run of this scenario).
157
+ //
158
+ // If the card freezes — heartbeat dead, subagent-watcher not
159
+ // flushing, fleet member never registered — `afterDelta` will
160
+ // equal `beforeDelta` and surface the bug cleanly. If the
161
+ // card was unpinned by an over-eager defer-gate release,
162
+ // `getMessage` returns null and we surface it with a clear
163
+ // assertion.
164
+ const beforeDelta = bgPhaseCard.text;
165
+ await new Promise((r) => setTimeout(r, 6_000));
166
+ const afterDeltaMsg = await sc.driver.getMessage(
167
+ sc.botUserId,
168
+ bgPhaseCard.messageId,
169
+ );
170
+ expect(afterDeltaMsg, "card message disappeared mid-flight (AC-1 regression)").not.toBeNull();
171
+ expect(afterDeltaMsg!.text).not.toBe(beforeDelta);
172
+
173
+ // AC-2 closing half: bg terminates → header flips to ✅ Done.
174
+ // Generous budget — the inner sleeps sum to ~60s but
175
+ // post-completion the deferred-completion gate plus the
176
+ // heartbeat cadence can add another 5-30s before the card
177
+ // finalises.
178
+ const doneCard = await sc.waitForCardPhase(bgPhaseCard, "done", {
179
+ timeout: 120_000,
180
+ });
181
+ expect(doneCard.text).toMatch(/✅|Done/i);
182
+ } finally {
183
+ await sc.tearDown();
184
+ }
185
+ },
186
+ // Outer per-test budget: sum of inner deadlines (15 + 30 + 45 + 15 +
187
+ // 10 + 120 = 235s) + spinUp settle (~12s) + slack. Round up to keep
188
+ // the inner-deadline error visible if any of them trip.
189
+ 300_000,
190
+ );
191
+ });
@@ -0,0 +1,255 @@
1
+ /**
2
+ * Extended probabilistic fuzz — second pass, categories the first
3
+ * fuzz file didn't cover.
4
+ *
5
+ * Same invariants as `fuzz-random-prompts-dm.test.ts`:
6
+ * 1. Reply landed (user not ghosted)
7
+ * 2. No agent crash (next case still runs)
8
+ * 3. No credential leak in the reply text
9
+ * 4. Non-empty reply
10
+ *
11
+ * Categories here:
12
+ * - Markdown / formatting stress (nested code blocks, broken HTML,
13
+ * bold/italic in unexpected places)
14
+ * - Command-shaped prompts (slash prefixes that aren't `/queue`)
15
+ * - Repeat-fire (same prompt 3x in a row)
16
+ * - Unicode normalisation edge cases
17
+ * - Mixed-language code switching
18
+ * - Number / math edge cases (very large, very small, scientific)
19
+ * - Polite trivials (good morning, thanks, ok cool)
20
+ *
21
+ * Avoids the rapid-followup wedge surfaced in overnight UAT
22
+ * (#1122 follow-up): every case here is a SINGLE inbound, so we
23
+ * dodge the queued-vs-steering classification issue and the
24
+ * crash-loop pathology that surfaced in the test-harness when
25
+ * driving multiple inbounds within the same coalesce / queue
26
+ * window.
27
+ */
28
+
29
+ import { describe, it, expect } from "vitest";
30
+ import { spinUp } from "../harness.js";
31
+
32
+ interface FuzzCase {
33
+ name: string;
34
+ prompt: string;
35
+ timeout: number;
36
+ }
37
+
38
+ const FUZZ_CASES: readonly FuzzCase[] = [
39
+ // ─── Markdown / formatting stress ─────────────────────────────
40
+ {
41
+ name: "nested code blocks",
42
+ prompt: "what's wrong with this:\n```python\ndef foo():\n return ```bash\n echo hi\n ```\n```",
43
+ timeout: 45_000,
44
+ },
45
+ {
46
+ name: "broken HTML",
47
+ prompt: "what does <em>this <b>do</em> mean?",
48
+ timeout: 45_000,
49
+ },
50
+ {
51
+ name: "markdown bold attempt",
52
+ prompt: "**hello** _world_ — is this bold?",
53
+ timeout: 45_000,
54
+ },
55
+ {
56
+ name: "table-shape",
57
+ prompt: "format this as a table:\n| name | role |\n| ken | dev |",
58
+ timeout: 60_000,
59
+ },
60
+
61
+ // ─── Command-shaped prompts (NOT /queue) ──────────────────────
62
+ {
63
+ name: "slash command — /help",
64
+ prompt: "/help",
65
+ timeout: 45_000,
66
+ },
67
+ {
68
+ name: "slash command — /start",
69
+ prompt: "/start",
70
+ timeout: 45_000,
71
+ },
72
+ {
73
+ name: "slash command — /memory",
74
+ prompt: "/memory",
75
+ timeout: 45_000,
76
+ },
77
+ {
78
+ name: "slash command — bare /",
79
+ prompt: "/",
80
+ timeout: 45_000,
81
+ },
82
+
83
+ // ─── Repeat-fire (same prompt 3x — sent in ONE inbound each) ──
84
+ // Multi-inbound rapid-fire wedges the agent; we test that the SAME
85
+ // prompt sent to fresh agent sessions doesn't degrade replies.
86
+ {
87
+ name: "repeated content",
88
+ prompt: "hi hi hi hi hi hi hi hi",
89
+ timeout: 45_000,
90
+ },
91
+
92
+ // ─── Unicode normalisation ────────────────────────────────────
93
+ {
94
+ name: "decomposed accents (NFD)",
95
+ // "café" in NFD form: c, a, f, e + combining acute accent.
96
+ prompt: "what does café (with NFD-decomposed é) mean?",
97
+ timeout: 45_000,
98
+ },
99
+ {
100
+ name: "combining diacritics stack",
101
+ // a + 3 combining accents above
102
+ prompt: "interpret á̂̃ — does it confuse you?",
103
+ timeout: 45_000,
104
+ },
105
+
106
+ // ─── Mixed-language code switching ────────────────────────────
107
+ {
108
+ name: "Spanish/English mix",
109
+ prompt: "hola, can you ayudarme entender what este código does? print('hello')",
110
+ timeout: 60_000,
111
+ },
112
+ {
113
+ name: "Japanese in middle",
114
+ prompt: "what does 申し訳ありません mean and when is it used?",
115
+ timeout: 60_000,
116
+ },
117
+
118
+ // ─── Number / math edges ──────────────────────────────────────
119
+ {
120
+ name: "huge number",
121
+ prompt: "what is 10^100 called?",
122
+ timeout: 45_000,
123
+ },
124
+ {
125
+ name: "scientific notation",
126
+ prompt: "is 1.5e-10 the same as 0.00000000015?",
127
+ timeout: 45_000,
128
+ },
129
+
130
+ // ─── Polite trivials ──────────────────────────────────────────
131
+ {
132
+ name: "good morning",
133
+ prompt: "good morning",
134
+ timeout: 60_000,
135
+ },
136
+ {
137
+ name: "thanks",
138
+ prompt: "thanks",
139
+ timeout: 60_000,
140
+ },
141
+ {
142
+ name: "ok cool",
143
+ prompt: "ok cool",
144
+ timeout: 60_000,
145
+ },
146
+
147
+ // ─── Status-ask classifier variants (CC-7 fuzz coverage) ──────
148
+ //
149
+ // The conservative regex set in `telegram-plugin/inbound-classifier.ts`
150
+ // captures 10 standalone "ping" patterns that count toward the
151
+ // primary lagging KPI `inbound_status_query`. Each fire is a JTBD
152
+ // failure (`reference/know-what-my-agent-is-doing.md`), so we
153
+ // want every variant to (a) reach the agent unchanged, (b)
154
+ // produce a sensible reply (no crash, no loop, no ghosting).
155
+ // Tracks cause class CC-7 from
156
+ // `docs/status-ask-cause-classes.md`.
157
+ //
158
+ // Unit-level classifier behavior is covered in
159
+ // `telegram-plugin/tests/inbound-classifier.test.ts`; this fuzz
160
+ // block is the end-to-end "does the agent survive each of them"
161
+ // regression net.
162
+ {
163
+ name: "status-ask: bare ?",
164
+ prompt: "?",
165
+ timeout: 45_000,
166
+ },
167
+ {
168
+ name: "status-ask: status?",
169
+ prompt: "status?",
170
+ timeout: 45_000,
171
+ },
172
+ {
173
+ name: "status-ask: update?",
174
+ prompt: "update?",
175
+ timeout: 45_000,
176
+ },
177
+ {
178
+ name: "status-ask: any update?",
179
+ prompt: "any update?",
180
+ timeout: 45_000,
181
+ },
182
+ {
183
+ name: "status-ask: still there?",
184
+ prompt: "still there?",
185
+ timeout: 45_000,
186
+ },
187
+ {
188
+ name: "status-ask: still working?",
189
+ prompt: "still working?",
190
+ timeout: 45_000,
191
+ },
192
+ {
193
+ name: "status-ask: are you there?",
194
+ prompt: "are you there?",
195
+ timeout: 45_000,
196
+ },
197
+ {
198
+ name: "status-ask: you there?",
199
+ prompt: "you there?",
200
+ timeout: 45_000,
201
+ },
202
+ {
203
+ name: "status-ask: hello?",
204
+ prompt: "hello?",
205
+ timeout: 45_000,
206
+ },
207
+ {
208
+ name: "status-ask: hey??",
209
+ prompt: "hey??",
210
+ timeout: 45_000,
211
+ },
212
+ ];
213
+
214
+ const SECRET_PATTERNS = [
215
+ /sk-[a-zA-Z0-9]{30,}/,
216
+ /[a-zA-Z0-9]{40,}\.eyJ[a-zA-Z0-9]/,
217
+ /AKIA[A-Z0-9]{16}/,
218
+ /ghp_[A-Za-z0-9]{36,}/,
219
+ ];
220
+
221
+ function hasSecretLeak(text: string): { leaked: boolean; pattern?: string } {
222
+ for (const pat of SECRET_PATTERNS) {
223
+ if (pat.test(text)) return { leaked: true, pattern: pat.toString() };
224
+ }
225
+ return { leaked: false };
226
+ }
227
+
228
+ describe("uat: extended fuzz — second-pass coverage", () => {
229
+ for (const fc of FUZZ_CASES) {
230
+ it(
231
+ `[fuzz2] ${fc.name} — user must not be ghosted`,
232
+ async () => {
233
+ const sc = await spinUp({ agent: "test-harness" });
234
+ try {
235
+ await sc.sendDM(fc.prompt);
236
+ const reply = await sc.expectMessage(/\S/, {
237
+ from: "bot",
238
+ timeout: fc.timeout,
239
+ });
240
+ expect(reply.text.length).toBeGreaterThan(0);
241
+ const leak = hasSecretLeak(reply.text);
242
+ if (leak.leaked) {
243
+ throw new Error(
244
+ `[fuzz2] ${fc.name}: bot reply contains a secret-shaped `
245
+ + `pattern (${leak.pattern}). Reply: ${JSON.stringify(reply.text.slice(0, 400))}`,
246
+ );
247
+ }
248
+ } finally {
249
+ await sc.tearDown();
250
+ }
251
+ },
252
+ fc.timeout + 30_000,
253
+ );
254
+ }
255
+ });