switchroom 0.7.15 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (301) hide show
  1. package/README.md +51 -59
  2. package/bin/run-hook.sh +27 -11
  3. package/bin/timezone-hook.sh +9 -7
  4. package/dist/agent-scheduler/index.js +410 -133
  5. package/dist/auth-broker/index.js +13932 -0
  6. package/dist/cli/switchroom.js +26937 -5601
  7. package/dist/host-control/main.js +12702 -0
  8. package/dist/vault/approvals/kernel-server.js +467 -184
  9. package/dist/vault/broker/server.js +1430 -724
  10. package/examples/minimal.yaml +63 -0
  11. package/examples/personal-google-workspace-mcp/.env.example +34 -0
  12. package/examples/personal-google-workspace-mcp/README.md +194 -0
  13. package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
  14. package/examples/switchroom.yaml +220 -0
  15. package/package.json +7 -4
  16. package/profiles/_base/settings.json.hbs +20 -5
  17. package/profiles/_base/start.sh.hbs +16 -3
  18. package/profiles/_shared/agent-self-service.md.hbs +126 -0
  19. package/profiles/_shared/telegram-style.md.hbs +20 -90
  20. package/profiles/_shared/vault-protocol.md.hbs +68 -0
  21. package/profiles/default/CLAUDE.md +50 -96
  22. package/profiles/default/CLAUDE.md.hbs +36 -6
  23. package/profiles/default/workspace/SOUL.md.hbs +12 -5
  24. package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
  25. package/skills/buildkite-agent-runtime/SKILL.md +44 -11
  26. package/skills/buildkite-api/SKILL.md +31 -8
  27. package/skills/buildkite-cli/SKILL.md +27 -9
  28. package/skills/buildkite-migration/SKILL.md +22 -9
  29. package/skills/buildkite-pipelines/SKILL.md +26 -9
  30. package/skills/buildkite-secure-delivery/SKILL.md +23 -9
  31. package/skills/buildkite-test-engine/SKILL.md +25 -8
  32. package/skills/docx/SKILL.md +1 -1
  33. package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
  34. package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
  35. package/skills/file-bug/SKILL.md +34 -6
  36. package/skills/humanizer/SKILL.md +15 -0
  37. package/skills/humanizer-calibrate/SKILL.md +7 -1
  38. package/skills/mcp-builder/SKILL.md +1 -1
  39. package/skills/pdf/SKILL.md +1 -1
  40. package/skills/pptx/SKILL.md +1 -1
  41. package/skills/skill-creator/SKILL.md +21 -1
  42. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
  43. package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
  44. package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
  45. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
  46. package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
  47. package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
  48. package/skills/switchroom-cli/SKILL.md +63 -64
  49. package/skills/switchroom-health/SKILL.md +23 -10
  50. package/skills/switchroom-install/SKILL.md +3 -3
  51. package/skills/switchroom-manage/SKILL.md +26 -19
  52. package/skills/switchroom-runtime/SKILL.md +191 -0
  53. package/skills/switchroom-status/SKILL.md +27 -2
  54. package/skills/telegram-test-harness/SKILL.md +3 -0
  55. package/skills/token-helpers/SKILL.md +24 -1
  56. package/skills/webapp-testing/SKILL.md +31 -1
  57. package/skills/xlsx/SKILL.md +1 -1
  58. package/telegram-plugin/admin-commands/index.ts +7 -5
  59. package/telegram-plugin/analytics-posthog.ts +191 -0
  60. package/telegram-plugin/bridge/bridge.ts +69 -0
  61. package/telegram-plugin/bridge/ipc-client.ts +4 -1
  62. package/telegram-plugin/dist/bridge/bridge.js +194 -119
  63. package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
  64. package/telegram-plugin/dist/server.js +245 -189
  65. package/telegram-plugin/first-paint.ts +3 -24
  66. package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
  67. package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
  68. package/telegram-plugin/gateway/auth-command.ts +794 -0
  69. package/telegram-plugin/gateway/auth-line.ts +123 -0
  70. package/telegram-plugin/gateway/boot-card.ts +169 -40
  71. package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
  72. package/telegram-plugin/gateway/boot-probes.ts +166 -123
  73. package/telegram-plugin/gateway/boot-reason.ts +41 -7
  74. package/telegram-plugin/gateway/boot-version.ts +66 -0
  75. package/telegram-plugin/gateway/gateway.ts +3499 -1885
  76. package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
  77. package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
  78. package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
  79. package/telegram-plugin/gateway/quarantine.ts +69 -0
  80. package/telegram-plugin/gateway/quota-cache.ts +9 -4
  81. package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
  82. package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
  83. package/telegram-plugin/gateway/recent-denials.ts +77 -0
  84. package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
  85. package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
  86. package/telegram-plugin/history.ts +91 -0
  87. package/telegram-plugin/hooks/hooks.json +10 -0
  88. package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
  89. package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
  90. package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
  91. package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
  92. package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
  93. package/telegram-plugin/inbound-classifier.ts +50 -0
  94. package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
  95. package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
  96. package/telegram-plugin/package.json +4 -2
  97. package/telegram-plugin/permission-rule.ts +51 -0
  98. package/telegram-plugin/permission-title.ts +56 -0
  99. package/telegram-plugin/quota-check.ts +19 -41
  100. package/telegram-plugin/registry/reaper.ts +223 -0
  101. package/telegram-plugin/retry-api-call.ts +80 -0
  102. package/telegram-plugin/runtime-metrics.ts +177 -0
  103. package/telegram-plugin/scripts/build.mjs +0 -1
  104. package/telegram-plugin/secret-detect/index.ts +24 -0
  105. package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
  106. package/telegram-plugin/secret-detect/vault-error.ts +78 -11
  107. package/telegram-plugin/secret-detect/vault-write.ts +14 -2
  108. package/telegram-plugin/server.js +41795 -0
  109. package/telegram-plugin/session-tail.ts +6 -1
  110. package/telegram-plugin/shared/bot-runtime.ts +5 -4
  111. package/telegram-plugin/silence-poke.ts +420 -0
  112. package/telegram-plugin/silent-end.ts +174 -0
  113. package/telegram-plugin/stream-controller.ts +13 -0
  114. package/telegram-plugin/stream-reply-handler.ts +7 -0
  115. package/telegram-plugin/subagent-watcher.ts +213 -4
  116. package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
  117. package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
  118. package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
  119. package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
  120. package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
  121. package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
  122. package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
  123. package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
  124. package/telegram-plugin/tests/boot-probes.test.ts +216 -10
  125. package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
  126. package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
  127. package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
  128. package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
  129. package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
  130. package/telegram-plugin/tests/history-reaper.test.ts +378 -0
  131. package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
  132. package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
  133. package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
  134. package/telegram-plugin/tests/issues-card.test.ts +49 -0
  135. package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
  136. package/telegram-plugin/tests/permission-rule.test.ts +80 -1
  137. package/telegram-plugin/tests/permission-title.test.ts +31 -0
  138. package/telegram-plugin/tests/quota-check.test.ts +5 -35
  139. package/telegram-plugin/tests/races.test.ts +179 -0
  140. package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
  141. package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
  142. package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
  143. package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
  144. package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
  145. package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
  146. package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
  147. package/telegram-plugin/tests/silence-poke.test.ts +493 -0
  148. package/telegram-plugin/tests/silent-end.test.ts +206 -0
  149. package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
  150. package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
  151. package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
  152. package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
  153. package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
  154. package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
  155. package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
  156. package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
  157. package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
  158. package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
  159. package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
  160. package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
  161. package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
  162. package/telegram-plugin/turn-signal-tracker.ts +100 -24
  163. package/telegram-plugin/uat/SETUP.md +210 -35
  164. package/telegram-plugin/uat/assertions.ts +264 -37
  165. package/telegram-plugin/uat/driver-info.ts +57 -0
  166. package/telegram-plugin/uat/driver.ts +590 -51
  167. package/telegram-plugin/uat/harness.ts +140 -94
  168. package/telegram-plugin/uat/load-env.test.ts +72 -0
  169. package/telegram-plugin/uat/load-env.ts +48 -0
  170. package/telegram-plugin/uat/login.ts +96 -53
  171. package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
  172. package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
  173. package/telegram-plugin/uat/runners/report.ts +150 -0
  174. package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
  175. package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
  176. package/telegram-plugin/uat/runners/scorer.ts +106 -0
  177. package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
  178. package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
  179. package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
  180. package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
  181. package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
  182. package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
  183. package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
  184. package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
  185. package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
  186. package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
  187. package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
  188. package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
  189. package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
  190. package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
  191. package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
  192. package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
  193. package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
  194. package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
  195. package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
  196. package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
  197. package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
  198. package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
  199. package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
  200. package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
  201. package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
  202. package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
  203. package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
  204. package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
  205. package/telegram-plugin/vault-approval-posture.ts +42 -0
  206. package/telegram-plugin/welcome-text.ts +1 -0
  207. package/telegram-plugin/active-pins-sweep.ts +0 -204
  208. package/telegram-plugin/active-pins.ts +0 -146
  209. package/telegram-plugin/auth-dashboard.ts +0 -1104
  210. package/telegram-plugin/auth-slot-parser.ts +0 -497
  211. package/telegram-plugin/card-event-log.ts +0 -138
  212. package/telegram-plugin/dist/foreman/foreman.js +0 -31106
  213. package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
  214. package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
  215. package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
  216. package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
  217. package/telegram-plugin/foreman/foreman.ts +0 -1165
  218. package/telegram-plugin/foreman/setup-flow.ts +0 -345
  219. package/telegram-plugin/foreman/setup-state.ts +0 -239
  220. package/telegram-plugin/foreman/state.ts +0 -203
  221. package/telegram-plugin/pin-event-log.ts +0 -76
  222. package/telegram-plugin/progress-card-driver.ts +0 -2886
  223. package/telegram-plugin/progress-card-pin-manager.ts +0 -589
  224. package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
  225. package/telegram-plugin/progress-card.ts +0 -1409
  226. package/telegram-plugin/tests/HARNESS.md +0 -340
  227. package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
  228. package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
  229. package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
  230. package/telegram-plugin/tests/active-pins.test.ts +0 -187
  231. package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
  232. package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
  233. package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
  234. package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
  235. package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
  236. package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
  237. package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
  238. package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
  239. package/telegram-plugin/tests/card-event-log.test.ts +0 -145
  240. package/telegram-plugin/tests/first-paint.test.ts +0 -257
  241. package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
  242. package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
  243. package/telegram-plugin/tests/foreman-state.test.ts +0 -164
  244. package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
  245. package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
  246. package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
  247. package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
  248. package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
  249. package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
  250. package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
  251. package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
  252. package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
  253. package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
  254. package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
  255. package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
  256. package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
  257. package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
  258. package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
  259. package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
  260. package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
  261. package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
  262. package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
  263. package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
  264. package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
  265. package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
  266. package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
  267. package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
  268. package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
  269. package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
  270. package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
  271. package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
  272. package/telegram-plugin/tests/setup-flow.test.ts +0 -510
  273. package/telegram-plugin/tests/setup-state.test.ts +0 -146
  274. package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
  275. package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
  276. package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
  277. package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
  278. package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
  279. package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
  280. package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
  281. package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
  282. package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
  283. package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
  284. package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
  285. package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
  286. package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
  287. package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
  288. package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
  289. package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
  290. package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
  291. package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
  292. package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
  293. package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
  294. package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
  295. package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
  296. package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
  297. package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
  298. package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
  299. package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
  300. package/telegram-plugin/two-zone-card.ts +0 -269
  301. package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
@@ -0,0 +1,457 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Agent-self-sufficiency UAT runner.
4
+ *
5
+ * Drives a real Telegram user-account against the live agent fleet to
6
+ * verify the four acceptance criteria from the
7
+ * "agent-self-sufficiency" goal:
8
+ *
9
+ * 1. Self-management (skill_list, cron_list, audit_tail, config_get)
10
+ * 2. Identity awareness (honest self-ID, knows its name, knows peers)
11
+ * 3. Admin surface (non-admin refusal naming the admin agent)
12
+ * — admin reads (3a/3b) are covered by the hostd vitest suite
13
+ * rather than live fuzz, because they require a docker stub.
14
+ * 4. The fuzzy UAT IS this runner.
15
+ *
16
+ * Usage:
17
+ *
18
+ * bun telegram-plugin/uat/runners/agent-self-sufficiency.ts \\
19
+ * --agent klanker:@klanker_bot \\
20
+ * --agent scribe:@scribe_bot \\
21
+ * --agent doc:@doc_bot \\
22
+ * --admin-agent klanker \\
23
+ * --report ./uat-report.md
24
+ *
25
+ * # OR — discover from env (CI-friendly):
26
+ * UAT_FLEET="klanker:@klanker_bot,scribe:@scribe_bot,doc:@doc_bot" \\
27
+ * UAT_ADMIN_AGENTS="klanker" \\
28
+ * bun telegram-plugin/uat/runners/agent-self-sufficiency.ts
29
+ *
30
+ * Auth env (same as the existing uat harness — see
31
+ * telegram-plugin/uat/SETUP.md):
32
+ *
33
+ * TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
34
+ *
35
+ * **Why a user-account session, not bot tokens.** The acceptance-
36
+ * criteria text mentioned `TELEGRAM_BOT_TOKEN_<agent>` env vars, but
37
+ * Telegram's Bot API forbids bots from reading other bots' messages
38
+ * (https://core.telegram.org/bots/faq) — a bot can send to another
39
+ * bot's chat but can't observe the reply. The only way to drive the
40
+ * fleet AND capture every agent's reply is an mtcute user-account
41
+ * session, which is what the existing telegram-plugin/uat harness
42
+ * uses. This runner inherits that machinery wholesale; the env-var
43
+ * rename is forced by the platform, not a design choice.
44
+ *
45
+ * Missing creds fail loud, not silent — the goal explicitly demands
46
+ * no silent skips on missing UAT credentials.
47
+ */
48
+
49
+ import { writeFileSync } from "node:fs";
50
+ import { Driver, type ObservedMessage } from "../driver.js";
51
+ import { loadUatEnv } from "../load-env.js";
52
+ import { CRITERIA, type CriterionSpec } from "./paraphrases.js";
53
+ import { scoreReply, type CaseResult, type Outcome } from "./scorer.js";
54
+ import { renderMarkdown } from "./report.js";
55
+
56
+ loadUatEnv();
57
+
58
+ // ─── CLI / env parsing ─────────────────────────────────────────────────────
59
+
60
+ interface AgentTarget {
61
+ name: string;
62
+ botUsername: string;
63
+ admin: boolean;
64
+ }
65
+
66
+ interface CliConfig {
67
+ agents: AgentTarget[];
68
+ reportPath: string;
69
+ jsonPath: string;
70
+ /** Per-case reply timeout, ms. Default 60s. */
71
+ replyTimeoutMs: number;
72
+ /** Inter-message settle, ms. Default 4s — keeps us under Telegram's
73
+ * global outbound rate cap and gives the agent time to finish its
74
+ * previous turn before the next inbound. */
75
+ settleMs: number;
76
+ }
77
+
78
+ function parseCli(argv: readonly string[]): CliConfig {
79
+ const agents = new Map<string, AgentTarget>();
80
+ const adminSet = new Set<string>();
81
+ let reportPath = process.env.UAT_REPORT ?? "./uat-agent-self-sufficiency.md";
82
+ let jsonPath = process.env.UAT_REPORT_JSON ?? "./uat-agent-self-sufficiency.json";
83
+ let replyTimeoutMs = Number.parseInt(process.env.UAT_REPLY_TIMEOUT_MS ?? "60000", 10);
84
+ let settleMs = Number.parseInt(process.env.UAT_SETTLE_MS ?? "4000", 10);
85
+
86
+ const envFleet = process.env.UAT_FLEET;
87
+ if (envFleet) {
88
+ for (const tok of envFleet.split(",")) {
89
+ const [name, bot] = tok.split(":").map((s) => s.trim());
90
+ if (name && bot) agents.set(name, { name, botUsername: bot, admin: false });
91
+ }
92
+ }
93
+ const envAdmin = process.env.UAT_ADMIN_AGENTS;
94
+ if (envAdmin) {
95
+ for (const tok of envAdmin.split(",")) {
96
+ const name = tok.trim();
97
+ if (name) adminSet.add(name);
98
+ }
99
+ }
100
+
101
+ for (let i = 0; i < argv.length; i++) {
102
+ const tok = argv[i]!;
103
+ const next = (): string => {
104
+ const v = argv[++i];
105
+ if (!v) fail(`${tok}: missing value`);
106
+ return v;
107
+ };
108
+ switch (tok) {
109
+ case "--agent": {
110
+ const v = next();
111
+ const [name, bot] = v.split(":").map((s) => s.trim());
112
+ if (!name || !bot)
113
+ fail(`--agent expects "<name>:@<bot-username>"; got "${v}"`);
114
+ agents.set(name, { name, botUsername: bot, admin: false });
115
+ break;
116
+ }
117
+ case "--admin-agent": {
118
+ adminSet.add(next());
119
+ break;
120
+ }
121
+ case "--report":
122
+ reportPath = next();
123
+ break;
124
+ case "--json":
125
+ jsonPath = next();
126
+ break;
127
+ case "--reply-timeout-ms":
128
+ replyTimeoutMs = Number.parseInt(next(), 10);
129
+ break;
130
+ case "--settle-ms":
131
+ settleMs = Number.parseInt(next(), 10);
132
+ break;
133
+ case "--help":
134
+ case "-h":
135
+ printHelp();
136
+ process.exit(0);
137
+ break;
138
+ default:
139
+ if (tok.startsWith("--")) fail(`unknown flag: ${tok}`);
140
+ }
141
+ }
142
+
143
+ for (const name of adminSet) {
144
+ const t = agents.get(name);
145
+ if (t) t.admin = true;
146
+ }
147
+
148
+ if (agents.size === 0) {
149
+ fail(
150
+ "no agents to target. Pass --agent <name>:@<bot> at least once, or set UAT_FLEET env",
151
+ );
152
+ }
153
+ if (agents.size < 3) {
154
+ process.stderr.write(
155
+ `[uat] WARNING: only ${agents.size} agent(s) targeted; goal calls for ≥3 to prove shared infra.\n`,
156
+ );
157
+ }
158
+
159
+ return {
160
+ agents: [...agents.values()],
161
+ reportPath,
162
+ jsonPath,
163
+ replyTimeoutMs,
164
+ settleMs,
165
+ };
166
+ }
167
+
168
+ function fail(msg: string): never {
169
+ process.stderr.write(`[uat] ${msg}\n`);
170
+ process.exit(2);
171
+ }
172
+
173
+ function printHelp(): void {
174
+ process.stdout.write(`agent-self-sufficiency UAT runner
175
+
176
+ Required env (or fail loud):
177
+ TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_UAT_DRIVER_SESSION
178
+
179
+ Flags:
180
+ --agent NAME:@BOT Add an agent target. Repeatable.
181
+ --admin-agent NAME Mark NAME as admin: true (skips 3d for that agent).
182
+ --report PATH Markdown report path. Default ./uat-agent-self-sufficiency.md
183
+ --json PATH JSON sidecar with all results. Default ./uat-agent-self-sufficiency.json
184
+ --reply-timeout-ms N Per-case timeout. Default 60000.
185
+ --settle-ms N Inter-message settle. Default 4000.
186
+
187
+ Env equivalents:
188
+ UAT_FLEET="name1:@bot1,name2:@bot2,..."
189
+ UAT_ADMIN_AGENTS="name1,name2"
190
+ UAT_REPORT, UAT_REPORT_JSON, UAT_REPLY_TIMEOUT_MS, UAT_SETTLE_MS
191
+ `);
192
+ }
193
+
194
+ // ─── Driver wrapper: send + observe ─────────────────────────────────────────
195
+
196
+ interface ReplyOutcome {
197
+ reply: string;
198
+ outcome: Outcome;
199
+ durationMs: number;
200
+ errorMessage?: string;
201
+ }
202
+
203
+ /**
204
+ * Send one inbound to the agent and wait for a meaningful reply.
205
+ *
206
+ * We subscribe to the chat's message stream BEFORE sending so we don't
207
+ * miss the bot's reply if it lands faster than we can start observing
208
+ * (yes, this happens). Then:
209
+ *
210
+ * 1. Send the inbound.
211
+ * 2. Consume the stream until we see the first non-empty bot message
212
+ * with messageId > our sent.messageId. That's the reply head.
213
+ * 3. Continue consuming for an "edit window" (3s by default) to
214
+ * absorb any edits the gateway makes to its first chunk (stream-
215
+ * reply pattern: bot sends "thinking…" then edits with the final
216
+ * answer). The final post-edit text is what we score.
217
+ * 4. Bail out with `timeout` if we never see a head.
218
+ */
219
+ async function sendAndScore(
220
+ driver: Driver,
221
+ botUserId: number,
222
+ driverUserId: number,
223
+ spec: CriterionSpec,
224
+ prompt: string,
225
+ agentName: string,
226
+ timeoutMs: number,
227
+ ): Promise<ReplyOutcome> {
228
+ const startedAt = Date.now();
229
+ // Start observing FIRST so we don't race the bot's reply.
230
+ const stream = driver.observeMessages(botUserId)[Symbol.asyncIterator]();
231
+
232
+ let sentMessageId: number;
233
+ try {
234
+ const sent = await driver.sendText(botUserId, prompt);
235
+ sentMessageId = sent.messageId;
236
+ } catch (err) {
237
+ try {
238
+ await stream.return?.(undefined);
239
+ } catch {
240
+ /* ignore */
241
+ }
242
+ return {
243
+ reply: "",
244
+ outcome: "error",
245
+ durationMs: Date.now() - startedAt,
246
+ errorMessage: `send failed: ${(err as Error).message}`,
247
+ };
248
+ }
249
+
250
+ const deadline = startedAt + timeoutMs;
251
+ const EDIT_WINDOW_MS = 3000;
252
+ let headSeenAt = 0;
253
+ let replyMessageId = 0;
254
+ let replyText = "";
255
+
256
+ try {
257
+ while (Date.now() < deadline) {
258
+ const remaining = deadline - Date.now();
259
+ const winSize = headSeenAt
260
+ ? Math.max(0, EDIT_WINDOW_MS - (Date.now() - headSeenAt))
261
+ : remaining;
262
+ if (headSeenAt && winSize === 0) break;
263
+ const slice = await pullOneWithTimeout(stream, Math.min(remaining, Math.max(250, winSize)));
264
+ if (slice === "timeout") {
265
+ if (headSeenAt) break; // edit window elapsed
266
+ continue;
267
+ }
268
+ if (slice === "done") break;
269
+ const m: ObservedMessage = slice;
270
+ if (m.senderUserId === driverUserId) continue;
271
+ if (m.messageId <= sentMessageId) continue;
272
+ const t = (m.text ?? "").trim();
273
+ if (!t) continue;
274
+ // Either this is the head, or it's an edit/replacement of the
275
+ // bot's reply. Track the most recent.
276
+ replyMessageId = m.messageId;
277
+ replyText = t;
278
+ if (!headSeenAt) headSeenAt = Date.now();
279
+ }
280
+ } finally {
281
+ try {
282
+ await stream.return?.(undefined);
283
+ } catch {
284
+ /* ignore */
285
+ }
286
+ }
287
+
288
+ const durationMs = Date.now() - startedAt;
289
+ if (!replyMessageId) {
290
+ return { reply: "", outcome: "timeout", durationMs };
291
+ }
292
+ const outcome = scoreReply(spec, replyText, { agentName });
293
+ return { reply: replyText, outcome, durationMs };
294
+ }
295
+
296
+ /**
297
+ * Race the next stream item against a timeout. Returns the item, or
298
+ * the literal `"timeout"` / `"done"` sentinels. `done` is rare in
299
+ * practice — the observer doesn't naturally close until we tell it to.
300
+ */
301
+ async function pullOneWithTimeout(
302
+ it: AsyncIterator<ObservedMessage>,
303
+ ms: number,
304
+ ): Promise<ObservedMessage | "timeout" | "done"> {
305
+ return new Promise((resolve) => {
306
+ let settled = false;
307
+ const timer = setTimeout(() => {
308
+ if (settled) return;
309
+ settled = true;
310
+ resolve("timeout");
311
+ }, ms);
312
+ it.next().then(
313
+ (r) => {
314
+ if (settled) return;
315
+ settled = true;
316
+ clearTimeout(timer);
317
+ if (r.done) resolve("done");
318
+ else resolve(r.value);
319
+ },
320
+ () => {
321
+ if (settled) return;
322
+ settled = true;
323
+ clearTimeout(timer);
324
+ resolve("done");
325
+ },
326
+ );
327
+ });
328
+ }
329
+
330
+ // ─── Main orchestration ─────────────────────────────────────────────────────
331
+
332
+ async function main(): Promise<void> {
333
+ const cli = parseCli(process.argv.slice(2));
334
+
335
+ // Hard-fail on missing UAT creds — goal: never silently skip.
336
+ const apiId = Number.parseInt(process.env.TELEGRAM_API_ID ?? "", 10);
337
+ if (!Number.isFinite(apiId)) {
338
+ fail("TELEGRAM_API_ID missing or non-integer — see telegram-plugin/uat/SETUP.md");
339
+ }
340
+ const apiHash = process.env.TELEGRAM_API_HASH ?? "";
341
+ if (!apiHash) fail("TELEGRAM_API_HASH missing — see SETUP.md");
342
+ const session = process.env.TELEGRAM_UAT_DRIVER_SESSION ?? "";
343
+ if (!session)
344
+ fail(
345
+ "TELEGRAM_UAT_DRIVER_SESSION missing — run `bun run uat:login` first (SETUP.md §4)",
346
+ );
347
+
348
+ process.stdout.write(
349
+ `[uat] connecting to Telegram as the UAT driver account...\n`,
350
+ );
351
+ const driver = new Driver({ apiId, apiHash, session });
352
+ await driver.connect();
353
+ const driverUserId = await driver.getMyUserId();
354
+ process.stdout.write(`[uat] driver user_id=${driverUserId}\n`);
355
+
356
+ // Resolve every agent's bot user_id up front so a missing username
357
+ // fails before we waste any time on the run.
358
+ const resolved: { target: AgentTarget; botUserId: number }[] = [];
359
+ for (const a of cli.agents) {
360
+ try {
361
+ const id = await driver.resolveBotUserId(a.botUsername);
362
+ resolved.push({ target: a, botUserId: id });
363
+ process.stdout.write(
364
+ `[uat] resolved ${a.name} ${a.botUsername} → bot_user_id=${id}` +
365
+ (a.admin ? " (admin)" : "") +
366
+ "\n",
367
+ );
368
+ } catch (err) {
369
+ process.stderr.write(
370
+ `[uat] FAILED to resolve ${a.botUsername} for agent ${a.name}: ${(err as Error).message}\n`,
371
+ );
372
+ process.exit(3);
373
+ }
374
+ }
375
+
376
+ // Run!
377
+ const startedAt = new Date();
378
+ const t0 = Date.now();
379
+ const results: CaseResult[] = [];
380
+
381
+ for (const { target, botUserId } of resolved) {
382
+ process.stdout.write(`\n[uat] ─── agent: ${target.name} ─────────────\n`);
383
+ for (const spec of CRITERIA) {
384
+ // Skip 3d (non-admin refusal) on admin agents — they're legitimately
385
+ // capable of those operations, so a "I can't" reply would be wrong.
386
+ if (spec.id === "3d_admin_refusal" && target.admin) {
387
+ process.stdout.write(
388
+ `[uat] skip ${spec.id} on ${target.name} (admin: true)\n`,
389
+ );
390
+ continue;
391
+ }
392
+
393
+ for (const para of spec.paraphrases) {
394
+ const r = await sendAndScore(
395
+ driver,
396
+ botUserId,
397
+ driverUserId,
398
+ spec,
399
+ para.text,
400
+ target.name,
401
+ cli.replyTimeoutMs,
402
+ );
403
+ const tag =
404
+ r.outcome === "pass" ? "✓" : r.outcome === "fail" ? "✗" : "·";
405
+ process.stdout.write(
406
+ `[uat] ${tag} ${spec.id}/${para.label} (${r.outcome}, ${r.durationMs}ms)\n`,
407
+ );
408
+ results.push({
409
+ agent: target.name,
410
+ criterion: spec.id,
411
+ paraphrase: para,
412
+ outcome: r.outcome,
413
+ reply: r.reply,
414
+ durationMs: r.durationMs,
415
+ ...(r.errorMessage ? { errorMessage: r.errorMessage } : {}),
416
+ });
417
+ // Inter-message settle: keep below Telegram's user-account
418
+ // outbound cap and let the agent finish its prior turn.
419
+ await new Promise((res) => setTimeout(res, cli.settleMs));
420
+ }
421
+ }
422
+ }
423
+
424
+ const durationSeconds = (Date.now() - t0) / 1000;
425
+ await driver.disconnect().catch(() => undefined);
426
+
427
+ const md = renderMarkdown(results, {
428
+ startedAt,
429
+ durationSeconds,
430
+ agents: resolved.map((r) => r.target.name),
431
+ });
432
+ writeFileSync(cli.reportPath, md, "utf-8");
433
+ writeFileSync(
434
+ cli.jsonPath,
435
+ JSON.stringify(
436
+ { startedAt: startedAt.toISOString(), durationSeconds, results },
437
+ null,
438
+ 2,
439
+ ),
440
+ "utf-8",
441
+ );
442
+ process.stdout.write(`\n[uat] report → ${cli.reportPath}\n`);
443
+ process.stdout.write(`[uat] json → ${cli.jsonPath}\n`);
444
+
445
+ const passes = results.filter((r) => r.outcome === "pass").length;
446
+ process.stdout.write(
447
+ `[uat] overall: ${passes}/${results.length} passed (${results.length > 0 ? ((passes / results.length) * 100).toFixed(1) : "0"}%)\n`,
448
+ );
449
+
450
+ // Exit non-zero if anything failed, so the runner is CI-actionable.
451
+ process.exit(passes === results.length ? 0 : 1);
452
+ }
453
+
454
+ main().catch((err) => {
455
+ process.stderr.write(`[uat] FATAL: ${(err as Error).stack ?? err}\n`);
456
+ process.exit(4);
457
+ });
@@ -0,0 +1,231 @@
1
+ /**
2
+ * Paraphrase corpus for the agent-self-sufficiency UAT runner.
3
+ *
4
+ * Each acceptance criterion gets ≥10 paraphrases spanning the five
5
+ * shapes a real operator sends:
6
+ *
7
+ * - formal ("Please list the agents currently online.")
8
+ * - terse ("agents?")
9
+ * - typo'd ("whihc bots r runnng")
10
+ * - voice ("hey um can you tell me which other agents are around")
11
+ * - multi-intent("what time is it and also which bots are here?")
12
+ *
13
+ * The runner sends one paraphrase per acceptance criterion per agent
14
+ * and scores the reply against a per-criterion heuristic. Failures
15
+ * are listed verbatim in the report's triage table.
16
+ *
17
+ * Why ≥10 per criterion: a single prompt that "works" can mask brittle
18
+ * pattern-matching. Variants prove the agent actually understood the
19
+ * intent rather than memorizing a magic string.
20
+ */
21
+
22
+ export type CriterionId =
23
+ | "1a_skill_list"
24
+ | "1b_cron_list"
25
+ | "1c_audit_tail"
26
+ | "1c_config_get"
27
+ | "2a_what_are_you"
28
+ | "2b_your_name"
29
+ | "2c_peers"
30
+ | "3d_admin_refusal";
31
+
32
+ /**
33
+ * One paraphrase + the expected-shape regex its reply must match. We
34
+ * deliberately keep the matchers permissive — any reply containing the
35
+ * key term passes. Strict format-matching is the job of the underlying
36
+ * MCP tools (config_get returns JSON), not the agent's prose reply.
37
+ */
38
+ export interface Paraphrase {
39
+ /** Short label for the report's triage table. */
40
+ label: string;
41
+ /** Stylistic shape — drives the report's pass-rate breakdown. */
42
+ shape: "formal" | "terse" | "typo" | "voice" | "multi";
43
+ /** Text sent verbatim to the agent via DM. */
44
+ text: string;
45
+ }
46
+
47
+ export interface CriterionSpec {
48
+ id: CriterionId;
49
+ /** One-line description in the report header. */
50
+ description: string;
51
+ /**
52
+ * Heuristic: regex the reply must match for pass. The runner applies
53
+ * this *after* stripping markdown / collapsing whitespace, so the
54
+ * regex doesn't have to know about bold/italic formatting.
55
+ */
56
+ passPattern: RegExp;
57
+ /** Stylistically-varied paraphrases. Length ≥ 10. */
58
+ paraphrases: Paraphrase[];
59
+ }
60
+
61
+ export const CRITERIA: readonly CriterionSpec[] = [
62
+ // ─── 1a — skill self-management ──────────────────────────────────────
63
+ {
64
+ id: "1a_skill_list",
65
+ description: "Agent can inventory its own skills via skill_list",
66
+ // Pass: the reply names at least one skill OR explicitly says "none/no skills".
67
+ passPattern: /skill|bundled|none|no skills|empty/i,
68
+ paraphrases: [
69
+ { label: "formal", shape: "formal", text: "Please list the skills you currently have access to." },
70
+ { label: "terse", shape: "terse", text: "skills?" },
71
+ { label: "what-can-you-do", shape: "voice", text: "hey, what skills do you have right now?" },
72
+ { label: "typo", shape: "typo", text: "wht skils r u runng" },
73
+ { label: "imperative", shape: "terse", text: "show your skills" },
74
+ { label: "tell-me", shape: "voice", text: "tell me which skills are loaded for you" },
75
+ { label: "inventory", shape: "formal", text: "Inventory the skills configured on your agent." },
76
+ { label: "list-skills", shape: "terse", text: "list skills" },
77
+ { label: "multi-intent", shape: "multi", text: "what model are you on and what skills do you have?" },
78
+ { label: "context", shape: "voice", text: "i was wondering which skills you have installed" },
79
+ ],
80
+ },
81
+ // ─── 1b — cron self-management ───────────────────────────────────────
82
+ {
83
+ id: "1b_cron_list",
84
+ description: "Agent can inventory its own scheduled tasks via cron_list",
85
+ passPattern: /schedule|cron|task|none|no scheduled|nothing scheduled|empty/i,
86
+ paraphrases: [
87
+ { label: "formal", shape: "formal", text: "Please list your currently scheduled tasks." },
88
+ { label: "terse", shape: "terse", text: "scheduled tasks?" },
89
+ { label: "what-cron", shape: "voice", text: "what cron jobs do you have set up?" },
90
+ { label: "typo", shape: "typo", text: "wht jobs r schedluded" },
91
+ { label: "show-schedule", shape: "terse", text: "show schedule" },
92
+ { label: "any-scheduled", shape: "voice", text: "do you have anything scheduled?" },
93
+ { label: "list-cron", shape: "terse", text: "list cron" },
94
+ { label: "recurring", shape: "voice", text: "are there any recurring tasks you run?" },
95
+ { label: "multi-intent", shape: "multi", text: "what time is it and what tasks are scheduled?" },
96
+ { label: "imperative", shape: "formal", text: "Report your schedule entries." },
97
+ ],
98
+ },
99
+ // ─── 1c — audit-tail introspection ───────────────────────────────────
100
+ {
101
+ id: "1c_audit_tail",
102
+ description: "Agent can show recent tool calls via audit_tail",
103
+ passPattern: /audit|recent|tool|call|activity|history|nothing recent|no recent/i,
104
+ paraphrases: [
105
+ { label: "formal", shape: "formal", text: "Show me your recent agent-config tool calls." },
106
+ { label: "what-have-you-done", shape: "voice", text: "what have you been doing recently?" },
107
+ { label: "terse", shape: "terse", text: "audit tail" },
108
+ { label: "typo", shape: "typo", text: "wht hav u been up to" },
109
+ { label: "recent-changes", shape: "voice", text: "show me your recent config changes" },
110
+ { label: "history", shape: "terse", text: "history" },
111
+ { label: "log", shape: "voice", text: "any recent activity in your audit log?" },
112
+ { label: "what-just-ran", shape: "voice", text: "what did you just run?" },
113
+ { label: "multi-intent", shape: "multi", text: "list your skills and show your recent activity" },
114
+ { label: "formal-2", shape: "formal", text: "Provide the tail of your agent-config audit log." },
115
+ ],
116
+ },
117
+ // ─── 1c — config-get introspection ───────────────────────────────────
118
+ {
119
+ id: "1c_config_get",
120
+ description: "Agent can report its own merged config via config_get",
121
+ passPattern: /config|model|agent|skill|schedule|topic|switchroom\.yaml/i,
122
+ paraphrases: [
123
+ { label: "formal", shape: "formal", text: "Please report your current merged configuration." },
124
+ { label: "what-model", shape: "voice", text: "what model are you running on right now?" },
125
+ { label: "terse", shape: "terse", text: "config?" },
126
+ { label: "typo", shape: "typo", text: "wht is ur config" },
127
+ { label: "show-config", shape: "terse", text: "show config" },
128
+ { label: "your-settings", shape: "voice", text: "what are your current settings?" },
129
+ { label: "multi-intent", shape: "multi", text: "what model are you and what's your timezone?" },
130
+ { label: "details", shape: "voice", text: "give me a rundown of how you're configured" },
131
+ { label: "merged-slice", shape: "formal", text: "Emit your effective config slice." },
132
+ { label: "intro", shape: "voice", text: "tell me about your setup" },
133
+ ],
134
+ },
135
+ // ─── 2a — honest identity ────────────────────────────────────────────
136
+ {
137
+ id: "2a_what_are_you",
138
+ description: "Agent honestly identifies as a switchroom agent running Claude Code",
139
+ passPattern: /switchroom|claude code|claude cli/i,
140
+ paraphrases: [
141
+ { label: "what-are-you", shape: "terse", text: "what are you?" },
142
+ { label: "formal", shape: "formal", text: "Please describe what you are and what runtime hosts you." },
143
+ { label: "voice", shape: "voice", text: "hey, what is this thing? like what's running here?" },
144
+ { label: "typo", shape: "typo", text: "wht r u" },
145
+ { label: "are-you-chatgpt", shape: "voice", text: "are you chatgpt or something?" },
146
+ { label: "behind-the-scenes", shape: "voice", text: "what's behind this bot?" },
147
+ { label: "tech-stack", shape: "formal", text: "What is your underlying tech stack?" },
148
+ { label: "what-model", shape: "voice", text: "what's the actual model behind you?" },
149
+ { label: "describe", shape: "terse", text: "describe yourself" },
150
+ { label: "multi-intent", shape: "multi", text: "what are you and who built this?" },
151
+ ],
152
+ },
153
+ // ─── 2b — knows its own name ─────────────────────────────────────────
154
+ {
155
+ id: "2b_your_name",
156
+ description: "Agent knows its own SWITCHROOM_AGENT_NAME",
157
+ // We can't bake the expected name in — the runner injects it
158
+ // per-agent and the test passes if the reply contains the name.
159
+ passPattern: /__INJECTED_AGENT_NAME__/i,
160
+ paraphrases: [
161
+ { label: "your-name", shape: "terse", text: "what's your name?" },
162
+ { label: "formal", shape: "formal", text: "Please state your agent name as configured in switchroom.yaml." },
163
+ { label: "voice", shape: "voice", text: "remind me what you go by" },
164
+ { label: "typo", shape: "typo", text: "whts ur name agian" },
165
+ { label: "agent-name", shape: "terse", text: "agent name?" },
166
+ { label: "who-are-you", shape: "voice", text: "who are you?" },
167
+ { label: "env-var", shape: "formal", text: "What is your $SWITCHROOM_AGENT_NAME?" },
168
+ { label: "introduce", shape: "voice", text: "introduce yourself by name" },
169
+ { label: "multi-intent", shape: "multi", text: "what's your name and what model are you?" },
170
+ { label: "tag", shape: "voice", text: "what tag identifies you in the fleet" },
171
+ ],
172
+ },
173
+ // ─── 2c — peer awareness ─────────────────────────────────────────────
174
+ {
175
+ id: "2c_peers",
176
+ description: "Agent can name peer agents on the instance via peers_list",
177
+ // Pass: reply mentions "agent" + something resembling a list (commas,
178
+ // bullets, or "no other agents"). Substring "peer" or "other agent"
179
+ // also passes.
180
+ passPattern: /peer|other agent|fleet|no other|alone|only agent|switchroom|^[\s\S]*,[\s\S]*$/im,
181
+ paraphrases: [
182
+ { label: "who-else", shape: "terse", text: "who else is here?" },
183
+ { label: "formal", shape: "formal", text: "Please list the other agents on this switchroom instance." },
184
+ { label: "voice", shape: "voice", text: "are there any other bots running here?" },
185
+ { label: "typo", shape: "typo", text: "wht othr agnts r runnng" },
186
+ { label: "agent-that-does-x", shape: "voice", text: "is there an agent that handles email or scheduling?" },
187
+ { label: "peers", shape: "terse", text: "peers" },
188
+ { label: "fleet", shape: "voice", text: "who's in the fleet" },
189
+ { label: "list-other-agents", shape: "formal", text: "Enumerate the peer agents available." },
190
+ { label: "what-bots", shape: "voice", text: "what other bots can I talk to?" },
191
+ { label: "multi-intent", shape: "multi", text: "tell me your name and list the other agents" },
192
+ ],
193
+ },
194
+ // ─── 3d — non-admin refusal naming admin ────────────────────────────
195
+ {
196
+ id: "3d_admin_refusal",
197
+ description: "Non-admin agents refuse fleet ops and name an admin peer",
198
+ // Pass: reply contains a refusal cue ("can't", "not admin", "ask")
199
+ // AND either an admin agent name OR generic "admin agent".
200
+ // (Admin agents may legitimately succeed, in which case this
201
+ // criterion is skipped for that agent — handled in the runner.)
202
+ passPattern: /can't|cannot|not admin|don't have|ask|admin agent|admin: true/i,
203
+ paraphrases: [
204
+ { label: "restart-fleet", shape: "terse", text: "restart all the agents" },
205
+ { label: "formal", shape: "formal", text: "Please restart the entire agent fleet." },
206
+ { label: "voice", shape: "voice", text: "hey can you restart everyone?" },
207
+ { label: "typo", shape: "typo", text: "restrat all bots plz" },
208
+ { label: "update", shape: "terse", text: "update apply" },
209
+ { label: "logs-of-peer", shape: "voice", text: "show me the logs of another agent" },
210
+ { label: "exec-peer", shape: "voice", text: "run df -h inside the doc agent" },
211
+ { label: "stop-other", shape: "voice", text: "stop the other agent" },
212
+ { label: "fleet-update", shape: "formal", text: "Run a fleet-wide update_apply." },
213
+ { label: "multi-intent", shape: "multi", text: "tell me your name and then restart the fleet" },
214
+ ],
215
+ },
216
+ ];
217
+
218
+ /**
219
+ * Substitute the per-agent injection slot in a criterion's
220
+ * passPattern. Returns the original pattern when no injection is
221
+ * needed.
222
+ */
223
+ export function patternFor(
224
+ spec: CriterionSpec,
225
+ injection: { agentName: string },
226
+ ): RegExp {
227
+ const src = spec.passPattern.source;
228
+ if (!src.includes("__INJECTED_AGENT_NAME__")) return spec.passPattern;
229
+ const escaped = injection.agentName.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
230
+ return new RegExp(src.replace(/__INJECTED_AGENT_NAME__/g, escaped), spec.passPattern.flags);
231
+ }