switchroom 0.7.15 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (301) hide show
  1. package/README.md +51 -59
  2. package/bin/run-hook.sh +27 -11
  3. package/bin/timezone-hook.sh +9 -7
  4. package/dist/agent-scheduler/index.js +410 -133
  5. package/dist/auth-broker/index.js +13932 -0
  6. package/dist/cli/switchroom.js +26937 -5601
  7. package/dist/host-control/main.js +12702 -0
  8. package/dist/vault/approvals/kernel-server.js +467 -184
  9. package/dist/vault/broker/server.js +1430 -724
  10. package/examples/minimal.yaml +63 -0
  11. package/examples/personal-google-workspace-mcp/.env.example +34 -0
  12. package/examples/personal-google-workspace-mcp/README.md +194 -0
  13. package/examples/personal-google-workspace-mcp/compose.yaml +66 -0
  14. package/examples/switchroom.yaml +220 -0
  15. package/package.json +7 -4
  16. package/profiles/_base/settings.json.hbs +20 -5
  17. package/profiles/_base/start.sh.hbs +16 -3
  18. package/profiles/_shared/agent-self-service.md.hbs +126 -0
  19. package/profiles/_shared/telegram-style.md.hbs +20 -90
  20. package/profiles/_shared/vault-protocol.md.hbs +68 -0
  21. package/profiles/default/CLAUDE.md +50 -96
  22. package/profiles/default/CLAUDE.md.hbs +36 -6
  23. package/profiles/default/workspace/SOUL.md.hbs +12 -5
  24. package/skills/buildkite-agent-infrastructure/SKILL.md +30 -11
  25. package/skills/buildkite-agent-runtime/SKILL.md +44 -11
  26. package/skills/buildkite-api/SKILL.md +31 -8
  27. package/skills/buildkite-cli/SKILL.md +27 -9
  28. package/skills/buildkite-migration/SKILL.md +22 -9
  29. package/skills/buildkite-pipelines/SKILL.md +26 -9
  30. package/skills/buildkite-secure-delivery/SKILL.md +23 -9
  31. package/skills/buildkite-test-engine/SKILL.md +25 -8
  32. package/skills/docx/SKILL.md +1 -1
  33. package/skills/docx/scripts/office/validators/__pycache__/__init__.cpython-313.pyc +0 -0
  34. package/skills/docx/scripts/office/validators/__pycache__/base.cpython-313.pyc +0 -0
  35. package/skills/file-bug/SKILL.md +34 -6
  36. package/skills/humanizer/SKILL.md +15 -0
  37. package/skills/humanizer-calibrate/SKILL.md +7 -1
  38. package/skills/mcp-builder/SKILL.md +1 -1
  39. package/skills/pdf/SKILL.md +1 -1
  40. package/skills/pptx/SKILL.md +1 -1
  41. package/skills/skill-creator/SKILL.md +21 -1
  42. package/skills/skill-creator/scripts/__pycache__/__init__.cpython-313.pyc +0 -0
  43. package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-313.pyc +0 -0
  44. package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-313.pyc +0 -0
  45. package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-313.pyc +0 -0
  46. package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-313.pyc +0 -0
  47. package/skills/skill-creator/scripts/__pycache__/utils.cpython-313.pyc +0 -0
  48. package/skills/switchroom-cli/SKILL.md +63 -64
  49. package/skills/switchroom-health/SKILL.md +23 -10
  50. package/skills/switchroom-install/SKILL.md +3 -3
  51. package/skills/switchroom-manage/SKILL.md +26 -19
  52. package/skills/switchroom-runtime/SKILL.md +191 -0
  53. package/skills/switchroom-status/SKILL.md +27 -2
  54. package/skills/telegram-test-harness/SKILL.md +3 -0
  55. package/skills/token-helpers/SKILL.md +24 -1
  56. package/skills/webapp-testing/SKILL.md +31 -1
  57. package/skills/xlsx/SKILL.md +1 -1
  58. package/telegram-plugin/admin-commands/index.ts +7 -5
  59. package/telegram-plugin/analytics-posthog.ts +191 -0
  60. package/telegram-plugin/bridge/bridge.ts +69 -0
  61. package/telegram-plugin/bridge/ipc-client.ts +4 -1
  62. package/telegram-plugin/dist/bridge/bridge.js +194 -119
  63. package/telegram-plugin/dist/gateway/gateway.js +23611 -19671
  64. package/telegram-plugin/dist/server.js +245 -189
  65. package/telegram-plugin/first-paint.ts +3 -24
  66. package/telegram-plugin/gateway/auth-add-flow.ts +326 -0
  67. package/telegram-plugin/gateway/auth-broker-client.ts +75 -0
  68. package/telegram-plugin/gateway/auth-command.ts +794 -0
  69. package/telegram-plugin/gateway/auth-line.ts +123 -0
  70. package/telegram-plugin/gateway/boot-card.ts +169 -40
  71. package/telegram-plugin/gateway/boot-issue-cache.ts +308 -0
  72. package/telegram-plugin/gateway/boot-probes.ts +166 -123
  73. package/telegram-plugin/gateway/boot-reason.ts +41 -7
  74. package/telegram-plugin/gateway/boot-version.ts +66 -0
  75. package/telegram-plugin/gateway/gateway.ts +3499 -1885
  76. package/telegram-plugin/gateway/hostd-dispatch.ts +117 -0
  77. package/telegram-plugin/gateway/ipc-protocol.ts +18 -0
  78. package/telegram-plugin/gateway/pending-inbound-buffer.ts +106 -0
  79. package/telegram-plugin/gateway/quarantine.ts +69 -0
  80. package/telegram-plugin/gateway/quota-cache.ts +9 -4
  81. package/telegram-plugin/gateway/reaction-trigger.ts +401 -0
  82. package/telegram-plugin/gateway/recent-denials.test.ts +103 -0
  83. package/telegram-plugin/gateway/recent-denials.ts +77 -0
  84. package/telegram-plugin/gateway/startup-network-retry.ts +109 -31
  85. package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +125 -0
  86. package/telegram-plugin/history.ts +91 -0
  87. package/telegram-plugin/hooks/hooks.json +10 -0
  88. package/telegram-plugin/hooks/sandbox-hint-posttool.mjs +130 -0
  89. package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +19 -2
  90. package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +22 -2
  91. package/telegram-plugin/hooks/tool-label-pretool.mjs +11 -0
  92. package/telegram-plugin/hooks/wedge-detect-posttool.mjs +303 -0
  93. package/telegram-plugin/inbound-classifier.ts +50 -0
  94. package/telegram-plugin/inline-keyboard-callbacks.ts +136 -0
  95. package/telegram-plugin/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
  96. package/telegram-plugin/package.json +4 -2
  97. package/telegram-plugin/permission-rule.ts +51 -0
  98. package/telegram-plugin/permission-title.ts +56 -0
  99. package/telegram-plugin/quota-check.ts +19 -41
  100. package/telegram-plugin/registry/reaper.ts +223 -0
  101. package/telegram-plugin/retry-api-call.ts +80 -0
  102. package/telegram-plugin/runtime-metrics.ts +177 -0
  103. package/telegram-plugin/scripts/build.mjs +0 -1
  104. package/telegram-plugin/secret-detect/index.ts +24 -0
  105. package/telegram-plugin/secret-detect/vault-error.test.ts +64 -12
  106. package/telegram-plugin/secret-detect/vault-error.ts +78 -11
  107. package/telegram-plugin/secret-detect/vault-write.ts +14 -2
  108. package/telegram-plugin/server.js +41795 -0
  109. package/telegram-plugin/session-tail.ts +6 -1
  110. package/telegram-plugin/shared/bot-runtime.ts +5 -4
  111. package/telegram-plugin/silence-poke.ts +420 -0
  112. package/telegram-plugin/silent-end.ts +174 -0
  113. package/telegram-plugin/stream-controller.ts +13 -0
  114. package/telegram-plugin/stream-reply-handler.ts +7 -0
  115. package/telegram-plugin/subagent-watcher.ts +213 -4
  116. package/telegram-plugin/tests/auth-add-flow.test.ts +559 -0
  117. package/telegram-plugin/tests/auth-code-redact.test.ts +8 -4
  118. package/telegram-plugin/tests/auth-command-vernacular.test.ts +531 -0
  119. package/telegram-plugin/tests/boot-card-issue-dedup.test.ts +247 -0
  120. package/telegram-plugin/tests/boot-card-reason-to-render.test.ts +182 -0
  121. package/telegram-plugin/tests/boot-card-reason.test.ts +65 -2
  122. package/telegram-plugin/tests/boot-card-render.test.ts +146 -0
  123. package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +103 -0
  124. package/telegram-plugin/tests/boot-probes.test.ts +216 -10
  125. package/telegram-plugin/tests/boot-version-string.test.ts +0 -0
  126. package/telegram-plugin/tests/finalize-callback.test.ts +190 -0
  127. package/telegram-plugin/tests/gateway-message-validator.test.ts +26 -0
  128. package/telegram-plugin/tests/gateway-secret-detect.test.ts +12 -3
  129. package/telegram-plugin/tests/gateway-startup-network-retry.test.ts +104 -0
  130. package/telegram-plugin/tests/history-reaper.test.ts +378 -0
  131. package/telegram-plugin/tests/hostd-dispatch.test.ts +129 -0
  132. package/telegram-plugin/tests/inbound-classifier.test.ts +76 -0
  133. package/telegram-plugin/tests/inbound-message-types.test.ts +267 -0
  134. package/telegram-plugin/tests/issues-card.test.ts +49 -0
  135. package/telegram-plugin/tests/pending-inbound-buffer.test.ts +132 -0
  136. package/telegram-plugin/tests/permission-rule.test.ts +80 -1
  137. package/telegram-plugin/tests/permission-title.test.ts +31 -0
  138. package/telegram-plugin/tests/quota-check.test.ts +5 -35
  139. package/telegram-plugin/tests/races.test.ts +179 -0
  140. package/telegram-plugin/tests/reaction-trigger-flow.test.ts +353 -0
  141. package/telegram-plugin/tests/reaction-trigger.test.ts +397 -0
  142. package/telegram-plugin/tests/retry-api-call.test.ts +152 -1
  143. package/telegram-plugin/tests/runtime-metrics.test.ts +145 -0
  144. package/telegram-plugin/tests/sandbox-hint-posttool.test.ts +155 -0
  145. package/telegram-plugin/tests/secret-detect-delete-must-surface-failures.test.ts +133 -0
  146. package/telegram-plugin/tests/secret-detect-false-positives.test.ts +137 -0
  147. package/telegram-plugin/tests/silence-poke.test.ts +493 -0
  148. package/telegram-plugin/tests/silent-end.test.ts +206 -0
  149. package/telegram-plugin/tests/subagent-tracker-hooks.test.ts +107 -0
  150. package/telegram-plugin/tests/subagent-watcher-env-thresholds.test.ts +224 -0
  151. package/telegram-plugin/tests/subagent-watcher-stall-terminal.test.ts +316 -0
  152. package/telegram-plugin/tests/subagent-watcher.test.ts +263 -0
  153. package/telegram-plugin/tests/turn-signal-tracker.test.ts +81 -0
  154. package/telegram-plugin/tests/vault-approval-posture.test.ts +256 -0
  155. package/telegram-plugin/tests/vault-grant-auto-resume.test.ts +73 -0
  156. package/telegram-plugin/tests/vault-grant-inbound-builders.test.ts +226 -0
  157. package/telegram-plugin/tests/vault-grant-union.test.ts +130 -0
  158. package/telegram-plugin/tests/vault-key-regex-allows-slash.test.ts +140 -0
  159. package/telegram-plugin/tests/vault-posture-quarantine.test.ts +104 -0
  160. package/telegram-plugin/tests/vault-request-access-tool.test.ts +114 -0
  161. package/telegram-plugin/tests/vault-request-access-unlock-resume.test.ts +106 -0
  162. package/telegram-plugin/turn-signal-tracker.ts +100 -24
  163. package/telegram-plugin/uat/SETUP.md +210 -35
  164. package/telegram-plugin/uat/assertions.ts +264 -37
  165. package/telegram-plugin/uat/driver-info.ts +57 -0
  166. package/telegram-plugin/uat/driver.ts +590 -51
  167. package/telegram-plugin/uat/harness.ts +140 -94
  168. package/telegram-plugin/uat/load-env.test.ts +72 -0
  169. package/telegram-plugin/uat/load-env.ts +48 -0
  170. package/telegram-plugin/uat/login.ts +96 -53
  171. package/telegram-plugin/uat/runners/agent-self-sufficiency.ts +457 -0
  172. package/telegram-plugin/uat/runners/paraphrases.ts +231 -0
  173. package/telegram-plugin/uat/runners/report.ts +150 -0
  174. package/telegram-plugin/uat/runners/run-agent-self-sufficiency.sh +50 -0
  175. package/telegram-plugin/uat/runners/scorer.test.ts +196 -0
  176. package/telegram-plugin/uat/runners/scorer.ts +106 -0
  177. package/telegram-plugin/uat/runners/skill-coverage.test.ts +100 -0
  178. package/telegram-plugin/uat/runners/skill-coverage.ts +620 -0
  179. package/telegram-plugin/uat/scenarios/ask-user-button-tap-dm.test.ts +141 -0
  180. package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +191 -0
  181. package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +255 -0
  182. package/telegram-plugin/uat/scenarios/fuzz-human-style-dm.test.ts +275 -0
  183. package/telegram-plugin/uat/scenarios/fuzz-random-prompts-dm.test.ts +146 -0
  184. package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +486 -0
  185. package/telegram-plugin/uat/scenarios/jtbd-interrupt-marker-dm.test.ts +67 -0
  186. package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +100 -0
  187. package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +67 -0
  188. package/telegram-plugin/uat/scenarios/jtbd-status-query-dm.test.ts +49 -0
  189. package/telegram-plugin/uat/scenarios/location-inbound-dm.test.ts +65 -0
  190. package/telegram-plugin/uat/scenarios/midturn-silent-dm.test.ts +175 -0
  191. package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +142 -0
  192. package/telegram-plugin/uat/scenarios/reactions-trigger-turn-dm.test.ts +96 -0
  193. package/telegram-plugin/uat/scenarios/secret-redaction-deletes-original-dm.test.ts +123 -0
  194. package/telegram-plugin/uat/scenarios/secret-redaction-no-false-positive-dm.test.ts +87 -0
  195. package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +155 -0
  196. package/telegram-plugin/uat/scenarios/silent-end-recovery-dm.test.ts +95 -0
  197. package/telegram-plugin/uat/scenarios/smoke-dm-reply.test.ts +57 -0
  198. package/telegram-plugin/uat/scenarios/subagent-watcher-no-rerun-dm.test.ts +135 -0
  199. package/telegram-plugin/uat/scenarios/vault-approval-posture-telegram-id-dm.test.ts +191 -0
  200. package/telegram-plugin/uat/scenarios/vault-audit-allow-dm.test.ts +108 -0
  201. package/telegram-plugin/uat/scenarios/vault-grant-auto-resume-dm.test.ts +121 -0
  202. package/telegram-plugin/uat/scenarios/vault-request-access-concurrent-dm.test.ts +161 -0
  203. package/telegram-plugin/uat/scenarios/vault-request-access-end-to-end-dm.test.ts +158 -0
  204. package/telegram-plugin/uat/scenarios/voice-inbound-dm.test.ts +65 -0
  205. package/telegram-plugin/vault-approval-posture.ts +42 -0
  206. package/telegram-plugin/welcome-text.ts +1 -0
  207. package/telegram-plugin/active-pins-sweep.ts +0 -204
  208. package/telegram-plugin/active-pins.ts +0 -146
  209. package/telegram-plugin/auth-dashboard.ts +0 -1104
  210. package/telegram-plugin/auth-slot-parser.ts +0 -497
  211. package/telegram-plugin/card-event-log.ts +0 -138
  212. package/telegram-plugin/dist/foreman/foreman.js +0 -31106
  213. package/telegram-plugin/docs/multi-agent-card-design.md +0 -847
  214. package/telegram-plugin/docs/pinned-progress-card-reliability.md +0 -144
  215. package/telegram-plugin/foreman/foreman-create-flow.ts +0 -202
  216. package/telegram-plugin/foreman/foreman-handlers.ts +0 -493
  217. package/telegram-plugin/foreman/foreman.ts +0 -1165
  218. package/telegram-plugin/foreman/setup-flow.ts +0 -345
  219. package/telegram-plugin/foreman/setup-state.ts +0 -239
  220. package/telegram-plugin/foreman/state.ts +0 -203
  221. package/telegram-plugin/pin-event-log.ts +0 -76
  222. package/telegram-plugin/progress-card-driver.ts +0 -2886
  223. package/telegram-plugin/progress-card-pin-manager.ts +0 -589
  224. package/telegram-plugin/progress-card-pin-watchdog.ts +0 -98
  225. package/telegram-plugin/progress-card.ts +0 -1409
  226. package/telegram-plugin/tests/HARNESS.md +0 -340
  227. package/telegram-plugin/tests/_progress-card-harness.ts +0 -109
  228. package/telegram-plugin/tests/active-pins-boot-reaper.test.ts +0 -211
  229. package/telegram-plugin/tests/active-pins-sweep.test.ts +0 -309
  230. package/telegram-plugin/tests/active-pins.test.ts +0 -187
  231. package/telegram-plugin/tests/auth-account-identity-surface.test.ts +0 -118
  232. package/telegram-plugin/tests/auth-dashboard-edge-cases.test.ts +0 -260
  233. package/telegram-plugin/tests/auth-dashboard-restart-flow.test.ts +0 -140
  234. package/telegram-plugin/tests/auth-dashboard-v3b.test.ts +0 -559
  235. package/telegram-plugin/tests/auth-dashboard.test.ts +0 -1045
  236. package/telegram-plugin/tests/auth-slot-commands.test.ts +0 -640
  237. package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +0 -201
  238. package/telegram-plugin/tests/boot-card-account-quota.test.ts +0 -137
  239. package/telegram-plugin/tests/card-event-log.test.ts +0 -145
  240. package/telegram-plugin/tests/first-paint.test.ts +0 -257
  241. package/telegram-plugin/tests/foreman-create-flow.test.ts +0 -359
  242. package/telegram-plugin/tests/foreman-handlers.test.ts +0 -347
  243. package/telegram-plugin/tests/foreman-state.test.ts +0 -164
  244. package/telegram-plugin/tests/foreman-write-ops.test.ts +0 -214
  245. package/telegram-plugin/tests/harness-ordering-invariants.test.ts +0 -243
  246. package/telegram-plugin/tests/pin-event-log.test.ts +0 -124
  247. package/telegram-plugin/tests/progress-card-api-failure-during-deferred.test.ts +0 -73
  248. package/telegram-plugin/tests/progress-card-close-paths-converge.test.ts +0 -272
  249. package/telegram-plugin/tests/progress-card-cross-turn.test.ts +0 -258
  250. package/telegram-plugin/tests/progress-card-delay-842.test.ts +0 -160
  251. package/telegram-plugin/tests/progress-card-dispose-preservepending.test.ts +0 -81
  252. package/telegram-plugin/tests/progress-card-draft-flag.test.ts +0 -80
  253. package/telegram-plugin/tests/progress-card-driver-eviction.test.ts +0 -215
  254. package/telegram-plugin/tests/progress-card-driver-fleet-shadow.test.ts +0 -123
  255. package/telegram-plugin/tests/progress-card-driver-force-complete-parent-done.test.ts +0 -76
  256. package/telegram-plugin/tests/progress-card-edit-timestamps-budget.test.ts +0 -62
  257. package/telegram-plugin/tests/progress-card-memory-bounds.test.ts +0 -84
  258. package/telegram-plugin/tests/progress-card-pin-failure-paths.test.ts +0 -139
  259. package/telegram-plugin/tests/progress-card-pin-manager.test.ts +0 -773
  260. package/telegram-plugin/tests/progress-card-pin-race-fast-turn.test.ts +0 -66
  261. package/telegram-plugin/tests/progress-card-pin-sidecar-partial-write.test.ts +0 -64
  262. package/telegram-plugin/tests/progress-card-pin-watchdog.test.ts +0 -190
  263. package/telegram-plugin/tests/progress-card-sigterm-pin-flush.test.ts +0 -146
  264. package/telegram-plugin/tests/real-gateway-f1-ladder-integrity.test.ts +0 -123
  265. package/telegram-plugin/tests/real-gateway-f2-instant-draft.test.ts +0 -82
  266. package/telegram-plugin/tests/real-gateway-f3-late-card.test.ts +0 -114
  267. package/telegram-plugin/tests/real-gateway-harness.ts +0 -699
  268. package/telegram-plugin/tests/real-gateway-i6-turn-flush-replay-dedup.test.ts +0 -313
  269. package/telegram-plugin/tests/real-gateway-ipc-lifecycle.test.ts +0 -299
  270. package/telegram-plugin/tests/real-gateway-spec.test.ts +0 -487
  271. package/telegram-plugin/tests/real-gateway.smoke.test.ts +0 -101
  272. package/telegram-plugin/tests/setup-flow.test.ts +0 -510
  273. package/telegram-plugin/tests/setup-state.test.ts +0 -146
  274. package/telegram-plugin/tests/sync-chat-running-subagents.test.ts +0 -116
  275. package/telegram-plugin/tests/turn-end-regressions.test.ts +0 -489
  276. package/telegram-plugin/tests/turn-flush-card-takeover.test.ts +0 -218
  277. package/telegram-plugin/tests/turn-flush-prose-recovery.test.ts +0 -78
  278. package/telegram-plugin/tests/two-zone-bg-carry-full-lifecycle.test.ts +0 -131
  279. package/telegram-plugin/tests/two-zone-bg-detection.test.ts +0 -120
  280. package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +0 -116
  281. package/telegram-plugin/tests/two-zone-bg-early-turn-end.test.ts +0 -87
  282. package/telegram-plugin/tests/two-zone-bg-survives-next-turn.test.ts +0 -211
  283. package/telegram-plugin/tests/two-zone-card-cap.test.ts +0 -62
  284. package/telegram-plugin/tests/two-zone-card-fleet-row.test.ts +0 -101
  285. package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +0 -78
  286. package/telegram-plugin/tests/two-zone-card-html-balance.test.ts +0 -110
  287. package/telegram-plugin/tests/two-zone-card-lifecycle.test.ts +0 -128
  288. package/telegram-plugin/tests/two-zone-card-sanitise.test.ts +0 -58
  289. package/telegram-plugin/tests/two-zone-card-snapshot.test.ts +0 -133
  290. package/telegram-plugin/tests/two-zone-concurrent-turns-isolation.test.ts +0 -155
  291. package/telegram-plugin/tests/two-zone-phasefor-precedence.test.ts +0 -117
  292. package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +0 -187
  293. package/telegram-plugin/tests/two-zone-stuck-edit-throttle.test.ts +0 -149
  294. package/telegram-plugin/tests/two-zone-stuck-header-escalation.test.ts +0 -101
  295. package/telegram-plugin/tests/two-zone-stuck-per-member.test.ts +0 -114
  296. package/telegram-plugin/tests/two-zone-stuck-recovery.test.ts +0 -105
  297. package/telegram-plugin/tests/waiting-ux-harness.ts +0 -381
  298. package/telegram-plugin/tests/waiting-ux.e2e.test.ts +0 -233
  299. package/telegram-plugin/turn-flush-prose-recovery.ts +0 -40
  300. package/telegram-plugin/two-zone-card.ts +0 -269
  301. package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +0 -61
@@ -1,2886 +0,0 @@
1
- /**
2
- * Driver that owns per-chat progress-card state and controls when to emit
3
- * an `update` call to the outer world (typically a handleStreamReply or a
4
- * test spy).
5
- *
6
- * Cadence rules:
7
- * - Fire IMMEDIATELY on state transitions (tool start, tool end, stage
8
- * change, enqueue). This is the key anti-flicker property — each event
9
- * renders exactly once at the moment of semantic change.
10
- * - Coalesce bursts: if multiple events land within `coalesceMs`, only
11
- * the last render actually fires (a single setTimeout collapses them).
12
- * - Hard floor: never emit faster than `minIntervalMs` to respect
13
- * Telegram's editMessageText rate budget.
14
- *
15
- * Pure in-process state. No IO; the outer `emit` callback does the send.
16
- */
17
-
18
- import type { SessionEvent } from './session-tail.js'
19
- import {
20
- hasAnyRunningSubAgent,
21
- initialState,
22
- reduce,
23
- render,
24
- type ProgressCardState,
25
- type TaskNum,
26
- type SubAgentState,
27
- } from './progress-card.js'
28
- import { isTelegramReplyTool } from './tool-names.js'
29
- import { emitCardEvent } from './card-event-log.js'
30
- import { createHash } from 'crypto'
31
- import {
32
- applyCapped as fleetApplyCapped,
33
- applyToolResult as fleetApplyToolResult,
34
- applyToolUse as fleetApplyToolUse,
35
- applyTurnEnd as fleetApplyTurnEnd,
36
- createFleetMember,
37
- hasLiveBackground,
38
- markStuck as fleetMarkStuck,
39
- roleFromDispatch,
40
- type FleetMember,
41
- } from './fleet-state.js'
42
-
43
- /**
44
- * Classification of a Telegram API error for failure-escalation purposes.
45
- *
46
- * - `permanent_4xx`: 4xx error that won't resolve itself (message deleted,
47
- * bot blocked, etc.). After K consecutive such failures the card is marked
48
- * terminal and all further edits are suppressed.
49
- * - `transient`: network/5xx error — retryable; does NOT count toward the
50
- * permanent-failure threshold.
51
- * - `benign`: "message is not modified" — the edit had no effect because the
52
- * text was already identical. Not a failure at all; counter must not advance.
53
- */
54
- export type ApiFailureKind = 'permanent_4xx' | 'transient' | 'benign'
55
-
56
- /**
57
- * Reason a per-chat card is being closed. Used by the unified
58
- * `closePerChat` helper to drive the small set of behavioural deltas
59
- * between paths (sub-agent force-close, stalled-render flag).
60
- *
61
- * - 'turn-end' : normal completion — no in-flight sub-agents.
62
- * - 'zombie' : abandonment via heartbeat maxIdle ceiling or
63
- * new-enqueue force-close.
64
- * - 'stalled' : Gap-8 deferred-completion timeout expired.
65
- */
66
- export type CloseReason = 'turn-end' | 'zombie' | 'stalled'
67
-
68
- /**
69
- * Failure descriptor reported back to the driver after an async emit fails.
70
- * The outer layer (server.ts) inspects the raw Telegram error and classifies
71
- * it before calling `reportApiFailure`.
72
- */
73
- export interface ApiFailureInfo {
74
- /** HTTP-level error code from Telegram (400, 403, 404, 500, …). */
75
- code: number
76
- /** Telegram's `description` field, e.g. "Forbidden: bot was blocked by the user". */
77
- description: string
78
- kind: ApiFailureKind
79
- }
80
-
81
- export interface ProgressDriverConfig {
82
- /**
83
- * Emit rendered HTML for the given chat+thread. Caller owns the send.
84
- *
85
- * `isFirstEmit` is true exactly once per turn — on the very first flush
86
- * that creates the Telegram message. The caller can use this signal to
87
- * pin the new message: after this call resolves, the message_id will be
88
- * available in the caller's draft-stream handle.
89
- *
90
- * `replyToMessageId` is set only on the first emit (when `isFirstEmit`
91
- * is true) and only when the turn was started with a source message_id
92
- * (via `startTurn({ replyToMessageId })`). The caller should pass this
93
- * as `reply_parameters` on the initial `sendMessage` so the progress
94
- * card is a tappable reply to the user's original message. Edits
95
- * (subsequent emits) must NOT carry reply_parameters — Telegram rejects
96
- * it on editMessageText.
97
- */
98
- emit: (args: {
99
- chatId: string
100
- threadId?: string
101
- /** Unique key for this turn (chatId:threadId:seq). Use for pin/unpin tracking. */
102
- turnKey: string
103
- html: string
104
- done: boolean
105
- /** True only on the first flush for this turn (message creation). */
106
- isFirstEmit: boolean
107
- /**
108
- * Set on the first emit only (isFirstEmit=true) when the turn was
109
- * started via startTurn({ replyToMessageId }). Pass as
110
- * reply_parameters.message_id on the initial sendMessage.
111
- */
112
- replyToMessageId?: number
113
- /**
114
- * Per-agent card identity. Absent for parent-card emits (the
115
- * gateway treats absence as the parent sentinel `__parent__`).
116
- * Retained for caller compatibility post-P4 cutover; the two-zone
117
- * renderer no longer emits per-sub-agent cards.
118
- */
119
- agentId?: string
120
- }) => void
121
- /**
122
- * Optional callback fired once per turn immediately after the final
123
- * render on `turn_end`. Receives a compact, one-line plain-text
124
- * summary suitable for the session-handoff continuity line. The outer
125
- * layer typically pipes this into `writeLastTurnSummary(agentDir, …)`
126
- * so that a session restart can show "↩️ Picked up — <summary>"
127
- * even if the Stop-hook summarizer didn't run.
128
- */
129
- onTurnEnd?: (summary: string) => void
130
- /**
131
- * Fired once per turn when `turn_end` is processed, with full chat
132
- * context. Use this for per-chat post-completion work: unpin the card,
133
- * send a completion summary to the main chat, etc.
134
- *
135
- * Fires BEFORE the per-chat state is deleted, so `summary` is still
136
- * accessible. The caller must NOT re-enter the driver from this callback.
137
- */
138
- onTurnComplete?: (args: {
139
- chatId: string
140
- threadId?: string
141
- /** Unique key for this turn (chatId:threadId:seq). Use for pin/unpin tracking. */
142
- turnKey: string
143
- summary: string
144
- taskIndex: number
145
- taskTotal: number
146
- }) => void
147
- /**
148
- * Fired when a turn ends with no reply sent (silentEnd=true). The outer
149
- * layer can write a state file so the Stop hook can block the session and
150
- * re-prompt the agent. The callback returns `{ suppressed: true }` when the
151
- * retry is allowed (retryCount was 0) — in that case the driver will
152
- * re-render the final card WITHOUT the "🙊 Ended without reply" warning so
153
- * the user doesn't see a false-positive before the retry lands.
154
- *
155
- * On the second silent-end (retryCount exhausted) the callback returns
156
- * `{ suppressed: false }` and the warning card renders as normal.
157
- *
158
- * Not fired for autonomous turns (wasAutonomous=true) — those intentionally
159
- * produce no user-visible reply.
160
- */
161
- onSilentEnd?: (args: {
162
- chatId: string
163
- threadId?: string
164
- turnKey: string
165
- }) => { suppressed: boolean } | void
166
- /** Min ms between edits for a given chat+thread. Default 500. */
167
- minIntervalMs?: number
168
- /** Coalesce window — burst events within this land as one render. Default 400. */
169
- coalesceMs?: number
170
- /** `Date.now` override for tests. */
171
- now?: () => number
172
- /** `setTimeout` override for tests. */
173
- setTimeout?: (fn: () => void, ms: number) => { ref: unknown }
174
- clearTimeout?: (ref: unknown) => void
175
- /** `setInterval` override for tests (used by the heartbeat). */
176
- setInterval?: (fn: () => void, ms: number) => { ref: unknown }
177
- clearInterval?: (ref: unknown) => void
178
- /**
179
- * Heartbeat cadence for the no-events-flowing re-render. When a turn
180
- * has settled into a long-running tool call (e.g. a sub-agent that
181
- * emits no session-JSONL events for minutes), the elapsed-time counter
182
- * in the card header never visibly ticks because no event fires a
183
- * re-render. The heartbeat forces a flush every `heartbeatMs` while
184
- * any chat has a running turn. Default 5000. Set to 0 to disable.
185
- */
186
- heartbeatMs?: number
187
- /**
188
- * Multi-agent rate-limit guardrail (design §4.4). Telegram caps edits
189
- * at ~20/min/chat. With N parallel sub-agents emitting bursty events
190
- * the default 400ms coalesce + 500ms floor can exceed the cap. When
191
- * we observe more than `editBudgetThreshold` edits in the trailing
192
- * 60s for a chat, the coalesce window expands to `editBudgetCoalesceMs`
193
- * until the rate drops back. Heartbeat is also suppressed while the
194
- * budget is hot.
195
- *
196
- * Defaults: threshold=18, coalesce window when hot=3000ms.
197
- */
198
- editBudgetThreshold?: number
199
- editBudgetCoalesceMs?: number
200
- /**
201
- * Zombie-card ceiling. If a chat's `lastEventAt` is older than this
202
- * many ms, the heartbeat loop force-closes the card (flush done,
203
- * onTurnComplete, delete from chats). This is the backstop for cards
204
- * orphaned by a missed `turn_end` line or an enqueue echo-drop that
205
- * routed events to a different card — without it, the heartbeat
206
- * would re-render a stale card forever (50+ minute ghost cards).
207
- *
208
- * Default 30 minutes. Set to 0 to disable entirely (not recommended
209
- * outside tests).
210
- */
211
- maxIdleMs?: number
212
- /**
213
- * Suppress the progress card for fast turns. The first emit is
214
- * deferred by this many ms after startTurn. If `turn_end` arrives
215
- * before the timer fires (and isFirstEmit is still true), no card
216
- * is ever shown — the user only sees the final reply.
217
- *
218
- * The card can be promoted out of suppression early when a sub-agent
219
- * starts (see `promoteOnSubAgent`) — long-running tool work and
220
- * background dispatches stay visible without waiting the full delay.
221
- *
222
- * Default 45000 (45 seconds, #842). Set to 0 to disable.
223
- */
224
- initialDelayMs?: number
225
- /**
226
- * First-render delay (ms) override for explicit background sub-agent
227
- * dispatches (#842). When the agent calls
228
- * `Agent({ run_in_background: true })`, the card is promoted out of
229
- * the suppression window using this delay instead of `initialDelayMs`.
230
- * Default 0 (immediate render — backgrounded work should be visible
231
- * right away).
232
- *
233
- * Implementation: at `tool_use` ingest time the driver detects the
234
- * background flag (existing `cs.backgroundParentToolUseIds` book-
235
- * keeping). If `initialDelayMsBackground` is 0 the card promotes
236
- * immediately via `promoteFirstEmit`. If positive, the deferred timer
237
- * is rescheduled to fire that many ms from turn start (or now, if
238
- * already past) — but only when shorter than what's currently
239
- * scheduled. Never lengthens an in-flight delay.
240
- */
241
- initialDelayMsBackground?: number
242
- /**
243
- * Promote the first emit immediately when a sub-agent transitions to
244
- * running during the suppression window, when the watcher fires
245
- * `onSubAgentStall`, or when `startTurn` carries over running
246
- * sub-agents from a prior turn (#334 carry-over). The card jumps
247
- * straight to visible instead of waiting for `initialDelayMs`.
248
- *
249
- * Fast-turn suppression (`turn_end` before the card has emitted) is
250
- * unchanged — it short-circuits in `flush()` regardless of this flag.
251
- *
252
- * Default true. Set to false to disable promotion entirely (the card
253
- * will only appear after `initialDelayMs` elapses, even when sub-agents
254
- * are dispatched mid-turn).
255
- */
256
- promoteOnSubAgent?: boolean
257
- /**
258
- * Promote the card out of initial-delay suppression once the agent has
259
- * issued this many parent-side tool calls in the suppression window.
260
- * Closes #478 — the user sees no progress card for the first 30s of a
261
- * substantial turn that does parent-side work (Read/Grep/Bash/Edit)
262
- * but never dispatches a sub-agent.
263
- *
264
- * Symmetric to `promoteOnSubAgent`. **Default 0 (disabled, #553 PR 4):**
265
- * under the v2 contract tools alone never trigger the card — only
266
- * sub-agents or `elapsed >= 60s`. Values of 0 or non-finite (Infinity)
267
- * are treated as "never promote on tool count". Set to a positive
268
- * integer (e.g. 3) to opt back in to the pre-v2 behaviour.
269
- *
270
- * Fast-turn suppression in `flush()` is unchanged — if the turn
271
- * ends before promotion, the card still skips the emit.
272
- */
273
- promoteOnParentToolCount?: number
274
- /**
275
- * Time-based first-emit promotion (#553 F3): if the turn has been
276
- * running this long with no tool/sub-agent that already triggered
277
- * promotion, force the card to emit. Without this, single- or two-
278
- * tool turns that take 5–30s never cross any existing promotion
279
- * threshold and the card stays suppressed until `initialDelayMs`,
280
- * at which point fast-turn-suppression cancels it on `turn_end`.
281
- *
282
- * Symmetric to `promoteOnParentToolCount`: pure additive promotion,
283
- * never delays an emit that would otherwise fire. Fast-turn
284
- * suppression in `flush()` is unchanged — sub-`promoteAfterMs` turns
285
- * still skip the card.
286
- *
287
- * **Default 0 (disabled, #553 PR 4).** The PR #570 5s time-promote was
288
- * a stop-gap when `initialDelayMs` defaulted to 30s; with the new
289
- * 60s `initialDelayMs` and the sub-agent promote intact, time-based
290
- * promotion is no longer needed. `ensureTimePromoteScheduled` no-ops
291
- * when this is 0 so the timer never schedules. Set to a positive
292
- * value to opt back in to the pre-v2 behaviour.
293
- */
294
- promoteAfterMs?: number
295
- /**
296
- * Number of consecutive 4xx Telegram API failures on card edits before
297
- * the card is marked terminal and all further edits are suppressed for
298
- * this turn. Transient (5xx/network) errors and "message is not modified"
299
- * do NOT count toward this threshold. A single success resets the counter.
300
- *
301
- * Default 3. Set to 0 to disable the escalation mechanism entirely.
302
- */
303
- maxConsecutive4xx?: number
304
- /**
305
- * Gap 3 (orphan promotion): how long a `PendingAgentSpawn` must be
306
- * outstanding before the heartbeat promotes it to a synthesised
307
- * sub-agent row (state='running'). Gives the sub-agent JSONL watcher a
308
- * chance to deliver the real `sub_agent_started` event first.
309
- *
310
- * Default 5000 (5 seconds). Set to 0 to disable promotion entirely.
311
- */
312
- orphanPromotionMs?: number
313
- /**
314
- * Gap 4 (cold-JSONL detection): when a running sub-agent's last event
315
- * is older than this threshold, the heartbeat synthesises a
316
- * `sub_agent_turn_end` for it so the deferred-completion path can
317
- * proceed (avoids the card staying pinned forever on a dead watcher).
318
- *
319
- * Default 30000 (30 seconds). Set to 0 to disable the synthetic close.
320
- */
321
- coldSubAgentThresholdMs?: number
322
- /**
323
- * Gap 8 (decoupled render and unpin): after `turn_end` arrives while
324
- * sub-agents are still running, this is the maximum ms to wait before
325
- * force-closing the card with a "stalled — forced close" header and
326
- * calling `onTurnComplete`. This is separate from `maxIdleMs` (which
327
- * watches for absence of ALL events) — this timeout starts specifically
328
- * on parent `turn_end` and fires regardless of sub-agent activity.
329
- *
330
- * Default 180000 (3 minutes). Set to 0 to disable.
331
- */
332
- deferredCompletionTimeoutMs?: number
333
- /**
334
- * Fix #314 — elapsed-ticker interval for silent sub-agent gaps.
335
- *
336
- * While at least one sub-agent is in `state='running'`, the parent card
337
- * only re-renders when an event changes the HTML (tool start/end, stage
338
- * change). During silent stretches between tool calls the elapsed counter
339
- * freezes — the diff guard suppresses edits when only the timestamp
340
- * advances. This interval forces a render (bypassing that guard) every N ms
341
- * so the elapsed counter visibly ticks even when the sub-agent is quietly
342
- * thinking or waiting for I/O.
343
- *
344
- * 10 s was chosen as a balance: short enough that the counter advances
345
- * at human-perceptible speed (users notice a 15+ second freeze), long
346
- * enough to stay well under Telegram's ~20 edits/minute budget even when
347
- * multiple cards are active in parallel.
348
- *
349
- * Default 10000. Set to 0 to disable the elapsed-ticker path entirely.
350
- */
351
- subAgentTickIntervalMs?: number
352
- }
353
-
354
- /**
355
- * Issue #399: Sync the per-chat running-sub-agent registry after any state
356
- * transition that may have moved agents to a terminal state.
357
- *
358
- * Factored out from the inline block inside `ingest` so it can be called
359
- * from three paths that can transition agents to done/failed without going
360
- * through the normal ingest post-reduce step:
361
- * 1. ingest post-reduce (existing call site, refactored)
362
- * 2. cold-jsonl-synth path (Gap-4, heartbeat)
363
- * 3. closeZombie direct mutation path
364
- * 4. deferred-completion-timeout force-close (Gap-8, heartbeat)
365
- */
366
- export function syncChatRunningSubagents(
367
- prev: ProgressCardState,
368
- next: ProgressCardState,
369
- cBaseKey: string,
370
- chatRunningSubagents: Map<string, Map<string, SubAgentState>>,
371
- ): { newRunningAppeared: boolean } {
372
- if (prev.subAgents === next.subAgents) return { newRunningAppeared: false }
373
- let newRunningAppeared = false
374
- // Check for new or newly-running entries (sub_agent_started path).
375
- for (const [agentId, sa] of next.subAgents) {
376
- if (sa.state === 'running') {
377
- const prevSa = prev.subAgents.get(agentId)
378
- if (prevSa == null || prevSa.state !== 'running') {
379
- // Newly running — register in chat-scoped registry.
380
- let chatMap = chatRunningSubagents.get(cBaseKey)
381
- if (chatMap == null) {
382
- chatMap = new Map<string, SubAgentState>()
383
- chatRunningSubagents.set(cBaseKey, chatMap)
384
- }
385
- chatMap.set(agentId, sa)
386
- newRunningAppeared = true
387
- }
388
- } else if (sa.state === 'done' || sa.state === 'failed') {
389
- // Terminal state — remove from chat registry if present.
390
- chatRunningSubagents.get(cBaseKey)?.delete(agentId)
391
- }
392
- }
393
- // Also handle entries that were removed from subAgents entirely
394
- // (shouldn't happen normally but be defensive).
395
- for (const agentId of prev.subAgents.keys()) {
396
- if (!next.subAgents.has(agentId)) {
397
- chatRunningSubagents.get(cBaseKey)?.delete(agentId)
398
- }
399
- }
400
- return { newRunningAppeared }
401
- }
402
-
403
- /**
404
- * Compact one-line summary of a completed turn for the handoff sidecar.
405
- * Shape: `"<tool-count> tool[s], <duration> — <user-request>"`.
406
- * Falls back gracefully when fields are missing (empty items → "no tools";
407
- * no userRequest → just the stats prefix).
408
- */
409
- export function summariseTurn(state: ProgressCardState, now: number): string {
410
- const toolCount = state.items.length
411
- const toolLabel = toolCount === 1 ? '1 tool' : `${toolCount} tools`
412
- const durSec = Math.max(0, Math.floor((now - state.turnStartedAt) / 1000))
413
- const dur =
414
- durSec >= 60
415
- ? `${Math.floor(durSec / 60)}:${(durSec % 60).toString().padStart(2, '0')}`
416
- : `${durSec}s`
417
- const stats = toolCount === 0 ? `no tools, ${dur}` : `${toolLabel}, ${dur}`
418
- const req = state.userRequest?.trim()
419
- return req ? `${stats} — ${req}` : stats
420
- }
421
-
422
- interface PerChatState {
423
- chatId: string
424
- threadId?: string
425
- /** Unique key for this turn: `chatId:threadId:seq`. Used as the chats-map key. */
426
- turnKey: string
427
- /** 1-based index of this card among all cards created for this chat:thread in this session. */
428
- taskIndex: number
429
- /** Total cards created for this chat:thread so far (snapshot at card creation). */
430
- taskTotal: number
431
- state: ProgressCardState
432
- lastEmittedAt: number
433
- lastEmittedHtml: string | null
434
- pendingTimer: unknown
435
- /** True until the very first flush fires for this turn. Cleared after first emit. */
436
- isFirstEmit: boolean
437
- /** Timer for the deferred first emit (initial-delay suppression). */
438
- deferredFirstEmitTimer: unknown
439
- /**
440
- * #842: per-chat first-emit delay budget in ms. Initialised to
441
- * `config.initialDelayMs`; lowered to `config.initialDelayMsBackground`
442
- * the first time the parent dispatches an Agent/Task with
443
- * `run_in_background: true`. Never increases. flush() reads this
444
- * (instead of the closure-level `initialDelayMs`) when scheduling the
445
- * deferred first-emit timer so the background bypass takes effect on
446
- * the next scheduling pass.
447
- */
448
- effectiveInitialDelayMs: number
449
- /**
450
- * F3 fix (#553): timer for the time-based first-emit promotion.
451
- * Scheduled on the first ingest event; fires after `promoteAfterMs`
452
- * to force-promote turns that don't trip parent-tool-count or
453
- * sub-agent thresholds (e.g. one long Bash). Cleared on
454
- * `promoteFirstEmit` or turn end.
455
- */
456
- timePromoteTimer: unknown
457
- /**
458
- * The Telegram message_id of the user's original inbound message that
459
- * triggered this turn. Set via startTurn({ replyToMessageId }). Passed
460
- * as reply_parameters on the FIRST sendMessage only — edits must not
461
- * carry it (Telegram rejects reply_parameters on editMessageText).
462
- */
463
- replyToMessageId?: number
464
- /**
465
- * Wall-clock ms of the last real session event routed to this card.
466
- * Distinct from `lastEmittedAt`: the heartbeat ticks `lastEmittedAt`
467
- * every cycle, but `lastEventAt` only advances when an actual event
468
- * (enqueue, tool_use, tool_result, turn_end, sub_agent_*) lands on
469
- * this chat state. The heartbeat uses it as a zombie ceiling — a
470
- * card whose `lastEventAt` is older than `maxIdleMs` has been
471
- * orphaned (turn_end missed by the session-tail, or an enqueue
472
- * echo-drop routed events to a different card) and is force-closed
473
- * so it can't tick forever.
474
- */
475
- lastEventAt: number
476
- /**
477
- * True once the parent turn has ended (via `turn_end` or
478
- * `forceCompleteTurn`) BUT one or more sub-agents were still running
479
- * at that moment. The card stays alive and keeps ticking so the
480
- * running sub-agents remain visible. When the last running sub-agent
481
- * transitions to done (via `sub_agent_turn_end` or parent's Agent
482
- * `tool_result`), completion callbacks finally fire and the card is
483
- * closed. Guards against duplicate completion firing (both turn_end
484
- * and forceCompleteTurn can legitimately arrive).
485
- */
486
- pendingCompletion: boolean
487
- /**
488
- * Set to true the moment completion callbacks have fired, whether
489
- * immediately (no in-flight sub-agents at turn_end) or deferred
490
- * (after last sub-agent finished). Guards against double-firing if
491
- * multiple completion signals race.
492
- */
493
- completionFired: boolean
494
- /**
495
- * Set to true when an external code path has assumed ownership of
496
- * the pinned card message (e.g. turn-flush rewriting the card with
497
- * the user-facing answer — see #654). Once true, `flush()`
498
- * short-circuits at the top so the driver never edits the card
499
- * again for this turn. The external owner is responsible for
500
- * issuing the final edit/unpin via pinMgr.
501
- */
502
- cardTakenOver: boolean
503
- /**
504
- * Tracks consecutive Telegram 4xx failures on card edits. Once
505
- * `terminal` is true, flush() and the heartbeat tick skip all edits
506
- * for this card (message deleted / bot blocked / stale message_id).
507
- *
508
- * Resets automatically when a fresh turn starts (new PerChatState).
509
- */
510
- apiFailures: {
511
- consecutive4xx: number
512
- lastError: { code: number; description: string; timestamp: number } | null
513
- terminal: boolean
514
- }
515
- /**
516
- * Issue #132: did the agent call `reply` or `stream_reply` (under any
517
- * MCP server-key prefix) at least once during this turn?
518
- *
519
- * Set true on the first matching `tool_use` event observed by `ingest()`.
520
- * When the turn ends with this still false, the card renders the
521
- * "🙊 Ended without reply" silent-end variant instead of "✅ Done" so the
522
- * user can tell the difference between "agent acknowledged with text"
523
- * and "agent ran tools and went mute". Resets implicitly with each new
524
- * `PerChatState` (one per turn).
525
- */
526
- replyToolCalled: boolean
527
- /**
528
- * Issue #137: how many outbound replies actually landed in the chat
529
- * this turn? Bumped by `ProgressDriver.recordOutboundDelivered()` from
530
- * the gateway's executeReply / executeStreamReply success paths.
531
- *
532
- * Combined with `replyToolCalled` at turn-end, this distinguishes:
533
- * - both false → silent-end (#132, "Ended without reply")
534
- * - replyToolCalled only → reply attempted but never delivered
535
- * (#137 — render a degraded variant
536
- * distinct from silent-end so the user
537
- * knows the agent TRIED)
538
- * - delivered>0 → real success
539
- */
540
- outboundDeliveredCount: number
541
- /**
542
- * Issue #259: true when the turn was started by an autonomous wakeup
543
- * sentinel (`<<autonomous-loop>>` or `<<autonomous-loop-dynamic>>`).
544
- * When set, the "🙊 Ended without reply" silent-end warning is
545
- * suppressed — autonomous turns intentionally produce no user-visible
546
- * reply and ending without one is entirely expected.
547
- */
548
- wasAutonomous: boolean
549
- /**
550
- * Set by prepareSilentEndSuppression when onSilentEnd returns
551
- * { suppressed: true }. Causes flush() to render the final card without
552
- * the "🙊 Ended without reply" header so no false-positive appears before
553
- * the retry reply lands.
554
- */
555
- silentEndSuppressed: boolean
556
- /**
557
- * Idempotent guard for prepareSilentEndSuppression — ensures the
558
- * onSilentEnd callback (which writes the Stop-hook state file) only
559
- * fires once per turn even if multiple sites call into the helper.
560
- */
561
- silentEndPrepared: boolean
562
- /**
563
- * Gap 8 (decoupled render and unpin): set to the timestamp when parent
564
- * `turn_end` landed while sub-agents were still running. Used by the
565
- * heartbeat to enforce `deferredCompletionTimeoutMs`. Null until
566
- * parent turn_end with in-flight sub-agents is observed.
567
- */
568
- parentTurnEndAt: number | null
569
- /**
570
- * Gap 8: true once the parent-done render (✅ Done header with sub-agents
571
- * still visible) has been emitted. Prevents re-rendering the ✅ Done
572
- * frame on every sub-agent event while deferred.
573
- */
574
- parentDoneRendered: boolean
575
- /**
576
- * Gap 3 (orphan promotion): set of toolUseIds from `pendingAgentSpawns`
577
- * that have already been promoted to synthetic sub-agent rows. Guards
578
- * against re-promotion on successive heartbeat ticks and against
579
- * double-registration if a real `sub_agent_started` arrives later.
580
- */
581
- promotedSpawnIds: Set<string>
582
- /**
583
- * P0 of #662 — shadow fleet map updated alongside `state.subAgents` at
584
- * every sub_agent_* event. Coexists with the legacy map; P1/P2/P3 build
585
- * the v2 two-zone status card on this without disturbing the existing
586
- * renderer. See fleet-state.ts for the pure transitions.
587
- */
588
- fleet: Map<string, FleetMember>
589
- /**
590
- * P2 of #662 — set of parent toolUseIds whose Agent/Task tool_use was
591
- * dispatched with `input.run_in_background === true`. When the
592
- * matching `sub_agent_started` correlates and writes
593
- * `parentToolUseId` into the freshly-created subagent state, the
594
- * fleet reducer flips that member's `status` from `running` to
595
- * `background`. Entry stays around for the life of the turn so a
596
- * reverse-race adoption (sub_agent_started arriving before tool_use)
597
- * still matches.
598
- */
599
- backgroundParentToolUseIds: Set<string>
600
- /**
601
- * P2 of #662 / fixes #64 — set true when `completeTurnFully` was
602
- * called but at least one fleet member was still in `status:
603
- * 'background'` and not terminal. The chats-map entry is preserved
604
- * (instead of deleted) and the original card stays pinned so updates
605
- * can continue to land. When the last live background member reaches
606
- * a terminal status, `finalizeBackgroundCarryIfReady` triggers the
607
- * deferred completion.
608
- */
609
- backgroundCarry: boolean
610
- }
611
-
612
- export interface ProgressDriver {
613
- /** Feed a session-tail event. Fires emit() as the cadence allows. */
614
- ingest(event: SessionEvent, chatId: string | null, threadId?: string): void
615
- /**
616
- * Stop internal timers and clear driver state. Idempotent.
617
- *
618
- * When called with `{ preservePending: true }`, chats with
619
- * `pendingCompletion === true` are preserved so their heartbeat and
620
- * deferred-completion timeout continue firing after a bridge disconnect.
621
- * Coalesce timers (`pendingTimer`, `deferredFirstEmitTimer`) on those
622
- * preserved chats ARE cleared — they cannot safely emit into a finalized
623
- * draft stream. Chats WITHOUT `pendingCompletion` are fully removed.
624
- * The heartbeat is only stopped if no `pendingCompletion` chats remain.
625
- *
626
- * When called with no args or `{ preservePending: false }`, the existing
627
- * wipe-everything behavior is retained for back-compat.
628
- */
629
- dispose?(opts?: { preservePending?: boolean }): void
630
- /**
631
- * Begin a new turn synchronously — called from the inbound-message
632
- * handler the instant a user's message clears the gate, BEFORE any
633
- * session-tail event arrives. Creates a fresh progress card state; the
634
- * first visible render is gated by `initialDelayMs` (default 60s) so
635
- * turns that finish before the delay produce no card at all and the
636
- * user only sees the final reply.
637
- *
638
- * If a card is already active for this chat, it is force-closed (done=true,
639
- * onTurnComplete fired) before the new card is created. Each call always
640
- * produces an independent card with its own pin lifecycle.
641
- */
642
- startTurn(args: { chatId: string; threadId?: string; userText: string; replyToMessageId?: number }): void
643
- /**
644
- * External completion hook — authoritative turn-finished signal from
645
- * outside the session-tail path. Intended for `stream_reply(done=true)`
646
- * so the final-answer arrival acts with equal authority to a session-tail
647
- * `turn_end` event. Idempotent: first caller wins, subsequent callers
648
- * on the same chat+thread find no active card and no-op.
649
- *
650
- * Closes any active card for (chatId, threadId):
651
- * - cancels the deferred-first-emit timer (fast-turn suppression)
652
- * - synthesizes a `turn_end` through the reducer
653
- * - fires onTurnEnd + onTurnComplete
654
- * - clears chats map + bookkeeping
655
- *
656
- * If the deferred first emit hasn't landed yet (fast turn), `flush` sees
657
- * `forceDone=true` on a still-`isFirstEmit=true` state and suppresses
658
- * the emit entirely — no ghost card. If the card already emitted, the
659
- * normal flush+unpin path runs via onTurnComplete.
660
- */
661
- forceCompleteTurn(args: { chatId: string; threadId?: string }): void
662
- /**
663
- * #654 deterministic double-message fix. Hand off ownership of the
664
- * pinned progress card for an active turn so an external code path
665
- * (specifically the turn-flush backstop in gateway.ts) can rewrite
666
- * the card message with the user-facing answer instead of issuing a
667
- * fresh sendMessage that lands as a second Telegram message.
668
- *
669
- * Effects:
670
- * - cancels the deferred-first-emit timer if pending (no late
671
- * card emission can race the takeover)
672
- * - sets `cardTakenOver = true` — `flush()` short-circuits at the
673
- * top, so no further edits go out from the driver for this turn
674
- * - sets `completionFired = true` — guards against double-firing
675
- * `completeTurnFully` if a deferred-completion path also runs
676
- *
677
- * Returns:
678
- * - `wasEmitted`: true iff the card has already been published to
679
- * Telegram (i.e. the deferred-emit timer fired or pinning has
680
- * occurred). Caller can use this to decide between editMessageText
681
- * vs sendMessage.
682
- * - `turnKey`: the active turn's full key (chatId:threadId?:seq)
683
- * so the caller can look up the pinned messageId via pinMgr.
684
- * Null only when no active card exists for (chatId, threadId).
685
- *
686
- * Idempotent — safe to call multiple times for the same turn; the
687
- * second call returns the same shape with timer-cancellation already
688
- * complete.
689
- */
690
- takeOverCard(args: { chatId: string; threadId?: string }): {
691
- wasEmitted: boolean
692
- turnKey: string | null
693
- }
694
- /** Current state for a chat (for tests / inspection). */
695
- peek(chatId: string, threadId?: string): ProgressCardState | undefined
696
- /**
697
- * P0 of #662 — fetch the shadow fleet map for a chat. Used by tests
698
- * and (eventually) by the v2 renderer. Same lookup semantics as
699
- * `peek`. Returns undefined when no active card exists.
700
- */
701
- peekFleet(chatId: string, threadId?: string): Map<string, FleetMember> | undefined
702
- /**
703
- * P2 of #662 — debug/test hook returning every live PerChatState's
704
- * fleet keyed by turnKey. Used by cross-turn background tests to
705
- * verify routing landed on the originating turn rather than the
706
- * currently-active one. Not part of the production driver contract.
707
- */
708
- peekAllFleets(): Array<{ turnKey: string; chatId: string | null; fleet: Map<string, FleetMember> }>
709
- /**
710
- * True when the driver is still managing an active card for this chat+
711
- * thread — either a normal turn or a deferred-completion turn waiting on
712
- * in-flight sub-agents. Used by the gateway's `closeProgressLane`
713
- * backstop to avoid tearing down the draft stream while the driver is
714
- * still going to emit into it. Without this guard, parent turn_end
715
- * closes the stream, sub-agent tool_use events fire fresh emits, and
716
- * each emit creates a new `sendMessage` on Telegram (= new push
717
- * notification) instead of editing the pinned card.
718
- */
719
- hasActiveCard(chatId: string, threadId?: string): boolean
720
- /**
721
- * Issue #305 Option A — push a sub-agent narrative line into the
722
- * pinned progress card's row body for `agentId` (jsonl_agent_id).
723
- * Replace-on-each-call. Caller (gateway) is responsible for truncating
724
- * `text` to the 200-char card cap before invocation.
725
- *
726
- * Returns:
727
- * - `{ ok: true }` when the narrative was applied + flush triggered.
728
- * - `{ ok: false, reason: 'no_active_card' }` if no card is tracked
729
- * for (chatId, threadId) or its turn already completionFired.
730
- * - `{ ok: false, reason: 'unknown_agent' }` if the card is active
731
- * but does not yet contain a sub-agent for `agentId` (likely a
732
- * race with sub-agent watcher's jsonl_agent_id backfill — caller
733
- * should fall through to the message-send path).
734
- *
735
- * Never throws.
736
- */
737
- recordSubAgentNarrative(args: {
738
- chatId: string
739
- threadId?: string
740
- agentId: string
741
- text: string
742
- }): { ok: true } | { ok: false; reason: 'no_active_card' | 'unknown_agent' }
743
- /**
744
- * Report a Telegram API failure back to the driver after an async emit
745
- * fails. The outer layer (server.ts catch handler) classifies the raw
746
- * error and calls this so the driver can track consecutive 4xx failures
747
- * and mark the card terminal when the threshold is reached.
748
- *
749
- * Rules:
750
- * - `benign` (message is not modified) — ignored; counter unchanged.
751
- * - `transient` (5xx, network) — logged at debug; counter unchanged.
752
- * - `permanent_4xx` — counter incremented; terminal=true after K hits.
753
- *
754
- * Idempotent after terminal=true.
755
- */
756
- reportApiFailure(turnKey: string, failure: ApiFailureInfo): void
757
- /**
758
- * Report a successful Telegram API call for a card. Resets the
759
- * consecutive-4xx counter so a single success after a transient failure
760
- * doesn't leave the counter elevated. Call from the `.then()` handler
761
- * of the async emit in server.ts.
762
- */
763
- reportApiSuccess(turnKey: string): void
764
- /**
765
- * Issue #137: bump the per-turn outbound-delivered counter for the
766
- * card matching (chatId, threadId). Called from the gateway's reply
767
- * success paths (executeReply, executeStreamReply) AFTER the
768
- * `bot.api.sendMessage` resolved. If no card is active for that
769
- * chat+thread, the call is a silent no-op (boot banners and other
770
- * system messages don't tick the counter).
771
- */
772
- recordOutboundDelivered(chatId: string, threadId?: string): void
773
- /**
774
- * Option C — watcher stall callback. Called by the sub-agent watcher
775
- * (via config.onStall) when a running sub-agent's JSONL goes silent for
776
- * longer than `stallThresholdMs`. Updates the sub-agent's `lastEventAt`
777
- * to trigger the elapsed-ticker so the progress card re-renders with a
778
- * visible ⚠️ stall indicator, even when the bridge has disconnected.
779
- *
780
- * No-op if no card is currently tracking this `agentId`.
781
- */
782
- onSubAgentStall(agentId: string, idleMs: number, description: string): void
783
- /**
784
- * Symmetric to `onSubAgentStall`. Fires when the watcher observes
785
- * JSONL activity returning for a previously-stalled sub-agent. Forces
786
- * a re-render so the ⚠ Stalled badge clears immediately, instead of
787
- * waiting on the next heartbeat tick (which the diff-guard might
788
- * suppress if no chat-level state otherwise changed). The render
789
- * itself reads the now-current `sa.lastEventAt` (already bumped by
790
- * the standard event path), so this method is purely a render-trigger.
791
- *
792
- * No-op if no card is currently tracking this `agentId`.
793
- */
794
- onSubAgentUnstall(agentId: string, description: string): void
795
- /**
796
- * Test-only accessor exposing the driver's internal Maps so unit tests
797
- * can assert TTL eviction and outer-base-key cleanup actually drop
798
- * entries. Not part of the supported runtime API — gated behind the
799
- * leading-underscore name.
800
- */
801
- _debugGetMaps?(): {
802
- chats: Map<string, unknown>
803
- seenEnqueueMsgIds: Map<string, number>
804
- pendingSyncEchoes: Map<string, number>
805
- chatRunningSubagents: Map<string, Map<string, unknown>>
806
- baseTurnSeqs: Map<string, number>
807
- editTimestamps: Map<string, number[]>
808
- }
809
- }
810
-
811
- export function createProgressDriver(config: ProgressDriverConfig): ProgressDriver {
812
- const minIntervalMs = config.minIntervalMs ?? 500
813
- const coalesceMs = config.coalesceMs ?? 400
814
- const now = config.now ?? (() => Date.now())
815
- const setT =
816
- config.setTimeout ??
817
- ((fn, ms) => {
818
- const h = setTimeout(fn, ms)
819
- return { ref: h }
820
- })
821
- const clearT =
822
- config.clearTimeout ??
823
- ((ref) => {
824
- const handle = (ref as { ref: ReturnType<typeof setTimeout> }).ref
825
- clearTimeout(handle)
826
- })
827
- const setI =
828
- config.setInterval ??
829
- ((fn, ms) => {
830
- const h = setInterval(fn, ms)
831
- return { ref: h }
832
- })
833
- const clearI =
834
- config.clearInterval ??
835
- ((ref) => {
836
- const handle = (ref as { ref: ReturnType<typeof setInterval> }).ref
837
- clearInterval(handle)
838
- })
839
- const heartbeatMs = config.heartbeatMs ?? 5000
840
- const editBudgetThreshold = config.editBudgetThreshold ?? 18
841
- const editBudgetCoalesceMs = config.editBudgetCoalesceMs ?? 3000
842
- const maxIdleMs = config.maxIdleMs ?? 30 * 60_000
843
- // v2 card-gate (#553 PR 4 / #842): card visibility is `(elapsed >= 45s)
844
- // OR (any sub-agent appeared) OR (explicit background dispatch)`.
845
- // Tools alone never trigger the card.
846
- // - initialDelayMs: 45s (was 60s, #842) — pushes the time-based gate
847
- // to the spec value. The lower threshold means more turns flash a
848
- // card; the explicit-background bypass below offsets that for the
849
- // "fire-and-forget" case where the user always wants to see the
850
- // card immediately.
851
- // - initialDelayMsBackground: 0 (#842) — explicit
852
- // `Agent({run_in_background:true})` dispatches promote the card
853
- // immediately. Lets backgrounded work be visible right away
854
- // without waiting for any other promotion path.
855
- // - promoteOnParentToolCount: 0 (was 3) — disabled. The check below
856
- // treats 0 (and Infinity) as "never promote on tool count".
857
- // - promoteAfterMs: 0 (was 5_000) — disabled. ensureTimePromoteScheduled
858
- // no-ops when this is 0, so the timer never schedules. The PR #570
859
- // time-promote was a stop-gap when initialDelayMs was 30s; with
860
- // initialDelayMs=45s and the sub-agent promote intact, it is no
861
- // longer needed.
862
- // - promoteOnSubAgent: true (unchanged) — sub-agents/background workers
863
- // break the suppression immediately.
864
- const initialDelayMs = config.initialDelayMs ?? 45_000
865
- const initialDelayMsBackground = config.initialDelayMsBackground ?? 0
866
- const promoteOnSubAgent = config.promoteOnSubAgent ?? true
867
- const promoteOnParentToolCount = config.promoteOnParentToolCount ?? 0
868
- const promoteAfterMs = config.promoteAfterMs ?? 0
869
- const maxConsecutive4xx = config.maxConsecutive4xx ?? 3
870
- const orphanPromotionMs = config.orphanPromotionMs ?? 5_000
871
- const coldSubAgentThresholdMs = config.coldSubAgentThresholdMs ?? 30_000
872
- const deferredCompletionTimeoutMs = config.deferredCompletionTimeoutMs ?? 3 * 60_000
873
- const subAgentTickIntervalMs = config.subAgentTickIntervalMs ?? 10_000
874
- // Per-chat sliding 60s window of recent emit timestamps. When the
875
- // window holds more than `editBudgetThreshold` entries we're "hot"
876
- // and coalesce more aggressively.
877
- const editTimestamps = new Map<string, number[]>()
878
- function recordEdit(k: string): void {
879
- const arr = editTimestamps.get(k) ?? []
880
- arr.push(now())
881
- // Drop entries older than 60s.
882
- const cutoff = now() - 60_000
883
- while (arr.length > 0 && arr[0] < cutoff) arr.shift()
884
- editTimestamps.set(k, arr)
885
- }
886
- function isBudgetHot(k: string): boolean {
887
- const arr = editTimestamps.get(k)
888
- if (!arr) return false
889
- const cutoff = now() - 60_000
890
- while (arr.length > 0 && arr[0] < cutoff) arr.shift()
891
- return arr.length >= editBudgetThreshold
892
- }
893
-
894
- const chats = new Map<string, PerChatState>()
895
-
896
- // Issue #334: per-chat registry of sub-agents that are still running.
897
- // Keyed by baseKey(chatId, threadId) → Map<agentId, SubAgentState>.
898
- // When a sub-agent starts it's added; when it reaches a terminal state
899
- // (done/failed) it's removed. On a new turn for the same chat, any
900
- // entries here are cloned into the new PerChatState's subAgents so the
901
- // new turn's progress card shows still-running background sub-agents
902
- // from the prior turn.
903
- const chatRunningSubagents = new Map<string, Map<string, SubAgentState>>()
904
-
905
- // Per-chat turn sequence counters. Key = baseKey(chatId, threadId).
906
- // Each new startTurn increments the counter; the value is the NEXT seq
907
- // to allocate (so current total = value - 1 once at least one was allocated).
908
- const baseTurnSeqs = new Map<string, number>()
909
- // Tracks base keys of turns started via isSync (startTurn). When the
910
- // corresponding non-sync session-tail echo arrives, it's dropped and
911
- // the entry is consumed. This prevents orphan cards when a fast turn
912
- // completes before the session-tail fires its enqueue echo — Guard 1
913
- // misses it because currentTurnKey is already null, but this guard
914
- // catches the echo regardless of turn lifecycle state.
915
- const pendingSyncEchoes = new Map<string, number>()
916
- // MessageId-based dedup: tracks recently seen enqueue messageIds so
917
- // that repeated delivery of the same user message (from session
918
- // restarts, reconnects, or JSONL rotation) is dropped even after
919
- // Guard 2's one-shot marker has been consumed. Keyed by
920
- // `base:messageId` → timestamp. Entries expire after 60s.
921
- const seenEnqueueMsgIds = new Map<string, number>()
922
-
923
- /** Allocate a new turn slot for chatId:threadId. Returns the unique turnKey and 1-based index. */
924
- function allocateTurnSlot(chatId: string, threadId?: string): { turnKey: string; index: number; total: number } {
925
- const base = baseKey(chatId, threadId)
926
- const seq = (baseTurnSeqs.get(base) ?? 0) + 1
927
- baseTurnSeqs.set(base, seq)
928
- return { turnKey: `${base}:${seq}`, index: seq, total: seq }
929
- }
930
-
931
- // Track the last enqueued turn key so non-enqueue session events (tool_use,
932
- // tool_result, turn_end) which arrive with chatIdMaybe=null from the
933
- // session-tail supervisor still route to the correct card.
934
- let currentChatId: string | null = null
935
- let currentThreadId: string | undefined
936
- /** Full turn key (chatId:threadId:seq) for the currently active turn. */
937
- let currentTurnKey: string | null = null
938
- let heartbeatHandle: { ref: unknown } | null = null
939
- // Throttled inline TTL eviction for `seenEnqueueMsgIds` and
940
- // `pendingSyncEchoes`. Previously eviction lived inside the heartbeat tick,
941
- // but the heartbeat stops when `chats.size === 0`, leaving these maps to
942
- // grow unbounded across idle periods. The inline path runs at the top of
943
- // every public ingress (ingest / startTurn) but is rate-limited to once
944
- // every `evictThrottleMs` so it stays effectively free in the hot path.
945
- let lastEvictedAt = 0
946
- const evictThrottleMs = 30_000
947
- // Tracks the last elapsed-seconds bucket we emitted for each chat so
948
- // the heartbeat can coalesce — if the HTML hasn't changed AND the
949
- // header elapsed counter (rounded to the heartbeat cadence) would
950
- // still render identically, skip the edit.
951
- const lastHeartbeatBucket = new Map<string, number>()
952
- // Fix #314: tracks the last sub-agent elapsed-tick bucket per turn.
953
- // Works exactly like `lastHeartbeatBucket` but uses `subAgentTickIntervalMs`
954
- // as the bucket width. When the bucket advances AND at least one sub-agent
955
- // is running, the heartbeat forces an emit even when the HTML hash is
956
- // unchanged. Bucket-based (not timestamp-based) so the comparison is stable
957
- // even when multiple heartbeat ticks fire at the same `now()` value during
958
- // a fake-clock advance in tests.
959
- const lastSubAgentTickBucket = new Map<string, number>()
960
-
961
- /**
962
- * Fire completion callbacks + delete chatState + tidy bookkeeping.
963
- * Idempotent via `completionFired`. Does not touch the reducer or
964
- * flush — the caller is responsible for putting the state into its
965
- * final shape before invoking this.
966
- *
967
- * Shared by three completion paths:
968
- * - Normal turn_end with no in-flight sub-agents
969
- * - Deferred completion (last sub-agent finishes after parent turn_end)
970
- * - Abandonment (closeZombie for maxIdle / enqueue-force-close)
971
- */
972
- /**
973
- * Prepare silent-end suppression BEFORE the final flush.
974
- *
975
- * Must run before the outer `flush(cs, true)` at every site that calls
976
- * `completeTurnFully`, so the render at that flush already knows whether
977
- * to suppress the "🙊 Ended without reply" header. If we relied on
978
- * `completeTurnFully` to set the flag and re-flush, the outer flush would
979
- * already have queued a warning-card edit/send to Telegram — and in the
980
- * worst case (the first edit finalizes before the second arrives) the
981
- * user sees both the warning AND the corrected card as separate messages.
982
- *
983
- * Idempotent — `silentEndPrepared` guards against re-firing the
984
- * `onSilentEnd` callback (which writes a state file the Stop hook reads).
985
- */
986
- function prepareSilentEndSuppression(cs: PerChatState): void {
987
- if (cs.silentEndPrepared) return
988
- cs.silentEndPrepared = true
989
- // #371 fix: when stream_reply(done=true) lands as the final tool call,
990
- // the Stop hook can fire before session-tail observes the matching
991
- // tool_use event. Pre-fix replyToolCalled stayed false long enough for
992
- // isSilentEnd to read true → the silent-end retry kicks in → the user
993
- // sees a duplicate reply.
994
- //
995
- // outboundDeliveredCount is bumped synchronously by
996
- // recordOutboundDelivered() inside the stream_reply MCP handler when
997
- // the API call returns successfully — it doesn't depend on the
998
- // session-tail event landing. Consulting it here closes the race.
999
- const isSilentEnd =
1000
- !cs.replyToolCalled
1001
- && cs.outboundDeliveredCount === 0
1002
- && !cs.wasAutonomous
1003
- if (!isSilentEnd || !config.onSilentEnd) return
1004
- try {
1005
- const result = config.onSilentEnd({ chatId: cs.chatId, threadId: cs.threadId, turnKey: cs.turnKey })
1006
- if (result?.suppressed === true) {
1007
- cs.silentEndSuppressed = true
1008
- }
1009
- } catch {
1010
- /* never let the callback break the completion path */
1011
- }
1012
- }
1013
-
1014
- function beginTurnEnd(target: PerChatState, durationMs: number): void {
1015
- target.parentTurnEndAt = now()
1016
- target.state = reduce(target.state, { kind: 'turn_end', durationMs }, now())
1017
- }
1018
-
1019
- function completeTurnFully(cs: PerChatState): void {
1020
- if (cs.completionFired) return
1021
- cs.completionFired = true
1022
- // Defensive: if a caller forgot to call prepareSilentEndSuppression
1023
- // before its flush, run it now so the onSilentEnd callback still fires
1024
- // (the Stop hook still gets the state file). The flag is already set
1025
- // for any caller that did call it (idempotent guard).
1026
- prepareSilentEndSuppression(cs)
1027
- const taskNum = taskNumFor(cs)
1028
- const summary = summariseTurn(cs.state, now())
1029
- if (config.onTurnEnd) {
1030
- try {
1031
- config.onTurnEnd(summary)
1032
- } catch {
1033
- /* never let a summary write break the stream */
1034
- }
1035
- }
1036
- if (config.onTurnComplete) {
1037
- process.stderr.write(`telegram gateway: progress-card: onTurnComplete firing turnKey=${cs.turnKey}\n`)
1038
- emitCardEvent({
1039
- agent: process.env.SWITCHROOM_AGENT_NAME ?? '',
1040
- chatId: cs.chatId ?? '',
1041
- turnKey: cs.turnKey,
1042
- event: 'finalized',
1043
- reason: 'onTurnComplete',
1044
- })
1045
- try {
1046
- config.onTurnComplete({
1047
- chatId: cs.chatId,
1048
- threadId: cs.threadId,
1049
- turnKey: cs.turnKey,
1050
- summary,
1051
- taskIndex: taskNum.index,
1052
- taskTotal: taskNum.total,
1053
- })
1054
- } catch {
1055
- /* never let completion callback break the stream */
1056
- }
1057
- }
1058
- if (cs.pendingTimer != null) {
1059
- clearT(cs.pendingTimer)
1060
- cs.pendingTimer = null
1061
- }
1062
- if (cs.deferredFirstEmitTimer != null) {
1063
- clearT(cs.deferredFirstEmitTimer)
1064
- cs.deferredFirstEmitTimer = null
1065
- }
1066
- if (cs.timePromoteTimer != null) {
1067
- clearT(cs.timePromoteTimer)
1068
- cs.timePromoteTimer = null
1069
- }
1070
- chats.delete(cs.turnKey)
1071
- lastHeartbeatBucket.delete(cs.turnKey)
1072
- lastSubAgentTickBucket.delete(cs.turnKey)
1073
- editTimestamps.delete(cs.turnKey)
1074
- // Drop the outer base-key entries if no other chat shares the same base.
1075
- // Covers all 3 close paths since they all funnel through here:
1076
- // completeTurnFully (turn_end), closeZombie (abandonment), and the
1077
- // stalled-close branch in the heartbeat. Prevents unbounded growth of
1078
- // `chatRunningSubagents` / `baseTurnSeqs` across idle periods.
1079
- cleanupBaseKeyIfUnused(baseKey(cs.chatId, cs.threadId), parseTurnSeq(cs.turnKey))
1080
- if (currentTurnKey === cs.turnKey) {
1081
- currentChatId = null
1082
- currentThreadId = undefined
1083
- currentTurnKey = null
1084
- }
1085
- if (chats.size === 0) stopHeartbeat()
1086
- }
1087
-
1088
- /**
1089
- * Post-ingest check: if the turn is in `pendingCompletion` state and
1090
- * no sub-agents are still in-flight, fire completion. Called after
1091
- * every reducer dispatch that could transition a sub-agent to done
1092
- * (sub_agent_turn_end, parent Agent tool_result, etc.).
1093
- */
1094
- function maybeCompleteDeferredTurn(cs: PerChatState): void {
1095
- if (!cs.pendingCompletion) return
1096
- // Gate on ANY running sub-agent (correlated OR orphan). Orphans from
1097
- // `Agent({run_in_background:true})` only deregister via their own
1098
- // `sub_agent_turn_end` — the card must stay pinned until then so the
1099
- // user sees the background work. Closes #87. Historical ghost-pin
1100
- // risk (#31/#43) is bounded by `closeZombie` on new enqueue +
1101
- // `maxIdleMs` heartbeat ceiling.
1102
- // Also gate on fleet background members: a bg sub-agent that hasn't
1103
- // yet emitted any events will be absent from state.subAgents but
1104
- // present in fleet with status:'background'. Without this gate the
1105
- // deferred completion would fire immediately and close the card.
1106
- // Fixes #713 and #709.
1107
- if (hasAnyRunningSubAgent(cs.state)) return
1108
- if (hasLiveBackground(cs.fleet)) return
1109
- process.stderr.write(`telegram gateway: progress-card: deferred completion firing turnKey=${cs.turnKey} (last sub-agent finished)\n`)
1110
- emitCardEvent({
1111
- agent: process.env.SWITCHROOM_AGENT_NAME ?? '',
1112
- chatId: cs.chatId ?? '',
1113
- turnKey: cs.turnKey,
1114
- event: 'force-completed',
1115
- reason: 'deferred-completion: last sub-agent finished',
1116
- })
1117
- // Route through the unified close path (turn-end reason) so the
1118
- // prelude (silentEnd suppression, final flush, tail cleanup) matches
1119
- // every other completion site.
1120
- closePerChat(cs, 'turn-end')
1121
- }
1122
-
1123
- /**
1124
- * Unified per-chat close path. Called by every site that finalises a
1125
- * card so the prelude (timer cleanup, sub-agent force-close where the
1126
- * reason demands it, silentEnd preparation, final flush) is applied
1127
- * consistently. The cleanup tail (chats.delete, baseKey cleanup,
1128
- * heartbeat-stop-if-last) lives in `completeTurnFully` and runs at the
1129
- * end of every reason path.
1130
- *
1131
- * Reasons:
1132
- * - 'turn-end' : normal completion (parent turn_end fired with no
1133
- * in-flight sub-agents, or the deferred-completion
1134
- * gate cleared). Sub-agents are NOT force-closed
1135
- * because by definition none are running.
1136
- * - 'zombie' : abandonment (heartbeat maxIdle ceiling, or new
1137
- * enqueue force-closing the previous card). Force-
1138
- * closes running sub-agents because we are giving
1139
- * up on them. Preserves `pendingSyncEchoes` because
1140
- * the echo for the previous turn may still arrive.
1141
- * - 'stalled' : Gap-8 deferred-completion timeout expired. Force-
1142
- * closes running sub-agents and passes
1143
- * `stalledClose=true` to flush so the renderer shows
1144
- * "⚠️ Stalled — forced close".
1145
- *
1146
- * Must not re-enter ingest.
1147
- */
1148
- function closePerChat(cs: PerChatState, reason: CloseReason): void {
1149
- // Clear pending coalesce timer for every reason — we are about to
1150
- // emit the final render synchronously.
1151
- if (cs.pendingTimer != null) {
1152
- clearT(cs.pendingTimer)
1153
- cs.pendingTimer = null
1154
- }
1155
-
1156
- if (reason === 'zombie' || reason === 'stalled') {
1157
- // Both reasons synthesise a turn_end (zombie) or have already had
1158
- // one fire (stalled — parentTurnEndAt is set) and then explicitly
1159
- // close every running sub-agent so the render accounts for all
1160
- // work. zombie: reduce now; stalled: reducer already saw turn_end.
1161
- if (reason === 'zombie') {
1162
- const durationMs = Math.max(0, now() - cs.state.turnStartedAt)
1163
- cs.state = reduce(cs.state, { kind: 'turn_end', durationMs }, now())
1164
- }
1165
- if (hasAnyRunningSubAgent(cs.state)) {
1166
- const prevStateForSync = cs.state
1167
- const closed = new Map(cs.state.subAgents)
1168
- const nowMs = now()
1169
- for (const [k, sa] of closed) {
1170
- if (sa.state === 'running') {
1171
- closed.set(k, { ...sa, state: 'done', finishedAt: nowMs, pendingPreamble: null })
1172
- }
1173
- }
1174
- cs.state = { ...cs.state, subAgents: closed }
1175
- // Issue #399: sync the chat-scoped running-sub-agent registry so
1176
- // stale entries don't carry into the next turn's progress card.
1177
- syncChatRunningSubagents(
1178
- prevStateForSync,
1179
- cs.state,
1180
- baseKey(cs.chatId, cs.threadId),
1181
- chatRunningSubagents,
1182
- )
1183
- }
1184
- }
1185
-
1186
- // Set silentEndSuppressed BEFORE the outer flush so the rendered
1187
- // card already excludes the "🙊 Ended without reply" header when a
1188
- // retry is queued. Otherwise the outer flush would queue a warning-
1189
- // card edit and a follow-up correction edit could race.
1190
- prepareSilentEndSuppression(cs)
1191
- // zombie passes stalledClose=false — we abandoned the card but did
1192
- // NOT exceed the deferred-completion timeout. Promoting it to
1193
- // stalled would mis-render the close header.
1194
- flush(cs, /*forceDone*/ true, /*stalledClose*/ reason === 'stalled')
1195
- completeTurnFully(cs)
1196
- // Note: zombie deliberately preserves `pendingSyncEchoes` because
1197
- // the echo for the closed turn may still arrive after close. The
1198
- // dedup map's TTL eviction (maybeEvict) will reap it eventually.
1199
- }
1200
-
1201
- /**
1202
- * Backwards-compatible alias for the zombie close path. Retained as a
1203
- * thin wrapper so call sites read clearly ("close the zombie") without
1204
- * needing to know about the reason taxonomy.
1205
- */
1206
- function closeZombie(cs: PerChatState): void {
1207
- closePerChat(cs, 'zombie')
1208
- }
1209
-
1210
- /**
1211
- * TTL-evict stale entries from the messageId-dedup map and the sync-echo
1212
- * marker map. Same TTLs as the (now-removed) heartbeat eviction:
1213
- * - `seenEnqueueMsgIds`: 60s (matches the dedup window in `ingest`).
1214
- * - `pendingSyncEchoes`: 30s (matches the consumer in `ingest`).
1215
- */
1216
- function evictStaleDedup(nowMs: number): void {
1217
- const t60 = nowMs - 60_000
1218
- for (const [k, ts] of seenEnqueueMsgIds) {
1219
- if (ts <= t60) seenEnqueueMsgIds.delete(k)
1220
- }
1221
- const t30 = nowMs - 30_000
1222
- for (const [k, ts] of pendingSyncEchoes) {
1223
- if (ts <= t30) pendingSyncEchoes.delete(k)
1224
- }
1225
- }
1226
-
1227
- /**
1228
- * Throttled wrapper. Cheap when not due — a single timestamp compare and
1229
- * branch. Called at the top of every public ingress so eviction runs
1230
- * regardless of whether any chats are currently live.
1231
- */
1232
- function maybeEvict(nowMs: number): void {
1233
- if (nowMs - lastEvictedAt < evictThrottleMs) return
1234
- lastEvictedAt = nowMs
1235
- evictStaleDedup(nowMs)
1236
- }
1237
-
1238
- /**
1239
- * Best-effort outer-base-key cleanup, called after a chat is removed from
1240
- * the `chats` map. Only drops entries that are *safe* to drop:
1241
- *
1242
- * - `chatRunningSubagents[base]`: deleted iff (a) no surviving chat
1243
- * shares the same base AND (b) the inner map is empty. Background
1244
- * sub-agents intentionally outlive their parent turn (cross-turn
1245
- * carry-over for `Agent({run_in_background:true})`), so we never
1246
- * drop a non-empty inner map — that would erase the next turn's
1247
- * seed list. The empty-map case is the unbounded-growth path the
1248
- * caller cares about: a chat that ran but never spawned anything
1249
- * still got a `Map` allocated (or, more importantly, the entry
1250
- * remains after natural sub-agent completion).
1251
- *
1252
- * - `baseTurnSeqs[base]`: deleted iff no surviving chat shares the
1253
- * same base AND no in-flight enqueue has just allocated a new turn
1254
- * for this base via `allocateTurnSlot` (signalled by
1255
- * `currentTurnKey` whose prefix matches `base`). The latter guard
1256
- * matters because the new-enqueue path runs
1257
- * `allocateTurnSlot -> closeZombie(old)` before registering the
1258
- * new chat in `chats`; a naive delete here would clobber the
1259
- * just-allocated seq, causing the next allocation to reset to 1
1260
- * and collide with the still-live new turn.
1261
- */
1262
- function cleanupBaseKeyIfUnused(base: string, closingTurnSeq?: number): void {
1263
- for (const cs of chats.values()) {
1264
- if (baseKey(cs.chatId, cs.threadId) === base) return
1265
- }
1266
- const inner = chatRunningSubagents.get(base)
1267
- if (inner == null || inner.size === 0) {
1268
- chatRunningSubagents.delete(base)
1269
- }
1270
- // Skip `baseTurnSeqs` cleanup if `allocateTurnSlot` has just bumped the
1271
- // seq past the turn we are closing. That happens in the new-enqueue
1272
- // path: `allocateTurnSlot` runs BEFORE `closeZombie(old)` and BEFORE
1273
- // the new PerChatState is registered in `chats`, so the new turn is
1274
- // invisible to the iteration above. Detecting that via the seq diff
1275
- // avoids clobbering the just-allocated counter (would reset numbering
1276
- // to 1 and cause turnKey collisions with the still-live new turn).
1277
- const currentSeq = baseTurnSeqs.get(base)
1278
- if (
1279
- currentSeq != null
1280
- && closingTurnSeq != null
1281
- && currentSeq > closingTurnSeq
1282
- ) {
1283
- return
1284
- }
1285
- baseTurnSeqs.delete(base)
1286
- }
1287
-
1288
- /** Parse the trailing `:N` from a turnKey. Returns undefined if absent. */
1289
- function parseTurnSeq(turnKey: string): number | undefined {
1290
- const idx = turnKey.lastIndexOf(':')
1291
- if (idx < 0) return undefined
1292
- const n = Number(turnKey.slice(idx + 1))
1293
- return Number.isFinite(n) ? n : undefined
1294
- }
1295
-
1296
- function startHeartbeatIfNeeded(): void {
1297
- if (heartbeatMs <= 0) return
1298
- if (heartbeatHandle != null) return
1299
- if (chats.size === 0) return
1300
- heartbeatHandle = setI(() => {
1301
- // Force a re-render for any chat with an open turn so the header
1302
- // elapsed time and per-item `(dur)` tick visibly — even when no
1303
- // session-JSONL events have arrived for a while (common while a
1304
- // sub-agent is running). Coalesce: only actually emit if either
1305
- // the rendered HTML changed or the elapsed-time bucket
1306
- // (rounded to the heartbeat period) advanced.
1307
- //
1308
- // Zombie ceiling: collect any card whose last real event is
1309
- // older than maxIdleMs and force-close it after the iteration.
1310
- // Deferring the close keeps Map iteration safe and lets us batch
1311
- // the cleanup.
1312
- const zombies: PerChatState[] = []
1313
- // Gap 3: pendingAgentSpawns that need orphan promotion this tick.
1314
- const orphanPromotions: PerChatState[] = []
1315
- // Gap 4: running sub-agents whose JSONL watcher appears cold.
1316
- const coldSubAgents: Array<{ cs: PerChatState; agentId: string }> = []
1317
- // Gap 8: cards where the deferred-completion timeout has expired.
1318
- const stalledCards: PerChatState[] = []
1319
- for (const [, cs] of chats) {
1320
- // P3 of #662 — per-member stuck escalation runs FIRST, before any
1321
- // skip gate. This is pure data plumbing on the fleet shadow map;
1322
- // it must happen even when the chat is in the initial-delay window
1323
- // or budget-hot (the renderer's job is gated by those conditions
1324
- // separately). markStuck is idempotent and a no-op for non-running
1325
- // members, so running it every tick is cheap.
1326
- {
1327
- const fleet = cs.fleet
1328
- if (fleet.size > 0) {
1329
- const tNow = now()
1330
- for (const [agentId, m] of fleet) {
1331
- const next = fleetMarkStuck(m, tNow, 60_000)
1332
- if (next !== m) fleet.set(agentId, next)
1333
- }
1334
- }
1335
- }
1336
-
1337
- // Skip only when TRULY done. During the deferred-completion
1338
- // window (parent turn_end fired but sub-agents — correlated or
1339
- // orphan — are still running), reducer stage is 'done' but the
1340
- // card is still alive. Keeping the heartbeat ticking lets per-row
1341
- // elapsed times advance visibly; otherwise the card looks frozen
1342
- // ("card went dead" bug). Same gate as the defer paths so the
1343
- // heartbeat lifetime tracks the pin lifetime exactly.
1344
- if (cs.state.stage === 'done' && !hasAnyRunningSubAgent(cs.state)) continue
1345
- // Skip heartbeat for terminal cards — the Telegram message is gone
1346
- // (deleted / bot blocked). No edits should be attempted.
1347
- if (cs.apiFailures.terminal) continue
1348
- // Don't heartbeat a card that's still in the initial delay window.
1349
- if (cs.isFirstEmit && cs.deferredFirstEmitTimer !== DELAY_ELAPSED) continue
1350
- if (maxIdleMs > 0 && now() - cs.lastEventAt > maxIdleMs) {
1351
- zombies.push(cs)
1352
- continue
1353
- }
1354
-
1355
- // Gap 3 — orphan promotion: if any PendingAgentSpawn has been
1356
- // waiting longer than orphanPromotionMs without a matching
1357
- // sub_agent_started, promote it to a synthesised sub-agent row so
1358
- // the work is at least visible on the card.
1359
- if (orphanPromotionMs > 0 && cs.state.pendingAgentSpawns.size > 0) {
1360
- for (const [toolUseId, pending] of cs.state.pendingAgentSpawns) {
1361
- if (!cs.promotedSpawnIds.has(toolUseId) && now() - pending.startedAt >= orphanPromotionMs) {
1362
- orphanPromotions.push(cs)
1363
- break
1364
- }
1365
- }
1366
- }
1367
-
1368
- // Gap 4 — cold-JSONL detection: if a running sub-agent hasn't
1369
- // emitted an event for coldSubAgentThresholdMs, synthesise a
1370
- // sub_agent_turn_end so the deferred-completion path can proceed.
1371
- if (coldSubAgentThresholdMs > 0 && cs.pendingCompletion) {
1372
- for (const [agentId, sa] of cs.state.subAgents) {
1373
- if (sa.state === 'running' && sa.lastEventAt != null && now() - sa.lastEventAt >= coldSubAgentThresholdMs) {
1374
- coldSubAgents.push({ cs, agentId })
1375
- }
1376
- }
1377
- }
1378
-
1379
- // Gap 8 — deferred-completion timeout: if the parent turn_end fired
1380
- // but sub-agents never finished within deferredCompletionTimeoutMs,
1381
- // force-close with a "stalled" header.
1382
- if (
1383
- deferredCompletionTimeoutMs > 0
1384
- && cs.parentTurnEndAt != null
1385
- && now() - cs.parentTurnEndAt >= deferredCompletionTimeoutMs
1386
- ) {
1387
- stalledCards.push(cs)
1388
- continue
1389
- }
1390
-
1391
- // Fix #314 — elapsed-ticker bucket: compute BEFORE the budget-hot
1392
- // skip so the ticker can override the skip when the elapsed counter
1393
- // would otherwise freeze. A bursty sub-agent (many tool calls) makes
1394
- // the chat hot, which suppresses the heartbeat — but the user still
1395
- // expects elapsed time to advance visibly. The ticker provides a hard
1396
- // floor every `subAgentTickIntervalMs` so the UI never looks dead for
1397
- // longer than that, even when a sub-agent is grinding through tools.
1398
- const subAgentRunning = subAgentTickIntervalMs > 0 && hasAnyRunningSubAgent(cs.state)
1399
- const subAgentBucket = subAgentTickIntervalMs > 0 ? Math.floor(now() / subAgentTickIntervalMs) : 0
1400
- const prevSubAgentBucket = lastSubAgentTickBucket.get(cs.turnKey)
1401
- const elapsedTickDue = subAgentRunning && subAgentBucket !== prevSubAgentBucket
1402
-
1403
- // Skip heartbeat while the chat is hot — sub-agent bursts are
1404
- // already producing edits, the elapsed counter is ticking from
1405
- // those, and an extra heartbeat edit just spends budget. (Design
1406
- // §4.4: "heartbeat respects budget too".) EXCEPTION: when the
1407
- // elapsed-ticker is due, push one render through to keep elapsed
1408
- // visibly advancing — this is the floor that fixes #314.
1409
- if (isBudgetHot(cs.turnKey) && !elapsedTickDue) continue
1410
- if (elapsedTickDue) {
1411
- lastSubAgentTickBucket.set(cs.turnKey, subAgentBucket)
1412
- }
1413
- const stuckMs = Math.max(0, now() - cs.lastEventAt)
1414
- // Issue #132: silentEnd only matters once the parent turn is in
1415
- // `stage='done'` AND no sub-agents are still running. While work
1416
- // is in flight, "no reply yet" is normal; the card stays in
1417
- // "Working…". The renderer applies the same gate, so passing the
1418
- // unconditional flag here is safe.
1419
- // Issue #259: suppress for autonomous wakeup turns (no reply is expected).
1420
- // silentEndSuppressed: set when a retry is queued (first silent-end) so
1421
- // the heartbeat renders "✅ Done" instead of "🙊 Ended without reply".
1422
- const silentEnd = !cs.replyToolCalled && !cs.wasAutonomous && !cs.silentEndSuppressed
1423
- // Issue #137: agent called reply/stream_reply (replyToolCalled=true)
1424
- // but the actual outbound never landed (recordOutboundDelivered was
1425
- // never called for this card). Distinct from silentEnd because the
1426
- // agent TRIED — the failure is in the delivery layer, not the model.
1427
- const replyNotDelivered = cs.replyToolCalled && cs.outboundDeliveredCount === 0
1428
- // Gap 8: pass parentDone to renderer during the deferred-unpin window.
1429
- const parentDone = cs.parentTurnEndAt != null && hasAnyRunningSubAgent(cs.state)
1430
- const html = render(cs.state, now(), undefined, { stuckMs, silentEnd, replyNotDelivered, parentDone }, undefined, cs.fleet)
1431
- const bucket = Math.floor(now() / heartbeatMs)
1432
- const prevBucket = lastHeartbeatBucket.get(cs.turnKey)
1433
-
1434
- // Fix #314 — elapsed-ticker bypass for the html-unchanged guard. When
1435
- // the elapsed-ticker is due, push the emit through even if html and
1436
- // heartbeat-bucket are both unchanged. Combined with the budget-hot
1437
- // bypass above, this guarantees the elapsed counter advances at most
1438
- // `subAgentTickIntervalMs` apart while a sub-agent is running.
1439
- if (html === cs.lastEmittedHtml && bucket === prevBucket && !elapsedTickDue) continue
1440
-
1441
- lastHeartbeatBucket.set(cs.turnKey, bucket)
1442
- cs.lastEmittedHtml = html
1443
- cs.lastEmittedAt = now()
1444
- recordEdit(cs.turnKey)
1445
- config.emit({
1446
- chatId: cs.chatId,
1447
- threadId: cs.threadId,
1448
- turnKey: cs.turnKey,
1449
- html,
1450
- done: false,
1451
- isFirstEmit: false,
1452
- })
1453
- }
1454
- for (const cs of zombies) closeZombie(cs)
1455
-
1456
- // Gap 3: promote stale PendingAgentSpawns to synthetic sub-agent rows.
1457
- for (const cs of orphanPromotions) {
1458
- for (const [toolUseId, pending] of cs.state.pendingAgentSpawns) {
1459
- if (cs.promotedSpawnIds.has(toolUseId)) continue
1460
- if (now() - pending.startedAt < orphanPromotionMs) continue
1461
- cs.promotedSpawnIds.add(toolUseId)
1462
- const syntheticId = `orphan-${toolUseId}`
1463
- process.stderr.write(
1464
- `telegram gateway: progress-card: orphan-promotion toolUseId=${toolUseId} syntheticId=${syntheticId} description="${pending.description}" (Gap 3 #313)\n`,
1465
- )
1466
- // Synthesise a sub_agent_started event — drives the reducer's
1467
- // existing sub_agent_started path (adds to subAgents, removes
1468
- // from pendingAgentSpawns, links checklist item via spawnedAgentId).
1469
- cs.state = reduce(cs.state, {
1470
- kind: 'sub_agent_started',
1471
- agentId: syntheticId,
1472
- firstPromptText: pending.promptText,
1473
- }, now())
1474
- cs.lastEventAt = now()
1475
- flush(cs, false)
1476
- }
1477
- }
1478
-
1479
- // Gap 4: synthesise sub_agent_turn_end for cold-JSONL sub-agents.
1480
- for (const { cs, agentId } of coldSubAgents) {
1481
- process.stderr.write(
1482
- `telegram gateway: progress-card: cold-jsonl-synth-turn-end agentId=${agentId} turnKey=${cs.turnKey} (Gap 4 #313)\n`,
1483
- )
1484
- const prevStateGap4 = cs.state
1485
- cs.state = reduce(cs.state, { kind: 'sub_agent_turn_end', agentId }, now())
1486
- // Issue #399: sync the chat-scoped running-sub-agent registry so the
1487
- // cold-synth terminal transition doesn't leave a stale entry that would
1488
- // carry over into the next turn's progress card.
1489
- syncChatRunningSubagents(
1490
- prevStateGap4,
1491
- cs.state,
1492
- baseKey(cs.chatId, cs.threadId),
1493
- chatRunningSubagents,
1494
- )
1495
- cs.lastEventAt = now()
1496
- maybeCompleteDeferredTurn(cs)
1497
- if (!cs.completionFired) flush(cs, false)
1498
- }
1499
-
1500
- // Gap 8: force-close cards whose deferred-completion timeout has expired.
1501
- // The unified `closePerChat('stalled')` path applies the same prelude
1502
- // (sub-agent sync, prepareSilentEndSuppression) and renders the
1503
- // "⚠️ Stalled — forced close" header via stalledClose=true.
1504
- for (const cs of stalledCards) {
1505
- process.stderr.write(
1506
- `telegram gateway: progress-card: deferred-completion-timeout-expired turnKey=${cs.turnKey} deferredCompletionTimeoutMs=${deferredCompletionTimeoutMs} (Gap 8 #313)\n`,
1507
- )
1508
- closePerChat(cs, 'stalled')
1509
- }
1510
- // Dedup-map TTL eviction has moved to `maybeEvict` (called from
1511
- // every public ingress). Keeping it here was unsafe because the
1512
- // heartbeat stops when `chats.size === 0`, which let
1513
- // `seenEnqueueMsgIds` / `pendingSyncEchoes` grow unbounded across
1514
- // idle periods.
1515
- //
1516
- // If every chat has ended, stop the heartbeat to avoid an
1517
- // always-on timer.
1518
- if (chats.size === 0) stopHeartbeat()
1519
- }, heartbeatMs)
1520
- }
1521
-
1522
- function stopHeartbeat(): void {
1523
- if (heartbeatHandle == null) return
1524
- clearI(heartbeatHandle)
1525
- heartbeatHandle = null
1526
- }
1527
-
1528
- /** Base key for a chat:thread (no turn seq). Used as prefix for turn keys. */
1529
- function baseKey(chatId: string, threadId?: string): string {
1530
- return threadId != null ? `${chatId}:${threadId}` : chatId
1531
- }
1532
-
1533
- /**
1534
- * Return the N/M task counter for a card. Index and total are derived
1535
- * from the currently ACTIVE cards for this chat:thread — NOT the
1536
- * session-cumulative baseTurnSeqs counter. Using the cumulative counter
1537
- * causes "(11/11)" to appear after 11 sequential turns, which reads as
1538
- * "task 11 of 11" (confusingly final-looking) rather than conveying
1539
- * parallel concurrency. The N/M suffix is only meaningful when 2+ cards
1540
- * are simultaneously active; for sequential turns it should be absent.
1541
- */
1542
- function taskNumFor(chatState: PerChatState): TaskNum {
1543
- const base = baseKey(chatState.chatId, chatState.threadId)
1544
- // Count only currently active cards for this chat:thread so that
1545
- // sequential turns always return total=1 (counter hidden) and only
1546
- // parallel active turns (2+ simultaneous cards) show "(N/M)".
1547
- let activeCount = 0
1548
- let activeIndex = 1
1549
- for (const [, cs] of chats) {
1550
- if (baseKey(cs.chatId, cs.threadId) === base) {
1551
- activeCount++
1552
- if (cs.turnKey === chatState.turnKey) activeIndex = activeCount
1553
- }
1554
- }
1555
- return { index: activeIndex, total: activeCount }
1556
- }
1557
-
1558
- const DELAY_ELAPSED = 'elapsed'
1559
- function flush(chatState: PerChatState, forceDone: boolean, stalledClose = false): void {
1560
- // If this card has hit the permanent-failure threshold, don't attempt
1561
- // any more edits. Avoids log spam and pointless retries for deleted
1562
- // messages / blocked bots.
1563
- if (chatState.apiFailures.terminal) return
1564
- // External takeover (e.g. turn-flush rewriting the card with the
1565
- // user-facing answer text — see #654). Once handed off, the driver
1566
- // must never issue another edit for this card; the new owner has
1567
- // full control of the message until they call pinMgr.completeTurn.
1568
- if (chatState.cardTakenOver) return
1569
- // Suppress the card entirely if the turn ends before the initial
1570
- // delay has elapsed — no point flashing a "Working…" card for a
1571
- // turn that completed in under initialDelayMs.
1572
- const effectiveDelayMs = chatState.effectiveInitialDelayMs
1573
- if (chatState.isFirstEmit && effectiveDelayMs > 0 && chatState.deferredFirstEmitTimer !== DELAY_ELAPSED) {
1574
- if (forceDone || chatState.state.stage === 'done') {
1575
- // Turn ended before the card was ever shown — suppress it.
1576
- if (chatState.deferredFirstEmitTimer != null) {
1577
- clearT(chatState.deferredFirstEmitTimer)
1578
- chatState.deferredFirstEmitTimer = null
1579
- }
1580
- process.stderr.write(`telegram gateway: progress-card: fast-turn suppression turnKey=${chatState.turnKey} (turn ended before initialDelayMs=${effectiveDelayMs}ms)\n`)
1581
- emitCardEvent({
1582
- agent: process.env.SWITCHROOM_AGENT_NAME ?? '',
1583
- chatId: chatState.chatId ?? '',
1584
- turnKey: chatState.turnKey,
1585
- event: 'suppressed',
1586
- reason: `fast-turn: ended before initialDelayMs=${effectiveDelayMs}`,
1587
- })
1588
- return
1589
- }
1590
- // Defer the first emit — schedule it for the per-chat budget from
1591
- // turn start if not already scheduled. Uses
1592
- // `chatState.effectiveInitialDelayMs` so the #842 background-
1593
- // dispatch bypass (which lowers this number on tool_use) takes
1594
- // effect on the very next flush.
1595
- if (chatState.deferredFirstEmitTimer == null) {
1596
- const capturedTurnKey = chatState.turnKey
1597
- // Schedule from turn start, not from now — multiple flush
1598
- // attempts during the buffering window must not push the
1599
- // first-emit clock back.
1600
- const elapsed = chatState.state.turnStartedAt > 0
1601
- ? Math.max(0, now() - chatState.state.turnStartedAt)
1602
- : 0
1603
- const remaining = Math.max(0, effectiveDelayMs - elapsed)
1604
- process.stderr.write(`telegram gateway: progress-card: scheduled initial-delay timer turnKey=${capturedTurnKey} delay=${remaining}ms budget=${effectiveDelayMs}ms\n`)
1605
- chatState.deferredFirstEmitTimer = setT(() => {
1606
- if (!chats.has(capturedTurnKey)) return
1607
- chatState.deferredFirstEmitTimer = DELAY_ELAPSED
1608
- process.stderr.write(`telegram gateway: progress-card: initial-delay timer fired turnKey=${capturedTurnKey}\n`)
1609
- flush(chatState, false)
1610
- }, remaining)
1611
- }
1612
- return
1613
- }
1614
- const taskNum = taskNumFor(chatState)
1615
- const stuckMs = Math.max(0, now() - chatState.lastEventAt)
1616
- // Issue #259: autonomous wakeup turns never produce a reply by design —
1617
- // suppress the silent-end warning so the card renders "✅ Done" instead
1618
- // of "🙊 Ended without reply" when ScheduleWakeup / CronCreate fires.
1619
- // silentEndSuppressed is set by completeTurnFully when onSilentEnd returns
1620
- // { suppressed: true } — used to re-render the final card without the
1621
- // warning after a retry is queued, preventing a false-positive flash.
1622
- const silentEnd =
1623
- !chatState.replyToolCalled && !chatState.wasAutonomous && !chatState.silentEndSuppressed
1624
- const replyNotDelivered =
1625
- chatState.replyToolCalled && chatState.outboundDeliveredCount === 0
1626
- // Gap 8: during the deferred-unpin window (parent turn_end fired but
1627
- // sub-agents still running), show ✅ Done in the parent header immediately.
1628
- const parentDone = chatState.parentTurnEndAt != null && hasAnyRunningSubAgent(chatState.state)
1629
- const html = render(
1630
- chatState.state,
1631
- now(),
1632
- taskNum.total > 1 ? taskNum : undefined,
1633
- { stuckMs, silentEnd, replyNotDelivered, parentDone, stalledClose },
1634
- undefined,
1635
- chatState.fleet,
1636
- )
1637
- // Issue #81 diagnostic: which checklist branch is the renderer taking?
1638
- // The card prefers `narratives` (human preambles) over `items` (raw
1639
- // tool counts). When prose lands without narratives we want to know
1640
- // why — log the available state at the decision boundary.
1641
- //
1642
- // Fires on the first emit AND on any forced-done flush (terminal
1643
- // state via completeTurnFully / closeZombie / maybeCompleteDeferredTurn)
1644
- // — both are useful inflection points for understanding what the card
1645
- // looked like when it transitioned.
1646
- if (forceDone || chatState.lastEmittedHtml == null /* first emit or terminal flush */) {
1647
- const s = chatState.state
1648
- const branch = s.narratives.length > 0
1649
- ? 'narratives'
1650
- : s.items.length > 0
1651
- ? 'tool-count-fallback'
1652
- : 'empty'
1653
- process.stderr.write(
1654
- `progress-card.diag: render branch=${branch} chatId=${chatState.chatId} turnKey=${chatState.turnKey} ` +
1655
- `narratives=${s.narratives.length} items=${s.items.length} latestText_len=${s.latestText?.length ?? 0} ` +
1656
- `subagents=${s.subAgents.size} pendingPreamble=${s.pendingPreamble ? 'yes' : 'no'} forceDone=${forceDone}\n`,
1657
- )
1658
- }
1659
- if (html === chatState.lastEmittedHtml && !forceDone) return
1660
- chatState.lastEmittedHtml = html
1661
- chatState.lastEmittedAt = now()
1662
- recordEdit(chatState.turnKey)
1663
- const isFirst = chatState.isFirstEmit
1664
- chatState.isFirstEmit = false
1665
- // Notification-spam fix (2026-04-23): never emit done=true while the
1666
- // card is still waiting on in-flight sub-agents. The reducer sets
1667
- // `stage='done'` the moment parent turn_end lands, so a naive
1668
- // `done: stage==='done'` passes done=true on every subsequent sub-
1669
- // agent event. handleStreamReply finalizes + deletes the draft
1670
- // stream after every done=true call, so the NEXT emit creates a
1671
- // fresh sendMessage — which Telegram delivers as a new push
1672
- // notification. Ken observed ~13 identical "✅ Done" notifications
1673
- // while two parallel review sub-agents were grinding.
1674
- //
1675
- // Safe to gate on `hasAnyRunningSubAgent`: the completion paths
1676
- // (`completeTurnFully` / `closeZombie` / `maybeCompleteDeferredTurn`)
1677
- // either (a) ran when no sub-agents are running or (b) explicitly
1678
- // marked every running sub-agent as done in the reducer state BEFORE
1679
- // the final flush. Including orphans here keeps `done=true` suppressed
1680
- // while a background dispatch is still active (closes #87).
1681
- const terminal =
1682
- (forceDone || chatState.state.stage === 'done')
1683
- && !hasAnyRunningSubAgent(chatState.state)
1684
- config.emit({
1685
- chatId: chatState.chatId,
1686
- threadId: chatState.threadId,
1687
- turnKey: chatState.turnKey,
1688
- html,
1689
- done: terminal,
1690
- isFirstEmit: isFirst,
1691
- // Thread the source message_id through on the first emit only so
1692
- // the caller can pass it as reply_parameters on the initial
1693
- // sendMessage. Edits (isFirstEmit=false) must NOT carry it.
1694
- ...(isFirst && chatState.replyToMessageId != null
1695
- ? { replyToMessageId: chatState.replyToMessageId }
1696
- : {}),
1697
- })
1698
- // #card-audit-log: structured lifecycle entry for retroactive audit.
1699
- // Mirrors the existing free-text traces but is grep-able by turnKey.
1700
- emitCardEvent({
1701
- agent: process.env.SWITCHROOM_AGENT_NAME ?? '',
1702
- chatId: chatState.chatId ?? '',
1703
- turnKey: chatState.turnKey,
1704
- event: isFirst ? 'rendered' : (terminal ? 'finalized' : 'edited'),
1705
- reason: terminal ? 'flush-terminal' : (isFirst ? 'flush-first' : 'flush-edit'),
1706
- htmlHash: html.length > 0
1707
- ? createHash('sha1').update(html).digest('hex').slice(0, 12)
1708
- : undefined,
1709
- })
1710
- }
1711
-
1712
- /**
1713
- * Promote a card out of the initial-delay suppression window early.
1714
- * Idempotent — short-circuits if the card has already emitted, the
1715
- * delay has already elapsed, or the card is terminal.
1716
- *
1717
- * Sets `deferredFirstEmitTimer = DELAY_ELAPSED` so the very next
1718
- * `flush()` call bypasses the suppression branch and emits a real
1719
- * card render. Cancels any in-flight deferred timer to prevent a
1720
- * second emit when the original `initialDelayMs` clock would have
1721
- * fired. Calls `flush()` directly so the card surfaces immediately.
1722
- *
1723
- * Used by:
1724
- * - sub-agent state diff in `ingest()` when a sub-agent transitions
1725
- * to running during the suppression window
1726
- * - the enqueue branch when carriedOver running sub-agents seed the
1727
- * fresh PerChatState (#334 cross-turn carry-over)
1728
- * - `onSubAgentStall()` when a watcher reports a stalled sub-agent
1729
- * before the card has emitted
1730
- */
1731
- function promoteFirstEmit(cs: PerChatState, reason: string): void {
1732
- if (!cs.isFirstEmit) return
1733
- if (cs.deferredFirstEmitTimer === DELAY_ELAPSED) return
1734
- if (cs.apiFailures.terminal) return
1735
- if (cs.deferredFirstEmitTimer != null) {
1736
- clearT(cs.deferredFirstEmitTimer)
1737
- }
1738
- if (cs.timePromoteTimer != null) {
1739
- clearT(cs.timePromoteTimer)
1740
- cs.timePromoteTimer = null
1741
- }
1742
- cs.deferredFirstEmitTimer = DELAY_ELAPSED
1743
- process.stderr.write(
1744
- `telegram gateway: progress-card: promoteFirstEmit turnKey=${cs.turnKey} reason=${reason}\n`,
1745
- )
1746
- flush(cs, /*forceDone*/ false)
1747
- }
1748
-
1749
- /**
1750
- * F3 fix (#553): schedule a one-shot timer that force-promotes the
1751
- * card after `promoteAfterMs` if no other promotion path has fired
1752
- * by then. Idempotent — safe to call repeatedly. The timer is
1753
- * cleared by `promoteFirstEmit` (so the existing promotion paths
1754
- * still win when they fire first) and at turn end.
1755
- *
1756
- * Without this proactive timer, a long single-tool turn (e.g. one
1757
- * 10s Bash) never crosses any existing promotion threshold and
1758
- * the card stays suppressed until `initialDelayMs` (30s by
1759
- * default). Fast-turn-suppression then cancels it on `turn_end`.
1760
- */
1761
- function ensureTimePromoteScheduled(cs: PerChatState): void {
1762
- if (!cs.isFirstEmit) return
1763
- if (cs.deferredFirstEmitTimer === DELAY_ELAPSED) return
1764
- if (cs.apiFailures.terminal) return
1765
- if (cs.timePromoteTimer != null) return
1766
- if (promoteAfterMs <= 0) return
1767
- const elapsed = now() - cs.state.turnStartedAt
1768
- const remaining = Math.max(0, promoteAfterMs - elapsed)
1769
- const capturedTurnKey = cs.turnKey
1770
- cs.timePromoteTimer = setT(() => {
1771
- if (!chats.has(capturedTurnKey)) return
1772
- const cs2 = chats.get(capturedTurnKey)!
1773
- cs2.timePromoteTimer = null
1774
- // Idempotency belt-and-braces: promoteFirstEmit no-ops if already
1775
- // promoted by another path between scheduling and firing.
1776
- promoteFirstEmit(cs2, `time_${promoteAfterMs}ms`)
1777
- }, remaining)
1778
- }
1779
-
1780
- /**
1781
- * True if `a` and `b` differ in any field that actually appears in the
1782
- * rendered card (items, stage, userRequest, latestText). Internal
1783
- * bookkeeping fields like `thinking` that don't reach render() don't
1784
- * count — we don't want to waste a Telegram edit on them.
1785
- */
1786
- function visibleDiff(a: ProgressCardState, b: ProgressCardState): boolean {
1787
- if (a.stage !== b.stage) return true
1788
- if (a.userRequest !== b.userRequest) return true
1789
- if (a.latestText !== b.latestText) return true
1790
- if (a.items.length !== b.items.length) return true
1791
- for (let i = 0; i < a.items.length; i++) {
1792
- if (a.items[i].state !== b.items[i].state) return true
1793
- if (a.items[i].tool !== b.items[i].tool) return true
1794
- // Multi-agent: spawnedAgentId attached on correlation matters for
1795
- // the [Main] line's 🤖 vs ✅ glyph (PR 3 renderer).
1796
- if (a.items[i].spawnedAgentId !== b.items[i].spawnedAgentId) return true
1797
- }
1798
- // Multi-agent: any change in sub-agent shape or per-sub-agent state
1799
- // is user-visible. Cheap O(N) scan; N is the sub-agent count, which
1800
- // is bounded by how many parallel Agent calls one turn makes (~4–12
1801
- // in practice).
1802
- if (a.subAgents.size !== b.subAgents.size) return true
1803
- for (const [k, sa] of a.subAgents) {
1804
- const sb = b.subAgents.get(k)
1805
- if (!sb) return true
1806
- if (sa.state !== sb.state) return true
1807
- if (sa.toolCount !== sb.toolCount) return true
1808
- if (sa.description !== sb.description) return true
1809
- if (sa.parentToolUseId !== sb.parentToolUseId) return true
1810
- if (sa.nestedSpawnCount !== sb.nestedSpawnCount) return true
1811
- if ((sa.currentTool?.toolUseId ?? null) !== (sb.currentTool?.toolUseId ?? null)) return true
1812
- if (sa.currentNarrative !== sb.currentNarrative) return true
1813
- }
1814
- return false
1815
- }
1816
-
1817
- // P0 of #662 — shadow fleet maintenance. Mutates cs.fleet in place
1818
- // by replacing entries with new immutable FleetMember objects from the
1819
- // pure transition functions in fleet-state.ts.
1820
- function updateFleetForEvent(cs: PerChatState, event: SessionEvent): void {
1821
- switch (event.kind) {
1822
- case 'tool_use': {
1823
- // P2 of #662 — capture the run_in_background flag from parent
1824
- // Agent/Task dispatches. The flag is keyed by parentToolUseId
1825
- // so the matching sub_agent_started (which gets the same id
1826
- // wired in via the reducer's pendingAgentSpawns adoption) can
1827
- // look it up when creating the fleet member.
1828
- if (
1829
- (event.toolName === 'Agent' || event.toolName === 'Task') &&
1830
- event.toolUseId &&
1831
- event.input?.run_in_background === true
1832
- ) {
1833
- cs.backgroundParentToolUseIds.add(event.toolUseId)
1834
- }
1835
- return
1836
- }
1837
- case 'sub_agent_started': {
1838
- // Idempotent — late duplicates of the same agentId keep the
1839
- // original startedAt + originatingTurnKey snapshot.
1840
- if (cs.fleet.has(event.agentId)) return
1841
- const role = roleFromDispatch(undefined, event.subagentType, event.firstPromptText)
1842
- // P2: derive background status from the parent dispatch flag.
1843
- // The reducer at progress-card.ts:706 already correlated the
1844
- // matching pendingAgentSpawn and wrote parentToolUseId into the
1845
- // fresh subagent state — read it back here so the fleet reflects
1846
- // the dispatch's run_in_background flag.
1847
- const parentToolUseId = cs.state.subAgents.get(event.agentId)?.parentToolUseId ?? null
1848
- const isBackground =
1849
- parentToolUseId != null && cs.backgroundParentToolUseIds.has(parentToolUseId)
1850
- const member = createFleetMember({
1851
- agentId: event.agentId,
1852
- role,
1853
- startedAt: now(),
1854
- originatingTurnKey: currentTurnKey ?? cs.turnKey,
1855
- isBackgroundDispatch: isBackground,
1856
- })
1857
- cs.fleet.set(event.agentId, isBackground ? { ...member, status: 'background' } : member)
1858
- return
1859
- }
1860
- case 'sub_agent_tool_use': {
1861
- const m = cs.fleet.get(event.agentId)
1862
- if (m == null) return
1863
- cs.fleet.set(event.agentId, fleetApplyToolUse(m, event.toolName, event.input, now()))
1864
- return
1865
- }
1866
- case 'sub_agent_tool_result': {
1867
- const m = cs.fleet.get(event.agentId)
1868
- if (m == null) return
1869
- cs.fleet.set(event.agentId, fleetApplyToolResult(m, event.isError))
1870
- return
1871
- }
1872
- case 'sub_agent_turn_end': {
1873
- const m = cs.fleet.get(event.agentId)
1874
- if (m == null) return
1875
- cs.fleet.set(event.agentId, fleetApplyTurnEnd(m, now()))
1876
- return
1877
- }
1878
- case 'sub_agent_capped': {
1879
- // The sub-agent transcript was truncated mid-flight: >= threshold
1880
- // tool_uses with no terminal record. Transition the fleet member to
1881
- // `capped` so the progress card shows a terminal "capped" row instead
1882
- // of hanging "running" indefinitely. Also drive the legacy reducer via
1883
- // sub_agent_turn_end so the subAgents map stays consistent.
1884
- const m = cs.fleet.get(event.agentId)
1885
- if (m != null) {
1886
- cs.fleet.set(event.agentId, fleetApplyCapped(m, now()))
1887
- }
1888
- // Mirror into the legacy reducer so render() sees the agent as done.
1889
- cs.state = reduce(cs.state, { kind: 'sub_agent_turn_end', agentId: event.agentId }, now())
1890
- return
1891
- }
1892
- default:
1893
- return
1894
- }
1895
- }
1896
-
1897
- // Cardinality reconciler: the legacy state.subAgents map can grow
1898
- // through paths the fleet shadow doesn't know about (parent Agent
1899
- // tool_use synthesised correlations, heartbeat orphan promotions,
1900
- // cross-turn carry-over). Mirror those into fleet so the invariant
1901
- // that `fleet` is a superset-or-equal of `subAgents` (by key) holds.
1902
- function reconcileFleetWithSubAgents(cs: PerChatState): void {
1903
- for (const [agentId, sa] of cs.state.subAgents) {
1904
- if (!cs.fleet.has(agentId)) {
1905
- // P0 follow-up (#662 reviewer items 1+2): preserve `startedAt`
1906
- // from the legacy SubAgentState when present so the synthesised
1907
- // carry-over entry doesn't reset the clock and immediately mask
1908
- // a stuck condition. `originatingTurnKey` has no legacy
1909
- // counterpart — fall back to the current/active turn.
1910
- const startedAt = sa.startedAt > 0 ? sa.startedAt : now()
1911
- const isBg =
1912
- sa.parentToolUseId != null &&
1913
- cs.backgroundParentToolUseIds.has(sa.parentToolUseId)
1914
- cs.fleet.set(
1915
- agentId,
1916
- createFleetMember({
1917
- agentId,
1918
- role: sa.description ?? 'agent',
1919
- startedAt,
1920
- originatingTurnKey: currentTurnKey ?? cs.turnKey,
1921
- isBackgroundDispatch: isBg,
1922
- }),
1923
- )
1924
- }
1925
- }
1926
- // Drop fleet entries the legacy map no longer tracks (rare — only
1927
- // when a parent tool_result correlation prunes a sub-agent before
1928
- // any sub_agent_turn_end arrived).
1929
- for (const agentId of [...cs.fleet.keys()]) {
1930
- if (!cs.state.subAgents.has(agentId)) {
1931
- cs.fleet.delete(agentId)
1932
- }
1933
- }
1934
- }
1935
-
1936
- return {
1937
- ingest(event, chatIdMaybe, threadId) {
1938
- // Throttled inline TTL sweep — see `maybeEvict` for rationale.
1939
- maybeEvict(now())
1940
- // An `enqueue` event carries its own chatId (extracted from the XML
1941
- // channel wrapper). Everything else falls back to the caller-provided
1942
- // chatIdMaybe, which the session-tail supervisor tracks.
1943
- let chatId = chatIdMaybe
1944
- if (event.kind === 'enqueue') {
1945
- chatId = event.chatId
1946
- threadId = event.threadId ?? undefined
1947
-
1948
- // Skip enqueue events with no chatId. These come from non-channel
1949
- // turns (e.g. terminal input) forwarded by the bridge's session-tail.
1950
- // Creating a card with chatId=null spams "chat null is not allowlisted"
1951
- // on every emit attempt and produces a ghost card that occupies
1952
- // currentTurnKey, potentially interfering with real card routing.
1953
- if (chatId == null || chatId === '') return
1954
-
1955
- // A session-tail enqueue (isSync not set) arriving while a card is
1956
- // already live for the same chat+thread is an echo of a sync
1957
- // startTurn() call — drop it. startTurn owns the turn lifecycle for
1958
- // non-steering messages; if we fell through we'd orphan the pinned
1959
- // card and spawn a second "Working…" message that takes over all
1960
- // the updates while the original stays stuck at 0ms.
1961
- if (!event.isSync) {
1962
- // Guard 0 (messageId dedup): if we've already seen an enqueue
1963
- // with this messageId for this chat+thread, drop it. Session
1964
- // restarts can produce multiple echoes of the same user message
1965
- // (each restart re-processes the queue, writing a fresh enqueue
1966
- // to a new JSONL). Guard 2 only catches the first; this guard
1967
- // catches all subsequent duplicates by messageId.
1968
- if (event.messageId != null) {
1969
- const base = baseKey(chatId, threadId ?? undefined)
1970
- const dedupKey = `${base}:${event.messageId}`
1971
- const seenAt = seenEnqueueMsgIds.get(dedupKey)
1972
- if (seenAt != null && now() - seenAt < 60_000) {
1973
- return
1974
- }
1975
- seenEnqueueMsgIds.set(dedupKey, now())
1976
- }
1977
-
1978
- // Guard 1: active card exists for this chat+thread.
1979
- // P2 of #662 / fixes #64 — except when the active card is a
1980
- // background-carry state (turn ended, fleet still has live bg
1981
- // members). The new enqueue is a real follow-up turn that must
1982
- // create a fresh PerChatState; the bg carry stays alive in
1983
- // parallel under its own turnKey.
1984
- if (currentTurnKey != null) {
1985
- const existing = chats.get(currentTurnKey)
1986
- if (
1987
- existing != null &&
1988
- existing.chatId === chatId &&
1989
- existing.threadId === threadId &&
1990
- !hasLiveBackground(existing.fleet)
1991
- ) {
1992
- return
1993
- }
1994
- }
1995
- // Guard 2: this enqueue is the session-tail echo of a sync
1996
- // startTurn() call. Drop it and consume the marker. Without
1997
- // this, fast turns that complete before the echo arrives would
1998
- // pass Guard 1 (currentTurnKey already null) and spawn an
1999
- // orphan card.
2000
- const base = baseKey(chatId, threadId ?? undefined)
2001
- const syncStart = pendingSyncEchoes.get(base)
2002
- if (syncStart != null && now() - syncStart < 30_000) {
2003
- pendingSyncEchoes.delete(base)
2004
- return
2005
- }
2006
- }
2007
-
2008
- // Allocate a new turn slot FIRST — this increments baseTurnSeqs so
2009
- // that taskNumFor() on the old card will see the correct total (N+1)
2010
- // when we render its final "done" frame below.
2011
- const slot = allocateTurnSlot(chatId, threadId)
2012
-
2013
- // If an existing card is still active for this chat, force-close it
2014
- // so it gets properly done/unpinned before the new card takes over.
2015
- // Also close ghost cards (chatId is null/empty) — these come from
2016
- // non-channel session-tail events that slipped through before the
2017
- // null guard above was added, or from a race.
2018
- //
2019
- // Route through closeZombie so any still-running sub-agents on
2020
- // the old card are explicitly marked done (abandoned) and the
2021
- // shared completion sequence fires exactly once. This is the
2022
- // correct path for "new turn replacing old" even when the old
2023
- // turn was in pendingCompletion state (background sub-agent
2024
- // hadn't reported done yet).
2025
- // P2 of #662 / fixes #64 — if the in-flight turn has live
2026
- // background fleet members, do NOT closeZombie it. Detach it
2027
- // from currentTurnKey instead so the new turn takes over the
2028
- // active slot while turn A's PerChatState stays alive in `chats`
2029
- // to receive cross-turn sub_agent_* events. Mark it with
2030
- // backgroundCarry so completion fires once the last live bg
2031
- // member reaches terminal status.
2032
- let bgCarryActive = false
2033
- if (currentTurnKey != null) {
2034
- const existing = chats.get(currentTurnKey)
2035
- if (existing != null && (existing.chatId === chatId || !existing.chatId)) {
2036
- if (hasLiveBackground(existing.fleet)) {
2037
- existing.backgroundCarry = true
2038
- bgCarryActive = true
2039
- process.stderr.write(
2040
- `telegram gateway: progress-card: bg-carry preserving turnKey=${existing.turnKey} (live background fleet members) on new enqueue\n`,
2041
- )
2042
- } else {
2043
- closeZombie(existing)
2044
- }
2045
- }
2046
- }
2047
- currentChatId = chatId
2048
- currentThreadId = threadId
2049
- currentTurnKey = slot.turnKey
2050
-
2051
- // Issue #334: seed the new turn's subAgents from any still-running
2052
- // background sub-agents dispatched in a prior turn for this chat.
2053
- const initialTurnState = reduce(initialState(), event, now())
2054
- const cBaseKey = baseKey(chatId, threadId)
2055
- // P2 of #662 — when bg carry is active, the originating PerChatState
2056
- // still owns the running sub-agents. Don't re-seed turn B with them
2057
- // (would duplicate the fleet entries and cause turn B to defer its
2058
- // own completion waiting for sub-agents that don't belong to it).
2059
- const carriedOver = bgCarryActive ? undefined : chatRunningSubagents.get(cBaseKey)
2060
- const seededState: ProgressCardState = (carriedOver != null && carriedOver.size > 0)
2061
- ? {
2062
- ...initialTurnState,
2063
- subAgents: new Map<string, SubAgentState>(
2064
- [...carriedOver.entries()].map(([id, sa]) => [id, { ...sa }]),
2065
- ),
2066
- }
2067
- : initialTurnState
2068
-
2069
- const chatState: PerChatState = {
2070
- chatId,
2071
- threadId,
2072
- turnKey: slot.turnKey,
2073
- taskIndex: slot.index,
2074
- taskTotal: slot.total,
2075
- state: seededState,
2076
- lastEmittedAt: 0,
2077
- lastEmittedHtml: null,
2078
- pendingTimer: null,
2079
- isFirstEmit: true,
2080
- deferredFirstEmitTimer: null,
2081
- effectiveInitialDelayMs: initialDelayMs,
2082
- timePromoteTimer: null,
2083
- lastEventAt: now(),
2084
- pendingCompletion: false,
2085
- completionFired: false,
2086
- cardTakenOver: false,
2087
- apiFailures: { consecutive4xx: 0, lastError: null, terminal: false },
2088
- replyToolCalled: false,
2089
- outboundDeliveredCount: 0,
2090
- wasAutonomous: false,
2091
- silentEndSuppressed: false,
2092
- silentEndPrepared: false,
2093
- parentTurnEndAt: null,
2094
- parentDoneRendered: false,
2095
- promotedSpawnIds: new Set(),
2096
- fleet: new Map<string, FleetMember>(),
2097
- backgroundParentToolUseIds: new Set<string>(),
2098
- backgroundCarry: false,
2099
- }
2100
- chats.set(slot.turnKey, chatState)
2101
- if (event.isSync) {
2102
- pendingSyncEchoes.set(baseKey(chatId, threadId), now())
2103
- }
2104
- startHeartbeatIfNeeded()
2105
- // #334 cross-turn carry-over: a fresh PerChatState seeded with
2106
- // running sub-agents from a prior turn already has visible work
2107
- // to surface. Skip suppression and emit immediately. The diff-
2108
- // based promote in the reducer block above misses this case
2109
- // because the carried-over sub-agents were copied during
2110
- // `initialState()` reduction — there is no prev→next transition
2111
- // for it to detect.
2112
- //
2113
- // Defensive: post-#401, `closeZombie` syncs the chat-scoped
2114
- // registry on every parent-replacement enqueue, so carriedOver
2115
- // is empty in the common path. Keeping the hook means future
2116
- // regressions in the sync path (or a code path that bypasses
2117
- // closeZombie) still produce a visible card instead of a
2118
- // silently-suppressed turn.
2119
- if (promoteOnSubAgent && carriedOver != null && carriedOver.size > 0) {
2120
- promoteFirstEmit(chatState, 'carried_over_subagents')
2121
- } else {
2122
- flush(chatState, /*forceDone*/ false)
2123
- }
2124
- return
2125
- } else if (chatId == null) {
2126
- // Non-enqueue event with no explicit chat: fall back to the
2127
- // most recently enqueued chat for this driver.
2128
- chatId = currentChatId
2129
- threadId = threadId ?? currentThreadId
2130
- }
2131
- if (chatId == null) return
2132
-
2133
- // P2 of #662 / fixes #64 — sub_agent_* events for an agentId whose
2134
- // fleet member lives on a non-current PerChatState (background
2135
- // carry) must route to the originating turn, not currentTurnKey.
2136
- // Without this, a background sub-agent that emits tool_use after
2137
- // its parent turn ended (and a new turn took over) would either
2138
- // be dropped or update the wrong turn's card.
2139
- let chatState: PerChatState | undefined
2140
- if (
2141
- (event.kind === 'sub_agent_tool_use' ||
2142
- event.kind === 'sub_agent_tool_result' ||
2143
- event.kind === 'sub_agent_turn_end' ||
2144
- event.kind === 'sub_agent_capped' ||
2145
- event.kind === 'sub_agent_started') &&
2146
- 'agentId' in event
2147
- ) {
2148
- const agentId = (event as { agentId: string }).agentId
2149
- for (const candidate of chats.values()) {
2150
- if (candidate.chatId !== chatId) continue
2151
- if (candidate.fleet.has(agentId)) {
2152
- chatState = candidate
2153
- break
2154
- }
2155
- }
2156
- }
2157
-
2158
- // Route to the current active turn key. Drop late events for a turn
2159
- // that already ended — without this, a stray tool_result after turn_end
2160
- // would resurrect the card. currentTurnKey is cleared on turn_end.
2161
- if (chatState == null) {
2162
- const k = currentTurnKey
2163
- if (k == null) {
2164
- if (event.kind.startsWith('sub_agent_')) {
2165
- process.stderr.write(
2166
- `telegram gateway: progress-card: late-sub-agent-event-dropped kind=${event.kind} agentId=${'agentId' in event ? (event as { agentId: string }).agentId : 'n/a'} chatId=${chatId}\n`,
2167
- )
2168
- }
2169
- return
2170
- }
2171
- chatState = chats.get(k)
2172
- if (chatState == null) return
2173
- }
2174
-
2175
- const prev = chatState.state
2176
- chatState.state = reduce(chatState.state, event, now())
2177
- chatState.lastEventAt = now()
2178
-
2179
- // P0 of #662 — shadow fleet map. Mirror sub_agent_* events into
2180
- // the parallel FleetMember map using the pure transitions from
2181
- // fleet-state.ts. Legacy state.subAgents is unchanged; P1/P2/P3
2182
- // build on `fleet` without touching the existing renderer.
2183
- updateFleetForEvent(chatState, event)
2184
- // Reconcile shadow with legacy map: any sub-agent that appears in
2185
- // state.subAgents (e.g. via parent-tool-result correlation, the
2186
- // heartbeat orphan-promotion path, or carry-over) but is missing
2187
- // from fleet gets a synthetic FleetMember so the cardinality
2188
- // invariant holds. Conversely, drop fleet entries that legacy
2189
- // dropped (these are already terminal in the watcher's view).
2190
- reconcileFleetWithSubAgents(chatState)
2191
- const stageChanged = chatState.state.stage !== prev.stage
2192
- const visibleChanged = visibleDiff(prev, chatState.state)
2193
-
2194
- // Issue #334/#399: mirror sub-agent state changes into the chat-scoped
2195
- // running-sub-agent registry so new turns can seed from it.
2196
- // We diff prev.subAgents vs chatState.state.subAgents to catch all
2197
- // mutation paths: sub_agent_started, sub_agent_turn_end, and parent
2198
- // tool_result (which can finalize a sub-agent via parentToolUseId).
2199
- // Factored into syncChatRunningSubagents (issue #399) so closeZombie
2200
- // and the heartbeat's cold-jsonl-synth path can call the same logic.
2201
- // Returns `newRunningAppeared` so the caller can promote the card
2202
- // out of initial-delay suppression on a fresh sub-agent transition.
2203
- const { newRunningAppeared: newRunningSubAgentAppeared } = syncChatRunningSubagents(
2204
- prev,
2205
- chatState.state,
2206
- baseKey(chatState.chatId, chatState.threadId),
2207
- chatRunningSubagents,
2208
- )
2209
-
2210
- // Promote the card out of initial-delay suppression as soon as a
2211
- // sub-agent transitions to running. Long-running sub-agent dispatches
2212
- // are exactly the case where the user wants to see what's happening
2213
- // — waiting the full `initialDelayMs` before showing the card means
2214
- // 30s of staring at a frozen draft bubble. Diff-based detection
2215
- // (rather than gating on a specific event kind) catches every path
2216
- // that reaches `running`: real `sub_agent_started`, heartbeat orphan
2217
- // promotion, and parent-tool-result correlation.
2218
- if (
2219
- newRunningSubAgentAppeared
2220
- && promoteOnSubAgent
2221
- && chatState.isFirstEmit
2222
- && chatState.deferredFirstEmitTimer !== DELAY_ELAPSED
2223
- && !chatState.apiFailures.terminal
2224
- ) {
2225
- promoteFirstEmit(chatState, 'sub_agent_started')
2226
- }
2227
-
2228
- // #842: explicit background dispatch bypass. When the parent calls
2229
- // `Agent({ run_in_background: true })`, swap the active delay
2230
- // budget over to `initialDelayMsBackground` instead of the longer
2231
- // `initialDelayMs`. Detection: an Agent/Task tool_use whose
2232
- // `event.input.run_in_background === true` (the same flag
2233
- // `updateFleetForEvent` uses to populate
2234
- // `cs.backgroundParentToolUseIds` for fleet membership).
2235
- //
2236
- // - `initialDelayMsBackground === 0` (default) → promote now.
2237
- // - `initialDelayMsBackground > 0` → set
2238
- // `cs.effectiveInitialDelayMs` so the next flush() schedules
2239
- // (or reschedules) the deferred timer at the lower budget.
2240
- // Never lengthens an existing budget.
2241
- if (
2242
- event.kind === 'tool_use'
2243
- && (event.toolName === 'Agent' || event.toolName === 'Task')
2244
- && event.toolUseId != null
2245
- && event.input?.run_in_background === true
2246
- && chatState.isFirstEmit
2247
- && chatState.deferredFirstEmitTimer !== DELAY_ELAPSED
2248
- && !chatState.apiFailures.terminal
2249
- ) {
2250
- if (initialDelayMsBackground <= 0) {
2251
- promoteFirstEmit(chatState, 'background_dispatch')
2252
- } else if (initialDelayMsBackground < chatState.effectiveInitialDelayMs) {
2253
- chatState.effectiveInitialDelayMs = initialDelayMsBackground
2254
- // If a longer-budget timer is already scheduled, cancel and
2255
- // reschedule against the new budget. Compute the remaining
2256
- // gap from turn start; if we're already past it, promote.
2257
- if (chatState.deferredFirstEmitTimer != null) {
2258
- const elapsed = now() - chatState.state.turnStartedAt
2259
- const remaining = initialDelayMsBackground - elapsed
2260
- clearT(chatState.deferredFirstEmitTimer)
2261
- if (remaining <= 0) {
2262
- chatState.deferredFirstEmitTimer = null
2263
- promoteFirstEmit(chatState, 'background_dispatch_elapsed')
2264
- } else {
2265
- const capturedTurnKey = chatState.turnKey
2266
- process.stderr.write(
2267
- `telegram gateway: progress-card: rescheduled initial-delay timer turnKey=${capturedTurnKey} delay=${remaining}ms reason=background_dispatch\n`,
2268
- )
2269
- chatState.deferredFirstEmitTimer = setT(() => {
2270
- if (!chats.has(capturedTurnKey)) return
2271
- chatState.deferredFirstEmitTimer = DELAY_ELAPSED
2272
- process.stderr.write(
2273
- `telegram gateway: progress-card: initial-delay timer fired turnKey=${capturedTurnKey} reason=background_dispatch\n`,
2274
- )
2275
- flush(chatState, false)
2276
- }, remaining)
2277
- }
2278
- }
2279
- }
2280
- }
2281
-
2282
- // #478 / #553 PR 4: promote the card when the agent has issued
2283
- // enough parent-side tool calls during the suppression window.
2284
- // Disabled by default in v2 (promoteOnParentToolCount=0 / Infinity)
2285
- // — under the v2 contract tools alone never trigger the card. The
2286
- // check is preserved as a config knob for callers that want the
2287
- // old behaviour, but values of 0 or non-finite (Infinity) are
2288
- // treated as "never promote on tool count".
2289
- if (
2290
- promoteOnParentToolCount > 0
2291
- && Number.isFinite(promoteOnParentToolCount)
2292
- && chatState.isFirstEmit
2293
- && chatState.deferredFirstEmitTimer !== DELAY_ELAPSED
2294
- && !chatState.apiFailures.terminal
2295
- && chatState.state.items.length >= promoteOnParentToolCount
2296
- ) {
2297
- promoteFirstEmit(chatState, `parent_tool_count_${chatState.state.items.length}`)
2298
- }
2299
-
2300
- // F3 fix (#553): schedule the time-based promotion timer on
2301
- // every ingest event (idempotent — only the first call schedules;
2302
- // subsequent calls are no-ops). Without this, a long single-tool
2303
- // turn never crossed parent_tool_count or sub_agent thresholds
2304
- // and the card stayed suppressed until initialDelayMs (30s).
2305
- ensureTimePromoteScheduled(chatState)
2306
-
2307
- // Issue #132: track whether the agent has called `reply` or
2308
- // `stream_reply` at least once this turn so the renderer can
2309
- // distinguish "Done with reply" from "Done without reply" at
2310
- // turn_end. Tool-use intent is the right granularity here — if
2311
- // the call landed but failed mid-API, the model sees the error
2312
- // in tool_result and may retry, which still flips this true.
2313
- // Only false → true; never reset mid-turn.
2314
- if (
2315
- !chatState.replyToolCalled
2316
- && event.kind === 'tool_use'
2317
- && isTelegramReplyTool(event.toolName)
2318
- ) {
2319
- chatState.replyToolCalled = true
2320
- }
2321
-
2322
- // Issue #81 diagnostic: when a 'text' event lands, did the reducer
2323
- // recognize it as a narrative step? If narratives.length didn't grow,
2324
- // the card's "human-readable preamble" path can't render and the
2325
- // tool-count fallback wins. The log lets us correlate "user typed
2326
- // status?" telemetry with the missing narrative path.
2327
- //
2328
- // Gated behind PROGRESS_CARD_DIAG=1 because this fires on every
2329
- // assistant text event — a long verbose turn could produce dozens
2330
- // of lines per minute. The render-branch and prose-recovery diags
2331
- // (~2x and ~1x per turn respectively) stay always-on. Flip the env
2332
- // var on a one-off agent restart to capture data, then turn it off.
2333
- if (event.kind === 'text' && process.env.PROGRESS_CARD_DIAG === '1') {
2334
- const before = prev.narratives.length
2335
- const after = chatState.state.narratives.length
2336
- const last = chatState.state.narratives[after - 1]
2337
- const preview = last?.text ? last.text.slice(0, 60).replace(/\n/g, ' ') : ''
2338
- const took = before === after ? 'discarded' : 'captured'
2339
- process.stderr.write(
2340
- `progress-card.diag: text-event ${took} chatId=${chatState.chatId} turnKey=${chatState.turnKey} ` +
2341
- `narratives_before=${before} narratives_after=${after} text_len=${event.text.length} preview=${JSON.stringify(preview)}\n`,
2342
- )
2343
- }
2344
-
2345
- // Cancel any pending coalesce timer — we'll either fire now or
2346
- // reschedule.
2347
- if (chatState.pendingTimer != null) {
2348
- clearT(chatState.pendingTimer)
2349
- chatState.pendingTimer = null
2350
- }
2351
-
2352
- // Fire immediately on terminal state — no coalesce delay when the
2353
- // turn finishes. The user sees the final card the instant turn_end
2354
- // lands. (Note: `enqueue` events are handled upstream by startTurn,
2355
- // not ingested here, so the prior `event.kind === 'enqueue'` check
2356
- // was dead code per the SessionEvent union.)
2357
- if (event.kind === 'turn_end' || stageChanged) {
2358
- if (event.kind === 'turn_end') {
2359
- process.stderr.write(`telegram gateway: progress-card: turn_end flush chatId=${chatState.chatId} threadId=${chatState.threadId ?? '-'} turnKey=${chatState.turnKey}\n`)
2360
- // Only fire silent-end prep when we're actually about to complete —
2361
- // i.e. no sub-agents still running. The sub-agent defer path
2362
- // returns below and prep will run later via maybeCompleteDeferredTurn.
2363
- if (!hasAnyRunningSubAgent(chatState.state)) {
2364
- prepareSilentEndSuppression(chatState)
2365
- }
2366
- }
2367
- if (event.kind === 'turn_end' && hasAnyRunningSubAgent(chatState.state)) {
2368
- // Gap 8: parent turn_end with sub-agents still running — render
2369
- // done=true immediately (card shows ✅ Done) then defer unpin.
2370
- // Set parentTurnEndAt BEFORE flush so flush()'s parentDone
2371
- // computation picks it up on this very call.
2372
- chatState.parentTurnEndAt = now()
2373
- }
2374
- flush(chatState, /*forceDone*/ event.kind === 'turn_end')
2375
- if (event.kind === 'turn_end') {
2376
- // Gate on BOTH the legacy subAgents map AND the fleet's background
2377
- // members. Background sub-agents (dispatched with run_in_background:true)
2378
- // are tagged in cs.fleet with status:'background' by updateFleetForEvent
2379
- // at sub_agent_started time. If the parent turn_end fires before the
2380
- // background sub-agent has produced any events, state.subAgents may
2381
- // still be empty for that agent — hasAnyRunningSubAgent alone would
2382
- // miss it and close the card prematurely. Fixes #713 and #709.
2383
- if (hasAnyRunningSubAgent(chatState.state) || hasLiveBackground(chatState.fleet)) {
2384
- // Parent turn ended but at least one sub-agent is still running.
2385
- // Keep the card alive so the sub-agent work stays visible; defer
2386
- // completion until the last running sub-agent reports done via
2387
- // its own sub_agent_turn_end (or the parent Agent tool_result).
2388
- // Closes #87: orphans from `Agent({run_in_background:true})` now
2389
- // gate the defer too, so background dispatches stay visible past
2390
- // parent turn-end. Safety nets: `closeZombie` on new enqueue +
2391
- // the `maxIdleMs` heartbeat ceiling bound the bad case (orphan
2392
- // never reports done).
2393
- chatState.pendingCompletion = true
2394
- const correlated: string[] = []
2395
- const orphans: string[] = []
2396
- const background: string[] = []
2397
- for (const [k, sa] of chatState.state.subAgents) {
2398
- if (sa.state === 'running') {
2399
- if (sa.parentToolUseId != null) correlated.push(k)
2400
- else orphans.push(k)
2401
- }
2402
- }
2403
- for (const [k, m] of chatState.fleet) {
2404
- if (m.status === 'background' && m.terminalAt == null) background.push(k)
2405
- }
2406
- process.stderr.write(`telegram gateway: progress-card: turn_end deferred turnKey=${chatState.turnKey} reason=in-flight-sub-agents correlated=${correlated.length} orphans=${orphans.length} background=${background.length} correlatedAgentIds=[${correlated.join(',')}] orphanAgentIds=[${orphans.join(',')}] backgroundAgentIds=[${background.join(',')}]\n`)
2407
- emitCardEvent({
2408
- agent: process.env.SWITCHROOM_AGENT_NAME ?? '',
2409
- chatId: chatState.chatId ?? '',
2410
- turnKey: chatState.turnKey,
2411
- event: 'deferred',
2412
- reason: `turn_end: in-flight-sub-agents correlated=${correlated.length} orphans=${orphans.length} background=${background.length}`,
2413
- subagents: [...correlated, ...orphans, ...background],
2414
- })
2415
- return
2416
- }
2417
- closePerChat(chatState, 'turn-end')
2418
- }
2419
- return
2420
- }
2421
-
2422
- // Post-reduce deferred-completion check: if this event transitioned
2423
- // the last in-flight sub-agent to done (sub_agent_turn_end, parent
2424
- // Agent tool_result), fire completion now.
2425
- maybeCompleteDeferredTurn(chatState)
2426
-
2427
- // If this event didn't change anything user-visible (e.g. a
2428
- // `thinking` flag toggle that isn't rendered), don't schedule a
2429
- // flush. Prevents emit noise from events that only mutate internal
2430
- // state, and avoids spurious edits driven by ticking elapsed time
2431
- // in the header.
2432
- if (!visibleChanged) return
2433
-
2434
- // Otherwise: respect the min-interval floor. If we just emitted,
2435
- // defer to at least minIntervalMs after the last emit. Also always
2436
- // coalesce bursts — even a burst that runs past minIntervalMs gets
2437
- // at most one flush per coalesce window.
2438
- //
2439
- // Multi-agent rate-limit: if the chat has emitted >threshold edits
2440
- // in the last 60s, expand the coalesce window to
2441
- // editBudgetCoalesceMs (default 3s) so the Telegram 20/min cap is
2442
- // never exceeded by sub-agent bursts.
2443
- const sinceLast = now() - chatState.lastEmittedAt
2444
- const effectiveCoalesce = isBudgetHot(chatState.turnKey) ? editBudgetCoalesceMs : coalesceMs
2445
- const delay = Math.max(effectiveCoalesce, minIntervalMs - sinceLast, 0)
2446
- const capturedTurnKey = chatState.turnKey
2447
- chatState.pendingTimer = setT(() => {
2448
- // Defensive: if the chat was deleted between schedule and fire
2449
- // (e.g. a turn_end racing with an async boundary added later),
2450
- // don't resurrect it with a stale flush.
2451
- if (!chats.has(capturedTurnKey)) return
2452
- chatState!.pendingTimer = null
2453
- flush(chatState!, /*forceDone*/ false)
2454
- }, delay)
2455
- },
2456
-
2457
- startTurn({ chatId, threadId, userText, replyToMessageId }) {
2458
- // Synthesize an enqueue event and run it through the normal ingest
2459
- // path. This guarantees we share all the flush/cadence/teardown
2460
- // semantics with session-tail-driven enqueues.
2461
- //
2462
- // Each call creates a NEW card — if a card is already active for
2463
- // this chat it is force-closed first so it gets properly done/unpinned.
2464
- const raw = `<channel source="switchroom-telegram" chat_id="${chatId}"${threadId != null ? ` message_thread_id="${threadId}"` : ''}>${userText}</channel>`
2465
- this.ingest(
2466
- {
2467
- kind: 'enqueue',
2468
- chatId,
2469
- messageId: null,
2470
- threadId: threadId ?? null,
2471
- rawContent: raw,
2472
- isSync: true,
2473
- },
2474
- chatId,
2475
- threadId,
2476
- )
2477
- // Stash the source message_id and autonomous flag on the newly-created
2478
- // PerChatState so flush() can use them. Do this AFTER ingest() so the
2479
- // new PerChatState entry is in chats.
2480
- if (currentTurnKey != null) {
2481
- const cs = chats.get(currentTurnKey)
2482
- if (cs != null && cs.chatId === chatId) {
2483
- if (replyToMessageId != null) {
2484
- cs.replyToMessageId = replyToMessageId
2485
- }
2486
- // Issue #259: autonomous wakeup turns (ScheduleWakeup / CronCreate
2487
- // sentinel) never produce a user-visible reply by design. Suppress
2488
- // the "🙊 Ended without reply" warning for these turns.
2489
- if (userText.startsWith('<<autonomous-loop')) {
2490
- cs.wasAutonomous = true
2491
- }
2492
- }
2493
- }
2494
- },
2495
-
2496
- forceCompleteTurn({ chatId, threadId }) {
2497
- // Find active chatState for this chat:thread. Prefer the one pointed
2498
- // at by currentTurnKey; fall back to any state matching the chat key.
2499
- let target: PerChatState | undefined
2500
- if (currentTurnKey != null) {
2501
- const cs = chats.get(currentTurnKey)
2502
- if (cs != null && cs.chatId === chatId && cs.threadId === threadId) {
2503
- target = cs
2504
- }
2505
- }
2506
- if (target == null) {
2507
- for (const cs of chats.values()) {
2508
- if (cs.chatId === chatId && cs.threadId === threadId) {
2509
- target = cs
2510
- break
2511
- }
2512
- }
2513
- }
2514
- if (target == null) {
2515
- // No active card for this chat+thread — either the turn already
2516
- // completed via another path, or no turn is in flight. Idempotent
2517
- // no-op.
2518
- return
2519
- }
2520
- // Simulate the normal turn_end path so in-flight sub-agents keep
2521
- // their card surface. If sub-agents are running, this sets
2522
- // pendingCompletion and defers; if not, it closes immediately.
2523
- // stream_reply(done=true) signals "user's answer landed", not
2524
- // "all background work finished" — we must not abandon still-
2525
- // running sub-agents just because the final reply was sent.
2526
- if (target.completionFired) return
2527
- process.stderr.write(`telegram gateway: progress-card: forceCompleteTurn turnKey=${target.turnKey} (external completion signal, e.g. stream_reply done=true)\n`)
2528
- emitCardEvent({
2529
- agent: process.env.SWITCHROOM_AGENT_NAME ?? '',
2530
- chatId: target.chatId ?? '',
2531
- turnKey: target.turnKey,
2532
- event: 'force-completed',
2533
- reason: 'external completion signal (stream_reply done=true)',
2534
- })
2535
- const durationMs = Math.max(0, now() - target.state.turnStartedAt)
2536
- beginTurnEnd(target, durationMs)
2537
- target.lastEventAt = now()
2538
- flush(target, /*forceDone*/ true)
2539
- if (hasAnyRunningSubAgent(target.state)) {
2540
- target.pendingCompletion = true
2541
- const correlated: string[] = []
2542
- const orphans: string[] = []
2543
- for (const [k, sa] of target.state.subAgents) {
2544
- if (sa.state === 'running') {
2545
- if (sa.parentToolUseId != null) correlated.push(k)
2546
- else orphans.push(k)
2547
- }
2548
- }
2549
- process.stderr.write(`telegram gateway: progress-card: forceCompleteTurn deferred turnKey=${target.turnKey} reason=in-flight-sub-agents correlated=${correlated.length} orphans=${orphans.length} correlatedAgentIds=[${correlated.join(',')}] orphanAgentIds=[${orphans.join(',')}]\n`)
2550
- return
2551
- }
2552
- closePerChat(target, 'turn-end')
2553
- },
2554
-
2555
- takeOverCard({ chatId, threadId }) {
2556
- // Mirror the (chatId, threadId) lookup used by forceCompleteTurn
2557
- // — prefer the currentTurnKey-pinned target so concurrent fresh
2558
- // turns can't get clobbered.
2559
- let target: PerChatState | undefined
2560
- if (currentTurnKey != null) {
2561
- const cs = chats.get(currentTurnKey)
2562
- if (cs != null && cs.chatId === chatId && cs.threadId === threadId) {
2563
- target = cs
2564
- }
2565
- }
2566
- if (target == null) {
2567
- for (const cs of chats.values()) {
2568
- if (cs.chatId === chatId && cs.threadId === threadId) {
2569
- target = cs
2570
- break
2571
- }
2572
- }
2573
- }
2574
- if (target == null) return { wasEmitted: false, turnKey: null }
2575
-
2576
- // Cancel any pending deferred-first-emit timer so no card emits
2577
- // late, AFTER the external owner takes over. If the timer has
2578
- // already fired (DELAY_ELAPSED sentinel), nothing to clear.
2579
- if (target.deferredFirstEmitTimer != null && target.deferredFirstEmitTimer !== DELAY_ELAPSED) {
2580
- clearT(target.deferredFirstEmitTimer)
2581
- target.deferredFirstEmitTimer = null
2582
- }
2583
- // The card has been emitted iff the deferred-emit timer fired
2584
- // (driver's own indicator) or `isFirstEmit === false` (an emit
2585
- // path other than the deferred one already ran).
2586
- const wasEmitted =
2587
- target.deferredFirstEmitTimer === DELAY_ELAPSED || !target.isFirstEmit
2588
-
2589
- target.cardTakenOver = true
2590
- target.completionFired = true
2591
-
2592
- process.stderr.write(
2593
- `telegram gateway: progress-card: takeOverCard turnKey=${target.turnKey} wasEmitted=${wasEmitted}\n`,
2594
- )
2595
- return { wasEmitted, turnKey: target.turnKey }
2596
- },
2597
-
2598
- /**
2599
- * P2 of #662 — debug/test hook returning every live PerChatState's
2600
- * fleet keyed by turnKey. Used by cross-turn background tests to
2601
- * verify routing landed on the originating turn rather than the
2602
- * currently-active one. Not part of the production driver contract.
2603
- */
2604
- peekAllFleets() {
2605
- const out: Array<{ turnKey: string; chatId: string | null; fleet: Map<string, FleetMember> }> = []
2606
- for (const cs of chats.values()) {
2607
- out.push({ turnKey: cs.turnKey, chatId: cs.chatId, fleet: cs.fleet })
2608
- }
2609
- return out
2610
- },
2611
-
2612
- peekFleet(chatId, threadId) {
2613
- if (currentTurnKey != null) {
2614
- const cs = chats.get(currentTurnKey)
2615
- if (cs != null && cs.chatId === chatId && cs.threadId === threadId) {
2616
- return cs.fleet
2617
- }
2618
- }
2619
- for (const cs of chats.values()) {
2620
- if (cs.chatId === chatId && cs.threadId === threadId) return cs.fleet
2621
- }
2622
- return undefined
2623
- },
2624
-
2625
- peek(chatId, threadId) {
2626
- // Return the current active turn state for this chat:thread.
2627
- if (currentTurnKey != null) {
2628
- const cs = chats.get(currentTurnKey)
2629
- if (cs != null && cs.chatId === chatId && cs.threadId === threadId) {
2630
- return cs.state
2631
- }
2632
- }
2633
- // Fallback: find any active card for this chatId (threadId match optional).
2634
- for (const cs of chats.values()) {
2635
- if (cs.chatId === chatId && cs.threadId === threadId) return cs.state
2636
- }
2637
- return undefined
2638
- },
2639
-
2640
- hasActiveCard(chatId, threadId) {
2641
- for (const cs of chats.values()) {
2642
- if (
2643
- cs.chatId === chatId
2644
- && cs.threadId === threadId
2645
- && !cs.completionFired
2646
- ) {
2647
- return true
2648
- }
2649
- }
2650
- return false
2651
- },
2652
-
2653
- recordSubAgentNarrative({ chatId, threadId, agentId, text }) {
2654
- // Locate the active card for (chatId, threadId). Mirrors
2655
- // hasActiveCard's iteration since `chats` is keyed by turnKey.
2656
- let cs: PerChatState | null = null
2657
- for (const candidate of chats.values()) {
2658
- if (
2659
- candidate.chatId === chatId
2660
- && candidate.threadId === threadId
2661
- && !candidate.completionFired
2662
- ) {
2663
- cs = candidate
2664
- break
2665
- }
2666
- }
2667
- if (cs == null) {
2668
- return { ok: false, reason: 'no_active_card' }
2669
- }
2670
- // Sub-agents are keyed by jsonl_agent_id in the reducer state.
2671
- if (!cs.state.subAgents.has(agentId)) {
2672
- return { ok: false, reason: 'unknown_agent' }
2673
- }
2674
- // Dispatch through the same reduce path used by ingest().
2675
- cs.state = reduce(
2676
- cs.state,
2677
- { kind: 'sub_agent_narrative', agentId, text },
2678
- now(),
2679
- )
2680
- // Force re-render even though milestoneVersion didn't bump.
2681
- flush(cs, false)
2682
- return { ok: true }
2683
- },
2684
-
2685
- reportApiFailure(turnKey, failure) {
2686
- const cs = chats.get(turnKey)
2687
- if (cs == null) return // turn already completed — ignore
2688
- if (cs.apiFailures.terminal) return // already terminal — no-op
2689
-
2690
- if (failure.kind === 'benign') {
2691
- // "message is not modified" — not a real failure; don't touch counter.
2692
- return
2693
- }
2694
- if (failure.kind === 'transient') {
2695
- // Network/5xx — retryable by the outer layer; don't escalate.
2696
- process.stderr.write(
2697
- `telegram gateway: progress-card: transient API error turnKey=${turnKey} code=${failure.code} (${failure.description}) — will retry\n`,
2698
- )
2699
- return
2700
- }
2701
-
2702
- // permanent_4xx
2703
- cs.apiFailures.consecutive4xx++
2704
- cs.apiFailures.lastError = {
2705
- code: failure.code,
2706
- description: failure.description,
2707
- timestamp: now(),
2708
- }
2709
-
2710
- if (maxConsecutive4xx > 0 && cs.apiFailures.consecutive4xx >= maxConsecutive4xx) {
2711
- cs.apiFailures.terminal = true
2712
- process.stderr.write(
2713
- `telegram gateway: progress-card: card edit giving 4xx, abandoning locally` +
2714
- ` (chat=${cs.chatId}, turnKey=${turnKey}, code=${failure.code}, desc="${failure.description}")\n`,
2715
- )
2716
- } else {
2717
- process.stderr.write(
2718
- `telegram gateway: progress-card: card edit 4xx (${cs.apiFailures.consecutive4xx}/${maxConsecutive4xx})` +
2719
- ` turnKey=${turnKey} code=${failure.code} (${failure.description})\n`,
2720
- )
2721
- }
2722
- },
2723
-
2724
- reportApiSuccess(turnKey) {
2725
- const cs = chats.get(turnKey)
2726
- if (cs == null) return
2727
- if (cs.apiFailures.consecutive4xx > 0) {
2728
- cs.apiFailures.consecutive4xx = 0
2729
- }
2730
- },
2731
-
2732
- recordOutboundDelivered(chatId, threadId) {
2733
- // Issue #137: walk the active chats and find the entry matching the
2734
- // outbound destination. We can't index by chatId alone — multiple
2735
- // turns may queue against the same chat — so iterate. The map is
2736
- // small (one entry per active turn) so the linear scan is fine.
2737
- for (const cs of chats.values()) {
2738
- if (cs.chatId === chatId && cs.threadId === threadId) {
2739
- cs.outboundDeliveredCount += 1
2740
- return
2741
- }
2742
- }
2743
- // No active card → outbound was likely a system message (boot
2744
- // banner, restart ack, etc.) and isn't part of any agent turn.
2745
- // Silent no-op.
2746
- },
2747
-
2748
- dispose(opts?: { preservePending?: boolean }) {
2749
- if (opts?.preservePending === true) {
2750
- // Selective dispose: preserve chats with pendingCompletion=true so
2751
- // their heartbeat and deferred-completion timeout continue firing
2752
- // after a bridge disconnect. This is the fix for the regression
2753
- // introduced in commit 4c0186d where dispose() wiped all in-flight
2754
- // card state on every bridge disconnect (stdio-MCP per-call lifecycle).
2755
- let hasPending = false
2756
- for (const [turnKey, cs] of chats) {
2757
- // Always clear coalesce timers — they could emit into a finalized
2758
- // draft stream and spawn duplicate messages.
2759
- if (cs.pendingTimer != null) {
2760
- clearT(cs.pendingTimer)
2761
- cs.pendingTimer = null
2762
- }
2763
- if (cs.deferredFirstEmitTimer != null) {
2764
- clearT(cs.deferredFirstEmitTimer)
2765
- cs.deferredFirstEmitTimer = null
2766
- }
2767
- if (cs.pendingCompletion) {
2768
- // Keep this chat alive — it has running background sub-agents
2769
- // that will continue emitting events and need the heartbeat.
2770
- hasPending = true
2771
- } else {
2772
- // No pending completion — clear this chat (existing behavior).
2773
- chats.delete(turnKey)
2774
- }
2775
- }
2776
- // Only stop the heartbeat if nothing is pending; if any chat is still
2777
- // alive, the heartbeat is exactly what drives future re-renders.
2778
- if (!hasPending) {
2779
- stopHeartbeat()
2780
- }
2781
- // Reset currentChatId/currentTurnKey only if they no longer map to
2782
- // a surviving pendingCompletion chat.
2783
- if (currentTurnKey != null && !chats.has(currentTurnKey)) {
2784
- currentChatId = null
2785
- currentThreadId = undefined
2786
- currentTurnKey = null
2787
- }
2788
- pendingSyncEchoes.clear()
2789
- seenEnqueueMsgIds.clear()
2790
- } else {
2791
- // Back-compat: wipe everything (original behavior).
2792
- stopHeartbeat()
2793
- for (const cs of chats.values()) {
2794
- if (cs.pendingTimer != null) {
2795
- clearT(cs.pendingTimer)
2796
- cs.pendingTimer = null
2797
- }
2798
- if (cs.deferredFirstEmitTimer != null) {
2799
- clearT(cs.deferredFirstEmitTimer)
2800
- cs.deferredFirstEmitTimer = null
2801
- }
2802
- }
2803
- chats.clear()
2804
- currentChatId = null
2805
- currentThreadId = undefined
2806
- currentTurnKey = null
2807
- pendingSyncEchoes.clear()
2808
- seenEnqueueMsgIds.clear()
2809
- }
2810
- },
2811
-
2812
- onSubAgentStall(agentId: string, _idleMs: number, _description: string) {
2813
- // Option C: watcher detected a stall for this sub-agent. Find which
2814
- // chat state is tracking it and force an elapsed-tick re-render so the
2815
- // ⚠️ stall indicator becomes visible even when no events are flowing.
2816
- for (const cs of chats.values()) {
2817
- if (!cs.state.subAgents.has(agentId)) continue
2818
- const sa = cs.state.subAgents.get(agentId)!
2819
- if (sa.state !== 'running') continue
2820
- // Leave sa.lastEventAt unchanged — the render computes the ⚠️
2821
- // stall badge from (now - sa.lastEventAt) >= SUBAGENT_STALL_MS,
2822
- // so the stale value is exactly what makes the badge appear.
2823
- // All we need to do here is force a re-render so the user sees it.
2824
- //
2825
- // If the card is still suppressed (no first emit yet), the user
2826
- // has nothing on screen — the stall warning needs to be visible
2827
- // immediately. Promote out of the initial-delay window before
2828
- // forcing the heartbeat tick.
2829
- if (
2830
- promoteOnSubAgent
2831
- && cs.isFirstEmit
2832
- && cs.deferredFirstEmitTimer !== DELAY_ELAPSED
2833
- && !cs.apiFailures.terminal
2834
- ) {
2835
- promoteFirstEmit(cs, 'sub_agent_stall')
2836
- }
2837
- // Force the next heartbeat tick to emit by clearing the diff-guard
2838
- // buckets for this turnKey. Note: this clears the chat-level and
2839
- // sub-agent-tick buckets — distinct from cs.lastEventAt (chat-level,
2840
- // drives stuckMs) which is left untouched.
2841
- lastHeartbeatBucket.delete(cs.turnKey)
2842
- lastSubAgentTickBucket.delete(cs.turnKey)
2843
- // If the heartbeat isn't running (it would have been kept alive by
2844
- // preserve-pending, but check defensively), start it.
2845
- if (chats.size > 0) startHeartbeatIfNeeded()
2846
- break
2847
- }
2848
- },
2849
-
2850
- onSubAgentUnstall(agentId: string, _description: string) {
2851
- // Symmetric to onSubAgentStall: watcher saw JSONL activity return.
2852
- // The standard event path has already bumped sa.lastEventAt and
2853
- // (for tool events) flipped fleet member status stuck→running via
2854
- // applyToolUse. All this method needs to do is force a re-render
2855
- // so the ⚠ badge clears immediately — the diff-guard can otherwise
2856
- // suppress the heartbeat for several seconds if no chat-level
2857
- // state changed, which manifests as the badge lingering even
2858
- // though the underlying state is fresh.
2859
- for (const cs of chats.values()) {
2860
- if (!cs.state.subAgents.has(agentId)) continue
2861
- const sa = cs.state.subAgents.get(agentId)!
2862
- if (sa.state !== 'running') continue
2863
- lastHeartbeatBucket.delete(cs.turnKey)
2864
- lastSubAgentTickBucket.delete(cs.turnKey)
2865
- if (chats.size > 0) startHeartbeatIfNeeded()
2866
- break
2867
- }
2868
- },
2869
-
2870
- /**
2871
- * Test-only accessor. Returns the live internal Maps so tests can
2872
- * assert TTL eviction and outer-base-key cleanup actually drop
2873
- * entries. Not part of the supported API — naming reflects that.
2874
- */
2875
- _debugGetMaps() {
2876
- return {
2877
- chats,
2878
- seenEnqueueMsgIds,
2879
- pendingSyncEchoes,
2880
- chatRunningSubagents,
2881
- baseTurnSeqs,
2882
- editTimestamps,
2883
- }
2884
- },
2885
- }
2886
- }