switchroom 0.15.44 → 0.16.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/dist/agent-scheduler/index.js +122 -88
  2. package/dist/auth-broker/index.js +463 -177
  3. package/dist/cli/autoaccept-poll.js +4842 -35
  4. package/dist/cli/drive-write-pretool.mjs +17 -14
  5. package/dist/cli/notion-write-pretool.mjs +117 -86
  6. package/dist/cli/self-improve-apply-guard-pretool.mjs +626 -0
  7. package/dist/cli/self-improve-stop.mjs +428 -0
  8. package/dist/cli/skill-validate-pretool.mjs +72 -72
  9. package/dist/cli/switchroom.js +3249 -1241
  10. package/dist/cli/ui/index.html +1 -1
  11. package/dist/host-control/main.js +2833 -355
  12. package/dist/vault/approvals/kernel-server.js +7482 -7439
  13. package/dist/vault/broker/server.js +11315 -11272
  14. package/examples/minimal.yaml +1 -0
  15. package/examples/switchroom.yaml +1 -0
  16. package/package.json +3 -3
  17. package/profiles/_base/start.sh.hbs +88 -1
  18. package/profiles/_shared/execution-discipline.md.hbs +18 -0
  19. package/profiles/default/CLAUDE.md.hbs +3 -22
  20. package/telegram-plugin/.claude-plugin/plugin.json +2 -2
  21. package/telegram-plugin/answer-stream-flag.ts +12 -49
  22. package/telegram-plugin/answer-stream.ts +5 -150
  23. package/telegram-plugin/auth-snapshot-format.ts +280 -48
  24. package/telegram-plugin/auto-fallback-fleet.ts +44 -1
  25. package/telegram-plugin/context-exhaustion.ts +12 -0
  26. package/telegram-plugin/demo-mask.ts +154 -0
  27. package/telegram-plugin/dist/bridge/bridge.js +167 -124
  28. package/telegram-plugin/dist/gateway/gateway.js +3039 -1159
  29. package/telegram-plugin/dist/server.js +215 -172
  30. package/telegram-plugin/docs/waiting-ux-spec.md +2 -2
  31. package/telegram-plugin/draft-stream.ts +47 -410
  32. package/telegram-plugin/final-answer-detect.ts +17 -12
  33. package/telegram-plugin/fleet-fallback-resume.ts +131 -0
  34. package/telegram-plugin/format.ts +56 -19
  35. package/telegram-plugin/gateway/auth-add-flow.ts +332 -127
  36. package/telegram-plugin/gateway/auth-broker-client.ts +2 -2
  37. package/telegram-plugin/gateway/auth-command.ts +70 -14
  38. package/telegram-plugin/gateway/clean-shutdown-marker.ts +44 -0
  39. package/telegram-plugin/gateway/config-approval-handler.test.ts +91 -4
  40. package/telegram-plugin/gateway/config-approval-handler.ts +94 -13
  41. package/telegram-plugin/gateway/current-turn-map.ts +188 -0
  42. package/telegram-plugin/gateway/disconnect-flush.ts +3 -1
  43. package/telegram-plugin/gateway/effort-command.ts +8 -3
  44. package/telegram-plugin/gateway/emission-authority.ts +369 -0
  45. package/telegram-plugin/gateway/feed-open-gate.ts +292 -0
  46. package/telegram-plugin/gateway/gateway.ts +1837 -291
  47. package/telegram-plugin/gateway/inject-handler.test.ts +2 -1
  48. package/telegram-plugin/gateway/ms365-write-approval.test.ts +4 -4
  49. package/telegram-plugin/gateway/represent-guard.ts +72 -0
  50. package/telegram-plugin/gateway/status-surface-log.test.ts +5 -4
  51. package/telegram-plugin/gateway/status-surface-log.ts +14 -3
  52. package/telegram-plugin/history.ts +33 -11
  53. package/telegram-plugin/hooks/repo-context-pretool.mjs +26 -0
  54. package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +5 -0
  55. package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +8 -0
  56. package/telegram-plugin/hooks/tool-label-pretool.mjs +39 -15
  57. package/telegram-plugin/issues-card.ts +4 -0
  58. package/telegram-plugin/model-unavailable.ts +124 -0
  59. package/telegram-plugin/narrative-dedup.ts +69 -0
  60. package/telegram-plugin/over-ping-safety-net.ts +70 -4
  61. package/telegram-plugin/package.json +3 -3
  62. package/telegram-plugin/pending-work-progress.ts +12 -0
  63. package/telegram-plugin/permission-rule.ts +32 -5
  64. package/telegram-plugin/permission-title.ts +152 -9
  65. package/telegram-plugin/quota-check.ts +13 -0
  66. package/telegram-plugin/quota-watch.ts +135 -7
  67. package/telegram-plugin/registry/turns-schema.test.ts +24 -0
  68. package/telegram-plugin/registry/turns-schema.ts +9 -0
  69. package/telegram-plugin/runtime-metrics.ts +13 -0
  70. package/telegram-plugin/session-tail.ts +96 -11
  71. package/telegram-plugin/silence-poke.ts +170 -24
  72. package/telegram-plugin/slot-banner-driver.ts +3 -0
  73. package/telegram-plugin/status-no-truncate.ts +44 -0
  74. package/telegram-plugin/status-reactions.ts +20 -3
  75. package/telegram-plugin/stream-controller.ts +4 -23
  76. package/telegram-plugin/stream-reply-handler.ts +6 -24
  77. package/telegram-plugin/streaming-metrics.ts +91 -0
  78. package/telegram-plugin/subagent-watcher.ts +212 -66
  79. package/telegram-plugin/tests/activity-ever-opened-sticky.test.ts +47 -0
  80. package/telegram-plugin/tests/answer-stream-dedup.test.ts +9 -26
  81. package/telegram-plugin/tests/answer-stream-flag.test.ts +25 -58
  82. package/telegram-plugin/tests/answer-stream-silent-markers.test.ts +41 -51
  83. package/telegram-plugin/tests/answer-stream.test.ts +2 -411
  84. package/telegram-plugin/tests/auth-add-flow.test.ts +488 -253
  85. package/telegram-plugin/tests/auth-command-format2.test.ts +71 -1
  86. package/telegram-plugin/tests/auth-snapshot-format.test.ts +376 -6
  87. package/telegram-plugin/tests/auto-fallback-fleet.test.ts +120 -0
  88. package/telegram-plugin/tests/cross-turn-card-gate.test.ts +424 -0
  89. package/telegram-plugin/tests/demo-mask.test.ts +127 -0
  90. package/telegram-plugin/tests/draft-stream.test.ts +0 -827
  91. package/telegram-plugin/tests/emission-authority-card-drain-gate.test.ts +236 -0
  92. package/telegram-plugin/tests/emission-authority-facade.test.ts +488 -0
  93. package/telegram-plugin/tests/emission-authority-open-gate.test.ts +179 -0
  94. package/telegram-plugin/tests/emission-authority-ping-gate.test.ts +395 -0
  95. package/telegram-plugin/tests/emission-determinism-wiring.test.ts +177 -0
  96. package/telegram-plugin/tests/feed-heartbeat-liveness-open.test.ts +146 -0
  97. package/telegram-plugin/tests/feed-open-gate.test.ts +259 -0
  98. package/telegram-plugin/tests/feed-survival.test.ts +526 -0
  99. package/telegram-plugin/tests/fleet-fallback-resume.test.ts +197 -0
  100. package/telegram-plugin/tests/gateway-clean-shutdown-marker.test.ts +117 -0
  101. package/telegram-plugin/tests/gateway-no-reply-single-emit.test.ts +4 -11
  102. package/telegram-plugin/tests/history.test.ts +60 -0
  103. package/telegram-plugin/tests/model-unavailable.test.ts +118 -0
  104. package/telegram-plugin/tests/narrative-dedup.test.ts +118 -0
  105. package/telegram-plugin/tests/orphaned-reply-rearm.test.ts +285 -0
  106. package/telegram-plugin/tests/over-ping-final-answer-decoupling.test.ts +194 -0
  107. package/telegram-plugin/tests/over-ping-safety-net.test.ts +2 -2
  108. package/telegram-plugin/tests/per-topic-current-turn.test.ts +373 -0
  109. package/telegram-plugin/tests/permission-card-origin-kill-switch.test.ts +42 -0
  110. package/telegram-plugin/tests/permission-rule.test.ts +17 -0
  111. package/telegram-plugin/tests/permission-title.test.ts +206 -17
  112. package/telegram-plugin/tests/quota-watch.test.ts +252 -9
  113. package/telegram-plugin/tests/reply-terminal-reaction.test.ts +6 -1
  114. package/telegram-plugin/tests/repo-context-pretool.test.ts +62 -0
  115. package/telegram-plugin/tests/represent-guard.test.ts +162 -0
  116. package/telegram-plugin/tests/session-tail.test.ts +147 -3
  117. package/telegram-plugin/tests/silence-liveness-wiring.test.ts +18 -0
  118. package/telegram-plugin/tests/status-card-budget-parity.test.ts +72 -0
  119. package/telegram-plugin/tests/status-surface-log.test.ts +146 -0
  120. package/telegram-plugin/tests/subagent-watcher-clip-narrative.test.ts +58 -0
  121. package/telegram-plugin/tests/subagent-watcher-parent-turn-key.test.ts +102 -0
  122. package/telegram-plugin/tests/subagent-watcher-workflow-visibility.test.ts +225 -0
  123. package/telegram-plugin/tests/subagent-watcher.test.ts +147 -0
  124. package/telegram-plugin/tests/telegram-activity-visibility-integration.test.ts +597 -0
  125. package/telegram-plugin/tests/telegram-format.test.ts +101 -6
  126. package/telegram-plugin/tests/tool-activity-summary.test.ts +550 -15
  127. package/telegram-plugin/tests/tool-label-pretool.test.ts +73 -0
  128. package/telegram-plugin/tests/tool-label-sidecar.test.ts +44 -0
  129. package/telegram-plugin/tests/tool-labels.test.ts +67 -0
  130. package/telegram-plugin/tests/turn-liveness-floor.test.ts +196 -0
  131. package/telegram-plugin/tests/turn-liveness-invariant.test.ts +340 -0
  132. package/telegram-plugin/tests/welcome-text.test.ts +32 -3
  133. package/telegram-plugin/tests/worker-activity-feed.test.ts +470 -22
  134. package/telegram-plugin/tool-activity-summary.ts +375 -58
  135. package/telegram-plugin/turn-liveness-floor.ts +240 -0
  136. package/telegram-plugin/uat/assertions.ts +115 -0
  137. package/telegram-plugin/uat/driver.ts +68 -0
  138. package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +119 -133
  139. package/telegram-plugin/uat/scenarios/jtbd-answer-pings.test.ts +94 -0
  140. package/telegram-plugin/uat/scenarios/jtbd-cross-turn-card-dm.test.ts +109 -0
  141. package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-thinkgap-dm.test.ts +478 -0
  142. package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-visibility-dm.test.ts +396 -0
  143. package/telegram-plugin/uat/scenarios/jtbd-liveness-feed-open-dm.test.ts +202 -0
  144. package/telegram-plugin/uat/scenarios/jtbd-reply-is-last-dm.test.ts +202 -0
  145. package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +93 -87
  146. package/telegram-plugin/welcome-text.ts +13 -1
  147. package/telegram-plugin/worker-activity-feed.ts +157 -82
  148. package/telegram-plugin/draft-transport.ts +0 -122
  149. package/telegram-plugin/tests/draft-retirement-wiring.test.ts +0 -82
  150. package/telegram-plugin/tests/draft-transport.test.ts +0 -211
@@ -0,0 +1,396 @@
1
+ /**
2
+ * Foreground activity-feed visibility across the silence-fallback threshold.
3
+ *
4
+ * ## Cause class
5
+ *
6
+ * A foreground turn that does REAL sequential work (several tool calls, each
7
+ * followed by a model-thinking gap) can trip the silence-fallback timer
8
+ * (`SILENCE_FALLBACK_MS`, default 300 000 ms) even while it is visibly
9
+ * progressing. The silence clock is reset by:
10
+ *
11
+ * - A fresh `reply` or `stream_reply` first-emit (any real user-visible send).
12
+ * - `SILENCE_LIVENESS_PRODUCTION` ON (the default): a new tool-activity label
13
+ * appearing on the feed, or an answer-stream draft update.
14
+ *
15
+ * Crucially, every render resets the clock to ZERO — so the failure is NOT
16
+ * cumulative across many short gaps. The feed only darkens on a SINGLE
17
+ * continuous no-render window longer than the threshold. Heartbeat edits keep
18
+ * the message visually advancing but do NOT count as liveness, so they do not
19
+ * reset the clock; only a real tool-label render or an answer-stream draft does.
20
+ *
21
+ * This file is the DETERMINISTIC guard half of the pair. The fleet runs
22
+ * `SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS=1` (set in defaults.env — it is the
23
+ * fleet default, despite a stale gateway.ts comment that still says "OFF, canary
24
+ * on marko"). With the defer ON, a single long IN-FLIGHT tool does NOT trip the
25
+ * base fallback: the defer holds it back while the tool runs (up to the hard
26
+ * ceiling, default 15 min) and the feed heartbeat keeps editing the live
27
+ * message. So the CORRECT behaviour for a long in-flight turn is: the feed stays
28
+ * lit. This guard pins exactly that. A regression that breaks the defer, stops
29
+ * the heartbeat, or nulls `currentTurn` for an in-flight turn darkens the feed,
30
+ * and this test catches it.
31
+ *
32
+ * The workload is one ~35 s no-output command (`sleep 35`) — well under the
33
+ * default 15 min hard ceiling — so under prod config the feed must stay live
34
+ * across the shrunk 20 s base fallback. A prompt of several FAST steps would not
35
+ * exercise the silence window at all (each tool start re-renders and resets the
36
+ * clock); one long stretch is what holds the clock open.
37
+ *
38
+ * This guard does NOT reproduce #680's exact trigger — silent model thinking
39
+ * BETWEEN tools, with no tool in-flight. The defer does not cover that vector
40
+ * and it cannot be forced deterministically; it lives in the sibling best-effort
41
+ * scenario `jtbd-foreground-feed-thinkgap-dm.test.ts`. Together the pair covers
42
+ * the feed-visibility invariant deterministically (here) and the true #680
43
+ * vector best-effort (there).
44
+ *
45
+ * This scenario shrinks the base fallback to 20 s via
46
+ * `SWITCHROOM_SILENCE_FALLBACK_MS=20000` on the test-harness agent so the
47
+ * window is exercised within a test budget instead of 5 minutes.
48
+ *
49
+ * ## Required env precondition (operator must set on test-harness agent)
50
+ *
51
+ * SWITCHROOM_SILENCE_FALLBACK_MS=20000
52
+ *
53
+ * Set this under the `test-harness` agent's `env:` block in
54
+ * `~/.switchroom/switchroom.yaml`, then restart the agent
55
+ * (`switchroom agent restart test-harness --wait --force`) before running
56
+ * this scenario. The scenario detects whether the threshold is plausibly
57
+ * shrunk by reading `SWITCHROOM_UAT_SILENCE_FALLBACK_MS` from the test
58
+ * env (a parallel knob populated by the UAT `.env` file) and
59
+ * skip-with-message if it is not set to ≤ 30 000.
60
+ *
61
+ * Without the shrunk threshold the scenario still runs but the timing
62
+ * assertions become vacuous (the default 300 s threshold far exceeds the
63
+ * test budget) and the test exits before the scenario would have had a
64
+ * chance to catch a regression. The skip keeps the failure signal honest.
65
+ *
66
+ * ## What it asserts (the gap no existing scenario covers)
67
+ *
68
+ * 1. **Feed opened.** An activity-feed message (`→`/`✓` lines) appears in the
69
+ * DM at some point during the turn — the agent started reporting progress.
70
+ *
71
+ * 2. **Feed survived the fallback window.** After the silence-fallback interval
72
+ * has elapsed from the point the feed was first observed, the feed message
73
+ * is still present and carries at least one more edit that arrived AFTER
74
+ * the threshold mark. If `currentTurn` was nulled mid-turn, the gateway
75
+ * stops sending activity-feed edits and the message goes stale or disappears
76
+ * — this assertion catches that.
77
+ *
78
+ * 3. **Final answer arrives.** A substantive reply (≥ 150 chars) eventually
79
+ * lands, confirming the turn completed rather than being wedged.
80
+ *
81
+ * ## Failure shapes
82
+ *
83
+ * (a) Feed never opened — the activity feed did not paint at all. Either the
84
+ * agent never used tools or the very first drainActivitySummary call
85
+ * failed. Distinct from the regression; both are failures.
86
+ *
87
+ * (b) Feed went dark — the feed message was present before the fallback mark
88
+ * but received no fresh edit after the mark. This IS the regression this
89
+ * test exists to catch: `currentTurn` was nulled mid-turn, silencing the
90
+ * live feed while the agent was still working.
91
+ *
92
+ * (c) No final answer — the turn never produced a substantive reply. Possibly
93
+ * the fallback also dropped the answer path (compound regression), or the
94
+ * prompt was too slow for the overall test budget.
95
+ *
96
+ * ## Tolerances
97
+ *
98
+ * The feed edit observation is polled via `driver.getMessage` (the same
99
+ * technique used by `jtbd-worker-activity-feed-dm`). Because mtcute's live
100
+ * `observeMessages` may miss edits that arrive before the observer is attached,
101
+ * we cross-check by comparing the snapshot taken just before the fallback mark
102
+ * against a fresh fetch just after it. A changed body confirms a live edit
103
+ * occurred across the threshold; an UNCHANGED body with no subsequent new edit
104
+ * in the live stream is the regression signal.
105
+ *
106
+ * The prompt is engineered so the model does at least 4 sequential tool calls
107
+ * with brief thinking gaps between them, each step taking ~4–6 s, giving a
108
+ * natural total span of ~25–35 s that straddles the shrunk 20 s fallback.
109
+ */
110
+
111
+ import { describe, expect, it } from "vitest";
112
+ import { spinUp } from "../harness.js";
113
+ import { isActivityFeedMessage } from "../assertions.js";
114
+ import type { ObservedMessage } from "../driver.js";
115
+
116
+ /**
117
+ * The shrunk fallback threshold the operator must set on the test-harness
118
+ * agent. The test reads a parallel UAT-env knob so we can detect whether
119
+ * the precondition is satisfied without reaching into the agent's process env.
120
+ *
121
+ * Set `SWITCHROOM_UAT_SILENCE_FALLBACK_MS=20000` in the repo-root `.env`
122
+ * alongside `SWITCHROOM_SILENCE_FALLBACK_MS=20000` in the agent's env block.
123
+ */
124
+ const PRECONDITION_FALLBACK_MS = Number.parseInt(
125
+ process.env.SWITCHROOM_UAT_SILENCE_FALLBACK_MS ?? "",
126
+ 10,
127
+ );
128
+
129
+ /**
130
+ * How long to wait after the feed first appears before taking the
131
+ * "before-mark" snapshot. We want the fallback timer to have clearly
132
+ * elapsed by the time we take the "after-mark" snapshot. Chosen to be
133
+ * safely above the shrunk 20 s fallback while staying comfortably within
134
+ * the test budget.
135
+ */
136
+ const FALLBACK_WINDOW_MS = 25_000;
137
+
138
+ /**
139
+ * How long to poll after the fallback window for a fresh feed edit. Short
140
+ * enough not to waste budget but long enough for one more drainActivitySummary
141
+ * cycle to land (the feed heartbeat fires roughly every 5–8 s).
142
+ */
143
+ const POST_MARK_EDIT_WAIT_MS = 15_000;
144
+
145
+ /**
146
+ * A substantive answer is at least this many characters. Avoids latching
147
+ * onto a brief "on it" ack or a stub.
148
+ */
149
+ const MIN_ANSWER_CHARS = 150;
150
+
151
+ /**
152
+ * Overall test budget. Includes:
153
+ * - spinUp settle: ~8 s
154
+ * - turn onset (first tool + first feed paint): ~20 s
155
+ * - FALLBACK_WINDOW_MS: 25 s
156
+ * - POST_MARK_EDIT_WAIT_MS: 15 s
157
+ * - final-answer wait: ~30 s
158
+ * - headroom: ~20 s
159
+ */
160
+ const OVERALL_BUDGET_MS = 150_000;
161
+
162
+ /**
163
+ * Workload prompt: one ~33 s no-output command, `timeout 33 tail -f /dev/null`.
164
+ * NOTE: standalone `sleep` is blocked by the Claude Code harness ("foreground
165
+ * sleep is blocked"), so this is the hook-safe equivalent of a long in-flight
166
+ * no-op. The tool's label renders once at its start (resetting the clock to
167
+ * zero), then nothing renders for ~33 s while the tool is in-flight. Under prod
168
+ * config (defer ON) the base fallback is HELD during that in-flight stretch, so
169
+ * the feed must stay live and the heartbeat keeps editing it. That is the
170
+ * invariant this guard asserts; a regression that lets `currentTurn` get nulled
171
+ * mid-stretch breaks it.
172
+ *
173
+ * The prompt explicitly asks the model to give the Bash call a short
174
+ * DESCRIPTION. That matters: an empty-label tool is dropped from the activity
175
+ * feed (it never opens), which would fail the test on "feed never appeared"
176
+ * (shape a) for the wrong reason. A labelled tool opens the feed reliably.
177
+ *
178
+ * Why not several fast steps: each fast tool start emits a fresh label that
179
+ * resets the clock, so the silence window never opens at all and the test would
180
+ * pass vacuously (a false green). One long stretch is what holds it open.
181
+ *
182
+ * We do NOT use run_in_background — this must be a FOREGROUND turn so
183
+ * currentTurn stays in place and the silence clock applies to it directly.
184
+ */
185
+ const SEQUENTIAL_WORK_PROMPT =
186
+ "Use the Bash tool to run EXACTLY this command — and give the tool call a " +
187
+ 'short description such as "long-running wait" so it is clearly labelled: ' +
188
+ "`timeout 33 tail -f /dev/null`. It runs for about 33 seconds and then exits " +
189
+ "on its own (a non-zero timeout exit code is expected and fine). Do not run " +
190
+ "any other tool while it is running. After it finishes, reply with a short " +
191
+ "paragraph (a few sentences) telling me it completed and that you waited " +
192
+ "about 33 seconds.";
193
+
194
+ describe("uat: foreground activity-feed visibility across silence-fallback threshold", () => {
195
+ it(
196
+ "feed remains live and receives edits after the shrunk fallback window",
197
+ async () => {
198
+ // Precondition guard: if the operator hasn't shrunk the silence
199
+ // fallback to ≤ 30 000 ms on the test-harness agent (and mirrored
200
+ // it into the UAT env), the timing assertions are vacuous. Skip
201
+ // with a clear message rather than silently producing a false green.
202
+ if (!Number.isFinite(PRECONDITION_FALLBACK_MS) || PRECONDITION_FALLBACK_MS > 30_000) {
203
+ console.warn(
204
+ "[uat/foreground-feed-visibility] SKIPPED — precondition not met.\n" +
205
+ " This scenario requires SWITCHROOM_SILENCE_FALLBACK_MS=20000 set on\n" +
206
+ " the test-harness agent AND SWITCHROOM_UAT_SILENCE_FALLBACK_MS=20000\n" +
207
+ " in the repo-root .env. Without it the silence fallback does not fire\n" +
208
+ " within the test window and the regression cannot be detected.\n" +
209
+ " See the header doc comment in this file for setup instructions.",
210
+ );
211
+ return;
212
+ }
213
+
214
+ const sc = await spinUp({ agent: "test-harness" });
215
+ try {
216
+ // Start observing BEFORE sending so no activity-feed messages are missed.
217
+ const iter = sc.driver
218
+ .observeMessages(sc.botUserId)
219
+ [Symbol.asyncIterator]();
220
+
221
+ await sc.sendDM(SEQUENTIAL_WORK_PROMPT);
222
+
223
+ console.log("[foreground-feed] prompt sent; watching for activity-feed message…");
224
+
225
+ // ── Assertion 1: feed opened ─────────────────────────────────────────
226
+ // Drain the live message stream until we see an activity-feed message
227
+ // (lines matching `→ …` or `✓ …`). Give generous budget for the agent to
228
+ // start tools and open the feed.
229
+ let feedMsg: ObservedMessage | null = null;
230
+ const feedDeadline = Date.now() + 90_000;
231
+
232
+ while (Date.now() < feedDeadline) {
233
+ const remaining = feedDeadline - Date.now();
234
+ const next = await Promise.race([
235
+ iter.next(),
236
+ new Promise<{ done: true; value: undefined }>((r) =>
237
+ setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
238
+ ),
239
+ ]);
240
+ if (next.done || next.value == null) break;
241
+ const m = next.value as ObservedMessage;
242
+ // Skip our own echo and worker-feed messages; we want the foreground
243
+ // activity feed.
244
+ if (m.senderUserId === sc.driverUserId) continue;
245
+ if (isActivityFeedMessage(m)) {
246
+ feedMsg = m;
247
+ break;
248
+ }
249
+ }
250
+
251
+ expect(
252
+ feedMsg,
253
+ "Failure shape (a): the foreground activity-feed message never appeared. " +
254
+ "Either the agent did not use tools, the prompt was too fast for the " +
255
+ "feed to paint, or drainActivitySummary failed on every attempt this turn.",
256
+ ).not.toBeNull();
257
+
258
+ // feedMsg is confirmed non-null beyond this point.
259
+ const { messageId: feedId } = feedMsg!;
260
+ console.log(
261
+ `[foreground-feed] feed opened (id=${feedId}): ` +
262
+ JSON.stringify(feedMsg!.text.slice(0, 120)),
263
+ );
264
+
265
+ // ── Snapshot before the fallback mark ───────────────────────────────
266
+ // Record the feed body and clock, then wait FALLBACK_WINDOW_MS so the
267
+ // shrunk base fallback has definitely elapsed. During this wait the
268
+ // single in-flight stretch (`sleep 35`) emits no renders, but the defer
269
+ // (prod default ON) HOLDS the base fallback while the tool is in-flight,
270
+ // so currentTurn survives and the feed heartbeat keeps editing the
271
+ // message. We assert those heartbeat edits keep landing after the mark;
272
+ // if a regression nulls currentTurn mid-stretch, the edits stop and we
273
+ // catch it.
274
+ const beforeMarkText = feedMsg!.text;
275
+ const markAt = Date.now() + FALLBACK_WINDOW_MS;
276
+
277
+ console.log(
278
+ `[foreground-feed] waiting ${FALLBACK_WINDOW_MS}ms for fallback window to elapse…`,
279
+ );
280
+ await new Promise((r) => setTimeout(r, FALLBACK_WINDOW_MS));
281
+
282
+ // ── Assertion 2: feed survived the fallback window ───────────────────
283
+ // Fetch the feed message directly. If currentTurn was nulled mid-turn,
284
+ // the gateway either stopped editing (stale text) or the message may have
285
+ // been deleted by clearActivitySummary (null). Either condition is the
286
+ // regression.
287
+ const afterMark = await sc.driver.getMessage(sc.botUserId, feedId);
288
+
289
+ console.log(
290
+ `[foreground-feed] feed state after ${FALLBACK_WINDOW_MS}ms mark ` +
291
+ `(id=${feedId}): ` +
292
+ JSON.stringify(afterMark?.text?.slice(0, 120) ?? null),
293
+ );
294
+
295
+ expect(
296
+ afterMark,
297
+ "Failure shape (b): the activity-feed message was deleted after the " +
298
+ `silence-fallback window (${FALLBACK_WINDOW_MS}ms). This means ` +
299
+ "currentTurn was nulled mid-turn by the silence-fallback handler, " +
300
+ "which then triggered clearActivitySummary and removed the live feed " +
301
+ "message. The regression: a productive foreground turn went dark " +
302
+ "because a continuous no-render window exceeded the " +
303
+ `shrunk threshold (SWITCHROOM_SILENCE_FALLBACK_MS=${PRECONDITION_FALLBACK_MS}).`,
304
+ ).not.toBeNull();
305
+
306
+ // The body should have changed — i.e. the feed received at least one
307
+ // edit after the fallback mark — proving it is still alive. We check
308
+ // both the polled snapshot and the live-stream edits we may have
309
+ // collected during the wait.
310
+ const bodyChangedAfterMark = afterMark!.text !== beforeMarkText;
311
+
312
+ // Also drain any edits that arrived during the POST_MARK_EDIT_WAIT_MS
313
+ // window to catch the next drainActivitySummary cycle if the polled
314
+ // snapshot was taken slightly before the edit landed.
315
+ let sawFeedEditAfterMark = bodyChangedAfterMark;
316
+ const postMarkDeadline = Date.now() + POST_MARK_EDIT_WAIT_MS;
317
+
318
+ while (!sawFeedEditAfterMark && Date.now() < postMarkDeadline) {
319
+ const remaining = postMarkDeadline - Date.now();
320
+ const next = await Promise.race([
321
+ iter.next(),
322
+ new Promise<{ done: true; value: undefined }>((r) =>
323
+ setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
324
+ ),
325
+ ]);
326
+ if (next.done || next.value == null) break;
327
+ const m = next.value as ObservedMessage;
328
+ if (m.senderUserId === sc.driverUserId) continue;
329
+ // An edit of the feed message that arrived after the mark
330
+ if (m.edited && m.messageId === feedId && m.date.getTime() >= markAt) {
331
+ sawFeedEditAfterMark = true;
332
+ console.log(
333
+ `[foreground-feed] feed edit confirmed after mark (id=${feedId}): ` +
334
+ JSON.stringify(m.text.slice(0, 120)),
335
+ );
336
+ }
337
+ }
338
+
339
+ expect(
340
+ sawFeedEditAfterMark,
341
+ "Failure shape (b): the activity-feed message still exists but received " +
342
+ `no edit after the ${FALLBACK_WINDOW_MS}ms fallback window. This is the ` +
343
+ "feed-went-dark regression: currentTurn was nulled mid-turn so no further " +
344
+ "drainActivitySummary calls fired. The feed body was frozen at " +
345
+ `${JSON.stringify(beforeMarkText.slice(0, 80))} and did not advance.`,
346
+ ).toBe(true);
347
+
348
+ // ── Assertion 3: final answer lands ─────────────────────────────────
349
+ // Collect from the live stream until a substantive bot reply arrives.
350
+ // This confirms the turn completed — the fallback did not also drop
351
+ // the answer path.
352
+ let finalAnswer: ObservedMessage | null = null;
353
+ const answerDeadline = Date.now() + 60_000;
354
+
355
+ while (Date.now() < answerDeadline) {
356
+ const remaining = answerDeadline - Date.now();
357
+ const next = await Promise.race([
358
+ iter.next(),
359
+ new Promise<{ done: true; value: undefined }>((r) =>
360
+ setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
361
+ ),
362
+ ]);
363
+ if (next.done || next.value == null) break;
364
+ const m = next.value as ObservedMessage;
365
+ if (m.senderUserId === sc.driverUserId) continue;
366
+ if (m.edited) continue; // edits are feed updates, not the answer
367
+ if (isActivityFeedMessage(m)) continue; // skip feed-only sends
368
+ if (m.text.trim().length >= MIN_ANSWER_CHARS) {
369
+ finalAnswer = m;
370
+ break;
371
+ }
372
+ }
373
+
374
+ console.log(
375
+ `[foreground-feed] final answer (id=${finalAnswer?.messageId ?? "NONE"}): ` +
376
+ JSON.stringify(finalAnswer?.text?.slice(0, 180) ?? null),
377
+ );
378
+
379
+ expect(
380
+ finalAnswer,
381
+ "Failure shape (c): no final answer arrived after the silence-fallback " +
382
+ `window. The turn did not produce a substantive reply (≥${MIN_ANSWER_CHARS} chars). ` +
383
+ "If the feed-gone-dark assertion also failed, the fallback may have " +
384
+ "suppressed the entire turn's output. If only this assertion failed, " +
385
+ "the turn is still in flight past the test budget — increase the prompt " +
386
+ "timeout or check that the agent is not wedged.",
387
+ ).not.toBeNull();
388
+
389
+ await iter.return?.();
390
+ } finally {
391
+ await sc.tearDown();
392
+ }
393
+ },
394
+ OVERALL_BUDGET_MS,
395
+ );
396
+ });
@@ -0,0 +1,202 @@
1
+ /**
2
+ * Liveness-driven feed open — a thinking-only turn still surfaces a live feed.
3
+ *
4
+ * ## Cause class (the #680 dark-turn, true vector)
5
+ *
6
+ * The activity feed is TOOL-driven: it opens only when a tool emits a non-null
7
+ * label (`drainActivitySummary`). A turn dominated by model thinking, or by
8
+ * suppressed-by-design tools (typing / memory recall / reply), emits no label —
9
+ * so the feed never opens and a long turn reads as pure silence until the 300s
10
+ * silence-poke. Turn #680 was exactly this: 335 s alive, `tools=4`, yet
11
+ * `feedOpened=false / activityMsgId=none` the entire time.
12
+ *
13
+ * The fix (gateway `feedHeartbeatTick`): once a turn has been alive for
14
+ * `FEED_LIVENESS_OPEN_MS` with no labelled tool yet, open a minimal `Working…`
15
+ * feed and let the existing 6 s heartbeat climb its elapsed. The first real
16
+ * tool label takes over and its edit replaces the placeholder; a pure-thinking
17
+ * turn finalizes to `✓ Working…` rather than freezing on the live line.
18
+ *
19
+ * ## Precondition (set on the test-harness agent for this run)
20
+ *
21
+ * SWITCHROOM_FEED_LIVENESS_OPEN_MS=6000 (shrinks the default 12 s so the
22
+ * window is exercised within budget)
23
+ *
24
+ * The scenario does not hard-require it — at the 12 s default the feed still
25
+ * opens, just later; the deadlines below tolerate either.
26
+ *
27
+ * ## What it asserts (asymmetric — non-determinism is handled, not faked)
28
+ *
29
+ * The trigger is a "think, then answer at length, use NO tools" prompt. The
30
+ * model's exact behaviour is not fully forceable, so the branches are:
31
+ *
32
+ * PASS — a `Working…` activity-feed message appeared. ONLY the
33
+ * liveness path produces a bare "Working…" feed (a tool would
34
+ * carry a tool label), so this is positive proof the timer
35
+ * opened the feed on a tool-less turn.
36
+ * INCONCLUSIVE — the agent used a tool anyway (feed opened with a tool label,
37
+ * no "Working…"). The liveness path was not exercised; not a
38
+ * failure of the fix. Warn + pass.
39
+ * INCONCLUSIVE — the turn was too short (answer landed before the threshold).
40
+ * Warn + pass.
41
+ * HARD FAIL — the turn ran clearly longer than the threshold with NO feed
42
+ * of any kind, yet produced an answer. Liveness should have
43
+ * opened a feed and did not — the regression this guard exists
44
+ * to catch.
45
+ * FAIL — no answer at all within budget (wedged).
46
+ */
47
+
48
+ import { describe, expect, it } from "vitest";
49
+ import { spinUp } from "../harness.js";
50
+ import { isActivityFeedMessage } from "../assertions.js";
51
+ import type { ObservedMessage } from "../driver.js";
52
+
53
+ /** A substantive answer is at least this many characters (skips a brief ack). */
54
+ const MIN_ANSWER_CHARS = 200;
55
+
56
+ /** Overall test budget. */
57
+ const OVERALL_BUDGET_MS = 150_000;
58
+
59
+ /**
60
+ * Workload: think, then answer at length, with NO tools. The long answer
61
+ * generation holds the turn open past the liveness threshold with no tool
62
+ * label, which is exactly the condition that should open the `Working…` feed.
63
+ */
64
+ const THINKING_WORKLOAD_PROMPT =
65
+ "Do NOT use any tools at all for this — no Bash, no Read, no memory search, " +
66
+ "nothing. Just think carefully and then write me a thorough, detailed " +
67
+ "explanation (at least 450 words) of how the TCP three-way handshake works, " +
68
+ "including SYN, SYN-ACK, ACK, sequence numbers, and what happens if the final " +
69
+ "ACK is lost. Take your time getting it right, then reply with the full essay " +
70
+ "in one message.";
71
+
72
+ describe("uat: liveness-driven feed open (thinking-only turn stays visible)", () => {
73
+ it(
74
+ "opens a 'Working…' feed for a turn that emits no tool label",
75
+ async () => {
76
+ const sc = await spinUp({ agent: "test-harness" });
77
+ try {
78
+ const iter = sc.driver
79
+ .observeMessages(sc.botUserId)
80
+ [Symbol.asyncIterator]();
81
+
82
+ await sc.sendDM(THINKING_WORKLOAD_PROMPT);
83
+ const sentAt = Date.now();
84
+ console.log("[liveness-feed] prompt sent; watching for feed + answer…");
85
+
86
+ // Drain the stream until EITHER a feed message appears OR a substantive
87
+ // answer lands. Track which kind of feed (liveness vs tool) we saw.
88
+ let livenessFeed: ObservedMessage | null = null;
89
+ let toolFeed: ObservedMessage | null = null;
90
+ let answer: ObservedMessage | null = null;
91
+ let answerAt = 0;
92
+
93
+ const deadline = Date.now() + 110_000;
94
+ while (Date.now() < deadline) {
95
+ if (livenessFeed && answer) break;
96
+ const remaining = deadline - Date.now();
97
+ const next = await Promise.race([
98
+ iter.next(),
99
+ new Promise<{ done: true; value: undefined }>((r) =>
100
+ setTimeout(
101
+ () => r({ done: true, value: undefined }),
102
+ Math.max(0, remaining),
103
+ ),
104
+ ),
105
+ ]);
106
+ if (next.done || next.value == null) break;
107
+ const m = next.value as ObservedMessage;
108
+ if (m.senderUserId === sc.driverUserId) continue;
109
+
110
+ if (isActivityFeedMessage(m)) {
111
+ // A "Working…" feed body is the liveness placeholder; anything else
112
+ // is a tool-label feed.
113
+ if (/Working/.test(m.text)) {
114
+ if (!livenessFeed) {
115
+ livenessFeed = m;
116
+ console.log(
117
+ `[liveness-feed] LIVENESS feed opened at +${Date.now() - sentAt}ms: ` +
118
+ JSON.stringify(m.text.slice(0, 120)),
119
+ );
120
+ }
121
+ } else if (!toolFeed) {
122
+ toolFeed = m;
123
+ console.log(
124
+ `[liveness-feed] tool-label feed opened at +${Date.now() - sentAt}ms: ` +
125
+ JSON.stringify(m.text.slice(0, 120)),
126
+ );
127
+ }
128
+ continue;
129
+ }
130
+ if (m.edited) continue;
131
+ if (m.text.trim().length >= MIN_ANSWER_CHARS && !answer) {
132
+ answer = m;
133
+ answerAt = Date.now();
134
+ console.log(
135
+ `[liveness-feed] answer landed at +${answerAt - sentAt}ms (len=${m.text.trim().length}).`,
136
+ );
137
+ }
138
+ }
139
+
140
+ const turnSpanMs = (answerAt || Date.now()) - sentAt;
141
+
142
+ // ── Branch resolution ───────────────────────────────────────────────
143
+ if (livenessFeed) {
144
+ // PASS: the liveness timer opened a feed on a tool-less turn.
145
+ expect(
146
+ livenessFeed,
147
+ "liveness feed should be present in the PASS branch",
148
+ ).not.toBeNull();
149
+ // Confirm it carries the in-progress placeholder shape.
150
+ expect(livenessFeed!.text).toMatch(/Working/);
151
+ console.log(
152
+ "[liveness-feed] PASS — liveness-driven feed open confirmed.",
153
+ );
154
+ return;
155
+ }
156
+
157
+ if (toolFeed) {
158
+ console.warn(
159
+ "[liveness-feed] INCONCLUSIVE — the agent used a tool, so the feed " +
160
+ "opened via the normal tool-label path and the liveness timer was " +
161
+ "not exercised. Not a failure of the fix.",
162
+ );
163
+ // Still require the turn completed.
164
+ expect(
165
+ answer,
166
+ "even in the tool-feed branch the turn must complete with an answer",
167
+ ).not.toBeNull();
168
+ return;
169
+ }
170
+
171
+ // No feed of any kind appeared.
172
+ if (answer && turnSpanMs < 8_000) {
173
+ console.warn(
174
+ `[liveness-feed] INCONCLUSIVE — turn completed in ${turnSpanMs}ms, ` +
175
+ "below the liveness threshold; no feed was expected.",
176
+ );
177
+ return;
178
+ }
179
+
180
+ // No feed, and the turn ran long enough that liveness SHOULD have opened.
181
+ expect(
182
+ answer,
183
+ "FAIL — no answer arrived within budget; the turn may be wedged.",
184
+ ).not.toBeNull();
185
+
186
+ expect(
187
+ livenessFeed,
188
+ "HARD FAIL — the turn ran for " +
189
+ `${turnSpanMs}ms (well past the liveness threshold) with NO activity ` +
190
+ "feed of any kind, yet produced an answer. The liveness timer should " +
191
+ "have opened a 'Working…' feed and did not — this is the #680 " +
192
+ "dark-turn regression.",
193
+ ).not.toBeNull();
194
+
195
+ await iter.return?.();
196
+ } finally {
197
+ await sc.tearDown();
198
+ }
199
+ },
200
+ OVERALL_BUDGET_MS,
201
+ );
202
+ });