switchroom 0.15.44 → 0.16.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/dist/agent-scheduler/index.js +122 -88
  2. package/dist/auth-broker/index.js +463 -177
  3. package/dist/cli/autoaccept-poll.js +4842 -35
  4. package/dist/cli/drive-write-pretool.mjs +17 -14
  5. package/dist/cli/notion-write-pretool.mjs +117 -86
  6. package/dist/cli/self-improve-apply-guard-pretool.mjs +626 -0
  7. package/dist/cli/self-improve-stop.mjs +428 -0
  8. package/dist/cli/skill-validate-pretool.mjs +72 -72
  9. package/dist/cli/switchroom.js +3249 -1241
  10. package/dist/cli/ui/index.html +1 -1
  11. package/dist/host-control/main.js +2833 -355
  12. package/dist/vault/approvals/kernel-server.js +7482 -7439
  13. package/dist/vault/broker/server.js +11315 -11272
  14. package/examples/minimal.yaml +1 -0
  15. package/examples/switchroom.yaml +1 -0
  16. package/package.json +3 -3
  17. package/profiles/_base/start.sh.hbs +88 -1
  18. package/profiles/_shared/execution-discipline.md.hbs +18 -0
  19. package/profiles/default/CLAUDE.md.hbs +3 -22
  20. package/telegram-plugin/.claude-plugin/plugin.json +2 -2
  21. package/telegram-plugin/answer-stream-flag.ts +12 -49
  22. package/telegram-plugin/answer-stream.ts +5 -150
  23. package/telegram-plugin/auth-snapshot-format.ts +280 -48
  24. package/telegram-plugin/auto-fallback-fleet.ts +44 -1
  25. package/telegram-plugin/context-exhaustion.ts +12 -0
  26. package/telegram-plugin/demo-mask.ts +154 -0
  27. package/telegram-plugin/dist/bridge/bridge.js +167 -124
  28. package/telegram-plugin/dist/gateway/gateway.js +3039 -1159
  29. package/telegram-plugin/dist/server.js +215 -172
  30. package/telegram-plugin/docs/waiting-ux-spec.md +2 -2
  31. package/telegram-plugin/draft-stream.ts +47 -410
  32. package/telegram-plugin/final-answer-detect.ts +17 -12
  33. package/telegram-plugin/fleet-fallback-resume.ts +131 -0
  34. package/telegram-plugin/format.ts +56 -19
  35. package/telegram-plugin/gateway/auth-add-flow.ts +332 -127
  36. package/telegram-plugin/gateway/auth-broker-client.ts +2 -2
  37. package/telegram-plugin/gateway/auth-command.ts +70 -14
  38. package/telegram-plugin/gateway/clean-shutdown-marker.ts +44 -0
  39. package/telegram-plugin/gateway/config-approval-handler.test.ts +91 -4
  40. package/telegram-plugin/gateway/config-approval-handler.ts +94 -13
  41. package/telegram-plugin/gateway/current-turn-map.ts +188 -0
  42. package/telegram-plugin/gateway/disconnect-flush.ts +3 -1
  43. package/telegram-plugin/gateway/effort-command.ts +8 -3
  44. package/telegram-plugin/gateway/emission-authority.ts +369 -0
  45. package/telegram-plugin/gateway/feed-open-gate.ts +292 -0
  46. package/telegram-plugin/gateway/gateway.ts +1837 -291
  47. package/telegram-plugin/gateway/inject-handler.test.ts +2 -1
  48. package/telegram-plugin/gateway/ms365-write-approval.test.ts +4 -4
  49. package/telegram-plugin/gateway/represent-guard.ts +72 -0
  50. package/telegram-plugin/gateway/status-surface-log.test.ts +5 -4
  51. package/telegram-plugin/gateway/status-surface-log.ts +14 -3
  52. package/telegram-plugin/history.ts +33 -11
  53. package/telegram-plugin/hooks/repo-context-pretool.mjs +26 -0
  54. package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +5 -0
  55. package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +8 -0
  56. package/telegram-plugin/hooks/tool-label-pretool.mjs +39 -15
  57. package/telegram-plugin/issues-card.ts +4 -0
  58. package/telegram-plugin/model-unavailable.ts +124 -0
  59. package/telegram-plugin/narrative-dedup.ts +69 -0
  60. package/telegram-plugin/over-ping-safety-net.ts +70 -4
  61. package/telegram-plugin/package.json +3 -3
  62. package/telegram-plugin/pending-work-progress.ts +12 -0
  63. package/telegram-plugin/permission-rule.ts +32 -5
  64. package/telegram-plugin/permission-title.ts +152 -9
  65. package/telegram-plugin/quota-check.ts +13 -0
  66. package/telegram-plugin/quota-watch.ts +135 -7
  67. package/telegram-plugin/registry/turns-schema.test.ts +24 -0
  68. package/telegram-plugin/registry/turns-schema.ts +9 -0
  69. package/telegram-plugin/runtime-metrics.ts +13 -0
  70. package/telegram-plugin/session-tail.ts +96 -11
  71. package/telegram-plugin/silence-poke.ts +170 -24
  72. package/telegram-plugin/slot-banner-driver.ts +3 -0
  73. package/telegram-plugin/status-no-truncate.ts +44 -0
  74. package/telegram-plugin/status-reactions.ts +20 -3
  75. package/telegram-plugin/stream-controller.ts +4 -23
  76. package/telegram-plugin/stream-reply-handler.ts +6 -24
  77. package/telegram-plugin/streaming-metrics.ts +91 -0
  78. package/telegram-plugin/subagent-watcher.ts +212 -66
  79. package/telegram-plugin/tests/activity-ever-opened-sticky.test.ts +47 -0
  80. package/telegram-plugin/tests/answer-stream-dedup.test.ts +9 -26
  81. package/telegram-plugin/tests/answer-stream-flag.test.ts +25 -58
  82. package/telegram-plugin/tests/answer-stream-silent-markers.test.ts +41 -51
  83. package/telegram-plugin/tests/answer-stream.test.ts +2 -411
  84. package/telegram-plugin/tests/auth-add-flow.test.ts +488 -253
  85. package/telegram-plugin/tests/auth-command-format2.test.ts +71 -1
  86. package/telegram-plugin/tests/auth-snapshot-format.test.ts +376 -6
  87. package/telegram-plugin/tests/auto-fallback-fleet.test.ts +120 -0
  88. package/telegram-plugin/tests/cross-turn-card-gate.test.ts +424 -0
  89. package/telegram-plugin/tests/demo-mask.test.ts +127 -0
  90. package/telegram-plugin/tests/draft-stream.test.ts +0 -827
  91. package/telegram-plugin/tests/emission-authority-card-drain-gate.test.ts +236 -0
  92. package/telegram-plugin/tests/emission-authority-facade.test.ts +488 -0
  93. package/telegram-plugin/tests/emission-authority-open-gate.test.ts +179 -0
  94. package/telegram-plugin/tests/emission-authority-ping-gate.test.ts +395 -0
  95. package/telegram-plugin/tests/emission-determinism-wiring.test.ts +177 -0
  96. package/telegram-plugin/tests/feed-heartbeat-liveness-open.test.ts +146 -0
  97. package/telegram-plugin/tests/feed-open-gate.test.ts +259 -0
  98. package/telegram-plugin/tests/feed-survival.test.ts +526 -0
  99. package/telegram-plugin/tests/fleet-fallback-resume.test.ts +197 -0
  100. package/telegram-plugin/tests/gateway-clean-shutdown-marker.test.ts +117 -0
  101. package/telegram-plugin/tests/gateway-no-reply-single-emit.test.ts +4 -11
  102. package/telegram-plugin/tests/history.test.ts +60 -0
  103. package/telegram-plugin/tests/model-unavailable.test.ts +118 -0
  104. package/telegram-plugin/tests/narrative-dedup.test.ts +118 -0
  105. package/telegram-plugin/tests/orphaned-reply-rearm.test.ts +285 -0
  106. package/telegram-plugin/tests/over-ping-final-answer-decoupling.test.ts +194 -0
  107. package/telegram-plugin/tests/over-ping-safety-net.test.ts +2 -2
  108. package/telegram-plugin/tests/per-topic-current-turn.test.ts +373 -0
  109. package/telegram-plugin/tests/permission-card-origin-kill-switch.test.ts +42 -0
  110. package/telegram-plugin/tests/permission-rule.test.ts +17 -0
  111. package/telegram-plugin/tests/permission-title.test.ts +206 -17
  112. package/telegram-plugin/tests/quota-watch.test.ts +252 -9
  113. package/telegram-plugin/tests/reply-terminal-reaction.test.ts +6 -1
  114. package/telegram-plugin/tests/repo-context-pretool.test.ts +62 -0
  115. package/telegram-plugin/tests/represent-guard.test.ts +162 -0
  116. package/telegram-plugin/tests/session-tail.test.ts +147 -3
  117. package/telegram-plugin/tests/silence-liveness-wiring.test.ts +18 -0
  118. package/telegram-plugin/tests/status-card-budget-parity.test.ts +72 -0
  119. package/telegram-plugin/tests/status-surface-log.test.ts +146 -0
  120. package/telegram-plugin/tests/subagent-watcher-clip-narrative.test.ts +58 -0
  121. package/telegram-plugin/tests/subagent-watcher-parent-turn-key.test.ts +102 -0
  122. package/telegram-plugin/tests/subagent-watcher-workflow-visibility.test.ts +225 -0
  123. package/telegram-plugin/tests/subagent-watcher.test.ts +147 -0
  124. package/telegram-plugin/tests/telegram-activity-visibility-integration.test.ts +597 -0
  125. package/telegram-plugin/tests/telegram-format.test.ts +101 -6
  126. package/telegram-plugin/tests/tool-activity-summary.test.ts +550 -15
  127. package/telegram-plugin/tests/tool-label-pretool.test.ts +73 -0
  128. package/telegram-plugin/tests/tool-label-sidecar.test.ts +44 -0
  129. package/telegram-plugin/tests/tool-labels.test.ts +67 -0
  130. package/telegram-plugin/tests/turn-liveness-floor.test.ts +196 -0
  131. package/telegram-plugin/tests/turn-liveness-invariant.test.ts +340 -0
  132. package/telegram-plugin/tests/welcome-text.test.ts +32 -3
  133. package/telegram-plugin/tests/worker-activity-feed.test.ts +470 -22
  134. package/telegram-plugin/tool-activity-summary.ts +375 -58
  135. package/telegram-plugin/turn-liveness-floor.ts +240 -0
  136. package/telegram-plugin/uat/assertions.ts +115 -0
  137. package/telegram-plugin/uat/driver.ts +68 -0
  138. package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +119 -133
  139. package/telegram-plugin/uat/scenarios/jtbd-answer-pings.test.ts +94 -0
  140. package/telegram-plugin/uat/scenarios/jtbd-cross-turn-card-dm.test.ts +109 -0
  141. package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-thinkgap-dm.test.ts +478 -0
  142. package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-visibility-dm.test.ts +396 -0
  143. package/telegram-plugin/uat/scenarios/jtbd-liveness-feed-open-dm.test.ts +202 -0
  144. package/telegram-plugin/uat/scenarios/jtbd-reply-is-last-dm.test.ts +202 -0
  145. package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +93 -87
  146. package/telegram-plugin/welcome-text.ts +13 -1
  147. package/telegram-plugin/worker-activity-feed.ts +157 -82
  148. package/telegram-plugin/draft-transport.ts +0 -122
  149. package/telegram-plugin/tests/draft-retirement-wiring.test.ts +0 -82
  150. package/telegram-plugin/tests/draft-transport.test.ts +0 -211
@@ -0,0 +1,478 @@
1
+ /**
2
+ * Foreground activity-feed visibility across a between-tools silent-thinking gap.
3
+ *
4
+ * ## Cause class — bug #680, the vector the defer does NOT cover
5
+ *
6
+ * The gateway maintains a live activity-feed Telegram message while a foreground
7
+ * turn is in progress. Its silence-fallback timer (`SILENCE_FALLBACK_MS`, shrunk
8
+ * to 20 000 ms on the test-harness for these scenarios) fires when no liveness
9
+ * signal arrives for longer than the threshold. On fire, `currentTurn` is nulled:
10
+ * the gateway stops sending activity-feed edits and the feed goes dark — even
11
+ * while the agent is still working.
12
+ *
13
+ * The fleet runs `SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS=1` (prod default). That
14
+ * defer HOLDS the fallback while a tool is still in-flight — so a long in-flight
15
+ * tool (e.g. `sleep 35`) does NOT trigger the fallback; the heartbeat keeps the
16
+ * feed lit. The sibling scenario `jtbd-foreground-feed-visibility-dm.test.ts`
17
+ * pins that deterministic, defer-protected case.
18
+ *
19
+ * What the defer does NOT protect is a long SILENT MODEL-THINKING GAP between
20
+ * two tool calls, where:
21
+ * - the prior tool has returned (no tool in-flight),
22
+ * - no answer-stream draft is emitting, and
23
+ * - no new tool-label has rendered yet.
24
+ *
25
+ * In that window the silence clock runs UNPROTECTED. If the model thinks silently
26
+ * for longer than 20 s before issuing the next tool call or starting its answer,
27
+ * the fallback fires, `currentTurn` is nulled, and the feed goes dark — while the
28
+ * agent is still working on the turn. This is the exact #680 regression vector.
29
+ *
30
+ * ## Why this is BEST-EFFORT and non-deterministic
31
+ *
32
+ * A gap long enough to trigger the 20 s fallback CANNOT be forced reliably. The
33
+ * model may:
34
+ * (a) Stream a partial answer-draft, which resets the silence clock — the gap
35
+ * never opens 20 s unprotected regardless of model latency.
36
+ * (b) Think quickly and issue its answer before 20 s elapses.
37
+ * (c) Think slowly and silently for > 20 s, opening the exact vector.
38
+ *
39
+ * Only case (c) lets us observe the bug. Cases (a) and (b) are inconclusive —
40
+ * a lit feed in either case does NOT prove the bug is absent; it just means the
41
+ * trigger condition didn't materialize this run.
42
+ *
43
+ * The workload prompt is engineered to maximize the probability of case (c):
44
+ * 1. One fast tool first (`date` via Bash) so the feed OPENS.
45
+ * 2. An explicit instruction to think silently for at least 30 s without further
46
+ * tools and without sending anything.
47
+ * 3. A genuinely hard open-ended reasoning question (multi-factor design
48
+ * estimation) that rewards sustained internal deliberation.
49
+ *
50
+ * This structural bias raises the chance the model hits > 20 s of unprotected
51
+ * silence, but cannot guarantee it on every run.
52
+ *
53
+ * ## Asymmetric assertion logic
54
+ *
55
+ * The scenario uses five distinct code paths, exactly one of which fires per run:
56
+ *
57
+ * Branch 1 — FEED NEVER OPENED: no activity-feed message appeared after the
58
+ * first tool call. Cannot observe the #680 vector at all. `console.warn` +
59
+ * return INCONCLUSIVE (no assertion failure).
60
+ *
61
+ * Branch 2 — TURN ENDED BEFORE THE MARK: the final answer arrived BEFORE the
62
+ * fallback-window mark was reached. The dark window cannot have been caused
63
+ * by a mid-think-gap null because the turn was already complete. This is NOT
64
+ * the bug; treat as pass/INCONCLUSIVE. `console.warn` + return.
65
+ *
66
+ * Branch 3 — FEED WENT DARK AND TURN WAS STILL IN-PROGRESS AT THE MARK:
67
+ * the feed existed before the mark; after the mark there were no further feed
68
+ * edits; and the final answer had NOT yet arrived at the mark (but does arrive
69
+ * afterward). This is #680 reproduced. HARD FAIL with a precise message naming
70
+ * the dark window and that `currentTurn` was nulled mid-think-gap.
71
+ *
72
+ * Branch 4 — FEED STAYED LIT PAST THE MARK: the feed kept receiving edits after
73
+ * the mark. Ambiguous — either the fix held, OR the model didn't produce a
74
+ * > 20 s silent gap (cases (a)/(b) above). Cannot distinguish without deeper
75
+ * gateway instrumentation. `console.warn` explaining both possibilities and
76
+ * that a lit feed is NOT proof of fix here — that's the sibling guard's job.
77
+ * Return INCONCLUSIVE (no assertion failure).
78
+ *
79
+ * Branch 5 — TURN WEDGED (final answer never arrives): distinct from #680's
80
+ * trigger; could be a compound regression or a model hang. HARD FAIL.
81
+ *
82
+ * ## Tolerances and timing
83
+ *
84
+ * - FALLBACK_WINDOW_MS (25 000): wait time from the moment the feed FIRST
85
+ * appears before taking the "before-mark" snapshot. Chosen to be safely above
86
+ * the shrunk 20 s fallback while leaving room in the budget.
87
+ * - POST_MARK_FEED_WAIT_MS (12 000): how long to drain the live stream after the
88
+ * mark for further feed edits. Short but enough for one heartbeat cycle.
89
+ * - ANSWER_BUDGET_MS (90 000): how long to wait for the final answer after the
90
+ * mark. Generous because the model may still be mid-think at the mark.
91
+ * - OVERALL_BUDGET_MS (150 000): total test timeout including settle, turn
92
+ * onset, the window, and final-answer drain.
93
+ *
94
+ * ## Env precondition (operator must set on test-harness agent)
95
+ *
96
+ * SWITCHROOM_SILENCE_FALLBACK_MS=20000 (on the agent's env: block)
97
+ * SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS=1 (prod default — defer must be ON
98
+ * so we're isolating the between-tools vector, not the in-flight-tool one)
99
+ *
100
+ * Mirror the fallback value into the UAT env:
101
+ * SWITCHROOM_UAT_SILENCE_FALLBACK_MS=20000 (in repo-root .env)
102
+ *
103
+ * The scenario reads `SWITCHROOM_UAT_SILENCE_FALLBACK_MS` at runtime to detect
104
+ * whether the precondition is satisfied. If absent or > 30 000, it skips with a
105
+ * clear warning rather than producing a vacuous green or a misleading timeout.
106
+ *
107
+ * ## Cross-reference
108
+ *
109
+ * This is the best-effort, non-deterministic HALF of the feed-visibility pair.
110
+ * The deterministic half (long in-flight tool, defer-protected) is:
111
+ * `jtbd-foreground-feed-visibility-dm.test.ts`
112
+ * Together the two cover the feed-visibility invariant: deterministically (there)
113
+ * and best-effort for the true #680 between-tools vector (here).
114
+ */
115
+
116
+ import { describe, it } from "vitest";
117
+ import { spinUp } from "../harness.js";
118
+ import { isActivityFeedMessage } from "../assertions.js";
119
+ import type { ObservedMessage } from "../driver.js";
120
+
121
+ /**
122
+ * UAT-env mirror of the shrunk fallback on the test-harness agent. The
123
+ * scenario checks this rather than querying the agent's process env
124
+ * (which it cannot reach) — so the operator must set both knobs in sync.
125
+ *
126
+ * Set `SWITCHROOM_UAT_SILENCE_FALLBACK_MS=20000` in repo-root .env.
127
+ */
128
+ const PRECONDITION_FALLBACK_MS = Number.parseInt(
129
+ process.env.SWITCHROOM_UAT_SILENCE_FALLBACK_MS ?? "",
130
+ 10,
131
+ );
132
+
133
+ /**
134
+ * Wait this long (ms) after the feed first appears before taking the
135
+ * "before-mark" snapshot. Must exceed the shrunk 20 s fallback so we
136
+ * are definitively past the point where the bug would have fired.
137
+ */
138
+ const FALLBACK_WINDOW_MS = 25_000;
139
+
140
+ /**
141
+ * After the mark, drain the live stream this long for further feed edits.
142
+ * One full heartbeat cycle (typically 5–8 s) plus slack.
143
+ */
144
+ const POST_MARK_FEED_WAIT_MS = 12_000;
145
+
146
+ /**
147
+ * Budget for the final answer to arrive after the mark. Generous because
148
+ * the model may still be in the middle of its think-gap at the mark.
149
+ */
150
+ const ANSWER_BUDGET_MS = 90_000;
151
+
152
+ /**
153
+ * A reply this many characters or longer is treated as the final answer.
154
+ * Avoids latching onto brief acks or one-liners that are not the deliverable.
155
+ */
156
+ const MIN_ANSWER_CHARS = 150;
157
+
158
+ /**
159
+ * Total test timeout: settle (~8 s) + turn onset + feed wait (~30 s) +
160
+ * FALLBACK_WINDOW_MS (25 s) + POST_MARK_FEED_WAIT_MS (12 s) + answer
161
+ * budget (90 s) + headroom. The budget is generous because model latency
162
+ * during the silent thinking gap is the whole point.
163
+ */
164
+ const OVERALL_BUDGET_MS = 150_000;
165
+
166
+ /**
167
+ * Workload prompt — engineered to maximize probability of a long silent
168
+ * thinking gap BETWEEN the first tool return and the final answer.
169
+ *
170
+ * Step 1: run `date` via Bash so the activity feed OPENS (the first
171
+ * PreToolUse label renders and the feed message is created).
172
+ *
173
+ * Step 2: after `date` returns, the model must think silently for AT LEAST
174
+ * 30 seconds — no further tools, no partial messages, no streaming drafts.
175
+ * The explicit instruction forbids shortcuts. The hard estimation question
176
+ * (multi-factor distributed-systems design) is chosen to reward sustained
177
+ * internal deliberation, making a long silent gap structurally plausible.
178
+ *
179
+ * Step 3: send ONE reply with the complete answer after the silent think.
180
+ *
181
+ * The 30 s instruction slightly exceeds the shrunk 20 s fallback so that
182
+ * IF the model obeys it and DOES stay silent, the threshold will be crossed.
183
+ * The asymmetric assertions (below) handle all cases where the model does not.
184
+ */
185
+ const THINK_GAP_WORKLOAD_PROMPT =
186
+ "First, run the `date` command via the Bash tool. " +
187
+ "Then — WITHOUT using any more tools and WITHOUT sending me anything yet — " +
188
+ "think carefully and silently for at least 30 seconds about the following design " +
189
+ "question: Given a distributed event-driven system where 50 microservices emit " +
190
+ "telemetry at varying rates (10 to 10 000 events/s per service), propose a " +
191
+ "tiered ingestion architecture that keeps end-to-end p99 latency under 200 ms, " +
192
+ "handles a 10x burst without data loss, and costs less than $2 000/month at " +
193
+ "steady state on a major cloud provider. Consider storage, compute, and egress. " +
194
+ "Do NOT stream partial thinking, do NOT send me interim updates, and do NOT call " +
195
+ "any further tools. Only after you have thought it through completely, reply once " +
196
+ "with your full architecture proposal.";
197
+
198
+ describe("uat: foreground activity-feed visibility across between-tools silent-thinking gap (#680)", () => {
199
+ it(
200
+ "BEST-EFFORT: feed does not go dark during a long silent model-think gap between tools",
201
+ async () => {
202
+ // ── Precondition guard ─────────────────────────────────────────────────
203
+ // If the operator has not shrunk the silence fallback to ≤ 30 000 ms on
204
+ // the test-harness agent (and mirrored it into the UAT env), the timing
205
+ // assertions are vacuous — the default 300 s fallback far exceeds the
206
+ // test budget and the scenario exits before it could detect a regression.
207
+ // Skip with a clear warning rather than silently producing a false green.
208
+ if (!Number.isFinite(PRECONDITION_FALLBACK_MS) || PRECONDITION_FALLBACK_MS > 30_000) {
209
+ console.warn(
210
+ "[uat/thinkgap-feed] SKIPPED — precondition not met.\n" +
211
+ " This scenario requires SWITCHROOM_SILENCE_FALLBACK_MS=20000 set on\n" +
212
+ " the test-harness agent AND SWITCHROOM_UAT_SILENCE_FALLBACK_MS=20000\n" +
213
+ " in the repo-root .env. Without it the silence fallback does not fire\n" +
214
+ " within the test window and the #680 between-tools vector cannot be\n" +
215
+ " exercised. See the header doc comment for setup instructions.",
216
+ );
217
+ return;
218
+ }
219
+
220
+ const sc = await spinUp({ agent: "test-harness" });
221
+ try {
222
+ // Start observing BEFORE sending so no messages (including the first
223
+ // activity-feed paint) are missed by the live stream.
224
+ const iter = sc.driver
225
+ .observeMessages(sc.botUserId)
226
+ [Symbol.asyncIterator]();
227
+
228
+ await sc.sendDM(THINK_GAP_WORKLOAD_PROMPT);
229
+
230
+ console.log("[thinkgap-feed] prompt sent; watching for activity-feed message…");
231
+
232
+ // ── Step 1: wait for the feed to OPEN ─────────────────────────────────
233
+ // The feed should open when the first tool label renders (the `date` Bash
234
+ // call). Give a generous budget to account for cold-start latency.
235
+ let feedMsg: ObservedMessage | null = null;
236
+ const feedDeadline = Date.now() + 90_000;
237
+
238
+ while (Date.now() < feedDeadline) {
239
+ const remaining = feedDeadline - Date.now();
240
+ const next = await Promise.race([
241
+ iter.next(),
242
+ new Promise<{ done: true; value: undefined }>((r) =>
243
+ setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
244
+ ),
245
+ ]);
246
+ if (next.done || next.value == null) break;
247
+ const m = next.value as ObservedMessage;
248
+ if (m.senderUserId === sc.driverUserId) continue; // skip our own echo
249
+ if (isActivityFeedMessage(m)) {
250
+ feedMsg = m;
251
+ console.log(
252
+ `[thinkgap-feed] feed opened (id=${m.messageId}): ` +
253
+ JSON.stringify(m.text.slice(0, 120)),
254
+ );
255
+ break;
256
+ }
257
+ }
258
+
259
+ // ── Branch 1: FEED NEVER OPENED ─────────────────────────────────────
260
+ // Cannot observe the #680 vector. Inconclusive — do not fail, but warn
261
+ // loudly so the operator knows the test was unable to exercise the path.
262
+ if (feedMsg === null) {
263
+ console.warn(
264
+ "[thinkgap-feed] INCONCLUSIVE — the foreground activity-feed message\n" +
265
+ " never appeared after the first tool call. The #680 between-tools\n" +
266
+ " vector cannot be observed without an open feed.\n" +
267
+ " Possible causes: the agent did not use tools at all, the initial\n" +
268
+ " `date` call was too fast for a feed paint, or drainActivitySummary\n" +
269
+ " failed before the message was sent. Not treated as a test failure\n" +
270
+ " because the absence of the feed is a distinct (pre-vector) issue.",
271
+ );
272
+ await iter.return?.();
273
+ return;
274
+ }
275
+
276
+ const feedId = feedMsg.messageId;
277
+ const beforeMarkText = feedMsg.text;
278
+
279
+ // ── Concurrently drain for two events while the mark elapses ──────────
280
+ //
281
+ // While we wait FALLBACK_WINDOW_MS for the mark to pass, collect:
282
+ // (a) any further feed EDITS (proves the feed stayed live),
283
+ // (b) the FINAL ANSWER (proves the turn ended before the mark —
284
+ // Branch 2, which is NOT the bug).
285
+ //
286
+ // Both are checked after the wait.
287
+ let sawFeedEditDuringWait = false;
288
+ let finalAnswerBeforeMark: ObservedMessage | null = null;
289
+
290
+ const markAt = Date.now() + FALLBACK_WINDOW_MS;
291
+ console.log(
292
+ `[thinkgap-feed] waiting ${FALLBACK_WINDOW_MS}ms for fallback window to elapse…`,
293
+ );
294
+
295
+ // Drain the live stream until the mark elapses.
296
+ while (Date.now() < markAt) {
297
+ const remaining = markAt - Date.now();
298
+ const next = await Promise.race([
299
+ iter.next(),
300
+ new Promise<{ done: true; value: undefined }>((r) =>
301
+ setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
302
+ ),
303
+ ]);
304
+ if (next.done || next.value == null) break;
305
+ const m = next.value as ObservedMessage;
306
+ if (m.senderUserId === sc.driverUserId) continue;
307
+
308
+ // A feed EDIT arriving during the wait means the feed is still live.
309
+ if (m.edited && m.messageId === feedId) {
310
+ sawFeedEditDuringWait = true;
311
+ console.log(
312
+ `[thinkgap-feed] feed edit arrived during wait (id=${feedId}): ` +
313
+ JSON.stringify(m.text.slice(0, 120)),
314
+ );
315
+ }
316
+
317
+ // A substantive non-feed, non-edit message is the final answer.
318
+ if (
319
+ !m.edited &&
320
+ !isActivityFeedMessage(m) &&
321
+ m.text.trim().length >= MIN_ANSWER_CHARS &&
322
+ finalAnswerBeforeMark === null
323
+ ) {
324
+ finalAnswerBeforeMark = m;
325
+ console.log(
326
+ `[thinkgap-feed] final answer arrived BEFORE the mark ` +
327
+ `(id=${m.messageId}): ` +
328
+ JSON.stringify(m.text.slice(0, 120)),
329
+ );
330
+ }
331
+ }
332
+
333
+ // ── Branch 2: TURN ENDED BEFORE THE MARK ───────────────────────────
334
+ // The final answer landed before the FALLBACK_WINDOW_MS mark. The turn
335
+ // was complete before the clock could have fired — any feed darkness
336
+ // observed after this point would just be normal post-turn cleanup.
337
+ // This is NOT the bug; treat as inconclusive/pass.
338
+ if (finalAnswerBeforeMark !== null) {
339
+ console.warn(
340
+ "[thinkgap-feed] INCONCLUSIVE — the final answer arrived BEFORE the\n" +
341
+ ` ${FALLBACK_WINDOW_MS}ms fallback-window mark. The turn completed\n` +
342
+ " before the silence clock could have fired, so any feed darkness after\n" +
343
+ " this point reflects normal post-turn teardown, not the #680 regression.\n" +
344
+ " The model did not produce a long enough silent think-gap to trigger\n" +
345
+ " the fallback mid-turn. Not a failure — run again or increase the\n" +
346
+ " prompt complexity to maximize the silent gap.",
347
+ );
348
+ await iter.return?.();
349
+ return;
350
+ }
351
+
352
+ // The final answer had NOT yet arrived by the mark. Now check whether
353
+ // the feed got further edits AFTER the mark.
354
+
355
+ // ── Poll for post-mark feed edits ────────────────────────────────────
356
+ // Fetch a fresh snapshot of the feed message (to check whether its body
357
+ // changed vs beforeMarkText) and drain POST_MARK_FEED_WAIT_MS of live
358
+ // stream for edit events on the feed message.
359
+ const afterMarkSnapshot = await sc.driver.getMessage(sc.botUserId, feedId);
360
+ const bodyChangedAfterMark =
361
+ afterMarkSnapshot !== null && afterMarkSnapshot.text !== beforeMarkText;
362
+
363
+ let sawFeedEditAfterMark = bodyChangedAfterMark;
364
+ const postMarkDeadline = Date.now() + POST_MARK_FEED_WAIT_MS;
365
+
366
+ while (!sawFeedEditAfterMark && Date.now() < postMarkDeadline) {
367
+ const remaining = postMarkDeadline - Date.now();
368
+ const next = await Promise.race([
369
+ iter.next(),
370
+ new Promise<{ done: true; value: undefined }>((r) =>
371
+ setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
372
+ ),
373
+ ]);
374
+ if (next.done || next.value == null) break;
375
+ const m = next.value as ObservedMessage;
376
+ if (m.senderUserId === sc.driverUserId) continue;
377
+ // An edit of the feed message that arrived after the mark.
378
+ if (m.edited && m.messageId === feedId) {
379
+ sawFeedEditAfterMark = true;
380
+ console.log(
381
+ `[thinkgap-feed] feed edit confirmed after mark (id=${feedId}): ` +
382
+ JSON.stringify(m.text.slice(0, 120)),
383
+ );
384
+ }
385
+ }
386
+
387
+ // ── Branch 4: FEED STAYED LIT ────────────────────────────────────────
388
+ // Either the fix held and the defer (or some other mechanism) prevented
389
+ // the fallback from firing, OR the model never produced a > 20 s silent
390
+ // gap (cases (a)/(b) in the header). We cannot distinguish these without
391
+ // deeper gateway instrumentation. A lit feed is NOT proof the bug is
392
+ // fixed — that proof lives in the sibling deterministic scenario. Warn
393
+ // and return without failing so we don't give a false green signal.
394
+ if (sawFeedEditDuringWait || sawFeedEditAfterMark) {
395
+ console.warn(
396
+ "[thinkgap-feed] INCONCLUSIVE — the feed kept receiving edits during\n" +
397
+ " and/or after the fallback-window mark. Two possible explanations:\n" +
398
+ " (A) The #680 bug is absent (fix holds) — the gateway did not null\n" +
399
+ " currentTurn despite the silent gap.\n" +
400
+ " (B) The model streamed a partial answer-draft or issued the next tool\n" +
401
+ " before 20 s elapsed, resetting the silence clock — the trigger\n" +
402
+ " condition never materialized.\n" +
403
+ " A lit feed is NOT proof-of-fix here; the deterministic guard for\n" +
404
+ " that is jtbd-foreground-feed-visibility-dm.test.ts (in-flight tool\n" +
405
+ " + defer). Treat as best-effort pass.",
406
+ );
407
+ // Fall through to the final-answer drain below so we confirm
408
+ // the turn eventually completed — don't return yet.
409
+ } else {
410
+ // Feed went dark AND the turn was still in progress at the mark.
411
+ // This is the #680 regression vector reproduced.
412
+
413
+ // ── Branch 3: FEED WENT DARK, TURN STILL IN PROGRESS ───────────────
414
+ // Hard fail with a precise description of what happened.
415
+ const darkWindowMs = FALLBACK_WINDOW_MS + POST_MARK_FEED_WAIT_MS;
416
+ throw new Error(
417
+ `[thinkgap-feed] FAIL — bug #680 (between-tools silent-think-gap) reproduced.\n` +
418
+ ` The activity-feed message (id=${feedId}) was present before the\n` +
419
+ ` ${FALLBACK_WINDOW_MS}ms mark and received no further edits in the\n` +
420
+ ` ${POST_MARK_FEED_WAIT_MS}ms window after the mark (total dark window\n` +
421
+ ` ≥ ${darkWindowMs}ms), while the final answer had NOT yet arrived.\n` +
422
+ ` This means currentTurn was nulled mid-think-gap by the silence-\n` +
423
+ ` fallback handler (SWITCHROOM_SILENCE_FALLBACK_MS=${PRECONDITION_FALLBACK_MS}ms)\n` +
424
+ ` after a silent model-thinking gap between the first tool's return\n` +
425
+ ` and the agent's reply, with no tool in-flight and no answer-stream\n` +
426
+ ` draft to reset the clock. The defer (SWITCHROOM_SILENCE_DEFER_\n` +
427
+ ` INFLIGHT_TOOLS=1) does not cover this vector.\n` +
428
+ ` Feed body at dark: ${JSON.stringify(beforeMarkText.slice(0, 100))}`,
429
+ );
430
+ }
431
+
432
+ // ── Branch 5: TURN WEDGED (final answer never arrives) ───────────────
433
+ // Drain for the full answer budget. If the answer lands we're done
434
+ // (success / inconclusive-pass). If it never lands, hard fail.
435
+ let finalAnswer: ObservedMessage | null = finalAnswerBeforeMark;
436
+ const answerDeadline = Date.now() + ANSWER_BUDGET_MS;
437
+
438
+ while (finalAnswer === null && Date.now() < answerDeadline) {
439
+ const remaining = answerDeadline - Date.now();
440
+ const next = await Promise.race([
441
+ iter.next(),
442
+ new Promise<{ done: true; value: undefined }>((r) =>
443
+ setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
444
+ ),
445
+ ]);
446
+ if (next.done || next.value == null) break;
447
+ const m = next.value as ObservedMessage;
448
+ if (m.senderUserId === sc.driverUserId) continue;
449
+ if (m.edited) continue; // edits are feed updates, not the final answer
450
+ if (isActivityFeedMessage(m)) continue;
451
+ if (m.text.trim().length >= MIN_ANSWER_CHARS) {
452
+ finalAnswer = m;
453
+ console.log(
454
+ `[thinkgap-feed] final answer received (id=${m.messageId}): ` +
455
+ JSON.stringify(m.text.slice(0, 180)),
456
+ );
457
+ }
458
+ }
459
+
460
+ if (finalAnswer === null) {
461
+ throw new Error(
462
+ `[thinkgap-feed] FAIL — the turn never produced a substantive reply\n` +
463
+ ` (≥${MIN_ANSWER_CHARS} chars) within the answer budget (${ANSWER_BUDGET_MS}ms).\n` +
464
+ ` The turn appears to have wedged. This may be a compound regression\n` +
465
+ ` where the silence-fallback nulled currentTurn and also suppressed\n` +
466
+ ` the answer path, or the model is hung waiting on something. Check\n` +
467
+ ` the test-harness gateway log for error or timeout signals.`,
468
+ );
469
+ }
470
+
471
+ await iter.return?.();
472
+ } finally {
473
+ await sc.tearDown();
474
+ }
475
+ },
476
+ OVERALL_BUDGET_MS,
477
+ );
478
+ });