switchroom 0.15.45 → 0.16.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-scheduler/index.js +122 -88
- package/dist/auth-broker/index.js +463 -177
- package/dist/cli/autoaccept-poll.js +4842 -35
- package/dist/cli/drive-write-pretool.mjs +17 -14
- package/dist/cli/notion-write-pretool.mjs +117 -86
- package/dist/cli/self-improve-apply-guard-pretool.mjs +626 -0
- package/dist/cli/self-improve-stop.mjs +428 -0
- package/dist/cli/skill-validate-pretool.mjs +72 -72
- package/dist/cli/switchroom.js +3158 -1178
- package/dist/host-control/main.js +2833 -355
- package/dist/vault/approvals/kernel-server.js +7479 -7439
- package/dist/vault/broker/server.js +11312 -11272
- package/examples/minimal.yaml +1 -0
- package/examples/switchroom.yaml +1 -0
- package/package.json +3 -3
- package/profiles/_base/start.sh.hbs +88 -1
- package/profiles/_shared/execution-discipline.md.hbs +18 -0
- package/profiles/default/CLAUDE.md.hbs +0 -19
- package/telegram-plugin/.claude-plugin/plugin.json +2 -2
- package/telegram-plugin/answer-stream-flag.ts +12 -49
- package/telegram-plugin/answer-stream.ts +5 -150
- package/telegram-plugin/auth-snapshot-format.ts +280 -48
- package/telegram-plugin/auto-fallback-fleet.ts +44 -1
- package/telegram-plugin/context-exhaustion.ts +12 -0
- package/telegram-plugin/demo-mask.ts +154 -0
- package/telegram-plugin/dist/bridge/bridge.js +167 -124
- package/telegram-plugin/dist/gateway/gateway.js +3039 -1159
- package/telegram-plugin/dist/server.js +215 -172
- package/telegram-plugin/docs/waiting-ux-spec.md +2 -2
- package/telegram-plugin/draft-stream.ts +47 -410
- package/telegram-plugin/final-answer-detect.ts +17 -12
- package/telegram-plugin/fleet-fallback-resume.ts +131 -0
- package/telegram-plugin/format.ts +56 -19
- package/telegram-plugin/gateway/auth-add-flow.ts +332 -127
- package/telegram-plugin/gateway/auth-broker-client.ts +2 -2
- package/telegram-plugin/gateway/auth-command.ts +70 -14
- package/telegram-plugin/gateway/clean-shutdown-marker.ts +44 -0
- package/telegram-plugin/gateway/config-approval-handler.test.ts +91 -4
- package/telegram-plugin/gateway/config-approval-handler.ts +94 -13
- package/telegram-plugin/gateway/current-turn-map.ts +188 -0
- package/telegram-plugin/gateway/disconnect-flush.ts +3 -1
- package/telegram-plugin/gateway/effort-command.ts +8 -3
- package/telegram-plugin/gateway/emission-authority.ts +369 -0
- package/telegram-plugin/gateway/feed-open-gate.ts +292 -0
- package/telegram-plugin/gateway/gateway.ts +1837 -291
- package/telegram-plugin/gateway/inject-handler.test.ts +2 -1
- package/telegram-plugin/gateway/ms365-write-approval.test.ts +4 -4
- package/telegram-plugin/gateway/represent-guard.ts +72 -0
- package/telegram-plugin/gateway/status-surface-log.test.ts +5 -4
- package/telegram-plugin/gateway/status-surface-log.ts +14 -3
- package/telegram-plugin/history.ts +33 -11
- package/telegram-plugin/hooks/repo-context-pretool.mjs +26 -0
- package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +5 -0
- package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +8 -0
- package/telegram-plugin/hooks/tool-label-pretool.mjs +39 -15
- package/telegram-plugin/issues-card.ts +4 -0
- package/telegram-plugin/model-unavailable.ts +124 -0
- package/telegram-plugin/narrative-dedup.ts +69 -0
- package/telegram-plugin/over-ping-safety-net.ts +70 -4
- package/telegram-plugin/package.json +3 -3
- package/telegram-plugin/pending-work-progress.ts +12 -0
- package/telegram-plugin/permission-rule.ts +32 -5
- package/telegram-plugin/permission-title.ts +152 -9
- package/telegram-plugin/quota-check.ts +13 -0
- package/telegram-plugin/quota-watch.ts +135 -7
- package/telegram-plugin/registry/turns-schema.test.ts +24 -0
- package/telegram-plugin/registry/turns-schema.ts +9 -0
- package/telegram-plugin/runtime-metrics.ts +13 -0
- package/telegram-plugin/session-tail.ts +96 -11
- package/telegram-plugin/silence-poke.ts +170 -24
- package/telegram-plugin/slot-banner-driver.ts +3 -0
- package/telegram-plugin/status-no-truncate.ts +44 -0
- package/telegram-plugin/status-reactions.ts +20 -3
- package/telegram-plugin/stream-controller.ts +4 -23
- package/telegram-plugin/stream-reply-handler.ts +6 -24
- package/telegram-plugin/streaming-metrics.ts +91 -0
- package/telegram-plugin/subagent-watcher.ts +212 -66
- package/telegram-plugin/tests/activity-ever-opened-sticky.test.ts +47 -0
- package/telegram-plugin/tests/answer-stream-dedup.test.ts +9 -26
- package/telegram-plugin/tests/answer-stream-flag.test.ts +25 -58
- package/telegram-plugin/tests/answer-stream-silent-markers.test.ts +41 -51
- package/telegram-plugin/tests/answer-stream.test.ts +2 -411
- package/telegram-plugin/tests/auth-add-flow.test.ts +488 -253
- package/telegram-plugin/tests/auth-command-format2.test.ts +71 -1
- package/telegram-plugin/tests/auth-snapshot-format.test.ts +376 -6
- package/telegram-plugin/tests/auto-fallback-fleet.test.ts +120 -0
- package/telegram-plugin/tests/cross-turn-card-gate.test.ts +424 -0
- package/telegram-plugin/tests/demo-mask.test.ts +127 -0
- package/telegram-plugin/tests/draft-stream.test.ts +0 -827
- package/telegram-plugin/tests/emission-authority-card-drain-gate.test.ts +236 -0
- package/telegram-plugin/tests/emission-authority-facade.test.ts +488 -0
- package/telegram-plugin/tests/emission-authority-open-gate.test.ts +179 -0
- package/telegram-plugin/tests/emission-authority-ping-gate.test.ts +395 -0
- package/telegram-plugin/tests/emission-determinism-wiring.test.ts +177 -0
- package/telegram-plugin/tests/feed-heartbeat-liveness-open.test.ts +146 -0
- package/telegram-plugin/tests/feed-open-gate.test.ts +259 -0
- package/telegram-plugin/tests/feed-survival.test.ts +526 -0
- package/telegram-plugin/tests/fleet-fallback-resume.test.ts +197 -0
- package/telegram-plugin/tests/gateway-clean-shutdown-marker.test.ts +117 -0
- package/telegram-plugin/tests/gateway-no-reply-single-emit.test.ts +4 -11
- package/telegram-plugin/tests/history.test.ts +60 -0
- package/telegram-plugin/tests/model-unavailable.test.ts +118 -0
- package/telegram-plugin/tests/narrative-dedup.test.ts +118 -0
- package/telegram-plugin/tests/orphaned-reply-rearm.test.ts +285 -0
- package/telegram-plugin/tests/over-ping-final-answer-decoupling.test.ts +194 -0
- package/telegram-plugin/tests/over-ping-safety-net.test.ts +2 -2
- package/telegram-plugin/tests/per-topic-current-turn.test.ts +373 -0
- package/telegram-plugin/tests/permission-card-origin-kill-switch.test.ts +42 -0
- package/telegram-plugin/tests/permission-rule.test.ts +17 -0
- package/telegram-plugin/tests/permission-title.test.ts +206 -17
- package/telegram-plugin/tests/quota-watch.test.ts +252 -9
- package/telegram-plugin/tests/reply-terminal-reaction.test.ts +6 -1
- package/telegram-plugin/tests/repo-context-pretool.test.ts +62 -0
- package/telegram-plugin/tests/represent-guard.test.ts +162 -0
- package/telegram-plugin/tests/session-tail.test.ts +147 -3
- package/telegram-plugin/tests/silence-liveness-wiring.test.ts +18 -0
- package/telegram-plugin/tests/status-card-budget-parity.test.ts +72 -0
- package/telegram-plugin/tests/status-surface-log.test.ts +146 -0
- package/telegram-plugin/tests/subagent-watcher-clip-narrative.test.ts +58 -0
- package/telegram-plugin/tests/subagent-watcher-parent-turn-key.test.ts +102 -0
- package/telegram-plugin/tests/subagent-watcher-workflow-visibility.test.ts +225 -0
- package/telegram-plugin/tests/subagent-watcher.test.ts +147 -0
- package/telegram-plugin/tests/telegram-activity-visibility-integration.test.ts +597 -0
- package/telegram-plugin/tests/telegram-format.test.ts +101 -6
- package/telegram-plugin/tests/tool-activity-summary.test.ts +550 -15
- package/telegram-plugin/tests/tool-label-pretool.test.ts +73 -0
- package/telegram-plugin/tests/tool-label-sidecar.test.ts +44 -0
- package/telegram-plugin/tests/tool-labels.test.ts +67 -0
- package/telegram-plugin/tests/turn-liveness-floor.test.ts +196 -0
- package/telegram-plugin/tests/turn-liveness-invariant.test.ts +340 -0
- package/telegram-plugin/tests/welcome-text.test.ts +32 -3
- package/telegram-plugin/tests/worker-activity-feed.test.ts +470 -22
- package/telegram-plugin/tool-activity-summary.ts +375 -58
- package/telegram-plugin/turn-liveness-floor.ts +240 -0
- package/telegram-plugin/uat/assertions.ts +115 -0
- package/telegram-plugin/uat/driver.ts +68 -0
- package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +119 -133
- package/telegram-plugin/uat/scenarios/jtbd-answer-pings.test.ts +94 -0
- package/telegram-plugin/uat/scenarios/jtbd-cross-turn-card-dm.test.ts +109 -0
- package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-thinkgap-dm.test.ts +478 -0
- package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-visibility-dm.test.ts +396 -0
- package/telegram-plugin/uat/scenarios/jtbd-liveness-feed-open-dm.test.ts +202 -0
- package/telegram-plugin/uat/scenarios/jtbd-reply-is-last-dm.test.ts +202 -0
- package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +93 -87
- package/telegram-plugin/welcome-text.ts +13 -1
- package/telegram-plugin/worker-activity-feed.ts +157 -82
- package/telegram-plugin/draft-transport.ts +0 -122
- package/telegram-plugin/tests/draft-retirement-wiring.test.ts +0 -82
- package/telegram-plugin/tests/draft-transport.test.ts +0 -211
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Foreground activity-feed visibility across a between-tools silent-thinking gap.
|
|
3
|
+
*
|
|
4
|
+
* ## Cause class — bug #680, the vector the defer does NOT cover
|
|
5
|
+
*
|
|
6
|
+
* The gateway maintains a live activity-feed Telegram message while a foreground
|
|
7
|
+
* turn is in progress. Its silence-fallback timer (`SILENCE_FALLBACK_MS`, shrunk
|
|
8
|
+
* to 20 000 ms on the test-harness for these scenarios) fires when no liveness
|
|
9
|
+
* signal arrives for longer than the threshold. On fire, `currentTurn` is nulled:
|
|
10
|
+
* the gateway stops sending activity-feed edits and the feed goes dark — even
|
|
11
|
+
* while the agent is still working.
|
|
12
|
+
*
|
|
13
|
+
* The fleet runs `SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS=1` (prod default). That
|
|
14
|
+
* defer HOLDS the fallback while a tool is still in-flight — so a long in-flight
|
|
15
|
+
* tool (e.g. `sleep 35`) does NOT trigger the fallback; the heartbeat keeps the
|
|
16
|
+
* feed lit. The sibling scenario `jtbd-foreground-feed-visibility-dm.test.ts`
|
|
17
|
+
* pins that deterministic, defer-protected case.
|
|
18
|
+
*
|
|
19
|
+
* What the defer does NOT protect is a long SILENT MODEL-THINKING GAP between
|
|
20
|
+
* two tool calls, where:
|
|
21
|
+
* - the prior tool has returned (no tool in-flight),
|
|
22
|
+
* - no answer-stream draft is emitting, and
|
|
23
|
+
* - no new tool-label has rendered yet.
|
|
24
|
+
*
|
|
25
|
+
* In that window the silence clock runs UNPROTECTED. If the model thinks silently
|
|
26
|
+
* for longer than 20 s before issuing the next tool call or starting its answer,
|
|
27
|
+
* the fallback fires, `currentTurn` is nulled, and the feed goes dark — while the
|
|
28
|
+
* agent is still working on the turn. This is the exact #680 regression vector.
|
|
29
|
+
*
|
|
30
|
+
* ## Why this is BEST-EFFORT and non-deterministic
|
|
31
|
+
*
|
|
32
|
+
* A gap long enough to trigger the 20 s fallback CANNOT be forced reliably. The
|
|
33
|
+
* model may:
|
|
34
|
+
* (a) Stream a partial answer-draft, which resets the silence clock — the gap
|
|
35
|
+
* never opens 20 s unprotected regardless of model latency.
|
|
36
|
+
* (b) Think quickly and issue its answer before 20 s elapses.
|
|
37
|
+
* (c) Think slowly and silently for > 20 s, opening the exact vector.
|
|
38
|
+
*
|
|
39
|
+
* Only case (c) lets us observe the bug. Cases (a) and (b) are inconclusive —
|
|
40
|
+
* a lit feed in either case does NOT prove the bug is absent; it just means the
|
|
41
|
+
* trigger condition didn't materialize this run.
|
|
42
|
+
*
|
|
43
|
+
* The workload prompt is engineered to maximize the probability of case (c):
|
|
44
|
+
* 1. One fast tool first (`date` via Bash) so the feed OPENS.
|
|
45
|
+
* 2. An explicit instruction to think silently for at least 30 s without further
|
|
46
|
+
* tools and without sending anything.
|
|
47
|
+
* 3. A genuinely hard open-ended reasoning question (multi-factor design
|
|
48
|
+
* estimation) that rewards sustained internal deliberation.
|
|
49
|
+
*
|
|
50
|
+
* This structural bias raises the chance the model hits > 20 s of unprotected
|
|
51
|
+
* silence, but cannot guarantee it on every run.
|
|
52
|
+
*
|
|
53
|
+
* ## Asymmetric assertion logic
|
|
54
|
+
*
|
|
55
|
+
* The scenario uses five distinct code paths, exactly one of which fires per run:
|
|
56
|
+
*
|
|
57
|
+
* Branch 1 — FEED NEVER OPENED: no activity-feed message appeared after the
|
|
58
|
+
* first tool call. Cannot observe the #680 vector at all. `console.warn` +
|
|
59
|
+
* return INCONCLUSIVE (no assertion failure).
|
|
60
|
+
*
|
|
61
|
+
* Branch 2 — TURN ENDED BEFORE THE MARK: the final answer arrived BEFORE the
|
|
62
|
+
* fallback-window mark was reached. The dark window cannot have been caused
|
|
63
|
+
* by a mid-think-gap null because the turn was already complete. This is NOT
|
|
64
|
+
* the bug; treat as pass/INCONCLUSIVE. `console.warn` + return.
|
|
65
|
+
*
|
|
66
|
+
* Branch 3 — FEED WENT DARK AND TURN WAS STILL IN-PROGRESS AT THE MARK:
|
|
67
|
+
* the feed existed before the mark; after the mark there were no further feed
|
|
68
|
+
* edits; and the final answer had NOT yet arrived at the mark (but does arrive
|
|
69
|
+
* afterward). This is #680 reproduced. HARD FAIL with a precise message naming
|
|
70
|
+
* the dark window and that `currentTurn` was nulled mid-think-gap.
|
|
71
|
+
*
|
|
72
|
+
* Branch 4 — FEED STAYED LIT PAST THE MARK: the feed kept receiving edits after
|
|
73
|
+
* the mark. Ambiguous — either the fix held, OR the model didn't produce a
|
|
74
|
+
* > 20 s silent gap (cases (a)/(b) above). Cannot distinguish without deeper
|
|
75
|
+
* gateway instrumentation. `console.warn` explaining both possibilities and
|
|
76
|
+
* that a lit feed is NOT proof of fix here — that's the sibling guard's job.
|
|
77
|
+
* Return INCONCLUSIVE (no assertion failure).
|
|
78
|
+
*
|
|
79
|
+
* Branch 5 — TURN WEDGED (final answer never arrives): distinct from #680's
|
|
80
|
+
* trigger; could be a compound regression or a model hang. HARD FAIL.
|
|
81
|
+
*
|
|
82
|
+
* ## Tolerances and timing
|
|
83
|
+
*
|
|
84
|
+
* - FALLBACK_WINDOW_MS (25 000): wait time from the moment the feed FIRST
|
|
85
|
+
* appears before taking the "before-mark" snapshot. Chosen to be safely above
|
|
86
|
+
* the shrunk 20 s fallback while leaving room in the budget.
|
|
87
|
+
* - POST_MARK_FEED_WAIT_MS (12 000): how long to drain the live stream after the
|
|
88
|
+
* mark for further feed edits. Short but enough for one heartbeat cycle.
|
|
89
|
+
* - ANSWER_BUDGET_MS (90 000): how long to wait for the final answer after the
|
|
90
|
+
* mark. Generous because the model may still be mid-think at the mark.
|
|
91
|
+
* - OVERALL_BUDGET_MS (150 000): total test timeout including settle, turn
|
|
92
|
+
* onset, the window, and final-answer drain.
|
|
93
|
+
*
|
|
94
|
+
* ## Env precondition (operator must set on test-harness agent)
|
|
95
|
+
*
|
|
96
|
+
* SWITCHROOM_SILENCE_FALLBACK_MS=20000 (on the agent's env: block)
|
|
97
|
+
* SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS=1 (prod default — defer must be ON
|
|
98
|
+
* so we're isolating the between-tools vector, not the in-flight-tool one)
|
|
99
|
+
*
|
|
100
|
+
* Mirror the fallback value into the UAT env:
|
|
101
|
+
* SWITCHROOM_UAT_SILENCE_FALLBACK_MS=20000 (in repo-root .env)
|
|
102
|
+
*
|
|
103
|
+
* The scenario reads `SWITCHROOM_UAT_SILENCE_FALLBACK_MS` at runtime to detect
|
|
104
|
+
* whether the precondition is satisfied. If absent or > 30 000, it skips with a
|
|
105
|
+
* clear warning rather than producing a vacuous green or a misleading timeout.
|
|
106
|
+
*
|
|
107
|
+
* ## Cross-reference
|
|
108
|
+
*
|
|
109
|
+
* This is the best-effort, non-deterministic HALF of the feed-visibility pair.
|
|
110
|
+
* The deterministic half (long in-flight tool, defer-protected) is:
|
|
111
|
+
* `jtbd-foreground-feed-visibility-dm.test.ts`
|
|
112
|
+
* Together the two cover the feed-visibility invariant: deterministically (there)
|
|
113
|
+
* and best-effort for the true #680 between-tools vector (here).
|
|
114
|
+
*/
|
|
115
|
+
|
|
116
|
+
import { describe, it } from "vitest";
|
|
117
|
+
import { spinUp } from "../harness.js";
|
|
118
|
+
import { isActivityFeedMessage } from "../assertions.js";
|
|
119
|
+
import type { ObservedMessage } from "../driver.js";
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* UAT-env mirror of the shrunk fallback on the test-harness agent. The
|
|
123
|
+
* scenario checks this rather than querying the agent's process env
|
|
124
|
+
* (which it cannot reach) — so the operator must set both knobs in sync.
|
|
125
|
+
*
|
|
126
|
+
* Set `SWITCHROOM_UAT_SILENCE_FALLBACK_MS=20000` in repo-root .env.
|
|
127
|
+
*/
|
|
128
|
+
const PRECONDITION_FALLBACK_MS = Number.parseInt(
|
|
129
|
+
process.env.SWITCHROOM_UAT_SILENCE_FALLBACK_MS ?? "",
|
|
130
|
+
10,
|
|
131
|
+
);
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Wait this long (ms) after the feed first appears before taking the
|
|
135
|
+
* "before-mark" snapshot. Must exceed the shrunk 20 s fallback so we
|
|
136
|
+
* are definitively past the point where the bug would have fired.
|
|
137
|
+
*/
|
|
138
|
+
const FALLBACK_WINDOW_MS = 25_000;
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* After the mark, drain the live stream this long for further feed edits.
|
|
142
|
+
* One full heartbeat cycle (typically 5–8 s) plus slack.
|
|
143
|
+
*/
|
|
144
|
+
const POST_MARK_FEED_WAIT_MS = 12_000;
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Budget for the final answer to arrive after the mark. Generous because
|
|
148
|
+
* the model may still be in the middle of its think-gap at the mark.
|
|
149
|
+
*/
|
|
150
|
+
const ANSWER_BUDGET_MS = 90_000;
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* A reply this many characters or longer is treated as the final answer.
|
|
154
|
+
* Avoids latching onto brief acks or one-liners that are not the deliverable.
|
|
155
|
+
*/
|
|
156
|
+
const MIN_ANSWER_CHARS = 150;
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Total test timeout: settle (~8 s) + turn onset + feed wait (~30 s) +
|
|
160
|
+
* FALLBACK_WINDOW_MS (25 s) + POST_MARK_FEED_WAIT_MS (12 s) + answer
|
|
161
|
+
* budget (90 s) + headroom. The budget is generous because model latency
|
|
162
|
+
* during the silent thinking gap is the whole point.
|
|
163
|
+
*/
|
|
164
|
+
const OVERALL_BUDGET_MS = 150_000;
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Workload prompt — engineered to maximize probability of a long silent
|
|
168
|
+
* thinking gap BETWEEN the first tool return and the final answer.
|
|
169
|
+
*
|
|
170
|
+
* Step 1: run `date` via Bash so the activity feed OPENS (the first
|
|
171
|
+
* PreToolUse label renders and the feed message is created).
|
|
172
|
+
*
|
|
173
|
+
* Step 2: after `date` returns, the model must think silently for AT LEAST
|
|
174
|
+
* 30 seconds — no further tools, no partial messages, no streaming drafts.
|
|
175
|
+
* The explicit instruction forbids shortcuts. The hard estimation question
|
|
176
|
+
* (multi-factor distributed-systems design) is chosen to reward sustained
|
|
177
|
+
* internal deliberation, making a long silent gap structurally plausible.
|
|
178
|
+
*
|
|
179
|
+
* Step 3: send ONE reply with the complete answer after the silent think.
|
|
180
|
+
*
|
|
181
|
+
* The 30 s instruction slightly exceeds the shrunk 20 s fallback so that
|
|
182
|
+
* IF the model obeys it and DOES stay silent, the threshold will be crossed.
|
|
183
|
+
* The asymmetric assertions (below) handle all cases where the model does not.
|
|
184
|
+
*/
|
|
185
|
+
const THINK_GAP_WORKLOAD_PROMPT =
|
|
186
|
+
"First, run the `date` command via the Bash tool. " +
|
|
187
|
+
"Then — WITHOUT using any more tools and WITHOUT sending me anything yet — " +
|
|
188
|
+
"think carefully and silently for at least 30 seconds about the following design " +
|
|
189
|
+
"question: Given a distributed event-driven system where 50 microservices emit " +
|
|
190
|
+
"telemetry at varying rates (10 to 10 000 events/s per service), propose a " +
|
|
191
|
+
"tiered ingestion architecture that keeps end-to-end p99 latency under 200 ms, " +
|
|
192
|
+
"handles a 10x burst without data loss, and costs less than $2 000/month at " +
|
|
193
|
+
"steady state on a major cloud provider. Consider storage, compute, and egress. " +
|
|
194
|
+
"Do NOT stream partial thinking, do NOT send me interim updates, and do NOT call " +
|
|
195
|
+
"any further tools. Only after you have thought it through completely, reply once " +
|
|
196
|
+
"with your full architecture proposal.";
|
|
197
|
+
|
|
198
|
+
describe("uat: foreground activity-feed visibility across between-tools silent-thinking gap (#680)", () => {
|
|
199
|
+
it(
|
|
200
|
+
"BEST-EFFORT: feed does not go dark during a long silent model-think gap between tools",
|
|
201
|
+
async () => {
|
|
202
|
+
// ── Precondition guard ─────────────────────────────────────────────────
|
|
203
|
+
// If the operator has not shrunk the silence fallback to ≤ 30 000 ms on
|
|
204
|
+
// the test-harness agent (and mirrored it into the UAT env), the timing
|
|
205
|
+
// assertions are vacuous — the default 300 s fallback far exceeds the
|
|
206
|
+
// test budget and the scenario exits before it could detect a regression.
|
|
207
|
+
// Skip with a clear warning rather than silently producing a false green.
|
|
208
|
+
if (!Number.isFinite(PRECONDITION_FALLBACK_MS) || PRECONDITION_FALLBACK_MS > 30_000) {
|
|
209
|
+
console.warn(
|
|
210
|
+
"[uat/thinkgap-feed] SKIPPED — precondition not met.\n" +
|
|
211
|
+
" This scenario requires SWITCHROOM_SILENCE_FALLBACK_MS=20000 set on\n" +
|
|
212
|
+
" the test-harness agent AND SWITCHROOM_UAT_SILENCE_FALLBACK_MS=20000\n" +
|
|
213
|
+
" in the repo-root .env. Without it the silence fallback does not fire\n" +
|
|
214
|
+
" within the test window and the #680 between-tools vector cannot be\n" +
|
|
215
|
+
" exercised. See the header doc comment for setup instructions.",
|
|
216
|
+
);
|
|
217
|
+
return;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
221
|
+
try {
|
|
222
|
+
// Start observing BEFORE sending so no messages (including the first
|
|
223
|
+
// activity-feed paint) are missed by the live stream.
|
|
224
|
+
const iter = sc.driver
|
|
225
|
+
.observeMessages(sc.botUserId)
|
|
226
|
+
[Symbol.asyncIterator]();
|
|
227
|
+
|
|
228
|
+
await sc.sendDM(THINK_GAP_WORKLOAD_PROMPT);
|
|
229
|
+
|
|
230
|
+
console.log("[thinkgap-feed] prompt sent; watching for activity-feed message…");
|
|
231
|
+
|
|
232
|
+
// ── Step 1: wait for the feed to OPEN ─────────────────────────────────
|
|
233
|
+
// The feed should open when the first tool label renders (the `date` Bash
|
|
234
|
+
// call). Give a generous budget to account for cold-start latency.
|
|
235
|
+
let feedMsg: ObservedMessage | null = null;
|
|
236
|
+
const feedDeadline = Date.now() + 90_000;
|
|
237
|
+
|
|
238
|
+
while (Date.now() < feedDeadline) {
|
|
239
|
+
const remaining = feedDeadline - Date.now();
|
|
240
|
+
const next = await Promise.race([
|
|
241
|
+
iter.next(),
|
|
242
|
+
new Promise<{ done: true; value: undefined }>((r) =>
|
|
243
|
+
setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
|
|
244
|
+
),
|
|
245
|
+
]);
|
|
246
|
+
if (next.done || next.value == null) break;
|
|
247
|
+
const m = next.value as ObservedMessage;
|
|
248
|
+
if (m.senderUserId === sc.driverUserId) continue; // skip our own echo
|
|
249
|
+
if (isActivityFeedMessage(m)) {
|
|
250
|
+
feedMsg = m;
|
|
251
|
+
console.log(
|
|
252
|
+
`[thinkgap-feed] feed opened (id=${m.messageId}): ` +
|
|
253
|
+
JSON.stringify(m.text.slice(0, 120)),
|
|
254
|
+
);
|
|
255
|
+
break;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// ── Branch 1: FEED NEVER OPENED ─────────────────────────────────────
|
|
260
|
+
// Cannot observe the #680 vector. Inconclusive — do not fail, but warn
|
|
261
|
+
// loudly so the operator knows the test was unable to exercise the path.
|
|
262
|
+
if (feedMsg === null) {
|
|
263
|
+
console.warn(
|
|
264
|
+
"[thinkgap-feed] INCONCLUSIVE — the foreground activity-feed message\n" +
|
|
265
|
+
" never appeared after the first tool call. The #680 between-tools\n" +
|
|
266
|
+
" vector cannot be observed without an open feed.\n" +
|
|
267
|
+
" Possible causes: the agent did not use tools at all, the initial\n" +
|
|
268
|
+
" `date` call was too fast for a feed paint, or drainActivitySummary\n" +
|
|
269
|
+
" failed before the message was sent. Not treated as a test failure\n" +
|
|
270
|
+
" because the absence of the feed is a distinct (pre-vector) issue.",
|
|
271
|
+
);
|
|
272
|
+
await iter.return?.();
|
|
273
|
+
return;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
const feedId = feedMsg.messageId;
|
|
277
|
+
const beforeMarkText = feedMsg.text;
|
|
278
|
+
|
|
279
|
+
// ── Concurrently drain for two events while the mark elapses ──────────
|
|
280
|
+
//
|
|
281
|
+
// While we wait FALLBACK_WINDOW_MS for the mark to pass, collect:
|
|
282
|
+
// (a) any further feed EDITS (proves the feed stayed live),
|
|
283
|
+
// (b) the FINAL ANSWER (proves the turn ended before the mark —
|
|
284
|
+
// Branch 2, which is NOT the bug).
|
|
285
|
+
//
|
|
286
|
+
// Both are checked after the wait.
|
|
287
|
+
let sawFeedEditDuringWait = false;
|
|
288
|
+
let finalAnswerBeforeMark: ObservedMessage | null = null;
|
|
289
|
+
|
|
290
|
+
const markAt = Date.now() + FALLBACK_WINDOW_MS;
|
|
291
|
+
console.log(
|
|
292
|
+
`[thinkgap-feed] waiting ${FALLBACK_WINDOW_MS}ms for fallback window to elapse…`,
|
|
293
|
+
);
|
|
294
|
+
|
|
295
|
+
// Drain the live stream until the mark elapses.
|
|
296
|
+
while (Date.now() < markAt) {
|
|
297
|
+
const remaining = markAt - Date.now();
|
|
298
|
+
const next = await Promise.race([
|
|
299
|
+
iter.next(),
|
|
300
|
+
new Promise<{ done: true; value: undefined }>((r) =>
|
|
301
|
+
setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
|
|
302
|
+
),
|
|
303
|
+
]);
|
|
304
|
+
if (next.done || next.value == null) break;
|
|
305
|
+
const m = next.value as ObservedMessage;
|
|
306
|
+
if (m.senderUserId === sc.driverUserId) continue;
|
|
307
|
+
|
|
308
|
+
// A feed EDIT arriving during the wait means the feed is still live.
|
|
309
|
+
if (m.edited && m.messageId === feedId) {
|
|
310
|
+
sawFeedEditDuringWait = true;
|
|
311
|
+
console.log(
|
|
312
|
+
`[thinkgap-feed] feed edit arrived during wait (id=${feedId}): ` +
|
|
313
|
+
JSON.stringify(m.text.slice(0, 120)),
|
|
314
|
+
);
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// A substantive non-feed, non-edit message is the final answer.
|
|
318
|
+
if (
|
|
319
|
+
!m.edited &&
|
|
320
|
+
!isActivityFeedMessage(m) &&
|
|
321
|
+
m.text.trim().length >= MIN_ANSWER_CHARS &&
|
|
322
|
+
finalAnswerBeforeMark === null
|
|
323
|
+
) {
|
|
324
|
+
finalAnswerBeforeMark = m;
|
|
325
|
+
console.log(
|
|
326
|
+
`[thinkgap-feed] final answer arrived BEFORE the mark ` +
|
|
327
|
+
`(id=${m.messageId}): ` +
|
|
328
|
+
JSON.stringify(m.text.slice(0, 120)),
|
|
329
|
+
);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// ── Branch 2: TURN ENDED BEFORE THE MARK ───────────────────────────
|
|
334
|
+
// The final answer landed before the FALLBACK_WINDOW_MS mark. The turn
|
|
335
|
+
// was complete before the clock could have fired — any feed darkness
|
|
336
|
+
// observed after this point would just be normal post-turn cleanup.
|
|
337
|
+
// This is NOT the bug; treat as inconclusive/pass.
|
|
338
|
+
if (finalAnswerBeforeMark !== null) {
|
|
339
|
+
console.warn(
|
|
340
|
+
"[thinkgap-feed] INCONCLUSIVE — the final answer arrived BEFORE the\n" +
|
|
341
|
+
` ${FALLBACK_WINDOW_MS}ms fallback-window mark. The turn completed\n` +
|
|
342
|
+
" before the silence clock could have fired, so any feed darkness after\n" +
|
|
343
|
+
" this point reflects normal post-turn teardown, not the #680 regression.\n" +
|
|
344
|
+
" The model did not produce a long enough silent think-gap to trigger\n" +
|
|
345
|
+
" the fallback mid-turn. Not a failure — run again or increase the\n" +
|
|
346
|
+
" prompt complexity to maximize the silent gap.",
|
|
347
|
+
);
|
|
348
|
+
await iter.return?.();
|
|
349
|
+
return;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// The final answer had NOT yet arrived by the mark. Now check whether
|
|
353
|
+
// the feed got further edits AFTER the mark.
|
|
354
|
+
|
|
355
|
+
// ── Poll for post-mark feed edits ────────────────────────────────────
|
|
356
|
+
// Fetch a fresh snapshot of the feed message (to check whether its body
|
|
357
|
+
// changed vs beforeMarkText) and drain POST_MARK_FEED_WAIT_MS of live
|
|
358
|
+
// stream for edit events on the feed message.
|
|
359
|
+
const afterMarkSnapshot = await sc.driver.getMessage(sc.botUserId, feedId);
|
|
360
|
+
const bodyChangedAfterMark =
|
|
361
|
+
afterMarkSnapshot !== null && afterMarkSnapshot.text !== beforeMarkText;
|
|
362
|
+
|
|
363
|
+
let sawFeedEditAfterMark = bodyChangedAfterMark;
|
|
364
|
+
const postMarkDeadline = Date.now() + POST_MARK_FEED_WAIT_MS;
|
|
365
|
+
|
|
366
|
+
while (!sawFeedEditAfterMark && Date.now() < postMarkDeadline) {
|
|
367
|
+
const remaining = postMarkDeadline - Date.now();
|
|
368
|
+
const next = await Promise.race([
|
|
369
|
+
iter.next(),
|
|
370
|
+
new Promise<{ done: true; value: undefined }>((r) =>
|
|
371
|
+
setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
|
|
372
|
+
),
|
|
373
|
+
]);
|
|
374
|
+
if (next.done || next.value == null) break;
|
|
375
|
+
const m = next.value as ObservedMessage;
|
|
376
|
+
if (m.senderUserId === sc.driverUserId) continue;
|
|
377
|
+
// An edit of the feed message that arrived after the mark.
|
|
378
|
+
if (m.edited && m.messageId === feedId) {
|
|
379
|
+
sawFeedEditAfterMark = true;
|
|
380
|
+
console.log(
|
|
381
|
+
`[thinkgap-feed] feed edit confirmed after mark (id=${feedId}): ` +
|
|
382
|
+
JSON.stringify(m.text.slice(0, 120)),
|
|
383
|
+
);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
// ── Branch 4: FEED STAYED LIT ────────────────────────────────────────
|
|
388
|
+
// Either the fix held and the defer (or some other mechanism) prevented
|
|
389
|
+
// the fallback from firing, OR the model never produced a > 20 s silent
|
|
390
|
+
// gap (cases (a)/(b) in the header). We cannot distinguish these without
|
|
391
|
+
// deeper gateway instrumentation. A lit feed is NOT proof the bug is
|
|
392
|
+
// fixed — that proof lives in the sibling deterministic scenario. Warn
|
|
393
|
+
// and return without failing so we don't give a false green signal.
|
|
394
|
+
if (sawFeedEditDuringWait || sawFeedEditAfterMark) {
|
|
395
|
+
console.warn(
|
|
396
|
+
"[thinkgap-feed] INCONCLUSIVE — the feed kept receiving edits during\n" +
|
|
397
|
+
" and/or after the fallback-window mark. Two possible explanations:\n" +
|
|
398
|
+
" (A) The #680 bug is absent (fix holds) — the gateway did not null\n" +
|
|
399
|
+
" currentTurn despite the silent gap.\n" +
|
|
400
|
+
" (B) The model streamed a partial answer-draft or issued the next tool\n" +
|
|
401
|
+
" before 20 s elapsed, resetting the silence clock — the trigger\n" +
|
|
402
|
+
" condition never materialized.\n" +
|
|
403
|
+
" A lit feed is NOT proof-of-fix here; the deterministic guard for\n" +
|
|
404
|
+
" that is jtbd-foreground-feed-visibility-dm.test.ts (in-flight tool\n" +
|
|
405
|
+
" + defer). Treat as best-effort pass.",
|
|
406
|
+
);
|
|
407
|
+
// Fall through to the final-answer drain below so we confirm
|
|
408
|
+
// the turn eventually completed — don't return yet.
|
|
409
|
+
} else {
|
|
410
|
+
// Feed went dark AND the turn was still in progress at the mark.
|
|
411
|
+
// This is the #680 regression vector reproduced.
|
|
412
|
+
|
|
413
|
+
// ── Branch 3: FEED WENT DARK, TURN STILL IN PROGRESS ───────────────
|
|
414
|
+
// Hard fail with a precise description of what happened.
|
|
415
|
+
const darkWindowMs = FALLBACK_WINDOW_MS + POST_MARK_FEED_WAIT_MS;
|
|
416
|
+
throw new Error(
|
|
417
|
+
`[thinkgap-feed] FAIL — bug #680 (between-tools silent-think-gap) reproduced.\n` +
|
|
418
|
+
` The activity-feed message (id=${feedId}) was present before the\n` +
|
|
419
|
+
` ${FALLBACK_WINDOW_MS}ms mark and received no further edits in the\n` +
|
|
420
|
+
` ${POST_MARK_FEED_WAIT_MS}ms window after the mark (total dark window\n` +
|
|
421
|
+
` ≥ ${darkWindowMs}ms), while the final answer had NOT yet arrived.\n` +
|
|
422
|
+
` This means currentTurn was nulled mid-think-gap by the silence-\n` +
|
|
423
|
+
` fallback handler (SWITCHROOM_SILENCE_FALLBACK_MS=${PRECONDITION_FALLBACK_MS}ms)\n` +
|
|
424
|
+
` after a silent model-thinking gap between the first tool's return\n` +
|
|
425
|
+
` and the agent's reply, with no tool in-flight and no answer-stream\n` +
|
|
426
|
+
` draft to reset the clock. The defer (SWITCHROOM_SILENCE_DEFER_\n` +
|
|
427
|
+
` INFLIGHT_TOOLS=1) does not cover this vector.\n` +
|
|
428
|
+
` Feed body at dark: ${JSON.stringify(beforeMarkText.slice(0, 100))}`,
|
|
429
|
+
);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
// ── Branch 5: TURN WEDGED (final answer never arrives) ───────────────
|
|
433
|
+
// Drain for the full answer budget. If the answer lands we're done
|
|
434
|
+
// (success / inconclusive-pass). If it never lands, hard fail.
|
|
435
|
+
let finalAnswer: ObservedMessage | null = finalAnswerBeforeMark;
|
|
436
|
+
const answerDeadline = Date.now() + ANSWER_BUDGET_MS;
|
|
437
|
+
|
|
438
|
+
while (finalAnswer === null && Date.now() < answerDeadline) {
|
|
439
|
+
const remaining = answerDeadline - Date.now();
|
|
440
|
+
const next = await Promise.race([
|
|
441
|
+
iter.next(),
|
|
442
|
+
new Promise<{ done: true; value: undefined }>((r) =>
|
|
443
|
+
setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
|
|
444
|
+
),
|
|
445
|
+
]);
|
|
446
|
+
if (next.done || next.value == null) break;
|
|
447
|
+
const m = next.value as ObservedMessage;
|
|
448
|
+
if (m.senderUserId === sc.driverUserId) continue;
|
|
449
|
+
if (m.edited) continue; // edits are feed updates, not the final answer
|
|
450
|
+
if (isActivityFeedMessage(m)) continue;
|
|
451
|
+
if (m.text.trim().length >= MIN_ANSWER_CHARS) {
|
|
452
|
+
finalAnswer = m;
|
|
453
|
+
console.log(
|
|
454
|
+
`[thinkgap-feed] final answer received (id=${m.messageId}): ` +
|
|
455
|
+
JSON.stringify(m.text.slice(0, 180)),
|
|
456
|
+
);
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
if (finalAnswer === null) {
|
|
461
|
+
throw new Error(
|
|
462
|
+
`[thinkgap-feed] FAIL — the turn never produced a substantive reply\n` +
|
|
463
|
+
` (≥${MIN_ANSWER_CHARS} chars) within the answer budget (${ANSWER_BUDGET_MS}ms).\n` +
|
|
464
|
+
` The turn appears to have wedged. This may be a compound regression\n` +
|
|
465
|
+
` where the silence-fallback nulled currentTurn and also suppressed\n` +
|
|
466
|
+
` the answer path, or the model is hung waiting on something. Check\n` +
|
|
467
|
+
` the test-harness gateway log for error or timeout signals.`,
|
|
468
|
+
);
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
await iter.return?.();
|
|
472
|
+
} finally {
|
|
473
|
+
await sc.tearDown();
|
|
474
|
+
}
|
|
475
|
+
},
|
|
476
|
+
OVERALL_BUDGET_MS,
|
|
477
|
+
);
|
|
478
|
+
});
|