switchroom 0.13.12 → 0.13.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,237 @@
1
+ /**
2
+ * Cross-turn pending-async progress — UAT regression gate for #1445.
3
+ *
4
+ * Verifies the post-fix behaviour shipped in `pending-work-progress.ts`:
5
+ * when a turn ends with the model having dispatched async background
6
+ * work (here `Bash` with `run_in_background:true`) and the model has
7
+ * stopped speaking, the framework keeps editing the model's last reply
8
+ * *in place* at ~60s intervals so the user sees ambient liveness during
9
+ * the wait.
10
+ *
11
+ * ## Pre-fix behaviour (what the user complained about)
12
+ *
13
+ * 1. User sends a long-task prompt at t=0.
14
+ * 2. Model runs the bash command with `run_in_background:true` and
15
+ * sends one PING reply at ~+20s ("Background sleep running…").
16
+ * 3. Turn ends.
17
+ * 4. Silence-poke ladder is per-turn — it stops the moment endTurn()
18
+ * fires. There is no cross-turn ambient surface.
19
+ * 5. The user sees NOTHING for ~5 min until the framework's 300s
20
+ * silence-poke fallback fires (or — as observed in the UAT that
21
+ * drove the fix — does not fire at all, because the turn already
22
+ * ended). Production data confirms: silence-poke succeeded/fired
23
+ * rate is 0–7% across hundreds of fires.
24
+ *
25
+ * ## Post-fix behaviour (this scenario asserts)
26
+ *
27
+ * 1. Model sends one fresh reply at ~+20s — the anchor.
28
+ * 2. Turn ends.
29
+ * 3. Framework edits the anchor in place at ~+80s, ~+140s, ~+200s,
30
+ * ~+260s, ~+320s with the suffix `\n\n— still working (Nm)`.
31
+ * 4. All edits are SILENT (`disable_notification: true` on the edit
32
+ * or, equivalently, an edit which never pushes a notification).
33
+ * The user sees ambient liveness without any added pings.
34
+ * 5. Sleep completes ~+350s; the model wakes (`BashOutput` /
35
+ * background-task notification path), turn re-starts,
36
+ * `clearPending` fires — no further edits.
37
+ *
38
+ * ## What this scenario asserts
39
+ *
40
+ * 1. At least one FRESH bot message lands (the initial anchor).
41
+ * 2. At least one EDIT to the anchor lands AFTER the initial reply,
42
+ * whose text contains the framework suffix `— still working (\d+m)`.
43
+ * 3. Every observed EDIT message is `silent === true` (no push
44
+ * notification fired by the in-place edit).
45
+ * 4. Edits are anchored to the SAME `messageId` as the initial
46
+ * fresh reply (single in-place surface, not spammy new sends).
47
+ *
48
+ * The full per-message trail is dumped to console for forensic
49
+ * inspection regardless of pass / fail.
50
+ *
51
+ * Wall-clock budget: ~8 min.
52
+ */
53
+
54
+ import { describe, expect, it } from "vitest";
55
+ import { spinUp } from "../harness.js";
56
+ import type { ObservedMessage } from "../driver.js";
57
+
58
+ const SLEEP_SECONDS = 350;
59
+
60
+ const PROMPT =
61
+ `This is an instrumented stress test of cross-turn pending-async ` +
62
+ `progress. Please run exactly this command via the Bash tool, and ` +
63
+ `ONLY this command, as a SINGLE call with run_in_background=true ` +
64
+ `(do not break it up, do not send any further reply until it ` +
65
+ `completes):\n\n` +
66
+ "```bash\n" +
67
+ `sleep ${SLEEP_SECONDS}\n` +
68
+ "```\n\n" +
69
+ `After the bash command returns, send exactly the single word ` +
70
+ `"done" as your final reply.`;
71
+
72
+ const OVERALL_DEADLINE_MS = (SLEEP_SECONDS + 240) * 1000;
73
+
74
+ interface TrailEntry {
75
+ relMs: number;
76
+ kind: "fresh" | "edit";
77
+ silent: boolean;
78
+ messageId: number;
79
+ text: string;
80
+ }
81
+
82
+ const SUFFIX_RE = /\n\n— still working \(\d+m\)$/;
83
+
84
+ function pad(s: string, n: number): string {
85
+ return s.length >= n ? s : s + " ".repeat(n - s.length);
86
+ }
87
+
88
+ describe("uat: cross-turn pending-async ambient progress (#1445)", () => {
89
+ it(
90
+ "framework edits the anchor in place during a 350s background bash",
91
+ async () => {
92
+ const sc = await spinUp({ agent: "test-harness" });
93
+ try {
94
+ const startedAt = Date.now();
95
+ await sc.sendDM(PROMPT);
96
+ console.log(`[cross-turn-pending] t=0 prompt sent`);
97
+
98
+ const trail: TrailEntry[] = [];
99
+
100
+ // Initial wait window — give the model 90s to send its first
101
+ // anchor reply. After that, we observe edits for the full
102
+ // sleep duration plus headroom; once the model's final fresh
103
+ // "done" lands we wind down within 10s.
104
+ let quiescenceDeadline = startedAt + 90_000;
105
+ const overallDeadline = startedAt + OVERALL_DEADLINE_MS;
106
+ let firstAnchorMsgId: number | null = null;
107
+ let sawDone = false;
108
+
109
+ while (Date.now() < overallDeadline) {
110
+ const remaining = Math.min(
111
+ quiescenceDeadline - Date.now(),
112
+ overallDeadline - Date.now(),
113
+ );
114
+ if (remaining <= 0) break;
115
+ try {
116
+ const msg = await sc.expectMessage(
117
+ (m: ObservedMessage) => m.fromBot,
118
+ { from: "bot", timeout: remaining },
119
+ );
120
+ const rel = Date.now() - startedAt;
121
+ const entry: TrailEntry = {
122
+ relMs: rel,
123
+ kind: msg.edited ? "edit" : "fresh",
124
+ silent: msg.silent,
125
+ messageId: msg.messageId,
126
+ text: msg.text,
127
+ };
128
+ trail.push(entry);
129
+ console.log(
130
+ `[cross-turn-pending] +${(rel / 1000).toFixed(1)}s ` +
131
+ `${entry.kind.toUpperCase()} msg=${entry.messageId} ` +
132
+ `silent=${entry.silent} text=${JSON.stringify(
133
+ entry.text.slice(0, 120).replace(/\n/g, " ⏎ "),
134
+ )}`,
135
+ );
136
+ if (firstAnchorMsgId == null && entry.kind === "fresh") {
137
+ firstAnchorMsgId = entry.messageId;
138
+ }
139
+ const trimmedFinal = entry.text.trim().toLowerCase();
140
+ const looksLikeDone =
141
+ entry.kind === "fresh" &&
142
+ entry.messageId !== firstAnchorMsgId &&
143
+ (trimmedFinal === "done" || /\bdone\b/.test(trimmedFinal));
144
+ if (looksLikeDone) {
145
+ sawDone = true;
146
+ quiescenceDeadline = Date.now() + 10_000;
147
+ } else {
148
+ // Generous quiescence so we cover the whole sleep window
149
+ // plus a 90s headroom for the model's wake + final reply.
150
+ quiescenceDeadline = Date.now() + 120_000;
151
+ }
152
+ } catch {
153
+ // Timed out — quiescence reached.
154
+ break;
155
+ }
156
+ }
157
+
158
+ // Dump full trail.
159
+ console.log(
160
+ "\n========== CROSS-TURN PENDING-PROGRESS TRAIL ==========",
161
+ );
162
+ console.log(`prompt: ${SLEEP_SECONDS}s background bash`);
163
+ console.log(`total bot messages observed: ${trail.length}`);
164
+ console.log(`anchor messageId: ${firstAnchorMsgId}`);
165
+ console.log(`saw_done: ${sawDone}`);
166
+ console.log("");
167
+ console.log(" rel(s) kind silent msg text");
168
+ console.log(" ------- ----- ------ ----------- ----");
169
+ for (const e of trail) {
170
+ console.log(
171
+ ` ${pad((e.relMs / 1000).toFixed(1) + "s", 8)} ` +
172
+ `${pad(e.kind, 6)} ${pad(String(e.silent), 7)} ` +
173
+ `${pad(String(e.messageId), 12)} ` +
174
+ `${e.text.slice(0, 80).replace(/\n/g, " ⏎ ")}`,
175
+ );
176
+ }
177
+ console.log(
178
+ "=======================================================\n",
179
+ );
180
+
181
+ // ── Regression assertions ─────────────────────────────────
182
+
183
+ // (1) at least one fresh anchor reply landed
184
+ const fresh = trail.filter((e) => e.kind === "fresh");
185
+ expect(
186
+ fresh.length,
187
+ `no fresh bot replies observed — agent isn't responding`,
188
+ ).toBeGreaterThanOrEqual(1);
189
+ expect(firstAnchorMsgId).not.toBeNull();
190
+
191
+ // (2) at least one edit landed AFTER the initial anchor, and
192
+ // its text carries the framework's "— still working (Nm)"
193
+ // suffix. This is THE regression gate for the fix.
194
+ const edits = trail.filter((e) => e.kind === "edit");
195
+ const editsWithSuffix = edits.filter((e) => SUFFIX_RE.test(e.text));
196
+ expect(
197
+ editsWithSuffix.length,
198
+ `no in-place edits with the "— still working (Nm)" suffix ` +
199
+ `landed during the ${SLEEP_SECONDS}s background bash. ` +
200
+ `Total edits observed: ${edits.length}. The cross-turn ` +
201
+ `pending-progress fix is not active — see ` +
202
+ `\`pending-work-progress.ts\` and the gateway hooks at ` +
203
+ `\`noteAsyncDispatch\` / \`noteOutbound\` / \`noteTurnEnd\`. ` +
204
+ `Pre-fix this number is zero by construction.`,
205
+ ).toBeGreaterThanOrEqual(1);
206
+
207
+ // (3) every observed edit is silent (an edit never pings per
208
+ // Telegram semantics; we double-check via the receiving-side
209
+ // flag so any framework regression that switched to fresh
210
+ // sends fails loudly).
211
+ const loudEdits = edits.filter((e) => !e.silent);
212
+ expect(
213
+ loudEdits.length,
214
+ `${loudEdits.length} edit(s) pinged the device — edits ` +
215
+ `should never fire a notification.`,
216
+ ).toBe(0);
217
+
218
+ // (4) every edit is anchored to the same messageId as the
219
+ // initial fresh anchor — the framework is editing ONE
220
+ // surface, not spamming.
221
+ const offAnchorEdits = edits.filter(
222
+ (e) => e.messageId !== firstAnchorMsgId,
223
+ );
224
+ expect(
225
+ offAnchorEdits.length,
226
+ `${offAnchorEdits.length} edit(s) were anchored to a ` +
227
+ `different message id than the initial reply ` +
228
+ `(${firstAnchorMsgId}). The framework should edit a ` +
229
+ `single anchor in place, not a chain of messages.`,
230
+ ).toBe(0);
231
+ } finally {
232
+ await sc.tearDown();
233
+ }
234
+ },
235
+ OVERALL_DEADLINE_MS + 60_000,
236
+ );
237
+ });