switchroom 0.13.13 → 0.13.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,354 @@
1
+ /**
2
+ * Unit tests for cross-turn pending-async progress (#1445).
3
+ *
4
+ * Pins the deterministic state machine + edit cadence in isolation
5
+ * from the gateway. The integration with gateway hooks is exercised
6
+ * by the UAT scenario `silence-poke-debug-dm.test.ts`.
7
+ */
8
+
9
+ import { afterEach, beforeEach, describe, expect, it } from 'vitest'
10
+
11
+ import {
12
+ EDIT_INTERVAL_MS,
13
+ MAX_LIFETIME_MS,
14
+ TELEGRAM_MSG_CAP,
15
+ __getStateForTests,
16
+ __resetAllForTests,
17
+ __setDepsForTests,
18
+ __tickForTests,
19
+ clearPending,
20
+ noteAsyncDispatch,
21
+ noteOutbound,
22
+ noteTurnEnd,
23
+ startTurn,
24
+ type PendingProgressEditCtx,
25
+ type PendingProgressMetric,
26
+ } from '../pending-work-progress.js'
27
+
28
+ const KEY = '12345:_'
29
+
30
+ interface Capture {
31
+ edits: PendingProgressEditCtx[]
32
+ metrics: PendingProgressMetric[]
33
+ now: number
34
+ }
35
+
36
+ function setup(): Capture {
37
+ const cap: Capture = { edits: [], metrics: [], now: 0 }
38
+ __resetAllForTests()
39
+ __setDepsForTests({
40
+ editMessage: async (ctx) => {
41
+ cap.edits.push(ctx)
42
+ },
43
+ emitMetric: (e) => {
44
+ cap.metrics.push(e)
45
+ },
46
+ nowMs: () => cap.now,
47
+ })
48
+ return cap
49
+ }
50
+
51
+ async function flush(): Promise<void> {
52
+ // Allow the fire-and-forget promise chain in tick() to settle.
53
+ await Promise.resolve()
54
+ await Promise.resolve()
55
+ }
56
+
57
+ describe('pending-work-progress', () => {
58
+ beforeEach(() => {
59
+ delete process.env.SWITCHROOM_DISABLE_PENDING_PROGRESS
60
+ })
61
+ afterEach(() => {
62
+ __resetAllForTests()
63
+ })
64
+
65
+ it('does nothing on turns without an async dispatch', () => {
66
+ const cap = setup()
67
+ startTurn(KEY)
68
+ noteOutbound(KEY, { messageId: 100, text: 'simple reply' })
69
+ noteTurnEnd(KEY)
70
+ expect(__getStateForTests(KEY)).toBeUndefined()
71
+ cap.now = 60_000
72
+ __tickForTests(cap.now)
73
+ expect(cap.edits).toHaveLength(0)
74
+ expect(cap.metrics).toHaveLength(0)
75
+ })
76
+
77
+ it('activates when turn ends with async dispatch + anchor', () => {
78
+ const cap = setup()
79
+ startTurn(KEY)
80
+ noteAsyncDispatch(KEY)
81
+ noteOutbound(KEY, { messageId: 100, text: 'worker dispatched' })
82
+ cap.now = 1_000
83
+ noteTurnEnd(KEY)
84
+ const s = __getStateForTests(KEY)
85
+ expect(s).toBeDefined()
86
+ expect(s?.activatedAt).toBe(1_000)
87
+ expect(s?.anchorMessageId).toBe(100)
88
+ expect(s?.anchorOriginalText).toBe('worker dispatched')
89
+ expect(cap.metrics).toContainEqual({
90
+ kind: 'pending_progress_started',
91
+ chatKey: KEY,
92
+ })
93
+ })
94
+
95
+ it('does not activate when async dispatch happened but no anchor was captured', () => {
96
+ const cap = setup()
97
+ startTurn(KEY)
98
+ noteAsyncDispatch(KEY)
99
+ // no noteOutbound — model never sent a reply (silent end)
100
+ noteTurnEnd(KEY)
101
+ expect(__getStateForTests(KEY)).toBeUndefined()
102
+ cap.now = 60_000
103
+ __tickForTests(cap.now)
104
+ expect(cap.edits).toHaveLength(0)
105
+ })
106
+
107
+ it('does not activate when an anchor exists but no async dispatch happened', () => {
108
+ const cap = setup()
109
+ startTurn(KEY)
110
+ noteOutbound(KEY, { messageId: 100, text: 'just chatting' })
111
+ noteTurnEnd(KEY)
112
+ expect(__getStateForTests(KEY)).toBeUndefined()
113
+ cap.now = 60_000
114
+ __tickForTests(cap.now)
115
+ expect(cap.edits).toHaveLength(0)
116
+ })
117
+
118
+ it('edits anchor with elapsed-time suffix at EDIT_INTERVAL_MS cadence', async () => {
119
+ const cap = setup()
120
+ startTurn(KEY)
121
+ noteAsyncDispatch(KEY)
122
+ noteOutbound(KEY, {
123
+ messageId: 100,
124
+ text: 'Background sleep running; awaiting completion.',
125
+ })
126
+ cap.now = 0
127
+ noteTurnEnd(KEY)
128
+
129
+ // Tick at half-interval — no edit yet.
130
+ cap.now = EDIT_INTERVAL_MS / 2
131
+ __tickForTests(cap.now)
132
+ await flush()
133
+ expect(cap.edits).toHaveLength(0)
134
+
135
+ // Tick at full interval — first edit fires, "1m" suffix.
136
+ cap.now = EDIT_INTERVAL_MS
137
+ __tickForTests(cap.now)
138
+ await flush()
139
+ expect(cap.edits).toHaveLength(1)
140
+ expect(cap.edits[0].messageId).toBe(100)
141
+ expect(cap.edits[0].newText).toBe(
142
+ 'Background sleep running; awaiting completion.\n\n— still working (1m)',
143
+ )
144
+
145
+ // Tick at 3 intervals total — second edit, "3m".
146
+ cap.now = EDIT_INTERVAL_MS * 3
147
+ __tickForTests(cap.now)
148
+ await flush()
149
+ expect(cap.edits).toHaveLength(2)
150
+ expect(cap.edits[1].newText).toBe(
151
+ 'Background sleep running; awaiting completion.\n\n— still working (3m)',
152
+ )
153
+ })
154
+
155
+ it('strips prior suffix before re-appending so anchor never accumulates', async () => {
156
+ const cap = setup()
157
+ startTurn(KEY)
158
+ noteAsyncDispatch(KEY)
159
+ // Simulate a noteOutbound for text that already carries a stale
160
+ // suffix from an earlier round (defence in depth).
161
+ noteOutbound(KEY, {
162
+ messageId: 100,
163
+ text: 'worker dispatched\n\n— still working (12m)',
164
+ })
165
+ noteTurnEnd(KEY)
166
+ cap.now = EDIT_INTERVAL_MS
167
+ __tickForTests(cap.now)
168
+ await flush()
169
+ // The new edit should be based on 'worker dispatched' alone.
170
+ expect(cap.edits[0].newText).toBe(
171
+ 'worker dispatched\n\n— still working (1m)',
172
+ )
173
+ })
174
+
175
+ it("clears on 'inbound' reason — user re-engaged", () => {
176
+ const cap = setup()
177
+ startTurn(KEY)
178
+ noteAsyncDispatch(KEY)
179
+ noteOutbound(KEY, { messageId: 100, text: 'wd' })
180
+ noteTurnEnd(KEY)
181
+ cap.now = EDIT_INTERVAL_MS * 2
182
+ clearPending(KEY, 'inbound')
183
+ expect(__getStateForTests(KEY)).toBeUndefined()
184
+ expect(cap.metrics).toContainEqual({
185
+ kind: 'pending_progress_cleared',
186
+ chatKey: KEY,
187
+ elapsedMs: EDIT_INTERVAL_MS * 2,
188
+ reason: 'inbound',
189
+ })
190
+ // No further edits after clear.
191
+ cap.now = EDIT_INTERVAL_MS * 3
192
+ __tickForTests(cap.now)
193
+ expect(cap.edits).toHaveLength(0)
194
+ })
195
+
196
+ it("clears on 'handback' reason — model is about to re-engage", () => {
197
+ const cap = setup()
198
+ startTurn(KEY)
199
+ noteAsyncDispatch(KEY)
200
+ noteOutbound(KEY, { messageId: 100, text: 'wd' })
201
+ noteTurnEnd(KEY)
202
+ clearPending(KEY, 'handback')
203
+ expect(__getStateForTests(KEY)).toBeUndefined()
204
+ expect(cap.metrics.some((m) => m.kind === 'pending_progress_cleared' && m.reason === 'handback')).toBe(true)
205
+ })
206
+
207
+ it('times out at MAX_LIFETIME_MS', async () => {
208
+ const cap = setup()
209
+ startTurn(KEY)
210
+ noteAsyncDispatch(KEY)
211
+ noteOutbound(KEY, { messageId: 100, text: 'wd' })
212
+ cap.now = 0
213
+ noteTurnEnd(KEY)
214
+ // Halfway — still active.
215
+ cap.now = MAX_LIFETIME_MS / 2
216
+ __tickForTests(cap.now)
217
+ await flush()
218
+ expect(__getStateForTests(KEY)).toBeDefined()
219
+ // Past the budget — auto-cleared.
220
+ cap.now = MAX_LIFETIME_MS + 1
221
+ __tickForTests(cap.now)
222
+ await flush()
223
+ expect(__getStateForTests(KEY)).toBeUndefined()
224
+ expect(cap.metrics.some((m) => m.kind === 'pending_progress_cleared' && m.reason === 'timeout')).toBe(true)
225
+ })
226
+
227
+ it('skips edit (but advances cadence) if total would exceed Telegram message cap', async () => {
228
+ const cap = setup()
229
+ startTurn(KEY)
230
+ noteAsyncDispatch(KEY)
231
+ // Anchor text long enough that even the smallest suffix overflows.
232
+ const bigText = 'x'.repeat(TELEGRAM_MSG_CAP - 5)
233
+ noteOutbound(KEY, { messageId: 100, text: bigText })
234
+ cap.now = 0
235
+ noteTurnEnd(KEY)
236
+ cap.now = EDIT_INTERVAL_MS
237
+ __tickForTests(cap.now)
238
+ await flush()
239
+ expect(cap.edits).toHaveLength(0)
240
+ // lastEditAt still advanced — we won't spin retrying every tick.
241
+ const s = __getStateForTests(KEY)
242
+ expect(s?.lastEditAt).toBe(EDIT_INTERVAL_MS)
243
+ })
244
+
245
+ it('honors the kill switch — no state, no edits, no metrics', async () => {
246
+ const cap = setup()
247
+ process.env.SWITCHROOM_DISABLE_PENDING_PROGRESS = '1'
248
+ try {
249
+ startTurn(KEY)
250
+ noteAsyncDispatch(KEY)
251
+ noteOutbound(KEY, { messageId: 100, text: 'wd' })
252
+ noteTurnEnd(KEY)
253
+ expect(__getStateForTests(KEY)).toBeUndefined()
254
+ cap.now = EDIT_INTERVAL_MS * 3
255
+ __tickForTests(cap.now)
256
+ await flush()
257
+ expect(cap.edits).toHaveLength(0)
258
+ expect(cap.metrics).toHaveLength(0)
259
+ } finally {
260
+ delete process.env.SWITCHROOM_DISABLE_PENDING_PROGRESS
261
+ }
262
+ })
263
+
264
+ it('startTurn resets per-turn fields but NOT cross-turn activation', () => {
265
+ const cap = setup()
266
+ // Turn 1: dispatches async, ends, pending-progress active.
267
+ startTurn(KEY)
268
+ noteAsyncDispatch(KEY)
269
+ noteOutbound(KEY, { messageId: 100, text: 'wd' })
270
+ cap.now = 1_000
271
+ noteTurnEnd(KEY)
272
+ expect(__getStateForTests(KEY)?.activatedAt).toBe(1_000)
273
+ // Turn 2 starts (e.g. via the gateway's inbound path that already
274
+ // called clearPending). startTurn resets per-turn fields but the
275
+ // map entry has been deleted by clearPending, so this should
276
+ // simply do nothing dangerous if called against an absent key.
277
+ clearPending(KEY, 'inbound')
278
+ startTurn(KEY)
279
+ expect(__getStateForTests(KEY)).toBeUndefined()
280
+ })
281
+
282
+ it('no stale carryover: turn 1 activates, clearPending fires, turn 2 (no async) does not re-activate', async () => {
283
+ // Reproduces the reviewer's blocker #2 path: turn 1 with async
284
+ // dispatch activates pending-progress; an arriving turn 2 (real
285
+ // inbound OR synthesised wake) must clear state so a turn 2 that
286
+ // does NOT itself dispatch async never inherits the prior turn's
287
+ // `pending=true` and re-activates against turn 2's anchor.
288
+ const cap = setup()
289
+ // ── Turn 1: dispatch async, reply, end — activates.
290
+ noteAsyncDispatch(KEY)
291
+ noteOutbound(KEY, { messageId: 100, text: 'worker dispatched' })
292
+ cap.now = 1_000
293
+ noteTurnEnd(KEY)
294
+ expect(__getStateForTests(KEY)?.activatedAt).toBe(1_000)
295
+
296
+ // ── Inbound (or handback / cron / vault grant) for turn 2.
297
+ // Gateway clears state — exactly what the inbound/enqueue hooks
298
+ // wire up at handleInbound + handleSessionEvent.enqueue.
299
+ cap.now = 90_000
300
+ clearPending(KEY, 'inbound')
301
+ expect(__getStateForTests(KEY)).toBeUndefined()
302
+
303
+ // ── Turn 2: reply only, NO async dispatch this turn.
304
+ noteOutbound(KEY, { messageId: 200, text: 'just answering' })
305
+ cap.now = 91_000
306
+ noteTurnEnd(KEY)
307
+
308
+ // Turn 2 must NOT activate — no async was dispatched in this turn.
309
+ // Pre-fix this assertion would fail because the prior turn's
310
+ // `pending=true` was never reset and `noteTurnEnd` re-activated
311
+ // against turn 2's fresh anchor.
312
+ expect(__getStateForTests(KEY)).toBeUndefined()
313
+
314
+ // Confirm: no edits fire over the next several poll intervals.
315
+ cap.now = 91_000 + EDIT_INTERVAL_MS * 3
316
+ __tickForTests(cap.now)
317
+ await flush()
318
+ expect(cap.edits).toHaveLength(0)
319
+ })
320
+
321
+ it('multiple chats — independent state', async () => {
322
+ const cap = setup()
323
+ const KEY_A = 'A:_'
324
+ const KEY_B = 'B:42'
325
+ startTurn(KEY_A)
326
+ noteAsyncDispatch(KEY_A)
327
+ noteOutbound(KEY_A, { messageId: 10, text: 'wd-A' })
328
+ cap.now = 0
329
+ noteTurnEnd(KEY_A)
330
+
331
+ startTurn(KEY_B)
332
+ noteAsyncDispatch(KEY_B)
333
+ noteOutbound(KEY_B, { messageId: 20, text: 'wd-B' })
334
+ noteTurnEnd(KEY_B)
335
+
336
+ cap.now = EDIT_INTERVAL_MS
337
+ __tickForTests(cap.now)
338
+ await flush()
339
+ expect(cap.edits).toHaveLength(2)
340
+ const byMsg = new Map(cap.edits.map((e) => [e.messageId, e]))
341
+ expect(byMsg.get(10)?.chatId).toBe('A')
342
+ expect(byMsg.get(10)?.threadId).toBe(null)
343
+ expect(byMsg.get(20)?.chatId).toBe('B')
344
+ expect(byMsg.get(20)?.threadId).toBe(42)
345
+
346
+ // Clear A only; B should keep ticking.
347
+ clearPending(KEY_A, 'inbound')
348
+ cap.now = EDIT_INTERVAL_MS * 2
349
+ __tickForTests(cap.now)
350
+ await flush()
351
+ expect(cap.edits.filter((e) => e.messageId === 10)).toHaveLength(1)
352
+ expect(cap.edits.filter((e) => e.messageId === 20)).toHaveLength(2)
353
+ })
354
+ })
@@ -0,0 +1,239 @@
1
+ /**
2
+ * Cross-turn pending-async progress — UAT regression gate for #1445.
3
+ *
4
+ * Verifies the post-fix behaviour shipped in `pending-work-progress.ts`:
5
+ * when a turn ends with the model having dispatched async background
6
+ * work (here `Bash` with `run_in_background:true`) and the model has
7
+ * stopped speaking, the framework keeps editing the model's last reply
8
+ * *in place* at ~60s intervals so the user sees ambient liveness during
9
+ * the wait.
10
+ *
11
+ * ## Pre-fix behaviour (what the user complained about)
12
+ *
13
+ * 1. User sends a long-task prompt at t=0.
14
+ * 2. Model runs the bash command with `run_in_background:true` and
15
+ * sends one PING reply at ~+20s ("Background sleep running…").
16
+ * 3. Turn ends.
17
+ * 4. Silence-poke ladder is per-turn — it stops the moment endTurn()
18
+ * fires. There is no cross-turn ambient surface.
19
+ * 5. The user sees NOTHING for ~5 min until the framework's 300s
20
+ * silence-poke fallback fires (or — as observed in the UAT that
21
+ * drove the fix — does not fire at all, because the turn already
22
+ * ended). Production data confirms: silence-poke succeeded/fired
23
+ * rate is 0–7% across hundreds of fires.
24
+ *
25
+ * ## Post-fix behaviour (this scenario asserts)
26
+ *
27
+ * 1. Model sends one fresh reply at ~+20s — the anchor.
28
+ * 2. Turn ends.
29
+ * 3. Framework edits the anchor in place at ~+80s, ~+140s, ~+200s,
30
+ * ~+260s, ~+320s with the suffix `\n\n— still working (Nm)`.
31
+ * 4. All edits are SILENT (`disable_notification: true` on the edit
32
+ * or, equivalently, an edit which never pushes a notification).
33
+ * The user sees ambient liveness without any added pings.
34
+ * 5. Sleep completes ~+350s; the model wakes (`BashOutput` /
35
+ * background-task notification path), turn re-starts,
36
+ * `clearPending` fires — no further edits.
37
+ *
38
+ * ## What this scenario asserts
39
+ *
40
+ * 1. At least one FRESH bot message lands (the initial anchor).
41
+ * 2. At least one EDIT to the anchor lands AFTER the initial reply,
42
+ * whose text contains the framework suffix `— still working (\d+m)`.
43
+ * 3. Every observed EDIT message is `silent === true` (no push
44
+ * notification fired by the in-place edit).
45
+ * 4. Edits are anchored to the SAME `messageId` as the initial
46
+ * fresh reply (single in-place surface, not spammy new sends).
47
+ *
48
+ * The full per-message trail is dumped to console for forensic
49
+ * inspection regardless of pass / fail.
50
+ *
51
+ * Wall-clock budget: ~8 min.
52
+ */
53
+
54
+ import { describe, expect, it } from "vitest";
55
+ import { spinUp } from "../harness.js";
56
+ import type { ObservedMessage } from "../driver.js";
57
+
58
+ const SLEEP_SECONDS = 350;
59
+
60
+ // Engineered to elicit the natural production pattern: the model
61
+ // sends a quick ack reply ("on it — background sleep running"),
62
+ // dispatches the sleep as a background Bash, ends its turn, then
63
+ // returns with "done" once the sleep completes. The framework
64
+ // fix-under-test owns the in-between ambient.
65
+ const PROMPT =
66
+ `Please run \`sleep ${SLEEP_SECONDS}\` in the background using the ` +
67
+ `Bash tool with \`run_in_background: true\` — this is a stress ` +
68
+ `test of the cross-turn ambient progress surface, so the sleep ` +
69
+ `duration matters. Send a brief one-line acknowledgement that ` +
70
+ `you've dispatched it (your natural beat-1 ack is fine), then ` +
71
+ `wait for it to complete. When it finishes, reply with exactly ` +
72
+ `the single word "done".`;
73
+
74
+ const OVERALL_DEADLINE_MS = (SLEEP_SECONDS + 240) * 1000;
75
+
76
+ interface TrailEntry {
77
+ relMs: number;
78
+ kind: "fresh" | "edit";
79
+ silent: boolean;
80
+ messageId: number;
81
+ text: string;
82
+ }
83
+
84
+ const SUFFIX_RE = /\n\n— still working \(\d+m\)$/;
85
+
86
+ function pad(s: string, n: number): string {
87
+ return s.length >= n ? s : s + " ".repeat(n - s.length);
88
+ }
89
+
90
+ describe("uat: cross-turn pending-async ambient progress (#1445)", () => {
91
+ it(
92
+ "framework edits the anchor in place during a 350s background bash",
93
+ async () => {
94
+ const sc = await spinUp({ agent: "test-harness" });
95
+ try {
96
+ const startedAt = Date.now();
97
+ await sc.sendDM(PROMPT);
98
+ console.log(`[cross-turn-pending] t=0 prompt sent`);
99
+
100
+ const trail: TrailEntry[] = [];
101
+
102
+ // Initial wait window — give the model 90s to send its first
103
+ // anchor reply. After that, we observe edits for the full
104
+ // sleep duration plus headroom; once the model's final fresh
105
+ // "done" lands we wind down within 10s.
106
+ let quiescenceDeadline = startedAt + 90_000;
107
+ const overallDeadline = startedAt + OVERALL_DEADLINE_MS;
108
+ let firstAnchorMsgId: number | null = null;
109
+ let sawDone = false;
110
+
111
+ while (Date.now() < overallDeadline) {
112
+ const remaining = Math.min(
113
+ quiescenceDeadline - Date.now(),
114
+ overallDeadline - Date.now(),
115
+ );
116
+ if (remaining <= 0) break;
117
+ try {
118
+ const msg = await sc.expectMessage(
119
+ (m: ObservedMessage) => m.fromBot,
120
+ { from: "bot", timeout: remaining },
121
+ );
122
+ const rel = Date.now() - startedAt;
123
+ const entry: TrailEntry = {
124
+ relMs: rel,
125
+ kind: msg.edited ? "edit" : "fresh",
126
+ silent: msg.silent,
127
+ messageId: msg.messageId,
128
+ text: msg.text,
129
+ };
130
+ trail.push(entry);
131
+ console.log(
132
+ `[cross-turn-pending] +${(rel / 1000).toFixed(1)}s ` +
133
+ `${entry.kind.toUpperCase()} msg=${entry.messageId} ` +
134
+ `silent=${entry.silent} text=${JSON.stringify(
135
+ entry.text.slice(0, 120).replace(/\n/g, " ⏎ "),
136
+ )}`,
137
+ );
138
+ if (firstAnchorMsgId == null && entry.kind === "fresh") {
139
+ firstAnchorMsgId = entry.messageId;
140
+ }
141
+ const trimmedFinal = entry.text.trim().toLowerCase();
142
+ const looksLikeDone =
143
+ entry.kind === "fresh" &&
144
+ entry.messageId !== firstAnchorMsgId &&
145
+ (trimmedFinal === "done" || /\bdone\b/.test(trimmedFinal));
146
+ if (looksLikeDone) {
147
+ sawDone = true;
148
+ quiescenceDeadline = Date.now() + 10_000;
149
+ } else {
150
+ // Generous quiescence so we cover the whole sleep window
151
+ // plus a 90s headroom for the model's wake + final reply.
152
+ quiescenceDeadline = Date.now() + 120_000;
153
+ }
154
+ } catch {
155
+ // Timed out — quiescence reached.
156
+ break;
157
+ }
158
+ }
159
+
160
+ // Dump full trail.
161
+ console.log(
162
+ "\n========== CROSS-TURN PENDING-PROGRESS TRAIL ==========",
163
+ );
164
+ console.log(`prompt: ${SLEEP_SECONDS}s background bash`);
165
+ console.log(`total bot messages observed: ${trail.length}`);
166
+ console.log(`anchor messageId: ${firstAnchorMsgId}`);
167
+ console.log(`saw_done: ${sawDone}`);
168
+ console.log("");
169
+ console.log(" rel(s) kind silent msg text");
170
+ console.log(" ------- ----- ------ ----------- ----");
171
+ for (const e of trail) {
172
+ console.log(
173
+ ` ${pad((e.relMs / 1000).toFixed(1) + "s", 8)} ` +
174
+ `${pad(e.kind, 6)} ${pad(String(e.silent), 7)} ` +
175
+ `${pad(String(e.messageId), 12)} ` +
176
+ `${e.text.slice(0, 80).replace(/\n/g, " ⏎ ")}`,
177
+ );
178
+ }
179
+ console.log(
180
+ "=======================================================\n",
181
+ );
182
+
183
+ // ── Regression assertions ─────────────────────────────────
184
+
185
+ // (1) at least one fresh anchor reply landed
186
+ const fresh = trail.filter((e) => e.kind === "fresh");
187
+ expect(
188
+ fresh.length,
189
+ `no fresh bot replies observed — agent isn't responding`,
190
+ ).toBeGreaterThanOrEqual(1);
191
+ expect(firstAnchorMsgId).not.toBeNull();
192
+
193
+ // (2) at least one edit landed AFTER the initial anchor, and
194
+ // its text carries the framework's "— still working (Nm)"
195
+ // suffix. This is THE regression gate for the fix.
196
+ const edits = trail.filter((e) => e.kind === "edit");
197
+ const editsWithSuffix = edits.filter((e) => SUFFIX_RE.test(e.text));
198
+ expect(
199
+ editsWithSuffix.length,
200
+ `no in-place edits with the "— still working (Nm)" suffix ` +
201
+ `landed during the ${SLEEP_SECONDS}s background bash. ` +
202
+ `Total edits observed: ${edits.length}. The cross-turn ` +
203
+ `pending-progress fix is not active — see ` +
204
+ `\`pending-work-progress.ts\` and the gateway hooks at ` +
205
+ `\`noteAsyncDispatch\` / \`noteOutbound\` / \`noteTurnEnd\`. ` +
206
+ `Pre-fix this number is zero by construction.`,
207
+ ).toBeGreaterThanOrEqual(1);
208
+
209
+ // (3) every observed edit is silent (an edit never pings per
210
+ // Telegram semantics; we double-check via the receiving-side
211
+ // flag so any framework regression that switched to fresh
212
+ // sends fails loudly).
213
+ const loudEdits = edits.filter((e) => !e.silent);
214
+ expect(
215
+ loudEdits.length,
216
+ `${loudEdits.length} edit(s) pinged the device — edits ` +
217
+ `should never fire a notification.`,
218
+ ).toBe(0);
219
+
220
+ // (4) every edit is anchored to the same messageId as the
221
+ // initial fresh anchor — the framework is editing ONE
222
+ // surface, not spamming.
223
+ const offAnchorEdits = edits.filter(
224
+ (e) => e.messageId !== firstAnchorMsgId,
225
+ );
226
+ expect(
227
+ offAnchorEdits.length,
228
+ `${offAnchorEdits.length} edit(s) were anchored to a ` +
229
+ `different message id than the initial reply ` +
230
+ `(${firstAnchorMsgId}). The framework should edit a ` +
231
+ `single anchor in place, not a chain of messages.`,
232
+ ).toBe(0);
233
+ } finally {
234
+ await sc.tearDown();
235
+ }
236
+ },
237
+ OVERALL_DEADLINE_MS + 60_000,
238
+ );
239
+ });