switchroom 0.13.13 → 0.13.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +2 -2
- package/package.json +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +293 -92
- package/telegram-plugin/gateway/gateway.ts +223 -17
- package/telegram-plugin/pending-work-progress.ts +377 -0
- package/telegram-plugin/runtime-metrics.ts +20 -0
- package/telegram-plugin/tests/pending-work-progress.test.ts +354 -0
- package/telegram-plugin/uat/scenarios/cross-turn-pending-progress-dm.test.ts +239 -0
- package/telegram-plugin/uat/scenarios/visible-answer-stream-dm.test.ts +219 -0
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for cross-turn pending-async progress (#1445).
|
|
3
|
+
*
|
|
4
|
+
* Pins the deterministic state machine + edit cadence in isolation
|
|
5
|
+
* from the gateway. The integration with gateway hooks is exercised
|
|
6
|
+
* by the UAT scenario `silence-poke-debug-dm.test.ts`.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { afterEach, beforeEach, describe, expect, it } from 'vitest'
|
|
10
|
+
|
|
11
|
+
import {
|
|
12
|
+
EDIT_INTERVAL_MS,
|
|
13
|
+
MAX_LIFETIME_MS,
|
|
14
|
+
TELEGRAM_MSG_CAP,
|
|
15
|
+
__getStateForTests,
|
|
16
|
+
__resetAllForTests,
|
|
17
|
+
__setDepsForTests,
|
|
18
|
+
__tickForTests,
|
|
19
|
+
clearPending,
|
|
20
|
+
noteAsyncDispatch,
|
|
21
|
+
noteOutbound,
|
|
22
|
+
noteTurnEnd,
|
|
23
|
+
startTurn,
|
|
24
|
+
type PendingProgressEditCtx,
|
|
25
|
+
type PendingProgressMetric,
|
|
26
|
+
} from '../pending-work-progress.js'
|
|
27
|
+
|
|
28
|
+
const KEY = '12345:_'
|
|
29
|
+
|
|
30
|
+
interface Capture {
|
|
31
|
+
edits: PendingProgressEditCtx[]
|
|
32
|
+
metrics: PendingProgressMetric[]
|
|
33
|
+
now: number
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function setup(): Capture {
|
|
37
|
+
const cap: Capture = { edits: [], metrics: [], now: 0 }
|
|
38
|
+
__resetAllForTests()
|
|
39
|
+
__setDepsForTests({
|
|
40
|
+
editMessage: async (ctx) => {
|
|
41
|
+
cap.edits.push(ctx)
|
|
42
|
+
},
|
|
43
|
+
emitMetric: (e) => {
|
|
44
|
+
cap.metrics.push(e)
|
|
45
|
+
},
|
|
46
|
+
nowMs: () => cap.now,
|
|
47
|
+
})
|
|
48
|
+
return cap
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async function flush(): Promise<void> {
|
|
52
|
+
// Allow the fire-and-forget promise chain in tick() to settle.
|
|
53
|
+
await Promise.resolve()
|
|
54
|
+
await Promise.resolve()
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
describe('pending-work-progress', () => {
|
|
58
|
+
beforeEach(() => {
|
|
59
|
+
delete process.env.SWITCHROOM_DISABLE_PENDING_PROGRESS
|
|
60
|
+
})
|
|
61
|
+
afterEach(() => {
|
|
62
|
+
__resetAllForTests()
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
it('does nothing on turns without an async dispatch', () => {
|
|
66
|
+
const cap = setup()
|
|
67
|
+
startTurn(KEY)
|
|
68
|
+
noteOutbound(KEY, { messageId: 100, text: 'simple reply' })
|
|
69
|
+
noteTurnEnd(KEY)
|
|
70
|
+
expect(__getStateForTests(KEY)).toBeUndefined()
|
|
71
|
+
cap.now = 60_000
|
|
72
|
+
__tickForTests(cap.now)
|
|
73
|
+
expect(cap.edits).toHaveLength(0)
|
|
74
|
+
expect(cap.metrics).toHaveLength(0)
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
it('activates when turn ends with async dispatch + anchor', () => {
|
|
78
|
+
const cap = setup()
|
|
79
|
+
startTurn(KEY)
|
|
80
|
+
noteAsyncDispatch(KEY)
|
|
81
|
+
noteOutbound(KEY, { messageId: 100, text: 'worker dispatched' })
|
|
82
|
+
cap.now = 1_000
|
|
83
|
+
noteTurnEnd(KEY)
|
|
84
|
+
const s = __getStateForTests(KEY)
|
|
85
|
+
expect(s).toBeDefined()
|
|
86
|
+
expect(s?.activatedAt).toBe(1_000)
|
|
87
|
+
expect(s?.anchorMessageId).toBe(100)
|
|
88
|
+
expect(s?.anchorOriginalText).toBe('worker dispatched')
|
|
89
|
+
expect(cap.metrics).toContainEqual({
|
|
90
|
+
kind: 'pending_progress_started',
|
|
91
|
+
chatKey: KEY,
|
|
92
|
+
})
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
it('does not activate when async dispatch happened but no anchor was captured', () => {
|
|
96
|
+
const cap = setup()
|
|
97
|
+
startTurn(KEY)
|
|
98
|
+
noteAsyncDispatch(KEY)
|
|
99
|
+
// no noteOutbound — model never sent a reply (silent end)
|
|
100
|
+
noteTurnEnd(KEY)
|
|
101
|
+
expect(__getStateForTests(KEY)).toBeUndefined()
|
|
102
|
+
cap.now = 60_000
|
|
103
|
+
__tickForTests(cap.now)
|
|
104
|
+
expect(cap.edits).toHaveLength(0)
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
it('does not activate when an anchor exists but no async dispatch happened', () => {
|
|
108
|
+
const cap = setup()
|
|
109
|
+
startTurn(KEY)
|
|
110
|
+
noteOutbound(KEY, { messageId: 100, text: 'just chatting' })
|
|
111
|
+
noteTurnEnd(KEY)
|
|
112
|
+
expect(__getStateForTests(KEY)).toBeUndefined()
|
|
113
|
+
cap.now = 60_000
|
|
114
|
+
__tickForTests(cap.now)
|
|
115
|
+
expect(cap.edits).toHaveLength(0)
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
it('edits anchor with elapsed-time suffix at EDIT_INTERVAL_MS cadence', async () => {
|
|
119
|
+
const cap = setup()
|
|
120
|
+
startTurn(KEY)
|
|
121
|
+
noteAsyncDispatch(KEY)
|
|
122
|
+
noteOutbound(KEY, {
|
|
123
|
+
messageId: 100,
|
|
124
|
+
text: 'Background sleep running; awaiting completion.',
|
|
125
|
+
})
|
|
126
|
+
cap.now = 0
|
|
127
|
+
noteTurnEnd(KEY)
|
|
128
|
+
|
|
129
|
+
// Tick at half-interval — no edit yet.
|
|
130
|
+
cap.now = EDIT_INTERVAL_MS / 2
|
|
131
|
+
__tickForTests(cap.now)
|
|
132
|
+
await flush()
|
|
133
|
+
expect(cap.edits).toHaveLength(0)
|
|
134
|
+
|
|
135
|
+
// Tick at full interval — first edit fires, "1m" suffix.
|
|
136
|
+
cap.now = EDIT_INTERVAL_MS
|
|
137
|
+
__tickForTests(cap.now)
|
|
138
|
+
await flush()
|
|
139
|
+
expect(cap.edits).toHaveLength(1)
|
|
140
|
+
expect(cap.edits[0].messageId).toBe(100)
|
|
141
|
+
expect(cap.edits[0].newText).toBe(
|
|
142
|
+
'Background sleep running; awaiting completion.\n\n— still working (1m)',
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
// Tick at 3 intervals total — second edit, "3m".
|
|
146
|
+
cap.now = EDIT_INTERVAL_MS * 3
|
|
147
|
+
__tickForTests(cap.now)
|
|
148
|
+
await flush()
|
|
149
|
+
expect(cap.edits).toHaveLength(2)
|
|
150
|
+
expect(cap.edits[1].newText).toBe(
|
|
151
|
+
'Background sleep running; awaiting completion.\n\n— still working (3m)',
|
|
152
|
+
)
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
it('strips prior suffix before re-appending so anchor never accumulates', async () => {
|
|
156
|
+
const cap = setup()
|
|
157
|
+
startTurn(KEY)
|
|
158
|
+
noteAsyncDispatch(KEY)
|
|
159
|
+
// Simulate a noteOutbound for text that already carries a stale
|
|
160
|
+
// suffix from an earlier round (defence in depth).
|
|
161
|
+
noteOutbound(KEY, {
|
|
162
|
+
messageId: 100,
|
|
163
|
+
text: 'worker dispatched\n\n— still working (12m)',
|
|
164
|
+
})
|
|
165
|
+
noteTurnEnd(KEY)
|
|
166
|
+
cap.now = EDIT_INTERVAL_MS
|
|
167
|
+
__tickForTests(cap.now)
|
|
168
|
+
await flush()
|
|
169
|
+
// The new edit should be based on 'worker dispatched' alone.
|
|
170
|
+
expect(cap.edits[0].newText).toBe(
|
|
171
|
+
'worker dispatched\n\n— still working (1m)',
|
|
172
|
+
)
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
it("clears on 'inbound' reason — user re-engaged", () => {
|
|
176
|
+
const cap = setup()
|
|
177
|
+
startTurn(KEY)
|
|
178
|
+
noteAsyncDispatch(KEY)
|
|
179
|
+
noteOutbound(KEY, { messageId: 100, text: 'wd' })
|
|
180
|
+
noteTurnEnd(KEY)
|
|
181
|
+
cap.now = EDIT_INTERVAL_MS * 2
|
|
182
|
+
clearPending(KEY, 'inbound')
|
|
183
|
+
expect(__getStateForTests(KEY)).toBeUndefined()
|
|
184
|
+
expect(cap.metrics).toContainEqual({
|
|
185
|
+
kind: 'pending_progress_cleared',
|
|
186
|
+
chatKey: KEY,
|
|
187
|
+
elapsedMs: EDIT_INTERVAL_MS * 2,
|
|
188
|
+
reason: 'inbound',
|
|
189
|
+
})
|
|
190
|
+
// No further edits after clear.
|
|
191
|
+
cap.now = EDIT_INTERVAL_MS * 3
|
|
192
|
+
__tickForTests(cap.now)
|
|
193
|
+
expect(cap.edits).toHaveLength(0)
|
|
194
|
+
})
|
|
195
|
+
|
|
196
|
+
it("clears on 'handback' reason — model is about to re-engage", () => {
|
|
197
|
+
const cap = setup()
|
|
198
|
+
startTurn(KEY)
|
|
199
|
+
noteAsyncDispatch(KEY)
|
|
200
|
+
noteOutbound(KEY, { messageId: 100, text: 'wd' })
|
|
201
|
+
noteTurnEnd(KEY)
|
|
202
|
+
clearPending(KEY, 'handback')
|
|
203
|
+
expect(__getStateForTests(KEY)).toBeUndefined()
|
|
204
|
+
expect(cap.metrics.some((m) => m.kind === 'pending_progress_cleared' && m.reason === 'handback')).toBe(true)
|
|
205
|
+
})
|
|
206
|
+
|
|
207
|
+
it('times out at MAX_LIFETIME_MS', async () => {
|
|
208
|
+
const cap = setup()
|
|
209
|
+
startTurn(KEY)
|
|
210
|
+
noteAsyncDispatch(KEY)
|
|
211
|
+
noteOutbound(KEY, { messageId: 100, text: 'wd' })
|
|
212
|
+
cap.now = 0
|
|
213
|
+
noteTurnEnd(KEY)
|
|
214
|
+
// Halfway — still active.
|
|
215
|
+
cap.now = MAX_LIFETIME_MS / 2
|
|
216
|
+
__tickForTests(cap.now)
|
|
217
|
+
await flush()
|
|
218
|
+
expect(__getStateForTests(KEY)).toBeDefined()
|
|
219
|
+
// Past the budget — auto-cleared.
|
|
220
|
+
cap.now = MAX_LIFETIME_MS + 1
|
|
221
|
+
__tickForTests(cap.now)
|
|
222
|
+
await flush()
|
|
223
|
+
expect(__getStateForTests(KEY)).toBeUndefined()
|
|
224
|
+
expect(cap.metrics.some((m) => m.kind === 'pending_progress_cleared' && m.reason === 'timeout')).toBe(true)
|
|
225
|
+
})
|
|
226
|
+
|
|
227
|
+
it('skips edit (but advances cadence) if total would exceed Telegram message cap', async () => {
|
|
228
|
+
const cap = setup()
|
|
229
|
+
startTurn(KEY)
|
|
230
|
+
noteAsyncDispatch(KEY)
|
|
231
|
+
// Anchor text long enough that even the smallest suffix overflows.
|
|
232
|
+
const bigText = 'x'.repeat(TELEGRAM_MSG_CAP - 5)
|
|
233
|
+
noteOutbound(KEY, { messageId: 100, text: bigText })
|
|
234
|
+
cap.now = 0
|
|
235
|
+
noteTurnEnd(KEY)
|
|
236
|
+
cap.now = EDIT_INTERVAL_MS
|
|
237
|
+
__tickForTests(cap.now)
|
|
238
|
+
await flush()
|
|
239
|
+
expect(cap.edits).toHaveLength(0)
|
|
240
|
+
// lastEditAt still advanced — we won't spin retrying every tick.
|
|
241
|
+
const s = __getStateForTests(KEY)
|
|
242
|
+
expect(s?.lastEditAt).toBe(EDIT_INTERVAL_MS)
|
|
243
|
+
})
|
|
244
|
+
|
|
245
|
+
it('honors the kill switch — no state, no edits, no metrics', async () => {
|
|
246
|
+
const cap = setup()
|
|
247
|
+
process.env.SWITCHROOM_DISABLE_PENDING_PROGRESS = '1'
|
|
248
|
+
try {
|
|
249
|
+
startTurn(KEY)
|
|
250
|
+
noteAsyncDispatch(KEY)
|
|
251
|
+
noteOutbound(KEY, { messageId: 100, text: 'wd' })
|
|
252
|
+
noteTurnEnd(KEY)
|
|
253
|
+
expect(__getStateForTests(KEY)).toBeUndefined()
|
|
254
|
+
cap.now = EDIT_INTERVAL_MS * 3
|
|
255
|
+
__tickForTests(cap.now)
|
|
256
|
+
await flush()
|
|
257
|
+
expect(cap.edits).toHaveLength(0)
|
|
258
|
+
expect(cap.metrics).toHaveLength(0)
|
|
259
|
+
} finally {
|
|
260
|
+
delete process.env.SWITCHROOM_DISABLE_PENDING_PROGRESS
|
|
261
|
+
}
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
it('startTurn resets per-turn fields but NOT cross-turn activation', () => {
|
|
265
|
+
const cap = setup()
|
|
266
|
+
// Turn 1: dispatches async, ends, pending-progress active.
|
|
267
|
+
startTurn(KEY)
|
|
268
|
+
noteAsyncDispatch(KEY)
|
|
269
|
+
noteOutbound(KEY, { messageId: 100, text: 'wd' })
|
|
270
|
+
cap.now = 1_000
|
|
271
|
+
noteTurnEnd(KEY)
|
|
272
|
+
expect(__getStateForTests(KEY)?.activatedAt).toBe(1_000)
|
|
273
|
+
// Turn 2 starts (e.g. via the gateway's inbound path that already
|
|
274
|
+
// called clearPending). startTurn resets per-turn fields but the
|
|
275
|
+
// map entry has been deleted by clearPending, so this should
|
|
276
|
+
// simply do nothing dangerous if called against an absent key.
|
|
277
|
+
clearPending(KEY, 'inbound')
|
|
278
|
+
startTurn(KEY)
|
|
279
|
+
expect(__getStateForTests(KEY)).toBeUndefined()
|
|
280
|
+
})
|
|
281
|
+
|
|
282
|
+
it('no stale carryover: turn 1 activates, clearPending fires, turn 2 (no async) does not re-activate', async () => {
|
|
283
|
+
// Reproduces the reviewer's blocker #2 path: turn 1 with async
|
|
284
|
+
// dispatch activates pending-progress; an arriving turn 2 (real
|
|
285
|
+
// inbound OR synthesised wake) must clear state so a turn 2 that
|
|
286
|
+
// does NOT itself dispatch async never inherits the prior turn's
|
|
287
|
+
// `pending=true` and re-activates against turn 2's anchor.
|
|
288
|
+
const cap = setup()
|
|
289
|
+
// ── Turn 1: dispatch async, reply, end — activates.
|
|
290
|
+
noteAsyncDispatch(KEY)
|
|
291
|
+
noteOutbound(KEY, { messageId: 100, text: 'worker dispatched' })
|
|
292
|
+
cap.now = 1_000
|
|
293
|
+
noteTurnEnd(KEY)
|
|
294
|
+
expect(__getStateForTests(KEY)?.activatedAt).toBe(1_000)
|
|
295
|
+
|
|
296
|
+
// ── Inbound (or handback / cron / vault grant) for turn 2.
|
|
297
|
+
// Gateway clears state — exactly what the inbound/enqueue hooks
|
|
298
|
+
// wire up at handleInbound + handleSessionEvent.enqueue.
|
|
299
|
+
cap.now = 90_000
|
|
300
|
+
clearPending(KEY, 'inbound')
|
|
301
|
+
expect(__getStateForTests(KEY)).toBeUndefined()
|
|
302
|
+
|
|
303
|
+
// ── Turn 2: reply only, NO async dispatch this turn.
|
|
304
|
+
noteOutbound(KEY, { messageId: 200, text: 'just answering' })
|
|
305
|
+
cap.now = 91_000
|
|
306
|
+
noteTurnEnd(KEY)
|
|
307
|
+
|
|
308
|
+
// Turn 2 must NOT activate — no async was dispatched in this turn.
|
|
309
|
+
// Pre-fix this assertion would fail because the prior turn's
|
|
310
|
+
// `pending=true` was never reset and `noteTurnEnd` re-activated
|
|
311
|
+
// against turn 2's fresh anchor.
|
|
312
|
+
expect(__getStateForTests(KEY)).toBeUndefined()
|
|
313
|
+
|
|
314
|
+
// Confirm: no edits fire over the next several poll intervals.
|
|
315
|
+
cap.now = 91_000 + EDIT_INTERVAL_MS * 3
|
|
316
|
+
__tickForTests(cap.now)
|
|
317
|
+
await flush()
|
|
318
|
+
expect(cap.edits).toHaveLength(0)
|
|
319
|
+
})
|
|
320
|
+
|
|
321
|
+
it('multiple chats — independent state', async () => {
|
|
322
|
+
const cap = setup()
|
|
323
|
+
const KEY_A = 'A:_'
|
|
324
|
+
const KEY_B = 'B:42'
|
|
325
|
+
startTurn(KEY_A)
|
|
326
|
+
noteAsyncDispatch(KEY_A)
|
|
327
|
+
noteOutbound(KEY_A, { messageId: 10, text: 'wd-A' })
|
|
328
|
+
cap.now = 0
|
|
329
|
+
noteTurnEnd(KEY_A)
|
|
330
|
+
|
|
331
|
+
startTurn(KEY_B)
|
|
332
|
+
noteAsyncDispatch(KEY_B)
|
|
333
|
+
noteOutbound(KEY_B, { messageId: 20, text: 'wd-B' })
|
|
334
|
+
noteTurnEnd(KEY_B)
|
|
335
|
+
|
|
336
|
+
cap.now = EDIT_INTERVAL_MS
|
|
337
|
+
__tickForTests(cap.now)
|
|
338
|
+
await flush()
|
|
339
|
+
expect(cap.edits).toHaveLength(2)
|
|
340
|
+
const byMsg = new Map(cap.edits.map((e) => [e.messageId, e]))
|
|
341
|
+
expect(byMsg.get(10)?.chatId).toBe('A')
|
|
342
|
+
expect(byMsg.get(10)?.threadId).toBe(null)
|
|
343
|
+
expect(byMsg.get(20)?.chatId).toBe('B')
|
|
344
|
+
expect(byMsg.get(20)?.threadId).toBe(42)
|
|
345
|
+
|
|
346
|
+
// Clear A only; B should keep ticking.
|
|
347
|
+
clearPending(KEY_A, 'inbound')
|
|
348
|
+
cap.now = EDIT_INTERVAL_MS * 2
|
|
349
|
+
__tickForTests(cap.now)
|
|
350
|
+
await flush()
|
|
351
|
+
expect(cap.edits.filter((e) => e.messageId === 10)).toHaveLength(1)
|
|
352
|
+
expect(cap.edits.filter((e) => e.messageId === 20)).toHaveLength(2)
|
|
353
|
+
})
|
|
354
|
+
})
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-turn pending-async progress — UAT regression gate for #1445.
|
|
3
|
+
*
|
|
4
|
+
* Verifies the post-fix behaviour shipped in `pending-work-progress.ts`:
|
|
5
|
+
* when a turn ends with the model having dispatched async background
|
|
6
|
+
* work (here `Bash` with `run_in_background:true`) and the model has
|
|
7
|
+
* stopped speaking, the framework keeps editing the model's last reply
|
|
8
|
+
* *in place* at ~60s intervals so the user sees ambient liveness during
|
|
9
|
+
* the wait.
|
|
10
|
+
*
|
|
11
|
+
* ## Pre-fix behaviour (what the user complained about)
|
|
12
|
+
*
|
|
13
|
+
* 1. User sends a long-task prompt at t=0.
|
|
14
|
+
* 2. Model runs the bash command with `run_in_background:true` and
|
|
15
|
+
* sends one PING reply at ~+20s ("Background sleep running…").
|
|
16
|
+
* 3. Turn ends.
|
|
17
|
+
* 4. Silence-poke ladder is per-turn — it stops the moment endTurn()
|
|
18
|
+
* fires. There is no cross-turn ambient surface.
|
|
19
|
+
* 5. The user sees NOTHING for ~5 min until the framework's 300s
|
|
20
|
+
* silence-poke fallback fires (or — as observed in the UAT that
|
|
21
|
+
* drove the fix — does not fire at all, because the turn already
|
|
22
|
+
* ended). Production data confirms: silence-poke succeeded/fired
|
|
23
|
+
* rate is 0–7% across hundreds of fires.
|
|
24
|
+
*
|
|
25
|
+
* ## Post-fix behaviour (this scenario asserts)
|
|
26
|
+
*
|
|
27
|
+
* 1. Model sends one fresh reply at ~+20s — the anchor.
|
|
28
|
+
* 2. Turn ends.
|
|
29
|
+
* 3. Framework edits the anchor in place at ~+80s, ~+140s, ~+200s,
|
|
30
|
+
* ~+260s, ~+320s with the suffix `\n\n— still working (Nm)`.
|
|
31
|
+
* 4. All edits are SILENT (`disable_notification: true` on the edit
|
|
32
|
+
* or, equivalently, an edit which never pushes a notification).
|
|
33
|
+
* The user sees ambient liveness without any added pings.
|
|
34
|
+
* 5. Sleep completes ~+350s; the model wakes (`BashOutput` /
|
|
35
|
+
* background-task notification path), turn re-starts,
|
|
36
|
+
* `clearPending` fires — no further edits.
|
|
37
|
+
*
|
|
38
|
+
* ## What this scenario asserts
|
|
39
|
+
*
|
|
40
|
+
* 1. At least one FRESH bot message lands (the initial anchor).
|
|
41
|
+
* 2. At least one EDIT to the anchor lands AFTER the initial reply,
|
|
42
|
+
* whose text contains the framework suffix `— still working (\d+m)`.
|
|
43
|
+
* 3. Every observed EDIT message is `silent === true` (no push
|
|
44
|
+
* notification fired by the in-place edit).
|
|
45
|
+
* 4. Edits are anchored to the SAME `messageId` as the initial
|
|
46
|
+
* fresh reply (single in-place surface, not spammy new sends).
|
|
47
|
+
*
|
|
48
|
+
* The full per-message trail is dumped to console for forensic
|
|
49
|
+
* inspection regardless of pass / fail.
|
|
50
|
+
*
|
|
51
|
+
* Wall-clock budget: ~8 min.
|
|
52
|
+
*/
|
|
53
|
+
|
|
54
|
+
import { describe, expect, it } from "vitest";
|
|
55
|
+
import { spinUp } from "../harness.js";
|
|
56
|
+
import type { ObservedMessage } from "../driver.js";
|
|
57
|
+
|
|
58
|
+
const SLEEP_SECONDS = 350;
|
|
59
|
+
|
|
60
|
+
// Engineered to elicit the natural production pattern: the model
|
|
61
|
+
// sends a quick ack reply ("on it — background sleep running"),
|
|
62
|
+
// dispatches the sleep as a background Bash, ends its turn, then
|
|
63
|
+
// returns with "done" once the sleep completes. The framework
|
|
64
|
+
// fix-under-test owns the in-between ambient.
|
|
65
|
+
const PROMPT =
|
|
66
|
+
`Please run \`sleep ${SLEEP_SECONDS}\` in the background using the ` +
|
|
67
|
+
`Bash tool with \`run_in_background: true\` — this is a stress ` +
|
|
68
|
+
`test of the cross-turn ambient progress surface, so the sleep ` +
|
|
69
|
+
`duration matters. Send a brief one-line acknowledgement that ` +
|
|
70
|
+
`you've dispatched it (your natural beat-1 ack is fine), then ` +
|
|
71
|
+
`wait for it to complete. When it finishes, reply with exactly ` +
|
|
72
|
+
`the single word "done".`;
|
|
73
|
+
|
|
74
|
+
const OVERALL_DEADLINE_MS = (SLEEP_SECONDS + 240) * 1000;
|
|
75
|
+
|
|
76
|
+
interface TrailEntry {
|
|
77
|
+
relMs: number;
|
|
78
|
+
kind: "fresh" | "edit";
|
|
79
|
+
silent: boolean;
|
|
80
|
+
messageId: number;
|
|
81
|
+
text: string;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const SUFFIX_RE = /\n\n— still working \(\d+m\)$/;
|
|
85
|
+
|
|
86
|
+
function pad(s: string, n: number): string {
|
|
87
|
+
return s.length >= n ? s : s + " ".repeat(n - s.length);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
describe("uat: cross-turn pending-async ambient progress (#1445)", () => {
|
|
91
|
+
it(
|
|
92
|
+
"framework edits the anchor in place during a 350s background bash",
|
|
93
|
+
async () => {
|
|
94
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
95
|
+
try {
|
|
96
|
+
const startedAt = Date.now();
|
|
97
|
+
await sc.sendDM(PROMPT);
|
|
98
|
+
console.log(`[cross-turn-pending] t=0 prompt sent`);
|
|
99
|
+
|
|
100
|
+
const trail: TrailEntry[] = [];
|
|
101
|
+
|
|
102
|
+
// Initial wait window — give the model 90s to send its first
|
|
103
|
+
// anchor reply. After that, we observe edits for the full
|
|
104
|
+
// sleep duration plus headroom; once the model's final fresh
|
|
105
|
+
// "done" lands we wind down within 10s.
|
|
106
|
+
let quiescenceDeadline = startedAt + 90_000;
|
|
107
|
+
const overallDeadline = startedAt + OVERALL_DEADLINE_MS;
|
|
108
|
+
let firstAnchorMsgId: number | null = null;
|
|
109
|
+
let sawDone = false;
|
|
110
|
+
|
|
111
|
+
while (Date.now() < overallDeadline) {
|
|
112
|
+
const remaining = Math.min(
|
|
113
|
+
quiescenceDeadline - Date.now(),
|
|
114
|
+
overallDeadline - Date.now(),
|
|
115
|
+
);
|
|
116
|
+
if (remaining <= 0) break;
|
|
117
|
+
try {
|
|
118
|
+
const msg = await sc.expectMessage(
|
|
119
|
+
(m: ObservedMessage) => m.fromBot,
|
|
120
|
+
{ from: "bot", timeout: remaining },
|
|
121
|
+
);
|
|
122
|
+
const rel = Date.now() - startedAt;
|
|
123
|
+
const entry: TrailEntry = {
|
|
124
|
+
relMs: rel,
|
|
125
|
+
kind: msg.edited ? "edit" : "fresh",
|
|
126
|
+
silent: msg.silent,
|
|
127
|
+
messageId: msg.messageId,
|
|
128
|
+
text: msg.text,
|
|
129
|
+
};
|
|
130
|
+
trail.push(entry);
|
|
131
|
+
console.log(
|
|
132
|
+
`[cross-turn-pending] +${(rel / 1000).toFixed(1)}s ` +
|
|
133
|
+
`${entry.kind.toUpperCase()} msg=${entry.messageId} ` +
|
|
134
|
+
`silent=${entry.silent} text=${JSON.stringify(
|
|
135
|
+
entry.text.slice(0, 120).replace(/\n/g, " ⏎ "),
|
|
136
|
+
)}`,
|
|
137
|
+
);
|
|
138
|
+
if (firstAnchorMsgId == null && entry.kind === "fresh") {
|
|
139
|
+
firstAnchorMsgId = entry.messageId;
|
|
140
|
+
}
|
|
141
|
+
const trimmedFinal = entry.text.trim().toLowerCase();
|
|
142
|
+
const looksLikeDone =
|
|
143
|
+
entry.kind === "fresh" &&
|
|
144
|
+
entry.messageId !== firstAnchorMsgId &&
|
|
145
|
+
(trimmedFinal === "done" || /\bdone\b/.test(trimmedFinal));
|
|
146
|
+
if (looksLikeDone) {
|
|
147
|
+
sawDone = true;
|
|
148
|
+
quiescenceDeadline = Date.now() + 10_000;
|
|
149
|
+
} else {
|
|
150
|
+
// Generous quiescence so we cover the whole sleep window
|
|
151
|
+
// plus a 90s headroom for the model's wake + final reply.
|
|
152
|
+
quiescenceDeadline = Date.now() + 120_000;
|
|
153
|
+
}
|
|
154
|
+
} catch {
|
|
155
|
+
// Timed out — quiescence reached.
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Dump full trail.
|
|
161
|
+
console.log(
|
|
162
|
+
"\n========== CROSS-TURN PENDING-PROGRESS TRAIL ==========",
|
|
163
|
+
);
|
|
164
|
+
console.log(`prompt: ${SLEEP_SECONDS}s background bash`);
|
|
165
|
+
console.log(`total bot messages observed: ${trail.length}`);
|
|
166
|
+
console.log(`anchor messageId: ${firstAnchorMsgId}`);
|
|
167
|
+
console.log(`saw_done: ${sawDone}`);
|
|
168
|
+
console.log("");
|
|
169
|
+
console.log(" rel(s) kind silent msg text");
|
|
170
|
+
console.log(" ------- ----- ------ ----------- ----");
|
|
171
|
+
for (const e of trail) {
|
|
172
|
+
console.log(
|
|
173
|
+
` ${pad((e.relMs / 1000).toFixed(1) + "s", 8)} ` +
|
|
174
|
+
`${pad(e.kind, 6)} ${pad(String(e.silent), 7)} ` +
|
|
175
|
+
`${pad(String(e.messageId), 12)} ` +
|
|
176
|
+
`${e.text.slice(0, 80).replace(/\n/g, " ⏎ ")}`,
|
|
177
|
+
);
|
|
178
|
+
}
|
|
179
|
+
console.log(
|
|
180
|
+
"=======================================================\n",
|
|
181
|
+
);
|
|
182
|
+
|
|
183
|
+
// ── Regression assertions ─────────────────────────────────
|
|
184
|
+
|
|
185
|
+
// (1) at least one fresh anchor reply landed
|
|
186
|
+
const fresh = trail.filter((e) => e.kind === "fresh");
|
|
187
|
+
expect(
|
|
188
|
+
fresh.length,
|
|
189
|
+
`no fresh bot replies observed — agent isn't responding`,
|
|
190
|
+
).toBeGreaterThanOrEqual(1);
|
|
191
|
+
expect(firstAnchorMsgId).not.toBeNull();
|
|
192
|
+
|
|
193
|
+
// (2) at least one edit landed AFTER the initial anchor, and
|
|
194
|
+
// its text carries the framework's "— still working (Nm)"
|
|
195
|
+
// suffix. This is THE regression gate for the fix.
|
|
196
|
+
const edits = trail.filter((e) => e.kind === "edit");
|
|
197
|
+
const editsWithSuffix = edits.filter((e) => SUFFIX_RE.test(e.text));
|
|
198
|
+
expect(
|
|
199
|
+
editsWithSuffix.length,
|
|
200
|
+
`no in-place edits with the "— still working (Nm)" suffix ` +
|
|
201
|
+
`landed during the ${SLEEP_SECONDS}s background bash. ` +
|
|
202
|
+
`Total edits observed: ${edits.length}. The cross-turn ` +
|
|
203
|
+
`pending-progress fix is not active — see ` +
|
|
204
|
+
`\`pending-work-progress.ts\` and the gateway hooks at ` +
|
|
205
|
+
`\`noteAsyncDispatch\` / \`noteOutbound\` / \`noteTurnEnd\`. ` +
|
|
206
|
+
`Pre-fix this number is zero by construction.`,
|
|
207
|
+
).toBeGreaterThanOrEqual(1);
|
|
208
|
+
|
|
209
|
+
// (3) every observed edit is silent (an edit never pings per
|
|
210
|
+
// Telegram semantics; we double-check via the receiving-side
|
|
211
|
+
// flag so any framework regression that switched to fresh
|
|
212
|
+
// sends fails loudly).
|
|
213
|
+
const loudEdits = edits.filter((e) => !e.silent);
|
|
214
|
+
expect(
|
|
215
|
+
loudEdits.length,
|
|
216
|
+
`${loudEdits.length} edit(s) pinged the device — edits ` +
|
|
217
|
+
`should never fire a notification.`,
|
|
218
|
+
).toBe(0);
|
|
219
|
+
|
|
220
|
+
// (4) every edit is anchored to the same messageId as the
|
|
221
|
+
// initial fresh anchor — the framework is editing ONE
|
|
222
|
+
// surface, not spamming.
|
|
223
|
+
const offAnchorEdits = edits.filter(
|
|
224
|
+
(e) => e.messageId !== firstAnchorMsgId,
|
|
225
|
+
);
|
|
226
|
+
expect(
|
|
227
|
+
offAnchorEdits.length,
|
|
228
|
+
`${offAnchorEdits.length} edit(s) were anchored to a ` +
|
|
229
|
+
`different message id than the initial reply ` +
|
|
230
|
+
`(${firstAnchorMsgId}). The framework should edit a ` +
|
|
231
|
+
`single anchor in place, not a chain of messages.`,
|
|
232
|
+
).toBe(0);
|
|
233
|
+
} finally {
|
|
234
|
+
await sc.tearDown();
|
|
235
|
+
}
|
|
236
|
+
},
|
|
237
|
+
OVERALL_DEADLINE_MS + 60_000,
|
|
238
|
+
);
|
|
239
|
+
});
|