switchroom 0.12.21 → 0.12.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,475 @@
1
+ /**
2
+ * Property tests for `inbound-delivery-machine.ts`.
3
+ *
4
+ * Per RFC `docs/rfcs/inbound-delivery-state-machine.md`: 5 invariants
5
+ * validated over arbitrary event schedules. A counterexample is the
6
+ * minimal evidence that the machine has a bug. The wedge-cluster
7
+ * bugs (v0.12.22 boot-wedge, overlapping-turn silence, #1564 sibling
8
+ * keys) become FAILING property tests if reintroduced.
9
+ *
10
+ * Schedules are generated by a seeded PRNG so failures are
11
+ * reproducible. We don't use fast-check (not a dependency) — the
12
+ * randomness is sufficient for the invariant shapes we're checking,
13
+ * and pure random + shrink-on-failure is documented in the RFC as
14
+ * acceptable for v1.
15
+ */
16
+
17
+ import { describe, expect, it } from 'vitest'
18
+ import {
19
+ type ChatKey,
20
+ type Effect,
21
+ type Event,
22
+ type InboundMessage,
23
+ type PermissionVerdict,
24
+ type State,
25
+ initialState,
26
+ OUTBOUND_RECENT_MS,
27
+ transition,
28
+ TURN_TTL_MS,
29
+ __chatIdOfKeyForTests,
30
+ } from '../gateway/inbound-delivery-machine'
31
+
32
+ // ─────────────────────────────────────────────────────────────────────
33
+ // Seeded PRNG (deterministic for reproducible failures)
34
+ // ─────────────────────────────────────────────────────────────────────
35
+
36
+ function mulberry32(seed: number): () => number {
37
+ let s = seed >>> 0
38
+ return () => {
39
+ s = (s + 0x6d2b79f5) >>> 0
40
+ let t = s
41
+ t = Math.imul(t ^ (t >>> 15), t | 1)
42
+ t ^= t + Math.imul(t ^ (t >>> 7), t | 61)
43
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296
44
+ }
45
+ }
46
+
47
+ // ─────────────────────────────────────────────────────────────────────
48
+ // Event generators
49
+ // ─────────────────────────────────────────────────────────────────────
50
+
51
+ const CHATS = ['c1', 'c2', 'c3'] as const
52
+ const THREADS = ['_', '1', '2', '0'] as const // includes '_' AND '0' to flag any latent sibling-key handling
53
+
54
+ function makeKey(chat: string, thread: string): ChatKey {
55
+ return `${chat}:${thread}` as ChatKey
56
+ }
57
+
58
+ interface ScheduleContext {
59
+ readonly rand: () => number
60
+ now: number
61
+ msgIdCounter: number
62
+ permIdCounter: number
63
+ }
64
+
65
+ function randomKey(ctx: ScheduleContext): ChatKey {
66
+ const c = CHATS[Math.floor(ctx.rand() * CHATS.length)]
67
+ const t = THREADS[Math.floor(ctx.rand() * THREADS.length)]
68
+ return makeKey(c, t)
69
+ }
70
+
71
+ function nextMsg(ctx: ScheduleContext, isSteering = false): InboundMessage {
72
+ const msgId = ++ctx.msgIdCounter
73
+ return { msgId, isSteering, payload: { id: msgId } }
74
+ }
75
+
76
+ function nextVerdict(ctx: ScheduleContext): PermissionVerdict {
77
+ const requestId = `req-${++ctx.permIdCounter}`
78
+ return { requestId, behavior: 'allow', payload: { requestId } }
79
+ }
80
+
81
+ function advanceTime(ctx: ScheduleContext, range: number): void {
82
+ ctx.now += Math.floor(ctx.rand() * range)
83
+ }
84
+
85
+ type EventKind = Event['kind']
86
+ const EVENT_KINDS: readonly EventKind[] = [
87
+ 'bridgeUp',
88
+ 'bridgeDown',
89
+ 'inbound',
90
+ 'turnStart',
91
+ 'turnEnd',
92
+ 'modelOutbound',
93
+ 'permVerdict',
94
+ 'tick',
95
+ ]
96
+
97
+ function generateEvent(ctx: ScheduleContext, prior: State): Event {
98
+ // Weight inbound + tick + turnEnd higher than rare ops so schedules
99
+ // exercise the steady-state hot path.
100
+ const r = ctx.rand()
101
+ let kind: EventKind
102
+ if (r < 0.3) kind = 'inbound'
103
+ else if (r < 0.45) kind = 'tick'
104
+ else if (r < 0.55) kind = 'turnEnd'
105
+ else if (r < 0.65) kind = 'modelOutbound'
106
+ else if (r < 0.75) kind = 'turnStart'
107
+ else if (r < 0.82) kind = 'permVerdict'
108
+ else if (r < 0.91) kind = 'bridgeUp'
109
+ else kind = 'bridgeDown'
110
+
111
+ switch (kind) {
112
+ case 'bridgeUp':
113
+ return { kind: 'bridgeUp', at: ctx.now }
114
+ case 'bridgeDown':
115
+ return { kind: 'bridgeDown', at: ctx.now }
116
+ case 'inbound': {
117
+ const isSteering = ctx.rand() < 0.1
118
+ return { kind: 'inbound', key: randomKey(ctx), msg: nextMsg(ctx, isSteering), at: ctx.now }
119
+ }
120
+ case 'turnStart':
121
+ return { kind: 'turnStart', key: randomKey(ctx), at: ctx.now }
122
+ case 'turnEnd': {
123
+ // Bias toward an actually-active key so turnEnd is meaningful
124
+ // most of the time.
125
+ const activeKeys = [...prior.perKey.entries()]
126
+ .filter(([, v]) => v.turnStartedAt != null)
127
+ .map(([k]) => k)
128
+ const key = activeKeys.length > 0 && ctx.rand() < 0.8
129
+ ? activeKeys[Math.floor(ctx.rand() * activeKeys.length)]
130
+ : randomKey(ctx)
131
+ return { kind: 'turnEnd', key, at: ctx.now, outboundEmitted: ctx.rand() < 0.85 }
132
+ }
133
+ case 'modelOutbound':
134
+ return { kind: 'modelOutbound', key: randomKey(ctx), at: ctx.now }
135
+ case 'permVerdict':
136
+ return { kind: 'permVerdict', verdict: nextVerdict(ctx), at: ctx.now }
137
+ case 'tick':
138
+ // Tick advances time meaningfully — sometimes a small step,
139
+ // sometimes a big one that crosses TURN_TTL_MS so the fallback
140
+ // path is exercised.
141
+ advanceTime(ctx, ctx.rand() < 0.1 ? TURN_TTL_MS * 2 : 5_000)
142
+ return { kind: 'tick', now: ctx.now }
143
+ }
144
+ }
145
+
146
+ // ─────────────────────────────────────────────────────────────────────
147
+ // Property: simulate a schedule and assert invariants
148
+ // ─────────────────────────────────────────────────────────────────────
149
+
150
+ interface TraceEntry {
151
+ readonly event: Event
152
+ readonly stateBefore: State
153
+ readonly stateAfter: State
154
+ readonly effects: readonly Effect[]
155
+ }
156
+
157
+ function runSchedule(seed: number, eventCount: number): TraceEntry[] {
158
+ const ctx: ScheduleContext = {
159
+ rand: mulberry32(seed),
160
+ now: 0,
161
+ msgIdCounter: 0,
162
+ permIdCounter: 0,
163
+ }
164
+ let state = initialState()
165
+ const trace: TraceEntry[] = []
166
+ for (let i = 0; i < eventCount; i++) {
167
+ const event = generateEvent(ctx, state)
168
+ const { state: stateAfter, effects } = transition(state, event)
169
+ trace.push({ event, stateBefore: state, stateAfter, effects })
170
+ state = stateAfter
171
+ // Advance clock a small amount between events so timestamps don't
172
+ // cluster at zero.
173
+ advanceTime(ctx, 1_000)
174
+ }
175
+ return trace
176
+ }
177
+
178
+ function formatCounterexample(seed: number, trace: TraceEntry[]): string {
179
+ const lines = [`seed=${seed}, len=${trace.length}`, '']
180
+ for (let i = 0; i < trace.length; i++) {
181
+ const t = trace[i]
182
+ lines.push(`#${i} ${t.event.kind} ${JSON.stringify(t.event)}`)
183
+ if (t.effects.length > 0) {
184
+ lines.push(` → ${t.effects.map((e) => e.kind).join(', ')}`)
185
+ }
186
+ }
187
+ return lines.join('\n')
188
+ }
189
+
190
+ // ─────────────────────────────────────────────────────────────────────
191
+ // Invariant #1 — Every inbound is delivered XOR persisted
192
+ // ─────────────────────────────────────────────────────────────────────
193
+
194
+ describe('Invariant #1 — every inbound is delivered XOR persisted', () => {
195
+ it('holds across 1000 random schedules of 50 events', () => {
196
+ for (let seed = 1; seed <= 1000; seed++) {
197
+ const trace = runSchedule(seed, 50)
198
+ for (let i = 0; i < trace.length; i++) {
199
+ const t = trace[i]
200
+ if (t.event.kind !== 'inbound') continue
201
+ const msgId = t.event.msg.msgId
202
+ const delivered = t.effects.some(
203
+ (e) => e.kind === 'deliverToBridge' && e.msg.msgId === msgId,
204
+ )
205
+ const buffered = t.effects.some(
206
+ (e) => e.kind === 'bufferInbound' && e.msg.msgId === msgId,
207
+ )
208
+ const persisted = t.effects.some(
209
+ (e) => e.kind === 'persistInbound' && e.msg.msgId === msgId,
210
+ )
211
+ // Contract: delivered XOR (buffered AND persisted). Never both
212
+ // delivered AND buffered. Never neither.
213
+ if (delivered && (buffered || persisted)) {
214
+ throw new Error(
215
+ `Invariant #1 violated at event #${i}: msg ${msgId} both delivered and buffered.\n` +
216
+ formatCounterexample(seed, trace.slice(0, i + 1)),
217
+ )
218
+ }
219
+ if (!delivered && !buffered) {
220
+ throw new Error(
221
+ `Invariant #1 violated at event #${i}: msg ${msgId} neither delivered nor buffered.\n` +
222
+ formatCounterexample(seed, trace.slice(0, i + 1)),
223
+ )
224
+ }
225
+ if (buffered && !persisted) {
226
+ throw new Error(
227
+ `Invariant #1 violated at event #${i}: msg ${msgId} buffered but not persisted.\n` +
228
+ formatCounterexample(seed, trace.slice(0, i + 1)),
229
+ )
230
+ }
231
+ }
232
+ }
233
+ expect(true).toBe(true)
234
+ })
235
+ })
236
+
237
+ // ─────────────────────────────────────────────────────────────────────
238
+ // Invariant #2 — setTurnStarted has a matching clearTurnStarted
239
+ // before the next end-of-life event (turnEnd, bridgeDown for active,
240
+ // or tick past TURN_TTL).
241
+ // ─────────────────────────────────────────────────────────────────────
242
+
243
+ describe('Invariant #2 — turnStarted is matched by clearTurnStarted before EOL', () => {
244
+ it('holds across 1000 random schedules of 100 events', () => {
245
+ for (let seed = 1; seed <= 1000; seed++) {
246
+ const trace = runSchedule(seed, 100)
247
+ // For each turn started, track when it was started and find the
248
+ // matching clear. The clear must come strictly before:
249
+ // - the next turnEnd for that key
250
+ // - OR the next tick that crosses TURN_TTL
251
+ // - OR bridgeDown (if the active turn — bridgeDown can clear
252
+ // state implicitly per the RFC's state machine; we don't
253
+ // require clear in that case, the bridge_dead state takes
254
+ // over)
255
+ // Simpler equivalent: at the end of every schedule step, the
256
+ // perKey set should never contain a turnStartedAt entry that's
257
+ // STALE BEYOND TTL. The tick handler is the gate.
258
+ for (let i = 0; i < trace.length; i++) {
259
+ const t = trace[i]
260
+ for (const [k, v] of t.stateAfter.perKey) {
261
+ if (v.turnStartedAt == null) continue
262
+ const age = t.event.kind === 'tick'
263
+ ? t.event.now - v.turnStartedAt
264
+ : 0
265
+ // Note: we don't have a single "current time" outside
266
+ // tick events. Only ticks can detect TTL expiration; so
267
+ // a stale entry persists across non-tick events until a
268
+ // tick processes it. That's the design.
269
+ if (t.event.kind === 'tick' && age > TURN_TTL_MS) {
270
+ // Recent outbound suppresses fallback (invariant #5) — if
271
+ // suppressed, the entry stays. Confirm the suppression
272
+ // condition holds.
273
+ if (v.lastOutboundAt == null || t.event.now - v.lastOutboundAt >= OUTBOUND_RECENT_MS) {
274
+ throw new Error(
275
+ `Invariant #2 violated at event #${i}: key ${k} has stale turnStartedAt ` +
276
+ `(age=${age}ms > ${TURN_TTL_MS}ms) after a tick.\n` +
277
+ formatCounterexample(seed, trace.slice(0, i + 1)),
278
+ )
279
+ }
280
+ }
281
+ }
282
+ }
283
+ }
284
+ expect(true).toBe(true)
285
+ })
286
+ })
287
+
288
+ // ─────────────────────────────────────────────────────────────────────
289
+ // Invariant #3 — per-chat sibling-key cleanup on turnEnd
290
+ // ─────────────────────────────────────────────────────────────────────
291
+
292
+ describe('Invariant #3 — turnEnd sweeps sibling keys for the same chatId', () => {
293
+ it('holds across 1000 random schedules of 100 events', () => {
294
+ for (let seed = 1; seed <= 1000; seed++) {
295
+ const trace = runSchedule(seed, 100)
296
+ for (let i = 0; i < trace.length; i++) {
297
+ const t = trace[i]
298
+ if (t.event.kind !== 'turnEnd') continue
299
+ const chatId = __chatIdOfKeyForTests(t.event.key)
300
+ // After turnEnd, no sibling key for this chatId should retain
301
+ // turnStartedAt != null.
302
+ for (const [k, v] of t.stateAfter.perKey) {
303
+ if (__chatIdOfKeyForTests(k) !== chatId) continue
304
+ if (v.turnStartedAt != null) {
305
+ throw new Error(
306
+ `Invariant #3 violated at event #${i}: turnEnd for ${t.event.key} ` +
307
+ `left sibling key ${k} with turnStartedAt=${v.turnStartedAt}.\n` +
308
+ formatCounterexample(seed, trace.slice(0, i + 1)),
309
+ )
310
+ }
311
+ }
312
+ }
313
+ }
314
+ expect(true).toBe(true)
315
+ })
316
+ })
317
+
318
+ // ─────────────────────────────────────────────────────────────────────
319
+ // Invariant #4 — permVerdict delivered iff bridge alive
320
+ // ─────────────────────────────────────────────────────────────────────
321
+
322
+ describe('Invariant #4 — permVerdict delivered iff bridge alive', () => {
323
+ it('holds across 1000 random schedules of 50 events', () => {
324
+ for (let seed = 1; seed <= 1000; seed++) {
325
+ const trace = runSchedule(seed, 50)
326
+ for (let i = 0; i < trace.length; i++) {
327
+ const t = trace[i]
328
+ if (t.event.kind !== 'permVerdict') continue
329
+ const alive = t.stateBefore.global.kind !== 'bridge_dead'
330
+ const delivered = t.effects.some((e) => e.kind === 'deliverPermVerdict')
331
+ const persisted = t.effects.some((e) => e.kind === 'persistPermVerdict')
332
+ if (alive && !delivered) {
333
+ throw new Error(
334
+ `Invariant #4 violated at #${i}: bridge alive but permVerdict not delivered.\n` +
335
+ formatCounterexample(seed, trace.slice(0, i + 1)),
336
+ )
337
+ }
338
+ if (!alive && !persisted) {
339
+ throw new Error(
340
+ `Invariant #4 violated at #${i}: bridge dead but permVerdict not persisted.\n` +
341
+ formatCounterexample(seed, trace.slice(0, i + 1)),
342
+ )
343
+ }
344
+ if (delivered && persisted) {
345
+ throw new Error(
346
+ `Invariant #4 violated at #${i}: permVerdict both delivered and persisted.\n` +
347
+ formatCounterexample(seed, trace.slice(0, i + 1)),
348
+ )
349
+ }
350
+ }
351
+ }
352
+ expect(true).toBe(true)
353
+ })
354
+ })
355
+
356
+ // ─────────────────────────────────────────────────────────────────────
357
+ // Invariant #5 — spurious-fallback suppression (the 2026-05-20
358
+ // overlapping-turn silence bug becomes unrepresentable)
359
+ // ─────────────────────────────────────────────────────────────────────
360
+
361
+ describe('Invariant #5 — fallback poke suppressed if model recently broke silence', () => {
362
+ it('holds across 1000 random schedules of 100 events', () => {
363
+ for (let seed = 1; seed <= 1000; seed++) {
364
+ const trace = runSchedule(seed, 100)
365
+ for (let i = 0; i < trace.length; i++) {
366
+ const t = trace[i]
367
+ if (t.event.kind !== 'tick') continue
368
+ const now = t.event.now
369
+ for (const eff of t.effects) {
370
+ if (eff.kind !== 'firePoke' || eff.level !== 'fallback') continue
371
+ // Look up the key's lastOutboundAt at stateBefore.
372
+ const perKey = t.stateBefore.perKey.get(eff.key)
373
+ if (perKey == null) continue
374
+ if (perKey.lastOutboundAt == null) continue
375
+ const sinceOutbound = now - perKey.lastOutboundAt
376
+ if (sinceOutbound < OUTBOUND_RECENT_MS) {
377
+ throw new Error(
378
+ `Invariant #5 violated at #${i}: fallback fired for ${eff.key} ` +
379
+ `but model produced outbound only ${sinceOutbound}ms ago ` +
380
+ `(threshold ${OUTBOUND_RECENT_MS}ms).\n` +
381
+ formatCounterexample(seed, trace.slice(0, i + 1)),
382
+ )
383
+ }
384
+ }
385
+ }
386
+ }
387
+ expect(true).toBe(true)
388
+ })
389
+ })
390
+
391
+ // ─────────────────────────────────────────────────────────────────────
392
+ // Targeted regression: the v0.12.22 boot-wedge case
393
+ // ─────────────────────────────────────────────────────────────────────
394
+
395
+ describe('Targeted regression — v0.12.22 boot-wedge', () => {
396
+ it('first inbound on a fresh bridge delivers, not buffers', () => {
397
+ let s = initialState()
398
+ s = transition(s, { kind: 'bridgeUp', at: 0 }).state
399
+ const { state: s2, effects } = transition(s, {
400
+ kind: 'inbound',
401
+ key: 'c1:_' as ChatKey,
402
+ msg: { msgId: 1, isSteering: false, payload: {} },
403
+ at: 100,
404
+ })
405
+ const delivered = effects.some((e) => e.kind === 'deliverToBridge')
406
+ const buffered = effects.some((e) => e.kind === 'bufferInbound')
407
+ expect(delivered).toBe(true)
408
+ expect(buffered).toBe(false)
409
+ expect(s2.global.kind).toBe('bridge_alive_in_turn')
410
+ })
411
+ })
412
+
413
+ // ─────────────────────────────────────────────────────────────────────
414
+ // Targeted regression: the 2026-05-20 overlapping-turn silence case
415
+ // ─────────────────────────────────────────────────────────────────────
416
+
417
+ describe('Targeted regression — overlapping-turn silence (the post-v0.12.22 case)', () => {
418
+ it("suppresses fallback when model produced outbound in prior overlapping turn", () => {
419
+ let s = initialState()
420
+ s = transition(s, { kind: 'bridgeUp', at: 0 }).state
421
+
422
+ // Turn A starts and ends with an outbound at t=5000
423
+ s = transition(s, {
424
+ kind: 'inbound',
425
+ key: 'c1:_' as ChatKey,
426
+ msg: { msgId: 1, isSteering: false, payload: {} },
427
+ at: 1000,
428
+ }).state
429
+ s = transition(s, {
430
+ kind: 'turnEnd',
431
+ key: 'c1:_' as ChatKey,
432
+ at: 5000,
433
+ outboundEmitted: true,
434
+ }).state
435
+
436
+ // Turn B starts at t=6000 (immediately after A)
437
+ s = transition(s, {
438
+ kind: 'inbound',
439
+ key: 'c1:_' as ChatKey,
440
+ msg: { msgId: 2, isSteering: false, payload: {} },
441
+ at: 6000,
442
+ }).state
443
+
444
+ // Tick at t=310,000: turn B is 304s old (past TURN_TTL_MS=300_000).
445
+ // But the model emitted an outbound at t=5000 (less than 60s ago
446
+ // RELATIVE TO TURN B'S START, but 305s before now).
447
+ // Per invariant #5: suppress fallback iff lastOutbound is within
448
+ // OUTBOUND_RECENT_MS (60s) of NOW. 310_000 - 5_000 = 305_000 ms;
449
+ // not within the suppression window. So fallback SHOULD fire here.
450
+ const tickResult = transition(s, { kind: 'tick', now: 310_000 })
451
+ const fired = tickResult.effects.some(
452
+ (e) => e.kind === 'firePoke' && e.level === 'fallback',
453
+ )
454
+ expect(fired).toBe(true)
455
+
456
+ // Now reset and verify the suppression case: outbound at t=305_000,
457
+ // tick at t=310_000 — only 5s elapsed since outbound.
458
+ let s2 = initialState()
459
+ s2 = transition(s2, { kind: 'bridgeUp', at: 0 }).state
460
+ s2 = transition(s2, {
461
+ kind: 'inbound',
462
+ key: 'c2:_' as ChatKey,
463
+ msg: { msgId: 3, isSteering: false, payload: {} },
464
+ at: 1000,
465
+ }).state
466
+ // Model emitted an outbound very recently
467
+ s2 = transition(s2, { kind: 'modelOutbound', key: 'c2:_' as ChatKey, at: 305_000 }).state
468
+ // Tick: turn is 309s old, outbound was 5s ago → suppress
469
+ const suppress = transition(s2, { kind: 'tick', now: 310_000 })
470
+ const firedSuppressed = suppress.effects.some(
471
+ (e) => e.kind === 'firePoke' && e.level === 'fallback',
472
+ )
473
+ expect(firedSuppressed).toBe(false)
474
+ })
475
+ })
@@ -0,0 +1,157 @@
1
+ /**
2
+ * JTBD scenario — always-on vision: first message after restart replies
3
+ * quickly, NOT 5 minutes later via silence-poke fallback.
4
+ *
5
+ * Vision (`reference/vision.md`, see [[project_vision_reanchor_human_assistants]]):
6
+ * agents are always-on specialist exec-assistants. A 5-min blank
7
+ * window on the first message after restart is what BROKEN feels
8
+ * like to a user trying to use their assistant.
9
+ *
10
+ * ## The wedge this guards against
11
+ *
12
+ * Pre-v0.12.22, every agent restart produced ~5 min blank on the
13
+ * first user message in each thread:
14
+ *
15
+ * 1. User sends msg → handleInbound runs the fresh-turn branch
16
+ * → activeTurnStartedAt.set(key, now) at gateway.ts:7357
17
+ * 2. The #1556 delivery gate further down (~7551) checks
18
+ * activeTurnStartedAt.size > 0 to decide if a turn is "already
19
+ * in flight" — sees the entry it just wrote → buffer-until-idle
20
+ * 3. Inbound stuck in pendingInboundBuffer. Bridge never sees it.
21
+ * Claude never replies. activeTurnStartedAt[key] stays set.
22
+ * 4. ~300s later silence-poke framework-fallback fires, drains
23
+ * the buffer, the reply finally lands — five minutes late.
24
+ *
25
+ * Documented in `feedback_5min_restart_wedge_violates_vision.md`.
26
+ * Fix is a one-line snapshot of the live size at receipt-time before
27
+ * the fresh-turn branch mutates the Map.
28
+ *
29
+ * ## What this UAT asserts
30
+ *
31
+ * After a deliberate restart, the FIRST message in a fresh thread
32
+ * gets a reply within a budget that excludes the silence-poke
33
+ * fallback floor (300s). Concretely we assert < 120s, which is
34
+ * generous for slow LLM replies but well below the wedge symptom.
35
+ *
36
+ * The test also makes a stricter observation log so a future
37
+ * regression that lands BETWEEN "wedge fixed" (~LLM latency) and
38
+ * "wedge present" (~5 min) is visible — e.g. some slow startup
39
+ * path that takes 60-90s would pass the contract but bear
40
+ * investigation.
41
+ *
42
+ * ## Why this scenario specifically
43
+ *
44
+ * - `smoke-dm-reply.test.ts` covers steady-state inbound→reply but
45
+ * does NOT restart the agent first, so the wedge surfaces as a
46
+ * "first message is slow" pattern that the smoke test would
47
+ * silently absorb on warm runs.
48
+ * - `silent-end-recovery-dm.test.ts` covers mid-turn silent-end →
49
+ * no-reply, but at 6 min budget — too generous to catch the
50
+ * 5-min wedge as a regression.
51
+ * - This is the FIRST UAT to explicitly tie the assertion to the
52
+ * "always-on" vision and measure first-after-restart TTFO.
53
+ */
54
+
55
+ import { describe, it, expect, beforeAll } from "vitest";
56
+ import { execSync } from "node:child_process";
57
+ import { spinUp } from "../harness.js";
58
+
59
+ const AGENT = "test-harness";
60
+
61
+ // Budget for the marker-safe restart itself (per
62
+ // feedback_agent_restart_needs_sudo_when_running.md, restart blocks
63
+ // ~30s as the gateway's bridge reattaches).
64
+ const RESTART_BUDGET_MS = 90_000;
65
+
66
+ // Hard contract: first-after-restart reply must land in under 2 min.
67
+ // This is generous for slow LLM replies but well below the 5-min
68
+ // silence-poke fallback floor — a regression of the #1556 wedge
69
+ // would trip on TTFO ≥ 300s and fail the test.
70
+ const HARD_REPLY_BUDGET_MS = 120_000;
71
+
72
+ // Vision-aligned target: real expectation is well under 30s on a
73
+ // healthy fleet. A pass between 30-120s is yellow — covered by the
74
+ // contract but worth logging for forensic visibility.
75
+ const VISION_REPLY_BUDGET_MS = 30_000;
76
+
77
+ function canShellSudo(): boolean {
78
+ try {
79
+ execSync("sudo -n true", { stdio: "ignore", timeout: 2_000 });
80
+ return true;
81
+ } catch {
82
+ return false;
83
+ }
84
+ }
85
+
86
+ function restartAgent(name: string): void {
87
+ // Marker-safe restart per memory feedback_compose_rollout.md +
88
+ // feedback_agent_restart_needs_sudo_when_running.md. Apply step
89
+ // self-elevates internally; restart needs the wrapper. We don't
90
+ // call apply here — the agent scaffolds are already current; only
91
+ // the in-memory state needs to reset.
92
+ execSync(
93
+ `sudo -n env PATH=$PATH HOME=$HOME switchroom agent restart ${name} --force`,
94
+ { stdio: ["ignore", "pipe", "pipe"], timeout: RESTART_BUDGET_MS },
95
+ );
96
+ }
97
+
98
+ // This scenario requires NOPASSWD sudo + the switchroom CLI on PATH on
99
+ // the harness host. Skip on CI runners that don't expose those.
100
+ const sudoOk = canShellSudo();
101
+
102
+ (sudoOk ? describe : describe.skip)(
103
+ "uat: always-on after restart",
104
+ () => {
105
+ beforeAll(() => {
106
+ restartAgent(AGENT);
107
+ // Brief settle so the bridge sidecar finishes its reattach
108
+ // before we send the first inbound. The bridge-register log
109
+ // line is the earliest the agent can accept inbound.
110
+ return new Promise((r) => setTimeout(r, 5_000));
111
+ }, RESTART_BUDGET_MS + 10_000);
112
+
113
+ it(
114
+ "first message after fresh restart → reply within 2 min (NOT the 5-min wedge)",
115
+ async () => {
116
+ const sc = await spinUp({ agent: AGENT });
117
+ try {
118
+ const sendStart = Date.now();
119
+ await sc.sendDM("ping — JTBD always-on UAT");
120
+
121
+ const firstReply = await sc.expectMessage(/\S/, {
122
+ from: "bot",
123
+ timeout: HARD_REPLY_BUDGET_MS,
124
+ });
125
+ const ttfo = Date.now() - sendStart;
126
+
127
+ expect(firstReply.text.length).toBeGreaterThan(0);
128
+
129
+ // HARD CONTRACT: the wedge symptom is 300s+ TTFO. Anything
130
+ // ≥ HARD_REPLY_BUDGET_MS (120s) flags a regression of the
131
+ // #1556 self-blocking gate.
132
+ if (ttfo >= HARD_REPLY_BUDGET_MS) {
133
+ throw new Error(
134
+ `[always-on] first-post-restart reply took ${ttfo}ms — ` +
135
+ `matches the #1556 wedge symptom (5-min silence-poke fallback). ` +
136
+ `Vision broken; see feedback_5min_restart_wedge_violates_vision.md`,
137
+ );
138
+ }
139
+ expect(ttfo).toBeLessThan(HARD_REPLY_BUDGET_MS);
140
+
141
+ // Yellow-band log: passes the contract but degraded from the
142
+ // vision target. Worth investigating if this fires regularly.
143
+ if (ttfo >= VISION_REPLY_BUDGET_MS) {
144
+ console.warn(
145
+ `[always-on] first-post-restart TTFO=${ttfo}ms — passed ` +
146
+ `contract (${HARD_REPLY_BUDGET_MS}ms) but slower than the ` +
147
+ `vision target (${VISION_REPLY_BUDGET_MS}ms). Forensic flag.`,
148
+ );
149
+ }
150
+ } finally {
151
+ await sc.tearDown();
152
+ }
153
+ },
154
+ HARD_REPLY_BUDGET_MS + 10_000,
155
+ );
156
+ },
157
+ );