switchroom 0.12.22 → 0.12.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,475 @@
1
+ /**
2
+ * Property tests for `inbound-delivery-machine.ts`.
3
+ *
4
+ * Per RFC `docs/rfcs/inbound-delivery-state-machine.md`: 5 invariants
5
+ * validated over arbitrary event schedules. A counterexample is the
6
+ * minimal evidence that the machine has a bug. The wedge-cluster
7
+ * bugs (v0.12.22 boot-wedge, overlapping-turn silence, #1564 sibling
8
+ * keys) become FAILING property tests if reintroduced.
9
+ *
10
+ * Schedules are generated by a seeded PRNG so failures are
11
+ * reproducible. We don't use fast-check (not a dependency) — the
12
+ * randomness is sufficient for the invariant shapes we're checking,
13
+ * and pure random + shrink-on-failure is documented in the RFC as
14
+ * acceptable for v1.
15
+ */
16
+
17
+ import { describe, expect, it } from 'vitest'
18
+ import {
19
+ type ChatKey,
20
+ type Effect,
21
+ type Event,
22
+ type InboundMessage,
23
+ type PermissionVerdict,
24
+ type State,
25
+ initialState,
26
+ OUTBOUND_RECENT_MS,
27
+ transition,
28
+ TURN_TTL_MS,
29
+ __chatIdOfKeyForTests,
30
+ } from '../gateway/inbound-delivery-machine'
31
+
32
+ // ─────────────────────────────────────────────────────────────────────
33
+ // Seeded PRNG (deterministic for reproducible failures)
34
+ // ─────────────────────────────────────────────────────────────────────
35
+
36
+ function mulberry32(seed: number): () => number {
37
+ let s = seed >>> 0
38
+ return () => {
39
+ s = (s + 0x6d2b79f5) >>> 0
40
+ let t = s
41
+ t = Math.imul(t ^ (t >>> 15), t | 1)
42
+ t ^= t + Math.imul(t ^ (t >>> 7), t | 61)
43
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296
44
+ }
45
+ }
46
+
47
+ // ─────────────────────────────────────────────────────────────────────
48
+ // Event generators
49
+ // ─────────────────────────────────────────────────────────────────────
50
+
51
+ const CHATS = ['c1', 'c2', 'c3'] as const
52
+ const THREADS = ['_', '1', '2', '0'] as const // includes '_' AND '0' to flag any latent sibling-key handling
53
+
54
+ function makeKey(chat: string, thread: string): ChatKey {
55
+ return `${chat}:${thread}` as ChatKey
56
+ }
57
+
58
+ interface ScheduleContext {
59
+ readonly rand: () => number
60
+ now: number
61
+ msgIdCounter: number
62
+ permIdCounter: number
63
+ }
64
+
65
+ function randomKey(ctx: ScheduleContext): ChatKey {
66
+ const c = CHATS[Math.floor(ctx.rand() * CHATS.length)]
67
+ const t = THREADS[Math.floor(ctx.rand() * THREADS.length)]
68
+ return makeKey(c, t)
69
+ }
70
+
71
+ function nextMsg(ctx: ScheduleContext, isSteering = false): InboundMessage {
72
+ const msgId = ++ctx.msgIdCounter
73
+ return { msgId, isSteering, payload: { id: msgId } }
74
+ }
75
+
76
+ function nextVerdict(ctx: ScheduleContext): PermissionVerdict {
77
+ const requestId = `req-${++ctx.permIdCounter}`
78
+ return { requestId, behavior: 'allow', payload: { requestId } }
79
+ }
80
+
81
+ function advanceTime(ctx: ScheduleContext, range: number): void {
82
+ ctx.now += Math.floor(ctx.rand() * range)
83
+ }
84
+
85
+ type EventKind = Event['kind']
86
+ const EVENT_KINDS: readonly EventKind[] = [
87
+ 'bridgeUp',
88
+ 'bridgeDown',
89
+ 'inbound',
90
+ 'turnStart',
91
+ 'turnEnd',
92
+ 'modelOutbound',
93
+ 'permVerdict',
94
+ 'tick',
95
+ ]
96
+
97
+ function generateEvent(ctx: ScheduleContext, prior: State): Event {
98
+ // Weight inbound + tick + turnEnd higher than rare ops so schedules
99
+ // exercise the steady-state hot path.
100
+ const r = ctx.rand()
101
+ let kind: EventKind
102
+ if (r < 0.3) kind = 'inbound'
103
+ else if (r < 0.45) kind = 'tick'
104
+ else if (r < 0.55) kind = 'turnEnd'
105
+ else if (r < 0.65) kind = 'modelOutbound'
106
+ else if (r < 0.75) kind = 'turnStart'
107
+ else if (r < 0.82) kind = 'permVerdict'
108
+ else if (r < 0.91) kind = 'bridgeUp'
109
+ else kind = 'bridgeDown'
110
+
111
+ switch (kind) {
112
+ case 'bridgeUp':
113
+ return { kind: 'bridgeUp', at: ctx.now }
114
+ case 'bridgeDown':
115
+ return { kind: 'bridgeDown', at: ctx.now }
116
+ case 'inbound': {
117
+ const isSteering = ctx.rand() < 0.1
118
+ return { kind: 'inbound', key: randomKey(ctx), msg: nextMsg(ctx, isSteering), at: ctx.now }
119
+ }
120
+ case 'turnStart':
121
+ return { kind: 'turnStart', key: randomKey(ctx), at: ctx.now }
122
+ case 'turnEnd': {
123
+ // Bias toward an actually-active key so turnEnd is meaningful
124
+ // most of the time.
125
+ const activeKeys = [...prior.perKey.entries()]
126
+ .filter(([, v]) => v.turnStartedAt != null)
127
+ .map(([k]) => k)
128
+ const key = activeKeys.length > 0 && ctx.rand() < 0.8
129
+ ? activeKeys[Math.floor(ctx.rand() * activeKeys.length)]
130
+ : randomKey(ctx)
131
+ return { kind: 'turnEnd', key, at: ctx.now, outboundEmitted: ctx.rand() < 0.85 }
132
+ }
133
+ case 'modelOutbound':
134
+ return { kind: 'modelOutbound', key: randomKey(ctx), at: ctx.now }
135
+ case 'permVerdict':
136
+ return { kind: 'permVerdict', verdict: nextVerdict(ctx), at: ctx.now }
137
+ case 'tick':
138
+ // Tick advances time meaningfully — sometimes a small step,
139
+ // sometimes a big one that crosses TURN_TTL_MS so the fallback
140
+ // path is exercised.
141
+ advanceTime(ctx, ctx.rand() < 0.1 ? TURN_TTL_MS * 2 : 5_000)
142
+ return { kind: 'tick', now: ctx.now }
143
+ }
144
+ }
145
+
146
+ // ─────────────────────────────────────────────────────────────────────
147
+ // Property: simulate a schedule and assert invariants
148
+ // ─────────────────────────────────────────────────────────────────────
149
+
150
+ interface TraceEntry {
151
+ readonly event: Event
152
+ readonly stateBefore: State
153
+ readonly stateAfter: State
154
+ readonly effects: readonly Effect[]
155
+ }
156
+
157
+ function runSchedule(seed: number, eventCount: number): TraceEntry[] {
158
+ const ctx: ScheduleContext = {
159
+ rand: mulberry32(seed),
160
+ now: 0,
161
+ msgIdCounter: 0,
162
+ permIdCounter: 0,
163
+ }
164
+ let state = initialState()
165
+ const trace: TraceEntry[] = []
166
+ for (let i = 0; i < eventCount; i++) {
167
+ const event = generateEvent(ctx, state)
168
+ const { state: stateAfter, effects } = transition(state, event)
169
+ trace.push({ event, stateBefore: state, stateAfter, effects })
170
+ state = stateAfter
171
+ // Advance clock a small amount between events so timestamps don't
172
+ // cluster at zero.
173
+ advanceTime(ctx, 1_000)
174
+ }
175
+ return trace
176
+ }
177
+
178
+ function formatCounterexample(seed: number, trace: TraceEntry[]): string {
179
+ const lines = [`seed=${seed}, len=${trace.length}`, '']
180
+ for (let i = 0; i < trace.length; i++) {
181
+ const t = trace[i]
182
+ lines.push(`#${i} ${t.event.kind} ${JSON.stringify(t.event)}`)
183
+ if (t.effects.length > 0) {
184
+ lines.push(` → ${t.effects.map((e) => e.kind).join(', ')}`)
185
+ }
186
+ }
187
+ return lines.join('\n')
188
+ }
189
+
190
+ // ─────────────────────────────────────────────────────────────────────
191
+ // Invariant #1 — Every inbound is delivered XOR persisted
192
+ // ─────────────────────────────────────────────────────────────────────
193
+
194
+ describe('Invariant #1 — every inbound is delivered XOR persisted', () => {
195
+ it('holds across 1000 random schedules of 50 events', () => {
196
+ for (let seed = 1; seed <= 1000; seed++) {
197
+ const trace = runSchedule(seed, 50)
198
+ for (let i = 0; i < trace.length; i++) {
199
+ const t = trace[i]
200
+ if (t.event.kind !== 'inbound') continue
201
+ const msgId = t.event.msg.msgId
202
+ const delivered = t.effects.some(
203
+ (e) => e.kind === 'deliverToBridge' && e.msg.msgId === msgId,
204
+ )
205
+ const buffered = t.effects.some(
206
+ (e) => e.kind === 'bufferInbound' && e.msg.msgId === msgId,
207
+ )
208
+ const persisted = t.effects.some(
209
+ (e) => e.kind === 'persistInbound' && e.msg.msgId === msgId,
210
+ )
211
+ // Contract: delivered XOR (buffered AND persisted). Never both
212
+ // delivered AND buffered. Never neither.
213
+ if (delivered && (buffered || persisted)) {
214
+ throw new Error(
215
+ `Invariant #1 violated at event #${i}: msg ${msgId} both delivered and buffered.\n` +
216
+ formatCounterexample(seed, trace.slice(0, i + 1)),
217
+ )
218
+ }
219
+ if (!delivered && !buffered) {
220
+ throw new Error(
221
+ `Invariant #1 violated at event #${i}: msg ${msgId} neither delivered nor buffered.\n` +
222
+ formatCounterexample(seed, trace.slice(0, i + 1)),
223
+ )
224
+ }
225
+ if (buffered && !persisted) {
226
+ throw new Error(
227
+ `Invariant #1 violated at event #${i}: msg ${msgId} buffered but not persisted.\n` +
228
+ formatCounterexample(seed, trace.slice(0, i + 1)),
229
+ )
230
+ }
231
+ }
232
+ }
233
+ expect(true).toBe(true)
234
+ })
235
+ })
236
+
237
+ // ─────────────────────────────────────────────────────────────────────
238
+ // Invariant #2 — setTurnStarted has a matching clearTurnStarted
239
+ // before the next end-of-life event (turnEnd, bridgeDown for active,
240
+ // or tick past TURN_TTL).
241
+ // ─────────────────────────────────────────────────────────────────────
242
+
243
+ describe('Invariant #2 — turnStarted is matched by clearTurnStarted before EOL', () => {
244
+ it('holds across 1000 random schedules of 100 events', () => {
245
+ for (let seed = 1; seed <= 1000; seed++) {
246
+ const trace = runSchedule(seed, 100)
247
+ // For each turn started, track when it was started and find the
248
+ // matching clear. The clear must come strictly before:
249
+ // - the next turnEnd for that key
250
+ // - OR the next tick that crosses TURN_TTL
251
+ // - OR bridgeDown (if the active turn — bridgeDown can clear
252
+ // state implicitly per the RFC's state machine; we don't
253
+ // require clear in that case, the bridge_dead state takes
254
+ // over)
255
+ // Simpler equivalent: at the end of every schedule step, the
256
+ // perKey set should never contain a turnStartedAt entry that's
257
+ // STALE BEYOND TTL. The tick handler is the gate.
258
+ for (let i = 0; i < trace.length; i++) {
259
+ const t = trace[i]
260
+ for (const [k, v] of t.stateAfter.perKey) {
261
+ if (v.turnStartedAt == null) continue
262
+ const age = t.event.kind === 'tick'
263
+ ? t.event.now - v.turnStartedAt
264
+ : 0
265
+ // Note: we don't have a single "current time" outside
266
+ // tick events. Only ticks can detect TTL expiration; so
267
+ // a stale entry persists across non-tick events until a
268
+ // tick processes it. That's the design.
269
+ if (t.event.kind === 'tick' && age > TURN_TTL_MS) {
270
+ // Recent outbound suppresses fallback (invariant #5) — if
271
+ // suppressed, the entry stays. Confirm the suppression
272
+ // condition holds.
273
+ if (v.lastOutboundAt == null || t.event.now - v.lastOutboundAt >= OUTBOUND_RECENT_MS) {
274
+ throw new Error(
275
+ `Invariant #2 violated at event #${i}: key ${k} has stale turnStartedAt ` +
276
+ `(age=${age}ms > ${TURN_TTL_MS}ms) after a tick.\n` +
277
+ formatCounterexample(seed, trace.slice(0, i + 1)),
278
+ )
279
+ }
280
+ }
281
+ }
282
+ }
283
+ }
284
+ expect(true).toBe(true)
285
+ })
286
+ })
287
+
288
+ // ─────────────────────────────────────────────────────────────────────
289
+ // Invariant #3 — per-chat sibling-key cleanup on turnEnd
290
+ // ─────────────────────────────────────────────────────────────────────
291
+
292
+ describe('Invariant #3 — turnEnd sweeps sibling keys for the same chatId', () => {
293
+ it('holds across 1000 random schedules of 100 events', () => {
294
+ for (let seed = 1; seed <= 1000; seed++) {
295
+ const trace = runSchedule(seed, 100)
296
+ for (let i = 0; i < trace.length; i++) {
297
+ const t = trace[i]
298
+ if (t.event.kind !== 'turnEnd') continue
299
+ const chatId = __chatIdOfKeyForTests(t.event.key)
300
+ // After turnEnd, no sibling key for this chatId should retain
301
+ // turnStartedAt != null.
302
+ for (const [k, v] of t.stateAfter.perKey) {
303
+ if (__chatIdOfKeyForTests(k) !== chatId) continue
304
+ if (v.turnStartedAt != null) {
305
+ throw new Error(
306
+ `Invariant #3 violated at event #${i}: turnEnd for ${t.event.key} ` +
307
+ `left sibling key ${k} with turnStartedAt=${v.turnStartedAt}.\n` +
308
+ formatCounterexample(seed, trace.slice(0, i + 1)),
309
+ )
310
+ }
311
+ }
312
+ }
313
+ }
314
+ expect(true).toBe(true)
315
+ })
316
+ })
317
+
318
+ // ─────────────────────────────────────────────────────────────────────
319
+ // Invariant #4 — permVerdict delivered iff bridge alive
320
+ // ─────────────────────────────────────────────────────────────────────
321
+
322
+ describe('Invariant #4 — permVerdict delivered iff bridge alive', () => {
323
+ it('holds across 1000 random schedules of 50 events', () => {
324
+ for (let seed = 1; seed <= 1000; seed++) {
325
+ const trace = runSchedule(seed, 50)
326
+ for (let i = 0; i < trace.length; i++) {
327
+ const t = trace[i]
328
+ if (t.event.kind !== 'permVerdict') continue
329
+ const alive = t.stateBefore.global.kind !== 'bridge_dead'
330
+ const delivered = t.effects.some((e) => e.kind === 'deliverPermVerdict')
331
+ const persisted = t.effects.some((e) => e.kind === 'persistPermVerdict')
332
+ if (alive && !delivered) {
333
+ throw new Error(
334
+ `Invariant #4 violated at #${i}: bridge alive but permVerdict not delivered.\n` +
335
+ formatCounterexample(seed, trace.slice(0, i + 1)),
336
+ )
337
+ }
338
+ if (!alive && !persisted) {
339
+ throw new Error(
340
+ `Invariant #4 violated at #${i}: bridge dead but permVerdict not persisted.\n` +
341
+ formatCounterexample(seed, trace.slice(0, i + 1)),
342
+ )
343
+ }
344
+ if (delivered && persisted) {
345
+ throw new Error(
346
+ `Invariant #4 violated at #${i}: permVerdict both delivered and persisted.\n` +
347
+ formatCounterexample(seed, trace.slice(0, i + 1)),
348
+ )
349
+ }
350
+ }
351
+ }
352
+ expect(true).toBe(true)
353
+ })
354
+ })
355
+
356
+ // ─────────────────────────────────────────────────────────────────────
357
+ // Invariant #5 — spurious-fallback suppression (the 2026-05-20
358
+ // overlapping-turn silence bug becomes unrepresentable)
359
+ // ─────────────────────────────────────────────────────────────────────
360
+
361
+ describe('Invariant #5 — fallback poke suppressed if model recently broke silence', () => {
362
+ it('holds across 1000 random schedules of 100 events', () => {
363
+ for (let seed = 1; seed <= 1000; seed++) {
364
+ const trace = runSchedule(seed, 100)
365
+ for (let i = 0; i < trace.length; i++) {
366
+ const t = trace[i]
367
+ if (t.event.kind !== 'tick') continue
368
+ const now = t.event.now
369
+ for (const eff of t.effects) {
370
+ if (eff.kind !== 'firePoke' || eff.level !== 'fallback') continue
371
+ // Look up the key's lastOutboundAt at stateBefore.
372
+ const perKey = t.stateBefore.perKey.get(eff.key)
373
+ if (perKey == null) continue
374
+ if (perKey.lastOutboundAt == null) continue
375
+ const sinceOutbound = now - perKey.lastOutboundAt
376
+ if (sinceOutbound < OUTBOUND_RECENT_MS) {
377
+ throw new Error(
378
+ `Invariant #5 violated at #${i}: fallback fired for ${eff.key} ` +
379
+ `but model produced outbound only ${sinceOutbound}ms ago ` +
380
+ `(threshold ${OUTBOUND_RECENT_MS}ms).\n` +
381
+ formatCounterexample(seed, trace.slice(0, i + 1)),
382
+ )
383
+ }
384
+ }
385
+ }
386
+ }
387
+ expect(true).toBe(true)
388
+ })
389
+ })
390
+
391
+ // ─────────────────────────────────────────────────────────────────────
392
+ // Targeted regression: the v0.12.22 boot-wedge case
393
+ // ─────────────────────────────────────────────────────────────────────
394
+
395
+ describe('Targeted regression — v0.12.22 boot-wedge', () => {
396
+ it('first inbound on a fresh bridge delivers, not buffers', () => {
397
+ let s = initialState()
398
+ s = transition(s, { kind: 'bridgeUp', at: 0 }).state
399
+ const { state: s2, effects } = transition(s, {
400
+ kind: 'inbound',
401
+ key: 'c1:_' as ChatKey,
402
+ msg: { msgId: 1, isSteering: false, payload: {} },
403
+ at: 100,
404
+ })
405
+ const delivered = effects.some((e) => e.kind === 'deliverToBridge')
406
+ const buffered = effects.some((e) => e.kind === 'bufferInbound')
407
+ expect(delivered).toBe(true)
408
+ expect(buffered).toBe(false)
409
+ expect(s2.global.kind).toBe('bridge_alive_in_turn')
410
+ })
411
+ })
412
+
413
+ // ─────────────────────────────────────────────────────────────────────
414
+ // Targeted regression: the 2026-05-20 overlapping-turn silence case
415
+ // ─────────────────────────────────────────────────────────────────────
416
+
417
+ describe('Targeted regression — overlapping-turn silence (the post-v0.12.22 case)', () => {
418
+ it("suppresses fallback when model produced outbound in prior overlapping turn", () => {
419
+ let s = initialState()
420
+ s = transition(s, { kind: 'bridgeUp', at: 0 }).state
421
+
422
+ // Turn A starts and ends with an outbound at t=5000
423
+ s = transition(s, {
424
+ kind: 'inbound',
425
+ key: 'c1:_' as ChatKey,
426
+ msg: { msgId: 1, isSteering: false, payload: {} },
427
+ at: 1000,
428
+ }).state
429
+ s = transition(s, {
430
+ kind: 'turnEnd',
431
+ key: 'c1:_' as ChatKey,
432
+ at: 5000,
433
+ outboundEmitted: true,
434
+ }).state
435
+
436
+ // Turn B starts at t=6000 (immediately after A)
437
+ s = transition(s, {
438
+ kind: 'inbound',
439
+ key: 'c1:_' as ChatKey,
440
+ msg: { msgId: 2, isSteering: false, payload: {} },
441
+ at: 6000,
442
+ }).state
443
+
444
+ // Tick at t=310,000: turn B is 304s old (past TURN_TTL_MS=300_000).
445
+ // But the model emitted an outbound at t=5000 (less than 60s ago
446
+ // RELATIVE TO TURN B'S START, but 305s before now).
447
+ // Per invariant #5: suppress fallback iff lastOutbound is within
448
+ // OUTBOUND_RECENT_MS (60s) of NOW. 310_000 - 5_000 = 305_000 ms;
449
+ // not within the suppression window. So fallback SHOULD fire here.
450
+ const tickResult = transition(s, { kind: 'tick', now: 310_000 })
451
+ const fired = tickResult.effects.some(
452
+ (e) => e.kind === 'firePoke' && e.level === 'fallback',
453
+ )
454
+ expect(fired).toBe(true)
455
+
456
+ // Now reset and verify the suppression case: outbound at t=305_000,
457
+ // tick at t=310_000 — only 5s elapsed since outbound.
458
+ let s2 = initialState()
459
+ s2 = transition(s2, { kind: 'bridgeUp', at: 0 }).state
460
+ s2 = transition(s2, {
461
+ kind: 'inbound',
462
+ key: 'c2:_' as ChatKey,
463
+ msg: { msgId: 3, isSteering: false, payload: {} },
464
+ at: 1000,
465
+ }).state
466
+ // Model emitted an outbound very recently
467
+ s2 = transition(s2, { kind: 'modelOutbound', key: 'c2:_' as ChatKey, at: 305_000 }).state
468
+ // Tick: turn is 309s old, outbound was 5s ago → suppress
469
+ const suppress = transition(s2, { kind: 'tick', now: 310_000 })
470
+ const firedSuppressed = suppress.effects.some(
471
+ (e) => e.kind === 'firePoke' && e.level === 'fallback',
472
+ )
473
+ expect(firedSuppressed).toBe(false)
474
+ })
475
+ })
@@ -0,0 +1,127 @@
1
+ /**
2
+ * JTBD scenario — short happy path: trivial questions reply FAST.
3
+ *
4
+ * Serves: `reference/know-what-my-agent-is-doing.md` — the short-path
5
+ * contract: a question with no real work should produce a plain reply
6
+ * with no ceremony (no soft-commit, no progress chunks) within a tight
7
+ * budget. Users judge agent speed on THIS path more than any other.
8
+ *
9
+ * Also serves: the always-on vision (`reference/vision.md`). An agent
10
+ * that takes 30+ seconds to answer "what's 2+2" is not "always-on" —
11
+ * it's awake but unresponsive.
12
+ *
13
+ * ## Targets
14
+ *
15
+ * From `reference/conversational-pacing.md` and the post-v0.12.22
16
+ * baseline measurements:
17
+ *
18
+ * - **TTFO p95 (vision target):** < 30s — the published contract.
19
+ * This test asserts the FAST-trivial case, not p95, so we tighten.
20
+ * - **Trivial-prompt TTFO (this test):** < 12s as hard contract,
21
+ * < 6s as the vision target. The mtcute post-restart UAT measured
22
+ * 19.4s on a COLD-START fresh-restart; a warm fast-trivial should
23
+ * be materially faster — the dominant cost on cold start is
24
+ * boot+session-resume which doesn't apply here.
25
+ * - **Soft-commit ceremony:** must NOT fire for trivial prompts.
26
+ * If the reply contains a soft-commit preamble ("let me check
27
+ * that for you, back in a few"), the conversational-pacing
28
+ * prompt classified the trivial prompt as slow — a regression.
29
+ *
30
+ * ## What this catches that other UATs don't
31
+ *
32
+ * - `jtbd-soft-commit-dm.test.ts` exercises slow prompts (the soft
33
+ * commit SHOULD fire). This test asserts the inverse — fast prompts
34
+ * should skip ceremony.
35
+ * - `jtbd-always-on-after-restart-dm.test.ts` asserts <120s after a
36
+ * cold restart. This test asserts <12s on a warm agent — a much
37
+ * tighter bar that catches steady-state latency regressions
38
+ * (model swap, MCP server slowdown, gateway middleware cost, etc.).
39
+ * - `smoke-dm-reply.test.ts` confirms the agent replies AT ALL but
40
+ * has no latency assertion — a 50s reply would pass smoke. This
41
+ * one fails.
42
+ *
43
+ * ## Forensic signal on a yellow-band pass
44
+ *
45
+ * If TTFO lands in 6-12s, the test passes but logs a forensic warning
46
+ * so a future regression in this code path is visible BEFORE it
47
+ * crosses the hard contract. Yellow-band drift is the canary for
48
+ * "something's getting slower" — better to chase it at 8s than at 28s.
49
+ */
50
+
51
+ import { describe, it, expect } from "vitest";
52
+ import { spinUp } from "../harness.js";
53
+
54
+ const AGENT = "test-harness";
55
+
56
+ // Hard contract for trivial-prompt TTFO.
57
+ const HARD_TTFO_MS = 12_000;
58
+
59
+ // Vision target: trivial prompts feel near-instant.
60
+ const VISION_TTFO_MS = 6_000;
61
+
62
+ const TRIVIAL_PROMPT = "Reply with just the number: what is 2 + 2?";
63
+
64
+ const SOFT_COMMIT_PHRASES = [
65
+ /let me/i,
66
+ /back in/i,
67
+ /one (sec|moment)/i,
68
+ /checking/i,
69
+ /looking into/i,
70
+ /hold on/i,
71
+ ];
72
+
73
+ describe("uat: short happy path — trivial prompt is FAST", () => {
74
+ it(
75
+ `trivial prompt → reply lands within ${HARD_TTFO_MS / 1000}s`,
76
+ async () => {
77
+ const sc = await spinUp({ agent: AGENT });
78
+ try {
79
+ const sendStart = Date.now();
80
+ await sc.sendDM(TRIVIAL_PROMPT);
81
+
82
+ const firstReply = await sc.expectMessage(/\S/, {
83
+ from: "bot",
84
+ timeout: HARD_TTFO_MS + 5_000,
85
+ });
86
+ const ttfo = Date.now() - sendStart;
87
+
88
+ expect(firstReply.text.length).toBeGreaterThan(0);
89
+
90
+ if (ttfo >= HARD_TTFO_MS) {
91
+ throw new Error(
92
+ `[fast-trivial] TTFO=${ttfo}ms exceeds hard contract ` +
93
+ `${HARD_TTFO_MS}ms — trivial-prompt latency regression.`,
94
+ );
95
+ }
96
+ expect(ttfo).toBeLessThan(HARD_TTFO_MS);
97
+
98
+ const triggeredSoftCommit = SOFT_COMMIT_PHRASES.some((re) =>
99
+ re.test(firstReply.text),
100
+ );
101
+ if (triggeredSoftCommit) {
102
+ console.warn(
103
+ `[fast-trivial] First reply contains soft-commit phrasing — ` +
104
+ `the conversational-pacing prompt likely classified the ` +
105
+ `trivial prompt as slow. Text: ${JSON.stringify(firstReply.text.slice(0, 200))}`,
106
+ );
107
+ }
108
+
109
+ if (ttfo >= VISION_TTFO_MS) {
110
+ console.warn(
111
+ `[fast-trivial] TTFO=${ttfo}ms — passed hard contract ` +
112
+ `(${HARD_TTFO_MS}ms) but slower than the vision target ` +
113
+ `(${VISION_TTFO_MS}ms). Forensic canary for delivery-path drift.`,
114
+ );
115
+ } else {
116
+ console.log(
117
+ `[fast-trivial] TTFO=${ttfo}ms — within vision target ` +
118
+ `(<${VISION_TTFO_MS}ms). Snappy.`,
119
+ );
120
+ }
121
+ } finally {
122
+ await sc.tearDown();
123
+ }
124
+ },
125
+ HARD_TTFO_MS + 15_000,
126
+ );
127
+ });