switchroom 0.12.22 → 0.12.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +23 -2
- package/package.json +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +350 -105
- package/telegram-plugin/gateway/gateway.ts +35 -0
- package/telegram-plugin/gateway/inbound-delivery-machine-shadow.ts +117 -0
- package/telegram-plugin/gateway/inbound-delivery-machine.ts +435 -0
- package/telegram-plugin/tests/inbound-delivery-machine.test.ts +475 -0
- package/telegram-plugin/uat/scenarios/jtbd-fast-trivial-dm.test.ts +127 -0
- package/telegram-plugin/uat/scenarios/jtbd-memory-survives-restart-dm.test.ts +239 -0
- package/telegram-plugin/uat/scenarios/jtbd-wake-audit-content-dm.test.ts +145 -0
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Property tests for `inbound-delivery-machine.ts`.
|
|
3
|
+
*
|
|
4
|
+
* Per RFC `docs/rfcs/inbound-delivery-state-machine.md`: 5 invariants
|
|
5
|
+
* validated over arbitrary event schedules. A counterexample is the
|
|
6
|
+
* minimal evidence that the machine has a bug. The wedge-cluster
|
|
7
|
+
* bugs (v0.12.22 boot-wedge, overlapping-turn silence, #1564 sibling
|
|
8
|
+
* keys) become FAILING property tests if reintroduced.
|
|
9
|
+
*
|
|
10
|
+
* Schedules are generated by a seeded PRNG so failures are
|
|
11
|
+
* reproducible. We don't use fast-check (not a dependency) — the
|
|
12
|
+
* randomness is sufficient for the invariant shapes we're checking,
|
|
13
|
+
* and pure random + shrink-on-failure is documented in the RFC as
|
|
14
|
+
* acceptable for v1.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { describe, expect, it } from 'vitest'
|
|
18
|
+
import {
|
|
19
|
+
type ChatKey,
|
|
20
|
+
type Effect,
|
|
21
|
+
type Event,
|
|
22
|
+
type InboundMessage,
|
|
23
|
+
type PermissionVerdict,
|
|
24
|
+
type State,
|
|
25
|
+
initialState,
|
|
26
|
+
OUTBOUND_RECENT_MS,
|
|
27
|
+
transition,
|
|
28
|
+
TURN_TTL_MS,
|
|
29
|
+
__chatIdOfKeyForTests,
|
|
30
|
+
} from '../gateway/inbound-delivery-machine'
|
|
31
|
+
|
|
32
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
33
|
+
// Seeded PRNG (deterministic for reproducible failures)
|
|
34
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
35
|
+
|
|
36
|
+
function mulberry32(seed: number): () => number {
|
|
37
|
+
let s = seed >>> 0
|
|
38
|
+
return () => {
|
|
39
|
+
s = (s + 0x6d2b79f5) >>> 0
|
|
40
|
+
let t = s
|
|
41
|
+
t = Math.imul(t ^ (t >>> 15), t | 1)
|
|
42
|
+
t ^= t + Math.imul(t ^ (t >>> 7), t | 61)
|
|
43
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
48
|
+
// Event generators
|
|
49
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
50
|
+
|
|
51
|
+
const CHATS = ['c1', 'c2', 'c3'] as const
|
|
52
|
+
const THREADS = ['_', '1', '2', '0'] as const // includes '_' AND '0' to flag any latent sibling-key handling
|
|
53
|
+
|
|
54
|
+
function makeKey(chat: string, thread: string): ChatKey {
|
|
55
|
+
return `${chat}:${thread}` as ChatKey
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
interface ScheduleContext {
|
|
59
|
+
readonly rand: () => number
|
|
60
|
+
now: number
|
|
61
|
+
msgIdCounter: number
|
|
62
|
+
permIdCounter: number
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function randomKey(ctx: ScheduleContext): ChatKey {
|
|
66
|
+
const c = CHATS[Math.floor(ctx.rand() * CHATS.length)]
|
|
67
|
+
const t = THREADS[Math.floor(ctx.rand() * THREADS.length)]
|
|
68
|
+
return makeKey(c, t)
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function nextMsg(ctx: ScheduleContext, isSteering = false): InboundMessage {
|
|
72
|
+
const msgId = ++ctx.msgIdCounter
|
|
73
|
+
return { msgId, isSteering, payload: { id: msgId } }
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function nextVerdict(ctx: ScheduleContext): PermissionVerdict {
|
|
77
|
+
const requestId = `req-${++ctx.permIdCounter}`
|
|
78
|
+
return { requestId, behavior: 'allow', payload: { requestId } }
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function advanceTime(ctx: ScheduleContext, range: number): void {
|
|
82
|
+
ctx.now += Math.floor(ctx.rand() * range)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
type EventKind = Event['kind']
|
|
86
|
+
const EVENT_KINDS: readonly EventKind[] = [
|
|
87
|
+
'bridgeUp',
|
|
88
|
+
'bridgeDown',
|
|
89
|
+
'inbound',
|
|
90
|
+
'turnStart',
|
|
91
|
+
'turnEnd',
|
|
92
|
+
'modelOutbound',
|
|
93
|
+
'permVerdict',
|
|
94
|
+
'tick',
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
function generateEvent(ctx: ScheduleContext, prior: State): Event {
|
|
98
|
+
// Weight inbound + tick + turnEnd higher than rare ops so schedules
|
|
99
|
+
// exercise the steady-state hot path.
|
|
100
|
+
const r = ctx.rand()
|
|
101
|
+
let kind: EventKind
|
|
102
|
+
if (r < 0.3) kind = 'inbound'
|
|
103
|
+
else if (r < 0.45) kind = 'tick'
|
|
104
|
+
else if (r < 0.55) kind = 'turnEnd'
|
|
105
|
+
else if (r < 0.65) kind = 'modelOutbound'
|
|
106
|
+
else if (r < 0.75) kind = 'turnStart'
|
|
107
|
+
else if (r < 0.82) kind = 'permVerdict'
|
|
108
|
+
else if (r < 0.91) kind = 'bridgeUp'
|
|
109
|
+
else kind = 'bridgeDown'
|
|
110
|
+
|
|
111
|
+
switch (kind) {
|
|
112
|
+
case 'bridgeUp':
|
|
113
|
+
return { kind: 'bridgeUp', at: ctx.now }
|
|
114
|
+
case 'bridgeDown':
|
|
115
|
+
return { kind: 'bridgeDown', at: ctx.now }
|
|
116
|
+
case 'inbound': {
|
|
117
|
+
const isSteering = ctx.rand() < 0.1
|
|
118
|
+
return { kind: 'inbound', key: randomKey(ctx), msg: nextMsg(ctx, isSteering), at: ctx.now }
|
|
119
|
+
}
|
|
120
|
+
case 'turnStart':
|
|
121
|
+
return { kind: 'turnStart', key: randomKey(ctx), at: ctx.now }
|
|
122
|
+
case 'turnEnd': {
|
|
123
|
+
// Bias toward an actually-active key so turnEnd is meaningful
|
|
124
|
+
// most of the time.
|
|
125
|
+
const activeKeys = [...prior.perKey.entries()]
|
|
126
|
+
.filter(([, v]) => v.turnStartedAt != null)
|
|
127
|
+
.map(([k]) => k)
|
|
128
|
+
const key = activeKeys.length > 0 && ctx.rand() < 0.8
|
|
129
|
+
? activeKeys[Math.floor(ctx.rand() * activeKeys.length)]
|
|
130
|
+
: randomKey(ctx)
|
|
131
|
+
return { kind: 'turnEnd', key, at: ctx.now, outboundEmitted: ctx.rand() < 0.85 }
|
|
132
|
+
}
|
|
133
|
+
case 'modelOutbound':
|
|
134
|
+
return { kind: 'modelOutbound', key: randomKey(ctx), at: ctx.now }
|
|
135
|
+
case 'permVerdict':
|
|
136
|
+
return { kind: 'permVerdict', verdict: nextVerdict(ctx), at: ctx.now }
|
|
137
|
+
case 'tick':
|
|
138
|
+
// Tick advances time meaningfully — sometimes a small step,
|
|
139
|
+
// sometimes a big one that crosses TURN_TTL_MS so the fallback
|
|
140
|
+
// path is exercised.
|
|
141
|
+
advanceTime(ctx, ctx.rand() < 0.1 ? TURN_TTL_MS * 2 : 5_000)
|
|
142
|
+
return { kind: 'tick', now: ctx.now }
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
147
|
+
// Property: simulate a schedule and assert invariants
|
|
148
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
149
|
+
|
|
150
|
+
interface TraceEntry {
|
|
151
|
+
readonly event: Event
|
|
152
|
+
readonly stateBefore: State
|
|
153
|
+
readonly stateAfter: State
|
|
154
|
+
readonly effects: readonly Effect[]
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
function runSchedule(seed: number, eventCount: number): TraceEntry[] {
|
|
158
|
+
const ctx: ScheduleContext = {
|
|
159
|
+
rand: mulberry32(seed),
|
|
160
|
+
now: 0,
|
|
161
|
+
msgIdCounter: 0,
|
|
162
|
+
permIdCounter: 0,
|
|
163
|
+
}
|
|
164
|
+
let state = initialState()
|
|
165
|
+
const trace: TraceEntry[] = []
|
|
166
|
+
for (let i = 0; i < eventCount; i++) {
|
|
167
|
+
const event = generateEvent(ctx, state)
|
|
168
|
+
const { state: stateAfter, effects } = transition(state, event)
|
|
169
|
+
trace.push({ event, stateBefore: state, stateAfter, effects })
|
|
170
|
+
state = stateAfter
|
|
171
|
+
// Advance clock a small amount between events so timestamps don't
|
|
172
|
+
// cluster at zero.
|
|
173
|
+
advanceTime(ctx, 1_000)
|
|
174
|
+
}
|
|
175
|
+
return trace
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
function formatCounterexample(seed: number, trace: TraceEntry[]): string {
|
|
179
|
+
const lines = [`seed=${seed}, len=${trace.length}`, '']
|
|
180
|
+
for (let i = 0; i < trace.length; i++) {
|
|
181
|
+
const t = trace[i]
|
|
182
|
+
lines.push(`#${i} ${t.event.kind} ${JSON.stringify(t.event)}`)
|
|
183
|
+
if (t.effects.length > 0) {
|
|
184
|
+
lines.push(` → ${t.effects.map((e) => e.kind).join(', ')}`)
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return lines.join('\n')
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
191
|
+
// Invariant #1 — Every inbound is delivered XOR persisted
|
|
192
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
193
|
+
|
|
194
|
+
describe('Invariant #1 — every inbound is delivered XOR persisted', () => {
|
|
195
|
+
it('holds across 1000 random schedules of 50 events', () => {
|
|
196
|
+
for (let seed = 1; seed <= 1000; seed++) {
|
|
197
|
+
const trace = runSchedule(seed, 50)
|
|
198
|
+
for (let i = 0; i < trace.length; i++) {
|
|
199
|
+
const t = trace[i]
|
|
200
|
+
if (t.event.kind !== 'inbound') continue
|
|
201
|
+
const msgId = t.event.msg.msgId
|
|
202
|
+
const delivered = t.effects.some(
|
|
203
|
+
(e) => e.kind === 'deliverToBridge' && e.msg.msgId === msgId,
|
|
204
|
+
)
|
|
205
|
+
const buffered = t.effects.some(
|
|
206
|
+
(e) => e.kind === 'bufferInbound' && e.msg.msgId === msgId,
|
|
207
|
+
)
|
|
208
|
+
const persisted = t.effects.some(
|
|
209
|
+
(e) => e.kind === 'persistInbound' && e.msg.msgId === msgId,
|
|
210
|
+
)
|
|
211
|
+
// Contract: delivered XOR (buffered AND persisted). Never both
|
|
212
|
+
// delivered AND buffered. Never neither.
|
|
213
|
+
if (delivered && (buffered || persisted)) {
|
|
214
|
+
throw new Error(
|
|
215
|
+
`Invariant #1 violated at event #${i}: msg ${msgId} both delivered and buffered.\n` +
|
|
216
|
+
formatCounterexample(seed, trace.slice(0, i + 1)),
|
|
217
|
+
)
|
|
218
|
+
}
|
|
219
|
+
if (!delivered && !buffered) {
|
|
220
|
+
throw new Error(
|
|
221
|
+
`Invariant #1 violated at event #${i}: msg ${msgId} neither delivered nor buffered.\n` +
|
|
222
|
+
formatCounterexample(seed, trace.slice(0, i + 1)),
|
|
223
|
+
)
|
|
224
|
+
}
|
|
225
|
+
if (buffered && !persisted) {
|
|
226
|
+
throw new Error(
|
|
227
|
+
`Invariant #1 violated at event #${i}: msg ${msgId} buffered but not persisted.\n` +
|
|
228
|
+
formatCounterexample(seed, trace.slice(0, i + 1)),
|
|
229
|
+
)
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
expect(true).toBe(true)
|
|
234
|
+
})
|
|
235
|
+
})
|
|
236
|
+
|
|
237
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
238
|
+
// Invariant #2 — setTurnStarted has a matching clearTurnStarted
|
|
239
|
+
// before the next end-of-life event (turnEnd, bridgeDown for active,
|
|
240
|
+
// or tick past TURN_TTL).
|
|
241
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
242
|
+
|
|
243
|
+
describe('Invariant #2 — turnStarted is matched by clearTurnStarted before EOL', () => {
|
|
244
|
+
it('holds across 1000 random schedules of 100 events', () => {
|
|
245
|
+
for (let seed = 1; seed <= 1000; seed++) {
|
|
246
|
+
const trace = runSchedule(seed, 100)
|
|
247
|
+
// For each turn started, track when it was started and find the
|
|
248
|
+
// matching clear. The clear must come strictly before:
|
|
249
|
+
// - the next turnEnd for that key
|
|
250
|
+
// - OR the next tick that crosses TURN_TTL
|
|
251
|
+
// - OR bridgeDown (if the active turn — bridgeDown can clear
|
|
252
|
+
// state implicitly per the RFC's state machine; we don't
|
|
253
|
+
// require clear in that case, the bridge_dead state takes
|
|
254
|
+
// over)
|
|
255
|
+
// Simpler equivalent: at the end of every schedule step, the
|
|
256
|
+
// perKey set should never contain a turnStartedAt entry that's
|
|
257
|
+
// STALE BEYOND TTL. The tick handler is the gate.
|
|
258
|
+
for (let i = 0; i < trace.length; i++) {
|
|
259
|
+
const t = trace[i]
|
|
260
|
+
for (const [k, v] of t.stateAfter.perKey) {
|
|
261
|
+
if (v.turnStartedAt == null) continue
|
|
262
|
+
const age = t.event.kind === 'tick'
|
|
263
|
+
? t.event.now - v.turnStartedAt
|
|
264
|
+
: 0
|
|
265
|
+
// Note: we don't have a single "current time" outside
|
|
266
|
+
// tick events. Only ticks can detect TTL expiration; so
|
|
267
|
+
// a stale entry persists across non-tick events until a
|
|
268
|
+
// tick processes it. That's the design.
|
|
269
|
+
if (t.event.kind === 'tick' && age > TURN_TTL_MS) {
|
|
270
|
+
// Recent outbound suppresses fallback (invariant #5) — if
|
|
271
|
+
// suppressed, the entry stays. Confirm the suppression
|
|
272
|
+
// condition holds.
|
|
273
|
+
if (v.lastOutboundAt == null || t.event.now - v.lastOutboundAt >= OUTBOUND_RECENT_MS) {
|
|
274
|
+
throw new Error(
|
|
275
|
+
`Invariant #2 violated at event #${i}: key ${k} has stale turnStartedAt ` +
|
|
276
|
+
`(age=${age}ms > ${TURN_TTL_MS}ms) after a tick.\n` +
|
|
277
|
+
formatCounterexample(seed, trace.slice(0, i + 1)),
|
|
278
|
+
)
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
expect(true).toBe(true)
|
|
285
|
+
})
|
|
286
|
+
})
|
|
287
|
+
|
|
288
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
289
|
+
// Invariant #3 — per-chat sibling-key cleanup on turnEnd
|
|
290
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
291
|
+
|
|
292
|
+
describe('Invariant #3 — turnEnd sweeps sibling keys for the same chatId', () => {
|
|
293
|
+
it('holds across 1000 random schedules of 100 events', () => {
|
|
294
|
+
for (let seed = 1; seed <= 1000; seed++) {
|
|
295
|
+
const trace = runSchedule(seed, 100)
|
|
296
|
+
for (let i = 0; i < trace.length; i++) {
|
|
297
|
+
const t = trace[i]
|
|
298
|
+
if (t.event.kind !== 'turnEnd') continue
|
|
299
|
+
const chatId = __chatIdOfKeyForTests(t.event.key)
|
|
300
|
+
// After turnEnd, no sibling key for this chatId should retain
|
|
301
|
+
// turnStartedAt != null.
|
|
302
|
+
for (const [k, v] of t.stateAfter.perKey) {
|
|
303
|
+
if (__chatIdOfKeyForTests(k) !== chatId) continue
|
|
304
|
+
if (v.turnStartedAt != null) {
|
|
305
|
+
throw new Error(
|
|
306
|
+
`Invariant #3 violated at event #${i}: turnEnd for ${t.event.key} ` +
|
|
307
|
+
`left sibling key ${k} with turnStartedAt=${v.turnStartedAt}.\n` +
|
|
308
|
+
formatCounterexample(seed, trace.slice(0, i + 1)),
|
|
309
|
+
)
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
expect(true).toBe(true)
|
|
315
|
+
})
|
|
316
|
+
})
|
|
317
|
+
|
|
318
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
319
|
+
// Invariant #4 — permVerdict delivered iff bridge alive
|
|
320
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
321
|
+
|
|
322
|
+
describe('Invariant #4 — permVerdict delivered iff bridge alive', () => {
|
|
323
|
+
it('holds across 1000 random schedules of 50 events', () => {
|
|
324
|
+
for (let seed = 1; seed <= 1000; seed++) {
|
|
325
|
+
const trace = runSchedule(seed, 50)
|
|
326
|
+
for (let i = 0; i < trace.length; i++) {
|
|
327
|
+
const t = trace[i]
|
|
328
|
+
if (t.event.kind !== 'permVerdict') continue
|
|
329
|
+
const alive = t.stateBefore.global.kind !== 'bridge_dead'
|
|
330
|
+
const delivered = t.effects.some((e) => e.kind === 'deliverPermVerdict')
|
|
331
|
+
const persisted = t.effects.some((e) => e.kind === 'persistPermVerdict')
|
|
332
|
+
if (alive && !delivered) {
|
|
333
|
+
throw new Error(
|
|
334
|
+
`Invariant #4 violated at #${i}: bridge alive but permVerdict not delivered.\n` +
|
|
335
|
+
formatCounterexample(seed, trace.slice(0, i + 1)),
|
|
336
|
+
)
|
|
337
|
+
}
|
|
338
|
+
if (!alive && !persisted) {
|
|
339
|
+
throw new Error(
|
|
340
|
+
`Invariant #4 violated at #${i}: bridge dead but permVerdict not persisted.\n` +
|
|
341
|
+
formatCounterexample(seed, trace.slice(0, i + 1)),
|
|
342
|
+
)
|
|
343
|
+
}
|
|
344
|
+
if (delivered && persisted) {
|
|
345
|
+
throw new Error(
|
|
346
|
+
`Invariant #4 violated at #${i}: permVerdict both delivered and persisted.\n` +
|
|
347
|
+
formatCounterexample(seed, trace.slice(0, i + 1)),
|
|
348
|
+
)
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
expect(true).toBe(true)
|
|
353
|
+
})
|
|
354
|
+
})
|
|
355
|
+
|
|
356
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
357
|
+
// Invariant #5 — spurious-fallback suppression (the 2026-05-20
|
|
358
|
+
// overlapping-turn silence bug becomes unrepresentable)
|
|
359
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
360
|
+
|
|
361
|
+
describe('Invariant #5 — fallback poke suppressed if model recently broke silence', () => {
|
|
362
|
+
it('holds across 1000 random schedules of 100 events', () => {
|
|
363
|
+
for (let seed = 1; seed <= 1000; seed++) {
|
|
364
|
+
const trace = runSchedule(seed, 100)
|
|
365
|
+
for (let i = 0; i < trace.length; i++) {
|
|
366
|
+
const t = trace[i]
|
|
367
|
+
if (t.event.kind !== 'tick') continue
|
|
368
|
+
const now = t.event.now
|
|
369
|
+
for (const eff of t.effects) {
|
|
370
|
+
if (eff.kind !== 'firePoke' || eff.level !== 'fallback') continue
|
|
371
|
+
// Look up the key's lastOutboundAt at stateBefore.
|
|
372
|
+
const perKey = t.stateBefore.perKey.get(eff.key)
|
|
373
|
+
if (perKey == null) continue
|
|
374
|
+
if (perKey.lastOutboundAt == null) continue
|
|
375
|
+
const sinceOutbound = now - perKey.lastOutboundAt
|
|
376
|
+
if (sinceOutbound < OUTBOUND_RECENT_MS) {
|
|
377
|
+
throw new Error(
|
|
378
|
+
`Invariant #5 violated at #${i}: fallback fired for ${eff.key} ` +
|
|
379
|
+
`but model produced outbound only ${sinceOutbound}ms ago ` +
|
|
380
|
+
`(threshold ${OUTBOUND_RECENT_MS}ms).\n` +
|
|
381
|
+
formatCounterexample(seed, trace.slice(0, i + 1)),
|
|
382
|
+
)
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
expect(true).toBe(true)
|
|
388
|
+
})
|
|
389
|
+
})
|
|
390
|
+
|
|
391
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
392
|
+
// Targeted regression: the v0.12.22 boot-wedge case
|
|
393
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
394
|
+
|
|
395
|
+
describe('Targeted regression — v0.12.22 boot-wedge', () => {
|
|
396
|
+
it('first inbound on a fresh bridge delivers, not buffers', () => {
|
|
397
|
+
let s = initialState()
|
|
398
|
+
s = transition(s, { kind: 'bridgeUp', at: 0 }).state
|
|
399
|
+
const { state: s2, effects } = transition(s, {
|
|
400
|
+
kind: 'inbound',
|
|
401
|
+
key: 'c1:_' as ChatKey,
|
|
402
|
+
msg: { msgId: 1, isSteering: false, payload: {} },
|
|
403
|
+
at: 100,
|
|
404
|
+
})
|
|
405
|
+
const delivered = effects.some((e) => e.kind === 'deliverToBridge')
|
|
406
|
+
const buffered = effects.some((e) => e.kind === 'bufferInbound')
|
|
407
|
+
expect(delivered).toBe(true)
|
|
408
|
+
expect(buffered).toBe(false)
|
|
409
|
+
expect(s2.global.kind).toBe('bridge_alive_in_turn')
|
|
410
|
+
})
|
|
411
|
+
})
|
|
412
|
+
|
|
413
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
414
|
+
// Targeted regression: the 2026-05-20 overlapping-turn silence case
|
|
415
|
+
// ─────────────────────────────────────────────────────────────────────
|
|
416
|
+
|
|
417
|
+
describe('Targeted regression — overlapping-turn silence (the post-v0.12.22 case)', () => {
|
|
418
|
+
it("suppresses fallback when model produced outbound in prior overlapping turn", () => {
|
|
419
|
+
let s = initialState()
|
|
420
|
+
s = transition(s, { kind: 'bridgeUp', at: 0 }).state
|
|
421
|
+
|
|
422
|
+
// Turn A starts and ends with an outbound at t=5000
|
|
423
|
+
s = transition(s, {
|
|
424
|
+
kind: 'inbound',
|
|
425
|
+
key: 'c1:_' as ChatKey,
|
|
426
|
+
msg: { msgId: 1, isSteering: false, payload: {} },
|
|
427
|
+
at: 1000,
|
|
428
|
+
}).state
|
|
429
|
+
s = transition(s, {
|
|
430
|
+
kind: 'turnEnd',
|
|
431
|
+
key: 'c1:_' as ChatKey,
|
|
432
|
+
at: 5000,
|
|
433
|
+
outboundEmitted: true,
|
|
434
|
+
}).state
|
|
435
|
+
|
|
436
|
+
// Turn B starts at t=6000 (immediately after A)
|
|
437
|
+
s = transition(s, {
|
|
438
|
+
kind: 'inbound',
|
|
439
|
+
key: 'c1:_' as ChatKey,
|
|
440
|
+
msg: { msgId: 2, isSteering: false, payload: {} },
|
|
441
|
+
at: 6000,
|
|
442
|
+
}).state
|
|
443
|
+
|
|
444
|
+
// Tick at t=310,000: turn B is 304s old (past TURN_TTL_MS=300_000).
|
|
445
|
+
// But the model emitted an outbound at t=5000 (less than 60s ago
|
|
446
|
+
// RELATIVE TO TURN B'S START, but 305s before now).
|
|
447
|
+
// Per invariant #5: suppress fallback iff lastOutbound is within
|
|
448
|
+
// OUTBOUND_RECENT_MS (60s) of NOW. 310_000 - 5_000 = 305_000 ms;
|
|
449
|
+
// not within the suppression window. So fallback SHOULD fire here.
|
|
450
|
+
const tickResult = transition(s, { kind: 'tick', now: 310_000 })
|
|
451
|
+
const fired = tickResult.effects.some(
|
|
452
|
+
(e) => e.kind === 'firePoke' && e.level === 'fallback',
|
|
453
|
+
)
|
|
454
|
+
expect(fired).toBe(true)
|
|
455
|
+
|
|
456
|
+
// Now reset and verify the suppression case: outbound at t=305_000,
|
|
457
|
+
// tick at t=310_000 — only 5s elapsed since outbound.
|
|
458
|
+
let s2 = initialState()
|
|
459
|
+
s2 = transition(s2, { kind: 'bridgeUp', at: 0 }).state
|
|
460
|
+
s2 = transition(s2, {
|
|
461
|
+
kind: 'inbound',
|
|
462
|
+
key: 'c2:_' as ChatKey,
|
|
463
|
+
msg: { msgId: 3, isSteering: false, payload: {} },
|
|
464
|
+
at: 1000,
|
|
465
|
+
}).state
|
|
466
|
+
// Model emitted an outbound very recently
|
|
467
|
+
s2 = transition(s2, { kind: 'modelOutbound', key: 'c2:_' as ChatKey, at: 305_000 }).state
|
|
468
|
+
// Tick: turn is 309s old, outbound was 5s ago → suppress
|
|
469
|
+
const suppress = transition(s2, { kind: 'tick', now: 310_000 })
|
|
470
|
+
const firedSuppressed = suppress.effects.some(
|
|
471
|
+
(e) => e.kind === 'firePoke' && e.level === 'fallback',
|
|
472
|
+
)
|
|
473
|
+
expect(firedSuppressed).toBe(false)
|
|
474
|
+
})
|
|
475
|
+
})
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JTBD scenario — short happy path: trivial questions reply FAST.
|
|
3
|
+
*
|
|
4
|
+
* Serves: `reference/know-what-my-agent-is-doing.md` — the short-path
|
|
5
|
+
* contract: a question with no real work should produce a plain reply
|
|
6
|
+
* with no ceremony (no soft-commit, no progress chunks) within a tight
|
|
7
|
+
* budget. Users judge agent speed on THIS path more than any other.
|
|
8
|
+
*
|
|
9
|
+
* Also serves: the always-on vision (`reference/vision.md`). An agent
|
|
10
|
+
* that takes 30+ seconds to answer "what's 2+2" is not "always-on" —
|
|
11
|
+
* it's awake but unresponsive.
|
|
12
|
+
*
|
|
13
|
+
* ## Targets
|
|
14
|
+
*
|
|
15
|
+
* From `reference/conversational-pacing.md` and the post-v0.12.22
|
|
16
|
+
* baseline measurements:
|
|
17
|
+
*
|
|
18
|
+
* - **TTFO p95 (vision target):** < 30s — the published contract.
|
|
19
|
+
* This test asserts the FAST-trivial case, not p95, so we tighten.
|
|
20
|
+
* - **Trivial-prompt TTFO (this test):** < 12s as hard contract,
|
|
21
|
+
* < 6s as the vision target. The mtcute post-restart UAT measured
|
|
22
|
+
* 19.4s on a COLD-START fresh-restart; a warm fast-trivial should
|
|
23
|
+
* be materially faster — the dominant cost on cold start is
|
|
24
|
+
* boot+session-resume which doesn't apply here.
|
|
25
|
+
* - **Soft-commit ceremony:** must NOT fire for trivial prompts.
|
|
26
|
+
* If the reply contains a soft-commit preamble ("let me check
|
|
27
|
+
* that for you, back in a few"), the conversational-pacing
|
|
28
|
+
* prompt classified the trivial prompt as slow — a regression.
|
|
29
|
+
*
|
|
30
|
+
* ## What this catches that other UATs don't
|
|
31
|
+
*
|
|
32
|
+
* - `jtbd-soft-commit-dm.test.ts` exercises slow prompts (the soft
|
|
33
|
+
* commit SHOULD fire). This test asserts the inverse — fast prompts
|
|
34
|
+
* should skip ceremony.
|
|
35
|
+
* - `jtbd-always-on-after-restart-dm.test.ts` asserts <120s after a
|
|
36
|
+
* cold restart. This test asserts <12s on a warm agent — a much
|
|
37
|
+
* tighter bar that catches steady-state latency regressions
|
|
38
|
+
* (model swap, MCP server slowdown, gateway middleware cost, etc.).
|
|
39
|
+
* - `smoke-dm-reply.test.ts` confirms the agent replies AT ALL but
|
|
40
|
+
* has no latency assertion — a 50s reply would pass smoke. This
|
|
41
|
+
* one fails.
|
|
42
|
+
*
|
|
43
|
+
* ## Forensic signal on a yellow-band pass
|
|
44
|
+
*
|
|
45
|
+
* If TTFO lands in 6-12s, the test passes but logs a forensic warning
|
|
46
|
+
* so a future regression in this code path is visible BEFORE it
|
|
47
|
+
* crosses the hard contract. Yellow-band drift is the canary for
|
|
48
|
+
* "something's getting slower" — better to chase it at 8s than at 28s.
|
|
49
|
+
*/
|
|
50
|
+
|
|
51
|
+
import { describe, it, expect } from "vitest";
|
|
52
|
+
import { spinUp } from "../harness.js";
|
|
53
|
+
|
|
54
|
+
const AGENT = "test-harness";
|
|
55
|
+
|
|
56
|
+
// Hard contract for trivial-prompt TTFO.
|
|
57
|
+
const HARD_TTFO_MS = 12_000;
|
|
58
|
+
|
|
59
|
+
// Vision target: trivial prompts feel near-instant.
|
|
60
|
+
const VISION_TTFO_MS = 6_000;
|
|
61
|
+
|
|
62
|
+
const TRIVIAL_PROMPT = "Reply with just the number: what is 2 + 2?";
|
|
63
|
+
|
|
64
|
+
const SOFT_COMMIT_PHRASES = [
|
|
65
|
+
/let me/i,
|
|
66
|
+
/back in/i,
|
|
67
|
+
/one (sec|moment)/i,
|
|
68
|
+
/checking/i,
|
|
69
|
+
/looking into/i,
|
|
70
|
+
/hold on/i,
|
|
71
|
+
];
|
|
72
|
+
|
|
73
|
+
describe("uat: short happy path — trivial prompt is FAST", () => {
|
|
74
|
+
it(
|
|
75
|
+
`trivial prompt → reply lands within ${HARD_TTFO_MS / 1000}s`,
|
|
76
|
+
async () => {
|
|
77
|
+
const sc = await spinUp({ agent: AGENT });
|
|
78
|
+
try {
|
|
79
|
+
const sendStart = Date.now();
|
|
80
|
+
await sc.sendDM(TRIVIAL_PROMPT);
|
|
81
|
+
|
|
82
|
+
const firstReply = await sc.expectMessage(/\S/, {
|
|
83
|
+
from: "bot",
|
|
84
|
+
timeout: HARD_TTFO_MS + 5_000,
|
|
85
|
+
});
|
|
86
|
+
const ttfo = Date.now() - sendStart;
|
|
87
|
+
|
|
88
|
+
expect(firstReply.text.length).toBeGreaterThan(0);
|
|
89
|
+
|
|
90
|
+
if (ttfo >= HARD_TTFO_MS) {
|
|
91
|
+
throw new Error(
|
|
92
|
+
`[fast-trivial] TTFO=${ttfo}ms exceeds hard contract ` +
|
|
93
|
+
`${HARD_TTFO_MS}ms — trivial-prompt latency regression.`,
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
expect(ttfo).toBeLessThan(HARD_TTFO_MS);
|
|
97
|
+
|
|
98
|
+
const triggeredSoftCommit = SOFT_COMMIT_PHRASES.some((re) =>
|
|
99
|
+
re.test(firstReply.text),
|
|
100
|
+
);
|
|
101
|
+
if (triggeredSoftCommit) {
|
|
102
|
+
console.warn(
|
|
103
|
+
`[fast-trivial] First reply contains soft-commit phrasing — ` +
|
|
104
|
+
`the conversational-pacing prompt likely classified the ` +
|
|
105
|
+
`trivial prompt as slow. Text: ${JSON.stringify(firstReply.text.slice(0, 200))}`,
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (ttfo >= VISION_TTFO_MS) {
|
|
110
|
+
console.warn(
|
|
111
|
+
`[fast-trivial] TTFO=${ttfo}ms — passed hard contract ` +
|
|
112
|
+
`(${HARD_TTFO_MS}ms) but slower than the vision target ` +
|
|
113
|
+
`(${VISION_TTFO_MS}ms). Forensic canary for delivery-path drift.`,
|
|
114
|
+
);
|
|
115
|
+
} else {
|
|
116
|
+
console.log(
|
|
117
|
+
`[fast-trivial] TTFO=${ttfo}ms — within vision target ` +
|
|
118
|
+
`(<${VISION_TTFO_MS}ms). Snappy.`,
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
} finally {
|
|
122
|
+
await sc.tearDown();
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
HARD_TTFO_MS + 15_000,
|
|
126
|
+
);
|
|
127
|
+
});
|