switchroom 0.13.3 → 0.13.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -45
- package/dist/agent-scheduler/index.js +80 -80
- package/dist/auth-broker/index.js +80 -80
- package/dist/cli/drive-write-pretool.mjs +10 -10
- package/dist/cli/skill-validate-pretool.mjs +72 -72
- package/dist/cli/switchroom.js +485 -566
- package/dist/host-control/main.js +99 -99
- package/dist/vault/approvals/kernel-server.js +82 -82
- package/dist/vault/broker/server.js +83 -83
- package/package.json +1 -1
- package/profiles/_base/start.sh.hbs +8 -8
- package/profiles/_shared/telegram-style.md.hbs +1 -1
- package/profiles/_shared/vault-protocol.md.hbs +12 -0
- package/profiles/default/CLAUDE.md +192 -0
- package/profiles/default/CLAUDE.md.hbs +1 -1
- package/telegram-plugin/dist/bridge/bridge.js +112 -112
- package/telegram-plugin/dist/gateway/gateway.js +210 -192
- package/telegram-plugin/dist/server.js +160 -160
- package/telegram-plugin/runtime-metrics.ts +14 -8
- package/telegram-plugin/silence-poke.ts +49 -1
- package/telegram-plugin/tests/silence-poke.test.ts +135 -3
- package/telegram-plugin/uat/scenarios/bridge-flap-resilience-dm.test.ts +166 -0
- package/telegram-plugin/uat/scenarios/jtbd-fast-ack-dm.test.ts +217 -0
- package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +16 -11
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
import { existsSync, mkdirSync, appendFileSync } from 'node:fs'
|
|
20
20
|
import { dirname, join } from 'node:path'
|
|
21
21
|
import { captureEvent } from './analytics-posthog.js'
|
|
22
|
+
import type { PokeLevel } from './silence-poke.js'
|
|
22
23
|
|
|
23
24
|
export type RuntimeMetricEvent =
|
|
24
25
|
/**
|
|
@@ -62,28 +63,33 @@ export type RuntimeMetricEvent =
|
|
|
62
63
|
ended_via: 'reply' | 'stream_reply_done' | 'silent' | 'forced' | 'framework_fallback'
|
|
63
64
|
}
|
|
64
65
|
/**
|
|
65
|
-
* Framework safety-net: a silence-poke was armed
|
|
66
|
-
*
|
|
67
|
-
*
|
|
68
|
-
*
|
|
69
|
-
*
|
|
66
|
+
* Framework safety-net: a silence-poke was armed. `ack` is the early
|
|
67
|
+
* (~10s) ack-budget poke — the model has sent NOTHING this turn and is
|
|
68
|
+
* leaving the user on a silent chat. `soft` (75s) / `firm` (180s) are
|
|
69
|
+
* the silence-since-last-outbound ladder. The system-reminder appended
|
|
70
|
+
* to the next tool result nudges the model to send an update. Doubles
|
|
71
|
+
* as a design-health signal — if these fire frequently, the
|
|
72
|
+
* conversational-pacing prompt isn't doing its job.
|
|
70
73
|
*/
|
|
71
74
|
| {
|
|
72
75
|
kind: 'silence_poke_fired'
|
|
73
76
|
key: string
|
|
74
|
-
level:
|
|
77
|
+
level: PokeLevel
|
|
75
78
|
silence_ms: number
|
|
76
79
|
subagent_wait: boolean
|
|
77
80
|
}
|
|
78
81
|
/**
|
|
79
82
|
* The model sent an outbound message within the success window
|
|
80
83
|
* (default 15s) after a poke fired. Pair with `silence_poke_fired`
|
|
81
|
-
* to compute success rate — the design target is >80%.
|
|
84
|
+
* to compute success rate — the design target is >80%. (`ack`-level
|
|
85
|
+
* success is not currently emitted — the ack poke sits outside the
|
|
86
|
+
* `pokesFired` ladder noteOutbound measures against; the type admits
|
|
87
|
+
* `ack` only so the silence-poke metric union stays assignable.)
|
|
82
88
|
*/
|
|
83
89
|
| {
|
|
84
90
|
kind: 'silence_poke_succeeded'
|
|
85
91
|
key: string
|
|
86
|
-
level:
|
|
92
|
+
level: PokeLevel
|
|
87
93
|
latency_ms: number
|
|
88
94
|
}
|
|
89
95
|
/**
|
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
* pacing prompt still applies; only the framework safety net is off.
|
|
44
44
|
*/
|
|
45
45
|
|
|
46
|
-
export type PokeLevel = 'soft' | 'firm'
|
|
46
|
+
export type PokeLevel = 'ack' | 'soft' | 'firm'
|
|
47
47
|
|
|
48
48
|
/** #1292: snapshot of an in-flight tool call, surfaced in the 300s
|
|
49
49
|
* framework-fallback message so the user sees the actual observable
|
|
@@ -76,6 +76,10 @@ export interface SilencePokeState {
|
|
|
76
76
|
lastThinkingAt: number | null
|
|
77
77
|
/** True once the 300s framework fallback has fired this turn. */
|
|
78
78
|
fallbackFired: boolean
|
|
79
|
+
/** True once the early ack-budget poke has fired this turn. One-shot:
|
|
80
|
+
* the ack nudge is specifically about the *first* outbound, so it
|
|
81
|
+
* never re-arms even after the model later goes quiet again. */
|
|
82
|
+
ackPokeFired: boolean
|
|
79
83
|
/** Wall-clock ms of last poke fire — used for poke-success latency. */
|
|
80
84
|
lastPokeFiredAt: number | null
|
|
81
85
|
/** #1292: in-flight tool calls keyed by toolUseId. Populated by
|
|
@@ -91,6 +95,12 @@ export interface SilencePokeState {
|
|
|
91
95
|
}
|
|
92
96
|
|
|
93
97
|
export interface ThresholdsMs {
|
|
98
|
+
/** Ack budget: if NO outbound at all has landed this many ms after
|
|
99
|
+
* turn start, arm an 'ack' poke. This is the framework enforcing the
|
|
100
|
+
* human-baseline "acknowledge within a beat" — far tighter than the
|
|
101
|
+
* 75s `soft` threshold, which measures silence-since-last-outbound
|
|
102
|
+
* and is the wrong instrument for "you never said hello." */
|
|
103
|
+
ack: number
|
|
94
104
|
soft: number
|
|
95
105
|
firm: number
|
|
96
106
|
fallback: number
|
|
@@ -101,6 +111,7 @@ export interface ThresholdsMs {
|
|
|
101
111
|
}
|
|
102
112
|
|
|
103
113
|
export const DEFAULT_THRESHOLDS: ThresholdsMs = {
|
|
114
|
+
ack: 10_000,
|
|
104
115
|
soft: 75_000,
|
|
105
116
|
firm: 180_000,
|
|
106
117
|
fallback: 300_000,
|
|
@@ -176,6 +187,7 @@ export function startTurn(key: string, now: number): void {
|
|
|
176
187
|
subagentDispatchActive: false,
|
|
177
188
|
lastThinkingAt: null,
|
|
178
189
|
fallbackFired: false,
|
|
190
|
+
ackPokeFired: false,
|
|
179
191
|
lastPokeFiredAt: null,
|
|
180
192
|
inFlightTools: new Map(),
|
|
181
193
|
})
|
|
@@ -340,6 +352,16 @@ export function endTurn(key: string): void {
|
|
|
340
352
|
|
|
341
353
|
/** Verbatim poke text. Wording is load-bearing — see issue #1122 design. */
|
|
342
354
|
export function formatPokeText(level: PokeLevel): string {
|
|
355
|
+
if (level === 'ack') {
|
|
356
|
+
return (
|
|
357
|
+
"[silence-poke] You haven't sent the user anything yet this turn — "
|
|
358
|
+
+ 'they are looking at a silent chat. Send a short, human one-line '
|
|
359
|
+
+ 'acknowledgement now via `reply` (e.g. "on it — checking"), in your '
|
|
360
|
+
+ "persona's voice, before you do any more work. A good colleague "
|
|
361
|
+
+ "answers in a beat; don't leave the message hanging while you think. "
|
|
362
|
+
+ 'If the full answer is genuinely seconds away, send that instead.'
|
|
363
|
+
)
|
|
364
|
+
}
|
|
343
365
|
if (level === 'soft') {
|
|
344
366
|
return (
|
|
345
367
|
"[silence-poke] You've been silent to the user for 75s. If you're "
|
|
@@ -437,6 +459,32 @@ function tick(now: number): void {
|
|
|
437
459
|
? thresholds.subagentSoft
|
|
438
460
|
: thresholds.soft
|
|
439
461
|
|
|
462
|
+
// Ack budget — the framework enforcing the human-baseline "answer
|
|
463
|
+
// in a beat." Fires once, only when NOTHING has been sent this turn
|
|
464
|
+
// (`lastOutboundAt == null`), well before the 75s `soft` threshold.
|
|
465
|
+
// `soft` measures silence-since-last-outbound and is the wrong
|
|
466
|
+
// instrument for "you never acknowledged me." Independent of the
|
|
467
|
+
// soft/firm/fallback ladder: if the model never acks, it still
|
|
468
|
+
// escalates soft → firm → fallback on schedule after this.
|
|
469
|
+
if (
|
|
470
|
+
!s.ackPokeFired
|
|
471
|
+
&& s.lastOutboundAt == null
|
|
472
|
+
&& s.pokesFired === 0
|
|
473
|
+
&& silence >= thresholds.ack
|
|
474
|
+
) {
|
|
475
|
+
s.pokeArmed = { level: 'ack' }
|
|
476
|
+
s.ackPokeFired = true
|
|
477
|
+
s.lastPokeFiredAt = now
|
|
478
|
+
activeDeps.emitMetric({
|
|
479
|
+
kind: 'silence_poke_fired',
|
|
480
|
+
key,
|
|
481
|
+
level: 'ack',
|
|
482
|
+
silence_ms: silence,
|
|
483
|
+
subagent_wait: s.subagentDispatchActive,
|
|
484
|
+
})
|
|
485
|
+
continue
|
|
486
|
+
}
|
|
487
|
+
|
|
440
488
|
if (s.pokesFired === 0 && silence >= softThreshold) {
|
|
441
489
|
s.pokeArmed = { level: 'soft' }
|
|
442
490
|
s.pokesFired = 1
|
|
@@ -33,7 +33,15 @@ function setupDeps(opts?: { thresholds?: Partial<typeof DEFAULT_THRESHOLDS> }):
|
|
|
33
33
|
__setDepsForTests({
|
|
34
34
|
emitMetric: (e) => fixtures.emitted.push(e),
|
|
35
35
|
onFrameworkFallback: (ctx) => { fixtures.fallbacks.push(ctx) },
|
|
36
|
-
|
|
36
|
+
// The ack budget (a new poke that fires *earlier* than `soft`) is
|
|
37
|
+
// disabled by default in this fixture so the soft/firm/fallback
|
|
38
|
+
// ladder tests stay isolated from it. The 'ack budget' describe
|
|
39
|
+
// block opts back in with a real value.
|
|
40
|
+
thresholdsMs: {
|
|
41
|
+
...DEFAULT_THRESHOLDS,
|
|
42
|
+
ack: Number.MAX_SAFE_INTEGER,
|
|
43
|
+
...(opts?.thresholds ?? {}),
|
|
44
|
+
},
|
|
37
45
|
})
|
|
38
46
|
return fixtures
|
|
39
47
|
}
|
|
@@ -139,6 +147,127 @@ describe('silence-poke — escalation ladder', () => {
|
|
|
139
147
|
})
|
|
140
148
|
})
|
|
141
149
|
|
|
150
|
+
// PR1 (human-feel UX epic): the ack budget. A person you message
|
|
151
|
+
// answers in a beat — the framework enforces that baseline by arming an
|
|
152
|
+
// 'ack' poke if NOTHING has been sent within `thresholds.ack` of turn
|
|
153
|
+
// start. It is a one-shot nudge (the model still authors every word),
|
|
154
|
+
// deliberately OUTSIDE the soft/firm/fallback `pokesFired` ladder: if
|
|
155
|
+
// the model never acks, the ladder still escalates on its own schedule.
|
|
156
|
+
// See `reference/conversational-pacing.md` and the "Open with an
|
|
157
|
+
// acknowledgement" bullet in `profiles/_shared/telegram-style.md.hbs`.
|
|
158
|
+
//
|
|
159
|
+
// NB: `setupDeps` disables the ack budget by default (ack = MAX_SAFE);
|
|
160
|
+
// every test here opts back in with a real `ack` threshold.
|
|
161
|
+
describe('silence-poke — ack budget (PR1 human-feel UX)', () => {
|
|
162
|
+
it('arms an ack poke at the ack threshold when nothing has been sent', () => {
|
|
163
|
+
const fx = setupDeps({ thresholds: { ack: 10_000 } })
|
|
164
|
+
startTurn('chat:0', 0)
|
|
165
|
+
|
|
166
|
+
__tickForTests(9_000) // before the ack budget
|
|
167
|
+
expect(consumeArmedPoke()).toBeNull()
|
|
168
|
+
expect(fx.emitted).toHaveLength(0)
|
|
169
|
+
|
|
170
|
+
__tickForTests(10_000) // at the ack budget
|
|
171
|
+
expect(fx.emitted).toEqual([
|
|
172
|
+
expect.objectContaining({ kind: 'silence_poke_fired', level: 'ack' }),
|
|
173
|
+
])
|
|
174
|
+
const text = consumeArmedPoke()
|
|
175
|
+
expect(text).toContain('[silence-poke]')
|
|
176
|
+
expect(text).toContain('reply')
|
|
177
|
+
})
|
|
178
|
+
|
|
179
|
+
it('does NOT arm an ack poke if an outbound landed before the budget', () => {
|
|
180
|
+
const fx = setupDeps({ thresholds: { ack: 10_000 } })
|
|
181
|
+
startTurn('chat:0', 0)
|
|
182
|
+
noteOutbound('chat:0', 3_000) // model acked fast — inside the budget
|
|
183
|
+
__tickForTests(10_000)
|
|
184
|
+
__tickForTests(20_000)
|
|
185
|
+
expect(consumeArmedPoke()).toBeNull()
|
|
186
|
+
expect(
|
|
187
|
+
fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'ack'),
|
|
188
|
+
).toHaveLength(0)
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
it('is one-shot — never re-arms even if the model goes quiet again', () => {
|
|
192
|
+
const fx = setupDeps({ thresholds: { ack: 10_000 } })
|
|
193
|
+
startTurn('chat:0', 0)
|
|
194
|
+
__tickForTests(10_000) // ack fires
|
|
195
|
+
consumeArmedPoke() // drain it
|
|
196
|
+
noteOutbound('chat:0', 12_000) // model finally acks
|
|
197
|
+
// The model goes quiet again. The ack poke is specifically about the
|
|
198
|
+
// FIRST outbound — it must not fire twice. A later silence is the
|
|
199
|
+
// soft poke's job, not the ack budget's.
|
|
200
|
+
__tickForTests(40_000)
|
|
201
|
+
expect(
|
|
202
|
+
fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'ack'),
|
|
203
|
+
).toHaveLength(1)
|
|
204
|
+
})
|
|
205
|
+
|
|
206
|
+
it('ackPokeFired resets across turns even when endTurn was skipped (CC-5 invariant)', () => {
|
|
207
|
+
// Mirrors the subagentDispatchActive CC-5 guard: `ackPokeFired` is a
|
|
208
|
+
// turn-scoped one-shot flag, and the only thing that keeps it from
|
|
209
|
+
// leaking into the next turn (when an abnormal abort skips endTurn)
|
|
210
|
+
// is startTurn's unconditional state overwrite. Pin that here so a
|
|
211
|
+
// future read-modify-write refactor of startTurn fails loud.
|
|
212
|
+
setupDeps({ thresholds: { ack: 10_000 } })
|
|
213
|
+
startTurn('k', 0)
|
|
214
|
+
__tickForTests(10_000) // ack fires
|
|
215
|
+
expect(__getStateForTests('k')?.ackPokeFired).toBe(true)
|
|
216
|
+
// Turn 2 in the same key, no endTurn — startTurn MUST clear the flag.
|
|
217
|
+
startTurn('k', 1_000_000)
|
|
218
|
+
expect(__getStateForTests('k')?.ackPokeFired).toBe(false)
|
|
219
|
+
})
|
|
220
|
+
|
|
221
|
+
it('does not advance the ladder — soft still requires a full 75s of silence', () => {
|
|
222
|
+
// The ack poke is deliberately outside `pokesFired`. After it fires,
|
|
223
|
+
// a soft poke must still wait the normal 75s.
|
|
224
|
+
const fx = setupDeps({ thresholds: { ack: 10_000 } })
|
|
225
|
+
startTurn('chat:0', 0)
|
|
226
|
+
__tickForTests(10_000) // ack
|
|
227
|
+
consumeArmedPoke()
|
|
228
|
+
__tickForTests(70_000) // 70s total — under the 75s soft threshold
|
|
229
|
+
expect(
|
|
230
|
+
fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'soft'),
|
|
231
|
+
).toHaveLength(0)
|
|
232
|
+
__tickForTests(75_000)
|
|
233
|
+
expect(
|
|
234
|
+
fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'soft'),
|
|
235
|
+
).toHaveLength(1)
|
|
236
|
+
})
|
|
237
|
+
|
|
238
|
+
it('still escalates ack -> soft -> firm -> fallback on a turn that never acks', () => {
|
|
239
|
+
const fx = setupDeps({ thresholds: { ack: 10_000 } })
|
|
240
|
+
startTurn('chat:0', 0)
|
|
241
|
+
__tickForTests(10_000) // ack
|
|
242
|
+
consumeArmedPoke()
|
|
243
|
+
__tickForTests(75_000) // soft
|
|
244
|
+
consumeArmedPoke()
|
|
245
|
+
__tickForTests(180_000) // firm
|
|
246
|
+
consumeArmedPoke()
|
|
247
|
+
__tickForTests(300_000) // fallback
|
|
248
|
+
const trail = fx.emitted.map((e) =>
|
|
249
|
+
e.kind === 'silence_poke_fired'
|
|
250
|
+
? `poke:${e.level}`
|
|
251
|
+
: e.kind === 'silence_fallback_sent'
|
|
252
|
+
? `fallback:${e.fallback_kind}`
|
|
253
|
+
: e.kind,
|
|
254
|
+
)
|
|
255
|
+
expect(trail).toEqual([
|
|
256
|
+
'poke:ack',
|
|
257
|
+
'poke:soft',
|
|
258
|
+
'poke:firm',
|
|
259
|
+
'fallback:working',
|
|
260
|
+
])
|
|
261
|
+
})
|
|
262
|
+
|
|
263
|
+
it('formatPokeText("ack") nudges for a human acknowledgement via reply', () => {
|
|
264
|
+
const text = formatPokeText('ack')
|
|
265
|
+
expect(text).toContain('[silence-poke]')
|
|
266
|
+
expect(text.toLowerCase()).toContain('acknowledg')
|
|
267
|
+
expect(text).toContain('reply')
|
|
268
|
+
})
|
|
269
|
+
})
|
|
270
|
+
|
|
142
271
|
describe('silence-poke — outbound resets clock + success measurement', () => {
|
|
143
272
|
it('noteOutbound resets the silence clock', () => {
|
|
144
273
|
setupDeps()
|
|
@@ -608,7 +737,9 @@ describe('silence-poke — fallback handler errors do not break timer', () => {
|
|
|
608
737
|
__setDepsForTests({
|
|
609
738
|
emitMetric: (e) => fx.emitted.push(e),
|
|
610
739
|
onFrameworkFallback: () => { throw new Error('oh no') },
|
|
611
|
-
|
|
740
|
+
// ack budget out of the way — this test exercises the
|
|
741
|
+
// soft/firm/fallback ladder under a throwing fallback handler.
|
|
742
|
+
thresholdsMs: { ...DEFAULT_THRESHOLDS, ack: Number.MAX_SAFE_INTEGER },
|
|
612
743
|
})
|
|
613
744
|
startTurn('k', 0)
|
|
614
745
|
expect(() => {
|
|
@@ -625,7 +756,8 @@ describe('silence-poke — fallback handler errors do not break timer', () => {
|
|
|
625
756
|
__setDepsForTests({
|
|
626
757
|
emitMetric: (e) => fx.emitted.push(e),
|
|
627
758
|
onFrameworkFallback: () => Promise.reject(new Error('async fail')),
|
|
628
|
-
|
|
759
|
+
// ack budget out of the way — see the throwing-handler test above.
|
|
760
|
+
thresholdsMs: { ...DEFAULT_THRESHOLDS, ack: Number.MAX_SAFE_INTEGER },
|
|
629
761
|
})
|
|
630
762
|
startTurn('k', 0)
|
|
631
763
|
__tickForTests(75_000)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bridge-flap resilience scenario — regression guard for #1613 / #1616.
|
|
3
|
+
*
|
|
4
|
+
* ## The bug this guards
|
|
5
|
+
*
|
|
6
|
+
* The handoff-briefing summarizer shells out to a headless `claude -p`
|
|
7
|
+
* once per turn (handoff Stop hook). Before #1616 it ran without
|
|
8
|
+
* `--strict-mcp-config`, so it auto-discovered the agent's project
|
|
9
|
+
* `.mcp.json` and started every MCP server in it — including
|
|
10
|
+
* `switchroom-telegram`. That spun up a *second* telegram bridge
|
|
11
|
+
* process which registered against the same gateway socket as the
|
|
12
|
+
* live agent's real bridge; the two collided under the gateway's
|
|
13
|
+
* register-race close, producing an A↔B "bridge reconnect race" flap
|
|
14
|
+
* every ~2s for the ~7-9s the `claude -p` lived. The handoff hook
|
|
15
|
+
* fires every turn, so did the flap. A turn whose completion landed
|
|
16
|
+
* inside a flap burst could have its `turn_end` signal eaten — the
|
|
17
|
+
* agent looked wedged for that turn.
|
|
18
|
+
*
|
|
19
|
+
* The fix (#1616): the summarizer passes `--strict-mcp-config`, so
|
|
20
|
+
* the headless `claude -p` loads zero MCP servers and never spawns a
|
|
21
|
+
* competing bridge. The structural guard against a new offending
|
|
22
|
+
* callsite is `tests/bridge-flap-regression-guard.test.ts`; this
|
|
23
|
+
* scenario is the behavioural backstop.
|
|
24
|
+
*
|
|
25
|
+
* ## What this scenario asserts (root-cause-agnostic by design)
|
|
26
|
+
*
|
|
27
|
+
* The checks are symptom-based, so they catch a flap reintroduced by
|
|
28
|
+
* ANY future change — not only a regression of #1616:
|
|
29
|
+
*
|
|
30
|
+
* 1. Send a handful of DMs in succession — each drives a turn (and a
|
|
31
|
+
* handoff-hook fire). **Primary assertion:** every DM gets a reply
|
|
32
|
+
* within budget. Directly catches both the flap (eats turn_end)
|
|
33
|
+
* and the wedge (a zero-bridge gap strands the inbound).
|
|
34
|
+
* 2. **Forensic assertion:** inspect the agent's gateway-supervisor.log
|
|
35
|
+
* over the test window and assert the `bridge disconnected` density
|
|
36
|
+
* stays BELOW a flap threshold. One healthy persistent bridge
|
|
37
|
+
* produces only a trickle of disconnects; a sustained reconnect
|
|
38
|
+
* race produces dozens in tight ~2s bursts.
|
|
39
|
+
*
|
|
40
|
+
* ## Why the log inspection
|
|
41
|
+
*
|
|
42
|
+
* A flap is a server-side phenomenon that does not always surface as
|
|
43
|
+
* a missed reply (a burst can self-heal in ~20-30s). The `bridge
|
|
44
|
+
* disconnected` count is the transport-agnostic flap symptom. This
|
|
45
|
+
* scenario shells into the agent container via `docker exec` to read
|
|
46
|
+
* the gateway log; if docker is unavailable the log assertion is
|
|
47
|
+
* skipped with a warning and the responsiveness checks still run.
|
|
48
|
+
*
|
|
49
|
+
* ## Tolerances
|
|
50
|
+
*
|
|
51
|
+
* - `DISCONNECT_FLAP_THRESHOLD` is the max acceptable `bridge
|
|
52
|
+
* disconnected` count over the window. Post-#1616 a healthy ~4-turn
|
|
53
|
+
* run sits well under 16 (measured ~8-13, including anonymous
|
|
54
|
+
* probe-connection churn); a sustained flap is 20-40+. 16 sits
|
|
55
|
+
* comfortably in the gap.
|
|
56
|
+
*/
|
|
57
|
+
|
|
58
|
+
import { describe, expect, it } from "vitest";
|
|
59
|
+
import { execSync } from "node:child_process";
|
|
60
|
+
import { spinUp } from "../harness.js";
|
|
61
|
+
import type { ObservedMessage } from "../driver.js";
|
|
62
|
+
|
|
63
|
+
const AGENT = "test-harness";
|
|
64
|
+
const CONTAINER = `switchroom-${AGENT}`;
|
|
65
|
+
const GATEWAY_LOG = "/var/log/switchroom/gateway-supervisor.log";
|
|
66
|
+
|
|
67
|
+
const DM_COUNT = 4;
|
|
68
|
+
const PER_DM_TIMEOUT_MS = 30_000;
|
|
69
|
+
const OVERALL_DEADLINE_MS = 180_000;
|
|
70
|
+
|
|
71
|
+
// Post-#1616 a healthy ~4-turn run logs ~8-13 `bridge disconnected`
|
|
72
|
+
// lines (one persistent bridge + anonymous probe-connection churn).
|
|
73
|
+
// A sustained A↔B flap produces 20-40+ in tight ~2s bursts. 16 sits
|
|
74
|
+
// in the gap.
|
|
75
|
+
const DISCONNECT_FLAP_THRESHOLD = 16;
|
|
76
|
+
|
|
77
|
+
/** Total line count of the agent's gateway-supervisor.log, or null if
|
|
78
|
+
* the container/log is unreachable (CI without the container). */
|
|
79
|
+
function gatewayLogLineCount(): number | null {
|
|
80
|
+
try {
|
|
81
|
+
const out = execSync(
|
|
82
|
+
`docker exec ${CONTAINER} sh -lc 'wc -l < ${GATEWAY_LOG}'`,
|
|
83
|
+
{ encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] },
|
|
84
|
+
);
|
|
85
|
+
return parseInt(out.trim(), 10);
|
|
86
|
+
} catch {
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/** Count `bridge disconnected` lines after `sinceLine` in the log. */
|
|
92
|
+
function disconnectCountSince(sinceLine: number): number | null {
|
|
93
|
+
try {
|
|
94
|
+
const out = execSync(
|
|
95
|
+
`docker exec ${CONTAINER} sh -lc ` +
|
|
96
|
+
`'awk "NR>${sinceLine}" ${GATEWAY_LOG} | grep -c "bridge disconnected" || true'`,
|
|
97
|
+
{ encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] },
|
|
98
|
+
);
|
|
99
|
+
return parseInt(out.trim(), 10);
|
|
100
|
+
} catch {
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
describe("uat: bridge-flap resilience — agent stays responsive, gateway does not flap", () => {
|
|
106
|
+
it(
|
|
107
|
+
"every DM gets a reply and the gateway does not flap across turns",
|
|
108
|
+
async () => {
|
|
109
|
+
const baselineLine = gatewayLogLineCount();
|
|
110
|
+
const sc = await spinUp({ agent: AGENT });
|
|
111
|
+
try {
|
|
112
|
+
const overallDeadline = Date.now() + OVERALL_DEADLINE_MS;
|
|
113
|
+
|
|
114
|
+
for (let i = 1; i <= DM_COUNT; i++) {
|
|
115
|
+
await sc.sendDM(`flap-resilience probe ${i}/${DM_COUNT}: reply with OK${i}`);
|
|
116
|
+
|
|
117
|
+
const remaining = Math.min(
|
|
118
|
+
PER_DM_TIMEOUT_MS,
|
|
119
|
+
overallDeadline - Date.now(),
|
|
120
|
+
);
|
|
121
|
+
expect(
|
|
122
|
+
remaining,
|
|
123
|
+
`overall deadline hit before DM ${i} — earlier turns were too slow`,
|
|
124
|
+
).toBeGreaterThan(0);
|
|
125
|
+
|
|
126
|
+
const reply = await sc.expectMessage(
|
|
127
|
+
(m: ObservedMessage) => m.fromBot && !m.edited,
|
|
128
|
+
{ from: "bot", timeout: remaining },
|
|
129
|
+
);
|
|
130
|
+
expect(
|
|
131
|
+
reply.text.length,
|
|
132
|
+
`DM ${i}/${DM_COUNT} produced an empty reply — a flap may have ` +
|
|
133
|
+
`eaten the turn_end signal`,
|
|
134
|
+
).toBeGreaterThan(0);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Responsiveness held for all DM_COUNT turns. Now check the
|
|
138
|
+
// server-side flap signal.
|
|
139
|
+
if (baselineLine == null) {
|
|
140
|
+
console.warn(
|
|
141
|
+
"[bridge-flap-resilience] docker exec unavailable — skipping " +
|
|
142
|
+
"the gateway-log flap assertion; responsiveness checks passed.",
|
|
143
|
+
);
|
|
144
|
+
return;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const disconnectCount = disconnectCountSince(baselineLine);
|
|
148
|
+
expect(
|
|
149
|
+
disconnectCount,
|
|
150
|
+
"could not read gateway log after the run — container went away",
|
|
151
|
+
).not.toBeNull();
|
|
152
|
+
expect(
|
|
153
|
+
disconnectCount as number,
|
|
154
|
+
`gateway logged ${disconnectCount} "bridge disconnected" lines across ` +
|
|
155
|
+
`${DM_COUNT} turns — at/above the flap threshold ` +
|
|
156
|
+
`(${DISCONNECT_FLAP_THRESHOLD}). A parasitic bridge is racing the ` +
|
|
157
|
+
`live one — check for a headless 'claude -p' spawned without ` +
|
|
158
|
+
`--strict-mcp-config (#1613/#1616).`,
|
|
159
|
+
).toBeLessThan(DISCONNECT_FLAP_THRESHOLD);
|
|
160
|
+
} finally {
|
|
161
|
+
await sc.tearDown();
|
|
162
|
+
}
|
|
163
|
+
},
|
|
164
|
+
OVERALL_DEADLINE_MS + 30_000,
|
|
165
|
+
);
|
|
166
|
+
});
|