switchroom 0.13.3 → 0.13.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@
19
19
  import { existsSync, mkdirSync, appendFileSync } from 'node:fs'
20
20
  import { dirname, join } from 'node:path'
21
21
  import { captureEvent } from './analytics-posthog.js'
22
+ import type { PokeLevel } from './silence-poke.js'
22
23
 
23
24
  export type RuntimeMetricEvent =
24
25
  /**
@@ -62,28 +63,33 @@ export type RuntimeMetricEvent =
62
63
  ended_via: 'reply' | 'stream_reply_done' | 'silent' | 'forced' | 'framework_fallback'
63
64
  }
64
65
  /**
65
- * Framework safety-net: a silence-poke was armed at 75s (soft) or
66
- * 180s (firm). The system-reminder appended to the next tool result
67
- * nudges the model to send an update. Doubles as a design-health
68
- * signal if these fire frequently, the conversational-pacing
69
- * prompt isn't doing its job.
66
+ * Framework safety-net: a silence-poke was armed. `ack` is the early
67
+ * (~10s) ack-budget poke the model has sent NOTHING this turn and is
68
+ * leaving the user on a silent chat. `soft` (75s) / `firm` (180s) are
69
+ * the silence-since-last-outbound ladder. The system-reminder appended
70
+ * to the next tool result nudges the model to send an update. Doubles
71
+ * as a design-health signal — if these fire frequently, the
72
+ * conversational-pacing prompt isn't doing its job.
70
73
  */
71
74
  | {
72
75
  kind: 'silence_poke_fired'
73
76
  key: string
74
- level: 'soft' | 'firm'
77
+ level: PokeLevel
75
78
  silence_ms: number
76
79
  subagent_wait: boolean
77
80
  }
78
81
  /**
79
82
  * The model sent an outbound message within the success window
80
83
  * (default 15s) after a poke fired. Pair with `silence_poke_fired`
81
- * to compute success rate — the design target is >80%.
84
+ * to compute success rate — the design target is >80%. (`ack`-level
85
+ * success is not currently emitted — the ack poke sits outside the
86
+ * `pokesFired` ladder noteOutbound measures against; the type admits
87
+ * `ack` only so the silence-poke metric union stays assignable.)
82
88
  */
83
89
  | {
84
90
  kind: 'silence_poke_succeeded'
85
91
  key: string
86
- level: 'soft' | 'firm'
92
+ level: PokeLevel
87
93
  latency_ms: number
88
94
  }
89
95
  /**
@@ -43,7 +43,7 @@
43
43
  * pacing prompt still applies; only the framework safety net is off.
44
44
  */
45
45
 
46
- export type PokeLevel = 'soft' | 'firm'
46
+ export type PokeLevel = 'ack' | 'soft' | 'firm'
47
47
 
48
48
  /** #1292: snapshot of an in-flight tool call, surfaced in the 300s
49
49
  * framework-fallback message so the user sees the actual observable
@@ -76,6 +76,10 @@ export interface SilencePokeState {
76
76
  lastThinkingAt: number | null
77
77
  /** True once the 300s framework fallback has fired this turn. */
78
78
  fallbackFired: boolean
79
+ /** True once the early ack-budget poke has fired this turn. One-shot:
80
+ * the ack nudge is specifically about the *first* outbound, so it
81
+ * never re-arms even after the model later goes quiet again. */
82
+ ackPokeFired: boolean
79
83
  /** Wall-clock ms of last poke fire — used for poke-success latency. */
80
84
  lastPokeFiredAt: number | null
81
85
  /** #1292: in-flight tool calls keyed by toolUseId. Populated by
@@ -91,6 +95,12 @@ export interface SilencePokeState {
91
95
  }
92
96
 
93
97
  export interface ThresholdsMs {
98
+ /** Ack budget: if NO outbound at all has landed this many ms after
99
+ * turn start, arm an 'ack' poke. This is the framework enforcing the
100
+ * human-baseline "acknowledge within a beat" — far tighter than the
101
+ * 75s `soft` threshold, which measures silence-since-last-outbound
102
+ * and is the wrong instrument for "you never said hello." */
103
+ ack: number
94
104
  soft: number
95
105
  firm: number
96
106
  fallback: number
@@ -101,6 +111,7 @@ export interface ThresholdsMs {
101
111
  }
102
112
 
103
113
  export const DEFAULT_THRESHOLDS: ThresholdsMs = {
114
+ ack: 10_000,
104
115
  soft: 75_000,
105
116
  firm: 180_000,
106
117
  fallback: 300_000,
@@ -176,6 +187,7 @@ export function startTurn(key: string, now: number): void {
176
187
  subagentDispatchActive: false,
177
188
  lastThinkingAt: null,
178
189
  fallbackFired: false,
190
+ ackPokeFired: false,
179
191
  lastPokeFiredAt: null,
180
192
  inFlightTools: new Map(),
181
193
  })
@@ -340,6 +352,16 @@ export function endTurn(key: string): void {
340
352
 
341
353
  /** Verbatim poke text. Wording is load-bearing — see issue #1122 design. */
342
354
  export function formatPokeText(level: PokeLevel): string {
355
+ if (level === 'ack') {
356
+ return (
357
+ "[silence-poke] You haven't sent the user anything yet this turn — "
358
+ + 'they are looking at a silent chat. Send a short, human one-line '
359
+ + 'acknowledgement now via `reply` (e.g. "on it — checking"), in your '
360
+ + "persona's voice, before you do any more work. A good colleague "
361
+ + "answers in a beat; don't leave the message hanging while you think. "
362
+ + 'If the full answer is genuinely seconds away, send that instead.'
363
+ )
364
+ }
343
365
  if (level === 'soft') {
344
366
  return (
345
367
  "[silence-poke] You've been silent to the user for 75s. If you're "
@@ -437,6 +459,32 @@ function tick(now: number): void {
437
459
  ? thresholds.subagentSoft
438
460
  : thresholds.soft
439
461
 
462
+ // Ack budget — the framework enforcing the human-baseline "answer
463
+ // in a beat." Fires once, only when NOTHING has been sent this turn
464
+ // (`lastOutboundAt == null`), well before the 75s `soft` threshold.
465
+ // `soft` measures silence-since-last-outbound and is the wrong
466
+ // instrument for "you never acknowledged me." Independent of the
467
+ // soft/firm/fallback ladder: if the model never acks, it still
468
+ // escalates soft → firm → fallback on schedule after this.
469
+ if (
470
+ !s.ackPokeFired
471
+ && s.lastOutboundAt == null
472
+ && s.pokesFired === 0
473
+ && silence >= thresholds.ack
474
+ ) {
475
+ s.pokeArmed = { level: 'ack' }
476
+ s.ackPokeFired = true
477
+ s.lastPokeFiredAt = now
478
+ activeDeps.emitMetric({
479
+ kind: 'silence_poke_fired',
480
+ key,
481
+ level: 'ack',
482
+ silence_ms: silence,
483
+ subagent_wait: s.subagentDispatchActive,
484
+ })
485
+ continue
486
+ }
487
+
440
488
  if (s.pokesFired === 0 && silence >= softThreshold) {
441
489
  s.pokeArmed = { level: 'soft' }
442
490
  s.pokesFired = 1
@@ -33,7 +33,15 @@ function setupDeps(opts?: { thresholds?: Partial<typeof DEFAULT_THRESHOLDS> }):
33
33
  __setDepsForTests({
34
34
  emitMetric: (e) => fixtures.emitted.push(e),
35
35
  onFrameworkFallback: (ctx) => { fixtures.fallbacks.push(ctx) },
36
- thresholdsMs: { ...DEFAULT_THRESHOLDS, ...(opts?.thresholds ?? {}) },
36
+ // The ack budget (a new poke that fires *earlier* than `soft`) is
37
+ // disabled by default in this fixture so the soft/firm/fallback
38
+ // ladder tests stay isolated from it. The 'ack budget' describe
39
+ // block opts back in with a real value.
40
+ thresholdsMs: {
41
+ ...DEFAULT_THRESHOLDS,
42
+ ack: Number.MAX_SAFE_INTEGER,
43
+ ...(opts?.thresholds ?? {}),
44
+ },
37
45
  })
38
46
  return fixtures
39
47
  }
@@ -139,6 +147,127 @@ describe('silence-poke — escalation ladder', () => {
139
147
  })
140
148
  })
141
149
 
150
+ // PR1 (human-feel UX epic): the ack budget. A person you message
151
+ // answers in a beat — the framework enforces that baseline by arming an
152
+ // 'ack' poke if NOTHING has been sent within `thresholds.ack` of turn
153
+ // start. It is a one-shot nudge (the model still authors every word),
154
+ // deliberately OUTSIDE the soft/firm/fallback `pokesFired` ladder: if
155
+ // the model never acks, the ladder still escalates on its own schedule.
156
+ // See `reference/conversational-pacing.md` and the "Open with an
157
+ // acknowledgement" bullet in `profiles/_shared/telegram-style.md.hbs`.
158
+ //
159
+ // NB: `setupDeps` disables the ack budget by default (ack = MAX_SAFE);
160
+ // every test here opts back in with a real `ack` threshold.
161
+ describe('silence-poke — ack budget (PR1 human-feel UX)', () => {
162
+ it('arms an ack poke at the ack threshold when nothing has been sent', () => {
163
+ const fx = setupDeps({ thresholds: { ack: 10_000 } })
164
+ startTurn('chat:0', 0)
165
+
166
+ __tickForTests(9_000) // before the ack budget
167
+ expect(consumeArmedPoke()).toBeNull()
168
+ expect(fx.emitted).toHaveLength(0)
169
+
170
+ __tickForTests(10_000) // at the ack budget
171
+ expect(fx.emitted).toEqual([
172
+ expect.objectContaining({ kind: 'silence_poke_fired', level: 'ack' }),
173
+ ])
174
+ const text = consumeArmedPoke()
175
+ expect(text).toContain('[silence-poke]')
176
+ expect(text).toContain('reply')
177
+ })
178
+
179
+ it('does NOT arm an ack poke if an outbound landed before the budget', () => {
180
+ const fx = setupDeps({ thresholds: { ack: 10_000 } })
181
+ startTurn('chat:0', 0)
182
+ noteOutbound('chat:0', 3_000) // model acked fast — inside the budget
183
+ __tickForTests(10_000)
184
+ __tickForTests(20_000)
185
+ expect(consumeArmedPoke()).toBeNull()
186
+ expect(
187
+ fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'ack'),
188
+ ).toHaveLength(0)
189
+ })
190
+
191
+ it('is one-shot — never re-arms even if the model goes quiet again', () => {
192
+ const fx = setupDeps({ thresholds: { ack: 10_000 } })
193
+ startTurn('chat:0', 0)
194
+ __tickForTests(10_000) // ack fires
195
+ consumeArmedPoke() // drain it
196
+ noteOutbound('chat:0', 12_000) // model finally acks
197
+ // The model goes quiet again. The ack poke is specifically about the
198
+ // FIRST outbound — it must not fire twice. A later silence is the
199
+ // soft poke's job, not the ack budget's.
200
+ __tickForTests(40_000)
201
+ expect(
202
+ fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'ack'),
203
+ ).toHaveLength(1)
204
+ })
205
+
206
+ it('ackPokeFired resets across turns even when endTurn was skipped (CC-5 invariant)', () => {
207
+ // Mirrors the subagentDispatchActive CC-5 guard: `ackPokeFired` is a
208
+ // turn-scoped one-shot flag, and the only thing that keeps it from
209
+ // leaking into the next turn (when an abnormal abort skips endTurn)
210
+ // is startTurn's unconditional state overwrite. Pin that here so a
211
+ // future read-modify-write refactor of startTurn fails loud.
212
+ setupDeps({ thresholds: { ack: 10_000 } })
213
+ startTurn('k', 0)
214
+ __tickForTests(10_000) // ack fires
215
+ expect(__getStateForTests('k')?.ackPokeFired).toBe(true)
216
+ // Turn 2 in the same key, no endTurn — startTurn MUST clear the flag.
217
+ startTurn('k', 1_000_000)
218
+ expect(__getStateForTests('k')?.ackPokeFired).toBe(false)
219
+ })
220
+
221
+ it('does not advance the ladder — soft still requires a full 75s of silence', () => {
222
+ // The ack poke is deliberately outside `pokesFired`. After it fires,
223
+ // a soft poke must still wait the normal 75s.
224
+ const fx = setupDeps({ thresholds: { ack: 10_000 } })
225
+ startTurn('chat:0', 0)
226
+ __tickForTests(10_000) // ack
227
+ consumeArmedPoke()
228
+ __tickForTests(70_000) // 70s total — under the 75s soft threshold
229
+ expect(
230
+ fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'soft'),
231
+ ).toHaveLength(0)
232
+ __tickForTests(75_000)
233
+ expect(
234
+ fx.emitted.filter((e) => e.kind === 'silence_poke_fired' && e.level === 'soft'),
235
+ ).toHaveLength(1)
236
+ })
237
+
238
+ it('still escalates ack -> soft -> firm -> fallback on a turn that never acks', () => {
239
+ const fx = setupDeps({ thresholds: { ack: 10_000 } })
240
+ startTurn('chat:0', 0)
241
+ __tickForTests(10_000) // ack
242
+ consumeArmedPoke()
243
+ __tickForTests(75_000) // soft
244
+ consumeArmedPoke()
245
+ __tickForTests(180_000) // firm
246
+ consumeArmedPoke()
247
+ __tickForTests(300_000) // fallback
248
+ const trail = fx.emitted.map((e) =>
249
+ e.kind === 'silence_poke_fired'
250
+ ? `poke:${e.level}`
251
+ : e.kind === 'silence_fallback_sent'
252
+ ? `fallback:${e.fallback_kind}`
253
+ : e.kind,
254
+ )
255
+ expect(trail).toEqual([
256
+ 'poke:ack',
257
+ 'poke:soft',
258
+ 'poke:firm',
259
+ 'fallback:working',
260
+ ])
261
+ })
262
+
263
+ it('formatPokeText("ack") nudges for a human acknowledgement via reply', () => {
264
+ const text = formatPokeText('ack')
265
+ expect(text).toContain('[silence-poke]')
266
+ expect(text.toLowerCase()).toContain('acknowledg')
267
+ expect(text).toContain('reply')
268
+ })
269
+ })
270
+
142
271
  describe('silence-poke — outbound resets clock + success measurement', () => {
143
272
  it('noteOutbound resets the silence clock', () => {
144
273
  setupDeps()
@@ -608,7 +737,9 @@ describe('silence-poke — fallback handler errors do not break timer', () => {
608
737
  __setDepsForTests({
609
738
  emitMetric: (e) => fx.emitted.push(e),
610
739
  onFrameworkFallback: () => { throw new Error('oh no') },
611
- thresholdsMs: DEFAULT_THRESHOLDS,
740
+ // ack budget out of the way — this test exercises the
741
+ // soft/firm/fallback ladder under a throwing fallback handler.
742
+ thresholdsMs: { ...DEFAULT_THRESHOLDS, ack: Number.MAX_SAFE_INTEGER },
612
743
  })
613
744
  startTurn('k', 0)
614
745
  expect(() => {
@@ -625,7 +756,8 @@ describe('silence-poke — fallback handler errors do not break timer', () => {
625
756
  __setDepsForTests({
626
757
  emitMetric: (e) => fx.emitted.push(e),
627
758
  onFrameworkFallback: () => Promise.reject(new Error('async fail')),
628
- thresholdsMs: DEFAULT_THRESHOLDS,
759
+ // ack budget out of the way — see the throwing-handler test above.
760
+ thresholdsMs: { ...DEFAULT_THRESHOLDS, ack: Number.MAX_SAFE_INTEGER },
629
761
  })
630
762
  startTurn('k', 0)
631
763
  __tickForTests(75_000)
@@ -0,0 +1,166 @@
1
+ /**
2
+ * Bridge-flap resilience scenario — regression guard for #1613 / #1616.
3
+ *
4
+ * ## The bug this guards
5
+ *
6
+ * The handoff-briefing summarizer shells out to a headless `claude -p`
7
+ * once per turn (handoff Stop hook). Before #1616 it ran without
8
+ * `--strict-mcp-config`, so it auto-discovered the agent's project
9
+ * `.mcp.json` and started every MCP server in it — including
10
+ * `switchroom-telegram`. That spun up a *second* telegram bridge
11
+ * process which registered against the same gateway socket as the
12
+ * live agent's real bridge; the two collided under the gateway's
13
+ * register-race close, producing an A↔B "bridge reconnect race" flap
14
+ * every ~2s for the ~7-9s the `claude -p` lived. The handoff hook
15
+ * fires every turn, so did the flap. A turn whose completion landed
16
+ * inside a flap burst could have its `turn_end` signal eaten — the
17
+ * agent looked wedged for that turn.
18
+ *
19
+ * The fix (#1616): the summarizer passes `--strict-mcp-config`, so
20
+ * the headless `claude -p` loads zero MCP servers and never spawns a
21
+ * competing bridge. The structural guard against a new offending
22
+ * callsite is `tests/bridge-flap-regression-guard.test.ts`; this
23
+ * scenario is the behavioural backstop.
24
+ *
25
+ * ## What this scenario asserts (root-cause-agnostic by design)
26
+ *
27
+ * The checks are symptom-based, so they catch a flap reintroduced by
28
+ * ANY future change — not only a regression of #1616:
29
+ *
30
+ * 1. Send a handful of DMs in succession — each drives a turn (and a
31
+ * handoff-hook fire). **Primary assertion:** every DM gets a reply
32
+ * within budget. Directly catches both the flap (eats turn_end)
33
+ * and the wedge (a zero-bridge gap strands the inbound).
34
+ * 2. **Forensic assertion:** inspect the agent's gateway-supervisor.log
35
+ * over the test window and assert the `bridge disconnected` density
36
+ * stays BELOW a flap threshold. One healthy persistent bridge
37
+ * produces only a trickle of disconnects; a sustained reconnect
38
+ * race produces dozens in tight ~2s bursts.
39
+ *
40
+ * ## Why the log inspection
41
+ *
42
+ * A flap is a server-side phenomenon that does not always surface as
43
+ * a missed reply (a burst can self-heal in ~20-30s). The `bridge
44
+ * disconnected` count is the transport-agnostic flap symptom. This
45
+ * scenario shells into the agent container via `docker exec` to read
46
+ * the gateway log; if docker is unavailable the log assertion is
47
+ * skipped with a warning and the responsiveness checks still run.
48
+ *
49
+ * ## Tolerances
50
+ *
51
+ * - `DISCONNECT_FLAP_THRESHOLD` is the max acceptable `bridge
52
+ * disconnected` count over the window. Post-#1616 a healthy ~4-turn
53
+ * run sits well under 16 (measured ~8-13, including anonymous
54
+ * probe-connection churn); a sustained flap is 20-40+. 16 sits
55
+ * comfortably in the gap.
56
+ */
57
+
58
+ import { describe, expect, it } from "vitest";
59
+ import { execSync } from "node:child_process";
60
+ import { spinUp } from "../harness.js";
61
+ import type { ObservedMessage } from "../driver.js";
62
+
63
+ const AGENT = "test-harness";
64
+ const CONTAINER = `switchroom-${AGENT}`;
65
+ const GATEWAY_LOG = "/var/log/switchroom/gateway-supervisor.log";
66
+
67
+ const DM_COUNT = 4;
68
+ const PER_DM_TIMEOUT_MS = 30_000;
69
+ const OVERALL_DEADLINE_MS = 180_000;
70
+
71
+ // Post-#1616 a healthy ~4-turn run logs ~8-13 `bridge disconnected`
72
+ // lines (one persistent bridge + anonymous probe-connection churn).
73
+ // A sustained A↔B flap produces 20-40+ in tight ~2s bursts. 16 sits
74
+ // in the gap.
75
+ const DISCONNECT_FLAP_THRESHOLD = 16;
76
+
77
+ /** Total line count of the agent's gateway-supervisor.log, or null if
78
+ * the container/log is unreachable (CI without the container). */
79
+ function gatewayLogLineCount(): number | null {
80
+ try {
81
+ const out = execSync(
82
+ `docker exec ${CONTAINER} sh -lc 'wc -l < ${GATEWAY_LOG}'`,
83
+ { encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] },
84
+ );
85
+ return parseInt(out.trim(), 10);
86
+ } catch {
87
+ return null;
88
+ }
89
+ }
90
+
91
+ /** Count `bridge disconnected` lines after `sinceLine` in the log. */
92
+ function disconnectCountSince(sinceLine: number): number | null {
93
+ try {
94
+ const out = execSync(
95
+ `docker exec ${CONTAINER} sh -lc ` +
96
+ `'awk "NR>${sinceLine}" ${GATEWAY_LOG} | grep -c "bridge disconnected" || true'`,
97
+ { encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] },
98
+ );
99
+ return parseInt(out.trim(), 10);
100
+ } catch {
101
+ return null;
102
+ }
103
+ }
104
+
105
+ describe("uat: bridge-flap resilience — agent stays responsive, gateway does not flap", () => {
106
+ it(
107
+ "every DM gets a reply and the gateway does not flap across turns",
108
+ async () => {
109
+ const baselineLine = gatewayLogLineCount();
110
+ const sc = await spinUp({ agent: AGENT });
111
+ try {
112
+ const overallDeadline = Date.now() + OVERALL_DEADLINE_MS;
113
+
114
+ for (let i = 1; i <= DM_COUNT; i++) {
115
+ await sc.sendDM(`flap-resilience probe ${i}/${DM_COUNT}: reply with OK${i}`);
116
+
117
+ const remaining = Math.min(
118
+ PER_DM_TIMEOUT_MS,
119
+ overallDeadline - Date.now(),
120
+ );
121
+ expect(
122
+ remaining,
123
+ `overall deadline hit before DM ${i} — earlier turns were too slow`,
124
+ ).toBeGreaterThan(0);
125
+
126
+ const reply = await sc.expectMessage(
127
+ (m: ObservedMessage) => m.fromBot && !m.edited,
128
+ { from: "bot", timeout: remaining },
129
+ );
130
+ expect(
131
+ reply.text.length,
132
+ `DM ${i}/${DM_COUNT} produced an empty reply — a flap may have ` +
133
+ `eaten the turn_end signal`,
134
+ ).toBeGreaterThan(0);
135
+ }
136
+
137
+ // Responsiveness held for all DM_COUNT turns. Now check the
138
+ // server-side flap signal.
139
+ if (baselineLine == null) {
140
+ console.warn(
141
+ "[bridge-flap-resilience] docker exec unavailable — skipping " +
142
+ "the gateway-log flap assertion; responsiveness checks passed.",
143
+ );
144
+ return;
145
+ }
146
+
147
+ const disconnectCount = disconnectCountSince(baselineLine);
148
+ expect(
149
+ disconnectCount,
150
+ "could not read gateway log after the run — container went away",
151
+ ).not.toBeNull();
152
+ expect(
153
+ disconnectCount as number,
154
+ `gateway logged ${disconnectCount} "bridge disconnected" lines across ` +
155
+ `${DM_COUNT} turns — at/above the flap threshold ` +
156
+ `(${DISCONNECT_FLAP_THRESHOLD}). A parasitic bridge is racing the ` +
157
+ `live one — check for a headless 'claude -p' spawned without ` +
158
+ `--strict-mcp-config (#1613/#1616).`,
159
+ ).toBeLessThan(DISCONNECT_FLAP_THRESHOLD);
160
+ } finally {
161
+ await sc.tearDown();
162
+ }
163
+ },
164
+ OVERALL_DEADLINE_MS + 30_000,
165
+ );
166
+ });