switchroom 0.14.2 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ /**
2
+ * Behavioral tests for the always-allow grant verification helper
3
+ * (`isRulePersisted` in `permission-rule.ts`).
4
+ *
5
+ * The `perm:always:*` handler in gateway.ts calls `isRulePersisted` after
6
+ * `switchroom agent grant` returns to confirm the rule actually landed in
7
+ * `tools.allow`. These tests drive the invariants that the structural test
8
+ * in `always-allow-grant.test.ts` can only pin by text-slicing:
9
+ *
10
+ * 1. exec "succeeds" but the reloaded allow-list does NOT contain the
11
+ * rule → isRulePersisted returns false (loud-failure path).
12
+ * 2. reloaded allow-list DOES contain the rule → returns true
13
+ * (success path).
14
+ * 3. Realistic rule values (`Skill(garmin)`, `Bash`, `mcp__x__y`) round-
15
+ * trip correctly — guards against normalization divergence where the
16
+ * value written by `agent grant` and the value read back from yaml
17
+ * diverge in shape.
18
+ *
19
+ * Because `isRulePersisted` is a pure function (takes the already-resolved
20
+ * allow-list directly), no mocking of `loadSwitchroomConfig` /
21
+ * `resolveAgentConfig` is required here. The handler's interaction with
22
+ * those config loaders is covered by the structural test.
23
+ */
24
+
25
+ import { describe, it, expect } from 'vitest'
26
+ import { isRulePersisted, resolveAlwaysAllowRule } from '../permission-rule.js'
27
+
28
+ // ---------------------------------------------------------------------------
29
+ // Core behavioral invariants
30
+ // ---------------------------------------------------------------------------
31
+
32
+ describe('isRulePersisted — failure path', () => {
33
+ it('returns false when the allow-list is empty (exec succeeded but nothing was written)', () => {
34
+ expect(isRulePersisted([], 'Bash')).toBe(false)
35
+ })
36
+
37
+ it('returns false when the rule is absent from a non-empty allow-list', () => {
38
+ expect(isRulePersisted(['Read', 'Write', 'Edit'], 'Bash')).toBe(false)
39
+ })
40
+
41
+ it('returns false for a Skill rule when the list only contains the bare tool name', () => {
42
+ // `agent grant` for Skill(garmin) should write `Skill(garmin)`, not
43
+ // `Skill`. If the yaml ended up with the wrong shape, the verification
44
+ // must catch it.
45
+ expect(isRulePersisted(['Skill'], 'Skill(garmin)')).toBe(false)
46
+ })
47
+
48
+ it('returns false for a bare tool name when only the parameterized form is present', () => {
49
+ expect(isRulePersisted(['Skill(garmin)'], 'Bash')).toBe(false)
50
+ })
51
+
52
+ it('returns false when a similar-looking rule is present but not an exact match', () => {
53
+ expect(isRulePersisted(['mcp__garmin__read_activity'], 'mcp__garmin__list_activities')).toBe(false)
54
+ })
55
+ })
56
+
57
+ describe('isRulePersisted — success path', () => {
58
+ it('returns true when the exact rule is present', () => {
59
+ expect(isRulePersisted(['Read', 'Bash', 'Write'], 'Bash')).toBe(true)
60
+ })
61
+
62
+ it('returns true when the rule is the only entry', () => {
63
+ expect(isRulePersisted(['Skill(garmin)'], 'Skill(garmin)')).toBe(true)
64
+ })
65
+
66
+ it('returns true for a namespaced MCP tool rule', () => {
67
+ expect(isRulePersisted(['mcp__garmin__list_activities', 'Bash'], 'mcp__garmin__list_activities')).toBe(true)
68
+ })
69
+ })
70
+
71
+ // ---------------------------------------------------------------------------
72
+ // Round-trip: resolveAlwaysAllowRule → isRulePersisted
73
+ // Simulates the full handler flow: resolve the rule from a permission_request,
74
+ // "grant" it (allow-list contains the resolved rule.rule), then verify.
75
+ // Guards against normalization divergence between the value the handler
76
+ // resolves and the value `agent grant` writes + the config reader returns.
77
+ // ---------------------------------------------------------------------------
78
+
79
+ describe('rule round-trip through isRulePersisted', () => {
80
+ it('Skill tool: resolved rule persists correctly', () => {
81
+ const rule = resolveAlwaysAllowRule('Skill', JSON.stringify({ skill: 'garmin' }))
82
+ expect(rule).not.toBeNull()
83
+ // Simulate: allow-list now contains the rule that `agent grant` wrote.
84
+ expect(isRulePersisted([rule!.rule], rule!.rule)).toBe(true)
85
+ // Confirm the written form is `Skill(garmin)` — not a bare `Skill`.
86
+ expect(rule!.rule).toBe('Skill(garmin)')
87
+ })
88
+
89
+ it('Skill tool: absent rule is detected', () => {
90
+ const rule = resolveAlwaysAllowRule('Skill', JSON.stringify({ skill: 'garmin' }))
91
+ expect(rule).not.toBeNull()
92
+ // allow-list was not updated (silent grant failure).
93
+ expect(isRulePersisted([], rule!.rule)).toBe(false)
94
+ expect(isRulePersisted(['Skill'], rule!.rule)).toBe(false)
95
+ })
96
+
97
+ it('Bash tool: round-trips correctly', () => {
98
+ const rule = resolveAlwaysAllowRule('Bash', undefined)
99
+ expect(rule).not.toBeNull()
100
+ expect(rule!.rule).toBe('Bash')
101
+ expect(isRulePersisted(['Bash', 'Read'], rule!.rule)).toBe(true)
102
+ expect(isRulePersisted(['Read'], rule!.rule)).toBe(false)
103
+ })
104
+
105
+ it('MCP tool: round-trips with exact namespaced form', () => {
106
+ const toolName = 'mcp__garmin__list_activities'
107
+ const rule = resolveAlwaysAllowRule(toolName, undefined)
108
+ expect(rule).not.toBeNull()
109
+ expect(rule!.rule).toBe(toolName)
110
+ expect(isRulePersisted([toolName], rule!.rule)).toBe(true)
111
+ expect(isRulePersisted(['mcp__garmin__read_activity'], rule!.rule)).toBe(false)
112
+ })
113
+
114
+ it('multiple Skill tools do not cross-contaminate', () => {
115
+ const garmin = resolveAlwaysAllowRule('Skill', JSON.stringify({ skill: 'garmin' }))
116
+ const mail = resolveAlwaysAllowRule('Skill', JSON.stringify({ skill: 'mail' }))
117
+ expect(garmin).not.toBeNull()
118
+ expect(mail).not.toBeNull()
119
+ // Allow-list only has garmin's rule.
120
+ const allowList = [garmin!.rule]
121
+ expect(isRulePersisted(allowList, garmin!.rule)).toBe(true)
122
+ expect(isRulePersisted(allowList, mail!.rule)).toBe(false)
123
+ })
124
+ })
@@ -0,0 +1,93 @@
1
+ /**
2
+ * PR3b cutover — the turn-in-flight GATE now reads the delivery state
3
+ * machine (`isMachineInTurn`) instead of the PR3b `claudeBusyKeys` set.
4
+ *
5
+ * The bug this closes (gymbro/clerk, 2026-05-28): `claudeBusyKeys` is a
6
+ * per-delivery Set — every delivery `.add`s a key, but turn-end `.delete`s
7
+ * exactly one. When a turn-end is missed (or fires under a non-matching
8
+ * key) the set keeps an orphan, `size > 0` reads true forever, and EVERY
9
+ * subsequent inbound buffers as "held mid-turn" until the 5-min
10
+ * framework-fallback force-drains it.
11
+ *
12
+ * The machine cannot accumulate orphans: global state holds ONE
13
+ * `activeTurn`, so any matching turnEnd returns it to idle, and the TTL
14
+ * `tick` self-heals a missed turnEnd. These tests pin both the normal
15
+ * reopen and the dangle-recovery path on the accessors the gate reads.
16
+ */
17
+
18
+ import { describe, expect, it, beforeEach } from 'vitest'
19
+ import {
20
+ shadowEmit,
21
+ isMachineInTurn,
22
+ isDeliveryCutoverEnabled,
23
+ __shadowResetForTests,
24
+ } from '../gateway/inbound-delivery-machine-shadow.js'
25
+ import { TURN_TTL_MS, type ChatKey } from '../gateway/inbound-delivery-machine.js'
26
+
27
+ const KEY_A = '111:_' as ChatKey
28
+ const KEY_B = '222:_' as ChatKey
29
+
30
+ function inbound(key: ChatKey, at: number, msgId = 1) {
31
+ shadowEmit({ kind: 'inbound', key, msg: { msgId, isSteering: false, payload: null }, at })
32
+ }
33
+
34
+ describe('PR3b cutover gate accessors', () => {
35
+ beforeEach(() => __shadowResetForTests())
36
+
37
+ it('enabled by default (shadow on, no kill-switch in test env)', () => {
38
+ expect(isDeliveryCutoverEnabled()).toBe(true)
39
+ })
40
+
41
+ it('reads idle before any turn (bridge alive)', () => {
42
+ shadowEmit({ kind: 'bridgeUp', at: 1000 })
43
+ expect(isMachineInTurn()).toBe(false)
44
+ })
45
+
46
+ it('flips in-turn on a fresh inbound and reopens on turnEnd (the gate reopen)', () => {
47
+ shadowEmit({ kind: 'bridgeUp', at: 1000 })
48
+ inbound(KEY_A, 2000)
49
+ expect(isMachineInTurn()).toBe(true)
50
+ shadowEmit({ kind: 'turnEnd', key: KEY_A, at: 3000, outboundEmitted: true })
51
+ // Gate reopens immediately — this is the path claudeBusyKeys danged on.
52
+ expect(isMachineInTurn()).toBe(false)
53
+ })
54
+
55
+ it('self-heals a MISSED turnEnd via the TTL tick (the dangle the fix kills)', () => {
56
+ shadowEmit({ kind: 'bridgeUp', at: 1000 })
57
+ // Turn A starts via enqueue (turnStart), then turn B starts before A's
58
+ // turnEnd ever lands — the orphan scenario. The machine keeps
59
+ // activeTurn=A (turnStart is a no-op on global when already in_turn),
60
+ // so a later turnEnd(B) does NOT match and would leave A dangling.
61
+ shadowEmit({ kind: 'turnStart', key: KEY_A, at: 2000 })
62
+ shadowEmit({ kind: 'turnStart', key: KEY_B, at: 3000 })
63
+ shadowEmit({ kind: 'turnEnd', key: KEY_B, at: 4000, outboundEmitted: true })
64
+ // Without tick, the gate would still read in-turn (activeTurn=A stuck).
65
+ expect(isMachineInTurn()).toBe(true)
66
+ // TTL tick past A's start clears the orphan and reopens the gate —
67
+ // the structural guarantee claudeBusyKeys lacked.
68
+ shadowEmit({ kind: 'tick', now: 2000 + TURN_TTL_MS + 1 })
69
+ expect(isMachineInTurn()).toBe(false)
70
+ })
71
+
72
+ it('does NOT clear a long-but-ACTIVE turn (modelOutbound suppression)', () => {
73
+ shadowEmit({ kind: 'bridgeUp', at: 1000 })
74
+ shadowEmit({ kind: 'turnStart', key: KEY_A, at: 2000 })
75
+ // Model is still streaming just before the TTL boundary.
76
+ const justBeforeTtl = 2000 + TURN_TTL_MS - 5_000
77
+ shadowEmit({ kind: 'modelOutbound', key: KEY_A, at: justBeforeTtl })
78
+ // Tick past TTL — but recent outbound is within the suppression window,
79
+ // so the turn is NOT cleared (parity with the imperative silence-poke).
80
+ shadowEmit({ kind: 'tick', now: 2000 + TURN_TTL_MS + 1 })
81
+ expect(isMachineInTurn()).toBe(true)
82
+ })
83
+
84
+ it('a buffered sibling inbound does not change the active turn', () => {
85
+ shadowEmit({ kind: 'bridgeUp', at: 1000 })
86
+ inbound(KEY_A, 2000) // fresh turn A
87
+ inbound(KEY_B, 2500) // mid-turn — buffered, must NOT start a new turn
88
+ expect(isMachineInTurn()).toBe(true)
89
+ shadowEmit({ kind: 'turnEnd', key: KEY_A, at: 3000, outboundEmitted: true })
90
+ // A ended; nothing else active → gate reopens so B can drain.
91
+ expect(isMachineInTurn()).toBe(false)
92
+ })
93
+ })
@@ -7,6 +7,7 @@ import {
7
7
  verbForTool,
8
8
  describeToolUse,
9
9
  appendActivityLine,
10
+ appendActivityLabel,
10
11
  renderActivityFeed,
11
12
  MIRROR_MAX_LINES,
12
13
  } from "../tool-activity-summary.js";
@@ -328,3 +329,21 @@ describe("appendActivityLine + renderActivityFeed — accumulating draft feed",
328
329
  expect(renderActivityFeed([])).toBeNull();
329
330
  });
330
331
  });
332
+
333
+ describe("appendActivityLabel — precomputed label feed (tool_label path)", () => {
334
+ it("accumulates precomputed labels, dedups consecutive, ignores empty", () => {
335
+ const lines: string[] = [];
336
+ expect(appendActivityLabel(lines, "Searching memory")).toBe("· Searching memory");
337
+ expect(appendActivityLabel(lines, "List workspace")).toBe(
338
+ "· Searching memory\n· List workspace",
339
+ );
340
+ // consecutive dup collapses
341
+ appendActivityLabel(lines, "List workspace");
342
+ expect(lines).toEqual(["Searching memory", "List workspace"]);
343
+ // empty / whitespace → null, no push
344
+ expect(appendActivityLabel(lines, "")).toBeNull();
345
+ expect(appendActivityLabel(lines, " ")).toBeNull();
346
+ expect(appendActivityLabel(lines, undefined)).toBeNull();
347
+ expect(lines.length).toBe(2);
348
+ });
349
+ });
@@ -83,6 +83,42 @@ describe('tool-label-sidecar', () => {
83
83
  s.stop()
84
84
  })
85
85
 
86
+ it('replays pre-existing rows to a subscriber that attaches after construction', () => {
87
+ // Regression: the gateway's session-tail constructs the sidecar (which
88
+ // does an initial drain of the file) and only THEN wires `onLabel`. On a
89
+ // fast/clustered turn — or a resumed/flipped session — the hook has
90
+ // already written labels, so the initial drain consumed them with an
91
+ // empty subscriber set. Before the replay fix the late subscriber got
92
+ // nothing, so the real-time draft-mirror never fired (every label lost).
93
+ const sessionId = 'sess-replay'
94
+ const f = join(stateDir, `tool-labels-${sessionId}.jsonl`)
95
+ writeFileSync(
96
+ f,
97
+ JSON.stringify({ ts: 1, tool_use_id: 'A', agent_id: 'g', label: 'Reading foo.ts', tool_name: 'Read' }) + '\n' +
98
+ JSON.stringify({ ts: 2, tool_use_id: 'B', agent_id: 'g', label: 'List workspace', tool_name: 'Bash' }) + '\n',
99
+ )
100
+ const sched = makeManualScheduler()
101
+ const s = createToolLabelSidecar({ stateDir, sessionId, scheduler: sched })
102
+ // Subscribe AFTER construction (the real ensureSidecar ordering).
103
+ const seen: Array<[string, string, string]> = []
104
+ s.onLabel((id, label, toolName) => seen.push([id, label, toolName]))
105
+ expect(seen).toEqual([
106
+ ['A', 'Reading foo.ts', 'Read'],
107
+ ['B', 'List workspace', 'Bash'],
108
+ ])
109
+
110
+ // And a row appended afterwards still reaches the subscriber exactly once
111
+ // (no double-emit of the replayed rows).
112
+ appendFileSync(f, JSON.stringify({ ts: 3, tool_use_id: 'C', agent_id: 'g', label: 'Searching memory', tool_name: 'mcp__hindsight__recall' }) + '\n')
113
+ s.poll()
114
+ expect(seen).toEqual([
115
+ ['A', 'Reading foo.ts', 'Read'],
116
+ ['B', 'List workspace', 'Bash'],
117
+ ['C', 'Searching memory', 'mcp__hindsight__recall'],
118
+ ])
119
+ s.stop()
120
+ })
121
+
86
122
  it('ignores malformed JSON lines', () => {
87
123
  const sessionId = 'sess4'
88
124
  const sched = makeManualScheduler()
@@ -382,3 +382,21 @@ export function renderActivityFeed(lines: string[]): string | null {
382
382
  const body = shown.map((l) => `· ${l}`).join("\n");
383
383
  return hidden > 0 ? `· +${hidden} earlier…\n${body}` : body;
384
384
  }
385
+
386
+ /**
387
+ * Like appendActivityLine, but for a pre-computed label (from the
388
+ * real-time PreToolUse sidecar / `tool_label` event) — the hook already
389
+ * rendered the friendly text, so we skip describeToolUse. Returns the
390
+ * rendered feed, or null when the label is empty.
391
+ */
392
+ export function appendActivityLabel(
393
+ lines: string[],
394
+ label: string | undefined,
395
+ ): string | null {
396
+ const l = (label ?? "").trim();
397
+ if (l.length === 0) return null;
398
+ if (lines.length === 0 || lines[lines.length - 1] !== l) {
399
+ lines.push(l);
400
+ }
401
+ return renderActivityFeed(lines);
402
+ }
@@ -40,8 +40,11 @@ export interface ToolLabelRow {
40
40
  export interface ToolLabelSidecar {
41
41
  /** Synchronous label lookup. */
42
42
  getLabel(toolUseId: string): string | undefined
43
- /** Subscribe to "label arrived" notifications. */
44
- onLabel(cb: (toolUseId: string, label: string) => void): () => void
43
+ /** Subscribe to "label arrived" notifications. Fires once per new
44
+ * sidecar line, in real time (~pollMs after the hook's appendFileSync),
45
+ * independent of when the claude transcript flushes. `toolName` lets
46
+ * subscribers filter surface tools (reply/react) from a live feed. */
47
+ onLabel(cb: (toolUseId: string, label: string, toolName: string) => void): () => void
45
48
  /** Force a re-poll (tests). */
46
49
  poll(): void
47
50
  /** Stop polling and release resources. */
@@ -63,7 +66,15 @@ export interface SidecarOptions {
63
66
  export function createToolLabelSidecar(opts: SidecarOptions): ToolLabelSidecar {
64
67
  const path = join(opts.stateDir, `tool-labels-${opts.sessionId}.jsonl`)
65
68
  const labels = new Map<string, string>()
66
- const subscribers = new Set<(toolUseId: string, label: string) => void>()
69
+ // Ordered log of every row ingested so far (label + tool_name), used to
70
+ // replay history to a subscriber that attaches AFTER rows were already
71
+ // read. Without this, a sidecar whose file is already populated when
72
+ // `onLabel` is wired (fast/clustered turns, resumed/flipped sessions —
73
+ // the gateway's `ensureSidecar` subscribes *after* construction's initial
74
+ // drain) would silently lose every pre-existing label, breaking the
75
+ // real-time draft-mirror determinism the sidecar exists to provide.
76
+ const seen: Array<{ toolUseId: string; label: string; toolName: string }> = []
77
+ const subscribers = new Set<(toolUseId: string, label: string, toolName: string) => void>()
67
78
  let offset = 0
68
79
  let stopped = false
69
80
 
@@ -84,13 +95,19 @@ export function createToolLabelSidecar(opts: SidecarOptions): ToolLabelSidecar {
84
95
  } catch {
85
96
  continue
86
97
  }
87
- if (!row || typeof row.tool_use_id !== 'string' || typeof row.label !== 'string') continue
98
+ if (
99
+ !row ||
100
+ typeof row.tool_use_id !== 'string' ||
101
+ typeof row.label !== 'string' ||
102
+ typeof row.tool_name !== 'string'
103
+ ) continue
88
104
  // First write wins — sidecar lines are append-only and we don't
89
105
  // expect duplicates, but if one lands we keep the earliest.
90
106
  if (labels.has(row.tool_use_id)) continue
91
107
  labels.set(row.tool_use_id, row.label)
108
+ seen.push({ toolUseId: row.tool_use_id, label: row.label, toolName: row.tool_name })
92
109
  for (const cb of subscribers) {
93
- try { cb(row.tool_use_id, row.label) } catch { /* ignore */ }
110
+ try { cb(row.tool_use_id, row.label, row.tool_name) } catch { /* ignore */ }
94
111
  }
95
112
  }
96
113
  }
@@ -126,6 +143,15 @@ export function createToolLabelSidecar(opts: SidecarOptions): ToolLabelSidecar {
126
143
  return labels.get(toolUseId)
127
144
  },
128
145
  onLabel(cb) {
146
+ // Replay rows already ingested before this subscriber attached, then
147
+ // register for future rows. Single-threaded: no row can be ingested
148
+ // between the replay loop and the add, so each row reaches `cb`
149
+ // exactly once. This is what makes the draft-mirror deterministic
150
+ // regardless of when the gateway subscribes relative to the hook's
151
+ // writes (see the `seen` declaration above).
152
+ for (const r of seen) {
153
+ try { cb(r.toolUseId, r.label, r.toolName) } catch { /* ignore */ }
154
+ }
129
155
  subscribers.add(cb)
130
156
  return () => subscribers.delete(cb)
131
157
  },
@@ -215,9 +215,13 @@ const CC2_CASES: readonly CC2Case[] = [
215
215
  },
216
216
  {
217
217
  name: "long-running with planned check-ins",
218
+ // Use python time.sleep, NOT the `sleep` command — Claude Code's bash
219
+ // sandbox blocks standalone `sleep` ("foreground sleep is sandboxed
220
+ // away"), which made this case un-runnable (agent replied instantly).
218
221
  prompt:
219
- "Run `bash` with `sleep 5 && echo step1`, send a brief update, " +
220
- "then `sleep 5 && echo step2`, send another brief update, then " +
222
+ "Run `bash` with `python3 -c 'import time; time.sleep(5)'` then echo " +
223
+ "step1, send a brief update, then `python3 -c 'import time; " +
224
+ "time.sleep(5)'` then echo step2, send another brief update, then " +
221
225
  "send a final 'done' as your answer.",
222
226
  },
223
227
  ];
@@ -262,12 +266,27 @@ async function assertMidTurnSilent(
262
266
  )
263
267
  .join("\n");
264
268
 
265
- const last = collected[collected.length - 1];
266
- expect(last.silent, `final answer was silent won't ping. Trail:\n${trail}`).toBe(
267
- false,
268
- );
269
-
270
- const midTurn = collected.slice(0, -1);
269
+ // The model habitually emits a trailing trivial confirmation ("Done.",
270
+ // "Sent.", "OK") as a separate SILENT message AFTER its real pinged
271
+ // answer. That's pacing noise (the turn-pacing directive discourages
272
+ // it), not the final answer — so don't treat it as the
273
+ // "final-answer-must-ping" target. Find the last SUBSTANTIVE message
274
+ // and assert that one pinged; trailing trivial confirmations are
275
+ // ignored for this invariant (they're correctly silent anyway).
276
+ const TRIVIAL_TAIL = /^(done|sent|ok|okay|ack|got it|hope (that|this) helps)\b[.! ]*$/i;
277
+ const isTrivial = (m: ObservedMessage) => TRIVIAL_TAIL.test(m.text.trim());
278
+ let finalIdx = collected.length - 1;
279
+ while (finalIdx > 0 && isTrivial(collected[finalIdx])) finalIdx--;
280
+ const finalAnswer = collected[finalIdx];
281
+ expect(
282
+ finalAnswer.silent,
283
+ `final substantive answer was silent — won't ping. Trail:\n${trail}`,
284
+ ).toBe(false);
285
+
286
+ // Everything BEFORE the final substantive answer must be silent
287
+ // (mid-turn updates ping-free). Trailing trivial confirmations after
288
+ // it are already silent and are not "mid-turn" — exclude them too.
289
+ const midTurn = collected.slice(0, finalIdx);
271
290
  const loudMidTurn = midTurn.filter((m) => !m.silent);
272
291
  expect(
273
292
  loudMidTurn.length,
@@ -334,12 +353,19 @@ async function assertSilencePokeFires(
334
353
  // Single bash call so the poke piggybacks the single tool result.
335
354
  // Without the explicit "no replies" instruction the model might
336
355
  // soft-commit; that resets the silence clock but a single >75s
337
- // sleep still pushes post-commit silence past the threshold.
356
+ // wait still pushes post-commit silence past the threshold.
357
+ //
358
+ // Use python time.sleep, NOT the `sleep` command — Claude Code's bash
359
+ // sandbox blocks standalone `sleep` ("foreground sleep is sandboxed
360
+ // away to prevent burning cache windows"), so a `sleep 80` prompt made
361
+ // the agent reply instantly instead of going silent, breaking this
362
+ // case. python3 time.sleep is a genuine foreground wait the sandbox
363
+ // doesn't special-case.
338
364
  const prompt =
339
- `Run exactly one Bash tool call: \`sleep ${sleepSeconds}\`. Do NOT ` +
340
- `send any reply before the sleep completes — no soft commit, no ` +
341
- `mid-turn updates. When the sleep returns, send one brief 'done' ` +
342
- `reply.`;
365
+ `Run exactly one Bash tool call: \`python3 -c 'import time; ` +
366
+ `time.sleep(${sleepSeconds})'\`. Do NOT send any reply before it ` +
367
+ `completes — no soft commit, no mid-turn updates. When it returns, ` +
368
+ `send one brief 'done' reply.`;
343
369
 
344
370
  await scenario.sendDM(prompt);
345
371