switchroom 0.14.2 → 0.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +81 -5
- package/package.json +1 -1
- package/telegram-plugin/dist/bridge/bridge.js +15 -2
- package/telegram-plugin/dist/gateway/gateway.js +97 -132
- package/telegram-plugin/dist/server.js +15 -2
- package/telegram-plugin/gateway/gateway.ts +174 -29
- package/telegram-plugin/gateway/inbound-delivery-machine-shadow.ts +33 -0
- package/telegram-plugin/hooks/tool-label-pretool.mjs +13 -4
- package/telegram-plugin/permission-rule.ts +22 -0
- package/telegram-plugin/session-tail.ts +18 -0
- package/telegram-plugin/tests/always-allow-grant.test.ts +147 -0
- package/telegram-plugin/tests/always-allow-persist.test.ts +124 -0
- package/telegram-plugin/tests/inbound-delivery-cutover-gate.test.ts +93 -0
- package/telegram-plugin/tests/tool-activity-summary.test.ts +19 -0
- package/telegram-plugin/tests/tool-label-sidecar.test.ts +36 -0
- package/telegram-plugin/tool-activity-summary.ts +18 -0
- package/telegram-plugin/tool-label-sidecar.ts +31 -5
- package/telegram-plugin/uat/scenarios/fuzz-status-ask-dm.test.ts +39 -13
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Behavioral tests for the always-allow grant verification helper
|
|
3
|
+
* (`isRulePersisted` in `permission-rule.ts`).
|
|
4
|
+
*
|
|
5
|
+
* The `perm:always:*` handler in gateway.ts calls `isRulePersisted` after
|
|
6
|
+
* `switchroom agent grant` returns to confirm the rule actually landed in
|
|
7
|
+
* `tools.allow`. These tests drive the invariants that the structural test
|
|
8
|
+
* in `always-allow-grant.test.ts` can only pin by text-slicing:
|
|
9
|
+
*
|
|
10
|
+
* 1. exec "succeeds" but the reloaded allow-list does NOT contain the
|
|
11
|
+
* rule → isRulePersisted returns false (loud-failure path).
|
|
12
|
+
* 2. reloaded allow-list DOES contain the rule → returns true
|
|
13
|
+
* (success path).
|
|
14
|
+
* 3. Realistic rule values (`Skill(garmin)`, `Bash`, `mcp__x__y`) round-
|
|
15
|
+
* trip correctly — guards against normalization divergence where the
|
|
16
|
+
* value written by `agent grant` and the value read back from yaml
|
|
17
|
+
* diverge in shape.
|
|
18
|
+
*
|
|
19
|
+
* Because `isRulePersisted` is a pure function (takes the already-resolved
|
|
20
|
+
* allow-list directly), no mocking of `loadSwitchroomConfig` /
|
|
21
|
+
* `resolveAgentConfig` is required here. The handler's interaction with
|
|
22
|
+
* those config loaders is covered by the structural test.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import { describe, it, expect } from 'vitest'
|
|
26
|
+
import { isRulePersisted, resolveAlwaysAllowRule } from '../permission-rule.js'
|
|
27
|
+
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
// Core behavioral invariants
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
describe('isRulePersisted — failure path', () => {
|
|
33
|
+
it('returns false when the allow-list is empty (exec succeeded but nothing was written)', () => {
|
|
34
|
+
expect(isRulePersisted([], 'Bash')).toBe(false)
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
it('returns false when the rule is absent from a non-empty allow-list', () => {
|
|
38
|
+
expect(isRulePersisted(['Read', 'Write', 'Edit'], 'Bash')).toBe(false)
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
it('returns false for a Skill rule when the list only contains the bare tool name', () => {
|
|
42
|
+
// `agent grant` for Skill(garmin) should write `Skill(garmin)`, not
|
|
43
|
+
// `Skill`. If the yaml ended up with the wrong shape, the verification
|
|
44
|
+
// must catch it.
|
|
45
|
+
expect(isRulePersisted(['Skill'], 'Skill(garmin)')).toBe(false)
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
it('returns false for a bare tool name when only the parameterized form is present', () => {
|
|
49
|
+
expect(isRulePersisted(['Skill(garmin)'], 'Bash')).toBe(false)
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
it('returns false when a similar-looking rule is present but not an exact match', () => {
|
|
53
|
+
expect(isRulePersisted(['mcp__garmin__read_activity'], 'mcp__garmin__list_activities')).toBe(false)
|
|
54
|
+
})
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
describe('isRulePersisted — success path', () => {
|
|
58
|
+
it('returns true when the exact rule is present', () => {
|
|
59
|
+
expect(isRulePersisted(['Read', 'Bash', 'Write'], 'Bash')).toBe(true)
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
it('returns true when the rule is the only entry', () => {
|
|
63
|
+
expect(isRulePersisted(['Skill(garmin)'], 'Skill(garmin)')).toBe(true)
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
it('returns true for a namespaced MCP tool rule', () => {
|
|
67
|
+
expect(isRulePersisted(['mcp__garmin__list_activities', 'Bash'], 'mcp__garmin__list_activities')).toBe(true)
|
|
68
|
+
})
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
// ---------------------------------------------------------------------------
|
|
72
|
+
// Round-trip: resolveAlwaysAllowRule → isRulePersisted
|
|
73
|
+
// Simulates the full handler flow: resolve the rule from a permission_request,
|
|
74
|
+
// "grant" it (allow-list contains the resolved rule.rule), then verify.
|
|
75
|
+
// Guards against normalization divergence between the value the handler
|
|
76
|
+
// resolves and the value `agent grant` writes + the config reader returns.
|
|
77
|
+
// ---------------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
describe('rule round-trip through isRulePersisted', () => {
|
|
80
|
+
it('Skill tool: resolved rule persists correctly', () => {
|
|
81
|
+
const rule = resolveAlwaysAllowRule('Skill', JSON.stringify({ skill: 'garmin' }))
|
|
82
|
+
expect(rule).not.toBeNull()
|
|
83
|
+
// Simulate: allow-list now contains the rule that `agent grant` wrote.
|
|
84
|
+
expect(isRulePersisted([rule!.rule], rule!.rule)).toBe(true)
|
|
85
|
+
// Confirm the written form is `Skill(garmin)` — not a bare `Skill`.
|
|
86
|
+
expect(rule!.rule).toBe('Skill(garmin)')
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
it('Skill tool: absent rule is detected', () => {
|
|
90
|
+
const rule = resolveAlwaysAllowRule('Skill', JSON.stringify({ skill: 'garmin' }))
|
|
91
|
+
expect(rule).not.toBeNull()
|
|
92
|
+
// allow-list was not updated (silent grant failure).
|
|
93
|
+
expect(isRulePersisted([], rule!.rule)).toBe(false)
|
|
94
|
+
expect(isRulePersisted(['Skill'], rule!.rule)).toBe(false)
|
|
95
|
+
})
|
|
96
|
+
|
|
97
|
+
it('Bash tool: round-trips correctly', () => {
|
|
98
|
+
const rule = resolveAlwaysAllowRule('Bash', undefined)
|
|
99
|
+
expect(rule).not.toBeNull()
|
|
100
|
+
expect(rule!.rule).toBe('Bash')
|
|
101
|
+
expect(isRulePersisted(['Bash', 'Read'], rule!.rule)).toBe(true)
|
|
102
|
+
expect(isRulePersisted(['Read'], rule!.rule)).toBe(false)
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
it('MCP tool: round-trips with exact namespaced form', () => {
|
|
106
|
+
const toolName = 'mcp__garmin__list_activities'
|
|
107
|
+
const rule = resolveAlwaysAllowRule(toolName, undefined)
|
|
108
|
+
expect(rule).not.toBeNull()
|
|
109
|
+
expect(rule!.rule).toBe(toolName)
|
|
110
|
+
expect(isRulePersisted([toolName], rule!.rule)).toBe(true)
|
|
111
|
+
expect(isRulePersisted(['mcp__garmin__read_activity'], rule!.rule)).toBe(false)
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
it('multiple Skill tools do not cross-contaminate', () => {
|
|
115
|
+
const garmin = resolveAlwaysAllowRule('Skill', JSON.stringify({ skill: 'garmin' }))
|
|
116
|
+
const mail = resolveAlwaysAllowRule('Skill', JSON.stringify({ skill: 'mail' }))
|
|
117
|
+
expect(garmin).not.toBeNull()
|
|
118
|
+
expect(mail).not.toBeNull()
|
|
119
|
+
// Allow-list only has garmin's rule.
|
|
120
|
+
const allowList = [garmin!.rule]
|
|
121
|
+
expect(isRulePersisted(allowList, garmin!.rule)).toBe(true)
|
|
122
|
+
expect(isRulePersisted(allowList, mail!.rule)).toBe(false)
|
|
123
|
+
})
|
|
124
|
+
})
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PR3b cutover — the turn-in-flight GATE now reads the delivery state
|
|
3
|
+
* machine (`isMachineInTurn`) instead of the PR3b `claudeBusyKeys` set.
|
|
4
|
+
*
|
|
5
|
+
* The bug this closes (gymbro/clerk, 2026-05-28): `claudeBusyKeys` is a
|
|
6
|
+
* per-delivery Set — every delivery `.add`s a key, but turn-end `.delete`s
|
|
7
|
+
* exactly one. When a turn-end is missed (or fires under a non-matching
|
|
8
|
+
* key) the set keeps an orphan, `size > 0` reads true forever, and EVERY
|
|
9
|
+
* subsequent inbound buffers as "held mid-turn" until the 5-min
|
|
10
|
+
* framework-fallback force-drains it.
|
|
11
|
+
*
|
|
12
|
+
* The machine cannot accumulate orphans: global state holds ONE
|
|
13
|
+
* `activeTurn`, so any matching turnEnd returns it to idle, and the TTL
|
|
14
|
+
* `tick` self-heals a missed turnEnd. These tests pin both the normal
|
|
15
|
+
* reopen and the dangle-recovery path on the accessors the gate reads.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { describe, expect, it, beforeEach } from 'vitest'
|
|
19
|
+
import {
|
|
20
|
+
shadowEmit,
|
|
21
|
+
isMachineInTurn,
|
|
22
|
+
isDeliveryCutoverEnabled,
|
|
23
|
+
__shadowResetForTests,
|
|
24
|
+
} from '../gateway/inbound-delivery-machine-shadow.js'
|
|
25
|
+
import { TURN_TTL_MS, type ChatKey } from '../gateway/inbound-delivery-machine.js'
|
|
26
|
+
|
|
27
|
+
const KEY_A = '111:_' as ChatKey
|
|
28
|
+
const KEY_B = '222:_' as ChatKey
|
|
29
|
+
|
|
30
|
+
function inbound(key: ChatKey, at: number, msgId = 1) {
|
|
31
|
+
shadowEmit({ kind: 'inbound', key, msg: { msgId, isSteering: false, payload: null }, at })
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
describe('PR3b cutover gate accessors', () => {
|
|
35
|
+
beforeEach(() => __shadowResetForTests())
|
|
36
|
+
|
|
37
|
+
it('enabled by default (shadow on, no kill-switch in test env)', () => {
|
|
38
|
+
expect(isDeliveryCutoverEnabled()).toBe(true)
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
it('reads idle before any turn (bridge alive)', () => {
|
|
42
|
+
shadowEmit({ kind: 'bridgeUp', at: 1000 })
|
|
43
|
+
expect(isMachineInTurn()).toBe(false)
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
it('flips in-turn on a fresh inbound and reopens on turnEnd (the gate reopen)', () => {
|
|
47
|
+
shadowEmit({ kind: 'bridgeUp', at: 1000 })
|
|
48
|
+
inbound(KEY_A, 2000)
|
|
49
|
+
expect(isMachineInTurn()).toBe(true)
|
|
50
|
+
shadowEmit({ kind: 'turnEnd', key: KEY_A, at: 3000, outboundEmitted: true })
|
|
51
|
+
// Gate reopens immediately — this is the path claudeBusyKeys danged on.
|
|
52
|
+
expect(isMachineInTurn()).toBe(false)
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
it('self-heals a MISSED turnEnd via the TTL tick (the dangle the fix kills)', () => {
|
|
56
|
+
shadowEmit({ kind: 'bridgeUp', at: 1000 })
|
|
57
|
+
// Turn A starts via enqueue (turnStart), then turn B starts before A's
|
|
58
|
+
// turnEnd ever lands — the orphan scenario. The machine keeps
|
|
59
|
+
// activeTurn=A (turnStart is a no-op on global when already in_turn),
|
|
60
|
+
// so a later turnEnd(B) does NOT match and would leave A dangling.
|
|
61
|
+
shadowEmit({ kind: 'turnStart', key: KEY_A, at: 2000 })
|
|
62
|
+
shadowEmit({ kind: 'turnStart', key: KEY_B, at: 3000 })
|
|
63
|
+
shadowEmit({ kind: 'turnEnd', key: KEY_B, at: 4000, outboundEmitted: true })
|
|
64
|
+
// Without tick, the gate would still read in-turn (activeTurn=A stuck).
|
|
65
|
+
expect(isMachineInTurn()).toBe(true)
|
|
66
|
+
// TTL tick past A's start clears the orphan and reopens the gate —
|
|
67
|
+
// the structural guarantee claudeBusyKeys lacked.
|
|
68
|
+
shadowEmit({ kind: 'tick', now: 2000 + TURN_TTL_MS + 1 })
|
|
69
|
+
expect(isMachineInTurn()).toBe(false)
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
it('does NOT clear a long-but-ACTIVE turn (modelOutbound suppression)', () => {
|
|
73
|
+
shadowEmit({ kind: 'bridgeUp', at: 1000 })
|
|
74
|
+
shadowEmit({ kind: 'turnStart', key: KEY_A, at: 2000 })
|
|
75
|
+
// Model is still streaming just before the TTL boundary.
|
|
76
|
+
const justBeforeTtl = 2000 + TURN_TTL_MS - 5_000
|
|
77
|
+
shadowEmit({ kind: 'modelOutbound', key: KEY_A, at: justBeforeTtl })
|
|
78
|
+
// Tick past TTL — but recent outbound is within the suppression window,
|
|
79
|
+
// so the turn is NOT cleared (parity with the imperative silence-poke).
|
|
80
|
+
shadowEmit({ kind: 'tick', now: 2000 + TURN_TTL_MS + 1 })
|
|
81
|
+
expect(isMachineInTurn()).toBe(true)
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
it('a buffered sibling inbound does not change the active turn', () => {
|
|
85
|
+
shadowEmit({ kind: 'bridgeUp', at: 1000 })
|
|
86
|
+
inbound(KEY_A, 2000) // fresh turn A
|
|
87
|
+
inbound(KEY_B, 2500) // mid-turn — buffered, must NOT start a new turn
|
|
88
|
+
expect(isMachineInTurn()).toBe(true)
|
|
89
|
+
shadowEmit({ kind: 'turnEnd', key: KEY_A, at: 3000, outboundEmitted: true })
|
|
90
|
+
// A ended; nothing else active → gate reopens so B can drain.
|
|
91
|
+
expect(isMachineInTurn()).toBe(false)
|
|
92
|
+
})
|
|
93
|
+
})
|
|
@@ -7,6 +7,7 @@ import {
|
|
|
7
7
|
verbForTool,
|
|
8
8
|
describeToolUse,
|
|
9
9
|
appendActivityLine,
|
|
10
|
+
appendActivityLabel,
|
|
10
11
|
renderActivityFeed,
|
|
11
12
|
MIRROR_MAX_LINES,
|
|
12
13
|
} from "../tool-activity-summary.js";
|
|
@@ -328,3 +329,21 @@ describe("appendActivityLine + renderActivityFeed — accumulating draft feed",
|
|
|
328
329
|
expect(renderActivityFeed([])).toBeNull();
|
|
329
330
|
});
|
|
330
331
|
});
|
|
332
|
+
|
|
333
|
+
describe("appendActivityLabel — precomputed label feed (tool_label path)", () => {
|
|
334
|
+
it("accumulates precomputed labels, dedups consecutive, ignores empty", () => {
|
|
335
|
+
const lines: string[] = [];
|
|
336
|
+
expect(appendActivityLabel(lines, "Searching memory")).toBe("· Searching memory");
|
|
337
|
+
expect(appendActivityLabel(lines, "List workspace")).toBe(
|
|
338
|
+
"· Searching memory\n· List workspace",
|
|
339
|
+
);
|
|
340
|
+
// consecutive dup collapses
|
|
341
|
+
appendActivityLabel(lines, "List workspace");
|
|
342
|
+
expect(lines).toEqual(["Searching memory", "List workspace"]);
|
|
343
|
+
// empty / whitespace → null, no push
|
|
344
|
+
expect(appendActivityLabel(lines, "")).toBeNull();
|
|
345
|
+
expect(appendActivityLabel(lines, " ")).toBeNull();
|
|
346
|
+
expect(appendActivityLabel(lines, undefined)).toBeNull();
|
|
347
|
+
expect(lines.length).toBe(2);
|
|
348
|
+
});
|
|
349
|
+
});
|
|
@@ -83,6 +83,42 @@ describe('tool-label-sidecar', () => {
|
|
|
83
83
|
s.stop()
|
|
84
84
|
})
|
|
85
85
|
|
|
86
|
+
it('replays pre-existing rows to a subscriber that attaches after construction', () => {
|
|
87
|
+
// Regression: the gateway's session-tail constructs the sidecar (which
|
|
88
|
+
// does an initial drain of the file) and only THEN wires `onLabel`. On a
|
|
89
|
+
// fast/clustered turn — or a resumed/flipped session — the hook has
|
|
90
|
+
// already written labels, so the initial drain consumed them with an
|
|
91
|
+
// empty subscriber set. Before the replay fix the late subscriber got
|
|
92
|
+
// nothing, so the real-time draft-mirror never fired (every label lost).
|
|
93
|
+
const sessionId = 'sess-replay'
|
|
94
|
+
const f = join(stateDir, `tool-labels-${sessionId}.jsonl`)
|
|
95
|
+
writeFileSync(
|
|
96
|
+
f,
|
|
97
|
+
JSON.stringify({ ts: 1, tool_use_id: 'A', agent_id: 'g', label: 'Reading foo.ts', tool_name: 'Read' }) + '\n' +
|
|
98
|
+
JSON.stringify({ ts: 2, tool_use_id: 'B', agent_id: 'g', label: 'List workspace', tool_name: 'Bash' }) + '\n',
|
|
99
|
+
)
|
|
100
|
+
const sched = makeManualScheduler()
|
|
101
|
+
const s = createToolLabelSidecar({ stateDir, sessionId, scheduler: sched })
|
|
102
|
+
// Subscribe AFTER construction (the real ensureSidecar ordering).
|
|
103
|
+
const seen: Array<[string, string, string]> = []
|
|
104
|
+
s.onLabel((id, label, toolName) => seen.push([id, label, toolName]))
|
|
105
|
+
expect(seen).toEqual([
|
|
106
|
+
['A', 'Reading foo.ts', 'Read'],
|
|
107
|
+
['B', 'List workspace', 'Bash'],
|
|
108
|
+
])
|
|
109
|
+
|
|
110
|
+
// And a row appended afterwards still reaches the subscriber exactly once
|
|
111
|
+
// (no double-emit of the replayed rows).
|
|
112
|
+
appendFileSync(f, JSON.stringify({ ts: 3, tool_use_id: 'C', agent_id: 'g', label: 'Searching memory', tool_name: 'mcp__hindsight__recall' }) + '\n')
|
|
113
|
+
s.poll()
|
|
114
|
+
expect(seen).toEqual([
|
|
115
|
+
['A', 'Reading foo.ts', 'Read'],
|
|
116
|
+
['B', 'List workspace', 'Bash'],
|
|
117
|
+
['C', 'Searching memory', 'mcp__hindsight__recall'],
|
|
118
|
+
])
|
|
119
|
+
s.stop()
|
|
120
|
+
})
|
|
121
|
+
|
|
86
122
|
it('ignores malformed JSON lines', () => {
|
|
87
123
|
const sessionId = 'sess4'
|
|
88
124
|
const sched = makeManualScheduler()
|
|
@@ -382,3 +382,21 @@ export function renderActivityFeed(lines: string[]): string | null {
|
|
|
382
382
|
const body = shown.map((l) => `· ${l}`).join("\n");
|
|
383
383
|
return hidden > 0 ? `· +${hidden} earlier…\n${body}` : body;
|
|
384
384
|
}
|
|
385
|
+
|
|
386
|
+
/**
|
|
387
|
+
* Like appendActivityLine, but for a pre-computed label (from the
|
|
388
|
+
* real-time PreToolUse sidecar / `tool_label` event) — the hook already
|
|
389
|
+
* rendered the friendly text, so we skip describeToolUse. Returns the
|
|
390
|
+
* rendered feed, or null when the label is empty.
|
|
391
|
+
*/
|
|
392
|
+
export function appendActivityLabel(
|
|
393
|
+
lines: string[],
|
|
394
|
+
label: string | undefined,
|
|
395
|
+
): string | null {
|
|
396
|
+
const l = (label ?? "").trim();
|
|
397
|
+
if (l.length === 0) return null;
|
|
398
|
+
if (lines.length === 0 || lines[lines.length - 1] !== l) {
|
|
399
|
+
lines.push(l);
|
|
400
|
+
}
|
|
401
|
+
return renderActivityFeed(lines);
|
|
402
|
+
}
|
|
@@ -40,8 +40,11 @@ export interface ToolLabelRow {
|
|
|
40
40
|
export interface ToolLabelSidecar {
|
|
41
41
|
/** Synchronous label lookup. */
|
|
42
42
|
getLabel(toolUseId: string): string | undefined
|
|
43
|
-
/** Subscribe to "label arrived" notifications.
|
|
44
|
-
|
|
43
|
+
/** Subscribe to "label arrived" notifications. Fires once per new
|
|
44
|
+
* sidecar line, in real time (~pollMs after the hook's appendFileSync),
|
|
45
|
+
* independent of when the claude transcript flushes. `toolName` lets
|
|
46
|
+
* subscribers filter surface tools (reply/react) from a live feed. */
|
|
47
|
+
onLabel(cb: (toolUseId: string, label: string, toolName: string) => void): () => void
|
|
45
48
|
/** Force a re-poll (tests). */
|
|
46
49
|
poll(): void
|
|
47
50
|
/** Stop polling and release resources. */
|
|
@@ -63,7 +66,15 @@ export interface SidecarOptions {
|
|
|
63
66
|
export function createToolLabelSidecar(opts: SidecarOptions): ToolLabelSidecar {
|
|
64
67
|
const path = join(opts.stateDir, `tool-labels-${opts.sessionId}.jsonl`)
|
|
65
68
|
const labels = new Map<string, string>()
|
|
66
|
-
|
|
69
|
+
// Ordered log of every row ingested so far (label + tool_name), used to
|
|
70
|
+
// replay history to a subscriber that attaches AFTER rows were already
|
|
71
|
+
// read. Without this, a sidecar whose file is already populated when
|
|
72
|
+
// `onLabel` is wired (fast/clustered turns, resumed/flipped sessions —
|
|
73
|
+
// the gateway's `ensureSidecar` subscribes *after* construction's initial
|
|
74
|
+
// drain) would silently lose every pre-existing label, breaking the
|
|
75
|
+
// real-time draft-mirror determinism the sidecar exists to provide.
|
|
76
|
+
const seen: Array<{ toolUseId: string; label: string; toolName: string }> = []
|
|
77
|
+
const subscribers = new Set<(toolUseId: string, label: string, toolName: string) => void>()
|
|
67
78
|
let offset = 0
|
|
68
79
|
let stopped = false
|
|
69
80
|
|
|
@@ -84,13 +95,19 @@ export function createToolLabelSidecar(opts: SidecarOptions): ToolLabelSidecar {
|
|
|
84
95
|
} catch {
|
|
85
96
|
continue
|
|
86
97
|
}
|
|
87
|
-
if (
|
|
98
|
+
if (
|
|
99
|
+
!row ||
|
|
100
|
+
typeof row.tool_use_id !== 'string' ||
|
|
101
|
+
typeof row.label !== 'string' ||
|
|
102
|
+
typeof row.tool_name !== 'string'
|
|
103
|
+
) continue
|
|
88
104
|
// First write wins — sidecar lines are append-only and we don't
|
|
89
105
|
// expect duplicates, but if one lands we keep the earliest.
|
|
90
106
|
if (labels.has(row.tool_use_id)) continue
|
|
91
107
|
labels.set(row.tool_use_id, row.label)
|
|
108
|
+
seen.push({ toolUseId: row.tool_use_id, label: row.label, toolName: row.tool_name })
|
|
92
109
|
for (const cb of subscribers) {
|
|
93
|
-
try { cb(row.tool_use_id, row.label) } catch { /* ignore */ }
|
|
110
|
+
try { cb(row.tool_use_id, row.label, row.tool_name) } catch { /* ignore */ }
|
|
94
111
|
}
|
|
95
112
|
}
|
|
96
113
|
}
|
|
@@ -126,6 +143,15 @@ export function createToolLabelSidecar(opts: SidecarOptions): ToolLabelSidecar {
|
|
|
126
143
|
return labels.get(toolUseId)
|
|
127
144
|
},
|
|
128
145
|
onLabel(cb) {
|
|
146
|
+
// Replay rows already ingested before this subscriber attached, then
|
|
147
|
+
// register for future rows. Single-threaded: no row can be ingested
|
|
148
|
+
// between the replay loop and the add, so each row reaches `cb`
|
|
149
|
+
// exactly once. This is what makes the draft-mirror deterministic
|
|
150
|
+
// regardless of when the gateway subscribes relative to the hook's
|
|
151
|
+
// writes (see the `seen` declaration above).
|
|
152
|
+
for (const r of seen) {
|
|
153
|
+
try { cb(r.toolUseId, r.label, r.toolName) } catch { /* ignore */ }
|
|
154
|
+
}
|
|
129
155
|
subscribers.add(cb)
|
|
130
156
|
return () => subscribers.delete(cb)
|
|
131
157
|
},
|
|
@@ -215,9 +215,13 @@ const CC2_CASES: readonly CC2Case[] = [
|
|
|
215
215
|
},
|
|
216
216
|
{
|
|
217
217
|
name: "long-running with planned check-ins",
|
|
218
|
+
// Use python time.sleep, NOT the `sleep` command — Claude Code's bash
|
|
219
|
+
// sandbox blocks standalone `sleep` ("foreground sleep is sandboxed
|
|
220
|
+
// away"), which made this case un-runnable (agent replied instantly).
|
|
218
221
|
prompt:
|
|
219
|
-
"Run `bash` with `
|
|
220
|
-
"
|
|
222
|
+
"Run `bash` with `python3 -c 'import time; time.sleep(5)'` then echo " +
|
|
223
|
+
"step1, send a brief update, then `python3 -c 'import time; " +
|
|
224
|
+
"time.sleep(5)'` then echo step2, send another brief update, then " +
|
|
221
225
|
"send a final 'done' as your answer.",
|
|
222
226
|
},
|
|
223
227
|
];
|
|
@@ -262,12 +266,27 @@ async function assertMidTurnSilent(
|
|
|
262
266
|
)
|
|
263
267
|
.join("\n");
|
|
264
268
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
|
|
269
|
+
// The model habitually emits a trailing trivial confirmation ("Done.",
|
|
270
|
+
// "Sent.", "OK") as a separate SILENT message AFTER its real pinged
|
|
271
|
+
// answer. That's pacing noise (the turn-pacing directive discourages
|
|
272
|
+
// it), not the final answer — so don't treat it as the
|
|
273
|
+
// "final-answer-must-ping" target. Find the last SUBSTANTIVE message
|
|
274
|
+
// and assert that one pinged; trailing trivial confirmations are
|
|
275
|
+
// ignored for this invariant (they're correctly silent anyway).
|
|
276
|
+
const TRIVIAL_TAIL = /^(done|sent|ok|okay|ack|got it|hope (that|this) helps)\b[.! ]*$/i;
|
|
277
|
+
const isTrivial = (m: ObservedMessage) => TRIVIAL_TAIL.test(m.text.trim());
|
|
278
|
+
let finalIdx = collected.length - 1;
|
|
279
|
+
while (finalIdx > 0 && isTrivial(collected[finalIdx])) finalIdx--;
|
|
280
|
+
const finalAnswer = collected[finalIdx];
|
|
281
|
+
expect(
|
|
282
|
+
finalAnswer.silent,
|
|
283
|
+
`final substantive answer was silent — won't ping. Trail:\n${trail}`,
|
|
284
|
+
).toBe(false);
|
|
285
|
+
|
|
286
|
+
// Everything BEFORE the final substantive answer must be silent
|
|
287
|
+
// (mid-turn updates ping-free). Trailing trivial confirmations after
|
|
288
|
+
// it are already silent and are not "mid-turn" — exclude them too.
|
|
289
|
+
const midTurn = collected.slice(0, finalIdx);
|
|
271
290
|
const loudMidTurn = midTurn.filter((m) => !m.silent);
|
|
272
291
|
expect(
|
|
273
292
|
loudMidTurn.length,
|
|
@@ -334,12 +353,19 @@ async function assertSilencePokeFires(
|
|
|
334
353
|
// Single bash call so the poke piggybacks the single tool result.
|
|
335
354
|
// Without the explicit "no replies" instruction the model might
|
|
336
355
|
// soft-commit; that resets the silence clock but a single >75s
|
|
337
|
-
//
|
|
356
|
+
// wait still pushes post-commit silence past the threshold.
|
|
357
|
+
//
|
|
358
|
+
// Use python time.sleep, NOT the `sleep` command — Claude Code's bash
|
|
359
|
+
// sandbox blocks standalone `sleep` ("foreground sleep is sandboxed
|
|
360
|
+
// away to prevent burning cache windows"), so a `sleep 80` prompt made
|
|
361
|
+
// the agent reply instantly instead of going silent, breaking this
|
|
362
|
+
// case. python3 time.sleep is a genuine foreground wait the sandbox
|
|
363
|
+
// doesn't special-case.
|
|
338
364
|
const prompt =
|
|
339
|
-
`Run exactly one Bash tool call: \`
|
|
340
|
-
`send any reply before
|
|
341
|
-
`mid-turn updates. When
|
|
342
|
-
`reply.`;
|
|
365
|
+
`Run exactly one Bash tool call: \`python3 -c 'import time; ` +
|
|
366
|
+
`time.sleep(${sleepSeconds})'\`. Do NOT send any reply before it ` +
|
|
367
|
+
`completes — no soft commit, no mid-turn updates. When it returns, ` +
|
|
368
|
+
`send one brief 'done' reply.`;
|
|
343
369
|
|
|
344
370
|
await scenario.sendDM(prompt);
|
|
345
371
|
|