switchroom 0.13.20 → 0.13.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +33 -3
- package/package.json +1 -1
- package/profiles/_base/start.sh.hbs +7 -6
- package/profiles/_shared/telegram-style.md.hbs +3 -3
- package/telegram-plugin/dist/gateway/gateway.js +87 -25
- package/telegram-plugin/gateway/disconnect-flush.ts +37 -0
- package/telegram-plugin/gateway/gateway.ts +100 -7
- package/telegram-plugin/gateway/inbound-delivery-gate.ts +37 -4
- package/telegram-plugin/handoff-continuity.ts +8 -2
- package/telegram-plugin/recent-outbound-dedup.ts +51 -5
- package/telegram-plugin/runtime-metrics.ts +5 -1
- package/telegram-plugin/subagent-watcher.ts +25 -3
- package/telegram-plugin/tests/gateway-disconnect-flush.test.ts +114 -0
- package/telegram-plugin/tests/handoff-continuity.test.ts +15 -2
- package/telegram-plugin/tests/inbound-delivery-gate.test.ts +77 -4
- package/telegram-plugin/tests/recent-outbound-dedup.test.ts +72 -0
- package/telegram-plugin/tests/subagent-watcher-enoent-deregister.test.ts +152 -0
- package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +72 -45
- package/vendor/hindsight-memory/scripts/recall.py +18 -2
|
@@ -41,13 +41,86 @@ describe('decideInboundDelivery', () => {
|
|
|
41
41
|
).toBe('deliver')
|
|
42
42
|
})
|
|
43
43
|
|
|
44
|
-
it('is total: the ONLY deferral path is mid-turn AND not steering', () => {
|
|
44
|
+
it('is total: the ONLY deferral path is mid-turn AND not steering AND not interrupt', () => {
|
|
45
45
|
for (const turnInFlight of [true, false]) {
|
|
46
46
|
for (const isSteering of [true, false]) {
|
|
47
|
-
const
|
|
48
|
-
|
|
49
|
-
|
|
47
|
+
for (const isInterrupt of [true, false]) {
|
|
48
|
+
const decision = decideInboundDelivery({ turnInFlight, isSteering, isInterrupt })
|
|
49
|
+
const expectBuffer = turnInFlight && !isSteering && !isInterrupt
|
|
50
|
+
expect(decision).toBe(expectBuffer ? 'buffer-until-idle' : 'deliver')
|
|
51
|
+
}
|
|
50
52
|
}
|
|
51
53
|
}
|
|
52
54
|
})
|
|
55
|
+
|
|
56
|
+
// ─── Interrupt-marker carve-out (2026-05-24 fix for the stranded-body bug) ──
|
|
57
|
+
// Live UAT trace: user fires `! actually do X` mid-turn. SIGINT delivered
|
|
58
|
+
// to claude via tmux send-keys. The killed turn does NOT emit
|
|
59
|
+
// turn_complete in many cases (mid-tool-call kill, in-flight subagent),
|
|
60
|
+
// so the post-`!` body sits in pendingInboundBuffer forever — the
|
|
61
|
+
// turn-complete drain trigger never fires. The user never gets a reply
|
|
62
|
+
// to their replacement instruction.
|
|
63
|
+
//
|
|
64
|
+
// The carve-out is a peer of isSteering: an interrupt body is by
|
|
65
|
+
// definition an intentional mid-turn delivery — the user explicitly
|
|
66
|
+
// asked for "stop and do this instead".
|
|
67
|
+
describe('interrupt-marker carve-out', () => {
|
|
68
|
+
it('delivers a `!`-interrupt body mid-turn (does NOT buffer)', () => {
|
|
69
|
+
// The headline regression fix. Without the carve-out the killed turn
|
|
70
|
+
// strands the body indefinitely.
|
|
71
|
+
expect(
|
|
72
|
+
decideInboundDelivery({
|
|
73
|
+
turnInFlight: true,
|
|
74
|
+
isSteering: false,
|
|
75
|
+
isInterrupt: true,
|
|
76
|
+
}),
|
|
77
|
+
).toBe('deliver')
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
it('delivers a `!`-interrupt body even when claude is idle (no harm)', () => {
|
|
81
|
+
expect(
|
|
82
|
+
decideInboundDelivery({
|
|
83
|
+
turnInFlight: false,
|
|
84
|
+
isSteering: false,
|
|
85
|
+
isInterrupt: true,
|
|
86
|
+
}),
|
|
87
|
+
).toBe('deliver')
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
it('isInterrupt is optional — omitting it preserves legacy behavior', () => {
|
|
91
|
+
// Backward-compat for callers that haven't been updated yet. Mirrors
|
|
92
|
+
// the optional-default pattern used in other gateway predicates this
|
|
93
|
+
// session (silent-reply-anchor wasOverPingSuppressed, recent-outbound-
|
|
94
|
+
// dedup turnKey).
|
|
95
|
+
expect(
|
|
96
|
+
decideInboundDelivery({ turnInFlight: true, isSteering: false }),
|
|
97
|
+
).toBe('buffer-until-idle')
|
|
98
|
+
expect(
|
|
99
|
+
decideInboundDelivery({ turnInFlight: false, isSteering: false }),
|
|
100
|
+
).toBe('deliver')
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
it('explicit isInterrupt:false is identical to omitting it', () => {
|
|
104
|
+
expect(
|
|
105
|
+
decideInboundDelivery({
|
|
106
|
+
turnInFlight: true,
|
|
107
|
+
isSteering: false,
|
|
108
|
+
isInterrupt: false,
|
|
109
|
+
}),
|
|
110
|
+
).toBe('buffer-until-idle')
|
|
111
|
+
})
|
|
112
|
+
|
|
113
|
+
it('interrupt + steering combination delivers (both are exempt paths)', () => {
|
|
114
|
+
// Pathological prompt: `! /steer change tactics`. parseInterruptMarker
|
|
115
|
+
// strips the `!`, then steering parse sees `/steer`. Either flag
|
|
116
|
+
// alone delivers; both together still deliver. No regression.
|
|
117
|
+
expect(
|
|
118
|
+
decideInboundDelivery({
|
|
119
|
+
turnInFlight: true,
|
|
120
|
+
isSteering: true,
|
|
121
|
+
isInterrupt: true,
|
|
122
|
+
}),
|
|
123
|
+
).toBe('deliver')
|
|
124
|
+
})
|
|
125
|
+
})
|
|
53
126
|
})
|
|
@@ -190,3 +190,75 @@ describe('OutboundDedupCache — multiple entries per chat', () => {
|
|
|
190
190
|
expect(cache.check('chat', undefined, LONG_HTML, 6000)).not.toBeNull()
|
|
191
191
|
})
|
|
192
192
|
})
|
|
193
|
+
|
|
194
|
+
// ─── turnKey carve-out (2026-05-23 cross-turn-swallow fix) ───────────────────
|
|
195
|
+
// Without turnKey awareness, the 60s TTL eats the SECOND turn's reply when a
|
|
196
|
+
// user asks similar questions back-to-back (forensic audit on midturn-silent
|
|
197
|
+
// UAT). The carve-out: both sides non-null + distinct ⇒ treat as miss.
|
|
198
|
+
// Within-turn (#546 retry race) protection unchanged: same turnKey on both
|
|
199
|
+
// sides ⇒ legacy hit. Null on either side ⇒ legacy hit.
|
|
200
|
+
|
|
201
|
+
const LONG_TEXT = 'long enough text to count as content for the dedup floor'
|
|
202
|
+
|
|
203
|
+
describe('OutboundDedupCache — turnKey carve-out', () => {
|
|
204
|
+
it('cross-turn identical content with distinct non-null turnKeys MISSES', () => {
|
|
205
|
+
// The headline bug: dedup was eating user replies across turns.
|
|
206
|
+
const cache = new OutboundDedupCache()
|
|
207
|
+
cache.record('chat', undefined, LONG_TEXT, 1000, 'turn-A')
|
|
208
|
+
expect(
|
|
209
|
+
cache.check('chat', undefined, LONG_TEXT, 5000, 'turn-B'),
|
|
210
|
+
).toBeNull()
|
|
211
|
+
})
|
|
212
|
+
|
|
213
|
+
it('within-turn duplicates (same turnKey) STILL HIT — preserves #546 protection', () => {
|
|
214
|
+
// Same-turn retry race the module was built for.
|
|
215
|
+
const cache = new OutboundDedupCache()
|
|
216
|
+
cache.record('chat', undefined, LONG_TEXT, 1000, 'turn-A')
|
|
217
|
+
expect(
|
|
218
|
+
cache.check('chat', undefined, LONG_TEXT, 10_000, 'turn-A'),
|
|
219
|
+
).not.toBeNull()
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
it('record-null + check-non-null → legacy hit', () => {
|
|
223
|
+
// Boot-time / silent-marker callers pass null on record; later
|
|
224
|
+
// executeReply checks with a turnKey. Legacy match must persist
|
|
225
|
+
// for the #546 protection to cover these cross-context cases.
|
|
226
|
+
const cache = new OutboundDedupCache()
|
|
227
|
+
cache.record('chat', undefined, LONG_TEXT, 1000, null)
|
|
228
|
+
expect(
|
|
229
|
+
cache.check('chat', undefined, LONG_TEXT, 5000, 'turn-A'),
|
|
230
|
+
).not.toBeNull()
|
|
231
|
+
})
|
|
232
|
+
|
|
233
|
+
it('record-non-null + check-null → legacy hit', () => {
|
|
234
|
+
// Symmetric direction: turn-flush records with turnKey, a later
|
|
235
|
+
// null-context probe (rare but possible) still matches.
|
|
236
|
+
const cache = new OutboundDedupCache()
|
|
237
|
+
cache.record('chat', undefined, LONG_TEXT, 1000, 'turn-A')
|
|
238
|
+
expect(
|
|
239
|
+
cache.check('chat', undefined, LONG_TEXT, 5000, null),
|
|
240
|
+
).not.toBeNull()
|
|
241
|
+
})
|
|
242
|
+
|
|
243
|
+
it('cross-turn entry does NOT shadow a same-turn match later in the list', () => {
|
|
244
|
+
// Edge case the predicate must handle: when the scan hits a stale
|
|
245
|
+
// cross-turn entry whose hash matches, it must keep scanning past
|
|
246
|
+
// it to find a real same-turn match. (The carve-out is implemented
|
|
247
|
+
// as `continue`, not `return null`.)
|
|
248
|
+
const cache = new OutboundDedupCache()
|
|
249
|
+
cache.record('chat', undefined, LONG_TEXT, 1000, 'turn-A') // older, cross-turn
|
|
250
|
+
cache.record('chat', undefined, LONG_TEXT, 3000, 'turn-B') // newer, same turn as query
|
|
251
|
+
expect(
|
|
252
|
+
cache.check('chat', undefined, LONG_TEXT, 5000, 'turn-B'),
|
|
253
|
+
).not.toBeNull()
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
it('legacy 4-arg API still compiles + matches (default turnKey=null)', () => {
|
|
257
|
+
// Backward-compat smoke test — older callers that haven't been
|
|
258
|
+
// updated to pass turnKey continue to behave as the original test
|
|
259
|
+
// suite pins.
|
|
260
|
+
const cache = new OutboundDedupCache()
|
|
261
|
+
cache.record('chat', undefined, LONG_TEXT, 1000)
|
|
262
|
+
expect(cache.check('chat', undefined, LONG_TEXT, 5000)).not.toBeNull()
|
|
263
|
+
})
|
|
264
|
+
})
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for the readSubTail ENOENT/EACCES deregister path.
|
|
3
|
+
*
|
|
4
|
+
* Production symptom: clerk agent's gateway-supervisor.log was growing
|
|
5
|
+
* at ~30 ENOENT lines/sec sustained (540k+ in 3 days) because the
|
|
6
|
+
* watcher's poll loop kept statx-ing JSONL files Claude Code had
|
|
7
|
+
* already reaped along with the parent session's `subagents/` dir.
|
|
8
|
+
* Same shape on klanker with EACCES (635 events) — likely a perm
|
|
9
|
+
* flip during cleanup.
|
|
10
|
+
*
|
|
11
|
+
* Fix shape: when readSubTail's statSync throws ENOENT or EACCES,
|
|
12
|
+
* log ONE line + invoke the onFileVanished callback so the watcher
|
|
13
|
+
* factory can call cleanupTerminalAgent and stop polling. Other
|
|
14
|
+
* errors (parse, malformed JSONL) keep the legacy per-poll log line.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { describe, it, expect, vi } from 'vitest'
|
|
18
|
+
import { readSubTail } from '../subagent-watcher.js'
|
|
19
|
+
import type { WorkerEntry } from '../subagent-watcher.js'
|
|
20
|
+
|
|
21
|
+
function fakeFsThrowingFromStat(code: 'ENOENT' | 'EACCES' | 'EOTHER') {
|
|
22
|
+
const err = new Error(`fake ${code}`) as NodeJS.ErrnoException
|
|
23
|
+
err.code = code
|
|
24
|
+
return {
|
|
25
|
+
existsSync: () => true,
|
|
26
|
+
readdirSync: () => [],
|
|
27
|
+
statSync: () => { throw err },
|
|
28
|
+
openSync: () => -1,
|
|
29
|
+
closeSync: () => {},
|
|
30
|
+
readSync: () => 0,
|
|
31
|
+
watch: () => ({ close: () => {} } as ReturnType<typeof require>),
|
|
32
|
+
} as unknown as Parameters<typeof readSubTail>[4]
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function makeEntry(): WorkerEntry {
|
|
36
|
+
return {
|
|
37
|
+
agentId: 'a1234567890abcdef',
|
|
38
|
+
filePath: '/tmp/fake/agent-a1234567890abcdef.jsonl',
|
|
39
|
+
dispatchedAt: 0,
|
|
40
|
+
lastActivityAt: 0,
|
|
41
|
+
toolCount: 0,
|
|
42
|
+
state: 'running',
|
|
43
|
+
completionNotified: false,
|
|
44
|
+
stallNotified: false,
|
|
45
|
+
historical: false,
|
|
46
|
+
description: '',
|
|
47
|
+
lastSummaryLine: '',
|
|
48
|
+
lastResultText: '',
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function makeTail() {
|
|
53
|
+
return {
|
|
54
|
+
cursor: 0,
|
|
55
|
+
pendingPartial: '',
|
|
56
|
+
hasEmittedStart: false,
|
|
57
|
+
watcher: null,
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
describe('readSubTail — ENOENT/EACCES deregister', () => {
|
|
62
|
+
it('fires onFileVanished and logs ONCE on ENOENT', () => {
|
|
63
|
+
const onFileVanished = vi.fn()
|
|
64
|
+
const log = vi.fn()
|
|
65
|
+
const entry = makeEntry()
|
|
66
|
+
|
|
67
|
+
readSubTail(
|
|
68
|
+
entry,
|
|
69
|
+
makeTail(),
|
|
70
|
+
0,
|
|
71
|
+
vi.fn(),
|
|
72
|
+
fakeFsThrowingFromStat('ENOENT'),
|
|
73
|
+
log,
|
|
74
|
+
null,
|
|
75
|
+
null,
|
|
76
|
+
undefined,
|
|
77
|
+
onFileVanished,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
expect(onFileVanished).toHaveBeenCalledTimes(1)
|
|
81
|
+
expect(onFileVanished).toHaveBeenCalledWith('a1234567890abcdef', 'ENOENT')
|
|
82
|
+
expect(log).toHaveBeenCalledTimes(1)
|
|
83
|
+
expect(log.mock.calls[0][0]).toMatch(/JSONL vanished for a1234567890abcdef \(ENOENT\) — deregistering/)
|
|
84
|
+
expect(log.mock.calls[0][0]).not.toMatch(/read error/)
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
it('fires onFileVanished and logs ONCE on EACCES (klanker pattern)', () => {
|
|
88
|
+
const onFileVanished = vi.fn()
|
|
89
|
+
const log = vi.fn()
|
|
90
|
+
|
|
91
|
+
readSubTail(
|
|
92
|
+
makeEntry(),
|
|
93
|
+
makeTail(),
|
|
94
|
+
0,
|
|
95
|
+
vi.fn(),
|
|
96
|
+
fakeFsThrowingFromStat('EACCES'),
|
|
97
|
+
log,
|
|
98
|
+
null,
|
|
99
|
+
null,
|
|
100
|
+
undefined,
|
|
101
|
+
onFileVanished,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
expect(onFileVanished).toHaveBeenCalledTimes(1)
|
|
105
|
+
expect(onFileVanished).toHaveBeenCalledWith('a1234567890abcdef', 'EACCES')
|
|
106
|
+
expect(log).toHaveBeenCalledTimes(1)
|
|
107
|
+
expect(log.mock.calls[0][0]).toMatch(/EACCES/)
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
it('still logs the legacy "read error" for unexpected error codes', () => {
|
|
111
|
+
// Regression guard: parse errors, EIO, EBUSY, etc. must still
|
|
112
|
+
// surface their detail. Only file-vanished codes are deregistered.
|
|
113
|
+
const onFileVanished = vi.fn()
|
|
114
|
+
const log = vi.fn()
|
|
115
|
+
|
|
116
|
+
readSubTail(
|
|
117
|
+
makeEntry(),
|
|
118
|
+
makeTail(),
|
|
119
|
+
0,
|
|
120
|
+
vi.fn(),
|
|
121
|
+
fakeFsThrowingFromStat('EOTHER'),
|
|
122
|
+
log,
|
|
123
|
+
null,
|
|
124
|
+
null,
|
|
125
|
+
undefined,
|
|
126
|
+
onFileVanished,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
expect(onFileVanished).not.toHaveBeenCalled()
|
|
130
|
+
expect(log).toHaveBeenCalledTimes(1)
|
|
131
|
+
expect(log.mock.calls[0][0]).toMatch(/read error a1234567890abcdef/)
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
it('omitting onFileVanished is safe (optional callback)', () => {
|
|
135
|
+
const log = vi.fn()
|
|
136
|
+
|
|
137
|
+
expect(() =>
|
|
138
|
+
readSubTail(
|
|
139
|
+
makeEntry(),
|
|
140
|
+
makeTail(),
|
|
141
|
+
0,
|
|
142
|
+
vi.fn(),
|
|
143
|
+
fakeFsThrowingFromStat('ENOENT'),
|
|
144
|
+
log,
|
|
145
|
+
null,
|
|
146
|
+
null,
|
|
147
|
+
undefined,
|
|
148
|
+
),
|
|
149
|
+
).not.toThrow()
|
|
150
|
+
expect(log).toHaveBeenCalledTimes(1)
|
|
151
|
+
})
|
|
152
|
+
})
|
|
@@ -1,38 +1,35 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* JTBD scenario — rapid follow-ups (steering vs queued classification).
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Live contract codified in `_shared/telegram-style.md.hbs` and
|
|
5
|
+
* `reference/steer-or-queue-mid-flight.md` (default-flip commits
|
|
6
|
+
* `4fff90bf` + `597a58af`, 2026-04-17):
|
|
5
7
|
*
|
|
6
|
-
* - A follow-up
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
* - A follow-up prefixed with `/
|
|
10
|
-
*
|
|
11
|
-
* in-flight
|
|
8
|
+
* - A mid-turn follow-up with NO prefix is `queued="true"` — new
|
|
9
|
+
* independent task. The agent should NOT reference the in-flight
|
|
10
|
+
* work.
|
|
11
|
+
* - A mid-turn follow-up prefixed with `/steer ` or `/s ` is
|
|
12
|
+
* `steering="true"` — course-correction; the agent continues the
|
|
13
|
+
* in-flight task incorporating the new guidance.
|
|
14
|
+
* - Legacy `/queue ` / `/q ` is a redundant alias for the default;
|
|
15
|
+
* still works.
|
|
12
16
|
*
|
|
13
|
-
* This UAT fires both shapes and asserts the agent
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
* top of its reply. So we can pattern-match on that.
|
|
17
|
+
* This UAT fires both shapes and asserts the agent narrates the
|
|
18
|
+
* classification correctly. The prior version of this scenario
|
|
19
|
+
* (2026-05-13 / PR #1132) tested the pre-flip contract with
|
|
20
|
+
* too-loose assertions (`/md5/i` regex passes on the queued path
|
|
21
|
+
* by coincidence — the model answers "use md5" fresh and the reply
|
|
22
|
+
* contains "md5"). After unskipping with the corrected contract,
|
|
23
|
+
* the assertions check for the italic classification line the
|
|
24
|
+
* prompt instructs the agent to emit.
|
|
22
25
|
*/
|
|
23
26
|
|
|
24
27
|
import { describe, it, expect } from "vitest";
|
|
25
28
|
import { spinUp } from "../harness.js";
|
|
26
29
|
|
|
27
|
-
|
|
28
|
-
// surface "md5"; queued didn't produce the expected fresh-task reply).
|
|
29
|
-
// May be real classification bugs, may be prompt fragility — neither
|
|
30
|
-
// has been root-caused. Excluded from the buildkite gate so it doesn't
|
|
31
|
-
// block every PR touching telegram-plugin/. Run locally via
|
|
32
|
-
// `bun run test:uat` once classification has been investigated.
|
|
33
|
-
describe.skip("uat: rapid follow-ups — steering vs queued", () => {
|
|
30
|
+
describe("uat: rapid follow-ups — steering vs queued classification", () => {
|
|
34
31
|
it(
|
|
35
|
-
"follow-up
|
|
32
|
+
"follow-up with /steer prefix → agent self-narrates as steering",
|
|
36
33
|
async () => {
|
|
37
34
|
const sc = await spinUp({ agent: "test-harness" });
|
|
38
35
|
try {
|
|
@@ -43,26 +40,39 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
|
|
|
43
40
|
+ "Show the work step by step with a 2-second pause between.",
|
|
44
41
|
);
|
|
45
42
|
await new Promise((r) => setTimeout(r, 3_000));
|
|
46
|
-
// Steer: change the algorithm
|
|
47
|
-
await sc.sendDM("actually use md5 not sha256");
|
|
43
|
+
// Steer: change the algorithm using the explicit /steer prefix.
|
|
44
|
+
await sc.sendDM("/steer actually use md5 not sha256");
|
|
48
45
|
|
|
49
|
-
// The agent should reply mentioning md5
|
|
50
|
-
//
|
|
51
|
-
//
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
46
|
+
// The agent should reply mentioning md5 AND surface the italic
|
|
47
|
+
// classification line per the prompt
|
|
48
|
+
// ("_↪️ treating as steer on the prior task_" or similar).
|
|
49
|
+
// We match either explicit-steer narration OR the steer emoji
|
|
50
|
+
// (`↪️`) to allow for natural-language variation while still
|
|
51
|
+
// failing if no narration appears (the previous version of
|
|
52
|
+
// this UAT was too loose — bare `/md5/i` passed by coincidence
|
|
53
|
+
// on the queued path).
|
|
54
|
+
const reply = await sc.expectMessage(
|
|
55
|
+
(m) => {
|
|
56
|
+
const txt = m.text;
|
|
57
|
+
const mentionsMd5 = /\bmd5\b/i.test(txt);
|
|
58
|
+
const narratesSteer =
|
|
59
|
+
/↪️|\bsteer(ing)?\b|continuing the (prior|original|in-flight) task|amendment|course[- ]correct/i.test(
|
|
60
|
+
txt,
|
|
61
|
+
);
|
|
62
|
+
return mentionsMd5 && narratesSteer;
|
|
63
|
+
},
|
|
64
|
+
{ from: "bot", timeout: 120_000 },
|
|
65
|
+
);
|
|
56
66
|
expect(reply.text.toLowerCase()).toContain("md5");
|
|
57
67
|
} finally {
|
|
58
68
|
await sc.tearDown();
|
|
59
69
|
}
|
|
60
70
|
},
|
|
61
|
-
|
|
71
|
+
180_000,
|
|
62
72
|
);
|
|
63
73
|
|
|
64
74
|
it(
|
|
65
|
-
"follow-up
|
|
75
|
+
"follow-up with no prefix mid-turn → agent treats as queued (new task)",
|
|
66
76
|
async () => {
|
|
67
77
|
const sc = await spinUp({ agent: "test-harness" });
|
|
68
78
|
try {
|
|
@@ -71,9 +81,10 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
|
|
|
71
81
|
+ "Use bash.",
|
|
72
82
|
);
|
|
73
83
|
await new Promise((r) => setTimeout(r, 3_000));
|
|
74
|
-
//
|
|
75
|
-
//
|
|
76
|
-
|
|
84
|
+
// No prefix — the default-flipped contract says this is a
|
|
85
|
+
// QUEUED new task. The agent should NOT reference the
|
|
86
|
+
// counting work.
|
|
87
|
+
await sc.sendDM("what is 2+2?");
|
|
77
88
|
|
|
78
89
|
// First reply should be from the counting task (still
|
|
79
90
|
// in-flight). Then a second reply for the queued task.
|
|
@@ -81,16 +92,32 @@ describe.skip("uat: rapid follow-ups — steering vs queued", () => {
|
|
|
81
92
|
from: "bot",
|
|
82
93
|
timeout: 60_000,
|
|
83
94
|
});
|
|
84
|
-
|
|
85
|
-
//
|
|
86
|
-
//
|
|
95
|
+
|
|
96
|
+
// Second reply: the queued task's answer. We want to see
|
|
97
|
+
// EITHER the italic queued-narration line OR a fresh "4"
|
|
98
|
+
// answer that doesn't reference the counting work.
|
|
87
99
|
const secondReply = await sc.expectMessage(
|
|
88
|
-
(m) =>
|
|
89
|
-
m.messageId
|
|
90
|
-
|
|
100
|
+
(m) => {
|
|
101
|
+
if (m.messageId <= firstReply.messageId) return false;
|
|
102
|
+
const txt = m.text;
|
|
103
|
+
const answersTheQuestion =
|
|
104
|
+
/\b4\b|\bfour\b|two\s+plus\s+two|2\s*\+\s*2/i.test(txt);
|
|
105
|
+
const narratesQueued =
|
|
106
|
+
/📥|\bqueued\b|new\s+(?:independent\s+)?task|fresh\s+task/i.test(
|
|
107
|
+
txt,
|
|
108
|
+
);
|
|
109
|
+
// Pass if either: the explicit narration is present, OR the
|
|
110
|
+
// reply answers cleanly without referencing the counting
|
|
111
|
+
// task. The latter is the substantive behavioural check —
|
|
112
|
+
// the queued task is isolated from the in-flight context.
|
|
113
|
+
const isolatedFromCounting = !/\bcount(ing)?\b|\bsleep\b/i.test(
|
|
114
|
+
txt,
|
|
115
|
+
);
|
|
116
|
+
return answersTheQuestion && (narratesQueued || isolatedFromCounting);
|
|
117
|
+
},
|
|
91
118
|
{ from: "bot", timeout: 120_000 },
|
|
92
119
|
);
|
|
93
|
-
expect(secondReply.text).toMatch(/4|
|
|
120
|
+
expect(secondReply.text).toMatch(/4|four|2\s*\+\s*2/i);
|
|
94
121
|
} finally {
|
|
95
122
|
await sc.tearDown();
|
|
96
123
|
}
|
|
@@ -556,7 +556,15 @@ def main():
|
|
|
556
556
|
max_tokens=config.get("recallMaxTokens", 1024),
|
|
557
557
|
budget=config.get("recallBudget", "mid"),
|
|
558
558
|
types=config.get("recallTypes"),
|
|
559
|
-
timeout
|
|
559
|
+
# 8s in-script timeout leaves 4s headroom inside the 12s
|
|
560
|
+
# UserPromptSubmit hook ceiling (see hooks.json:20) for cache
|
|
561
|
+
# write + block formatting. Tightened from 10s in switchroom
|
|
562
|
+
# v0.13.22: the 2026-05-24 audit showed 17-26% of turns
|
|
563
|
+
# breaching the 12s hook timeout on heavy agents (finn /
|
|
564
|
+
# gymbro / klanker), which dropped the recall entirely; an
|
|
565
|
+
# earlier-hard-timeout failure returns cleanly with no
|
|
566
|
+
# memories instead of blowing past the hook ceiling.
|
|
567
|
+
timeout=8,
|
|
560
568
|
)
|
|
561
569
|
results = response.get("results", [])
|
|
562
570
|
except Exception as e:
|
|
@@ -577,7 +585,15 @@ def main():
|
|
|
577
585
|
max_tokens=config.get("recallMaxTokens", 1024),
|
|
578
586
|
budget=config.get("recallBudget", "mid"),
|
|
579
587
|
types=config.get("recallTypes"),
|
|
580
|
-
timeout
|
|
588
|
+
# 8s in-script timeout leaves 4s headroom inside the 12s
|
|
589
|
+
# UserPromptSubmit hook ceiling (see hooks.json:20) for cache
|
|
590
|
+
# write + block formatting. Tightened from 10s in switchroom
|
|
591
|
+
# v0.13.22: the 2026-05-24 audit showed 17-26% of turns
|
|
592
|
+
# breaching the 12s hook timeout on heavy agents (finn /
|
|
593
|
+
# gymbro / klanker), which dropped the recall entirely; an
|
|
594
|
+
# earlier-hard-timeout failure returns cleanly with no
|
|
595
|
+
# memories instead of blowing past the hook ceiling.
|
|
596
|
+
timeout=8,
|
|
581
597
|
)
|
|
582
598
|
extra_results = extra_response.get("results", [])
|
|
583
599
|
if extra_results:
|