switchroom 0.12.18 → 0.12.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-scheduler/index.js +85 -81
- package/dist/auth-broker/index.js +85 -81
- package/dist/cli/drive-write-pretool.mjs +10 -10
- package/dist/cli/skill-validate-pretool.mjs +72 -72
- package/dist/cli/switchroom.js +1284 -998
- package/dist/host-control/main.js +104 -100
- package/dist/vault/approvals/kernel-server.js +87 -83
- package/dist/vault/broker/server.js +94 -90
- package/package.json +2 -2
- package/telegram-plugin/dist/bridge/bridge.js +112 -112
- package/telegram-plugin/dist/gateway/gateway.js +448 -209
- package/telegram-plugin/dist/server.js +160 -160
- package/telegram-plugin/gateway/gateway.ts +180 -13
- package/telegram-plugin/gateway/inbound-delivery-gate.ts +85 -0
- package/telegram-plugin/gateway/inbound-spool.ts +272 -0
- package/telegram-plugin/gateway/pending-inbound-buffer.ts +42 -3
- package/telegram-plugin/gateway/turn-state-purge.ts +71 -0
- package/telegram-plugin/tests/inbound-delivery-gate.test.ts +53 -0
- package/telegram-plugin/tests/inbound-spool.test.ts +229 -0
- package/telegram-plugin/tests/pending-inbound-buffer.test.ts +66 -0
- package/telegram-plugin/tests/turn-state-purge.test.ts +109 -0
|
@@ -17,7 +17,7 @@ import { execFileSync, execSync, spawn } from 'child_process'
|
|
|
17
17
|
import {
|
|
18
18
|
readFileSync, writeFileSync, mkdirSync, readdirSync, rmSync,
|
|
19
19
|
statSync, renameSync, realpathSync, chmodSync, openSync, closeSync,
|
|
20
|
-
existsSync, unlinkSync,
|
|
20
|
+
existsSync, unlinkSync, appendFileSync,
|
|
21
21
|
} from 'fs'
|
|
22
22
|
import { homedir } from 'os'
|
|
23
23
|
import { join, extname, sep, basename } from 'path'
|
|
@@ -249,6 +249,9 @@ import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js
|
|
|
249
249
|
import { handleRequestDriveApproval } from './drive-write-approval.js'
|
|
250
250
|
import { buildDiffPreviewCard } from './diff-preview-card.js'
|
|
251
251
|
import { createPendingInboundBuffer, redeliverBufferedInbound, idleDrainTick } from './pending-inbound-buffer.js'
|
|
252
|
+
import { createInboundSpool } from './inbound-spool.js'
|
|
253
|
+
import { purgeStaleTurnsForChat } from './turn-state-purge.js'
|
|
254
|
+
import { decideInboundDelivery } from './inbound-delivery-gate.js'
|
|
252
255
|
import { createPendingPermissionBuffer } from './pending-permission-decisions.js'
|
|
253
256
|
import {
|
|
254
257
|
buildVaultGrantApprovedInbound,
|
|
@@ -1278,6 +1281,30 @@ function purgeReactionTracking(key: string): void {
|
|
|
1278
1281
|
// response to the client was already sent when the restart was
|
|
1279
1282
|
// scheduled, so nobody is waiting on this.
|
|
1280
1283
|
if (activeTurnStartedAt.size === 0) {
|
|
1284
|
+
// #1556: the deterministic delivery point. claude has just gone
|
|
1285
|
+
// idle — flush any inbound held mid-turn so the channel
|
|
1286
|
+
// notification lands at the idle prompt and submits as a fresh
|
|
1287
|
+
// turn (instead of stranding in the composer, the lawgpt wedge).
|
|
1288
|
+
// Zero-churn: depth check first, no work on the common empty path.
|
|
1289
|
+
// Lossless: redeliver re-buffers any per-message miss (bridge
|
|
1290
|
+
// mid-reconnect), which onClientRegistered then drains.
|
|
1291
|
+
const selfAgentForFlush = process.env.SWITCHROOM_AGENT_NAME ?? ''
|
|
1292
|
+
if (pendingInboundBuffer.depth(selfAgentForFlush) > 0) {
|
|
1293
|
+
const fr = redeliverBufferedInbound(
|
|
1294
|
+
pendingInboundBuffer,
|
|
1295
|
+
selfAgentForFlush,
|
|
1296
|
+
(m) => ipcServer.sendToAgent(selfAgentForFlush, m),
|
|
1297
|
+
inboundSpool,
|
|
1298
|
+
)
|
|
1299
|
+
if (fr.redelivered > 0) {
|
|
1300
|
+
process.stderr.write(
|
|
1301
|
+
`telegram gateway: turn-complete flushed ${fr.redelivered}/${fr.drained} ` +
|
|
1302
|
+
`held inbound for ${selfAgentForFlush}` +
|
|
1303
|
+
`${fr.rebuffered > 0 ? ` (${fr.rebuffered} re-buffered)` : ''}\n`,
|
|
1304
|
+
)
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1281
1308
|
if (pendingRestarts.size > 0) {
|
|
1282
1309
|
for (const [agentName, _timestamp] of pendingRestarts.entries()) {
|
|
1283
1310
|
triggerSelfRestart(agentName, 'turn-complete-pending-restart');
|
|
@@ -1292,6 +1319,32 @@ function purgeReactionTracking(key: string): void {
|
|
|
1292
1319
|
}
|
|
1293
1320
|
}
|
|
1294
1321
|
|
|
1322
|
+
/**
|
|
1323
|
+
* Atomic null-and-purge for a wedged turn. Every site that ends a
|
|
1324
|
+
* turn by nulling `currentTurn` MUST also clear the turn's statusKey
|
|
1325
|
+
* from `activeTurnStartedAt` — else a dangling entry survives and
|
|
1326
|
+
* `#1556`'s turn-gate holds every new inbound mid-turn forever
|
|
1327
|
+
* (gymbro / klanker held-mid-turn symptom, 2026-05-20).
|
|
1328
|
+
*
|
|
1329
|
+
* Pre-this, three turn-end paths (silent-marker / turn-flush /
|
|
1330
|
+
* `turn_end`) nulled `currentTurn` on code-paths whose
|
|
1331
|
+
* `purgeReactionTracking` calls weren't reached on every branch,
|
|
1332
|
+
* leaving sibling entries under the turn's statusKey that the
|
|
1333
|
+
* silence-poke framework-fallback's `purgeReactionTracking(fbKey)`
|
|
1334
|
+
* couldn't catch (different key shape). The fallback now also sweeps
|
|
1335
|
+
* siblings for `fbChatId` (`turn-state-purge.ts`) as defense-in-depth,
|
|
1336
|
+
* but THIS helper closes the leak at origin: null and purge are
|
|
1337
|
+
* inseparable at every call site.
|
|
1338
|
+
*
|
|
1339
|
+
* Idempotent: a second purge is a no-op `.delete()` on a key already
|
|
1340
|
+
* gone — handlers that already purge elsewhere are unharmed.
|
|
1341
|
+
*/
|
|
1342
|
+
function endCurrentTurnAtomic(turn: CurrentTurn): void {
|
|
1343
|
+
if (currentTurn !== turn) return
|
|
1344
|
+
currentTurn = null
|
|
1345
|
+
purgeReactionTracking(statusKey(turn.sessionChatId, turn.sessionThreadId))
|
|
1346
|
+
}
|
|
1347
|
+
|
|
1295
1348
|
/**
|
|
1296
1349
|
* Model-idle proactive-compaction check. Called ONLY from the
|
|
1297
1350
|
* activeTurnStartedAt.size === 0 gate above (never mid-turn). Opt-in via
|
|
@@ -2985,6 +3038,23 @@ silencePoke.startTimer({
|
|
|
2985
3038
|
// for this chat starts a fresh turn instead of queueing forever.
|
|
2986
3039
|
silencePoke.endTurn(fbKey)
|
|
2987
3040
|
purgeReactionTracking(fbKey)
|
|
3041
|
+
// Defense-in-depth: the fallback's purgeReactionTracking above
|
|
3042
|
+
// clears the canonical statusKey(chatId, threadId) for fbKey
|
|
3043
|
+
// only. activeTurnStartedAt can hold sibling entries for the
|
|
3044
|
+
// SAME chat (different threads, or a `null` vs `undefined`-thread
|
|
3045
|
+
// variant left over from a normal turn-end path that nulled
|
|
3046
|
+
// currentTurn without invoking purgeReactionTracking — the
|
|
3047
|
+
// gymbro/klanker held-mid-turn symptom, 2026-05-20). Any sibling
|
|
3048
|
+
// for fbChatId is by definition stale when THIS fallback fires
|
|
3049
|
+
// (the chat has been silent ≥5 min); sweep them via the same
|
|
3050
|
+
// purger. Multi-chat-safe — only touches keys for fbChatId, so
|
|
3051
|
+
// #1546's intentional cross-chat safety guard is preserved.
|
|
3052
|
+
// See turn-state-purge.ts.
|
|
3053
|
+
const fbExtraPurge = purgeStaleTurnsForChat(
|
|
3054
|
+
fbChatId,
|
|
3055
|
+
activeTurnStartedAt.keys(),
|
|
3056
|
+
purgeReactionTracking,
|
|
3057
|
+
)
|
|
2988
3058
|
// Null `currentTurn` if it's still pointing at the wedged turn —
|
|
2989
3059
|
// when claude eventually fires a late `turn_end` for this session
|
|
2990
3060
|
// (or never does), the handler's `const turn = currentTurn` snapshot
|
|
@@ -3011,13 +3081,15 @@ silencePoke.startTimer({
|
|
|
3011
3081
|
pendingInboundBuffer,
|
|
3012
3082
|
fbSelfAgent,
|
|
3013
3083
|
(m) => ipcServer.sendToAgent(fbSelfAgent, m),
|
|
3084
|
+
inboundSpool,
|
|
3014
3085
|
)
|
|
3015
3086
|
process.stderr.write(
|
|
3016
3087
|
`telegram gateway: silence-poke framework-fallback ended wedged turn ` +
|
|
3017
3088
|
`chat=${fbChatId} thread=${ctx.threadId ?? '-'} silence_ms=${ctx.silenceMs} ` +
|
|
3018
3089
|
`currentTurn_nulled=${turnMatchesFallback} ` +
|
|
3019
3090
|
`drained_buffered=${fbRedeliver.redelivered}/${fbRedeliver.drained}` +
|
|
3020
|
-
`${fbRedeliver.rebuffered > 0 ? ` rebuffered=${fbRedeliver.rebuffered}` : ''}
|
|
3091
|
+
`${fbRedeliver.rebuffered > 0 ? ` rebuffered=${fbRedeliver.rebuffered}` : ''}` +
|
|
3092
|
+
`${fbExtraPurge.purged.length > 0 ? ` extra_keys_purged=${fbExtraPurge.purged.length}` : ''}\n`,
|
|
3021
3093
|
)
|
|
3022
3094
|
},
|
|
3023
3095
|
})
|
|
@@ -3029,7 +3101,42 @@ silencePoke.startTimer({
|
|
|
3029
3101
|
// vault_request_access card during the 100ms bridge-reconnect window
|
|
3030
3102
|
// would mint the grant but silently drop the `vault_grant_approved`
|
|
3031
3103
|
// inbound, leaving the agent stuck waiting for a manual poke.
|
|
3032
|
-
|
|
3104
|
+
// Durable inbound spool on the persistent per-agent volume
|
|
3105
|
+
// (STATE_DIR = /state/agent/telegram in prod — survives container
|
|
3106
|
+
// recreate). Makes the "⏳ your message is queued and will be
|
|
3107
|
+
// processed when it reconnects" promise deterministic across a
|
|
3108
|
+
// gateway/container restart (finn/carrie lost-on-restart incident,
|
|
3109
|
+
// 2026-05-19). STATIC mode has no runtime/bridge, so no spool.
|
|
3110
|
+
const inboundSpool = STATIC
|
|
3111
|
+
? undefined
|
|
3112
|
+
: createInboundSpool({
|
|
3113
|
+
path: join(STATE_DIR, 'inbound-spool.jsonl'),
|
|
3114
|
+
fs: {
|
|
3115
|
+
appendFileSync: (p, d) => appendFileSync(p, d),
|
|
3116
|
+
readFileSync: (p) => readFileSync(p, 'utf8'),
|
|
3117
|
+
writeFileSync: (p, d) => writeFileSync(p, d),
|
|
3118
|
+
renameSync: (a, b) => renameSync(a, b),
|
|
3119
|
+
existsSync: (p) => existsSync(p),
|
|
3120
|
+
statSizeSync: (p) => statSync(p).size,
|
|
3121
|
+
},
|
|
3122
|
+
})
|
|
3123
|
+
const pendingInboundBuffer = createPendingInboundBuffer({ spool: inboundSpool })
|
|
3124
|
+
// Boot-replay: re-queue every un-acked spooled inbound into the
|
|
3125
|
+
// in-memory buffer so the existing drain triggers (onClientRegistered
|
|
3126
|
+
// / silence-poke #1546 / idle-drain #1549) deliver them. push →
|
|
3127
|
+
// spool.put dedups on the already-live id, so this re-push does NOT
|
|
3128
|
+
// double-append. This is what makes a queued message survive a
|
|
3129
|
+
// restart instead of being silently lost.
|
|
3130
|
+
if (inboundSpool != null) {
|
|
3131
|
+
const replay = inboundSpool.liveEntries()
|
|
3132
|
+
for (const e of replay) pendingInboundBuffer.push(e.agent, e.msg)
|
|
3133
|
+
if (replay.length > 0) {
|
|
3134
|
+
process.stderr.write(
|
|
3135
|
+
`telegram gateway: inbound-spool boot-replay re-queued ${replay.length} ` +
|
|
3136
|
+
`un-acked inbound (durable-queue, survives restart)\n`,
|
|
3137
|
+
)
|
|
3138
|
+
}
|
|
3139
|
+
}
|
|
3033
3140
|
const pendingPermissionBuffer = createPendingPermissionBuffer()
|
|
3034
3141
|
|
|
3035
3142
|
/**
|
|
@@ -3080,6 +3187,12 @@ const ipcServer: IpcServer = createIpcServer({
|
|
|
3080
3187
|
for (const msg of pending) {
|
|
3081
3188
|
try {
|
|
3082
3189
|
client.send(msg)
|
|
3190
|
+
// Confirmed delivery to the just-registered live bridge →
|
|
3191
|
+
// tombstone the durable spool entry so it isn't boot-replayed
|
|
3192
|
+
// again. A throw below leaves it spooled (un-acked) so the
|
|
3193
|
+
// idle-drain / escalation path still recovers it — strictly
|
|
3194
|
+
// safer than the old log-and-drop.
|
|
3195
|
+
inboundSpool?.ack(msg)
|
|
3083
3196
|
} catch (err) {
|
|
3084
3197
|
process.stderr.write(
|
|
3085
3198
|
`telegram gateway: pending-inbound drain failed agent=${client.agentName} ` +
|
|
@@ -3542,12 +3655,17 @@ const ipcServer: IpcServer = createIpcServer({
|
|
|
3542
3655
|
//
|
|
3543
3656
|
// This is the third drain trigger. It's gated to be zero-cost and
|
|
3544
3657
|
// zero-churn: skip entirely when nothing is buffered (one Map.get, no
|
|
3545
|
-
// log)
|
|
3546
|
-
//
|
|
3547
|
-
//
|
|
3548
|
-
//
|
|
3549
|
-
//
|
|
3550
|
-
//
|
|
3658
|
+
// log), when the bridge isn't alive (exactly sendToAgent's own guard —
|
|
3659
|
+
// so we never drain into a dead bridge and re-buffer/log-spin), OR
|
|
3660
|
+
// when a turn is in flight. The turn gate is #1556: a message
|
|
3661
|
+
// delivered while a turn is active is NOT safely queued by the bridge
|
|
3662
|
+
// — claude types it into its TUI composer and the auto-submit races
|
|
3663
|
+
// turn-completion, stranding it (the lawgpt wedge). Draining only at
|
|
3664
|
+
// `activeTurnStartedAt.size === 0` guarantees the channel notification
|
|
3665
|
+
// lands at an idle prompt and submits as a fresh turn. Only when there
|
|
3666
|
+
// IS a buffered message AND a live bridge AND no active turn do we
|
|
3667
|
+
// reuse the #1546 `redeliverBufferedInbound` (lossless: re-buffers any
|
|
3668
|
+
// per-message miss).
|
|
3551
3669
|
const IDLE_DRAIN_INTERVAL_MS = 5000
|
|
3552
3670
|
if (!STATIC) {
|
|
3553
3671
|
setInterval(() => {
|
|
@@ -3556,10 +3674,14 @@ if (!STATIC) {
|
|
|
3556
3674
|
pendingInboundBuffer,
|
|
3557
3675
|
selfAgent,
|
|
3558
3676
|
() => {
|
|
3677
|
+
// #1556: never drain mid-turn — that re-creates the composer
|
|
3678
|
+
// wedge this buffer exists to prevent.
|
|
3679
|
+
if (activeTurnStartedAt.size > 0) return false
|
|
3559
3680
|
const c = ipcServer.getClient(selfAgent)
|
|
3560
3681
|
return c != null && c.isAlive()
|
|
3561
3682
|
},
|
|
3562
3683
|
(m) => ipcServer.sendToAgent(selfAgent, m),
|
|
3684
|
+
inboundSpool,
|
|
3563
3685
|
)
|
|
3564
3686
|
if (r != null && r.redelivered > 0) {
|
|
3565
3687
|
process.stderr.write(
|
|
@@ -3568,6 +3690,28 @@ if (!STATIC) {
|
|
|
3568
3690
|
`${r.rebuffered > 0 ? ` (${r.rebuffered} re-buffered)` : ''}\n`,
|
|
3569
3691
|
)
|
|
3570
3692
|
}
|
|
3693
|
+
// Bounded escalation: a spooled inbound still un-acked past its
|
|
3694
|
+
// bound (default 15 min — well past the 5-min silence-poke ladder)
|
|
3695
|
+
// is undeliverable in practice. Retract the "will be processed"
|
|
3696
|
+
// promise EXPLICITLY (honest failure) instead of letting it sit
|
|
3697
|
+
// forever. This is what makes the guarantee deterministic: every
|
|
3698
|
+
// queued message ends either delivered or visibly retracted.
|
|
3699
|
+
inboundSpool?.sweepEscalations((e) => {
|
|
3700
|
+
const chat = e.msg.chatId
|
|
3701
|
+
const threadOpts =
|
|
3702
|
+
typeof e.msg.meta?.threadId === 'string' && e.msg.meta.threadId
|
|
3703
|
+
? { message_thread_id: Number(e.msg.meta.threadId) }
|
|
3704
|
+
: {}
|
|
3705
|
+
void swallowingApiCall(
|
|
3706
|
+
() =>
|
|
3707
|
+
bot.api.sendMessage(
|
|
3708
|
+
chat,
|
|
3709
|
+
"⚠️ I couldn't deliver an earlier message to the agent after repeated retries (it survived restarts but the agent never picked it up). Please resend it.",
|
|
3710
|
+
{ ...threadOpts },
|
|
3711
|
+
),
|
|
3712
|
+
{ chat_id: chat, verb: 'inbound-spool-escalation' },
|
|
3713
|
+
)
|
|
3714
|
+
})
|
|
3571
3715
|
}, IDLE_DRAIN_INTERVAL_MS).unref()
|
|
3572
3716
|
}
|
|
3573
3717
|
|
|
@@ -5587,7 +5731,7 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
5587
5731
|
turn.answerStream = null
|
|
5588
5732
|
}
|
|
5589
5733
|
// Null the atom — this turn is being abandoned.
|
|
5590
|
-
|
|
5734
|
+
endCurrentTurnAtomic(turn)
|
|
5591
5735
|
// #549 fix — context-exhaustion teardown also resets preamble state.
|
|
5592
5736
|
preambleSuppressor.reset()
|
|
5593
5737
|
}
|
|
@@ -5785,7 +5929,7 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
5785
5929
|
// returns early at handler entry. A new `enqueue` swaps in a
|
|
5786
5930
|
// fresh atom; the silent-turn teardown doesn't need to preserve
|
|
5787
5931
|
// any of the prior turn's state.
|
|
5788
|
-
|
|
5932
|
+
endCurrentTurnAtomic(turn)
|
|
5789
5933
|
// #549 fix — silent-marker teardown drops any pending preamble.
|
|
5790
5934
|
preambleSuppressor.dropNow()
|
|
5791
5935
|
return
|
|
@@ -5819,7 +5963,7 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
5819
5963
|
// sendMessage await for this turn will see currentTurn == null
|
|
5820
5964
|
// and bail; a new enqueue will swap in a fresh atom. The
|
|
5821
5965
|
// `backstop*` locals above hold everything the IIFE needs.
|
|
5822
|
-
|
|
5966
|
+
endCurrentTurnAtomic(turn)
|
|
5823
5967
|
// #549 fix — turn-flush takes ownership of the captured-text
|
|
5824
5968
|
// backup; reset the preamble buffer (its content is already in
|
|
5825
5969
|
// the captured `capturedText`, which turn-flush is about to send).
|
|
@@ -6091,7 +6235,7 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
6091
6235
|
// #1067: null the atom in one assignment, replacing the seven
|
|
6092
6236
|
// field clears the pre-refactor version did. Any late-arriving
|
|
6093
6237
|
// event for this turn will see currentTurn == null and bail.
|
|
6094
|
-
|
|
6238
|
+
endCurrentTurnAtomic(turn)
|
|
6095
6239
|
// #549 fix — preamble flush already happened at the TOP of this
|
|
6096
6240
|
// turn_end handler (before turn.answerStream is nulled). See
|
|
6097
6241
|
// comment near line 3431.
|
|
@@ -7377,6 +7521,29 @@ async function handleInbound(
|
|
|
7377
7521
|
// push to pendingInboundBuffer, which onClientRegistered drains on
|
|
7378
7522
|
// the next bridge register — so the notice below is now truthful.
|
|
7379
7523
|
const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
|
|
7524
|
+
|
|
7525
|
+
// #1556: turn-gated delivery. A non-steering inbound that arrives
|
|
7526
|
+
// mid-turn must NOT be sent to the bridge now — claude would type it
|
|
7527
|
+
// into its TUI composer and the auto-submit races turn-completion,
|
|
7528
|
+
// stranding the message (the lawgpt wedge, 2026-05-19). Buffer it;
|
|
7529
|
+
// `purgeReactionTracking`'s turn-complete hook and the turn-gated
|
|
7530
|
+
// idle-drain flush it the instant claude goes idle, where the channel
|
|
7531
|
+
// notification submits cleanly as a fresh turn. Steering messages are
|
|
7532
|
+
// exempt — reaching claude mid-turn is the whole point of /steer.
|
|
7533
|
+
if (
|
|
7534
|
+
decideInboundDelivery({
|
|
7535
|
+
turnInFlight: activeTurnStartedAt.size > 0,
|
|
7536
|
+
isSteering,
|
|
7537
|
+
}) === 'buffer-until-idle'
|
|
7538
|
+
) {
|
|
7539
|
+
pendingInboundBuffer.push(selfAgent, inboundMsg)
|
|
7540
|
+
process.stderr.write(
|
|
7541
|
+
`telegram gateway: inbound held mid-turn agent=${selfAgent} ` +
|
|
7542
|
+
`chat=${chat_id} msg=${msgId ?? '-'} — will flush on turn-complete\n`,
|
|
7543
|
+
)
|
|
7544
|
+
return
|
|
7545
|
+
}
|
|
7546
|
+
|
|
7380
7547
|
const delivered = ipcServer.sendToAgent(selfAgent, inboundMsg)
|
|
7381
7548
|
if (!delivered) {
|
|
7382
7549
|
pendingInboundBuffer.push(selfAgent, inboundMsg)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inbound delivery gate (#1556 — the lawgpt composer-wedge).
|
|
3
|
+
*
|
|
4
|
+
* Pure decision: given the live turn state, should a freshly-received
|
|
5
|
+
* Telegram inbound be delivered to the bridge *now*, or held in the
|
|
6
|
+
* pending-inbound buffer until claude is idle?
|
|
7
|
+
*
|
|
8
|
+
* ## Why this exists
|
|
9
|
+
*
|
|
10
|
+
* The gateway used to `ipcServer.sendToAgent(inbound)` unconditionally,
|
|
11
|
+
* buffering ONLY when the bridge was offline. The load-bearing (and
|
|
12
|
+
* false) assumption — stated verbatim in three places before this fix
|
|
13
|
+
* (`pending-inbound-buffer.ts`, the idle-drain comment, and the
|
|
14
|
+
* implicit unconditional send) — was:
|
|
15
|
+
*
|
|
16
|
+
* "a message delivered while a turn is active is queued normally by
|
|
17
|
+
* the bridge, same as a live arrival, not lost."
|
|
18
|
+
*
|
|
19
|
+
* It is not. The bridge converts an inbound into an MCP
|
|
20
|
+
* `notifications/claude/channel` notification (`bridge.ts:onInbound`).
|
|
21
|
+
* When claude receives that notification mid-turn, the unmodified CLI
|
|
22
|
+
* types the text into its TUI composer and relies on an auto-submit
|
|
23
|
+
* once the turn ends. That submit races turn-completion and frequently
|
|
24
|
+
* does not fire — the message strands in the composer, claude sits at
|
|
25
|
+
* an idle prompt with the user's instruction un-actioned, and nothing
|
|
26
|
+
* self-heals it (the turn-active watchdog only catches *in-turn* hangs;
|
|
27
|
+
* this is *between-turns*-with-undelivered-input, which reads as
|
|
28
|
+
* healthy idle). Observed live: agent `lawgpt`, 2026-05-19 — a
|
|
29
|
+
* follow-up message sat unsubmitted indefinitely; only a restart
|
|
30
|
+
* cleared it, and the restart *lost* the message.
|
|
31
|
+
*
|
|
32
|
+
* ## The deterministic guarantee
|
|
33
|
+
*
|
|
34
|
+
* A non-steering inbound on the Telegram `handleInbound` path is
|
|
35
|
+
* delivered to the bridge ONLY when no turn is in flight. The channel
|
|
36
|
+
* notification therefore always lands at an idle claude prompt, where
|
|
37
|
+
* it submits cleanly as a fresh turn. It can be *delayed* (until the
|
|
38
|
+
* current turn completes) but can never strand in the composer. The
|
|
39
|
+
* turn-complete hook (`purgeReactionTracking`) and the turn-gated
|
|
40
|
+
* idle-drain timer flush the buffer the instant
|
|
41
|
+
* `activeTurnStartedAt.size === 0`.
|
|
42
|
+
*
|
|
43
|
+
* Scope: this gates the Telegram `handleInbound` path only — the one
|
|
44
|
+
* the lawgpt wedge hit. The `inject_inbound` IPC path (cron / synthetic
|
|
45
|
+
* operator wakeups) reaches the bridge directly and is deliberately
|
|
46
|
+
* NOT gated here: cron fires carry at-least-once replay semantics and
|
|
47
|
+
* their delivery contract is a separate product decision, out of scope
|
|
48
|
+
* for this bug.
|
|
49
|
+
*
|
|
50
|
+
* ## Steering is deliberately exempt
|
|
51
|
+
*
|
|
52
|
+
* An explicit `/steer` (`/s`) message is *meant* to reach claude
|
|
53
|
+
* mid-turn — that is the whole point of the steering feature (redirect
|
|
54
|
+
* the agent while it works). Steering messages keep immediate delivery.
|
|
55
|
+
* The wedge only ever affected the queued-mid-turn default path.
|
|
56
|
+
*/
|
|
57
|
+
|
|
58
|
+
export interface InboundDeliveryGateInput {
|
|
59
|
+
/** A turn is in flight RIGHT NOW (live: `activeTurnStartedAt.size > 0`),
|
|
60
|
+
* evaluated at delivery time — not a receipt-time snapshot, so a turn
|
|
61
|
+
* that completed between receipt and here correctly reads as idle. */
|
|
62
|
+
turnInFlight: boolean
|
|
63
|
+
/** This inbound carried an explicit `/steer` (`/s`) prefix and is an
|
|
64
|
+
* intentional mid-turn redirect. */
|
|
65
|
+
isSteering: boolean
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export type InboundDeliveryDecision =
|
|
69
|
+
/** Send to the bridge now (idle prompt, or an intentional steer). */
|
|
70
|
+
| 'deliver'
|
|
71
|
+
/** Hold in the pending-inbound buffer; the turn-complete hook /
|
|
72
|
+
* turn-gated idle-drain flushes it when claude goes idle. */
|
|
73
|
+
| 'buffer-until-idle'
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Pure. The ONLY condition that defers delivery is "a turn is in flight
|
|
77
|
+
* AND this is not a steering message". Everything else delivers
|
|
78
|
+
* immediately (idle → submits at once; steering → intentional mid-turn).
|
|
79
|
+
*/
|
|
80
|
+
export function decideInboundDelivery(
|
|
81
|
+
input: InboundDeliveryGateInput,
|
|
82
|
+
): InboundDeliveryDecision {
|
|
83
|
+
if (input.turnInFlight && !input.isSteering) return 'buffer-until-idle'
|
|
84
|
+
return 'deliver'
|
|
85
|
+
}
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* inbound-spool.ts — durable, crash-tolerant spool for buffered inbound.
|
|
3
|
+
*
|
|
4
|
+
* Why this exists: `pending-inbound-buffer.ts` is in-memory only. A
|
|
5
|
+
* gateway/container restart (switchroom update, agent restart, a
|
|
6
|
+
* self-restart, an OOM) destroys it — so the user-facing promise
|
|
7
|
+
* "⏳ your message is queued and will be processed when it reconnects"
|
|
8
|
+
* (gateway.ts) is a lie across a restart. Proven twice: finn and
|
|
9
|
+
* carrie (2026-05-19) lost the user's message on restart and the user
|
|
10
|
+
* had to resend. #1546/#1549 only shrank the in-memory delivery
|
|
11
|
+
* window; they cannot survive process death.
|
|
12
|
+
*
|
|
13
|
+
* This module makes the promise DETERMINISTIC: every buffered inbound
|
|
14
|
+
* is also appended to a JSONL spool on the persistent per-agent volume
|
|
15
|
+
* (`/state/agent/telegram/…`, survives container recreate). On boot the
|
|
16
|
+
* gateway replays un-acked entries back into the in-memory buffer, so
|
|
17
|
+
* the existing drain machinery delivers them. An entry is acked (and
|
|
18
|
+
* tombstoned) ONLY on confirmed delivery to a live registered bridge.
|
|
19
|
+
* Un-acked entries older than `escalateAfterMs` are surfaced to the
|
|
20
|
+
* user via an explicit "couldn't deliver — resend?" callback and then
|
|
21
|
+
* dropped: the promise is then ALWAYS resolved — kept, or visibly
|
|
22
|
+
* retracted — never silently lost.
|
|
23
|
+
*
|
|
24
|
+
* Scope (v1): the ack is "delivered to a live registered bridge", not
|
|
25
|
+
* "claude consumed it". A true claude→gateway consumption-ack needs a
|
|
26
|
+
* new bidirectional bridge protocol (high blast radius) and is a
|
|
27
|
+
* documented follow-up. v1 already eliminates the silent-loss-on-
|
|
28
|
+
* restart class — the actual incident class.
|
|
29
|
+
*
|
|
30
|
+
* Crash-consistency: append-only JSONL, one self-contained JSON object
|
|
31
|
+
* per line, written with a trailing newline in a single `appendFileSync`
|
|
32
|
+
* (atomic for small writes on local fs). A torn final line on a crash
|
|
33
|
+
* mid-write is tolerated: replay skips any line that does not
|
|
34
|
+
* round-trip `JSON.parse` + shape-check. Acks are themselves appended
|
|
35
|
+
* as tombstone lines (`{t:"ack",id}`) rather than rewriting the file;
|
|
36
|
+
* a bounded `compact()` rewrites the file dropping acked/escalated ids
|
|
37
|
+
* when it grows past `compactAtBytes`.
|
|
38
|
+
*
|
|
39
|
+
* This module is PURE w.r.t. its injected fs + clock seams so the
|
|
40
|
+
* crash/dedup/replay/escalation logic is unit-tested without a real
|
|
41
|
+
* gateway (mirrors the #1544/#1546/#1549 pure-seam idiom).
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
import type { InboundMessage } from './ipc-protocol.js'
|
|
45
|
+
|
|
46
|
+
/** Stable dedup id for an inbound. Real Telegram messages have a
|
|
47
|
+
* unique (chatId, messageId). Synthetic/cron inbounds use messageId
|
|
48
|
+
* 0 — fall back to a deterministic id from source+ts so retried
|
|
49
|
+
* synthetics of the SAME logical event dedup, but distinct events
|
|
50
|
+
* (different ts) do not collapse. */
|
|
51
|
+
export function spoolId(msg: InboundMessage): string {
|
|
52
|
+
if (typeof msg.messageId === 'number' && msg.messageId > 0) {
|
|
53
|
+
return `m:${msg.chatId}:${msg.messageId}`
|
|
54
|
+
}
|
|
55
|
+
const src = msg.meta?.source ?? '-'
|
|
56
|
+
return `s:${msg.chatId}:${src}:${msg.ts}`
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
interface SpoolRecord {
|
|
60
|
+
t: 'put' | 'ack'
|
|
61
|
+
id: string
|
|
62
|
+
/** Present only on `put`. The full inbound to replay. */
|
|
63
|
+
msg?: InboundMessage
|
|
64
|
+
/** Present only on `put`. Owning agent (replay re-pushes per agent). */
|
|
65
|
+
agent?: string
|
|
66
|
+
/** Present only on `put`. ms epoch first-spooled — drives escalation. */
|
|
67
|
+
firstAt?: number
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export interface InboundSpoolFsSeam {
|
|
71
|
+
appendFileSync: (path: string, data: string) => void
|
|
72
|
+
readFileSync: (path: string) => string
|
|
73
|
+
writeFileSync: (path: string, data: string) => void
|
|
74
|
+
/** Atomic same-dir replace (POSIX rename). Used so compaction can't
|
|
75
|
+
* lose entries to a crash mid-rewrite. */
|
|
76
|
+
renameSync: (from: string, to: string) => void
|
|
77
|
+
existsSync: (path: string) => boolean
|
|
78
|
+
statSizeSync: (path: string) => number
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export interface InboundSpoolOptions {
|
|
82
|
+
path: string
|
|
83
|
+
fs: InboundSpoolFsSeam
|
|
84
|
+
now?: () => number
|
|
85
|
+
log?: (line: string) => void
|
|
86
|
+
/** Un-acked entries older than this are escalated then dropped.
|
|
87
|
+
* Default 15 min — comfortably past the 5-min silence-poke ladder
|
|
88
|
+
* so self-heal gets every chance before we retract the promise. */
|
|
89
|
+
escalateAfterMs?: number
|
|
90
|
+
/** Rewrite-compact the JSONL once it exceeds this. Default 256 KiB. */
|
|
91
|
+
compactAtBytes?: number
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export interface ReplayEntry {
|
|
95
|
+
agent: string
|
|
96
|
+
msg: InboundMessage
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
export interface InboundSpool {
|
|
100
|
+
/** Durably record `msg` for `agent`. Idempotent by spoolId: a
|
|
101
|
+
* re-spool of an already-live id is a no-op (returns false). */
|
|
102
|
+
put: (agent: string, msg: InboundMessage) => boolean
|
|
103
|
+
/** Tombstone `id` — call ONLY on confirmed delivery to a live
|
|
104
|
+
* registered bridge. Idempotent. */
|
|
105
|
+
ack: (msg: InboundMessage) => void
|
|
106
|
+
/** Live (un-acked) entries, oldest first. Used at boot to re-push
|
|
107
|
+
* into the in-memory buffer. Pure read — does not mutate. */
|
|
108
|
+
liveEntries: () => ReplayEntry[]
|
|
109
|
+
/** Escalate+drop entries older than `escalateAfterMs`. Calls
|
|
110
|
+
* `onEscalate` once per dropped entry (post the "couldn't deliver"
|
|
111
|
+
* card there). Returns the count escalated. Safe to call on a timer. */
|
|
112
|
+
sweepEscalations: (onEscalate: (e: ReplayEntry) => void) => number
|
|
113
|
+
/** Test/observability: count of live (un-acked) ids. */
|
|
114
|
+
liveCount: () => number
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export function createInboundSpool(opts: InboundSpoolOptions): InboundSpool {
|
|
118
|
+
const { path, fs } = opts
|
|
119
|
+
const now = opts.now ?? Date.now
|
|
120
|
+
const log = opts.log ?? ((l: string) => process.stderr.write(l))
|
|
121
|
+
const escalateAfterMs = opts.escalateAfterMs ?? 15 * 60 * 1000
|
|
122
|
+
const compactAtBytes = opts.compactAtBytes ?? 256 * 1024
|
|
123
|
+
|
|
124
|
+
// In-memory projection of the on-disk log, rebuilt from the file at
|
|
125
|
+
// construction. `live` maps spoolId → the put record (insertion order
|
|
126
|
+
// preserved via the Map). An `ack` deletes from `live`.
|
|
127
|
+
const live = new Map<string, { agent: string; msg: InboundMessage; firstAt: number }>()
|
|
128
|
+
|
|
129
|
+
function parseLine(line: string): SpoolRecord | null {
|
|
130
|
+
const s = line.trim()
|
|
131
|
+
if (!s) return null
|
|
132
|
+
let rec: unknown
|
|
133
|
+
try {
|
|
134
|
+
rec = JSON.parse(s)
|
|
135
|
+
} catch {
|
|
136
|
+
return null // torn / partial line from a crash mid-append — skip
|
|
137
|
+
}
|
|
138
|
+
if (rec == null || typeof rec !== 'object') return null
|
|
139
|
+
const r = rec as Record<string, unknown>
|
|
140
|
+
if (r.t !== 'put' && r.t !== 'ack') return null
|
|
141
|
+
if (typeof r.id !== 'string' || r.id.length === 0) return null
|
|
142
|
+
if (r.t === 'put') {
|
|
143
|
+
if (r.msg == null || typeof r.msg !== 'object') return null
|
|
144
|
+
if (typeof r.agent !== 'string' || r.agent.length === 0) return null
|
|
145
|
+
if (typeof r.firstAt !== 'number') return null
|
|
146
|
+
}
|
|
147
|
+
return r as unknown as SpoolRecord
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Rebuild `live` from the file. Tolerates a torn last line.
|
|
151
|
+
function hydrate(): void {
|
|
152
|
+
live.clear()
|
|
153
|
+
if (!fs.existsSync(path)) return
|
|
154
|
+
let raw = ''
|
|
155
|
+
try {
|
|
156
|
+
raw = fs.readFileSync(path)
|
|
157
|
+
} catch {
|
|
158
|
+
return
|
|
159
|
+
}
|
|
160
|
+
for (const line of raw.split('\n')) {
|
|
161
|
+
const rec = parseLine(line)
|
|
162
|
+
if (rec == null) continue
|
|
163
|
+
if (rec.t === 'put') {
|
|
164
|
+
// Last put for an id wins; an ack later removes it.
|
|
165
|
+
live.set(rec.id, {
|
|
166
|
+
agent: rec.agent as string,
|
|
167
|
+
msg: rec.msg as InboundMessage,
|
|
168
|
+
firstAt: rec.firstAt as number,
|
|
169
|
+
})
|
|
170
|
+
} else {
|
|
171
|
+
live.delete(rec.id)
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
function appendRecord(rec: SpoolRecord): void {
|
|
177
|
+
try {
|
|
178
|
+
fs.appendFileSync(path, JSON.stringify(rec) + '\n')
|
|
179
|
+
} catch (err) {
|
|
180
|
+
// Durability is best-effort relative to fs availability; a spool
|
|
181
|
+
// write failure must NOT break live delivery. Log loudly — a
|
|
182
|
+
// persistently failing spool means we're back to in-memory-only
|
|
183
|
+
// semantics and the operator should know.
|
|
184
|
+
log(
|
|
185
|
+
`inbound-spool: append FAILED path=${path} id=${rec.id} t=${rec.t}: ` +
|
|
186
|
+
`${(err as Error).message} — durability degraded to in-memory\n`,
|
|
187
|
+
)
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function maybeCompact(): void {
|
|
192
|
+
let size = 0
|
|
193
|
+
try {
|
|
194
|
+
size = fs.existsSync(path) ? fs.statSizeSync(path) : 0
|
|
195
|
+
} catch {
|
|
196
|
+
return
|
|
197
|
+
}
|
|
198
|
+
if (size <= compactAtBytes) return
|
|
199
|
+
// Rewrite the file as exactly the current live set (one put per
|
|
200
|
+
// live id, no acks). ATOMIC: write a sibling tmp then rename over
|
|
201
|
+
// the real path. rename(2) is atomic within a filesystem, so a
|
|
202
|
+
// crash at any point leaves EITHER the full pre-compaction log OR
|
|
203
|
+
// the full compacted log on disk — never a truncated/torn file
|
|
204
|
+
// that loses live entries after the tear. (Plain writeFileSync is
|
|
205
|
+
// not atomic; a crash mid-write of a >256 KiB rewrite could drop
|
|
206
|
+
// entries past the tear — the residual the reviewer flagged.)
|
|
207
|
+
const lines: string[] = []
|
|
208
|
+
for (const [id, e] of live) {
|
|
209
|
+
lines.push(
|
|
210
|
+
JSON.stringify({ t: 'put', id, agent: e.agent, msg: e.msg, firstAt: e.firstAt } satisfies SpoolRecord),
|
|
211
|
+
)
|
|
212
|
+
}
|
|
213
|
+
const tmp = path + '.compact.tmp'
|
|
214
|
+
try {
|
|
215
|
+
fs.writeFileSync(tmp, lines.length ? lines.join('\n') + '\n' : '')
|
|
216
|
+
fs.renameSync(tmp, path)
|
|
217
|
+
log(`inbound-spool: compacted path=${path} live=${live.size}\n`)
|
|
218
|
+
} catch (err) {
|
|
219
|
+
// Compaction is opportunistic — a failure keeps the (larger but
|
|
220
|
+
// correct) append-only log; never lose data trying to shrink it.
|
|
221
|
+
log(`inbound-spool: compact FAILED path=${path}: ${(err as Error).message}\n`)
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
hydrate()
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
put(agent, msg) {
|
|
229
|
+
const id = spoolId(msg)
|
|
230
|
+
if (live.has(id)) return false // dedup: already spooled & un-acked
|
|
231
|
+
const firstAt = now()
|
|
232
|
+
live.set(id, { agent, msg, firstAt })
|
|
233
|
+
appendRecord({ t: 'put', id, agent, msg, firstAt })
|
|
234
|
+
maybeCompact()
|
|
235
|
+
return true
|
|
236
|
+
},
|
|
237
|
+
ack(msg) {
|
|
238
|
+
const id = spoolId(msg)
|
|
239
|
+
if (!live.has(id)) return // idempotent / unknown id
|
|
240
|
+
live.delete(id)
|
|
241
|
+
appendRecord({ t: 'ack', id })
|
|
242
|
+
maybeCompact()
|
|
243
|
+
},
|
|
244
|
+
liveEntries() {
|
|
245
|
+
// Insertion order = Map iteration order = oldest first.
|
|
246
|
+
return [...live.values()].map((e) => ({ agent: e.agent, msg: e.msg }))
|
|
247
|
+
},
|
|
248
|
+
sweepEscalations(onEscalate) {
|
|
249
|
+
const cutoff = now() - escalateAfterMs
|
|
250
|
+
let n = 0
|
|
251
|
+
for (const [id, e] of [...live.entries()]) {
|
|
252
|
+
if (e.firstAt > cutoff) continue
|
|
253
|
+
live.delete(id)
|
|
254
|
+
appendRecord({ t: 'ack', id }) // tombstone — promise retracted
|
|
255
|
+
try {
|
|
256
|
+
onEscalate({ agent: e.agent, msg: e.msg })
|
|
257
|
+
} catch (err) {
|
|
258
|
+
log(`inbound-spool: onEscalate threw id=${id}: ${(err as Error).message}\n`)
|
|
259
|
+
}
|
|
260
|
+
n++
|
|
261
|
+
}
|
|
262
|
+
if (n > 0) {
|
|
263
|
+
log(`inbound-spool: escalated+dropped ${n} undelivered entr${n === 1 ? 'y' : 'ies'} (older than ${escalateAfterMs}ms)\n`)
|
|
264
|
+
maybeCompact()
|
|
265
|
+
}
|
|
266
|
+
return n
|
|
267
|
+
},
|
|
268
|
+
liveCount() {
|
|
269
|
+
return live.size
|
|
270
|
+
},
|
|
271
|
+
}
|
|
272
|
+
}
|