switchroom 0.12.18 → 0.12.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-scheduler/index.js +80 -80
- package/dist/auth-broker/index.js +80 -80
- package/dist/cli/drive-write-pretool.mjs +10 -10
- package/dist/cli/skill-validate-pretool.mjs +72 -72
- package/dist/cli/switchroom.js +359 -361
- package/dist/host-control/main.js +99 -99
- package/dist/vault/approvals/kernel-server.js +82 -82
- package/dist/vault/broker/server.js +83 -83
- package/package.json +1 -1
- package/telegram-plugin/dist/bridge/bridge.js +112 -112
- package/telegram-plugin/dist/gateway/gateway.js +410 -199
- package/telegram-plugin/dist/server.js +160 -160
- package/telegram-plugin/gateway/gateway.ts +130 -8
- package/telegram-plugin/gateway/inbound-delivery-gate.ts +85 -0
- package/telegram-plugin/gateway/inbound-spool.ts +272 -0
- package/telegram-plugin/gateway/pending-inbound-buffer.ts +42 -3
- package/telegram-plugin/tests/inbound-delivery-gate.test.ts +53 -0
- package/telegram-plugin/tests/inbound-spool.test.ts +229 -0
- package/telegram-plugin/tests/pending-inbound-buffer.test.ts +66 -0
|
@@ -17,7 +17,7 @@ import { execFileSync, execSync, spawn } from 'child_process'
|
|
|
17
17
|
import {
|
|
18
18
|
readFileSync, writeFileSync, mkdirSync, readdirSync, rmSync,
|
|
19
19
|
statSync, renameSync, realpathSync, chmodSync, openSync, closeSync,
|
|
20
|
-
existsSync, unlinkSync,
|
|
20
|
+
existsSync, unlinkSync, appendFileSync,
|
|
21
21
|
} from 'fs'
|
|
22
22
|
import { homedir } from 'os'
|
|
23
23
|
import { join, extname, sep, basename } from 'path'
|
|
@@ -249,6 +249,8 @@ import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js
|
|
|
249
249
|
import { handleRequestDriveApproval } from './drive-write-approval.js'
|
|
250
250
|
import { buildDiffPreviewCard } from './diff-preview-card.js'
|
|
251
251
|
import { createPendingInboundBuffer, redeliverBufferedInbound, idleDrainTick } from './pending-inbound-buffer.js'
|
|
252
|
+
import { createInboundSpool } from './inbound-spool.js'
|
|
253
|
+
import { decideInboundDelivery } from './inbound-delivery-gate.js'
|
|
252
254
|
import { createPendingPermissionBuffer } from './pending-permission-decisions.js'
|
|
253
255
|
import {
|
|
254
256
|
buildVaultGrantApprovedInbound,
|
|
@@ -1278,6 +1280,30 @@ function purgeReactionTracking(key: string): void {
|
|
|
1278
1280
|
// response to the client was already sent when the restart was
|
|
1279
1281
|
// scheduled, so nobody is waiting on this.
|
|
1280
1282
|
if (activeTurnStartedAt.size === 0) {
|
|
1283
|
+
// #1556: the deterministic delivery point. claude has just gone
|
|
1284
|
+
// idle — flush any inbound held mid-turn so the channel
|
|
1285
|
+
// notification lands at the idle prompt and submits as a fresh
|
|
1286
|
+
// turn (instead of stranding in the composer, the lawgpt wedge).
|
|
1287
|
+
// Zero-churn: depth check first, no work on the common empty path.
|
|
1288
|
+
// Lossless: redeliver re-buffers any per-message miss (bridge
|
|
1289
|
+
// mid-reconnect), which onClientRegistered then drains.
|
|
1290
|
+
const selfAgentForFlush = process.env.SWITCHROOM_AGENT_NAME ?? ''
|
|
1291
|
+
if (pendingInboundBuffer.depth(selfAgentForFlush) > 0) {
|
|
1292
|
+
const fr = redeliverBufferedInbound(
|
|
1293
|
+
pendingInboundBuffer,
|
|
1294
|
+
selfAgentForFlush,
|
|
1295
|
+
(m) => ipcServer.sendToAgent(selfAgentForFlush, m),
|
|
1296
|
+
inboundSpool,
|
|
1297
|
+
)
|
|
1298
|
+
if (fr.redelivered > 0) {
|
|
1299
|
+
process.stderr.write(
|
|
1300
|
+
`telegram gateway: turn-complete flushed ${fr.redelivered}/${fr.drained} ` +
|
|
1301
|
+
`held inbound for ${selfAgentForFlush}` +
|
|
1302
|
+
`${fr.rebuffered > 0 ? ` (${fr.rebuffered} re-buffered)` : ''}\n`,
|
|
1303
|
+
)
|
|
1304
|
+
}
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1281
1307
|
if (pendingRestarts.size > 0) {
|
|
1282
1308
|
for (const [agentName, _timestamp] of pendingRestarts.entries()) {
|
|
1283
1309
|
triggerSelfRestart(agentName, 'turn-complete-pending-restart');
|
|
@@ -3011,6 +3037,7 @@ silencePoke.startTimer({
|
|
|
3011
3037
|
pendingInboundBuffer,
|
|
3012
3038
|
fbSelfAgent,
|
|
3013
3039
|
(m) => ipcServer.sendToAgent(fbSelfAgent, m),
|
|
3040
|
+
inboundSpool,
|
|
3014
3041
|
)
|
|
3015
3042
|
process.stderr.write(
|
|
3016
3043
|
`telegram gateway: silence-poke framework-fallback ended wedged turn ` +
|
|
@@ -3029,7 +3056,42 @@ silencePoke.startTimer({
|
|
|
3029
3056
|
// vault_request_access card during the 100ms bridge-reconnect window
|
|
3030
3057
|
// would mint the grant but silently drop the `vault_grant_approved`
|
|
3031
3058
|
// inbound, leaving the agent stuck waiting for a manual poke.
|
|
3032
|
-
|
|
3059
|
+
// Durable inbound spool on the persistent per-agent volume
|
|
3060
|
+
// (STATE_DIR = /state/agent/telegram in prod — survives container
|
|
3061
|
+
// recreate). Makes the "⏳ your message is queued and will be
|
|
3062
|
+
// processed when it reconnects" promise deterministic across a
|
|
3063
|
+
// gateway/container restart (finn/carrie lost-on-restart incident,
|
|
3064
|
+
// 2026-05-19). STATIC mode has no runtime/bridge, so no spool.
|
|
3065
|
+
const inboundSpool = STATIC
|
|
3066
|
+
? undefined
|
|
3067
|
+
: createInboundSpool({
|
|
3068
|
+
path: join(STATE_DIR, 'inbound-spool.jsonl'),
|
|
3069
|
+
fs: {
|
|
3070
|
+
appendFileSync: (p, d) => appendFileSync(p, d),
|
|
3071
|
+
readFileSync: (p) => readFileSync(p, 'utf8'),
|
|
3072
|
+
writeFileSync: (p, d) => writeFileSync(p, d),
|
|
3073
|
+
renameSync: (a, b) => renameSync(a, b),
|
|
3074
|
+
existsSync: (p) => existsSync(p),
|
|
3075
|
+
statSizeSync: (p) => statSync(p).size,
|
|
3076
|
+
},
|
|
3077
|
+
})
|
|
3078
|
+
const pendingInboundBuffer = createPendingInboundBuffer({ spool: inboundSpool })
|
|
3079
|
+
// Boot-replay: re-queue every un-acked spooled inbound into the
|
|
3080
|
+
// in-memory buffer so the existing drain triggers (onClientRegistered
|
|
3081
|
+
// / silence-poke #1546 / idle-drain #1549) deliver them. push →
|
|
3082
|
+
// spool.put dedups on the already-live id, so this re-push does NOT
|
|
3083
|
+
// double-append. This is what makes a queued message survive a
|
|
3084
|
+
// restart instead of being silently lost.
|
|
3085
|
+
if (inboundSpool != null) {
|
|
3086
|
+
const replay = inboundSpool.liveEntries()
|
|
3087
|
+
for (const e of replay) pendingInboundBuffer.push(e.agent, e.msg)
|
|
3088
|
+
if (replay.length > 0) {
|
|
3089
|
+
process.stderr.write(
|
|
3090
|
+
`telegram gateway: inbound-spool boot-replay re-queued ${replay.length} ` +
|
|
3091
|
+
`un-acked inbound (durable-queue, survives restart)\n`,
|
|
3092
|
+
)
|
|
3093
|
+
}
|
|
3094
|
+
}
|
|
3033
3095
|
const pendingPermissionBuffer = createPendingPermissionBuffer()
|
|
3034
3096
|
|
|
3035
3097
|
/**
|
|
@@ -3080,6 +3142,12 @@ const ipcServer: IpcServer = createIpcServer({
|
|
|
3080
3142
|
for (const msg of pending) {
|
|
3081
3143
|
try {
|
|
3082
3144
|
client.send(msg)
|
|
3145
|
+
// Confirmed delivery to the just-registered live bridge →
|
|
3146
|
+
// tombstone the durable spool entry so it isn't boot-replayed
|
|
3147
|
+
// again. A throw below leaves it spooled (un-acked) so the
|
|
3148
|
+
// idle-drain / escalation path still recovers it — strictly
|
|
3149
|
+
// safer than the old log-and-drop.
|
|
3150
|
+
inboundSpool?.ack(msg)
|
|
3083
3151
|
} catch (err) {
|
|
3084
3152
|
process.stderr.write(
|
|
3085
3153
|
`telegram gateway: pending-inbound drain failed agent=${client.agentName} ` +
|
|
@@ -3542,12 +3610,17 @@ const ipcServer: IpcServer = createIpcServer({
|
|
|
3542
3610
|
//
|
|
3543
3611
|
// This is the third drain trigger. It's gated to be zero-cost and
|
|
3544
3612
|
// zero-churn: skip entirely when nothing is buffered (one Map.get, no
|
|
3545
|
-
// log)
|
|
3546
|
-
//
|
|
3547
|
-
//
|
|
3548
|
-
//
|
|
3549
|
-
//
|
|
3550
|
-
//
|
|
3613
|
+
// log), when the bridge isn't alive (exactly sendToAgent's own guard —
|
|
3614
|
+
// so we never drain into a dead bridge and re-buffer/log-spin), OR
|
|
3615
|
+
// when a turn is in flight. The turn gate is #1556: a message
|
|
3616
|
+
// delivered while a turn is active is NOT safely queued by the bridge
|
|
3617
|
+
// — claude types it into its TUI composer and the auto-submit races
|
|
3618
|
+
// turn-completion, stranding it (the lawgpt wedge). Draining only at
|
|
3619
|
+
// `activeTurnStartedAt.size === 0` guarantees the channel notification
|
|
3620
|
+
// lands at an idle prompt and submits as a fresh turn. Only when there
|
|
3621
|
+
// IS a buffered message AND a live bridge AND no active turn do we
|
|
3622
|
+
// reuse the #1546 `redeliverBufferedInbound` (lossless: re-buffers any
|
|
3623
|
+
// per-message miss).
|
|
3551
3624
|
const IDLE_DRAIN_INTERVAL_MS = 5000
|
|
3552
3625
|
if (!STATIC) {
|
|
3553
3626
|
setInterval(() => {
|
|
@@ -3556,10 +3629,14 @@ if (!STATIC) {
|
|
|
3556
3629
|
pendingInboundBuffer,
|
|
3557
3630
|
selfAgent,
|
|
3558
3631
|
() => {
|
|
3632
|
+
// #1556: never drain mid-turn — that re-creates the composer
|
|
3633
|
+
// wedge this buffer exists to prevent.
|
|
3634
|
+
if (activeTurnStartedAt.size > 0) return false
|
|
3559
3635
|
const c = ipcServer.getClient(selfAgent)
|
|
3560
3636
|
return c != null && c.isAlive()
|
|
3561
3637
|
},
|
|
3562
3638
|
(m) => ipcServer.sendToAgent(selfAgent, m),
|
|
3639
|
+
inboundSpool,
|
|
3563
3640
|
)
|
|
3564
3641
|
if (r != null && r.redelivered > 0) {
|
|
3565
3642
|
process.stderr.write(
|
|
@@ -3568,6 +3645,28 @@ if (!STATIC) {
|
|
|
3568
3645
|
`${r.rebuffered > 0 ? ` (${r.rebuffered} re-buffered)` : ''}\n`,
|
|
3569
3646
|
)
|
|
3570
3647
|
}
|
|
3648
|
+
// Bounded escalation: a spooled inbound still un-acked past its
|
|
3649
|
+
// bound (default 15 min — well past the 5-min silence-poke ladder)
|
|
3650
|
+
// is undeliverable in practice. Retract the "will be processed"
|
|
3651
|
+
// promise EXPLICITLY (honest failure) instead of letting it sit
|
|
3652
|
+
// forever. This is what makes the guarantee deterministic: every
|
|
3653
|
+
// queued message ends either delivered or visibly retracted.
|
|
3654
|
+
inboundSpool?.sweepEscalations((e) => {
|
|
3655
|
+
const chat = e.msg.chatId
|
|
3656
|
+
const threadOpts =
|
|
3657
|
+
typeof e.msg.meta?.threadId === 'string' && e.msg.meta.threadId
|
|
3658
|
+
? { message_thread_id: Number(e.msg.meta.threadId) }
|
|
3659
|
+
: {}
|
|
3660
|
+
void swallowingApiCall(
|
|
3661
|
+
() =>
|
|
3662
|
+
bot.api.sendMessage(
|
|
3663
|
+
chat,
|
|
3664
|
+
"⚠️ I couldn't deliver an earlier message to the agent after repeated retries (it survived restarts but the agent never picked it up). Please resend it.",
|
|
3665
|
+
{ ...threadOpts },
|
|
3666
|
+
),
|
|
3667
|
+
{ chat_id: chat, verb: 'inbound-spool-escalation' },
|
|
3668
|
+
)
|
|
3669
|
+
})
|
|
3571
3670
|
}, IDLE_DRAIN_INTERVAL_MS).unref()
|
|
3572
3671
|
}
|
|
3573
3672
|
|
|
@@ -7377,6 +7476,29 @@ async function handleInbound(
|
|
|
7377
7476
|
// push to pendingInboundBuffer, which onClientRegistered drains on
|
|
7378
7477
|
// the next bridge register — so the notice below is now truthful.
|
|
7379
7478
|
const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
|
|
7479
|
+
|
|
7480
|
+
// #1556: turn-gated delivery. A non-steering inbound that arrives
|
|
7481
|
+
// mid-turn must NOT be sent to the bridge now — claude would type it
|
|
7482
|
+
// into its TUI composer and the auto-submit races turn-completion,
|
|
7483
|
+
// stranding the message (the lawgpt wedge, 2026-05-19). Buffer it;
|
|
7484
|
+
// `purgeReactionTracking`'s turn-complete hook and the turn-gated
|
|
7485
|
+
// idle-drain flush it the instant claude goes idle, where the channel
|
|
7486
|
+
// notification submits cleanly as a fresh turn. Steering messages are
|
|
7487
|
+
// exempt — reaching claude mid-turn is the whole point of /steer.
|
|
7488
|
+
if (
|
|
7489
|
+
decideInboundDelivery({
|
|
7490
|
+
turnInFlight: activeTurnStartedAt.size > 0,
|
|
7491
|
+
isSteering,
|
|
7492
|
+
}) === 'buffer-until-idle'
|
|
7493
|
+
) {
|
|
7494
|
+
pendingInboundBuffer.push(selfAgent, inboundMsg)
|
|
7495
|
+
process.stderr.write(
|
|
7496
|
+
`telegram gateway: inbound held mid-turn agent=${selfAgent} ` +
|
|
7497
|
+
`chat=${chat_id} msg=${msgId ?? '-'} — will flush on turn-complete\n`,
|
|
7498
|
+
)
|
|
7499
|
+
return
|
|
7500
|
+
}
|
|
7501
|
+
|
|
7380
7502
|
const delivered = ipcServer.sendToAgent(selfAgent, inboundMsg)
|
|
7381
7503
|
if (!delivered) {
|
|
7382
7504
|
pendingInboundBuffer.push(selfAgent, inboundMsg)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inbound delivery gate (#1556 — the lawgpt composer-wedge).
|
|
3
|
+
*
|
|
4
|
+
* Pure decision: given the live turn state, should a freshly-received
|
|
5
|
+
* Telegram inbound be delivered to the bridge *now*, or held in the
|
|
6
|
+
* pending-inbound buffer until claude is idle?
|
|
7
|
+
*
|
|
8
|
+
* ## Why this exists
|
|
9
|
+
*
|
|
10
|
+
* The gateway used to `ipcServer.sendToAgent(inbound)` unconditionally,
|
|
11
|
+
* buffering ONLY when the bridge was offline. The load-bearing (and
|
|
12
|
+
* false) assumption — stated verbatim in three places before this fix
|
|
13
|
+
* (`pending-inbound-buffer.ts`, the idle-drain comment, and the
|
|
14
|
+
* implicit unconditional send) — was:
|
|
15
|
+
*
|
|
16
|
+
* "a message delivered while a turn is active is queued normally by
|
|
17
|
+
* the bridge, same as a live arrival, not lost."
|
|
18
|
+
*
|
|
19
|
+
* It is not. The bridge converts an inbound into an MCP
|
|
20
|
+
* `notifications/claude/channel` notification (`bridge.ts:onInbound`).
|
|
21
|
+
* When claude receives that notification mid-turn, the unmodified CLI
|
|
22
|
+
* types the text into its TUI composer and relies on an auto-submit
|
|
23
|
+
* once the turn ends. That submit races turn-completion and frequently
|
|
24
|
+
* does not fire — the message strands in the composer, claude sits at
|
|
25
|
+
* an idle prompt with the user's instruction un-actioned, and nothing
|
|
26
|
+
* self-heals it (the turn-active watchdog only catches *in-turn* hangs;
|
|
27
|
+
* this is *between-turns*-with-undelivered-input, which reads as
|
|
28
|
+
* healthy idle). Observed live: agent `lawgpt`, 2026-05-19 — a
|
|
29
|
+
* follow-up message sat unsubmitted indefinitely; only a restart
|
|
30
|
+
* cleared it, and the restart *lost* the message.
|
|
31
|
+
*
|
|
32
|
+
* ## The deterministic guarantee
|
|
33
|
+
*
|
|
34
|
+
* A non-steering inbound on the Telegram `handleInbound` path is
|
|
35
|
+
* delivered to the bridge ONLY when no turn is in flight. The channel
|
|
36
|
+
* notification therefore always lands at an idle claude prompt, where
|
|
37
|
+
* it submits cleanly as a fresh turn. It can be *delayed* (until the
|
|
38
|
+
* current turn completes) but can never strand in the composer. The
|
|
39
|
+
* turn-complete hook (`purgeReactionTracking`) and the turn-gated
|
|
40
|
+
* idle-drain timer flush the buffer the instant
|
|
41
|
+
* `activeTurnStartedAt.size === 0`.
|
|
42
|
+
*
|
|
43
|
+
* Scope: this gates the Telegram `handleInbound` path only — the one
|
|
44
|
+
* the lawgpt wedge hit. The `inject_inbound` IPC path (cron / synthetic
|
|
45
|
+
* operator wakeups) reaches the bridge directly and is deliberately
|
|
46
|
+
* NOT gated here: cron fires carry at-least-once replay semantics and
|
|
47
|
+
* their delivery contract is a separate product decision, out of scope
|
|
48
|
+
* for this bug.
|
|
49
|
+
*
|
|
50
|
+
* ## Steering is deliberately exempt
|
|
51
|
+
*
|
|
52
|
+
* An explicit `/steer` (`/s`) message is *meant* to reach claude
|
|
53
|
+
* mid-turn — that is the whole point of the steering feature (redirect
|
|
54
|
+
* the agent while it works). Steering messages keep immediate delivery.
|
|
55
|
+
* The wedge only ever affected the queued-mid-turn default path.
|
|
56
|
+
*/
|
|
57
|
+
|
|
58
|
+
export interface InboundDeliveryGateInput {
|
|
59
|
+
/** A turn is in flight RIGHT NOW (live: `activeTurnStartedAt.size > 0`),
|
|
60
|
+
* evaluated at delivery time — not a receipt-time snapshot, so a turn
|
|
61
|
+
* that completed between receipt and here correctly reads as idle. */
|
|
62
|
+
turnInFlight: boolean
|
|
63
|
+
/** This inbound carried an explicit `/steer` (`/s`) prefix and is an
|
|
64
|
+
* intentional mid-turn redirect. */
|
|
65
|
+
isSteering: boolean
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export type InboundDeliveryDecision =
|
|
69
|
+
/** Send to the bridge now (idle prompt, or an intentional steer). */
|
|
70
|
+
| 'deliver'
|
|
71
|
+
/** Hold in the pending-inbound buffer; the turn-complete hook /
|
|
72
|
+
* turn-gated idle-drain flushes it when claude goes idle. */
|
|
73
|
+
| 'buffer-until-idle'
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Pure. The ONLY condition that defers delivery is "a turn is in flight
|
|
77
|
+
* AND this is not a steering message". Everything else delivers
|
|
78
|
+
* immediately (idle → submits at once; steering → intentional mid-turn).
|
|
79
|
+
*/
|
|
80
|
+
export function decideInboundDelivery(
|
|
81
|
+
input: InboundDeliveryGateInput,
|
|
82
|
+
): InboundDeliveryDecision {
|
|
83
|
+
if (input.turnInFlight && !input.isSteering) return 'buffer-until-idle'
|
|
84
|
+
return 'deliver'
|
|
85
|
+
}
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* inbound-spool.ts — durable, crash-tolerant spool for buffered inbound.
|
|
3
|
+
*
|
|
4
|
+
* Why this exists: `pending-inbound-buffer.ts` is in-memory only. A
|
|
5
|
+
* gateway/container restart (switchroom update, agent restart, a
|
|
6
|
+
* self-restart, an OOM) destroys it — so the user-facing promise
|
|
7
|
+
* "⏳ your message is queued and will be processed when it reconnects"
|
|
8
|
+
* (gateway.ts) is a lie across a restart. Proven twice: finn and
|
|
9
|
+
* carrie (2026-05-19) lost the user's message on restart and the user
|
|
10
|
+
* had to resend. #1546/#1549 only shrank the in-memory delivery
|
|
11
|
+
* window; they cannot survive process death.
|
|
12
|
+
*
|
|
13
|
+
* This module makes the promise DETERMINISTIC: every buffered inbound
|
|
14
|
+
* is also appended to a JSONL spool on the persistent per-agent volume
|
|
15
|
+
* (`/state/agent/telegram/…`, survives container recreate). On boot the
|
|
16
|
+
* gateway replays un-acked entries back into the in-memory buffer, so
|
|
17
|
+
* the existing drain machinery delivers them. An entry is acked (and
|
|
18
|
+
* tombstoned) ONLY on confirmed delivery to a live registered bridge.
|
|
19
|
+
* Un-acked entries older than `escalateAfterMs` are surfaced to the
|
|
20
|
+
* user via an explicit "couldn't deliver — resend?" callback and then
|
|
21
|
+
* dropped: the promise is then ALWAYS resolved — kept, or visibly
|
|
22
|
+
* retracted — never silently lost.
|
|
23
|
+
*
|
|
24
|
+
* Scope (v1): the ack is "delivered to a live registered bridge", not
|
|
25
|
+
* "claude consumed it". A true claude→gateway consumption-ack needs a
|
|
26
|
+
* new bidirectional bridge protocol (high blast radius) and is a
|
|
27
|
+
* documented follow-up. v1 already eliminates the silent-loss-on-
|
|
28
|
+
* restart class — the actual incident class.
|
|
29
|
+
*
|
|
30
|
+
* Crash-consistency: append-only JSONL, one self-contained JSON object
|
|
31
|
+
* per line, written with a trailing newline in a single `appendFileSync`
|
|
32
|
+
* (atomic for small writes on local fs). A torn final line on a crash
|
|
33
|
+
* mid-write is tolerated: replay skips any line that does not
|
|
34
|
+
* round-trip `JSON.parse` + shape-check. Acks are themselves appended
|
|
35
|
+
* as tombstone lines (`{t:"ack",id}`) rather than rewriting the file;
|
|
36
|
+
* a bounded `compact()` rewrites the file dropping acked/escalated ids
|
|
37
|
+
* when it grows past `compactAtBytes`.
|
|
38
|
+
*
|
|
39
|
+
* This module is PURE w.r.t. its injected fs + clock seams so the
|
|
40
|
+
* crash/dedup/replay/escalation logic is unit-tested without a real
|
|
41
|
+
* gateway (mirrors the #1544/#1546/#1549 pure-seam idiom).
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
import type { InboundMessage } from './ipc-protocol.js'
|
|
45
|
+
|
|
46
|
+
/** Stable dedup id for an inbound. Real Telegram messages have a
|
|
47
|
+
* unique (chatId, messageId). Synthetic/cron inbounds use messageId
|
|
48
|
+
* 0 — fall back to a deterministic id from source+ts so retried
|
|
49
|
+
* synthetics of the SAME logical event dedup, but distinct events
|
|
50
|
+
* (different ts) do not collapse. */
|
|
51
|
+
export function spoolId(msg: InboundMessage): string {
|
|
52
|
+
if (typeof msg.messageId === 'number' && msg.messageId > 0) {
|
|
53
|
+
return `m:${msg.chatId}:${msg.messageId}`
|
|
54
|
+
}
|
|
55
|
+
const src = msg.meta?.source ?? '-'
|
|
56
|
+
return `s:${msg.chatId}:${src}:${msg.ts}`
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
interface SpoolRecord {
|
|
60
|
+
t: 'put' | 'ack'
|
|
61
|
+
id: string
|
|
62
|
+
/** Present only on `put`. The full inbound to replay. */
|
|
63
|
+
msg?: InboundMessage
|
|
64
|
+
/** Present only on `put`. Owning agent (replay re-pushes per agent). */
|
|
65
|
+
agent?: string
|
|
66
|
+
/** Present only on `put`. ms epoch first-spooled — drives escalation. */
|
|
67
|
+
firstAt?: number
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export interface InboundSpoolFsSeam {
|
|
71
|
+
appendFileSync: (path: string, data: string) => void
|
|
72
|
+
readFileSync: (path: string) => string
|
|
73
|
+
writeFileSync: (path: string, data: string) => void
|
|
74
|
+
/** Atomic same-dir replace (POSIX rename). Used so compaction can't
|
|
75
|
+
* lose entries to a crash mid-rewrite. */
|
|
76
|
+
renameSync: (from: string, to: string) => void
|
|
77
|
+
existsSync: (path: string) => boolean
|
|
78
|
+
statSizeSync: (path: string) => number
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export interface InboundSpoolOptions {
|
|
82
|
+
path: string
|
|
83
|
+
fs: InboundSpoolFsSeam
|
|
84
|
+
now?: () => number
|
|
85
|
+
log?: (line: string) => void
|
|
86
|
+
/** Un-acked entries older than this are escalated then dropped.
|
|
87
|
+
* Default 15 min — comfortably past the 5-min silence-poke ladder
|
|
88
|
+
* so self-heal gets every chance before we retract the promise. */
|
|
89
|
+
escalateAfterMs?: number
|
|
90
|
+
/** Rewrite-compact the JSONL once it exceeds this. Default 256 KiB. */
|
|
91
|
+
compactAtBytes?: number
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export interface ReplayEntry {
|
|
95
|
+
agent: string
|
|
96
|
+
msg: InboundMessage
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
export interface InboundSpool {
|
|
100
|
+
/** Durably record `msg` for `agent`. Idempotent by spoolId: a
|
|
101
|
+
* re-spool of an already-live id is a no-op (returns false). */
|
|
102
|
+
put: (agent: string, msg: InboundMessage) => boolean
|
|
103
|
+
/** Tombstone `id` — call ONLY on confirmed delivery to a live
|
|
104
|
+
* registered bridge. Idempotent. */
|
|
105
|
+
ack: (msg: InboundMessage) => void
|
|
106
|
+
/** Live (un-acked) entries, oldest first. Used at boot to re-push
|
|
107
|
+
* into the in-memory buffer. Pure read — does not mutate. */
|
|
108
|
+
liveEntries: () => ReplayEntry[]
|
|
109
|
+
/** Escalate+drop entries older than `escalateAfterMs`. Calls
|
|
110
|
+
* `onEscalate` once per dropped entry (post the "couldn't deliver"
|
|
111
|
+
* card there). Returns the count escalated. Safe to call on a timer. */
|
|
112
|
+
sweepEscalations: (onEscalate: (e: ReplayEntry) => void) => number
|
|
113
|
+
/** Test/observability: count of live (un-acked) ids. */
|
|
114
|
+
liveCount: () => number
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export function createInboundSpool(opts: InboundSpoolOptions): InboundSpool {
|
|
118
|
+
const { path, fs } = opts
|
|
119
|
+
const now = opts.now ?? Date.now
|
|
120
|
+
const log = opts.log ?? ((l: string) => process.stderr.write(l))
|
|
121
|
+
const escalateAfterMs = opts.escalateAfterMs ?? 15 * 60 * 1000
|
|
122
|
+
const compactAtBytes = opts.compactAtBytes ?? 256 * 1024
|
|
123
|
+
|
|
124
|
+
// In-memory projection of the on-disk log, rebuilt from the file at
|
|
125
|
+
// construction. `live` maps spoolId → the put record (insertion order
|
|
126
|
+
// preserved via the Map). An `ack` deletes from `live`.
|
|
127
|
+
const live = new Map<string, { agent: string; msg: InboundMessage; firstAt: number }>()
|
|
128
|
+
|
|
129
|
+
function parseLine(line: string): SpoolRecord | null {
|
|
130
|
+
const s = line.trim()
|
|
131
|
+
if (!s) return null
|
|
132
|
+
let rec: unknown
|
|
133
|
+
try {
|
|
134
|
+
rec = JSON.parse(s)
|
|
135
|
+
} catch {
|
|
136
|
+
return null // torn / partial line from a crash mid-append — skip
|
|
137
|
+
}
|
|
138
|
+
if (rec == null || typeof rec !== 'object') return null
|
|
139
|
+
const r = rec as Record<string, unknown>
|
|
140
|
+
if (r.t !== 'put' && r.t !== 'ack') return null
|
|
141
|
+
if (typeof r.id !== 'string' || r.id.length === 0) return null
|
|
142
|
+
if (r.t === 'put') {
|
|
143
|
+
if (r.msg == null || typeof r.msg !== 'object') return null
|
|
144
|
+
if (typeof r.agent !== 'string' || r.agent.length === 0) return null
|
|
145
|
+
if (typeof r.firstAt !== 'number') return null
|
|
146
|
+
}
|
|
147
|
+
return r as unknown as SpoolRecord
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Rebuild `live` from the file. Tolerates a torn last line.
|
|
151
|
+
function hydrate(): void {
|
|
152
|
+
live.clear()
|
|
153
|
+
if (!fs.existsSync(path)) return
|
|
154
|
+
let raw = ''
|
|
155
|
+
try {
|
|
156
|
+
raw = fs.readFileSync(path)
|
|
157
|
+
} catch {
|
|
158
|
+
return
|
|
159
|
+
}
|
|
160
|
+
for (const line of raw.split('\n')) {
|
|
161
|
+
const rec = parseLine(line)
|
|
162
|
+
if (rec == null) continue
|
|
163
|
+
if (rec.t === 'put') {
|
|
164
|
+
// Last put for an id wins; an ack later removes it.
|
|
165
|
+
live.set(rec.id, {
|
|
166
|
+
agent: rec.agent as string,
|
|
167
|
+
msg: rec.msg as InboundMessage,
|
|
168
|
+
firstAt: rec.firstAt as number,
|
|
169
|
+
})
|
|
170
|
+
} else {
|
|
171
|
+
live.delete(rec.id)
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
function appendRecord(rec: SpoolRecord): void {
|
|
177
|
+
try {
|
|
178
|
+
fs.appendFileSync(path, JSON.stringify(rec) + '\n')
|
|
179
|
+
} catch (err) {
|
|
180
|
+
// Durability is best-effort relative to fs availability; a spool
|
|
181
|
+
// write failure must NOT break live delivery. Log loudly — a
|
|
182
|
+
// persistently failing spool means we're back to in-memory-only
|
|
183
|
+
// semantics and the operator should know.
|
|
184
|
+
log(
|
|
185
|
+
`inbound-spool: append FAILED path=${path} id=${rec.id} t=${rec.t}: ` +
|
|
186
|
+
`${(err as Error).message} — durability degraded to in-memory\n`,
|
|
187
|
+
)
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function maybeCompact(): void {
|
|
192
|
+
let size = 0
|
|
193
|
+
try {
|
|
194
|
+
size = fs.existsSync(path) ? fs.statSizeSync(path) : 0
|
|
195
|
+
} catch {
|
|
196
|
+
return
|
|
197
|
+
}
|
|
198
|
+
if (size <= compactAtBytes) return
|
|
199
|
+
// Rewrite the file as exactly the current live set (one put per
|
|
200
|
+
// live id, no acks). ATOMIC: write a sibling tmp then rename over
|
|
201
|
+
// the real path. rename(2) is atomic within a filesystem, so a
|
|
202
|
+
// crash at any point leaves EITHER the full pre-compaction log OR
|
|
203
|
+
// the full compacted log on disk — never a truncated/torn file
|
|
204
|
+
// that loses live entries after the tear. (Plain writeFileSync is
|
|
205
|
+
// not atomic; a crash mid-write of a >256 KiB rewrite could drop
|
|
206
|
+
// entries past the tear — the residual the reviewer flagged.)
|
|
207
|
+
const lines: string[] = []
|
|
208
|
+
for (const [id, e] of live) {
|
|
209
|
+
lines.push(
|
|
210
|
+
JSON.stringify({ t: 'put', id, agent: e.agent, msg: e.msg, firstAt: e.firstAt } satisfies SpoolRecord),
|
|
211
|
+
)
|
|
212
|
+
}
|
|
213
|
+
const tmp = path + '.compact.tmp'
|
|
214
|
+
try {
|
|
215
|
+
fs.writeFileSync(tmp, lines.length ? lines.join('\n') + '\n' : '')
|
|
216
|
+
fs.renameSync(tmp, path)
|
|
217
|
+
log(`inbound-spool: compacted path=${path} live=${live.size}\n`)
|
|
218
|
+
} catch (err) {
|
|
219
|
+
// Compaction is opportunistic — a failure keeps the (larger but
|
|
220
|
+
// correct) append-only log; never lose data trying to shrink it.
|
|
221
|
+
log(`inbound-spool: compact FAILED path=${path}: ${(err as Error).message}\n`)
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
hydrate()
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
put(agent, msg) {
|
|
229
|
+
const id = spoolId(msg)
|
|
230
|
+
if (live.has(id)) return false // dedup: already spooled & un-acked
|
|
231
|
+
const firstAt = now()
|
|
232
|
+
live.set(id, { agent, msg, firstAt })
|
|
233
|
+
appendRecord({ t: 'put', id, agent, msg, firstAt })
|
|
234
|
+
maybeCompact()
|
|
235
|
+
return true
|
|
236
|
+
},
|
|
237
|
+
ack(msg) {
|
|
238
|
+
const id = spoolId(msg)
|
|
239
|
+
if (!live.has(id)) return // idempotent / unknown id
|
|
240
|
+
live.delete(id)
|
|
241
|
+
appendRecord({ t: 'ack', id })
|
|
242
|
+
maybeCompact()
|
|
243
|
+
},
|
|
244
|
+
liveEntries() {
|
|
245
|
+
// Insertion order = Map iteration order = oldest first.
|
|
246
|
+
return [...live.values()].map((e) => ({ agent: e.agent, msg: e.msg }))
|
|
247
|
+
},
|
|
248
|
+
sweepEscalations(onEscalate) {
|
|
249
|
+
const cutoff = now() - escalateAfterMs
|
|
250
|
+
let n = 0
|
|
251
|
+
for (const [id, e] of [...live.entries()]) {
|
|
252
|
+
if (e.firstAt > cutoff) continue
|
|
253
|
+
live.delete(id)
|
|
254
|
+
appendRecord({ t: 'ack', id }) // tombstone — promise retracted
|
|
255
|
+
try {
|
|
256
|
+
onEscalate({ agent: e.agent, msg: e.msg })
|
|
257
|
+
} catch (err) {
|
|
258
|
+
log(`inbound-spool: onEscalate threw id=${id}: ${(err as Error).message}\n`)
|
|
259
|
+
}
|
|
260
|
+
n++
|
|
261
|
+
}
|
|
262
|
+
if (n > 0) {
|
|
263
|
+
log(`inbound-spool: escalated+dropped ${n} undelivered entr${n === 1 ? 'y' : 'ies'} (older than ${escalateAfterMs}ms)\n`)
|
|
264
|
+
maybeCompact()
|
|
265
|
+
}
|
|
266
|
+
return n
|
|
267
|
+
},
|
|
268
|
+
liveCount() {
|
|
269
|
+
return live.size
|
|
270
|
+
},
|
|
271
|
+
}
|
|
272
|
+
}
|
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
*/
|
|
31
31
|
|
|
32
32
|
import type { InboundMessage } from './ipc-protocol.js'
|
|
33
|
+
import type { InboundSpool } from './inbound-spool.js'
|
|
33
34
|
|
|
34
35
|
/** Default cap per agent. Tuned for `should fit a reasonable backlog of
|
|
35
36
|
* approval cards stacked while bridge is offline` but no more. */
|
|
@@ -52,6 +53,19 @@ export interface PendingInboundBuffer {
|
|
|
52
53
|
export interface PendingInboundBufferOptions {
|
|
53
54
|
capPerAgent?: number
|
|
54
55
|
log?: (line: string) => void
|
|
56
|
+
/**
|
|
57
|
+
* Durable spool. When set, every `push` is also recorded on the
|
|
58
|
+
* persistent per-agent volume so a gateway/container restart cannot
|
|
59
|
+
* silently lose the message (the finn/carrie incident class). The
|
|
60
|
+
* in-memory queue stays the hot path + cap; the spool is the
|
|
61
|
+
* crash-survivable record, acked only on confirmed delivery (by
|
|
62
|
+
* `redeliverBufferedInbound`/`idleDrainTick`), boot-replayed by the
|
|
63
|
+
* gateway, and escalated-then-dropped if undeliverable past its
|
|
64
|
+
* bound. The in-memory cap eviction does NOT touch the spool — an
|
|
65
|
+
* evicted-from-memory entry survives in the spool (strictly safer
|
|
66
|
+
* than the old silent in-memory drop).
|
|
67
|
+
*/
|
|
68
|
+
spool?: InboundSpool
|
|
55
69
|
}
|
|
56
70
|
|
|
57
71
|
/**
|
|
@@ -72,6 +86,7 @@ export function redeliverBufferedInbound(
|
|
|
72
86
|
buffer: PendingInboundBuffer,
|
|
73
87
|
agent: string,
|
|
74
88
|
send: (msg: InboundMessage) => boolean,
|
|
89
|
+
spool?: InboundSpool,
|
|
75
90
|
): { drained: number; redelivered: number; rebuffered: number } {
|
|
76
91
|
const pending = buffer.drain(agent)
|
|
77
92
|
let redelivered = 0
|
|
@@ -85,6 +100,11 @@ export function redeliverBufferedInbound(
|
|
|
85
100
|
}
|
|
86
101
|
if (delivered) {
|
|
87
102
|
redelivered++
|
|
103
|
+
// Confirmed delivery to a live registered bridge → the durable
|
|
104
|
+
// promise is kept; tombstone the spool entry so it is NOT
|
|
105
|
+
// boot-replayed again. A miss leaves it spooled (re-pushed below
|
|
106
|
+
// AND still live in the spool) for the next drain / escalation.
|
|
107
|
+
spool?.ack(msg)
|
|
88
108
|
} else {
|
|
89
109
|
buffer.push(agent, msg)
|
|
90
110
|
rebuffered++
|
|
@@ -107,8 +127,19 @@ export function redeliverBufferedInbound(
|
|
|
107
127
|
* which would re-buffer+log-spin every tick; onClientRegistered
|
|
108
128
|
* will drain on the eventual reconnect instead)
|
|
109
129
|
* - otherwise → `redeliverBufferedInbound` (lossless: re-buffers any
|
|
110
|
-
* per-message miss).
|
|
111
|
-
*
|
|
130
|
+
* per-message miss).
|
|
131
|
+
*
|
|
132
|
+
* NOTE (#1556): a message delivered mid-turn is NOT safely queued by
|
|
133
|
+
* the bridge — the prior "queued normally, same as a live arrival"
|
|
134
|
+
* claim here was the false assumption behind the lawgpt composer
|
|
135
|
+
* wedge. claude types a mid-turn channel notification into its TUI
|
|
136
|
+
* composer and the auto-submit races turn-completion, stranding it.
|
|
137
|
+
* The `idleDrainTick` caller therefore also gates on
|
|
138
|
+
* `activeTurnStartedAt.size === 0`, so this function is never invoked
|
|
139
|
+
* mid-turn. The Telegram `handleInbound` delivery path is turn-gated
|
|
140
|
+
* (gateway.ts); the `inject_inbound` cron/synthetic path is a separate
|
|
141
|
+
* delivery contract and deliberately not gated — see
|
|
142
|
+
* `inbound-delivery-gate.ts`.
|
|
112
143
|
*
|
|
113
144
|
* Returns the redeliver counts only when it actually ran, else null
|
|
114
145
|
* (so the caller logs only on a real flush).
|
|
@@ -118,11 +149,12 @@ export function idleDrainTick(
|
|
|
118
149
|
agent: string,
|
|
119
150
|
isBridgeAlive: () => boolean,
|
|
120
151
|
send: (msg: InboundMessage) => boolean,
|
|
152
|
+
spool?: InboundSpool,
|
|
121
153
|
): { drained: number; redelivered: number; rebuffered: number } | null {
|
|
122
154
|
if (!agent) return null
|
|
123
155
|
if (buffer.depth(agent) === 0) return null
|
|
124
156
|
if (!isBridgeAlive()) return null
|
|
125
|
-
return redeliverBufferedInbound(buffer, agent, send)
|
|
157
|
+
return redeliverBufferedInbound(buffer, agent, send, spool)
|
|
126
158
|
}
|
|
127
159
|
|
|
128
160
|
export function createPendingInboundBuffer(
|
|
@@ -130,6 +162,7 @@ export function createPendingInboundBuffer(
|
|
|
130
162
|
): PendingInboundBuffer {
|
|
131
163
|
const cap = opts.capPerAgent ?? DEFAULT_PENDING_INBOUND_CAP
|
|
132
164
|
const log = opts.log ?? ((line: string) => process.stderr.write(line))
|
|
165
|
+
const spool = opts.spool
|
|
133
166
|
const queues = new Map<string, InboundMessage[]>()
|
|
134
167
|
|
|
135
168
|
return {
|
|
@@ -149,6 +182,12 @@ export function createPendingInboundBuffer(
|
|
|
149
182
|
)
|
|
150
183
|
}
|
|
151
184
|
q.push(msg)
|
|
185
|
+
// Durable record FIRST-class to the in-memory queue: spool BEFORE
|
|
186
|
+
// returning, regardless of the cap eviction above — an entry the
|
|
187
|
+
// in-memory cap drops still survives in the spool (boot-replayed /
|
|
188
|
+
// escalated), which is the whole point. spool.put dedups by
|
|
189
|
+
// spoolId so a boot-replay re-push is a no-op here.
|
|
190
|
+
spool?.put(agent, msg)
|
|
152
191
|
log(
|
|
153
192
|
`pending-inbound-buffer: agent=${agent} buffered source=${msg.meta?.source ?? '-'} ` +
|
|
154
193
|
`depth_after=${q.length} evicted=${evicted}\n`,
|