switchroom 0.12.18 → 0.12.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,7 @@ import { execFileSync, execSync, spawn } from 'child_process'
17
17
  import {
18
18
  readFileSync, writeFileSync, mkdirSync, readdirSync, rmSync,
19
19
  statSync, renameSync, realpathSync, chmodSync, openSync, closeSync,
20
- existsSync, unlinkSync,
20
+ existsSync, unlinkSync, appendFileSync,
21
21
  } from 'fs'
22
22
  import { homedir } from 'os'
23
23
  import { join, extname, sep, basename } from 'path'
@@ -249,6 +249,8 @@ import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js
249
249
  import { handleRequestDriveApproval } from './drive-write-approval.js'
250
250
  import { buildDiffPreviewCard } from './diff-preview-card.js'
251
251
  import { createPendingInboundBuffer, redeliverBufferedInbound, idleDrainTick } from './pending-inbound-buffer.js'
252
+ import { createInboundSpool } from './inbound-spool.js'
253
+ import { decideInboundDelivery } from './inbound-delivery-gate.js'
252
254
  import { createPendingPermissionBuffer } from './pending-permission-decisions.js'
253
255
  import {
254
256
  buildVaultGrantApprovedInbound,
@@ -1278,6 +1280,30 @@ function purgeReactionTracking(key: string): void {
1278
1280
  // response to the client was already sent when the restart was
1279
1281
  // scheduled, so nobody is waiting on this.
1280
1282
  if (activeTurnStartedAt.size === 0) {
1283
+ // #1556: the deterministic delivery point. claude has just gone
1284
+ // idle — flush any inbound held mid-turn so the channel
1285
+ // notification lands at the idle prompt and submits as a fresh
1286
+ // turn (instead of stranding in the composer, the lawgpt wedge).
1287
+ // Zero-churn: depth check first, no work on the common empty path.
1288
+ // Lossless: redeliver re-buffers any per-message miss (bridge
1289
+ // mid-reconnect), which onClientRegistered then drains.
1290
+ const selfAgentForFlush = process.env.SWITCHROOM_AGENT_NAME ?? ''
1291
+ if (pendingInboundBuffer.depth(selfAgentForFlush) > 0) {
1292
+ const fr = redeliverBufferedInbound(
1293
+ pendingInboundBuffer,
1294
+ selfAgentForFlush,
1295
+ (m) => ipcServer.sendToAgent(selfAgentForFlush, m),
1296
+ inboundSpool,
1297
+ )
1298
+ if (fr.redelivered > 0) {
1299
+ process.stderr.write(
1300
+ `telegram gateway: turn-complete flushed ${fr.redelivered}/${fr.drained} ` +
1301
+ `held inbound for ${selfAgentForFlush}` +
1302
+ `${fr.rebuffered > 0 ? ` (${fr.rebuffered} re-buffered)` : ''}\n`,
1303
+ )
1304
+ }
1305
+ }
1306
+
1281
1307
  if (pendingRestarts.size > 0) {
1282
1308
  for (const [agentName, _timestamp] of pendingRestarts.entries()) {
1283
1309
  triggerSelfRestart(agentName, 'turn-complete-pending-restart');
@@ -3011,6 +3037,7 @@ silencePoke.startTimer({
3011
3037
  pendingInboundBuffer,
3012
3038
  fbSelfAgent,
3013
3039
  (m) => ipcServer.sendToAgent(fbSelfAgent, m),
3040
+ inboundSpool,
3014
3041
  )
3015
3042
  process.stderr.write(
3016
3043
  `telegram gateway: silence-poke framework-fallback ended wedged turn ` +
@@ -3029,7 +3056,42 @@ silencePoke.startTimer({
3029
3056
  // vault_request_access card during the 100ms bridge-reconnect window
3030
3057
  // would mint the grant but silently drop the `vault_grant_approved`
3031
3058
  // inbound, leaving the agent stuck waiting for a manual poke.
3032
- const pendingInboundBuffer = createPendingInboundBuffer()
3059
+ // Durable inbound spool on the persistent per-agent volume
3060
+ // (STATE_DIR = /state/agent/telegram in prod — survives container
3061
+ // recreate). Makes the "⏳ your message is queued and will be
3062
+ // processed when it reconnects" promise deterministic across a
3063
+ // gateway/container restart (finn/carrie lost-on-restart incident,
3064
+ // 2026-05-19). STATIC mode has no runtime/bridge, so no spool.
3065
+ const inboundSpool = STATIC
3066
+ ? undefined
3067
+ : createInboundSpool({
3068
+ path: join(STATE_DIR, 'inbound-spool.jsonl'),
3069
+ fs: {
3070
+ appendFileSync: (p, d) => appendFileSync(p, d),
3071
+ readFileSync: (p) => readFileSync(p, 'utf8'),
3072
+ writeFileSync: (p, d) => writeFileSync(p, d),
3073
+ renameSync: (a, b) => renameSync(a, b),
3074
+ existsSync: (p) => existsSync(p),
3075
+ statSizeSync: (p) => statSync(p).size,
3076
+ },
3077
+ })
3078
+ const pendingInboundBuffer = createPendingInboundBuffer({ spool: inboundSpool })
3079
+ // Boot-replay: re-queue every un-acked spooled inbound into the
3080
+ // in-memory buffer so the existing drain triggers (onClientRegistered
3081
+ // / silence-poke #1546 / idle-drain #1549) deliver them. push →
3082
+ // spool.put dedups on the already-live id, so this re-push does NOT
3083
+ // double-append. This is what makes a queued message survive a
3084
+ // restart instead of being silently lost.
3085
+ if (inboundSpool != null) {
3086
+ const replay = inboundSpool.liveEntries()
3087
+ for (const e of replay) pendingInboundBuffer.push(e.agent, e.msg)
3088
+ if (replay.length > 0) {
3089
+ process.stderr.write(
3090
+ `telegram gateway: inbound-spool boot-replay re-queued ${replay.length} ` +
3091
+ `un-acked inbound (durable-queue, survives restart)\n`,
3092
+ )
3093
+ }
3094
+ }
3033
3095
  const pendingPermissionBuffer = createPendingPermissionBuffer()
3034
3096
 
3035
3097
  /**
@@ -3080,6 +3142,12 @@ const ipcServer: IpcServer = createIpcServer({
3080
3142
  for (const msg of pending) {
3081
3143
  try {
3082
3144
  client.send(msg)
3145
+ // Confirmed delivery to the just-registered live bridge →
3146
+ // tombstone the durable spool entry so it isn't boot-replayed
3147
+ // again. A throw below leaves it spooled (un-acked) so the
3148
+ // idle-drain / escalation path still recovers it — strictly
3149
+ // safer than the old log-and-drop.
3150
+ inboundSpool?.ack(msg)
3083
3151
  } catch (err) {
3084
3152
  process.stderr.write(
3085
3153
  `telegram gateway: pending-inbound drain failed agent=${client.agentName} ` +
@@ -3542,12 +3610,17 @@ const ipcServer: IpcServer = createIpcServer({
3542
3610
  //
3543
3611
  // This is the third drain trigger. It's gated to be zero-cost and
3544
3612
  // zero-churn: skip entirely when nothing is buffered (one Map.get, no
3545
- // log) or when the bridge isn't alive (exactly sendToAgent's own
3546
- // guard — so we never drain into a dead bridge and re-buffer/log-spin).
3547
- // Only when there IS a buffered message AND a live bridge do we reuse
3548
- // the #1546 `redeliverBufferedInbound` (lossless: re-buffers any
3549
- // per-message miss). A message delivered while a turn is active is
3550
- // queued normally by the bridge same as a live arrival, not lost.
3613
+ // log), when the bridge isn't alive (exactly sendToAgent's own guard —
3614
+ // so we never drain into a dead bridge and re-buffer/log-spin), OR
3615
+ // when a turn is in flight. The turn gate is #1556: a message
3616
+ // delivered while a turn is active is NOT safely queued by the bridge
3617
+ // claude types it into its TUI composer and the auto-submit races
3618
+ // turn-completion, stranding it (the lawgpt wedge). Draining only at
3619
+ // `activeTurnStartedAt.size === 0` guarantees the channel notification
3620
+ // lands at an idle prompt and submits as a fresh turn. Only when there
3621
+ // IS a buffered message AND a live bridge AND no active turn do we
3622
+ // reuse the #1546 `redeliverBufferedInbound` (lossless: re-buffers any
3623
+ // per-message miss).
3551
3624
  const IDLE_DRAIN_INTERVAL_MS = 5000
3552
3625
  if (!STATIC) {
3553
3626
  setInterval(() => {
@@ -3556,10 +3629,14 @@ if (!STATIC) {
3556
3629
  pendingInboundBuffer,
3557
3630
  selfAgent,
3558
3631
  () => {
3632
+ // #1556: never drain mid-turn — that re-creates the composer
3633
+ // wedge this buffer exists to prevent.
3634
+ if (activeTurnStartedAt.size > 0) return false
3559
3635
  const c = ipcServer.getClient(selfAgent)
3560
3636
  return c != null && c.isAlive()
3561
3637
  },
3562
3638
  (m) => ipcServer.sendToAgent(selfAgent, m),
3639
+ inboundSpool,
3563
3640
  )
3564
3641
  if (r != null && r.redelivered > 0) {
3565
3642
  process.stderr.write(
@@ -3568,6 +3645,28 @@ if (!STATIC) {
3568
3645
  `${r.rebuffered > 0 ? ` (${r.rebuffered} re-buffered)` : ''}\n`,
3569
3646
  )
3570
3647
  }
3648
+ // Bounded escalation: a spooled inbound still un-acked past its
3649
+ // bound (default 15 min — well past the 5-min silence-poke ladder)
3650
+ // is undeliverable in practice. Retract the "will be processed"
3651
+ // promise EXPLICITLY (honest failure) instead of letting it sit
3652
+ // forever. This is what makes the guarantee deterministic: every
3653
+ // queued message ends either delivered or visibly retracted.
3654
+ inboundSpool?.sweepEscalations((e) => {
3655
+ const chat = e.msg.chatId
3656
+ const threadOpts =
3657
+ typeof e.msg.meta?.threadId === 'string' && e.msg.meta.threadId
3658
+ ? { message_thread_id: Number(e.msg.meta.threadId) }
3659
+ : {}
3660
+ void swallowingApiCall(
3661
+ () =>
3662
+ bot.api.sendMessage(
3663
+ chat,
3664
+ "⚠️ I couldn't deliver an earlier message to the agent after repeated retries (it survived restarts but the agent never picked it up). Please resend it.",
3665
+ { ...threadOpts },
3666
+ ),
3667
+ { chat_id: chat, verb: 'inbound-spool-escalation' },
3668
+ )
3669
+ })
3571
3670
  }, IDLE_DRAIN_INTERVAL_MS).unref()
3572
3671
  }
3573
3672
 
@@ -7377,6 +7476,29 @@ async function handleInbound(
7377
7476
  // push to pendingInboundBuffer, which onClientRegistered drains on
7378
7477
  // the next bridge register — so the notice below is now truthful.
7379
7478
  const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
7479
+
7480
+ // #1556: turn-gated delivery. A non-steering inbound that arrives
7481
+ // mid-turn must NOT be sent to the bridge now — claude would type it
7482
+ // into its TUI composer and the auto-submit races turn-completion,
7483
+ // stranding the message (the lawgpt wedge, 2026-05-19). Buffer it;
7484
+ // `purgeReactionTracking`'s turn-complete hook and the turn-gated
7485
+ // idle-drain flush it the instant claude goes idle, where the channel
7486
+ // notification submits cleanly as a fresh turn. Steering messages are
7487
+ // exempt — reaching claude mid-turn is the whole point of /steer.
7488
+ if (
7489
+ decideInboundDelivery({
7490
+ turnInFlight: activeTurnStartedAt.size > 0,
7491
+ isSteering,
7492
+ }) === 'buffer-until-idle'
7493
+ ) {
7494
+ pendingInboundBuffer.push(selfAgent, inboundMsg)
7495
+ process.stderr.write(
7496
+ `telegram gateway: inbound held mid-turn agent=${selfAgent} ` +
7497
+ `chat=${chat_id} msg=${msgId ?? '-'} — will flush on turn-complete\n`,
7498
+ )
7499
+ return
7500
+ }
7501
+
7380
7502
  const delivered = ipcServer.sendToAgent(selfAgent, inboundMsg)
7381
7503
  if (!delivered) {
7382
7504
  pendingInboundBuffer.push(selfAgent, inboundMsg)
@@ -0,0 +1,85 @@
1
+ /**
2
+ * Inbound delivery gate (#1556 — the lawgpt composer-wedge).
3
+ *
4
+ * Pure decision: given the live turn state, should a freshly-received
5
+ * Telegram inbound be delivered to the bridge *now*, or held in the
6
+ * pending-inbound buffer until claude is idle?
7
+ *
8
+ * ## Why this exists
9
+ *
10
+ * The gateway used to `ipcServer.sendToAgent(inbound)` unconditionally,
11
+ * buffering ONLY when the bridge was offline. The load-bearing (and
12
+ * false) assumption — stated verbatim in three places before this fix
13
+ * (`pending-inbound-buffer.ts`, the idle-drain comment, and the
14
+ * implicit unconditional send) — was:
15
+ *
16
+ * "a message delivered while a turn is active is queued normally by
17
+ * the bridge, same as a live arrival, not lost."
18
+ *
19
+ * It is not. The bridge converts an inbound into an MCP
20
+ * `notifications/claude/channel` notification (`bridge.ts:onInbound`).
21
+ * When claude receives that notification mid-turn, the unmodified CLI
22
+ * types the text into its TUI composer and relies on an auto-submit
23
+ * once the turn ends. That submit races turn-completion and frequently
24
+ * does not fire — the message strands in the composer, claude sits at
25
+ * an idle prompt with the user's instruction un-actioned, and nothing
26
+ * self-heals it (the turn-active watchdog only catches *in-turn* hangs;
27
+ * this is *between-turns*-with-undelivered-input, which reads as
28
+ * healthy idle). Observed live: agent `lawgpt`, 2026-05-19 — a
29
+ * follow-up message sat unsubmitted indefinitely; only a restart
30
+ * cleared it, and the restart *lost* the message.
31
+ *
32
+ * ## The deterministic guarantee
33
+ *
34
+ * A non-steering inbound on the Telegram `handleInbound` path is
35
+ * delivered to the bridge ONLY when no turn is in flight. The channel
36
+ * notification therefore always lands at an idle claude prompt, where
37
+ * it submits cleanly as a fresh turn. It can be *delayed* (until the
38
+ * current turn completes) but can never strand in the composer. The
39
+ * turn-complete hook (`purgeReactionTracking`) and the turn-gated
40
+ * idle-drain timer flush the buffer the instant
41
+ * `activeTurnStartedAt.size === 0`.
42
+ *
43
+ * Scope: this gates the Telegram `handleInbound` path only — the one
44
+ * the lawgpt wedge hit. The `inject_inbound` IPC path (cron / synthetic
45
+ * operator wakeups) reaches the bridge directly and is deliberately
46
+ * NOT gated here: cron fires carry at-least-once replay semantics and
47
+ * their delivery contract is a separate product decision, out of scope
48
+ * for this bug.
49
+ *
50
+ * ## Steering is deliberately exempt
51
+ *
52
+ * An explicit `/steer` (`/s`) message is *meant* to reach claude
53
+ * mid-turn — that is the whole point of the steering feature (redirect
54
+ * the agent while it works). Steering messages keep immediate delivery.
55
+ * The wedge only ever affected the queued-mid-turn default path.
56
+ */
57
+
58
+ export interface InboundDeliveryGateInput {
59
+ /** A turn is in flight RIGHT NOW (live: `activeTurnStartedAt.size > 0`),
60
+ * evaluated at delivery time — not a receipt-time snapshot, so a turn
61
+ * that completed between receipt and here correctly reads as idle. */
62
+ turnInFlight: boolean
63
+ /** This inbound carried an explicit `/steer` (`/s`) prefix and is an
64
+ * intentional mid-turn redirect. */
65
+ isSteering: boolean
66
+ }
67
+
68
+ export type InboundDeliveryDecision =
69
+ /** Send to the bridge now (idle prompt, or an intentional steer). */
70
+ | 'deliver'
71
+ /** Hold in the pending-inbound buffer; the turn-complete hook /
72
+ * turn-gated idle-drain flushes it when claude goes idle. */
73
+ | 'buffer-until-idle'
74
+
75
+ /**
76
+ * Pure. The ONLY condition that defers delivery is "a turn is in flight
77
+ * AND this is not a steering message". Everything else delivers
78
+ * immediately (idle → submits at once; steering → intentional mid-turn).
79
+ */
80
+ export function decideInboundDelivery(
81
+ input: InboundDeliveryGateInput,
82
+ ): InboundDeliveryDecision {
83
+ if (input.turnInFlight && !input.isSteering) return 'buffer-until-idle'
84
+ return 'deliver'
85
+ }
@@ -0,0 +1,272 @@
1
+ /**
2
+ * inbound-spool.ts — durable, crash-tolerant spool for buffered inbound.
3
+ *
4
+ * Why this exists: `pending-inbound-buffer.ts` is in-memory only. A
5
+ * gateway/container restart (switchroom update, agent restart, a
6
+ * self-restart, an OOM) destroys it — so the user-facing promise
7
+ * "⏳ your message is queued and will be processed when it reconnects"
8
+ * (gateway.ts) is a lie across a restart. Proven twice: finn and
9
+ * carrie (2026-05-19) lost the user's message on restart and the user
10
+ * had to resend. #1546/#1549 only shrank the in-memory delivery
11
+ * window; they cannot survive process death.
12
+ *
13
+ * This module makes the promise DETERMINISTIC: every buffered inbound
14
+ * is also appended to a JSONL spool on the persistent per-agent volume
15
+ * (`/state/agent/telegram/…`, survives container recreate). On boot the
16
+ * gateway replays un-acked entries back into the in-memory buffer, so
17
+ * the existing drain machinery delivers them. An entry is acked (and
18
+ * tombstoned) ONLY on confirmed delivery to a live registered bridge.
19
+ * Un-acked entries older than `escalateAfterMs` are surfaced to the
20
+ * user via an explicit "couldn't deliver — resend?" callback and then
21
+ * dropped: the promise is then ALWAYS resolved — kept, or visibly
22
+ * retracted — never silently lost.
23
+ *
24
+ * Scope (v1): the ack is "delivered to a live registered bridge", not
25
+ * "claude consumed it". A true claude→gateway consumption-ack needs a
26
+ * new bidirectional bridge protocol (high blast radius) and is a
27
+ * documented follow-up. v1 already eliminates the silent-loss-on-
28
+ * restart class — the actual incident class.
29
+ *
30
+ * Crash-consistency: append-only JSONL, one self-contained JSON object
31
+ * per line, written with a trailing newline in a single `appendFileSync`
32
+ * (atomic for small writes on local fs). A torn final line on a crash
33
+ * mid-write is tolerated: replay skips any line that does not
34
+ * round-trip `JSON.parse` + shape-check. Acks are themselves appended
35
+ * as tombstone lines (`{t:"ack",id}`) rather than rewriting the file;
36
+ * a bounded `compact()` rewrites the file dropping acked/escalated ids
37
+ * when it grows past `compactAtBytes`.
38
+ *
39
+ * This module is PURE w.r.t. its injected fs + clock seams so the
40
+ * crash/dedup/replay/escalation logic is unit-tested without a real
41
+ * gateway (mirrors the #1544/#1546/#1549 pure-seam idiom).
42
+ */
43
+
44
+ import type { InboundMessage } from './ipc-protocol.js'
45
+
46
+ /** Stable dedup id for an inbound. Real Telegram messages have a
47
+ * unique (chatId, messageId). Synthetic/cron inbounds use messageId
48
+ * 0 — fall back to a deterministic id from source+ts so retried
49
+ * synthetics of the SAME logical event dedup, but distinct events
50
+ * (different ts) do not collapse. */
51
+ export function spoolId(msg: InboundMessage): string {
52
+ if (typeof msg.messageId === 'number' && msg.messageId > 0) {
53
+ return `m:${msg.chatId}:${msg.messageId}`
54
+ }
55
+ const src = msg.meta?.source ?? '-'
56
+ return `s:${msg.chatId}:${src}:${msg.ts}`
57
+ }
58
+
59
+ interface SpoolRecord {
60
+ t: 'put' | 'ack'
61
+ id: string
62
+ /** Present only on `put`. The full inbound to replay. */
63
+ msg?: InboundMessage
64
+ /** Present only on `put`. Owning agent (replay re-pushes per agent). */
65
+ agent?: string
66
+ /** Present only on `put`. ms epoch first-spooled — drives escalation. */
67
+ firstAt?: number
68
+ }
69
+
70
+ export interface InboundSpoolFsSeam {
71
+ appendFileSync: (path: string, data: string) => void
72
+ readFileSync: (path: string) => string
73
+ writeFileSync: (path: string, data: string) => void
74
+ /** Atomic same-dir replace (POSIX rename). Used so compaction can't
75
+ * lose entries to a crash mid-rewrite. */
76
+ renameSync: (from: string, to: string) => void
77
+ existsSync: (path: string) => boolean
78
+ statSizeSync: (path: string) => number
79
+ }
80
+
81
+ export interface InboundSpoolOptions {
82
+ path: string
83
+ fs: InboundSpoolFsSeam
84
+ now?: () => number
85
+ log?: (line: string) => void
86
+ /** Un-acked entries older than this are escalated then dropped.
87
+ * Default 15 min — comfortably past the 5-min silence-poke ladder
88
+ * so self-heal gets every chance before we retract the promise. */
89
+ escalateAfterMs?: number
90
+ /** Rewrite-compact the JSONL once it exceeds this. Default 256 KiB. */
91
+ compactAtBytes?: number
92
+ }
93
+
94
+ export interface ReplayEntry {
95
+ agent: string
96
+ msg: InboundMessage
97
+ }
98
+
99
+ export interface InboundSpool {
100
+ /** Durably record `msg` for `agent`. Idempotent by spoolId: a
101
+ * re-spool of an already-live id is a no-op (returns false). */
102
+ put: (agent: string, msg: InboundMessage) => boolean
103
+ /** Tombstone `id` — call ONLY on confirmed delivery to a live
104
+ * registered bridge. Idempotent. */
105
+ ack: (msg: InboundMessage) => void
106
+ /** Live (un-acked) entries, oldest first. Used at boot to re-push
107
+ * into the in-memory buffer. Pure read — does not mutate. */
108
+ liveEntries: () => ReplayEntry[]
109
+ /** Escalate+drop entries older than `escalateAfterMs`. Calls
110
+ * `onEscalate` once per dropped entry (post the "couldn't deliver"
111
+ * card there). Returns the count escalated. Safe to call on a timer. */
112
+ sweepEscalations: (onEscalate: (e: ReplayEntry) => void) => number
113
+ /** Test/observability: count of live (un-acked) ids. */
114
+ liveCount: () => number
115
+ }
116
+
117
+ export function createInboundSpool(opts: InboundSpoolOptions): InboundSpool {
118
+ const { path, fs } = opts
119
+ const now = opts.now ?? Date.now
120
+ const log = opts.log ?? ((l: string) => process.stderr.write(l))
121
+ const escalateAfterMs = opts.escalateAfterMs ?? 15 * 60 * 1000
122
+ const compactAtBytes = opts.compactAtBytes ?? 256 * 1024
123
+
124
+ // In-memory projection of the on-disk log, rebuilt from the file at
125
+ // construction. `live` maps spoolId → the put record (insertion order
126
+ // preserved via the Map). An `ack` deletes from `live`.
127
+ const live = new Map<string, { agent: string; msg: InboundMessage; firstAt: number }>()
128
+
129
+ function parseLine(line: string): SpoolRecord | null {
130
+ const s = line.trim()
131
+ if (!s) return null
132
+ let rec: unknown
133
+ try {
134
+ rec = JSON.parse(s)
135
+ } catch {
136
+ return null // torn / partial line from a crash mid-append — skip
137
+ }
138
+ if (rec == null || typeof rec !== 'object') return null
139
+ const r = rec as Record<string, unknown>
140
+ if (r.t !== 'put' && r.t !== 'ack') return null
141
+ if (typeof r.id !== 'string' || r.id.length === 0) return null
142
+ if (r.t === 'put') {
143
+ if (r.msg == null || typeof r.msg !== 'object') return null
144
+ if (typeof r.agent !== 'string' || r.agent.length === 0) return null
145
+ if (typeof r.firstAt !== 'number') return null
146
+ }
147
+ return r as unknown as SpoolRecord
148
+ }
149
+
150
+ // Rebuild `live` from the file. Tolerates a torn last line.
151
+ function hydrate(): void {
152
+ live.clear()
153
+ if (!fs.existsSync(path)) return
154
+ let raw = ''
155
+ try {
156
+ raw = fs.readFileSync(path)
157
+ } catch {
158
+ return
159
+ }
160
+ for (const line of raw.split('\n')) {
161
+ const rec = parseLine(line)
162
+ if (rec == null) continue
163
+ if (rec.t === 'put') {
164
+ // Last put for an id wins; an ack later removes it.
165
+ live.set(rec.id, {
166
+ agent: rec.agent as string,
167
+ msg: rec.msg as InboundMessage,
168
+ firstAt: rec.firstAt as number,
169
+ })
170
+ } else {
171
+ live.delete(rec.id)
172
+ }
173
+ }
174
+ }
175
+
176
+ function appendRecord(rec: SpoolRecord): void {
177
+ try {
178
+ fs.appendFileSync(path, JSON.stringify(rec) + '\n')
179
+ } catch (err) {
180
+ // Durability is best-effort relative to fs availability; a spool
181
+ // write failure must NOT break live delivery. Log loudly — a
182
+ // persistently failing spool means we're back to in-memory-only
183
+ // semantics and the operator should know.
184
+ log(
185
+ `inbound-spool: append FAILED path=${path} id=${rec.id} t=${rec.t}: ` +
186
+ `${(err as Error).message} — durability degraded to in-memory\n`,
187
+ )
188
+ }
189
+ }
190
+
191
+ function maybeCompact(): void {
192
+ let size = 0
193
+ try {
194
+ size = fs.existsSync(path) ? fs.statSizeSync(path) : 0
195
+ } catch {
196
+ return
197
+ }
198
+ if (size <= compactAtBytes) return
199
+ // Rewrite the file as exactly the current live set (one put per
200
+ // live id, no acks). ATOMIC: write a sibling tmp then rename over
201
+ // the real path. rename(2) is atomic within a filesystem, so a
202
+ // crash at any point leaves EITHER the full pre-compaction log OR
203
+ // the full compacted log on disk — never a truncated/torn file
204
+ // that loses live entries after the tear. (Plain writeFileSync is
205
+ // not atomic; a crash mid-write of a >256 KiB rewrite could drop
206
+ // entries past the tear — the residual the reviewer flagged.)
207
+ const lines: string[] = []
208
+ for (const [id, e] of live) {
209
+ lines.push(
210
+ JSON.stringify({ t: 'put', id, agent: e.agent, msg: e.msg, firstAt: e.firstAt } satisfies SpoolRecord),
211
+ )
212
+ }
213
+ const tmp = path + '.compact.tmp'
214
+ try {
215
+ fs.writeFileSync(tmp, lines.length ? lines.join('\n') + '\n' : '')
216
+ fs.renameSync(tmp, path)
217
+ log(`inbound-spool: compacted path=${path} live=${live.size}\n`)
218
+ } catch (err) {
219
+ // Compaction is opportunistic — a failure keeps the (larger but
220
+ // correct) append-only log; never lose data trying to shrink it.
221
+ log(`inbound-spool: compact FAILED path=${path}: ${(err as Error).message}\n`)
222
+ }
223
+ }
224
+
225
+ hydrate()
226
+
227
+ return {
228
+ put(agent, msg) {
229
+ const id = spoolId(msg)
230
+ if (live.has(id)) return false // dedup: already spooled & un-acked
231
+ const firstAt = now()
232
+ live.set(id, { agent, msg, firstAt })
233
+ appendRecord({ t: 'put', id, agent, msg, firstAt })
234
+ maybeCompact()
235
+ return true
236
+ },
237
+ ack(msg) {
238
+ const id = spoolId(msg)
239
+ if (!live.has(id)) return // idempotent / unknown id
240
+ live.delete(id)
241
+ appendRecord({ t: 'ack', id })
242
+ maybeCompact()
243
+ },
244
+ liveEntries() {
245
+ // Insertion order = Map iteration order = oldest first.
246
+ return [...live.values()].map((e) => ({ agent: e.agent, msg: e.msg }))
247
+ },
248
+ sweepEscalations(onEscalate) {
249
+ const cutoff = now() - escalateAfterMs
250
+ let n = 0
251
+ for (const [id, e] of [...live.entries()]) {
252
+ if (e.firstAt > cutoff) continue
253
+ live.delete(id)
254
+ appendRecord({ t: 'ack', id }) // tombstone — promise retracted
255
+ try {
256
+ onEscalate({ agent: e.agent, msg: e.msg })
257
+ } catch (err) {
258
+ log(`inbound-spool: onEscalate threw id=${id}: ${(err as Error).message}\n`)
259
+ }
260
+ n++
261
+ }
262
+ if (n > 0) {
263
+ log(`inbound-spool: escalated+dropped ${n} undelivered entr${n === 1 ? 'y' : 'ies'} (older than ${escalateAfterMs}ms)\n`)
264
+ maybeCompact()
265
+ }
266
+ return n
267
+ },
268
+ liveCount() {
269
+ return live.size
270
+ },
271
+ }
272
+ }
@@ -30,6 +30,7 @@
30
30
  */
31
31
 
32
32
  import type { InboundMessage } from './ipc-protocol.js'
33
+ import type { InboundSpool } from './inbound-spool.js'
33
34
 
34
35
  /** Default cap per agent. Tuned for `should fit a reasonable backlog of
35
36
  * approval cards stacked while bridge is offline` but no more. */
@@ -52,6 +53,19 @@ export interface PendingInboundBuffer {
52
53
  export interface PendingInboundBufferOptions {
53
54
  capPerAgent?: number
54
55
  log?: (line: string) => void
56
+ /**
57
+ * Durable spool. When set, every `push` is also recorded on the
58
+ * persistent per-agent volume so a gateway/container restart cannot
59
+ * silently lose the message (the finn/carrie incident class). The
60
+ * in-memory queue stays the hot path + cap; the spool is the
61
+ * crash-survivable record, acked only on confirmed delivery (by
62
+ * `redeliverBufferedInbound`/`idleDrainTick`), boot-replayed by the
63
+ * gateway, and escalated-then-dropped if undeliverable past its
64
+ * bound. The in-memory cap eviction does NOT touch the spool — an
65
+ * evicted-from-memory entry survives in the spool (strictly safer
66
+ * than the old silent in-memory drop).
67
+ */
68
+ spool?: InboundSpool
55
69
  }
56
70
 
57
71
  /**
@@ -72,6 +86,7 @@ export function redeliverBufferedInbound(
72
86
  buffer: PendingInboundBuffer,
73
87
  agent: string,
74
88
  send: (msg: InboundMessage) => boolean,
89
+ spool?: InboundSpool,
75
90
  ): { drained: number; redelivered: number; rebuffered: number } {
76
91
  const pending = buffer.drain(agent)
77
92
  let redelivered = 0
@@ -85,6 +100,11 @@ export function redeliverBufferedInbound(
85
100
  }
86
101
  if (delivered) {
87
102
  redelivered++
103
+ // Confirmed delivery to a live registered bridge → the durable
104
+ // promise is kept; tombstone the spool entry so it is NOT
105
+ // boot-replayed again. A miss leaves it spooled (re-pushed below
106
+ // AND still live in the spool) for the next drain / escalation.
107
+ spool?.ack(msg)
88
108
  } else {
89
109
  buffer.push(agent, msg)
90
110
  rebuffered++
@@ -107,8 +127,19 @@ export function redeliverBufferedInbound(
107
127
  * which would re-buffer+log-spin every tick; onClientRegistered
108
128
  * will drain on the eventual reconnect instead)
109
129
  * - otherwise → `redeliverBufferedInbound` (lossless: re-buffers any
110
- * per-message miss). A message delivered mid-turn is queued
111
- * normally by the bridge, same as a live arrival — not lost.
130
+ * per-message miss).
131
+ *
132
+ * NOTE (#1556): a message delivered mid-turn is NOT safely queued by
133
+ * the bridge — the prior "queued normally, same as a live arrival"
134
+ * claim here was the false assumption behind the lawgpt composer
135
+ * wedge. claude types a mid-turn channel notification into its TUI
136
+ * composer and the auto-submit races turn-completion, stranding it.
137
+ * The `idleDrainTick` caller therefore also gates on
138
+ * `activeTurnStartedAt.size === 0`, so this function is never invoked
139
+ * mid-turn. The Telegram `handleInbound` delivery path is turn-gated
140
+ * (gateway.ts); the `inject_inbound` cron/synthetic path is a separate
141
+ * delivery contract and deliberately not gated — see
142
+ * `inbound-delivery-gate.ts`.
112
143
  *
113
144
  * Returns the redeliver counts only when it actually ran, else null
114
145
  * (so the caller logs only on a real flush).
@@ -118,11 +149,12 @@ export function idleDrainTick(
118
149
  agent: string,
119
150
  isBridgeAlive: () => boolean,
120
151
  send: (msg: InboundMessage) => boolean,
152
+ spool?: InboundSpool,
121
153
  ): { drained: number; redelivered: number; rebuffered: number } | null {
122
154
  if (!agent) return null
123
155
  if (buffer.depth(agent) === 0) return null
124
156
  if (!isBridgeAlive()) return null
125
- return redeliverBufferedInbound(buffer, agent, send)
157
+ return redeliverBufferedInbound(buffer, agent, send, spool)
126
158
  }
127
159
 
128
160
  export function createPendingInboundBuffer(
@@ -130,6 +162,7 @@ export function createPendingInboundBuffer(
130
162
  ): PendingInboundBuffer {
131
163
  const cap = opts.capPerAgent ?? DEFAULT_PENDING_INBOUND_CAP
132
164
  const log = opts.log ?? ((line: string) => process.stderr.write(line))
165
+ const spool = opts.spool
133
166
  const queues = new Map<string, InboundMessage[]>()
134
167
 
135
168
  return {
@@ -149,6 +182,12 @@ export function createPendingInboundBuffer(
149
182
  )
150
183
  }
151
184
  q.push(msg)
185
+ // Durable record FIRST-class to the in-memory queue: spool BEFORE
186
+ // returning, regardless of the cap eviction above — an entry the
187
+ // in-memory cap drops still survives in the spool (boot-replayed /
188
+ // escalated), which is the whole point. spool.put dedups by
189
+ // spoolId so a boot-replay re-push is a no-op here.
190
+ spool?.put(agent, msg)
152
191
  log(
153
192
  `pending-inbound-buffer: agent=${agent} buffered source=${msg.meta?.source ?? '-'} ` +
154
193
  `depth_after=${q.length} evicted=${evicted}\n`,