switchroom 0.12.18 → 0.12.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,7 @@ import { execFileSync, execSync, spawn } from 'child_process'
17
17
  import {
18
18
  readFileSync, writeFileSync, mkdirSync, readdirSync, rmSync,
19
19
  statSync, renameSync, realpathSync, chmodSync, openSync, closeSync,
20
- existsSync, unlinkSync,
20
+ existsSync, unlinkSync, appendFileSync,
21
21
  } from 'fs'
22
22
  import { homedir } from 'os'
23
23
  import { join, extname, sep, basename } from 'path'
@@ -249,6 +249,9 @@ import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js
249
249
  import { handleRequestDriveApproval } from './drive-write-approval.js'
250
250
  import { buildDiffPreviewCard } from './diff-preview-card.js'
251
251
  import { createPendingInboundBuffer, redeliverBufferedInbound, idleDrainTick } from './pending-inbound-buffer.js'
252
+ import { createInboundSpool } from './inbound-spool.js'
253
+ import { purgeStaleTurnsForChat } from './turn-state-purge.js'
254
+ import { decideInboundDelivery } from './inbound-delivery-gate.js'
252
255
  import { createPendingPermissionBuffer } from './pending-permission-decisions.js'
253
256
  import {
254
257
  buildVaultGrantApprovedInbound,
@@ -1278,6 +1281,30 @@ function purgeReactionTracking(key: string): void {
1278
1281
  // response to the client was already sent when the restart was
1279
1282
  // scheduled, so nobody is waiting on this.
1280
1283
  if (activeTurnStartedAt.size === 0) {
1284
+ // #1556: the deterministic delivery point. claude has just gone
1285
+ // idle — flush any inbound held mid-turn so the channel
1286
+ // notification lands at the idle prompt and submits as a fresh
1287
+ // turn (instead of stranding in the composer, the lawgpt wedge).
1288
+ // Zero-churn: depth check first, no work on the common empty path.
1289
+ // Lossless: redeliver re-buffers any per-message miss (bridge
1290
+ // mid-reconnect), which onClientRegistered then drains.
1291
+ const selfAgentForFlush = process.env.SWITCHROOM_AGENT_NAME ?? ''
1292
+ if (pendingInboundBuffer.depth(selfAgentForFlush) > 0) {
1293
+ const fr = redeliverBufferedInbound(
1294
+ pendingInboundBuffer,
1295
+ selfAgentForFlush,
1296
+ (m) => ipcServer.sendToAgent(selfAgentForFlush, m),
1297
+ inboundSpool,
1298
+ )
1299
+ if (fr.redelivered > 0) {
1300
+ process.stderr.write(
1301
+ `telegram gateway: turn-complete flushed ${fr.redelivered}/${fr.drained} ` +
1302
+ `held inbound for ${selfAgentForFlush}` +
1303
+ `${fr.rebuffered > 0 ? ` (${fr.rebuffered} re-buffered)` : ''}\n`,
1304
+ )
1305
+ }
1306
+ }
1307
+
1281
1308
  if (pendingRestarts.size > 0) {
1282
1309
  for (const [agentName, _timestamp] of pendingRestarts.entries()) {
1283
1310
  triggerSelfRestart(agentName, 'turn-complete-pending-restart');
@@ -1292,6 +1319,32 @@ function purgeReactionTracking(key: string): void {
1292
1319
  }
1293
1320
  }
1294
1321
 
1322
+ /**
1323
+ * Atomic null-and-purge for a wedged turn. Every site that ends a
1324
+ * turn by nulling `currentTurn` MUST also clear the turn's statusKey
1325
+ * from `activeTurnStartedAt` — else a dangling entry survives and
1326
+ * `#1556`'s turn-gate holds every new inbound mid-turn forever
1327
+ * (gymbro / klanker held-mid-turn symptom, 2026-05-20).
1328
+ *
1329
+ * Pre-this, three turn-end paths (silent-marker / turn-flush /
1330
+ * `turn_end`) nulled `currentTurn` on code-paths whose
1331
+ * `purgeReactionTracking` calls weren't reached on every branch,
1332
+ * leaving sibling entries under the turn's statusKey that the
1333
+ * silence-poke framework-fallback's `purgeReactionTracking(fbKey)`
1334
+ * couldn't catch (different key shape). The fallback now also sweeps
1335
+ * siblings for `fbChatId` (`turn-state-purge.ts`) as defense-in-depth,
1336
+ * but THIS helper closes the leak at origin: null and purge are
1337
+ * inseparable at every call site.
1338
+ *
1339
+ * Idempotent: a second purge is a no-op `.delete()` on a key already
1340
+ * gone — handlers that already purge elsewhere are unharmed.
1341
+ */
1342
+ function endCurrentTurnAtomic(turn: CurrentTurn): void {
1343
+ if (currentTurn !== turn) return
1344
+ currentTurn = null
1345
+ purgeReactionTracking(statusKey(turn.sessionChatId, turn.sessionThreadId))
1346
+ }
1347
+
1295
1348
  /**
1296
1349
  * Model-idle proactive-compaction check. Called ONLY from the
1297
1350
  * activeTurnStartedAt.size === 0 gate above (never mid-turn). Opt-in via
@@ -2985,6 +3038,23 @@ silencePoke.startTimer({
2985
3038
  // for this chat starts a fresh turn instead of queueing forever.
2986
3039
  silencePoke.endTurn(fbKey)
2987
3040
  purgeReactionTracking(fbKey)
3041
+ // Defense-in-depth: the fallback's purgeReactionTracking above
3042
+ // clears the canonical statusKey(chatId, threadId) for fbKey
3043
+ // only. activeTurnStartedAt can hold sibling entries for the
3044
+ // SAME chat (different threads, or a `null` vs `undefined`-thread
3045
+ // variant left over from a normal turn-end path that nulled
3046
+ // currentTurn without invoking purgeReactionTracking — the
3047
+ // gymbro/klanker held-mid-turn symptom, 2026-05-20). Any sibling
3048
+ // for fbChatId is by definition stale when THIS fallback fires
3049
+ // (the chat has been silent ≥5 min); sweep them via the same
3050
+ // purger. Multi-chat-safe — only touches keys for fbChatId, so
3051
+ // #1546's intentional cross-chat safety guard is preserved.
3052
+ // See turn-state-purge.ts.
3053
+ const fbExtraPurge = purgeStaleTurnsForChat(
3054
+ fbChatId,
3055
+ activeTurnStartedAt.keys(),
3056
+ purgeReactionTracking,
3057
+ )
2988
3058
  // Null `currentTurn` if it's still pointing at the wedged turn —
2989
3059
  // when claude eventually fires a late `turn_end` for this session
2990
3060
  // (or never does), the handler's `const turn = currentTurn` snapshot
@@ -3011,13 +3081,15 @@ silencePoke.startTimer({
3011
3081
  pendingInboundBuffer,
3012
3082
  fbSelfAgent,
3013
3083
  (m) => ipcServer.sendToAgent(fbSelfAgent, m),
3084
+ inboundSpool,
3014
3085
  )
3015
3086
  process.stderr.write(
3016
3087
  `telegram gateway: silence-poke framework-fallback ended wedged turn ` +
3017
3088
  `chat=${fbChatId} thread=${ctx.threadId ?? '-'} silence_ms=${ctx.silenceMs} ` +
3018
3089
  `currentTurn_nulled=${turnMatchesFallback} ` +
3019
3090
  `drained_buffered=${fbRedeliver.redelivered}/${fbRedeliver.drained}` +
3020
- `${fbRedeliver.rebuffered > 0 ? ` rebuffered=${fbRedeliver.rebuffered}` : ''}\n`,
3091
+ `${fbRedeliver.rebuffered > 0 ? ` rebuffered=${fbRedeliver.rebuffered}` : ''}` +
3092
+ `${fbExtraPurge.purged.length > 0 ? ` extra_keys_purged=${fbExtraPurge.purged.length}` : ''}\n`,
3021
3093
  )
3022
3094
  },
3023
3095
  })
@@ -3029,7 +3101,42 @@ silencePoke.startTimer({
3029
3101
  // vault_request_access card during the 100ms bridge-reconnect window
3030
3102
  // would mint the grant but silently drop the `vault_grant_approved`
3031
3103
  // inbound, leaving the agent stuck waiting for a manual poke.
3032
- const pendingInboundBuffer = createPendingInboundBuffer()
3104
+ // Durable inbound spool on the persistent per-agent volume
3105
+ // (STATE_DIR = /state/agent/telegram in prod — survives container
3106
+ // recreate). Makes the "⏳ your message is queued and will be
3107
+ // processed when it reconnects" promise deterministic across a
3108
+ // gateway/container restart (finn/carrie lost-on-restart incident,
3109
+ // 2026-05-19). STATIC mode has no runtime/bridge, so no spool.
3110
+ const inboundSpool = STATIC
3111
+ ? undefined
3112
+ : createInboundSpool({
3113
+ path: join(STATE_DIR, 'inbound-spool.jsonl'),
3114
+ fs: {
3115
+ appendFileSync: (p, d) => appendFileSync(p, d),
3116
+ readFileSync: (p) => readFileSync(p, 'utf8'),
3117
+ writeFileSync: (p, d) => writeFileSync(p, d),
3118
+ renameSync: (a, b) => renameSync(a, b),
3119
+ existsSync: (p) => existsSync(p),
3120
+ statSizeSync: (p) => statSync(p).size,
3121
+ },
3122
+ })
3123
+ const pendingInboundBuffer = createPendingInboundBuffer({ spool: inboundSpool })
3124
+ // Boot-replay: re-queue every un-acked spooled inbound into the
3125
+ // in-memory buffer so the existing drain triggers (onClientRegistered
3126
+ // / silence-poke #1546 / idle-drain #1549) deliver them. push →
3127
+ // spool.put dedups on the already-live id, so this re-push does NOT
3128
+ // double-append. This is what makes a queued message survive a
3129
+ // restart instead of being silently lost.
3130
+ if (inboundSpool != null) {
3131
+ const replay = inboundSpool.liveEntries()
3132
+ for (const e of replay) pendingInboundBuffer.push(e.agent, e.msg)
3133
+ if (replay.length > 0) {
3134
+ process.stderr.write(
3135
+ `telegram gateway: inbound-spool boot-replay re-queued ${replay.length} ` +
3136
+ `un-acked inbound (durable-queue, survives restart)\n`,
3137
+ )
3138
+ }
3139
+ }
3033
3140
  const pendingPermissionBuffer = createPendingPermissionBuffer()
3034
3141
 
3035
3142
  /**
@@ -3080,6 +3187,12 @@ const ipcServer: IpcServer = createIpcServer({
3080
3187
  for (const msg of pending) {
3081
3188
  try {
3082
3189
  client.send(msg)
3190
+ // Confirmed delivery to the just-registered live bridge →
3191
+ // tombstone the durable spool entry so it isn't boot-replayed
3192
+ // again. A throw below leaves it spooled (un-acked) so the
3193
+ // idle-drain / escalation path still recovers it — strictly
3194
+ // safer than the old log-and-drop.
3195
+ inboundSpool?.ack(msg)
3083
3196
  } catch (err) {
3084
3197
  process.stderr.write(
3085
3198
  `telegram gateway: pending-inbound drain failed agent=${client.agentName} ` +
@@ -3542,12 +3655,17 @@ const ipcServer: IpcServer = createIpcServer({
3542
3655
  //
3543
3656
  // This is the third drain trigger. It's gated to be zero-cost and
3544
3657
  // zero-churn: skip entirely when nothing is buffered (one Map.get, no
3545
- // log) or when the bridge isn't alive (exactly sendToAgent's own
3546
- // guard — so we never drain into a dead bridge and re-buffer/log-spin).
3547
- // Only when there IS a buffered message AND a live bridge do we reuse
3548
- // the #1546 `redeliverBufferedInbound` (lossless: re-buffers any
3549
- // per-message miss). A message delivered while a turn is active is
3550
- // queued normally by the bridge same as a live arrival, not lost.
3658
+ // log), when the bridge isn't alive (exactly sendToAgent's own guard —
3659
+ // so we never drain into a dead bridge and re-buffer/log-spin), OR
3660
+ // when a turn is in flight. The turn gate is #1556: a message
3661
+ // delivered while a turn is active is NOT safely queued by the bridge
3662
+ // claude types it into its TUI composer and the auto-submit races
3663
+ // turn-completion, stranding it (the lawgpt wedge). Draining only at
3664
+ // `activeTurnStartedAt.size === 0` guarantees the channel notification
3665
+ // lands at an idle prompt and submits as a fresh turn. Only when there
3666
+ // IS a buffered message AND a live bridge AND no active turn do we
3667
+ // reuse the #1546 `redeliverBufferedInbound` (lossless: re-buffers any
3668
+ // per-message miss).
3551
3669
  const IDLE_DRAIN_INTERVAL_MS = 5000
3552
3670
  if (!STATIC) {
3553
3671
  setInterval(() => {
@@ -3556,10 +3674,14 @@ if (!STATIC) {
3556
3674
  pendingInboundBuffer,
3557
3675
  selfAgent,
3558
3676
  () => {
3677
+ // #1556: never drain mid-turn — that re-creates the composer
3678
+ // wedge this buffer exists to prevent.
3679
+ if (activeTurnStartedAt.size > 0) return false
3559
3680
  const c = ipcServer.getClient(selfAgent)
3560
3681
  return c != null && c.isAlive()
3561
3682
  },
3562
3683
  (m) => ipcServer.sendToAgent(selfAgent, m),
3684
+ inboundSpool,
3563
3685
  )
3564
3686
  if (r != null && r.redelivered > 0) {
3565
3687
  process.stderr.write(
@@ -3568,6 +3690,28 @@ if (!STATIC) {
3568
3690
  `${r.rebuffered > 0 ? ` (${r.rebuffered} re-buffered)` : ''}\n`,
3569
3691
  )
3570
3692
  }
3693
+ // Bounded escalation: a spooled inbound still un-acked past its
3694
+ // bound (default 15 min — well past the 5-min silence-poke ladder)
3695
+ // is undeliverable in practice. Retract the "will be processed"
3696
+ // promise EXPLICITLY (honest failure) instead of letting it sit
3697
+ // forever. This is what makes the guarantee deterministic: every
3698
+ // queued message ends either delivered or visibly retracted.
3699
+ inboundSpool?.sweepEscalations((e) => {
3700
+ const chat = e.msg.chatId
3701
+ const threadOpts =
3702
+ typeof e.msg.meta?.threadId === 'string' && e.msg.meta.threadId
3703
+ ? { message_thread_id: Number(e.msg.meta.threadId) }
3704
+ : {}
3705
+ void swallowingApiCall(
3706
+ () =>
3707
+ bot.api.sendMessage(
3708
+ chat,
3709
+ "⚠️ I couldn't deliver an earlier message to the agent after repeated retries (it survived restarts but the agent never picked it up). Please resend it.",
3710
+ { ...threadOpts },
3711
+ ),
3712
+ { chat_id: chat, verb: 'inbound-spool-escalation' },
3713
+ )
3714
+ })
3571
3715
  }, IDLE_DRAIN_INTERVAL_MS).unref()
3572
3716
  }
3573
3717
 
@@ -5587,7 +5731,7 @@ function handleSessionEvent(ev: SessionEvent): void {
5587
5731
  turn.answerStream = null
5588
5732
  }
5589
5733
  // Null the atom — this turn is being abandoned.
5590
- if (currentTurn === turn) currentTurn = null
5734
+ endCurrentTurnAtomic(turn)
5591
5735
  // #549 fix — context-exhaustion teardown also resets preamble state.
5592
5736
  preambleSuppressor.reset()
5593
5737
  }
@@ -5785,7 +5929,7 @@ function handleSessionEvent(ev: SessionEvent): void {
5785
5929
  // returns early at handler entry. A new `enqueue` swaps in a
5786
5930
  // fresh atom; the silent-turn teardown doesn't need to preserve
5787
5931
  // any of the prior turn's state.
5788
- if (currentTurn === turn) currentTurn = null
5932
+ endCurrentTurnAtomic(turn)
5789
5933
  // #549 fix — silent-marker teardown drops any pending preamble.
5790
5934
  preambleSuppressor.dropNow()
5791
5935
  return
@@ -5819,7 +5963,7 @@ function handleSessionEvent(ev: SessionEvent): void {
5819
5963
  // sendMessage await for this turn will see currentTurn == null
5820
5964
  // and bail; a new enqueue will swap in a fresh atom. The
5821
5965
  // `backstop*` locals above hold everything the IIFE needs.
5822
- if (currentTurn === turn) currentTurn = null
5966
+ endCurrentTurnAtomic(turn)
5823
5967
  // #549 fix — turn-flush takes ownership of the captured-text
5824
5968
  // backup; reset the preamble buffer (its content is already in
5825
5969
  // the captured `capturedText`, which turn-flush is about to send).
@@ -6091,7 +6235,7 @@ function handleSessionEvent(ev: SessionEvent): void {
6091
6235
  // #1067: null the atom in one assignment, replacing the seven
6092
6236
  // field clears the pre-refactor version did. Any late-arriving
6093
6237
  // event for this turn will see currentTurn == null and bail.
6094
- if (currentTurn === turn) currentTurn = null
6238
+ endCurrentTurnAtomic(turn)
6095
6239
  // #549 fix — preamble flush already happened at the TOP of this
6096
6240
  // turn_end handler (before turn.answerStream is nulled). See
6097
6241
  // comment near line 3431.
@@ -7377,6 +7521,29 @@ async function handleInbound(
7377
7521
  // push to pendingInboundBuffer, which onClientRegistered drains on
7378
7522
  // the next bridge register — so the notice below is now truthful.
7379
7523
  const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
7524
+
7525
+ // #1556: turn-gated delivery. A non-steering inbound that arrives
7526
+ // mid-turn must NOT be sent to the bridge now — claude would type it
7527
+ // into its TUI composer and the auto-submit races turn-completion,
7528
+ // stranding the message (the lawgpt wedge, 2026-05-19). Buffer it;
7529
+ // `purgeReactionTracking`'s turn-complete hook and the turn-gated
7530
+ // idle-drain flush it the instant claude goes idle, where the channel
7531
+ // notification submits cleanly as a fresh turn. Steering messages are
7532
+ // exempt — reaching claude mid-turn is the whole point of /steer.
7533
+ if (
7534
+ decideInboundDelivery({
7535
+ turnInFlight: activeTurnStartedAt.size > 0,
7536
+ isSteering,
7537
+ }) === 'buffer-until-idle'
7538
+ ) {
7539
+ pendingInboundBuffer.push(selfAgent, inboundMsg)
7540
+ process.stderr.write(
7541
+ `telegram gateway: inbound held mid-turn agent=${selfAgent} ` +
7542
+ `chat=${chat_id} msg=${msgId ?? '-'} — will flush on turn-complete\n`,
7543
+ )
7544
+ return
7545
+ }
7546
+
7380
7547
  const delivered = ipcServer.sendToAgent(selfAgent, inboundMsg)
7381
7548
  if (!delivered) {
7382
7549
  pendingInboundBuffer.push(selfAgent, inboundMsg)
@@ -0,0 +1,85 @@
1
+ /**
2
+ * Inbound delivery gate (#1556 — the lawgpt composer-wedge).
3
+ *
4
+ * Pure decision: given the live turn state, should a freshly-received
5
+ * Telegram inbound be delivered to the bridge *now*, or held in the
6
+ * pending-inbound buffer until claude is idle?
7
+ *
8
+ * ## Why this exists
9
+ *
10
+ * The gateway used to `ipcServer.sendToAgent(inbound)` unconditionally,
11
+ * buffering ONLY when the bridge was offline. The load-bearing (and
12
+ * false) assumption — stated verbatim in three places before this fix
13
+ * (`pending-inbound-buffer.ts`, the idle-drain comment, and the
14
+ * implicit unconditional send) — was:
15
+ *
16
+ * "a message delivered while a turn is active is queued normally by
17
+ * the bridge, same as a live arrival, not lost."
18
+ *
19
+ * It is not. The bridge converts an inbound into an MCP
20
+ * `notifications/claude/channel` notification (`bridge.ts:onInbound`).
21
+ * When claude receives that notification mid-turn, the unmodified CLI
22
+ * types the text into its TUI composer and relies on an auto-submit
23
+ * once the turn ends. That submit races turn-completion and frequently
24
+ * does not fire — the message strands in the composer, claude sits at
25
+ * an idle prompt with the user's instruction un-actioned, and nothing
26
+ * self-heals it (the turn-active watchdog only catches *in-turn* hangs;
27
+ * this is *between-turns*-with-undelivered-input, which reads as
28
+ * healthy idle). Observed live: agent `lawgpt`, 2026-05-19 — a
29
+ * follow-up message sat unsubmitted indefinitely; only a restart
30
+ * cleared it, and the restart *lost* the message.
31
+ *
32
+ * ## The deterministic guarantee
33
+ *
34
+ * A non-steering inbound on the Telegram `handleInbound` path is
35
+ * delivered to the bridge ONLY when no turn is in flight. The channel
36
+ * notification therefore always lands at an idle claude prompt, where
37
+ * it submits cleanly as a fresh turn. It can be *delayed* (until the
38
+ * current turn completes) but can never strand in the composer. The
39
+ * turn-complete hook (`purgeReactionTracking`) and the turn-gated
40
+ * idle-drain timer flush the buffer the instant
41
+ * `activeTurnStartedAt.size === 0`.
42
+ *
43
+ * Scope: this gates the Telegram `handleInbound` path only — the one
44
+ * the lawgpt wedge hit. The `inject_inbound` IPC path (cron / synthetic
45
+ * operator wakeups) reaches the bridge directly and is deliberately
46
+ * NOT gated here: cron fires carry at-least-once replay semantics and
47
+ * their delivery contract is a separate product decision, out of scope
48
+ * for this bug.
49
+ *
50
+ * ## Steering is deliberately exempt
51
+ *
52
+ * An explicit `/steer` (`/s`) message is *meant* to reach claude
53
+ * mid-turn — that is the whole point of the steering feature (redirect
54
+ * the agent while it works). Steering messages keep immediate delivery.
55
+ * The wedge only ever affected the queued-mid-turn default path.
56
+ */
57
+
58
+ export interface InboundDeliveryGateInput {
59
+ /** A turn is in flight RIGHT NOW (live: `activeTurnStartedAt.size > 0`),
60
+ * evaluated at delivery time — not a receipt-time snapshot, so a turn
61
+ * that completed between receipt and here correctly reads as idle. */
62
+ turnInFlight: boolean
63
+ /** This inbound carried an explicit `/steer` (`/s`) prefix and is an
64
+ * intentional mid-turn redirect. */
65
+ isSteering: boolean
66
+ }
67
+
68
+ export type InboundDeliveryDecision =
69
+ /** Send to the bridge now (idle prompt, or an intentional steer). */
70
+ | 'deliver'
71
+ /** Hold in the pending-inbound buffer; the turn-complete hook /
72
+ * turn-gated idle-drain flushes it when claude goes idle. */
73
+ | 'buffer-until-idle'
74
+
75
+ /**
76
+ * Pure. The ONLY condition that defers delivery is "a turn is in flight
77
+ * AND this is not a steering message". Everything else delivers
78
+ * immediately (idle → submits at once; steering → intentional mid-turn).
79
+ */
80
+ export function decideInboundDelivery(
81
+ input: InboundDeliveryGateInput,
82
+ ): InboundDeliveryDecision {
83
+ if (input.turnInFlight && !input.isSteering) return 'buffer-until-idle'
84
+ return 'deliver'
85
+ }
@@ -0,0 +1,272 @@
1
+ /**
2
+ * inbound-spool.ts — durable, crash-tolerant spool for buffered inbound.
3
+ *
4
+ * Why this exists: `pending-inbound-buffer.ts` is in-memory only. A
5
+ * gateway/container restart (switchroom update, agent restart, a
6
+ * self-restart, an OOM) destroys it — so the user-facing promise
7
+ * "⏳ your message is queued and will be processed when it reconnects"
8
+ * (gateway.ts) is a lie across a restart. Proven twice: finn and
9
+ * carrie (2026-05-19) lost the user's message on restart and the user
10
+ * had to resend. #1546/#1549 only shrank the in-memory delivery
11
+ * window; they cannot survive process death.
12
+ *
13
+ * This module makes the promise DETERMINISTIC: every buffered inbound
14
+ * is also appended to a JSONL spool on the persistent per-agent volume
15
+ * (`/state/agent/telegram/…`, survives container recreate). On boot the
16
+ * gateway replays un-acked entries back into the in-memory buffer, so
17
+ * the existing drain machinery delivers them. An entry is acked (and
18
+ * tombstoned) ONLY on confirmed delivery to a live registered bridge.
19
+ * Un-acked entries older than `escalateAfterMs` are surfaced to the
20
+ * user via an explicit "couldn't deliver — resend?" callback and then
21
+ * dropped: the promise is then ALWAYS resolved — kept, or visibly
22
+ * retracted — never silently lost.
23
+ *
24
+ * Scope (v1): the ack is "delivered to a live registered bridge", not
25
+ * "claude consumed it". A true claude→gateway consumption-ack needs a
26
+ * new bidirectional bridge protocol (high blast radius) and is a
27
+ * documented follow-up. v1 already eliminates the silent-loss-on-
28
+ * restart class — the actual incident class.
29
+ *
30
+ * Crash-consistency: append-only JSONL, one self-contained JSON object
31
+ * per line, written with a trailing newline in a single `appendFileSync`
32
+ * (atomic for small writes on local fs). A torn final line on a crash
33
+ * mid-write is tolerated: replay skips any line that does not
34
+ * round-trip `JSON.parse` + shape-check. Acks are themselves appended
35
+ * as tombstone lines (`{t:"ack",id}`) rather than rewriting the file;
36
+ * a bounded `compact()` rewrites the file dropping acked/escalated ids
37
+ * when it grows past `compactAtBytes`.
38
+ *
39
+ * This module is PURE w.r.t. its injected fs + clock seams so the
40
+ * crash/dedup/replay/escalation logic is unit-tested without a real
41
+ * gateway (mirrors the #1544/#1546/#1549 pure-seam idiom).
42
+ */
43
+
44
+ import type { InboundMessage } from './ipc-protocol.js'
45
+
46
+ /** Stable dedup id for an inbound. Real Telegram messages have a
47
+ * unique (chatId, messageId). Synthetic/cron inbounds use messageId
48
+ * 0 — fall back to a deterministic id from source+ts so retried
49
+ * synthetics of the SAME logical event dedup, but distinct events
50
+ * (different ts) do not collapse. */
51
+ export function spoolId(msg: InboundMessage): string {
52
+ if (typeof msg.messageId === 'number' && msg.messageId > 0) {
53
+ return `m:${msg.chatId}:${msg.messageId}`
54
+ }
55
+ const src = msg.meta?.source ?? '-'
56
+ return `s:${msg.chatId}:${src}:${msg.ts}`
57
+ }
58
+
59
+ interface SpoolRecord {
60
+ t: 'put' | 'ack'
61
+ id: string
62
+ /** Present only on `put`. The full inbound to replay. */
63
+ msg?: InboundMessage
64
+ /** Present only on `put`. Owning agent (replay re-pushes per agent). */
65
+ agent?: string
66
+ /** Present only on `put`. ms epoch first-spooled — drives escalation. */
67
+ firstAt?: number
68
+ }
69
+
70
+ export interface InboundSpoolFsSeam {
71
+ appendFileSync: (path: string, data: string) => void
72
+ readFileSync: (path: string) => string
73
+ writeFileSync: (path: string, data: string) => void
74
+ /** Atomic same-dir replace (POSIX rename). Used so compaction can't
75
+ * lose entries to a crash mid-rewrite. */
76
+ renameSync: (from: string, to: string) => void
77
+ existsSync: (path: string) => boolean
78
+ statSizeSync: (path: string) => number
79
+ }
80
+
81
+ export interface InboundSpoolOptions {
82
+ path: string
83
+ fs: InboundSpoolFsSeam
84
+ now?: () => number
85
+ log?: (line: string) => void
86
+ /** Un-acked entries older than this are escalated then dropped.
87
+ * Default 15 min — comfortably past the 5-min silence-poke ladder
88
+ * so self-heal gets every chance before we retract the promise. */
89
+ escalateAfterMs?: number
90
+ /** Rewrite-compact the JSONL once it exceeds this. Default 256 KiB. */
91
+ compactAtBytes?: number
92
+ }
93
+
94
+ export interface ReplayEntry {
95
+ agent: string
96
+ msg: InboundMessage
97
+ }
98
+
99
+ export interface InboundSpool {
100
+ /** Durably record `msg` for `agent`. Idempotent by spoolId: a
101
+ * re-spool of an already-live id is a no-op (returns false). */
102
+ put: (agent: string, msg: InboundMessage) => boolean
103
+ /** Tombstone `id` — call ONLY on confirmed delivery to a live
104
+ * registered bridge. Idempotent. */
105
+ ack: (msg: InboundMessage) => void
106
+ /** Live (un-acked) entries, oldest first. Used at boot to re-push
107
+ * into the in-memory buffer. Pure read — does not mutate. */
108
+ liveEntries: () => ReplayEntry[]
109
+ /** Escalate+drop entries older than `escalateAfterMs`. Calls
110
+ * `onEscalate` once per dropped entry (post the "couldn't deliver"
111
+ * card there). Returns the count escalated. Safe to call on a timer. */
112
+ sweepEscalations: (onEscalate: (e: ReplayEntry) => void) => number
113
+ /** Test/observability: count of live (un-acked) ids. */
114
+ liveCount: () => number
115
+ }
116
+
117
+ export function createInboundSpool(opts: InboundSpoolOptions): InboundSpool {
118
+ const { path, fs } = opts
119
+ const now = opts.now ?? Date.now
120
+ const log = opts.log ?? ((l: string) => process.stderr.write(l))
121
+ const escalateAfterMs = opts.escalateAfterMs ?? 15 * 60 * 1000
122
+ const compactAtBytes = opts.compactAtBytes ?? 256 * 1024
123
+
124
+ // In-memory projection of the on-disk log, rebuilt from the file at
125
+ // construction. `live` maps spoolId → the put record (insertion order
126
+ // preserved via the Map). An `ack` deletes from `live`.
127
+ const live = new Map<string, { agent: string; msg: InboundMessage; firstAt: number }>()
128
+
129
+ function parseLine(line: string): SpoolRecord | null {
130
+ const s = line.trim()
131
+ if (!s) return null
132
+ let rec: unknown
133
+ try {
134
+ rec = JSON.parse(s)
135
+ } catch {
136
+ return null // torn / partial line from a crash mid-append — skip
137
+ }
138
+ if (rec == null || typeof rec !== 'object') return null
139
+ const r = rec as Record<string, unknown>
140
+ if (r.t !== 'put' && r.t !== 'ack') return null
141
+ if (typeof r.id !== 'string' || r.id.length === 0) return null
142
+ if (r.t === 'put') {
143
+ if (r.msg == null || typeof r.msg !== 'object') return null
144
+ if (typeof r.agent !== 'string' || r.agent.length === 0) return null
145
+ if (typeof r.firstAt !== 'number') return null
146
+ }
147
+ return r as unknown as SpoolRecord
148
+ }
149
+
150
+ // Rebuild `live` from the file. Tolerates a torn last line.
151
+ function hydrate(): void {
152
+ live.clear()
153
+ if (!fs.existsSync(path)) return
154
+ let raw = ''
155
+ try {
156
+ raw = fs.readFileSync(path)
157
+ } catch {
158
+ return
159
+ }
160
+ for (const line of raw.split('\n')) {
161
+ const rec = parseLine(line)
162
+ if (rec == null) continue
163
+ if (rec.t === 'put') {
164
+ // Last put for an id wins; an ack later removes it.
165
+ live.set(rec.id, {
166
+ agent: rec.agent as string,
167
+ msg: rec.msg as InboundMessage,
168
+ firstAt: rec.firstAt as number,
169
+ })
170
+ } else {
171
+ live.delete(rec.id)
172
+ }
173
+ }
174
+ }
175
+
176
+ function appendRecord(rec: SpoolRecord): void {
177
+ try {
178
+ fs.appendFileSync(path, JSON.stringify(rec) + '\n')
179
+ } catch (err) {
180
+ // Durability is best-effort relative to fs availability; a spool
181
+ // write failure must NOT break live delivery. Log loudly — a
182
+ // persistently failing spool means we're back to in-memory-only
183
+ // semantics and the operator should know.
184
+ log(
185
+ `inbound-spool: append FAILED path=${path} id=${rec.id} t=${rec.t}: ` +
186
+ `${(err as Error).message} — durability degraded to in-memory\n`,
187
+ )
188
+ }
189
+ }
190
+
191
+ function maybeCompact(): void {
192
+ let size = 0
193
+ try {
194
+ size = fs.existsSync(path) ? fs.statSizeSync(path) : 0
195
+ } catch {
196
+ return
197
+ }
198
+ if (size <= compactAtBytes) return
199
+ // Rewrite the file as exactly the current live set (one put per
200
+ // live id, no acks). ATOMIC: write a sibling tmp then rename over
201
+ // the real path. rename(2) is atomic within a filesystem, so a
202
+ // crash at any point leaves EITHER the full pre-compaction log OR
203
+ // the full compacted log on disk — never a truncated/torn file
204
+ // that loses live entries after the tear. (Plain writeFileSync is
205
+ // not atomic; a crash mid-write of a >256 KiB rewrite could drop
206
+ // entries past the tear — the residual the reviewer flagged.)
207
+ const lines: string[] = []
208
+ for (const [id, e] of live) {
209
+ lines.push(
210
+ JSON.stringify({ t: 'put', id, agent: e.agent, msg: e.msg, firstAt: e.firstAt } satisfies SpoolRecord),
211
+ )
212
+ }
213
+ const tmp = path + '.compact.tmp'
214
+ try {
215
+ fs.writeFileSync(tmp, lines.length ? lines.join('\n') + '\n' : '')
216
+ fs.renameSync(tmp, path)
217
+ log(`inbound-spool: compacted path=${path} live=${live.size}\n`)
218
+ } catch (err) {
219
+ // Compaction is opportunistic — a failure keeps the (larger but
220
+ // correct) append-only log; never lose data trying to shrink it.
221
+ log(`inbound-spool: compact FAILED path=${path}: ${(err as Error).message}\n`)
222
+ }
223
+ }
224
+
225
+ hydrate()
226
+
227
+ return {
228
+ put(agent, msg) {
229
+ const id = spoolId(msg)
230
+ if (live.has(id)) return false // dedup: already spooled & un-acked
231
+ const firstAt = now()
232
+ live.set(id, { agent, msg, firstAt })
233
+ appendRecord({ t: 'put', id, agent, msg, firstAt })
234
+ maybeCompact()
235
+ return true
236
+ },
237
+ ack(msg) {
238
+ const id = spoolId(msg)
239
+ if (!live.has(id)) return // idempotent / unknown id
240
+ live.delete(id)
241
+ appendRecord({ t: 'ack', id })
242
+ maybeCompact()
243
+ },
244
+ liveEntries() {
245
+ // Insertion order = Map iteration order = oldest first.
246
+ return [...live.values()].map((e) => ({ agent: e.agent, msg: e.msg }))
247
+ },
248
+ sweepEscalations(onEscalate) {
249
+ const cutoff = now() - escalateAfterMs
250
+ let n = 0
251
+ for (const [id, e] of [...live.entries()]) {
252
+ if (e.firstAt > cutoff) continue
253
+ live.delete(id)
254
+ appendRecord({ t: 'ack', id }) // tombstone — promise retracted
255
+ try {
256
+ onEscalate({ agent: e.agent, msg: e.msg })
257
+ } catch (err) {
258
+ log(`inbound-spool: onEscalate threw id=${id}: ${(err as Error).message}\n`)
259
+ }
260
+ n++
261
+ }
262
+ if (n > 0) {
263
+ log(`inbound-spool: escalated+dropped ${n} undelivered entr${n === 1 ? 'y' : 'ies'} (older than ${escalateAfterMs}ms)\n`)
264
+ maybeCompact()
265
+ }
266
+ return n
267
+ },
268
+ liveCount() {
269
+ return live.size
270
+ },
271
+ }
272
+ }