switchroom 0.12.13 → 0.12.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,14 +22,58 @@
22
22
  */
23
23
 
24
24
  import { type Context, InlineKeyboard } from "grammy";
25
- import { parseApprovalCallback, ttlMsFromToken } from "./approval-card.js";
26
25
  import {
27
- approvalConsume,
28
- approvalRecord,
29
- } from "../../src/vault/approvals/client.js";
26
+ parseApprovalCallback,
27
+ ttlMsFromToken,
28
+ type ApprovalChoice,
29
+ } from "./approval-card.js";
30
+ import { approvalConsumeRecord } from "../../src/vault/approvals/client.js";
30
31
  import type { ApprovalDecisionMode } from "../../src/vault/approvals/schema.js";
31
32
  import { scopeToOpenInDriveButton } from "../../src/drive/deep-links.js";
32
33
 
34
+ /**
35
+ * Resolve a tapped approval choice to its decision tuple — PURE, no
36
+ * kernel I/O, so the `bad ttl token` branch (the only fallible path in
37
+ * the old inline switch) is unit-testable without mocking grammy.
38
+ *
39
+ * Extracted (PR-5) from `handleApprovalCallback` so PR-4's invariant —
40
+ * "compute + validate the decision BEFORE burning the single-use
41
+ * nonce" — is now structural, not a comment: the handler calls this
42
+ * first and only proceeds to `approvalConsume` on `ok: true`. A
43
+ * malformed ttl token returns `{ ok: false }` and the nonce is never
44
+ * touched (operator can re-tap a valid choice).
45
+ */
46
+ export type ResolvedApprovalDecision =
47
+ | {
48
+ ok: true;
49
+ decision: ApprovalDecisionMode;
50
+ granted: boolean;
51
+ ttl_ms: number | null;
52
+ displayMode: string;
53
+ }
54
+ | { ok: false; error: string };
55
+
56
+ export function resolveApprovalDecision(
57
+ choice: ApprovalChoice,
58
+ ): ResolvedApprovalDecision {
59
+ switch (choice.kind) {
60
+ case "deny":
61
+ return { ok: true, decision: "deny", granted: false, ttl_ms: null, displayMode: "denied" };
62
+ case "once":
63
+ // No expiry — recorded as a one-shot grant; the agent calls
64
+ // approval_lookup at most once, then proceeds. /approvals revoke
65
+ // can still target the row by id.
66
+ return { ok: true, decision: "allow_once", granted: true, ttl_ms: null, displayMode: "granted once" };
67
+ case "always":
68
+ return { ok: true, decision: "allow_always", granted: true, ttl_ms: null, displayMode: "granted always" };
69
+ case "ttl": {
70
+ const ms = ttlMsFromToken(choice.param);
71
+ if (ms === null) return { ok: false, error: "bad ttl token" };
72
+ return { ok: true, decision: "allow_ttl", granted: true, ttl_ms: ms, displayMode: `granted for ${choice.param}` };
73
+ }
74
+ }
75
+ }
76
+
33
77
  /**
34
78
  * Build the post-tap keyboard for a granted decision. Today this is
35
79
  * just the `[ 📖 Open in Drive ]` button when the granted scope names
@@ -57,55 +101,21 @@ export async function handleApprovalCallback(
57
101
  return;
58
102
  }
59
103
 
60
- const consumed = await approvalConsume(parsed.request_id);
61
- if (consumed === null) {
62
- await ctx.answerCallbackQuery({ text: "approval kernel unreachable" });
104
+ // Resolve + validate the decision BEFORE burning the single-use
105
+ // nonce (PR-4 invariant, now structural via the pure
106
+ // resolveApprovalDecision — see its doc). A malformed ttl token
107
+ // returns { ok: false } here and the nonce is never touched, so the
108
+ // operator can re-tap a valid choice; pre-fix this validation ran
109
+ // AFTER approvalConsume(), burning the nonce with no decision
110
+ // recorded → the agent's approval_lookup poll never saw a verdict
111
+ // and the turn wedged. There is now NO fallible step between the
112
+ // consume→record below.
113
+ const resolved = resolveApprovalDecision(parsed.choice);
114
+ if (!resolved.ok) {
115
+ await ctx.answerCallbackQuery({ text: resolved.error });
63
116
  return;
64
117
  }
65
- if (!consumed.consumed) {
66
- // Single-use enforcement: someone already tapped, or the nonce
67
- // expired/unknown. Match the RFC §8.1 wording.
68
- await ctx.answerCallbackQuery({ text: "this prompt expired" });
69
- return;
70
- }
71
-
72
- // Compute decision + ttl from the choice variant.
73
- let decision: ApprovalDecisionMode;
74
- let granted: boolean;
75
- let ttl_ms: number | null = null;
76
- let displayMode: string;
77
- switch (parsed.choice.kind) {
78
- case "deny":
79
- decision = "deny";
80
- granted = false;
81
- displayMode = "denied";
82
- break;
83
- case "once":
84
- decision = "allow_once";
85
- granted = true;
86
- // No expiry — recorded as a one-shot grant; the agent calls
87
- // approval_lookup at most once, then proceeds. /approvals revoke
88
- // can still target the row by id.
89
- displayMode = "granted once";
90
- break;
91
- case "always":
92
- decision = "allow_always";
93
- granted = true;
94
- displayMode = "granted always";
95
- break;
96
- case "ttl": {
97
- decision = "allow_ttl";
98
- granted = true;
99
- const ms = ttlMsFromToken(parsed.choice.param);
100
- if (ms === null) {
101
- await ctx.answerCallbackQuery({ text: "bad ttl token" });
102
- return;
103
- }
104
- ttl_ms = ms;
105
- displayMode = `granted for ${parsed.choice.param}`;
106
- break;
107
- }
108
- }
118
+ const { decision, granted, ttl_ms, displayMode } = resolved;
109
119
 
110
120
  const granted_by_user_id = ctx.from?.id ?? 0;
111
121
  // Approver set at decision time = the chat that received the card. We
@@ -114,18 +124,37 @@ export async function handleApprovalCallback(
114
124
  // when each surface migrates and starts passing access.allowFrom.
115
125
  const approver_set = [String(granted_by_user_id)];
116
126
 
117
- const decision_id = await approvalRecord({
127
+ // PR-6: atomic consume+record — ONE round-trip; the kernel burns the
128
+ // single-use nonce AND writes the decision in one SQLite transaction.
129
+ // If the record fails the burn rolls back, so `null` genuinely means
130
+ // "nothing happened, safe to retry" — there is no burned-nonce /
131
+ // no-decision wedge any more (the residual the shipped permission-TTL
132
+ // auto-deny used to backstop). resolveApprovalDecision already
133
+ // validated the ttl above, so no fallible step precedes this call.
134
+ const result = await approvalConsumeRecord({
118
135
  request_id: parsed.request_id,
119
136
  decision,
120
137
  approver_set,
121
138
  granted_by_user_id,
122
139
  ttl_ms,
123
140
  });
124
-
125
- if (decision_id === null) {
141
+ if (result === null) {
142
+ await ctx.answerCallbackQuery({ text: "approval kernel unreachable" });
143
+ return;
144
+ }
145
+ if (!result.consumed) {
146
+ // Already tapped / expired / unknown — single-use is enforced
147
+ // kernel-side and NO decision was written. RFC §8.1 wording.
148
+ await ctx.answerCallbackQuery({ text: "this prompt expired" });
149
+ return;
150
+ }
151
+ if (!result.decision_id) {
152
+ // Defensive: consumed:true must carry a decision_id. Kept distinct
153
+ // from the unreachable message for operator triage.
126
154
  await ctx.answerCallbackQuery({ text: "kernel record failed" });
127
155
  return;
128
156
  }
157
+ const decision_id: string = result.decision_id;
129
158
 
130
159
  // Edit the original card to its post-tap state. Drop the original
131
160
  // action keyboard either way; on a successful grant for a Drive
@@ -138,8 +167,8 @@ export async function handleApprovalCallback(
138
167
  ? ` · /approvals revoke <code>${decision_id}</code>`
139
168
  : "");
140
169
 
141
- const postTapKeyboard = granted && consumed.scope
142
- ? buildGrantedKeyboard(consumed.scope)
170
+ const postTapKeyboard = granted && result.scope
171
+ ? buildGrantedKeyboard(result.scope)
143
172
  : undefined;
144
173
 
145
174
  try {
@@ -245,10 +245,14 @@ import { shouldSweepChatAtBoot } from './boot-sweep-filter.js'
245
245
  import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js'
246
246
  import { handleRequestDriveApproval } from './drive-write-approval.js'
247
247
  import { buildDiffPreviewCard } from './diff-preview-card.js'
248
- import { createPendingInboundBuffer } from './pending-inbound-buffer.js'
248
+ import { createPendingInboundBuffer, redeliverBufferedInbound } from './pending-inbound-buffer.js'
249
+ import { createPendingPermissionBuffer } from './pending-permission-decisions.js'
249
250
  import {
250
251
  buildVaultGrantApprovedInbound,
251
252
  buildVaultGrantDeniedInbound,
253
+ buildVaultSaveCompletedInbound,
254
+ buildVaultSaveFailedInbound,
255
+ buildVaultSaveDiscardedInbound,
252
256
  } from './vault-grant-inbound-builders.js'
253
257
  import { createPollHealthCheck, type PollHealthCheckHandle } from './poll-health.js'
254
258
  import type {
@@ -262,6 +266,7 @@ import type {
262
266
  PtyPartialForward,
263
267
  InboundMessage,
264
268
  InjectInboundMessage,
269
+ PermissionEvent,
265
270
  } from './ipc-protocol.js'
266
271
  import { DebounceBuffer, HourCap, buildReactionInboundMeta, buildReactionInboundText, evaluateTriggerCandidate, isGroupChat, resolveReactionsConfig, truncatePreview, type PendingReaction, type ReactionBatch, type ReactionsResolvedConfig } from './reaction-trigger.js'
267
272
  import { writePidFile, clearPidFile } from './pid-file.js'
@@ -2093,7 +2098,23 @@ const pendingStateReaper = setInterval(() => {
2093
2098
  if (now - v.startedAt > VAULT_INPUT_TTL_MS) pendingVaultOps.delete(k)
2094
2099
  }
2095
2100
  for (const [k, v] of pendingPermissions) {
2096
- if (now - v.startedAt > PERMISSION_TTL_MS) pendingPermissions.delete(k)
2101
+ if (now - v.startedAt > PERMISSION_TTL_MS) {
2102
+ // Don't just drop it: the claude turn is suspended INSIDE the MCP
2103
+ // permission call waiting for a verdict. A silent delete left it
2104
+ // wedged forever when the operator never tapped — permanent
2105
+ // silence, the exact symptom this series fixes. Auto-deny so the
2106
+ // call unblocks; claude then tells the user it couldn't get
2107
+ // permission (or takes a fallback). Routed through
2108
+ // dispatchPermissionVerdict so it's buffered+redelivered too if
2109
+ // the bridge is also offline at sweep time.
2110
+ dispatchPermissionVerdict({ type: 'permission', requestId: k, behavior: 'deny' })
2111
+ process.stderr.write(
2112
+ `telegram gateway: permission TTL expired — auto-deny request=${k} ` +
2113
+ `tool=${v.tool_name} (no operator response in ` +
2114
+ `${Math.round(PERMISSION_TTL_MS / 60000)}m)\n`,
2115
+ )
2116
+ pendingPermissions.delete(k)
2117
+ }
2097
2118
  }
2098
2119
  for (const [k, v] of vaultPassphraseCache) {
2099
2120
  if (now > v.expiresAt) vaultPassphraseCache.delete(k)
@@ -2722,10 +2743,27 @@ silencePoke.startTimer({
2722
2743
  try {
2723
2744
  clearSilentEndState(fbKey)
2724
2745
  } catch { /* best-effort */ }
2746
+ // Self-heal the inbound buffer. pendingInboundBuffer otherwise
2747
+ // drains ONLY on bridge re-register (onClientRegistered). After a
2748
+ // network storm that settles with the bridge STILL connected, user
2749
+ // messages buffered during the flap sit forever — until a manual
2750
+ // restart forces a re-register (the fleet-update thundering-herd
2751
+ // incident, 2026-05-19: agents "not responding", logs show
2752
+ // pending-inbound-buffer depth>0 with no drain). Flushing on
2753
+ // wedge-clear makes the agent self-heal. selfAgent-keyed; a miss
2754
+ // re-buffers so nothing is lost if the bridge is genuinely offline.
2755
+ const fbSelfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
2756
+ const fbRedeliver = redeliverBufferedInbound(
2757
+ pendingInboundBuffer,
2758
+ fbSelfAgent,
2759
+ (m) => ipcServer.sendToAgent(fbSelfAgent, m),
2760
+ )
2725
2761
  process.stderr.write(
2726
2762
  `telegram gateway: silence-poke framework-fallback ended wedged turn ` +
2727
2763
  `chat=${fbChatId} thread=${ctx.threadId ?? '-'} silence_ms=${ctx.silenceMs} ` +
2728
- `currentTurn_nulled=${turnMatchesFallback}\n`,
2764
+ `currentTurn_nulled=${turnMatchesFallback} ` +
2765
+ `drained_buffered=${fbRedeliver.redelivered}/${fbRedeliver.drained}` +
2766
+ `${fbRedeliver.rebuffered > 0 ? ` rebuffered=${fbRedeliver.rebuffered}` : ''}\n`,
2729
2767
  )
2730
2768
  },
2731
2769
  })
@@ -2738,6 +2776,36 @@ silencePoke.startTimer({
2738
2776
  // would mint the grant but silently drop the `vault_grant_approved`
2739
2777
  // inbound, leaving the agent stuck waiting for a manual poke.
2740
2778
  const pendingInboundBuffer = createPendingInboundBuffer()
2779
+ const pendingPermissionBuffer = createPendingPermissionBuffer()
2780
+
2781
+ /**
2782
+ * Deliver a permission verdict to this agent's bridge, buffering on a
2783
+ * miss so it's redelivered when the bridge reconnects. Replaces the
2784
+ * bare `ipcServer.broadcast({type:'permission',...})` at every verdict
2785
+ * site (and the TTL-sweep auto-deny). broadcast was fire-and-forget:
2786
+ * a verdict produced while the bridge was mid-reconnect was dropped
2787
+ * and the claude turn stayed suspended INSIDE the MCP permission call
2788
+ * forever — the user tapped Approve/Deny and nothing happened, no
2789
+ * further output, permanent silence. sendToAgent is registered-keyed
2790
+ * and returns a real delivered bool; on a miss we buffer and
2791
+ * onClientRegistered re-sends so the reconnecting bridge relays the
2792
+ * verdict to the still-suspended call. A late verdict for a dead
2793
+ * request_id is harmless — the bridge relays it and Claude Code
2794
+ * ignores an unknown request_id. (Function declaration so the
2795
+ * pre-2747 TTL sweep can reference it; ipcServer/pendingPermissionBuffer
2796
+ * are resolved at call-time, after module init.)
2797
+ */
2798
+ function dispatchPermissionVerdict(ev: PermissionEvent): void {
2799
+ const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
2800
+ const delivered = ipcServer.sendToAgent(selfAgent, ev)
2801
+ if (!delivered) {
2802
+ pendingPermissionBuffer.push(selfAgent, ev)
2803
+ process.stderr.write(
2804
+ `telegram gateway: permission verdict buffered (bridge offline) ` +
2805
+ `request=${ev.requestId} behavior=${ev.behavior}\n`,
2806
+ )
2807
+ }
2808
+ }
2741
2809
 
2742
2810
  const ipcServer: IpcServer = createIpcServer({
2743
2811
  socketPath: SOCKET_PATH,
@@ -2765,6 +2833,22 @@ const ipcServer: IpcServer = createIpcServer({
2765
2833
  )
2766
2834
  }
2767
2835
  }
2836
+ // PR-3: drain permission verdicts missed while the bridge was
2837
+ // offline. A claude turn suspended inside the MCP permission call
2838
+ // is unblocked the moment the reconnecting bridge relays the
2839
+ // verdict; without this the verdict (incl. the TTL auto-deny) was
2840
+ // lost and the turn stayed silent forever.
2841
+ const pendingVerdicts = pendingPermissionBuffer.drain(client.agentName)
2842
+ for (const ev of pendingVerdicts) {
2843
+ try {
2844
+ client.send(ev)
2845
+ } catch (err) {
2846
+ process.stderr.write(
2847
+ `telegram gateway: pending-permission drain failed agent=${client.agentName} ` +
2848
+ `request=${ev.requestId} behavior=${ev.behavior}: ${(err as Error).message}\n`,
2849
+ )
2850
+ }
2851
+ }
2768
2852
  }
2769
2853
 
2770
2854
  // If the agent reconnected after a /restart (or any restart), post a boot
@@ -6251,7 +6335,7 @@ async function handleInbound(
6251
6335
  // Forward permission reply to connected bridge
6252
6336
  const behavior = permMatch[1]!.toLowerCase().startsWith('y') ? 'allow' : 'deny'
6253
6337
  const request_id = permMatch[2]!.toLowerCase()
6254
- ipcServer.broadcast({
6338
+ dispatchPermissionVerdict({
6255
6339
  type: 'permission',
6256
6340
  requestId: request_id,
6257
6341
  behavior,
@@ -6981,17 +7065,28 @@ async function handleInbound(
6981
7065
  },
6982
7066
  }
6983
7067
 
6984
- // Try to send to a connected bridge. If no bridge connected, tell the user.
6985
- ipcServer.broadcast(inboundMsg)
6986
- const delivered = ipcServer.clientCount() > 0
6987
-
7068
+ // Deliver to THIS agent's registered bridge, buffering on miss.
7069
+ // broadcast()/clientCount() were the wrong primitives: broadcast is
7070
+ // not registered-keyed (writes to any alive socket incl. an
7071
+ // unregistered pre-handshake one) and yields no delivered signal,
7072
+ // and clientCount() counts unregistered sockets — so a bridge
7073
+ // mid-reconnect made clientCount()>0, the message was broadcast into
7074
+ // a non-registered socket, the "restarting" notice was suppressed,
7075
+ // and the user's message was silently lost. The old "queued either
7076
+ // way" comment was false: broadcast does not queue. sendToAgent is
7077
+ // registered-keyed + returns a real delivered bool; on a miss we
7078
+ // push to pendingInboundBuffer, which onClientRegistered drains on
7079
+ // the next bridge register — so the notice below is now truthful.
7080
+ const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
7081
+ const delivered = ipcServer.sendToAgent(selfAgent, inboundMsg)
6988
7082
  if (!delivered) {
7083
+ pendingInboundBuffer.push(selfAgent, inboundMsg)
6989
7084
  const threadOpts = messageThreadId != null ? { message_thread_id: messageThreadId } : {}
6990
7085
  // #1075: thread-id-bearing — swallow via robustApiCall so a deleted
6991
- // topic doesn't crash the gateway. Fire-and-forget; the user-visible
6992
- // hint is non-critical (the inbound is queued either way).
7086
+ // topic doesn't crash the gateway. Fire-and-forget; the inbound is
7087
+ // genuinely buffered now, so the hint is accurate, not a guess.
6993
7088
  void swallowingApiCall(
6994
- () => bot.api.sendMessage(chat_id, '⏳ Agent is restarting, please wait…', { ...threadOpts }),
7089
+ () => bot.api.sendMessage(chat_id, '⏳ Agent is restarting your message is queued and will be processed when it reconnects.', { ...threadOpts }),
6995
7090
  {
6996
7091
  chat_id,
6997
7092
  verb: 'agent-restarting-notice',
@@ -8847,7 +8942,7 @@ async function handlePermissionSlash(ctx: Context, behavior: 'allow' | 'deny'):
8847
8942
  return
8848
8943
  }
8849
8944
  // Forward to connected bridges — same IPC the button handler uses.
8850
- ipcServer.broadcast({ type: 'permission', requestId: request_id, behavior })
8945
+ dispatchPermissionVerdict({ type: 'permission', requestId: request_id, behavior })
8851
8946
  pendingPermissions.delete(request_id)
8852
8947
  process.stderr.write(
8853
8948
  `[telegram gateway] slash-${behavior} request_id=${request_id} tool=${details.tool_name} by=${senderId}\n`,
@@ -10039,6 +10134,21 @@ async function handleVaultRequestSaveCallback(ctx: Context, data: string): Promi
10039
10134
  )
10040
10135
  .catch(() => {})
10041
10136
  }
10137
+ // Wake the agent that called vault_request_save — symmetric with
10138
+ // the vra: approve/deny path (#1052/#1150/#1156). Without this the
10139
+ // tool returned "waiting for operator", the turn ended, and a
10140
+ // Discard left the agent silently idle forever.
10141
+ const discardInbound = buildVaultSaveDiscardedInbound({
10142
+ ctx: { agent: pending.agent, key: pending.key, chat_id: pending.chat_id },
10143
+ stageId,
10144
+ operatorId: senderId,
10145
+ })
10146
+ const dDelivered = ipcServer.sendToAgent(pending.agent, discardInbound)
10147
+ process.stderr.write(
10148
+ `telegram gateway: vault_save_discarded injection agent=${pending.agent} ` +
10149
+ `key=${pending.key} stage=${stageId} delivered=${dDelivered}\n`,
10150
+ )
10151
+ if (!dDelivered) pendingInboundBuffer.push(pending.agent, discardInbound)
10042
10152
  return
10043
10153
  }
10044
10154
 
@@ -10143,6 +10253,22 @@ async function handleVaultRequestSaveCallback(ctx: Context, data: string): Promi
10143
10253
  // retry by re-invoking the same MCP tool, but the value will be
10144
10254
  // re-staged with a new ID. Drop the current stage.
10145
10255
  pendingVaultRequestSaves.delete(stageId)
10256
+ // Wake the waiting agent with the failure (symmetric with the
10257
+ // success/discard paths) so it doesn't assume vault:<key> exists.
10258
+ const failReason =
10259
+ (write.output || 'vault write error').split('\n')[0]!.slice(0, 200)
10260
+ const failInbound = buildVaultSaveFailedInbound({
10261
+ ctx: { agent: pending.agent, key: pending.key, chat_id: pending.chat_id },
10262
+ stageId,
10263
+ operatorId: senderId,
10264
+ reason: failReason,
10265
+ })
10266
+ const fDelivered = ipcServer.sendToAgent(pending.agent, failInbound)
10267
+ process.stderr.write(
10268
+ `telegram gateway: vault_save_failed injection agent=${pending.agent} ` +
10269
+ `key=${pending.key} stage=${stageId} delivered=${fDelivered}\n`,
10270
+ )
10271
+ if (!fDelivered) pendingInboundBuffer.push(pending.agent, failInbound)
10146
10272
  return
10147
10273
  }
10148
10274
 
@@ -10158,6 +10284,20 @@ async function handleVaultRequestSaveCallback(ctx: Context, data: string): Promi
10158
10284
  )
10159
10285
  .catch(() => {})
10160
10286
  }
10287
+ // Wake the agent that called vault_request_save so it resumes the
10288
+ // task that was blocked on this credential (symmetric with the
10289
+ // vra: approve path; buffered if the bridge is mid-reconnect).
10290
+ const okInbound = buildVaultSaveCompletedInbound({
10291
+ ctx: { agent: pending.agent, key: pending.key, chat_id: pending.chat_id },
10292
+ stageId,
10293
+ operatorId: senderId,
10294
+ })
10295
+ const okDelivered = ipcServer.sendToAgent(pending.agent, okInbound)
10296
+ process.stderr.write(
10297
+ `telegram gateway: vault_save_completed injection agent=${pending.agent} ` +
10298
+ `key=${pending.key} stage=${stageId} delivered=${okDelivered}\n`,
10299
+ )
10300
+ if (!okDelivered) pendingInboundBuffer.push(pending.agent, okInbound)
10161
10301
  return
10162
10302
  }
10163
10303
 
@@ -12084,16 +12224,25 @@ bot.on('callback_query:data', async ctx => {
12084
12224
  process.stderr.write(
12085
12225
  `telegram gateway: button_callback chatId=${cbChatId} user=${ctx.from.id} data=${JSON.stringify(agentCb.raw)} btnText=${JSON.stringify(buttonText ?? null)}\n`,
12086
12226
  )
12087
- ipcServer.broadcast(inboundMsg)
12088
- if (ipcServer.clientCount() === 0) {
12089
- // No bridge connected — the agent's gone. Tell the user so they
12090
- // don't think the button silently swallowed their tap.
12227
+ // Registered-keyed delivery + buffer-on-miss (same fix as the
12228
+ // normal-inbound path above): broadcast()/clientCount() lost the
12229
+ // tap whenever the bridge was mid-reconnect (clientCount() counts
12230
+ // unregistered sockets, so the notice was suppressed AND nothing
12231
+ // was actually queued). sendToAgent → pendingInboundBuffer (drained
12232
+ // by onClientRegistered) makes the "queued" promise real.
12233
+ const selfAgentBtn = process.env.SWITCHROOM_AGENT_NAME ?? ''
12234
+ const btnDelivered = ipcServer.sendToAgent(selfAgentBtn, inboundMsg)
12235
+ if (!btnDelivered) {
12236
+ pendingInboundBuffer.push(selfAgentBtn, inboundMsg)
12237
+ // No registered bridge — the agent's mid-restart. Tell the user
12238
+ // so they don't think the button silently swallowed their tap;
12239
+ // the tap is genuinely buffered now and replays on reconnect.
12091
12240
  // #1075: thread-id-bearing — swallow on THREAD_NOT_FOUND.
12092
12241
  void swallowingApiCall(
12093
12242
  () =>
12094
12243
  bot.api.sendMessage(
12095
12244
  cbChatId,
12096
- '⏳ Agent is restarting — your button tap was queued but won\'t be processed until it comes back.',
12245
+ '⏳ Agent is restarting — your button tap is queued and will be processed when it comes back.',
12097
12246
  cbThreadId != null ? { message_thread_id: cbThreadId } : {},
12098
12247
  ),
12099
12248
  {
@@ -12222,7 +12371,7 @@ bot.on('callback_query:data', async ctx => {
12222
12371
  // otherwise the rule may be unsafe to honour at scale and we
12223
12372
  // fall back to single-use allow.
12224
12373
  synthInbound: () => {
12225
- ipcServer.broadcast({
12374
+ dispatchPermissionVerdict({
12226
12375
  type: 'permission',
12227
12376
  requestId: request_id,
12228
12377
  behavior: 'allow',
@@ -12260,7 +12409,7 @@ bot.on('callback_query:data', async ctx => {
12260
12409
  newText: baseText ? `${baseText}\n\n${label}` : label,
12261
12410
  parseMode: 'HTML',
12262
12411
  synthInbound: () => {
12263
- ipcServer.broadcast({
12412
+ dispatchPermissionVerdict({
12264
12413
  type: 'permission',
12265
12414
  requestId: request_id,
12266
12415
  behavior: behavior as 'allow' | 'deny',
@@ -54,6 +54,45 @@ export interface PendingInboundBufferOptions {
54
54
  log?: (line: string) => void
55
55
  }
56
56
 
57
+ /**
58
+ * Drain `agent`'s buffered inbound and re-deliver each via `send`. A
59
+ * `send` returning false (or throwing) means "not delivered" — the
60
+ * message is re-buffered so nothing is lost when the bridge is still
61
+ * offline. Returns counts for observability.
62
+ *
63
+ * This exists because `drain` is otherwise only called on bridge
64
+ * re-register (`onClientRegistered`). After a network storm that
65
+ * settles with the bridge STILL connected, messages buffered during
66
+ * the flap never drain — they sit until a manual restart forces a
67
+ * re-register. The silence-poke framework fallback calls this on
68
+ * wedge-clear so the agent self-heals (fleet-update thundering-herd
69
+ * incident, 2026-05-19).
70
+ */
71
+ export function redeliverBufferedInbound(
72
+ buffer: PendingInboundBuffer,
73
+ agent: string,
74
+ send: (msg: InboundMessage) => boolean,
75
+ ): { drained: number; redelivered: number; rebuffered: number } {
76
+ const pending = buffer.drain(agent)
77
+ let redelivered = 0
78
+ let rebuffered = 0
79
+ for (const msg of pending) {
80
+ let delivered = false
81
+ try {
82
+ delivered = send(msg)
83
+ } catch {
84
+ delivered = false
85
+ }
86
+ if (delivered) {
87
+ redelivered++
88
+ } else {
89
+ buffer.push(agent, msg)
90
+ rebuffered++
91
+ }
92
+ }
93
+ return { drained: pending.length, redelivered, rebuffered }
94
+ }
95
+
57
96
  export function createPendingInboundBuffer(
58
97
  opts: PendingInboundBufferOptions = {},
59
98
  ): PendingInboundBuffer {
@@ -0,0 +1,112 @@
1
+ /**
2
+ * Per-agent buffer for permission verdicts the gateway couldn't deliver
3
+ * because no live IPC client was registered for the agent at send-time.
4
+ *
5
+ * Background (PR-3 of the callback→model-continuation series): a
6
+ * tool/skill/MCP permission request suspends the claude turn *inside*
7
+ * the MCP permission call until the gateway relays the operator's
8
+ * Approve/Deny verdict back (`{type:'permission'}` → bridge
9
+ * `onPermission` → Claude Code). The verdict sites previously used
10
+ * `ipcServer.broadcast(...)`, which is fire-and-forget: if the bridge
11
+ * was mid-reconnect at the exact moment the operator tapped (every
12
+ * agent/gateway restart, claude session bounce), the verdict was
13
+ * dropped and the model stayed wedged forever — the user's tap did
14
+ * nothing and they were left silent.
15
+ *
16
+ * This is the permission-verdict analog of `pending-inbound-buffer.ts`:
17
+ * the verdict sites now `sendToAgent` (registered-keyed, real delivered
18
+ * bool) and on a miss `push()` here; `onClientRegistered` `drain()`s
19
+ * and re-sends so a reconnecting bridge relays the missed verdict to
20
+ * the still-suspended permission call.
21
+ *
22
+ * Contract mirrors pending-inbound-buffer:
23
+ * - `push(agent, ev)` best-effort, synchronous, bounded.
24
+ * - `drain(agent)` returns ALL pending verdicts in insertion order
25
+ * and clears them; called from `onClientRegistered`.
26
+ * - In-memory only; survives reconnect within one gateway lifetime,
27
+ * not a gateway restart. A late-redelivered verdict for a
28
+ * request_id claude no longer has is harmless — the bridge relays
29
+ * it and Claude Code ignores an unknown request_id. The TTL-sweep
30
+ * auto-deny is the independent backstop for "operator never tapped".
31
+ *
32
+ * Per-agent cap prevents a never-reconnecting bridge from leaking
33
+ * memory; on overflow the OLDEST verdict is dropped (freshest is most
34
+ * relevant) and logged.
35
+ */
36
+
37
+ import type { PermissionEvent } from './ipc-protocol.js'
38
+
39
+ /** Default cap per agent — a reasonable backlog of permission cards
40
+ * stacked while the bridge is offline, no more. */
41
+ export const DEFAULT_PENDING_PERMISSION_CAP = 32
42
+
43
+ export interface PendingPermissionBuffer {
44
+ /** Append `ev` to `agent`'s queue. Returns true if accepted without
45
+ * eviction, false if the cap forced dropping the oldest (the new
46
+ * entry is STILL accepted). */
47
+ push: (agent: string, ev: PermissionEvent) => boolean
48
+ /** Pop and return all pending verdicts for `agent` (insertion order).
49
+ * Empty array when none. Idempotent. */
50
+ drain: (agent: string) => PermissionEvent[]
51
+ /** Test-only: current depth for `agent`. */
52
+ depth: (agent: string) => number
53
+ /** Test-only: total depth across all agents. */
54
+ totalDepth: () => number
55
+ }
56
+
57
+ export interface PendingPermissionBufferOptions {
58
+ capPerAgent?: number
59
+ log?: (line: string) => void
60
+ }
61
+
62
+ export function createPendingPermissionBuffer(
63
+ opts: PendingPermissionBufferOptions = {},
64
+ ): PendingPermissionBuffer {
65
+ const cap = opts.capPerAgent ?? DEFAULT_PENDING_PERMISSION_CAP
66
+ const log = opts.log ?? ((line: string) => process.stderr.write(line))
67
+ const queues = new Map<string, PermissionEvent[]>()
68
+
69
+ return {
70
+ push(agent, ev) {
71
+ let q = queues.get(agent)
72
+ if (q == null) {
73
+ q = []
74
+ queues.set(agent, q)
75
+ }
76
+ let evicted = false
77
+ if (q.length >= cap) {
78
+ const dropped = q.shift()
79
+ evicted = true
80
+ log(
81
+ `pending-permission-buffer: agent=${agent} cap=${cap} reached — ` +
82
+ `dropped oldest verdict request=${dropped?.requestId ?? '-'} ` +
83
+ `behavior=${dropped?.behavior ?? '-'}\n`,
84
+ )
85
+ }
86
+ q.push(ev)
87
+ log(
88
+ `pending-permission-buffer: agent=${agent} buffered request=${ev.requestId} ` +
89
+ `behavior=${ev.behavior} depth_after=${q.length} evicted=${evicted}\n`,
90
+ )
91
+ return !evicted
92
+ },
93
+ drain(agent) {
94
+ const q = queues.get(agent)
95
+ if (q == null || q.length === 0) return []
96
+ queues.delete(agent)
97
+ log(
98
+ `pending-permission-buffer: drained agent=${agent} count=${q.length} ` +
99
+ `requests=[${q.map((e) => e.requestId).join(',')}]\n`,
100
+ )
101
+ return q
102
+ },
103
+ depth(agent) {
104
+ return queues.get(agent)?.length ?? 0
105
+ },
106
+ totalDepth() {
107
+ let n = 0
108
+ for (const q of queues.values()) n += q.length
109
+ return n
110
+ },
111
+ }
112
+ }