switchroom 0.12.14 → 0.12.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,14 +22,58 @@
22
22
  */
23
23
 
24
24
  import { type Context, InlineKeyboard } from "grammy";
25
- import { parseApprovalCallback, ttlMsFromToken } from "./approval-card.js";
26
25
  import {
27
- approvalConsume,
28
- approvalRecord,
29
- } from "../../src/vault/approvals/client.js";
26
+ parseApprovalCallback,
27
+ ttlMsFromToken,
28
+ type ApprovalChoice,
29
+ } from "./approval-card.js";
30
+ import { approvalConsumeRecord } from "../../src/vault/approvals/client.js";
30
31
  import type { ApprovalDecisionMode } from "../../src/vault/approvals/schema.js";
31
32
  import { scopeToOpenInDriveButton } from "../../src/drive/deep-links.js";
32
33
 
34
+ /**
35
+ * Resolve a tapped approval choice to its decision tuple — PURE, no
36
+ * kernel I/O, so the `bad ttl token` branch (the only fallible path in
37
+ * the old inline switch) is unit-testable without mocking grammy.
38
+ *
39
+ * Extracted (PR-5) from `handleApprovalCallback` so PR-4's invariant —
40
+ * "compute + validate the decision BEFORE burning the single-use
41
+ * nonce" — is now structural, not a comment: the handler calls this
42
+ * first and only proceeds to `approvalConsume` on `ok: true`. A
43
+ * malformed ttl token returns `{ ok: false }` and the nonce is never
44
+ * touched (operator can re-tap a valid choice).
45
+ */
46
+ export type ResolvedApprovalDecision =
47
+ | {
48
+ ok: true;
49
+ decision: ApprovalDecisionMode;
50
+ granted: boolean;
51
+ ttl_ms: number | null;
52
+ displayMode: string;
53
+ }
54
+ | { ok: false; error: string };
55
+
56
+ export function resolveApprovalDecision(
57
+ choice: ApprovalChoice,
58
+ ): ResolvedApprovalDecision {
59
+ switch (choice.kind) {
60
+ case "deny":
61
+ return { ok: true, decision: "deny", granted: false, ttl_ms: null, displayMode: "denied" };
62
+ case "once":
63
+ // No expiry — recorded as a one-shot grant; the agent calls
64
+ // approval_lookup at most once, then proceeds. /approvals revoke
65
+ // can still target the row by id.
66
+ return { ok: true, decision: "allow_once", granted: true, ttl_ms: null, displayMode: "granted once" };
67
+ case "always":
68
+ return { ok: true, decision: "allow_always", granted: true, ttl_ms: null, displayMode: "granted always" };
69
+ case "ttl": {
70
+ const ms = ttlMsFromToken(choice.param);
71
+ if (ms === null) return { ok: false, error: "bad ttl token" };
72
+ return { ok: true, decision: "allow_ttl", granted: true, ttl_ms: ms, displayMode: `granted for ${choice.param}` };
73
+ }
74
+ }
75
+ }
76
+
33
77
  /**
34
78
  * Build the post-tap keyboard for a granted decision. Today this is
35
79
  * just the `[ 📖 Open in Drive ]` button when the granted scope names
@@ -57,66 +101,21 @@ export async function handleApprovalCallback(
57
101
  return;
58
102
  }
59
103
 
60
- // Compute decision + ttl from the choice variant BEFORE burning the
61
- // single-use nonce. This block has a fallible early-return (the
62
- // `bad ttl token` path). Pre-fix it ran AFTER approvalConsume(), so a
63
- // malformed ttl token burned the nonce but recorded no decision the
64
- // agent's approval_lookup poll never saw a verdict and the turn
65
- // wedged (pre-PR-3: forever; now bounded by PR-3's PERMISSION_TTL
66
- // auto-deny). approvalConsume stays the atomic single-use guard; it
67
- // simply doesn't fire until we have a valid decision to record
68
- // immediately after. There is now NO fallible step between
69
- // consume→record; the only residual gap is the inherent 1-RPC
70
- // consume/record non-atomicity (backstopped by PR-3's TTL auto-deny;
71
- // a fully atomic kernel consume+record is a tracked follow-up).
72
- let decision: ApprovalDecisionMode;
73
- let granted: boolean;
74
- let ttl_ms: number | null = null;
75
- let displayMode: string;
76
- switch (parsed.choice.kind) {
77
- case "deny":
78
- decision = "deny";
79
- granted = false;
80
- displayMode = "denied";
81
- break;
82
- case "once":
83
- decision = "allow_once";
84
- granted = true;
85
- // No expiry — recorded as a one-shot grant; the agent calls
86
- // approval_lookup at most once, then proceeds. /approvals revoke
87
- // can still target the row by id.
88
- displayMode = "granted once";
89
- break;
90
- case "always":
91
- decision = "allow_always";
92
- granted = true;
93
- displayMode = "granted always";
94
- break;
95
- case "ttl": {
96
- decision = "allow_ttl";
97
- granted = true;
98
- const ms = ttlMsFromToken(parsed.choice.param);
99
- if (ms === null) {
100
- await ctx.answerCallbackQuery({ text: "bad ttl token" });
101
- return;
102
- }
103
- ttl_ms = ms;
104
- displayMode = `granted for ${parsed.choice.param}`;
105
- break;
106
- }
107
- }
108
-
109
- const consumed = await approvalConsume(parsed.request_id);
110
- if (consumed === null) {
111
- await ctx.answerCallbackQuery({ text: "approval kernel unreachable" });
112
- return;
113
- }
114
- if (!consumed.consumed) {
115
- // Single-use enforcement: someone already tapped, or the nonce
116
- // expired/unknown. Match the RFC §8.1 wording.
117
- await ctx.answerCallbackQuery({ text: "this prompt expired" });
104
+ // Resolve + validate the decision BEFORE burning the single-use
105
+ // nonce (PR-4 invariant, now structural via the pure
106
+ // resolveApprovalDecision see its doc). A malformed ttl token
107
+ // returns { ok: false } here and the nonce is never touched, so the
108
+ // operator can re-tap a valid choice; pre-fix this validation ran
109
+ // AFTER approvalConsume(), burning the nonce with no decision
110
+ // recorded the agent's approval_lookup poll never saw a verdict
111
+ // and the turn wedged. There is now NO fallible step between the
112
+ // consume→record below.
113
+ const resolved = resolveApprovalDecision(parsed.choice);
114
+ if (!resolved.ok) {
115
+ await ctx.answerCallbackQuery({ text: resolved.error });
118
116
  return;
119
117
  }
118
+ const { decision, granted, ttl_ms, displayMode } = resolved;
120
119
 
121
120
  const granted_by_user_id = ctx.from?.id ?? 0;
122
121
  // Approver set at decision time = the chat that received the card. We
@@ -125,18 +124,37 @@ export async function handleApprovalCallback(
125
124
  // when each surface migrates and starts passing access.allowFrom.
126
125
  const approver_set = [String(granted_by_user_id)];
127
126
 
128
- const decision_id = await approvalRecord({
127
+ // PR-6: atomic consume+record — ONE round-trip; the kernel burns the
128
+ // single-use nonce AND writes the decision in one SQLite transaction.
129
+ // If the record fails the burn rolls back, so `null` genuinely means
130
+ // "nothing happened, safe to retry" — there is no burned-nonce /
131
+ // no-decision wedge any more (the residual the shipped permission-TTL
132
+ // auto-deny used to backstop). resolveApprovalDecision already
133
+ // validated the ttl above, so no fallible step precedes this call.
134
+ const result = await approvalConsumeRecord({
129
135
  request_id: parsed.request_id,
130
136
  decision,
131
137
  approver_set,
132
138
  granted_by_user_id,
133
139
  ttl_ms,
134
140
  });
135
-
136
- if (decision_id === null) {
141
+ if (result === null) {
142
+ await ctx.answerCallbackQuery({ text: "approval kernel unreachable" });
143
+ return;
144
+ }
145
+ if (!result.consumed) {
146
+ // Already tapped / expired / unknown — single-use is enforced
147
+ // kernel-side and NO decision was written. RFC §8.1 wording.
148
+ await ctx.answerCallbackQuery({ text: "this prompt expired" });
149
+ return;
150
+ }
151
+ if (!result.decision_id) {
152
+ // Defensive: consumed:true must carry a decision_id. Kept distinct
153
+ // from the unreachable message for operator triage.
137
154
  await ctx.answerCallbackQuery({ text: "kernel record failed" });
138
155
  return;
139
156
  }
157
+ const decision_id: string = result.decision_id;
140
158
 
141
159
  // Edit the original card to its post-tap state. Drop the original
142
160
  // action keyboard either way; on a successful grant for a Drive
@@ -149,8 +167,8 @@ export async function handleApprovalCallback(
149
167
  ? ` · /approvals revoke <code>${decision_id}</code>`
150
168
  : "");
151
169
 
152
- const postTapKeyboard = granted && consumed.scope
153
- ? buildGrantedKeyboard(consumed.scope)
170
+ const postTapKeyboard = granted && result.scope
171
+ ? buildGrantedKeyboard(result.scope)
154
172
  : undefined;
155
173
 
156
174
  try {
@@ -245,7 +245,7 @@ import { shouldSweepChatAtBoot } from './boot-sweep-filter.js'
245
245
  import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js'
246
246
  import { handleRequestDriveApproval } from './drive-write-approval.js'
247
247
  import { buildDiffPreviewCard } from './diff-preview-card.js'
248
- import { createPendingInboundBuffer } from './pending-inbound-buffer.js'
248
+ import { createPendingInboundBuffer, redeliverBufferedInbound } from './pending-inbound-buffer.js'
249
249
  import { createPendingPermissionBuffer } from './pending-permission-decisions.js'
250
250
  import {
251
251
  buildVaultGrantApprovedInbound,
@@ -2743,10 +2743,27 @@ silencePoke.startTimer({
2743
2743
  try {
2744
2744
  clearSilentEndState(fbKey)
2745
2745
  } catch { /* best-effort */ }
2746
+ // Self-heal the inbound buffer. pendingInboundBuffer otherwise
2747
+ // drains ONLY on bridge re-register (onClientRegistered). After a
2748
+ // network storm that settles with the bridge STILL connected, user
2749
+ // messages buffered during the flap sit forever — until a manual
2750
+ // restart forces a re-register (the fleet-update thundering-herd
2751
+ // incident, 2026-05-19: agents "not responding", logs show
2752
+ // pending-inbound-buffer depth>0 with no drain). Flushing on
2753
+ // wedge-clear makes the agent self-heal. selfAgent-keyed; a miss
2754
+ // re-buffers so nothing is lost if the bridge is genuinely offline.
2755
+ const fbSelfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
2756
+ const fbRedeliver = redeliverBufferedInbound(
2757
+ pendingInboundBuffer,
2758
+ fbSelfAgent,
2759
+ (m) => ipcServer.sendToAgent(fbSelfAgent, m),
2760
+ )
2746
2761
  process.stderr.write(
2747
2762
  `telegram gateway: silence-poke framework-fallback ended wedged turn ` +
2748
2763
  `chat=${fbChatId} thread=${ctx.threadId ?? '-'} silence_ms=${ctx.silenceMs} ` +
2749
- `currentTurn_nulled=${turnMatchesFallback}\n`,
2764
+ `currentTurn_nulled=${turnMatchesFallback} ` +
2765
+ `drained_buffered=${fbRedeliver.redelivered}/${fbRedeliver.drained}` +
2766
+ `${fbRedeliver.rebuffered > 0 ? ` rebuffered=${fbRedeliver.rebuffered}` : ''}\n`,
2750
2767
  )
2751
2768
  },
2752
2769
  })
@@ -54,6 +54,45 @@ export interface PendingInboundBufferOptions {
54
54
  log?: (line: string) => void
55
55
  }
56
56
 
57
+ /**
58
+ * Drain `agent`'s buffered inbound and re-deliver each via `send`. A
59
+ * `send` returning false (or throwing) means "not delivered" — the
60
+ * message is re-buffered so nothing is lost when the bridge is still
61
+ * offline. Returns counts for observability.
62
+ *
63
+ * This exists because `drain` is otherwise only called on bridge
64
+ * re-register (`onClientRegistered`). After a network storm that
65
+ * settles with the bridge STILL connected, messages buffered during
66
+ * the flap never drain — they sit until a manual restart forces a
67
+ * re-register. The silence-poke framework fallback calls this on
68
+ * wedge-clear so the agent self-heals (fleet-update thundering-herd
69
+ * incident, 2026-05-19).
70
+ */
71
+ export function redeliverBufferedInbound(
72
+ buffer: PendingInboundBuffer,
73
+ agent: string,
74
+ send: (msg: InboundMessage) => boolean,
75
+ ): { drained: number; redelivered: number; rebuffered: number } {
76
+ const pending = buffer.drain(agent)
77
+ let redelivered = 0
78
+ let rebuffered = 0
79
+ for (const msg of pending) {
80
+ let delivered = false
81
+ try {
82
+ delivered = send(msg)
83
+ } catch {
84
+ delivered = false
85
+ }
86
+ if (delivered) {
87
+ redelivered++
88
+ } else {
89
+ buffer.push(agent, msg)
90
+ rebuffered++
91
+ }
92
+ }
93
+ return { drained: pending.length, redelivered, rebuffered }
94
+ }
95
+
57
96
  export function createPendingInboundBuffer(
58
97
  opts: PendingInboundBufferOptions = {},
59
98
  ): PendingInboundBuffer {
@@ -7,7 +7,7 @@
7
7
  */
8
8
 
9
9
  import { describe, it, expect } from 'vitest'
10
- import { createPendingInboundBuffer, DEFAULT_PENDING_INBOUND_CAP } from '../gateway/pending-inbound-buffer.js'
10
+ import { createPendingInboundBuffer, redeliverBufferedInbound, DEFAULT_PENDING_INBOUND_CAP } from '../gateway/pending-inbound-buffer.js'
11
11
  import type { InboundMessage } from '../gateway/ipc-protocol.js'
12
12
 
13
13
  function inbound(source: string, ts = Date.now()): InboundMessage {
@@ -130,3 +130,73 @@ describe('pending-inbound-buffer', () => {
130
130
  expect(buf.totalDepth()).toBe(1)
131
131
  })
132
132
  })
133
+
134
+ describe('redeliverBufferedInbound — wedge-clear self-heal (fleet-update incident 2026-05-19)', () => {
135
+ it('delivers every buffered message and empties the buffer when send succeeds', () => {
136
+ const buf = createPendingInboundBuffer({ log: () => {} })
137
+ buf.push('klanker', inbound('user', 1))
138
+ buf.push('klanker', inbound('user', 2))
139
+ const seen: number[] = []
140
+ const r = redeliverBufferedInbound(buf, 'klanker', (m) => {
141
+ seen.push(m.messageId as number)
142
+ return true
143
+ })
144
+ expect(r).toEqual({ drained: 2, redelivered: 2, rebuffered: 0 })
145
+ expect(seen).toEqual([1, 2]) // FIFO preserved
146
+ expect(buf.depth('klanker')).toBe(0)
147
+ })
148
+
149
+ it('re-buffers (loses nothing) when the bridge is still offline — send returns false', () => {
150
+ const buf = createPendingInboundBuffer({ log: () => {} })
151
+ buf.push('klanker', inbound('user', 1))
152
+ buf.push('klanker', inbound('cron', 2))
153
+ const r = redeliverBufferedInbound(buf, 'klanker', () => false)
154
+ expect(r).toEqual({ drained: 2, redelivered: 0, rebuffered: 2 })
155
+ expect(buf.depth('klanker')).toBe(2) // still there, nothing lost
156
+ expect(buf.drain('klanker').map((m) => m.meta?.source)).toEqual(['user', 'cron'])
157
+ })
158
+
159
+ it('treats a throwing send as not-delivered and re-buffers', () => {
160
+ const buf = createPendingInboundBuffer({ log: () => {} })
161
+ buf.push('klanker', inbound('user', 1))
162
+ const r = redeliverBufferedInbound(buf, 'klanker', () => {
163
+ throw new Error('bridge write failed')
164
+ })
165
+ expect(r).toEqual({ drained: 1, redelivered: 0, rebuffered: 1 })
166
+ expect(buf.depth('klanker')).toBe(1)
167
+ })
168
+
169
+ it('mixed: delivers what it can, re-buffers only the misses', () => {
170
+ const buf = createPendingInboundBuffer({ log: () => {} })
171
+ buf.push('klanker', inbound('a', 1))
172
+ buf.push('klanker', inbound('b', 2))
173
+ buf.push('klanker', inbound('c', 3))
174
+ let n = 0
175
+ const r = redeliverBufferedInbound(buf, 'klanker', () => {
176
+ n++
177
+ return n !== 2 // 2nd send fails
178
+ })
179
+ expect(r).toEqual({ drained: 3, redelivered: 2, rebuffered: 1 })
180
+ expect(buf.drain('klanker').map((m) => m.meta?.source)).toEqual(['b'])
181
+ })
182
+
183
+ it('is a no-op on an empty buffer (no send calls)', () => {
184
+ const buf = createPendingInboundBuffer({ log: () => {} })
185
+ let calls = 0
186
+ const r = redeliverBufferedInbound(buf, 'klanker', () => {
187
+ calls++
188
+ return true
189
+ })
190
+ expect(r).toEqual({ drained: 0, redelivered: 0, rebuffered: 0 })
191
+ expect(calls).toBe(0)
192
+ })
193
+
194
+ it('only touches the named agent', () => {
195
+ const buf = createPendingInboundBuffer({ log: () => {} })
196
+ buf.push('klanker', inbound('user', 1))
197
+ buf.push('clerk', inbound('user', 2))
198
+ redeliverBufferedInbound(buf, 'klanker', () => true)
199
+ expect(buf.depth('klanker')).toBe(0)
200
+ expect(buf.depth('clerk')).toBe(1) // untouched
201
+ })
202
+ })