npm - switchroom - Versions diffs - 0.12.14 → 0.12.15 - Mend

switchroom 0.12.14 → 0.12.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/cli/switchroom.js +367 -278
package/dist/vault/approvals/kernel-server.js +68 -1
package/dist/vault/broker/server.js +21 -1
package/package.json +1 -1
package/telegram-plugin/dist/gateway/gateway.js +96 -70
package/telegram-plugin/gateway/approval-callback.test.ts +49 -1
package/telegram-plugin/gateway/approval-callback.ts +85 -67
package/telegram-plugin/gateway/gateway.ts +19 -2
package/telegram-plugin/gateway/pending-inbound-buffer.ts +39 -0
package/telegram-plugin/tests/pending-inbound-buffer.test.ts +71 -1

package/telegram-plugin/gateway/approval-callback.ts CHANGED Viewed

@@ -22,14 +22,58 @@
  */
 import { type Context, InlineKeyboard } from "grammy";
-import { parseApprovalCallback, ttlMsFromToken } from "./approval-card.js";
 import {
-  approvalConsume,
-  approvalRecord,
-} from "../../src/vault/approvals/client.js";
+  parseApprovalCallback,
+  ttlMsFromToken,
+  type ApprovalChoice,
+} from "./approval-card.js";
+import { approvalConsumeRecord } from "../../src/vault/approvals/client.js";
 import type { ApprovalDecisionMode } from "../../src/vault/approvals/schema.js";
 import { scopeToOpenInDriveButton } from "../../src/drive/deep-links.js";
+/**
+ * Resolve a tapped approval choice to its decision tuple — PURE, no
+ * kernel I/O, so the `bad ttl token` branch (the only fallible path in
+ * the old inline switch) is unit-testable without mocking grammy.
+ *
+ * Extracted (PR-5) from `handleApprovalCallback` so PR-4's invariant —
+ * "compute + validate the decision BEFORE burning the single-use
+ * nonce" — is now structural, not a comment: the handler calls this
+ * first and only proceeds to `approvalConsume` on `ok: true`. A
+ * malformed ttl token returns `{ ok: false }` and the nonce is never
+ * touched (operator can re-tap a valid choice).
+ */
+export type ResolvedApprovalDecision =
+  | {
+      ok: true;
+      decision: ApprovalDecisionMode;
+      granted: boolean;
+      ttl_ms: number | null;
+      displayMode: string;
+    }
+  | { ok: false; error: string };
+export function resolveApprovalDecision(
+  choice: ApprovalChoice,
+): ResolvedApprovalDecision {
+  switch (choice.kind) {
+    case "deny":
+      return { ok: true, decision: "deny", granted: false, ttl_ms: null, displayMode: "denied" };
+    case "once":
+      // No expiry — recorded as a one-shot grant; the agent calls
+      // approval_lookup at most once, then proceeds. /approvals revoke
+      // can still target the row by id.
+      return { ok: true, decision: "allow_once", granted: true, ttl_ms: null, displayMode: "granted once" };
+    case "always":
+      return { ok: true, decision: "allow_always", granted: true, ttl_ms: null, displayMode: "granted always" };
+    case "ttl": {
+      const ms = ttlMsFromToken(choice.param);
+      if (ms === null) return { ok: false, error: "bad ttl token" };
+      return { ok: true, decision: "allow_ttl", granted: true, ttl_ms: ms, displayMode: `granted for ${choice.param}` };
+    }
+  }
+}
 /**
  * Build the post-tap keyboard for a granted decision. Today this is
  * just the `[ 📖 Open in Drive ]` button when the granted scope names
@@ -57,66 +101,21 @@ export async function handleApprovalCallback(
     return;
   }
-  // Compute decision + ttl from the choice variant BEFORE burning the
-  // single-use nonce. This block has a fallible early-return (the
-  // `bad ttl token` path). Pre-fix it ran AFTER approvalConsume(), so a
-  // malformed ttl token burned the nonce but recorded no decision — the
-  // agent's approval_lookup poll never saw a verdict and the turn
-  // wedged (pre-PR-3: forever; now bounded by PR-3's PERMISSION_TTL
-  // auto-deny). approvalConsume stays the atomic single-use guard; it
-  // simply doesn't fire until we have a valid decision to record
-  // immediately after. There is now NO fallible step between
-  // consume→record; the only residual gap is the inherent 1-RPC
-  // consume/record non-atomicity (backstopped by PR-3's TTL auto-deny;
-  // a fully atomic kernel consume+record is a tracked follow-up).
-  let decision: ApprovalDecisionMode;
-  let granted: boolean;
-  let ttl_ms: number | null = null;
-  let displayMode: string;
-  switch (parsed.choice.kind) {
-    case "deny":
-      decision = "deny";
-      granted = false;
-      displayMode = "denied";
-      break;
-    case "once":
-      decision = "allow_once";
-      granted = true;
-      // No expiry — recorded as a one-shot grant; the agent calls
-      // approval_lookup at most once, then proceeds. /approvals revoke
-      // can still target the row by id.
-      displayMode = "granted once";
-      break;
-    case "always":
-      decision = "allow_always";
-      granted = true;
-      displayMode = "granted always";
-      break;
-    case "ttl": {
-      decision = "allow_ttl";
-      granted = true;
-      const ms = ttlMsFromToken(parsed.choice.param);
-      if (ms === null) {
-        await ctx.answerCallbackQuery({ text: "bad ttl token" });
-        return;
-      }
-      ttl_ms = ms;
-      displayMode = `granted for ${parsed.choice.param}`;
-      break;
-    }
-  }
-  const consumed = await approvalConsume(parsed.request_id);
-  if (consumed === null) {
-    await ctx.answerCallbackQuery({ text: "approval kernel unreachable" });
-    return;
-  }
-  if (!consumed.consumed) {
-    // Single-use enforcement: someone already tapped, or the nonce
-    // expired/unknown. Match the RFC §8.1 wording.
-    await ctx.answerCallbackQuery({ text: "this prompt expired" });
+  // Resolve + validate the decision BEFORE burning the single-use
+  // nonce (PR-4 invariant, now structural via the pure
+  // resolveApprovalDecision — see its doc). A malformed ttl token
+  // returns { ok: false } here and the nonce is never touched, so the
+  // operator can re-tap a valid choice; pre-fix this validation ran
+  // AFTER approvalConsume(), burning the nonce with no decision
+  // recorded → the agent's approval_lookup poll never saw a verdict
+  // and the turn wedged. There is now NO fallible step between the
+  // consume→record below.
+  const resolved = resolveApprovalDecision(parsed.choice);
+  if (!resolved.ok) {
+    await ctx.answerCallbackQuery({ text: resolved.error });
     return;
   }
+  const { decision, granted, ttl_ms, displayMode } = resolved;
   const granted_by_user_id = ctx.from?.id ?? 0;
   // Approver set at decision time = the chat that received the card. We
@@ -125,18 +124,37 @@ export async function handleApprovalCallback(
   // when each surface migrates and starts passing access.allowFrom.
   const approver_set = [String(granted_by_user_id)];
-  const decision_id = await approvalRecord({
+  // PR-6: atomic consume+record — ONE round-trip; the kernel burns the
+  // single-use nonce AND writes the decision in one SQLite transaction.
+  // If the record fails the burn rolls back, so `null` genuinely means
+  // "nothing happened, safe to retry" — there is no burned-nonce /
+  // no-decision wedge any more (the residual the shipped permission-TTL
+  // auto-deny used to backstop). resolveApprovalDecision already
+  // validated the ttl above, so no fallible step precedes this call.
+  const result = await approvalConsumeRecord({
     request_id: parsed.request_id,
     decision,
     approver_set,
     granted_by_user_id,
     ttl_ms,
   });
-  if (decision_id === null) {
+  if (result === null) {
+    await ctx.answerCallbackQuery({ text: "approval kernel unreachable" });
+    return;
+  }
+  if (!result.consumed) {
+    // Already tapped / expired / unknown — single-use is enforced
+    // kernel-side and NO decision was written. RFC §8.1 wording.
+    await ctx.answerCallbackQuery({ text: "this prompt expired" });
+    return;
+  }
+  if (!result.decision_id) {
+    // Defensive: consumed:true must carry a decision_id. Kept distinct
+    // from the unreachable message for operator triage.
     await ctx.answerCallbackQuery({ text: "kernel record failed" });
     return;
   }
+  const decision_id: string = result.decision_id;
   // Edit the original card to its post-tap state. Drop the original
   // action keyboard either way; on a successful grant for a Drive
@@ -149,8 +167,8 @@ export async function handleApprovalCallback(
       ? ` · /approvals revoke <code>${decision_id}</code>`
       : "");
-  const postTapKeyboard = granted && consumed.scope
-    ? buildGrantedKeyboard(consumed.scope)
+  const postTapKeyboard = granted && result.scope
+    ? buildGrantedKeyboard(result.scope)
     : undefined;
   try {

package/telegram-plugin/gateway/gateway.ts CHANGED Viewed

@@ -245,7 +245,7 @@ import { shouldSweepChatAtBoot } from './boot-sweep-filter.js'
 import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js'
 import { handleRequestDriveApproval } from './drive-write-approval.js'
 import { buildDiffPreviewCard } from './diff-preview-card.js'
-import { createPendingInboundBuffer } from './pending-inbound-buffer.js'
+import { createPendingInboundBuffer, redeliverBufferedInbound } from './pending-inbound-buffer.js'
 import { createPendingPermissionBuffer } from './pending-permission-decisions.js'
 import {
   buildVaultGrantApprovedInbound,
@@ -2743,10 +2743,27 @@ silencePoke.startTimer({
     try {
       clearSilentEndState(fbKey)
     } catch { /* best-effort */ }
+    // Self-heal the inbound buffer. pendingInboundBuffer otherwise
+    // drains ONLY on bridge re-register (onClientRegistered). After a
+    // network storm that settles with the bridge STILL connected, user
+    // messages buffered during the flap sit forever — until a manual
+    // restart forces a re-register (the fleet-update thundering-herd
+    // incident, 2026-05-19: agents "not responding", logs show
+    // pending-inbound-buffer depth>0 with no drain). Flushing on
+    // wedge-clear makes the agent self-heal. selfAgent-keyed; a miss
+    // re-buffers so nothing is lost if the bridge is genuinely offline.
+    const fbSelfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
+    const fbRedeliver = redeliverBufferedInbound(
+      pendingInboundBuffer,
+      fbSelfAgent,
+      (m) => ipcServer.sendToAgent(fbSelfAgent, m),
+    )
     process.stderr.write(
       `telegram gateway: silence-poke framework-fallback ended wedged turn ` +
       `chat=${fbChatId} thread=${ctx.threadId ?? '-'} silence_ms=${ctx.silenceMs} ` +
-      `currentTurn_nulled=${turnMatchesFallback}\n`,
+      `currentTurn_nulled=${turnMatchesFallback} ` +
+      `drained_buffered=${fbRedeliver.redelivered}/${fbRedeliver.drained}` +
+      `${fbRedeliver.rebuffered > 0 ? ` rebuffered=${fbRedeliver.rebuffered}` : ''}\n`,
     )
   },
 })

package/telegram-plugin/gateway/pending-inbound-buffer.ts CHANGED Viewed

@@ -54,6 +54,45 @@ export interface PendingInboundBufferOptions {
   log?: (line: string) => void
 }
+/**
+ * Drain `agent`'s buffered inbound and re-deliver each via `send`. A
+ * `send` returning false (or throwing) means "not delivered" — the
+ * message is re-buffered so nothing is lost when the bridge is still
+ * offline. Returns counts for observability.
+ *
+ * This exists because `drain` is otherwise only called on bridge
+ * re-register (`onClientRegistered`). After a network storm that
+ * settles with the bridge STILL connected, messages buffered during
+ * the flap never drain — they sit until a manual restart forces a
+ * re-register. The silence-poke framework fallback calls this on
+ * wedge-clear so the agent self-heals (fleet-update thundering-herd
+ * incident, 2026-05-19).
+ */
+export function redeliverBufferedInbound(
+  buffer: PendingInboundBuffer,
+  agent: string,
+  send: (msg: InboundMessage) => boolean,
+): { drained: number; redelivered: number; rebuffered: number } {
+  const pending = buffer.drain(agent)
+  let redelivered = 0
+  let rebuffered = 0
+  for (const msg of pending) {
+    let delivered = false
+    try {
+      delivered = send(msg)
+    } catch {
+      delivered = false
+    }
+    if (delivered) {
+      redelivered++
+    } else {
+      buffer.push(agent, msg)
+      rebuffered++
+    }
+  }
+  return { drained: pending.length, redelivered, rebuffered }
+}
 export function createPendingInboundBuffer(
   opts: PendingInboundBufferOptions = {},
 ): PendingInboundBuffer {

package/telegram-plugin/tests/pending-inbound-buffer.test.ts CHANGED Viewed

@@ -7,7 +7,7 @@
  */
 import { describe, it, expect } from 'vitest'
-import { createPendingInboundBuffer, DEFAULT_PENDING_INBOUND_CAP } from '../gateway/pending-inbound-buffer.js'
+import { createPendingInboundBuffer, redeliverBufferedInbound, DEFAULT_PENDING_INBOUND_CAP } from '../gateway/pending-inbound-buffer.js'
 import type { InboundMessage } from '../gateway/ipc-protocol.js'
 function inbound(source: string, ts = Date.now()): InboundMessage {
@@ -130,3 +130,73 @@ describe('pending-inbound-buffer', () => {
     expect(buf.totalDepth()).toBe(1)
   })
 })
+describe('redeliverBufferedInbound — wedge-clear self-heal (fleet-update incident 2026-05-19)', () => {
+  it('delivers every buffered message and empties the buffer when send succeeds', () => {
+    const buf = createPendingInboundBuffer({ log: () => {} })
+    buf.push('klanker', inbound('user', 1))
+    buf.push('klanker', inbound('user', 2))
+    const seen: number[] = []
+    const r = redeliverBufferedInbound(buf, 'klanker', (m) => {
+      seen.push(m.messageId as number)
+      return true
+    })
+    expect(r).toEqual({ drained: 2, redelivered: 2, rebuffered: 0 })
+    expect(seen).toEqual([1, 2]) // FIFO preserved
+    expect(buf.depth('klanker')).toBe(0)
+  })
+  it('re-buffers (loses nothing) when the bridge is still offline — send returns false', () => {
+    const buf = createPendingInboundBuffer({ log: () => {} })
+    buf.push('klanker', inbound('user', 1))
+    buf.push('klanker', inbound('cron', 2))
+    const r = redeliverBufferedInbound(buf, 'klanker', () => false)
+    expect(r).toEqual({ drained: 2, redelivered: 0, rebuffered: 2 })
+    expect(buf.depth('klanker')).toBe(2) // still there, nothing lost
+    expect(buf.drain('klanker').map((m) => m.meta?.source)).toEqual(['user', 'cron'])
+  })
+  it('treats a throwing send as not-delivered and re-buffers', () => {
+    const buf = createPendingInboundBuffer({ log: () => {} })
+    buf.push('klanker', inbound('user', 1))
+    const r = redeliverBufferedInbound(buf, 'klanker', () => {
+      throw new Error('bridge write failed')
+    })
+    expect(r).toEqual({ drained: 1, redelivered: 0, rebuffered: 1 })
+    expect(buf.depth('klanker')).toBe(1)
+  })
+  it('mixed: delivers what it can, re-buffers only the misses', () => {
+    const buf = createPendingInboundBuffer({ log: () => {} })
+    buf.push('klanker', inbound('a', 1))
+    buf.push('klanker', inbound('b', 2))
+    buf.push('klanker', inbound('c', 3))
+    let n = 0
+    const r = redeliverBufferedInbound(buf, 'klanker', () => {
+      n++
+      return n !== 2 // 2nd send fails
+    })
+    expect(r).toEqual({ drained: 3, redelivered: 2, rebuffered: 1 })
+    expect(buf.drain('klanker').map((m) => m.meta?.source)).toEqual(['b'])
+  })
+  it('is a no-op on an empty buffer (no send calls)', () => {
+    const buf = createPendingInboundBuffer({ log: () => {} })
+    let calls = 0
+    const r = redeliverBufferedInbound(buf, 'klanker', () => {
+      calls++
+      return true
+    })
+    expect(r).toEqual({ drained: 0, redelivered: 0, rebuffered: 0 })
+    expect(calls).toBe(0)
+  })
+  it('only touches the named agent', () => {
+    const buf = createPendingInboundBuffer({ log: () => {} })
+    buf.push('klanker', inbound('user', 1))
+    buf.push('clerk', inbound('user', 2))
+    redeliverBufferedInbound(buf, 'klanker', () => true)
+    expect(buf.depth('klanker')).toBe(0)
+    expect(buf.depth('clerk')).toBe(1) // untouched
+  })
+})