npm - switchroom - Versions diffs - 0.13.33 → 0.13.36 - Mend

switchroom 0.13.33 → 0.13.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/bin/timezone-hook.sh +1 -1
package/dist/agent-scheduler/index.js +8 -1
package/dist/auth-broker/index.js +8 -1
package/dist/cli/switchroom.js +176 -26
package/dist/host-control/main.js +5222 -203
package/dist/vault/approvals/kernel-server.js +9 -2
package/dist/vault/broker/server.js +9 -2
package/package.json +1 -1
package/profiles/default/CLAUDE.md.hbs +1 -1
package/telegram-plugin/dist/gateway/gateway.js +234 -31
package/telegram-plugin/docs/waiting-ux-spec.md +40 -0
package/telegram-plugin/gateway/config-approval-handler.test.ts +188 -1
package/telegram-plugin/gateway/config-approval-handler.ts +170 -15
package/telegram-plugin/gateway/diff-preview-card.test.ts +2 -2
package/telegram-plugin/gateway/diff-preview-card.ts +2 -2
package/telegram-plugin/gateway/drive-write-approval.test.ts +70 -0
package/telegram-plugin/gateway/drive-write-approval.ts +51 -2
package/telegram-plugin/gateway/error-envelope-card.ts +64 -0
package/telegram-plugin/gateway/gateway.ts +112 -15
package/telegram-plugin/gateway/ipc-protocol.ts +10 -1
package/telegram-plugin/gateway/oversize-card-body.test.ts +108 -0
package/telegram-plugin/gateway/oversize-card-body.ts +114 -0
package/telegram-plugin/gateway/unhandled-rejection-policy.ts +46 -1
package/telegram-plugin/hooks/silent-end-interrupt-stop.mjs +118 -41
package/telegram-plugin/hooks/silent-end-scan.mjs +190 -0
package/telegram-plugin/pending-work-progress.ts +37 -1
package/telegram-plugin/tests/boot-clears-clean-shutdown-marker.test.ts +75 -0
package/telegram-plugin/tests/error-envelope-unlock-card.test.ts +79 -0
package/telegram-plugin/tests/pending-work-progress.test.ts +134 -0
package/telegram-plugin/tests/silent-end-integration.test.ts +268 -0
package/telegram-plugin/tests/silent-end-interrupt-stop-integration.test.ts +242 -0
package/telegram-plugin/tests/silent-end-interrupt-stop-scan.test.ts +314 -0
package/telegram-plugin/tests/silent-end.test.ts +227 -38
package/telegram-plugin/tests/unhandled-rejection-policy.test.ts +51 -6

package/telegram-plugin/gateway/error-envelope-card.ts ADDED Viewed

@@ -0,0 +1,64 @@
+/**
+ * Render a one-tap unlock card for hostd error_envelopes that carry a
+ * `flip_yaml_flag` fix (#1758 Phase 1).
+ *
+ * CRITICAL safety: the `yaml_path` MUST be on the
+ * `UNLOCK_CARD_YAML_ALLOWLIST` exported from
+ * `src/host-control/config-edit-validator.ts`. A malformed or hostile
+ * envelope from any backend could otherwise nudge the operator into
+ * one-tap-approving an arbitrary flag flip. Non-allowlisted paths fall
+ * back to plain-text rendering (the caller surfaces `resp.error` as
+ * today).
+ *
+ * Phase 1 scope: ONLY `flip_yaml_flag`. `request_vault_grant` is
+ * explicitly deferred to a later phase (still plain-text rendered).
+ */
+import type { HostdResponse } from "../../src/host-control/protocol.js";
+import { isAllowlistedYamlPath } from "../../src/host-control/config-edit-validator.js";
+import {
+  buildApprovalCard,
+  type BuiltApprovalCard,
+} from "./approval-card.js";
+export type UnlockCardOutcome =
+  | { kind: "card"; card: BuiltApprovalCard; yaml_path: string; to: unknown }
+  | { kind: "plain-text" };
+/**
+ * Decide whether to render a one-tap unlock card for the given
+ * response. Returns `{kind: "plain-text"}` whenever the envelope
+ * lacks a `flip_yaml_flag` fix OR the path isn't on the allowlist.
+ *
+ * `approvalRequestId` is the 32-hex nonce minted by the approval
+ * kernel; caller is responsible for binding the card to that nonce
+ * and recording the apply-on-tap intent.
+ */
+export function renderErrorEnvelopeCard(
+  resp: HostdResponse,
+  agentName: string,
+  approvalRequestId: string,
+): UnlockCardOutcome {
+  const env = resp.error_envelope;
+  if (!env || !env.fix) return { kind: "plain-text" };
+  if (env.fix.kind !== "flip_yaml_flag") {
+    // request_vault_grant is Phase-2 work; everything else has no
+    // unlock-card UX. Caller falls back to plain-text rendering.
+    return { kind: "plain-text" };
+  }
+  const { yaml_path, to } = env.fix;
+  if (!isAllowlistedYamlPath(yaml_path)) {
+    // Defense-in-depth: never render a one-tap card for a path the
+    // operator hasn't explicitly opted into.
+    return { kind: "plain-text" };
+  }
+  const card = buildApprovalCard({
+    request_id: approvalRequestId,
+    agent: agentName,
+    scope_humanized: `flip ${yaml_path} → ${JSON.stringify(to)}`,
+    why: env.human + (env.why ? ` — ${env.why}` : ""),
+    offer_always: false,
+    offer_ttl: false,
+  });
+  return { kind: "card", card, yaml_path, to };
+}

package/telegram-plugin/gateway/gateway.ts CHANGED Viewed

@@ -319,12 +319,16 @@ import {
 import {
   writeCleanShutdownMarker,
   readCleanShutdownMarker,
-  // clearCleanShutdownMarker is intentionally NOT imported here —
-  // the marker is a single self-overwriting file; staleness is bounded by
-  // `shouldSuppressRecoveryBanner` (DEFAULT_MAX_AGE_MS), so leaving it on
-  // disk is harmless. Pre-#142 the agent-side `session-greeting.sh` did
-  // the cleanup after rendering its "Restarted <reason>" row; that script
-  // was deleted in #142 PR 1.
+  // 2026-05-25 — clearCleanShutdownMarker IS imported and called on every
+  // boot after the marker is read. The earlier "intentionally NOT imported"
+  // comment was wrong: it assumed every shutdown writes a fresh marker, but
+  // unhandledRejection / uncaughtException paths explicitly SKIP the write
+  // (gateway.ts:15107 — "crash path"). A marker from a prior graceful
+  // shutdown then sits on disk for hours and triggers a misleading stale-
+  // marker crash banner on the next boot after an unhandled rejection.
+  // Clearing on boot collapses the marker to "describes the immediately
+  // preceding shutdown only" semantics.
+  clearCleanShutdownMarker,
   shouldSuppressRecoveryBanner,
   resolveShutdownMarker,
   DEFAULT_MAX_AGE_MS as CLEAN_SHUTDOWN_MAX_AGE_MS,
@@ -3434,6 +3438,14 @@ pendingProgress.startTimer({
     )
   },
   emitMetric: (event) => emitRuntimeMetric(event),
+  // #1760 defense-in-depth: if a newer turn for this chat is active at
+  // tick time, the prior turn's pending-progress is stale (the
+  // canonical teardown was missed). Drop the ticker instead of editing
+  // the old anchor — see pending-work-progress.ts's docblock.
+  isActiveTurnNewerThan: (key, activatedAt) => {
+    const turnStartedAt = activeTurnStartedAt.get(key)
+    return turnStartedAt != null && turnStartedAt > activatedAt
+  },
 })
 // Per-agent buffer for synthetic inbounds the gateway couldn't deliver
@@ -4109,6 +4121,24 @@ const ipcServer: IpcServer = createIpcServer({
           )
         }
       },
+      // #1762: send the full diff as a `.patch` attachment when the
+      // card body would exceed Telegram's 4096-char sendMessage limit.
+      postAttachment: async (args) => {
+        const input = new InputFile(Buffer.from(args.content, 'utf8'), args.filename)
+        await robustApiCall(
+          () =>
+            bot.api.sendDocument(args.chatId, input, {
+              ...(args.threadId !== undefined
+                ? { message_thread_id: args.threadId }
+                : {}),
+            }),
+          {
+            chat_id: String(args.chatId),
+            verb: 'config-approval-attachment',
+            ...(args.threadId !== undefined ? { threadId: args.threadId } : {}),
+          },
+        )
+      },
       log: (m) =>
         process.stderr.write(`telegram gateway: config-approval — ${m}\n`),
     })
@@ -4594,12 +4624,21 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
   // #1122 KPI: a `reply` always produces a fresh user-visible outbound
   // message — count it for the outbound-gap / TTFO KPI AND reset the
   // silence-poke clock so the next poke is measured from this send.
-  // Also clear any silent-end state file so the Stop hook doesn't fire
-  // a stale block when the session ends (deterministic restore of the
-  // detection PR3 inadvertently removed).
   signalTracker.noteOutbound(statusKey(chat_id, threadId), Date.now())
   silencePoke.noteOutbound(statusKey(chat_id, threadId), Date.now())
-  clearSilentEndState(statusKey(chat_id, threadId))
+  // #1741 — only clear silent-end state on a plausibly-final reply.
+  // An interim ack (disable_notification:true, short text, no done)
+  // must NOT clear the state file; otherwise a turn that ends with
+  // ack-only + answer-as-transcript leaves no state for the Stop
+  // hook to act on if `turn_end` never lands (the `turn_duration`
+  // system event is unreliable for trivial-prompt turns — see the
+  // executeReply finalize comments). Final-answer replies still
+  // clear; the main turn-end path also re-writes the state when
+  // finalAnswerDelivered=false, so this is a belt-and-braces gate
+  // for the turn_end-missing case (#1741).
+  if (isFinalAnswerReply({ text: rawText, disableNotification })) {
+    clearSilentEndState(statusKey(chat_id, threadId))
+  }
   if (previewMessageId != null && reply_to != null && replyMode !== 'off') {
     await deleteStalePreview(previewMessageId)
@@ -4680,6 +4719,10 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
           //   - #1664 silent-end re-prompt fires even when the
           //     accumulated silent content qualifies as substantive;
           //   - retries within the dedup window may double-send.
+          // #1760 primary fix — clear any stale prior-turn ticker
+          // before re-anchoring on this silent-reply edit. See the
+          // matching comment at the executeReply finalize site below.
+          pendingProgress.clearPending(statusKey(chat_id, threadId), 'reply_finalize')
           pendingProgress.noteOutbound(statusKey(chat_id, threadId), {
             messageId: decision.messageId,
             text: decision.mergedText,
@@ -4873,6 +4916,14 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
   if (sentIds.length === chunks.length && chunks.length > 0) {
     const anchorMsgId = sentIds[chunks.length - 1]
     if (typeof anchorMsgId === 'number') {
+      // #1760 primary fix — clear any stale prior-turn ticker BEFORE
+      // re-anchoring. The canonical teardown wires (turn_end,
+      // subagent_handback, inbound) can be missed (e.g. SDK turn_end
+      // event dropped, as in the #1760 live evidence). Tearing down on
+      // every reply-finalize is idempotent and resilient: it's a no-op
+      // when nothing is active, and drops a stale ambient before the
+      // new turn captures its anchor.
+      pendingProgress.clearPending(statusKey(chat_id, threadId), 'reply_finalize')
       pendingProgress.noteOutbound(statusKey(chat_id, threadId), {
         messageId: anchorMsgId,
         text: chunks[chunks.length - 1],
@@ -5170,7 +5221,19 @@ async function executeStreamReply(args: Record<string, unknown>): Promise<unknow
       const sKey = statusKey(streamChatId, streamThreadId)
       signalTracker.noteOutbound(sKey, Date.now())
       silencePoke.noteOutbound(sKey, Date.now())
-      clearSilentEndState(sKey)
+      // #1741 — see executeReply for the rationale: only a plausibly-
+      // final stream_reply clears the silent-end state. An interim
+      // ack via stream_reply must NOT clear; the Stop hook needs
+      // the state to persist if turn_end fails to land.
+      if (
+        isFinalAnswerReply({
+          text: (args.text as string | undefined) ?? '',
+          disableNotification: args.disable_notification === true,
+          done: args.done === true,
+        })
+      ) {
+        clearSilentEndState(sKey)
+      }
     }
   }
@@ -5298,6 +5361,10 @@ async function executeStreamReply(args: Record<string, unknown>): Promise<unknow
       streamFormat === 'html' ? 'HTML'
       : streamFormat === 'markdownv2' ? 'MarkdownV2'
       : undefined
+    // #1760 primary fix — clear any stale prior-turn ticker before
+    // re-anchoring on stream_reply done. See the matching comment at
+    // the executeReply finalize site.
+    pendingProgress.clearPending(statusKey(sChatId, sThreadId), 'reply_finalize')
     pendingProgress.noteOutbound(statusKey(sChatId, sThreadId), {
       messageId: result.messageId,
       text: args.text as string,
@@ -5320,6 +5387,20 @@ async function executeStreamReply(args: Record<string, unknown>): Promise<unknow
     })
   ) {
     turn.finalAnswerDelivered = true
+    // #1744 follow-up — stream_reply edge case. The first-emit gate at
+    // L5178 only clears silent-end state on the FIRST emit of a stream.
+    // If a stream's first emit was ack-shaped (disable_notification:true,
+    // short text, no done) it correctly did NOT clear the state. But a
+    // LATER emit in the same stream may flip `done=true` or carry
+    // substantive text — that's the real final answer landing, and the
+    // state file must be cleared here too. clearSilentEndState is
+    // idempotent (no-op when the file is absent or the turnKey doesn't
+    // match), so calling it unconditionally on every final-answer-shaped
+    // emit is safe even if the first-emit path already cleared.
+    const streamThreadIdForClear = args.message_thread_id != null
+      ? Number(args.message_thread_id)
+      : undefined
+    clearSilentEndState(statusKey(streamChatId, streamThreadIdForClear))
   }
   // v0.13.30 follow-up — release the buffer gate on every successful
   // stream_reply too. Same rationale as executeReply: short replies
@@ -15618,10 +15699,26 @@ void (async () => {
             } else {
               process.stderr.write(`telegram gateway: boot.clean_shutdown_marker_stale age=${ageSec}s signal=${cleanMarker.signal}${reasonTag}\n`)
             }
-            // No clearCleanShutdownMarker() call — the marker is a single
-            // self-overwriting file, age-gated by shouldSuppressRecoveryBanner,
-            // so leaving it on disk is harmless. (Pre-#142 the agent-side
-            // session-greeting.sh did the cleanup; that script is deleted.)
+            // 2026-05-25 — clear the marker after this boot has read it.
+            // Pre-fix the comment here claimed the file was "self-
+            // overwriting, age-gated, harmless to leave on disk" — that's
+            // true ONLY for the cycle where every shutdown writes a fresh
+            // marker. The unhandledRejection / uncaughtException paths
+            // explicitly SKIP writing (gateway.ts:15107 — the "crash path")
+            // so a marker from an earlier graceful shutdown sits on disk
+            // for hours, then on the next boot looks stale-by-age and
+            // fires a misleading agent-crashed banner with detail
+            // `clean-shutdown marker stale age=39976s` (clerk 2026-05-25
+            // 01:11). Clearing now means the marker only ever describes
+            // the IMMEDIATELY PRECEDING shutdown, not "some shutdown in
+            // the past". After this clear: a subsequent crash with no
+            // marker write = no marker file = correctly classified
+            // 'crash' via the sessionMarker fallback (boot-reason.ts:84);
+            // a graceful shutdown writes a fresh marker that the next
+            // boot reads + clears. The historical session-greeting.sh
+            // ownership the old comment referred to is gone since #142
+            // but the GC step was never re-homed — this is it.
+            clearCleanShutdownMarker(GATEWAY_CLEAN_SHUTDOWN_MARKER_PATH)
           }
           if (marker) {

package/telegram-plugin/gateway/ipc-protocol.ts CHANGED Viewed

@@ -107,8 +107,17 @@ export interface ConfigApprovalResolvedEvent {
   /** Echoes the requestId from the originating request_config_approval. */
   requestId: string;
   verdict: "approve" | "deny" | "timeout";
-  /** Diagnostic detail when present (currently unused; reserved). */
+  /** Diagnostic detail when present. */
   reason?: string;
+  /**
+   * Distinguishes an actual operator tap-deny (`"operator"`) from a
+   * gateway-side dispatch failure that auto-denied because the card
+   * never reached the operator (`"dispatch_failure"`). Only set on
+   * `verdict: "deny"` events. Caller (hostd) maps `dispatch_failure`
+   * to a distinct error code so the failure isn't misattributed to
+   * the operator. Issue #1762.
+   */
+  denySource?: "operator" | "dispatch_failure";
 }
 export type GatewayToClient =

package/telegram-plugin/gateway/oversize-card-body.test.ts ADDED Viewed

@@ -0,0 +1,108 @@
+/**
+ * Tests for the shared truncateRawToFit helper (#1767).
+ *
+ * Covers:
+ *   - No-op when the rendered body already fits.
+ *   - Binary-search shrinks the raw slice; result fits under `cap`.
+ *   - Line-snap when newlines exist; sentinel appended.
+ *   - Char-truncation fallback when a single unbroken line exceeds
+ *     the budget (no `\n` to snap to).
+ *   - Defensive hard-cut when even the framing-alone overflows.
+ */
+import { describe, it, expect } from "vitest";
+import { truncateRawToFit } from "./oversize-card-body.js";
+const SENTINEL = "\n[… truncated]";
+function frame(escapeMultiplier: number) {
+  // Render closure that mimics a `<pre>`-wrapped HTML-escaped body
+  // where every `&` inflates `escapeMultiplier`-fold.
+  return (raw: string) => {
+    const inflated = raw.replace(/&/g, "&".repeat(escapeMultiplier));
+    return `<b>Hdr</b>\n<pre>${inflated}</pre>`;
+  };
+}
+describe("truncateRawToFit", () => {
+  it("returns the full body unchanged when it fits under cap", () => {
+    const raw = "small content";
+    const { body, truncated } = truncateRawToFit({
+      raw,
+      render: (s) => `<pre>${s}</pre>`,
+      cap: 100,
+      sentinel: SENTINEL,
+    });
+    expect(truncated).toBe(false);
+    expect(body).toBe("<pre>small content</pre>");
+  });
+  it("shrinks raw via binary-search until the rendered body fits the cap (5x escape inflation)", () => {
+    const raw = "&".repeat(2000); // 2000 chars raw, 10000 after 5x inflate
+    const { body, truncated } = truncateRawToFit({
+      raw,
+      render: frame(5),
+      cap: 1000,
+      sentinel: SENTINEL,
+    });
+    expect(truncated).toBe(true);
+    expect(body.length).toBeLessThanOrEqual(1000);
+    expect(body).toContain("truncated");
+  });
+  it("snaps to the last newline within the chosen raw prefix (no mid-line cut)", () => {
+    const raw = ["line-a", "line-b", "line-c", "line-d", "line-e"]
+      .map((l) => l.repeat(50))
+      .join("\n");
+    const { body, truncated } = truncateRawToFit({
+      raw,
+      render: (s) => `<pre>${s}</pre>`,
+      cap: 600,
+      sentinel: SENTINEL,
+    });
+    expect(truncated).toBe(true);
+    // The portion before the sentinel must end at a complete line —
+    // no partial `line-?` suffix mid-word.
+    const beforeSentinel = body.slice(
+      0,
+      body.length - SENTINEL.length - "</pre>".length,
+    );
+    // Every line in the source repeated its label 50x — assert the
+    // tail of the kept content ends with a full repeated block, not
+    // a chopped one. Easiest check: the body must not end mid-word
+    // like `line-` (the dash followed by no letter).
+    expect(beforeSentinel).not.toMatch(/line-$/);
+  });
+  it("falls through to char-truncation when a single unbroken line exceeds cap", () => {
+    const raw = "x".repeat(5000); // no newlines at all
+    const { body, truncated } = truncateRawToFit({
+      raw,
+      render: (s) => `<pre>${s}</pre>`,
+      cap: 500,
+      sentinel: SENTINEL,
+    });
+    expect(truncated).toBe(true);
+    expect(body.length).toBeLessThanOrEqual(500);
+    // Some of the line content survives — the helper shouldn't
+    // collapse to "<pre>" + sentinel only.
+    expect(body).toMatch(/x{100,}/);
+    expect(body).toContain("truncated");
+  });
+  it("hard-cuts at hardLimit when framing alone overflows (defensive)", () => {
+    // Framing-alone is 5000 chars (way past cap), and the renderer
+    // ignores its input — so no slice of raw can ever fit. The
+    // helper should hard-cut to hardLimit rather than loop forever.
+    const huge = "Z".repeat(5000);
+    const { body, truncated } = truncateRawToFit({
+      raw: "ignored",
+      render: () => huge,
+      cap: 1000,
+      sentinel: SENTINEL,
+      hardLimit: 1100,
+    });
+    expect(truncated).toBe(true);
+    expect(body.length).toBeLessThanOrEqual(1100);
+  });
+});

package/telegram-plugin/gateway/oversize-card-body.ts ADDED Viewed

@@ -0,0 +1,114 @@
+/**
+ * Shared "render-and-fit" helper for approval cards that wrap
+ * user-supplied content in HTML framing. (#1762 / #1767)
+ *
+ * Telegram's `sendMessage` caps the body at 4096 chars and we render
+ * with `parse_mode=HTML`. Worst-case escape inflates raw content up
+ * to 5x (`&` → `&amp;`), so a naive raw-input cap is unsafe — the
+ * post-escape body can blow past the limit and `sendMessage` then
+ * returns a generic 400 that surfaces upstream as a silent
+ * `E_DENIED`.
+ *
+ * This helper binary-searches the largest prefix of the RAW content
+ * whose rendered body still fits under `cap`, snaps to the last
+ * newline so we don't cut mid-line (and never cut mid-entity like
+ * `&am|p;` — raw doesn't contain entities yet), and appends a
+ * sentinel pointing at the attached full content (if any).
+ *
+ * Callers own the framing: pass a `render(slice)` closure that
+ * embeds the slice in whatever escaped envelope they want, and the
+ * helper guarantees the returned `body` fits.
+ *
+ * Both `config-approval-handler.ts` (config-edit diffs) and
+ * `drive-write-approval.ts` (Drive write preview cards) use this.
+ */
+export interface TruncateRawToFitInput {
+  /** Raw, un-escaped content to slice. */
+  raw: string;
+  /**
+   * Build the full rendered card body from a (possibly truncated)
+   * raw slice. The closure owns HTML escaping + all framing. Called
+   * O(log n) times during the binary search; keep it cheap.
+   */
+  render: (rawSlice: string) => string;
+  /**
+   * Maximum rendered length (chars). Should be set below Telegram's
+   * 4096 hard limit to leave margin for invisible framing wobble.
+   */
+  cap: number;
+  /**
+   * Marker appended to the truncated slice before re-rendering — e.g.
+   * `"\n[… diff continues, see attached file]"`. The render closure
+   * receives `rawSlice + sentinel` so the marker is visible inside
+   * the same envelope (code block etc.).
+   */
+  sentinel: string;
+  /** Absolute hard cap for the defensive last-resort raw cut. Default `cap + 196`. */
+  hardLimit?: number;
+}
+export interface TruncateRawToFitResult {
+  /** Rendered body, guaranteed to fit within `cap` (best-effort) or `hardLimit` (defensive). */
+  body: string;
+  /** True iff the helper had to truncate (raw was sliced or hard-cut). */
+  truncated: boolean;
+}
+/**
+ * Try the full content first; if it fits, return as-is. Otherwise
+ * binary-search the largest raw prefix whose rendered body fits,
+ * snap to the last newline boundary, append the sentinel, re-render
+ * and return.
+ *
+ * Defensive last resort: if even the empty-slice + sentinel render
+ * overflows (means the framing alone exceeds `cap` — caller bug or
+ * adversarial reason field that slipped past clipping), we hard-cut
+ * the rendered body to `hardLimit` chars. Should be unreachable in
+ * production but cheaper than crashing.
+ */
+export function truncateRawToFit(
+  input: TruncateRawToFitInput,
+): TruncateRawToFitResult {
+  const { raw, render, cap, sentinel } = input;
+  const hardLimit = input.hardLimit ?? cap + 196;
+  const fullBody = render(raw);
+  if (fullBody.length <= cap) {
+    return { body: fullBody, truncated: false };
+  }
+  // Binary-search the largest raw prefix length whose rendered body
+  // fits (with sentinel suffixed before render). We track the best
+  // slice rather than just the length so we can snap after the loop.
+  let lo = 0;
+  let hi = raw.length;
+  let bestSliceLen = 0;
+  while (lo <= hi) {
+    const mid = (lo + hi) >>> 1;
+    const candidate = raw.slice(0, mid) + sentinel;
+    if (render(candidate).length <= cap) {
+      bestSliceLen = mid;
+      lo = mid + 1;
+    } else {
+      hi = mid - 1;
+    }
+  }
+  // Snap to the last newline within the chosen raw prefix so we
+  // never cut a line in half. If a single unbroken line exceeds
+  // the budget, fall through with the char-truncated slice — the
+  // caller's framing (e.g. `<pre>`) handles the visual gracefully.
+  let chosenRaw = raw.slice(0, bestSliceLen);
+  const lastNl = chosenRaw.lastIndexOf("\n");
+  if (lastNl > 0) chosenRaw = chosenRaw.slice(0, lastNl);
+  let body = render(chosenRaw + sentinel);
+  // Defensive: framing-alone overflow. Hard-cut to hardLimit so the
+  // outbound sendMessage at least has a chance of succeeding.
+  if (body.length > hardLimit) {
+    body = body.slice(0, hardLimit - 1);
+  }
+  return { body, truncated: true };
+}

package/telegram-plugin/gateway/unhandled-rejection-policy.ts CHANGED Viewed

@@ -10,13 +10,15 @@
  * Pure helper so it can be tested without spinning up the gateway.
  */
-import { GrammyError } from 'grammy'
+import { GrammyError, HttpError } from 'grammy'
 export type RejectionAction = 'shutdown' | 'log_only'
 export interface RejectionPolicyOptions {
   /** Allow tests to inject error type detection without depending on grammy. */
   isGrammyError?: (err: unknown) => boolean
+  /** Allow tests to inject HttpError detection without depending on grammy. */
+  isHttpError?: (err: unknown) => boolean
 }
 /**
@@ -42,9 +44,52 @@ export function classifyRejection(
       ? opts.isGrammyError(err)
       : err instanceof GrammyError
+  // Transient network-layer failures: grammy throws an `HttpError` wrapping
+  // the underlying fetch failure (ECONNRESET, ETIMEDOUT, fetch failed, DNS
+  // failures, etc.). These are the SAME class `retry-api-call.ts:146-162`
+  // already retries with exponential backoff — if one leaks past the retry
+  // policy (3 attempts exhausted, or a fire-and-forget callsite without
+  // robustApiCall wrapping), crashing the gateway turns one bad packet into
+  // a crash banner. log_only is the right posture: the request failed, the
+  // user-visible UX recovers on the next retry cycle, and a daemon that
+  // crashes on network errors isn't always-on.
+  //
+  // Surfaced 2026-05-25 on clerk via the boot-card sendMessage path: an
+  // HttpError leaked past the boot-card's try/catch (the async post-settle
+  // probe-loop IIFE at boot-card.ts:616 had no .catch on its outer void),
+  // triggering an unhandledRejection → shutdown → user-visible
+  // "agent-crashed" banner for what was really just a transient network hiccup.
+  const isHttp =
+    opts.isHttpError != null
+      ? opts.isHttpError(err)
+      : err instanceof HttpError
+  if (isHttp) return 'log_only'
   if (!isGrammy) return 'shutdown'
   const e = err as { error_code?: number; description?: string }
+  // 429 (Too Many Requests / flood-wait): grammy's flood-wait response.
+  // Already handled in retry-api-call.ts:100-108 with the
+  // `parameters.retry_after` backoff. If one leaks past — caller exceeded
+  // maxRetries=3 of sustained 429s, or didn't wrap in robustApiCall — the
+  // right posture is log_only (matches the HttpError rationale above).
+  // The bot is rate-limited; crashing makes it worse (boot fires more
+  // API calls that hit fresh 429s).
+  //
+  // Surfaced 2026-05-25 on clerk via a sendMessage that exceeded the 3-
+  // attempt retry budget; the rejection bubbled to this handler, triggered
+  // shutdown, and posted an "agent-crashed" operator-event banner.
+  if (e.error_code === 429) return 'log_only'
+  // 5xx (Bad Gateway / Service Unavailable / Gateway Timeout): Telegram
+  // intermittently returns these during their own load events. Same
+  // posture as 429 — retry policy already backs off and re-tries; if
+  // one leaks past, log don't crash.
+  if (typeof e.error_code === 'number' && e.error_code >= 500 && e.error_code < 600) {
+    return 'log_only'
+  }
   if (e.error_code !== 400) return 'shutdown'
   const desc = (e.description ?? '').toLowerCase()