npm - switchroom - Versions diffs - 0.15.41 → 0.15.43 - Mend

switchroom 0.15.41 → 0.15.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/dist/agent-scheduler/index.js +2 -1
package/dist/auth-broker/index.js +2 -1
package/dist/cli/notion-write-pretool.mjs +2 -1
package/dist/cli/switchroom.js +157 -13
package/dist/cli/ui/index.html +31 -0
package/dist/host-control/main.js +2 -1
package/dist/vault/approvals/kernel-server.js +2 -1
package/dist/vault/broker/server.js +2 -1
package/package.json +1 -1
package/telegram-plugin/dist/gateway/gateway.js +397 -226
package/telegram-plugin/gateway/context-occupancy.ts +91 -0
package/telegram-plugin/gateway/gateway.ts +204 -63
package/telegram-plugin/gateway/hostd-dispatch.ts +1 -1
package/telegram-plugin/gateway/idle-clear.ts +72 -0
package/telegram-plugin/gateway/poll-health.ts +9 -4
package/telegram-plugin/gateway/poll-stall-recovery.ts +59 -0
package/telegram-plugin/tests/context-occupancy.test.ts +55 -0
package/telegram-plugin/tests/idle-clear.test.ts +62 -0
package/telegram-plugin/tests/poll-stall-recovery.test.ts +32 -0
package/telegram-plugin/tests/welcome-text.test.ts +10 -11
package/telegram-plugin/welcome-text.ts +11 -12

package/telegram-plugin/gateway/context-occupancy.ts ADDED Viewed

@@ -0,0 +1,91 @@
+/**
+ * Context-headroom snapshot (RFC reference/rfcs/context-headroom-surface.md).
+ *
+ * The gateway already computes working-context occupancy at the turn-end idle
+ * gate for proactive-compaction (gateway.ts ~3000). This module turns that
+ * value into a small on-disk snapshot the host surfaces (`switchroom status` /
+ * `doctor` / web) read, so the operator can SEE each agent's headroom-to-
+ * compaction — the predictability won by ENABLE_TOOL_SEARCH=true made visible.
+ *
+ * Pure + side-effect-light so it's unit-testable; the write is best-effort and
+ * never throws (a missing snapshot reads as `unknown`, never an error).
+ */
+import { mkdirSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+/** Filename written under the agent's state dir. Shared with the host reader. */
+export const CONTEXT_OCCUPANCY_FILENAME = "context-occupancy.json";
+/** Occupancy ≥ this fraction of the cap → "tight" (compaction imminent). */
+export const TIGHT_FRACTION = 0.8;
+export type ContextState = "ok" | "tight" | "unknown";
+export interface ContextOccupancy {
+  /** Live working-context tokens (latest turn input + cache_read + cache_creation). */
+  occupancy: number;
+  /** session.max_context_tokens, or null when unset (native compaction only). */
+  cap: number | null;
+  /** cap - occupancy, or null when no cap. */
+  headroom: number | null;
+  /** occupancy / cap (0..1+), or null when no cap. */
+  pct: number | null;
+  /** ok / tight / unknown. `unknown` only when occupancy is unmeasurable. */
+  state: ContextState;
+  /** epoch ms (host/container clock) when computed. */
+  computedAt: number;
+}
+/**
+ * Build the snapshot from a measured occupancy + the resolved cap. Pure.
+ *   - cap null → no ratio; state "ok" (occupancy known, just no ceiling set).
+ *   - occupancy < 0 / NaN → "unknown".
+ */
+export function buildContextOccupancy(
+  occupancy: number,
+  cap: number | null | undefined,
+  now: number,
+): ContextOccupancy {
+  if (!Number.isFinite(occupancy) || occupancy < 0) {
+    return { occupancy: 0, cap: cap ?? null, headroom: null, pct: null, state: "unknown", computedAt: now };
+  }
+  const c = cap != null && cap > 0 ? cap : null;
+  if (c == null) {
+    return { occupancy, cap: null, headroom: null, pct: null, state: "ok", computedAt: now };
+  }
+  const pct = occupancy / c;
+  return {
+    occupancy,
+    cap: c,
+    headroom: c - occupancy,
+    pct,
+    state: pct >= TIGHT_FRACTION ? "tight" : "ok",
+    computedAt: now,
+  };
+}
+/**
+ * Write `<stateDir>/context-occupancy.json`. Best-effort — callers wrap in
+ * try/catch but this also swallows internally so a write failure never
+ * disrupts the turn-end gate.
+ */
+export function writeContextOccupancySnapshot(
+  stateDir: string,
+  snapshot: ContextOccupancy,
+  deps?: {
+    mkdir?: (p: string, o: { recursive: true }) => void;
+    writeFile?: (p: string, d: string) => void;
+  },
+): void {
+  try {
+    const path = join(stateDir, CONTEXT_OCCUPANCY_FILENAME);
+    (deps?.mkdir ?? ((p, o) => mkdirSync(p, o)))(stateDir, { recursive: true });
+    (deps?.writeFile ?? ((p, d) => writeFileSync(p, d)))(
+      path,
+      JSON.stringify(snapshot, null, 2) + "\n",
+    );
+  } catch {
+    /* best-effort — never break the turn-end gate */
+  }
+}

package/telegram-plugin/gateway/gateway.ts CHANGED Viewed

@@ -208,7 +208,6 @@ import {
   switchroomHelpText as buildSwitchroomHelpText,
   restartAckText as buildRestartAckText,
   newSessionAckText as buildNewSessionAckText,
-  resetSessionAckText as buildResetSessionAckText,
   TELEGRAM_BASE_COMMANDS,
   TELEGRAM_SWITCHROOM_COMMANDS,
   type AgentMetadata, type AuthSummary, type StatusProbeRow,
@@ -266,7 +265,7 @@ import {
 import { DEFAULT_SLOT } from '../../src/auth/accounts.js'
 import { currentActiveSlot, type AuthCodeOutcome } from '../../src/auth/manager.js'
 import { injectSlashCommand as injectSlashCommandImpl } from '../../src/agents/inject.js'
-import { handleInjectCommand } from './inject-handler.js'
+import { handleInjectCommand, type InjectDeps } from './inject-handler.js'
 import {
   parseModelCommand,
   handleModelCommand,
@@ -293,7 +292,9 @@ import { refreshBanner } from '../slot-banner-driver.js'
 import { loadConfig as loadSwitchroomConfig, findConfigFile as findSwitchroomConfigFile } from '../../src/config/loader.js'; import { resolveAgentConfig } from '../../src/config/merge.js'
 import { resolveOutboundTopic as resolveOutboundTopicHelper, topicForRecipient, type TopicRouterConfig as _OutboundRouterConfig } from '../../src/telegram/topic-router.js'
 import { readTurnUsages } from '../../src/agents/perf.js'
+import { buildContextOccupancy, writeContextOccupancySnapshot } from './context-occupancy.js'
 import { decideProactiveCompact, initialCompactState, type CompactState } from './proactive-compact.js'
+import { decideIdleClear, idleDurationToMs, DEFAULT_IDLE_CLEAR_MS } from './idle-clear.js'
 import { nextCompactNotify, idleCompactNotifyState, type CompactNotifyState } from './compact-notify.js'
 import {
   tryHostdDispatch,
@@ -369,6 +370,7 @@ import {
   foregroundFinishAction,
 } from './foreground-nesting.js'
 import { createPollHealthCheck, type PollHealthCheckHandle } from './poll-health.js'
+import { recoverFromPollStall } from './poll-stall-recovery.js'
 import type {
   ToolCallMessage,
   ToolCallResult,
@@ -1351,6 +1353,11 @@ function checkApprovals(): void {
   }
 }
 if (!STATIC) setInterval(checkApprovals, 5000).unref()
+// Idle auto-clear: check wall-clock idle every minute; maybeIdleClear no-ops
+// when disabled ('0s'), mid-turn, or already cleared this idle period. The
+// `let` state + maybeIdleClear are hoisted/initialized before this fires.
+const IDLE_CLEAR_CHECK_MS = Number(process.env.SWITCHROOM_IDLE_CLEAR_CHECK_MS ?? 60_000)
+if (!STATIC && IDLE_CLEAR_CHECK_MS > 0) setInterval(maybeIdleClear, IDLE_CLEAR_CHECK_MS).unref()
 // ─── Thread / status / stream state ───────────────────────────────────────
 const chatThreadMap = new Map<string, number>()
@@ -2796,6 +2803,45 @@ function purgeReactionTracking(key: string, endingTurn?: CurrentTurn): void {
       // moot, so only evaluate when no restart drained this pass.
       maybeProactiveCompact();
     }
+    // Context-headroom snapshot (RFC context-headroom-surface) — write the
+    // current occupancy + cap so `switchroom status`/`doctor`/web can show
+    // headroom. Independent of proactive-compaction (writes even when no cap
+    // is configured) and best-effort (never throws). Runs on the same idle
+    // signal — never mid-turn.
+    snapshotContextOccupancy();
+  }
+}
+/**
+ * Write the per-agent context-occupancy snapshot from the same live occupancy
+ * proactive-compaction reads — but unconditionally (even with no cap set), so
+ * the operator always sees headroom. Best-effort; never throws.
+ */
+function snapshotContextOccupancy(): void {
+  try {
+    const agentName = process.env.SWITCHROOM_AGENT_NAME;
+    const file = lastSessionActiveFile;
+    if (!agentName || !file) return;
+    const turns = readTurnUsages(file, 1);
+    if (turns.length === 0) return;
+    const t = turns[0];
+    const occupancy = t.input + t.cacheRead + t.cacheCreate;
+    let cap: number | null = null;
+    try {
+      const cfg = loadSwitchroomConfig();
+      const rawAgent = cfg.agents?.[agentName] ?? {};
+      const resolved = resolveAgentConfig(cfg.defaults, cfg.profiles, rawAgent);
+      cap = resolved.session?.max_context_tokens ?? null;
+    } catch {
+      cap = null; // config unreadable → show occupancy without a ratio
+    }
+    const stateDir = process.env.SWITCHROOM_AGENT_STATE_DIR ?? "/state/agent";
+    writeContextOccupancySnapshot(
+      stateDir,
+      buildContextOccupancy(occupancy, cap, Date.now()),
+    );
+  } catch {
+    /* best-effort — never disrupt the idle gate */
   }
 }
@@ -3057,6 +3103,111 @@ function maybeProactiveCompact(): void {
     });
 }
+// ─── Idle auto-clear ──────────────────────────────────────────────────────
+// Wall-clock idle → /clear (idle-clear.ts). Independent of proactive-compact
+// (occupancy-driven at the turn-end gate): a fully-idle agent never ends a
+// turn, so this runs on its own interval. Any activity (inbound / turn start /
+// cron fire) resets the timer via markIdleActivity(); fires once per idle
+// period; never mid-turn (turnInFlightForGate, the same gate compaction uses).
+let lastIdleActivityAt = Date.now();
+let idleAutoCleared = false;
+let idleClearDispatching = false;
+/** Reset the idle timer + re-arm auto-clear. Call on ANY activity. */
+function markIdleActivity(): void {
+  lastIdleActivityAt = Date.now();
+  idleAutoCleared = false;
+}
+/** Idle window in ms: env override → per-agent config → 3h default. 0 disables. */
+function resolveIdleClearMs(): number {
+  const env = process.env.SWITCHROOM_IDLE_CLEAR_MS;
+  if (env != null && env !== '') {
+    const n = Number(env);
+    return Number.isFinite(n) && n >= 0 ? n : DEFAULT_IDLE_CLEAR_MS;
+  }
+  try {
+    const agentName = process.env.SWITCHROOM_AGENT_NAME;
+    if (!agentName) return DEFAULT_IDLE_CLEAR_MS;
+    const cfg = loadSwitchroomConfig();
+    const rawAgent = cfg.agents?.[agentName] ?? {};
+    const resolved = resolveAgentConfig(cfg.defaults, cfg.profiles, rawAgent);
+    const raw = resolved.session?.idle_clear_after;
+    if (raw == null) return DEFAULT_IDLE_CLEAR_MS; // unset → on by default (3h)
+    const ms = idleDurationToMs(raw);
+    return ms == null ? DEFAULT_IDLE_CLEAR_MS : ms;
+  } catch {
+    return DEFAULT_IDLE_CLEAR_MS; // config unreadable → keep the default on
+  }
+}
+/** Evaluate idle auto-clear (runs on IDLE_CLEAR_CHECK_MS interval). */
+function maybeIdleClear(): void {
+  if (idleClearDispatching) return;
+  const agentName = process.env.SWITCHROOM_AGENT_NAME;
+  if (!agentName) return;
+  const idleClearMs = resolveIdleClearMs();
+  const decision = decideIdleClear(
+    {
+      lastActivityAt: lastIdleActivityAt,
+      idleClearMs,
+      alreadyCleared: idleAutoCleared,
+      turnInFlight: turnInFlightForGate(),
+    },
+    Date.now(),
+  );
+  if (!decision.clear) return;
+  // Fire once per idle period — set BEFORE the await so the next tick can't
+  // double-dispatch. markIdleActivity() re-arms on the next real activity.
+  idleAutoCleared = true;
+  idleClearDispatching = true;
+  process.stderr.write(
+    `telegram gateway: idle auto-/clear for ${agentName} ` +
+      `(idle >= ${Math.round(idleClearMs / 60_000)}m)\n`,
+  );
+  // Accepted check-to-send race (same as maybeProactiveCompact): a new inbound
+  // could arrive between the gate check and the tmux send; /clear then lands in
+  // claude's prompt buffer and runs at the next idle prompt (inject.ts FUTURE-GAP).
+  void injectSlashCommandImpl(agentName, '/clear')
+    .then(() => { void postIdleClearNotice(idleClearMs); })
+    .catch((err: unknown) => {
+      process.stderr.write(
+        `telegram gateway: idle /clear inject failed for ` +
+          `${agentName}: ${err instanceof Error ? err.message : String(err)}\n`,
+      );
+    })
+    .finally(() => { idleClearDispatching = false; });
+}
+/** Subtle one-line notice so the operator knows the session was auto-cleared. */
+async function postIdleClearNotice(idleClearMs: number): Promise<void> {
+  try {
+    const chatId = loadAccess().allowFrom[0];
+    if (!chatId) return;
+    const threadId = topicForRecipient({
+      recipientChatId: chatId,
+      resolvedTopic:
+        resolveAgentOutboundTopic({ kind: 'compact-watchdog' })
+        ?? chatThreadMap.get(chatId),
+      supergroupChatId: resolveAgentSupergroupChatId(),
+    });
+    const hrs = Math.round((idleClearMs / 3_600_000) * 10) / 10;
+    const text =
+      `🧹 <b>Cleared after ${hrs}h idle</b> — fresh slate next message; ` +
+      `long-term memory is in Hindsight.`;
+    await swallowingApiCall(
+      () =>
+        bot.api.sendMessage(chatId, text, {
+          parse_mode: 'HTML',
+          ...(threadId != null ? { message_thread_id: threadId } : {}),
+        }),
+      { chat_id: chatId, verb: 'idleAutoClear.notice' },
+    );
+  } catch {
+    /* best-effort notice — the /clear itself still happened */
+  }
+}
 /**
  * Post the START card for a proactive compaction. Best-effort: a failed
  * send just means no card (the compaction itself still happens). The
@@ -6690,6 +6841,10 @@ const ipcServer: IpcServer = createIpcServer({
   },
   onInjectInbound(_client: IpcClient, msg: InjectInboundMessage) {
+    // Cron fires (incl. cheap-cron, whose session events are dropped before
+    // currentTurn is set) are real activity — re-arm idle auto-clear so a
+    // working scheduled agent isn't wiped after 3h of "no inbound".
+    markIdleActivity()
     const promptKey = typeof msg.inbound.meta?.prompt_key === 'string'
       ? msg.inbound.meta.prompt_key
       : 'unknown'
@@ -9994,6 +10149,7 @@ function handleSessionEvent(ev: SessionEvent): void {
           isDm: isDmChatId(ev.chatId),
         }
         currentTurn = next
+        markIdleActivity() // any turn start (main session) is activity — re-arm idle clear
         // Status-surface observability: one line at every turn SET so a later
         // dark card is traceable to which turn/topic key it belonged to.
         process.stderr.write(
@@ -11672,6 +11828,7 @@ async function handleInbound(
   // (image_path_2, attachment_file_id_2, …) alongside the primary.
   extraAttachments?: CoalesceAttachment[],
 ): Promise<void> {
+  markIdleActivity() // any inbound resets the idle auto-clear timer + re-arms
   const isTopicMessage = ctx.message?.is_topic_message ?? false
   const messageThreadId = ctx.message?.message_thread_id
@@ -14452,19 +14609,37 @@ bot.command('agents', async ctx => {
 // /inject — #725 Phase 2 slash-command bridge. Implementation in
 // inject-handler.ts so it's unit-testable without booting the bot.
-bot.command('inject', async ctx => {
-  await handleInjectCommand(ctx, {
-    isAuthorized: isAuthorizedSender,
+// Shared deps for the inject-backed commands. `/inject <verb>` uses the
+// defaults; first-class /compact and /clear pass `open` (anyone in the chat —
+// operator decision, single-tenant trust) + a `fixedVerb` so they don't need
+// the `/inject` prefix.
+function buildInjectDeps(opts?: { open?: boolean; fixedVerb?: string }): InjectDeps {
+  return {
+    isAuthorized: opts?.open ? () => true : isAuthorizedSender,
     inject: injectSlashCommandImpl,
     // accent is already inlined into the body by the handler via
     // buildAccentHeader; switchroomReply doesn't need to know about it.
-    reply: async (ctx, text, opts) => switchroomReply(ctx, text, { html: opts?.html }),
+    reply: async (ctx, text, replyOpts) => switchroomReply(ctx, text, { html: replyOpts?.html }),
     getAgentName: getMyAgentName,
-    getArgs: getCommandArgs,
+    getArgs: opts?.fixedVerb != null ? () => opts.fixedVerb as string : getCommandArgs,
     escapeHtml: escapeHtmlForTg,
     preBlock,
     formatOutput: formatSwitchroomOutput,
-  })
+  }
+}
+bot.command('inject', async ctx => {
+  await handleInjectCommand(ctx, buildInjectDeps())
+})
+// /compact + /clear — first-class session-control commands, open to anyone in
+// the chat. Both are in the INJECT_COMMANDS allowlist; they ride the same
+// inject primitive as `/inject compact` / `/inject clear`.
+bot.command('compact', async ctx => {
+  await handleInjectCommand(ctx, buildInjectDeps({ open: true, fixedVerb: '/compact' }))
+})
+bot.command('clear', async ctx => {
+  await handleInjectCommand(ctx, buildInjectDeps({ open: true, fixedVerb: '/clear' }))
 })
 // /model — model dashboard + switch for this agent's live session.
@@ -14705,7 +14880,8 @@ function flushAgentHandoff(agentDir: string): number {
   return removed
 }
-async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Promise<void> {
+async function handleNewCommand(ctx: Context): Promise<void> {
+  const kind = 'new' // /reset removed (was a pure alias); keep the string for messages
   if (!isAuthorizedSender(ctx)) return
   const name = (typeof ctx.match === "string" ? ctx.match : "").trim() || getMyAgentName()
   try { assertSafeAgentName(name) } catch { await switchroomReply(ctx, 'Invalid agent name.'); return }
@@ -14755,9 +14931,7 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
   const chatId = String(ctx.chat!.id)
   const threadId = resolveThreadId(chatId, ctx.message?.message_thread_id)
-  const ackText = kind === 'new'
-    ? buildNewSessionAckText(name, flushed > 0)
-    : buildResetSessionAckText(name, flushed > 0)
+  const ackText = buildNewSessionAckText(name, flushed > 0)
   let ackId: number | null = null
   // #1075: thread-id-bearing — fall back to main chat.
   try {
@@ -14823,8 +14997,7 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
   )
 }
-bot.command('new', async ctx => handleNewOrResetCommand(ctx, 'new'))
-bot.command('reset', async ctx => handleNewOrResetCommand(ctx, 'reset'))
+bot.command('new', async ctx => handleNewCommand(ctx))
 // /update — host update from Telegram (#919). Default = dry-run plan
 // (`switchroom update --check`); explicit `apply` triggers the real
@@ -21118,54 +21291,32 @@ process.on('uncaughtException', err => {
 let runnerHandle: RunnerHandle | null = null
 // Long-poll health-check handle (issue #56). Created once per process, started
-// after the runner comes up, stopped on clean shutdown. The `onStall` callback
-// stops the runner so the outer retry loop can restart it.
+// after the runner comes up, stopped on clean shutdown. On a confirmed stall
+// the gateway EXITS non-zero and the supervisor restarts it with a fresh runner
+// (see recoverFromPollStall + the 2026-06-18 incident note there). It does NOT
+// try to stop()+re-run the runner in place — grammy's stop() blocks on a non-
+// abortable getUpdates retry backoff during a network outage, which hung the
+// whole fleet deaf.
 //
 // Interval and threshold are configurable via env for ops/testing flexibility:
-//   SWITCHROOM_POLL_HEALTH_INTERVAL_MS — default 5 min
-//   SWITCHROOM_POLL_HEALTH_THRESHOLD   — default 3
+//   SWITCHROOM_POLL_HEALTH_INTERVAL_MS — default 60s (fast self-heal after a flap)
+//   SWITCHROOM_POLL_HEALTH_THRESHOLD   — default 3 (a single blip must not trip it)
 const POLL_HEALTH_INTERVAL_MS = Number(
-  process.env.SWITCHROOM_POLL_HEALTH_INTERVAL_MS ?? 5 * 60_000,
+  process.env.SWITCHROOM_POLL_HEALTH_INTERVAL_MS ?? 60_000,
 )
 const POLL_HEALTH_THRESHOLD = Number(
   process.env.SWITCHROOM_POLL_HEALTH_THRESHOLD ?? 3,
 )
-/** Sentinel error thrown by onStall so the outer for-loop retries rather
- *  than exiting. The catch block recognises this specific message. */
-class PollStallError extends Error {
-  constructor() {
-    super('poll_stall_restart')
-    this.name = 'PollStallError'
-  }
-}
 let pollHealthCheck: PollHealthCheckHandle | null = null
 if (POLL_HEALTH_INTERVAL_MS > 0) {
   pollHealthCheck = createPollHealthCheck({
     ping: () => bot.api.getMe(),
     onStall: async () => {
-      const agentName = process.env.SWITCHROOM_AGENT_NAME ?? '-'
-      process.stderr.write(
-        `telegram gateway: poll.health_check.stall_recovery stopping runner agent=${agentName}\n`,
-      )
-      if (runnerHandle != null && runnerHandle.isRunning()) {
-        try {
-          await runnerHandle.stop()
-        } catch (err) {
-          process.stderr.write(
-            `telegram gateway: poll.health_check.stall_recovery runner.stop error: ${(err as Error).message}\n`,
-          )
-        }
-      }
-      // runnerHandle.stop() causes task() to resolve. That would normally
-      // hit the `return` below and exit the startup IIFE. Instead we throw
-      // PollStallError from inside task()'s continuation by surfacing it
-      // through the outer catch block — but task() itself doesn't throw here.
-      //
-      // The simpler fix: set runnerHandle to a sentinel that the code below
-      // `await runnerHandle.task()` checks to decide continue vs return.
-      runnerHandle = null
+      // Exit non-zero → _switchroom_supervise restarts the gateway sidecar
+      // with a fresh runner. Never awaits runnerHandle.stop() (it hangs on a
+      // wedged source). recoverFromPollStall exits 1, never 78.
+      recoverFromPollStall({ agentName: process.env.SWITCHROOM_AGENT_NAME ?? '-' })
     },
     intervalMs: POLL_HEALTH_INTERVAL_MS,
     failureThreshold: POLL_HEALTH_THRESHOLD,
@@ -22247,20 +22398,10 @@ void (async () => {
       pollHealthCheck?.stop()
       pollHealthCheck?.start()
       await runnerHandle.task()
-      // If onStall fired, it called runnerHandle.stop() which resolved task()
-      // above, then set runnerHandle = null. Detect that here and continue the
-      // loop to restart the runner. A normal clean exit leaves runnerHandle non-
-      // null (the stopped handle is still non-null at this point), so we can
-      // distinguish: null means stall-triggered, non-null means clean exit.
-      if (runnerHandle === null) {
-        const agentName = process.env.SWITCHROOM_AGENT_NAME ?? '-'
-        process.stderr.write(
-          `telegram gateway: poll.health_check.stall_recovery restarting runner agent=${agentName}\n`,
-        )
-        // Brief pause so the Telegram API can close the stalled connection.
-        await new Promise(r => setTimeout(r, 2000))
-        continue
-      }
+      // task() resolves only on clean shutdown (shutdown-drain stops the
+      // runner) — exit the startup IIFE. Stall recovery no longer routes here:
+      // onStall exits the process and the supervisor restarts the gateway
+      // (see recoverFromPollStall). The 409 path below re-runs in place.
       return
     } catch (err) {
       if (err instanceof GrammyError && err.error_code === 409) {

package/telegram-plugin/gateway/hostd-dispatch.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
  * Hostd dispatch helpers for the gateway's self-restart slash-commands
  * (#1175 RFC C, Phase 2). When the operator has opted into
- * `host_control.enabled: true`, /restart, /new, /reset, and
+ * `host_control.enabled: true`, /restart, /new, and
  * /update apply route through the per-agent hostd UDS instead of the
  * in-container `spawnSwitchroomDetached` shellout.
  *

package/telegram-plugin/gateway/idle-clear.ts ADDED Viewed

@@ -0,0 +1,72 @@
+/**
+ * Idle auto-clear: wipe a session's working context after a wall-clock idle
+ * period (default 3h), so a long-untouched agent starts fresh next message
+ * instead of resuming a stale, context-heavy thread. Long-term memory lives in
+ * Hindsight, so a clear loses only the in-session scratch.
+ *
+ * Sibling of proactive-compact.ts (occupancy-driven /compact at the turn-end
+ * idle gate). This one is wall-clock-driven: it fires `/clear` from a periodic
+ * interval because a fully-idle agent never ends a turn, so the turn-end gate
+ * alone would never see it. Both inject via the same primitive and both refuse
+ * to fire mid-turn (turnInFlight guard).
+ *
+ * The decider is pure so the fire-once / re-arm / not-mid-turn / disabled logic
+ * is unit-tested without the gateway.
+ */
+/** Default idle window when `session.idle_clear_after` is unset (3h). ON by default. */
+export const DEFAULT_IDLE_CLEAR_MS = 3 * 60 * 60 * 1000;
+export interface IdleClearState {
+  /** Epoch ms of the last activity (inbound, turn start, cron fire). */
+  lastActivityAt: number;
+  /** Idle window in ms. <= 0 disables auto-clear. */
+  idleClearMs: number;
+  /** Already auto-cleared since the last activity? Prevents re-clearing every tick. */
+  alreadyCleared: boolean;
+  /** A turn is in flight — never clear mid-turn. */
+  turnInFlight: boolean;
+}
+export interface IdleClearDecision {
+  clear: boolean;
+}
+/**
+ * Decide whether to auto-clear this evaluation. Fires exactly once per idle
+ * period: only when enabled, not mid-turn, not already cleared, and the idle
+ * window has elapsed. The caller sets `alreadyCleared` on fire and resets it
+ * (with `lastActivityAt`) on the next activity to re-arm.
+ */
+export function decideIdleClear(
+  state: IdleClearState,
+  now: number,
+): IdleClearDecision {
+  if (state.idleClearMs <= 0) return { clear: false }; // disabled
+  if (state.turnInFlight) return { clear: false }; // never mid-turn
+  if (state.alreadyCleared) return { clear: false }; // once per idle period
+  if (now - state.lastActivityAt < state.idleClearMs) return { clear: false };
+  return { clear: true };
+}
+/**
+ * Parse a `^\d+[smh]$` duration (the SessionSchema format, e.g. "3h", "30m",
+ * "7200s") to ms. Returns null on a malformed string so the caller can fall
+ * back to the default. Kept local (vs importing the web module's parser) to
+ * avoid cross-package coupling.
+ */
+export function idleDurationToMs(raw: string): number | null {
+  const m = /^(\d+)([smh])$/.exec(raw.trim());
+  if (!m) return null;
+  const n = Number(m[1]);
+  switch (m[2]) {
+    case "s":
+      return n * 1000;
+    case "m":
+      return n * 60_000;
+    case "h":
+      return n * 3_600_000;
+    default:
+      return null;
+  }
+}

package/telegram-plugin/gateway/poll-health.ts CHANGED Viewed

@@ -10,18 +10,23 @@
  *
  * Fix:
  *   A separate setInterval calls `getMe()` (a lightweight Bot API
- *   endpoint) every HEALTH_INTERVAL_MS. Three consecutive failures
- *   constitute a stall: we stop the runner, wait RESTART_GRACE_MS
- *   for the in-flight request to die, then let the caller restart it.
+ *   endpoint) every `intervalMs`. `failureThreshold` consecutive
+ *   failures constitute a stall and fire `onStall`.
  *
  *   A single failure doesn't count — transient network blips happen.
  *   The threshold must be >= 3 so a brief Telegram outage (e.g. a
  *   data-centre hiccup) doesn't cause thrashing.
  *
+ * Recovery (see gateway.ts onStall → poll-stall-recovery.ts):
+ *   `onStall` does NOT stop()+re-run the runner — grammy's stop() blocks
+ *   on a non-abortable getUpdates retry backoff during an outage, which hung
+ *   the fleet deaf (2026-06-18). It exits the process non-zero and the
+ *   supervisor restarts the gateway with a fresh runner.
+ *
  * Usage:
  *   const hc = createPollHealthCheck({
  *     ping:  () => bot.api.getMe(),
- *     onStall: async () => { await runnerHandle.stop(); … restart … },
+ *     onStall: async () => { recoverFromPollStall({ agentName }); },
  *     log:   (msg) => process.stderr.write(msg),
  *   });
  *   // start after the runner is up:

package/telegram-plugin/gateway/poll-stall-recovery.ts ADDED Viewed

@@ -0,0 +1,59 @@
+/**
+ * Telegram poll stall recovery (incident 2026-06-18: a network flap left the
+ * whole agent fleet alive-but-deaf for ~30 min until a manual restart).
+ *
+ * When the long-poll health-check (poll-health.ts) detects a stall — 3
+ * consecutive `getMe` failures — the OLD recovery did `await runnerHandle.stop()`
+ * then re-ran the runner in place. That HUNG: grammy's `stop()` returns the
+ * runner task promise, which is blocked on a non-abortable `getUpdates` retry
+ * backoff (`@grammyjs/runner` maxRetryTime 15h, exponential, plain setTimeout
+ * that ignores the abort signal). During a network outage the source never
+ * terminates, so `stop()` never resolves, the re-run never fires, and the
+ * gateway stays deaf.
+ *
+ * Recovery is now a clean non-zero process EXIT: `_switchroom_supervise`
+ * (profiles/_base/start.sh.hbs) restarts the gateway sidecar with a fresh
+ * runner. Same mechanism the manual fix used and the run loop's catch-block
+ * already relies on; it restarts only the gateway (claude/tmux untouched,
+ * in-flight turns recovered by boot-resume). Extracted as its own module with
+ * an injectable `exit` so the exit code is unit-testable (mirrors
+ * startup-network-retry.ts) — gateway.ts itself can't be imported in a test.
+ */
+export interface PollStallRecoveryDeps {
+  /** Process exit. Injectable for tests; defaults to process.exit. */
+  exit?: (code: number) => void;
+  /** Logger. Defaults to process.stderr. */
+  log?: (msg: string) => void;
+  /** Agent name for the log line. */
+  agentName?: string;
+}
+/** Exit code for a poll stall. Always 1 — NEVER 78. */
+export const POLL_STALL_EXIT_CODE = 1;
+/**
+ * Recover from a confirmed Telegram poll stall by exiting non-zero so the
+ * supervisor restarts the gateway with a fresh runner.
+ *
+ * MUST exit 1 — NEVER 78. Exit 78 (EX_CONFIG) is the supervisor's permanent-
+ * quarantine sentinel (start.sh.hbs); quarantining on a *transient* network
+ * stall would leave the gateway dead even after connectivity returns — the
+ * exact failure this fix exists to prevent.
+ */
+export function recoverFromPollStall(deps: PollStallRecoveryDeps = {}): void {
+  const exit = deps.exit ?? ((code: number) => process.exit(code));
+  const log =
+    deps.log ??
+    ((msg: string) => {
+      process.stderr.write(msg.endsWith("\n") ? msg : msg + "\n");
+    });
+  const agentName = deps.agentName ?? "-";
+  log(
+    `telegram gateway: poll.health_check.stall_recovery exiting code=${POLL_STALL_EXIT_CODE} ` +
+      `pid=${process.pid} agent=${agentName} — supervisor will restart the gateway with a fresh runner ` +
+      `(not awaiting runnerHandle.stop(): grammy stop() blocks on a non-abortable getUpdates retry backoff during an outage)`,
+  );
+  exit(POLL_STALL_EXIT_CODE);
+}