switchroom 0.15.40 → 0.15.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ /**
2
+ * Context-headroom snapshot (RFC reference/rfcs/context-headroom-surface.md).
3
+ *
4
+ * The gateway already computes working-context occupancy at the turn-end idle
5
+ * gate for proactive-compaction (gateway.ts ~3000). This module turns that
6
+ * value into a small on-disk snapshot the host surfaces (`switchroom status` /
7
+ * `doctor` / web) read, so the operator can SEE each agent's headroom-to-
8
+ * compaction — the predictability won by ENABLE_TOOL_SEARCH=true made visible.
9
+ *
10
+ * Pure + side-effect-light so it's unit-testable; the write is best-effort and
11
+ * never throws (a missing snapshot reads as `unknown`, never an error).
12
+ */
13
+
14
+ import { mkdirSync, writeFileSync } from "node:fs";
15
+ import { join } from "node:path";
16
+
17
+ /** Filename written under the agent's state dir. Shared with the host reader. */
18
+ export const CONTEXT_OCCUPANCY_FILENAME = "context-occupancy.json";
19
+
20
+ /** Occupancy ≥ this fraction of the cap → "tight" (compaction imminent). */
21
+ export const TIGHT_FRACTION = 0.8;
22
+
23
+ export type ContextState = "ok" | "tight" | "unknown";
24
+
25
+ export interface ContextOccupancy {
26
+ /** Live working-context tokens (latest turn input + cache_read + cache_creation). */
27
+ occupancy: number;
28
+ /** session.max_context_tokens, or null when unset (native compaction only). */
29
+ cap: number | null;
30
+ /** cap - occupancy, or null when no cap. */
31
+ headroom: number | null;
32
+ /** occupancy / cap (0..1+), or null when no cap. */
33
+ pct: number | null;
34
+ /** ok / tight / unknown. `unknown` only when occupancy is unmeasurable. */
35
+ state: ContextState;
36
+ /** epoch ms (host/container clock) when computed. */
37
+ computedAt: number;
38
+ }
39
+
40
+ /**
41
+ * Build the snapshot from a measured occupancy + the resolved cap. Pure.
42
+ * - cap null → no ratio; state "ok" (occupancy known, just no ceiling set).
43
+ * - occupancy < 0 / NaN → "unknown".
44
+ */
45
+ export function buildContextOccupancy(
46
+ occupancy: number,
47
+ cap: number | null | undefined,
48
+ now: number,
49
+ ): ContextOccupancy {
50
+ if (!Number.isFinite(occupancy) || occupancy < 0) {
51
+ return { occupancy: 0, cap: cap ?? null, headroom: null, pct: null, state: "unknown", computedAt: now };
52
+ }
53
+ const c = cap != null && cap > 0 ? cap : null;
54
+ if (c == null) {
55
+ return { occupancy, cap: null, headroom: null, pct: null, state: "ok", computedAt: now };
56
+ }
57
+ const pct = occupancy / c;
58
+ return {
59
+ occupancy,
60
+ cap: c,
61
+ headroom: c - occupancy,
62
+ pct,
63
+ state: pct >= TIGHT_FRACTION ? "tight" : "ok",
64
+ computedAt: now,
65
+ };
66
+ }
67
+
68
+ /**
69
+ * Write `<stateDir>/context-occupancy.json`. Best-effort — callers wrap in
70
+ * try/catch but this also swallows internally so a write failure never
71
+ * disrupts the turn-end gate.
72
+ */
73
+ export function writeContextOccupancySnapshot(
74
+ stateDir: string,
75
+ snapshot: ContextOccupancy,
76
+ deps?: {
77
+ mkdir?: (p: string, o: { recursive: true }) => void;
78
+ writeFile?: (p: string, d: string) => void;
79
+ },
80
+ ): void {
81
+ try {
82
+ const path = join(stateDir, CONTEXT_OCCUPANCY_FILENAME);
83
+ (deps?.mkdir ?? ((p, o) => mkdirSync(p, o)))(stateDir, { recursive: true });
84
+ (deps?.writeFile ?? ((p, d) => writeFileSync(p, d)))(
85
+ path,
86
+ JSON.stringify(snapshot, null, 2) + "\n",
87
+ );
88
+ } catch {
89
+ /* best-effort — never break the turn-end gate */
90
+ }
91
+ }
@@ -208,7 +208,6 @@ import {
208
208
  switchroomHelpText as buildSwitchroomHelpText,
209
209
  restartAckText as buildRestartAckText,
210
210
  newSessionAckText as buildNewSessionAckText,
211
- resetSessionAckText as buildResetSessionAckText,
212
211
  TELEGRAM_BASE_COMMANDS,
213
212
  TELEGRAM_SWITCHROOM_COMMANDS,
214
213
  type AgentMetadata, type AuthSummary, type StatusProbeRow,
@@ -266,7 +265,7 @@ import {
266
265
  import { DEFAULT_SLOT } from '../../src/auth/accounts.js'
267
266
  import { currentActiveSlot, type AuthCodeOutcome } from '../../src/auth/manager.js'
268
267
  import { injectSlashCommand as injectSlashCommandImpl } from '../../src/agents/inject.js'
269
- import { handleInjectCommand } from './inject-handler.js'
268
+ import { handleInjectCommand, type InjectDeps } from './inject-handler.js'
270
269
  import {
271
270
  parseModelCommand,
272
271
  handleModelCommand,
@@ -293,7 +292,9 @@ import { refreshBanner } from '../slot-banner-driver.js'
293
292
  import { loadConfig as loadSwitchroomConfig, findConfigFile as findSwitchroomConfigFile } from '../../src/config/loader.js'; import { resolveAgentConfig } from '../../src/config/merge.js'
294
293
  import { resolveOutboundTopic as resolveOutboundTopicHelper, topicForRecipient, type TopicRouterConfig as _OutboundRouterConfig } from '../../src/telegram/topic-router.js'
295
294
  import { readTurnUsages } from '../../src/agents/perf.js'
295
+ import { buildContextOccupancy, writeContextOccupancySnapshot } from './context-occupancy.js'
296
296
  import { decideProactiveCompact, initialCompactState, type CompactState } from './proactive-compact.js'
297
+ import { decideIdleClear, idleDurationToMs, DEFAULT_IDLE_CLEAR_MS } from './idle-clear.js'
297
298
  import { nextCompactNotify, idleCompactNotifyState, type CompactNotifyState } from './compact-notify.js'
298
299
  import {
299
300
  tryHostdDispatch,
@@ -369,6 +370,7 @@ import {
369
370
  foregroundFinishAction,
370
371
  } from './foreground-nesting.js'
371
372
  import { createPollHealthCheck, type PollHealthCheckHandle } from './poll-health.js'
373
+ import { recoverFromPollStall } from './poll-stall-recovery.js'
372
374
  import type {
373
375
  ToolCallMessage,
374
376
  ToolCallResult,
@@ -1351,6 +1353,11 @@ function checkApprovals(): void {
1351
1353
  }
1352
1354
  }
1353
1355
  if (!STATIC) setInterval(checkApprovals, 5000).unref()
1356
+ // Idle auto-clear: check wall-clock idle every minute; maybeIdleClear no-ops
1357
+ // when disabled ('0s'), mid-turn, or already cleared this idle period. The
1358
+ // `let` state + maybeIdleClear are hoisted/initialized before this fires.
1359
+ const IDLE_CLEAR_CHECK_MS = Number(process.env.SWITCHROOM_IDLE_CLEAR_CHECK_MS ?? 60_000)
1360
+ if (!STATIC && IDLE_CLEAR_CHECK_MS > 0) setInterval(maybeIdleClear, IDLE_CLEAR_CHECK_MS).unref()
1354
1361
 
1355
1362
  // ─── Thread / status / stream state ───────────────────────────────────────
1356
1363
  const chatThreadMap = new Map<string, number>()
@@ -2796,6 +2803,45 @@ function purgeReactionTracking(key: string, endingTurn?: CurrentTurn): void {
2796
2803
  // moot, so only evaluate when no restart drained this pass.
2797
2804
  maybeProactiveCompact();
2798
2805
  }
2806
+ // Context-headroom snapshot (RFC context-headroom-surface) — write the
2807
+ // current occupancy + cap so `switchroom status`/`doctor`/web can show
2808
+ // headroom. Independent of proactive-compaction (writes even when no cap
2809
+ // is configured) and best-effort (never throws). Runs on the same idle
2810
+ // signal — never mid-turn.
2811
+ snapshotContextOccupancy();
2812
+ }
2813
+ }
2814
+
2815
+ /**
2816
+ * Write the per-agent context-occupancy snapshot from the same live occupancy
2817
+ * proactive-compaction reads — but unconditionally (even with no cap set), so
2818
+ * the operator always sees headroom. Best-effort; never throws.
2819
+ */
2820
+ function snapshotContextOccupancy(): void {
2821
+ try {
2822
+ const agentName = process.env.SWITCHROOM_AGENT_NAME;
2823
+ const file = lastSessionActiveFile;
2824
+ if (!agentName || !file) return;
2825
+ const turns = readTurnUsages(file, 1);
2826
+ if (turns.length === 0) return;
2827
+ const t = turns[0];
2828
+ const occupancy = t.input + t.cacheRead + t.cacheCreate;
2829
+ let cap: number | null = null;
2830
+ try {
2831
+ const cfg = loadSwitchroomConfig();
2832
+ const rawAgent = cfg.agents?.[agentName] ?? {};
2833
+ const resolved = resolveAgentConfig(cfg.defaults, cfg.profiles, rawAgent);
2834
+ cap = resolved.session?.max_context_tokens ?? null;
2835
+ } catch {
2836
+ cap = null; // config unreadable → show occupancy without a ratio
2837
+ }
2838
+ const stateDir = process.env.SWITCHROOM_AGENT_STATE_DIR ?? "/state/agent";
2839
+ writeContextOccupancySnapshot(
2840
+ stateDir,
2841
+ buildContextOccupancy(occupancy, cap, Date.now()),
2842
+ );
2843
+ } catch {
2844
+ /* best-effort — never disrupt the idle gate */
2799
2845
  }
2800
2846
  }
2801
2847
 
@@ -3057,6 +3103,111 @@ function maybeProactiveCompact(): void {
3057
3103
  });
3058
3104
  }
3059
3105
 
3106
+ // ─── Idle auto-clear ──────────────────────────────────────────────────────
3107
+ // Wall-clock idle → /clear (idle-clear.ts). Independent of proactive-compact
3108
+ // (occupancy-driven at the turn-end gate): a fully-idle agent never ends a
3109
+ // turn, so this runs on its own interval. Any activity (inbound / turn start /
3110
+ // cron fire) resets the timer via markIdleActivity(); fires once per idle
3111
+ // period; never mid-turn (turnInFlightForGate, the same gate compaction uses).
3112
+ let lastIdleActivityAt = Date.now();
3113
+ let idleAutoCleared = false;
3114
+ let idleClearDispatching = false;
3115
+
3116
+ /** Reset the idle timer + re-arm auto-clear. Call on ANY activity. */
3117
+ function markIdleActivity(): void {
3118
+ lastIdleActivityAt = Date.now();
3119
+ idleAutoCleared = false;
3120
+ }
3121
+
3122
+ /** Idle window in ms: env override → per-agent config → 3h default. 0 disables. */
3123
+ function resolveIdleClearMs(): number {
3124
+ const env = process.env.SWITCHROOM_IDLE_CLEAR_MS;
3125
+ if (env != null && env !== '') {
3126
+ const n = Number(env);
3127
+ return Number.isFinite(n) && n >= 0 ? n : DEFAULT_IDLE_CLEAR_MS;
3128
+ }
3129
+ try {
3130
+ const agentName = process.env.SWITCHROOM_AGENT_NAME;
3131
+ if (!agentName) return DEFAULT_IDLE_CLEAR_MS;
3132
+ const cfg = loadSwitchroomConfig();
3133
+ const rawAgent = cfg.agents?.[agentName] ?? {};
3134
+ const resolved = resolveAgentConfig(cfg.defaults, cfg.profiles, rawAgent);
3135
+ const raw = resolved.session?.idle_clear_after;
3136
+ if (raw == null) return DEFAULT_IDLE_CLEAR_MS; // unset → on by default (3h)
3137
+ const ms = idleDurationToMs(raw);
3138
+ return ms == null ? DEFAULT_IDLE_CLEAR_MS : ms;
3139
+ } catch {
3140
+ return DEFAULT_IDLE_CLEAR_MS; // config unreadable → keep the default on
3141
+ }
3142
+ }
3143
+
3144
+ /** Evaluate idle auto-clear (runs on IDLE_CLEAR_CHECK_MS interval). */
3145
+ function maybeIdleClear(): void {
3146
+ if (idleClearDispatching) return;
3147
+ const agentName = process.env.SWITCHROOM_AGENT_NAME;
3148
+ if (!agentName) return;
3149
+ const idleClearMs = resolveIdleClearMs();
3150
+ const decision = decideIdleClear(
3151
+ {
3152
+ lastActivityAt: lastIdleActivityAt,
3153
+ idleClearMs,
3154
+ alreadyCleared: idleAutoCleared,
3155
+ turnInFlight: turnInFlightForGate(),
3156
+ },
3157
+ Date.now(),
3158
+ );
3159
+ if (!decision.clear) return;
3160
+ // Fire once per idle period — set BEFORE the await so the next tick can't
3161
+ // double-dispatch. markIdleActivity() re-arms on the next real activity.
3162
+ idleAutoCleared = true;
3163
+ idleClearDispatching = true;
3164
+ process.stderr.write(
3165
+ `telegram gateway: idle auto-/clear for ${agentName} ` +
3166
+ `(idle >= ${Math.round(idleClearMs / 60_000)}m)\n`,
3167
+ );
3168
+ // Accepted check-to-send race (same as maybeProactiveCompact): a new inbound
3169
+ // could arrive between the gate check and the tmux send; /clear then lands in
3170
+ // claude's prompt buffer and runs at the next idle prompt (inject.ts FUTURE-GAP).
3171
+ void injectSlashCommandImpl(agentName, '/clear')
3172
+ .then(() => { void postIdleClearNotice(idleClearMs); })
3173
+ .catch((err: unknown) => {
3174
+ process.stderr.write(
3175
+ `telegram gateway: idle /clear inject failed for ` +
3176
+ `${agentName}: ${err instanceof Error ? err.message : String(err)}\n`,
3177
+ );
3178
+ })
3179
+ .finally(() => { idleClearDispatching = false; });
3180
+ }
3181
+
3182
+ /** Subtle one-line notice so the operator knows the session was auto-cleared. */
3183
+ async function postIdleClearNotice(idleClearMs: number): Promise<void> {
3184
+ try {
3185
+ const chatId = loadAccess().allowFrom[0];
3186
+ if (!chatId) return;
3187
+ const threadId = topicForRecipient({
3188
+ recipientChatId: chatId,
3189
+ resolvedTopic:
3190
+ resolveAgentOutboundTopic({ kind: 'compact-watchdog' })
3191
+ ?? chatThreadMap.get(chatId),
3192
+ supergroupChatId: resolveAgentSupergroupChatId(),
3193
+ });
3194
+ const hrs = Math.round((idleClearMs / 3_600_000) * 10) / 10;
3195
+ const text =
3196
+ `🧹 <b>Cleared after ${hrs}h idle</b> — fresh slate next message; ` +
3197
+ `long-term memory is in Hindsight.`;
3198
+ await swallowingApiCall(
3199
+ () =>
3200
+ bot.api.sendMessage(chatId, text, {
3201
+ parse_mode: 'HTML',
3202
+ ...(threadId != null ? { message_thread_id: threadId } : {}),
3203
+ }),
3204
+ { chat_id: chatId, verb: 'idleAutoClear.notice' },
3205
+ );
3206
+ } catch {
3207
+ /* best-effort notice — the /clear itself still happened */
3208
+ }
3209
+ }
3210
+
3060
3211
  /**
3061
3212
  * Post the START card for a proactive compaction. Best-effort: a failed
3062
3213
  * send just means no card (the compaction itself still happens). The
@@ -6690,6 +6841,10 @@ const ipcServer: IpcServer = createIpcServer({
6690
6841
  },
6691
6842
 
6692
6843
  onInjectInbound(_client: IpcClient, msg: InjectInboundMessage) {
6844
+ // Cron fires (incl. cheap-cron, whose session events are dropped before
6845
+ // currentTurn is set) are real activity — re-arm idle auto-clear so a
6846
+ // working scheduled agent isn't wiped after 3h of "no inbound".
6847
+ markIdleActivity()
6693
6848
  const promptKey = typeof msg.inbound.meta?.prompt_key === 'string'
6694
6849
  ? msg.inbound.meta.prompt_key
6695
6850
  : 'unknown'
@@ -9994,6 +10149,7 @@ function handleSessionEvent(ev: SessionEvent): void {
9994
10149
  isDm: isDmChatId(ev.chatId),
9995
10150
  }
9996
10151
  currentTurn = next
10152
+ markIdleActivity() // any turn start (main session) is activity — re-arm idle clear
9997
10153
  // Status-surface observability: one line at every turn SET so a later
9998
10154
  // dark card is traceable to which turn/topic key it belonged to.
9999
10155
  process.stderr.write(
@@ -11672,6 +11828,7 @@ async function handleInbound(
11672
11828
  // (image_path_2, attachment_file_id_2, …) alongside the primary.
11673
11829
  extraAttachments?: CoalesceAttachment[],
11674
11830
  ): Promise<void> {
11831
+ markIdleActivity() // any inbound resets the idle auto-clear timer + re-arms
11675
11832
  const isTopicMessage = ctx.message?.is_topic_message ?? false
11676
11833
  const messageThreadId = ctx.message?.message_thread_id
11677
11834
 
@@ -14452,19 +14609,37 @@ bot.command('agents', async ctx => {
14452
14609
 
14453
14610
  // /inject — #725 Phase 2 slash-command bridge. Implementation in
14454
14611
  // inject-handler.ts so it's unit-testable without booting the bot.
14455
- bot.command('inject', async ctx => {
14456
- await handleInjectCommand(ctx, {
14457
- isAuthorized: isAuthorizedSender,
14612
+ // Shared deps for the inject-backed commands. `/inject <verb>` uses the
14613
+ // defaults; first-class /compact and /clear pass `open` (anyone in the chat —
14614
+ // operator decision, single-tenant trust) + a `fixedVerb` so they don't need
14615
+ // the `/inject` prefix.
14616
+ function buildInjectDeps(opts?: { open?: boolean; fixedVerb?: string }): InjectDeps {
14617
+ return {
14618
+ isAuthorized: opts?.open ? () => true : isAuthorizedSender,
14458
14619
  inject: injectSlashCommandImpl,
14459
14620
  // accent is already inlined into the body by the handler via
14460
14621
  // buildAccentHeader; switchroomReply doesn't need to know about it.
14461
- reply: async (ctx, text, opts) => switchroomReply(ctx, text, { html: opts?.html }),
14622
+ reply: async (ctx, text, replyOpts) => switchroomReply(ctx, text, { html: replyOpts?.html }),
14462
14623
  getAgentName: getMyAgentName,
14463
- getArgs: getCommandArgs,
14624
+ getArgs: opts?.fixedVerb != null ? () => opts.fixedVerb as string : getCommandArgs,
14464
14625
  escapeHtml: escapeHtmlForTg,
14465
14626
  preBlock,
14466
14627
  formatOutput: formatSwitchroomOutput,
14467
- })
14628
+ }
14629
+ }
14630
+
14631
+ bot.command('inject', async ctx => {
14632
+ await handleInjectCommand(ctx, buildInjectDeps())
14633
+ })
14634
+
14635
+ // /compact + /clear — first-class session-control commands, open to anyone in
14636
+ // the chat. Both are in the INJECT_COMMANDS allowlist; they ride the same
14637
+ // inject primitive as `/inject compact` / `/inject clear`.
14638
+ bot.command('compact', async ctx => {
14639
+ await handleInjectCommand(ctx, buildInjectDeps({ open: true, fixedVerb: '/compact' }))
14640
+ })
14641
+ bot.command('clear', async ctx => {
14642
+ await handleInjectCommand(ctx, buildInjectDeps({ open: true, fixedVerb: '/clear' }))
14468
14643
  })
14469
14644
 
14470
14645
  // /model — model dashboard + switch for this agent's live session.
@@ -14705,7 +14880,8 @@ function flushAgentHandoff(agentDir: string): number {
14705
14880
  return removed
14706
14881
  }
14707
14882
 
14708
- async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Promise<void> {
14883
+ async function handleNewCommand(ctx: Context): Promise<void> {
14884
+ const kind = 'new' // /reset removed (was a pure alias); keep the string for messages
14709
14885
  if (!isAuthorizedSender(ctx)) return
14710
14886
  const name = (typeof ctx.match === "string" ? ctx.match : "").trim() || getMyAgentName()
14711
14887
  try { assertSafeAgentName(name) } catch { await switchroomReply(ctx, 'Invalid agent name.'); return }
@@ -14755,9 +14931,7 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
14755
14931
 
14756
14932
  const chatId = String(ctx.chat!.id)
14757
14933
  const threadId = resolveThreadId(chatId, ctx.message?.message_thread_id)
14758
- const ackText = kind === 'new'
14759
- ? buildNewSessionAckText(name, flushed > 0)
14760
- : buildResetSessionAckText(name, flushed > 0)
14934
+ const ackText = buildNewSessionAckText(name, flushed > 0)
14761
14935
  let ackId: number | null = null
14762
14936
  // #1075: thread-id-bearing — fall back to main chat.
14763
14937
  try {
@@ -14823,8 +14997,7 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
14823
14997
  )
14824
14998
  }
14825
14999
 
14826
- bot.command('new', async ctx => handleNewOrResetCommand(ctx, 'new'))
14827
- bot.command('reset', async ctx => handleNewOrResetCommand(ctx, 'reset'))
15000
+ bot.command('new', async ctx => handleNewCommand(ctx))
14828
15001
 
14829
15002
  // /update — host update from Telegram (#919). Default = dry-run plan
14830
15003
  // (`switchroom update --check`); explicit `apply` triggers the real
@@ -21118,54 +21291,32 @@ process.on('uncaughtException', err => {
21118
21291
  let runnerHandle: RunnerHandle | null = null
21119
21292
 
21120
21293
  // Long-poll health-check handle (issue #56). Created once per process, started
21121
- // after the runner comes up, stopped on clean shutdown. The `onStall` callback
21122
- // stops the runner so the outer retry loop can restart it.
21294
+ // after the runner comes up, stopped on clean shutdown. On a confirmed stall
21295
+ // the gateway EXITS non-zero and the supervisor restarts it with a fresh runner
21296
+ // (see recoverFromPollStall + the 2026-06-18 incident note there). It does NOT
21297
+ // try to stop()+re-run the runner in place — grammy's stop() blocks on a non-
21298
+ // abortable getUpdates retry backoff during a network outage, which hung the
21299
+ // whole fleet deaf.
21123
21300
  //
21124
21301
  // Interval and threshold are configurable via env for ops/testing flexibility:
21125
- // SWITCHROOM_POLL_HEALTH_INTERVAL_MS — default 5 min
21126
- // SWITCHROOM_POLL_HEALTH_THRESHOLD — default 3
21302
+ // SWITCHROOM_POLL_HEALTH_INTERVAL_MS — default 60s (fast self-heal after a flap)
21303
+ // SWITCHROOM_POLL_HEALTH_THRESHOLD — default 3 (a single blip must not trip it)
21127
21304
  const POLL_HEALTH_INTERVAL_MS = Number(
21128
- process.env.SWITCHROOM_POLL_HEALTH_INTERVAL_MS ?? 5 * 60_000,
21305
+ process.env.SWITCHROOM_POLL_HEALTH_INTERVAL_MS ?? 60_000,
21129
21306
  )
21130
21307
  const POLL_HEALTH_THRESHOLD = Number(
21131
21308
  process.env.SWITCHROOM_POLL_HEALTH_THRESHOLD ?? 3,
21132
21309
  )
21133
21310
 
21134
- /** Sentinel error thrown by onStall so the outer for-loop retries rather
21135
- * than exiting. The catch block recognises this specific message. */
21136
- class PollStallError extends Error {
21137
- constructor() {
21138
- super('poll_stall_restart')
21139
- this.name = 'PollStallError'
21140
- }
21141
- }
21142
-
21143
21311
  let pollHealthCheck: PollHealthCheckHandle | null = null
21144
21312
  if (POLL_HEALTH_INTERVAL_MS > 0) {
21145
21313
  pollHealthCheck = createPollHealthCheck({
21146
21314
  ping: () => bot.api.getMe(),
21147
21315
  onStall: async () => {
21148
- const agentName = process.env.SWITCHROOM_AGENT_NAME ?? '-'
21149
- process.stderr.write(
21150
- `telegram gateway: poll.health_check.stall_recovery stopping runner agent=${agentName}\n`,
21151
- )
21152
- if (runnerHandle != null && runnerHandle.isRunning()) {
21153
- try {
21154
- await runnerHandle.stop()
21155
- } catch (err) {
21156
- process.stderr.write(
21157
- `telegram gateway: poll.health_check.stall_recovery runner.stop error: ${(err as Error).message}\n`,
21158
- )
21159
- }
21160
- }
21161
- // runnerHandle.stop() causes task() to resolve. That would normally
21162
- // hit the `return` below and exit the startup IIFE. Instead we throw
21163
- // PollStallError from inside task()'s continuation by surfacing it
21164
- // through the outer catch block — but task() itself doesn't throw here.
21165
- //
21166
- // The simpler fix: set runnerHandle to a sentinel that the code below
21167
- // `await runnerHandle.task()` checks to decide continue vs return.
21168
- runnerHandle = null
21316
+ // Exit non-zero _switchroom_supervise restarts the gateway sidecar
21317
+ // with a fresh runner. Never awaits runnerHandle.stop() (it hangs on a
21318
+ // wedged source). recoverFromPollStall exits 1, never 78.
21319
+ recoverFromPollStall({ agentName: process.env.SWITCHROOM_AGENT_NAME ?? '-' })
21169
21320
  },
21170
21321
  intervalMs: POLL_HEALTH_INTERVAL_MS,
21171
21322
  failureThreshold: POLL_HEALTH_THRESHOLD,
@@ -22247,20 +22398,10 @@ void (async () => {
22247
22398
  pollHealthCheck?.stop()
22248
22399
  pollHealthCheck?.start()
22249
22400
  await runnerHandle.task()
22250
- // If onStall fired, it called runnerHandle.stop() which resolved task()
22251
- // above, then set runnerHandle = null. Detect that here and continue the
22252
- // loop to restart the runner. A normal clean exit leaves runnerHandle non-
22253
- // null (the stopped handle is still non-null at this point), so we can
22254
- // distinguish: null means stall-triggered, non-null means clean exit.
22255
- if (runnerHandle === null) {
22256
- const agentName = process.env.SWITCHROOM_AGENT_NAME ?? '-'
22257
- process.stderr.write(
22258
- `telegram gateway: poll.health_check.stall_recovery restarting runner agent=${agentName}\n`,
22259
- )
22260
- // Brief pause so the Telegram API can close the stalled connection.
22261
- await new Promise(r => setTimeout(r, 2000))
22262
- continue
22263
- }
22401
+ // task() resolves only on clean shutdown (shutdown-drain stops the
22402
+ // runner) exit the startup IIFE. Stall recovery no longer routes here:
22403
+ // onStall exits the process and the supervisor restarts the gateway
22404
+ // (see recoverFromPollStall). The 409 path below re-runs in place.
22264
22405
  return
22265
22406
  } catch (err) {
22266
22407
  if (err instanceof GrammyError && err.error_code === 409) {
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * Hostd dispatch helpers for the gateway's self-restart slash-commands
3
3
  * (#1175 RFC C, Phase 2). When the operator has opted into
4
- * `host_control.enabled: true`, /restart, /new, /reset, and
4
+ * `host_control.enabled: true`, /restart, /new, and
5
5
  * /update apply route through the per-agent hostd UDS instead of the
6
6
  * in-container `spawnSwitchroomDetached` shellout.
7
7
  *
@@ -0,0 +1,72 @@
1
+ /**
2
+ * Idle auto-clear: wipe a session's working context after a wall-clock idle
3
+ * period (default 3h), so a long-untouched agent starts fresh next message
4
+ * instead of resuming a stale, context-heavy thread. Long-term memory lives in
5
+ * Hindsight, so a clear loses only the in-session scratch.
6
+ *
7
+ * Sibling of proactive-compact.ts (occupancy-driven /compact at the turn-end
8
+ * idle gate). This one is wall-clock-driven: it fires `/clear` from a periodic
9
+ * interval because a fully-idle agent never ends a turn, so the turn-end gate
10
+ * alone would never see it. Both inject via the same primitive and both refuse
11
+ * to fire mid-turn (turnInFlight guard).
12
+ *
13
+ * The decider is pure so the fire-once / re-arm / not-mid-turn / disabled logic
14
+ * is unit-tested without the gateway.
15
+ */
16
+
17
+ /** Default idle window when `session.idle_clear_after` is unset (3h). ON by default. */
18
+ export const DEFAULT_IDLE_CLEAR_MS = 3 * 60 * 60 * 1000;
19
+
20
+ export interface IdleClearState {
21
+ /** Epoch ms of the last activity (inbound, turn start, cron fire). */
22
+ lastActivityAt: number;
23
+ /** Idle window in ms. <= 0 disables auto-clear. */
24
+ idleClearMs: number;
25
+ /** Already auto-cleared since the last activity? Prevents re-clearing every tick. */
26
+ alreadyCleared: boolean;
27
+ /** A turn is in flight — never clear mid-turn. */
28
+ turnInFlight: boolean;
29
+ }
30
+
31
+ export interface IdleClearDecision {
32
+ clear: boolean;
33
+ }
34
+
35
+ /**
36
+ * Decide whether to auto-clear this evaluation. Fires exactly once per idle
37
+ * period: only when enabled, not mid-turn, not already cleared, and the idle
38
+ * window has elapsed. The caller sets `alreadyCleared` on fire and resets it
39
+ * (with `lastActivityAt`) on the next activity to re-arm.
40
+ */
41
+ export function decideIdleClear(
42
+ state: IdleClearState,
43
+ now: number,
44
+ ): IdleClearDecision {
45
+ if (state.idleClearMs <= 0) return { clear: false }; // disabled
46
+ if (state.turnInFlight) return { clear: false }; // never mid-turn
47
+ if (state.alreadyCleared) return { clear: false }; // once per idle period
48
+ if (now - state.lastActivityAt < state.idleClearMs) return { clear: false };
49
+ return { clear: true };
50
+ }
51
+
52
+ /**
53
+ * Parse a `^\d+[smh]$` duration (the SessionSchema format, e.g. "3h", "30m",
54
+ * "7200s") to ms. Returns null on a malformed string so the caller can fall
55
+ * back to the default. Kept local (vs importing the web module's parser) to
56
+ * avoid cross-package coupling.
57
+ */
58
+ export function idleDurationToMs(raw: string): number | null {
59
+ const m = /^(\d+)([smh])$/.exec(raw.trim());
60
+ if (!m) return null;
61
+ const n = Number(m[1]);
62
+ switch (m[2]) {
63
+ case "s":
64
+ return n * 1000;
65
+ case "m":
66
+ return n * 60_000;
67
+ case "h":
68
+ return n * 3_600_000;
69
+ default:
70
+ return null;
71
+ }
72
+ }
@@ -10,18 +10,23 @@
10
10
  *
11
11
  * Fix:
12
12
  * A separate setInterval calls `getMe()` (a lightweight Bot API
13
- * endpoint) every HEALTH_INTERVAL_MS. Three consecutive failures
14
- * constitute a stall: we stop the runner, wait RESTART_GRACE_MS
15
- * for the in-flight request to die, then let the caller restart it.
13
+ * endpoint) every `intervalMs`. `failureThreshold` consecutive
14
+ * failures constitute a stall and fire `onStall`.
16
15
  *
17
16
  * A single failure doesn't count — transient network blips happen.
18
17
  * The threshold must be >= 3 so a brief Telegram outage (e.g. a
19
18
  * data-centre hiccup) doesn't cause thrashing.
20
19
  *
20
+ * Recovery (see gateway.ts onStall → poll-stall-recovery.ts):
21
+ * `onStall` does NOT stop()+re-run the runner — grammy's stop() blocks
22
+ * on a non-abortable getUpdates retry backoff during an outage, which hung
23
+ * the fleet deaf (2026-06-18). It exits the process non-zero and the
24
+ * supervisor restarts the gateway with a fresh runner.
25
+ *
21
26
  * Usage:
22
27
  * const hc = createPollHealthCheck({
23
28
  * ping: () => bot.api.getMe(),
24
- * onStall: async () => { await runnerHandle.stop(); … restart … },
29
+ * onStall: async () => { recoverFromPollStall({ agentName }); },
25
30
  * log: (msg) => process.stderr.write(msg),
26
31
  * });
27
32
  * // start after the runner is up:
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Telegram poll stall recovery (incident 2026-06-18: a network flap left the
3
+ * whole agent fleet alive-but-deaf for ~30 min until a manual restart).
4
+ *
5
+ * When the long-poll health-check (poll-health.ts) detects a stall — 3
6
+ * consecutive `getMe` failures — the OLD recovery did `await runnerHandle.stop()`
7
+ * then re-ran the runner in place. That HUNG: grammy's `stop()` returns the
8
+ * runner task promise, which is blocked on a non-abortable `getUpdates` retry
9
+ * backoff (`@grammyjs/runner` maxRetryTime 15h, exponential, plain setTimeout
10
+ * that ignores the abort signal). During a network outage the source never
11
+ * terminates, so `stop()` never resolves, the re-run never fires, and the
12
+ * gateway stays deaf.
13
+ *
14
+ * Recovery is now a clean non-zero process EXIT: `_switchroom_supervise`
15
+ * (profiles/_base/start.sh.hbs) restarts the gateway sidecar with a fresh
16
+ * runner. Same mechanism the manual fix used and the run loop's catch-block
17
+ * already relies on; it restarts only the gateway (claude/tmux untouched,
18
+ * in-flight turns recovered by boot-resume). Extracted as its own module with
19
+ * an injectable `exit` so the exit code is unit-testable (mirrors
20
+ * startup-network-retry.ts) — gateway.ts itself can't be imported in a test.
21
+ */
22
+
23
+ export interface PollStallRecoveryDeps {
24
+ /** Process exit. Injectable for tests; defaults to process.exit. */
25
+ exit?: (code: number) => void;
26
+ /** Logger. Defaults to process.stderr. */
27
+ log?: (msg: string) => void;
28
+ /** Agent name for the log line. */
29
+ agentName?: string;
30
+ }
31
+
32
+ /** Exit code for a poll stall. Always 1 — NEVER 78. */
33
+ export const POLL_STALL_EXIT_CODE = 1;
34
+
35
+ /**
36
+ * Recover from a confirmed Telegram poll stall by exiting non-zero so the
37
+ * supervisor restarts the gateway with a fresh runner.
38
+ *
39
+ * MUST exit 1 — NEVER 78. Exit 78 (EX_CONFIG) is the supervisor's permanent-
40
+ * quarantine sentinel (start.sh.hbs); quarantining on a *transient* network
41
+ * stall would leave the gateway dead even after connectivity returns — the
42
+ * exact failure this fix exists to prevent.
43
+ */
44
+ export function recoverFromPollStall(deps: PollStallRecoveryDeps = {}): void {
45
+ const exit = deps.exit ?? ((code: number) => process.exit(code));
46
+ const log =
47
+ deps.log ??
48
+ ((msg: string) => {
49
+ process.stderr.write(msg.endsWith("\n") ? msg : msg + "\n");
50
+ });
51
+ const agentName = deps.agentName ?? "-";
52
+
53
+ log(
54
+ `telegram gateway: poll.health_check.stall_recovery exiting code=${POLL_STALL_EXIT_CODE} ` +
55
+ `pid=${process.pid} agent=${agentName} — supervisor will restart the gateway with a fresh runner ` +
56
+ `(not awaiting runnerHandle.stop(): grammy stop() blocks on a non-abortable getUpdates retry backoff during an outage)`,
57
+ );
58
+ exit(POLL_STALL_EXIT_CODE);
59
+ }