polygram 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -214,14 +214,25 @@ const CODES = {
214
214
  isTransient: false,
215
215
  autoRecover: null,
216
216
  },
217
- // TURN_TIMEOUT: per-turn time cap (idle default 10 min, configurable per
218
- // chat/topic UMI runs 60 min). Mirror of the tmux wall-clock ceiling
219
- // typically a runaway, not a wedge. Not transient (auto-retry would just
220
- // runaway again). Copy must not name a number: the 2026-06-11 UMI false-⏱
221
- // rendered "10-minute" under a 60-minute cap.
217
+ // TURN_TIMEOUT: per-turn time cap fired because the turn went QUIET with no
218
+ // detectable progress (0.16: the busy-aware checkpoint extends a turn that's
219
+ // provably working, so reaching this code means the probe saw no streaming /
220
+ // no active shell a genuine stall/wedge, not a long-but-working turn). Not
221
+ // transient. Copy must not name a number (the 2026-06-11 UMI false-⏱ rendered
222
+ // "10-minute" under a 60-minute cap).
222
223
  TURN_TIMEOUT: {
223
224
  kind: 'turnTimeout',
224
- userMessage: '⏱ This one ran past its time cap with no reply. Resend if the answer still matters.',
225
+ userMessage: '⏱ This one went quiet with no progress, so I stopped waiting — send /stop to clear it, or resend if you still need it.',
226
+ isTransient: false,
227
+ autoRecover: null,
228
+ },
229
+ // TURN_MAX_EXCEEDED (0.16): the busy-aware checkpoint kept extending a turn
230
+ // that WAS still working, until it hit the hard wall-clock backstop
231
+ // (turnHardMaxMs, default 90 min). Distinct from TURN_TIMEOUT (which means
232
+ // "went quiet") — here it ran genuinely long and we capped it for safety.
233
+ TURN_MAX_EXCEEDED: {
234
+ kind: 'turnMaxExceeded',
235
+ userMessage: '⏱ This ran past the max time and I had to stop it. Resend if you still need it — or break it into smaller steps.',
225
236
  isTransient: false,
226
237
  autoRecover: null,
227
238
  },
@@ -416,8 +427,38 @@ function detectWedgedSessionError(text) {
416
427
  return cls;
417
428
  }
418
429
 
430
+ /**
431
+ * 0.16: decide how the streamed-reply catch (polygram.js handleMessage) should
432
+ * cap the bubble + set the reactor when a turn ends in error. Extracted as a
433
+ * pure fn so the decision is unit-testable (the catch itself isn't unit-reachable).
434
+ *
435
+ * Returns { errorSuffix, reactorState }:
436
+ * - errorSuffix: appended to streamer.finalize('') (null = no suffix)
437
+ * - reactorState: reactor.setState(...) value
438
+ *
439
+ * Turn-end timeouts (TURN_TIMEOUT = went quiet, TURN_MAX_EXCEEDED = hit hard cap)
440
+ * are real stops → the "stream interrupted" suffix is honest here. Note: the cli
441
+ * backend's TURN_TIMEOUT err.message is `turn timeout (...)` which does NOT match
442
+ * the legacy /wall-clock ceiling|idle.../ regex, so we branch on err.code, not
443
+ * the message text (a v1-review correction).
444
+ */
445
+ function classifyTurnEndError(err) {
446
+ const code = err?.code;
447
+ // The cli backend sets err.code (TURN_TIMEOUT / TURN_MAX_EXCEEDED). The SDK +
448
+ // tmux backends reject with a MESSAGE and NO code (e.g. "Turn exceeded 1800s
449
+ // wall-clock ceiling" / "Timeout: 600s idle with no Claude activity"), so we
450
+ // MUST keep the legacy message regex as a fallback — without it those
451
+ // backends' timeouts flip from the calm ⏱ TIMEOUT reactor to the scary ERROR
452
+ // one (regression caught in the 0.16 code review).
453
+ const isTimeout = code === 'TURN_TIMEOUT'
454
+ || code === 'TURN_MAX_EXCEEDED'
455
+ || /wall-clock ceiling|idle with no Claude activity/i.test(err?.message || '');
456
+ return { errorSuffix: 'stream interrupted', reactorState: isTimeout ? 'TIMEOUT' : 'ERROR' };
457
+ }
458
+
419
459
  module.exports = {
420
460
  classify,
461
+ classifyTurnEndError,
421
462
  detectWedgedSessionError,
422
463
  isTransientHttpError,
423
464
  PATTERNS,
@@ -97,7 +97,8 @@ const INPUT_LEDGER_CAP = 64;
97
97
  // (Envelope shape verified from prod JSONL + the P0 spike — Q1.)
98
98
  const UPS_ENVELOPE_TURN_ID_RE = /<channel\s[^>]*turn_id="([0-9a-f-]{36})"/g;
99
99
  const DEFAULT_TURN_TIMEOUT_MS = 600_000; // 10 min idle cap (resets on each reply — Review F#13)
100
- const DEFAULT_TURN_ABSOLUTE_MS = 1_800_000; // 30 min absolute wall-clock ceiling (no reset)
100
+ const DEFAULT_TURN_ABSOLUTE_MS = 1_800_000; // 30 min busy-aware checkpoint interval (0.16: re-arms while working)
101
+ const DEFAULT_TURN_HARD_MAX_MS = 5_400_000; // 90 min hard wall-clock backstop (0.16: extension can't exceed this)
101
102
  const DEFAULT_INTERRUPT_GRACE_MS = 5_000; // after Ctrl-C, wait this long for Claude to ack before synthesizing 'interrupted'
102
103
  const DEFAULT_MAX_REPLIES_PER_TURN = 20; // P1 #12: cap on quiet-window resets to prevent chatty-Claude hang
103
104
  const PING_INTERVAL_MS = 10_000;
@@ -223,6 +224,7 @@ class CliProcess extends Process {
223
224
  deliveryWatchdogMs = DEFAULT_DELIVERY_WATCHDOG_MS,
224
225
  turnTimeoutMs = DEFAULT_TURN_TIMEOUT_MS,
225
226
  turnAbsoluteMs = DEFAULT_TURN_ABSOLUTE_MS,
227
+ turnHardMaxMs = DEFAULT_TURN_HARD_MAX_MS,
226
228
  bgWorkStallMs = DEFAULT_BG_WORK_STALL_MS,
227
229
  interruptGraceMs = DEFAULT_INTERRUPT_GRACE_MS,
228
230
  maxRepliesPerTurn = DEFAULT_MAX_REPLIES_PER_TURN,
@@ -258,6 +260,7 @@ class CliProcess extends Process {
258
260
  this.deliveryWatchdogMs = deliveryWatchdogMs;
259
261
  this.turnTimeoutMs = turnTimeoutMs;
260
262
  this.turnAbsoluteMs = turnAbsoluteMs;
263
+ this.turnHardMaxMs = turnHardMaxMs;
261
264
  this.bgWorkStallMs = bgWorkStallMs;
262
265
  this.interruptGraceMs = interruptGraceMs;
263
266
  this.maxRepliesPerTurn = maxRepliesPerTurn;
@@ -2083,7 +2086,12 @@ class CliProcess extends Process {
2083
2086
  // Added absoluteTimer as the true wall-clock ceiling at 30 min so a
2084
2087
  // legitimate 15-min "replies every 60s" turn isn't killed mid-stream
2085
2088
  // while still bounding runaways.
2086
- const fireTimeout = (reason) => {
2089
+ // 0.16: `reason` {'idle','absolute','hard-max'}. The absolute checkpoint
2090
+ // (_checkpointAbsolute) passes its already-captured `probeResult` so we
2091
+ // don't double capture-pane on the give-up path. err.code is mapped from
2092
+ // reason: 'hard-max' → TURN_MAX_EXCEEDED (ran long while working), else
2093
+ // → TURN_TIMEOUT (went quiet / idle).
2094
+ const fireTimeout = (reason, probeResult = null) => {
2087
2095
  if (!this.pendingTurns.has(turnId)) return;
2088
2096
  const pending = this.pendingTurns.get(turnId);
2089
2097
  // 0.13 D1 (S9): unblock any open ask FIRST — claude must never stay
@@ -2105,7 +2113,11 @@ class CliProcess extends Process {
2105
2113
  if (pending._activityQuietTimer) clearTimeout(pending._activityQuietTimer);
2106
2114
  if (pending._onStop) this.off('stop-hook', pending._onStop);
2107
2115
  this.inFlight = this.pendingTurns.size > 0;
2108
- const turnTimeoutMs = reason === 'absolute' ? this.turnAbsoluteMs : (opts.maxTurnMs || this.turnTimeoutMs);
2116
+ const turnTimeoutMs = reason === 'hard-max'
2117
+ ? (pending._turnHardMaxMs || this.turnHardMaxMs)
2118
+ : reason === 'absolute'
2119
+ ? this.turnAbsoluteMs
2120
+ : (opts.maxTurnMs || this.turnTimeoutMs);
2109
2121
 
2110
2122
  // 0.13 D1 ceiling-resolve: a ceiling expiring on a turn with delivered
2111
2123
  // replies RESOLVES it — the user already has their answer; rejecting
@@ -2150,10 +2162,10 @@ class CliProcess extends Process {
2150
2162
  // 0.12.3 wedge characterization (docs/0.13-turn-wedge-autorecovery-spec.md):
2151
2163
  // a zero-reply turn hit the ceiling = claude wedged (no hooks AND no
2152
2164
  // "esc to interrupt" the whole window). Capture the TUI pane tail + busy
2153
- // flags to learn WHAT state claude is stuck in (a tool, an unrecognized
2154
- // prompt, the idle ❯, blank). Fire-and-forget: never blocks or changes
2155
- // the kill path; the tmux session is still alive at this point.
2156
- this.probeBusyState().then((probe) => {
2165
+ // flags to learn WHAT state claude is stuck in. 0.16: reuse the probe the
2166
+ // absolute checkpoint already captured (probeResult) to avoid a second
2167
+ // capture-pane; only probe fresh on the idle-timer path (no prior probe).
2168
+ const logProbe = (probe) => {
2157
2169
  this._logEvent('turn-timeout-pane', {
2158
2170
  reason,
2159
2171
  streaming: probe.streaming,
@@ -2162,10 +2174,12 @@ class CliProcess extends Process {
2162
2174
  captured: probe.captured,
2163
2175
  pane_tail: probe.paneTail,
2164
2176
  });
2165
- }).catch(() => { /* telemetry best-effort — never throws into the kill path */ });
2177
+ };
2178
+ if (probeResult) { try { logProbe(probeResult); } catch { /* best-effort */ } }
2179
+ else this.probeBusyState().then(logProbe).catch(() => { /* telemetry best-effort */ });
2166
2180
  this.emit('idle');
2167
2181
  const err = new Error(`turn timeout (${turnTimeoutMs}ms, reason=${reason})`);
2168
- err.code = 'TURN_TIMEOUT';
2182
+ err.code = reason === 'hard-max' ? 'TURN_MAX_EXCEEDED' : 'TURN_TIMEOUT';
2169
2183
  reject(err);
2170
2184
  };
2171
2185
  const pending = {
@@ -2179,15 +2193,24 @@ class CliProcess extends Process {
2179
2193
  // hardTimer = idle ceiling. Resets on any activity (_noteActivity)
2180
2194
  // so a chatty or tool-heavy turn isn't killed at 10 min wall-clock.
2181
2195
  hardTimer: setTimeout(() => fireTimeout('idle'), opts.maxTurnMs || this.turnTimeoutMs),
2182
- // absoluteTimer = wall-clock ceiling. Does NOT reset. Bounds true
2183
- // runaways. 30 min default high enough that legitimate
2184
- // multi-step refactors complete, low enough to catch infinite
2185
- // chatter.
2186
- absoluteTimer: setTimeout(() => fireTimeout('absolute'), this.turnAbsoluteMs),
2196
+ // absoluteTimer = busy-aware checkpoint (0.16). Fires every
2197
+ // turnAbsoluteMs (30min) as a LIVENESS CHECK: if the turn is provably
2198
+ // working (streaming/active shell + progress since last checkpoint) and
2199
+ // under the hard backstop, re-arm; else give up. Replaces the old
2200
+ // one-shot 30-min guillotine that cut actively-working turns.
2201
+ absoluteTimer: setTimeout(() => this._checkpointAbsolute(turnId), this.turnAbsoluteMs),
2187
2202
  // Review F#13: attach fireTimeout so activity can reset the idle
2188
2203
  // timer (creates a fresh setTimeout with the same closure).
2189
2204
  _fireTimeout: fireTimeout,
2190
2205
  startedAt: Date.now(),
2206
+ // 0.16: hard wall-clock backstop for this turn (per-send override →
2207
+ // instance default). The checkpoint never extends past this.
2208
+ _turnHardMaxMs: opts.maxTurnHardMs || this.turnHardMaxMs,
2209
+ // 0.16: checkpoint progress-tracking (MF-A) — extend only if activity
2210
+ // advanced since the previous checkpoint, not just "a shell exists".
2211
+ _lastCheckpointActivityAt: Date.now(),
2212
+ _lastCheckpointPaneTail: null,
2213
+ _extended: false,
2191
2214
  };
2192
2215
  this.pendingTurns.set(turnId, pending);
2193
2216
 
@@ -2332,6 +2355,81 @@ class CliProcess extends Process {
2332
2355
  this._interruptGraceTimer.unref?.();
2333
2356
  }
2334
2357
 
2358
+ /**
2359
+ * 0.16 busy-aware ceiling checkpoint. Armed by the per-turn absoluteTimer
2360
+ * every `turnAbsoluteMs` (30min). Decides whether to EXTEND a still-working
2361
+ * turn or give up:
2362
+ *
2363
+ * - replied turn → resolve gracefully (delegate to fireTimeout, which takes
2364
+ * the line-2118 ceiling-resolve branch).
2365
+ * - probe says working AND progress advanced since last checkpoint AND
2366
+ * elapsed < hard backstop → re-arm (turn stays pending, /stop keeps
2367
+ * working, the live reply lands in the same bubble). Ping once.
2368
+ * - not working / no progress → give up as 'idle' → TURN_TIMEOUT (went quiet).
2369
+ * - elapsed ≥ hard backstop → give up as 'hard-max' → TURN_MAX_EXCEEDED.
2370
+ *
2371
+ * MF-A: "working" requires evidence of PROGRESS (streaming now, or activity /
2372
+ * pane changed since the last checkpoint), not merely a shell's existence — a
2373
+ * hung/zombie background shell would otherwise extend to the hard max.
2374
+ * MF-C: re-check pendingTurns AFTER the async probe (the turn can resolve /
2375
+ * abort / kill during the capture-pane round-trip — TOCTOU), and reassign
2376
+ * pending.absoluteTimer so teardown sites clear the live handle.
2377
+ */
2378
+ async _checkpointAbsolute(turnId) {
2379
+ if (!this.pendingTurns.has(turnId)) return;
2380
+ let pending = this.pendingTurns.get(turnId);
2381
+ // Replied turn (or consumed-acked): the ceiling RESOLVES it, never extends.
2382
+ if ((pending.replies?.length || 0) > 0
2383
+ || (pending.seen === true && pending._consumedAcked === true)) {
2384
+ pending._fireTimeout('absolute');
2385
+ return;
2386
+ }
2387
+ let probe = null;
2388
+ try { probe = await this.probeBusyState(); } catch { probe = null; }
2389
+ // MF-C TOCTOU: the turn may have settled during the capture-pane await.
2390
+ if (!this.pendingTurns.has(turnId)) return;
2391
+ pending = this.pendingTurns.get(turnId);
2392
+ // Also bail if the turn entered finalization DURING the probe — a reply
2393
+ // landed, or it's in stop-grace, or it consumed-acked. Re-arming or pinging
2394
+ // now would resurrect a settling turn (spurious "still working" right as the
2395
+ // real answer lands). It will finalize through its own quiet/grace path.
2396
+ if (pending._stopGracePending
2397
+ || (pending.replies?.length || 0) > 0
2398
+ || (pending.seen === true && pending._consumedAcked === true)) return;
2399
+ const now = Date.now();
2400
+ const elapsed = now - pending.startedAt;
2401
+ const maxMs = pending._turnHardMaxMs || this.turnHardMaxMs;
2402
+ const streaming = !!(probe && probe.streaming);
2403
+ const hasShell = !!(probe && (probe.backgroundShell || probe.shellCount > 0));
2404
+ const lastAct = Math.max(this._lastActivityAt || 0, this._lastHookEventAt || 0);
2405
+ // MF-A progress delta: streaming NOW is live proof; otherwise require that
2406
+ // activity advanced OR the pane changed since the previous checkpoint.
2407
+ const progressed = streaming
2408
+ || (lastAct > (pending._lastCheckpointActivityAt || pending.startedAt))
2409
+ || (!!probe && probe.paneTail != null && probe.paneTail !== pending._lastCheckpointPaneTail);
2410
+ const working = (streaming || hasShell) && progressed;
2411
+
2412
+ if (working && elapsed < maxMs) {
2413
+ pending._lastCheckpointActivityAt = lastAct || pending._lastCheckpointActivityAt;
2414
+ pending._lastCheckpointPaneTail = (probe && probe.paneTail) || pending._lastCheckpointPaneTail;
2415
+ // MF-C: reassign the live handle so cleanup sites clear THIS timer.
2416
+ pending.absoluteTimer = setTimeout(() => this._checkpointAbsolute(turnId), this.turnAbsoluteMs);
2417
+ this._logEvent('turn-extended', {
2418
+ turn_id: turnId, elapsed_ms: elapsed, streaming, shell_count: probe ? probe.shellCount : 0,
2419
+ });
2420
+ // Progress ping ONCE per turn (first extension) — emits an event polygram
2421
+ // turns into a single "still working" message (honest: probe-confirmed).
2422
+ if (!pending._extended) {
2423
+ pending._extended = true;
2424
+ this.emit('turn-extended', { sessionKey: this.sessionKey, turnId, elapsedMs: elapsed });
2425
+ }
2426
+ return;
2427
+ }
2428
+ // Give up: hard-max (was working but ran too long) vs idle (went quiet).
2429
+ const reason = elapsed >= maxMs ? 'hard-max' : 'idle';
2430
+ pending._fireTimeout(reason, probe);
2431
+ }
2432
+
2335
2433
  /**
2336
2434
  * Is claude actually still working, regardless of the resolved-turn flag?
2337
2435
  *
@@ -2425,6 +2523,16 @@ class CliProcess extends Process {
2425
2523
  return this._bgWorkSince !== null;
2426
2524
  }
2427
2525
 
2526
+ /**
2527
+ * 0.16 (MF-B): does any in-flight turn have a busy-aware ceiling EXTENSION
2528
+ * active? Such a turn can hold its slot up to the hard backstop, so the LRU
2529
+ * treats it as a durable pin (soft-overflow) rather than a transient turn.
2530
+ */
2531
+ hasExtendedTurn() {
2532
+ for (const p of this.pendingTurns.values()) if (p._extended) return true;
2533
+ return false;
2534
+ }
2535
+
2428
2536
  /**
2429
2537
  * Resolve the model / effort for a spawn context using the topic→chat→
2430
2538
  * fallback precedence (mirrors the spawn path). Single source of truth shared
@@ -53,6 +53,12 @@ const CALLBACK_TO_EVENT = {
53
53
  // posts/edits a "⏳ working in background" status message so a long job reads as
54
54
  // working, not stuck. See docs/0.12.0-background-work-lifecycle-plan.md.
55
55
  onBgWorkStatus: 'bg-work-status',
56
+ // 0.16 busy-aware ceiling: CliProcess emits 'turn-extended' the FIRST time a
57
+ // turn passes the 30-min checkpoint while still provably working. The callback
58
+ // posts a one-time "⏳ still working — /stop to cancel" message so a long turn
59
+ // reads as alive (not the old false "stream interrupted"). See
60
+ // docs/0.16-turn-ceiling-busy-aware-spec.md.
61
+ onTurnExtended: 'turn-extended',
56
62
  // 0.12 interactive questions: CliProcess emits 'question-asked'
57
63
  // {sessionKey, chatId, threadId, turnId, toolCallId, questions} when claude calls
58
64
  // the `ask` tool. The callback (polygram) renders the Telegram inline keyboard;
@@ -351,6 +357,12 @@ class ProcessManager {
351
357
  _hasPinnedSession() {
352
358
  for (const p of this.procs.values()) {
353
359
  if (!p.inFlight && p.hasActiveBackgroundWork()) return true;
360
+ // 0.16 (MF-B): an extended busy-aware-ceiling turn is a DURABLE blocker —
361
+ // it can hold its slot up to the hard backstop (90min), not "seconds" like
362
+ // a normal in-flight turn. Treat it as a pin so getOrSpawn SOFT-overflows
363
+ // (spawn over budget + warn) instead of park-then-reject, which would deny
364
+ // service to other chats for the full 5-min LRU wait.
365
+ if (p.inFlight && typeof p.hasExtendedTurn === 'function' && p.hasExtendedTurn()) return true;
354
366
  }
355
367
  return false;
356
368
  }
@@ -359,6 +371,7 @@ class ProcessManager {
359
371
  const keys = [];
360
372
  for (const [k, p] of this.procs.entries()) {
361
373
  if (!p.inFlight && p.hasActiveBackgroundWork()) keys.push(k);
374
+ else if (p.inFlight && typeof p.hasExtendedTurn === 'function' && p.hasExtendedTurn()) keys.push(k);
362
375
  }
363
376
  return keys;
364
377
  }
@@ -411,6 +411,41 @@ function createSdkCallbacks({
411
411
  }
412
412
  },
413
413
 
414
+ // 0.16 busy-aware ceiling: CliProcess emits 'turn-extended' the FIRST time a
415
+ // turn passes the 30-min checkpoint while still provably working. Post ONE
416
+ // honest "still working — /stop" message so a long turn reads as alive
417
+ // instead of the old false "stream interrupted". The cli side flags
418
+ // _extended once per turn, so this fires at most once per long turn.
419
+ // Opt-out per chat via progressPings:false. NO "ask how it's going" — a
420
+ // foreground-streaming turn can't answer (review F1).
421
+ onTurnExtended: async (sessionKey, payload) => {
422
+ try {
423
+ if (!bot) return;
424
+ const chatId = getChatIdFromKey(sessionKey);
425
+ const chatCfg = (config && config.chats && config.chats[chatId]) || {};
426
+ // Precedence: per-chat overrides default (a chat can re-enable pings even
427
+ // if the global default disables them, and vice-versa). Default ON.
428
+ const chatPings = chatCfg.progressPings;
429
+ const enabled = chatPings !== undefined
430
+ ? chatPings !== false
431
+ : (config && config.defaults && config.defaults.progressPings) !== false;
432
+ if (!enabled) return;
433
+ const threadIdRaw = getThreadIdFromKey(sessionKey);
434
+ const threadId = threadIdRaw ? parseInt(threadIdRaw, 10) : null;
435
+ await tg(bot, 'sendMessage', {
436
+ chat_id: chatId,
437
+ text: '⏳ Still working on this — it\'s taking a while. Send /stop to cancel.',
438
+ ...(Number.isInteger(threadId) && { message_thread_id: threadId }),
439
+ }, { source: 'turn-extended', botName });
440
+ logEvent('turn-extended-ping', {
441
+ chat_id: chatId, session_key: sessionKey, thread_id: threadIdRaw,
442
+ elapsed_ms: payload?.elapsedMs ?? null,
443
+ });
444
+ } catch (err) {
445
+ logger.error?.(`[${botName}] turn-extended handler: ${err.message}`);
446
+ }
447
+ },
448
+
414
449
  // R8: a failed autosteer paste. injectUserMessage fires
415
450
  // `inject-fail` when its fire-and-forget paste rejects (tmux
416
451
  // server gone, paste-buffer error, etc.). Before this handler was
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "polygram",
3
- "version": "0.15.0",
3
+ "version": "0.16.0",
4
4
  "description": "Telegram daemon for Claude Code that preserves the OpenClaw per-chat session model. Migration path for OpenClaw users moving to Claude Code.",
5
5
  "main": "lib/ipc/client.js",
6
6
  "bin": {
package/polygram.js CHANGED
@@ -108,7 +108,7 @@ const { startTyping } = require('./lib/telegram/typing');
108
108
  const { createReactionManager, classifyToolName } = require('./lib/telegram/reactions');
109
109
  const { createMediaGroupBuffer } = require('./lib/media-group-buffer');
110
110
  const { applyReactionToMessages } = require('./lib/telegram/album-reactions');
111
- const { classify: classifyError, detectWedgedSessionError, isTransientHttpError } = require('./lib/error/classify');
111
+ const { classify: classifyError, classifyTurnEndError, detectWedgedSessionError, isTransientHttpError } = require('./lib/error/classify');
112
112
  const { createAutoResumeTracker, isAutoResumable } = require('./lib/db/auto-resume');
113
113
  const { resolveReplayWindowMs } = require('./lib/db/replay-window');
114
114
  const { pruneEvents, resolveRetentionPolicy, validatePolicy } = require('./lib/db/events-retention');
@@ -510,6 +510,10 @@ async function sendToProcess(sessionKey, prompt, context = {}, { onDispatched }
510
510
  const chatConfig = config.chats[chatId];
511
511
  const timeoutMs = (chatConfig.timeout || config.defaults.timeout) * 1000;
512
512
  const maxTurnMs = (chatConfig.maxTurn || config.defaults?.maxTurn || 1800) * 1000;
513
+ // 0.16 busy-aware ceiling: hard wall-clock backstop for a turn that keeps
514
+ // extending while provably working (cli backend). Per-chat → default →
515
+ // 90 min. The checkpoint never extends a turn past this.
516
+ const maxTurnHardMs = (chatConfig.maxTurnHard || config.defaults?.maxTurnHard || 5400) * 1000;
513
517
 
514
518
  // 0.12 Phase 2.1: HeartbeatReactor binding removed for CliProcess.
515
519
  // 0.11.0-channels needed a random-cycling working-pool reactor because
@@ -540,7 +544,7 @@ async function sendToProcess(sessionKey, prompt, context = {}, { onDispatched }
540
544
  // starts, which is the correct UX (and what the user already expects).
541
545
  const release = await stdinLock.acquire(sessionKey);
542
546
  try {
543
- const turnP = pm.send(sessionKey, prompt, { timeoutMs, maxTurnMs, context });
547
+ const turnP = pm.send(sessionKey, prompt, { timeoutMs, maxTurnMs, maxTurnHardMs, context });
544
548
  // Phase 3 §4: pm.send synchronously kicks off the turn — the
545
549
  // process is now inFlight. Signal the committed-intent latch so
546
550
  // it can release; a concurrent handler will then correctly see
@@ -1711,12 +1715,14 @@ async function handleMessage(sessionKey, chatId, msg, bot) {
1711
1715
  // On shutdown, leave the reactor state as-is — boot-replay's
1712
1716
  // fresh dispatch will set its own reactor.
1713
1717
  } else {
1714
- await streamer.finalize('', { errorSuffix: 'stream interrupted' }).catch(() => {});
1715
- if (/wall-clock ceiling|idle with no Claude activity/i.test(err?.message || '')) {
1716
- reactor.setState('TIMEOUT');
1717
- } else {
1718
- reactor.setState('ERROR');
1719
- }
1718
+ // 0.16: branch the bubble suffix + reactor on err.code via the pure
1719
+ // classifyTurnEndError helper (the cli TURN_TIMEOUT message is
1720
+ // `turn timeout (...)`, which does NOT match the legacy regex — branch on
1721
+ // code, not text). TURN_TIMEOUT (went quiet) / TURN_MAX_EXCEEDED (hit hard
1722
+ // cap) → TIMEOUT reactor; anything else → ERROR.
1723
+ const { errorSuffix, reactorState } = classifyTurnEndError(err);
1724
+ await streamer.finalize('', errorSuffix ? { errorSuffix } : {}).catch(() => {});
1725
+ reactor.setState(reactorState);
1720
1726
  }
1721
1727
  throw err;
1722
1728
  } finally {