npm - polygram - Versions diffs - 0.8.0-rc.53 → 0.8.0-rc.55 - Mend

polygram 0.8.0-rc.53 → 0.8.0-rc.55

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/.claude-plugin/plugin.json +1 -1
package/lib/auto-resume.js +101 -0
package/package.json +1 -1
package/polygram.js +179 -11

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "$schema": "https://anthropic.com/claude-code/plugin.schema.json",
   "name": "polygram",
-  "version": "0.8.0-rc.53",
+  "version": "0.8.0-rc.55",
   "description": "Telegram integration for Claude Code that preserves the OpenClaw per-chat session model. Migration target for OpenClaw users. Multi-bot, multi-chat, per-topic isolation; SQLite transcripts; inline-keyboard approvals. Bundles /polygram:status|logs|pair-code|approvals admin commands and a history skill.",
   "keywords": [
     "telegram",

package/lib/auto-resume.js ADDED Viewed

@@ -0,0 +1,101 @@
+/**
+ * rc.54: auto-resume on 300s no-activity timeout.
+ *
+ * Background — the rc.54 incident pattern:
+ *   When polygram's per-turn watchdog fires "Timeout: 300s idle with
+ *   no Claude activity", the running SDK Query is torn down and the
+ *   user gets the friendly "⏳ I went quiet too long without finishing.
+ *   Try resending or simplifying." message. The session_id is preserved,
+ *   so the *next* user message resumes context — but the work the user
+ *   was waiting for is dropped on the floor.
+ *
+ *   Most timeouts are wedged tool calls (long Bash, hanging MCP, stuck
+ *   subagent). The wedged subprocess is dead by the time the watchdog
+ *   fires; a fresh resume of the same session_id will spawn a clean
+ *   Query and Claude has full prior context to continue.
+ *
+ * What this module provides: a per-session cooldown tracker so we
+ * don't auto-resume in a tight loop when the wedge is permanent.
+ *
+ *   - markAttempt(sessionKey) — record we just tried an auto-resume
+ *   - isInCooldown(sessionKey) — true if we attempted within the
+ *     cooldown window (default 10 min). Caller skips auto-resume and
+ *     falls back to the existing user-facing timeout reply.
+ *   - clear(sessionKey) — drop the timestamp (e.g. a successful turn
+ *     completed since the auto-resume — we're back to healthy).
+ */
+'use strict';
+const DEFAULT_COOLDOWN_MS = 10 * 60 * 1000; // 10 min
+function createAutoResumeTracker({ cooldownMs = DEFAULT_COOLDOWN_MS, now = Date.now } = {}) {
+  const lastAttemptAt = new Map();
+  return {
+    /**
+     * Returns true if the most recent attempt for this sessionKey was
+     * within `cooldownMs` ago. Use to gate further auto-resume
+     * attempts when a wedge keeps recurring.
+     */
+    isInCooldown(sessionKey) {
+      const ts = lastAttemptAt.get(sessionKey);
+      if (ts == null) return false;
+      return now() - ts < cooldownMs;
+    },
+    /**
+     * Record an auto-resume attempt. Call BEFORE dispatching the
+     * resumed turn so a fast follow-up timeout can still see this
+     * session is in cooldown.
+     */
+    markAttempt(sessionKey) {
+      lastAttemptAt.set(sessionKey, now());
+    },
+    /**
+     * Clear the cooldown for a session — called when a normal turn
+     * succeeds, signalling the session is healthy again. Without
+     * this, a session that auto-resumed once would be locked out of
+     * future auto-resumes for the full 10 min even after recovery.
+     */
+    clear(sessionKey) {
+      lastAttemptAt.delete(sessionKey);
+    },
+    /**
+     * Reset all tracked sessions. Called by daemon reload, tests.
+     */
+    reset() {
+      lastAttemptAt.clear();
+    },
+    // Test hooks
+    _size() { return lastAttemptAt.size; },
+    _get(sessionKey) { return lastAttemptAt.get(sessionKey); },
+  };
+}
+/**
+ * Decide whether an error is a candidate for auto-resume.
+ *
+ * Gates:
+ *   - error message matches the 300s no-activity timeout pattern
+ *     (NOT the wall-clock ceiling — that's usually a runaway, not
+ *     a wedge; resuming might just runaway again)
+ *   - NOT user-aborted (the user explicitly /stop'd; never resume)
+ *   - NOT a boot-replay (the user typed this minutes ago and moved
+ *     on; resuming now is more confusing than helpful)
+ *   - NOT during shutdown (boot replay will pick it up)
+ */
+function isAutoResumable({ error, aborted, replay, shuttingDown }) {
+  if (aborted || replay || shuttingDown) return false;
+  const msg = String(error?.message || error || '');
+  return /idle with no Claude activity/i.test(msg);
+}
+module.exports = {
+  createAutoResumeTracker,
+  isAutoResumable,
+  DEFAULT_COOLDOWN_MS,
+};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "polygram",
-  "version": "0.8.0-rc.53",
+  "version": "0.8.0-rc.55",
   "description": "Telegram daemon for Claude Code that preserves the OpenClaw per-chat session model. Migration path for OpenClaw users moving to Claude Code.",
   "main": "lib/ipc-client.js",
   "bin": {

package/polygram.js CHANGED Viewed

@@ -65,6 +65,7 @@ const { redactBotToken } = require('./lib/net-errors');
 const { createReactionManager, classifyToolName } = require('./lib/status-reactions');
 const { createMediaGroupBuffer } = require('./lib/media-group-buffer');
 const { classify: classifyError, isTransientHttpError } = require('./lib/error-classify');
+const { createAutoResumeTracker, isAutoResumable } = require('./lib/auto-resume');
 const {
   createStore: createApprovalsStore,
   matchesAnyPattern: matchesApprovalPattern,
@@ -1118,6 +1119,94 @@ const abortGrace = createAbortGrace();
 function markSessionAborted(sessionKey) { abortGrace.mark(sessionKey); }
 function isSessionRecentlyAborted(sessionKey) { return abortGrace.isRecent(sessionKey); }
+// rc.54: per-session cooldown for auto-resume on 300s no-activity
+// timeout. Without the cooldown, a permanently-wedged tool would
+// trigger an infinite resume → timeout → resume loop.
+const autoResumeTracker = createAutoResumeTracker();
+// rc.54: spawn a fresh Query resuming the same session_id and ask
+// Claude to continue the timed-out work. The killed pm Query has
+// already torn down the wedged subprocess (via pm.kill on timeout);
+// getOrSpawnForChat creates a new entry that picks up the saved
+// session_id from `sessions` table and sets `--resume <id>` on the
+// SDK Options. The continuation message tells Claude what happened
+// and that it has full prior context to keep going.
+//
+// Returns the result.text on success (already-sent to chat); throws
+// on any failure (caller writes auto-resume-failed event + falls
+// back to the standard timeout reply).
+async function attemptAutoResume(sessionKey, chatId, originalMsg, bot) {
+  const threadId = originalMsg.message_thread_id || null;
+  // 1. Tell the user we're auto-resuming so they don't think nothing
+  //    happened. Threaded under the original user message.
+  await tg(bot, 'sendMessage', {
+    chat_id: chatId,
+    text: '🔁 Auto-resuming after timeout — continuing where the previous turn left off.',
+    reply_parameters: { message_id: originalMsg.message_id },
+    ...(threadId && { message_thread_id: threadId }),
+  }, { source: 'auto-resume-indicator', botName: BOT_NAME }).catch((sendErr) => {
+    // Indicator is informational; don't fail the whole resume on it.
+    console.error(`[${sessionKey}] auto-resume indicator send failed: ${sendErr.message}`);
+  });
+  // 2. Continuation prompt. Plain text — no XML wrapper. The SDK
+  //    Query resumes the saved session_id, so Claude has full prior
+  //    transcript context including its own partially-streamed text
+  //    and tool calls. We just need to tell it WHAT happened and
+  //    that it should pick up where it left off.
+  const continuation = '[polygram] Your previous turn timed out at 300s with no Claude activity (likely a wedged tool call — long Bash, hanging MCP, or stuck subagent). Continue from where you left off; do not restart from scratch. If the same operation would just hang again, abort it and tell me.';
+  // 3. No-op streamer + reactor. We don't need to stream the resume
+  //    turn's response (we'll send it as one message at the end). pm
+  //    invokes streamer/reactor methods only when present; passing
+  //    minimal stubs keeps pm happy.
+  const noopStreamer = {
+    onChunk: async () => {},
+    forceNewMessage: () => {},
+    finalize: async () => ({ streamed: false }),
+    flushDraft: async () => {},
+    discard: async () => {},
+  };
+  const noopReactor = {
+    setState: () => {},
+    heartbeat: () => {},
+    clear: async () => {},
+    stop: () => {},
+  };
+  const result = await sendToProcess(sessionKey, continuation, {
+    streamer: noopStreamer,
+    reactor: noopReactor,
+    sourceMsgId: originalMsg.message_id,
+    threadId,
+    onFirstStream: () => {},
+  });
+  if (result?.error) {
+    throw new Error(`auto-resume turn errored: ${String(result.error).slice(0, 200)}`);
+  }
+  if (!result?.text) {
+    throw new Error('auto-resume turn produced no text');
+  }
+  // 4. Send the continuation reply as regular Telegram message(s),
+  //    threaded under the original user message. Reuse the existing
+  //    chunked-delivery + markdown-formatting primitives.
+  const chunks = chunkMarkdownText(result.text, TG_MAX_LEN);
+  await deliverReplies({
+    bot,
+    send: (b, method, params, m) => tg(b, method, params, m),
+    chatId,
+    threadId,
+    chunks,
+    replyToMessageId: originalMsg.message_id,
+    meta: { source: 'auto-resume-reply', botName: BOT_NAME },
+    logger: { error: (m) => console.error(`[${sessionKey}] auto-resume deliver: ${m}`) },
+  });
+  return result.text;
+}
 // Called by bot.on('message') for every regular (non-admin, non-pair)
 // message. Runs handleMessage in a fire-and-forget manner with centralised
 // error handling. Replaces the old processQueue loop.
@@ -1163,6 +1252,28 @@ function dispatchHandleMessage(sessionKey, chatId, msg, bot) {
       aborted: wasAborted || undefined,
       replay: isReplay || undefined,
     });
+    // rc.55: surface replay failures with a meaningful message. Pre-rc.55
+    // any boot-replay turn that failed for ANY reason was silently dropped
+    // (the original logic assumed "user typed this minutes ago and moved
+    // on"). But the rc.51-onward boot-replay path is a recovery primitive,
+    // not stale-message handling — when it fails, the user IS still waiting
+    // for their answer. The Shumabit@UMI thread :24 wall-clock incident on
+    // 2026-05-04 hit exactly this: original turn SIGHUP'd by deploy → boot-
+    // replay redispatched → replay hit 1800s wall-clock → user saw nothing
+    // and didn't know their work had been lost.
+    //
+    // Now: send a tailored message on replay failures. Still suppress when
+    // the replay itself was killed by ANOTHER shutdown (the next boot will
+    // redispatch — same logic as before, just narrower).
+    if (isReplay && !wasAborted && !isShuttingDown) {
+      tg(bot, 'sendMessage', {
+        chat_id: chatId,
+        text: '⚠️ This turn was interrupted and didn\'t complete on retry — please rephrase or simplify, or split into smaller steps.',
+        reply_parameters: { message_id: msg.message_id },
+      }, { source: 'error-reply', botName: BOT_NAME }).catch((replyErr) => {
+        console.error(`[${sessionKey}] failed to send replay-failure reply: ${replyErr.message}`);
+      });
+    }
     // Suppress the user-facing error reply when:
     //  - boot replay (user typed this minutes ago and moved on)
     //  - polygram is shutting down (the failure is "Process killed" /
@@ -1170,6 +1281,49 @@ function dispatchHandleMessage(sessionKey, chatId, msg, bot) {
     //    re-dispatch it on next start)
     //  - user just /stop'd (already saw their abort acknowledgement)
     if (!wasAborted && !isReplay && !isShuttingDown) {
+      // rc.54: auto-resume on 300s no-activity timeout. Spawn a fresh
+      // Query resuming the same session_id and inject a continuation
+      // nudge. This recovers from wedged tool calls (long Bash, hung
+      // MCP, stuck subagent) that polygram's watchdog catches but
+      // currently leaves the user stranded with "try resending".
+      // Skipped when the failed turn was ITSELF an auto-resume
+      // (msg._isAutoResume) to prevent recursion; per-session
+      // cooldown blocks tight loops on permanent wedges.
+      const isResumeTurn = msg._isAutoResume === true;
+      const resumable = !isResumeTurn && isAutoResumable({
+        error: err, aborted: wasAborted, replay: isReplay, shuttingDown: isShuttingDown,
+      });
+      if (resumable && !autoResumeTracker.isInCooldown(sessionKey)) {
+        autoResumeTracker.markAttempt(sessionKey);
+        logEvent('auto-resume-attempted', {
+          chat_id: chatId, session_key: sessionKey, msg_id: msg.message_id,
+          original_error: err.message?.slice(0, 200),
+        });
+        attemptAutoResume(sessionKey, chatId, msg, bot)
+          .then(() => {
+            logEvent('auto-resume-success', {
+              chat_id: chatId, session_key: sessionKey, msg_id: msg.message_id,
+            });
+            autoResumeTracker.clear(sessionKey);
+          })
+          .catch((resumeErr) => {
+            console.error(`[${sessionKey}] auto-resume failed: ${resumeErr?.message}`);
+            logEvent('auto-resume-failed', {
+              chat_id: chatId, session_key: sessionKey, msg_id: msg.message_id,
+              error: resumeErr?.message?.slice(0, 200),
+            });
+            // Fall back to the original error reply so the user isn't
+            // left with just the 🔁 indicator and no answer.
+            const fallbackText = errorReplyText(err);
+            if (fallbackText) {
+              tg(bot, 'sendMessage', {
+                chat_id: chatId, text: fallbackText,
+                reply_parameters: { message_id: msg.message_id },
+              }, { source: 'error-reply', botName: BOT_NAME }).catch(() => {});
+            }
+          });
+        return;
+      }
       // 0.7.7: errorReplyText may return null when the classifier
       // says "suppress reply" (e.g. INTERRUPTED inside abort grace —
       // user already saw their /stop ack). Skip the send call in
@@ -2943,19 +3097,33 @@ async function handleMessage(sessionKey, chatId, msg, bot) {
     // without the scary "⚠ stream interrupted" banner. The user has already
     // seen their "Остановлено." ack; adding a warning to the partial bubble
     // just reads as "something crashed".
+    //
+    // rc.55: SAME quiet-finalize path during shutdown. Pre-rc.55 a deploy
+    // that landed mid-turn appended "⚠ stream interrupted" to whatever
+    // had streamed so far — the user saw a scary symbol every time we
+    // kickstart-k'd. polygram's boot-replay (rc.51) redispatches the
+    // turn from the same session_id, so the recovery is automatic; the
+    // user shouldn't be told "we crashed". Skip the suffix; let the
+    // partial bubble stand silently. The redispatched turn streams a
+    // fresh bubble with the full answer below.
     const abortedByUser = isSessionRecentlyAborted(sessionKey);
-    if (abortedByUser) {
+    const quietFinalize = abortedByUser || isShuttingDown;
+    if (quietFinalize) {
       await streamer.finalize('').catch(() => {});
-      // 0.8.0-rc.13: clear the in-flight emoji on abort so the user
-      // sees a clean message after their /stop ack — pre-rc.13 the
-      // last 👀 / 🤔 / ✍ stayed stuck on the message indefinitely
-      // because reactor.stop() (in finally) only kills timers, not
-      // the visible reaction. We DON'T set 🤯/😨 (those are for
-      // unexpected errors); the user just wants their stop honored.
-      await reactor.clear().catch(() => {});
-      // rc.14: clear ✍ on autosteered followups too (per-msg
-      // reactors are already GC'd in their own handleMessage scopes).
-      await clearAutosteeredReactions(sessionKey).catch(() => {});
+      if (abortedByUser) {
+        // 0.8.0-rc.13: clear the in-flight emoji on abort so the user
+        // sees a clean message after their /stop ack — pre-rc.13 the
+        // last 👀 / 🤔 / ✍ stayed stuck on the message indefinitely
+        // because reactor.stop() (in finally) only kills timers, not
+        // the visible reaction. We DON'T set 🤯/😨 (those are for
+        // unexpected errors); the user just wants their stop honored.
+        await reactor.clear().catch(() => {});
+        // rc.14: clear ✍ on autosteered followups too (per-msg
+        // reactors are already GC'd in their own handleMessage scopes).
+        await clearAutosteeredReactions(sessionKey).catch(() => {});
+      }
+      // On shutdown, leave the reactor state as-is — boot-replay's
+      // fresh dispatch will set its own reactor.
     } else {
       await streamer.finalize('', { errorSuffix: 'stream interrupted' }).catch(() => {});
       if (/wall-clock ceiling|idle with no Claude activity/i.test(err?.message || '')) {