polygram 0.8.0-rc.53 → 0.8.0-rc.55

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "$schema": "https://anthropic.com/claude-code/plugin.schema.json",
3
3
  "name": "polygram",
4
- "version": "0.8.0-rc.53",
4
+ "version": "0.8.0-rc.55",
5
5
  "description": "Telegram integration for Claude Code that preserves the OpenClaw per-chat session model. Migration target for OpenClaw users. Multi-bot, multi-chat, per-topic isolation; SQLite transcripts; inline-keyboard approvals. Bundles /polygram:status|logs|pair-code|approvals admin commands and a history skill.",
6
6
  "keywords": [
7
7
  "telegram",
@@ -0,0 +1,101 @@
1
+ /**
2
+ * rc.54: auto-resume on 300s no-activity timeout.
3
+ *
4
+ * Background — the rc.54 incident pattern:
5
+ * When polygram's per-turn watchdog fires "Timeout: 300s idle with
6
+ * no Claude activity", the running SDK Query is torn down and the
7
+ * user gets the friendly "⏳ I went quiet too long without finishing.
8
+ * Try resending or simplifying." message. The session_id is preserved,
9
+ * so the *next* user message resumes context — but the work the user
10
+ * was waiting for is dropped on the floor.
11
+ *
12
+ * Most timeouts are wedged tool calls (long Bash, hanging MCP, stuck
13
+ * subagent). The wedged subprocess is dead by the time the watchdog
14
+ * fires; a fresh resume of the same session_id will spawn a clean
15
+ * Query and Claude has full prior context to continue.
16
+ *
17
+ * What this module provides: a per-session cooldown tracker so we
18
+ * don't auto-resume in a tight loop when the wedge is permanent.
19
+ *
20
+ * - markAttempt(sessionKey) — record we just tried an auto-resume
21
+ * - isInCooldown(sessionKey) — true if we attempted within the
22
+ * cooldown window (default 10 min). Caller skips auto-resume and
23
+ * falls back to the existing user-facing timeout reply.
24
+ * - clear(sessionKey) — drop the timestamp (e.g. a successful turn
25
+ * completed since the auto-resume — we're back to healthy).
26
+ */
27
+
28
+ 'use strict';
29
+
30
+ const DEFAULT_COOLDOWN_MS = 10 * 60 * 1000; // 10 min
31
+
32
+ function createAutoResumeTracker({ cooldownMs = DEFAULT_COOLDOWN_MS, now = Date.now } = {}) {
33
+ const lastAttemptAt = new Map();
34
+
35
+ return {
36
+ /**
37
+ * Returns true if the most recent attempt for this sessionKey was
38
+ * within `cooldownMs` ago. Use to gate further auto-resume
39
+ * attempts when a wedge keeps recurring.
40
+ */
41
+ isInCooldown(sessionKey) {
42
+ const ts = lastAttemptAt.get(sessionKey);
43
+ if (ts == null) return false;
44
+ return now() - ts < cooldownMs;
45
+ },
46
+
47
+ /**
48
+ * Record an auto-resume attempt. Call BEFORE dispatching the
49
+ * resumed turn so a fast follow-up timeout can still see this
50
+ * session is in cooldown.
51
+ */
52
+ markAttempt(sessionKey) {
53
+ lastAttemptAt.set(sessionKey, now());
54
+ },
55
+
56
+ /**
57
+ * Clear the cooldown for a session — called when a normal turn
58
+ * succeeds, signalling the session is healthy again. Without
59
+ * this, a session that auto-resumed once would be locked out of
60
+ * future auto-resumes for the full 10 min even after recovery.
61
+ */
62
+ clear(sessionKey) {
63
+ lastAttemptAt.delete(sessionKey);
64
+ },
65
+
66
+ /**
67
+ * Reset all tracked sessions. Called by daemon reload, tests.
68
+ */
69
+ reset() {
70
+ lastAttemptAt.clear();
71
+ },
72
+
73
+ // Test hooks
74
+ _size() { return lastAttemptAt.size; },
75
+ _get(sessionKey) { return lastAttemptAt.get(sessionKey); },
76
+ };
77
+ }
78
+
79
+ /**
80
+ * Decide whether an error is a candidate for auto-resume.
81
+ *
82
+ * Gates:
83
+ * - error message matches the 300s no-activity timeout pattern
84
+ * (NOT the wall-clock ceiling — that's usually a runaway, not
85
+ * a wedge; resuming might just runaway again)
86
+ * - NOT user-aborted (the user explicitly /stop'd; never resume)
87
+ * - NOT a boot-replay (the user typed this minutes ago and moved
88
+ * on; resuming now is more confusing than helpful)
89
+ * - NOT during shutdown (boot replay will pick it up)
90
+ */
91
+ function isAutoResumable({ error, aborted, replay, shuttingDown }) {
92
+ if (aborted || replay || shuttingDown) return false;
93
+ const msg = String(error?.message || error || '');
94
+ return /idle with no Claude activity/i.test(msg);
95
+ }
96
+
97
+ module.exports = {
98
+ createAutoResumeTracker,
99
+ isAutoResumable,
100
+ DEFAULT_COOLDOWN_MS,
101
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "polygram",
3
- "version": "0.8.0-rc.53",
3
+ "version": "0.8.0-rc.55",
4
4
  "description": "Telegram daemon for Claude Code that preserves the OpenClaw per-chat session model. Migration path for OpenClaw users moving to Claude Code.",
5
5
  "main": "lib/ipc-client.js",
6
6
  "bin": {
package/polygram.js CHANGED
@@ -65,6 +65,7 @@ const { redactBotToken } = require('./lib/net-errors');
65
65
  const { createReactionManager, classifyToolName } = require('./lib/status-reactions');
66
66
  const { createMediaGroupBuffer } = require('./lib/media-group-buffer');
67
67
  const { classify: classifyError, isTransientHttpError } = require('./lib/error-classify');
68
+ const { createAutoResumeTracker, isAutoResumable } = require('./lib/auto-resume');
68
69
  const {
69
70
  createStore: createApprovalsStore,
70
71
  matchesAnyPattern: matchesApprovalPattern,
@@ -1118,6 +1119,94 @@ const abortGrace = createAbortGrace();
1118
1119
  function markSessionAborted(sessionKey) { abortGrace.mark(sessionKey); }
1119
1120
  function isSessionRecentlyAborted(sessionKey) { return abortGrace.isRecent(sessionKey); }
1120
1121
 
1122
+ // rc.54: per-session cooldown for auto-resume on 300s no-activity
1123
+ // timeout. Without the cooldown, a permanently-wedged tool would
1124
+ // trigger an infinite resume → timeout → resume loop.
1125
+ const autoResumeTracker = createAutoResumeTracker();
1126
+
1127
+ // rc.54: spawn a fresh Query resuming the same session_id and ask
1128
+ // Claude to continue the timed-out work. The killed pm Query has
1129
+ // already torn down the wedged subprocess (via pm.kill on timeout);
1130
+ // getOrSpawnForChat creates a new entry that picks up the saved
1131
+ // session_id from `sessions` table and sets `--resume <id>` on the
1132
+ // SDK Options. The continuation message tells Claude what happened
1133
+ // and that it has full prior context to keep going.
1134
+ //
1135
+ // Returns the result.text on success (already-sent to chat); throws
1136
+ // on any failure (caller writes auto-resume-failed event + falls
1137
+ // back to the standard timeout reply).
1138
+ async function attemptAutoResume(sessionKey, chatId, originalMsg, bot) {
1139
+ const threadId = originalMsg.message_thread_id || null;
1140
+ // 1. Tell the user we're auto-resuming so they don't think nothing
1141
+ // happened. Threaded under the original user message.
1142
+ await tg(bot, 'sendMessage', {
1143
+ chat_id: chatId,
1144
+ text: '🔁 Auto-resuming after timeout — continuing where the previous turn left off.',
1145
+ reply_parameters: { message_id: originalMsg.message_id },
1146
+ ...(threadId && { message_thread_id: threadId }),
1147
+ }, { source: 'auto-resume-indicator', botName: BOT_NAME }).catch((sendErr) => {
1148
+ // Indicator is informational; don't fail the whole resume on it.
1149
+ console.error(`[${sessionKey}] auto-resume indicator send failed: ${sendErr.message}`);
1150
+ });
1151
+
1152
+ // 2. Continuation prompt. Plain text — no XML wrapper. The SDK
1153
+ // Query resumes the saved session_id, so Claude has full prior
1154
+ // transcript context including its own partially-streamed text
1155
+ // and tool calls. We just need to tell it WHAT happened and
1156
+ // that it should pick up where it left off.
1157
+ const continuation = '[polygram] Your previous turn timed out at 300s with no Claude activity (likely a wedged tool call — long Bash, hanging MCP, or stuck subagent). Continue from where you left off; do not restart from scratch. If the same operation would just hang again, abort it and tell me.';
1158
+
1159
+ // 3. No-op streamer + reactor. We don't need to stream the resume
1160
+ // turn's response (we'll send it as one message at the end). pm
1161
+ // invokes streamer/reactor methods only when present; passing
1162
+ // minimal stubs keeps pm happy.
1163
+ const noopStreamer = {
1164
+ onChunk: async () => {},
1165
+ forceNewMessage: () => {},
1166
+ finalize: async () => ({ streamed: false }),
1167
+ flushDraft: async () => {},
1168
+ discard: async () => {},
1169
+ };
1170
+ const noopReactor = {
1171
+ setState: () => {},
1172
+ heartbeat: () => {},
1173
+ clear: async () => {},
1174
+ stop: () => {},
1175
+ };
1176
+
1177
+ const result = await sendToProcess(sessionKey, continuation, {
1178
+ streamer: noopStreamer,
1179
+ reactor: noopReactor,
1180
+ sourceMsgId: originalMsg.message_id,
1181
+ threadId,
1182
+ onFirstStream: () => {},
1183
+ });
1184
+
1185
+ if (result?.error) {
1186
+ throw new Error(`auto-resume turn errored: ${String(result.error).slice(0, 200)}`);
1187
+ }
1188
+ if (!result?.text) {
1189
+ throw new Error('auto-resume turn produced no text');
1190
+ }
1191
+
1192
+ // 4. Send the continuation reply as regular Telegram message(s),
1193
+ // threaded under the original user message. Reuse the existing
1194
+ // chunked-delivery + markdown-formatting primitives.
1195
+ const chunks = chunkMarkdownText(result.text, TG_MAX_LEN);
1196
+ await deliverReplies({
1197
+ bot,
1198
+ send: (b, method, params, m) => tg(b, method, params, m),
1199
+ chatId,
1200
+ threadId,
1201
+ chunks,
1202
+ replyToMessageId: originalMsg.message_id,
1203
+ meta: { source: 'auto-resume-reply', botName: BOT_NAME },
1204
+ logger: { error: (m) => console.error(`[${sessionKey}] auto-resume deliver: ${m}`) },
1205
+ });
1206
+
1207
+ return result.text;
1208
+ }
1209
+
1121
1210
  // Called by bot.on('message') for every regular (non-admin, non-pair)
1122
1211
  // message. Runs handleMessage in a fire-and-forget manner with centralised
1123
1212
  // error handling. Replaces the old processQueue loop.
@@ -1163,6 +1252,28 @@ function dispatchHandleMessage(sessionKey, chatId, msg, bot) {
1163
1252
  aborted: wasAborted || undefined,
1164
1253
  replay: isReplay || undefined,
1165
1254
  });
1255
+ // rc.55: surface replay failures with a meaningful message. Pre-rc.55
1256
+ // any boot-replay turn that failed for ANY reason was silently dropped
1257
+ // (the original logic assumed "user typed this minutes ago and moved
1258
+ // on"). But the rc.51-onward boot-replay path is a recovery primitive,
1259
+ // not stale-message handling — when it fails, the user IS still waiting
1260
+ // for their answer. The Shumabit@UMI thread :24 wall-clock incident on
1261
+ // 2026-05-04 hit exactly this: original turn SIGHUP'd by deploy → boot-
1262
+ // replay redispatched → replay hit 1800s wall-clock → user saw nothing
1263
+ // and didn't know their work had been lost.
1264
+ //
1265
+ // Now: send a tailored message on replay failures. Still suppress when
1266
+ // the replay itself was killed by ANOTHER shutdown (the next boot will
1267
+ // redispatch — same logic as before, just narrower).
1268
+ if (isReplay && !wasAborted && !isShuttingDown) {
1269
+ tg(bot, 'sendMessage', {
1270
+ chat_id: chatId,
1271
+ text: '⚠️ This turn was interrupted and didn\'t complete on retry — please rephrase or simplify, or split into smaller steps.',
1272
+ reply_parameters: { message_id: msg.message_id },
1273
+ }, { source: 'error-reply', botName: BOT_NAME }).catch((replyErr) => {
1274
+ console.error(`[${sessionKey}] failed to send replay-failure reply: ${replyErr.message}`);
1275
+ });
1276
+ }
1166
1277
  // Suppress the user-facing error reply when:
1167
1278
  // - boot replay (user typed this minutes ago and moved on)
1168
1279
  // - polygram is shutting down (the failure is "Process killed" /
@@ -1170,6 +1281,49 @@ function dispatchHandleMessage(sessionKey, chatId, msg, bot) {
1170
1281
  // re-dispatch it on next start)
1171
1282
  // - user just /stop'd (already saw their abort acknowledgement)
1172
1283
  if (!wasAborted && !isReplay && !isShuttingDown) {
1284
+ // rc.54: auto-resume on 300s no-activity timeout. Spawn a fresh
1285
+ // Query resuming the same session_id and inject a continuation
1286
+ // nudge. This recovers from wedged tool calls (long Bash, hung
1287
+ // MCP, stuck subagent) that polygram's watchdog catches but
1288
+ // currently leaves the user stranded with "try resending".
1289
+ // Skipped when the failed turn was ITSELF an auto-resume
1290
+ // (msg._isAutoResume) to prevent recursion; per-session
1291
+ // cooldown blocks tight loops on permanent wedges.
1292
+ const isResumeTurn = msg._isAutoResume === true;
1293
+ const resumable = !isResumeTurn && isAutoResumable({
1294
+ error: err, aborted: wasAborted, replay: isReplay, shuttingDown: isShuttingDown,
1295
+ });
1296
+ if (resumable && !autoResumeTracker.isInCooldown(sessionKey)) {
1297
+ autoResumeTracker.markAttempt(sessionKey);
1298
+ logEvent('auto-resume-attempted', {
1299
+ chat_id: chatId, session_key: sessionKey, msg_id: msg.message_id,
1300
+ original_error: err.message?.slice(0, 200),
1301
+ });
1302
+ attemptAutoResume(sessionKey, chatId, msg, bot)
1303
+ .then(() => {
1304
+ logEvent('auto-resume-success', {
1305
+ chat_id: chatId, session_key: sessionKey, msg_id: msg.message_id,
1306
+ });
1307
+ autoResumeTracker.clear(sessionKey);
1308
+ })
1309
+ .catch((resumeErr) => {
1310
+ console.error(`[${sessionKey}] auto-resume failed: ${resumeErr?.message}`);
1311
+ logEvent('auto-resume-failed', {
1312
+ chat_id: chatId, session_key: sessionKey, msg_id: msg.message_id,
1313
+ error: resumeErr?.message?.slice(0, 200),
1314
+ });
1315
+ // Fall back to the original error reply so the user isn't
1316
+ // left with just the 🔁 indicator and no answer.
1317
+ const fallbackText = errorReplyText(err);
1318
+ if (fallbackText) {
1319
+ tg(bot, 'sendMessage', {
1320
+ chat_id: chatId, text: fallbackText,
1321
+ reply_parameters: { message_id: msg.message_id },
1322
+ }, { source: 'error-reply', botName: BOT_NAME }).catch(() => {});
1323
+ }
1324
+ });
1325
+ return;
1326
+ }
1173
1327
  // 0.7.7: errorReplyText may return null when the classifier
1174
1328
  // says "suppress reply" (e.g. INTERRUPTED inside abort grace —
1175
1329
  // user already saw their /stop ack). Skip the send call in
@@ -2943,19 +3097,33 @@ async function handleMessage(sessionKey, chatId, msg, bot) {
2943
3097
  // without the scary "⚠ stream interrupted" banner. The user has already
2944
3098
  // seen their "Остановлено." ack; adding a warning to the partial bubble
2945
3099
  // just reads as "something crashed".
3100
+ //
3101
+ // rc.55: SAME quiet-finalize path during shutdown. Pre-rc.55 a deploy
3102
+ // that landed mid-turn appended "⚠ stream interrupted" to whatever
3103
+ // had streamed so far — the user saw a scary symbol every time we
3104
+ // kickstart-k'd. polygram's boot-replay (rc.51) redispatches the
3105
+ // turn from the same session_id, so the recovery is automatic; the
3106
+ // user shouldn't be told "we crashed". Skip the suffix; let the
3107
+ // partial bubble stand silently. The redispatched turn streams a
3108
+ // fresh bubble with the full answer below.
2946
3109
  const abortedByUser = isSessionRecentlyAborted(sessionKey);
2947
- if (abortedByUser) {
3110
+ const quietFinalize = abortedByUser || isShuttingDown;
3111
+ if (quietFinalize) {
2948
3112
  await streamer.finalize('').catch(() => {});
2949
- // 0.8.0-rc.13: clear the in-flight emoji on abort so the user
2950
- // sees a clean message after their /stop ack pre-rc.13 the
2951
- // last 👀 / 🤔 / stayed stuck on the message indefinitely
2952
- // because reactor.stop() (in finally) only kills timers, not
2953
- // the visible reaction. We DON'T set 🤯/😨 (those are for
2954
- // unexpected errors); the user just wants their stop honored.
2955
- await reactor.clear().catch(() => {});
2956
- // rc.14: clear on autosteered followups too (per-msg
2957
- // reactors are already GC'd in their own handleMessage scopes).
2958
- await clearAutosteeredReactions(sessionKey).catch(() => {});
3113
+ if (abortedByUser) {
3114
+ // 0.8.0-rc.13: clear the in-flight emoji on abort so the user
3115
+ // sees a clean message after their /stop ack pre-rc.13 the
3116
+ // last 👀 / 🤔 / stayed stuck on the message indefinitely
3117
+ // because reactor.stop() (in finally) only kills timers, not
3118
+ // the visible reaction. We DON'T set 🤯/😨 (those are for
3119
+ // unexpected errors); the user just wants their stop honored.
3120
+ await reactor.clear().catch(() => {});
3121
+ // rc.14: clear on autosteered followups too (per-msg
3122
+ // reactors are already GC'd in their own handleMessage scopes).
3123
+ await clearAutosteeredReactions(sessionKey).catch(() => {});
3124
+ }
3125
+ // On shutdown, leave the reactor state as-is — boot-replay's
3126
+ // fresh dispatch will set its own reactor.
2959
3127
  } else {
2960
3128
  await streamer.finalize('', { errorSuffix: 'stream interrupted' }).catch(() => {});
2961
3129
  if (/wall-clock ceiling|idle with no Claude activity/i.test(err?.message || '')) {