polygram 0.8.0-rc.49 → 0.8.0-rc.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "$schema": "https://anthropic.com/claude-code/plugin.schema.json",
3
3
  "name": "polygram",
4
- "version": "0.8.0-rc.49",
4
+ "version": "0.8.0-rc.50",
5
5
  "description": "Telegram integration for Claude Code that preserves the OpenClaw per-chat session model. Migration target for OpenClaw users. Multi-bot, multi-chat, per-topic isolation; SQLite transcripts; inline-keyboard approvals. Bundles /polygram:status|logs|pair-code|approvals admin commands and a history skill.",
6
6
  "keywords": [
7
7
  "telegram",
@@ -0,0 +1,240 @@
1
+ /**
2
+ * rc.50: process-guard helpers — orphan-detection PID file + safety
3
+ * handlers for uncaughtException / unhandledRejection that don't
4
+ * re-enter on broken stdout.
5
+ *
6
+ * Background — the rc.50 incident:
7
+ * PID 6335 (rc.48) was orphaned when its tmux pane was destroyed
8
+ * during `launchctl kickstart -k`. polygram's existing SIGHUP
9
+ * handler should have drained cleanly, but during the drain
10
+ * `console.error` inside the uncaughtException handler itself
11
+ * threw EIO (stdout was wired to a now-destroyed pty). That fired
12
+ * the same handler, which logged again, which threw EIO again — a
13
+ * tight re-entrant loop that hijacked the event loop and prevented
14
+ * shutdown from completing. The orphan ran for 3+ hours writing
15
+ * 3.59M+ uncaught-exception rows to the DB at ~12k/sec, and
16
+ * polled the same Telegram bot token in parallel with the new
17
+ * daemon.
18
+ *
19
+ * This module provides three primitives. polygram.js wires them
20
+ * together at boot.
21
+ */
22
+
23
+ 'use strict';
24
+
25
+ const fs = require('fs');
26
+
27
+ /**
28
+ * Boot-time orphan detection. Writes our PID to `pidPath`. If the
29
+ * file already exists with a different live PID, kill it before
30
+ * proceeding (SIGTERM, then SIGKILL after `sigtermWaitMs`). Without
31
+ * this, two daemons can end up sharing the same Telegram bot token
32
+ * and SQLite DB — the cascade that made the rc.50 incident
33
+ * production-visible.
34
+ *
35
+ * @returns {{ priorPid: number|null, priorAction: string }}
36
+ */
37
+ function claimPidFile(pidPath, { logger = console, sigtermWaitMs = 2000 } = {}) {
38
+ const ownPid = process.pid;
39
+ let priorPid = null;
40
+ let priorAction = 'no-prior';
41
+
42
+ if (fs.existsSync(pidPath)) {
43
+ const raw = (() => {
44
+ try { return fs.readFileSync(pidPath, 'utf8').trim(); }
45
+ catch { return ''; }
46
+ })();
47
+ const parsed = /^\d+$/.test(raw) ? parseInt(raw, 10) : null;
48
+ if (!parsed) {
49
+ priorAction = 'malformed-overwritten';
50
+ } else if (parsed === ownPid) {
51
+ // Re-entrant call from same process — write but don't kill self.
52
+ priorPid = parsed;
53
+ priorAction = 'self-skip';
54
+ } else {
55
+ priorPid = parsed;
56
+ const alive = isAlive(parsed);
57
+ if (!alive) {
58
+ priorAction = 'stale-overwritten';
59
+ } else {
60
+ logger.log?.(`[orphan-guard] prior daemon PID ${parsed} still alive — sending SIGTERM`);
61
+ try { process.kill(parsed, 'SIGTERM'); } catch {}
62
+ const start = Date.now();
63
+ while (Date.now() - start < sigtermWaitMs && isAlive(parsed)) {
64
+ // Busy-wait. Boot is single-threaded; we have nothing else to do
65
+ // until the orphan is gone, and we don't want to bind the bot
66
+ // token while it's still polling. sigtermWaitMs is configurable
67
+ // (default 2s; tests override to 100ms).
68
+ sleepSync(50);
69
+ }
70
+ if (isAlive(parsed)) {
71
+ logger.log?.(`[orphan-guard] PID ${parsed} ignored SIGTERM — escalating to SIGKILL`);
72
+ try { process.kill(parsed, 'SIGKILL'); } catch {}
73
+ // Poll for actual death — SIGKILL is delivered async, the
74
+ // kernel may take a tick to reap (esp. for detached children).
75
+ const killStart = Date.now();
76
+ while (Date.now() - killStart < 1000 && isAlive(parsed)) {
77
+ sleepSync(20);
78
+ }
79
+ priorAction = 'sigkill-killed';
80
+ } else {
81
+ priorAction = 'sigterm-killed';
82
+ }
83
+ }
84
+ }
85
+ }
86
+
87
+ fs.writeFileSync(pidPath, String(ownPid) + '\n', { mode: 0o600 });
88
+ return { priorPid, priorAction };
89
+ }
90
+
91
+ /**
92
+ * Delete the PID file on clean shutdown. Only deletes if the file
93
+ * still contains OUR PID — protects against the race where a new
94
+ * daemon already claimed the file and rewrote it before we got here.
95
+ */
96
+ function releasePidFile(pidPath) {
97
+ if (!fs.existsSync(pidPath)) return;
98
+ try {
99
+ const content = fs.readFileSync(pidPath, 'utf8').trim();
100
+ if (content === String(process.pid)) {
101
+ fs.unlinkSync(pidPath);
102
+ }
103
+ // Else: another daemon owns it now. Leaving alone is correct.
104
+ } catch {}
105
+ }
106
+
107
+ /**
108
+ * Build an uncaughtException handler that:
109
+ * 1. Wraps `logger.error` AND `logEvent` in try/catch — neither
110
+ * can re-throw out of the handler. (Pre-rc.50 the bare
111
+ * console.error threw EIO and re-fired this same handler in
112
+ * an event-loop-hijacking loop.)
113
+ * 2. Tracks repetitions of the same exception message in a sliding
114
+ * window. If the same message fires `eioThreshold` times within
115
+ * `eioWindowMs`, calls `panicExit(2)` so launchd restarts us
116
+ * cleanly. Without the circuit breaker, a stuck-stdout EIO
117
+ * cascade just keeps writing rows forever.
118
+ *
119
+ * @param {object} opts
120
+ * @param {object} opts.logger - { error(msg) } sink for human-readable logs.
121
+ * @param {function(string, object)} opts.logEvent - DB persist sink.
122
+ * @param {string} opts.botName
123
+ * @param {number} [opts.eioThreshold=100]
124
+ * @param {number} [opts.eioWindowMs=5000]
125
+ * @param {function(number)} [opts.panicExit=process.exit]
126
+ * @param {function(): number} [opts.now=Date.now]
127
+ * @returns {function(Error)}
128
+ */
129
+ function _makeUncaughtHandler({
130
+ logger,
131
+ logEvent,
132
+ botName,
133
+ eioThreshold = 100,
134
+ eioWindowMs = 5000,
135
+ panicExit = (code) => process.exit(code),
136
+ now = Date.now,
137
+ } = {}) {
138
+ // Per-message sliding-window timestamps. Map<message, number[]>.
139
+ const recent = new Map();
140
+ let panicked = false;
141
+
142
+ return function uncaughtHandler(err) {
143
+ if (panicked) return; // bail — we're on our way out
144
+ const msg = String(err?.message || err || 'unknown').slice(0, 500);
145
+ const stack = err?.stack?.split('\n').slice(0, 5).join('\n') || '';
146
+
147
+ // 1. Log defensively. Stdout may be broken (the original incident);
148
+ // must not re-throw out of this handler.
149
+ try {
150
+ logger?.error?.(`[polygram] uncaughtException: ${msg}\n${stack}`);
151
+ } catch { /* swallow — broken stdout */ }
152
+
153
+ // 2. Persist defensively. DB might be closing during shutdown.
154
+ try {
155
+ logEvent?.('uncaught-exception', { message: msg, bot_name: botName });
156
+ } catch { /* swallow */ }
157
+
158
+ // 3. Storm circuit breaker: same message N times in window → exit.
159
+ const t = now();
160
+ let timestamps = recent.get(msg);
161
+ if (!timestamps) { timestamps = []; recent.set(msg, timestamps); }
162
+ timestamps.push(t);
163
+ // Drop expired.
164
+ while (timestamps.length && t - timestamps[0] > eioWindowMs) timestamps.shift();
165
+ if (timestamps.length >= eioThreshold) {
166
+ panicked = true;
167
+ try {
168
+ logger?.error?.(`[polygram] uncaughtException circuit breaker: ${timestamps.length}× "${msg}" in ${eioWindowMs}ms — exit(2)`);
169
+ } catch {}
170
+ try {
171
+ logEvent?.('panic-exit', { message: msg, count: timestamps.length, window_ms: eioWindowMs, bot_name: botName });
172
+ } catch {}
173
+ panicExit(2);
174
+ }
175
+ };
176
+ }
177
+
178
+ // Build a parallel handler for unhandledRejection: same defensive
179
+ // posture, separate counter (rejections and exceptions can come
180
+ // from different code paths and shouldn't share a budget).
181
+ function _makeUnhandledRejectionHandler(opts) {
182
+ const inner = _makeUncaughtHandler({
183
+ ...opts,
184
+ // Override the 'kind' written to events table.
185
+ logEvent: opts.logEvent
186
+ ? (kind, detail) => opts.logEvent(kind === 'panic-exit' ? 'panic-exit' : 'unhandled-rejection', detail)
187
+ : undefined,
188
+ });
189
+ return (reason /* , promise */) => {
190
+ const err = reason instanceof Error ? reason : new Error(String(reason));
191
+ inner(err);
192
+ };
193
+ }
194
+
195
+ /**
196
+ * Convenience: install both handlers in one call.
197
+ * @returns {{ uninstall: function() }}
198
+ */
199
+ function installSafetyHandlers(opts) {
200
+ const onException = _makeUncaughtHandler(opts);
201
+ const onRejection = _makeUnhandledRejectionHandler(opts);
202
+ process.on('uncaughtException', onException);
203
+ process.on('unhandledRejection', onRejection);
204
+ return {
205
+ uninstall() {
206
+ process.off('uncaughtException', onException);
207
+ process.off('unhandledRejection', onRejection);
208
+ },
209
+ };
210
+ }
211
+
212
+ // ─── helpers ─────────────────────────────────────────────────────────
213
+
214
+ function isAlive(pid) {
215
+ try {
216
+ process.kill(pid, 0);
217
+ return true;
218
+ } catch (err) {
219
+ // ESRCH = no such process. EPERM = exists but we lack rights
220
+ // (treat as alive — same UID typically; we will fail to kill it
221
+ // but at least we know it's there).
222
+ if (err.code === 'EPERM') return true;
223
+ return false;
224
+ }
225
+ }
226
+
227
+ function sleepSync(ms) {
228
+ // Atomics-based busy-wait. 50ms granularity is fine for boot
229
+ // orphan-killing; we're not in a hot path.
230
+ const buf = new Int32Array(new SharedArrayBuffer(4));
231
+ Atomics.wait(buf, 0, 0, ms);
232
+ }
233
+
234
+ module.exports = {
235
+ claimPidFile,
236
+ releasePidFile,
237
+ installSafetyHandlers,
238
+ _makeUncaughtHandler,
239
+ _makeUnhandledRejectionHandler,
240
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "polygram",
3
- "version": "0.8.0-rc.49",
3
+ "version": "0.8.0-rc.50",
4
4
  "description": "Telegram daemon for Claude Code that preserves the OpenClaw per-chat session model. Migration path for OpenClaw users moving to Claude Code.",
5
5
  "main": "lib/ipc-client.js",
6
6
  "bin": {
package/polygram.js CHANGED
@@ -20,6 +20,7 @@ const { Bot } = require('grammy');
20
20
  const { spawn } = require('child_process');
21
21
  const fs = require('fs');
22
22
  const path = require('path');
23
+ const processGuard = require('./lib/process-guard');
23
24
  const dbClient = require('./lib/db');
24
25
  const { migrateJsonToDb, getClaudeSessionId } = require('./lib/sessions');
25
26
  const { buildPrompt } = require('./lib/prompt');
@@ -85,6 +86,7 @@ const SESSIONS_JSON_PATH = path.join(DATA_DIR, 'sessions.json'); // legacy, impo
85
86
  const DB_DIR = DATA_DIR;
86
87
  // DB_PATH is resolved in main() from --db or <bot>.db default.
87
88
  let DB_PATH = null;
89
+ let PID_PATH = null; // rc.50: orphan-detection PID file
88
90
  const STICKERS_PATH = process.env.POLYGRAM_STICKERS
89
91
  || path.join(DATA_DIR, 'stickers.json');
90
92
  const INBOX_DIR = process.env.POLYGRAM_INBOX || path.join(DATA_DIR, 'inbox');
@@ -3535,6 +3537,17 @@ async function main() {
3535
3537
  DB_PATH = dbOverride || path.join(DB_DIR, `${BOT_NAME}.db`);
3536
3538
  console.log(`[polygram] bot: ${BOT_NAME} (${Object.keys(config.chats).length} chats) db: ${DB_PATH}`);
3537
3539
 
3540
+ // rc.50: claim our PID file BEFORE binding the bot token. If a
3541
+ // prior daemon (orphan from a botched restart) is still running,
3542
+ // SIGTERM/SIGKILL it first. Two daemons sharing one Telegram bot
3543
+ // token + SQLite DB caused the rc.50 incident's user-visible
3544
+ // damage; this stops the cascade at boot.
3545
+ PID_PATH = path.join(DB_DIR, `${BOT_NAME}.pid`);
3546
+ const pidClaim = processGuard.claimPidFile(PID_PATH, { logger: console });
3547
+ if (pidClaim.priorAction !== 'no-prior') {
3548
+ console.log(`[orphan-guard] prior=${pidClaim.priorPid ?? '?'} action=${pidClaim.priorAction}`);
3549
+ }
3550
+
3538
3551
  try {
3539
3552
  db = dbClient.open(DB_PATH);
3540
3553
  console.log(`[db] opened ${DB_PATH}`);
@@ -3560,38 +3573,28 @@ async function main() {
3560
3573
  process.exit(1);
3561
3574
  }
3562
3575
 
3563
- // 0.8.0 Phase 1 step 11: belt-and-suspenders unhandledRejection
3564
- // logger. The new pm wraps every Query iteration in try/catch so
3565
- // SDK throws never leak — but if a callback ever does throw async
3566
- // (canUseTool body, onResult handler, etc.) the rejection could
3567
- // escape to the global handler. Without this, Node's default is to
3568
- // exit the process. With this, we log + persist and keep running
3569
- // so other chats are unaffected.
3570
- process.on('unhandledRejection', (reason, promise) => {
3571
- const reasonStr = reason instanceof Error
3572
- ? `${reason.message}\n${(reason.stack || '').split('\n').slice(0, 3).join('\n')}`
3573
- : String(reason);
3574
- console.error(`[polygram] unhandledRejection: ${reasonStr.slice(0, 1000)}`);
3575
- try {
3576
- db.logEvent('unhandled-rejection', {
3577
- reason: String(reason instanceof Error ? reason.message : reason).slice(0, 500),
3578
- bot_name: BOT_NAME,
3579
- });
3580
- } catch { /* swallow — db might be closing */ }
3581
- });
3582
- // Same defensive posture for uncaughtException — Node's default is
3583
- // exit on these. We want to log + persist + survive (the affected
3584
- // chat's iteration loop will have rejected its pendings via the
3585
- // catch in pm's _runIteration, so user-visible UX is "their turn
3586
- // failed", not "bot died").
3587
- process.on('uncaughtException', (err) => {
3588
- console.error(`[polygram] uncaughtException: ${err?.message}\n${err?.stack?.split('\n').slice(0, 5).join('\n')}`);
3589
- try {
3590
- db.logEvent('uncaught-exception', {
3591
- message: String(err?.message || err).slice(0, 500),
3592
- bot_name: BOT_NAME,
3593
- });
3594
- } catch { /* swallow */ }
3576
+ // 0.8.0 Phase 1 step 11 + rc.50: defensive uncaughtException +
3577
+ // unhandledRejection handlers. The new pm wraps every Query
3578
+ // iteration in try/catch so SDK throws never leak — but if a
3579
+ // callback ever does throw async (canUseTool body, onResult
3580
+ // handler, etc.) the rejection could escape. Node's default is
3581
+ // process exit; we log + persist + survive so other chats keep
3582
+ // running.
3583
+ //
3584
+ // rc.50 hardening (after the PID-6335 orphan-storm incident):
3585
+ // 1. Both handlers wrap their loggers in try/catch — pre-rc.50,
3586
+ // a bare console.error inside the uncaughtException handler
3587
+ // threw EIO when stdout was wired to a destroyed pty. That
3588
+ // re-fired the same handler infinitely, hijacking the event
3589
+ // loop and preventing the SIGHUP shutdown drain from running.
3590
+ // 2. Storm circuit breaker: same message firing >100× in 5s →
3591
+ // panic exit(2). Lets launchd restart cleanly instead of
3592
+ // letting the process zombie at ~12k EIO/sec writing to DB.
3593
+ // Lives in lib/process-guard.js.
3594
+ processGuard.installSafetyHandlers({
3595
+ logger: console,
3596
+ logEvent: (kind, detail) => { try { db.logEvent(kind, detail); } catch {} },
3597
+ botName: BOT_NAME,
3595
3598
  });
3596
3599
 
3597
3600
  const cap = config.maxWarmProcesses || DEFAULT_MAX_WARM_PROCS;
@@ -3913,6 +3916,11 @@ async function main() {
3913
3916
  if (db) {
3914
3917
  try { db.logEvent('polygram-stop'); db.raw.close(); } catch {}
3915
3918
  }
3919
+ // rc.50: release our PID file claim so the next boot doesn't try
3920
+ // to kill us. releasePidFile is idempotent and only deletes the
3921
+ // file when its content matches our PID — a new daemon that
3922
+ // already claimed the slot is left alone.
3923
+ if (PID_PATH) processGuard.releasePidFile(PID_PATH);
3916
3924
  setTimeout(() => process.exit(0), 100);
3917
3925
  };
3918
3926
  process.on('SIGINT', shutdown);