polygram 0.8.0-rc.49 → 0.8.0-rc.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/lib/db.js +21 -0
- package/lib/process-guard.js +240 -0
- package/package.json +1 -1
- package/polygram.js +53 -34
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "https://anthropic.com/claude-code/plugin.schema.json",
|
|
3
3
|
"name": "polygram",
|
|
4
|
-
"version": "0.8.0-rc.
|
|
4
|
+
"version": "0.8.0-rc.51",
|
|
5
5
|
"description": "Telegram integration for Claude Code that preserves the OpenClaw per-chat session model. Migration target for OpenClaw users. Multi-bot, multi-chat, per-topic isolation; SQLite transcripts; inline-keyboard approvals. Bundles /polygram:status|logs|pair-code|approvals admin commands and a history skill.",
|
|
6
6
|
"keywords": [
|
|
7
7
|
"telegram",
|
package/lib/db.js
CHANGED
|
@@ -498,6 +498,27 @@ function wrap(db) {
|
|
|
498
498
|
// Treating ambiguous states as "replied" costs us occasional missed
|
|
499
499
|
// replies (recoverable: user resends) to prevent duplicates
|
|
500
500
|
// (irrecoverable: user has to mentally dedupe two answers).
|
|
501
|
+
// rc.51: stricter dedupe than hasOutboundReplyTo for boot-replay.
|
|
502
|
+
// A `turn_metrics` row is only inserted when a turn definitively
|
|
503
|
+
// completes (onResult callback). If no row exists for this inbound
|
|
504
|
+
// msg_id, the turn never finished — even if intermediate ack-bubbles
|
|
505
|
+
// were already sent. The rc.50 incident's lost msg 12158 had a
|
|
506
|
+
// partial "I'll write a quick inline script..." outbound but no
|
|
507
|
+
// turn_metrics, and was being silently skipped by replay-dedupe.
|
|
508
|
+
//
|
|
509
|
+
// Caveat: a row whose `error` is set (transient/aborted/timeout)
|
|
510
|
+
// does NOT count as complete — the turn started but failed. Boot
|
|
511
|
+
// replay should redispatch within window so the user gets a real
|
|
512
|
+
// answer.
|
|
513
|
+
hasCompletedTurnFor({ chat_id, msg_id }) {
|
|
514
|
+
const row = db.prepare(`
|
|
515
|
+
SELECT 1 FROM turn_metrics
|
|
516
|
+
WHERE chat_id = ? AND msg_id = ? AND error IS NULL
|
|
517
|
+
LIMIT 1
|
|
518
|
+
`).get(String(chat_id), msg_id);
|
|
519
|
+
return !!row;
|
|
520
|
+
},
|
|
521
|
+
|
|
501
522
|
hasOutboundReplyTo({ chat_id, msg_id }) {
|
|
502
523
|
const row = db.prepare(`
|
|
503
524
|
SELECT 1 FROM messages
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* rc.50: process-guard helpers — orphan-detection PID file + safety
|
|
3
|
+
* handlers for uncaughtException / unhandledRejection that don't
|
|
4
|
+
* re-enter on broken stdout.
|
|
5
|
+
*
|
|
6
|
+
* Background — the rc.50 incident:
|
|
7
|
+
* PID 6335 (rc.48) was orphaned when its tmux pane was destroyed
|
|
8
|
+
* during `launchctl kickstart -k`. polygram's existing SIGHUP
|
|
9
|
+
* handler should have drained cleanly, but during the drain
|
|
10
|
+
* `console.error` inside the uncaughtException handler itself
|
|
11
|
+
* threw EIO (stdout was wired to a now-destroyed pty). That fired
|
|
12
|
+
* the same handler, which logged again, which threw EIO again — a
|
|
13
|
+
* tight re-entrant loop that hijacked the event loop and prevented
|
|
14
|
+
* shutdown from completing. The orphan ran for 3+ hours writing
|
|
15
|
+
* 3.59M+ uncaught-exception rows to the DB at ~12k/sec, and
|
|
16
|
+
* polled the same Telegram bot token in parallel with the new
|
|
17
|
+
* daemon.
|
|
18
|
+
*
|
|
19
|
+
* This module provides three primitives. polygram.js wires them
|
|
20
|
+
* together at boot.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
'use strict';
|
|
24
|
+
|
|
25
|
+
const fs = require('fs');
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Boot-time orphan detection. Writes our PID to `pidPath`. If the
|
|
29
|
+
* file already exists with a different live PID, kill it before
|
|
30
|
+
* proceeding (SIGTERM, then SIGKILL after `sigtermWaitMs`). Without
|
|
31
|
+
* this, two daemons can end up sharing the same Telegram bot token
|
|
32
|
+
* and SQLite DB — the cascade that made the rc.50 incident
|
|
33
|
+
* production-visible.
|
|
34
|
+
*
|
|
35
|
+
* @returns {{ priorPid: number|null, priorAction: string }}
|
|
36
|
+
*/
|
|
37
|
+
function claimPidFile(pidPath, { logger = console, sigtermWaitMs = 2000 } = {}) {
|
|
38
|
+
const ownPid = process.pid;
|
|
39
|
+
let priorPid = null;
|
|
40
|
+
let priorAction = 'no-prior';
|
|
41
|
+
|
|
42
|
+
if (fs.existsSync(pidPath)) {
|
|
43
|
+
const raw = (() => {
|
|
44
|
+
try { return fs.readFileSync(pidPath, 'utf8').trim(); }
|
|
45
|
+
catch { return ''; }
|
|
46
|
+
})();
|
|
47
|
+
const parsed = /^\d+$/.test(raw) ? parseInt(raw, 10) : null;
|
|
48
|
+
if (!parsed) {
|
|
49
|
+
priorAction = 'malformed-overwritten';
|
|
50
|
+
} else if (parsed === ownPid) {
|
|
51
|
+
// Re-entrant call from same process — write but don't kill self.
|
|
52
|
+
priorPid = parsed;
|
|
53
|
+
priorAction = 'self-skip';
|
|
54
|
+
} else {
|
|
55
|
+
priorPid = parsed;
|
|
56
|
+
const alive = isAlive(parsed);
|
|
57
|
+
if (!alive) {
|
|
58
|
+
priorAction = 'stale-overwritten';
|
|
59
|
+
} else {
|
|
60
|
+
logger.log?.(`[orphan-guard] prior daemon PID ${parsed} still alive — sending SIGTERM`);
|
|
61
|
+
try { process.kill(parsed, 'SIGTERM'); } catch {}
|
|
62
|
+
const start = Date.now();
|
|
63
|
+
while (Date.now() - start < sigtermWaitMs && isAlive(parsed)) {
|
|
64
|
+
// Busy-wait. Boot is single-threaded; we have nothing else to do
|
|
65
|
+
// until the orphan is gone, and we don't want to bind the bot
|
|
66
|
+
// token while it's still polling. sigtermWaitMs is configurable
|
|
67
|
+
// (default 2s; tests override to 100ms).
|
|
68
|
+
sleepSync(50);
|
|
69
|
+
}
|
|
70
|
+
if (isAlive(parsed)) {
|
|
71
|
+
logger.log?.(`[orphan-guard] PID ${parsed} ignored SIGTERM — escalating to SIGKILL`);
|
|
72
|
+
try { process.kill(parsed, 'SIGKILL'); } catch {}
|
|
73
|
+
// Poll for actual death — SIGKILL is delivered async, the
|
|
74
|
+
// kernel may take a tick to reap (esp. for detached children).
|
|
75
|
+
const killStart = Date.now();
|
|
76
|
+
while (Date.now() - killStart < 1000 && isAlive(parsed)) {
|
|
77
|
+
sleepSync(20);
|
|
78
|
+
}
|
|
79
|
+
priorAction = 'sigkill-killed';
|
|
80
|
+
} else {
|
|
81
|
+
priorAction = 'sigterm-killed';
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
fs.writeFileSync(pidPath, String(ownPid) + '\n', { mode: 0o600 });
|
|
88
|
+
return { priorPid, priorAction };
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Delete the PID file on clean shutdown. Only deletes if the file
|
|
93
|
+
* still contains OUR PID — protects against the race where a new
|
|
94
|
+
* daemon already claimed the file and rewrote it before we got here.
|
|
95
|
+
*/
|
|
96
|
+
function releasePidFile(pidPath) {
|
|
97
|
+
if (!fs.existsSync(pidPath)) return;
|
|
98
|
+
try {
|
|
99
|
+
const content = fs.readFileSync(pidPath, 'utf8').trim();
|
|
100
|
+
if (content === String(process.pid)) {
|
|
101
|
+
fs.unlinkSync(pidPath);
|
|
102
|
+
}
|
|
103
|
+
// Else: another daemon owns it now. Leaving alone is correct.
|
|
104
|
+
} catch {}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Build an uncaughtException handler that:
|
|
109
|
+
* 1. Wraps `logger.error` AND `logEvent` in try/catch — neither
|
|
110
|
+
* can re-throw out of the handler. (Pre-rc.50 the bare
|
|
111
|
+
* console.error threw EIO and re-fired this same handler in
|
|
112
|
+
* an event-loop-hijacking loop.)
|
|
113
|
+
* 2. Tracks repetitions of the same exception message in a sliding
|
|
114
|
+
* window. If the same message fires `eioThreshold` times within
|
|
115
|
+
* `eioWindowMs`, calls `panicExit(2)` so launchd restarts us
|
|
116
|
+
* cleanly. Without the circuit breaker, a stuck-stdout EIO
|
|
117
|
+
* cascade just keeps writing rows forever.
|
|
118
|
+
*
|
|
119
|
+
* @param {object} opts
|
|
120
|
+
* @param {object} opts.logger - { error(msg) } sink for human-readable logs.
|
|
121
|
+
* @param {function(string, object)} opts.logEvent - DB persist sink.
|
|
122
|
+
* @param {string} opts.botName
|
|
123
|
+
* @param {number} [opts.eioThreshold=100]
|
|
124
|
+
* @param {number} [opts.eioWindowMs=5000]
|
|
125
|
+
* @param {function(number)} [opts.panicExit=process.exit]
|
|
126
|
+
* @param {function(): number} [opts.now=Date.now]
|
|
127
|
+
* @returns {function(Error)}
|
|
128
|
+
*/
|
|
129
|
+
function _makeUncaughtHandler({
|
|
130
|
+
logger,
|
|
131
|
+
logEvent,
|
|
132
|
+
botName,
|
|
133
|
+
eioThreshold = 100,
|
|
134
|
+
eioWindowMs = 5000,
|
|
135
|
+
panicExit = (code) => process.exit(code),
|
|
136
|
+
now = Date.now,
|
|
137
|
+
} = {}) {
|
|
138
|
+
// Per-message sliding-window timestamps. Map<message, number[]>.
|
|
139
|
+
const recent = new Map();
|
|
140
|
+
let panicked = false;
|
|
141
|
+
|
|
142
|
+
return function uncaughtHandler(err) {
|
|
143
|
+
if (panicked) return; // bail — we're on our way out
|
|
144
|
+
const msg = String(err?.message || err || 'unknown').slice(0, 500);
|
|
145
|
+
const stack = err?.stack?.split('\n').slice(0, 5).join('\n') || '';
|
|
146
|
+
|
|
147
|
+
// 1. Log defensively. Stdout may be broken (the original incident);
|
|
148
|
+
// must not re-throw out of this handler.
|
|
149
|
+
try {
|
|
150
|
+
logger?.error?.(`[polygram] uncaughtException: ${msg}\n${stack}`);
|
|
151
|
+
} catch { /* swallow — broken stdout */ }
|
|
152
|
+
|
|
153
|
+
// 2. Persist defensively. DB might be closing during shutdown.
|
|
154
|
+
try {
|
|
155
|
+
logEvent?.('uncaught-exception', { message: msg, bot_name: botName });
|
|
156
|
+
} catch { /* swallow */ }
|
|
157
|
+
|
|
158
|
+
// 3. Storm circuit breaker: same message N times in window → exit.
|
|
159
|
+
const t = now();
|
|
160
|
+
let timestamps = recent.get(msg);
|
|
161
|
+
if (!timestamps) { timestamps = []; recent.set(msg, timestamps); }
|
|
162
|
+
timestamps.push(t);
|
|
163
|
+
// Drop expired.
|
|
164
|
+
while (timestamps.length && t - timestamps[0] > eioWindowMs) timestamps.shift();
|
|
165
|
+
if (timestamps.length >= eioThreshold) {
|
|
166
|
+
panicked = true;
|
|
167
|
+
try {
|
|
168
|
+
logger?.error?.(`[polygram] uncaughtException circuit breaker: ${timestamps.length}× "${msg}" in ${eioWindowMs}ms — exit(2)`);
|
|
169
|
+
} catch {}
|
|
170
|
+
try {
|
|
171
|
+
logEvent?.('panic-exit', { message: msg, count: timestamps.length, window_ms: eioWindowMs, bot_name: botName });
|
|
172
|
+
} catch {}
|
|
173
|
+
panicExit(2);
|
|
174
|
+
}
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Build a parallel handler for unhandledRejection: same defensive
|
|
179
|
+
// posture, separate counter (rejections and exceptions can come
|
|
180
|
+
// from different code paths and shouldn't share a budget).
|
|
181
|
+
function _makeUnhandledRejectionHandler(opts) {
|
|
182
|
+
const inner = _makeUncaughtHandler({
|
|
183
|
+
...opts,
|
|
184
|
+
// Override the 'kind' written to events table.
|
|
185
|
+
logEvent: opts.logEvent
|
|
186
|
+
? (kind, detail) => opts.logEvent(kind === 'panic-exit' ? 'panic-exit' : 'unhandled-rejection', detail)
|
|
187
|
+
: undefined,
|
|
188
|
+
});
|
|
189
|
+
return (reason /* , promise */) => {
|
|
190
|
+
const err = reason instanceof Error ? reason : new Error(String(reason));
|
|
191
|
+
inner(err);
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Convenience: install both handlers in one call.
|
|
197
|
+
* @returns {{ uninstall: function() }}
|
|
198
|
+
*/
|
|
199
|
+
function installSafetyHandlers(opts) {
|
|
200
|
+
const onException = _makeUncaughtHandler(opts);
|
|
201
|
+
const onRejection = _makeUnhandledRejectionHandler(opts);
|
|
202
|
+
process.on('uncaughtException', onException);
|
|
203
|
+
process.on('unhandledRejection', onRejection);
|
|
204
|
+
return {
|
|
205
|
+
uninstall() {
|
|
206
|
+
process.off('uncaughtException', onException);
|
|
207
|
+
process.off('unhandledRejection', onRejection);
|
|
208
|
+
},
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// ─── helpers ─────────────────────────────────────────────────────────
|
|
213
|
+
|
|
214
|
+
function isAlive(pid) {
|
|
215
|
+
try {
|
|
216
|
+
process.kill(pid, 0);
|
|
217
|
+
return true;
|
|
218
|
+
} catch (err) {
|
|
219
|
+
// ESRCH = no such process. EPERM = exists but we lack rights
|
|
220
|
+
// (treat as alive — same UID typically; we will fail to kill it
|
|
221
|
+
// but at least we know it's there).
|
|
222
|
+
if (err.code === 'EPERM') return true;
|
|
223
|
+
return false;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function sleepSync(ms) {
|
|
228
|
+
// Atomics-based busy-wait. 50ms granularity is fine for boot
|
|
229
|
+
// orphan-killing; we're not in a hot path.
|
|
230
|
+
const buf = new Int32Array(new SharedArrayBuffer(4));
|
|
231
|
+
Atomics.wait(buf, 0, 0, ms);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
module.exports = {
|
|
235
|
+
claimPidFile,
|
|
236
|
+
releasePidFile,
|
|
237
|
+
installSafetyHandlers,
|
|
238
|
+
_makeUncaughtHandler,
|
|
239
|
+
_makeUnhandledRejectionHandler,
|
|
240
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "polygram",
|
|
3
|
-
"version": "0.8.0-rc.
|
|
3
|
+
"version": "0.8.0-rc.51",
|
|
4
4
|
"description": "Telegram daemon for Claude Code that preserves the OpenClaw per-chat session model. Migration path for OpenClaw users moving to Claude Code.",
|
|
5
5
|
"main": "lib/ipc-client.js",
|
|
6
6
|
"bin": {
|
package/polygram.js
CHANGED
|
@@ -20,6 +20,7 @@ const { Bot } = require('grammy');
|
|
|
20
20
|
const { spawn } = require('child_process');
|
|
21
21
|
const fs = require('fs');
|
|
22
22
|
const path = require('path');
|
|
23
|
+
const processGuard = require('./lib/process-guard');
|
|
23
24
|
const dbClient = require('./lib/db');
|
|
24
25
|
const { migrateJsonToDb, getClaudeSessionId } = require('./lib/sessions');
|
|
25
26
|
const { buildPrompt } = require('./lib/prompt');
|
|
@@ -85,6 +86,7 @@ const SESSIONS_JSON_PATH = path.join(DATA_DIR, 'sessions.json'); // legacy, impo
|
|
|
85
86
|
const DB_DIR = DATA_DIR;
|
|
86
87
|
// DB_PATH is resolved in main() from --db or <bot>.db default.
|
|
87
88
|
let DB_PATH = null;
|
|
89
|
+
let PID_PATH = null; // rc.50: orphan-detection PID file
|
|
88
90
|
const STICKERS_PATH = process.env.POLYGRAM_STICKERS
|
|
89
91
|
|| path.join(DATA_DIR, 'stickers.json');
|
|
90
92
|
const INBOX_DIR = process.env.POLYGRAM_INBOX || path.join(DATA_DIR, 'inbox');
|
|
@@ -3535,6 +3537,17 @@ async function main() {
|
|
|
3535
3537
|
DB_PATH = dbOverride || path.join(DB_DIR, `${BOT_NAME}.db`);
|
|
3536
3538
|
console.log(`[polygram] bot: ${BOT_NAME} (${Object.keys(config.chats).length} chats) db: ${DB_PATH}`);
|
|
3537
3539
|
|
|
3540
|
+
// rc.50: claim our PID file BEFORE binding the bot token. If a
|
|
3541
|
+
// prior daemon (orphan from a botched restart) is still running,
|
|
3542
|
+
// SIGTERM/SIGKILL it first. Two daemons sharing one Telegram bot
|
|
3543
|
+
// token + SQLite DB caused the rc.50 incident's user-visible
|
|
3544
|
+
// damage; this stops the cascade at boot.
|
|
3545
|
+
PID_PATH = path.join(DB_DIR, `${BOT_NAME}.pid`);
|
|
3546
|
+
const pidClaim = processGuard.claimPidFile(PID_PATH, { logger: console });
|
|
3547
|
+
if (pidClaim.priorAction !== 'no-prior') {
|
|
3548
|
+
console.log(`[orphan-guard] prior=${pidClaim.priorPid ?? '?'} action=${pidClaim.priorAction}`);
|
|
3549
|
+
}
|
|
3550
|
+
|
|
3538
3551
|
try {
|
|
3539
3552
|
db = dbClient.open(DB_PATH);
|
|
3540
3553
|
console.log(`[db] opened ${DB_PATH}`);
|
|
@@ -3560,38 +3573,28 @@ async function main() {
|
|
|
3560
3573
|
process.exit(1);
|
|
3561
3574
|
}
|
|
3562
3575
|
|
|
3563
|
-
// 0.8.0 Phase 1 step 11:
|
|
3564
|
-
//
|
|
3565
|
-
// SDK throws never leak — but if a
|
|
3566
|
-
// (canUseTool body, onResult
|
|
3567
|
-
//
|
|
3568
|
-
//
|
|
3569
|
-
//
|
|
3570
|
-
|
|
3571
|
-
|
|
3572
|
-
|
|
3573
|
-
|
|
3574
|
-
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
// catch in pm's _runIteration, so user-visible UX is "their turn
|
|
3586
|
-
// failed", not "bot died").
|
|
3587
|
-
process.on('uncaughtException', (err) => {
|
|
3588
|
-
console.error(`[polygram] uncaughtException: ${err?.message}\n${err?.stack?.split('\n').slice(0, 5).join('\n')}`);
|
|
3589
|
-
try {
|
|
3590
|
-
db.logEvent('uncaught-exception', {
|
|
3591
|
-
message: String(err?.message || err).slice(0, 500),
|
|
3592
|
-
bot_name: BOT_NAME,
|
|
3593
|
-
});
|
|
3594
|
-
} catch { /* swallow */ }
|
|
3576
|
+
// 0.8.0 Phase 1 step 11 + rc.50: defensive uncaughtException +
|
|
3577
|
+
// unhandledRejection handlers. The new pm wraps every Query
|
|
3578
|
+
// iteration in try/catch so SDK throws never leak — but if a
|
|
3579
|
+
// callback ever does throw async (canUseTool body, onResult
|
|
3580
|
+
// handler, etc.) the rejection could escape. Node's default is
|
|
3581
|
+
// process exit; we log + persist + survive so other chats keep
|
|
3582
|
+
// running.
|
|
3583
|
+
//
|
|
3584
|
+
// rc.50 hardening (after the PID-6335 orphan-storm incident):
|
|
3585
|
+
// 1. Both handlers wrap their loggers in try/catch — pre-rc.50,
|
|
3586
|
+
// a bare console.error inside the uncaughtException handler
|
|
3587
|
+
// threw EIO when stdout was wired to a destroyed pty. That
|
|
3588
|
+
// re-fired the same handler infinitely, hijacking the event
|
|
3589
|
+
// loop and preventing the SIGHUP shutdown drain from running.
|
|
3590
|
+
// 2. Storm circuit breaker: same message firing >100× in 5s →
|
|
3591
|
+
// panic exit(2). Lets launchd restart cleanly instead of
|
|
3592
|
+
// letting the process zombie at ~12k EIO/sec writing to DB.
|
|
3593
|
+
// Lives in lib/process-guard.js.
|
|
3594
|
+
processGuard.installSafetyHandlers({
|
|
3595
|
+
logger: console,
|
|
3596
|
+
logEvent: (kind, detail) => { try { db.logEvent(kind, detail); } catch {} },
|
|
3597
|
+
botName: BOT_NAME,
|
|
3595
3598
|
});
|
|
3596
3599
|
|
|
3597
3600
|
const cap = config.maxWarmProcesses || DEFAULT_MAX_WARM_PROCS;
|
|
@@ -3913,6 +3916,11 @@ async function main() {
|
|
|
3913
3916
|
if (db) {
|
|
3914
3917
|
try { db.logEvent('polygram-stop'); db.raw.close(); } catch {}
|
|
3915
3918
|
}
|
|
3919
|
+
// rc.50: release our PID file claim so the next boot doesn't try
|
|
3920
|
+
// to kill us. releasePidFile is idempotent and only deletes the
|
|
3921
|
+
// file when its content matches our PID — a new daemon that
|
|
3922
|
+
// already claimed the slot is left alone.
|
|
3923
|
+
if (PID_PATH) processGuard.releasePidFile(PID_PATH);
|
|
3916
3924
|
setTimeout(() => process.exit(0), 100);
|
|
3917
3925
|
};
|
|
3918
3926
|
process.on('SIGINT', shutdown);
|
|
@@ -3969,8 +3977,19 @@ async function main() {
|
|
|
3969
3977
|
let replayed = 0;
|
|
3970
3978
|
let skipped = 0;
|
|
3971
3979
|
for (const row of candidates) {
|
|
3972
|
-
|
|
3973
|
-
|
|
3980
|
+
// rc.51: dedupe on turn_metrics (definitive turn completion),
|
|
3981
|
+
// NOT just on hasOutboundReplyTo. The latter trips on
|
|
3982
|
+
// intermediate ack-bubbles (e.g. "Catching up on history…",
|
|
3983
|
+
// "I'll write a quick inline script…") and silently skips the
|
|
3984
|
+
// replay even when the actual answer never arrived. The rc.50
|
|
3985
|
+
// EIO-orphan incident lost Ivan DM msg 12158 this way: an ack
|
|
3986
|
+
// bubble was sent at 13:20:36, the turn was killed mid-flight,
|
|
3987
|
+
// boot-replay saw the ack and assumed "answered."
|
|
3988
|
+
//
|
|
3989
|
+
// turn_metrics is only inserted by the SDK pm's onResult
|
|
3990
|
+
// callback, which fires only when the turn definitively
|
|
3991
|
+
// completes. No row → no completion → re-dispatch.
|
|
3992
|
+
if (db.hasCompletedTurnFor({ chat_id: row.chat_id, msg_id: row.msg_id })) {
|
|
3974
3993
|
db.setInboundHandlerStatus({
|
|
3975
3994
|
chat_id: row.chat_id, msg_id: row.msg_id, status: 'replied',
|
|
3976
3995
|
});
|