polygram 0.12.5 → 0.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/db/events-retention.js +200 -0
- package/package.json +1 -1
- package/polygram.js +38 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* events-table retention (#3, spec docs/0.13-events-retention-spec.md).
|
|
5
|
+
*
|
|
6
|
+
* `events` is append-only and grew unbounded fleet-wide (the table that
|
|
7
|
+
* ballooned shumabit.db to 4.4GB in the May-3 EIO storm). This caps it with:
|
|
8
|
+
* - time tiers: diagnostic kinds (high-frequency, short forensic value) pruned
|
|
9
|
+
* at `diagnosticDays`; everything else at `defaultDays`; a keep-forever set
|
|
10
|
+
* (lifecycle + errors) never pruned by time.
|
|
11
|
+
* - a UNIVERSAL per-kind row cap (`maxPerKind`) applied to EVERY kind incl.
|
|
12
|
+
* keep-forever — the real safety net so hardcoded tier lists aren't
|
|
13
|
+
* load-bearing and an incident-storm of one kind (12k/sec handler-error,
|
|
14
|
+
* the May-3 shape) can't balloon the table.
|
|
15
|
+
* - safety guards: `enabled` kill switch, `dryRun`, clock-backward skip, and a
|
|
16
|
+
* mass-delete fraction guard (refuse if a run would remove > a fraction of
|
|
17
|
+
* the table — the check that would have made May-3 a deliberate act).
|
|
18
|
+
*
|
|
19
|
+
* `pruneEvents` is PURE w.r.t. the events log: it deletes + returns counts but
|
|
20
|
+
* writes NO event rows. The caller (polygram boot) emits the `events-pruned` /
|
|
21
|
+
* `-preview` / `-skipped` audit event from the returned result (those kinds are
|
|
22
|
+
* in keepForever, so the audit trail survives its own prune).
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
const DAY_MS = 86_400_000;
|
|
26
|
+
|
|
27
|
+
const DEFAULT_POLICY = {
|
|
28
|
+
enabled: true,
|
|
29
|
+
dryRun: false,
|
|
30
|
+
diagnosticDays: 14,
|
|
31
|
+
defaultDays: 90,
|
|
32
|
+
diagnosticKinds: [
|
|
33
|
+
'reactor-state', 'hook-lag-sample', 'tool-result', 'cli-ups-seen',
|
|
34
|
+
// dormant since 2026-05-25 but listed defensively in case re-enabled:
|
|
35
|
+
'hook-event', 'turn-phase-change',
|
|
36
|
+
],
|
|
37
|
+
keepForeverKinds: [
|
|
38
|
+
'polygram-start', 'polygram-stop', 'shutdown-drain',
|
|
39
|
+
'handler-error', 'auth-expired', 'resume-fail',
|
|
40
|
+
// the prune's own audit trail — kept so it survives a prune (still capped):
|
|
41
|
+
'events-pruned', 'events-prune-preview', 'events-prune-skipped',
|
|
42
|
+
],
|
|
43
|
+
maxPerKind: 50_000,
|
|
44
|
+
maxDeleteFraction: 0.5,
|
|
45
|
+
batchSize: 5_000,
|
|
46
|
+
// compact-* drive findOrphanedCompactCommands; their retention must stay above
|
|
47
|
+
// the replay-window cap (2h) + margin or the rc.66 handled-/compact dedup can
|
|
48
|
+
// re-surface an old /compact to a partner. Validated, not assumed.
|
|
49
|
+
compactKinds: ['compact-command', 'compact-boundary', 'compact-replay', 'compact-failed-restart'],
|
|
50
|
+
minCompactRetentionMs: 3 * 3600 * 1000, // 3h > 2h replay cap
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
/** Merge a `config.defaults.events_retention` override onto the defaults. */
|
|
54
|
+
function resolveRetentionPolicy(config) {
|
|
55
|
+
const o = (config && config.defaults && config.defaults.events_retention) || {};
|
|
56
|
+
return {
|
|
57
|
+
...DEFAULT_POLICY,
|
|
58
|
+
...o,
|
|
59
|
+
// arrays don't deep-merge — fall back to defaults when not overridden
|
|
60
|
+
diagnosticKinds: o.diagnosticKinds || DEFAULT_POLICY.diagnosticKinds,
|
|
61
|
+
keepForeverKinds: o.keepForeverKinds || DEFAULT_POLICY.keepForeverKinds,
|
|
62
|
+
compactKinds: o.compactKinds || DEFAULT_POLICY.compactKinds,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** Fail loud on a misconfigured policy. Called at load and defensively per-run. */
|
|
67
|
+
function validatePolicy(policy) {
|
|
68
|
+
const diag = policy.diagnosticKinds || [];
|
|
69
|
+
const keep = policy.keepForeverKinds || [];
|
|
70
|
+
for (const k of [...diag, ...keep]) {
|
|
71
|
+
if (!k || typeof k !== 'string') {
|
|
72
|
+
throw new Error('events_retention: null/empty kind in a tier list');
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
const keepSet = new Set(keep);
|
|
76
|
+
for (const k of diag) {
|
|
77
|
+
if (keepSet.has(k)) {
|
|
78
|
+
throw new Error(`events_retention: kind "${k}" is in both diagnostic and keep-forever (tiers must be disjoint)`);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
const diagSet = new Set(diag);
|
|
82
|
+
for (const k of (policy.compactKinds || [])) {
|
|
83
|
+
let ms;
|
|
84
|
+
if (keepSet.has(k)) ms = Infinity;
|
|
85
|
+
else if (diagSet.has(k)) ms = policy.diagnosticDays * DAY_MS;
|
|
86
|
+
else ms = policy.defaultDays * DAY_MS;
|
|
87
|
+
if (ms < policy.minCompactRetentionMs) {
|
|
88
|
+
throw new Error(`events_retention: compact kind "${k}" retention (${ms}ms) is below the replay-window floor (${policy.minCompactRetentionMs}ms) — would re-arm the rc.66 re-surface bug`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return true;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/** Loop DELETE…LIMIT until a batch comes up short. Steady state = 1 batch. */
|
|
95
|
+
function batchedDelete(rawDb, sql, params, batchSize) {
|
|
96
|
+
const stmt = rawDb.prepare(sql);
|
|
97
|
+
let deleted = 0;
|
|
98
|
+
for (;;) {
|
|
99
|
+
const r = stmt.run(...params, batchSize);
|
|
100
|
+
deleted += r.changes;
|
|
101
|
+
if (r.changes < batchSize) break;
|
|
102
|
+
}
|
|
103
|
+
return deleted;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Prune the events table per `policy`. Returns one of:
|
|
108
|
+
* { skipped: true, reason } — disabled / clock / mass-delete
|
|
109
|
+
* { dryRun: true, preview: {default,diagnostic,cap,total}, before }
|
|
110
|
+
* { deleted: {default,diagnostic,cap,total}, before, after }
|
|
111
|
+
* Never writes an event row (caller logs the audit event).
|
|
112
|
+
*/
|
|
113
|
+
function pruneEvents(rawDb, now, policy) {
|
|
114
|
+
if (!policy.enabled) return { skipped: true, reason: 'disabled' };
|
|
115
|
+
validatePolicy(policy);
|
|
116
|
+
|
|
117
|
+
const diagSet = new Set(policy.diagnosticKinds);
|
|
118
|
+
const keepSet = new Set(policy.keepForeverKinds);
|
|
119
|
+
|
|
120
|
+
const before = rawDb.prepare('SELECT count(*) c, max(ts) mx FROM events').get();
|
|
121
|
+
const totalBefore = before.c;
|
|
122
|
+
if (totalBefore === 0) {
|
|
123
|
+
return { deleted: { default: 0, diagnostic: 0, cap: 0, total: 0 }, before: 0, after: 0 };
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Clock-backward guard: newest row is in the future relative to `now` ⇒ the
|
|
127
|
+
// system clock can't be trusted, don't delete on it.
|
|
128
|
+
if (before.mx != null && now < before.mx) {
|
|
129
|
+
return { skipped: true, reason: `clock-backward (now ${now} < max ts ${before.mx})` };
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const diagCut = now - policy.diagnosticDays * DAY_MS;
|
|
133
|
+
const defCut = now - policy.defaultDays * DAY_MS;
|
|
134
|
+
|
|
135
|
+
// Default-bucket predicate: old AND not diagnostic AND not keep-forever.
|
|
136
|
+
// Explicit ?-placeholders — better-sqlite3 does NOT expand a JS array from one
|
|
137
|
+
// param, and `NOT IN (…, NULL)` is a 3-valued-logic trap. validatePolicy
|
|
138
|
+
// already guarantees no NULL members.
|
|
139
|
+
const excluded = [...diagSet, ...keepSet];
|
|
140
|
+
const ph = excluded.map(() => '?').join(',');
|
|
141
|
+
const defWhere = `ts < ?${excluded.length ? ` AND kind NOT IN (${ph})` : ''}`;
|
|
142
|
+
|
|
143
|
+
// ---- estimate (drives dryRun + the mass-delete guard) ----
|
|
144
|
+
const estDefault = rawDb.prepare(`SELECT count(*) c FROM events WHERE ${defWhere}`).get(defCut, ...excluded).c;
|
|
145
|
+
let estDiag = 0;
|
|
146
|
+
const diagCountStmt = rawDb.prepare('SELECT count(*) c FROM events WHERE kind = ? AND ts < ?');
|
|
147
|
+
for (const k of diagSet) estDiag += diagCountStmt.get(k, diagCut).c;
|
|
148
|
+
const kinds = rawDb.prepare('SELECT kind, count(*) c FROM events GROUP BY kind').all();
|
|
149
|
+
let estCap = 0;
|
|
150
|
+
for (const { c } of kinds) if (c > policy.maxPerKind) estCap += c - policy.maxPerKind;
|
|
151
|
+
const estTotal = estDefault + estDiag + estCap;
|
|
152
|
+
|
|
153
|
+
// dryRun returns the preview regardless of the mass-delete guard (you want to
|
|
154
|
+
// SEE a would-be mass delete). Clock-backward already short-circuited above.
|
|
155
|
+
if (policy.dryRun) {
|
|
156
|
+
return {
|
|
157
|
+
dryRun: true,
|
|
158
|
+
preview: { default: estDefault, diagnostic: estDiag, cap: estCap, total: estTotal },
|
|
159
|
+
before: totalBefore,
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Mass-delete guard: refuse an anomalous run rather than execute it.
|
|
164
|
+
if (estTotal > 0 && estTotal / totalBefore > policy.maxDeleteFraction) {
|
|
165
|
+
return {
|
|
166
|
+
skipped: true,
|
|
167
|
+
reason: `mass-delete-guard (${estTotal}/${totalBefore} = ${(estTotal / totalBefore).toFixed(2)} > ${policy.maxDeleteFraction})`,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// ---- execute (batched; steady state is a single batch) ----
|
|
172
|
+
const delDefault = batchedDelete(rawDb, `DELETE FROM events WHERE ${defWhere} LIMIT ?`, [defCut, ...excluded], policy.batchSize);
|
|
173
|
+
let delDiag = 0;
|
|
174
|
+
for (const k of diagSet) {
|
|
175
|
+
delDiag += batchedDelete(rawDb, 'DELETE FROM events WHERE kind = ? AND ts < ? LIMIT ?', [k, diagCut], policy.batchSize);
|
|
176
|
+
}
|
|
177
|
+
// Universal cap: for each kind, delete everything older (by id) than the
|
|
178
|
+
// maxPerKind-th most-recent row. Applies to keep-forever too.
|
|
179
|
+
let delCap = 0;
|
|
180
|
+
for (const { kind, c } of kinds) {
|
|
181
|
+
if (c <= policy.maxPerKind) continue;
|
|
182
|
+
const thr = rawDb.prepare('SELECT id FROM events WHERE kind = ? ORDER BY id DESC LIMIT 1 OFFSET ?').get(kind, policy.maxPerKind);
|
|
183
|
+
if (!thr) continue; // a time-delete already brought it under the cap
|
|
184
|
+
delCap += batchedDelete(rawDb, 'DELETE FROM events WHERE kind = ? AND id <= ? LIMIT ?', [kind, thr.id], policy.batchSize);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const totalDeleted = delDefault + delDiag + delCap;
|
|
188
|
+
// Reclaim WAL slack after a large prune (steady-state prunes are tiny — skip).
|
|
189
|
+
if (totalDeleted > policy.batchSize) {
|
|
190
|
+
try { rawDb.pragma('wal_checkpoint(TRUNCATE)'); } catch { /* best-effort */ }
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return {
|
|
194
|
+
deleted: { default: delDefault, diagnostic: delDiag, cap: delCap, total: totalDeleted },
|
|
195
|
+
before: totalBefore,
|
|
196
|
+
after: totalBefore - totalDeleted,
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
module.exports = { pruneEvents, resolveRetentionPolicy, validatePolicy, DEFAULT_POLICY };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "polygram",
|
|
3
|
-
"version": "0.12.
|
|
3
|
+
"version": "0.12.6",
|
|
4
4
|
"description": "Telegram daemon for Claude Code that preserves the OpenClaw per-chat session model. Migration path for OpenClaw users moving to Claude Code.",
|
|
5
5
|
"main": "lib/ipc/client.js",
|
|
6
6
|
"bin": {
|
package/polygram.js
CHANGED
|
@@ -110,6 +110,7 @@ const { applyReactionToMessages } = require('./lib/telegram/album-reactions');
|
|
|
110
110
|
const { classify: classifyError, detectWedgedSessionError, isTransientHttpError } = require('./lib/error/classify');
|
|
111
111
|
const { createAutoResumeTracker, isAutoResumable } = require('./lib/db/auto-resume');
|
|
112
112
|
const { resolveReplayWindowMs } = require('./lib/db/replay-window');
|
|
113
|
+
const { pruneEvents, resolveRetentionPolicy, validatePolicy } = require('./lib/db/events-retention');
|
|
113
114
|
// validateIpcFileParam moved with handleSendOverIpc to
|
|
114
115
|
// lib/handlers/ipc-send.js (commit 36).
|
|
115
116
|
const {
|
|
@@ -2195,6 +2196,43 @@ async function main() {
|
|
|
2195
2196
|
process.exit(1);
|
|
2196
2197
|
}
|
|
2197
2198
|
|
|
2199
|
+
// #3 events-table retention. Prune on boot (the primary path — daemons rarely
|
|
2200
|
+
// live to the 24h tick given deploy cadence) + a 24h .unref()'d interval as
|
|
2201
|
+
// insurance for long-uptime daemons. Validation failures DISABLE pruning and
|
|
2202
|
+
// log loud — a retention config typo must never take down the bot, so this
|
|
2203
|
+
// lives outside the DB-fatal try/catch above. pruneEvents writes no event
|
|
2204
|
+
// rows; we emit the audit event here from its result.
|
|
2205
|
+
let eventsRetentionPolicy = null;
|
|
2206
|
+
try {
|
|
2207
|
+
eventsRetentionPolicy = resolveRetentionPolicy(config);
|
|
2208
|
+
validatePolicy(eventsRetentionPolicy);
|
|
2209
|
+
} catch (err) {
|
|
2210
|
+
console.error(`[events-retention] invalid policy — pruning DISABLED: ${err.message}`);
|
|
2211
|
+
eventsRetentionPolicy = null;
|
|
2212
|
+
}
|
|
2213
|
+
const runEventsPrune = (trigger) => {
|
|
2214
|
+
if (!eventsRetentionPolicy) return;
|
|
2215
|
+
try {
|
|
2216
|
+
const res = pruneEvents(db.raw, Date.now(), eventsRetentionPolicy);
|
|
2217
|
+
if (res.skipped) {
|
|
2218
|
+
console.log(`[events-retention] skipped (${trigger}): ${res.reason}`);
|
|
2219
|
+
db.logEvent('events-prune-skipped', { reason: res.reason, trigger });
|
|
2220
|
+
} else if (res.dryRun) {
|
|
2221
|
+
console.log(`[events-retention] DRY-RUN (${trigger}) would delete ${res.preview.total} (default ${res.preview.default}, diag ${res.preview.diagnostic}, cap ${res.preview.cap})`);
|
|
2222
|
+
db.logEvent('events-prune-preview', { ...res.preview, trigger });
|
|
2223
|
+
} else if (res.deleted.total > 0) {
|
|
2224
|
+
console.log(`[events-retention] pruned ${res.deleted.total} (default ${res.deleted.default}, diag ${res.deleted.diagnostic}, cap ${res.deleted.cap}) ${res.before}→${res.after}`);
|
|
2225
|
+
db.logEvent('events-pruned', { ...res.deleted, before: res.before, after: res.after, trigger });
|
|
2226
|
+
}
|
|
2227
|
+
} catch (err) {
|
|
2228
|
+
console.error(`[events-retention] prune failed (${trigger}): ${err.message}`);
|
|
2229
|
+
}
|
|
2230
|
+
};
|
|
2231
|
+
if (eventsRetentionPolicy && eventsRetentionPolicy.enabled) {
|
|
2232
|
+
setImmediate(() => runEventsPrune('boot'));
|
|
2233
|
+
setInterval(() => runEventsPrune('interval'), 24 * 3_600_000).unref?.();
|
|
2234
|
+
}
|
|
2235
|
+
|
|
2198
2236
|
// 0.8.0 Phase 1 step 11 + rc.50: defensive uncaughtException +
|
|
2199
2237
|
// unhandledRejection handlers. The new pm wraps every Query
|
|
2200
2238
|
// iteration in try/catch so SDK throws never leak — but if a
|