polygram 0.12.0-rc.3 → 0.12.0-rc.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.example.json +5 -1
- package/lib/attachments.js +46 -2
- package/lib/claude-bin.js +8 -1
- package/lib/compaction-warn.js +59 -0
- package/lib/context-usage.js +93 -0
- package/lib/db.js +1 -1
- package/lib/error/classify.js +12 -0
- package/lib/handlers/abort.js +59 -9
- package/lib/handlers/config-callback.js +8 -2
- package/lib/handlers/config-ui.js +23 -9
- package/lib/handlers/dispatcher.js +83 -0
- package/lib/handlers/download.js +101 -58
- package/lib/handlers/questions.js +287 -0
- package/lib/ipc/file-validator.js +8 -1
- package/lib/process/channels-bridge-protocol.js +18 -1
- package/lib/process/channels-bridge.mjs +85 -1
- package/lib/process/channels-tool-dispatcher.js +20 -2
- package/lib/process/cli-process.js +596 -27
- package/lib/process/factory.js +0 -4
- package/lib/process/hook-event-tail.js +7 -0
- package/lib/process/hook-settings.js +7 -0
- package/lib/process-manager.js +25 -0
- package/lib/questions/questions.js +183 -0
- package/lib/questions/store.js +100 -0
- package/lib/sdk/callbacks.js +110 -0
- package/lib/telegram/album-reactions.js +50 -0
- package/lib/telegram/api.js +9 -0
- package/lib/telegram/input-file.js +76 -0
- package/lib/tmux/startup-gate.js +26 -10
- package/migrations/012-pending-questions.sql +30 -0
- package/package.json +1 -1
- package/polygram.js +109 -23
package/config.example.json
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
"bots": {
|
|
5
5
|
"admin-bot": {
|
|
6
6
|
"token": "REPLACE_WITH_BOT_TOKEN_FROM_BOTFATHER",
|
|
7
|
+
"_comment_apiRoot": "Optional. Point grammy at a self-hosted Telegram Bot API server (e.g. 'http://localhost:8082' from a local `telegram-bot-api --local` process) to raise file send/receive limits from cloud's 50MB-out / 20MB-in to 2GB both ways. Omit for cloud Telegram (default, unchanged). The server is a separate localhost-only companion daemon — see docs/0.12.0-file-send.md.",
|
|
7
8
|
"allowConfigCommands": true,
|
|
8
9
|
"_comment_adminChatId": "Required when allowConfigCommands is true for pairing commands (/pair-code, /pairings, /unpair) to work. These grant cross-chat trust and are gated to the admin chat only.",
|
|
9
10
|
"adminChatId": "123456789",
|
|
@@ -70,7 +71,10 @@
|
|
|
70
71
|
"model": "opus",
|
|
71
72
|
"effort": "medium",
|
|
72
73
|
"cwd": "/Users/you/admin-agent",
|
|
73
|
-
"timeout": 600
|
|
74
|
+
"timeout": 600,
|
|
75
|
+
"_comment_maxFileBytes": "OPTIONAL per-chat (or per-topic; topic wins) file-size cap in BYTES. There is NO fixed default — the default is backend-derived: cloud Telegram = 50MB send / 20MB receive; with a local Bot API server (bot.apiRoot set) = 2GB both ways. This key only LOWERS that ceiling for this chat (Telegram rejects anything above the backend limit regardless); omit it to use the full backend default. To set one, add e.g. \"maxFileBytes\": 104857600 (=100MB) — only meaningful when apiRoot is set, since cloud already clamps to 50/20MB.",
|
|
76
|
+
"_comment_compactionWarnings": "OPTIONAL per-chat (or per-topic; topic wins). CLI/channels backend (pm:'cli') only. Default OFF. When ON, polygram warns the chat as Claude's context fills so you can /compact on your terms BEFORE an auto-compaction interrupts a turn (auto-compaction detaches the channels MCP bridge mid-turn — see docs/0.12.0-compaction-warnings.md). Two forms: `true` (enable at the 75% default threshold) or `{ \"enabled\": true, \"thresholdPct\": 80 }` (custom 1-99 threshold). Proactive: at the threshold it posts 'context ~N% full, run /compact or /new at a break'. Reactive backstop: when claude auto-compacts anyway it posts 'compacting now, resend if quiet'. Manual /compact never warns. Requires the bot to allow /compact + /new commands.",
|
|
77
|
+
"compactionWarnings": true
|
|
74
78
|
},
|
|
75
79
|
|
|
76
80
|
"-1000000000001": {
|
package/lib/attachments.js
CHANGED
|
@@ -22,8 +22,48 @@
|
|
|
22
22
|
* extension — the fallback only kicks in when MIME is unhelpful.
|
|
23
23
|
*/
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
// Inbound (user → bot) per-file cap. Telegram's cloud Bot API hard-caps
|
|
26
|
+
// bot file DOWNLOADS (getFile) at 20 MB, so 20 MB is the real ceiling on
|
|
27
|
+
// cloud — raised from 10 MB so users can send larger tracks/docs. With a
|
|
28
|
+
// self-hosted Bot API server (config.bot.apiRoot) the Telegram limit rises
|
|
29
|
+
// to 2 GB; resolveFileCaps() raises the default accordingly.
|
|
30
|
+
const MAX_FILE_BYTES = 20 * 1024 * 1024;
|
|
31
|
+
const MAX_TOTAL_BYTES = 50 * 1024 * 1024;
|
|
32
|
+
|
|
33
|
+
// ─── Backend-derived file-size caps (cloud vs local Bot API server) ──
|
|
34
|
+
//
|
|
35
|
+
// These are the HARD ceilings Telegram itself enforces — a per-chat
|
|
36
|
+
// override can lower them but never exceed them (Telegram rejects beyond
|
|
37
|
+
// regardless). NOT "adaptive": there is no intermediate tier. Cloud is a
|
|
38
|
+
// flat 20 in / 50 out; a local `telegram-bot-api --local` server is a flat
|
|
39
|
+
// 2 GB both ways.
|
|
40
|
+
const CLOUD_MAX_IN_BYTES = 20 * 1024 * 1024; // getFile download limit
|
|
41
|
+
const CLOUD_MAX_OUT_BYTES = 50 * 1024 * 1024; // sendDocument upload limit
|
|
42
|
+
const LOCAL_MAX_BYTES = 2000 * 1024 * 1024; // --local server, both ways
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Resolve the effective per-file caps for a chat/topic.
|
|
46
|
+
*
|
|
47
|
+
* @param {object} opts
|
|
48
|
+
* @param {boolean} opts.localApi — true when config.bot.apiRoot is set
|
|
49
|
+
* (a local Bot API server is in use → 2 GB ceiling).
|
|
50
|
+
* @param {...number} opts.override — per-chat/topic maxFileBytes (bytes).
|
|
51
|
+
* Resolved by the caller from topic → chat → undefined; clamped to the
|
|
52
|
+
* backend ceiling.
|
|
53
|
+
* @returns {{ inBytes:number, outBytes:number, ceiling:number, localApi:boolean }}
|
|
54
|
+
*/
|
|
55
|
+
function resolveFileCaps({ localApi = false, override = null } = {}) {
|
|
56
|
+
const ceiling = localApi ? LOCAL_MAX_BYTES : null;
|
|
57
|
+
const defIn = localApi ? LOCAL_MAX_BYTES : CLOUD_MAX_IN_BYTES;
|
|
58
|
+
const defOut = localApi ? LOCAL_MAX_BYTES : CLOUD_MAX_OUT_BYTES;
|
|
59
|
+
// A numeric override sets BOTH directions to the same value, clamped to
|
|
60
|
+
// the backend hard ceiling (cloud uses the per-direction default as the
|
|
61
|
+
// clamp so an override can't push past Telegram's own limit).
|
|
62
|
+
const ovr = (typeof override === 'number' && override > 0) ? override : null;
|
|
63
|
+
const inBytes = ovr ? (localApi ? Math.min(ovr, ceiling) : Math.min(ovr, CLOUD_MAX_IN_BYTES)) : defIn;
|
|
64
|
+
const outBytes = ovr ? (localApi ? Math.min(ovr, ceiling) : Math.min(ovr, CLOUD_MAX_OUT_BYTES)) : defOut;
|
|
65
|
+
return { inBytes, outBytes, ceiling: ceiling ?? CLOUD_MAX_OUT_BYTES, localApi };
|
|
66
|
+
}
|
|
27
67
|
const MIME_ALLOW = [
|
|
28
68
|
/^image\//, /^audio\//, /^video\//,
|
|
29
69
|
/^application\/pdf$/, /^text\/plain$/,
|
|
@@ -109,8 +149,12 @@ function filterAttachments(attachments, opts = {}) {
|
|
|
109
149
|
|
|
110
150
|
module.exports = {
|
|
111
151
|
filterAttachments,
|
|
152
|
+
resolveFileCaps,
|
|
112
153
|
MAX_FILE_BYTES,
|
|
113
154
|
MAX_TOTAL_BYTES,
|
|
155
|
+
CLOUD_MAX_IN_BYTES,
|
|
156
|
+
CLOUD_MAX_OUT_BYTES,
|
|
157
|
+
LOCAL_MAX_BYTES,
|
|
114
158
|
MIME_ALLOW,
|
|
115
159
|
EXTENSION_ALLOW,
|
|
116
160
|
FALLBACK_MIMES,
|
package/lib/claude-bin.js
CHANGED
|
@@ -7,7 +7,14 @@ const fs = require('fs');
|
|
|
7
7
|
// 0.12 Phase 4: moved from lib/process/tmux-process.js into the helper module
|
|
8
8
|
// that consumes it, so the constant survives TmuxProcess deletion. CliProcess
|
|
9
9
|
// + spike scripts + polygram boot all import from here now.
|
|
10
|
-
|
|
10
|
+
// 0.12.0-rc.18: bumped 2.1.142 → 2.1.158 (latest installed). The dev-channels
|
|
11
|
+
// inbound-message delivery has an intermittent channel-bind race (the bridge
|
|
12
|
+
// pushes user_msg before claude's channel subscription is active → message
|
|
13
|
+
// silently dropped → stuck turn; see docs/0.12.0-known-issues.md). Trying a
|
|
14
|
+
// newer claude to see if the research-preview channels reliability improved,
|
|
15
|
+
// before building polygram-side recovery. Re-validate the channel flow on each
|
|
16
|
+
// bump via tests/e2e-channels-real-claude.test.js.
|
|
17
|
+
const CLAUDE_CLI_PINNED_VERSION = '2.1.158';
|
|
11
18
|
|
|
12
19
|
/**
|
|
13
20
|
* Resolve + verify the pinned claude CLI binary.
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* compaction-warn — per-chat config resolution + warn-once state for the
|
|
3
|
+
* compaction warning (0.12.0-rc.13).
|
|
4
|
+
*
|
|
5
|
+
* The warning is OFF by default. A chat (or topic) opts in via
|
|
6
|
+
* `compactionWarnings`:
|
|
7
|
+
* true → enabled, default threshold
|
|
8
|
+
* { enabled: true, thresholdPct: 80 } → enabled, custom threshold
|
|
9
|
+
* false / absent / object w/o enabled → off
|
|
10
|
+
*
|
|
11
|
+
* `thresholdPct` is the context-fill % at which the PROACTIVE warning fires
|
|
12
|
+
* (propose /compact before claude auto-compacts mid-turn). Default 75 — below
|
|
13
|
+
* claude's own auto-compact threshold so the user gets a window to act.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
'use strict';
|
|
17
|
+
|
|
18
|
+
const DEFAULT_THRESHOLD_PCT = 75;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* @param {object|undefined} cfg resolved topic/chat config (getTopicConfig result)
|
|
22
|
+
* @returns {{enabled: boolean, thresholdPct: number}}
|
|
23
|
+
*/
|
|
24
|
+
function resolveCompactionWarnConfig(cfg) {
|
|
25
|
+
const raw = cfg?.compactionWarnings;
|
|
26
|
+
const off = { enabled: false, thresholdPct: DEFAULT_THRESHOLD_PCT };
|
|
27
|
+
|
|
28
|
+
if (raw === true) return { enabled: true, thresholdPct: DEFAULT_THRESHOLD_PCT };
|
|
29
|
+
if (raw && typeof raw === 'object' && raw.enabled === true) {
|
|
30
|
+
const t = Number(raw.thresholdPct);
|
|
31
|
+
const thresholdPct = (Number.isFinite(t) && t > 0 && t < 100) ? t : DEFAULT_THRESHOLD_PCT;
|
|
32
|
+
return { enabled: true, thresholdPct };
|
|
33
|
+
}
|
|
34
|
+
return off;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Per-session "have we already warned on this climb?" state. Warn ONCE per
|
|
39
|
+
* session until reset — without this the proactive warning would re-fire on
|
|
40
|
+
* every turn-end while the context stays high. Reset on a successful
|
|
41
|
+
* compaction (PostCompact → context dropped) or a fresh session so the next
|
|
42
|
+
* climb can warn again. Mirrors the autoResumeTracker shape.
|
|
43
|
+
*/
|
|
44
|
+
function createCompactionWarnTracker() {
|
|
45
|
+
const warned = new Set();
|
|
46
|
+
return {
|
|
47
|
+
shouldWarn(sessionKey) { return !warned.has(sessionKey); },
|
|
48
|
+
markWarned(sessionKey) { warned.add(sessionKey); },
|
|
49
|
+
reset(sessionKey) { warned.delete(sessionKey); },
|
|
50
|
+
resetAll() { warned.clear(); },
|
|
51
|
+
_size() { return warned.size; },
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
module.exports = {
|
|
56
|
+
resolveCompactionWarnConfig,
|
|
57
|
+
createCompactionWarnTracker,
|
|
58
|
+
DEFAULT_THRESHOLD_PCT,
|
|
59
|
+
};
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* context-usage — read live context occupancy from a Claude Code session
|
|
3
|
+
* transcript (JSONL).
|
|
4
|
+
*
|
|
5
|
+
* Used by the per-chat compaction warning (0.12.0-rc.13). polygram has no
|
|
6
|
+
* usage payload on the channels/CLI backend (hook events carry none — see
|
|
7
|
+
* the rc.13 spike), so the only source of "how full is the context" is the
|
|
8
|
+
* transcript itself. We read it ONCE per turn-end (Stop hook), not on a
|
|
9
|
+
* poll loop, so a single streamed pass is fine.
|
|
10
|
+
*
|
|
11
|
+
* What "occupancy" means: Claude's own context-% / auto-compact threshold is
|
|
12
|
+
* measured against what's fed INTO the model each turn —
|
|
13
|
+
* input_tokens + cache_read_input_tokens + cache_creation_input_tokens
|
|
14
|
+
* (cache_read dominates once the conversation is warm). output_tokens is the
|
|
15
|
+
* reply, not context, so it's excluded.
|
|
16
|
+
*
|
|
17
|
+
* We take the LAST main-thread (non-sidechain) assistant frame with a usage
|
|
18
|
+
* block. Subagents write to their own agent_transcript_path so sidechain
|
|
19
|
+
* frames don't normally appear here, but we skip them defensively: a format
|
|
20
|
+
* change that inlined a subagent's large usage would otherwise spike the
|
|
21
|
+
* parent's apparent context and trigger a false "you're full" warning.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
'use strict';
|
|
25
|
+
|
|
26
|
+
const fs = require('node:fs');
|
|
27
|
+
const readline = require('node:readline');
|
|
28
|
+
|
|
29
|
+
// Standard Claude context window (sonnet/opus, non-beta). The warning is a
|
|
30
|
+
// heuristic ("you're getting full"), so an approximate denominator is fine;
|
|
31
|
+
// callers can pass a different window for 1M-beta sessions.
|
|
32
|
+
const DEFAULT_WINDOW_TOKENS = 200_000;
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* @param {string} transcriptPath
|
|
36
|
+
* @returns {Promise<{inputTokens:number, cacheReadTokens:number, cacheCreationTokens:number, total:number} | null>}
|
|
37
|
+
* null when the path is falsy/unreadable or no usable usage frame exists.
|
|
38
|
+
*/
|
|
39
|
+
async function readContextTokens(transcriptPath) {
|
|
40
|
+
if (!transcriptPath) return null;
|
|
41
|
+
|
|
42
|
+
let stream;
|
|
43
|
+
try {
|
|
44
|
+
stream = fs.createReadStream(transcriptPath, { encoding: 'utf8' });
|
|
45
|
+
} catch {
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return new Promise((resolve) => {
|
|
50
|
+
let last = null;
|
|
51
|
+
// Resolve only once — error and close can both fire.
|
|
52
|
+
let done = false;
|
|
53
|
+
const finish = (v) => { if (!done) { done = true; resolve(v); } };
|
|
54
|
+
|
|
55
|
+
stream.on('error', () => finish(null));
|
|
56
|
+
|
|
57
|
+
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
|
|
58
|
+
// readline forwards the input stream's 'error' (e.g. ENOENT on open) to
|
|
59
|
+
// the interface; without this handler that re-emit is unhandled and
|
|
60
|
+
// crashes the process even though we resolved null on the stream error.
|
|
61
|
+
rl.on('error', () => finish(null));
|
|
62
|
+
rl.on('line', (line) => {
|
|
63
|
+
if (!line) return;
|
|
64
|
+
let o;
|
|
65
|
+
try { o = JSON.parse(line); } catch { return; } // skip partial/non-JSON lines
|
|
66
|
+
if (!o || o.type !== 'assistant' || o.isSidechain === true) return;
|
|
67
|
+
const u = o.message?.usage;
|
|
68
|
+
if (!u) return;
|
|
69
|
+
const inputTokens = Number(u.input_tokens) || 0;
|
|
70
|
+
const cacheReadTokens = Number(u.cache_read_input_tokens) || 0;
|
|
71
|
+
const cacheCreationTokens = Number(u.cache_creation_input_tokens) || 0;
|
|
72
|
+
const total = inputTokens + cacheReadTokens + cacheCreationTokens;
|
|
73
|
+
if (total > 0) last = { inputTokens, cacheReadTokens, cacheCreationTokens, total };
|
|
74
|
+
});
|
|
75
|
+
rl.on('close', () => finish(last));
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Fraction (0..1) of the context window currently occupied. Clamps to 0 on
|
|
81
|
+
* non-positive / non-finite inputs so callers never see NaN/Infinity.
|
|
82
|
+
*
|
|
83
|
+
* @param {number} totalTokens
|
|
84
|
+
* @param {number} [windowTokens=DEFAULT_WINDOW_TOKENS]
|
|
85
|
+
* @returns {number}
|
|
86
|
+
*/
|
|
87
|
+
function contextPct(totalTokens, windowTokens = DEFAULT_WINDOW_TOKENS) {
|
|
88
|
+
if (!Number.isFinite(totalTokens) || totalTokens <= 0) return 0;
|
|
89
|
+
if (!Number.isFinite(windowTokens) || windowTokens <= 0) return 0;
|
|
90
|
+
return totalTokens / windowTokens;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
module.exports = { readContextTokens, contextPct, DEFAULT_WINDOW_TOKENS };
|
package/lib/db.js
CHANGED
|
@@ -19,7 +19,7 @@ const Database = require('better-sqlite3');
|
|
|
19
19
|
// SCHEMA_VERSION; the early-return on line ~42 then skipped the
|
|
20
20
|
// migration loop on any DB already at user_version=8 → turn_metrics
|
|
21
21
|
// table never created → INSERT prepare at startup crashed polygram.
|
|
22
|
-
const SCHEMA_VERSION =
|
|
22
|
+
const SCHEMA_VERSION = 12;
|
|
23
23
|
|
|
24
24
|
// Sentinel `error` value for outbound rows whose API call may or may not
|
|
25
25
|
// have reached Telegram. markStalePending writes it; hasOutboundReplyTo
|
package/lib/error/classify.js
CHANGED
|
@@ -195,6 +195,18 @@ const CODES = {
|
|
|
195
195
|
isTransient: false,
|
|
196
196
|
autoRecover: null,
|
|
197
197
|
},
|
|
198
|
+
// TMUX_SESSION_GONE: claude exited during spawn so the tmux session vanished
|
|
199
|
+
// before the channel went live (the startup-gate's captureWide hit "can't
|
|
200
|
+
// find pane"). Usual cause: an unresumable aged session whose "Resume from
|
|
201
|
+
// summary?" /compact exits code 0. The dispatcher poison-clears the session
|
|
202
|
+
// on this code, so a resend genuinely starts fresh and works — hence the
|
|
203
|
+
// calm "send it again" copy instead of the old raw "[startup-gate]…" leak.
|
|
204
|
+
TMUX_SESSION_GONE: {
|
|
205
|
+
kind: 'tmuxSessionGone',
|
|
206
|
+
userMessage: '🔄 That chat got stuck starting up, so I reset it. Send your message again and I\'ll pick it up fresh.',
|
|
207
|
+
isTransient: false,
|
|
208
|
+
autoRecover: null,
|
|
209
|
+
},
|
|
198
210
|
// TURN_TIMEOUT: 10-min wall-clock cap on a single channels turn. Mirror
|
|
199
211
|
// of the tmux wall-clock ceiling — typically a runaway, not a wedge.
|
|
200
212
|
// Not transient (auto-retry would just runaway again).
|
package/lib/handlers/abort.js
CHANGED
|
@@ -42,13 +42,37 @@ function createHandleAbort({
|
|
|
42
42
|
const threadId = msg.message_thread_id?.toString();
|
|
43
43
|
const sessionKey = getSessionKey(chatId, threadId, chatConfig);
|
|
44
44
|
const proc = pm.has(sessionKey) ? pm.get(sessionKey) : null;
|
|
45
|
-
|
|
45
|
+
let hadActive = !!proc?.inFlight;
|
|
46
46
|
|
|
47
47
|
// Mark BEFORE killing: the 'close' event fires almost immediately
|
|
48
48
|
// after interrupt, and the surrounding handleMessage's catch
|
|
49
49
|
// needs to see the flag to skip the generic error-reply.
|
|
50
50
|
if (hadActive) markSessionAborted(sessionKey);
|
|
51
51
|
|
|
52
|
+
// "Stop" incident (shumorobot Music, 2026-05-31 13:08): on the
|
|
53
|
+
// CliProcess/channels backend a turn resolves on the quiet-window
|
|
54
|
+
// after claude's last reply tool call (inFlight → false), but claude
|
|
55
|
+
// can still be working (subagent, long Bash). Keying the ack on
|
|
56
|
+
// inFlight alone made "Stop" say "Nothing to stop" while a subagent
|
|
57
|
+
// download churned. probeBusyState() reads the TUI "esc to interrupt"
|
|
58
|
+
// hint — the truthful signal — so detection, the abort mark, and the
|
|
59
|
+
// ack all agree. The probe result is logged below (forensics) so the
|
|
60
|
+
// heuristic can be refined against real states later. Channels analog
|
|
61
|
+
// of the (deleted) tmux hasBackgroundShell branch; typeof-guarded so
|
|
62
|
+
// it's a no-op on backends without it.
|
|
63
|
+
let busyProbe = null;
|
|
64
|
+
if (!hadActive && proc && typeof proc.probeBusyState === 'function') {
|
|
65
|
+
try {
|
|
66
|
+
busyProbe = await proc.probeBusyState();
|
|
67
|
+
if (busyProbe?.busy) {
|
|
68
|
+
hadActive = true;
|
|
69
|
+
markSessionAborted(sessionKey);
|
|
70
|
+
}
|
|
71
|
+
} catch (err) {
|
|
72
|
+
logger.error?.(`[${botName}] busy-probe failed: ${err.message}`);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
52
76
|
// Bug 1 (incident 2026-05-18): "Stop" was turn-scoped — it only
|
|
53
77
|
// looked at an in-flight TURN. But the agent can leave a DETACHED
|
|
54
78
|
// background shell running (a `run_in_background:true` Bash) that
|
|
@@ -72,21 +96,47 @@ function createHandleAbort({
|
|
|
72
96
|
}
|
|
73
97
|
}
|
|
74
98
|
|
|
75
|
-
//
|
|
76
|
-
//
|
|
77
|
-
//
|
|
78
|
-
// drainQueue() rejects every queued pending with
|
|
79
|
-
// err.code='INTERRUPTED' so the abort-grace classifier
|
|
80
|
-
// suppresses error replies.
|
|
81
|
-
await pm.interrupt(sessionKey).catch((err) =>
|
|
82
|
-
logger.error?.(`[${botName}] interrupt failed: ${err.message}`));
|
|
99
|
+
// Reject queued pendings first (err.code='INTERRUPTED' → the abort-grace
|
|
100
|
+
// classifier suppresses their error replies AND each turn's finally clears
|
|
101
|
+
// its reactor + typing), THEN stop the live work.
|
|
83
102
|
pm.drainQueue(sessionKey, 'INTERRUPTED');
|
|
103
|
+
if (hadActive && proc && proc.backend === 'cli') {
|
|
104
|
+
// Channels HARD stop (user decision 2026-06-04: "/stop should stop
|
|
105
|
+
// everything including background, like the SDK backend"). A soft C-c
|
|
106
|
+
// interrupt leaves detached background shells + subagents running and
|
|
107
|
+
// can't clear a ghost (no-pending-turn) busy state — the symptom was
|
|
108
|
+
// "Stopped." with the reaction + typing still going. Kill the session: the
|
|
109
|
+
// whole process tree (claude + every subagent + all background shells)
|
|
110
|
+
// dies at once, the close drains the in-flight turn (clearing its
|
|
111
|
+
// reactor/typing), and the next message respawns fresh (--resume restores
|
|
112
|
+
// the conversation). This is what makes channels /stop "stop everything".
|
|
113
|
+
await pm.kill(sessionKey, 'abort').catch((err) =>
|
|
114
|
+
logger.error?.(`[${botName}] abort kill failed: ${err.message}`));
|
|
115
|
+
} else {
|
|
116
|
+
// SDK (or nothing active): non-destructive interrupt cancels the in-flight
|
|
117
|
+
// Query turn WITHOUT tearing down the Query (cheap to reuse next message).
|
|
118
|
+
await pm.interrupt(sessionKey).catch((err) =>
|
|
119
|
+
logger.error?.(`[${botName}] interrupt failed: ${err.message}`));
|
|
120
|
+
}
|
|
84
121
|
|
|
85
122
|
clearAutosteeredReactions(sessionKey).catch(() => {});
|
|
86
123
|
logEvent('abort-requested', {
|
|
87
124
|
chat_id: chatId, user_id: msg.from?.id || null,
|
|
88
125
|
had_active: hadActive,
|
|
89
126
|
killed_background_shell: killedBackgroundShell,
|
|
127
|
+
// "Stop" incident forensics: the raw busy-probe signals at decision
|
|
128
|
+
// time. Lets us query, across real aborts, where the esc-hint /
|
|
129
|
+
// inFlight / pending-turn signals agreed vs diverged and refine the
|
|
130
|
+
// heuristic later. null when no probe ran (turn was already inFlight,
|
|
131
|
+
// or the backend has no probeBusyState).
|
|
132
|
+
busy_probe: busyProbe ? {
|
|
133
|
+
busy: busyProbe.busy,
|
|
134
|
+
streaming: busyProbe.streaming,
|
|
135
|
+
in_flight: busyProbe.inFlight,
|
|
136
|
+
pending_turns: busyProbe.pendingTurns,
|
|
137
|
+
captured: busyProbe.captured,
|
|
138
|
+
pane_tail: busyProbe.paneTail,
|
|
139
|
+
} : null,
|
|
90
140
|
trigger: cleanText.slice(0, 40),
|
|
91
141
|
});
|
|
92
142
|
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
'use strict';
|
|
18
18
|
|
|
19
19
|
const { toTelegramHtml } = require('../telegram/format');
|
|
20
|
+
const { getTopicConfig } = require('../session-key');
|
|
20
21
|
|
|
21
22
|
const MODEL_OPTIONS = ['opus', 'sonnet', 'haiku'];
|
|
22
23
|
const EFFORT_OPTIONS = ['low', 'medium', 'high', 'xhigh', 'max'];
|
|
@@ -92,8 +93,13 @@ function createHandleConfigCallback({
|
|
|
92
93
|
// tapped into.
|
|
93
94
|
const existingRows = ctx.callbackQuery.message?.reply_markup?.inline_keyboard?.length || 0;
|
|
94
95
|
const showRow = existingRows >= 2 ? 'all' : setting;
|
|
95
|
-
|
|
96
|
-
|
|
96
|
+
// Re-render with per-topic overrides resolved (topic > chat), so the agent
|
|
97
|
+
// line doesn't flip back to the chat-level default after a button tap —
|
|
98
|
+
// mirrors the /model command card (polygram.js). getTopicConfig returns {}
|
|
99
|
+
// for the chat-level card.
|
|
100
|
+
const _cbTopicCfg = getTopicConfig(chatConfig, callbackThreadId);
|
|
101
|
+
const newInfo = formatConfigInfoText(chatConfig, showRow, chatId, _cbTopicCfg);
|
|
102
|
+
const newKeyboard = buildConfigKeyboard(chatConfig, showRow, _cbTopicCfg);
|
|
97
103
|
try {
|
|
98
104
|
const { text: html, parseMode } = toTelegramHtml(newInfo);
|
|
99
105
|
await ctx.editMessageText(html, {
|
|
@@ -31,19 +31,23 @@ const MODEL_VERSIONS_DESC = {
|
|
|
31
31
|
/**
|
|
32
32
|
* Build the inline keyboard for /model + /effort.
|
|
33
33
|
* show = 'model' | 'effort' | 'all'
|
|
34
|
-
* The current value gets a ✓ prefix.
|
|
34
|
+
* The current value gets a ✓ prefix. `topicConfig` (per-topic overrides, or
|
|
35
|
+
* null for the chat-level card) wins over chatConfig so the ✓ matches what a
|
|
36
|
+
* topic actually runs — mirrors the spawn-path precedence (topic > chat).
|
|
35
37
|
*/
|
|
36
|
-
function buildConfigKeyboard(chatConfig, show = 'all') {
|
|
38
|
+
function buildConfigKeyboard(chatConfig, show = 'all', topicConfig = null) {
|
|
39
|
+
const model = (topicConfig && topicConfig.model) || chatConfig.model;
|
|
40
|
+
const effort = (topicConfig && topicConfig.effort) || chatConfig.effort;
|
|
37
41
|
const rows = [];
|
|
38
42
|
if (show === 'model' || show === 'all') {
|
|
39
43
|
rows.push(MODEL_OPTIONS.map((m) => ({
|
|
40
|
-
text: m ===
|
|
44
|
+
text: m === model ? `✓ ${m}` : m,
|
|
41
45
|
callback_data: `cfg:model:${m}`,
|
|
42
46
|
})));
|
|
43
47
|
}
|
|
44
48
|
if (show === 'effort' || show === 'all') {
|
|
45
49
|
rows.push(EFFORT_OPTIONS.map((e) => ({
|
|
46
|
-
text: e ===
|
|
50
|
+
text: e === effort ? `✓ ${e}` : e,
|
|
47
51
|
callback_data: `cfg:effort:${e}`,
|
|
48
52
|
})));
|
|
49
53
|
}
|
|
@@ -60,14 +64,24 @@ function buildConfigKeyboard(chatConfig, show = 'all') {
|
|
|
60
64
|
* @param {(db, sessionKey) => string|null} deps.getClaudeSessionId
|
|
61
65
|
*/
|
|
62
66
|
function createFormatConfigInfoText({ pm, db, getClaudeSessionId } = {}) {
|
|
63
|
-
return function formatConfigInfoText(chatConfig, show, sessionKey) {
|
|
67
|
+
return function formatConfigInfoText(chatConfig, show, sessionKey, topicConfig = null) {
|
|
64
68
|
const alive = pm.has(sessionKey) && !pm.get(sessionKey).closed;
|
|
65
|
-
|
|
69
|
+
// Per-topic overrides win over chat-level for the displayed values,
|
|
70
|
+
// mirroring the spawn path (polygram.js: topicConfig.agent ||
|
|
71
|
+
// chatConfig.agent). Pre-fix the card always read chat-level, so a topic's
|
|
72
|
+
// /model showed the WRONG agent — shumorobot Music topic (thread 3) showed
|
|
73
|
+
// "Agent: shumabit" instead of its music-curation:music-curator override
|
|
74
|
+
// (2026-06-03). topicConfig defaults to null (chat-level) for callers with
|
|
75
|
+
// no active topic.
|
|
76
|
+
const model = (topicConfig && topicConfig.model) || chatConfig.model;
|
|
77
|
+
const effort = (topicConfig && topicConfig.effort) || chatConfig.effort;
|
|
78
|
+
const agent = (topicConfig && topicConfig.agent) || chatConfig.agent;
|
|
79
|
+
const ver = MODEL_VERSIONS_DESC[model] || model;
|
|
66
80
|
const sess = getClaudeSessionId(db, sessionKey)?.slice(0, 8) || 'new';
|
|
67
81
|
const head =
|
|
68
|
-
`Model: ${
|
|
69
|
-
`Effort: ${
|
|
70
|
-
`Agent: ${
|
|
82
|
+
`Model: ${model} (${ver})\n` +
|
|
83
|
+
`Effort: ${effort}\n` +
|
|
84
|
+
`Agent: ${agent}\n` +
|
|
71
85
|
`Process: ${alive ? 'warm' : 'cold'}\n` +
|
|
72
86
|
`Session: ${sess}`;
|
|
73
87
|
|
|
@@ -24,6 +24,13 @@
|
|
|
24
24
|
|
|
25
25
|
const CONCURRENT_WARN_THRESHOLD_DEFAULT = 20;
|
|
26
26
|
|
|
27
|
+
// Startup auto-retry (option a, 2026-06-04): a short breath before silently
|
|
28
|
+
// re-dispatching a message whose first attempt died in the dev-channels startup
|
|
29
|
+
// gate (TMUX_SESSION_GONE). Long enough that a host under momentary load isn't
|
|
30
|
+
// hammered with a back-to-back respawn, short enough that a transient flake
|
|
31
|
+
// still recovers fast enough to feel instant to the user.
|
|
32
|
+
const STARTUP_RETRY_DELAY_MS = 1500;
|
|
33
|
+
|
|
27
34
|
function createDispatcher({
|
|
28
35
|
config,
|
|
29
36
|
db,
|
|
@@ -48,6 +55,9 @@ function createDispatcher({
|
|
|
48
55
|
// the historic 4096 for back-compat in synthetic test runs that pass
|
|
49
56
|
// pre-formatted text.
|
|
50
57
|
chunkBudget = 4096,
|
|
58
|
+
// Delay before a silent startup auto-retry re-dispatches (TMUX_SESSION_GONE).
|
|
59
|
+
// Injected so tests can drive it to 0; production uses STARTUP_RETRY_DELAY_MS.
|
|
60
|
+
startupRetryDelayMs = STARTUP_RETRY_DELAY_MS,
|
|
51
61
|
// State accessors (need late binding because polygram.js mutates):
|
|
52
62
|
getIsShuttingDown, // () → boolean
|
|
53
63
|
logger = console,
|
|
@@ -178,6 +188,26 @@ function createDispatcher({
|
|
|
178
188
|
aborted: wasAborted || undefined,
|
|
179
189
|
replay: isReplay || undefined,
|
|
180
190
|
});
|
|
191
|
+
// Startup-gate death (claude exited during spawn / the dialog gate timed
|
|
192
|
+
// out) of a likely-aged RESUMED session — the persisted claude_session_id
|
|
193
|
+
// can't be resumed cleanly (shumorobot general chat 2026-06-01→03: a
|
|
194
|
+
// week-old session renders claude's "Resume from summary?" dialog whose
|
|
195
|
+
// /compact resume exits code 0 → TMUX_SESSION_GONE → the chat re-resumes
|
|
196
|
+
// the same dead id on every message, stuck for days). Poison-clear so the
|
|
197
|
+
// NEXT message spawns a FRESH session — same recovery the auto-resume path
|
|
198
|
+
// does for BRIDGE_DISCONNECTED below. clearSessionId is a no-op DELETE when
|
|
199
|
+
// there's no row (a genuine fresh-spawn failure), so this is safe; and
|
|
200
|
+
// unlike an in-process recursive retry it never reuses a closed instance.
|
|
201
|
+
if ((err.code === 'TMUX_SESSION_GONE' || err.code === 'CHANNELS_DIALOG_TIMEOUT')
|
|
202
|
+
&& typeof db.clearSessionId === 'function') {
|
|
203
|
+
dbWrite(
|
|
204
|
+
() => db.clearSessionId(sessionKey),
|
|
205
|
+
`clearSessionId: poisoned by ${err.code} on startup`,
|
|
206
|
+
);
|
|
207
|
+
logEvent('session-reset-after-startup-gate', {
|
|
208
|
+
chat_id: chatId, session_key: sessionKey, msg_id: msg?.message_id, code: err.code,
|
|
209
|
+
});
|
|
210
|
+
}
|
|
181
211
|
// rc.55: surface replay failures with a meaningful message.
|
|
182
212
|
// Pre-rc.55 any boot-replay turn that failed for ANY reason
|
|
183
213
|
// was silently dropped. The rc.51-onward boot-replay path is
|
|
@@ -197,6 +227,35 @@ function createDispatcher({
|
|
|
197
227
|
// - shutting down ("Process killed" isn't a real error),
|
|
198
228
|
// - user just /stop'd (already saw their abort ack).
|
|
199
229
|
if (!wasAborted && !isReplay && !isShuttingDown) {
|
|
230
|
+
// Startup auto-retry (option a, 2026-06-04). TMUX_SESSION_GONE = claude
|
|
231
|
+
// exited INSIDE the startup gate, before the dev-channels channel went
|
|
232
|
+
// live — so the user's message was NEVER delivered to claude. That makes
|
|
233
|
+
// a re-send idempotent BY CONSTRUCTION (unlike a mid-turn drop, where
|
|
234
|
+
// claude might still be slowly processing). The session_id was just
|
|
235
|
+
// poison-cleared above, so re-dispatching the SAME message spawns a FRESH
|
|
236
|
+
// session and delivers it. Silent: a transient startup flake (recurs
|
|
237
|
+
// ~once/9h on the channels backend) never reaches the user — instead of
|
|
238
|
+
// the "🔄 reset it, resend" papercut, polygram just retries. One-shot
|
|
239
|
+
// (_startupRetried) so a host that genuinely can't start claude surfaces
|
|
240
|
+
// the friendly reset reply (below) after EXACTLY one retry, never a loop.
|
|
241
|
+
// Scoped to TMUX_SESSION_GONE only: CHANNELS_DIALOG_TIMEOUT is a real
|
|
242
|
+
// blocking dialog (usage-limit / permission) a retry would just re-hit,
|
|
243
|
+
// so it keeps its "please resend" copy.
|
|
244
|
+
if (err.code === 'TMUX_SESSION_GONE' && !msg._startupRetried) {
|
|
245
|
+
logEvent('startup-auto-retry', {
|
|
246
|
+
chat_id: chatId, session_key: sessionKey, msg_id: msg?.message_id,
|
|
247
|
+
});
|
|
248
|
+
// Re-dispatch a COPY carrying the one-shot marker — never mutate the
|
|
249
|
+
// caller's msg (the boot-replay path shares/re-reads it). unref the
|
|
250
|
+
// best-effort timer so a pending retry can't pin the daemon alive
|
|
251
|
+
// (the Telegram long-poll already keeps the loop running).
|
|
252
|
+
const retryMsg = { ...msg, _startupRetried: true };
|
|
253
|
+
setTimeout(
|
|
254
|
+
() => dispatchHandleMessage(sessionKey, chatId, retryMsg, bot),
|
|
255
|
+
startupRetryDelayMs,
|
|
256
|
+
).unref?.();
|
|
257
|
+
return;
|
|
258
|
+
}
|
|
200
259
|
// rc.54: auto-resume on 300s no-activity timeout. The
|
|
201
260
|
// resume turn itself runs through sendToProcess directly
|
|
202
261
|
// (not handleMessage), so its errors don't re-enter this
|
|
@@ -224,6 +283,29 @@ function createDispatcher({
|
|
|
224
283
|
chat_id: chatId, session_key: sessionKey, msg_id: msg.message_id,
|
|
225
284
|
error: resumeErr?.message?.slice(0, 200),
|
|
226
285
|
});
|
|
286
|
+
// Music topic incident (2026-06-01): a channels session whose
|
|
287
|
+
// context grew large enough to auto-/compact on resume loses its
|
|
288
|
+
// MCP bridge binding on EVERY resume ("no MCP server configured"),
|
|
289
|
+
// so the resumed turn re-detaches (BRIDGE_DISCONNECTED) and lands
|
|
290
|
+
// here. The persisted claude_session_id is then poisoned — every
|
|
291
|
+
// future message (manual resend OR post-cooldown auto-resume)
|
|
292
|
+
// re-resumes it and re-detaches, an endless "🔌 please resend"
|
|
293
|
+
// loop. Break it: drop the session row so the NEXT message spawns
|
|
294
|
+
// a FRESH session (no --resume). Gated on the ORIGINAL error being
|
|
295
|
+
// a bridge-detach AND auto-resume having failed — a one-off bridge
|
|
296
|
+
// crash that resumes cleanly takes the .then() path above and
|
|
297
|
+
// keeps its context; only a session that re-detaches on resume is
|
|
298
|
+
// treated as poison. We lose the poisoned conversation's history,
|
|
299
|
+
// but that session can't complete a turn anyway.
|
|
300
|
+
if (err.code === 'BRIDGE_DISCONNECTED' && typeof db.clearSessionId === 'function') {
|
|
301
|
+
dbWrite(
|
|
302
|
+
() => db.clearSessionId(sessionKey),
|
|
303
|
+
'clearSessionId: poisoned by bridge-detach on resume',
|
|
304
|
+
);
|
|
305
|
+
logEvent('session-reset-after-bridge-detach', {
|
|
306
|
+
chat_id: chatId, session_key: sessionKey, msg_id: msg.message_id,
|
|
307
|
+
});
|
|
308
|
+
}
|
|
227
309
|
const fallbackText = errorReplyText(err);
|
|
228
310
|
if (fallbackText) {
|
|
229
311
|
tg(bot, 'sendMessage', {
|
|
@@ -266,4 +348,5 @@ function createDispatcher({
|
|
|
266
348
|
module.exports = {
|
|
267
349
|
createDispatcher,
|
|
268
350
|
CONCURRENT_WARN_THRESHOLD_DEFAULT,
|
|
351
|
+
STARTUP_RETRY_DELAY_MS,
|
|
269
352
|
};
|