polygram 0.12.0-rc.1 → 0.12.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@
4
4
  "bots": {
5
5
  "admin-bot": {
6
6
  "token": "REPLACE_WITH_BOT_TOKEN_FROM_BOTFATHER",
7
+ "_comment_apiRoot": "Optional. Point grammy at a self-hosted Telegram Bot API server (e.g. 'http://localhost:8082' from a local `telegram-bot-api --local` process) to raise file send/receive limits from cloud's 50MB-out / 20MB-in to 2GB both ways. Omit for cloud Telegram (default, unchanged). The server is a separate localhost-only companion daemon — see docs/0.12.0-file-send.md.",
7
8
  "allowConfigCommands": true,
8
9
  "_comment_adminChatId": "Required when allowConfigCommands is true for pairing commands (/pair-code, /pairings, /unpair) to work. These grant cross-chat trust and are gated to the admin chat only.",
9
10
  "adminChatId": "123456789",
@@ -70,7 +71,8 @@
70
71
  "model": "opus",
71
72
  "effort": "medium",
72
73
  "cwd": "/Users/you/admin-agent",
73
- "timeout": 600
74
+ "timeout": 600,
75
+ "_comment_maxFileBytes": "OPTIONAL per-chat (or per-topic; topic wins) file-size cap in BYTES. There is NO fixed default — the default is backend-derived: cloud Telegram = 50MB send / 20MB receive; with a local Bot API server (bot.apiRoot set) = 2GB both ways. This key only LOWERS that ceiling for this chat (Telegram rejects anything above the backend limit regardless); omit it to use the full backend default. To set one, add e.g. \"maxFileBytes\": 104857600 (=100MB) — only meaningful when apiRoot is set, since cloud already clamps to 50/20MB."
74
76
  },
75
77
 
76
78
  "-1000000000001": {
@@ -22,8 +22,48 @@
22
22
  * extension — the fallback only kicks in when MIME is unhelpful.
23
23
  */
24
24
 
25
- const MAX_FILE_BYTES = 10 * 1024 * 1024;
26
- const MAX_TOTAL_BYTES = 20 * 1024 * 1024;
25
+ // Inbound (user bot) per-file cap. Telegram's cloud Bot API hard-caps
26
+ // bot file DOWNLOADS (getFile) at 20 MB, so 20 MB is the real ceiling on
27
+ // cloud — raised from 10 MB so users can send larger tracks/docs. With a
28
+ // self-hosted Bot API server (config.bot.apiRoot) the Telegram limit rises
29
+ // to 2 GB; resolveFileCaps() raises the default accordingly.
30
+ const MAX_FILE_BYTES = 20 * 1024 * 1024;
31
+ const MAX_TOTAL_BYTES = 50 * 1024 * 1024;
32
+
33
+ // ─── Backend-derived file-size caps (cloud vs local Bot API server) ──
34
+ //
35
+ // These are the HARD ceilings Telegram itself enforces — a per-chat
36
+ // override can lower them but never exceed them (Telegram rejects beyond
37
+ // regardless). NOT "adaptive": there is no intermediate tier. Cloud is a
38
+ // flat 20 in / 50 out; a local `telegram-bot-api --local` server is a flat
39
+ // 2 GB both ways.
40
+ const CLOUD_MAX_IN_BYTES = 20 * 1024 * 1024; // getFile download limit
41
+ const CLOUD_MAX_OUT_BYTES = 50 * 1024 * 1024; // sendDocument upload limit
42
+ const LOCAL_MAX_BYTES = 2000 * 1024 * 1024; // --local server, both ways
43
+
44
+ /**
45
+ * Resolve the effective per-file caps for a chat/topic.
46
+ *
47
+ * @param {object} opts
48
+ * @param {boolean} opts.localApi — true when config.bot.apiRoot is set
49
+ * (a local Bot API server is in use → 2 GB ceiling).
50
+ * @param {...number} opts.override — per-chat/topic maxFileBytes (bytes).
51
+ * Resolved by the caller from topic → chat → undefined; clamped to the
52
+ * backend ceiling.
53
+ * @returns {{ inBytes:number, outBytes:number, ceiling:number, localApi:boolean }}
54
+ */
55
+ function resolveFileCaps({ localApi = false, override = null } = {}) {
56
+ const ceiling = localApi ? LOCAL_MAX_BYTES : null;
57
+ const defIn = localApi ? LOCAL_MAX_BYTES : CLOUD_MAX_IN_BYTES;
58
+ const defOut = localApi ? LOCAL_MAX_BYTES : CLOUD_MAX_OUT_BYTES;
59
+ // A numeric override sets BOTH directions to the same value, clamped to
60
+ // the backend hard ceiling (cloud uses the per-direction default as the
61
+ // clamp so an override can't push past Telegram's own limit).
62
+ const ovr = (typeof override === 'number' && override > 0) ? override : null;
63
+ const inBytes = ovr ? (localApi ? Math.min(ovr, ceiling) : Math.min(ovr, CLOUD_MAX_IN_BYTES)) : defIn;
64
+ const outBytes = ovr ? (localApi ? Math.min(ovr, ceiling) : Math.min(ovr, CLOUD_MAX_OUT_BYTES)) : defOut;
65
+ return { inBytes, outBytes, ceiling: ceiling ?? CLOUD_MAX_OUT_BYTES, localApi };
66
+ }
27
67
  const MIME_ALLOW = [
28
68
  /^image\//, /^audio\//, /^video\//,
29
69
  /^application\/pdf$/, /^text\/plain$/,
@@ -109,8 +149,12 @@ function filterAttachments(attachments, opts = {}) {
109
149
 
110
150
  module.exports = {
111
151
  filterAttachments,
152
+ resolveFileCaps,
112
153
  MAX_FILE_BYTES,
113
154
  MAX_TOTAL_BYTES,
155
+ CLOUD_MAX_IN_BYTES,
156
+ CLOUD_MAX_OUT_BYTES,
157
+ LOCAL_MAX_BYTES,
114
158
  MIME_ALLOW,
115
159
  EXTENSION_ALLOW,
116
160
  FALLBACK_MIMES,
@@ -42,13 +42,37 @@ function createHandleAbort({
42
42
  const threadId = msg.message_thread_id?.toString();
43
43
  const sessionKey = getSessionKey(chatId, threadId, chatConfig);
44
44
  const proc = pm.has(sessionKey) ? pm.get(sessionKey) : null;
45
- const hadActive = !!proc?.inFlight;
45
+ let hadActive = !!proc?.inFlight;
46
46
 
47
47
  // Mark BEFORE killing: the 'close' event fires almost immediately
48
48
  // after interrupt, and the surrounding handleMessage's catch
49
49
  // needs to see the flag to skip the generic error-reply.
50
50
  if (hadActive) markSessionAborted(sessionKey);
51
51
 
52
+ // "Stop" incident (shumorobot Music, 2026-05-31 13:08): on the
53
+ // CliProcess/channels backend a turn resolves on the quiet-window
54
+ // after claude's last reply tool call (inFlight → false), but claude
55
+ // can still be working (subagent, long Bash). Keying the ack on
56
+ // inFlight alone made "Stop" say "Nothing to stop" while a subagent
57
+ // download churned. probeBusyState() reads the TUI "esc to interrupt"
58
+ // hint — the truthful signal — so detection, the abort mark, and the
59
+ // ack all agree. The probe result is logged below (forensics) so the
60
+ // heuristic can be refined against real states later. Channels analog
61
+ // of the (deleted) tmux hasBackgroundShell branch; typeof-guarded so
62
+ // it's a no-op on backends without it.
63
+ let busyProbe = null;
64
+ if (!hadActive && proc && typeof proc.probeBusyState === 'function') {
65
+ try {
66
+ busyProbe = await proc.probeBusyState();
67
+ if (busyProbe?.busy) {
68
+ hadActive = true;
69
+ markSessionAborted(sessionKey);
70
+ }
71
+ } catch (err) {
72
+ logger.error?.(`[${botName}] busy-probe failed: ${err.message}`);
73
+ }
74
+ }
75
+
52
76
  // Bug 1 (incident 2026-05-18): "Stop" was turn-scoped — it only
53
77
  // looked at an in-flight TURN. But the agent can leave a DETACHED
54
78
  // background shell running (a `run_in_background:true` Bash) that
@@ -87,6 +111,19 @@ function createHandleAbort({
87
111
  chat_id: chatId, user_id: msg.from?.id || null,
88
112
  had_active: hadActive,
89
113
  killed_background_shell: killedBackgroundShell,
114
+ // "Stop" incident forensics: the raw busy-probe signals at decision
115
+ // time. Lets us query, across real aborts, where the esc-hint /
116
+ // inFlight / pending-turn signals agreed vs diverged and refine the
117
+ // heuristic later. null when no probe ran (turn was already inFlight,
118
+ // or the backend has no probeBusyState).
119
+ busy_probe: busyProbe ? {
120
+ busy: busyProbe.busy,
121
+ streaming: busyProbe.streaming,
122
+ in_flight: busyProbe.inFlight,
123
+ pending_turns: busyProbe.pendingTurns,
124
+ captured: busyProbe.captured,
125
+ pane_tail: busyProbe.paneTail,
126
+ } : null,
90
127
  trigger: cleanText.slice(0, 40),
91
128
  });
92
129
 
@@ -50,7 +50,14 @@ function validateIpcFileParam(method, params = {}) {
50
50
  const fileParam = FILE_PARAM_BY_METHOD[method];
51
51
  if (!fileParam) return null;
52
52
  const val = params[fileParam];
53
- if (typeof val !== 'string') return null; // envelope/Buffer/etcpass through
53
+ // { source: '/abs/path' } envelope — now coerced to a grammy InputFile in
54
+ // tg() (coerceFileParams). Validate it has a usable absolute source, else
55
+ // pass through (Buffer / stream / InputFile shapes).
56
+ if (val && typeof val === 'object' && typeof val.source === 'string') {
57
+ if (val.source.length === 0) return `polygram IPC: ${fileParam}.source is empty`;
58
+ return null;
59
+ }
60
+ if (typeof val !== 'string') return null; // Buffer/InputFile/etc — pass through
54
61
  if (val.length === 0) return `polygram IPC: ${fileParam} is empty`;
55
62
 
56
63
  const looksUrl = /^(https?|ftp):\/\//i.test(val);
@@ -125,7 +125,7 @@ function createChannelsToolDispatcher({
125
125
  || require('../telegram/process-agent-reply').processAndDeliverAgentText;
126
126
 
127
127
  return async function channelsToolDispatcher(call) {
128
- const { sessionKey, chatId, threadId, toolName, text, files, sourceMsgId } = call;
128
+ const { sessionKey, chatId, threadId, toolName, text, files, sourceMsgId, maxOutboundFileBytes } = call;
129
129
 
130
130
  if (toolName !== 'reply') {
131
131
  // 0.11.0 Phase 1 ships `reply` only — react and edit_message are
@@ -196,6 +196,21 @@ function createChannelsToolDispatcher({
196
196
  failedAttachments.push({ path: filePath, error: check.error });
197
197
  continue;
198
198
  }
199
+ // Backend/chat-derived upload cap. Reject oversize BEFORE upload with
200
+ // a clear error (vs Telegram's cryptic 413/"file is too big") so
201
+ // claude can convert/compress and retry. maxOutboundFileBytes is
202
+ // undefined for non-channels callers → no cap (Telegram still gates).
203
+ if (typeof maxOutboundFileBytes === 'number' && maxOutboundFileBytes > 0) {
204
+ let size = 0;
205
+ try { size = fs.statSync(check.resolved).size; } catch {}
206
+ if (size > maxOutboundFileBytes) {
207
+ const mb = (n) => (n / (1024 * 1024)).toFixed(1);
208
+ const err = `file too large to send: ${mb(size)}MB > ${mb(maxOutboundFileBytes)}MB limit`;
209
+ logger.warn?.(`[channels-tool-dispatcher] ${err} (${check.resolved})`);
210
+ failedAttachments.push({ path: filePath, error: err });
211
+ continue;
212
+ }
213
+ }
199
214
  try {
200
215
  const ext = path.extname(check.resolved).toLowerCase();
201
216
  const isImage = ['.jpg', '.jpeg', '.png', '.gif', '.webp'].includes(ext);
@@ -203,7 +218,10 @@ function createChannelsToolDispatcher({
203
218
  const fieldName = isImage ? 'photo' : 'document';
204
219
  const params = {
205
220
  chat_id: chatId,
206
- [fieldName]: { source: check.resolved },
221
+ // { source } envelope → grammy InputFile in tg()'s coerceFileParams.
222
+ // Pre-fix this bare object reached grammy unrecognized and every
223
+ // upload 400'd with "Wrong port number" (file-send never worked).
224
+ [fieldName]: { source: check.resolved, filename: path.basename(check.resolved) },
207
225
  };
208
226
  if (threadId) params.message_thread_id = threadId;
209
227
  await send(bot, method, params, { source: 'channels-tool-dispatcher', sessionKey });
@@ -48,6 +48,11 @@ const { Process, UnsupportedOperationError } = require('./process');
48
48
  const { ChannelsBridgeServer } = require('./channels-bridge-server');
49
49
  const { writeHookFiles, removeHookFiles } = require('./hook-settings');
50
50
  const { createHookTail } = require('./hook-event-tail');
51
+ // File-send staging: reuse the dispatcher's allowlist root so the dir we
52
+ // create exactly matches the realpath the validator accepts (no /tmp vs
53
+ // /private/tmp drift — one of the original Music-topic failures).
54
+ const { DEFAULT_ATTACHMENT_BASE } = require('./channels-tool-dispatcher');
55
+ const { resolveFileCaps } = require('../attachments');
51
56
  const { runStartupGate } = require('../tmux/startup-gate');
52
57
  const { POLYGRAM_DISPLAY_HINT } = require('../telegram/display-hint');
53
58
 
@@ -251,6 +256,10 @@ class CliProcess extends Process {
251
256
  // pending turn(s): turn_id → { resolve, reject, replies: [], quietTimer, hardTimer, startedAt }
252
257
  this.pendingTurns = new Map();
253
258
 
259
+ // File-send outbound cap (bot → user). Safe cloud default; overwritten in
260
+ // _spawnTmuxClaude with the backend/chat-resolved value before any turn.
261
+ this.maxOutboundFileBytes = resolveFileCaps({ localApi: false }).outBytes;
262
+
254
263
  // P1 security (review #8): track resolved permission request_ids so a
255
264
  // double-fire of respond() can't write a second perm_verdict for the same
256
265
  // request. TmuxProcess gates on _pendingApprovalId; this is the channels
@@ -297,6 +306,23 @@ class CliProcess extends Process {
297
306
  // permit files under the agent's workspace.
298
307
  this.sessionCwd = opts.cwd || null;
299
308
 
309
+ // File-send staging dir (2026-06 file-send feature). The dispatcher
310
+ // allowlist always permits <DEFAULT_ATTACHMENT_BASE>/<sessionKey>/, but
311
+ // nothing ever CREATED it — so claude's reply(files) attempts at
312
+ // /tmp/polygram-attachments failed (dir absent / realpath mismatch) and
313
+ // it flailed across other paths. Create it here and surface it to the
314
+ // prompt so claude has one blessed, always-allowed place to stage a file
315
+ // before sending. realpathSync so the stored path matches what the
316
+ // validator resolves (the /tmp ↔ /private/tmp fix).
317
+ try {
318
+ const dir = path.join(DEFAULT_ATTACHMENT_BASE, String(this.sessionKey));
319
+ fs.mkdirSync(dir, { recursive: true, mode: 0o700 });
320
+ this.attachmentStagingDir = fs.realpathSync(dir);
321
+ } catch (err) {
322
+ this.attachmentStagingDir = null;
323
+ this.logger.warn?.(`[${this.label}] channels: staging dir create failed: ${err.message}`);
324
+ }
325
+
300
326
  // Opaque random token for socket filename — do NOT leak sessionKey to /tmp.
301
327
  const socketToken = crypto.randomBytes(16).toString('hex');
302
328
  this.sockPath = path.join(os.tmpdir(), `polygram-${socketToken}.sock`);
@@ -416,28 +442,7 @@ class CliProcess extends Process {
416
442
 
417
443
  this.bridgeServer.on('bridge-message', msg => this._handleBridgeMessage(msg));
418
444
 
419
- this.bridgeServer.on('bridge-disconnected', () => {
420
- this.bridgeReady = false;
421
- this.mcpReady = false;
422
- if (!this.closed) {
423
- this.logger.warn?.(`[${this.label}] channels: bridge disconnected unexpectedly`);
424
- // P1 #5: drain pendingTurns immediately so hardTimers don't run 10min.
425
- for (const [, pending] of this.pendingTurns) {
426
- if (pending.quietTimer) clearTimeout(pending.quietTimer);
427
- if (pending.hardTimer) clearTimeout(pending.hardTimer);
428
- if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
429
- if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
430
- const err = new Error('bridge disconnected');
431
- err.code = 'BRIDGE_DISCONNECTED';
432
- try { pending.reject(err); } catch {}
433
- }
434
- this.pendingTurns.clear();
435
- this.pendingQueue.length = 0;
436
- this.inFlight = false;
437
- this.emit('bridge-disconnected');
438
- this._logEvent('bridge-disconnected', { reason: 'socket-close' });
439
- }
440
- });
445
+ this.bridgeServer.on('bridge-disconnected', () => this._handleBridgeDisconnected());
441
446
 
442
447
  await this.bridgeServer.listen();
443
448
  }
@@ -493,6 +498,18 @@ class CliProcess extends Process {
493
498
  const effort = topicConfig?.effort || opts.chatConfig?.effort || opts.effort;
494
499
  const resolvedCwd = topicConfig?.cwd || opts.chatConfig?.cwd || opts.cwd;
495
500
 
501
+ // File-send outbound cap (bot → user). Backend-derived (cloud 50MB vs
502
+ // local Bot API server 2GB via opts.localApi) with per-topic/chat
503
+ // maxFileBytes override, clamped to the backend ceiling. Stored for the
504
+ // dispatcher (live size-check) and the system prompt (so claude states
505
+ // the right limit). Resolved here so it follows the same topic→chat
506
+ // precedence as cwd/agent above.
507
+ const _capOverride = topicConfig?.maxFileBytes ?? opts.chatConfig?.maxFileBytes ?? null;
508
+ this.maxOutboundFileBytes = resolveFileCaps({
509
+ localApi: !!opts.localApi,
510
+ override: _capOverride,
511
+ }).outBytes;
512
+
496
513
  // Parity audit P8 + rc.8 fs-guard (2026-05-26 shumorobot Music topic):
497
514
  // `--session-id <id>` creates a NEW claude session with that id;
498
515
  // `--resume <id>` resumes the EXISTING conversation. Lazy-respawn after
@@ -540,6 +557,9 @@ class CliProcess extends Process {
540
557
  );
541
558
  }
542
559
  }
560
+ // Finding 0.12-M2: record the resume decision so _armHookTail (run
561
+ // after spawn) skips the prior session's still-on-disk hook ndjson.
562
+ this._resumedSession = canResume;
543
563
  if (agent) claudeArgs.push('--agent', agent);
544
564
  if (model) claudeArgs.unshift('--model', model);
545
565
  if (effort) claudeArgs.push('--effort', effort);
@@ -616,6 +636,28 @@ class CliProcess extends Process {
616
636
  'Internal tool calls (Bash, Edit, Write, Read, etc.) are fine to use',
617
637
  'as normal — only the FINAL user-visible message needs to go through',
618
638
  'the reply tool.',
639
+ '',
640
+ '### Sending FILES (tracks, images, docs) to the user',
641
+ '',
642
+ 'The `mcp__polygram-bridge__reply` tool takes an optional `files` array of',
643
+ 'absolute paths. This is the ONLY way to send a file. Do NOT use Bash,',
644
+ 'curl, the Telegram Bot API, or polygram-ipc to send files — those fail.',
645
+ '',
646
+ ...(this.attachmentStagingDir ? [
647
+ `To send a file: COPY it into the staging dir \`${this.attachmentStagingDir}\`,`,
648
+ 'then call reply with its absolute path, e.g.:',
649
+ ` reply(chat_id="<id>", text="Here's the track", files=["${this.attachmentStagingDir}/track.flac"])`,
650
+ 'polygram auto-deletes staged files after the turn — you do not need to clean up.',
651
+ 'You may also send directly from the agent workspace (cwd); other paths are rejected.',
652
+ ] : [
653
+ 'Copy the file somewhere under your workspace (cwd) and pass its absolute',
654
+ 'path in `files`. Paths outside the workspace are rejected for safety.',
655
+ ]),
656
+ '',
657
+ `Max file size for sending: ${Math.round(this.maxOutboundFileBytes / (1024 * 1024))} MB. ` +
658
+ 'For larger lossless audio, convert to FLAC/MP3 under the limit first, ' +
659
+ 'or tell the user it exceeds the limit. Images go as photos; everything ' +
660
+ 'else as documents.',
619
661
  ].join('\n'));
620
662
 
621
663
  // Parity audit P6: honor isolateUserConfig — mirrors tmux pattern at
@@ -705,6 +747,20 @@ class CliProcess extends Process {
705
747
  ],
706
748
  readySignal: /Listening for channel messages from: server:polygram-bridge/i,
707
749
  timeoutCode: 'CHANNELS_DIALOG_TIMEOUT',
750
+ // Progress-aware gate (shumorobot General incident 2026-05-30): a
751
+ // cold spawn that's mid-download (runtime fetch, "24%" progress bar)
752
+ // is genuinely working and must NOT be killed by the blind 30s
753
+ // wall-clock. stallMs fails fast only when the pane is FROZEN; an
754
+ // actively-changing pane (download bar, dialog nav) keeps resetting
755
+ // the stall clock and rides out to the ready signal. deadlineMs stays
756
+ // the absolute backstop. 30s of zero pane activity = genuinely wedged.
757
+ // Stall = pane rendered then went static (genuinely wedged). 60s, not
758
+ // 30s: some topics' TUIs cold-render slowly (Music ~45s, slow MCP
759
+ // startup) — 30s was too tight and false-aborted them. Blank panes
760
+ // don't arm the stall timer at all now (see runStartupGate), so this
761
+ // only bounds a TUI that rendered and then truly hung.
762
+ stallMs: this.startupGateStallMs ?? 60_000,
763
+ deadlineMs: this.startupGateDeadlineMs ?? 180_000,
708
764
  logger: this.logger,
709
765
  label: `${this.label}:startup-gate`,
710
766
  });
@@ -849,15 +905,18 @@ class CliProcess extends Process {
849
905
  // rate-limit / chat-id-mismatch path. Live shumorobot 2026-05-26 23:44
850
906
  // observed 3+ "Called polygram-bridge" entries in the TUI pane with
851
907
  // ZERO OUT messages delivered to TG and zero warn-level diagnostics —
852
- // need to see args.text / args.chat_id / args.turn_id to know whether
853
- // claude is calling reply with empty text, wrong chat_id, or something
854
- // else entirely.
855
- this.logger.warn?.(
908
+ // need to see args.chat_id / args.turn_id to know whether claude is
909
+ // calling reply with empty text, wrong chat_id, or something else.
910
+ // L13: root-caused — demoted to debug and DROPPED text_head. Logging
911
+ // the first 80 chars of every reply at warn level leaked private chat
912
+ // content / file excerpts / secrets into the default log sink,
913
+ // unconditionally. name/chat_id/turn_id/text_len diagnose dispatch
914
+ // without exposing message content.
915
+ this.logger.debug?.(
856
916
  `[${this.label}] channels: tool-call name=${msg.name} ` +
857
917
  `chat_id=${JSON.stringify(args.chat_id)} ` +
858
918
  `turn_id=${JSON.stringify(args.turn_id)} ` +
859
- `text_len=${typeof args.text === 'string' ? args.text.length : 'non-string'} ` +
860
- `text_head=${JSON.stringify((args.text || '').slice(0, 80))}`,
919
+ `text_len=${typeof args.text === 'string' ? args.text.length : 'non-string'}`,
861
920
  );
862
921
 
863
922
  // Review P1 #7: idempotency. If we've already ACK'd this tool_call_id,
@@ -948,6 +1007,7 @@ class CliProcess extends Process {
948
1007
  text: args.text,
949
1008
  files: args.files,
950
1009
  sessionCwd: this.sessionCwd, // P0 #2: dispatcher uses this to allowlist file roots
1010
+ maxOutboundFileBytes: this.maxOutboundFileBytes, // backend/chat-derived upload cap
951
1011
  });
952
1012
  } catch (err) {
953
1013
  this._writeToBridge({ kind: 'tool_ack', tool_call_id: msg.tool_call_id, ok: false, error: err.message });
@@ -1122,13 +1182,27 @@ class CliProcess extends Process {
1122
1182
  this._finalizeTurn(turnId);
1123
1183
  };
1124
1184
  const onStop = (info) => {
1125
- // Capture the fallback text; the actual finalize call below will pick
1126
- // it up via pending._stopHookData.
1185
+ // Finding 0.12-M1: the Stop hook carries NO turn_id, and a single
1186
+ // global 'stop-hook' emission fires EVERY per-turn onStop listener.
1187
+ // When more than one turn is in stop-grace we cannot attribute this
1188
+ // Stop (or its last_assistant_message) to a specific turn — the
1189
+ // pre-fix code let one Stop finalize all grace-pending turns and
1190
+ // cross-attribute one turn's text to another (the exact class the
1191
+ // F#3 reply routing prevents). Mirror that drop-rather-than-
1192
+ // misattribute discipline: only consume the Stop when exactly ONE
1193
+ // turn is in grace; otherwise ignore it and let each turn finalize
1194
+ // on its own grace timer (each keeps its own reply text).
1195
+ let graceCount = 0;
1196
+ for (const p of this.pendingTurns.values()) if (p._stopGracePending) graceCount++;
1197
+ if (graceCount !== 1) return;
1127
1198
  pending._stopHookData = info;
1128
1199
  clearTimeout(pending._stopGraceTimer);
1129
1200
  pending._stopGraceTimer = null;
1130
1201
  finalize();
1131
1202
  };
1203
+ // L5: stash the closure so teardown paths that bypass Process.kill()'s
1204
+ // removeAllListeners (bridge-disconnect drain, resetSession) can off it.
1205
+ pending._onStop = onStop;
1132
1206
  pending._stopGraceTimer = setTimeout(finalize, this.stopGraceMs);
1133
1207
  // unref so a never-fired grace doesn't pin the event loop. In tests
1134
1208
  // where a CliProcess is created, send() is called, then the test
@@ -1195,6 +1269,27 @@ class CliProcess extends Process {
1195
1269
  pending.resolve(result);
1196
1270
  this.emit('result', { subtype: 'success' }, { streamText: text });
1197
1271
  this.emit('idle');
1272
+ // File-send staging auto-purge (your choice — no "claude must delete").
1273
+ // Once the LAST turn settles, wipe the staging dir's contents so files
1274
+ // claude copied in to send don't accumulate on disk across turns. Only
1275
+ // when fully idle, so a file staged for a still-pending concurrent turn
1276
+ // isn't yanked mid-send.
1277
+ if (this.pendingTurns.size === 0) this._purgeStagingDir();
1278
+ }
1279
+
1280
+ /**
1281
+ * Empty the per-session file-send staging dir (keep the dir itself).
1282
+ * Best-effort; never throws. Called when the session goes idle and on kill.
1283
+ */
1284
+ _purgeStagingDir() {
1285
+ if (!this.attachmentStagingDir) return;
1286
+ let entries;
1287
+ try { entries = fs.readdirSync(this.attachmentStagingDir); }
1288
+ catch { return; }
1289
+ for (const name of entries) {
1290
+ try { fs.rmSync(path.join(this.attachmentStagingDir, name), { recursive: true, force: true }); }
1291
+ catch { /* best-effort */ }
1292
+ }
1198
1293
  }
1199
1294
 
1200
1295
  // ─── public Process API ──────────────────────────────────────────
@@ -1386,6 +1481,63 @@ class CliProcess extends Process {
1386
1481
  this._interruptGraceTimer.unref?.();
1387
1482
  }
1388
1483
 
1484
+ /**
1485
+ * Is claude actually still working, regardless of the resolved-turn flag?
1486
+ *
1487
+ * "Stop" incident (shumorobot Music, 2026-05-31 13:08): the channels
1488
+ * backend resolves a turn on the quiet-window after claude's last reply
1489
+ * tool call (inFlight → false), but claude can keep working afterwards
1490
+ * (a subagent, a long Bash). The abort handler keyed its ack on inFlight
1491
+ * alone, so "Stop" said "Nothing to stop" one second after the bot said
1492
+ * "On it — downloading…" while a subagent churned.
1493
+ *
1494
+ * The TUI prints "esc to interrupt" (STREAMING_HINT_RE) continuously
1495
+ * whenever claude is busy — capture-pane is the truthful signal, the
1496
+ * channels analog of the (deleted) tmux hasBackgroundShell() probe.
1497
+ *
1498
+ * Returns a STRUCTURED probe (not just a boolean) so the abort path can
1499
+ * log the raw signals — pane tail + flags — to the events DB. That lets
1500
+ * us later characterize which states the heuristic gets right/wrong and
1501
+ * refine it (e.g. add signals beyond the esc-hint) without guessing.
1502
+ *
1503
+ * Never throws — a failed capture returns captured:false, busy:false.
1504
+ *
1505
+ * @returns {Promise<{busy:boolean, streaming:boolean, inFlight:boolean,
1506
+ * pendingTurns:number, captured:boolean, paneTail:(string|null)}>}
1507
+ */
1508
+ async probeBusyState() {
1509
+ const base = {
1510
+ busy: false, streaming: false,
1511
+ inFlight: this.inFlight, pendingTurns: this.pendingTurns.size,
1512
+ captured: false, paneTail: null,
1513
+ };
1514
+ if (this.closed || !this.tmuxSession || typeof this.runner?.captureWide !== 'function') {
1515
+ return base;
1516
+ }
1517
+ let pane;
1518
+ try {
1519
+ pane = await this.runner.captureWide(this.tmuxSession);
1520
+ } catch (err) {
1521
+ this.logger.warn?.(`[${this.label}] channels: probeBusyState captureWide failed: ${err.message}`);
1522
+ return base;
1523
+ }
1524
+ if (!pane) return base;
1525
+ const streaming = STREAMING_HINT_RE.test(pane);
1526
+ return {
1527
+ ...base,
1528
+ busy: streaming,
1529
+ streaming,
1530
+ captured: true,
1531
+ paneTail: pane.slice(-200),
1532
+ };
1533
+ }
1534
+
1535
+ /** Boolean shorthand for probeBusyState().busy (abort-path convenience). */
1536
+ async isBusy() {
1537
+ const { busy } = await this.probeBusyState();
1538
+ return busy;
1539
+ }
1540
+
1389
1541
  async kill(reason = 'kill') {
1390
1542
  if (this.closed) return;
1391
1543
  // Parity P19: re-entry guard for concurrent kill() calls. Mirrors
@@ -1415,17 +1567,18 @@ class CliProcess extends Process {
1415
1567
  this.logger.warn?.(`[${this.label}] _armHookTail: _hookNdjsonPath unset; hooks disabled. Phase 1.2 may have failed.`);
1416
1568
  return;
1417
1569
  }
1418
- // Fresh spawn: ndjson was just touched by writeHookFiles and is empty,
1419
- // so `skipExisting: false` (default) is correct. For lazy-respawn on
1420
- // existingSessionId, we currently re-run writeHookFiles which touches
1421
- // a NEW file with the same name (overwrite). If we ever switch to
1422
- // resume-without-touch, set skipExisting: true to avoid replaying
1423
- // stale events from the prior process same pattern tmux uses on
1424
- // --resume per rc.42 #5.
1570
+ // Finding 0.12-M2: writeHookFiles opens the ndjson in APPEND mode
1571
+ // ('a') and never truncates, so on a --resume respawn the prior
1572
+ // session's hook lines are still on disk under the same path. Replaying
1573
+ // them re-drives the turn state machine from stale Stop/PreToolUse
1574
+ // events (a stale Stop can finalize the fresh turn). So skip existing
1575
+ // content when (and only when) this is a resumed session the same
1576
+ // discipline the JSONL tail uses on --resume. A fresh spawn's ndjson is
1577
+ // empty, so skipExisting:false is correct there.
1425
1578
  this._hookTail = createHookTail({
1426
1579
  path: this._hookNdjsonPath,
1427
1580
  logger: this.logger,
1428
- skipExisting: false,
1581
+ skipExisting: this._resumedSession === true,
1429
1582
  });
1430
1583
  this._hookTail.on('event', (ev) => {
1431
1584
  try {
@@ -1465,25 +1618,18 @@ class CliProcess extends Process {
1465
1618
  // gates tag-out on median < 2s and p99 < 5s across the events DB.
1466
1619
  if (Number.isFinite(ev.receivedAtMs)) {
1467
1620
  const lagMs = Date.now() - ev.receivedAtMs;
1621
+ // L10: emit ONLY — the onHookLagSample callback owns the DB write
1622
+ // (CALLBACK_TO_EVENT → callbacks.js). Previously this ALSO wrote
1623
+ // directly via this.db.logEvent, double-persisting every sample and
1624
+ // inflating the Phase 1.8 soak-gate row count. Consistent with how
1625
+ // tool-result / subagent-start / subagent-done are handled (emit,
1626
+ // don't double-write).
1468
1627
  this.emit('hook-lag-sample', {
1469
1628
  hookEventName: ev.type,
1470
1629
  lagMs,
1471
1630
  toolName: ev.toolName || null,
1472
1631
  backend: this.backend,
1473
1632
  });
1474
- // Log to events DB if wired. db is optional (factory injects when
1475
- // available) — same pattern as the other parity-P1 _logEvent calls.
1476
- if (this.db?.logEvent) {
1477
- try {
1478
- this.db.logEvent('hook-lag-sample', {
1479
- session_key: this.sessionKey,
1480
- backend: this.backend,
1481
- hook_event_name: ev.type,
1482
- tool_name: ev.toolName || null,
1483
- lag_ms: lagMs,
1484
- });
1485
- } catch {}
1486
- }
1487
1633
  }
1488
1634
 
1489
1635
  switch (ev.type) {
@@ -1503,6 +1649,16 @@ class CliProcess extends Process {
1503
1649
  const subagentType = ev.toolInput?.subagent_type
1504
1650
  || ev.toolInput?.agent_type
1505
1651
  || 'general-purpose';
1652
+ // Finding 0.12-M4: SubagentStop carries agent_id/agent_type but
1653
+ // NOT the originating Agent tool_use_id, so without help the
1654
+ // subagent-start/subagent-done rows share no JOIN key (the
1655
+ // documented soak query on $.tool_use_id returns zero rows).
1656
+ // Track the in-flight Agent tool_use_id keyed by subagent type so
1657
+ // the paired SubagentStop below can stamp it onto subagent-done.
1658
+ (this._pendingSubagentStarts ||= []).push({
1659
+ agentType: subagentType,
1660
+ toolUseId: ev.toolUseId,
1661
+ });
1506
1662
  this.emit('subagent-start', {
1507
1663
  agentType: subagentType,
1508
1664
  // PreToolUse for Agent carries no agent_id (set later on
@@ -1541,14 +1697,27 @@ class CliProcess extends Process {
1541
1697
  });
1542
1698
  return;
1543
1699
 
1544
- case 'SubagentStop':
1700
+ case 'SubagentStop': {
1701
+ // Finding 0.12-M4: recover the originating Agent tool_use_id so the
1702
+ // subagent-start/subagent-done pair is JOINable. Prefer a match on
1703
+ // agent type (correct for parallel subagents of different types);
1704
+ // fall back to the oldest pending start when types don't line up.
1705
+ let subagentToolUseId = null;
1706
+ const pendingStarts = this._pendingSubagentStarts;
1707
+ if (pendingStarts && pendingStarts.length) {
1708
+ let idx = pendingStarts.findIndex(s => s.agentType === ev.agentType);
1709
+ if (idx < 0) idx = 0;
1710
+ subagentToolUseId = pendingStarts.splice(idx, 1)[0]?.toolUseId ?? null;
1711
+ }
1545
1712
  this.emit('subagent-done', {
1546
1713
  agentType: ev.agentType,
1547
1714
  agentId: ev.agentId,
1548
1715
  durationMs: ev.durationMs,
1716
+ toolUseId: subagentToolUseId,
1549
1717
  backend: this.backend,
1550
1718
  });
1551
1719
  return;
1720
+ }
1552
1721
 
1553
1722
  case 'Stop':
1554
1723
  // Phase 1.7 (TODO) will use this as the authoritative turn-end
@@ -1665,6 +1834,50 @@ class CliProcess extends Process {
1665
1834
  }
1666
1835
  }
1667
1836
 
1837
+ /**
1838
+ * Drain on unexpected bridge socket loss (claude crash, bridge crash,
1839
+ * EOF). Extracted from the inline 'bridge-disconnected' handler so the
1840
+ * teardown is testable and consistent with _doKill.
1841
+ *
1842
+ * Findings 0.12-L5 + L6: in addition to clearing the per-turn timers
1843
+ * and rejecting pendings (the original P1 #5 behavior), this now also
1844
+ * (L5) removes each turn's stop-hook listener — this drain does NOT go
1845
+ * through Process.kill()'s blanket removeAllListeners, so a turn torn
1846
+ * down mid-stop-grace would otherwise leak its onStop closure — and
1847
+ * (L6) clears _interruptGraceTimer, matching _doKill (a /stop verdict
1848
+ * landing just before the disconnect would otherwise leave a stray
1849
+ * timer on the dead instance).
1850
+ */
1851
+ _handleBridgeDisconnected() {
1852
+ this.bridgeReady = false;
1853
+ this.mcpReady = false;
1854
+ if (this.closed) return;
1855
+ this.logger.warn?.(`[${this.label}] channels: bridge disconnected unexpectedly`);
1856
+ // L6: clear the interrupt grace timer alongside the rest of the lifecycle.
1857
+ if (this._interruptGraceTimer) {
1858
+ clearTimeout(this._interruptGraceTimer);
1859
+ this._interruptGraceTimer = null;
1860
+ }
1861
+ // P1 #5: drain pendingTurns immediately so hardTimers don't run 10min.
1862
+ for (const [, pending] of this.pendingTurns) {
1863
+ if (pending.quietTimer) clearTimeout(pending.quietTimer);
1864
+ if (pending.hardTimer) clearTimeout(pending.hardTimer);
1865
+ if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1866
+ if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
1867
+ // L5: remove the per-turn stop-hook listener (this path bypasses
1868
+ // Process.kill()'s removeAllListeners).
1869
+ if (pending._onStop) this.off('stop-hook', pending._onStop);
1870
+ const err = new Error('bridge disconnected');
1871
+ err.code = 'BRIDGE_DISCONNECTED';
1872
+ try { pending.reject(err); } catch {}
1873
+ }
1874
+ this.pendingTurns.clear();
1875
+ this.pendingQueue.length = 0;
1876
+ this.inFlight = false;
1877
+ this.emit('bridge-disconnected');
1878
+ this._logEvent('bridge-disconnected', { reason: 'socket-close' });
1879
+ }
1880
+
1668
1881
  async _doKill(reason) {
1669
1882
  this.closed = true;
1670
1883
  this.inFlight = false;
@@ -1688,6 +1901,7 @@ class CliProcess extends Process {
1688
1901
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1689
1902
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1690
1903
  if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
1904
+ if (pending._onStop) this.off('stop-hook', pending._onStop); // L5
1691
1905
  const err = new Error(`session killed: ${reason}`);
1692
1906
  err.code = 'KILLED';
1693
1907
  pending.reject(err);
@@ -1734,6 +1948,12 @@ class CliProcess extends Process {
1734
1948
  if (this.botName && this.claudeSessionId) {
1735
1949
  try { removeHookFiles({ botName: this.botName, sessionId: this.claudeSessionId }); } catch {}
1736
1950
  }
1951
+ // File-send staging: remove the whole per-session dir on kill (purge only
1952
+ // empties it between turns; kill is end-of-life so drop it entirely).
1953
+ if (this.attachmentStagingDir) {
1954
+ try { fs.rmSync(this.attachmentStagingDir, { recursive: true, force: true }); } catch {}
1955
+ this.attachmentStagingDir = null;
1956
+ }
1737
1957
 
1738
1958
  this.emit('close', 0);
1739
1959
  }
@@ -1876,6 +2096,8 @@ class CliProcess extends Process {
1876
2096
  if (pending.quietTimer) clearTimeout(pending.quietTimer);
1877
2097
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1878
2098
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
2099
+ if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer); // L5
2100
+ if (pending._onStop) this.off('stop-hook', pending._onStop); // L5
1879
2101
  const err = new Error(`session reset: ${reason}`);
1880
2102
  err.code = 'RESET';
1881
2103
  try { pending.reject(err); } catch {}
@@ -91,10 +91,6 @@ function _maybeWarnR12Migration({ rawPm, canonical, chatId, threadId, chatCfg, t
91
91
  * @param {number} [opts.queryCloseTimeoutMs]
92
92
  * @param {object} [opts.tmuxRunner] — required when ANY chat routes to 'cli'
93
93
  * @param {string} [opts.botName] — required when ANY chat routes to 'cli'
94
- * @param {object} [opts.pollScheduler] — DEPRECATED in 0.12 — was used by the
95
- * removed tmux backend to share one setInterval across all chats; CliProcess's
96
- * per-session pongWatchdog handles its own cadence. Param kept for caller
97
- * back-compat; ignored. Will be removed in 0.13.
98
94
  * @param {Function} [opts.toolDispatcher] — required when ANY chat routes to 'cli'.
99
95
  * async ({sessionKey, chatId, threadId, toolName, text, files}) => {ok, error?}.
100
96
  * Called when Claude's reply (or react/edit_message) tool fires inside a
@@ -113,7 +109,6 @@ function createProcessFactory({
113
109
  queryCloseTimeoutMs,
114
110
  tmuxRunner = null,
115
111
  botName = null,
116
- pollScheduler = null,
117
112
  toolDispatcher = null,
118
113
  channelsClaudeBin = null,
119
114
  } = {}) {
@@ -123,6 +123,19 @@ const CALLBACK_TO_EVENT = {
123
123
  // menu auto-dismissed by `_waitForReady`. Surfacing the event so
124
124
  // soak can count how often aged-session resumes hit this path.
125
125
  onSessionAgePromptDismissed: 'session-age-prompt-dismissed',
126
+ // 0.12 CliProcess observability — typed hook events from cli-process.js
127
+ // _handleHookEvent. Each gets its own callback so polygram can persist
128
+ // structured rows to the events DB for soak-time aggregate queries.
129
+ // - hook-lag-sample: Phase 1.8 — per-event lag_ms (target: median<2s, p99<5s)
130
+ // - tool-result: Phase 1.3 — PostToolUse durationMs per tool
131
+ // - subagent-start / subagent-done: Phase 1.3 — typed subagent lifecycle
132
+ // (we DO get tool-use='Agent' via onToolUse, but agent_type + durationMs
133
+ // only fire on these typed events). SDK backend never emits — hooks
134
+ // are CliProcess-specific (and were tmux-specific in 0.10–0.11).
135
+ onHookLagSample: 'hook-lag-sample',
136
+ onToolResult: 'tool-result',
137
+ onSubagentStart: 'subagent-start',
138
+ onSubagentDone: 'subagent-done',
126
139
  };
127
140
 
128
141
  class ProcessManager {
@@ -464,7 +464,10 @@ function createSdkCallbacks({
464
464
  const detail = {
465
465
  chat_id: getChatIdFromKey(sessionKey),
466
466
  session_key: sessionKey,
467
- backend: 'tmux',
467
+ // Finding 0.12-M3: tmux backend was deleted in 0.12; these hook
468
+ // handlers only ever fire on the CLI driver now — default to 'cli'
469
+ // (honor an explicit payload.backend if a caller ever sets one).
470
+ backend: payload?.backend ?? 'cli',
468
471
  hook_type: payload?.type ?? null,
469
472
  claude_session_id: payload?.sessionId ?? null,
470
473
  tool_name: payload?.toolName ?? null,
@@ -555,7 +558,7 @@ function createSdkCallbacks({
555
558
  logEvent('turn-timeout', {
556
559
  chat_id: getChatIdFromKey(sessionKey),
557
560
  session_key: sessionKey,
558
- backend: 'tmux',
561
+ backend: payload?.backend ?? 'cli', // Finding 0.12-M3
559
562
  turn_id: payload?.turnId ?? null,
560
563
  reason: payload?.reason ?? null,
561
564
  idle_ms: payload?.idleMs ?? null,
@@ -578,7 +581,7 @@ function createSdkCallbacks({
578
581
  logEvent('hook-tail-error', {
579
582
  chat_id: getChatIdFromKey(sessionKey),
580
583
  session_key: sessionKey,
581
- backend: 'tmux',
584
+ backend: payload?.backend ?? 'cli', // Finding 0.12-M3 (fires on the CLI hook tail)
582
585
  message: (payload?.message || '').slice(0, 200),
583
586
  path: payload?.path ?? null,
584
587
  claude_session_id: payload?.sessionId ?? null,
@@ -596,7 +599,7 @@ function createSdkCallbacks({
596
599
  logEvent('stop-hook-resolved', {
597
600
  chat_id: getChatIdFromKey(sessionKey),
598
601
  session_key: sessionKey,
599
- backend: 'tmux',
602
+ backend: payload?.backend ?? 'cli', // Finding 0.12-M3
600
603
  turn_id: payload?.turnId ?? null,
601
604
  claude_session_id: payload?.sessionId ?? null,
602
605
  });
@@ -614,7 +617,7 @@ function createSdkCallbacks({
614
617
  logEvent('session-age-prompt-dismissed', {
615
618
  chat_id: getChatIdFromKey(sessionKey),
616
619
  session_key: sessionKey,
617
- backend: 'tmux',
620
+ backend: payload?.backend ?? 'cli', // Finding 0.12-M3
618
621
  claude_session_id: payload?.sessionId ?? null,
619
622
  });
620
623
  } catch (err) {
@@ -622,6 +625,108 @@ function createSdkCallbacks({
622
625
  }
623
626
  },
624
627
 
628
+ // 0.12 Phase 1.8 — hook-lag persistence for the soak gate (median<2s,
629
+ // p99<5s). Each row carries the hookEventName + lagMs so we can:
630
+ // SELECT json_extract(detail_json, '$.hook_event_name') AS evt,
631
+ // AVG(json_extract(detail_json, '$.lag_ms')) AS avg_lag,
632
+ // MAX(json_extract(detail_json, '$.lag_ms')) AS max_lag
633
+ // FROM events WHERE kind='hook-lag-sample' AND ts>...
634
+ // GROUP BY evt;
635
+ onHookLagSample: (sessionKey, payload /* , entry */) => {
636
+ try {
637
+ logEvent('hook-lag-sample', {
638
+ chat_id: getChatIdFromKey(sessionKey),
639
+ session_key: sessionKey,
640
+ backend: payload?.backend ?? 'cli',
641
+ hook_event_name: payload?.hookEventName ?? null,
642
+ lag_ms: payload?.lagMs ?? null,
643
+ tool_name: payload?.toolName ?? null,
644
+ });
645
+ } catch (err) {
646
+ logger.error?.(`[${botName}] hook-lag-sample handler: ${err.message}`);
647
+ }
648
+ },
649
+
650
+ // 0.12 Phase 1.3 — tool-result with durationMs. Pairs with the
651
+ // existing onToolUse row (which fires on PreToolUse) so the soak can
652
+ // compute per-tool average + p99 durations:
653
+ // SELECT json_extract(detail_json, '$.tool_name') AS tool,
654
+ // AVG(json_extract(detail_json, '$.duration_ms')) AS avg_ms,
655
+ // MAX(json_extract(detail_json, '$.duration_ms')) AS max_ms
656
+ // FROM events WHERE kind='tool-result' GROUP BY tool;
657
+ // isError captures the rare PostToolUse where the tool itself failed
658
+ // (vs the tool succeeding but claude deciding to retry).
659
+ onToolResult: (sessionKey, payload /* , entry */) => {
660
+ try {
661
+ logEvent('tool-result', {
662
+ chat_id: getChatIdFromKey(sessionKey),
663
+ session_key: sessionKey,
664
+ backend: payload?.backend ?? 'cli',
665
+ tool_name: payload?.name ?? null,
666
+ duration_ms: payload?.durationMs ?? null,
667
+ agent_id: payload?.agentId ?? null,
668
+ agent_type: payload?.agentType ?? null,
669
+ tool_use_id: payload?.toolUseId ?? null,
670
+ is_error: payload?.isError === true,
671
+ });
672
+ } catch (err) {
673
+ logger.error?.(`[${botName}] tool-result handler: ${err.message}`);
674
+ }
675
+ },
676
+
677
+ // 0.12 Phase 1.3 — subagent lifecycle. PreToolUse with name='Agent'
678
+ // synthesizes 'subagent-start' (no agent_id yet — claude doesn't
679
+ // hand one out until the inner SubagentStop). 'subagent-done' carries
680
+ // the agent_id + duration_ms so a soak can correlate the pair:
681
+ // SELECT s.detail_json AS start, d.detail_json AS done
682
+ // FROM events s JOIN events d
683
+ // ON json_extract(s.detail_json, '$.tool_use_id') =
684
+ // json_extract(d.detail_json, '$.tool_use_id')
685
+ // WHERE s.kind='subagent-start' AND d.kind='subagent-done';
686
+ onSubagentStart: (sessionKey, payload, entry) => {
687
+ try {
688
+ logEvent('subagent-start', {
689
+ chat_id: getChatIdFromKey(sessionKey),
690
+ session_key: sessionKey,
691
+ backend: payload?.backend ?? 'cli',
692
+ agent_type: payload?.agentType ?? null,
693
+ tool_use_id: payload?.toolUseId ?? null,
694
+ });
695
+ // Findings L9/L14: drive the head reactor into the distinct SUBAGENT
696
+ // state so a running subagent shows 👾 rather than freezing on the
697
+ // prior tool's emoji. The plan promised this; previously the handler
698
+ // only persisted the DB row and never touched the reactor.
699
+ const r = entry?.pendingQueue?.[0]?.context?.reactor;
700
+ if (r) r.setState('SUBAGENT');
701
+ } catch (err) {
702
+ logger.error?.(`[${botName}] subagent-start handler: ${err.message}`);
703
+ }
704
+ },
705
+
706
+ onSubagentDone: (sessionKey, payload, entry) => {
707
+ try {
708
+ // L9/L14: heartbeat at subagent end so the cascade/stall clock
709
+ // resets; the next tool's PreToolUse sets the following state.
710
+ const r = entry?.pendingQueue?.[0]?.context?.reactor;
711
+ if (r && typeof r.heartbeat === 'function') r.heartbeat();
712
+ logEvent('subagent-done', {
713
+ chat_id: getChatIdFromKey(sessionKey),
714
+ session_key: sessionKey,
715
+ backend: payload?.backend ?? 'cli',
716
+ agent_type: payload?.agentType ?? null,
717
+ agent_id: payload?.agentId ?? null,
718
+ duration_ms: payload?.durationMs ?? null,
719
+ // Finding 0.12-M4: persist the originating Agent tool_use_id so the
720
+ // documented subagent-start/subagent-done soak JOIN on
721
+ // $.tool_use_id matches (subagent-done's tool_use_id is recovered
722
+ // in cli-process.js from the paired Agent PreToolUse).
723
+ tool_use_id: payload?.toolUseId ?? null,
724
+ });
725
+ } catch (err) {
726
+ logger.error?.(`[${botName}] subagent-done handler: ${err.message}`);
727
+ }
728
+ },
729
+
625
730
  onInjectFail: (sessionKey, payload /* , entry */) => {
626
731
  try {
627
732
  const msgId = payload?.msgId;
@@ -28,6 +28,7 @@ const {
28
28
  getRetryAfterMs,
29
29
  } = require('./format');
30
30
  const { isSafeToRetry, redactBotToken } = require('../error/net');
31
+ const { coerceFileParams } = require('./input-file');
31
32
 
32
33
  // Topic deletion race: a user can delete a forum topic while a turn is in
33
34
  // flight, turning a valid `message_thread_id` into a 404. Telegram's error
@@ -112,6 +113,14 @@ async function send({ bot, method, params, db = null, meta = {}, logger = consol
112
113
  const chatId = params.chat_id != null ? String(params.chat_id) : null;
113
114
  const threadId = params.message_thread_id != null ? String(params.message_thread_id) : null;
114
115
 
116
+ // File-upload bug fix (2026-05-31): coerce a `{ source: '/abs/path' }`
117
+ // file param into a grammy InputFile so local-file uploads actually work.
118
+ // grammy doesn't recognize the bare envelope → it failed every send with
119
+ // "Wrong port number". Single choke point: fixes channels reply(files)
120
+ // AND the IPC send path at once. No-op for non-file methods / file_id /
121
+ // URL strings / existing InputFile instances.
122
+ coerceFileParams(method, params);
123
+
115
124
  // 0.7.4: empty-text short-circuit. Pre-fix, an empty params.text on
116
125
  // sendMessage/editMessageText reached Telegram and 400'd with
117
126
  // "message text is empty"; the row was marked failed and propagated
@@ -0,0 +1,76 @@
1
+ /**
2
+ * input-file — coerce file-upload params into grammy InputFile instances.
3
+ *
4
+ * The bug (2026-05-31, shumorobot Music): callers passed a Telegraf-style
5
+ * `{ source: '/abs/path' }` envelope as the file param (document/photo/…).
6
+ * grammy 1.x does NOT recognize that shape — it's not an InputFile, so
7
+ * grammy serializes it as a plain object and Telegram tries to read it as
8
+ * a URL/file_id, failing with "invalid file HTTP URL: Wrong port number".
9
+ * Result: file-send NEVER worked (channels reply(files) AND the IPC path
10
+ * both produced this exact error). The existing dispatcher test used a fake
11
+ * `send` and only asserted the METHOD, so it couldn't catch the bad shape.
12
+ *
13
+ * grammy uploads a local file only when the param is `new InputFile(path)`.
14
+ * This helper normalizes, at the single send choke point (tg()), the
15
+ * `{ source: <abs path> }` envelope → `new InputFile(path)`, leaving every
16
+ * other shape untouched:
17
+ * - string file_id / https URL → pass through (Telegram resolves)
18
+ * - existing InputFile instance → pass through (already correct)
19
+ * - Buffer / stream → pass through (grammy handles)
20
+ *
21
+ * Only the explicit `{ source: string }` envelope is transformed — bare
22
+ * path strings are intentionally NOT coerced (a Telegram file_id is also a
23
+ * bare string; coercing would break sends-by-id).
24
+ */
25
+
26
+ 'use strict';
27
+
28
+ const { InputFile } = require('grammy');
29
+
30
+ // method → the params field that carries the file.
31
+ const FILE_FIELD_BY_METHOD = {
32
+ sendPhoto: 'photo',
33
+ sendDocument: 'document',
34
+ sendAudio: 'audio',
35
+ sendVideo: 'video',
36
+ sendAnimation: 'animation',
37
+ sendVoice: 'voice',
38
+ sendVideoNote: 'video_note',
39
+ };
40
+
41
+ /**
42
+ * Return a grammy-uploadable value for a single file param, or the original
43
+ * value unchanged if it's not the `{ source }` envelope we coerce.
44
+ */
45
+ function coerceFileValue(val) {
46
+ if (val && typeof val === 'object' && !(val instanceof InputFile)
47
+ && typeof val.source === 'string' && val.source.length > 0) {
48
+ // { source: '/abs/path' } | { source: 'https://…', filename } → InputFile
49
+ return new InputFile(val.source, val.filename);
50
+ }
51
+ return val;
52
+ }
53
+
54
+ /**
55
+ * Mutate `params` in place so its file field (if any) is grammy-uploadable.
56
+ * No-op for non-file methods and for params with no file field set.
57
+ *
58
+ * @param {string} method
59
+ * @param {object} params
60
+ * @returns {object} the same params object (for chaining)
61
+ */
62
+ function coerceFileParams(method, params) {
63
+ if (!params || typeof params !== 'object') return params;
64
+ const field = FILE_FIELD_BY_METHOD[method];
65
+ if (!field) return params;
66
+ if (params[field] != null) {
67
+ params[field] = coerceFileValue(params[field]);
68
+ }
69
+ return params;
70
+ }
71
+
72
+ module.exports = {
73
+ coerceFileParams,
74
+ coerceFileValue,
75
+ FILE_FIELD_BY_METHOD,
76
+ };
@@ -55,6 +55,11 @@ const STATES = {
55
55
  // mid-turn user message is buffered for the next PostToolBatch
56
56
  // injection.
57
57
  AUTOSTEERED: { label: 'autosteered', chain: ['✍', '👀'] },
58
+ // 0.12 (Findings L9/L14): distinct in-progress reaction for a running
59
+ // subagent (Agent PreToolUse → SubagentStop). Driven by onSubagentStart.
60
+ // Preferred 👾 (NOT 🤖 — 🤖 is REACTION_INVALID for bots, same class as
61
+ // the rc.37 🧐 bug); falls back to 🔥 then 🤔, all bot-usable.
62
+ SUBAGENT: { label: 'subagent', chain: ['👾', '🔥', '🤔'] },
58
63
  DONE: { label: 'done', chain: ['👍'] },
59
64
  ERROR: { label: 'error', chain: ['🤯', '🤔'] },
60
65
  STALL: { label: 'stall', chain: ['🥱', '🤔'] },
@@ -42,6 +42,7 @@
42
42
  const EventEmitter = require('events');
43
43
  const fs = require('fs');
44
44
  const path = require('path');
45
+ const { StringDecoder } = require('string_decoder');
45
46
 
46
47
  const DEFAULT_INTERVAL_MS = 100;
47
48
  // Slow safety-net poll when fs.watch is active. Catches any events
@@ -91,6 +92,13 @@ class LogTail extends EventEmitter {
91
92
  this.fs = fsOverride || fs;
92
93
  this._offset = 0;
93
94
  this._buf = '';
95
+ // L8: decode bytes through a StringDecoder so a multibyte UTF-8 char
96
+ // split across two read chunks (the 64KB DEFAULT_CHUNK_BYTES boundary)
97
+ // isn't corrupted into U+FFFD. The decoder holds an incomplete trailing
98
+ // sequence until the continuation bytes arrive on the next read. The
99
+ // hook ndjson carries large non-ASCII tool payloads, so this is
100
+ // load-bearing on the CliProcess observability path.
101
+ this._decoder = new StringDecoder('utf8');
94
102
  this._closed = false;
95
103
  this._timer = null;
96
104
  this._watcher = null;
@@ -260,7 +268,9 @@ class LogTail extends EventEmitter {
260
268
  const readSize = Math.min(remaining, buffer.length);
261
269
  const { bytesRead } = await fd.read(buffer, 0, readSize, this._offset + totalRead);
262
270
  if (bytesRead === 0) break;
263
- this._buf += buffer.slice(0, bytesRead).toString('utf8');
271
+ // L8: StringDecoder.write instead of per-chunk toString('utf8') so a
272
+ // multibyte char straddling the read boundary survives intact.
273
+ this._buf += this._decoder.write(buffer.subarray(0, bytesRead));
264
274
  totalRead += bytesRead;
265
275
  }
266
276
  this._offset += totalRead;
@@ -17,6 +17,19 @@
17
17
  * - if `readySignal` regex matches the captured pane content, resolve
18
18
  * - if `Date.now()` exceeds the deadline, throw with `err.code = timeoutCode`
19
19
  *
20
+ * Progress-aware (stall) deadline — `stallMs`:
21
+ * The blind wall-clock `deadlineMs` can't tell "claude is mid-download
22
+ * (24% progress bar, genuinely working)" from "claude is wedged". The
23
+ * shumorobot General incident (2026-05-30) killed a cold-spawn that was
24
+ * actively downloading the runtime. When `stallMs` is set, the gate
25
+ * tracks pane ACTIVITY: any change in captured pane content — or a
26
+ * trigger key being sent — resets a stall clock. The gate fails early
27
+ * (with `timeoutCode`) only after `stallMs` elapses with NO activity,
28
+ * i.e. the pane is frozen. `deadlineMs` remains an absolute backstop so
29
+ * a pane that animates forever but never reaches `readySignal` still
30
+ * terminates. When `stallMs` is omitted (default), behavior is the pure
31
+ * `deadlineMs` wall-clock exactly as before.
32
+ *
20
33
  * Each trigger is one-shot per gate run (tracked by `name` in a Set).
21
34
  *
22
35
  * Caller supplies:
@@ -40,7 +53,10 @@ const DEFAULT_SETTLE_MS = 500;
40
53
  * @param {string} opts.tmuxName — tmux session name to poll
41
54
  * @param {Array<{name:string, regex:RegExp, key:string}>} opts.triggers
42
55
  * @param {RegExp} opts.readySignal — match → resolve
43
- * @param {number} [opts.deadlineMs=30000]
56
+ * @param {number} [opts.deadlineMs=30000] — absolute backstop
57
+ * @param {number} [opts.stallMs] — if set, fail after this much
58
+ * wall-clock with NO pane activity (progress-aware). Omit for pure
59
+ * wall-clock behavior.
44
60
  * @param {number} [opts.pollMs=300]
45
61
  * @param {number} [opts.settleMs=500]
46
62
  * @param {string} [opts.timeoutCode='TUI_STARTUP_TIMEOUT']
@@ -54,6 +70,7 @@ async function runStartupGate({
54
70
  triggers = [],
55
71
  readySignal,
56
72
  deadlineMs = DEFAULT_DEADLINE_MS,
73
+ stallMs,
57
74
  pollMs = DEFAULT_POLL_MS,
58
75
  settleMs = DEFAULT_SETTLE_MS,
59
76
  timeoutCode = 'TUI_STARTUP_TIMEOUT',
@@ -70,6 +87,7 @@ async function runStartupGate({
70
87
 
71
88
  const startedAt = Date.now();
72
89
  const deadline = startedAt + deadlineMs;
90
+ const stallEnabled = Number.isFinite(stallMs) && stallMs > 0;
73
91
  const seen = new Set();
74
92
  const matchedTriggers = [];
75
93
  // rc.4: remember the most recent successful pane snapshot. If the gate
@@ -78,8 +96,37 @@ async function runStartupGate({
78
96
  // this, "claude exits code 0 after dev-channels Enter" surfaces as a
79
97
  // 30-second `can't find pane` spam with no diagnostic about WHY.
80
98
  let lastPane = null;
99
+ // Progress-aware gate: timestamp of the last observed pane CHANGE (or
100
+ // trigger send). Only consulted when stallEnabled.
101
+ let lastActivityAt = startedAt;
102
+ // Music incident (2026-06-01): the stall timer must NOT arm while the pane
103
+ // is still BLANK. A blank-and-unchanging pane means claude hasn't started
104
+ // rendering yet (slow cold-start), NOT that it wedged — the TUI for some
105
+ // topics takes 30-45s to first-render. Arming the stall timer on a blank
106
+ // pane killed a legitimate slow spawn at stallMs with a false "wedged".
107
+ // So the stall clock only runs once the pane has shown non-whitespace
108
+ // content; before that, only the absolute `deadlineMs` governs.
109
+ let sawContent = false;
81
110
 
82
111
  while (Date.now() < deadline) {
112
+ // Stall check (progress-aware): the pane RENDERED something and has then
113
+ // been static for stallMs → genuinely wedged. Gated on sawContent so a
114
+ // blank cold-start isn't mistaken for a wedge. Fires early so a truly
115
+ // hung TUI fails fast, while an actively-progressing one (download bar,
116
+ // dialog navigation) keeps resetting lastActivityAt below.
117
+ if (stallEnabled && sawContent && Date.now() - lastActivityAt >= stallMs) {
118
+ const err = new Error(
119
+ `[${label}] startup gate: pane rendered then went static for ${stallMs}ms for ${tmuxName} ` +
120
+ `(matched: ${matchedTriggers.length ? matchedTriggers.join(', ') : 'none'}). ` +
121
+ `Appears wedged. Last pane content:\n` +
122
+ _formatPaneTail(lastPane),
123
+ );
124
+ err.code = timeoutCode;
125
+ err.lastPane = lastPane;
126
+ err.matchedTriggers = matchedTriggers;
127
+ err.reason = 'stall';
128
+ throw err;
129
+ }
83
130
  let pane;
84
131
  try {
85
132
  pane = await runner.captureWide(tmuxName);
@@ -107,6 +154,19 @@ async function runStartupGate({
107
154
  await new Promise(r => setTimeout(r, settleMs));
108
155
  continue;
109
156
  }
157
+ // First non-whitespace content = the TUI has started rendering. Only
158
+ // from here does the stall timer become meaningful (before this, a blank
159
+ // pane is cold-start, governed by the absolute deadline). Seed
160
+ // lastActivityAt at the moment content first appears so the stall window
161
+ // is measured from "rendered", not from spawn.
162
+ if (!sawContent && pane && pane.trim().length > 0) {
163
+ sawContent = true;
164
+ lastActivityAt = Date.now();
165
+ }
166
+ // Progress signal: any change in pane content is activity → reset the
167
+ // stall clock. A captureWide that returns the SAME bytes is NOT
168
+ // activity (a frozen download bar at 24% reads identically each poll).
169
+ if (pane !== lastPane) lastActivityAt = Date.now();
110
170
  lastPane = pane;
111
171
 
112
172
  // Walk triggers in declaration order — first match (and not yet seen) wins
@@ -122,6 +182,10 @@ async function runStartupGate({
122
182
  seen.add(trigger.name);
123
183
  matchedTriggers.push(trigger.name);
124
184
  matched = true;
185
+ // Sending a key is activity — navigating the TUI counts as progress
186
+ // even if the pre-transition pane text was static (e.g. a dialog we
187
+ // just answered). Reset the stall clock so we don't fail mid-nav.
188
+ lastActivityAt = Date.now();
125
189
  // Settle window so the TUI transitions out of the dialog before next poll
126
190
  await new Promise(r => setTimeout(r, settleMs));
127
191
  break;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "polygram",
3
- "version": "0.12.0-rc.1",
3
+ "version": "0.12.0-rc.10",
4
4
  "description": "Telegram daemon for Claude Code that preserves the OpenClaw per-chat session model. Migration path for OpenClaw users moving to Claude Code.",
5
5
  "main": "lib/ipc/client.js",
6
6
  "bin": {
package/polygram.js CHANGED
@@ -28,7 +28,7 @@ const {
28
28
  migrateJsonToDb, getClaudeSessionId, resolveSessionForSpawn,
29
29
  } = require('./lib/db/sessions');
30
30
  const { buildPrompt } = require('./lib/prompt');
31
- const { filterAttachments } = require('./lib/attachments');
31
+ const { filterAttachments, resolveFileCaps, MAX_TOTAL_BYTES } = require('./lib/attachments');
32
32
  // 0.9.0: SDK ProcessManager is the only pm. CLI pm
33
33
  // (lib/process-manager.js) deleted in commit 6.
34
34
  // Both implementations expose the same public API (constructor +
@@ -51,7 +51,6 @@ const { extractAssistantText } = require('./lib/process/sdk-process');
51
51
  const { createChannelsToolDispatcher } = require('./lib/process/channels-tool-dispatcher');
52
52
  const { createTmuxRunner } = require('./lib/tmux/tmux-runner');
53
53
  const { sweepTmuxOrphans } = require('./lib/tmux/orphan-sweep');
54
- const { PollScheduler } = require('./lib/tmux/poll-scheduler');
55
54
  // rc.42: autosteer-buffer module deleted. Native SDK priority push
56
55
  // (pm.injectUserMessage) replaces the buffer + PostToolBatch detour.
57
56
  const { createAutosteeredRefs } = require('./lib/autosteered-refs');
@@ -462,6 +461,10 @@ function buildSpawnContext(sessionKey) {
462
461
  threadId: threadId || null,
463
462
  label: getSessionLabel(chatConfig, threadId),
464
463
  existingSessionId,
464
+ // File-send outbound cap inputs: localApi (bot-level) so CliProcess can
465
+ // resolve the per-chat/topic outbound cap (resolveFileCaps) the same way
466
+ // it resolves cwd/agent. Override itself lives in chatConfig/topic.
467
+ localApi: !!config.bot?.apiRoot,
465
468
  };
466
469
  }
467
470
 
@@ -755,7 +758,19 @@ async function handleMessage(sessionKey, chatId, msg, bot) {
755
758
  const sessionCtx = !pm.has(sessionKey) ? await readSessionContext(sessionKey, chatConfig.cwd) : '';
756
759
 
757
760
  const rawAtts = extractAttachments(msg);
758
- const { accepted, rejected } = filterAttachments(rawAtts);
761
+ // Backend-derived inbound cap with per-topic/chat override. Cloud → 20MB;
762
+ // a local Bot API server (config.bot.apiRoot) → 2GB; override via
763
+ // chats[id].maxFileBytes or topics[t].maxFileBytes, clamped to the
764
+ // backend ceiling. Bytes-valued config; resolveFileCaps does the clamp.
765
+ const _inTopicCfg = getTopicConfig(chatConfig, threadIdStr || null);
766
+ const _fileCaps = resolveFileCaps({
767
+ localApi: !!config.bot?.apiRoot,
768
+ override: _inTopicCfg.maxFileBytes ?? chatConfig.maxFileBytes ?? null,
769
+ });
770
+ const { accepted, rejected } = filterAttachments(rawAtts, {
771
+ maxFileBytes: _fileCaps.inBytes,
772
+ maxTotalBytes: Math.max(_fileCaps.inBytes, MAX_TOTAL_BYTES),
773
+ });
759
774
  for (const { att, reason } of rejected) {
760
775
  console.log(`[${label}] attachment skipped: ${att.name} (${reason})`);
761
776
  logEvent('attachment-skipped', { chat_id: chatId, msg_id: msg.message_id, name: att.name, reason });
@@ -1673,9 +1688,23 @@ function shouldHandle(msg, chatConfig, botUsername) {
1673
1688
  }
1674
1689
 
1675
1690
  function createBot(token) {
1691
+ // Optional self-hosted Telegram Bot API server. When config.bot.apiRoot is
1692
+ // set (e.g. "http://localhost:8081" from a local `telegram-bot-api`
1693
+ // process), grammy routes all Bot API calls there instead of
1694
+ // api.telegram.org — which lifts file send/receive from cloud's 50 MB-out /
1695
+ // 20 MB-in to 2 GB both ways. Omit it (default) → cloud Telegram, unchanged.
1696
+ // The local server is a separate companion daemon; this is just the knob
1697
+ // that points polygram at it. See docs/0.12.0-file-send.md.
1698
+ const apiRoot = config.bot?.apiRoot;
1676
1699
  const bot = new Bot(token, {
1677
- client: { timeoutSeconds: 60 },
1700
+ client: {
1701
+ timeoutSeconds: 60,
1702
+ ...(apiRoot ? { apiRoot } : {}),
1703
+ },
1678
1704
  });
1705
+ if (apiRoot) {
1706
+ console.log(`[polygram] using local Telegram Bot API server: ${apiRoot} (2GB file limit)`);
1707
+ }
1679
1708
  let botUsername = '';
1680
1709
  // Cached once @botUsername is known — was recompiling per inbound msg.
1681
1710
  let mentionRe = null;
@@ -2244,19 +2273,13 @@ async function main() {
2244
2273
  const binCheck = verifyPinnedClaudeBin(CLAUDE_CLI_PINNED_VERSION);
2245
2274
  if (binCheck.ok) {
2246
2275
  console.log(
2247
- `[polygram] tmux backend pinned to claude CLI v${CLAUDE_CLI_PINNED_VERSION}: ${binCheck.path}`,
2276
+ `[polygram] CliProcess pinned to claude CLI v${CLAUDE_CLI_PINNED_VERSION}: ${binCheck.path}`,
2248
2277
  );
2249
2278
  pinnedClaudeBin = binCheck.path;
2250
2279
  } else {
2251
2280
  console.warn(`[polygram] WARNING: ${binCheck.reason}`);
2252
2281
  }
2253
2282
  }
2254
- // O1 optimization: shared poll-tick scheduler. N TmuxProcess
2255
- // instances share ONE setInterval instead of spawning N independent
2256
- // setTimeout chains. Idle when no chats are in flight (zero timers
2257
- // running). Configurable via config.bot.tmuxPollIntervalMs.
2258
- const tmuxPollIntervalMs = config.bot?.tmuxPollIntervalMs || 250;
2259
- const pollScheduler = new PollScheduler({ intervalMs: tmuxPollIntervalMs });
2260
2283
  // 0.11.0: channels backend wiring. Used when a chat opts in via
2261
2284
  // `pm: 'channels'` config. Falls back to SDK gracefully if the pinned
2262
2285
  // claude binary isn't present (see factory.js — channelsClaudeBin
@@ -2282,7 +2305,6 @@ async function main() {
2282
2305
  logger: console,
2283
2306
  tmuxRunner,
2284
2307
  botName: BOT_NAME,
2285
- pollScheduler,
2286
2308
  // channels backend
2287
2309
  toolDispatcher: channelsToolDispatcher,
2288
2310
  channelsClaudeBin,