polygram 0.12.0-rc.2 → 0.12.0-rc.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -48,6 +48,13 @@ const { Process, UnsupportedOperationError } = require('./process');
48
48
  const { ChannelsBridgeServer } = require('./channels-bridge-server');
49
49
  const { writeHookFiles, removeHookFiles } = require('./hook-settings');
50
50
  const { createHookTail } = require('./hook-event-tail');
51
+ // File-send staging: reuse the dispatcher's allowlist root so the dir we
52
+ // create exactly matches the realpath the validator accepts (no /tmp vs
53
+ // /private/tmp drift — one of the original Music-topic failures).
54
+ const { DEFAULT_ATTACHMENT_BASE } = require('./channels-tool-dispatcher');
55
+ const { resolveFileCaps } = require('../attachments');
56
+ const { resolveCompactionWarnConfig } = require('../compaction-warn');
57
+ const { readContextTokens, contextPct } = require('../context-usage');
51
58
  const { runStartupGate } = require('../tmux/startup-gate');
52
59
  const { POLYGRAM_DISPLAY_HINT } = require('../telegram/display-hint');
53
60
 
@@ -113,6 +120,17 @@ const STREAMING_HINT_RE = /esc to interrupt/i;
113
120
  // — false positives surface as no-op telemetry, false negatives surface
114
121
  // as the idle-ceiling timeout (~10min).
115
122
  const UNKNOWN_PROMPT_HEURISTIC_RE = /(\?\s*$|\(y\/N\)|Yes\/No|❯\s|^\s*[12345]\.\s)/im;
123
+ // rc.14: a previous rc (rc.11) had a BRIDGE_DEAD_RE here that matched the pane
124
+ // line "server:polygram-bridge no MCP server configured with that name" and
125
+ // treated it as a dead bridge to recover from. That was a MISDIAGNOSIS: this
126
+ // line is a BENIGN, persistent banner that `--dangerously-load-development-
127
+ // channels` + `--strict-mcp-config` prints on EVERY healthy session — the
128
+ // channel still delivers messages and the reply tool still works (reproduced
129
+ // 2026-06-01 with a test MCP server that demonstrably functions). The pane
130
+ // matcher therefore false-fired ~5s into every channels turn and KILLED
131
+ // healthy sessions (the Music-topic "mid-turn detach" regression). Real bridge
132
+ // loss is caught by the socket-close path (bridgeServer 'bridge-disconnected'
133
+ // → _handleBridgeDisconnected). There is no reliable pane signal — removed.
116
134
  // Per-pattern rate limit so a dialog that lingers across multiple polls
117
135
  // doesn't spam sendControl/event emissions. Aligned with the 5s poll cadence.
118
136
  const MID_TURN_DEDUP_WINDOW_MS = 30_000;
@@ -251,6 +269,10 @@ class CliProcess extends Process {
251
269
  // pending turn(s): turn_id → { resolve, reject, replies: [], quietTimer, hardTimer, startedAt }
252
270
  this.pendingTurns = new Map();
253
271
 
272
+ // File-send outbound cap (bot → user). Safe cloud default; overwritten in
273
+ // _spawnTmuxClaude with the backend/chat-resolved value before any turn.
274
+ this.maxOutboundFileBytes = resolveFileCaps({ localApi: false }).outBytes;
275
+
254
276
  // P1 security (review #8): track resolved permission request_ids so a
255
277
  // double-fire of respond() can't write a second perm_verdict for the same
256
278
  // request. TmuxProcess gates on _pendingApprovalId; this is the channels
@@ -297,6 +319,23 @@ class CliProcess extends Process {
297
319
  // permit files under the agent's workspace.
298
320
  this.sessionCwd = opts.cwd || null;
299
321
 
322
+ // File-send staging dir (2026-06 file-send feature). The dispatcher
323
+ // allowlist always permits <DEFAULT_ATTACHMENT_BASE>/<sessionKey>/, but
324
+ // nothing ever CREATED it — so claude's reply(files) attempts at
325
+ // /tmp/polygram-attachments failed (dir absent / realpath mismatch) and
326
+ // it flailed across other paths. Create it here and surface it to the
327
+ // prompt so claude has one blessed, always-allowed place to stage a file
328
+ // before sending. realpathSync so the stored path matches what the
329
+ // validator resolves (the /tmp ↔ /private/tmp fix).
330
+ try {
331
+ const dir = path.join(DEFAULT_ATTACHMENT_BASE, String(this.sessionKey));
332
+ fs.mkdirSync(dir, { recursive: true, mode: 0o700 });
333
+ this.attachmentStagingDir = fs.realpathSync(dir);
334
+ } catch (err) {
335
+ this.attachmentStagingDir = null;
336
+ this.logger.warn?.(`[${this.label}] channels: staging dir create failed: ${err.message}`);
337
+ }
338
+
300
339
  // Opaque random token for socket filename — do NOT leak sessionKey to /tmp.
301
340
  const socketToken = crypto.randomBytes(16).toString('hex');
302
341
  this.sockPath = path.join(os.tmpdir(), `polygram-${socketToken}.sock`);
@@ -416,28 +455,7 @@ class CliProcess extends Process {
416
455
 
417
456
  this.bridgeServer.on('bridge-message', msg => this._handleBridgeMessage(msg));
418
457
 
419
- this.bridgeServer.on('bridge-disconnected', () => {
420
- this.bridgeReady = false;
421
- this.mcpReady = false;
422
- if (!this.closed) {
423
- this.logger.warn?.(`[${this.label}] channels: bridge disconnected unexpectedly`);
424
- // P1 #5: drain pendingTurns immediately so hardTimers don't run 10min.
425
- for (const [, pending] of this.pendingTurns) {
426
- if (pending.quietTimer) clearTimeout(pending.quietTimer);
427
- if (pending.hardTimer) clearTimeout(pending.hardTimer);
428
- if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
429
- if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
430
- const err = new Error('bridge disconnected');
431
- err.code = 'BRIDGE_DISCONNECTED';
432
- try { pending.reject(err); } catch {}
433
- }
434
- this.pendingTurns.clear();
435
- this.pendingQueue.length = 0;
436
- this.inFlight = false;
437
- this.emit('bridge-disconnected');
438
- this._logEvent('bridge-disconnected', { reason: 'socket-close' });
439
- }
440
- });
458
+ this.bridgeServer.on('bridge-disconnected', () => this._handleBridgeDisconnected());
441
459
 
442
460
  await this.bridgeServer.listen();
443
461
  }
@@ -493,6 +511,26 @@ class CliProcess extends Process {
493
511
  const effort = topicConfig?.effort || opts.chatConfig?.effort || opts.effort;
494
512
  const resolvedCwd = topicConfig?.cwd || opts.chatConfig?.cwd || opts.cwd;
495
513
 
514
+ // File-send outbound cap (bot → user). Backend-derived (cloud 50MB vs
515
+ // local Bot API server 2GB via opts.localApi) with per-topic/chat
516
+ // maxFileBytes override, clamped to the backend ceiling. Stored for the
517
+ // dispatcher (live size-check) and the system prompt (so claude states
518
+ // the right limit). Resolved here so it follows the same topic→chat
519
+ // precedence as cwd/agent above.
520
+ const _capOverride = topicConfig?.maxFileBytes ?? opts.chatConfig?.maxFileBytes ?? null;
521
+ this.maxOutboundFileBytes = resolveFileCaps({
522
+ localApi: !!opts.localApi,
523
+ override: _capOverride,
524
+ }).outBytes;
525
+
526
+ // 0.12.0-rc.13: per-chat/topic compaction warning (default OFF). Same
527
+ // topic→chat precedence as the file cap above. When enabled, the channels
528
+ // backend warns the chat as context fills (propose /compact at a break)
529
+ // and on auto-compaction (the event that detaches the bridge mid-turn).
530
+ const _compactionWarnRaw = topicConfig?.compactionWarnings ?? opts.chatConfig?.compactionWarnings;
531
+ this.compactionWarn = resolveCompactionWarnConfig({ compactionWarnings: _compactionWarnRaw });
532
+ this._compactionWarned = false; // proactive warn-once per climb; reset on PostCompact
533
+
496
534
  // Parity audit P8 + rc.8 fs-guard (2026-05-26 shumorobot Music topic):
497
535
  // `--session-id <id>` creates a NEW claude session with that id;
498
536
  // `--resume <id>` resumes the EXISTING conversation. Lazy-respawn after
@@ -540,6 +578,9 @@ class CliProcess extends Process {
540
578
  );
541
579
  }
542
580
  }
581
+ // Finding 0.12-M2: record the resume decision so _armHookTail (run
582
+ // after spawn) skips the prior session's still-on-disk hook ndjson.
583
+ this._resumedSession = canResume;
543
584
  if (agent) claudeArgs.push('--agent', agent);
544
585
  if (model) claudeArgs.unshift('--model', model);
545
586
  if (effort) claudeArgs.push('--effort', effort);
@@ -616,6 +657,28 @@ class CliProcess extends Process {
616
657
  'Internal tool calls (Bash, Edit, Write, Read, etc.) are fine to use',
617
658
  'as normal — only the FINAL user-visible message needs to go through',
618
659
  'the reply tool.',
660
+ '',
661
+ '### Sending FILES (tracks, images, docs) to the user',
662
+ '',
663
+ 'The `mcp__polygram-bridge__reply` tool takes an optional `files` array of',
664
+ 'absolute paths. This is the ONLY way to send a file. Do NOT use Bash,',
665
+ 'curl, the Telegram Bot API, or polygram-ipc to send files — those fail.',
666
+ '',
667
+ ...(this.attachmentStagingDir ? [
668
+ `To send a file: COPY it into the staging dir \`${this.attachmentStagingDir}\`,`,
669
+ 'then call reply with its absolute path, e.g.:',
670
+ ` reply(chat_id="<id>", text="Here's the track", files=["${this.attachmentStagingDir}/track.flac"])`,
671
+ 'polygram auto-deletes staged files after the turn — you do not need to clean up.',
672
+ 'You may also send directly from the agent workspace (cwd); other paths are rejected.',
673
+ ] : [
674
+ 'Copy the file somewhere under your workspace (cwd) and pass its absolute',
675
+ 'path in `files`. Paths outside the workspace are rejected for safety.',
676
+ ]),
677
+ '',
678
+ `Max file size for sending: ${Math.round(this.maxOutboundFileBytes / (1024 * 1024))} MB. ` +
679
+ 'For larger lossless audio, convert to FLAC/MP3 under the limit first, ' +
680
+ 'or tell the user it exceeds the limit. Images go as photos; everything ' +
681
+ 'else as documents.',
619
682
  ].join('\n'));
620
683
 
621
684
  // Parity audit P6: honor isolateUserConfig — mirrors tmux pattern at
@@ -705,6 +768,20 @@ class CliProcess extends Process {
705
768
  ],
706
769
  readySignal: /Listening for channel messages from: server:polygram-bridge/i,
707
770
  timeoutCode: 'CHANNELS_DIALOG_TIMEOUT',
771
+ // Progress-aware gate (shumorobot General incident 2026-05-30): a
772
+ // cold spawn that's mid-download (runtime fetch, "24%" progress bar)
773
+ // is genuinely working and must NOT be killed by the blind 30s
774
+ // wall-clock. stallMs fails fast only when the pane is FROZEN; an
775
+ // actively-changing pane (download bar, dialog nav) keeps resetting
776
+ // the stall clock and rides out to the ready signal. deadlineMs stays
777
+ // the absolute backstop. 30s of zero pane activity = genuinely wedged.
778
+ // Stall = pane rendered then went static (genuinely wedged). 60s, not
779
+ // 30s: some topics' TUIs cold-render slowly (Music ~45s, slow MCP
780
+ // startup) — 30s was too tight and false-aborted them. Blank panes
781
+ // don't arm the stall timer at all now (see runStartupGate), so this
782
+ // only bounds a TUI that rendered and then truly hung.
783
+ stallMs: this.startupGateStallMs ?? 60_000,
784
+ deadlineMs: this.startupGateDeadlineMs ?? 180_000,
708
785
  logger: this.logger,
709
786
  label: `${this.label}:startup-gate`,
710
787
  });
@@ -849,15 +926,18 @@ class CliProcess extends Process {
849
926
  // rate-limit / chat-id-mismatch path. Live shumorobot 2026-05-26 23:44
850
927
  // observed 3+ "Called polygram-bridge" entries in the TUI pane with
851
928
  // ZERO OUT messages delivered to TG and zero warn-level diagnostics —
852
- // need to see args.text / args.chat_id / args.turn_id to know whether
853
- // claude is calling reply with empty text, wrong chat_id, or something
854
- // else entirely.
855
- this.logger.warn?.(
929
+ // need to see args.chat_id / args.turn_id to know whether claude is
930
+ // calling reply with empty text, wrong chat_id, or something else.
931
+ // L13: root-caused — demoted to debug and DROPPED text_head. Logging
932
+ // the first 80 chars of every reply at warn level leaked private chat
933
+ // content / file excerpts / secrets into the default log sink,
934
+ // unconditionally. name/chat_id/turn_id/text_len diagnose dispatch
935
+ // without exposing message content.
936
+ this.logger.debug?.(
856
937
  `[${this.label}] channels: tool-call name=${msg.name} ` +
857
938
  `chat_id=${JSON.stringify(args.chat_id)} ` +
858
939
  `turn_id=${JSON.stringify(args.turn_id)} ` +
859
- `text_len=${typeof args.text === 'string' ? args.text.length : 'non-string'} ` +
860
- `text_head=${JSON.stringify((args.text || '').slice(0, 80))}`,
940
+ `text_len=${typeof args.text === 'string' ? args.text.length : 'non-string'}`,
861
941
  );
862
942
 
863
943
  // Review P1 #7: idempotency. If we've already ACK'd this tool_call_id,
@@ -948,6 +1028,7 @@ class CliProcess extends Process {
948
1028
  text: args.text,
949
1029
  files: args.files,
950
1030
  sessionCwd: this.sessionCwd, // P0 #2: dispatcher uses this to allowlist file roots
1031
+ maxOutboundFileBytes: this.maxOutboundFileBytes, // backend/chat-derived upload cap
951
1032
  });
952
1033
  } catch (err) {
953
1034
  this._writeToBridge({ kind: 'tool_ack', tool_call_id: msg.tool_call_id, ok: false, error: err.message });
@@ -1122,13 +1203,27 @@ class CliProcess extends Process {
1122
1203
  this._finalizeTurn(turnId);
1123
1204
  };
1124
1205
  const onStop = (info) => {
1125
- // Capture the fallback text; the actual finalize call below will pick
1126
- // it up via pending._stopHookData.
1206
+ // Finding 0.12-M1: the Stop hook carries NO turn_id, and a single
1207
+ // global 'stop-hook' emission fires EVERY per-turn onStop listener.
1208
+ // When more than one turn is in stop-grace we cannot attribute this
1209
+ // Stop (or its last_assistant_message) to a specific turn — the
1210
+ // pre-fix code let one Stop finalize all grace-pending turns and
1211
+ // cross-attribute one turn's text to another (the exact class the
1212
+ // F#3 reply routing prevents). Mirror that drop-rather-than-
1213
+ // misattribute discipline: only consume the Stop when exactly ONE
1214
+ // turn is in grace; otherwise ignore it and let each turn finalize
1215
+ // on its own grace timer (each keeps its own reply text).
1216
+ let graceCount = 0;
1217
+ for (const p of this.pendingTurns.values()) if (p._stopGracePending) graceCount++;
1218
+ if (graceCount !== 1) return;
1127
1219
  pending._stopHookData = info;
1128
1220
  clearTimeout(pending._stopGraceTimer);
1129
1221
  pending._stopGraceTimer = null;
1130
1222
  finalize();
1131
1223
  };
1224
+ // L5: stash the closure so teardown paths that bypass Process.kill()'s
1225
+ // removeAllListeners (bridge-disconnect drain, resetSession) can off it.
1226
+ pending._onStop = onStop;
1132
1227
  pending._stopGraceTimer = setTimeout(finalize, this.stopGraceMs);
1133
1228
  // unref so a never-fired grace doesn't pin the event loop. In tests
1134
1229
  // where a CliProcess is created, send() is called, then the test
@@ -1154,6 +1249,7 @@ class CliProcess extends Process {
1154
1249
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1155
1250
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1156
1251
  if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
1252
+ const hadReplyToolCalls = pending.replies.length > 0;
1157
1253
  let text = pending.replies.join('\n\n');
1158
1254
  // 0.12 Phase 1.7 fallback: if no reply tool calls landed (claude ended
1159
1255
  // the turn without calling mcp__polygram-bridge__reply), use the Stop
@@ -1171,12 +1267,14 @@ class CliProcess extends Process {
1171
1267
  // to appear free in dashboards.
1172
1268
  const result = {
1173
1269
  text,
1174
- // Review F#2: dispatcher has ALREADY delivered text to Telegram on each
1175
- // reply tool call (incremental real-time UX is the channels delivery
1176
- // model). polygram.js's post-pm.send pipeline must short-circuit its
1177
- // streamer.finalize / deliverReplies branch otherwise every turn
1178
- // delivers twice. Logging + DB transcript still use result.text.
1179
- alreadyDelivered: true,
1270
+ // Review F#2: when claude used reply tool calls, the dispatcher ALREADY
1271
+ // delivered that text to Telegram incrementally polygram.js must
1272
+ // short-circuit its deliverReplies branch or every turn delivers twice.
1273
+ // BUT a turn finalized via the Stop fallback (no reply tool calls — the
1274
+ // stuck-turn case) has delivered NOTHING; marking it alreadyDelivered
1275
+ // would resolve the turn silently and the user still sees nothing. So
1276
+ // only claim already-delivered when reply tool calls actually fired.
1277
+ alreadyDelivered: hadReplyToolCalls,
1180
1278
  sessionId: this.claudeSessionId,
1181
1279
  cost: null, // Channels protocol doesn't expose per-turn cost
1182
1280
  duration,
@@ -1195,6 +1293,27 @@ class CliProcess extends Process {
1195
1293
  pending.resolve(result);
1196
1294
  this.emit('result', { subtype: 'success' }, { streamText: text });
1197
1295
  this.emit('idle');
1296
+ // File-send staging auto-purge (your choice — no "claude must delete").
1297
+ // Once the LAST turn settles, wipe the staging dir's contents so files
1298
+ // claude copied in to send don't accumulate on disk across turns. Only
1299
+ // when fully idle, so a file staged for a still-pending concurrent turn
1300
+ // isn't yanked mid-send.
1301
+ if (this.pendingTurns.size === 0) this._purgeStagingDir();
1302
+ }
1303
+
1304
+ /**
1305
+ * Empty the per-session file-send staging dir (keep the dir itself).
1306
+ * Best-effort; never throws. Called when the session goes idle and on kill.
1307
+ */
1308
+ _purgeStagingDir() {
1309
+ if (!this.attachmentStagingDir) return;
1310
+ let entries;
1311
+ try { entries = fs.readdirSync(this.attachmentStagingDir); }
1312
+ catch { return; }
1313
+ for (const name of entries) {
1314
+ try { fs.rmSync(path.join(this.attachmentStagingDir, name), { recursive: true, force: true }); }
1315
+ catch { /* best-effort */ }
1316
+ }
1198
1317
  }
1199
1318
 
1200
1319
  // ─── public Process API ──────────────────────────────────────────
@@ -1386,6 +1505,63 @@ class CliProcess extends Process {
1386
1505
  this._interruptGraceTimer.unref?.();
1387
1506
  }
1388
1507
 
1508
+ /**
1509
+ * Is claude actually still working, regardless of the resolved-turn flag?
1510
+ *
1511
+ * "Stop" incident (shumorobot Music, 2026-05-31 13:08): the channels
1512
+ * backend resolves a turn on the quiet-window after claude's last reply
1513
+ * tool call (inFlight → false), but claude can keep working afterwards
1514
+ * (a subagent, a long Bash). The abort handler keyed its ack on inFlight
1515
+ * alone, so "Stop" said "Nothing to stop" one second after the bot said
1516
+ * "On it — downloading…" while a subagent churned.
1517
+ *
1518
+ * The TUI prints "esc to interrupt" (STREAMING_HINT_RE) continuously
1519
+ * whenever claude is busy — capture-pane is the truthful signal, the
1520
+ * channels analog of the (deleted) tmux hasBackgroundShell() probe.
1521
+ *
1522
+ * Returns a STRUCTURED probe (not just a boolean) so the abort path can
1523
+ * log the raw signals — pane tail + flags — to the events DB. That lets
1524
+ * us later characterize which states the heuristic gets right/wrong and
1525
+ * refine it (e.g. add signals beyond the esc-hint) without guessing.
1526
+ *
1527
+ * Never throws — a failed capture returns captured:false, busy:false.
1528
+ *
1529
+ * @returns {Promise<{busy:boolean, streaming:boolean, inFlight:boolean,
1530
+ * pendingTurns:number, captured:boolean, paneTail:(string|null)}>}
1531
+ */
1532
+ async probeBusyState() {
1533
+ const base = {
1534
+ busy: false, streaming: false,
1535
+ inFlight: this.inFlight, pendingTurns: this.pendingTurns.size,
1536
+ captured: false, paneTail: null,
1537
+ };
1538
+ if (this.closed || !this.tmuxSession || typeof this.runner?.captureWide !== 'function') {
1539
+ return base;
1540
+ }
1541
+ let pane;
1542
+ try {
1543
+ pane = await this.runner.captureWide(this.tmuxSession);
1544
+ } catch (err) {
1545
+ this.logger.warn?.(`[${this.label}] channels: probeBusyState captureWide failed: ${err.message}`);
1546
+ return base;
1547
+ }
1548
+ if (!pane) return base;
1549
+ const streaming = STREAMING_HINT_RE.test(pane);
1550
+ return {
1551
+ ...base,
1552
+ busy: streaming,
1553
+ streaming,
1554
+ captured: true,
1555
+ paneTail: pane.slice(-200),
1556
+ };
1557
+ }
1558
+
1559
+ /** Boolean shorthand for probeBusyState().busy (abort-path convenience). */
1560
+ async isBusy() {
1561
+ const { busy } = await this.probeBusyState();
1562
+ return busy;
1563
+ }
1564
+
1389
1565
  async kill(reason = 'kill') {
1390
1566
  if (this.closed) return;
1391
1567
  // Parity P19: re-entry guard for concurrent kill() calls. Mirrors
@@ -1415,17 +1591,18 @@ class CliProcess extends Process {
1415
1591
  this.logger.warn?.(`[${this.label}] _armHookTail: _hookNdjsonPath unset; hooks disabled. Phase 1.2 may have failed.`);
1416
1592
  return;
1417
1593
  }
1418
- // Fresh spawn: ndjson was just touched by writeHookFiles and is empty,
1419
- // so `skipExisting: false` (default) is correct. For lazy-respawn on
1420
- // existingSessionId, we currently re-run writeHookFiles which touches
1421
- // a NEW file with the same name (overwrite). If we ever switch to
1422
- // resume-without-touch, set skipExisting: true to avoid replaying
1423
- // stale events from the prior process same pattern tmux uses on
1424
- // --resume per rc.42 #5.
1594
+ // Finding 0.12-M2: writeHookFiles opens the ndjson in APPEND mode
1595
+ // ('a') and never truncates, so on a --resume respawn the prior
1596
+ // session's hook lines are still on disk under the same path. Replaying
1597
+ // them re-drives the turn state machine from stale Stop/PreToolUse
1598
+ // events (a stale Stop can finalize the fresh turn). So skip existing
1599
+ // content when (and only when) this is a resumed session the same
1600
+ // discipline the JSONL tail uses on --resume. A fresh spawn's ndjson is
1601
+ // empty, so skipExisting:false is correct there.
1425
1602
  this._hookTail = createHookTail({
1426
1603
  path: this._hookNdjsonPath,
1427
1604
  logger: this.logger,
1428
- skipExisting: false,
1605
+ skipExisting: this._resumedSession === true,
1429
1606
  });
1430
1607
  this._hookTail.on('event', (ev) => {
1431
1608
  try {
@@ -1458,6 +1635,22 @@ class CliProcess extends Process {
1458
1635
  _handleHookEvent(ev) {
1459
1636
  if (!ev || typeof ev !== 'object') return;
1460
1637
 
1638
+ // rc.16 observability: emit once when the FIRST hook event arrives for
1639
+ // this session, confirming the claude→ndjson→tail pipeline is actually
1640
+ // flowing. The 2026-06-02 stuck turn had a session whose hook ndjson was
1641
+ // 0 bytes — claude emitted no hooks polygram could see, so no Stop ever
1642
+ // arrived to finalize the turn. Without this signal that's invisible: a
1643
+ // turn that hangs with NO `cli-hook-stream-live` for its session means the
1644
+ // hook pipeline is dead for it (distinct from "Stop fired but wasn't
1645
+ // acted on", which `cli-turn-resolved-by-stop` now covers).
1646
+ if (!this._sawHookStream) {
1647
+ this._sawHookStream = true;
1648
+ this._logEvent('cli-hook-stream-live', {
1649
+ session_id: this.claudeSessionId,
1650
+ first_event: ev.type,
1651
+ });
1652
+ }
1653
+
1461
1654
  // 0.12 Phase 1.8 (Finding 0.4.A): per-event lag measurement.
1462
1655
  // polygram_received_at_ms is stamped by the helper subprocess at write
1463
1656
  // time; subtracting from Date.now() gives the helper-write → tail-emit
@@ -1465,25 +1658,18 @@ class CliProcess extends Process {
1465
1658
  // gates tag-out on median < 2s and p99 < 5s across the events DB.
1466
1659
  if (Number.isFinite(ev.receivedAtMs)) {
1467
1660
  const lagMs = Date.now() - ev.receivedAtMs;
1661
+ // L10: emit ONLY — the onHookLagSample callback owns the DB write
1662
+ // (CALLBACK_TO_EVENT → callbacks.js). Previously this ALSO wrote
1663
+ // directly via this.db.logEvent, double-persisting every sample and
1664
+ // inflating the Phase 1.8 soak-gate row count. Consistent with how
1665
+ // tool-result / subagent-start / subagent-done are handled (emit,
1666
+ // don't double-write).
1468
1667
  this.emit('hook-lag-sample', {
1469
1668
  hookEventName: ev.type,
1470
1669
  lagMs,
1471
1670
  toolName: ev.toolName || null,
1472
1671
  backend: this.backend,
1473
1672
  });
1474
- // Log to events DB if wired. db is optional (factory injects when
1475
- // available) — same pattern as the other parity-P1 _logEvent calls.
1476
- if (this.db?.logEvent) {
1477
- try {
1478
- this.db.logEvent('hook-lag-sample', {
1479
- session_key: this.sessionKey,
1480
- backend: this.backend,
1481
- hook_event_name: ev.type,
1482
- tool_name: ev.toolName || null,
1483
- lag_ms: lagMs,
1484
- });
1485
- } catch {}
1486
- }
1487
1673
  }
1488
1674
 
1489
1675
  switch (ev.type) {
@@ -1503,6 +1689,16 @@ class CliProcess extends Process {
1503
1689
  const subagentType = ev.toolInput?.subagent_type
1504
1690
  || ev.toolInput?.agent_type
1505
1691
  || 'general-purpose';
1692
+ // Finding 0.12-M4: SubagentStop carries agent_id/agent_type but
1693
+ // NOT the originating Agent tool_use_id, so without help the
1694
+ // subagent-start/subagent-done rows share no JOIN key (the
1695
+ // documented soak query on $.tool_use_id returns zero rows).
1696
+ // Track the in-flight Agent tool_use_id keyed by subagent type so
1697
+ // the paired SubagentStop below can stamp it onto subagent-done.
1698
+ (this._pendingSubagentStarts ||= []).push({
1699
+ agentType: subagentType,
1700
+ toolUseId: ev.toolUseId,
1701
+ });
1506
1702
  this.emit('subagent-start', {
1507
1703
  agentType: subagentType,
1508
1704
  // PreToolUse for Agent carries no agent_id (set later on
@@ -1541,24 +1737,102 @@ class CliProcess extends Process {
1541
1737
  });
1542
1738
  return;
1543
1739
 
1544
- case 'SubagentStop':
1740
+ case 'SubagentStop': {
1741
+ // Finding 0.12-M4: recover the originating Agent tool_use_id so the
1742
+ // subagent-start/subagent-done pair is JOINable. Prefer a match on
1743
+ // agent type (correct for parallel subagents of different types);
1744
+ // fall back to the oldest pending start when types don't line up.
1745
+ let subagentToolUseId = null;
1746
+ const pendingStarts = this._pendingSubagentStarts;
1747
+ if (pendingStarts && pendingStarts.length) {
1748
+ let idx = pendingStarts.findIndex(s => s.agentType === ev.agentType);
1749
+ if (idx < 0) idx = 0;
1750
+ subagentToolUseId = pendingStarts.splice(idx, 1)[0]?.toolUseId ?? null;
1751
+ }
1545
1752
  this.emit('subagent-done', {
1546
1753
  agentType: ev.agentType,
1547
1754
  agentId: ev.agentId,
1548
1755
  durationMs: ev.durationMs,
1756
+ toolUseId: subagentToolUseId,
1549
1757
  backend: this.backend,
1550
1758
  });
1551
1759
  return;
1760
+ }
1552
1761
 
1553
- case 'Stop':
1554
- // Phase 1.7 (TODO) will use this as the authoritative turn-end
1555
- // signal with stopGraceMs. For now: pass through as 'stop-hook'
1556
- // event so the resolver in Phase 1.7 can subscribe.
1557
- this.emit('stop-hook', {
1762
+ case 'Stop': {
1763
+ // 0.12.0 Phase 1.7 (rc.16): Stop is the AUTHORITATIVE turn-end signal.
1764
+ const info = {
1558
1765
  stopHookActive: ev.stopHookActive,
1559
1766
  lastAssistantMessage: ev.lastAssistantMessage,
1560
1767
  backend: this.backend,
1561
- });
1768
+ };
1769
+ // Turns already resolving via a reply quiet-window consume this via
1770
+ // their per-turn onStop listener (the text-fallback rescue inside
1771
+ // _resolveTurn). Emit first so that path runs synchronously and any
1772
+ // grace-pending turn is finalized + removed before the check below.
1773
+ this.emit('stop-hook', info);
1774
+
1775
+ // THE FIX (2026-06-02 stuck-turn): a turn that ended WITHOUT a reply
1776
+ // tool call has no quiet-window to fire _resolveTurn — pre-fix it hung
1777
+ // until the 30-min wall-clock backstop while the unknown-prompt
1778
+ // watchdog spun. Stop IS the turn-end; resolve the single in-flight
1779
+ // turn now (reply text if any, else last_assistant_message). After the
1780
+ // emit above, a grace-pending turn is already gone, so this only fires
1781
+ // for the no-reply case. Gated on exactly one in-flight turn — Stop
1782
+ // carries no turn_id, so we cannot attribute it when turns are
1783
+ // concurrent (the M1 cross-attribution hazard).
1784
+ if (this.pendingTurns.size === 1) {
1785
+ const [turnId, p] = [...this.pendingTurns.entries()][0];
1786
+ if (!p._stopGracePending) {
1787
+ p._stopHookData = info;
1788
+ this._logEvent('cli-turn-resolved-by-stop', {
1789
+ turn_id: turnId,
1790
+ reply_count: p.replies?.length || 0,
1791
+ via_text_fallback: (p.replies?.length || 0) === 0,
1792
+ session_id: this.claudeSessionId,
1793
+ });
1794
+ this._finalizeTurn(turnId);
1795
+ }
1796
+ } else if (this.pendingTurns.size > 1) {
1797
+ // Can't attribute Stop to one of several concurrent turns — surface
1798
+ // it so a turn that waited for its grace timer (instead of resolving
1799
+ // on Stop) is explained in the events DB.
1800
+ this._logEvent('cli-stop-unattributed', { pending_count: this.pendingTurns.size });
1801
+ }
1802
+
1803
+ // 0.12.0-rc.13 proactive compaction warning: on turn-end, if enabled
1804
+ // for this chat and not already warned this climb, sample context
1805
+ // occupancy from the transcript and warn (propose /compact) BEFORE
1806
+ // claude auto-compacts mid-turn and detaches the bridge. Fire-and-
1807
+ // forget — transcript IO must never block the stop path.
1808
+ if (this.compactionWarn?.enabled && !this._compactionWarned && ev.transcriptPath) {
1809
+ this._maybeProactiveCompactionWarn(ev.transcriptPath);
1810
+ }
1811
+ return;
1812
+ }
1813
+
1814
+ case 'PreCompact':
1815
+ // 0.12.0-rc.13: auto-compaction is the event that detaches the
1816
+ // channels MCP bridge mid-turn. Record it; and on the dangerous AUTO
1817
+ // case (manual /compact is the user's own deliberate action — never
1818
+ // nag), emit a reactive warning the chat layer posts. The proactive
1819
+ // warning (on Stop) tries to PREVENT this; this is the backstop.
1820
+ this._logEvent('cli-compaction-imminent', { trigger: ev.trigger });
1821
+ if (this.compactionWarn?.enabled && ev.trigger === 'auto') {
1822
+ this.emit('compaction-warn', {
1823
+ kind: 'reactive',
1824
+ trigger: 'auto',
1825
+ sessionId: this.claudeSessionId,
1826
+ backend: this.backend,
1827
+ });
1828
+ }
1829
+ return;
1830
+
1831
+ case 'PostCompact':
1832
+ // Context just dropped — re-arm the proactive warn-once so the next
1833
+ // climb can warn again.
1834
+ this._compactionWarned = false;
1835
+ this._logEvent('cli-compaction-done', { trigger: ev.trigger });
1562
1836
  return;
1563
1837
 
1564
1838
  case 'Notification':
@@ -1597,15 +1871,22 @@ class CliProcess extends Process {
1597
1871
  {
1598
1872
  const requestId = ev.toolUseId || `hook-notification-${Date.now()}`;
1599
1873
  const toolName = ev.toolName;
1600
- const toolInput = this._formatToolInputForApproval(
1601
- ev.prompt || null,
1602
- // Use the structured tool_input as the "preview" it's
1603
- // already structured by claude rather than truncated to
1604
- // 200 chars like the channels bridge perm_req does.
1605
- typeof ev.toolInput === 'string'
1606
- ? ev.toolInput
1607
- : JSON.stringify(ev.toolInput || {}),
1608
- );
1874
+ // Finding #11 fix: pass the STRUCTURED tool_input through. makeCanUseTool
1875
+ // matches gated patterns via matchesAnyPattern, which reads
1876
+ // input.command (Bash) / input.url (WebFetch) a formatted STRING
1877
+ // makes those undefined so a gated `Bash(rm *)` never matches and the
1878
+ // tool is allowed with NO approval card (silent gating bypass). The
1879
+ // hook Notification payload carries structured tool_input, so forward
1880
+ // it as-is; the approval card (approvalCardText) renders a structured
1881
+ // object fine — same shape the SDK canUseTool path already uses. Fall
1882
+ // back to the formatted-string preview only if claude sent no
1883
+ // structured tool_input (degenerate — tool needs perm but no input).
1884
+ const toolInput = (ev.toolInput && typeof ev.toolInput === 'object')
1885
+ ? ev.toolInput
1886
+ : this._formatToolInputForApproval(
1887
+ ev.prompt || null,
1888
+ typeof ev.toolInput === 'string' ? ev.toolInput : JSON.stringify(ev.toolInput || {}),
1889
+ );
1609
1890
  this.emit('approval-required', {
1610
1891
  id: requestId,
1611
1892
  toolName,
@@ -1665,6 +1946,50 @@ class CliProcess extends Process {
1665
1946
  }
1666
1947
  }
1667
1948
 
1949
+ /**
1950
+ * Drain on unexpected bridge socket loss (claude crash, bridge crash,
1951
+ * EOF). Extracted from the inline 'bridge-disconnected' handler so the
1952
+ * teardown is testable and consistent with _doKill.
1953
+ *
1954
+ * Findings 0.12-L5 + L6: in addition to clearing the per-turn timers
1955
+ * and rejecting pendings (the original P1 #5 behavior), this now also
1956
+ * (L5) removes each turn's stop-hook listener — this drain does NOT go
1957
+ * through Process.kill()'s blanket removeAllListeners, so a turn torn
1958
+ * down mid-stop-grace would otherwise leak its onStop closure — and
1959
+ * (L6) clears _interruptGraceTimer, matching _doKill (a /stop verdict
1960
+ * landing just before the disconnect would otherwise leave a stray
1961
+ * timer on the dead instance).
1962
+ */
1963
+ _handleBridgeDisconnected(reason = 'socket-close') {
1964
+ this.bridgeReady = false;
1965
+ this.mcpReady = false;
1966
+ if (this.closed) return;
1967
+ this.logger.warn?.(`[${this.label}] channels: bridge disconnected unexpectedly (${reason})`);
1968
+ // L6: clear the interrupt grace timer alongside the rest of the lifecycle.
1969
+ if (this._interruptGraceTimer) {
1970
+ clearTimeout(this._interruptGraceTimer);
1971
+ this._interruptGraceTimer = null;
1972
+ }
1973
+ // P1 #5: drain pendingTurns immediately so hardTimers don't run 10min.
1974
+ for (const [, pending] of this.pendingTurns) {
1975
+ if (pending.quietTimer) clearTimeout(pending.quietTimer);
1976
+ if (pending.hardTimer) clearTimeout(pending.hardTimer);
1977
+ if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1978
+ if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
1979
+ // L5: remove the per-turn stop-hook listener (this path bypasses
1980
+ // Process.kill()'s removeAllListeners).
1981
+ if (pending._onStop) this.off('stop-hook', pending._onStop);
1982
+ const err = new Error('bridge disconnected');
1983
+ err.code = 'BRIDGE_DISCONNECTED';
1984
+ try { pending.reject(err); } catch {}
1985
+ }
1986
+ this.pendingTurns.clear();
1987
+ this.pendingQueue.length = 0;
1988
+ this.inFlight = false;
1989
+ this.emit('bridge-disconnected');
1990
+ this._logEvent('bridge-disconnected', { reason });
1991
+ }
1992
+
1668
1993
  async _doKill(reason) {
1669
1994
  this.closed = true;
1670
1995
  this.inFlight = false;
@@ -1688,6 +2013,7 @@ class CliProcess extends Process {
1688
2013
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1689
2014
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1690
2015
  if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
2016
+ if (pending._onStop) this.off('stop-hook', pending._onStop); // L5
1691
2017
  const err = new Error(`session killed: ${reason}`);
1692
2018
  err.code = 'KILLED';
1693
2019
  pending.reject(err);
@@ -1734,6 +2060,12 @@ class CliProcess extends Process {
1734
2060
  if (this.botName && this.claudeSessionId) {
1735
2061
  try { removeHookFiles({ botName: this.botName, sessionId: this.claudeSessionId }); } catch {}
1736
2062
  }
2063
+ // File-send staging: remove the whole per-session dir on kill (purge only
2064
+ // empties it between turns; kill is end-of-life so drop it entirely).
2065
+ if (this.attachmentStagingDir) {
2066
+ try { fs.rmSync(this.attachmentStagingDir, { recursive: true, force: true }); } catch {}
2067
+ this.attachmentStagingDir = null;
2068
+ }
1737
2069
 
1738
2070
  this.emit('close', 0);
1739
2071
  }
@@ -1876,6 +2208,8 @@ class CliProcess extends Process {
1876
2208
  if (pending.quietTimer) clearTimeout(pending.quietTimer);
1877
2209
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1878
2210
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
2211
+ if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer); // L5
2212
+ if (pending._onStop) this.off('stop-hook', pending._onStop); // L5
1879
2213
  const err = new Error(`session reset: ${reason}`);
1880
2214
  err.code = 'RESET';
1881
2215
  try { pending.reject(err); } catch {}
@@ -2088,6 +2422,37 @@ class CliProcess extends Process {
2088
2422
  * Extracted as a separate async method so unit tests can drive it
2089
2423
  * directly without waiting for the setInterval tick.
2090
2424
  */
2425
+ /**
2426
+ * 0.12.0-rc.13: proactive compaction warning. Read the transcript's current
2427
+ * context occupancy and, if past the per-chat threshold, emit a
2428
+ * 'compaction-warn' the chat layer turns into "you're ~N% full, run
2429
+ * /compact" — giving the user a window to compact on their terms BEFORE
2430
+ * claude auto-compacts mid-turn (which detaches the channels bridge). Warns
2431
+ * once per climb (this._compactionWarned), re-armed on PostCompact.
2432
+ * Fire-and-forget: swallows its own errors so transcript IO never breaks
2433
+ * the turn-end path.
2434
+ */
2435
+ async _maybeProactiveCompactionWarn(transcriptPath) {
2436
+ try {
2437
+ if (!this.compactionWarn?.enabled || this._compactionWarned) return;
2438
+ const usage = await readContextTokens(transcriptPath);
2439
+ if (!usage) return;
2440
+ const pct = contextPct(usage.total) * 100;
2441
+ if (pct < this.compactionWarn.thresholdPct) return;
2442
+ if (this._compactionWarned) return; // re-check after the async gap
2443
+ this._compactionWarned = true;
2444
+ this.emit('compaction-warn', {
2445
+ kind: 'proactive',
2446
+ pct: Math.round(pct),
2447
+ totalTokens: usage.total,
2448
+ sessionId: this.claudeSessionId,
2449
+ backend: this.backend,
2450
+ });
2451
+ } catch (err) {
2452
+ this.logger.warn?.(`[${this.label}] compaction-warn sample failed: ${err.message}`);
2453
+ }
2454
+ }
2455
+
2091
2456
  async _pollMidTurnDialogs() {
2092
2457
  if (this.closed) return;
2093
2458
  if (this.pendingTurns.size === 0) return; // no work to do when idle
@@ -2106,6 +2471,15 @@ class CliProcess extends Process {
2106
2471
  }
2107
2472
  if (!pane) return;
2108
2473
 
2474
+ // rc.14: removed the rc.11 pane-based "dead bridge" detection here. It
2475
+ // matched the BENIGN banner "server:polygram-bridge no MCP server
2476
+ // configured with that name" — a cosmetic line that
2477
+ // `--dangerously-load-development-channels` + `--strict-mcp-config` prints
2478
+ // on EVERY healthy session (channel still delivers; reply tool still
2479
+ // works). The matcher false-fired ~5s into every channels turn and killed
2480
+ // healthy sessions. Real bridge loss is the socket-close path
2481
+ // (_handleBridgeDisconnected), not anything observable in the pane.
2482
+
2109
2483
  const now = Date.now();
2110
2484
 
2111
2485
  // 0.12 Phase 3.2: liveness heartbeat. The TUI prints "esc to interrupt"