polygram 0.12.0-rc.8 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/config.example.json +4 -3
  2. package/lib/claude-bin.js +14 -1
  3. package/lib/compaction-warn.js +59 -0
  4. package/lib/context-usage.js +93 -0
  5. package/lib/db.js +1 -1
  6. package/lib/error/classify.js +33 -10
  7. package/lib/feedback/session-feedback.js +91 -0
  8. package/lib/handlers/abort.js +87 -40
  9. package/lib/handlers/autosteer.js +4 -0
  10. package/lib/handlers/config-callback.js +25 -6
  11. package/lib/handlers/config-ui.js +39 -10
  12. package/lib/handlers/dispatcher.js +83 -0
  13. package/lib/handlers/download.js +101 -58
  14. package/lib/handlers/drop-redeliver.js +69 -0
  15. package/lib/handlers/edit-correction.js +2 -0
  16. package/lib/handlers/edit-redelivery.js +136 -0
  17. package/lib/handlers/gate-inbound.js +188 -0
  18. package/lib/handlers/questions.js +289 -0
  19. package/lib/handlers/redeliver.js +122 -0
  20. package/lib/handlers/slash-commands.js +43 -30
  21. package/lib/history-preload.js +6 -0
  22. package/lib/history.js +7 -1
  23. package/lib/model-costs.js +4 -0
  24. package/lib/process/channels-bridge-protocol.js +22 -1
  25. package/lib/process/channels-bridge.mjs +128 -7
  26. package/lib/process/channels-tool-dispatcher.js +105 -12
  27. package/lib/process/cli-process.js +1277 -70
  28. package/lib/process/hook-event-tail.js +7 -0
  29. package/lib/process/hook-settings.js +7 -0
  30. package/lib/process/process.js +22 -0
  31. package/lib/process-guard.js +57 -1
  32. package/lib/process-manager.js +120 -35
  33. package/lib/questions/questions.js +187 -0
  34. package/lib/questions/store.js +105 -0
  35. package/lib/rewind/execute.js +89 -0
  36. package/lib/rewind/fork.js +112 -0
  37. package/lib/rewind/rewind.js +174 -0
  38. package/lib/sdk/callbacks.js +165 -167
  39. package/lib/session-key.js +29 -0
  40. package/lib/telegram/album-reactions.js +50 -0
  41. package/lib/telegram/parse.js +9 -2
  42. package/lib/telegram/typing.js +17 -2
  43. package/lib/tmux/startup-gate.js +44 -14
  44. package/migrations/012-pending-questions.sql +30 -0
  45. package/package.json +1 -1
  46. package/polygram.js +224 -78
@@ -53,6 +53,8 @@ const { createHookTail } = require('./hook-event-tail');
53
53
  // /private/tmp drift — one of the original Music-topic failures).
54
54
  const { DEFAULT_ATTACHMENT_BASE } = require('./channels-tool-dispatcher');
55
55
  const { resolveFileCaps } = require('../attachments');
56
+ const { resolveCompactionWarnConfig } = require('../compaction-warn');
57
+ const { readContextTokens, contextPct } = require('../context-usage');
56
58
  const { runStartupGate } = require('../tmux/startup-gate');
57
59
  const { POLYGRAM_DISPLAY_HINT } = require('../telegram/display-hint');
58
60
 
@@ -70,6 +72,30 @@ const DEFAULT_MCP_READY_TIMEOUT_MS = 5_000;
70
72
  // Mirrors rc.41 H4 stopGraceMs from tmux backend. 2s default = same as tmux.
71
73
  const DEFAULT_STOP_GRACE_MS = 2_000;
72
74
  const DEFAULT_TURN_QUIET_MS = 2_000; // after first reply, wait this long for more before resolving turn
75
+ // 0.13 D1 rung 2 (docs/0.13-channels-lifecycle-design.md §3 D1): once a turn has
76
+ // ≥1 delivered reply AND the hook stream is live, the turn finalizes when the
77
+ // session's whole ACTIVITY surface (hook events + the pane "esc to interrupt"
78
+ // thinking heartbeat + bridge tool calls + replies) goes quiet for this long.
79
+ // Calibrated against the busy-phase inter-activity gap: the pane heartbeat fires
80
+ // on the 5s pong tick while a turn is pending, so a live claude can never be
81
+ // "activity-quiet" — only a truly ended (or hook-and-pane-dead) tail is.
82
+ const DEFAULT_ACTIVITY_QUIET_MS = 18_000;
83
+ // 0.13 D2 (P3): InputLedger windows. dropConfirm = how long after the trigger
84
+ // cycle's end an unseen/unacked non-primary entry may still be picked up as a
85
+ // claude-side next cycle before it is declared dropped (late seen/ack cancels).
86
+ // deliveryWatchdog = the primary pickup window: a dispatched primary with no
87
+ // UPS and ZERO session activity gets one idempotent re-write, then (still
88
+ // nothing) a bridge teardown onto the existing recovery path.
89
+ const DEFAULT_DROP_CONFIRM_MS = 20_000;
90
+ const DEFAULT_DELIVERY_WATCHDOG_MS = 10_000;
91
+ const INPUT_LEDGER_CAP = 64;
92
+ // 0.13 D1 P1 seen-slice: parse the pickup turn_id out of the UserPromptSubmit
93
+ // prompt. Anchored on the RAW `<channel ` tag prefix — the bridge body-escape
94
+ // (channels-bridge.mjs escapeChannelBody) turns every user-authored `<` into
95
+ // `&lt;`, so a raw tag prefix is bridge-authored by construction and a pasted/
96
+ // spoofed `turn_id="…"` in message body text can never mark a pending seen.
97
+ // (Envelope shape verified from prod JSONL + the P0 spike — Q1.)
98
+ const UPS_ENVELOPE_TURN_ID_RE = /<channel\s[^>]*turn_id="([0-9a-f-]{36})"/g;
73
99
  const DEFAULT_TURN_TIMEOUT_MS = 600_000; // 10 min idle cap (resets on each reply — Review F#13)
74
100
  const DEFAULT_TURN_ABSOLUTE_MS = 1_800_000; // 30 min absolute wall-clock ceiling (no reset)
75
101
  const DEFAULT_INTERRUPT_GRACE_MS = 5_000; // after Ctrl-C, wait this long for Claude to ack before synthesizing 'interrupted'
@@ -96,7 +122,10 @@ const DEFAULT_QUEUE_CAP = 50; // Parity P2: match SDK/tmux pendin
96
122
  // catalog when new dialogs are observed in production.
97
123
  const SESSION_AGE_PROMPT_RE = /Resuming the full session[\s\S]*Resume from summary/i;
98
124
  const MID_TURN_PROMPTS = [
99
- { name: 'session-age', regex: SESSION_AGE_PROMPT_RE, action: 'enter' },
125
+ // Review F2 (resume-dialog fix): bare Enter selects the pre-selected
126
+ // "Resume from summary" — which literally runs /compact. Navigate to
127
+ // "Resume full session as-is" instead, same as the startup-gate trigger.
128
+ { name: 'session-age', regex: SESSION_AGE_PROMPT_RE, action: 'keys', keys: ['Down', 'Enter'] },
100
129
  ];
101
130
 
102
131
  // 0.12 Phase 3.2 (Finding 0.1.A): rc.45 esc-to-interrupt liveness heartbeat.
@@ -111,6 +140,27 @@ const MID_TURN_PROMPTS = [
111
140
  // hook process.
112
141
  const STREAMING_HINT_RE = /esc to interrupt/i;
113
142
 
143
+ // 0.12.0 background-work lifecycle: claude's TUI mode line shows a live
144
+ // background-shell COUNT while a `run_in_background:true` Bash outlives its turn,
145
+ // e.g. `⏵⏵ bypass permissions on · 1 shell · ← for agents · ↓ to manage`.
146
+ // Confirmed on claude 2.1.158 (P0 spike — docs/0.12.0-background-work-lifecycle-
147
+ // plan.md): the count is always-present in the viewport mode line while shells run
148
+ // and clears IN-PLACE within ~3s when they exit (no stale scrollback).
149
+ //
150
+ // MODE-INDEPENDENT (prod regression fix, 2026-06-04): the original regex anchored
151
+ // on "auto mode on", but EVERY shumorobot session runs "⏵⏵ bypass permissions on"
152
+ // — the spike happened to be captured in auto mode. So the detector never matched
153
+ // in prod and bg-work-status fired zero times. Anchor instead on the `⏵⏵` mode-
154
+ // line glyph (present in auto / bypass / accept-edits modes alike); only the mode
155
+ // label between it and `· N shell` varies. Still matched only against the captured
156
+ // TAIL so a scrolled-off history line never trips it. R1: re-validate on each
157
+ // pinned-claude bump (glyph + `N shell` wording).
158
+ const BACKGROUND_SHELL_RE = /⏵⏵[^\n]*·\s*(\d+)\s+shells?\b/i;
159
+ // How long a detached background shell may run AFTER its turn resolved (claude
160
+ // idle) before the stall-watchdog fires one read-only self-check. Override via
161
+ // the constructor (tests use a small value).
162
+ const DEFAULT_BG_WORK_STALL_MS = 600_000; // 10 min
163
+
114
164
  // 0.12 Phase 3.3 (Q1 resolution): heuristic for "looks like an unknown
115
165
  // interactive prompt." Match common prompt shapes that don't appear in
116
166
  // MID_TURN_PROMPTS — operator gets a telemetry event so they can decide
@@ -118,6 +168,17 @@ const STREAMING_HINT_RE = /esc to interrupt/i;
118
168
  // — false positives surface as no-op telemetry, false negatives surface
119
169
  // as the idle-ceiling timeout (~10min).
120
170
  const UNKNOWN_PROMPT_HEURISTIC_RE = /(\?\s*$|\(y\/N\)|Yes\/No|❯\s|^\s*[12345]\.\s)/im;
171
+ // rc.14: a previous rc (rc.11) had a BRIDGE_DEAD_RE here that matched the pane
172
+ // line "server:polygram-bridge no MCP server configured with that name" and
173
+ // treated it as a dead bridge to recover from. That was a MISDIAGNOSIS: this
174
+ // line is a BENIGN, persistent banner that `--dangerously-load-development-
175
+ // channels` + `--strict-mcp-config` prints on EVERY healthy session — the
176
+ // channel still delivers messages and the reply tool still works (reproduced
177
+ // 2026-06-01 with a test MCP server that demonstrably functions). The pane
178
+ // matcher therefore false-fired ~5s into every channels turn and KILLED
179
+ // healthy sessions (the Music-topic "mid-turn detach" regression). Real bridge
180
+ // loss is caught by the socket-close path (bridgeServer 'bridge-disconnected'
181
+ // → _handleBridgeDisconnected). There is no reliable pane signal — removed.
121
182
  // Per-pattern rate limit so a dialog that lingers across multiple polls
122
183
  // doesn't spam sendControl/event emissions. Aligned with the 5s poll cadence.
123
184
  const MID_TURN_DEDUP_WINDOW_MS = 30_000;
@@ -157,8 +218,12 @@ class CliProcess extends Process {
157
218
  mcpReadyTimeoutMs = DEFAULT_MCP_READY_TIMEOUT_MS,
158
219
  stopGraceMs = DEFAULT_STOP_GRACE_MS,
159
220
  turnQuietMs = DEFAULT_TURN_QUIET_MS,
221
+ activityQuietMs = DEFAULT_ACTIVITY_QUIET_MS,
222
+ dropConfirmMs = DEFAULT_DROP_CONFIRM_MS,
223
+ deliveryWatchdogMs = DEFAULT_DELIVERY_WATCHDOG_MS,
160
224
  turnTimeoutMs = DEFAULT_TURN_TIMEOUT_MS,
161
225
  turnAbsoluteMs = DEFAULT_TURN_ABSOLUTE_MS,
226
+ bgWorkStallMs = DEFAULT_BG_WORK_STALL_MS,
162
227
  interruptGraceMs = DEFAULT_INTERRUPT_GRACE_MS,
163
228
  maxRepliesPerTurn = DEFAULT_MAX_REPLIES_PER_TURN,
164
229
  queueCap = DEFAULT_QUEUE_CAP, // Parity P2
@@ -188,8 +253,12 @@ class CliProcess extends Process {
188
253
  this.mcpReadyTimeoutMs = mcpReadyTimeoutMs;
189
254
  this.stopGraceMs = stopGraceMs;
190
255
  this.turnQuietMs = turnQuietMs;
256
+ this.activityQuietMs = activityQuietMs;
257
+ this.dropConfirmMs = dropConfirmMs;
258
+ this.deliveryWatchdogMs = deliveryWatchdogMs;
191
259
  this.turnTimeoutMs = turnTimeoutMs;
192
260
  this.turnAbsoluteMs = turnAbsoluteMs;
261
+ this.bgWorkStallMs = bgWorkStallMs;
193
262
  this.interruptGraceMs = interruptGraceMs;
194
263
  this.maxRepliesPerTurn = maxRepliesPerTurn;
195
264
  this.queueCap = queueCap;
@@ -213,6 +282,15 @@ class CliProcess extends Process {
213
282
  // interval fires bridge-disconnected if too much time elapses.
214
283
  this.lastPongAt = 0;
215
284
  this.pongWatchdog = null;
285
+ // 0.12.0 background-work stall-watchdog state. `_bgWorkSince` = when a live
286
+ // background shell was first observed while idle (null = none); reset only
287
+ // when the shell count returns to 0. `_bgWorkEscalations` caps the watchdog
288
+ // at one read-only self-check per continuous background-work window.
289
+ this._bgWorkSince = null;
290
+ this._bgWorkEscalations = 0;
291
+ // Visibility (Use 3): whether a "⏳ working in background" status message is
292
+ // currently shown, so we emit exactly one running→cleared pair per window.
293
+ this._bgWorkStatusShown = false;
216
294
  // Review P2 ADV-6: token-bucket rate limit on Claude's reply tool calls.
217
295
  // Without this, a prompt-injected or runaway Claude can fire reply() 1000×
218
296
  // in a tight loop, flooding TG + saturating the daemon event loop.
@@ -236,6 +314,7 @@ class CliProcess extends Process {
236
314
  // doesn't re-invoke the dispatcher → duplicate TG send. Set is bounded
237
315
  // to RECENT_TOOL_CALL_LIMIT entries via FIFO eviction.
238
316
  this.recentToolCallIds = new Set();
317
+ this.recentToolCallResults = new Map(); // tool_call_id → message_id (0.13: replay on re-ACK)
239
318
  this.recentToolCallOrder = []; // FIFO bound
240
319
  // Review F#17: per-pattern last-fired timestamp for the mid-turn dialog
241
320
  // watchdog. Dedups within MID_TURN_DEDUP_WINDOW_MS so a lingering dialog
@@ -253,8 +332,32 @@ class CliProcess extends Process {
253
332
  this.recentContentHashes = new Map(); // key → expiryTs
254
333
  this.contentDedupWindowMs = 60_000;
255
334
 
256
- // pending turn(s): turn_id → { resolve, reject, replies: [], quietTimer, hardTimer, startedAt }
335
+ // pending turn(s): turn_id → { resolve, reject, replies: [], seen, quietTimer,
336
+ // hardTimer, absoluteTimer, _activityQuietTimer, startedAt }
257
337
  this.pendingTurns = new Map();
338
+ // 0.13 D1: activity bookkeeping for the finalizer ladder. _lastHookEventAt
339
+ // feeds the rung-2 telemetry (hook-stalled discrimination); _lastActivityAt
340
+ // is the broader surface (hooks + pane heartbeat + bridge tool calls).
341
+ this._lastHookEventAt = 0;
342
+ this._lastActivityAt = 0;
343
+ // 0.13 D2: the InputLedger — every user-shaped input written to the bridge
344
+ // gets an observable lifecycle: written → seen → resolved | dropped |
345
+ // superseded | fold-suspected. Pre-P3, injectUserMessage minted a turn_id
346
+ // that never escaped the function (fold/new-turn/drop indistinguishable —
347
+ // seam S4; the #14 msg-2385 drop was invisible by construction).
348
+ // turn_id → { turnId, source, msgId, chatId, writtenAt, state, _dropTimer,
349
+ // _watchdogTimer, _rewritten }
350
+ this.inputLedger = new Map();
351
+ // Set whenever a reply carried the consumed_turn_ids contract field —
352
+ // the Tier 2C "contract observed" discriminator (P0 spike: incidental
353
+ // echo is trigger-only; without the contract a fold is indistinguishable
354
+ // from a drop, and auto-redelivering folds double-answers the common case).
355
+ this._lastAckFieldAt = 0;
356
+ // 0.12 interactive questions: tool_call_ids of `ask` calls awaiting an answer.
357
+ // While non-empty, the keep-alive interval resets the turn's idle ceiling (an
358
+ // idle `ask` fires no tool hooks, so _extendQuietOnToolActivity wouldn't run).
359
+ this._openQuestions = new Set();
360
+ this._questionKeepAliveTimer = null;
258
361
 
259
362
  // File-send outbound cap (bot → user). Safe cloud default; overwritten in
260
363
  // _spawnTmuxClaude with the backend/chat-resolved value before any turn.
@@ -494,9 +597,15 @@ class CliProcess extends Process {
494
597
  // after this.
495
598
  const topicConfig = opts.threadId && opts.chatConfig?.topics?.[opts.threadId];
496
599
  const agent = topicConfig?.agent || opts.chatConfig?.agent || opts.agent;
497
- const model = topicConfig?.model || opts.chatConfig?.model || opts.model;
498
- const effort = topicConfig?.effort || opts.chatConfig?.effort || opts.effort;
600
+ const model = this._resolveModel(opts);
601
+ const effort = this._resolveEffort(opts);
499
602
  const resolvedCwd = topicConfig?.cwd || opts.chatConfig?.cwd || opts.cwd;
603
+ // Record the spawn-time model/effort. cli has no live model/effort swap
604
+ // (they are spawn-time --model / --effort flags), so getOrSpawn detects a
605
+ // /model or /effort drift against these and reloads — --resume preserves
606
+ // the conversation, the new flag takes effect. See wouldReloadFor.
607
+ this.model = model;
608
+ this.effort = effort;
500
609
 
501
610
  // File-send outbound cap (bot → user). Backend-derived (cloud 50MB vs
502
611
  // local Bot API server 2GB via opts.localApi) with per-topic/chat
@@ -510,6 +619,14 @@ class CliProcess extends Process {
510
619
  override: _capOverride,
511
620
  }).outBytes;
512
621
 
622
+ // 0.12.0-rc.13: per-chat/topic compaction warning (default OFF). Same
623
+ // topic→chat precedence as the file cap above. When enabled, the channels
624
+ // backend warns the chat as context fills (propose /compact at a break)
625
+ // and on auto-compaction (the event that detaches the bridge mid-turn).
626
+ const _compactionWarnRaw = topicConfig?.compactionWarnings ?? opts.chatConfig?.compactionWarnings;
627
+ this.compactionWarn = resolveCompactionWarnConfig({ compactionWarnings: _compactionWarnRaw });
628
+ this._compactionWarned = false; // proactive warn-once per climb; reset on PostCompact
629
+
513
630
  // Parity audit P8 + rc.8 fs-guard (2026-05-26 shumorobot Music topic):
514
631
  // `--session-id <id>` creates a NEW claude session with that id;
515
632
  // `--resume <id>` resumes the EXISTING conversation. Lazy-respawn after
@@ -637,6 +754,44 @@ class CliProcess extends Process {
637
754
  'as normal — only the FINAL user-visible message needs to go through',
638
755
  'the reply tool.',
639
756
  '',
757
+ 'When you call `reply`, ALWAYS set `consumed_turn_ids` to the turn_id',
758
+ 'attribute of EVERY <channel> message you are answering or have received',
759
+ 'since your last reply — including mid-turn follow-ups you absorbed into',
760
+ 'the current answer. polygram uses it to confirm follow-up delivery;',
761
+ 'omitting it can cause a follow-up to be re-sent to you.',
762
+ '',
763
+ '### Staying responsive on a long task',
764
+ '',
765
+ 'The user cannot see you working — no live typing reaches them. For any task',
766
+ 'that takes more than a few seconds, send a SHORT status first via `reply`',
767
+ '(it returns a `message_id`), then call `mcp__polygram-bridge__edit_message`',
768
+ 'with that `message_id` to update the SAME bubble as you make progress,',
769
+ 'finishing with the result. One evolving message beats silence or a flood of',
770
+ 'new ones.',
771
+ '',
772
+ 'Write status in PLAIN, friendly language about what you are doing FOR THE',
773
+ 'USER — never tool names or mechanics. Say "Checking your config now…", not',
774
+ '"Running Bash" or "Calling Read". If the final answer is long, send it as a',
775
+ 'fresh `reply` rather than an edit (an edit is one single message bubble).',
776
+ '',
777
+ // TEMPORARY mitigation (2026-06-08 Shumabit@UMI wedge): AskUserQuestion opens
778
+ // a blocking TUI selection widget the channel can't answer → the session
779
+ // parks until manually Esc'd. REMOVE this whole rule when the rich
780
+ // question→Telegram-keyboard feature ships (see docs design); claude should
781
+ // then use the native question tool again. Tracked so it isn't forgotten.
782
+ '### Asking the user a question / offering choices — HARD RULE',
783
+ '',
784
+ 'NEVER use the AskUserQuestion tool or any interactive menu / selection',
785
+ 'widget. They open a blocking terminal prompt the user on Telegram CANNOT',
786
+ 'see or navigate — it silently wedges the entire session until it is manually',
787
+ 'cleared. (Rich tap-to-answer choices are coming; until then this is a hard rule.)',
788
+ '',
789
+ 'To ask a multiple-choice question, a confirmation, or yes/no, call the',
790
+ '`mcp__polygram-bridge__ask` tool — it renders tap-to-answer inline buttons',
791
+ '(supports multiSelect via `multiSelect:true` and a free-text answer via',
792
+ '`allowOther:true`) and returns the user\'s selection(s) as the tool result.',
793
+ 'Prefer `ask` over a typed numbered list whenever you are offering choices.',
794
+ '',
640
795
  '### Sending FILES (tracks, images, docs) to the user',
641
796
  '',
642
797
  'The `mcp__polygram-bridge__reply` tool takes an optional `files` array of',
@@ -712,6 +867,20 @@ class CliProcess extends Process {
712
867
  cwd: resolvedCwd || opts.cwd || process.cwd(),
713
868
  command: this.claudeBin,
714
869
  args: claudeArgs,
870
+ envExtras: {
871
+ // Resume-dialog suppression (docs/0.13-resume-dialog-fix-spec.md B1):
872
+ // claude's session-age "resume-return" dialog fires when sessionAge ≥
873
+ // this many minutes AND est. tokens ≥ CLAUDE_CODE_RESUME_TOKEN_THRESHOLD
874
+ // (defaults 70 / 1e5, binary-verified on 2.1.158). Its pre-selected
875
+ // option literally runs /compact — silently compacting every aged
876
+ // --resume (and breaking the /model "conversation kept" guarantee).
877
+ // A huge threshold (1 year) means the dialog never triggers and resume
878
+ // is always full-session-as-is. Per-process env — the operator's own
879
+ // interactive claude is untouched. Belt-and-braces: the session-age
880
+ // gate trigger below still navigates to "full" if a future binary bump
881
+ // renames this var.
882
+ CLAUDE_CODE_RESUME_THRESHOLD_MINUTES: '525600',
883
+ },
715
884
  });
716
885
 
717
886
  // Dialog handling (Phase 0 finding) — poll capture-pane and Enter through:
@@ -728,24 +897,46 @@ class CliProcess extends Process {
728
897
  * lives in the shared helper.
729
898
  */
730
899
  async _handleStartupDialogs(tmuxName) {
731
- await runStartupGate({
900
+ const gateResult = await runStartupGate({
732
901
  runner: this.runner,
733
902
  tmuxName,
734
903
  triggers: [
735
904
  // Dev-channels confirmation — always fires under
736
905
  // --dangerously-load-development-channels.
737
906
  { name: 'dev-channels', regex: /WARNING: Loading development channels/i, key: 'Enter' },
738
- // Workspace trust prompt — fires on first-time cwd or untrusted.
739
- { name: 'trust', regex: /trust the files in this folder/i, key: 'Enter' },
740
- // Review F#12: session-age "Resume from summary?" prompt fires on
741
- // aged sessions (claude treats older session JSONLs differently).
742
- // Tmux backend dismisses with Enter at tmux-process.js:2637 onward;
743
- // mirror that here so an aged channels session doesn't hang the
744
- // handshake until CHANNELS_HANDSHAKE_TIMEOUT (15s) dead chat
745
- // requiring manual /reset.
746
- { name: 'session-age', regex: SESSION_AGE_PROMPT_RE, key: 'Enter' },
907
+ // Workspace trust prompt — fires on first-time cwd or untrusted. claude
908
+ // 2.1.158 renders "Quick safety check: Is this a project you created or
909
+ // one you trust? 1. Yes, I trust this folder" (Enter confirms the
910
+ // pre-selected "trust" option). The older "trust the files in this folder"
911
+ // wording is kept for back-compat; both anchor on "trust … this folder".
912
+ { name: 'trust', regex: /trust (?:the files in )?this folder/i, key: 'Enter' },
913
+ // Review F#12 + 2026-06-11 resume-dialog fix: session-age
914
+ // "resume-return" prompt on aged sessions. Bare Enter selects the
915
+ // pre-selected "Resume from summary" — which literally runs /compact
916
+ // on the resumed session (silent context degradation; the original
917
+ // F#12 dismissal compacted every aged resume). Navigate to option 2
918
+ // "Resume full session as-is" instead. This is the FALLBACK path:
919
+ // spawn env (CLAUDE_CODE_RESUME_THRESHOLD_MINUTES above) suppresses
920
+ // the dialog entirely; this trigger firing at all means suppression
921
+ // failed (upstream renamed the env var?) — surfaced via the
922
+ // session-age-dialog-fallback event below.
923
+ { name: 'session-age', regex: SESSION_AGE_PROMPT_RE, keys: ['Down', 'Enter'] },
747
924
  ],
748
- readySignal: /Listening for channel messages from: server:polygram-bridge/i,
925
+ // 2.1.173 reworked the channels UI banner (live-captured 2026-06-11):
926
+ // "Channels (experimental) messages from server:polygram-bridge inject
927
+ // directly in this session · …". Keep the 2.1.158 text too so a
928
+ // POLYGRAM_CLAUDE_BIN override to an older binary still gates correctly.
929
+ //
930
+ // 2026-06-12 (caught by the cancel-cheap E2E before prod): in 2.1.173
931
+ // the banner lives in a COLLAPSIBLE notice list — with ≥3 notices the
932
+ // pane shows "+N more · /status" and the banner is hidden, stalling a
933
+ // banner-only gate into a false CHANNELS_DIALOG_TIMEOUT. An interactive
934
+ // prompt footer ("(shift+tab to cycle)" / "? for shortcuts") with no
935
+ // pending dialog is equally READY: the gate's job is dialog navigation;
936
+ // channel liveness is separately guaranteed by mcp-ready (send() gate)
937
+ // + the delivery watchdog. Dialog panes render "Enter to confirm"
938
+ // instead of the footer, so the footer can't match mid-dialog.
939
+ readySignal: /(?:Listening for channel messages from:|Channels \(experimental\) messages from) server:polygram-bridge|shift\+tab to cycle|\? for shortcuts/i,
749
940
  timeoutCode: 'CHANNELS_DIALOG_TIMEOUT',
750
941
  // Progress-aware gate (shumorobot General incident 2026-05-30): a
751
942
  // cold spawn that's mid-download (runtime fetch, "24%" progress bar)
@@ -754,11 +945,32 @@ class CliProcess extends Process {
754
945
  // actively-changing pane (download bar, dialog nav) keeps resetting
755
946
  // the stall clock and rides out to the ready signal. deadlineMs stays
756
947
  // the absolute backstop. 30s of zero pane activity = genuinely wedged.
757
- stallMs: this.startupGateStallMs ?? 30_000,
948
+ // Stall = pane rendered then went static (genuinely wedged). 60s, not
949
+ // 30s: some topics' TUIs cold-render slowly (Music ~45s, slow MCP
950
+ // startup) — 30s was too tight and false-aborted them. Blank panes
951
+ // don't arm the stall timer at all now (see runStartupGate), so this
952
+ // only bounds a TUI that rendered and then truly hung.
953
+ stallMs: this.startupGateStallMs ?? 60_000,
758
954
  deadlineMs: this.startupGateDeadlineMs ?? 180_000,
955
+ // Review F4: fire-time, NOT gate-resolution — the 2026-06-10 incident
956
+ // matched session-age and THEN died (TMUX_SESSION_GONE), which a
957
+ // success-path check would miss. The dialog appearing AT ALL means the
958
+ // env suppression (CLAUDE_CODE_RESUME_THRESHOLD_MINUTES in
959
+ // _spawnTmuxClaude) stopped working — almost certainly an upstream
960
+ // rename on a binary bump. The gate handles it (full resume picked);
961
+ // this makes the regression visible.
962
+ onTrigger: (name) => {
963
+ if (name !== 'session-age') return;
964
+ this.logger.warn?.(
965
+ `[${this.label}] channels: session-age resume dialog appeared despite env suppression — ` +
966
+ 'check CLAUDE_CODE_RESUME_THRESHOLD_MINUTES against the pinned claude binary',
967
+ );
968
+ this._logEvent('session-age-dialog-fallback', { tmux_name: tmuxName, phase: 'startup-gate' });
969
+ },
759
970
  logger: this.logger,
760
971
  label: `${this.label}:startup-gate`,
761
972
  });
973
+ return gateResult;
762
974
  }
763
975
 
764
976
  // 0.12 Phase 1.6: TWO-handshake gate. The original implementation only
@@ -922,7 +1134,61 @@ class CliProcess extends Process {
922
1134
  this.logger.warn?.(
923
1135
  `[${this.label}] channels: duplicate tool_call_id=${msg.tool_call_id} — re-ACKing without dispatch`,
924
1136
  );
925
- this._writeToBridge({ kind: 'tool_ack', tool_call_id: msg.tool_call_id, ok: true });
1137
+ // 0.13: replay the cached message_id so a retried reply keeps its edit handle
1138
+ // (re-ACKing without it would null the handle → progressive status silently breaks).
1139
+ this._writeToBridge({ kind: 'tool_ack', tool_call_id: msg.tool_call_id, ok: true, message_id: this.recentToolCallResults.get(msg.tool_call_id) ?? null });
1140
+ return;
1141
+ }
1142
+
1143
+ // 0.13 D1: any bridge tool call is same-session activity (the reply tool's
1144
+ // own delivery additionally notes activity via _recordReplyForPendingTurn,
1145
+ // but Pre/PostToolUse hook lag is 250ms–5s — the socket message is the
1146
+ // earliest truthful signal claude is working).
1147
+ this._noteActivity('bridge-tool');
1148
+
1149
+ // 0.13 D2 Tier 2C: the consumed_turn_ids contract field — claude
1150
+ // acknowledges every <channel> message this reply covers (incl. folds the
1151
+ // incidental turn_id echo can't express; the reply schema carries ONE
1152
+ // turn_id). Acked entries can never be declared dropped.
1153
+ //
1154
+ // SECURITY (review 2026-06-12): gate the ack on chat_id matching this
1155
+ // session. The chat_id check lives further down (after dedup/rate-limit);
1156
+ // without this guard a reply carrying a FOREIGN chat_id but naming the live
1157
+ // turn here would mark it resolved/_consumedAcked + arm the finalizer —
1158
+ // "delivered" though nothing reached this chat. The actual reject still
1159
+ // happens at the chat_id guard below.
1160
+ const chatIdMatches = this.chatId == null || String(args.chat_id) === String(this.chatId);
1161
+ if (chatIdMatches && Array.isArray(args.consumed_turn_ids) && args.consumed_turn_ids.length) {
1162
+ this._ledgerAckConsumed(args.consumed_turn_ids.filter((x) => typeof x === 'string'));
1163
+ } else if (chatIdMatches && msg.name === 'reply' && 'consumed_turn_ids' in args) {
1164
+ this._lastAckFieldAt = Date.now(); // field present but empty — contract observed
1165
+ }
1166
+
1167
+ // 0.12 interactive questions: `ask` is a BLOCKING tool whose answer rides back
1168
+ // on a `question_answer` message (NOT tool_ack). Skip the reply-only paths
1169
+ // (content-dedup, rate-limit, the reply dispatcher) — just guard chat_id and
1170
+ // emit so polygram renders the keyboard; the answer is written later via
1171
+ // writeQuestionAnswer(). claude is now idle waiting on the result, so start a
1172
+ // keep-alive that resets the turn's idle ceiling (no tool hooks fire meanwhile).
1173
+ if (msg.name === 'ask') {
1174
+ if (this.chatId != null && args.chat_id != null && String(args.chat_id) !== String(this.chatId)) {
1175
+ this._writeToBridge({ kind: 'question_answer', tool_call_id: msg.tool_call_id, result: { cancelled: true, error: 'chat_id mismatch' } });
1176
+ return;
1177
+ }
1178
+ this._openQuestions.add(msg.tool_call_id);
1179
+ this._startQuestionKeepAlive();
1180
+ // 0.13 D1: waiting-on-user — claude is legitimately silent, so the
1181
+ // activity-quiet finalize must not run down while the keyboard is up.
1182
+ this._suspendActivityQuiet();
1183
+ this.emit('question-asked', {
1184
+ sessionKey: this.sessionKey,
1185
+ chatId: this.chatId,
1186
+ threadId: this.threadId,
1187
+ turnId: args.turn_id || null,
1188
+ toolCallId: msg.tool_call_id,
1189
+ questions: Array.isArray(args.questions) ? args.questions : [],
1190
+ backend: this.backend,
1191
+ });
926
1192
  return;
927
1193
  }
928
1194
 
@@ -931,15 +1197,15 @@ class CliProcess extends Process {
931
1197
  // an isError ack). Window-based so legit repeat sends eventually pass.
932
1198
  if (msg.name === 'reply' && typeof args.text === 'string' && args.chat_id != null) {
933
1199
  const dedupKey = this._buildContentDedupKey(args.chat_id, args.text);
934
- const expiry = this.recentContentHashes.get(dedupKey);
1200
+ const entry = this.recentContentHashes.get(dedupKey); // { expiry, message_id }
935
1201
  const nowDedup = Date.now();
936
1202
  // Evict stale entries opportunistically (avoids unbounded growth).
937
1203
  if (this.recentContentHashes.size > 64) {
938
- for (const [k, ts] of this.recentContentHashes) {
939
- if (ts < nowDedup) this.recentContentHashes.delete(k);
1204
+ for (const [k, e] of this.recentContentHashes) {
1205
+ if (e.expiry < nowDedup) this.recentContentHashes.delete(k);
940
1206
  }
941
1207
  }
942
- if (expiry && expiry > nowDedup) {
1208
+ if (entry && entry.expiry > nowDedup) {
943
1209
  this.logger.warn?.(
944
1210
  `[${this.label}] channels: duplicate content within ${this.contentDedupWindowMs}ms ` +
945
1211
  `(new tool_call_id=${msg.tool_call_id}, hash=${dedupKey.slice(-12)}) — re-ACKing without dispatch`,
@@ -949,7 +1215,9 @@ class CliProcess extends Process {
949
1215
  chat_id: args.chat_id,
950
1216
  window_ms: this.contentDedupWindowMs,
951
1217
  });
952
- this._writeToBridge({ kind: 'tool_ack', tool_call_id: msg.tool_call_id, ok: true });
1218
+ // 0.13: replay the ORIGINAL bubble's message_id so a retried identical reply
1219
+ // keeps its edit handle (the slow-ack-retry case progressive status targets).
1220
+ this._writeToBridge({ kind: 'tool_ack', tool_call_id: msg.tool_call_id, ok: true, message_id: entry.message_id ?? null });
953
1221
  return;
954
1222
  }
955
1223
  }
@@ -992,6 +1260,34 @@ class CliProcess extends Process {
992
1260
  return;
993
1261
  }
994
1262
 
1263
+ // Dropped-"4" fix A2 (docs/0.13-resume-dialog-fix-spec.md): resolve the
1264
+ // reply's originating TG message so the dispatcher has a target for solo
1265
+ // reactions (and reply-quoting). Resolution order strictly mirrors
1266
+ // _recordReplyForPendingTurn so quote/reaction attribution can never
1267
+ // disagree with reply attribution: echoed turn_id → InputLedger entry's
1268
+ // msgId (registered at send/inject time); no echo → the single pending
1269
+ // turn's ledger entry. Anything else stays null — an unattributable
1270
+ // reply must never react to / quote an unrelated message.
1271
+ //
1272
+ // Review F1: quote only the FIRST delivered reply per turn. On SDK,
1273
+ // deliverReplies fires once per turn → one quote; the channels dispatcher
1274
+ // fires per reply tool call, and an N-reply turn must not produce N
1275
+ // bubbles all quoting the same user message.
1276
+ let sourceMsgId = null;
1277
+ let sourceEntry = null;
1278
+ if (args.turn_id && this.inputLedger.has(args.turn_id)) {
1279
+ sourceEntry = this.inputLedger.get(args.turn_id);
1280
+ } else if (this.pendingTurns.size === 1) {
1281
+ const [[onlyTurnId]] = this.pendingTurns;
1282
+ sourceEntry = this.inputLedger.get(onlyTurnId) || null;
1283
+ }
1284
+ if (sourceEntry && !sourceEntry._quoteUsed) {
1285
+ // Review F6: ledger stores msgId stringified; every other delivery call
1286
+ // site passes numeric message_id — coerce rather than lean on TG leniency.
1287
+ const n = Number(sourceEntry.msgId);
1288
+ sourceMsgId = Number.isFinite(n) && n > 0 ? n : null;
1289
+ }
1290
+
995
1291
  let result;
996
1292
  try {
997
1293
  result = await this.toolDispatcher({
@@ -1001,6 +1297,8 @@ class CliProcess extends Process {
1001
1297
  toolName: msg.name,
1002
1298
  text: args.text,
1003
1299
  files: args.files,
1300
+ messageId: args.message_id, // 0.13: edit_message target bubble
1301
+ sourceMsgId, // reaction/quote target (A2)
1004
1302
  sessionCwd: this.sessionCwd, // P0 #2: dispatcher uses this to allowlist file roots
1005
1303
  maxOutboundFileBytes: this.maxOutboundFileBytes, // backend/chat-derived upload cap
1006
1304
  });
@@ -1009,18 +1307,28 @@ class CliProcess extends Process {
1009
1307
  return;
1010
1308
  }
1011
1309
 
1012
- this._writeToBridge({ kind: 'tool_ack', tool_call_id: msg.tool_call_id, ok: !!result?.ok, error: result?.error });
1310
+ // Review F1: the quote target is spent once a reply actually delivered
1311
+ // with it. A FAILED delivery doesn't consume it — the retry still quotes.
1312
+ if (msg.name === 'reply' && result?.ok && sourceMsgId != null && sourceEntry) {
1313
+ sourceEntry._quoteUsed = true;
1314
+ }
1315
+
1316
+ // 0.13: carry the delivered message_id back so the bridge hands it to claude
1317
+ // (reply → edit_message progressive status).
1318
+ this._writeToBridge({ kind: 'tool_ack', tool_call_id: msg.tool_call_id, ok: !!result?.ok, error: result?.error, message_id: result?.message_id });
1013
1319
 
1014
1320
  // P1 #7: remember the tool_call_id so duplicates re-ACK without dispatch.
1015
1321
  // Only cache on SUCCESS — failed calls should be retryable (transient TG
1016
1322
  // outage etc).
1017
1323
  if (result?.ok && msg.tool_call_id) {
1018
1324
  this.recentToolCallIds.add(msg.tool_call_id);
1325
+ this.recentToolCallResults.set(msg.tool_call_id, result.message_id ?? null); // 0.13: for re-ACK replay
1019
1326
  this.recentToolCallOrder.push(msg.tool_call_id);
1020
1327
  // FIFO eviction at cap
1021
1328
  while (this.recentToolCallOrder.length > RECENT_TOOL_CALL_LIMIT) {
1022
1329
  const evicted = this.recentToolCallOrder.shift();
1023
1330
  this.recentToolCallIds.delete(evicted);
1331
+ this.recentToolCallResults.delete(evicted);
1024
1332
  }
1025
1333
  }
1026
1334
 
@@ -1028,7 +1336,9 @@ class CliProcess extends Process {
1028
1336
  // NEW tool_call_id still dedups. TTL-based via expiry timestamp.
1029
1337
  if (result?.ok && msg.name === 'reply' && typeof args.text === 'string' && args.chat_id != null) {
1030
1338
  const dedupKey = this._buildContentDedupKey(args.chat_id, args.text);
1031
- this.recentContentHashes.set(dedupKey, Date.now() + this.contentDedupWindowMs);
1339
+ // 0.13: store the delivered message_id alongside the expiry so a deduped retry
1340
+ // can replay it (keeps claude's edit handle for progressive status).
1341
+ this.recentContentHashes.set(dedupKey, { expiry: Date.now() + this.contentDedupWindowMs, message_id: result.message_id ?? null });
1032
1342
  }
1033
1343
 
1034
1344
  // Review #16 + C9: only record the reply for pending-turn resolution when
@@ -1050,6 +1360,24 @@ class CliProcess extends Process {
1050
1360
  * @param {string|undefined} replyTurnId — echoed from Claude's reply tool args
1051
1361
  */
1052
1362
  _recordReplyForPendingTurn(text, replyTurnId) {
1363
+ // 0.13 D2 (S5 tightening): a reply echoing a KNOWN ledgered turn_id that is
1364
+ // NOT the current pending is a LATE reply from an earlier cycle (post-
1365
+ // finalize tails, fireUserMessage cycles, ask wrap-ups). Pre-P3 the
1366
+ // ==1 fallback below bound it into whatever pending exists now — the live
1367
+ // misattribution path the design's §1.4 corollary names. Correlate it,
1368
+ // resolve its entry, and route it as already-delivered instead.
1369
+ if (replyTurnId && !this.pendingTurns.has(replyTurnId) && this.inputLedger.has(replyTurnId)) {
1370
+ const lEntry = this.inputLedger.get(replyTurnId);
1371
+ this._ledgerTransition(replyTurnId, 'resolved');
1372
+ this._logEvent('cli-late-reply-correlated', { turn_id: replyTurnId, source: lEntry.source });
1373
+ this.emit('autonomous-assistant-message', {
1374
+ text,
1375
+ sessionId: this.claudeSessionId,
1376
+ backend: this.backend,
1377
+ alreadyDelivered: true,
1378
+ });
1379
+ return;
1380
+ }
1053
1381
  let target = null;
1054
1382
  if (replyTurnId && this.pendingTurns.has(replyTurnId)) {
1055
1383
  // Canonical path: Claude echoed the turn_id we sent.
@@ -1116,6 +1444,26 @@ class CliProcess extends Process {
1116
1444
  }
1117
1445
 
1118
1446
  target.replies.push(text);
1447
+ target.replyCount = (target.replyCount || 0) + 1;
1448
+
1449
+ if (this._sawHookStream) {
1450
+ // 0.13 D1: a delivered reply is ACTIVITY — rung 2 (activity-quiet) owns
1451
+ // the finalize; the reply-quiet window never arms on hooks-live sessions.
1452
+ // The chatty-claude cap (Review P1 #12) no longer instant-resolves a turn
1453
+ // claude may still be working (that was seam S1's third premature-finalize
1454
+ // trigger); past the cap, rung 2 + the ceilings govern — and a ceiling on
1455
+ // a replied turn now RESOLVES with its replies (see fireTimeout).
1456
+ if (target.replyCount === this.maxRepliesPerTurn) {
1457
+ this.logger.warn?.(
1458
+ `[${this.label}] cli: ${target.replyCount} replies in single turn — deferring to activity-quiet (cap=${this.maxRepliesPerTurn})`,
1459
+ );
1460
+ this._logEvent('cli-reply-cap-noted', { reply_count: target.replyCount });
1461
+ }
1462
+ this._noteActivity('reply');
1463
+ return;
1464
+ }
1465
+
1466
+ // ── Legacy (rung 3, hook stream never came up): pre-D1 path, byte-identical ──
1119
1467
  // Review F#13: each reply is "activity" — reset the idle ceiling so a
1120
1468
  // 15-min legit turn (PDF analysis, multi-file refactor) replying every
1121
1469
  // minute doesn't get killed at the 10-min wall-clock. The absoluteTimer
@@ -1132,7 +1480,6 @@ class CliProcess extends Process {
1132
1480
  // hang. After N reply tool calls in a single turn, resolve immediately on
1133
1481
  // the NEXT reply without waiting for the quiet window. N defaults to 20
1134
1482
  // which is plenty for normal multi-message replies but caps runaway chains.
1135
- target.replyCount = (target.replyCount || 0) + 1;
1136
1483
  if (target.quietTimer) clearTimeout(target.quietTimer);
1137
1484
  if (target.replyCount >= this.maxRepliesPerTurn) {
1138
1485
  // Skip the quiet-window — resolve right away with whatever we've got.
@@ -1145,6 +1492,318 @@ class CliProcess extends Process {
1145
1492
  }
1146
1493
  }
1147
1494
 
1495
+ // ─── 0.13 D2: InputLedger ──────────────────────────────────────────
1496
+
1497
+ _ledgerAdd(turnId, { source, msgId = null } = {}) {
1498
+ this.inputLedger.set(turnId, {
1499
+ turnId,
1500
+ source,
1501
+ msgId: msgId != null ? String(msgId) : null,
1502
+ chatId: this.chatId,
1503
+ writtenAt: Date.now(),
1504
+ state: 'written',
1505
+ _dropTimer: null,
1506
+ _watchdogTimer: null,
1507
+ _rewritten: false,
1508
+ });
1509
+ // Bounded: prune terminal entries first, then the oldest.
1510
+ if (this.inputLedger.size > INPUT_LEDGER_CAP) {
1511
+ let victim = null;
1512
+ for (const [id, e] of this.inputLedger) {
1513
+ if (e.state !== 'written' && e.state !== 'seen') { victim = id; break; }
1514
+ if (!victim) victim = id;
1515
+ }
1516
+ if (victim) this._ledgerDelete(victim);
1517
+ }
1518
+ }
1519
+
1520
+ _ledgerDelete(turnId) {
1521
+ const e = this.inputLedger.get(turnId);
1522
+ if (!e) return;
1523
+ if (e._dropTimer) clearTimeout(e._dropTimer);
1524
+ if (e._watchdogTimer) clearTimeout(e._watchdogTimer);
1525
+ this.inputLedger.delete(turnId);
1526
+ }
1527
+
1528
+ /** Transition + cancel the entry's timers (a seen/resolved entry can never drop or re-write). */
1529
+ _ledgerTransition(turnId, state) {
1530
+ const e = this.inputLedger.get(turnId);
1531
+ if (!e) return;
1532
+ e.state = state;
1533
+ if (e._dropTimer) { clearTimeout(e._dropTimer); e._dropTimer = null; }
1534
+ if (e._watchdogTimer) { clearTimeout(e._watchdogTimer); e._watchdogTimer = null; }
1535
+ }
1536
+
1537
+ /** Tier 2C: a reply carried consumed_turn_ids — acknowledge every known id. */
1538
+ _ledgerAckConsumed(ids) {
1539
+ this._lastAckFieldAt = Date.now();
1540
+ for (const id of ids) {
1541
+ const e = this.inputLedger.get(id);
1542
+ if (e && e.state !== 'resolved') {
1543
+ this._ledgerTransition(id, 'resolved');
1544
+ this._logEvent('cli-input-acked', { turn_id: id, source: e.source });
1545
+ }
1546
+ // UMI 2026-06-11 19:49 false ⏱ timeout: when claude answers a
1547
+ // primary+fold in ONE reply but echoes the FOLD's turn_id, the reply
1548
+ // routes via late-reply correlation and the PRIMARY pending absorbs
1549
+ // nothing — yet this ack names the primary. Mark it consumed so the
1550
+ // finalizer rungs treat it as replied (resolve already-delivered)
1551
+ // instead of rejecting it at a ceiling AFTER the user got the answer.
1552
+ const pending = this.pendingTurns.get(id);
1553
+ if (pending) {
1554
+ pending._consumedAcked = true;
1555
+ // The ack itself flips rung-2 eligibility on — arm now. (The turn's
1556
+ // last _noteActivity ran BEFORE this flag was set, so without this
1557
+ // a quiet tail would never re-arm and the turn would sit until a
1558
+ // ceiling.)
1559
+ this._armActivityQuiet(id, pending);
1560
+ }
1561
+ }
1562
+ }
1563
+
1564
+ _clearLedgerTimers() {
1565
+ for (const e of this.inputLedger.values()) {
1566
+ if (e._dropTimer) { clearTimeout(e._dropTimer); e._dropTimer = null; }
1567
+ if (e._watchdogTimer) { clearTimeout(e._watchdogTimer); e._watchdogTimer = null; }
1568
+ }
1569
+ }
1570
+
1571
+ /**
1572
+ * D2 drop detection, armed at every cycle end for non-primary entries still
1573
+ * 'written'. The confirm window exists because a non-folded inject legally
1574
+ * queues claude-side and is picked up as the NEXT cycle (its UPS then
1575
+ * cancels this); only entries nobody ever picked up or acknowledged drop.
1576
+ */
1577
+ _armDropConfirmSweep() {
1578
+ for (const [id, entry] of this.inputLedger) {
1579
+ if (entry.state !== 'written') continue;
1580
+ if (entry.source === 'primary') continue; // pending lifecycle + delivery watchdog govern primaries
1581
+ if (entry._dropTimer) continue;
1582
+ entry._dropTimer = setTimeout(() => this._dropConfirmFire(id), this.dropConfirmMs);
1583
+ entry._dropTimer.unref?.();
1584
+ }
1585
+ }
1586
+
1587
+ _dropConfirmFire(turnId) {
1588
+ const entry = this.inputLedger.get(turnId);
1589
+ if (!entry || entry.state !== 'written') return;
1590
+ entry._dropTimer = null;
1591
+ // System/anonymous pushes are never auto-redelivered — resolve quietly.
1592
+ if (entry.source === 'system' || entry.source === 'inject') {
1593
+ this._ledgerTransition(turnId, 'resolved');
1594
+ this._logEvent('cli-input-unconfirmed', { turn_id: turnId, source: entry.source });
1595
+ return;
1596
+ }
1597
+ // Supersession: the user re-sent / moved on — a newer primary was picked
1598
+ // up after this entry was written. Redelivering the stale one would
1599
+ // double-answer the same intent.
1600
+ for (const e of this.inputLedger.values()) {
1601
+ if (e.source === 'primary' && e.writtenAt > entry.writtenAt
1602
+ && (e.state === 'seen' || e.state === 'resolved')) {
1603
+ this._ledgerTransition(turnId, 'superseded');
1604
+ this._logEvent('input-superseded', { turn_id: turnId, msg_id: entry.msgId });
1605
+ return;
1606
+ }
1607
+ }
1608
+ // Contract discriminator: if NO reply since this entry carried the
1609
+ // consumed_turn_ids field, the model ignored the contract this cycle — a
1610
+ // fold is then indistinguishable from a drop, and redelivering folds
1611
+ // double-answers the COMMON case (the inversion that killed the A1 spec).
1612
+ // Park as fold-suspected (telemetry; the soak's anomaly signal).
1613
+ if (!(this._lastAckFieldAt >= entry.writtenAt)) { // >= : same-ms ack still proves the contract mode
1614
+ this._ledgerTransition(turnId, 'fold-suspected');
1615
+ this._logEvent('input-fold-suspected', { turn_id: turnId, msg_id: entry.msgId, source: entry.source });
1616
+ return;
1617
+ }
1618
+ this._ledgerTransition(turnId, 'dropped');
1619
+ this._logEvent('input-dropped', { turn_id: turnId, msg_id: entry.msgId, source: entry.source });
1620
+ this.emit('input-dropped', {
1621
+ turnId, msgId: entry.msgId, chatId: entry.chatId, source: entry.source,
1622
+ });
1623
+ }
1624
+
1625
+ /**
1626
+ * D2 primary-delivery watchdog (KI-drop's missing half — the channel-bind
1627
+ * race drops a user_msg before claude's subscription is live). Fire logic:
1628
+ * - entry seen / turn settled → done (timer was already cancelled).
1629
+ * - ANY session activity since dispatch (hooks, pane heartbeat, bridge
1630
+ * tool calls) → claude is busy (likely a foreign cycle; the queued
1631
+ * pickup is legitimately deferred) → extend, NEVER re-write (round-2
1632
+ * panel: re-writes against a busy session double-prompt it).
1633
+ * - total silence → ONE re-write of the SAME envelope (idempotent:
1634
+ * never seen + zero activity ⇒ claude never had it — the rc.25
1635
+ * argument, properly scoped); still silence after that → bridge
1636
+ * teardown onto the existing bridge-disconnected recovery path.
1637
+ */
1638
+ _armDeliveryWatchdog(turnId, pending) {
1639
+ const entry = this.inputLedger.get(turnId);
1640
+ if (!entry) return;
1641
+ entry._watchdogTimer = setTimeout(() => this._deliveryWatchdogFire(turnId, pending), this.deliveryWatchdogMs);
1642
+ entry._watchdogTimer.unref?.();
1643
+ }
1644
+
1645
+ _deliveryWatchdogFire(turnId, pending) {
1646
+ const entry = this.inputLedger.get(turnId);
1647
+ if (!entry || entry.state !== 'written') return;
1648
+ if (!this.pendingTurns.has(turnId)) return; // settled some other way
1649
+ entry._watchdogTimer = null;
1650
+ const activitySince = Math.max(this._lastActivityAt, this._lastHookEventAt) >= entry.writtenAt
1651
+ && Math.max(this._lastActivityAt, this._lastHookEventAt) > 0;
1652
+ if (activitySince) {
1653
+ this._armDeliveryWatchdog(turnId, pending); // busy — extend the window
1654
+ return;
1655
+ }
1656
+ if (!entry._rewritten) {
1657
+ entry._rewritten = true;
1658
+ this._logEvent('cli-delivery-rewrite', { turn_id: turnId });
1659
+ if (pending._userMsgPayload) this._writeToBridge(pending._userMsgPayload);
1660
+ this._armDeliveryWatchdog(turnId, pending);
1661
+ return;
1662
+ }
1663
+ this._logEvent('cli-delivery-watchdog-escalate', { turn_id: turnId });
1664
+ if (this.bridgeServer?.destroyConnection) this.bridgeServer.destroyConnection();
1665
+ }
1666
+
1667
+ /**
1668
+ * 0.13 D1: note same-session activity — the heartbeat of the finalizer ladder
1669
+ * (docs/0.13-channels-lifecycle-design.md §3 D1). Supersedes the 0.12
1670
+ * `_extendQuietOnToolActivity` (the WA-topic point fix): instead of pushing a
1671
+ * 2s reply-quiet window around, activity now drives three things per pending:
1672
+ *
1673
+ * 1. The idle ceiling resets (pre-D1 semantics preserved — a long
1674
+ * tool-heavy turn isn't idle-killed).
1675
+ * 2. HOOKS-LIVE sessions: an attributed-Stop grace in flight is CANCELLED —
1676
+ * Stop arrives via the ndjson tail with 250ms–5s lag, so a foreign
1677
+ * cycle's lagged Stop can land after this turn's fast first pickup;
1678
+ * activity proves claude is still working and the Stop was stale. The
1679
+ * legacy reply-quiet timer (rung 3) is likewise superseded the moment
1680
+ * hooks go live mid-turn. The activity-quiet window (rung 2) re-arms.
1681
+ * 3. HOOK-NEVER-ALIVE sessions (rung 3): the pre-D1 reply-quiet re-arm,
1682
+ * byte-identical.
1683
+ *
1684
+ * Callers: every hook event except Stop, the pane "esc to interrupt"
1685
+ * thinking heartbeat, bridge tool calls, delivered replies, the question
1686
+ * keep-alive, and question answers.
1687
+ */
1688
+ _noteActivity(source = 'activity') {
1689
+ this._lastActivityAt = Date.now();
1690
+ for (const [turnId, pending] of this.pendingTurns) {
1691
+ // Idle ceiling: activity IS activity.
1692
+ if (pending.hardTimer) {
1693
+ clearTimeout(pending.hardTimer);
1694
+ pending.hardTimer = setTimeout(() => pending._fireTimeout?.('idle'), this.turnTimeoutMs);
1695
+ }
1696
+ if (this._sawHookStream) {
1697
+ if (pending._stopGracePending) this._cancelStopGrace(turnId, pending, source);
1698
+ if (pending.quietTimer) { clearTimeout(pending.quietTimer); pending.quietTimer = null; }
1699
+ this._armActivityQuiet(turnId, pending);
1700
+ } else if (pending._stopGracePending) {
1701
+ // Legacy grace (resolveTurn's wait-for-Stop) — never revived/cancelled
1702
+ // by activity; identical to pre-D1.
1703
+ continue;
1704
+ } else if (pending.quietTimer) {
1705
+ clearTimeout(pending.quietTimer);
1706
+ pending.quietTimer = setTimeout(() => this._resolveTurn(turnId), this.turnQuietMs);
1707
+ }
1708
+ }
1709
+ }
1710
+
1711
+ /**
1712
+ * D1 rung 2: arm/refresh the activity-quiet finalize for one pending.
1713
+ * Preconditions: hooks live, ≥1 delivered reply (a reply-less turn ends via
1714
+ * rung 1 or the ceilings), no open question (waiting-on-user suspends the
1715
+ * clock — claude is legitimately silent), and no rung-1 grace in flight.
1716
+ */
1717
+ _armActivityQuiet(turnId, pending) {
1718
+ if (!this._sawHookStream) return;
1719
+ // ≥1 reply, OR seen + consumed-acked (the answer rode a sibling turn_id —
1720
+ // fold-id echo; see _ledgerAckConsumed). Same eligibility as the fire site.
1721
+ if ((!pending.replies || pending.replies.length === 0)
1722
+ && !(pending.seen === true && pending._consumedAcked === true)) return;
1723
+ if (this._openQuestions.size > 0) return;
1724
+ if (pending._stopGracePending) return;
1725
+ if (pending._activityQuietTimer) clearTimeout(pending._activityQuietTimer);
1726
+ pending._activityQuietTimer = setTimeout(() => this._activityQuietFinalize(turnId), this.activityQuietMs);
1727
+ pending._activityQuietTimer.unref?.();
1728
+ }
1729
+
1730
+ /** D1: suspend rung 2 for all pendings (an `ask` just opened — waiting on the user). */
1731
+ _suspendActivityQuiet() {
1732
+ for (const [, pending] of this.pendingTurns) {
1733
+ if (pending._activityQuietTimer) {
1734
+ clearTimeout(pending._activityQuietTimer);
1735
+ pending._activityQuietTimer = null;
1736
+ }
1737
+ }
1738
+ }
1739
+
1740
+ /**
1741
+ * D1 rung 2 fire: the whole activity surface (hooks + pane heartbeat + bridge
1742
+ * tool calls) has been quiet for activityQuietMs on a replied turn — the tail
1743
+ * is over (Stop was lost, foreign, or the hook stream died mid-session; the
1744
+ * pre-D1 `_sawHookStream` one-way boolean left that last class with NO
1745
+ * finalizer until a 10-min TURN_TIMEOUT *rejection* after a delivered answer).
1746
+ */
1747
+ _activityQuietFinalize(turnId) {
1748
+ const pending = this.pendingTurns.get(turnId);
1749
+ if (!pending) return;
1750
+ if (pending._stopGracePending) return;
1751
+ if (this._openQuestions.size > 0) return; // re-check at fire time
1752
+ // Eligibility: ≥1 bound reply, OR seen + consumed-acked (the answer went
1753
+ // out under a sibling turn_id — fold-id echo; see _ledgerAckConsumed).
1754
+ const consumedAcked = pending.seen === true && pending._consumedAcked === true;
1755
+ if ((!pending.replies || pending.replies.length === 0) && !consumedAcked) return;
1756
+ const lastHookAgeMs = this._lastHookEventAt ? Date.now() - this._lastHookEventAt : null;
1757
+ this._logEvent('cli-activity-quiet-finalize', {
1758
+ turn_id: turnId,
1759
+ reply_count: pending.replies.length,
1760
+ consumed_acked: consumedAcked,
1761
+ last_hook_age_ms: lastHookAgeMs,
1762
+ had_stop: !!pending._stopHookData,
1763
+ });
1764
+ if (lastHookAgeMs != null && lastHookAgeMs >= this.activityQuietMs) {
1765
+ // A previously-live hook stream went quiet enough that rung 2 (not an
1766
+ // attributed Stop) ended the turn — the soak's mid-session-death signal.
1767
+ this._logEvent('cli-hook-stream-stalled', { turn_id: turnId, last_hook_age_ms: lastHookAgeMs });
1768
+ }
1769
+ this._finalizeTurn(turnId);
1770
+ }
1771
+
1772
+ /**
1773
+ * D1 rung 1: an attributed Stop (the pending was `seen` at pickup, or has
1774
+ * ≥1 turn_id-bound reply) finalizes through a short grace that any
1775
+ * subsequent same-session activity cancels (see _noteActivity #2).
1776
+ */
1777
+ _beginAttributedStopGrace(turnId, pending, info) {
1778
+ pending._stopHookData = info;
1779
+ pending._stopGracePending = true;
1780
+ if (pending._activityQuietTimer) {
1781
+ clearTimeout(pending._activityQuietTimer);
1782
+ pending._activityQuietTimer = null;
1783
+ }
1784
+ pending._stopGraceTimer = setTimeout(() => {
1785
+ pending._stopGraceTimer = null;
1786
+ pending._stopGracePending = false;
1787
+ this._logEvent('cli-turn-resolved-by-stop', {
1788
+ turn_id: turnId,
1789
+ reply_count: pending.replies?.length || 0,
1790
+ via_text_fallback: (pending.replies?.length || 0) === 0,
1791
+ attributed: pending.seen === true ? 'seen' : 'reply-bound',
1792
+ session_id: this.claudeSessionId,
1793
+ });
1794
+ this._finalizeTurn(turnId);
1795
+ }, this.stopGraceMs);
1796
+ pending._stopGraceTimer.unref?.();
1797
+ }
1798
+
1799
+ /** D1: cancel a stop-grace (rung 1 stale-Stop, or a superseded legacy grace). */
1800
+ _cancelStopGrace(turnId, pending, source) {
1801
+ if (pending._stopGraceTimer) { clearTimeout(pending._stopGraceTimer); pending._stopGraceTimer = null; }
1802
+ if (pending._onStop) { this.off('stop-hook', pending._onStop); pending._onStop = null; }
1803
+ pending._stopGracePending = false;
1804
+ this._logEvent('cli-stop-grace-cancelled', { turn_id: turnId, source });
1805
+ }
1806
+
1148
1807
  // 0.12 Phase 1.7 (Finding 0.1.A): two-step turn resolution.
1149
1808
  // _resolveTurn — entry point called by channel-result OR quiet-window
1150
1809
  // expiry. Schedules a stopGraceMs window during which
@@ -1223,6 +1882,9 @@ class CliProcess extends Process {
1223
1882
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1224
1883
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1225
1884
  if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
1885
+ if (pending._activityQuietTimer) clearTimeout(pending._activityQuietTimer); // 0.13 D1
1886
+ if (pending._onStop) { this.off('stop-hook', pending._onStop); pending._onStop = null; }
1887
+ const hadReplyToolCalls = pending.replies.length > 0;
1226
1888
  let text = pending.replies.join('\n\n');
1227
1889
  // 0.12 Phase 1.7 fallback: if no reply tool calls landed (claude ended
1228
1890
  // the turn without calling mcp__polygram-bridge__reply), use the Stop
@@ -1240,12 +1902,17 @@ class CliProcess extends Process {
1240
1902
  // to appear free in dashboards.
1241
1903
  const result = {
1242
1904
  text,
1243
- // Review F#2: dispatcher has ALREADY delivered text to Telegram on each
1244
- // reply tool call (incremental real-time UX is the channels delivery
1245
- // model). polygram.js's post-pm.send pipeline must short-circuit its
1246
- // streamer.finalize / deliverReplies branch otherwise every turn
1247
- // delivers twice. Logging + DB transcript still use result.text.
1248
- alreadyDelivered: true,
1905
+ // Review F#2: when claude used reply tool calls, the dispatcher ALREADY
1906
+ // delivered that text to Telegram incrementally polygram.js must
1907
+ // short-circuit its deliverReplies branch or every turn delivers twice.
1908
+ // BUT a turn finalized via the Stop fallback (no reply tool calls — the
1909
+ // stuck-turn case) has delivered NOTHING; marking it alreadyDelivered
1910
+ // would resolve the turn silently and the user still sees nothing. So
1911
+ // only claim already-delivered when reply tool calls actually fired —
1912
+ // or when claude ACKED consuming this turn in a sibling reply
1913
+ // (consumed_turn_ids; the fold-id-echo case): re-sending the Stop
1914
+ // fallback there would duplicate the delivered answer.
1915
+ alreadyDelivered: hadReplyToolCalls || pending._consumedAcked === true,
1249
1916
  sessionId: this.claudeSessionId,
1250
1917
  cost: null, // Channels protocol doesn't expose per-turn cost
1251
1918
  duration,
@@ -1261,6 +1928,12 @@ class CliProcess extends Process {
1261
1928
  },
1262
1929
  };
1263
1930
  this.inFlight = this.pendingTurns.size > 0;
1931
+ // 0.13 D2: the finalized cycle resolves its own ledger entry; any
1932
+ // non-primary entries still 'written' enter the drop-confirm window
1933
+ // (a late next-cycle pickup or ack cancels; otherwise dropped /
1934
+ // fold-suspected / superseded — see _dropConfirmFire).
1935
+ this._ledgerTransition(turnId, 'resolved');
1936
+ this._armDropConfirmSweep();
1264
1937
  pending.resolve(result);
1265
1938
  this.emit('result', { subtype: 'success' }, { streamText: text });
1266
1939
  this.emit('idle');
@@ -1310,6 +1983,9 @@ class CliProcess extends Process {
1310
1983
  if (oldest.quietTimer) clearTimeout(oldest.quietTimer);
1311
1984
  if (oldest.hardTimer) clearTimeout(oldest.hardTimer);
1312
1985
  if (oldest.absoluteTimer) clearTimeout(oldest.absoluteTimer);
1986
+ if (oldest._stopGraceTimer) clearTimeout(oldest._stopGraceTimer);
1987
+ if (oldest._activityQuietTimer) clearTimeout(oldest._activityQuietTimer); // 0.13 D1
1988
+ if (oldest._onStop) this.off('stop-hook', oldest._onStop);
1313
1989
  const dropErr = new Error('queue overflow — oldest pending evicted');
1314
1990
  dropErr.code = 'QUEUE_OVERFLOW';
1315
1991
  try { oldest.reject(dropErr); } catch {}
@@ -1348,6 +2024,15 @@ class CliProcess extends Process {
1348
2024
  const fireTimeout = (reason) => {
1349
2025
  if (!this.pendingTurns.has(turnId)) return;
1350
2026
  const pending = this.pendingTurns.get(turnId);
2027
+ // 0.13 D1 (S9): unblock any open ask FIRST — claude must never stay
2028
+ // hung on a question whose turn we are about to end. The card cleanup
2029
+ // stays with the question sweep; this only resolves the blocking tool.
2030
+ if (this._openQuestions.size > 0) {
2031
+ for (const tc of [...this._openQuestions]) {
2032
+ this._logEvent('cli-question-timedout-at-ceiling', { tool_call_id: tc, reason });
2033
+ try { this.writeQuestionAnswer(tc, { timedout: true }); } catch { /* best-effort */ }
2034
+ }
2035
+ }
1351
2036
  this.pendingTurns.delete(turnId);
1352
2037
  const idx = this.pendingQueue.findIndex(e => e.turnId === turnId);
1353
2038
  if (idx >= 0) this.pendingQueue.splice(idx, 1);
@@ -1355,8 +2040,44 @@ class CliProcess extends Process {
1355
2040
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1356
2041
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1357
2042
  if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
2043
+ if (pending._activityQuietTimer) clearTimeout(pending._activityQuietTimer);
2044
+ if (pending._onStop) this.off('stop-hook', pending._onStop);
1358
2045
  this.inFlight = this.pendingTurns.size > 0;
1359
2046
  const turnTimeoutMs = reason === 'absolute' ? this.turnAbsoluteMs : (opts.maxTurnMs || this.turnTimeoutMs);
2047
+
2048
+ // 0.13 D1 ceiling-resolve: a ceiling expiring on a turn with delivered
2049
+ // replies RESOLVES it — the user already has their answer; rejecting
2050
+ // would send a scary timeout error AFTER a successful reply (round-2
2051
+ // panel finding: the v2 soak gate contradicted the design's own
2052
+ // ask-timeout-then-ceiling path). TURN_TIMEOUT rejection is reserved
2053
+ // for turns with ZERO delivered replies. Consumed-acked counts as
2054
+ // replied: the answer rode a sibling turn_id (fold-id echo — the UMI
2055
+ // 2026-06-11 19:49 false ⏱; see _ledgerAckConsumed).
2056
+ if ((pending.replies?.length || 0) > 0
2057
+ || (pending.seen === true && pending._consumedAcked === true)) {
2058
+ this._logEvent('cli-turn-ceiling-resolved', {
2059
+ reason, turnTimeoutMs, reply_count: pending.replies?.length || 0,
2060
+ consumed_acked: pending._consumedAcked === true,
2061
+ });
2062
+ this.emit('idle');
2063
+ resolve({
2064
+ text: pending.replies.join('\n\n'),
2065
+ alreadyDelivered: true,
2066
+ sessionId: this.claudeSessionId,
2067
+ cost: null,
2068
+ duration: Date.now() - pending.startedAt,
2069
+ error: null,
2070
+ metrics: {
2071
+ inputTokens: null, outputTokens: null,
2072
+ cacheCreationTokens: null, cacheReadTokens: null,
2073
+ numAssistantMessages: pending.replies.length,
2074
+ numToolUses: null,
2075
+ resultSubtype: 'success',
2076
+ },
2077
+ });
2078
+ return;
2079
+ }
2080
+
1360
2081
  this.emit('turn-timeout', {
1361
2082
  reason,
1362
2083
  turnTimeoutMs,
@@ -1372,24 +2093,43 @@ class CliProcess extends Process {
1372
2093
  const pending = {
1373
2094
  resolve, reject,
1374
2095
  replies: [],
2096
+ // 0.13 D1: pickup marker — set when a UserPromptSubmit prompt carries
2097
+ // this turn's envelope (the seen-slice). Rung 1's Stop attribution.
2098
+ seen: false,
1375
2099
  quietTimer: null,
1376
- // hardTimer = idle ceiling. Resets on each reply in
1377
- // _recordReplyForPendingTurn so a chatty turn (replies every 60s)
1378
- // doesn't get killed at 10 min wall-clock.
2100
+ _activityQuietTimer: null,
2101
+ // hardTimer = idle ceiling. Resets on any activity (_noteActivity)
2102
+ // so a chatty or tool-heavy turn isn't killed at 10 min wall-clock.
1379
2103
  hardTimer: setTimeout(() => fireTimeout('idle'), opts.maxTurnMs || this.turnTimeoutMs),
1380
2104
  // absoluteTimer = wall-clock ceiling. Does NOT reset. Bounds true
1381
2105
  // runaways. 30 min default — high enough that legitimate
1382
2106
  // multi-step refactors complete, low enough to catch infinite
1383
2107
  // chatter.
1384
2108
  absoluteTimer: setTimeout(() => fireTimeout('absolute'), this.turnAbsoluteMs),
1385
- // Review F#13: attach fireTimeout so _recordReplyForPendingTurn can
1386
- // reset the idle timer (creates a fresh setTimeout with the same
1387
- // reject closure).
2109
+ // Review F#13: attach fireTimeout so activity can reset the idle
2110
+ // timer (creates a fresh setTimeout with the same closure).
1388
2111
  _fireTimeout: fireTimeout,
1389
2112
  startedAt: Date.now(),
1390
2113
  };
1391
2114
  this.pendingTurns.set(turnId, pending);
1392
2115
 
2116
+ // 0.13 D1 (§1.4): the single-active-cycle invariant is enforced by the
2117
+ // daemon's stdinLock (held across the full turn) — CliProcess can't see
2118
+ // the lock, so a second concurrent pending means a caller bypassed the
2119
+ // contract. Loud assertion telemetry; the drop-rather-than-misattribute
2120
+ // defenses (reply routing, Stop attribution) remain the failure mode.
2121
+ if (this.pendingTurns.size > 1) {
2122
+ this.logger.warn?.(
2123
+ `[${this.label}] cli: ${this.pendingTurns.size} concurrent pending turns — stdinLock contract violated upstream`,
2124
+ );
2125
+ this._logEvent('cli-multi-pending-assert', { pending_count: this.pendingTurns.size });
2126
+ }
2127
+
2128
+ // 0.13 D2: ledger the primary + keep the exact envelope for the delivery
2129
+ // watchdog's idempotent re-write (the pending owns it — no text in the
2130
+ // ledger, events stay content-free per L13).
2131
+ this._ledgerAdd(turnId, { source: 'primary', msgId: opts.context?.sourceMsgId });
2132
+
1393
2133
  // Review F#18: bridge-disconnect TOCTOU. The bridgeReady check at
1394
2134
  // top of send() can race the bridge socket close. If the bridge
1395
2135
  // dies between check and write, _writeToBridge silently no-ops (it
@@ -1397,14 +2137,16 @@ class CliProcess extends Process {
1397
2137
  // pending entry sits with no live bridge until hardTimer (10 min).
1398
2138
  // Pass the actual write result back and reject immediately on
1399
2139
  // failure so the caller sees a fast, code-tagged error.
1400
- const wrote = this._writeToBridge({
2140
+ pending._userMsgPayload = {
1401
2141
  kind: 'user_msg',
1402
2142
  turn_id: turnId,
1403
2143
  text: prompt,
1404
2144
  chat_id: this.chatId,
1405
2145
  user: opts.context?.user || '',
1406
2146
  msg_id: opts.context?.sourceMsgId || '',
1407
- });
2147
+ };
2148
+ const wrote = this._writeToBridge(pending._userMsgPayload);
2149
+ if (wrote) this._armDeliveryWatchdog(turnId, pending);
1408
2150
  if (!wrote) {
1409
2151
  this.pendingTurns.delete(turnId);
1410
2152
  const qIdx = this.pendingQueue.findIndex(e => e.turnId === turnId);
@@ -1423,6 +2165,13 @@ class CliProcess extends Process {
1423
2165
  async interrupt() {
1424
2166
  if (this.closed) return;
1425
2167
  if (!this.tmuxSession) return;
2168
+ // Cancel-cheap C2 (spec Finding 7): a cancel is already in flight — a
2169
+ // SECOND C-c would land at the now-idle prompt, which is claude's exit
2170
+ // chord ("press ctrl+c again to exit") and would convert the cheap cancel
2171
+ // into an accidental process exit. Also: resetting the grace timer would
2172
+ // DELAY the synthetic resolution for a user double-tapping "stop".
2173
+ // Idempotent no-op instead.
2174
+ if (this._interruptGraceTimer) return;
1426
2175
  // tmux SIGINT — hard interrupt for the running turn.
1427
2176
  try {
1428
2177
  await this.runner.sendControl(this.tmuxSession, 'C-c');
@@ -1433,18 +2182,47 @@ class CliProcess extends Process {
1433
2182
  this.emit('interrupt-applied', { backend: this.backend });
1434
2183
  this._logEvent('interrupt-applied', {});
1435
2184
 
2185
+ // Cancel-cheap C1 — the spec's O2 BLOCKER: the cancelled work's inputs
2186
+ // must never re-deliver. The grace below synthesizes the resolution
2187
+ // WITHOUT _finalizeTurn, so without this, an autosteer/fold entry stays
2188
+ // 'written' and a LATER cycle-end sweep declares it dropped →
2189
+ // drop-redeliver re-injects the user's CANCELLED message minutes later.
2190
+ // 'cancelled' is terminal: the sweep only targets 'written', and
2191
+ // _ledgerTransition clears the entry's drop/watchdog timers.
2192
+ for (const [id, e] of this.inputLedger) {
2193
+ if (e.state === 'written' || e.state === 'seen') {
2194
+ this._ledgerTransition(id, 'cancelled');
2195
+ this._logEvent('cli-input-cancelled', { turn_id: id, source: e.source });
2196
+ }
2197
+ }
2198
+
1436
2199
  // Review P3 C8: after Ctrl-C, Claude may or may not call reply with an
1437
2200
  // "I was interrupted" message. If it doesn't (5s grace), resolve pending
1438
2201
  // turns with subtype 'interrupted' instead of letting them wait the full
1439
- // 10-min hardTimer. The grace window is reset if a new interrupt fires.
1440
- if (this._interruptGraceTimer) clearTimeout(this._interruptGraceTimer);
2202
+ // 10-min hardTimer.
2203
+ //
2204
+ // C4 BLOCKER (review 2026-06-12): SNAPSHOT the turns that were in flight at
2205
+ // C-c time and resolve ONLY those. The cancelled turn often finalizes
2206
+ // cleanly DURING the grace (claude acks the C-c) and the user then starts a
2207
+ // NEW turn — the "stop, then redirect" flow cheap-cancel exists for. Without
2208
+ // the snapshot the stale grace iterated pendingTurns LIVE and silently
2209
+ // resolved that fresh turn as 'interrupted' (lost). send() doesn't clear the
2210
+ // grace, so the snapshot is the fix.
2211
+ const interruptedTurnIds = new Set(this.pendingTurns.keys());
1441
2212
  this._interruptGraceTimer = setTimeout(() => {
1442
2213
  let resolvedAny = false;
1443
2214
  for (const [turnId, pending] of this.pendingTurns) {
2215
+ if (!interruptedTurnIds.has(turnId)) continue; // only the turns in flight at C-c
1444
2216
  // Synthesize an interrupted resolution: empty text, 'interrupted' subtype.
2217
+ // Cancel-cheap C3: clear ALL per-pending machinery (mirrors
2218
+ // _finalizeTurn) — stray timers/listeners on the kept-warm proc are
2219
+ // exactly what the cheap-cancel design must not leak.
1445
2220
  if (pending.quietTimer) clearTimeout(pending.quietTimer);
1446
2221
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1447
- if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
2222
+ if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
2223
+ if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
2224
+ if (pending._activityQuietTimer) clearTimeout(pending._activityQuietTimer);
2225
+ if (pending._onStop) { this.off('stop-hook', pending._onStop); pending._onStop = null; }
1448
2226
  this.pendingTurns.delete(turnId);
1449
2227
  const qIdx = this.pendingQueue.findIndex(e => e.turnId === turnId);
1450
2228
  if (qIdx >= 0) this.pendingQueue.splice(qIdx, 1);
@@ -1502,7 +2280,7 @@ class CliProcess extends Process {
1502
2280
  */
1503
2281
  async probeBusyState() {
1504
2282
  const base = {
1505
- busy: false, streaming: false,
2283
+ busy: false, streaming: false, backgroundShell: false, shellCount: 0,
1506
2284
  inFlight: this.inFlight, pendingTurns: this.pendingTurns.size,
1507
2285
  captured: false, paneTail: null,
1508
2286
  };
@@ -1518,10 +2296,23 @@ class CliProcess extends Process {
1518
2296
  }
1519
2297
  if (!pane) return base;
1520
2298
  const streaming = STREAMING_HINT_RE.test(pane);
2299
+ // Background-shell count from the TUI mode line. Match only the captured
2300
+ // TAIL (the mode line lives at the bottom of the viewport) so a `· N shell ·`
2301
+ // string scrolled off into history can't trip a stale false-positive — see
2302
+ // BACKGROUND_SHELL_RE. A detached `run_in_background` Bash that outlived its
2303
+ // turn shows here even while claude is idle and not streaming.
2304
+ const m = pane.slice(-400).match(BACKGROUND_SHELL_RE);
2305
+ const shellCount = m ? parseInt(m[1], 10) : 0;
2306
+ const backgroundShell = shellCount > 0;
1521
2307
  return {
1522
2308
  ...base,
2309
+ // `busy` stays streaming-only — it is the abort path's "is claude working a
2310
+ // turn" signal and must not change behaviour. Background-shell liveness is a
2311
+ // separate axis the stall-watchdog reads via `backgroundShell`/`shellCount`.
1523
2312
  busy: streaming,
1524
2313
  streaming,
2314
+ backgroundShell,
2315
+ shellCount,
1525
2316
  captured: true,
1526
2317
  paneTail: pane.slice(-200),
1527
2318
  };
@@ -1533,6 +2324,137 @@ class CliProcess extends Process {
1533
2324
  return busy;
1534
2325
  }
1535
2326
 
2327
+ /**
2328
+ * Does this session have a detached background shell running RIGHT NOW — a
2329
+ * `run_in_background` Bash that may have outlived its turn? Thin probe over
2330
+ * probeBusyState's background-shell signal; the stall-watchdog's input.
2331
+ * @returns {Promise<{live:boolean, count:number}>}
2332
+ */
2333
+ async hasLiveBackgroundWork() {
2334
+ const { backgroundShell, shellCount } = await this.probeBusyState();
2335
+ return { live: backgroundShell, count: shellCount };
2336
+ }
2337
+
2338
+ /**
2339
+ * LRU eviction pin (0.12.0 spec). Cached read of `_bgWorkSince` — the idle bg-work
2340
+ * watchdog state maintained by `_pollBackgroundWork` on the ≤5s pong tick. Non-null ⟺ a
2341
+ * detached background shell has been observed while idle. No time cap: a job that runs for
2342
+ * hours stays pinned (elapsed time can't tell "slow-but-progressing" from "stuck"). Cheap,
2343
+ * sync — safe to call from `_evictLRU`.
2344
+ * @returns {boolean}
2345
+ */
2346
+ hasActiveBackgroundWork() {
2347
+ return this._bgWorkSince !== null;
2348
+ }
2349
+
2350
+ /**
2351
+ * Resolve the model / effort for a spawn context using the topic→chat→
2352
+ * fallback precedence (mirrors the spawn path). Single source of truth shared
2353
+ * by start() (which records this.model / this.effort) and wouldReloadFor()
2354
+ * (which compares the current config to those spawn-time values).
2355
+ */
2356
+ _resolveModel(opts) {
2357
+ const topicConfig = opts.threadId && opts.chatConfig?.topics?.[opts.threadId];
2358
+ return topicConfig?.model || opts.chatConfig?.model || opts.model;
2359
+ }
2360
+
2361
+ _resolveEffort(opts) {
2362
+ const topicConfig = opts.threadId && opts.chatConfig?.topics?.[opts.threadId];
2363
+ return topicConfig?.effort || opts.chatConfig?.effort || opts.effort;
2364
+ }
2365
+
2366
+ /**
2367
+ * getOrSpawn calls this before reusing a warm proc. cli can't hot-swap model
2368
+ * or effort (spawn-time flags), so when the resolved config has drifted from
2369
+ * what we spawned with AND we are idle, the proc must be killed + cold-
2370
+ * respawned (--resume keeps the conversation; the new --model / --effort takes
2371
+ * effect). In-flight → false: fold the message into the running turn; the
2372
+ * drift reloads on the next idle dispatch. SDK procs apply model live and do
2373
+ * NOT implement this method, so process-manager only reloads when it exists.
2374
+ * @returns {boolean}
2375
+ */
2376
+ wouldReloadFor(spawnContext) {
2377
+ if (this.inFlight || this.closed) return false;
2378
+ return this._resolveModel(spawnContext) !== this.model
2379
+ || this._resolveEffort(spawnContext) !== this.effort;
2380
+ }
2381
+
2382
+ /**
2383
+ * 0.13 D1 (S9): LRU eviction pin — a session blocked on an open `ask` must
2384
+ * not be evicted (the question, and claude's blocked cycle, would die with
2385
+ * it). Belt-and-braces: with D1 the turn stays inFlight through the wait.
2386
+ */
2387
+ hasOpenQuestions() {
2388
+ return this._openQuestions.size > 0;
2389
+ }
2390
+
2391
+ /**
2392
+ * Stall-watchdog for detached background work (0.12.0 background-work
2393
+ * lifecycle, shumorobot Music 7h frozen-Chrome download). Runs on the
2394
+ * pongWatchdog 5s tick but ONLY while the session is IDLE (pendingTurns===0) —
2395
+ * the mirror of _pollMidTurnDialogs, which only runs DURING turns. When a
2396
+ * `run_in_background` Bash outlives its turn and keeps running while claude is
2397
+ * idle for > bgWorkStallMs, nothing tells the agent or user whether it's
2398
+ * progressing or stuck. One read-only self-check re-invokes the agent to
2399
+ * diagnose — via `fireUserMessage`, NOT `injectUserMessage` (which no-ops when
2400
+ * !inFlight, the exact idle state here). Read-only framing matters: the agent
2401
+ * runs bypassPermissions, so an open-ended "fix it" could background another
2402
+ * hung shell unattended.
2403
+ *
2404
+ * Exactly one self-check per continuous background-work window (capped by
2405
+ * `_bgWorkEscalations`); the window resets only when the shell count returns to
2406
+ * 0. Never throws — swallows its own errors so the pong watchdog stays clean.
2407
+ */
2408
+ async _pollBackgroundWork() {
2409
+ if (this.closed || !this.bridgeReady) return;
2410
+ // Only watch while idle. An active turn means the agent is engaged
2411
+ // (_pollMidTurnDialogs owns that path). Crucially we do NOT reset the clock
2412
+ // here — the same shell is still running, so the window persists across a
2413
+ // brief self-check turn rather than restarting and re-pinging every window.
2414
+ if (this.pendingTurns.size > 0) return;
2415
+ let live = false;
2416
+ let count = 0;
2417
+ try {
2418
+ ({ live, count } = await this.hasLiveBackgroundWork());
2419
+ } catch (err) {
2420
+ this.logger.warn?.(`[${this.label}] channels: bg-work probe failed: ${err.message}`);
2421
+ return;
2422
+ }
2423
+ if (!live) {
2424
+ if (this._bgWorkSince !== null) {
2425
+ this._logEvent('cli-bg-work-cleared', { idle_ms: Date.now() - this._bgWorkSince });
2426
+ // Visibility: tear down the status indicator once work clears.
2427
+ if (this._bgWorkStatusShown) {
2428
+ this.emit('bg-work-status', { state: 'cleared' });
2429
+ this._bgWorkStatusShown = false;
2430
+ }
2431
+ }
2432
+ this._bgWorkSince = null;
2433
+ this._bgWorkEscalations = 0;
2434
+ return;
2435
+ }
2436
+ if (this._bgWorkSince === null) {
2437
+ // First idle observation of a live background shell — start the clock AND
2438
+ // raise the visibility indicator so a long job reads as working, not stuck.
2439
+ this._bgWorkSince = Date.now();
2440
+ this._bgWorkEscalations = 0;
2441
+ this._logEvent('cli-bg-work-detected', { shell_count: count });
2442
+ this.emit('bg-work-status', { state: 'running', count });
2443
+ this._bgWorkStatusShown = true;
2444
+ return;
2445
+ }
2446
+ const idleMs = Date.now() - this._bgWorkSince;
2447
+ if (idleMs < this.bgWorkStallMs || this._bgWorkEscalations >= 1) return;
2448
+ const mins = Math.max(1, Math.round(idleMs / 60000));
2449
+ const prompt =
2450
+ `⏳ A background job has been running ~${mins} min with no update. `
2451
+ + `Check its status and report whether it's progressing or stuck. `
2452
+ + `Do NOT start new work, re-run it, or kill anything — report only.`;
2453
+ const fired = this.fireUserMessage(prompt);
2454
+ this._bgWorkEscalations = 1;
2455
+ this._logEvent('cli-bg-work-stall-selfcheck', { idle_ms: idleMs, shell_count: count, fired });
2456
+ }
2457
+
1536
2458
  async kill(reason = 'kill') {
1537
2459
  if (this.closed) return;
1538
2460
  // Parity P19: re-entry guard for concurrent kill() calls. Mirrors
@@ -1606,6 +2528,22 @@ class CliProcess extends Process {
1606
2528
  _handleHookEvent(ev) {
1607
2529
  if (!ev || typeof ev !== 'object') return;
1608
2530
 
2531
+ // rc.16 observability: emit once when the FIRST hook event arrives for
2532
+ // this session, confirming the claude→ndjson→tail pipeline is actually
2533
+ // flowing. The 2026-06-02 stuck turn had a session whose hook ndjson was
2534
+ // 0 bytes — claude emitted no hooks polygram could see, so no Stop ever
2535
+ // arrived to finalize the turn. Without this signal that's invisible: a
2536
+ // turn that hangs with NO `cli-hook-stream-live` for its session means the
2537
+ // hook pipeline is dead for it (distinct from "Stop fired but wasn't
2538
+ // acted on", which `cli-turn-resolved-by-stop` now covers).
2539
+ if (!this._sawHookStream) {
2540
+ this._sawHookStream = true;
2541
+ this._logEvent('cli-hook-stream-live', {
2542
+ session_id: this.claudeSessionId,
2543
+ first_event: ev.type,
2544
+ });
2545
+ }
2546
+
1609
2547
  // 0.12 Phase 1.8 (Finding 0.4.A): per-event lag measurement.
1610
2548
  // polygram_received_at_ms is stamped by the helper subprocess at write
1611
2549
  // time; subtracting from Date.now() gives the helper-write → tail-emit
@@ -1627,11 +2565,57 @@ class CliProcess extends Process {
1627
2565
  });
1628
2566
  }
1629
2567
 
2568
+ // 0.13 D1: every hook event is same-session ACTIVITY for the finalizer
2569
+ // ladder (generalizes the 2026-06-08 WA-topic fix, which only extended on
2570
+ // Pre/PostToolUse) — EXCEPT Stop, which is a terminal signal, not work:
2571
+ // noting it as activity would cancel its own attribution grace. parse-error
2572
+ // and unknown are excluded too (stream noise is not evidence of work).
2573
+ if (ev.type === 'Stop') {
2574
+ this._lastHookEventAt = Date.now();
2575
+ } else if (ev.type && ev.type !== 'parse-error' && ev.type !== 'unknown') {
2576
+ this._lastHookEventAt = Date.now();
2577
+ this._noteActivity(`hook:${ev.type}`);
2578
+ }
2579
+
1630
2580
  switch (ev.type) {
1631
2581
  case 'UserPromptSubmit':
2582
+ // 0.13 D1 seen-slice: the UPS prompt carries the bridge-authored
2583
+ // <channel turn_id="…"> envelope (P0 spike Q1) — parse it (anchored on
2584
+ // the raw tag prefix, see UPS_ENVELOPE_TURN_ID_RE) and mark the
2585
+ // matching pending as picked-up. `seen` is what lets rung 1 tell this
2586
+ // cycle's Stop from a foreign cycle's. Never log prompt content (L13).
2587
+ let anchorMsgId = null;
2588
+ if (typeof ev.prompt === 'string' && ev.prompt) {
2589
+ for (const m of ev.prompt.matchAll(UPS_ENVELOPE_TURN_ID_RE)) {
2590
+ const seenPending = this.pendingTurns.get(m[1]);
2591
+ if (seenPending && seenPending.seen !== true) {
2592
+ seenPending.seen = true;
2593
+ this._logEvent('cli-ups-seen', { turn_id: m[1] });
2594
+ }
2595
+ // 0.13 D2: pickup transitions the ledger entry too — for injected
2596
+ // (no-pending) inputs this is THE fold/next-cycle signal that
2597
+ // cancels drop detection; for primaries it cancels the delivery
2598
+ // watchdog. A late pickup (queued inject becoming the next cycle)
2599
+ // landing inside the drop-confirm window cancels it here.
2600
+ const lEntry = this.inputLedger.get(m[1]);
2601
+ if (lEntry) {
2602
+ if (lEntry.state === 'written' || lEntry.state === 'fold-suspected') {
2603
+ this._ledgerTransition(m[1], 'seen');
2604
+ if (!seenPending) this._logEvent('cli-ups-seen', { turn_id: m[1] });
2605
+ }
2606
+ // 0.13 D3: the picked-up message anchors the cycle's visuals.
2607
+ if (anchorMsgId == null && lEntry.msgId != null) anchorMsgId = lEntry.msgId;
2608
+ }
2609
+ }
2610
+ }
1632
2611
  this.emit('turn-start', {
1633
2612
  backend: this.backend,
1634
2613
  sessionId: this.claudeSessionId,
2614
+ // 0.13 D3: lets the session feedback controller distinguish a
2615
+ // normal turn (has pending — per-turn visuals own it) from an
2616
+ // autonomous/injected cycle (no pending — the controller's job).
2617
+ hasPending: this.pendingTurns.size > 0,
2618
+ anchorMsgId,
1635
2619
  });
1636
2620
  return;
1637
2621
 
@@ -1714,15 +2698,94 @@ class CliProcess extends Process {
1714
2698
  return;
1715
2699
  }
1716
2700
 
1717
- case 'Stop':
1718
- // Phase 1.7 (TODO) will use this as the authoritative turn-end
1719
- // signal with stopGraceMs. For now: pass through as 'stop-hook'
1720
- // event so the resolver in Phase 1.7 can subscribe.
1721
- this.emit('stop-hook', {
2701
+ case 'Stop': {
2702
+ // 0.13 D1 rung 1: Stop ends the turn ONLY when the ending cycle is
2703
+ // attributable to it. Stop carries no turn_id, and claude-side cycles
2704
+ // polygram never registered a pending for are routine (/compact +
2705
+ // bg-work self-checks via fireUserMessage, ScheduleWakeup cycles, a
2706
+ // non-folded inject running as its own cycle — the P0 spike confirmed
2707
+ // such cycles DO fire Stop). Pre-D1 the rc.16 branch finalized the
2708
+ // single pending on ANY Stop — a foreign cycle's Stop could close a
2709
+ // queued, never-picked-up user turn and deliver the FOREIGN cycle's
2710
+ // last_assistant_message as its answer (seam S5's Stop-identity gap).
2711
+ const info = {
1722
2712
  stopHookActive: ev.stopHookActive,
1723
2713
  lastAssistantMessage: ev.lastAssistantMessage,
1724
2714
  backend: this.backend,
1725
- });
2715
+ };
2716
+ // Legacy (rung 3) turns already resolving via a reply quiet-window
2717
+ // consume this via their per-turn onStop listener (the text-fallback
2718
+ // rescue inside _resolveTurn). Emit first so that path runs
2719
+ // synchronously before the attribution branch below.
2720
+ this.emit('stop-hook', info);
2721
+
2722
+ // A stop-hook-forced continuation means the cycle is, by definition,
2723
+ // NOT over — never finalize on it. (Unobserved in 30d of prod data;
2724
+ // cheap insurance per the design's round-2 review.)
2725
+ if (ev.stopHookActive === true) {
2726
+ this._logEvent('cli-stop-hook-active-ignored', { pending_count: this.pendingTurns.size });
2727
+ return;
2728
+ }
2729
+
2730
+ if (this.pendingTurns.size === 1) {
2731
+ const [turnId, p] = [...this.pendingTurns.entries()][0];
2732
+ if (!p._stopGracePending) {
2733
+ const attributed = p.seen === true || (p.replies?.length || 0) > 0;
2734
+ if (attributed) {
2735
+ // Finalize through a short grace; any same-session activity
2736
+ // inside it proves this Stop was stale/foreign (lagged ndjson
2737
+ // delivery) and cancels — the turn falls back to rung 2.
2738
+ this._beginAttributedStopGrace(turnId, p, info);
2739
+ } else {
2740
+ // Never picked up (no UPS-seen) and never replied — this Stop
2741
+ // belongs to a foreign cycle. Ignore it loudly; the pending
2742
+ // ends via its own pickup→Stop, rung 2, or the ceilings.
2743
+ this._logEvent('cli-stop-foreign', {
2744
+ turn_id: turnId,
2745
+ session_id: this.claudeSessionId,
2746
+ });
2747
+ }
2748
+ }
2749
+ } else if (this.pendingTurns.size > 1) {
2750
+ // Can't attribute Stop to one of several concurrent turns — surface
2751
+ // it so a turn that waited for its grace timer (instead of resolving
2752
+ // on Stop) is explained in the events DB.
2753
+ this._logEvent('cli-stop-unattributed', { pending_count: this.pendingTurns.size });
2754
+ }
2755
+
2756
+ // 0.12.0-rc.13 proactive compaction warning: on turn-end, if enabled
2757
+ // for this chat and not already warned this climb, sample context
2758
+ // occupancy from the transcript and warn (propose /compact) BEFORE
2759
+ // claude auto-compacts mid-turn and detaches the bridge. Fire-and-
2760
+ // forget — transcript IO must never block the stop path.
2761
+ if (this.compactionWarn?.enabled && !this._compactionWarned && ev.transcriptPath) {
2762
+ this._maybeProactiveCompactionWarn(ev.transcriptPath);
2763
+ }
2764
+ return;
2765
+ }
2766
+
2767
+ case 'PreCompact':
2768
+ // 0.12.0-rc.13: auto-compaction is the event that detaches the
2769
+ // channels MCP bridge mid-turn. Record it; and on the dangerous AUTO
2770
+ // case (manual /compact is the user's own deliberate action — never
2771
+ // nag), emit a reactive warning the chat layer posts. The proactive
2772
+ // warning (on Stop) tries to PREVENT this; this is the backstop.
2773
+ this._logEvent('cli-compaction-imminent', { trigger: ev.trigger });
2774
+ if (this.compactionWarn?.enabled && ev.trigger === 'auto') {
2775
+ this.emit('compaction-warn', {
2776
+ kind: 'reactive',
2777
+ trigger: 'auto',
2778
+ sessionId: this.claudeSessionId,
2779
+ backend: this.backend,
2780
+ });
2781
+ }
2782
+ return;
2783
+
2784
+ case 'PostCompact':
2785
+ // Context just dropped — re-arm the proactive warn-once so the next
2786
+ // climb can warn again.
2787
+ this._compactionWarned = false;
2788
+ this._logEvent('cli-compaction-done', { trigger: ev.trigger });
1726
2789
  return;
1727
2790
 
1728
2791
  case 'Notification':
@@ -1761,15 +2824,22 @@ class CliProcess extends Process {
1761
2824
  {
1762
2825
  const requestId = ev.toolUseId || `hook-notification-${Date.now()}`;
1763
2826
  const toolName = ev.toolName;
1764
- const toolInput = this._formatToolInputForApproval(
1765
- ev.prompt || null,
1766
- // Use the structured tool_input as the "preview" it's
1767
- // already structured by claude rather than truncated to
1768
- // 200 chars like the channels bridge perm_req does.
1769
- typeof ev.toolInput === 'string'
1770
- ? ev.toolInput
1771
- : JSON.stringify(ev.toolInput || {}),
1772
- );
2827
+ // Finding #11 fix: pass the STRUCTURED tool_input through. makeCanUseTool
2828
+ // matches gated patterns via matchesAnyPattern, which reads
2829
+ // input.command (Bash) / input.url (WebFetch) a formatted STRING
2830
+ // makes those undefined so a gated `Bash(rm *)` never matches and the
2831
+ // tool is allowed with NO approval card (silent gating bypass). The
2832
+ // hook Notification payload carries structured tool_input, so forward
2833
+ // it as-is; the approval card (approvalCardText) renders a structured
2834
+ // object fine — same shape the SDK canUseTool path already uses. Fall
2835
+ // back to the formatted-string preview only if claude sent no
2836
+ // structured tool_input (degenerate — tool needs perm but no input).
2837
+ const toolInput = (ev.toolInput && typeof ev.toolInput === 'object')
2838
+ ? ev.toolInput
2839
+ : this._formatToolInputForApproval(
2840
+ ev.prompt || null,
2841
+ typeof ev.toolInput === 'string' ? ev.toolInput : JSON.stringify(ev.toolInput || {}),
2842
+ );
1773
2843
  this.emit('approval-required', {
1774
2844
  id: requestId,
1775
2845
  toolName,
@@ -1843,11 +2913,11 @@ class CliProcess extends Process {
1843
2913
  * landing just before the disconnect would otherwise leave a stray
1844
2914
  * timer on the dead instance).
1845
2915
  */
1846
- _handleBridgeDisconnected() {
2916
+ _handleBridgeDisconnected(reason = 'socket-close') {
1847
2917
  this.bridgeReady = false;
1848
2918
  this.mcpReady = false;
1849
2919
  if (this.closed) return;
1850
- this.logger.warn?.(`[${this.label}] channels: bridge disconnected unexpectedly`);
2920
+ this.logger.warn?.(`[${this.label}] channels: bridge disconnected unexpectedly (${reason})`);
1851
2921
  // L6: clear the interrupt grace timer alongside the rest of the lifecycle.
1852
2922
  if (this._interruptGraceTimer) {
1853
2923
  clearTimeout(this._interruptGraceTimer);
@@ -1859,6 +2929,7 @@ class CliProcess extends Process {
1859
2929
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1860
2930
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1861
2931
  if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
2932
+ if (pending._activityQuietTimer) clearTimeout(pending._activityQuietTimer); // 0.13 D1
1862
2933
  // L5: remove the per-turn stop-hook listener (this path bypasses
1863
2934
  // Process.kill()'s removeAllListeners).
1864
2935
  if (pending._onStop) this.off('stop-hook', pending._onStop);
@@ -1869,14 +2940,24 @@ class CliProcess extends Process {
1869
2940
  this.pendingTurns.clear();
1870
2941
  this.pendingQueue.length = 0;
1871
2942
  this.inFlight = false;
2943
+ // 0.12: drop the interactive-question keep-alive here too, for parity with
2944
+ // _doKill — pm reacts to 'bridge-disconnected' by killing us anyway, but don't
2945
+ // depend on that ordering to stop the 60s interval / clear the open set.
2946
+ this._stopQuestionKeepAlive();
2947
+ this._openQuestions.clear();
2948
+ this._clearLedgerTimers(); // 0.13 D2
1872
2949
  this.emit('bridge-disconnected');
1873
- this._logEvent('bridge-disconnected', { reason: 'socket-close' });
2950
+ this._logEvent('bridge-disconnected', { reason });
1874
2951
  }
1875
2952
 
1876
2953
  async _doKill(reason) {
1877
2954
  this.closed = true;
1878
2955
  this.inFlight = false;
1879
2956
 
2957
+ this._stopQuestionKeepAlive(); // 0.12: drop the interactive-question keep-alive
2958
+ this._openQuestions.clear();
2959
+ this._clearLedgerTimers(); // 0.13 D2
2960
+
1880
2961
  if (this.pingTimer) {
1881
2962
  clearInterval(this.pingTimer);
1882
2963
  this.pingTimer = null;
@@ -1896,6 +2977,7 @@ class CliProcess extends Process {
1896
2977
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1897
2978
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1898
2979
  if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
2980
+ if (pending._activityQuietTimer) clearTimeout(pending._activityQuietTimer); // 0.13 D1
1899
2981
  if (pending._onStop) this.off('stop-hook', pending._onStop); // L5
1900
2982
  const err = new Error(`session killed: ${reason}`);
1901
2983
  err.code = 'KILLED';
@@ -1995,9 +3077,15 @@ class CliProcess extends Process {
1995
3077
  * @param {string|number} [opts.msgId] — inbound Telegram msg_id, passed through to the bridge so claude's next reply can echo it via turn_id
1996
3078
  * @returns {boolean}
1997
3079
  */
1998
- injectUserMessage({ content, priority = 'next', shouldQuery, msgId } = {}) {
3080
+ injectUserMessage({ content, priority = 'next', shouldQuery, msgId, source = 'inject' } = {}) {
1999
3081
  if (this.closed) return false;
2000
3082
  if (!this.inFlight) return false; // base contract: no live turn → caller falls through
3083
+ // C5 (review 2026-06-12): a cancel is in flight (interrupt grace armed) —
3084
+ // inFlight is still true until the grace fires, but merging a follow-up into
3085
+ // work the user just stopped is wrong AND leaks a fresh 'written' ledger
3086
+ // entry the cancel-loop already passed (later re-delivery). Refuse so the
3087
+ // caller queues it as a fresh primary turn instead.
3088
+ if (this._interruptGraceTimer) return false;
2001
3089
  if (!this.bridgeReady) return false;
2002
3090
  if (typeof content !== 'string' || !content) return false;
2003
3091
 
@@ -2026,9 +3114,14 @@ class CliProcess extends Process {
2026
3114
  this.emit('inject-fail', { err: 'bridge write failed', source: 'inject' });
2027
3115
  return false;
2028
3116
  }
3117
+ // 0.13 D2: the injected turn_id is LEDGERED — pre-P3 it never escaped this
3118
+ // function, making fold/new-turn/drop indistinguishable (seam S4).
3119
+ this._ledgerAdd(turnId, { source, msgId });
2029
3120
  this._logEvent('inject-user-message', {
2030
3121
  session_key: this.sessionKey,
2031
3122
  chat_id: this.chatId,
3123
+ turn_id: turnId,
3124
+ source,
2032
3125
  priority: priority ?? null,
2033
3126
  should_query: shouldQuery ?? null,
2034
3127
  text_len: safeContent.length,
@@ -2045,7 +3138,8 @@ class CliProcess extends Process {
2045
3138
 
2046
3139
  /**
2047
3140
  * Review AC7: fire-and-forget user-message into the bridge. Polygram's
2048
- * slash-command paths (/compact, /reload) use this to push a user-shaped
3141
+ * /compact path, the boot-time compact-replay, and the bg-work stall
3142
+ * self-check use this to push a user-shaped
2049
3143
  * prompt without registering a pending turn. SDK/tmux implement this
2050
3144
  * differently per backend; channels just writes a user_msg to the bridge
2051
3145
  * with a fresh turn_id (which has no listener — so any reply Claude sends
@@ -2059,6 +3153,7 @@ class CliProcess extends Process {
2059
3153
  if (typeof text !== 'string' || text.length === 0) return false;
2060
3154
  if (this.closed || !this.bridgeReady) return false;
2061
3155
  const turnId = crypto.randomUUID();
3156
+ this._ledgerAdd(turnId, { source: 'system' }); // 0.13 D2: visible, never redelivered
2062
3157
  this._writeToBridge({
2063
3158
  kind: 'user_msg',
2064
3159
  turn_id: turnId,
@@ -2092,6 +3187,7 @@ class CliProcess extends Process {
2092
3187
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
2093
3188
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
2094
3189
  if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer); // L5
3190
+ if (pending._activityQuietTimer) clearTimeout(pending._activityQuietTimer); // 0.13 D1
2095
3191
  if (pending._onStop) this.off('stop-hook', pending._onStop); // L5
2096
3192
  const err = new Error(`session reset: ${reason}`);
2097
3193
  err.code = 'RESET';
@@ -2237,6 +3333,49 @@ class CliProcess extends Process {
2237
3333
  this._writeToBridge({ kind: 'perm_verdict', request_id: requestId, behavior });
2238
3334
  }
2239
3335
 
3336
+ // ─── interactive questions (0.12 ask) ─────────────────────────────
3337
+
3338
+ /**
3339
+ * Hand a question's answer back to the blocking `ask` tool call. `result` is
3340
+ * {answers:[...]} | {cancelled:true} | {timedout:true}. Stops the keep-alive
3341
+ * once no questions remain open. Called by pm.answerQuestion (from the handler).
3342
+ */
3343
+ writeQuestionAnswer(toolCallId, result) {
3344
+ this._openQuestions.delete(toolCallId);
3345
+ const noneLeft = this._openQuestions.size === 0;
3346
+ if (noneLeft) this._stopQuestionKeepAlive();
3347
+ const wrote = this._writeToBridge({ kind: 'question_answer', tool_call_id: toolCallId, result: result ?? {} });
3348
+ // Re-light progress: claude is about to resume working on the answer. The per-turn reactor
3349
+ // cleared when claude posted its reply + asked, and no tool hooks fired during the wait, so
3350
+ // it stayed cleared — the post-answer work was invisible ("why don't I see it working after
3351
+ // submit?", hire topic 2026-06-09). On a REAL answer (cancelled/timeout END the turn → let
3352
+ // the normal teardown clear), signal polygram to re-arm the turn's working reaction.
3353
+ if (noneLeft && result && !result.cancelled && !result.timedout) {
3354
+ this.emit('question-resumed');
3355
+ }
3356
+ // 0.13 D1: the wait is over either way — restart the activity clock so a
3357
+ // replied turn's rung-2 finalize resumes (real answer: claude works on;
3358
+ // cancelled/timedout: claude wraps up — rung 2 then ends the tail cleanly).
3359
+ if (noneLeft) this._noteActivity('question-answered');
3360
+ return wrote;
3361
+ }
3362
+
3363
+ _startQuestionKeepAlive() {
3364
+ if (this._questionKeepAliveTimer) return;
3365
+ this._questionKeepAliveTimer = setInterval(() => {
3366
+ if (this._openQuestions.size === 0) { this._stopQuestionKeepAlive(); return; }
3367
+ // claude is idle waiting on the answer → no tool hooks → reset the idle
3368
+ // ceiling so the turn isn't killed mid-question. (Rung 2 is suspended
3369
+ // while a question is open, so this only feeds the hardTimer.)
3370
+ this._noteActivity('question-keepalive');
3371
+ }, 60_000);
3372
+ this._questionKeepAliveTimer.unref?.();
3373
+ }
3374
+
3375
+ _stopQuestionKeepAlive() {
3376
+ if (this._questionKeepAliveTimer) { clearInterval(this._questionKeepAliveTimer); this._questionKeepAliveTimer = null; }
3377
+ }
3378
+
2240
3379
  // ─── socket plumbing ──────────────────────────────────────────────
2241
3380
 
2242
3381
  _writeToBridge(obj) {
@@ -2280,6 +3419,11 @@ class CliProcess extends Process {
2280
3419
  this._pollMidTurnDialogs().catch((err) => {
2281
3420
  this.logger.warn?.(`[${this.label}] channels: mid-turn poll failed: ${err.message}`);
2282
3421
  });
3422
+ // 0.12.0 background-work lifecycle: idle-side stall-watchdog, the mirror of
3423
+ // _pollMidTurnDialogs (which only runs during turns). Fire-and-forget.
3424
+ this._pollBackgroundWork().catch((err) => {
3425
+ this.logger.warn?.(`[${this.label}] channels: bg-work poll failed: ${err.message}`);
3426
+ });
2283
3427
  }, PONG_CHECK_INTERVAL_MS);
2284
3428
  this.pongWatchdog.unref?.();
2285
3429
  }
@@ -2305,9 +3449,46 @@ class CliProcess extends Process {
2305
3449
  * Extracted as a separate async method so unit tests can drive it
2306
3450
  * directly without waiting for the setInterval tick.
2307
3451
  */
3452
+ /**
3453
+ * 0.12.0-rc.13: proactive compaction warning. Read the transcript's current
3454
+ * context occupancy and, if past the per-chat threshold, emit a
3455
+ * 'compaction-warn' the chat layer turns into "you're ~N% full, run
3456
+ * /compact" — giving the user a window to compact on their terms BEFORE
3457
+ * claude auto-compacts mid-turn (which detaches the channels bridge). Warns
3458
+ * once per climb (this._compactionWarned), re-armed on PostCompact.
3459
+ * Fire-and-forget: swallows its own errors so transcript IO never breaks
3460
+ * the turn-end path.
3461
+ */
3462
+ async _maybeProactiveCompactionWarn(transcriptPath) {
3463
+ try {
3464
+ if (!this.compactionWarn?.enabled || this._compactionWarned) return;
3465
+ const usage = await readContextTokens(transcriptPath);
3466
+ if (!usage) return;
3467
+ const pct = contextPct(usage.total) * 100;
3468
+ if (pct < this.compactionWarn.thresholdPct) return;
3469
+ if (this._compactionWarned) return; // re-check after the async gap
3470
+ this._compactionWarned = true;
3471
+ this.emit('compaction-warn', {
3472
+ kind: 'proactive',
3473
+ pct: Math.round(pct),
3474
+ totalTokens: usage.total,
3475
+ sessionId: this.claudeSessionId,
3476
+ backend: this.backend,
3477
+ });
3478
+ } catch (err) {
3479
+ this.logger.warn?.(`[${this.label}] compaction-warn sample failed: ${err.message}`);
3480
+ }
3481
+ }
3482
+
2308
3483
  async _pollMidTurnDialogs() {
2309
3484
  if (this.closed) return;
2310
3485
  if (this.pendingTurns.size === 0) return; // no work to do when idle
3486
+ // 0.12 interactive questions: while an `ask` is open claude sits idle at the
3487
+ // prompt waiting on the tool result — so the pane shows no "esc to interrupt"
3488
+ // and the question's own echoed text (a "?"/numbered list/"Yes/No") would
3489
+ // false-trip the unknown-prompt heuristic + starve the STALL heartbeat. The
3490
+ // keyboard lives on Telegram; suppress the pane watchdog while a question is open.
3491
+ if (this._openQuestions.size > 0) return;
2311
3492
  if (!this.tmuxSession) return; // pre-spawn / post-kill
2312
3493
  if (typeof this.runner?.captureWide !== 'function') return;
2313
3494
 
@@ -2323,6 +3504,15 @@ class CliProcess extends Process {
2323
3504
  }
2324
3505
  if (!pane) return;
2325
3506
 
3507
+ // rc.14: removed the rc.11 pane-based "dead bridge" detection here. It
3508
+ // matched the BENIGN banner "server:polygram-bridge no MCP server
3509
+ // configured with that name" — a cosmetic line that
3510
+ // `--dangerously-load-development-channels` + `--strict-mcp-config` prints
3511
+ // on EVERY healthy session (channel still delivers; reply tool still
3512
+ // works). The matcher false-fired ~5s into every channels turn and killed
3513
+ // healthy sessions. Real bridge loss is the socket-close path
3514
+ // (_handleBridgeDisconnected), not anything observable in the pane.
3515
+
2326
3516
  const now = Date.now();
2327
3517
 
2328
3518
  // 0.12 Phase 3.2: liveness heartbeat. The TUI prints "esc to interrupt"
@@ -2333,6 +3523,11 @@ class CliProcess extends Process {
2333
3523
  // resets a timer; safe to fire on every poll while claude is busy.
2334
3524
  if (STREAMING_HINT_RE.test(pane)) {
2335
3525
  this.emit('thinking');
3526
+ // 0.13 D1: the pane heartbeat is ACTIVITY for the finalizer ladder —
3527
+ // pure-thinking stretches fire ZERO hooks for 45s+ (that is this
3528
+ // heartbeat's whole reason to exist), so a hook-only quiet clock would
3529
+ // finalize a replied turn mid-thought (round-2 panel finding).
3530
+ this._noteActivity('pane-thinking');
2336
3531
  }
2337
3532
 
2338
3533
  let matchedKnownPrompt = false;
@@ -2359,16 +3554,28 @@ class CliProcess extends Process {
2359
3554
  pending_count: this.pendingTurns.size,
2360
3555
  });
2361
3556
 
2362
- if (prompt.action === 'enter') {
2363
- try {
2364
- await this.runner.sendControl(this.tmuxSession, 'Enter');
2365
- } catch (err) {
2366
- this.logger.warn?.(
2367
- `[${this.label}] cli: mid-turn dismiss-Enter failed for ${prompt.name}: ${err.message}`,
2368
- );
3557
+ if (prompt.action === 'enter' || prompt.action === 'keys') {
3558
+ // 'keys' sends a navigation sequence (e.g. Down,Enter to pick a
3559
+ // non-default dialog option); 'enter' stays the single-key dismissal.
3560
+ const keySeq = prompt.action === 'keys' ? prompt.keys : ['Enter'];
3561
+ for (let ki = 0; ki < keySeq.length; ki++) {
3562
+ if (ki > 0) await new Promise(r => setTimeout(r, 120)); // Ink can swallow same-batch keys
3563
+ try {
3564
+ await this.runner.sendControl(this.tmuxSession, keySeq[ki]);
3565
+ } catch (err) {
3566
+ this.logger.warn?.(
3567
+ `[${this.label}] cli: mid-turn ${keySeq[ki]} failed for ${prompt.name}: ${err.message}`,
3568
+ );
3569
+ }
2369
3570
  }
2370
3571
  }
2371
3572
  // 'emit-only': telemetry-only; operator decides next step.
3573
+ // Resume-dialog fix: the session-age dialog escaping to MID-TURN means
3574
+ // env suppression failed AND the startup gate didn't see it — same
3575
+ // soak-queryable event kind as the startup-gate fallback.
3576
+ if (prompt.name === 'session-age') {
3577
+ this._logEvent('session-age-dialog-fallback', { tmux_name: this.tmuxSession, phase: 'mid-turn' });
3578
+ }
2372
3579
  }
2373
3580
 
2374
3581
  // 0.12 Phase 3.3 (Q1 resolution): unknown-prompt heuristic. If the pane