polygram 0.12.0-rc.1 → 0.12.0-rc.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -416,28 +416,7 @@ class CliProcess extends Process {
416
416
 
417
417
  this.bridgeServer.on('bridge-message', msg => this._handleBridgeMessage(msg));
418
418
 
419
- this.bridgeServer.on('bridge-disconnected', () => {
420
- this.bridgeReady = false;
421
- this.mcpReady = false;
422
- if (!this.closed) {
423
- this.logger.warn?.(`[${this.label}] channels: bridge disconnected unexpectedly`);
424
- // P1 #5: drain pendingTurns immediately so hardTimers don't run 10min.
425
- for (const [, pending] of this.pendingTurns) {
426
- if (pending.quietTimer) clearTimeout(pending.quietTimer);
427
- if (pending.hardTimer) clearTimeout(pending.hardTimer);
428
- if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
429
- if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
430
- const err = new Error('bridge disconnected');
431
- err.code = 'BRIDGE_DISCONNECTED';
432
- try { pending.reject(err); } catch {}
433
- }
434
- this.pendingTurns.clear();
435
- this.pendingQueue.length = 0;
436
- this.inFlight = false;
437
- this.emit('bridge-disconnected');
438
- this._logEvent('bridge-disconnected', { reason: 'socket-close' });
439
- }
440
- });
419
+ this.bridgeServer.on('bridge-disconnected', () => this._handleBridgeDisconnected());
441
420
 
442
421
  await this.bridgeServer.listen();
443
422
  }
@@ -540,6 +519,9 @@ class CliProcess extends Process {
540
519
  );
541
520
  }
542
521
  }
522
+ // Finding 0.12-M2: record the resume decision so _armHookTail (run
523
+ // after spawn) skips the prior session's still-on-disk hook ndjson.
524
+ this._resumedSession = canResume;
543
525
  if (agent) claudeArgs.push('--agent', agent);
544
526
  if (model) claudeArgs.unshift('--model', model);
545
527
  if (effort) claudeArgs.push('--effort', effort);
@@ -705,6 +687,15 @@ class CliProcess extends Process {
705
687
  ],
706
688
  readySignal: /Listening for channel messages from: server:polygram-bridge/i,
707
689
  timeoutCode: 'CHANNELS_DIALOG_TIMEOUT',
690
+ // Progress-aware gate (shumorobot General incident 2026-05-30): a
691
+ // cold spawn that's mid-download (runtime fetch, "24%" progress bar)
692
+ // is genuinely working and must NOT be killed by the blind 30s
693
+ // wall-clock. stallMs fails fast only when the pane is FROZEN; an
694
+ // actively-changing pane (download bar, dialog nav) keeps resetting
695
+ // the stall clock and rides out to the ready signal. deadlineMs stays
696
+ // the absolute backstop. 30s of zero pane activity = genuinely wedged.
697
+ stallMs: this.startupGateStallMs ?? 30_000,
698
+ deadlineMs: this.startupGateDeadlineMs ?? 180_000,
708
699
  logger: this.logger,
709
700
  label: `${this.label}:startup-gate`,
710
701
  });
@@ -849,15 +840,18 @@ class CliProcess extends Process {
849
840
  // rate-limit / chat-id-mismatch path. Live shumorobot 2026-05-26 23:44
850
841
  // observed 3+ "Called polygram-bridge" entries in the TUI pane with
851
842
  // ZERO OUT messages delivered to TG and zero warn-level diagnostics —
852
- // need to see args.text / args.chat_id / args.turn_id to know whether
853
- // claude is calling reply with empty text, wrong chat_id, or something
854
- // else entirely.
855
- this.logger.warn?.(
843
+ // need to see args.chat_id / args.turn_id to know whether claude is
844
+ // calling reply with empty text, wrong chat_id, or something else.
845
+ // L13: root-caused — demoted to debug and DROPPED text_head. Logging
846
+ // the first 80 chars of every reply at warn level leaked private chat
847
+ // content / file excerpts / secrets into the default log sink,
848
+ // unconditionally. name/chat_id/turn_id/text_len diagnose dispatch
849
+ // without exposing message content.
850
+ this.logger.debug?.(
856
851
  `[${this.label}] channels: tool-call name=${msg.name} ` +
857
852
  `chat_id=${JSON.stringify(args.chat_id)} ` +
858
853
  `turn_id=${JSON.stringify(args.turn_id)} ` +
859
- `text_len=${typeof args.text === 'string' ? args.text.length : 'non-string'} ` +
860
- `text_head=${JSON.stringify((args.text || '').slice(0, 80))}`,
854
+ `text_len=${typeof args.text === 'string' ? args.text.length : 'non-string'}`,
861
855
  );
862
856
 
863
857
  // Review P1 #7: idempotency. If we've already ACK'd this tool_call_id,
@@ -1122,13 +1116,27 @@ class CliProcess extends Process {
1122
1116
  this._finalizeTurn(turnId);
1123
1117
  };
1124
1118
  const onStop = (info) => {
1125
- // Capture the fallback text; the actual finalize call below will pick
1126
- // it up via pending._stopHookData.
1119
+ // Finding 0.12-M1: the Stop hook carries NO turn_id, and a single
1120
+ // global 'stop-hook' emission fires EVERY per-turn onStop listener.
1121
+ // When more than one turn is in stop-grace we cannot attribute this
1122
+ // Stop (or its last_assistant_message) to a specific turn — the
1123
+ // pre-fix code let one Stop finalize all grace-pending turns and
1124
+ // cross-attribute one turn's text to another (the exact class the
1125
+ // F#3 reply routing prevents). Mirror that drop-rather-than-
1126
+ // misattribute discipline: only consume the Stop when exactly ONE
1127
+ // turn is in grace; otherwise ignore it and let each turn finalize
1128
+ // on its own grace timer (each keeps its own reply text).
1129
+ let graceCount = 0;
1130
+ for (const p of this.pendingTurns.values()) if (p._stopGracePending) graceCount++;
1131
+ if (graceCount !== 1) return;
1127
1132
  pending._stopHookData = info;
1128
1133
  clearTimeout(pending._stopGraceTimer);
1129
1134
  pending._stopGraceTimer = null;
1130
1135
  finalize();
1131
1136
  };
1137
+ // L5: stash the closure so teardown paths that bypass Process.kill()'s
1138
+ // removeAllListeners (bridge-disconnect drain, resetSession) can off it.
1139
+ pending._onStop = onStop;
1132
1140
  pending._stopGraceTimer = setTimeout(finalize, this.stopGraceMs);
1133
1141
  // unref so a never-fired grace doesn't pin the event loop. In tests
1134
1142
  // where a CliProcess is created, send() is called, then the test
@@ -1415,17 +1423,18 @@ class CliProcess extends Process {
1415
1423
  this.logger.warn?.(`[${this.label}] _armHookTail: _hookNdjsonPath unset; hooks disabled. Phase 1.2 may have failed.`);
1416
1424
  return;
1417
1425
  }
1418
- // Fresh spawn: ndjson was just touched by writeHookFiles and is empty,
1419
- // so `skipExisting: false` (default) is correct. For lazy-respawn on
1420
- // existingSessionId, we currently re-run writeHookFiles which touches
1421
- // a NEW file with the same name (overwrite). If we ever switch to
1422
- // resume-without-touch, set skipExisting: true to avoid replaying
1423
- // stale events from the prior process same pattern tmux uses on
1424
- // --resume per rc.42 #5.
1426
+ // Finding 0.12-M2: writeHookFiles opens the ndjson in APPEND mode
1427
+ // ('a') and never truncates, so on a --resume respawn the prior
1428
+ // session's hook lines are still on disk under the same path. Replaying
1429
+ // them re-drives the turn state machine from stale Stop/PreToolUse
1430
+ // events (a stale Stop can finalize the fresh turn). So skip existing
1431
+ // content when (and only when) this is a resumed session the same
1432
+ // discipline the JSONL tail uses on --resume. A fresh spawn's ndjson is
1433
+ // empty, so skipExisting:false is correct there.
1425
1434
  this._hookTail = createHookTail({
1426
1435
  path: this._hookNdjsonPath,
1427
1436
  logger: this.logger,
1428
- skipExisting: false,
1437
+ skipExisting: this._resumedSession === true,
1429
1438
  });
1430
1439
  this._hookTail.on('event', (ev) => {
1431
1440
  try {
@@ -1465,25 +1474,18 @@ class CliProcess extends Process {
1465
1474
  // gates tag-out on median < 2s and p99 < 5s across the events DB.
1466
1475
  if (Number.isFinite(ev.receivedAtMs)) {
1467
1476
  const lagMs = Date.now() - ev.receivedAtMs;
1477
+ // L10: emit ONLY — the onHookLagSample callback owns the DB write
1478
+ // (CALLBACK_TO_EVENT → callbacks.js). Previously this ALSO wrote
1479
+ // directly via this.db.logEvent, double-persisting every sample and
1480
+ // inflating the Phase 1.8 soak-gate row count. Consistent with how
1481
+ // tool-result / subagent-start / subagent-done are handled (emit,
1482
+ // don't double-write).
1468
1483
  this.emit('hook-lag-sample', {
1469
1484
  hookEventName: ev.type,
1470
1485
  lagMs,
1471
1486
  toolName: ev.toolName || null,
1472
1487
  backend: this.backend,
1473
1488
  });
1474
- // Log to events DB if wired. db is optional (factory injects when
1475
- // available) — same pattern as the other parity-P1 _logEvent calls.
1476
- if (this.db?.logEvent) {
1477
- try {
1478
- this.db.logEvent('hook-lag-sample', {
1479
- session_key: this.sessionKey,
1480
- backend: this.backend,
1481
- hook_event_name: ev.type,
1482
- tool_name: ev.toolName || null,
1483
- lag_ms: lagMs,
1484
- });
1485
- } catch {}
1486
- }
1487
1489
  }
1488
1490
 
1489
1491
  switch (ev.type) {
@@ -1503,6 +1505,16 @@ class CliProcess extends Process {
1503
1505
  const subagentType = ev.toolInput?.subagent_type
1504
1506
  || ev.toolInput?.agent_type
1505
1507
  || 'general-purpose';
1508
+ // Finding 0.12-M4: SubagentStop carries agent_id/agent_type but
1509
+ // NOT the originating Agent tool_use_id, so without help the
1510
+ // subagent-start/subagent-done rows share no JOIN key (the
1511
+ // documented soak query on $.tool_use_id returns zero rows).
1512
+ // Track the in-flight Agent tool_use_id keyed by subagent type so
1513
+ // the paired SubagentStop below can stamp it onto subagent-done.
1514
+ (this._pendingSubagentStarts ||= []).push({
1515
+ agentType: subagentType,
1516
+ toolUseId: ev.toolUseId,
1517
+ });
1506
1518
  this.emit('subagent-start', {
1507
1519
  agentType: subagentType,
1508
1520
  // PreToolUse for Agent carries no agent_id (set later on
@@ -1541,14 +1553,27 @@ class CliProcess extends Process {
1541
1553
  });
1542
1554
  return;
1543
1555
 
1544
- case 'SubagentStop':
1556
+ case 'SubagentStop': {
1557
+ // Finding 0.12-M4: recover the originating Agent tool_use_id so the
1558
+ // subagent-start/subagent-done pair is JOINable. Prefer a match on
1559
+ // agent type (correct for parallel subagents of different types);
1560
+ // fall back to the oldest pending start when types don't line up.
1561
+ let subagentToolUseId = null;
1562
+ const pendingStarts = this._pendingSubagentStarts;
1563
+ if (pendingStarts && pendingStarts.length) {
1564
+ let idx = pendingStarts.findIndex(s => s.agentType === ev.agentType);
1565
+ if (idx < 0) idx = 0;
1566
+ subagentToolUseId = pendingStarts.splice(idx, 1)[0]?.toolUseId ?? null;
1567
+ }
1545
1568
  this.emit('subagent-done', {
1546
1569
  agentType: ev.agentType,
1547
1570
  agentId: ev.agentId,
1548
1571
  durationMs: ev.durationMs,
1572
+ toolUseId: subagentToolUseId,
1549
1573
  backend: this.backend,
1550
1574
  });
1551
1575
  return;
1576
+ }
1552
1577
 
1553
1578
  case 'Stop':
1554
1579
  // Phase 1.7 (TODO) will use this as the authoritative turn-end
@@ -1665,6 +1690,50 @@ class CliProcess extends Process {
1665
1690
  }
1666
1691
  }
1667
1692
 
1693
+ /**
1694
+ * Drain on unexpected bridge socket loss (claude crash, bridge crash,
1695
+ * EOF). Extracted from the inline 'bridge-disconnected' handler so the
1696
+ * teardown is testable and consistent with _doKill.
1697
+ *
1698
+ * Findings 0.12-L5 + L6: in addition to clearing the per-turn timers
1699
+ * and rejecting pendings (the original P1 #5 behavior), this now also
1700
+ * (L5) removes each turn's stop-hook listener — this drain does NOT go
1701
+ * through Process.kill()'s blanket removeAllListeners, so a turn torn
1702
+ * down mid-stop-grace would otherwise leak its onStop closure — and
1703
+ * (L6) clears _interruptGraceTimer, matching _doKill (a /stop verdict
1704
+ * landing just before the disconnect would otherwise leave a stray
1705
+ * timer on the dead instance).
1706
+ */
1707
+ _handleBridgeDisconnected() {
1708
+ this.bridgeReady = false;
1709
+ this.mcpReady = false;
1710
+ if (this.closed) return;
1711
+ this.logger.warn?.(`[${this.label}] channels: bridge disconnected unexpectedly`);
1712
+ // L6: clear the interrupt grace timer alongside the rest of the lifecycle.
1713
+ if (this._interruptGraceTimer) {
1714
+ clearTimeout(this._interruptGraceTimer);
1715
+ this._interruptGraceTimer = null;
1716
+ }
1717
+ // P1 #5: drain pendingTurns immediately so hardTimers don't run 10min.
1718
+ for (const [, pending] of this.pendingTurns) {
1719
+ if (pending.quietTimer) clearTimeout(pending.quietTimer);
1720
+ if (pending.hardTimer) clearTimeout(pending.hardTimer);
1721
+ if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1722
+ if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
1723
+ // L5: remove the per-turn stop-hook listener (this path bypasses
1724
+ // Process.kill()'s removeAllListeners).
1725
+ if (pending._onStop) this.off('stop-hook', pending._onStop);
1726
+ const err = new Error('bridge disconnected');
1727
+ err.code = 'BRIDGE_DISCONNECTED';
1728
+ try { pending.reject(err); } catch {}
1729
+ }
1730
+ this.pendingTurns.clear();
1731
+ this.pendingQueue.length = 0;
1732
+ this.inFlight = false;
1733
+ this.emit('bridge-disconnected');
1734
+ this._logEvent('bridge-disconnected', { reason: 'socket-close' });
1735
+ }
1736
+
1668
1737
  async _doKill(reason) {
1669
1738
  this.closed = true;
1670
1739
  this.inFlight = false;
@@ -1688,6 +1757,7 @@ class CliProcess extends Process {
1688
1757
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1689
1758
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1690
1759
  if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer);
1760
+ if (pending._onStop) this.off('stop-hook', pending._onStop); // L5
1691
1761
  const err = new Error(`session killed: ${reason}`);
1692
1762
  err.code = 'KILLED';
1693
1763
  pending.reject(err);
@@ -1876,6 +1946,8 @@ class CliProcess extends Process {
1876
1946
  if (pending.quietTimer) clearTimeout(pending.quietTimer);
1877
1947
  if (pending.hardTimer) clearTimeout(pending.hardTimer);
1878
1948
  if (pending.absoluteTimer) clearTimeout(pending.absoluteTimer);
1949
+ if (pending._stopGraceTimer) clearTimeout(pending._stopGraceTimer); // L5
1950
+ if (pending._onStop) this.off('stop-hook', pending._onStop); // L5
1879
1951
  const err = new Error(`session reset: ${reason}`);
1880
1952
  err.code = 'RESET';
1881
1953
  try { pending.reject(err); } catch {}
@@ -113,7 +113,6 @@ function createProcessFactory({
113
113
  queryCloseTimeoutMs,
114
114
  tmuxRunner = null,
115
115
  botName = null,
116
- pollScheduler = null,
117
116
  toolDispatcher = null,
118
117
  channelsClaudeBin = null,
119
118
  } = {}) {
@@ -123,6 +123,19 @@ const CALLBACK_TO_EVENT = {
123
123
  // menu auto-dismissed by `_waitForReady`. Surfacing the event so
124
124
  // soak can count how often aged-session resumes hit this path.
125
125
  onSessionAgePromptDismissed: 'session-age-prompt-dismissed',
126
+ // 0.12 CliProcess observability — typed hook events from cli-process.js
127
+ // _handleHookEvent. Each gets its own callback so polygram can persist
128
+ // structured rows to the events DB for soak-time aggregate queries.
129
+ // - hook-lag-sample: Phase 1.8 — per-event lag_ms (target: median<2s, p99<5s)
130
+ // - tool-result: Phase 1.3 — PostToolUse durationMs per tool
131
+ // - subagent-start / subagent-done: Phase 1.3 — typed subagent lifecycle
132
+ // (we DO get tool-use='Agent' via onToolUse, but agent_type + durationMs
133
+ // only fire on these typed events). SDK backend never emits — hooks
134
+ // are CliProcess-specific (and were tmux-specific in 0.10–0.11).
135
+ onHookLagSample: 'hook-lag-sample',
136
+ onToolResult: 'tool-result',
137
+ onSubagentStart: 'subagent-start',
138
+ onSubagentDone: 'subagent-done',
126
139
  };
127
140
 
128
141
  class ProcessManager {
@@ -464,7 +464,10 @@ function createSdkCallbacks({
464
464
  const detail = {
465
465
  chat_id: getChatIdFromKey(sessionKey),
466
466
  session_key: sessionKey,
467
- backend: 'tmux',
467
+ // Finding 0.12-M3: tmux backend was deleted in 0.12; these hook
468
+ // handlers only ever fire on the CLI driver now — default to 'cli'
469
+ // (honor an explicit payload.backend if a caller ever sets one).
470
+ backend: payload?.backend ?? 'cli',
468
471
  hook_type: payload?.type ?? null,
469
472
  claude_session_id: payload?.sessionId ?? null,
470
473
  tool_name: payload?.toolName ?? null,
@@ -555,7 +558,7 @@ function createSdkCallbacks({
555
558
  logEvent('turn-timeout', {
556
559
  chat_id: getChatIdFromKey(sessionKey),
557
560
  session_key: sessionKey,
558
- backend: 'tmux',
561
+ backend: payload?.backend ?? 'cli', // Finding 0.12-M3
559
562
  turn_id: payload?.turnId ?? null,
560
563
  reason: payload?.reason ?? null,
561
564
  idle_ms: payload?.idleMs ?? null,
@@ -578,7 +581,7 @@ function createSdkCallbacks({
578
581
  logEvent('hook-tail-error', {
579
582
  chat_id: getChatIdFromKey(sessionKey),
580
583
  session_key: sessionKey,
581
- backend: 'tmux',
584
+ backend: payload?.backend ?? 'cli', // Finding 0.12-M3 (fires on the CLI hook tail)
582
585
  message: (payload?.message || '').slice(0, 200),
583
586
  path: payload?.path ?? null,
584
587
  claude_session_id: payload?.sessionId ?? null,
@@ -596,7 +599,7 @@ function createSdkCallbacks({
596
599
  logEvent('stop-hook-resolved', {
597
600
  chat_id: getChatIdFromKey(sessionKey),
598
601
  session_key: sessionKey,
599
- backend: 'tmux',
602
+ backend: payload?.backend ?? 'cli', // Finding 0.12-M3
600
603
  turn_id: payload?.turnId ?? null,
601
604
  claude_session_id: payload?.sessionId ?? null,
602
605
  });
@@ -614,7 +617,7 @@ function createSdkCallbacks({
614
617
  logEvent('session-age-prompt-dismissed', {
615
618
  chat_id: getChatIdFromKey(sessionKey),
616
619
  session_key: sessionKey,
617
- backend: 'tmux',
620
+ backend: payload?.backend ?? 'cli', // Finding 0.12-M3
618
621
  claude_session_id: payload?.sessionId ?? null,
619
622
  });
620
623
  } catch (err) {
@@ -622,6 +625,108 @@ function createSdkCallbacks({
622
625
  }
623
626
  },
624
627
 
628
+ // 0.12 Phase 1.8 — hook-lag persistence for the soak gate (median<2s,
629
+ // p99<5s). Each row carries the hookEventName + lagMs so we can:
630
+ // SELECT json_extract(detail_json, '$.hook_event_name') AS evt,
631
+ // AVG(json_extract(detail_json, '$.lag_ms')) AS avg_lag,
632
+ // MAX(json_extract(detail_json, '$.lag_ms')) AS max_lag
633
+ // FROM events WHERE kind='hook-lag-sample' AND ts>...
634
+ // GROUP BY evt;
635
+ onHookLagSample: (sessionKey, payload /* , entry */) => {
636
+ try {
637
+ logEvent('hook-lag-sample', {
638
+ chat_id: getChatIdFromKey(sessionKey),
639
+ session_key: sessionKey,
640
+ backend: payload?.backend ?? 'cli',
641
+ hook_event_name: payload?.hookEventName ?? null,
642
+ lag_ms: payload?.lagMs ?? null,
643
+ tool_name: payload?.toolName ?? null,
644
+ });
645
+ } catch (err) {
646
+ logger.error?.(`[${botName}] hook-lag-sample handler: ${err.message}`);
647
+ }
648
+ },
649
+
650
+ // 0.12 Phase 1.3 — tool-result with durationMs. Pairs with the
651
+ // existing onToolUse row (which fires on PreToolUse) so the soak can
652
+ // compute per-tool average + p99 durations:
653
+ // SELECT json_extract(detail_json, '$.tool_name') AS tool,
654
+ // AVG(json_extract(detail_json, '$.duration_ms')) AS avg_ms,
655
+ // MAX(json_extract(detail_json, '$.duration_ms')) AS max_ms
656
+ // FROM events WHERE kind='tool-result' GROUP BY tool;
657
+ // isError captures the rare PostToolUse where the tool itself failed
658
+ // (vs the tool succeeding but claude deciding to retry).
659
+ onToolResult: (sessionKey, payload /* , entry */) => {
660
+ try {
661
+ logEvent('tool-result', {
662
+ chat_id: getChatIdFromKey(sessionKey),
663
+ session_key: sessionKey,
664
+ backend: payload?.backend ?? 'cli',
665
+ tool_name: payload?.name ?? null,
666
+ duration_ms: payload?.durationMs ?? null,
667
+ agent_id: payload?.agentId ?? null,
668
+ agent_type: payload?.agentType ?? null,
669
+ tool_use_id: payload?.toolUseId ?? null,
670
+ is_error: payload?.isError === true,
671
+ });
672
+ } catch (err) {
673
+ logger.error?.(`[${botName}] tool-result handler: ${err.message}`);
674
+ }
675
+ },
676
+
677
+ // 0.12 Phase 1.3 — subagent lifecycle. PreToolUse with name='Agent'
678
+ // synthesizes 'subagent-start' (no agent_id yet — claude doesn't
679
+ // hand one out until the inner SubagentStop). 'subagent-done' carries
680
+ // the agent_id + duration_ms so a soak can correlate the pair:
681
+ // SELECT s.detail_json AS start, d.detail_json AS done
682
+ // FROM events s JOIN events d
683
+ // ON json_extract(s.detail_json, '$.tool_use_id') =
684
+ // json_extract(d.detail_json, '$.tool_use_id')
685
+ // WHERE s.kind='subagent-start' AND d.kind='subagent-done';
686
+ onSubagentStart: (sessionKey, payload, entry) => {
687
+ try {
688
+ logEvent('subagent-start', {
689
+ chat_id: getChatIdFromKey(sessionKey),
690
+ session_key: sessionKey,
691
+ backend: payload?.backend ?? 'cli',
692
+ agent_type: payload?.agentType ?? null,
693
+ tool_use_id: payload?.toolUseId ?? null,
694
+ });
695
+ // Findings L9/L14: drive the head reactor into the distinct SUBAGENT
696
+ // state so a running subagent shows 👾 rather than freezing on the
697
+ // prior tool's emoji. The plan promised this; previously the handler
698
+ // only persisted the DB row and never touched the reactor.
699
+ const r = entry?.pendingQueue?.[0]?.context?.reactor;
700
+ if (r) r.setState('SUBAGENT');
701
+ } catch (err) {
702
+ logger.error?.(`[${botName}] subagent-start handler: ${err.message}`);
703
+ }
704
+ },
705
+
706
+ onSubagentDone: (sessionKey, payload, entry) => {
707
+ try {
708
+ // L9/L14: heartbeat at subagent end so the cascade/stall clock
709
+ // resets; the next tool's PreToolUse sets the following state.
710
+ const r = entry?.pendingQueue?.[0]?.context?.reactor;
711
+ if (r && typeof r.heartbeat === 'function') r.heartbeat();
712
+ logEvent('subagent-done', {
713
+ chat_id: getChatIdFromKey(sessionKey),
714
+ session_key: sessionKey,
715
+ backend: payload?.backend ?? 'cli',
716
+ agent_type: payload?.agentType ?? null,
717
+ agent_id: payload?.agentId ?? null,
718
+ duration_ms: payload?.durationMs ?? null,
719
+ // Finding 0.12-M4: persist the originating Agent tool_use_id so the
720
+ // documented subagent-start/subagent-done soak JOIN on
721
+ // $.tool_use_id matches (subagent-done's tool_use_id is recovered
722
+ // in cli-process.js from the paired Agent PreToolUse).
723
+ tool_use_id: payload?.toolUseId ?? null,
724
+ });
725
+ } catch (err) {
726
+ logger.error?.(`[${botName}] subagent-done handler: ${err.message}`);
727
+ }
728
+ },
729
+
625
730
  onInjectFail: (sessionKey, payload /* , entry */) => {
626
731
  try {
627
732
  const msgId = payload?.msgId;
@@ -55,6 +55,11 @@ const STATES = {
55
55
  // mid-turn user message is buffered for the next PostToolBatch
56
56
  // injection.
57
57
  AUTOSTEERED: { label: 'autosteered', chain: ['✍', '👀'] },
58
+ // 0.12 (Findings L9/L14): distinct in-progress reaction for a running
59
+ // subagent (Agent PreToolUse → SubagentStop). Driven by onSubagentStart.
60
+ // Preferred 👾 (NOT 🤖 — 🤖 is REACTION_INVALID for bots, same class as
61
+ // the rc.37 🧐 bug); falls back to 🔥 then 🤔, all bot-usable.
62
+ SUBAGENT: { label: 'subagent', chain: ['👾', '🔥', '🤔'] },
58
63
  DONE: { label: 'done', chain: ['👍'] },
59
64
  ERROR: { label: 'error', chain: ['🤯', '🤔'] },
60
65
  STALL: { label: 'stall', chain: ['🥱', '🤔'] },
@@ -42,6 +42,7 @@
42
42
  const EventEmitter = require('events');
43
43
  const fs = require('fs');
44
44
  const path = require('path');
45
+ const { StringDecoder } = require('string_decoder');
45
46
 
46
47
  const DEFAULT_INTERVAL_MS = 100;
47
48
  // Slow safety-net poll when fs.watch is active. Catches any events
@@ -91,6 +92,13 @@ class LogTail extends EventEmitter {
91
92
  this.fs = fsOverride || fs;
92
93
  this._offset = 0;
93
94
  this._buf = '';
95
+ // L8: decode bytes through a StringDecoder so a multibyte UTF-8 char
96
+ // split across two read chunks (the 64KB DEFAULT_CHUNK_BYTES boundary)
97
+ // isn't corrupted into U+FFFD. The decoder holds an incomplete trailing
98
+ // sequence until the continuation bytes arrive on the next read. The
99
+ // hook ndjson carries large non-ASCII tool payloads, so this is
100
+ // load-bearing on the CliProcess observability path.
101
+ this._decoder = new StringDecoder('utf8');
94
102
  this._closed = false;
95
103
  this._timer = null;
96
104
  this._watcher = null;
@@ -260,7 +268,9 @@ class LogTail extends EventEmitter {
260
268
  const readSize = Math.min(remaining, buffer.length);
261
269
  const { bytesRead } = await fd.read(buffer, 0, readSize, this._offset + totalRead);
262
270
  if (bytesRead === 0) break;
263
- this._buf += buffer.slice(0, bytesRead).toString('utf8');
271
+ // L8: StringDecoder.write instead of per-chunk toString('utf8') so a
272
+ // multibyte char straddling the read boundary survives intact.
273
+ this._buf += this._decoder.write(buffer.subarray(0, bytesRead));
264
274
  totalRead += bytesRead;
265
275
  }
266
276
  this._offset += totalRead;
@@ -17,6 +17,19 @@
17
17
  * - if `readySignal` regex matches the captured pane content, resolve
18
18
  * - if `Date.now()` exceeds the deadline, throw with `err.code = timeoutCode`
19
19
  *
20
+ * Progress-aware (stall) deadline — `stallMs`:
21
+ * The blind wall-clock `deadlineMs` can't tell "claude is mid-download
22
+ * (24% progress bar, genuinely working)" from "claude is wedged". The
23
+ * shumorobot General incident (2026-05-30) killed a cold-spawn that was
24
+ * actively downloading the runtime. When `stallMs` is set, the gate
25
+ * tracks pane ACTIVITY: any change in captured pane content — or a
26
+ * trigger key being sent — resets a stall clock. The gate fails early
27
+ * (with `timeoutCode`) only after `stallMs` elapses with NO activity,
28
+ * i.e. the pane is frozen. `deadlineMs` remains an absolute backstop so
29
+ * a pane that animates forever but never reaches `readySignal` still
30
+ * terminates. When `stallMs` is omitted (default), behavior is the pure
31
+ * `deadlineMs` wall-clock exactly as before.
32
+ *
20
33
  * Each trigger is one-shot per gate run (tracked by `name` in a Set).
21
34
  *
22
35
  * Caller supplies:
@@ -40,7 +53,10 @@ const DEFAULT_SETTLE_MS = 500;
40
53
  * @param {string} opts.tmuxName — tmux session name to poll
41
54
  * @param {Array<{name:string, regex:RegExp, key:string}>} opts.triggers
42
55
  * @param {RegExp} opts.readySignal — match → resolve
43
- * @param {number} [opts.deadlineMs=30000]
56
+ * @param {number} [opts.deadlineMs=30000] — absolute backstop
57
+ * @param {number} [opts.stallMs] — if set, fail after this much
58
+ * wall-clock with NO pane activity (progress-aware). Omit for pure
59
+ * wall-clock behavior.
44
60
  * @param {number} [opts.pollMs=300]
45
61
  * @param {number} [opts.settleMs=500]
46
62
  * @param {string} [opts.timeoutCode='TUI_STARTUP_TIMEOUT']
@@ -54,6 +70,7 @@ async function runStartupGate({
54
70
  triggers = [],
55
71
  readySignal,
56
72
  deadlineMs = DEFAULT_DEADLINE_MS,
73
+ stallMs,
57
74
  pollMs = DEFAULT_POLL_MS,
58
75
  settleMs = DEFAULT_SETTLE_MS,
59
76
  timeoutCode = 'TUI_STARTUP_TIMEOUT',
@@ -70,6 +87,7 @@ async function runStartupGate({
70
87
 
71
88
  const startedAt = Date.now();
72
89
  const deadline = startedAt + deadlineMs;
90
+ const stallEnabled = Number.isFinite(stallMs) && stallMs > 0;
73
91
  const seen = new Set();
74
92
  const matchedTriggers = [];
75
93
  // rc.4: remember the most recent successful pane snapshot. If the gate
@@ -78,8 +96,30 @@ async function runStartupGate({
78
96
  // this, "claude exits code 0 after dev-channels Enter" surfaces as a
79
97
  // 30-second `can't find pane` spam with no diagnostic about WHY.
80
98
  let lastPane = null;
99
+ // Progress-aware gate: timestamp of the last observed pane CHANGE (or
100
+ // trigger send). Seeded to startedAt so a pane that's frozen from the
101
+ // very first capture still trips stallMs. Only consulted when
102
+ // stallEnabled.
103
+ let lastActivityAt = startedAt;
81
104
 
82
105
  while (Date.now() < deadline) {
106
+ // Stall check (progress-aware): the pane has been doing nothing for
107
+ // stallMs. Distinct from the absolute deadline — fires early so a
108
+ // wedged TUI fails fast, while an actively-progressing one (download
109
+ // bar, dialog navigation) keeps resetting lastActivityAt below.
110
+ if (stallEnabled && Date.now() - lastActivityAt >= stallMs) {
111
+ const err = new Error(
112
+ `[${label}] startup gate saw no pane activity for ${stallMs}ms for ${tmuxName} ` +
113
+ `(matched: ${matchedTriggers.length ? matchedTriggers.join(', ') : 'none'}). ` +
114
+ `Pane appears wedged. Last pane content:\n` +
115
+ _formatPaneTail(lastPane),
116
+ );
117
+ err.code = timeoutCode;
118
+ err.lastPane = lastPane;
119
+ err.matchedTriggers = matchedTriggers;
120
+ err.reason = 'stall';
121
+ throw err;
122
+ }
83
123
  let pane;
84
124
  try {
85
125
  pane = await runner.captureWide(tmuxName);
@@ -107,6 +147,10 @@ async function runStartupGate({
107
147
  await new Promise(r => setTimeout(r, settleMs));
108
148
  continue;
109
149
  }
150
+ // Progress signal: any change in pane content is activity → reset the
151
+ // stall clock. A captureWide that returns the SAME bytes is NOT
152
+ // activity (a frozen download bar at 24% reads identically each poll).
153
+ if (pane !== lastPane) lastActivityAt = Date.now();
110
154
  lastPane = pane;
111
155
 
112
156
  // Walk triggers in declaration order — first match (and not yet seen) wins
@@ -122,6 +166,10 @@ async function runStartupGate({
122
166
  seen.add(trigger.name);
123
167
  matchedTriggers.push(trigger.name);
124
168
  matched = true;
169
+ // Sending a key is activity — navigating the TUI counts as progress
170
+ // even if the pre-transition pane text was static (e.g. a dialog we
171
+ // just answered). Reset the stall clock so we don't fail mid-nav.
172
+ lastActivityAt = Date.now();
125
173
  // Settle window so the TUI transitions out of the dialog before next poll
126
174
  await new Promise(r => setTimeout(r, settleMs));
127
175
  break;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "polygram",
3
- "version": "0.12.0-rc.1",
3
+ "version": "0.12.0-rc.3",
4
4
  "description": "Telegram daemon for Claude Code that preserves the OpenClaw per-chat session model. Migration path for OpenClaw users moving to Claude Code.",
5
5
  "main": "lib/ipc/client.js",
6
6
  "bin": {
package/polygram.js CHANGED
@@ -51,7 +51,6 @@ const { extractAssistantText } = require('./lib/process/sdk-process');
51
51
  const { createChannelsToolDispatcher } = require('./lib/process/channels-tool-dispatcher');
52
52
  const { createTmuxRunner } = require('./lib/tmux/tmux-runner');
53
53
  const { sweepTmuxOrphans } = require('./lib/tmux/orphan-sweep');
54
- const { PollScheduler } = require('./lib/tmux/poll-scheduler');
55
54
  // rc.42: autosteer-buffer module deleted. Native SDK priority push
56
55
  // (pm.injectUserMessage) replaces the buffer + PostToolBatch detour.
57
56
  const { createAutosteeredRefs } = require('./lib/autosteered-refs');