@yemi33/minions 0.1.2004 → 0.1.2005

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dashboard.js +82 -1
  2. package/package.json +1 -1
package/dashboard.js CHANGED
@@ -1937,6 +1937,21 @@ const CC_LIVE_STREAM_MAX_AGE_MS = shared.ENGINE_DEFAULTS.ccLiveStreamMaxAgeMs;
1937
1937
  // edits aren't killed mid-stream and the backend timeout never beats the user's reading
1938
1938
  // time. The doc-chat handlers still abort on client disconnect.
1939
1939
  const DOC_CHAT_TIMEOUT_MS = 60 * 60 * 1000;
1940
+ // W-mpetru71000re5de — bound the SSE per-tab queue and force-close streams
1941
+ // whose consumer has been backpressured >30s. writeCcEvent used to log
1942
+ // [cc-sse-backpressure] and silently return true on res.write()===false;
1943
+ // Node's WritableState.buffered[] has no upper bound, so a backgrounded tab
1944
+ // whose socket is half-open (Windows default TCP keepalive 7200s) can
1945
+ // accumulate bytes forever — one tab can push the dashboard past V8's 4 GB
1946
+ // heap and silently OOM-kill. Smoking gun: a single CC stream sat open
1947
+ // 8.5 min with 45 tool events queued and 0 bytes flushed
1948
+ // ([cc-stream] reason=heartbeat-write-failed duration=511442ms chunks=0
1949
+ // tools=45 bytes=0). Shedding is safe because liveState.text/.tools are
1950
+ // populated BEFORE the writer({...}) call (dashboard.js:6753, 6760, 6854,
1951
+ // 6864), so dropped wire frames are fully recoverable via the
1952
+ // reconnect-replay protocol (dashboard.js:7048-7083).
1953
+ const SSE_MAX_QUEUE_BYTES = 4 * 1024 * 1024; // 4 MB per-tab — conservative, tunable
1954
+ const SSE_STUCK_KILL_MS = 30 * 1000; // 30s of continuous backpressure → res.destroy()
1940
1955
  function _releaseCCTab(tabId) { ccInFlightTabs.delete(tabId); ccInFlightAborts.delete(tabId); }
1941
1956
  function _getCcLiveStream(tabId) {
1942
1957
  return ccLiveStreams.get(tabId) || null;
@@ -6372,6 +6387,13 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6372
6387
  let _docStreamEnded = false;
6373
6388
  let _docHeartbeatTimer = null;
6374
6389
  const writeDocEvent = (payload) => {
6390
+ // TODO(W-mpetru71000re5de): doc-chat SSE has the same unbounded-queue
6391
+ // failure mode as CC's writeCcEvent — res.write() returning false from
6392
+ // backpressure silently queues bytes in Node's WritableState.buffered[].
6393
+ // Out of scope for this fix (task is scoped to CC only). When this is
6394
+ // addressed, mirror the SSE_MAX_QUEUE_BYTES shed + SSE_STUCK_KILL_MS
6395
+ // heartbeat force-close pattern from the writeCcEvent closure
6396
+ // (dashboard.js, search for SSE_MAX_QUEUE_BYTES).
6375
6397
  try {
6376
6398
  res.write('data: ' + JSON.stringify(payload) + '\n\n');
6377
6399
  return true;
@@ -7315,6 +7337,17 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7315
7337
  let _ccStreamEnded = false;
7316
7338
  let _ccHeartbeatTimer = null;
7317
7339
  let _ccLastHeartbeatAt = Date.now();
7340
+ // W-mpetru71000re5de — per-stream backpressure clock + approximate queued
7341
+ // bytes. _bpStartedAt timestamps the first write that returned false (and
7342
+ // is reset on 'drain'); _queuedBytes accumulates write sizes pushed past
7343
+ // res.writable's highWaterMark (best-effort — Node's internal buffer
7344
+ // length is private). Used by writeCcEvent (shed) + the heartbeat tick
7345
+ // (force-close stuck streams).
7346
+ let _bpStartedAt = null;
7347
+ let _queuedBytes = 0;
7348
+ try {
7349
+ res.on('drain', () => { _bpStartedAt = null; _queuedBytes = 0; });
7350
+ } catch { /* listener registration is best-effort */ }
7318
7351
  // W-mpdavudb000v8446 — SSE delivery telemetry. Previously writeCcEvent
7319
7352
  // swallowed all write failures (res.destroyed / res.write returning false
7320
7353
  // for backpressure / sync throw), and the [cc-timing] log only proved
@@ -7351,6 +7384,30 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7351
7384
  _logFail('json-serialize-failed', { error: String((err && err.message) || err).slice(0, 200) });
7352
7385
  return false;
7353
7386
  }
7387
+ // W-mpetru71000re5de — shed wire frames once the per-tab queue exceeds
7388
+ // the cap. Safe ONLY because liveState.text / liveState.tools are
7389
+ // populated BEFORE writer({...}) is called at every call site:
7390
+ // - dashboard.js:6753-6754 (legacy direct path: text → writer)
7391
+ // - dashboard.js:6760-6761 (legacy direct path: tools → writer)
7392
+ // - dashboard.js:6854-6855 (pool path: text → writer)
7393
+ // - dashboard.js:6864-6865 (pool path: tools → writer)
7394
+ // The reconnect path (dashboard.js:7048-7083) replays from liveState.*,
7395
+ // so dropped wire frames are fully recoverable on reattach. We still
7396
+ // bump _ccTelemetry counters so the [cc-stream] outcome log line stays
7397
+ // truthful about what the orchestrator produced — only the wire was
7398
+ // shed, the work happened.
7399
+ if (_queuedBytes > SSE_MAX_QUEUE_BYTES) {
7400
+ try {
7401
+ shared.log('warn', `[cc-sse-shed] tab=${tabId || _ccTelemetry.tabId || 'unknown'} type=${type} queuedBytes=${_queuedBytes} wireBytes=${wire.length}`);
7402
+ } catch { /* telemetry is best-effort */ }
7403
+ if (type === 'chunk') {
7404
+ _ccTelemetry.chunks++;
7405
+ _ccTelemetry.bytes += Buffer.byteLength(String((payload && payload.text) || ''), 'utf8');
7406
+ } else if (type === 'tool') {
7407
+ _ccTelemetry.tools++;
7408
+ }
7409
+ return true;
7410
+ }
7354
7411
  let writeOk;
7355
7412
  try { writeOk = res.write(wire); }
7356
7413
  catch (err) {
@@ -7362,8 +7419,16 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7362
7419
  // The write IS still queued, so don't treat this as a failure, but
7363
7420
  // surface it so a slow consumer is visible in telemetry. Most CC
7364
7421
  // chunks are small enough that we never hit this in practice.
7422
+ // W-mpetru71000re5de — also start (or extend) the per-stream
7423
+ // backpressure clock and accumulate approximate queued bytes. The
7424
+ // heartbeat tick force-closes the stream once the clock exceeds
7425
+ // SSE_STUCK_KILL_MS; writeCcEvent above sheds further frames once
7426
+ // _queuedBytes exceeds SSE_MAX_QUEUE_BYTES. _bpStartedAt + _queuedBytes
7427
+ // are reset by the res.on('drain') listener registered above.
7428
+ if (_bpStartedAt == null) _bpStartedAt = Date.now();
7429
+ _queuedBytes += wire.length;
7365
7430
  try {
7366
- shared.log('warn', `[cc-sse-backpressure] tab=${tabId || _ccTelemetry.tabId || 'unknown'} type=${type} bytes=${wire.length}`);
7431
+ shared.log('warn', `[cc-sse-backpressure] tab=${tabId || _ccTelemetry.tabId || 'unknown'} type=${type} bytes=${wire.length} queuedBytes=${_queuedBytes} bpMs=${Date.now() - _bpStartedAt}`);
7367
7432
  } catch { /* telemetry is best-effort */ }
7368
7433
  }
7369
7434
  if (payload && payload.type === 'chunk') {
@@ -7405,6 +7470,22 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7405
7470
  stopCcHeartbeat();
7406
7471
  return;
7407
7472
  }
7473
+ // W-mpetru71000re5de — force-close streams stuck on backpressure.
7474
+ // res.destroy() fires req.on('close'), which triggers the existing
7475
+ // teardown path (detach writer → schedule abort → cleanup), so the
7476
+ // queued bytes are sheddable via GC and the LLM is aborted. Catch-22
7477
+ // workaround: _scheduleCcLiveAbort bails while state.writer is
7478
+ // non-null and writer is non-null until req.close fires; that's
7479
+ // exactly what res.destroy() triggers.
7480
+ if (_bpStartedAt && Date.now() - _bpStartedAt > SSE_STUCK_KILL_MS) {
7481
+ const stuckMs = Date.now() - _bpStartedAt;
7482
+ try {
7483
+ shared.log('warn', `[cc-sse-stuck-close] tab=${tabId || _ccTelemetry.tabId || 'unknown'} stuckMs=${stuckMs} queuedBytes=${_queuedBytes}`);
7484
+ } catch { /* telemetry is best-effort */ }
7485
+ stopCcHeartbeat();
7486
+ try { res.destroy(); } catch { /* swallow — req.on('close') will still fire */ }
7487
+ return;
7488
+ }
7408
7489
  _checkStall();
7409
7490
  if (!writeCcEvent({ type: 'heartbeat' })) {
7410
7491
  stopCcHeartbeat();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.2004",
3
+ "version": "0.1.2005",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"