@yemi33/minions 0.1.2004 → 0.1.2005
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard.js +82 -1
- package/package.json +1 -1
package/dashboard.js
CHANGED
|
@@ -1937,6 +1937,21 @@ const CC_LIVE_STREAM_MAX_AGE_MS = shared.ENGINE_DEFAULTS.ccLiveStreamMaxAgeMs;
|
|
|
1937
1937
|
// edits aren't killed mid-stream and the backend timeout never beats the user's reading
|
|
1938
1938
|
// time. The doc-chat handlers still abort on client disconnect.
|
|
1939
1939
|
const DOC_CHAT_TIMEOUT_MS = 60 * 60 * 1000;
|
|
1940
|
+
// W-mpetru71000re5de — bound the SSE per-tab queue and force-close streams
|
|
1941
|
+
// whose consumer has been backpressured >30s. writeCcEvent used to log
|
|
1942
|
+
// [cc-sse-backpressure] and silently return true on res.write()===false;
|
|
1943
|
+
// Node's WritableState.buffered[] has no upper bound, so a backgrounded tab
|
|
1944
|
+
// whose socket is half-open (Windows default TCP keepalive 7200s) can
|
|
1945
|
+
// accumulate bytes forever — one tab can push the dashboard past V8's 4 GB
|
|
1946
|
+
// heap and silently OOM-kill. Smoking gun: a single CC stream sat open
|
|
1947
|
+
// 8.5 min with 45 tool events queued and 0 bytes flushed
|
|
1948
|
+
// ([cc-stream] reason=heartbeat-write-failed duration=511442ms chunks=0
|
|
1949
|
+
// tools=45 bytes=0). Shedding is safe because liveState.text/.tools are
|
|
1950
|
+
// populated BEFORE the writer({...}) call (dashboard.js:6753, 6760, 6854,
|
|
1951
|
+
// 6864), so dropped wire frames are fully recoverable via the
|
|
1952
|
+
// reconnect-replay protocol (dashboard.js:7048-7083).
|
|
1953
|
+
const SSE_MAX_QUEUE_BYTES = 4 * 1024 * 1024; // 4 MB per-tab — conservative, tunable
|
|
1954
|
+
const SSE_STUCK_KILL_MS = 30 * 1000; // 30s of continuous backpressure → res.destroy()
|
|
1940
1955
|
function _releaseCCTab(tabId) { ccInFlightTabs.delete(tabId); ccInFlightAborts.delete(tabId); }
|
|
1941
1956
|
function _getCcLiveStream(tabId) {
|
|
1942
1957
|
return ccLiveStreams.get(tabId) || null;
|
|
@@ -6372,6 +6387,13 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
6372
6387
|
let _docStreamEnded = false;
|
|
6373
6388
|
let _docHeartbeatTimer = null;
|
|
6374
6389
|
const writeDocEvent = (payload) => {
|
|
6390
|
+
// TODO(W-mpetru71000re5de): doc-chat SSE has the same unbounded-queue
|
|
6391
|
+
// failure mode as CC's writeCcEvent — res.write() returning false from
|
|
6392
|
+
// backpressure silently queues bytes in Node's WritableState.buffered[].
|
|
6393
|
+
// Out of scope for this fix (task is scoped to CC only). When this is
|
|
6394
|
+
// addressed, mirror the SSE_MAX_QUEUE_BYTES shed + SSE_STUCK_KILL_MS
|
|
6395
|
+
// heartbeat force-close pattern from the writeCcEvent closure
|
|
6396
|
+
// (dashboard.js, search for SSE_MAX_QUEUE_BYTES).
|
|
6375
6397
|
try {
|
|
6376
6398
|
res.write('data: ' + JSON.stringify(payload) + '\n\n');
|
|
6377
6399
|
return true;
|
|
@@ -7315,6 +7337,17 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
7315
7337
|
let _ccStreamEnded = false;
|
|
7316
7338
|
let _ccHeartbeatTimer = null;
|
|
7317
7339
|
let _ccLastHeartbeatAt = Date.now();
|
|
7340
|
+
// W-mpetru71000re5de — per-stream backpressure clock + approximate queued
|
|
7341
|
+
// bytes. _bpStartedAt timestamps the first write that returned false (and
|
|
7342
|
+
// is reset on 'drain'); _queuedBytes accumulates write sizes pushed past
|
|
7343
|
+
// res.writable's highWaterMark (best-effort — Node's internal buffer
|
|
7344
|
+
// length is private). Used by writeCcEvent (shed) + the heartbeat tick
|
|
7345
|
+
// (force-close stuck streams).
|
|
7346
|
+
let _bpStartedAt = null;
|
|
7347
|
+
let _queuedBytes = 0;
|
|
7348
|
+
try {
|
|
7349
|
+
res.on('drain', () => { _bpStartedAt = null; _queuedBytes = 0; });
|
|
7350
|
+
} catch { /* listener registration is best-effort */ }
|
|
7318
7351
|
// W-mpdavudb000v8446 — SSE delivery telemetry. Previously writeCcEvent
|
|
7319
7352
|
// swallowed all write failures (res.destroyed / res.write returning false
|
|
7320
7353
|
// for backpressure / sync throw), and the [cc-timing] log only proved
|
|
@@ -7351,6 +7384,30 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
7351
7384
|
_logFail('json-serialize-failed', { error: String((err && err.message) || err).slice(0, 200) });
|
|
7352
7385
|
return false;
|
|
7353
7386
|
}
|
|
7387
|
+
// W-mpetru71000re5de — shed wire frames once the per-tab queue exceeds
|
|
7388
|
+
// the cap. Safe ONLY because liveState.text / liveState.tools are
|
|
7389
|
+
// populated BEFORE writer({...}) is called at every call site:
|
|
7390
|
+
// - dashboard.js:6753-6754 (legacy direct path: text → writer)
|
|
7391
|
+
// - dashboard.js:6760-6761 (legacy direct path: tools → writer)
|
|
7392
|
+
// - dashboard.js:6854-6855 (pool path: text → writer)
|
|
7393
|
+
// - dashboard.js:6864-6865 (pool path: tools → writer)
|
|
7394
|
+
// The reconnect path (dashboard.js:7048-7083) replays from liveState.*,
|
|
7395
|
+
// so dropped wire frames are fully recoverable on reattach. We still
|
|
7396
|
+
// bump _ccTelemetry counters so the [cc-stream] outcome log line stays
|
|
7397
|
+
// truthful about what the orchestrator produced — only the wire was
|
|
7398
|
+
// shed, the work happened.
|
|
7399
|
+
if (_queuedBytes > SSE_MAX_QUEUE_BYTES) {
|
|
7400
|
+
try {
|
|
7401
|
+
shared.log('warn', `[cc-sse-shed] tab=${tabId || _ccTelemetry.tabId || 'unknown'} type=${type} queuedBytes=${_queuedBytes} wireBytes=${wire.length}`);
|
|
7402
|
+
} catch { /* telemetry is best-effort */ }
|
|
7403
|
+
if (type === 'chunk') {
|
|
7404
|
+
_ccTelemetry.chunks++;
|
|
7405
|
+
_ccTelemetry.bytes += Buffer.byteLength(String((payload && payload.text) || ''), 'utf8');
|
|
7406
|
+
} else if (type === 'tool') {
|
|
7407
|
+
_ccTelemetry.tools++;
|
|
7408
|
+
}
|
|
7409
|
+
return true;
|
|
7410
|
+
}
|
|
7354
7411
|
let writeOk;
|
|
7355
7412
|
try { writeOk = res.write(wire); }
|
|
7356
7413
|
catch (err) {
|
|
@@ -7362,8 +7419,16 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
7362
7419
|
// The write IS still queued, so don't treat this as a failure, but
|
|
7363
7420
|
// surface it so a slow consumer is visible in telemetry. Most CC
|
|
7364
7421
|
// chunks are small enough that we never hit this in practice.
|
|
7422
|
+
// W-mpetru71000re5de — also start (or extend) the per-stream
|
|
7423
|
+
// backpressure clock and accumulate approximate queued bytes. The
|
|
7424
|
+
// heartbeat tick force-closes the stream once the clock exceeds
|
|
7425
|
+
// SSE_STUCK_KILL_MS; writeCcEvent above sheds further frames once
|
|
7426
|
+
// _queuedBytes exceeds SSE_MAX_QUEUE_BYTES. _bpStartedAt + _queuedBytes
|
|
7427
|
+
// are reset by the res.on('drain') listener registered above.
|
|
7428
|
+
if (_bpStartedAt == null) _bpStartedAt = Date.now();
|
|
7429
|
+
_queuedBytes += wire.length;
|
|
7365
7430
|
try {
|
|
7366
|
-
shared.log('warn', `[cc-sse-backpressure] tab=${tabId || _ccTelemetry.tabId || 'unknown'} type=${type} bytes=${wire.length}`);
|
|
7431
|
+
shared.log('warn', `[cc-sse-backpressure] tab=${tabId || _ccTelemetry.tabId || 'unknown'} type=${type} bytes=${wire.length} queuedBytes=${_queuedBytes} bpMs=${Date.now() - _bpStartedAt}`);
|
|
7367
7432
|
} catch { /* telemetry is best-effort */ }
|
|
7368
7433
|
}
|
|
7369
7434
|
if (payload && payload.type === 'chunk') {
|
|
@@ -7405,6 +7470,22 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
7405
7470
|
stopCcHeartbeat();
|
|
7406
7471
|
return;
|
|
7407
7472
|
}
|
|
7473
|
+
// W-mpetru71000re5de — force-close streams stuck on backpressure.
|
|
7474
|
+
// res.destroy() fires req.on('close'), which triggers the existing
|
|
7475
|
+
// teardown path (detach writer → schedule abort → cleanup), so the
|
|
7476
|
+
// queued bytes are sheddable via GC and the LLM is aborted. Catch-22
|
|
7477
|
+
// workaround: _scheduleCcLiveAbort bails while state.writer is
|
|
7478
|
+
// non-null and writer is non-null until req.close fires; that's
|
|
7479
|
+
// exactly what res.destroy() triggers.
|
|
7480
|
+
if (_bpStartedAt && Date.now() - _bpStartedAt > SSE_STUCK_KILL_MS) {
|
|
7481
|
+
const stuckMs = Date.now() - _bpStartedAt;
|
|
7482
|
+
try {
|
|
7483
|
+
shared.log('warn', `[cc-sse-stuck-close] tab=${tabId || _ccTelemetry.tabId || 'unknown'} stuckMs=${stuckMs} queuedBytes=${_queuedBytes}`);
|
|
7484
|
+
} catch { /* telemetry is best-effort */ }
|
|
7485
|
+
stopCcHeartbeat();
|
|
7486
|
+
try { res.destroy(); } catch { /* swallow — req.on('close') will still fire */ }
|
|
7487
|
+
return;
|
|
7488
|
+
}
|
|
7408
7489
|
_checkStall();
|
|
7409
7490
|
if (!writeCcEvent({ type: 'heartbeat' })) {
|
|
7410
7491
|
stopCcHeartbeat();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yemi33/minions",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2005",
|
|
4
4
|
"description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
|
|
5
5
|
"bin": {
|
|
6
6
|
"minions": "bin/minions.js"
|