@yemi33/minions 0.1.2004 → 0.1.2006

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dashboard.js CHANGED
@@ -1937,6 +1937,21 @@ const CC_LIVE_STREAM_MAX_AGE_MS = shared.ENGINE_DEFAULTS.ccLiveStreamMaxAgeMs;
1937
1937
  // edits aren't killed mid-stream and the backend timeout never beats the user's reading
1938
1938
  // time. The doc-chat handlers still abort on client disconnect.
1939
1939
  const DOC_CHAT_TIMEOUT_MS = 60 * 60 * 1000;
1940
+ // W-mpetru71000re5de — bound the SSE per-tab queue and force-close streams
1941
+ // whose consumer has been backpressured >30s. writeCcEvent used to log
1942
+ // [cc-sse-backpressure] and silently return true on res.write()===false;
1943
+ // Node's WritableState.buffered[] has no upper bound, so a backgrounded tab
1944
+ // whose socket is half-open (Windows default TCP keepalive 7200s) can
1945
+ // accumulate bytes forever — one tab can push the dashboard past V8's 4 GB
1946
+ // heap and silently OOM-kill. Smoking gun: a single CC stream sat open
1947
+ // 8.5 min with 45 tool events queued and 0 bytes flushed
1948
+ // ([cc-stream] reason=heartbeat-write-failed duration=511442ms chunks=0
1949
+ // tools=45 bytes=0). Shedding is safe because liveState.text/.tools are
1950
+ // populated BEFORE the writer({...}) call (dashboard.js:6753, 6760, 6854,
1951
+ // 6864), so dropped wire frames are fully recoverable via the
1952
+ // reconnect-replay protocol (dashboard.js:7048-7083).
1953
+ const SSE_MAX_QUEUE_BYTES = 4 * 1024 * 1024; // 4 MB per-tab — conservative, tunable
1954
+ const SSE_STUCK_KILL_MS = 30 * 1000; // 30s of continuous backpressure → res.destroy()
1940
1955
  function _releaseCCTab(tabId) { ccInFlightTabs.delete(tabId); ccInFlightAborts.delete(tabId); }
1941
1956
  function _getCcLiveStream(tabId) {
1942
1957
  return ccLiveStreams.get(tabId) || null;
@@ -4600,6 +4615,10 @@ const server = http.createServer(async (req, res) => {
4600
4615
  d.meta?.item?.id === id
4601
4616
  );
4602
4617
 
4618
+ // (W-mpfsl2rw000m9469) Archiving removes a row from work-items.json, which
4619
+ // is part of /api/status fast-state. Match every other mutating handler
4620
+ // and invalidate so the dashboard reflects the archive immediately.
4621
+ invalidateStatusCache();
4603
4622
  return jsonReply(res, 200, { ok: true, id });
4604
4623
  } catch (e) { return jsonReply(res, 400, { error: e.message }); }
4605
4624
  }
@@ -4803,6 +4822,10 @@ const server = http.createServer(async (req, res) => {
4803
4822
  });
4804
4823
  }
4805
4824
  recordCcTurnIfPresent(req, { kind: 'work-item', id, title: item.title, project: item.project || null });
4825
+ // (W-mpfsl2rw000m9469) Invalidate so the next ≤4s SPA poll sees the
4826
+ // followup WI instead of serving a stale 304. Sibling mutating handlers
4827
+ // (delete/cancel/retry/reopen/notes/plan) all invalidate before reply.
4828
+ invalidateStatusCache();
4806
4829
  return jsonReply(res, 200, { ok: true, id });
4807
4830
  }
4808
4831
  const createResult = createWorkItemWithDedup(wiPath, item);
@@ -4811,6 +4834,10 @@ const server = http.createServer(async (req, res) => {
4811
4834
  return jsonReply(res, 200, { ok: true, id: duplicateId, duplicate: true, duplicateOf: duplicateId });
4812
4835
  }
4813
4836
  recordCcTurnIfPresent(req, { kind: 'work-item', id, title: item.title, project: item.project || null });
4837
+ // (W-mpfsl2rw000m9469) Mirror the followup-branch invalidation above so
4838
+ // the new WI shows up on the next dashboard poll instead of waiting for
4839
+ // the 10s periodic push.
4840
+ invalidateStatusCache();
4814
4841
  return jsonReply(res, 200, { ok: true, id });
4815
4842
  } catch (e) { return jsonReply(res, 400, { error: e.message }); }
4816
4843
  }
@@ -6372,6 +6399,13 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6372
6399
  let _docStreamEnded = false;
6373
6400
  let _docHeartbeatTimer = null;
6374
6401
  const writeDocEvent = (payload) => {
6402
+ // TODO(W-mpetru71000re5de): doc-chat SSE has the same unbounded-queue
6403
+ // failure mode as CC's writeCcEvent — res.write() returning false from
6404
+ // backpressure silently queues bytes in Node's WritableState.buffered[].
6405
+ // Out of scope for this fix (task is scoped to CC only). When this is
6406
+ // addressed, mirror the SSE_MAX_QUEUE_BYTES shed + SSE_STUCK_KILL_MS
6407
+ // heartbeat force-close pattern from the writeCcEvent closure
6408
+ // (dashboard.js, search for SSE_MAX_QUEUE_BYTES).
6375
6409
  try {
6376
6410
  res.write('data: ' + JSON.stringify(payload) + '\n\n');
6377
6411
  return true;
@@ -7315,6 +7349,17 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7315
7349
  let _ccStreamEnded = false;
7316
7350
  let _ccHeartbeatTimer = null;
7317
7351
  let _ccLastHeartbeatAt = Date.now();
7352
+ // W-mpetru71000re5de — per-stream backpressure clock + approximate queued
7353
+ // bytes. _bpStartedAt timestamps the first write that returned false (and
7354
+ // is reset on 'drain'); _queuedBytes accumulates write sizes pushed past
7355
+ // res.writable's highWaterMark (best-effort — Node's internal buffer
7356
+ // length is private). Used by writeCcEvent (shed) + the heartbeat tick
7357
+ // (force-close stuck streams).
7358
+ let _bpStartedAt = null;
7359
+ let _queuedBytes = 0;
7360
+ try {
7361
+ res.on('drain', () => { _bpStartedAt = null; _queuedBytes = 0; });
7362
+ } catch { /* listener registration is best-effort */ }
7318
7363
  // W-mpdavudb000v8446 — SSE delivery telemetry. Previously writeCcEvent
7319
7364
  // swallowed all write failures (res.destroyed / res.write returning false
7320
7365
  // for backpressure / sync throw), and the [cc-timing] log only proved
@@ -7351,6 +7396,30 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7351
7396
  _logFail('json-serialize-failed', { error: String((err && err.message) || err).slice(0, 200) });
7352
7397
  return false;
7353
7398
  }
7399
+ // W-mpetru71000re5de — shed wire frames once the per-tab queue exceeds
7400
+ // the cap. Safe ONLY because liveState.text / liveState.tools are
7401
+ // populated BEFORE writer({...}) is called at every call site:
7402
+ // - dashboard.js:6753-6754 (legacy direct path: text → writer)
7403
+ // - dashboard.js:6760-6761 (legacy direct path: tools → writer)
7404
+ // - dashboard.js:6854-6855 (pool path: text → writer)
7405
+ // - dashboard.js:6864-6865 (pool path: tools → writer)
7406
+ // The reconnect path (dashboard.js:7048-7083) replays from liveState.*,
7407
+ // so dropped wire frames are fully recoverable on reattach. We still
7408
+ // bump _ccTelemetry counters so the [cc-stream] outcome log line stays
7409
+ // truthful about what the orchestrator produced — only the wire was
7410
+ // shed, the work happened.
7411
+ if (_queuedBytes > SSE_MAX_QUEUE_BYTES) {
7412
+ try {
7413
+ shared.log('warn', `[cc-sse-shed] tab=${tabId || _ccTelemetry.tabId || 'unknown'} type=${type} queuedBytes=${_queuedBytes} wireBytes=${wire.length}`);
7414
+ } catch { /* telemetry is best-effort */ }
7415
+ if (type === 'chunk') {
7416
+ _ccTelemetry.chunks++;
7417
+ _ccTelemetry.bytes += Buffer.byteLength(String((payload && payload.text) || ''), 'utf8');
7418
+ } else if (type === 'tool') {
7419
+ _ccTelemetry.tools++;
7420
+ }
7421
+ return true;
7422
+ }
7354
7423
  let writeOk;
7355
7424
  try { writeOk = res.write(wire); }
7356
7425
  catch (err) {
@@ -7362,8 +7431,16 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7362
7431
  // The write IS still queued, so don't treat this as a failure, but
7363
7432
  // surface it so a slow consumer is visible in telemetry. Most CC
7364
7433
  // chunks are small enough that we never hit this in practice.
7434
+ // W-mpetru71000re5de — also start (or extend) the per-stream
7435
+ // backpressure clock and accumulate approximate queued bytes. The
7436
+ // heartbeat tick force-closes the stream once the clock exceeds
7437
+ // SSE_STUCK_KILL_MS; writeCcEvent above sheds further frames once
7438
+ // _queuedBytes exceeds SSE_MAX_QUEUE_BYTES. _bpStartedAt + _queuedBytes
7439
+ // are reset by the res.on('drain') listener registered above.
7440
+ if (_bpStartedAt == null) _bpStartedAt = Date.now();
7441
+ _queuedBytes += wire.length;
7365
7442
  try {
7366
- shared.log('warn', `[cc-sse-backpressure] tab=${tabId || _ccTelemetry.tabId || 'unknown'} type=${type} bytes=${wire.length}`);
7443
+ shared.log('warn', `[cc-sse-backpressure] tab=${tabId || _ccTelemetry.tabId || 'unknown'} type=${type} bytes=${wire.length} queuedBytes=${_queuedBytes} bpMs=${Date.now() - _bpStartedAt}`);
7367
7444
  } catch { /* telemetry is best-effort */ }
7368
7445
  }
7369
7446
  if (payload && payload.type === 'chunk') {
@@ -7405,6 +7482,22 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7405
7482
  stopCcHeartbeat();
7406
7483
  return;
7407
7484
  }
7485
+ // W-mpetru71000re5de — force-close streams stuck on backpressure.
7486
+ // res.destroy() fires req.on('close'), which triggers the existing
7487
+ // teardown path (detach writer → schedule abort → cleanup), so the
7488
+ // queued bytes are sheddable via GC and the LLM is aborted. Catch-22
7489
+ // workaround: _scheduleCcLiveAbort bails while state.writer is
7490
+ // non-null and writer is non-null until req.close fires; that's
7491
+ // exactly what res.destroy() triggers.
7492
+ if (_bpStartedAt && Date.now() - _bpStartedAt > SSE_STUCK_KILL_MS) {
7493
+ const stuckMs = Date.now() - _bpStartedAt;
7494
+ try {
7495
+ shared.log('warn', `[cc-sse-stuck-close] tab=${tabId || _ccTelemetry.tabId || 'unknown'} stuckMs=${stuckMs} queuedBytes=${_queuedBytes}`);
7496
+ } catch { /* telemetry is best-effort */ }
7497
+ stopCcHeartbeat();
7498
+ try { res.destroy(); } catch { /* swallow — req.on('close') will still fire */ }
7499
+ return;
7500
+ }
7408
7501
  _checkStall();
7409
7502
  if (!writeCcEvent({ type: 'heartbeat' })) {
7410
7503
  stopCcHeartbeat();
package/docs/README.md CHANGED
@@ -21,7 +21,9 @@ Architecture, design proposals, and lifecycle references for people working on t
21
21
  - [kb-sweep.md](kb-sweep.md) — Knowledge-base consolidation sweep (hash dedup → LLM batch dedup/reclassify → per-entry compress) and the detached runner that keeps it alive across `minions restart`.
22
22
  - [managed-spawn.md](managed-spawn.md) — Engine-owned long-running services (managed-spawn primitive): sidecar schema, healthcheck examples, lifecycle, dashboard API, and the WI 1 (build) → WI 2 (test) chained-validation pattern.
23
23
  - [plan-lifecycle.md](plan-lifecycle.md) — Full plan pipeline from `/plan` through PRD materialization, dispatch with dependency gating, verify task, and human archive.
24
+ - [pr-comment-followup.md](pr-comment-followup.md) — PR-comment follow-up dispatch contract: fix/review agents may spin off a new WI via `POST /api/work-items` with `meta.pr_followup` instead of broadening the current PR or rebutting the comment.
24
25
  - [pr-review-fix-loop.md](pr-review-fix-loop.md) — How the engine moves a PR from creation through review, fix dispatch, and re-review, including stale-status guards.
26
+ - [qa-runbooks.md](qa-runbooks.md) — Per-project QA runbook schema, storage layout (`projects/<name>/runbooks/<id>.json`), CRUD endpoints, run-record lifecycle, and the `qa-validate` agent sidecar contract.
25
27
  - [rfc-completion-json.md](rfc-completion-json.md) — RFC for replacing stdout regex-scraping with a structured `completion.json` control-plane protocol.
26
28
  - [runtime-adapters.md](runtime-adapters.md) — Runtime adapter contract (`engine/runtimes/*`): how the engine talks to Claude Code, Copilot CLI, and future CLIs through a single capability-flagged interface.
27
29
  - [self-improvement.md](self-improvement.md) — The six self-improvement mechanisms (learnings inbox, per-agent history, review feedback, quality metrics, etc.) that form Minions' continuous feedback loop.
@@ -17,6 +17,7 @@ tick()
17
17
  2. consolidateInbox() Merge learnings into notes.md (Haiku-powered)
18
18
  2.5 runCleanup() Periodic cleanup (every 10 ticks ≈ 10min)
19
19
  2.52 sweepKeepProcesses() keep_processes TTL/dead-PID sweep (every 30 ticks)
20
+ 2.53 sweepManagedSpawn() managed_spawn TTL/dead-PID/log-rotate sweep (every 30 ticks)
20
21
  2.55 checkWatches() Persistent watch jobs (every 3 tick-equivalents)
21
22
  2.6 pollPrStatus() Poll ADO + GitHub for build, review, merge status (wall-clock cadence from prPollStatusEvery × tickInterval, default ≈ 12min)
22
23
  processPendingRebases() Run any rebase work queued from the previous tick
@@ -38,7 +38,7 @@ The sidecar lives at `<MINIONS_DIR>/agents/<agentId>/managed-spawn.json` and is
38
38
  "cwd": "D:/repos/constellation", // must be inside a real git worktree (requireGitWorkdir: true) — monorepo subdirs ok, ancestor walked up to gitWorktreeMaxParentDepth parents
39
39
  "env": { "CONSTELLATION_SERVER": "http://localhost:3000" }, // ≤32 keys; POSIX-shape + denylist enforced
40
40
  "ports": [3001], // 1024-65535; ≤20 per spec; advisory only (engine doesn't bind)
41
- "ttl_minutes": 240, // ≤1440 (24h hard cap); defaults to 240 (4h)
41
+ "ttl_minutes": 240, // ≤1440 (24h hard cap); defaults to 720 (12h)
42
42
  "attrs": { // opaque per-spec metadata, ≤2048 bytes serialized
43
43
  "base_url": "http://localhost:3001",
44
44
  "framework": "vite"
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.2004",
3
+ "version": "0.1.2006",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"