@yemi33/minions 0.1.2044 → 0.1.2045

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -194,6 +194,35 @@ function _ccIsNewDashboardInstance(preRestartDashId, newDashId, clickTimeMs) {
194
194
  return Number.isFinite(parsed) && parsed > clickTimeMs;
195
195
  }
196
196
 
197
+ // Shown when ccRestartMinions's POST to /api/dashboard/restart can't be
198
+ // delivered — i.e. the dashboard process itself is dead, not just stale. In
199
+ // that state the in-browser button cannot bring anything back up: a detached
200
+ // `minions restart` child is only spawned if the POST reaches the dashboard.
201
+ // The recovery path lives outside the browser. Surface that fact explicitly
202
+ // instead of polling for a new dashboardStartedAt that never arrives.
203
+ function _ccShowDashboardDeadFallback(btn, reason) {
204
+ var humanReason = reason || 'connection refused';
205
+ var msg = 'The dashboard process appears to be down — the Restart button can\'t reach it (' + humanReason + ').\n\nRun this in your terminal to recover:\n\n minions restart';
206
+ if (btn) {
207
+ try {
208
+ btn.disabled = false;
209
+ btn.textContent = 'Run `minions restart` in terminal';
210
+ } catch {}
211
+ }
212
+ var copyOk = function() {
213
+ if (typeof showToast === 'function') showToast('cmd-toast', '`minions restart` copied — paste in your terminal', true);
214
+ };
215
+ var copyFail = function() {
216
+ try { window.prompt('Copy this command, then run it in your terminal:', 'minions restart'); }
217
+ catch { try { alert(msg); } catch {} }
218
+ };
219
+ try {
220
+ if (navigator.clipboard && navigator.clipboard.writeText) {
221
+ navigator.clipboard.writeText('minions restart').then(copyOk).catch(copyFail);
222
+ } else { copyFail(); }
223
+ } catch { copyFail(); }
224
+ }
225
+
197
226
  // Triggered by the CC "Restart Minions" recovery button when a stale dashboard
198
227
  // connection is killing CC streams with "Failed to fetch". Spawns the same
199
228
  // `minions restart` flow as the CLI command (kills + respawns engine AND
@@ -242,14 +271,42 @@ async function ccRestartMinions(btn) {
242
271
  } catch { /* best-effort — clickTime fallback inside helper still covers us */ }
243
272
  }
244
273
 
245
- // Fire-and-forget the restart POST. We do NOT await it the dashboard often
246
- // kills its own process before the response is flushed, so the fetch throws
247
- // even though the restart child (a detached `minions restart`) is happily
248
- // running. The polling loop is the source of truth for completion.
274
+ // Briefly await the POST so we can distinguish two failure shapes that look
275
+ // identical from a fire-and-forget callsite:
276
+ //
277
+ // (a) Dashboard alive when POST landed but died mid-response. The spawned
278
+ // restart child is running — polling /api/status WILL eventually see
279
+ // a new dashboardStartedAt and the existing reload path works.
280
+ //
281
+ // (b) Dashboard already dead when the user clicked. ECONNREFUSED before
282
+ // any byte hits the wire. No restart child was spawned, polling would
283
+ // wait the full 90 s and time out into a doomed reload (browser
284
+ // lands on a port-not-listening error). This is the case operators
285
+ // most often hit — the user just witnessed it today.
286
+ //
287
+ // 4 s is enough for the POST to deliver headers on a healthy box. If we
288
+ // saw EITHER res.ok OR a delayed-disconnect-with-headers-seen, assume the
289
+ // spawned child is on its way and start the polling loop. Otherwise fall
290
+ // back to the terminal-runnable command.
291
+ var postCtl = new AbortController();
292
+ var postTimer = setTimeout(function() { postCtl.abort(); }, 4000);
293
+ var postDelivered = false;
294
+ var postError = null;
249
295
  try {
250
- fetch('/api/dashboard/restart', { method: 'POST', headers: { 'Content-Type': 'application/json' } })
251
- .catch(function() { /* dashboard process likely killed mid-response — expected */ });
252
- } catch { /* network layer threw before fetch even queued — also expected */ }
296
+ var postRes = await fetch('/api/dashboard/restart', { method: 'POST', headers: { 'Content-Type': 'application/json' }, signal: postCtl.signal });
297
+ if (postRes && postRes.ok) postDelivered = true;
298
+ else postError = 'HTTP ' + (postRes ? postRes.status : '?');
299
+ } catch (e) {
300
+ postError = String((e && e.message) || e);
301
+ } finally {
302
+ clearTimeout(postTimer);
303
+ }
304
+
305
+ if (!postDelivered) {
306
+ _ccShowDashboardDeadFallback(btn, postError);
307
+ return;
308
+ }
309
+
253
310
  if (btn) { try { btn.textContent = 'Restarting Minions — waiting for new dashboard...'; } catch {} }
254
311
 
255
312
  var startedAt = Date.now();
@@ -71,7 +71,7 @@ const RENDER_VERSIONS = {
71
71
  projects: 1,
72
72
  notes: 1,
73
73
  prd: 1,
74
- prs: 1,
74
+ prs: 2,
75
75
  archivedPrds: 1,
76
76
  engine: 2,
77
77
  version: 1,
@@ -206,7 +206,16 @@ function _processStatusUpdate(data) {
206
206
 
207
207
 
208
208
  // Render only changed sections
209
- if (_changed('agents', data.agents)) { renderAgents(data.agents); cmdUpdateAgentList(data.agents); }
209
+ // Agents is exempt from the _changed gate: real-time status correctness on
210
+ // the Minions Members grid (status badge, running timer, Last-run line)
211
+ // beats the cost of re-rendering 5 cards every poll tick. The gate was
212
+ // causing visible staleness when the ref-eq / JSON-stringify short-circuit
213
+ // falsely matched across ticks (W-mpn7keq9000302c9). Still call _changed
214
+ // here so the _lastChangedFlags diag ring-buffer keeps recording whether
215
+ // the agents payload actually moved this tick.
216
+ _changed('agents', data.agents);
217
+ renderAgents(data.agents);
218
+ cmdUpdateAgentList(data.agents);
210
219
  if (_changed('prdProgress', data.prdProgress) || _changed('prdPrs', data.pullRequests?.length)) { renderPrdProgress(data.prdProgress); _cachePrdItems(data.prdProgress); }
211
220
  if (_changed('inbox', data.inbox)) renderInbox(data.inbox || []);
212
221
  if (_changed('projects', data.projects)) { cmdUpdateProjectList(data.projects || []); renderProjects(data.projects || []); }
@@ -343,6 +352,116 @@ let _lastStatusData = null;
343
352
  // fresh state anyway.
344
353
  let _refreshInFlight = false;
345
354
 
355
+ // ── Dashboard-unreachable detector ───────────────────────────────────────
356
+ // When the dashboard process dies, /api/status throws or 5xxs and the
357
+ // existing catch block just console.errors — the page keeps painting the
358
+ // last successful snapshot. Operators have reported the resulting symptom
359
+ // many times: badge says "running", CC POST throws "Failed to fetch", and
360
+ // it's not obvious the dashboard itself is dead (vs. wedged, vs. the
361
+ // engine being down).
362
+ //
363
+ // Trip conditions: 2 consecutive failed polls OR >12 s since the last
364
+ // success (3× the 4 s poll cadence, so a single flaky tick doesn't fire).
365
+ // On trip we show a sticky red banner with two recovery actions:
366
+ // 1. "Restart Minions" → ccRestartMinions (works when dashboard is
367
+ // alive-but-stale; falls through to copy-to-clipboard when the POST
368
+ // itself fails)
369
+ // 2. "Copy minions restart" → terminal fallback for the dashboard-is-
370
+ // truly-dead case the in-browser button can't fix on its own
371
+ // Also overrides the engine badge to UNKNOWN/muted so the misleading
372
+ // "RUNNING" pill stops showing while data is frozen.
373
+ let _lastStatusOkAt = Date.now();
374
+ let _consecutiveStatusFails = 0;
375
+ let _unreachableSince = 0; // 0 = currently reachable
376
+ let _unreachableAgeTimer = null;
377
+ const _UNREACHABLE_FAIL_THRESHOLD = 2;
378
+ const _UNREACHABLE_AGE_MS = 12000;
379
+
380
+ function _formatAge(ms) {
381
+ if (ms < 1000) return 'just now';
382
+ const s = Math.round(ms / 1000);
383
+ if (s < 60) return s + 's ago';
384
+ const m = Math.floor(s / 60);
385
+ const rem = s % 60;
386
+ return rem ? m + 'm ' + rem + 's ago' : m + 'm ago';
387
+ }
388
+
389
+ function _refreshUnreachableAgeText() {
390
+ if (!_unreachableSince) return;
391
+ // The age span is rendered into the shared #engine-alert element by
392
+ // _markDashboardUnreachable; lookup by ID survives any inner-HTML rebuild
393
+ // as long as the markup keeps the span around (it does — single source).
394
+ const el = document.getElementById('dashboard-unreachable-age');
395
+ if (el) el.textContent = _formatAge(Date.now() - _lastStatusOkAt);
396
+ }
397
+
398
+ function _markDashboardUnreachable(err) {
399
+ if (_unreachableSince) {
400
+ _refreshUnreachableAgeText();
401
+ return;
402
+ }
403
+ _unreachableSince = Date.now();
404
+ window._dashboardUnreachable = {
405
+ since: _unreachableSince,
406
+ lastSuccessAt: _lastStatusOkAt,
407
+ lastError: String(err && err.message || err || 'unknown'),
408
+ };
409
+ // Reuse the existing #engine-alert surface (red-tinted banner already wired
410
+ // for engine-stale, ado-throttle, gh-throttle) instead of introducing a
411
+ // second red banner. Engine-stale and dashboard-unreachable are mutually
412
+ // exclusive in practice: engine-stale needs a fresh heartbeat from a
413
+ // successful poll, and a successful poll means the dashboard IS reachable.
414
+ // When the dashboard recovers, _markDashboardReachable hides this element
415
+ // explicitly; the next renderEngineAlert pass (driven by the recovered
416
+ // poll's engine state) then takes over normally.
417
+ const el = document.getElementById('engine-alert');
418
+ if (el) {
419
+ el.innerHTML =
420
+ '<span class="engine-alert-msg">&#x26A0;&#xFE0F; Dashboard unreachable &mdash; stale <span id="dashboard-unreachable-age">just now</span></span>' +
421
+ '<span class="engine-alert-action" onclick="ccRestartMinions(this)">Restart Minions</span>';
422
+ el.style.display = 'flex';
423
+ }
424
+ _refreshUnreachableAgeText();
425
+ if (_unreachableAgeTimer) clearInterval(_unreachableAgeTimer);
426
+ _unreachableAgeTimer = setInterval(_refreshUnreachableAgeText, 1000);
427
+ // Override engine badge so the cached "RUNNING" doesn't keep misleading
428
+ // the user. renderEngineStatus reads engine state from the next
429
+ // successful poll — when reachable recovers, the override clears.
430
+ const badge = document.getElementById('engine-badge');
431
+ if (badge) {
432
+ badge.className = 'engine-badge stopped';
433
+ badge.textContent = 'UNKNOWN';
434
+ badge.title = 'Dashboard unreachable — engine state is unknown. UI data is stale.';
435
+ }
436
+ console.warn('Dashboard unreachable:', window._dashboardUnreachable.lastError);
437
+ }
438
+
439
+ function _markDashboardReachable() {
440
+ if (!_unreachableSince) return;
441
+ const downForMs = Date.now() - _unreachableSince;
442
+ _unreachableSince = 0;
443
+ delete window._dashboardUnreachable;
444
+ if (_unreachableAgeTimer) { clearInterval(_unreachableAgeTimer); _unreachableAgeTimer = null; }
445
+ // Hand #engine-alert back to renderEngineAlert. We can't leave our content
446
+ // sitting in there because the next renderEngineAlert call only runs when
447
+ // `_changed('engine', data.engine)` returns true — if engine state hasn't
448
+ // shifted, our stale banner would persist past recovery.
449
+ const el = document.getElementById('engine-alert');
450
+ if (el) { el.style.display = 'none'; el.innerHTML = ''; }
451
+ console.log('Dashboard recovered after', _formatAge(downForMs).replace(' ago', ''));
452
+ // Badge restoration happens automatically on the next renderEngineStatus
453
+ // call (triggered by the successful refresh that brought us here).
454
+ }
455
+
456
+ // Test seam — reset detector state between scenarios.
457
+ window._resetDashboardUnreachableForTest = function() {
458
+ _lastStatusOkAt = Date.now();
459
+ _consecutiveStatusFails = 0;
460
+ _unreachableSince = 0;
461
+ if (_unreachableAgeTimer) { clearInterval(_unreachableAgeTimer); _unreachableAgeTimer = null; }
462
+ delete window._dashboardUnreachable;
463
+ };
464
+
346
465
  // ── Refresh diagnostics (W-mphejzx100081972) ─────────────────────────────
347
466
  // Ring buffer capturing the last 50 /api/status poll cycles so a user
348
467
  // reporting "the dashboard didn't auto-update when X changed" can paste
@@ -418,6 +537,11 @@ async function refresh() {
418
537
  const headers = {};
419
538
  if (_lastStatusEtag) headers['If-None-Match'] = _lastStatusEtag;
420
539
  const res = await safeFetch('/api/status', { headers });
540
+ if (!res || (!res.ok && res.status !== 304)) {
541
+ // Dashboard responded but with an error status. Treat as a fail tick so
542
+ // a 5xx-storm trips the unreachable banner just like a network error.
543
+ throw new Error('HTTP ' + (res ? res.status : '?'));
544
+ }
421
545
  let data;
422
546
  if (res.status === 304 && _lastStatusData) {
423
547
  // Cache hit — reuse last payload, skip parsing entirely.
@@ -425,6 +549,12 @@ async function refresh() {
425
549
  if (_diagEntry) {
426
550
  _diagEntry.response_status = '304';
427
551
  _diagEntry.bytes_received = 0;
552
+ // D2: capture etag on 304 so the diag table can show whether the
553
+ // server's ETag advanced even when we're reusing the cached body.
554
+ // Without this the "etag↓" column is blank on every 304 row and the
555
+ // operator can't tell server-side advancement from a pinned cache.
556
+ const etag304 = res.headers && (res.headers.get ? res.headers.get('etag') : null);
557
+ if (etag304) _diagEntry.etag_received = etag304;
428
558
  }
429
559
  } else {
430
560
  data = await res.json();
@@ -458,6 +588,12 @@ async function refresh() {
458
588
  return;
459
589
  }
460
590
  if (buildId) _knownDashboardBuildId = buildId;
591
+ // Successful poll — clear unreachable state. Placed AFTER the reload
592
+ // guards above so a dashboard restart still triggers location.reload()
593
+ // instead of just dismissing the banner.
594
+ _lastStatusOkAt = Date.now();
595
+ _consecutiveStatusFails = 0;
596
+ if (_unreachableSince) _markDashboardReachable();
461
597
  const _renderStart = _diagOn ? Date.now() : 0;
462
598
  let _diagChanges = null;
463
599
  if (_diagOn) {
@@ -484,6 +620,11 @@ async function refresh() {
484
620
  _diagEntry.response_status = (e && e.name === 'AbortError') ? 'abort' : 'error';
485
621
  _diagEntry.error_message = String((e && e.message) || e);
486
622
  }
623
+ _consecutiveStatusFails++;
624
+ const ageMs = Date.now() - _lastStatusOkAt;
625
+ if (_consecutiveStatusFails >= _UNREACHABLE_FAIL_THRESHOLD || ageMs > _UNREACHABLE_AGE_MS) {
626
+ _markDashboardUnreachable(e);
627
+ }
487
628
  }
488
629
  finally {
489
630
  _refreshInFlight = false;
@@ -51,22 +51,56 @@ function prRow(pr) {
51
51
  var followupChip = followupCount > 0
52
52
  ? ' <span class="pr-badge draft" style="font-size:8px" title="' + followupCount + ' follow-up work item(s) dispatched from comments on this PR">+' + followupCount + ' follow-up' + (followupCount === 1 ? '' : 's') + '</span>'
53
53
  : '';
54
+ const titleText = pr.title || 'Untitled';
55
+ const agentText = pr.agent || '—';
56
+ const reviewerCell = sq.reviewer && sq.status !== 'waiting'
57
+ ? '<span class="pr-agent" title="' + escapeHtml(sq.note || sq.reviewer) + '">' + escapeHtml(sq.reviewer) + '</span>'
58
+ : sq.reviewer && sq.status === 'waiting'
59
+ ? '<span class="pr-agent" style="color:var(--muted)" title="Vote pending confirmation">' + escapeHtml(sq.reviewer) + '…</span>'
60
+ : pr.reviewedBy && pr.reviewedBy.length
61
+ ? '<span class="pr-agent" title="' + escapeHtml(pr.reviewedBy.join(', ')) + '">' + escapeHtml(pr.reviewedBy.join(', ')) + '</span>'
62
+ : '<span style="color:var(--muted);font-size:11px">—</span>';
63
+ const createdLabel = (pr.created || '—').slice(0, 16).replace('T', ' ');
64
+ // Title attrs live on the inner element (link/span/badge) so hovering the
65
+ // ellipsis-truncated content reveals the full text. Cell tags stay bare so
66
+ // the header-to-cell count assertion in test/unit.test.js continues to
67
+ // balance.
54
68
  return '<tr>' +
55
- '<td><span class="pr-id">' + escapeHtml(String(prId)) + '</span></td>' +
56
- '<td><a class="pr-title" href="' + escapeHtml(safeUrl(url)) + '" target="_blank" rel="noopener">' + escapeHtml(pr.title || 'Untitled') + '</a>' + followupChip + (pr.description ? '<div class="pr-desc">' + escapeHtml(pr.description.length > 120 ? pr.description.slice(0, 120) + '...' : pr.description) + '</div>' : '') + '</td>' +
57
- '<td><span class="pr-agent">' + escapeHtml(pr.agent || '') + '</span></td>' +
69
+ '<td><span class="pr-id" title="' + escapeHtml(String(prId)) + '">' + escapeHtml(String(prId)) + '</span></td>' +
70
+ '<td><a class="pr-title" title="' + escapeHtml(titleText) + '" href="' + escapeHtml(safeUrl(url)) + '" target="_blank" rel="noopener">' + escapeHtml(titleText) + '</a>' + followupChip + (pr.description ? '<div class="pr-desc" title="' + escapeHtml(pr.description) + '">' + escapeHtml(pr.description.length > 120 ? pr.description.slice(0, 120) + '...' : pr.description) + '</div>' : '') + '</td>' +
71
+ '<td><span class="pr-agent" title="' + escapeHtml(agentText) + '">' + escapeHtml(agentText) + '</span></td>' +
58
72
  '<td><span class="' + branchClass + '" title="' + escapeHtml(branchError || branchLabel) + '">' + escapeHtml(branchLabel) + '</span>' + pendingReasonHtml + '</td>' +
59
- '<td><span class="pr-badge ' + reviewClass + '"' + (reviewTitle ? ' title="' + escapeHtml(reviewTitle) + '"' : '') + '>' + escapeHtml(reviewLabel) + '</span></td>' +
60
- '<td>' + (sq.reviewer && sq.status !== 'waiting' ? '<span class="pr-agent" title="' + escapeHtml(sq.note || '') + '">' + escapeHtml(sq.reviewer) + '</span>' : sq.reviewer && sq.status === 'waiting' ? '<span class="pr-agent" style="color:var(--muted)" title="Vote pending confirmation">' + escapeHtml(sq.reviewer) + '…</span>' : pr.reviewedBy && pr.reviewedBy.length ? '<span class="pr-agent">' + escapeHtml(pr.reviewedBy.join(', ')) + '</span>' : '<span style="color:var(--muted);font-size:11px">—</span>') + '</td>' +
61
- '<td><span class="pr-badge ' + buildClass + '"' + (buildTitle ? ' title="' + escapeHtml(buildTitle) + '"' : '') + '>' + escapeHtml(buildLabel) + '</span></td>' +
62
- '<td><span class="pr-badge ' + statusClass + '">' + escapeHtml(statusLabel) + '</span></td>' +
63
- '<td><span class="pr-date">' + escapeHtml((pr.created || '').slice(0, 16).replace('T', ' ')) + '</span></td>' +
73
+ '<td><span class="pr-badge ' + reviewClass + '" title="' + escapeHtml(reviewTitle || reviewLabel) + '">' + escapeHtml(reviewLabel) + '</span></td>' +
74
+ '<td>' + reviewerCell + '</td>' +
75
+ '<td><span class="pr-badge ' + buildClass + '" title="' + escapeHtml(buildTitle || buildLabel) + '">' + escapeHtml(buildLabel) + '</span></td>' +
76
+ '<td><span class="pr-badge ' + statusClass + '" title="' + escapeHtml(statusLabel) + '">' + escapeHtml(statusLabel) + '</span></td>' +
77
+ '<td><span class="pr-date" title="' + escapeHtml(createdLabel) + '">' + escapeHtml(createdLabel) + '</span></td>' +
64
78
  '<td><button class="pr-pager-btn" style="font-size:9px;padding:1px 5px;color:var(--red);border-color:var(--red)" data-pr-id="' + escapeHtml(String(prId)) + '" onclick="event.stopPropagation();unlinkPr(this.dataset.prId)" title="Remove from tracking">x</button></td>' +
65
79
  '</tr>';
66
80
  }
67
81
 
82
+ // Explicit per-column widths keep the PR table from ballooning when titles or
83
+ // branches are long. Total ≈1420px → table grows past viewport on narrow
84
+ // windows and the .pr-table-wrap--prs container scrolls horizontally inside
85
+ // the viewport (sticky scrollbar — see styles.css).
86
+ const PRS_COLGROUP =
87
+ '<colgroup>' +
88
+ '<col style="width:75px">' + // PR id
89
+ '<col style="width:320px">' + // Title
90
+ '<col style="width:140px">' + // Agent
91
+ '<col style="width:200px">' + // Branch
92
+ '<col style="width:130px">' + // Review
93
+ '<col style="width:140px">' + // Signed Off By
94
+ '<col style="width:130px">' + // Build
95
+ '<col style="width:110px">' + // Status
96
+ '<col style="width:130px">' + // Created
97
+ '<col style="width:50px">' + // Actions
98
+ '</colgroup>';
99
+
68
100
  function prTableHtml(rows) {
69
- return '<div class="pr-table-wrap"><table class="pr-table"><thead><tr>' +
101
+ return '<div class="pr-table-wrap pr-table-wrap--prs"><table class="pr-table pr-table--prs">' +
102
+ PRS_COLGROUP +
103
+ '<thead><tr>' +
70
104
  '<th>PR</th><th>Title</th><th>Agent</th><th>Branch</th><th>Review</th><th>Signed Off By</th><th>Build</th><th>Status</th><th>Created</th><th></th>' +
71
105
  '</tr></thead><tbody>' + rows + '</tbody></table></div>';
72
106
  }
@@ -98,6 +98,7 @@ async function openSettings() {
98
98
  settingsToggle('Auto-decompose', 'set-autoDecompose', e.autoDecompose !== false, 'Large implement items are auto-split into sub-tasks') +
99
99
  settingsToggle('Allow Temp Agents', 'set-allowTempAgents', !!e.allowTempAgents, 'Spawn ephemeral agents when all permanent agents are busy') +
100
100
  settingsToggle('Auto-archive Plans', 'set-autoArchive', !!e.autoArchive, 'Automatically archive plans after verify completes (off = manual archive via dashboard)') +
101
+ settingsToggle('Auto-consolidate Memory', 'set-autoConsolidateMemory', !!e.autoConsolidateMemory, 'Periodically spawn the KB sweep (dedup + compress + normalize knowledge/) from the engine tick on a 4h cadence. Inbox→notes consolidation already runs every tick (gated by the Consolidation Threshold above); this toggle controls only the KB sweep that was previously dashboard-button-only.') +
101
102
  settingsToggle('Auto-complete PRs', 'set-autoCompletePrs', !!e.autoCompletePrs, 'Auto-merge PRs when builds pass and review is approved (opt-in)') +
102
103
  settingsToggle('CC Worker Pool', 'set-ccUseWorkerPool', (e.ccUseWorkerPool === undefined ? ((e.ccCli || e.defaultCli) === 'copilot') : !!e.ccUseWorkerPool), 'Route Command Center / doc-chat through a persistent copilot --acp worker per tab instead of spawning a fresh CLI per turn. Copilot-only (Agent Client Protocol transport); Claude does not implement ACP, so this toggle has no effect when CC runtime is Claude. Default ON for copilot (cold-spawn ~20s on Windows); forced OFF for non-copilot CC runtimes regardless of this toggle.') +
103
104
  '</div>' +
@@ -127,6 +128,7 @@ async function openSettings() {
127
128
  '<div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;margin-bottom:16px">' +
128
129
  settingsField('Eval Max Cost', 'set-evalMaxCost', e.evalMaxCost === null || e.evalMaxCost === undefined ? '' : e.evalMaxCost, '$', 'USD ceiling per work item across all eval iterations (blank = no limit)') +
129
130
  settingsField('Agent Busy Reassign', 'set-agentBusyReassignMs', e.agentBusyReassignMs || 600000, 'ms', 'Reassign work to another agent after it waits this long on a busy agent') +
131
+ settingsField('Max Retries Per Agent', 'set-maxRetriesPerAgent', e.maxRetriesPerAgent ?? 2, '', 'After the same agent fails the same work item this many times, the next retry reassigns to a different eligible agent (consults routing.md + availability). Falls back to the same agent only when no alternate is available. Counted separately from total maxRetries (which still caps overall retries).') +
130
132
  settingsField('Version Check Interval', 'set-versionCheckInterval', e.versionCheckInterval || 3600000, 'ms', 'How often to check npm for updates (default: 1 hour)') +
131
133
  settingsField('Ignored Comment Authors', 'set-ignoredCommentAuthors', (e.ignoredCommentAuthors || []).join(', '), '', 'Comma-separated usernames — comments auto-closed, never trigger fixes') +
132
134
  '</div>' +
@@ -606,6 +608,7 @@ async function saveSettings() {
606
608
  autoDecompose: document.getElementById('set-autoDecompose').checked,
607
609
  allowTempAgents: document.getElementById('set-allowTempAgents').checked,
608
610
  autoArchive: document.getElementById('set-autoArchive').checked,
611
+ autoConsolidateMemory: document.getElementById('set-autoConsolidateMemory').checked,
609
612
  autoApplyReviewVote: document.getElementById('set-autoApplyReviewVote').checked,
610
613
  autoFixBuilds: document.getElementById('set-autoFixBuilds').checked,
611
614
  autoFixConflicts: document.getElementById('set-autoFixConflicts').checked,
@@ -621,6 +624,7 @@ async function saveSettings() {
621
624
  prPollCommentsEvery: document.getElementById('set-prPollCommentsEvery').value,
622
625
  evalMaxCost: document.getElementById('set-evalMaxCost').value || null,
623
626
  agentBusyReassignMs: document.getElementById('set-agentBusyReassignMs').value,
627
+ maxRetriesPerAgent: document.getElementById('set-maxRetriesPerAgent').value,
624
628
  ignoredCommentAuthors: document.getElementById('set-ignoredCommentAuthors').value,
625
629
  versionCheckInterval: document.getElementById('set-versionCheckInterval').value,
626
630
  // Runtime fleet (P-7a5c1f8e). Empty strings are intentional — they signal
@@ -260,6 +260,27 @@
260
260
  .pr-table-wrap { overflow-x: auto; }
261
261
  .pr-table { width: 100%; border-collapse: collapse; font-size: var(--text-md); table-layout: auto; }
262
262
  .pr-table th:last-child, .pr-table td:last-child { width: 36px; min-width: 36px; text-align: center; }
263
+
264
+ /* PR-page table variant (W-mpmwxn9h000bd2c2): fixed column widths with
265
+ ellipsis overflow, and the horizontal scrollbar is pinned inside the
266
+ viewport via a bounded-height container on the standalone /prs page so
267
+ it stays reachable without scrolling to the bottom of a tall table. */
268
+ .pr-table--prs { table-layout: fixed; width: 100%; min-width: 1420px; }
269
+ .pr-table--prs th, .pr-table--prs td { overflow: hidden; text-overflow: ellipsis; }
270
+ .pr-table--prs th:last-child, .pr-table--prs td:last-child { width: auto; min-width: 0; }
271
+ .pr-table--prs .pr-title { display: block; max-width: 100%; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
272
+ .pr-table--prs .pr-agent { display: inline-block; max-width: 100%; overflow: hidden; text-overflow: ellipsis; vertical-align: middle; white-space: nowrap; }
273
+ .pr-table--prs .pr-branch { max-width: 100%; }
274
+ .pr-table--prs .pr-desc { max-width: 100%; }
275
+ .pr-table--prs .pr-date { display: inline-block; max-width: 100%; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; vertical-align: middle; }
276
+
277
+ /* Standalone /prs page only: bound the table-wrap height so the horizontal
278
+ scrollbar (bottom of the wrap) and the table header (sticky inside the
279
+ wrap) stay visible while the user is on this page. The modal "see all"
280
+ view uses the same colgroup but is unaffected — modal-body handles its
281
+ own scrolling. */
282
+ #pr-content .pr-table-wrap--prs { max-height: calc(100vh - 200px); overflow: auto; }
283
+ #pr-content .pr-table--prs thead th { position: sticky; top: 0; background: var(--surface); z-index: 1; }
263
284
  .pr-table th { text-align: left; color: var(--muted); font-weight: 500; font-size: var(--text-base); text-transform: uppercase; letter-spacing: 0.5px; padding: var(--space-4) var(--space-5); border-bottom: 1px solid var(--border); }
264
285
  .pr-table td { padding: var(--space-5); border-bottom: 1px solid var(--border); vertical-align: middle; white-space: nowrap; }
265
286
  .pr-table tr:last-child td { border-bottom: none; }
package/dashboard.js CHANGED
@@ -2042,6 +2042,13 @@ function _markStatusCacheBuilt() {
2042
2042
  _statusCacheJson = null;
2043
2043
  _statusCacheGzip = null;
2044
2044
  _statusCacheVersion++;
2045
+ // A4: keep body.version.statusCacheVersion in sync with the ETag we're
2046
+ // about to return. Without this, the field lags by up to 60s because
2047
+ // `version` is built in slow-state, making refresh-diagnostics unable
2048
+ // to distinguish "server cache pinned" from "slow-state TTL not expired."
2049
+ if (_statusCache && _statusCache.version) {
2050
+ _statusCache.version.statusCacheVersion = _statusCacheVersion;
2051
+ }
2045
2052
  }
2046
2053
 
2047
2054
  function getStatus() {
@@ -5848,19 +5855,17 @@ const server = http.createServer(async (req, res) => {
5848
5855
  }
5849
5856
 
5850
5857
  async function handleKnowledgeSweep(req, res) {
5851
- // Source of truth = kb-sweep-state.json + PID liveness. The sweep now runs
5852
- // as a detached child (engine/kb-sweep-runner.js) so it survives
5853
- // `minions restart`; the in-memory `global._kbSweep*` flags from the old
5854
- // in-process implementation are gone.
5858
+ // Source of truth = kb-sweep-state.json + PID liveness. The sweep runs as
5859
+ // a detached child (engine/kb-sweep-runner.js) so it survives
5860
+ // `minions restart`. The actual spawn logic lives in
5861
+ // engine/kb-sweep.js::spawnSweepRunnerDetached shared with the engine's
5862
+ // auto-sweep tick phase.
5855
5863
  const {
5856
- readSweepLiveness, staleGuardMs, KB_SWEEP_STATE_PATH, KB_SWEEP_LOG_PATH, KB_SWEEP_RUNNER_PATH,
5864
+ readSweepLiveness, staleGuardMs, spawnSweepRunnerDetached, KB_SWEEP_STATE_PATH,
5857
5865
  } = require('./engine/kb-sweep');
5858
5866
  const entryCount = ((await queries.getKnowledgeBaseEntries()) || []).length;
5859
5867
  const guardMs = staleGuardMs(entryCount);
5860
5868
 
5861
- // Synchronous pre-claim BEFORE awaiting the body so a concurrent POST
5862
- // arriving in the same tick sees in-flight state and can't double-spawn.
5863
- const sweepToken = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
5864
5869
  const liveness = readSweepLiveness({ entryCount });
5865
5870
  if (liveness.inFlight && liveness.stale) {
5866
5871
  const reason = !liveness.alive
@@ -5875,79 +5880,15 @@ const server = http.createServer(async (req, res) => {
5875
5880
  });
5876
5881
  }
5877
5882
 
5878
- // Claim the slot synchronously by writing a "starting" state. The runner
5879
- // will overwrite this with status:'in-flight' + its real pid once it boots.
5880
- // readSweepLiveness grants a 15s boot-grace to "starting" records with no pid.
5881
- const startedAt = Date.now();
5882
- try {
5883
- safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify({
5884
- status: 'starting', startedAt, startedAtIso: new Date().toISOString(),
5885
- sweepToken, pid: null,
5886
- }));
5887
- } catch (e) {
5888
- console.error(`[kb-sweep] failed to write starting state: ${e.message}`);
5889
- }
5890
-
5891
5883
  const body = await readBody(req).catch(() => ({}));
5892
-
5893
- // Persist body to a temp file so spawn doesn't have to serialize large
5894
- // pinnedKeys arrays via argv. Skip when body is empty.
5895
- let bodyFile = null;
5896
- if (body && (Array.isArray(body.pinnedKeys) || body.dryRun != null)) {
5897
- bodyFile = path.join(ENGINE_DIR, `tmp-kb-sweep-body-${sweepToken}.json`);
5898
- try { safeWrite(bodyFile, JSON.stringify(body)); }
5899
- catch (e) {
5900
- console.error(`[kb-sweep] failed to write body-file ${bodyFile}: ${e.message}`);
5901
- bodyFile = null;
5902
- }
5903
- }
5904
-
5905
- const { spawn: cpSpawn } = require('child_process');
5906
- // Open log fd in append mode so spawn can pipe stdio there. Child inherits
5907
- // the fd; parent closes its copy after spawn returns successfully.
5908
- let logFdNum = null;
5909
- let stdio = ['ignore', 'ignore', 'ignore'];
5910
- try {
5911
- logFdNum = fs.openSync(KB_SWEEP_LOG_PATH, 'a');
5912
- stdio = ['ignore', logFdNum, logFdNum];
5913
- } catch (e) {
5914
- console.error(`[kb-sweep] failed to open log ${KB_SWEEP_LOG_PATH}: ${e.message}`);
5915
- }
5916
-
5917
- const spawnArgs = ['--sweep-token', sweepToken];
5918
- if (bodyFile) spawnArgs.push('--body-file', bodyFile);
5919
-
5920
- let proc;
5921
- try {
5922
- proc = cpSpawn(process.execPath, [KB_SWEEP_RUNNER_PATH, ...spawnArgs], {
5923
- cwd: MINIONS_DIR, stdio, detached: true, windowsHide: true,
5924
- env: { ...process.env },
5925
- });
5926
- } catch (e) {
5927
- if (logFdNum != null) try { fs.closeSync(logFdNum); } catch { /* ignore */ }
5928
- if (bodyFile) try { fs.unlinkSync(bodyFile); } catch { /* ignore */ }
5929
- // Release the "starting" claim on synchronous spawn failure so the user
5930
- // can retry immediately.
5931
- try { shared.safeUnlink(KB_SWEEP_STATE_PATH); } catch { /* ignore */ }
5932
- return jsonReply(res, 500, { error: `spawn failed: ${e.message}` });
5884
+ const result = spawnSweepRunnerDetached({
5885
+ pinnedKeys: Array.isArray(body?.pinnedKeys) ? body.pinnedKeys : undefined,
5886
+ dryRun: body?.dryRun,
5887
+ });
5888
+ if (!result.ok) {
5889
+ return jsonReply(res, 500, { error: result.error || 'spawn failed' });
5933
5890
  }
5934
- if (logFdNum != null) try { fs.closeSync(logFdNum); } catch { /* ignore */ }
5935
-
5936
- // Conditional CAS: only update the state file from "starting" → "in-flight"
5937
- // if our sweepToken still owns it. If the (fast) runner already wrote
5938
- // "completed"/"failed" or its own "in-flight", leave that newer state alone.
5939
- try {
5940
- const current = safeJson(KB_SWEEP_STATE_PATH);
5941
- if (current && current.status === 'starting' && current.sweepToken === sweepToken) {
5942
- safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify({
5943
- status: 'in-flight', startedAt, startedAtIso: new Date().toISOString(),
5944
- sweepToken, pid: proc.pid,
5945
- }));
5946
- }
5947
- } catch { /* best-effort */ }
5948
-
5949
- proc.unref();
5950
- return jsonReply(res, 202, { ok: true, started: true, sweepToken });
5891
+ return jsonReply(res, 202, { ok: true, started: true, sweepToken: result.sweepToken });
5951
5892
  }
5952
5893
 
5953
5894
 
@@ -8613,6 +8554,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8613
8554
  versionCheckInterval: [60000],
8614
8555
  prPollStatusEvery: [1], prPollCommentsEvery: [1],
8615
8556
  agentBusyReassignMs: [0],
8557
+ maxRetriesPerAgent: [1, 20],
8616
8558
  };
8617
8559
  for (const [key, [min, max]] of Object.entries(numericFields)) {
8618
8560
  if (e[key] !== undefined) {
@@ -1,6 +1,6 @@
1
1
  # Auto-Discovery & Execution Pipeline
2
2
 
3
- > Last verified: 2026-05-22 against `engine.js` `tickInner()` (lines 6068-6425).
3
+ > Last verified: 2026-05-25 against `engine.js` `tickInner()` (lines 6293-6947) and `routing.md`.
4
4
 
5
5
  How the minions engine finds work and dispatches agents automatically.
6
6
 
@@ -199,6 +199,8 @@ routing.md table (see the file for the authoritative list):
199
199
  decompose → ripley (fallback: rebecca)
200
200
  meeting → ripley (fallback: lambert)
201
201
  docs → lambert (fallback: _any_)
202
+ setup → dallas (fallback: _any_)
203
+ qa-validate → dallas (fallback: ralph)
202
204
  ```
203
205
 
204
206
  Resolution order:
@@ -0,0 +1,71 @@
1
+ # QA runbook lifecycle (W-mpeiwz6k0005bf34)
2
+
3
+ Validation runbooks dispatched against live managed instances. Mirrors the
4
+ managed-spawn lifecycle (declare → engine spawns → healthcheck → observable)
5
+ but optimized for human/agent-driven smoke + E2E flows. Surfaced on the
6
+ `/qa` dashboard page (`dashboard/pages/qa.html`, `dashboard/js/qa.js`); full
7
+ live-process inventory remains on `/engine` (do NOT mirror it; see
8
+ W-mpdad3mq000m53bb).
9
+
10
+ ## Runbook location
11
+
12
+ `qa-runbooks.json` (engine state, JSON list keyed by `id`). Each entry:
13
+ `{ id, name, target, steps, expectedArtifacts, createdAt, createdBy }`.
14
+ CRUD via `GET/POST /api/qa/runbooks` (POST returns the new runbook with
15
+ engine-assigned `id`). `target` is a name from `/api/managed-processes` or
16
+ `/api/keep-processes` (deduped by `<project>::<name>`, managed wins).
17
+
18
+ ## Run-record path
19
+
20
+ `qa-runs.json` (newest-first, capped). Each run:
21
+ `{ id, runbookId, runbookName, target, status, startedAt, completedAt, workItemId, agentId, artifacts }`.
22
+ `status` ∈ `pending|dispatched|running|passed|failed|error`. Created by
23
+ `POST /api/qa/runbooks/run` (`{ id }`), which dispatches a `qa-validate`
24
+ work item and seeds the run with `dispatched`. Read via
25
+ `GET /api/qa/runs?limit=50` — UI polls every 5s while the QA page is active
26
+ and clears the interval on page navigation via the `switchPage` wrapper in
27
+ `dashboard/js/qa.js` (matches `_stopPlanPoll`/`_stopMeetingPoll` pattern in
28
+ `dashboard/js/state.js`).
29
+
30
+ ## Artifact contract
31
+
32
+ `engine/qa-artifacts/<runId>/<file>`, served via
33
+ `GET /api/qa/artifacts/<runId>/<file>`. Files are agent-uploaded screenshots,
34
+ video recordings, and log captures listed in the run record's
35
+ `artifacts: [{ file, kind }]`. UI auto-detects:
36
+ `screenshot`/`png|jpg|jpeg|gif|webp|svg` → `<img>`;
37
+ `video`/`mp4|webm|ogg|mov` → `<video controls>`; everything else → log
38
+ preview (first 40 lines fetched lazily) with a `View full` link to the same
39
+ endpoint. **No direct filesystem paths are exposed** — every artifact URL
40
+ goes through `/api/qa/artifacts/` so path traversal is server-gated.
41
+ Optional config: `engine.qaArtifactsMaxBytes` caps per-file upload size;
42
+ when set, dashboard Settings exposes a matching toggle (CLAUDE.md
43
+ best-practice #15).
44
+
45
+ ## Agent sidecar shape
46
+
47
+ The `qa-validate` agent writes `agents/<id>/qa-run.json` before exit:
48
+
49
+ ```json
50
+ { "runId": "qa-run-<id>",
51
+ "status": "passed|failed|error",
52
+ "summary": "...",
53
+ "artifacts": [ { "file": "dashboard.png", "kind": "screenshot" },
54
+ { "file": "test.log", "kind": "log" } ],
55
+ "written_by": "<agentId>", "wi_id": "<workItemId>" }
56
+ ```
57
+
58
+ The engine reads the sidecar in `onAgentClose`, copies the listed files into
59
+ `engine/qa-artifacts/<runId>/`, and stamps the run record with `status`,
60
+ `completedAt`, and the recorded `artifacts`. Sidecar validation failure is
61
+ non-retryable (`failure_class: invalid-qa-run`); listed files outside the
62
+ agent worktree or larger than `qaArtifactsMaxBytes` are rejected without
63
+ stamping the run.
64
+
65
+ ## Entry point
66
+
67
+ `playbooks/qa-validate.md`. Dispatched by `POST /api/qa/runbooks/run`;
68
+ receives `target`, `steps`, `expectedArtifacts` as template vars; required
69
+ to write the sidecar above before exit. Routing line in `routing.md` maps
70
+ the synthetic `qa-validate` task-type to the playbook so manual dispatches
71
+ work too.