@yemi33/minions 0.1.2053 → 0.1.2055

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1064,7 +1064,18 @@ async function _ccDoSend(message, skipUserMsg, forceTabId, intentMetadata) {
1064
1064
  activeTab._429retries = 0;
1065
1065
  var errText = await res.text();
1066
1066
  if (isReconnect && res.status === 409) return { interrupted: true, reconnectable: false, reason: errText || 'No live stream' };
1067
- throw new Error(errText || 'CC error');
1067
+ // W-mpmwxni2000c25c7-d try to parse the canonical error envelope from
1068
+ // the non-2xx body. Backend wraps every pre-stream error in
1069
+ // _buildCcErrorEnvelope, so when the JSON parses we surface
1070
+ // envelope.message / envelope.code; otherwise fall back to raw text.
1071
+ var ccEnvelope = null;
1072
+ try {
1073
+ var parsed = JSON.parse(errText);
1074
+ if (parsed && parsed.type === 'error' && typeof parsed.message === 'string') ccEnvelope = parsed;
1075
+ } catch (_e) { /* not JSON — keep raw text */ }
1076
+ var thrown = new Error((ccEnvelope && ccEnvelope.message) || errText || 'CC error');
1077
+ if (ccEnvelope) thrown._ccErrorEnvelope = ccEnvelope;
1078
+ throw thrown;
1068
1079
  }
1069
1080
 
1070
1081
  activeTab._429retries = 0;
@@ -1075,6 +1086,11 @@ async function _ccDoSend(message, skipUserMsg, forceTabId, intentMetadata) {
1075
1086
  var decoder = new TextDecoder();
1076
1087
  var buf = '';
1077
1088
  var terminalEventSeen = false;
1089
+ // W-mpmwxni2000c25c7-d — SSE spec: `event:` lines name the event type for
1090
+ // the data lines that follow, reset on a blank line. Tracked so backend
1091
+ // `event: error` frames are recognized even by clients that don't read
1092
+ // `data.type` (and so the integration test can assert wire format).
1093
+ var pendingEventName = '';
1078
1094
 
1079
1095
  async function _handleEvent(evt) {
1080
1096
  if (evt.type === 'chunk') {
@@ -1148,7 +1164,23 @@ async function _ccDoSend(message, skipUserMsg, forceTabId, intentMetadata) {
1148
1164
  } else if (evt.type === 'error') {
1149
1165
  terminalEventSeen = true;
1150
1166
  _cleanupStreamDiv();
1151
- addMsg('assistant', '<span style="color:var(--red)">' + escHtml(evt.error) + '</span>');
1167
+ // W-mpmwxni2000c25c7-d render the typed error envelope as an
1168
+ // accessible red bubble (role=alert) with a Retry button. We honor
1169
+ // `evt.message` (canonical envelope) and fall back to `evt.error` for
1170
+ // any pre-envelope frames still in flight from older backends.
1171
+ var ccErrMsg = (typeof evt.message === 'string' && evt.message) ? evt.message
1172
+ : (typeof evt.error === 'string' && evt.error) ? evt.error
1173
+ : 'Command Center reported an unknown error.';
1174
+ var ccErrCode = typeof evt.code === 'string' ? evt.code : '';
1175
+ var ccRetry = _ccStoreRetryRequest(activeTab, activeTabId, message);
1176
+ var codeChip = ccErrCode
1177
+ ? '<span style="display:inline-block;margin-left:6px;padding:1px 6px;font-size:9px;color:var(--muted);background:var(--surface2);border:1px solid var(--border);border-radius:3px;font-family:monospace">' + escHtml(ccErrCode) + '</span>'
1178
+ : '';
1179
+ var availList = Array.isArray(evt.availableModels) && evt.availableModels.length
1180
+ ? '<div style="font-size:10px;color:var(--muted);margin-top:6px">Available models: ' + escHtml(evt.availableModels.slice(0, 8).join(', ')) + (evt.availableModels.length > 8 ? '…' : '') + '</div>'
1181
+ : '';
1182
+ var errorBubble = '<div class="cc-error" role="alert" aria-live="assertive" style="padding:8px 12px;background:rgba(220,80,80,0.08);border-left:3px solid var(--red);border-radius:4px;color:var(--red);font-size:12px"><strong>Error</strong>' + codeChip + '<div style="margin-top:4px;color:var(--text)">' + escHtml(ccErrMsg) + '</div>' + availList + '</div>';
1183
+ addMsg('assistant', errorBubble + _ccRetryControls(ccRetry, '', false), false, { retryId: ccRetry.id });
1152
1184
  }
1153
1185
  }
1154
1186
 
@@ -1160,6 +1192,13 @@ async function _ccDoSend(message, skipUserMsg, forceTabId, intentMetadata) {
1160
1192
  buf = lines.pop();
1161
1193
  for (var li = 0; li < lines.length; li++) {
1162
1194
  var line = lines[li];
1195
+ // W-mpmwxni2000c25c7-d — track SSE `event:` lines per spec. The
1196
+ // event-name applies to the next data line and resets on a blank
1197
+ // line. The backend emits `event: error\ndata: {...}` for errors;
1198
+ // listeners that prefer event-typed dispatch see them as named
1199
+ // events rather than having to sniff `data.type`.
1200
+ if (line === '') { pendingEventName = ''; continue; }
1201
+ if (line.startsWith('event: ')) { pendingEventName = line.slice(7).trim(); continue; }
1163
1202
  if (!line.startsWith('data: ')) continue;
1164
1203
  // W-mpdavudb000v8446 — these used to swallow ALL errors via `catch {}`,
1165
1204
  // hiding JSON.parse failures AND any DOM/render exception thrown by
@@ -1178,6 +1217,9 @@ async function _ccDoSend(message, skipUserMsg, forceTabId, intentMetadata) {
1178
1217
  try { console.error('[cc-sse] parse-failed', { tab: activeTabId, len: rawJson.length, error: String(parseErr && parseErr.message || parseErr) }); } catch (_e) {}
1179
1218
  continue;
1180
1219
  }
1220
+ // If the server named the event but the payload didn't carry a `type`,
1221
+ // backfill from the event line so `_handleEvent` dispatch still works.
1222
+ if (pendingEventName && evt && typeof evt === 'object' && !evt.type) evt.type = pendingEventName;
1181
1223
  try { await _handleEvent(evt); }
1182
1224
  catch (handleErr) {
1183
1225
  try { console.error('[cc-sse] handle-failed', { tab: activeTabId, type: evt && evt.type, error: String(handleErr && handleErr.message || handleErr), stack: handleErr && handleErr.stack }); } catch (_e) {}
@@ -1188,6 +1230,8 @@ async function _ccDoSend(message, skipUserMsg, forceTabId, intentMetadata) {
1188
1230
  var remainingLines = buf.split('\n');
1189
1231
  for (var ri = 0; ri < remainingLines.length; ri++) {
1190
1232
  var rline = remainingLines[ri];
1233
+ if (rline === '') { pendingEventName = ''; continue; }
1234
+ if (rline.startsWith('event: ')) { pendingEventName = rline.slice(7).trim(); continue; }
1191
1235
  if (!rline.startsWith('data: ')) continue;
1192
1236
  var trailRaw = rline.slice(6);
1193
1237
  var trailEvt;
@@ -1196,6 +1240,7 @@ async function _ccDoSend(message, skipUserMsg, forceTabId, intentMetadata) {
1196
1240
  try { console.error('[cc-sse] parse-failed-trailing', { tab: activeTabId, len: trailRaw.length, error: String(parseErr && parseErr.message || parseErr) }); } catch (_e) {}
1197
1241
  continue;
1198
1242
  }
1243
+ if (pendingEventName && trailEvt && typeof trailEvt === 'object' && !trailEvt.type) trailEvt.type = pendingEventName;
1199
1244
  try { await _handleEvent(trailEvt); }
1200
1245
  catch (handleErr) {
1201
1246
  try { console.error('[cc-sse] handle-failed-trailing', { tab: activeTabId, type: trailEvt && trailEvt.type, error: String(handleErr && handleErr.message || handleErr), stack: handleErr && handleErr.stack }); } catch (_e) {}
@@ -1265,8 +1310,26 @@ async function _ccDoSend(message, skipUserMsg, forceTabId, intentMetadata) {
1265
1310
  : '<div style="font-size:10px;color:var(--muted);margin-top:4px">Dashboard connection lost. Restart Minions to reconnect.</div>';
1266
1311
  }
1267
1312
  var errorRetry = _ccStoreRetryRequest(activeTab, activeTabId, message);
1313
+ // W-mpmwxni2000c25c7-d — if the thrower attached a parsed CC error
1314
+ // envelope (non-2xx body with `{type:'error', message, code}` shape),
1315
+ // render the styled bubble + code chip + available-models hint to
1316
+ // match the SSE error path. Bare network errors keep the legacy red
1317
+ // "Error: <msg>" span so connection-loss UX is unchanged.
1318
+ var ccEnv = e && e._ccErrorEnvelope;
1319
+ var errorRendered;
1320
+ if (ccEnv) {
1321
+ var ccCodeChip = ccEnv.code
1322
+ ? '<span style="display:inline-block;margin-left:6px;padding:1px 6px;font-size:9px;color:var(--muted);background:var(--surface2);border:1px solid var(--border);border-radius:3px;font-family:monospace">' + escHtml(ccEnv.code) + '</span>'
1323
+ : '';
1324
+ var ccAvail = Array.isArray(ccEnv.availableModels) && ccEnv.availableModels.length
1325
+ ? '<div style="font-size:10px;color:var(--muted);margin-top:6px">Available models: ' + escHtml(ccEnv.availableModels.slice(0, 8).join(', ')) + (ccEnv.availableModels.length > 8 ? '…' : '') + '</div>'
1326
+ : '';
1327
+ errorRendered = '<div class="cc-error" role="alert" aria-live="assertive" style="padding:8px 12px;background:rgba(220,80,80,0.08);border-left:3px solid var(--red);border-radius:4px;color:var(--red);font-size:12px"><strong>Error</strong>' + ccCodeChip + '<div style="margin-top:4px;color:var(--text)">' + escHtml(ccEnv.message) + '</div>' + ccAvail + '</div>';
1328
+ } else {
1329
+ errorRendered = '<span style="color:var(--red)">Error: ' + escHtml(e.message) + '</span>';
1330
+ }
1268
1331
  addMsg('assistant', (streamedText ? renderMd(streamedText) + _ccElapsedFooter('Stream interrupted after {seconds}s') : '') +
1269
- '<span style="color:var(--red)">Error: ' + escHtml(e.message) + '</span>' +
1332
+ errorRendered +
1270
1333
  _ccRetryControls(errorRetry, connectionHint, isNetworkError && (!dashboardHealth.reachable || dashboardHealth.restarted)), false, { retryId: errorRetry.id });
1271
1334
  }
1272
1335
  } finally {
@@ -130,6 +130,11 @@ async function openSettings() {
130
130
  '<div style="font-size:9px;color:var(--muted);margin-top:1px">CC reasoning depth</div>' +
131
131
  '</div>' +
132
132
  '</div>' +
133
+ // W-mpmwxni2000c25c7-d — per-turn watchdog. Surfaced under CC overrides
134
+ // because it gates CC/doc-chat error visibility (not the agent fleet).
135
+ '<div style="display:grid;grid-template-columns:1fr;gap:8px;margin-top:8px">' +
136
+ settingsField('CC Turn Timeout', 'set-ccTurnTimeoutMs', e.ccTurnTimeoutMs || 300000, 'ms', 'Per-turn watchdog for CC + doc-chat. If no terminal SSE event arrives within this window the handler emits event: error with code: cc-turn-timeout, the spinner stops, and a Retry button is shown. Clamped to 10000–3600000 ms.') +
137
+ '</div>' +
133
138
  '</details>' +
134
139
  '</div>' +
135
140
  '<h4>Agents</h4>' +
@@ -833,6 +838,7 @@ async function saveSettings() {
833
838
  ccCli: (document.getElementById('set-ccCli')?.value ?? '').trim(),
834
839
  ccModel: (document.getElementById('set-ccModel')?.value ?? '').trim(),
835
840
  ccEffort: document.getElementById('set-ccEffort').value || null,
841
+ ccTurnTimeoutMs: document.getElementById('set-ccTurnTimeoutMs')?.value,
836
842
  claudeBareMode: !!document.getElementById('set-claudeBareMode')?.checked,
837
843
  claudeFallbackModel: (document.getElementById('set-claudeFallbackModel')?.value ?? '').trim(),
838
844
  copilotFallbackModel: (document.getElementById('set-copilotFallbackModel')?.value ?? '').trim(),
@@ -715,8 +715,13 @@
715
715
  vertical nav + per-tab pane. Search input filters control rows across all tabs
716
716
  by data-search attribute. .modal.modal-wide is added by openSettings() so the
717
717
  rail + content fit comfortably side-by-side. */
718
- .modal-body.settings-body { padding: 0; white-space: normal; font-size: var(--text-md); line-height: 1.45; color: var(--text); font-family: 'Segoe UI', system-ui, sans-serif; }
719
- .settings-layout { display: flex; min-height: 480px; max-height: calc(80vh - 64px); }
718
+ /* Lock the settings body so the dialog dimensions stay constant across tabs.
719
+ `overflow: hidden` on the body suppresses the inherited `.modal-body`
720
+ scroll-y; only `.settings-content` should ever show a scrollbar. The
721
+ layout is pinned to a single fixed height (was min/max range) so empty
722
+ tabs don't shrink the dialog and full tabs don't stretch it. */
723
+ .modal-body.settings-body { padding: 0; white-space: normal; font-size: var(--text-md); line-height: 1.45; color: var(--text); font-family: 'Segoe UI', system-ui, sans-serif; overflow: hidden; }
724
+ .settings-layout { display: flex; height: calc(80vh - 64px); }
720
725
  .settings-rail { width: 220px; min-width: 220px; background: var(--surface2); border-right: 1px solid var(--border); display: flex; flex-direction: column; overflow: hidden; }
721
726
  .settings-search-wrap { padding: var(--space-5) var(--space-5) var(--space-4); border-bottom: 1px solid var(--border); }
722
727
  .settings-search { width: 100%; padding: var(--space-3) var(--space-4); background: var(--surface); border: 1px solid var(--border); border-radius: var(--radius-sm); color: var(--text); font-size: var(--text-md); font-family: inherit; }
@@ -959,3 +964,14 @@
959
964
  max-height: 160px; overflow: auto; white-space: pre;
960
965
  }
961
966
  .qa-artifact-log { max-width: 480px; }
967
+
968
+ /* W-mpmwxni2000c25c7-d - Command Center / doc-chat typed error bubble. */
969
+ /* Token-only styling so dark/light themes stay consistent; the inline */
970
+ /* styles emitted by command-center.js use the same vars and are kept */
971
+ /* for backward compat with existing render paths. */
972
+ .cc-error { padding: 8px 12px; background: rgba(248, 81, 73, 0.08);
973
+ border-left: 3px solid var(--red); border-radius: var(--radius-sm);
974
+ color: var(--red); font-size: var(--text-sm); }
975
+ .cc-error strong { color: var(--red); }
976
+ .cc-error code { font-family: monospace; font-size: var(--text-xs);
977
+ color: var(--muted); }
package/dashboard.js CHANGED
@@ -2400,6 +2400,37 @@ const DOC_CHAT_TIMEOUT_MS = 60 * 60 * 1000;
2400
2400
  // reconnect-replay protocol (dashboard.js:7048-7083).
2401
2401
  const SSE_MAX_QUEUE_BYTES = 4 * 1024 * 1024; // 4 MB per-tab — conservative, tunable
2402
2402
  const SSE_STUCK_KILL_MS = 30 * 1000; // 30s of continuous backpressure → res.destroy()
2403
+
2404
+ // W-mpmwxni2000c25c7-d — CC + doc-chat error envelope contract.
2405
+ // Canonical shape: `{ type: 'error', message, code, retryable, ...extra }`.
2406
+ // `code` is one of: 'model-unavailable', 'auth-failure', 'context-limit',
2407
+ // 'budget-exceeded', 'crash', 'cc-turn-timeout', 'worker-spawn-failed',
2408
+ // 'acp-handshake-failed', 'worker-died'. `retryable` tells the client whether
2409
+ // the same input has a chance of succeeding without operator intervention
2410
+ // (e.g. transient overload retries; auth/budget/timeout don't). Extra fields
2411
+ // (`availableModels`, `runtime`) are envelope-shape-stable so the client can
2412
+ // surface them without sniffing types.
2413
+ const CC_ERROR_CODES = Object.freeze([
2414
+ 'model-unavailable',
2415
+ 'auth-failure',
2416
+ 'context-limit',
2417
+ 'budget-exceeded',
2418
+ 'crash',
2419
+ 'cc-turn-timeout',
2420
+ 'worker-spawn-failed',
2421
+ 'acp-handshake-failed',
2422
+ 'worker-died',
2423
+ ]);
2424
+ function _buildCcErrorEnvelope({ message, code, retryable, ...extra } = {}) {
2425
+ const normalizedCode = CC_ERROR_CODES.includes(code) ? code : 'crash';
2426
+ return {
2427
+ type: 'error',
2428
+ message: String(message == null ? '' : message) || 'Unknown error',
2429
+ code: normalizedCode,
2430
+ retryable: !!retryable,
2431
+ ...extra,
2432
+ };
2433
+ }
2403
2434
  function _releaseCCTab(tabId) { ccInFlightTabs.delete(tabId); ccInFlightAborts.delete(tabId); }
2404
2435
  function _getCcLiveStream(tabId) {
2405
2436
  return ccLiveStreams.get(tabId) || null;
@@ -6981,7 +7012,12 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6981
7012
  // heartbeat force-close pattern from the writeCcEvent closure
6982
7013
  // (dashboard.js, search for SSE_MAX_QUEUE_BYTES).
6983
7014
  try {
6984
- res.write('data: ' + JSON.stringify(payload) + '\n\n');
7015
+ const type = payload && payload.type;
7016
+ // W-mpmwxni2000c25c7-d — mirror the writeCcEvent change so doc-chat
7017
+ // also emits `event: error` for terminal errors. Same back-compat:
7018
+ // the JSON still carries `type: 'error'` for data-line parsers.
7019
+ const eventLine = (type === 'error') ? 'event: error\n' : '';
7020
+ res.write(eventLine + 'data: ' + JSON.stringify(payload) + '\n\n');
6985
7021
  return true;
6986
7022
  } catch {
6987
7023
  return false;
@@ -7117,9 +7153,11 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7117
7153
  if (!res.headersSent) {
7118
7154
  res.statusCode = e.statusCode || 500;
7119
7155
  res.setHeader('Content-Type', 'application/json');
7120
- try { res.end(JSON.stringify({ error: e.message })); } catch {}
7156
+ // W-mpmwxni2000c25c7-d non-SSE error path mirrors the envelope shape
7157
+ // so the frontend's non-2xx branch can render the same red bubble.
7158
+ try { res.end(JSON.stringify(_buildCcErrorEnvelope({ message: e.message, code: e.code || 'crash', retryable: false }))); } catch {}
7121
7159
  } else {
7122
- writeDocEvent({ type: 'error', error: e.message });
7160
+ writeDocEvent(_buildCcErrorEnvelope({ message: e.message, code: e.code || 'crash', retryable: false }));
7123
7161
  _docStreamEnded = true;
7124
7162
  try { res.end(); } catch {}
7125
7163
  }
@@ -8031,7 +8069,16 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8031
8069
  return false;
8032
8070
  }
8033
8071
  let wire;
8034
- try { wire = 'data: ' + JSON.stringify(payload) + '\n\n'; }
8072
+ try {
8073
+ // W-mpmwxni2000c25c7-d — terminal error frames go out as `event: error`
8074
+ // so SSE consumers using addEventListener('error', …) and tests
8075
+ // matching the raw wire format can target them directly. The JSON
8076
+ // payload still carries `type: 'error'` so the existing
8077
+ // data-line parser (and any client code that only reads `data:`
8078
+ // lines) keeps working.
8079
+ const eventLine = (type === 'error') ? 'event: error\n' : '';
8080
+ wire = eventLine + 'data: ' + JSON.stringify(payload) + '\n\n';
8081
+ }
8035
8082
  catch (err) {
8036
8083
  _logFail('json-serialize-failed', { error: String((err && err.message) || err).slice(0, 200) });
8037
8084
  return false;
@@ -8312,13 +8359,48 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8312
8359
  const streamModel = CONFIG.engine?.ccModel || shared.ENGINE_DEFAULTS.ccModel;
8313
8360
  const streamEffort = CONFIG.engine?.ccEffort || shared.ENGINE_DEFAULTS.ccEffort;
8314
8361
  const ccMaxTurns = CONFIG.engine?.ccMaxTurns || shared.ENGINE_DEFAULTS.ccMaxTurns;
8362
+
8363
+ // W-mpmwxni2000c25c7-d — preflight model check inside the streaming
8364
+ // path. ccCall() runs this guard for the non-stream surface; the SSE
8365
+ // handler historically skipped it because the legacy "errorClass:
8366
+ // unknown-model" envelope wasn't surfaced through writeCcEvent. Now
8367
+ // that we have a typed error envelope, fail fast with
8368
+ // `code: 'model-unavailable'` and the runtime-discovered catalog in
8369
+ // `availableModels` / `message` — saves the user one round-trip into
8370
+ // the CLI process that we already know cannot run the requested model.
8371
+ const preflightFailure = await _preflightModelCheck({
8372
+ model: streamModel, engineConfig: CONFIG.engine,
8373
+ });
8374
+ if (preflightFailure) {
8375
+ const known = (preflightFailure.errorMessage || '').match(/known:\s*([^)]+)/);
8376
+ const availableModels = known ? known[1].replace(/[…\u2026]\s*$/, '').split(',').map(s => s.trim()).filter(Boolean) : [];
8377
+ const envelope = _buildCcErrorEnvelope({
8378
+ message: preflightFailure.errorMessage,
8379
+ code: 'model-unavailable',
8380
+ retryable: false,
8381
+ runtime: preflightFailure.runtime || null,
8382
+ availableModels,
8383
+ });
8384
+ writeCcEvent(envelope);
8385
+ liveState.donePayload = envelope;
8386
+ _ccStreamEnded = true;
8387
+ if (liveState.endResponse) liveState.endResponse();
8388
+ _scheduleCcLiveCleanup(tabId);
8389
+ _logCcStreamEnd(_ccTelemetry, 'error-preflight-model-unavailable', { runtime: preflightFailure.runtime });
8390
+ return;
8391
+ }
8392
+
8315
8393
  let toolUses = [];
8316
- // W-mpmwxni2000c25c7-b — turn-level watchdog. Wraps the initial
8394
+ // W-mpmwxni2000c25c7-b/-d — turn-level watchdog. Wraps the initial
8317
8395
  // _invokeCcStream PLUS the post-resume-fail retry so the wall clock
8318
8396
  // covers the entire CC turn (not just one underlying LLM call). On
8319
8397
  // expiry, whichever call is in flight is aborted; the watchdog
8320
8398
  // resolves with a synthetic `{ error: { code: 'cc-turn-timeout' } }`
8321
- // envelope so the SSE error path below kicks in.
8399
+ // envelope so the SSE error path below kicks in. The frontend
8400
+ // (dashboard/js/command-center.js) recognizes `cc-turn-timeout` as a
8401
+ // retryable typed error and offers a Retry affordance instead of
8402
+ // hanging the spinner. The per-turn cap is configurable via
8403
+ // `engine.ccTurnTimeoutMs` (Settings UI; clamped 10s..1h).
8322
8404
  const turnTimeoutMs = _resolveCcTurnTimeoutMs();
8323
8405
  const result = await withTimeout({
8324
8406
  timeoutMs: turnTimeoutMs, label: 'command-center-stream',
@@ -8465,10 +8547,13 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8465
8547
  if (!res.headersSent) {
8466
8548
  res.statusCode = e.statusCode || 500;
8467
8549
  res.setHeader('Content-Type', 'application/json');
8468
- try { res.end(JSON.stringify({ error: e.message })); } catch {}
8550
+ // W-mpmwxni2000c25c7-d non-2xx response carries the same envelope
8551
+ // so the frontend's `if (!res.ok)` branch can render the red error
8552
+ // bubble with the same code/message/retryable surface.
8553
+ try { res.end(JSON.stringify(_buildCcErrorEnvelope({ message: e.message, code: e.code || 'crash', retryable: false }))); } catch {}
8469
8554
  _logCcStreamEnd(_ccTelemetry, 'error-pre-stream', { error: (e && e.message ? e.message.slice(0, CC_LOG_ERROR_MAX_LEN) : 'unknown') });
8470
8555
  } else {
8471
- writeCcEvent({ type: 'error', error: e.message });
8556
+ writeCcEvent(_buildCcErrorEnvelope({ message: e.message, code: e.code || 'crash', retryable: false }));
8472
8557
  _ccStreamEnded = true; try { res.end(); } catch {}
8473
8558
  _logCcStreamEnd(_ccTelemetry, 'error-mid-stream', { error: (e && e.message ? e.message.slice(0, CC_LOG_ERROR_MAX_LEN) : 'unknown') });
8474
8559
  }
@@ -8858,6 +8943,11 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8858
8943
  prPollStatusEvery: [1], prPollCommentsEvery: [1],
8859
8944
  agentBusyReassignMs: [0],
8860
8945
  maxRetriesPerAgent: [1, 20],
8946
+ // W-mpmwxni2000c25c7-d — per-turn CC/doc-chat watchdog. Min 10s
8947
+ // (anything shorter would fire on legitimate first-token latency for
8948
+ // larger models); max 1h (matches CC_CALL_TIMEOUT_MS so the watchdog
8949
+ // never outlives the outer abort).
8950
+ ccTurnTimeoutMs: [10000, 3600000],
8861
8951
  };
8862
8952
  for (const [key, [min, max]] of Object.entries(numericFields)) {
8863
8953
  if (e[key] !== undefined) {
@@ -2067,6 +2067,90 @@ function updatePrAfterFix(pr, project, source, options = {}, legacyDispatchId =
2067
2067
  delete next.fixedAt;
2068
2068
  target.minionsReview = next;
2069
2069
  };
2070
+ // W-mpoeirqx0007712a — Build-fix push verification. The agent may report
2071
+ // SUCCESS while the git push silently failed to advance the remote head
2072
+ // (stale-worktree push rejected non-fast-forward, agent ignores non-zero
2073
+ // `git push` exit, etc). detectPrFixBranchChange falls back to
2074
+ // local-head / worktree-diff evidence in those scenarios and returns
2075
+ // `changed: true` even though origin/<branch> never moved. Without a
2076
+ // guard here, the optimistic stamp + 10-min buildFixGracePeriod
2077
+ // suppresses re-dispatch against a still-failing build that was never
2078
+ // actually fixed (live repro: opg-microsoft/minions PR #57).
2079
+ //
2080
+ // Only `evidence: 'remote-head'` proves the push landed. For
2081
+ // BUILD_FAILURE with changed=true AND evidence explicitly set to one of
2082
+ // the unverified types, increment `_buildFixPushFailedCount`, write an
2083
+ // inbox alert, route through recordPrNoOpFixAttempt so the cause stays
2084
+ // unhandled, and never write `_buildFixPushedAt`. When the counter
2085
+ // reaches `engine.maxBuildFixRetries`, flip `_buildFixNeedsHumanRebase`
2086
+ // so the engine stops retrying.
2087
+ //
2088
+ // Note: callers that omit `branchChange.evidence` (legacy / tests
2089
+ // predating evidence plumbing) still hit the trusted-push path below to
2090
+ // preserve backward compatibility — only the explicitly unverified
2091
+ // evidence kinds trigger this guard.
2092
+ const _unverifiedPushEvidence = new Set(['local-head', 'worktree-diff']);
2093
+ if (cause === shared.PR_FIX_CAUSE.BUILD_FAILURE
2094
+ && explicitlyChangedBranch
2095
+ && options.branchChange?.changed === true
2096
+ && _unverifiedPushEvidence.has(options.branchChange?.evidence)) {
2097
+ const maxRetries = options.config?.engine?.maxBuildFixRetries
2098
+ ?? ENGINE_DEFAULTS.maxBuildFixRetries;
2099
+ target._buildFixPushFailedCount = (Number(target._buildFixPushFailedCount) || 0) + 1;
2100
+ const reachedCap = target._buildFixPushFailedCount >= maxRetries;
2101
+ if (reachedCap) {
2102
+ target._buildFixNeedsHumanRebase = ts();
2103
+ }
2104
+ const beforeHeadStr = String(options.branchChange?.beforeHead || '').slice(0, 40);
2105
+ const afterHeadStr = String(options.branchChange?.afterHead || '').slice(0, 40);
2106
+ const evidenceStr = String(options.branchChange?.evidence || 'unknown');
2107
+ try {
2108
+ const wiId = options.dispatchItem?.meta?.item?.id || null;
2109
+ const noteBody = `# Build-fix push not verified for ${pr.id}\n\n`
2110
+ + `**PR:** ${pr.url || pr.id}\n`
2111
+ + `**Branch:** ${pr.branch || '(unknown)'}\n`
2112
+ + `**Cause:** build-failure\n`
2113
+ + `**Pre-dispatch head:** ${beforeHeadStr || '(unknown)'}\n`
2114
+ + `**Post-completion head (live):** ${afterHeadStr || '(unknown)'}\n`
2115
+ + `**Branch-change evidence:** ${evidenceStr}\n`
2116
+ + `**Attempt:** ${target._buildFixPushFailedCount}/${maxRetries}\n\n`
2117
+ + (reachedCap
2118
+ ? `⚠️ **Reached \`engine.maxBuildFixRetries\` (${maxRetries}).** PR flagged \`_buildFixNeedsHumanRebase\` — engine will stop auto-retrying. Likely root cause: worktree stale vs origin/master, push rejected non-fast-forward, or branch protection blocks the engine identity.\n`
2119
+ : `_Engine will re-dispatch on the next \`discoverFromPrs\` pass (counter < cap)._\n`)
2120
+ + `\nThe agent reported SUCCESS but the remote head did not advance — the optimistic \`_buildFixPushedAt\` stamp was suppressed to avoid the ${(ENGINE_DEFAULTS.buildFixGracePeriod / 60000) | 0}-minute grace-period blackout.\n`;
2121
+ shared.writeToInbox(
2122
+ 'engine',
2123
+ `build-fix-push-unverified-${pr.prNumber || pr.id}`,
2124
+ noteBody,
2125
+ null,
2126
+ { wi: wiId, pr: pr.id, cause: shared.PR_FIX_CAUSE.BUILD_FAILURE }
2127
+ );
2128
+ } catch (err) {
2129
+ log('warn', `build-fix push-verify inbox alert for ${pr.id}: ${err.message}`);
2130
+ }
2131
+ // Route through the noop path so the cause stays unhandled, the noop
2132
+ // counter advances symmetrically with the genuine-noop case, and the
2133
+ // existing `delete target._buildFixPushedAt` cleanup (line ~2016) runs.
2134
+ const verifyBranchChange = {
2135
+ changed: false,
2136
+ beforeHead: options.branchChange?.beforeHead,
2137
+ afterHead: options.branchChange?.afterHead,
2138
+ evidence: 'push-unverified',
2139
+ };
2140
+ const noopReason = `build-fix push unverified (evidence: ${evidenceStr}); attempt ${target._buildFixPushFailedCount}/${maxRetries}${reachedCap ? ' — needs-human-rebase' : ''}`;
2141
+ const record = recordPrNoOpFixAttempt(target, cause, source, options.dispatchItem, verifyBranchChange, options.config, noopReason);
2142
+ result = {
2143
+ noOp: true,
2144
+ cause,
2145
+ paused: !!record.paused,
2146
+ count: record.count,
2147
+ pushUnverified: true,
2148
+ pushFailedCount: target._buildFixPushFailedCount,
2149
+ needsHumanRebase: reachedCap,
2150
+ };
2151
+ log('warn', `Updated ${pr.id} → build-fix push unverified (${target._buildFixPushFailedCount}/${maxRetries}, evidence=${evidenceStr})${reachedCap ? ' [needs-human-rebase]' : ''}; remote head ${beforeHeadStr.slice(0, 8)} did not advance — inbox alert written, cause left unhandled for re-dispatch`);
2152
+ return prs;
2153
+ }
2070
2154
  if (explicitlyChangedBranch && options.branchChange?.changed === false) {
2071
2155
  const record = recordPrNoOpFixAttempt(target, cause, source, options.dispatchItem, options.branchChange, options.config, options.noopReason);
2072
2156
  result = { noOp: true, cause, paused: !!record.paused, count: record.count };
@@ -2086,6 +2170,19 @@ function updatePrAfterFix(pr, project, source, options = {}, legacyDispatchId =
2086
2170
  return prs;
2087
2171
  }
2088
2172
  clearPrNoOpFixAttempt(target, cause);
2173
+ // W-mpoeirqx0007712a — verified-push stamping for BUILD_FAILURE. Reaching
2174
+ // this point with explicitlyChangedBranch=true means the unverified-push
2175
+ // guard above did NOT trigger, so either evidence === 'remote-head'
2176
+ // (live remote refs prove the branch advanced) OR no branchChange info
2177
+ // was supplied (legacy callers that didn't pass branchChange — keep
2178
+ // existing behavior of trusting the agent's branchChanged claim).
2179
+ // Clear the push-failure counter on confirmed success so future
2180
+ // regressions start fresh.
2181
+ if (cause === shared.PR_FIX_CAUSE.BUILD_FAILURE && explicitlyChangedBranch) {
2182
+ target._buildFixPushedAt = ts();
2183
+ delete target._buildFixPushFailedCount;
2184
+ delete target._buildFixNeedsHumanRebase;
2185
+ }
2089
2186
  if (source === 'pr-human-feedback') {
2090
2187
  const clearPendingFix = shouldClearHumanFeedbackPendingFix(target, pr, automationCauseKey);
2091
2188
  if (target.humanFeedback && clearPendingFix) target.humanFeedback.pendingFix = false;
package/engine/shared.js CHANGED
@@ -1800,7 +1800,15 @@ const ENGINE_DEFAULTS = {
1800
1800
  logBufferSize: 50, // flush immediately when buffer exceeds this many entries
1801
1801
  lockRetries: 0, // no retries — single 5s timeout window with 25ms polling (200 attempts) is sufficient; stale lock recovery at 60s handles crashes
1802
1802
  lockRetryBackoffMs: 500, // base backoff between lock retries (doubles each attempt: 500ms, 1s, 2s, ...)
1803
- buildFixGracePeriod: 600000, // 10min — wait for CI to run after build fix before re-dispatching
1803
+ buildFixGracePeriod: 600000, // 10min — wait for CI to run after a verified build-fix push before re-dispatching
1804
+ // W-mpoeirqx0007712a: cap re-dispatch attempts when build-fix pushes
1805
+ // silently fail to advance the remote head (stale-worktree push rejected,
1806
+ // agent ignores non-zero git push exit and reports SUCCESS, etc).
1807
+ // updatePrAfterFix increments `_buildFixPushFailedCount` whenever the
1808
+ // post-completion branchChange has non-remote-head evidence; when the
1809
+ // counter reaches this cap, the PR is flagged `_buildFixNeedsHumanRebase`
1810
+ // so the dispatcher stops auto-retrying and a human can rescue the branch.
1811
+ maxBuildFixRetries: 3,
1804
1812
  adoPollEnabled: true, // poll ADO PR status, comments, and reconciliation on each tick cycle
1805
1813
  ghPollEnabled: true, // poll GitHub PR status, comments, and reconciliation on each tick cycle
1806
1814
  prPollStatusEvery: 12, // poll PR build/review/merge status every N ticks for both ADO and GitHub (~12 min at default interval)
@@ -1879,7 +1887,7 @@ const ENGINE_DEFAULTS = {
1879
1887
  removeWorktreeFailureTtlMs: 24 * 60 * 60 * 1000, // stale failed paths are forgotten after a day
1880
1888
  removeWorktreeFailureMaxEntries: 1000, // bound failed-worktree retry suppression cache
1881
1889
  ccMaxTurns: 50, // max tool-use turns per CC/doc-chat call before CLI stops (per response, not per session)
1882
- ccTurnTimeoutMs: 300000, // W-mpmwxni2000c25c7-b: wall-clock cap per CC/doc-chat turn; on expiry the in-flight LLM call is aborted and the handler surfaces `{code:'cc-turn-timeout', retriable:true}` instead of hanging the UI
1890
+ ccTurnTimeoutMs: 300000, // W-mpmwxni2000c25c7-b/-d: 5min per-turn watchdog. Wall-clock cap per CC/doc-chat turn; on expiry the in-flight LLM call is aborted and the handler surfaces `{code:'cc-turn-timeout', retryable:true}` via the typed error envelope so the UI can stop the spinner and offer Retry. Clamped to [10000, 3600000] in the settings POST handler. Independent of CC_CALL_TIMEOUT_MS (the outer hour-long abort) — this is the visible-to-user no-progress cap.
1883
1891
  docSessionMaxEntries: 200, // cap doc-chat session map/disk store by least-recent activity (LRU; sessions are non-expiring otherwise)
1884
1892
  ccLiveStreamMaxAgeMs: 30 * 60 * 1000, // hard cap reconnect buffers if abort/cleanup stalls
1885
1893
  metricsFlushIntervalMs: 10000, // batch trackEngineUsage writes to metrics.json — flushed every 10s instead of per-call to cut lock contention and dashboard mtime churn
package/engine.js CHANGED
@@ -4929,15 +4929,16 @@ async function discoverFromPrs(config, project) {
4929
4929
  }, `Fix build failure on ${pr.id}: ${pr.title || ''}`, { dispatchKey: key, cooldownKey: key, automationCauseKey: buildCauseKey, source: 'pr', pr, branch: prBranch, project: projMeta });
4930
4930
  if (item) {
4931
4931
  newWork.push(item); fixDispatched = true;
4932
- try {
4933
- const prPath = projectPrPath(project);
4934
- mutatePullRequests(prPath, prs => {
4935
- const target = shared.findPrRecord(prs, pr, project);
4936
- if (target) {
4937
- target._buildFixPushedAt = ts();
4938
- }
4939
- });
4940
- } catch (e) { log('warn', 'mark build fix dispatched: ' + e.message); }
4932
+ // W-mpoeirqx0007712a — DO NOT stamp `_buildFixPushedAt` at dispatch
4933
+ // time. The optimistic stamp here used to suppress re-dispatch for
4934
+ // the buildFixGracePeriod window even when the agent never pushed
4935
+ // (stale-worktree push silently rejected, agent reported SUCCESS
4936
+ // anyway). `_buildFixPushedAt` is now written only by
4937
+ // lifecycle.updatePrAfterFix after the post-completion branchChange
4938
+ // confirms the remote head actually advanced (evidence ===
4939
+ // 'remote-head'). In-flight dispatches are already deduplicated by
4940
+ // `isPrAutomationCausePending` + `isAlreadyDispatched` above, so no
4941
+ // race window opens by removing the optimistic stamp.
4941
4942
  }
4942
4943
 
4943
4944
  if (pr.agent && !pr._buildFailNotified) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.2053",
3
+ "version": "0.1.2055",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"