@yemi33/minions 0.1.2045 → 0.1.2047

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dashboard.js CHANGED
@@ -855,6 +855,67 @@ function linkPullRequestForTracking({ url, title, project: projectName, autoObse
855
855
  return { ...result, prPath, targetProject, projectResolution, prNum };
856
856
  }
857
857
 
858
+ // W-mpmwxkzm0009ba0b — Per-row auto-observe toggle backing helper for
859
+ // POST /api/pull-requests/observe. Flips `_contextOnly` / `_autoObserve` on
860
+ // an existing tracked PR record under a lock (per CLAUDE.md mutate convention).
861
+ // Body shape: { host: 'github'|'ado', slug, number, observe: boolean }.
862
+ // Returns the updated record + the PR path that was touched. Throws an
863
+ // Error with `statusCode` for the route handler to map to an HTTP status.
864
+ function updatePullRequestObserveFlag({ host, slug, number, observe } = {}, config = CONFIG, minionsDir = MINIONS_DIR) {
865
+ const hostStr = String(host || '').trim().toLowerCase();
866
+ const slugStr = String(slug || '').trim();
867
+ const numberInt = Number.parseInt(number, 10);
868
+ if (!hostStr || (hostStr !== 'github' && hostStr !== 'ado')) {
869
+ const err = new Error('host must be "github" or "ado"');
870
+ err.statusCode = 400;
871
+ throw err;
872
+ }
873
+ if (!slugStr) {
874
+ const err = new Error('slug required');
875
+ err.statusCode = 400;
876
+ throw err;
877
+ }
878
+ if (!Number.isFinite(numberInt) || numberInt <= 0) {
879
+ const err = new Error('number must be a positive integer');
880
+ err.statusCode = 400;
881
+ throw err;
882
+ }
883
+ if (typeof observe !== 'boolean') {
884
+ const err = new Error('observe must be a boolean');
885
+ err.statusCode = 400;
886
+ throw err;
887
+ }
888
+
889
+ const canonicalId = `${hostStr}:${slugStr}#${numberInt}`;
890
+ const projects = shared.getProjects(config);
891
+ const prPaths = [
892
+ ...projects.map(p => shared.projectPrPath(p)),
893
+ shared.centralPullRequestsPath(minionsDir),
894
+ ];
895
+
896
+ let updated = null;
897
+ let updatedPath = null;
898
+ for (const prPath of prPaths) {
899
+ if (updated) break;
900
+ shared.mutatePullRequests(prPath, (prs) => {
901
+ const pr = prs.find(p => p && p.id === canonicalId);
902
+ if (!pr) return prs;
903
+ pr._contextOnly = !observe;
904
+ pr._autoObserve = !!observe;
905
+ updated = { id: pr.id, _contextOnly: pr._contextOnly, _autoObserve: pr._autoObserve };
906
+ updatedPath = prPath;
907
+ return prs;
908
+ });
909
+ }
910
+
911
+ if (!updated) {
912
+ const err = new Error(`pull request ${canonicalId} not found`);
913
+ err.statusCode = 404;
914
+ throw err;
915
+ }
916
+ return { ...updated, prPath: updatedPath };
917
+ }
918
+
858
919
  function _normalizeSkillDirForCompare(dir) {
859
920
  const resolved = path.resolve(String(dir || '').replace(/\//g, path.sep));
860
921
  return process.platform === 'win32' ? resolved.toLowerCase() : resolved;
@@ -1173,48 +1234,13 @@ function _getDashboardBrowserPresence(now = Date.now()) {
1173
1234
  return { active: activeTabs.length > 0, activeTabs: activeTabs.length, maxAgeMs: DASHBOARD_BROWSER_PRESENCE_MAX_AGE_MS };
1174
1235
  }
1175
1236
 
1176
- // Hot-reload: watch dashboard/ directory for changes, rebuild, and push reload to browsers
1177
- const _hotReloadClients = new Set();
1178
-
1179
- function rebuildDashboardHtml() {
1180
- try {
1181
- const newRaw = buildDashboardHtml();
1182
- if (newRaw === HTML_RAW) return; // no changes
1183
- HTML_RAW = newRaw;
1184
- HTML = HTML_RAW;
1185
- HTML_GZ = zlib.gzipSync(HTML);
1186
- HTML_ETAG = '"' + require('crypto').createHash('md5').update(HTML).digest('hex') + '"';
1187
- // Bust the /api/status cache so the new dashboardBuildId propagates on the
1188
- // next poll — refresh.js compares it against its first-observed value and
1189
- // hard-reloads on mismatch (R3, W-mpgb0xgc000hf1d3).
1190
- try { invalidateStatusCache(); } catch { /* status cache may not be initialized yet */ }
1191
- console.log(' Dashboard hot-reloaded');
1192
- // Push reload to all connected browsers via status-stream (saves a connection)
1193
- for (const res of _statusStreamClients) {
1194
- try { res.write('event: reload\ndata: reload\n\n'); } catch { _removeSseClient(_statusStreamClients, res); }
1195
- }
1196
- // Legacy hot-reload clients
1197
- for (const res of _hotReloadClients) {
1198
- try { res.write('data: reload\n\n'); } catch { _removeSseClient(_hotReloadClients, res); }
1199
- }
1200
- } catch (e) { console.error(' Hot-reload error:', e.message); }
1201
- }
1202
-
1203
- const dashDir = path.join(MINIONS_DIR, 'dashboard');
1204
- if (fs.existsSync(dashDir)) {
1205
- let _reloadTimer = null;
1206
- const scheduleReload = () => {
1207
- if (_reloadTimer) clearTimeout(_reloadTimer);
1208
- _reloadTimer = setTimeout(rebuildDashboardHtml, 300); // debounce 300ms
1209
- };
1210
- // Watch top-level files (styles.css, layout.html)
1211
- try { fs.watch(dashDir, scheduleReload); } catch { /* optional */ }
1212
- // Watch subdirectories (pages/, js/)
1213
- for (const sub of ['pages', 'js']) {
1214
- const subDir = path.join(dashDir, sub);
1215
- if (fs.existsSync(subDir)) try { fs.watch(subDir, scheduleReload); } catch { /* optional */ }
1216
- }
1217
- }
1237
+ // Hot-reload removed (2026-05-26): file-watch-driven rebuilds force-reloaded
1238
+ // every connected browser on any change under dashboard/, which surprised
1239
+ // users when parallel agent merges touched dashboard.js or dashboard/js/*.
1240
+ // The dashboard HTML is now assembled once at process boot. Code changes on
1241
+ // disk land in the running process only after an explicit `minions restart`;
1242
+ // the buildId/startedAt mismatch in refresh.js then triggers a one-shot
1243
+ // browser reload on that next poll — driven by user-initiated restart only.
1218
1244
 
1219
1245
  // -- Data Collectors (most moved to engine/queries.js) --
1220
1246
 
@@ -2337,6 +2363,17 @@ const CC_LOG_ERROR_MAX_LEN = 80; // truncate exception messages in [cc-stream] l
2337
2363
  const CC_STREAM_REATTACH_GRACE_MS = 60000; // keep CC job alive briefly after disconnect so the UI can reattach
2338
2364
  const CC_STREAM_DONE_RETENTION_MS = 30000; // retain final payload briefly so reconnect can still receive it
2339
2365
  const CC_LIVE_STREAM_MAX_AGE_MS = shared.ENGINE_DEFAULTS.ccLiveStreamMaxAgeMs;
2366
+ // W-mpmwxni2000c25c7-b — CC/doc-chat turn watchdog. Resolves per-call from
2367
+ // CONFIG.engine.ccTurnTimeoutMs (defaults to ENGINE_DEFAULTS.ccTurnTimeoutMs)
2368
+ // so an operator can shorten/lengthen the wall-clock cap without a code
2369
+ // change. callLLM's own `timeout` opt only kills the spawned child after a
2370
+ // long idle stretch; this turn-level cap kills WHICHEVER LLM call is in
2371
+ // flight inside ccCall/ccCallStreaming (resume → fresh → final retry).
2372
+ function _resolveCcTurnTimeoutMs() {
2373
+ const cfg = CONFIG && CONFIG.engine;
2374
+ const candidate = cfg && Number.isFinite(cfg.ccTurnTimeoutMs) ? cfg.ccTurnTimeoutMs : shared.ENGINE_DEFAULTS.ccTurnTimeoutMs;
2375
+ return Number.isFinite(candidate) && candidate > 0 ? candidate : 0;
2376
+ }
2340
2377
  // Doc-chat is interactive — long-doc edits with multi-step Read+Write tool use can run
2341
2378
  // well past 5 min on `canEdit:true` paths. Bumped to 1 hour (matching CC) so legitimate
2342
2379
  // edits aren't killed mid-stream and the backend timeout never beats the user's reading
@@ -3391,6 +3428,22 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
3391
3428
  let timeoutTimer = null;
3392
3429
  let resolveResult;
3393
3430
  const promise = new Promise((resolve) => { resolveResult = resolve; });
3431
+ // W-mpmwxni2000c25c7-c — build a single failure envelope shape from a
3432
+ // typed Error (or a plain Error). Reads `.code` / `.retriable` if the
3433
+ // pool stamped them; falls back to safe defaults otherwise so callers
3434
+ // see a consistent `{ ..., errorCode, errorRetriable }` shape. Sub-item
3435
+ // b's SSE writer consumes these to render a structured error event
3436
+ // instead of grepping the stderr string for a code.
3437
+ const _failureEnvelope = (err, defaultCode) => ({
3438
+ text: accumulated,
3439
+ sessionId: sessionHandle ? sessionHandle.sessionId : null,
3440
+ code: 1,
3441
+ usage: {},
3442
+ raw: accumulated,
3443
+ stderr: String((err && err.message) || err || 'cc-worker-pool failure'),
3444
+ errorCode: (err && err.code) || defaultCode || null,
3445
+ errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
3446
+ });
3394
3447
  const finalize = (envelope) => {
3395
3448
  if (settled) return;
3396
3449
  settled = true;
@@ -3415,14 +3468,18 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
3415
3468
  timeoutTimer = setTimeout(() => {
3416
3469
  try { sessionHandle && sessionHandle.cancel(); } catch { /* swallow */ }
3417
3470
  try { ccWorkerPool.closeTab(tabKey); } catch { /* swallow */ }
3418
- finalize({
3419
- text: accumulated,
3420
- sessionId: sessionHandle ? sessionHandle.sessionId : null,
3421
- code: 1,
3422
- usage: {},
3423
- raw: accumulated,
3424
- stderr: `doc-chat-pool: timeout after ${timeoutMs}ms`,
3425
- });
3471
+ // W-mpmwxni2000c25c7-c — convert the legacy synthesized
3472
+ // `{ code: 1, stderr: 'doc-chat-pool: timeout after Xms' }` shape into
3473
+ // a typed-error envelope so the SSE writer can render the same
3474
+ // structured error event for timeouts as for spawn/handshake/exit
3475
+ // failures. The error code carries `cc-turn-timeout`; consumers
3476
+ // grep on that instead of parsing the stderr string.
3477
+ const timeoutErr = ccWorkerPool._typedError(
3478
+ `doc-chat-pool: timeout after ${timeoutMs}ms`,
3479
+ ccWorkerPool.ERROR_CODES.CC_TURN_TIMEOUT,
3480
+ true
3481
+ );
3482
+ finalize(_failureEnvelope(timeoutErr, ccWorkerPool.ERROR_CODES.CC_TURN_TIMEOUT));
3426
3483
  }, timeoutMs);
3427
3484
  if (typeof timeoutTimer.unref === 'function') timeoutTimer.unref();
3428
3485
  }
@@ -3436,14 +3493,10 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
3436
3493
  systemPromptHash: _docChatPromptHash,
3437
3494
  });
3438
3495
  } catch (err) {
3439
- return finalize({
3440
- text: '',
3441
- sessionId: null,
3442
- code: 1,
3443
- usage: {},
3444
- raw: '',
3445
- stderr: String((err && err.message) || err || 'cc-worker-pool spawn failed'),
3446
- });
3496
+ // Pool stamps `.code` (worker-spawn-failed / acp-handshake-failed) on
3497
+ // every error from getSession; fall back to worker-spawn-failed if
3498
+ // the error is a plain Error from somewhere unexpected.
3499
+ return finalize(_failureEnvelope(err, ccWorkerPool.ERROR_CODES.WORKER_SPAWN_FAILED));
3447
3500
  }
3448
3501
  if (cancelled) {
3449
3502
  try { sessionHandle.cancel(); } catch { /* swallow */ }
@@ -3471,14 +3524,15 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
3471
3524
  finalize({ text: accumulated, sessionId: sessionHandle.sessionId, code: 0, usage: {}, raw: accumulated, stderr: '' });
3472
3525
  },
3473
3526
  onError: (err) => {
3474
- finalize({
3475
- text: accumulated,
3476
- sessionId: sessionHandle.sessionId,
3477
- code: cancelled ? 0 : 1,
3478
- usage: {},
3479
- raw: accumulated,
3480
- stderr: String((err && err.message) || err || 'cc-worker-pool stream error'),
3481
- });
3527
+ if (cancelled) {
3528
+ // User-driven cancel — not a real error, treat as a clean exit.
3529
+ finalize({ text: accumulated, sessionId: sessionHandle.sessionId, code: 0, usage: {}, raw: accumulated, stderr: '' });
3530
+ return;
3531
+ }
3532
+ // Pool stamps `.code` (worker-died for mid-stream proc exit).
3533
+ // Fallback default is worker-died because the stream onError is
3534
+ // overwhelmingly fired from the post-handshake exit handler.
3535
+ finalize(_failureEnvelope(err, ccWorkerPool.ERROR_CODES.WORKER_DIED));
3482
3536
  },
3483
3537
  });
3484
3538
  })();
@@ -3975,12 +4029,65 @@ async function _retryDocChatAfterResumeFailure({ result, initialPass, freshSessi
3975
4029
  // Shape the per-failure debug envelope (raw stderr + classification metadata)
3976
4030
  // shared by hard failures and partial recoveries — keeps the wire shape in lockstep.
3977
4031
  function _buildDocChatErrorEnvelope(result) {
4032
+ // W-mpmwxni2000c25c7-b — also surface the typed `error: {message, code,
4033
+ // retriable}` envelope when llm.callLLM* produced one, so doc-chat clients
4034
+ // get the same shape Command Center handlers emit.
4035
+ const typed = result && result.error;
3978
4036
  return {
3979
4037
  code: result.code ?? null,
3980
4038
  stderr: String(result.stderr || '').slice(-2048),
3981
4039
  errorClass: result.errorClass || null,
3982
4040
  errorMessage: result.errorMessage || null,
3983
4041
  runtime: result.runtime || null,
4042
+ ...(typed ? {
4043
+ typedCode: typed.code || null,
4044
+ typedMessage: typed.message || null,
4045
+ retriable: typed.retriable !== false,
4046
+ } : {}),
4047
+ };
4048
+ }
4049
+
4050
+ // W-mpmwxni2000c25c7-b — race a ccDocCall* promise against a wall-clock turn
4051
+ // timer. On expiry, fires `abortFn` (killing the in-flight CLI) and resolves
4052
+ // with a doc-chat-shaped failure payload that flows through the existing
4053
+ // _docChatFailureResponse / SSE error event paths. timeoutMs <= 0 disables
4054
+ // the watchdog (passthrough).
4055
+ async function _raceCcDocChatTimeout(callPromise, timeoutMs, abortFn, label) {
4056
+ if (!timeoutMs || timeoutMs <= 0) return callPromise;
4057
+ let timer = null;
4058
+ let timedOut = false;
4059
+ const timeoutPromise = new Promise((resolve) => {
4060
+ timer = setTimeout(() => {
4061
+ timedOut = true;
4062
+ try { if (abortFn) abortFn(); } catch { /* swallow */ }
4063
+ resolve(null);
4064
+ }, timeoutMs);
4065
+ // NOTE: do NOT unref — Node would exit the event loop while awaiting the
4066
+ // call promise (Promises don't keep the loop open; timers/I/O do). Cleared
4067
+ // immediately on the success path below.
4068
+ });
4069
+ const winner = await Promise.race([callPromise, timeoutPromise]);
4070
+ if (!timedOut) {
4071
+ clearTimeout(timer);
4072
+ return winner;
4073
+ }
4074
+ // Drain the in-flight call so its cleanup runs before we hand back the
4075
+ // synthetic envelope.
4076
+ await callPromise.catch(() => null);
4077
+ const message = `${label || 'doc-chat'} turn timed out after ${timeoutMs}ms`;
4078
+ return {
4079
+ answer: 'Document chat request timed out — try again.',
4080
+ toolUses: [],
4081
+ error: {
4082
+ code: 'cc-turn-timeout',
4083
+ stderr: '',
4084
+ errorClass: 'cc-turn-timeout',
4085
+ errorMessage: message,
4086
+ runtime: null,
4087
+ typedCode: 'cc-turn-timeout',
4088
+ typedMessage: message,
4089
+ retriable: true,
4090
+ },
3984
4091
  };
3985
4092
  }
3986
4093
 
@@ -4014,7 +4121,13 @@ function _recoverPartialDocChatResponse(result, sessionKey) {
4014
4121
 
4015
4122
  function _shouldSuppressDocChatPostPatchError(ccError, finalize) {
4016
4123
  if (!finalize || finalize.edited !== true) return false;
4017
- if (!ccError || ccError.errorClass !== 'unknown-model') return false;
4124
+ if (!ccError) return false;
4125
+ // W-mpmwxni2000c25c7-a — accept both the legacy 'unknown-model' errorClass
4126
+ // (still emitted by the dashboard preflight at _preflightModelCheck) and
4127
+ // the new 'model-unavailable' code returned by copilot.parseError for
4128
+ // invalid-model responses. Suppress in either case — the edit already
4129
+ // landed, so the stale model error shouldn't surface to the user.
4130
+ if (ccError.errorClass !== 'unknown-model' && ccError.errorClass !== 'model-unavailable') return false;
4018
4131
  return String(ccError.runtime || '').toLowerCase() === 'copilot';
4019
4132
  }
4020
4133
 
@@ -6729,7 +6842,14 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6729
6842
  const ccTurnId = 'cct-' + shared.uid();
6730
6843
  const turnSystemPrompt = renderDocChatSystemPromptForTurn(ccTurnId);
6731
6844
 
6732
- let { answer, partial, warning, toolUses, error: ccError } = await ccDocCall({
6845
+ // W-mpmwxni2000c25c7-b wall-clock turn watchdog. The doc-chat call
6846
+ // can internally spawn resume + fresh + final-retry LLM calls; we want
6847
+ // ONE wall-clock cap that covers the whole turn so a runtime stuck
6848
+ // mid-stream can't outlive ccTurnTimeoutMs. On expiry the watchdog
6849
+ // calls _docAbort (kills the in-flight CLI) and the synthesized payload
6850
+ // below flows through the existing _docChatFailureResponse path.
6851
+ const _docTurnTimeoutMs = _resolveCcTurnTimeoutMs();
6852
+ const _docCallPromise = ccDocCall({
6733
6853
  message: body.message, document: currentContent, title: body.title,
6734
6854
  filePath: body.filePath, selection: body.selection, canEdit, isJson,
6735
6855
  model: body.model || undefined,
@@ -6739,6 +6859,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6739
6859
  systemPrompt: turnSystemPrompt,
6740
6860
  turnId: ccTurnId,
6741
6861
  });
6862
+ const _docCallResult = await _raceCcDocChatTimeout(_docCallPromise, _docTurnTimeoutMs, () => _docAbort && _docAbort(), 'doc-chat');
6863
+ let { answer, partial, warning, toolUses, error: ccError } = _docCallResult;
6742
6864
  const finalize = _finalizeDocChatEdit({
6743
6865
  filePath: body.filePath, fullPath, isJson, canEdit,
6744
6866
  originalContent: currentContent, delimiterContent: null,
@@ -6752,6 +6874,25 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6752
6874
  ccError, partial, warning, toolUses, finalize,
6753
6875
  });
6754
6876
  _docDone = true;
6877
+ // W-mpmwxni2000c25c7-b — track every surfaced doc-chat error code so
6878
+ // /api/metrics reflects silent-error regressions. Hard failures (no
6879
+ // partial recovery, no edited file) graduate to 5xx so the client can
6880
+ // render a real error UI instead of treating the polite "Failed to
6881
+ // process request" string as a successful turn.
6882
+ if (ccError) {
6883
+ const errCode = ccError.typedCode || ccError.errorClass || ccError.code || 'unknown';
6884
+ llm.trackEngineError('doc-chat', errCode);
6885
+ const isHardFailure = !partial && !(finalize && finalize.edited);
6886
+ if (isHardFailure) {
6887
+ const status = errCode === shared.FAILURE_CLASS.CONFIG_ERROR ? 503 : 502;
6888
+ return jsonReply(res, status, {
6889
+ ...payload,
6890
+ error: ccError.typedMessage || ccError.errorMessage || 'Document chat failed',
6891
+ code: errCode,
6892
+ retriable: ccError.retriable !== false,
6893
+ });
6894
+ }
6895
+ }
6755
6896
  return jsonReply(res, 200, payload);
6756
6897
  } finally { _docAbort = null; _docDone = true; docChatInFlight.delete(docKey); }
6757
6898
  } catch (e) { return jsonReply(res, e.statusCode || 500, { error: e.message }); }
@@ -6840,7 +6981,12 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6840
6981
  const ccTurnId = 'cct-' + shared.uid();
6841
6982
  const turnSystemPrompt = renderDocChatSystemPromptForTurn(ccTurnId);
6842
6983
 
6843
- let { answer, partial, warning, toolUses, error: ccError } = await ccDocCallStreaming({
6984
+ // W-mpmwxni2000c25c7-b wall-clock turn watchdog (mirrors the
6985
+ // non-stream handleDocChat path). On expiry _docAbort kills the
6986
+ // in-flight LLM and the synthesized payload below flows through the
6987
+ // SSE done frame the client already expects with `error` set.
6988
+ const _docTurnTimeoutMs = _resolveCcTurnTimeoutMs();
6989
+ const _docStreamCallPromise = ccDocCallStreaming({
6844
6990
  message: body.message, document: currentContent, title: body.title,
6845
6991
  filePath: body.filePath, selection: body.selection, canEdit, isJson,
6846
6992
  model: body.model || undefined,
@@ -6853,6 +6999,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6853
6999
  systemPrompt: turnSystemPrompt,
6854
7000
  turnId: ccTurnId,
6855
7001
  });
7002
+ const _docStreamResult = await _raceCcDocChatTimeout(_docStreamCallPromise, _docTurnTimeoutMs, () => _docAbort && _docAbort(), 'doc-chat-stream');
7003
+ let { answer, partial, warning, toolUses, error: ccError } = _docStreamResult;
6856
7004
  const finalize = _finalizeDocChatEdit({
6857
7005
  filePath: body.filePath, fullPath, isJson, canEdit,
6858
7006
  originalContent: currentContent, delimiterContent: null,
@@ -6865,6 +7013,23 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6865
7013
  actionFeedback: null, actionParseError: null,
6866
7014
  ccError, partial, warning, toolUses, finalize,
6867
7015
  });
7016
+ // W-mpmwxni2000c25c7-b — track surfaced doc-chat error codes for
7017
+ // /api/metrics and emit a named SSE `event: error` frame so the
7018
+ // client can render a typed error instead of treating the polite
7019
+ // fallback string as a normal completion.
7020
+ if (ccError) {
7021
+ const errCode = ccError.typedCode || ccError.errorClass || ccError.code || 'unknown';
7022
+ llm.trackEngineError('doc-chat', errCode);
7023
+ const isHardFailure = !partial && !(finalize && finalize.edited);
7024
+ if (isHardFailure) {
7025
+ const errPayload = {
7026
+ message: ccError.typedMessage || ccError.errorMessage || 'Document chat failed',
7027
+ code: errCode,
7028
+ retriable: ccError.retriable !== false,
7029
+ };
7030
+ try { res.write(`event: error\ndata: ${JSON.stringify(errPayload)}\n\n`); } catch {}
7031
+ }
7032
+ }
6868
7033
  const { answer: finalAnswer, ...donePayload } = payload;
6869
7034
  writeDocEvent({
6870
7035
  type: 'done',
@@ -7461,21 +7626,40 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7461
7626
  // confirmation chips in the assistant reply.
7462
7627
  const ccTurnId = 'cct-' + shared.uid();
7463
7628
  const turnSystemPrompt = renderCcSystemPromptForTurn(ccTurnId);
7464
- const result = await ccCall(body.message, { store: 'cc', transcript: body.transcript, systemPrompt: turnSystemPrompt, turnId: ccTurnId });
7465
-
7466
- // Non-zero exit with text = max_turns or partial success — still usable
7467
- if (!result.text) {
7629
+ // W-mpmwxni2000c25c7-b wall-clock turn watchdog. On expiry the
7630
+ // in-flight LLM call is aborted and ccCall returns a synthetic
7631
+ // envelope with error.code === 'cc-turn-timeout'.
7632
+ const turnTimeoutMs = _resolveCcTurnTimeoutMs();
7633
+ const result = await llm.withCcTurnTimeout({
7634
+ timeoutMs: turnTimeoutMs, label: 'command-center',
7635
+ }, (registerAbort) => ccCall(body.message, {
7636
+ store: 'cc', transcript: body.transcript, systemPrompt: turnSystemPrompt, turnId: ccTurnId,
7637
+ onAbortReady: registerAbort,
7638
+ }));
7639
+
7640
+ // W-mpmwxni2000c25c7-b — typed-error envelope path. Any failure that
7641
+ // produced no usable text is surfaced to the client as 5xx JSON
7642
+ // `{ error, code, retriable }` instead of a polite 200 "I had trouble
7643
+ // processing that" string that silently halves CC retry signal.
7644
+ if (!result.text || result.error) {
7645
+ const errEnvelope = result.error || (result.errorMessage
7646
+ ? { message: result.errorMessage, code: result.errorClass || 'unknown', retriable: true }
7647
+ : { message: 'Command Center returned no output', code: 'empty-output', retriable: true });
7648
+ llm.trackEngineError('command-center', errEnvelope.code);
7468
7649
  const debugInfo = result.code !== 0 ? `(exit code ${result.code})` : '(empty response)';
7469
7650
  const stderrTail = (result.stderr || '').trim().split('\n').filter(Boolean).slice(-5).join(' | ');
7470
- console.error(`[CC] LLM failed after retries ${debugInfo}: ${stderrTail}`);
7471
- try { shared.log('warn', `CC failed ${debugInfo}: ${stderrTail.slice(0, 300)}`); } catch {}
7472
- const hasSession = !!ccSession.sessionId;
7473
- const retryHint = hasSession
7474
- ? 'Your session is still active — just send your message again to retry.'
7475
- : 'Try clicking **New Session** and sending your message again.';
7476
- return jsonReply(res, 200, {
7477
- text: `I had trouble processing that ${debugInfo}. ${stderrTail ? 'Detail: ' + stderrTail : ''}\n\n${retryHint}`,
7478
- actions: [], sessionId: ccSession.sessionId
7651
+ console.error(`[CC] LLM failed after retries ${debugInfo} code=${errEnvelope.code}: ${stderrTail}`);
7652
+ try { shared.log('warn', `CC failed ${debugInfo} code=${errEnvelope.code}: ${stderrTail.slice(0, 300)}`); } catch {}
7653
+ // Missing-runtime is a 503 (service config); auth-failure also 503; other classes 502.
7654
+ const status = result.missingRuntime ? 503
7655
+ : errEnvelope.code === 'auth-failure' ? 503
7656
+ : 502;
7657
+ return jsonReply(res, status, {
7658
+ error: errEnvelope.message,
7659
+ code: errEnvelope.code,
7660
+ retriable: !!errEnvelope.retriable,
7661
+ sessionId: ccSession.sessionId || null,
7662
+ ...(stderrTail ? { stderr: stderrTail.slice(0, 500) } : {}),
7479
7663
  });
7480
7664
  }
7481
7665
 
@@ -7496,7 +7680,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7496
7680
  } finally {
7497
7681
  _releaseCCTab(tabId);
7498
7682
  }
7499
- } catch (e) { _releaseCCTab(tabId); return jsonReply(res, e.statusCode || 500, { error: e.message }); }
7683
+ } catch (e) { _releaseCCTab(tabId); return jsonReply(res, e.statusCode || 500, { error: e.message, code: 'handler-exception', retriable: false }); }
7500
7684
  }
7501
7685
 
7502
7686
  /** Build a lightweight input object for SSE tool events — keeps only the fields formatToolSummary needs, with truncated string values. */
@@ -7618,6 +7802,11 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7618
7802
  });
7619
7803
  } catch (err) {
7620
7804
  _emitTimingLog(null, null, Date.now(), 'spawn-failed');
7805
+ // W-mpmwxni2000c25c7-c — pipe the pool's typed-error fields
7806
+ // (`code`, `retriable`) onto the envelope so the SSE writer can
7807
+ // render a structured error event instead of grepping the stderr
7808
+ // string. Pool stamps `.code` (worker-spawn-failed or
7809
+ // acp-handshake-failed) on every getSession rejection.
7621
7810
  return resolveResult({
7622
7811
  text: '',
7623
7812
  sessionId: null,
@@ -7625,6 +7814,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7625
7814
  usage: {},
7626
7815
  raw: '',
7627
7816
  stderr: String((err && err.message) || err || 'cc-worker-pool spawn failed'),
7817
+ errorCode: (err && err.code) || ccWorkerPool.ERROR_CODES.WORKER_SPAWN_FAILED,
7818
+ errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
7628
7819
  });
7629
7820
  }
7630
7821
  const _tSessionReady = Date.now();
@@ -7671,13 +7862,29 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7671
7862
  },
7672
7863
  onError: (err) => {
7673
7864
  _emitTimingLog(_lifecycle, _tSessionReady, Date.now(), cancelled ? 'cancelled' : 'error');
7865
+ if (cancelled) {
7866
+ resolveResult({
7867
+ text: accumulated,
7868
+ sessionId: sessionHandle.sessionId,
7869
+ code: 0,
7870
+ usage: {},
7871
+ raw: accumulated,
7872
+ stderr: '',
7873
+ });
7874
+ return;
7875
+ }
7876
+ // W-mpmwxni2000c25c7-c — pipe the pool's typed-error fields
7877
+ // through. mid-stream worker exits stamp `.code = 'worker-died'`
7878
+ // on the Error before invoking onError.
7674
7879
  resolveResult({
7675
7880
  text: accumulated,
7676
7881
  sessionId: sessionHandle.sessionId,
7677
- code: cancelled ? 0 : 1,
7882
+ code: 1,
7678
7883
  usage: {},
7679
7884
  raw: accumulated,
7680
7885
  stderr: String((err && err.message) || err || 'cc-worker-pool stream error'),
7886
+ errorCode: (err && err.code) || ccWorkerPool.ERROR_CODES.WORKER_DIED,
7887
+ errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
7681
7888
  });
7682
7889
  },
7683
7890
  });
@@ -8032,73 +8239,100 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8032
8239
  : '';
8033
8240
  const prompt = _joinCcPromptParts(preamble, resumeGuard, carryover, turnHeader, projectContextPart, body.message);
8034
8241
 
8035
- const { trackEngineUsage: trackUsage } = require('./engine/llm');
8242
+ const { trackEngineUsage: trackUsage, trackEngineError: trackErr, withCcTurnTimeout: withTimeout } = require('./engine/llm');
8036
8243
  const streamModel = CONFIG.engine?.ccModel || shared.ENGINE_DEFAULTS.ccModel;
8037
8244
  const streamEffort = CONFIG.engine?.ccEffort || shared.ENGINE_DEFAULTS.ccEffort;
8038
8245
  const ccMaxTurns = CONFIG.engine?.ccMaxTurns || shared.ENGINE_DEFAULTS.ccMaxTurns;
8039
8246
  let toolUses = [];
8040
- const llmPromise = _invokeCcStream({
8041
- prompt, sessionId, liveState, toolUses,
8042
- model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
8043
- engineConfig: CONFIG.engine,
8044
- systemPrompt: turnSystemPrompt,
8045
- tabId,
8046
- });
8047
- _ccStreamAbort = llmPromise.abort;
8048
- liveState.abortFn = _ccStreamAbort;
8049
- ccInFlightAborts.set(tabId, _ccStreamAbort);
8050
- const result = await llmPromise;
8051
- trackUsage('command-center', result.usage);
8052
-
8053
- if (result.missingRuntime) {
8054
- finishMissingRuntime(result, liveState);
8055
- return;
8056
- }
8057
-
8058
- // Handle failure — non-zero exit with text = max_turns or partial success, still usable
8059
- if (!result.text && wasResume && result.code !== 0 && !req.destroyed) {
8060
- // Resume failed (stale/expired session) — auto-retry as fresh session (skip if client already disconnected)
8061
- console.log(`[CC-stream] Resume failed (code=${result.code}) — retrying fresh`);
8062
- const freshPreamble = buildCCStatePreamble();
8063
- const freshCarryover = _buildTranscriptCarryover(body.transcript, { currentMessage: body.message });
8064
- const freshPrompt = _joinCcPromptParts(freshPreamble, freshCarryover, turnHeader, projectContextPart, body.message);
8065
- toolUses = []; // discard stale metadata from the failed resume attempt
8066
- const retryPromise = _invokeCcStream({
8067
- prompt: freshPrompt, sessionId: undefined, liveState, toolUses,
8247
+ // W-mpmwxni2000c25c7-b turn-level watchdog. Wraps the initial
8248
+ // _invokeCcStream PLUS the post-resume-fail retry so the wall clock
8249
+ // covers the entire CC turn (not just one underlying LLM call). On
8250
+ // expiry, whichever call is in flight is aborted; the watchdog
8251
+ // resolves with a synthetic `{ error: { code: 'cc-turn-timeout' } }`
8252
+ // envelope so the SSE error path below kicks in.
8253
+ const turnTimeoutMs = _resolveCcTurnTimeoutMs();
8254
+ const result = await withTimeout({
8255
+ timeoutMs: turnTimeoutMs, label: 'command-center-stream',
8256
+ }, async (registerAbort) => {
8257
+ const llmPromise = _invokeCcStream({
8258
+ prompt, sessionId, liveState, toolUses,
8068
8259
  model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
8069
8260
  engineConfig: CONFIG.engine,
8070
8261
  systemPrompt: turnSystemPrompt,
8071
8262
  tabId,
8072
8263
  });
8073
- _ccStreamAbort = retryPromise.abort;
8264
+ _ccStreamAbort = llmPromise.abort;
8074
8265
  liveState.abortFn = _ccStreamAbort;
8075
8266
  ccInFlightAborts.set(tabId, _ccStreamAbort);
8076
- const retryResult = await retryPromise;
8077
- trackUsage('command-center', retryResult.usage);
8078
- if (retryResult.text) {
8079
- // Fresh session succeeded — use retryResult from here
8080
- Object.assign(result, retryResult);
8267
+ registerAbort(_ccStreamAbort);
8268
+ const initial = await llmPromise;
8269
+ trackUsage('command-center', initial.usage);
8270
+
8271
+ if (initial.missingRuntime) return initial;
8272
+
8273
+ // Handle failure — non-zero exit with text = max_turns or partial success, still usable
8274
+ if (!initial.text && wasResume && initial.code !== 0 && !req.destroyed) {
8275
+ // Resume failed (stale/expired session) — auto-retry as fresh session (skip if client already disconnected)
8276
+ console.log(`[CC-stream] Resume failed (code=${initial.code}) — retrying fresh`);
8277
+ const freshPreamble = buildCCStatePreamble();
8278
+ const freshCarryover = _buildTranscriptCarryover(body.transcript, { currentMessage: body.message });
8279
+ const freshPrompt = _joinCcPromptParts(freshPreamble, freshCarryover, turnHeader, projectContextPart, body.message);
8280
+ toolUses = []; // discard stale metadata from the failed resume attempt
8281
+ const retryPromise = _invokeCcStream({
8282
+ prompt: freshPrompt, sessionId: undefined, liveState, toolUses,
8283
+ model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
8284
+ engineConfig: CONFIG.engine,
8285
+ systemPrompt: turnSystemPrompt,
8286
+ tabId,
8287
+ });
8288
+ _ccStreamAbort = retryPromise.abort;
8289
+ liveState.abortFn = _ccStreamAbort;
8290
+ ccInFlightAborts.set(tabId, _ccStreamAbort);
8291
+ registerAbort(_ccStreamAbort);
8292
+ const retryResult = await retryPromise;
8293
+ trackUsage('command-center', retryResult.usage);
8294
+ if (retryResult.text) {
8295
+ // Fresh session succeeded — use retryResult from here
8296
+ Object.assign(initial, retryResult);
8297
+ // Clear the error envelope inherited from the failed first attempt
8298
+ // so the success path below doesn't misclassify a recovered turn.
8299
+ if (retryResult.text) { initial.error = null; initial.ok = true; }
8300
+ } else if (retryResult.error) {
8301
+ initial.error = retryResult.error;
8302
+ }
8081
8303
  }
8082
- }
8304
+ return initial;
8305
+ });
8083
8306
  if (result.missingRuntime) {
8084
8307
  finishMissingRuntime(result, liveState);
8085
8308
  return;
8086
8309
  }
8087
- if (!result.text) {
8310
+ if (!result.text || result.error) {
8088
8311
  if (req.destroyed) {
8089
8312
  _ccStreamEnded = true;
8090
8313
  _logCcStreamEnd(_ccTelemetry, 'llm-empty-client-gone', { code: result.code });
8091
8314
  return;
8092
8315
  }
8093
- const debugInfo = result.code !== 0 ? `(exit code ${result.code})` : '(empty response)';
8316
+ // W-mpmwxni2000c25c7-b surface the typed error envelope as a
8317
+ // distinct SSE `event: error` frame so the client renders a real
8318
+ // error UI (with a retry hint derived from `retriable`) instead of
8319
+ // swallowing a polite 200 "I had trouble processing that" string.
8320
+ const envelope = result.error || (result.errorMessage
8321
+ ? { message: result.errorMessage, code: result.errorClass || 'unknown', retriable: true }
8322
+ : { message: 'Command Center returned no output', code: 'empty-output', retriable: true });
8323
+ trackErr('command-center', envelope.code);
8094
8324
  const stderrTail = (result.stderr || '').trim().split('\n').filter(Boolean).slice(-3).join(' | ');
8095
- console.error(`[CC-stream] Failed: code=${result.code}, stderr=${(result.stderr || '').slice(0, 500)}, stdout_tail=${(result.raw || '').slice(-500)}`);
8096
- const retryHint = 'Send your message again to retry.';
8097
- liveState.donePayload = { type: 'done', text: `I had trouble processing that ${debugInfo}. ${stderrTail ? 'Detail: ' + stderrTail : ''}\n\n${retryHint}`, actions: [], sessionId: null };
8325
+ console.error(`[CC-stream] Failed code=${envelope.code} retriable=${envelope.retriable}: ${(result.stderr || '').slice(0, 500)}; stdout_tail=${(result.raw || '').slice(-500)}`);
8326
+ // Emit `event: error` (named SSE frame), then a `done`-style frame
8327
+ // for clients that only handle the default message channel, then
8328
+ // close cleanly so the EventSource exits its read loop without
8329
+ // throwing a connection-reset.
8330
+ try { res.write(`event: error\ndata: ${JSON.stringify({ message: envelope.message, code: envelope.code, retriable: !!envelope.retriable, ...(stderrTail ? { stderr: stderrTail.slice(0, 500) } : {}) })}\n\n`); } catch {}
8331
+ liveState.donePayload = { type: 'error', error: envelope.message, code: envelope.code, retriable: !!envelope.retriable, sessionId: null };
8098
8332
  if (liveState.writer) liveState.writer(liveState.donePayload);
8099
8333
  if (liveState.endResponse) liveState.endResponse();
8100
8334
  _scheduleCcLiveCleanup(tabId);
8101
- _logCcStreamEnd(_ccTelemetry, 'llm-failed-fallback-sent', { code: result.code });
8335
+ _logCcStreamEnd(_ccTelemetry, 'llm-failed-error-envelope-sent', { code: result.code, errorCode: envelope.code });
8102
8336
  return;
8103
8337
  }
8104
8338
 
@@ -8670,7 +8904,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8670
8904
  if (_isClear(e.defaultModel)) _deleteEngineConfig('defaultModel');
8671
8905
  else {
8672
8906
  const candidate = String(e.defaultModel);
8673
- const resolvedCli = config.engine.defaultCli || 'claude';
8907
+ const resolvedCli = config.engine.defaultCli || 'copilot';
8674
8908
  const rejection = await _validateFleetModel(candidate, resolvedCli);
8675
8909
  if (rejection) _clamped.push(`engine.defaultModel: "${candidate}" ${rejection} — kept previous value`);
8676
8910
  else _setEngineConfig('defaultModel', candidate);
@@ -8680,7 +8914,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8680
8914
  if (_isClear(e.ccModel)) _deleteEngineConfig('ccModel');
8681
8915
  else {
8682
8916
  const candidate = String(e.ccModel);
8683
- const resolvedCli = config.engine.ccCli || config.engine.defaultCli || 'claude';
8917
+ const resolvedCli = config.engine.ccCli || config.engine.defaultCli || 'copilot';
8684
8918
  const rejection = await _validateFleetModel(candidate, resolvedCli);
8685
8919
  if (rejection) _clamped.push(`engine.ccModel: "${candidate}" ${rejection} — kept previous value`);
8686
8920
  else _setEngineConfig('ccModel', candidate);
@@ -8798,7 +9032,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8798
9032
  if (updates.model === '' || updates.model === null) delete config.agents[id].model;
8799
9033
  else {
8800
9034
  const candidate = String(updates.model);
8801
- const resolvedCli = config.agents[id].cli || config.engine.defaultCli || 'claude';
9035
+ const resolvedCli = config.agents[id].cli || config.engine.defaultCli || 'copilot';
8802
9036
  const runtimeModelStr = _resolveModelForRuntime(candidate, resolvedCli);
8803
9037
  const knownModels = await _modelsFor(resolvedCli);
8804
9038
  // Two validation paths:
@@ -9792,12 +10026,6 @@ What would you like to discuss or change? When you're happy, say "approve" and I
9792
10026
  { method: 'GET', path: '/api/qa/runs', desc: 'List QA validation runs (newest first). Optional ?limit=N and ?status=pending|running|passed|failed|errored filters.', handler: handleQaRunsList },
9793
10027
  { method: 'GET', path: /^\/api\/qa\/runs\/([^/?]+)$/, template: '/api/qa/runs/<id>', desc: 'Fetch a single QA run record by id.', handler: handleQaRunsById },
9794
10028
  { method: 'GET', path: /^\/api\/qa\/artifacts\/([^/?]+)\/([^?]+)$/, template: '/api/qa/artifacts/<runId>/<file>', desc: 'Serve a QA artifact file (image/video/log). Sandboxed to engine/qa-artifacts/; rejects path traversal with 403.', handler: handleQaArtifact },
9795
- { method: 'GET', path: '/api/hot-reload', desc: 'SSE stream for dashboard hot-reload notifications', handler: (req, res) => {
9796
- res.writeHead(200, { 'Content-Type': 'text/event-stream', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive' });
9797
- res.write('data: connected\n\n');
9798
- _trackSseClient(_hotReloadClients, req, res);
9799
- }},
9800
-
9801
10029
  // QA Runbooks (W-mpeiwz6k0005bf34-a) — per-project test plans stored at
9802
10030
  // <MINIONS_DIR>/projects/<name>/runbooks/<id>.json. Pure persistence —
9803
10031
  // dispatch + run records + UI live in follow-up plan items.
@@ -10020,6 +10248,18 @@ What would you like to discuss or change? When you're happy, say "approve" and I
10020
10248
  })();
10021
10249
  }},
10022
10250
 
10251
+ { method: 'POST', path: '/api/pull-requests/observe', desc: 'Toggle auto-observe (_contextOnly flag) on a tracked PR', params: 'host (github|ado), slug, number, observe (boolean)', handler: async (req, res) => {
10252
+ const body = await readBody(req);
10253
+ reloadConfig();
10254
+ try {
10255
+ const result = updatePullRequestObserveFlag(body, CONFIG);
10256
+ invalidateStatusCache();
10257
+ return jsonReply(res, 200, { ok: true, ...result, observe: !result._contextOnly });
10258
+ } catch (e) {
10259
+ return jsonReply(res, e.statusCode || 400, { error: e.message });
10260
+ }
10261
+ }},
10262
+
10023
10263
  { method: 'POST', path: '/api/pull-requests/delete', desc: 'Remove a PR from tracking', params: 'id, project?', handler: async (req, res) => {
10024
10264
  const body = await readBody(req);
10025
10265
  const { id } = body;
@@ -10629,6 +10869,7 @@ module.exports = {
10629
10869
  _buildDocChatResponsePayload,
10630
10870
  _inferDocChatProject,
10631
10871
  _linkPullRequestForTracking: linkPullRequestForTracking,
10872
+ _updatePullRequestObserveFlag: updatePullRequestObserveFlag,
10632
10873
  _resolveSkillReadPath,
10633
10874
  // Per-CC-turn correlation surface
10634
10875
  _ccTurnCreations,