npm - @yemi33/minions - Versions diffs - 0.1.2045 → 0.1.2047 - Mend

@yemi33/minions 0.1.2045 → 0.1.2047

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +2 -2
package/dashboard/js/fre.js +3 -2
package/dashboard/js/render-prs.js +82 -2
package/dashboard/js/settings.js +5 -5
package/dashboard/styles.css +11 -0
package/dashboard.js +376 -135
package/docs/copilot-cli-schema.md +2 -1
package/docs/runtime-adapters.md +9 -4
package/engine/cc-worker-pool.js +87 -11
package/engine/llm.js +148 -2
package/engine/preflight.js +5 -5
package/engine/queries.js +75 -35
package/engine/runtimes/claude.js +41 -0
package/engine/runtimes/copilot.js +97 -3
package/engine/shared.js +4 -3
package/package.json +1 -1

package/dashboard.js CHANGED Viewed

@@ -855,6 +855,67 @@ function linkPullRequestForTracking({ url, title, project: projectName, autoObse
   return { ...result, prPath, targetProject, projectResolution, prNum };
 }
+// W-mpmwxkzm0009ba0b — Per-row auto-observe toggle backing helper for
+// POST /api/pull-requests/observe. Flips `_contextOnly` / `_autoObserve` on
+// an existing tracked PR record under a lock (per CLAUDE.md mutate convention).
+// Body shape: { host: 'github'|'ado', slug, number, observe: boolean }.
+// Returns the updated record + the PR path that was touched. Throws an
+// Error with `statusCode` for the route handler to map to an HTTP status.
+function updatePullRequestObserveFlag({ host, slug, number, observe } = {}, config = CONFIG, minionsDir = MINIONS_DIR) {
+  const hostStr = String(host || '').trim().toLowerCase();
+  const slugStr = String(slug || '').trim();
+  const numberInt = Number.parseInt(number, 10);
+  if (!hostStr || (hostStr !== 'github' && hostStr !== 'ado')) {
+    const err = new Error('host must be "github" or "ado"');
+    err.statusCode = 400;
+    throw err;
+  }
+  if (!slugStr) {
+    const err = new Error('slug required');
+    err.statusCode = 400;
+    throw err;
+  }
+  if (!Number.isFinite(numberInt) || numberInt <= 0) {
+    const err = new Error('number must be a positive integer');
+    err.statusCode = 400;
+    throw err;
+  }
+  if (typeof observe !== 'boolean') {
+    const err = new Error('observe must be a boolean');
+    err.statusCode = 400;
+    throw err;
+  }
+  const canonicalId = `${hostStr}:${slugStr}#${numberInt}`;
+  const projects = shared.getProjects(config);
+  const prPaths = [
+    ...projects.map(p => shared.projectPrPath(p)),
+    shared.centralPullRequestsPath(minionsDir),
+  ];
+  let updated = null;
+  let updatedPath = null;
+  for (const prPath of prPaths) {
+    if (updated) break;
+    shared.mutatePullRequests(prPath, (prs) => {
+      const pr = prs.find(p => p && p.id === canonicalId);
+      if (!pr) return prs;
+      pr._contextOnly = !observe;
+      pr._autoObserve = !!observe;
+      updated = { id: pr.id, _contextOnly: pr._contextOnly, _autoObserve: pr._autoObserve };
+      updatedPath = prPath;
+      return prs;
+    });
+  }
+  if (!updated) {
+    const err = new Error(`pull request ${canonicalId} not found`);
+    err.statusCode = 404;
+    throw err;
+  }
+  return { ...updated, prPath: updatedPath };
+}
 function _normalizeSkillDirForCompare(dir) {
   const resolved = path.resolve(String(dir || '').replace(/\//g, path.sep));
   return process.platform === 'win32' ? resolved.toLowerCase() : resolved;
@@ -1173,48 +1234,13 @@ function _getDashboardBrowserPresence(now = Date.now()) {
   return { active: activeTabs.length > 0, activeTabs: activeTabs.length, maxAgeMs: DASHBOARD_BROWSER_PRESENCE_MAX_AGE_MS };
 }
-// Hot-reload: watch dashboard/ directory for changes, rebuild, and push reload to browsers
-const _hotReloadClients = new Set();
-function rebuildDashboardHtml() {
-  try {
-    const newRaw = buildDashboardHtml();
-    if (newRaw === HTML_RAW) return; // no changes
-    HTML_RAW = newRaw;
-    HTML = HTML_RAW;
-    HTML_GZ = zlib.gzipSync(HTML);
-    HTML_ETAG = '"' + require('crypto').createHash('md5').update(HTML).digest('hex') + '"';
-    // Bust the /api/status cache so the new dashboardBuildId propagates on the
-    // next poll — refresh.js compares it against its first-observed value and
-    // hard-reloads on mismatch (R3, W-mpgb0xgc000hf1d3).
-    try { invalidateStatusCache(); } catch { /* status cache may not be initialized yet */ }
-    console.log('  Dashboard hot-reloaded');
-    // Push reload to all connected browsers via status-stream (saves a connection)
-    for (const res of _statusStreamClients) {
-      try { res.write('event: reload\ndata: reload\n\n'); } catch { _removeSseClient(_statusStreamClients, res); }
-    }
-    // Legacy hot-reload clients
-    for (const res of _hotReloadClients) {
-      try { res.write('data: reload\n\n'); } catch { _removeSseClient(_hotReloadClients, res); }
-    }
-  } catch (e) { console.error('  Hot-reload error:', e.message); }
-}
-const dashDir = path.join(MINIONS_DIR, 'dashboard');
-if (fs.existsSync(dashDir)) {
-  let _reloadTimer = null;
-  const scheduleReload = () => {
-    if (_reloadTimer) clearTimeout(_reloadTimer);
-    _reloadTimer = setTimeout(rebuildDashboardHtml, 300); // debounce 300ms
-  };
-  // Watch top-level files (styles.css, layout.html)
-  try { fs.watch(dashDir, scheduleReload); } catch { /* optional */ }
-  // Watch subdirectories (pages/, js/)
-  for (const sub of ['pages', 'js']) {
-    const subDir = path.join(dashDir, sub);
-    if (fs.existsSync(subDir)) try { fs.watch(subDir, scheduleReload); } catch { /* optional */ }
-  }
-}
+// Hot-reload removed (2026-05-26): file-watch-driven rebuilds force-reloaded
+// every connected browser on any change under dashboard/, which surprised
+// users when parallel agent merges touched dashboard.js or dashboard/js/*.
+// The dashboard HTML is now assembled once at process boot. Code changes on
+// disk land in the running process only after an explicit `minions restart`;
+// the buildId/startedAt mismatch in refresh.js then triggers a one-shot
+// browser reload on that next poll — driven by user-initiated restart only.
 // -- Data Collectors (most moved to engine/queries.js) --
@@ -2337,6 +2363,17 @@ const CC_LOG_ERROR_MAX_LEN = 80; // truncate exception messages in [cc-stream] l
 const CC_STREAM_REATTACH_GRACE_MS = 60000; // keep CC job alive briefly after disconnect so the UI can reattach
 const CC_STREAM_DONE_RETENTION_MS = 30000; // retain final payload briefly so reconnect can still receive it
 const CC_LIVE_STREAM_MAX_AGE_MS = shared.ENGINE_DEFAULTS.ccLiveStreamMaxAgeMs;
+// W-mpmwxni2000c25c7-b — CC/doc-chat turn watchdog. Resolves per-call from
+// CONFIG.engine.ccTurnTimeoutMs (defaults to ENGINE_DEFAULTS.ccTurnTimeoutMs)
+// so an operator can shorten/lengthen the wall-clock cap without a code
+// change. callLLM's own `timeout` opt only kills the spawned child after a
+// long idle stretch; this turn-level cap kills WHICHEVER LLM call is in
+// flight inside ccCall/ccCallStreaming (resume → fresh → final retry).
+function _resolveCcTurnTimeoutMs() {
+  const cfg = CONFIG && CONFIG.engine;
+  const candidate = cfg && Number.isFinite(cfg.ccTurnTimeoutMs) ? cfg.ccTurnTimeoutMs : shared.ENGINE_DEFAULTS.ccTurnTimeoutMs;
+  return Number.isFinite(candidate) && candidate > 0 ? candidate : 0;
+}
 // Doc-chat is interactive — long-doc edits with multi-step Read+Write tool use can run
 // well past 5 min on `canEdit:true` paths. Bumped to 1 hour (matching CC) so legitimate
 // edits aren't killed mid-stream and the backend timeout never beats the user's reading
@@ -3391,6 +3428,22 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
   let timeoutTimer = null;
   let resolveResult;
   const promise = new Promise((resolve) => { resolveResult = resolve; });
+  // W-mpmwxni2000c25c7-c — build a single failure envelope shape from a
+  // typed Error (or a plain Error). Reads `.code` / `.retriable` if the
+  // pool stamped them; falls back to safe defaults otherwise so callers
+  // see a consistent `{ ..., errorCode, errorRetriable }` shape. Sub-item
+  // b's SSE writer consumes these to render a structured error event
+  // instead of grepping the stderr string for a code.
+  const _failureEnvelope = (err, defaultCode) => ({
+    text: accumulated,
+    sessionId: sessionHandle ? sessionHandle.sessionId : null,
+    code: 1,
+    usage: {},
+    raw: accumulated,
+    stderr: String((err && err.message) || err || 'cc-worker-pool failure'),
+    errorCode: (err && err.code) || defaultCode || null,
+    errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
+  });
   const finalize = (envelope) => {
     if (settled) return;
     settled = true;
@@ -3415,14 +3468,18 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
     timeoutTimer = setTimeout(() => {
       try { sessionHandle && sessionHandle.cancel(); } catch { /* swallow */ }
       try { ccWorkerPool.closeTab(tabKey); } catch { /* swallow */ }
-      finalize({
-        text: accumulated,
-        sessionId: sessionHandle ? sessionHandle.sessionId : null,
-        code: 1,
-        usage: {},
-        raw: accumulated,
-        stderr: `doc-chat-pool: timeout after ${timeoutMs}ms`,
-      });
+      // W-mpmwxni2000c25c7-c — convert the legacy synthesized
+      // `{ code: 1, stderr: 'doc-chat-pool: timeout after Xms' }` shape into
+      // a typed-error envelope so the SSE writer can render the same
+      // structured error event for timeouts as for spawn/handshake/exit
+      // failures. The error code carries `cc-turn-timeout`; consumers
+      // grep on that instead of parsing the stderr string.
+      const timeoutErr = ccWorkerPool._typedError(
+        `doc-chat-pool: timeout after ${timeoutMs}ms`,
+        ccWorkerPool.ERROR_CODES.CC_TURN_TIMEOUT,
+        true
+      );
+      finalize(_failureEnvelope(timeoutErr, ccWorkerPool.ERROR_CODES.CC_TURN_TIMEOUT));
     }, timeoutMs);
     if (typeof timeoutTimer.unref === 'function') timeoutTimer.unref();
   }
@@ -3436,14 +3493,10 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
         systemPromptHash: _docChatPromptHash,
       });
     } catch (err) {
-      return finalize({
-        text: '',
-        sessionId: null,
-        code: 1,
-        usage: {},
-        raw: '',
-        stderr: String((err && err.message) || err || 'cc-worker-pool spawn failed'),
-      });
+      // Pool stamps `.code` (worker-spawn-failed / acp-handshake-failed) on
+      // every error from getSession; fall back to worker-spawn-failed if
+      // the error is a plain Error from somewhere unexpected.
+      return finalize(_failureEnvelope(err, ccWorkerPool.ERROR_CODES.WORKER_SPAWN_FAILED));
     }
     if (cancelled) {
       try { sessionHandle.cancel(); } catch { /* swallow */ }
@@ -3471,14 +3524,15 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
         finalize({ text: accumulated, sessionId: sessionHandle.sessionId, code: 0, usage: {}, raw: accumulated, stderr: '' });
       },
       onError: (err) => {
-        finalize({
-          text: accumulated,
-          sessionId: sessionHandle.sessionId,
-          code: cancelled ? 0 : 1,
-          usage: {},
-          raw: accumulated,
-          stderr: String((err && err.message) || err || 'cc-worker-pool stream error'),
-        });
+        if (cancelled) {
+          // User-driven cancel — not a real error, treat as a clean exit.
+          finalize({ text: accumulated, sessionId: sessionHandle.sessionId, code: 0, usage: {}, raw: accumulated, stderr: '' });
+          return;
+        }
+        // Pool stamps `.code` (worker-died for mid-stream proc exit).
+        // Fallback default is worker-died because the stream onError is
+        // overwhelmingly fired from the post-handshake exit handler.
+        finalize(_failureEnvelope(err, ccWorkerPool.ERROR_CODES.WORKER_DIED));
       },
     });
   })();
@@ -3975,12 +4029,65 @@ async function _retryDocChatAfterResumeFailure({ result, initialPass, freshSessi
 // Shape the per-failure debug envelope (raw stderr + classification metadata)
 // shared by hard failures and partial recoveries — keeps the wire shape in lockstep.
 function _buildDocChatErrorEnvelope(result) {
+  // W-mpmwxni2000c25c7-b — also surface the typed `error: {message, code,
+  // retriable}` envelope when llm.callLLM* produced one, so doc-chat clients
+  // get the same shape Command Center handlers emit.
+  const typed = result && result.error;
   return {
     code: result.code ?? null,
     stderr: String(result.stderr || '').slice(-2048),
     errorClass: result.errorClass || null,
     errorMessage: result.errorMessage || null,
     runtime: result.runtime || null,
+    ...(typed ? {
+      typedCode: typed.code || null,
+      typedMessage: typed.message || null,
+      retriable: typed.retriable !== false,
+    } : {}),
+  };
+}
+// W-mpmwxni2000c25c7-b — race a ccDocCall* promise against a wall-clock turn
+// timer. On expiry, fires `abortFn` (killing the in-flight CLI) and resolves
+// with a doc-chat-shaped failure payload that flows through the existing
+// _docChatFailureResponse / SSE error event paths. timeoutMs <= 0 disables
+// the watchdog (passthrough).
+async function _raceCcDocChatTimeout(callPromise, timeoutMs, abortFn, label) {
+  if (!timeoutMs || timeoutMs <= 0) return callPromise;
+  let timer = null;
+  let timedOut = false;
+  const timeoutPromise = new Promise((resolve) => {
+    timer = setTimeout(() => {
+      timedOut = true;
+      try { if (abortFn) abortFn(); } catch { /* swallow */ }
+      resolve(null);
+    }, timeoutMs);
+    // NOTE: do NOT unref — Node would exit the event loop while awaiting the
+    // call promise (Promises don't keep the loop open; timers/I/O do). Cleared
+    // immediately on the success path below.
+  });
+  const winner = await Promise.race([callPromise, timeoutPromise]);
+  if (!timedOut) {
+    clearTimeout(timer);
+    return winner;
+  }
+  // Drain the in-flight call so its cleanup runs before we hand back the
+  // synthetic envelope.
+  await callPromise.catch(() => null);
+  const message = `${label || 'doc-chat'} turn timed out after ${timeoutMs}ms`;
+  return {
+    answer: 'Document chat request timed out — try again.',
+    toolUses: [],
+    error: {
+      code: 'cc-turn-timeout',
+      stderr: '',
+      errorClass: 'cc-turn-timeout',
+      errorMessage: message,
+      runtime: null,
+      typedCode: 'cc-turn-timeout',
+      typedMessage: message,
+      retriable: true,
+    },
   };
 }
@@ -4014,7 +4121,13 @@ function _recoverPartialDocChatResponse(result, sessionKey) {
 function _shouldSuppressDocChatPostPatchError(ccError, finalize) {
   if (!finalize || finalize.edited !== true) return false;
-  if (!ccError || ccError.errorClass !== 'unknown-model') return false;
+  if (!ccError) return false;
+  // W-mpmwxni2000c25c7-a — accept both the legacy 'unknown-model' errorClass
+  // (still emitted by the dashboard preflight at _preflightModelCheck) and
+  // the new 'model-unavailable' code returned by copilot.parseError for
+  // invalid-model responses. Suppress in either case — the edit already
+  // landed, so the stale model error shouldn't surface to the user.
+  if (ccError.errorClass !== 'unknown-model' && ccError.errorClass !== 'model-unavailable') return false;
   return String(ccError.runtime || '').toLowerCase() === 'copilot';
 }
@@ -6729,7 +6842,14 @@ What would you like to discuss or change? When you're happy, say "approve" and I
       const ccTurnId = 'cct-' + shared.uid();
       const turnSystemPrompt = renderDocChatSystemPromptForTurn(ccTurnId);
-      let { answer, partial, warning, toolUses, error: ccError } = await ccDocCall({
+      // W-mpmwxni2000c25c7-b — wall-clock turn watchdog. The doc-chat call
+      // can internally spawn resume + fresh + final-retry LLM calls; we want
+      // ONE wall-clock cap that covers the whole turn so a runtime stuck
+      // mid-stream can't outlive ccTurnTimeoutMs. On expiry the watchdog
+      // calls _docAbort (kills the in-flight CLI) and the synthesized payload
+      // below flows through the existing _docChatFailureResponse path.
+      const _docTurnTimeoutMs = _resolveCcTurnTimeoutMs();
+      const _docCallPromise = ccDocCall({
         message: body.message, document: currentContent, title: body.title,
         filePath: body.filePath, selection: body.selection, canEdit, isJson,
         model: body.model || undefined,
@@ -6739,6 +6859,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
         systemPrompt: turnSystemPrompt,
         turnId: ccTurnId,
       });
+      const _docCallResult = await _raceCcDocChatTimeout(_docCallPromise, _docTurnTimeoutMs, () => _docAbort && _docAbort(), 'doc-chat');
+      let { answer, partial, warning, toolUses, error: ccError } = _docCallResult;
       const finalize = _finalizeDocChatEdit({
         filePath: body.filePath, fullPath, isJson, canEdit,
         originalContent: currentContent, delimiterContent: null,
@@ -6752,6 +6874,25 @@ What would you like to discuss or change? When you're happy, say "approve" and I
         ccError, partial, warning, toolUses, finalize,
       });
       _docDone = true;
+      // W-mpmwxni2000c25c7-b — track every surfaced doc-chat error code so
+      // /api/metrics reflects silent-error regressions. Hard failures (no
+      // partial recovery, no edited file) graduate to 5xx so the client can
+      // render a real error UI instead of treating the polite "Failed to
+      // process request" string as a successful turn.
+      if (ccError) {
+        const errCode = ccError.typedCode || ccError.errorClass || ccError.code || 'unknown';
+        llm.trackEngineError('doc-chat', errCode);
+        const isHardFailure = !partial && !(finalize && finalize.edited);
+        if (isHardFailure) {
+          const status = errCode === shared.FAILURE_CLASS.CONFIG_ERROR ? 503 : 502;
+          return jsonReply(res, status, {
+            ...payload,
+            error: ccError.typedMessage || ccError.errorMessage || 'Document chat failed',
+            code: errCode,
+            retriable: ccError.retriable !== false,
+          });
+        }
+      }
       return jsonReply(res, 200, payload);
       } finally { _docAbort = null; _docDone = true; docChatInFlight.delete(docKey); }
     } catch (e) { return jsonReply(res, e.statusCode || 500, { error: e.message }); }
@@ -6840,7 +6981,12 @@ What would you like to discuss or change? When you're happy, say "approve" and I
         const ccTurnId = 'cct-' + shared.uid();
         const turnSystemPrompt = renderDocChatSystemPromptForTurn(ccTurnId);
-        let { answer, partial, warning, toolUses, error: ccError } = await ccDocCallStreaming({
+        // W-mpmwxni2000c25c7-b — wall-clock turn watchdog (mirrors the
+        // non-stream handleDocChat path). On expiry _docAbort kills the
+        // in-flight LLM and the synthesized payload below flows through the
+        // SSE done frame the client already expects with `error` set.
+        const _docTurnTimeoutMs = _resolveCcTurnTimeoutMs();
+        const _docStreamCallPromise = ccDocCallStreaming({
           message: body.message, document: currentContent, title: body.title,
           filePath: body.filePath, selection: body.selection, canEdit, isJson,
           model: body.model || undefined,
@@ -6853,6 +6999,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
           systemPrompt: turnSystemPrompt,
           turnId: ccTurnId,
         });
+        const _docStreamResult = await _raceCcDocChatTimeout(_docStreamCallPromise, _docTurnTimeoutMs, () => _docAbort && _docAbort(), 'doc-chat-stream');
+        let { answer, partial, warning, toolUses, error: ccError } = _docStreamResult;
         const finalize = _finalizeDocChatEdit({
           filePath: body.filePath, fullPath, isJson, canEdit,
           originalContent: currentContent, delimiterContent: null,
@@ -6865,6 +7013,23 @@ What would you like to discuss or change? When you're happy, say "approve" and I
           actionFeedback: null, actionParseError: null,
           ccError, partial, warning, toolUses, finalize,
         });
+        // W-mpmwxni2000c25c7-b — track surfaced doc-chat error codes for
+        // /api/metrics and emit a named SSE `event: error` frame so the
+        // client can render a typed error instead of treating the polite
+        // fallback string as a normal completion.
+        if (ccError) {
+          const errCode = ccError.typedCode || ccError.errorClass || ccError.code || 'unknown';
+          llm.trackEngineError('doc-chat', errCode);
+          const isHardFailure = !partial && !(finalize && finalize.edited);
+          if (isHardFailure) {
+            const errPayload = {
+              message: ccError.typedMessage || ccError.errorMessage || 'Document chat failed',
+              code: errCode,
+              retriable: ccError.retriable !== false,
+            };
+            try { res.write(`event: error\ndata: ${JSON.stringify(errPayload)}\n\n`); } catch {}
+          }
+        }
         const { answer: finalAnswer, ...donePayload } = payload;
         writeDocEvent({
           type: 'done',
@@ -7461,21 +7626,40 @@ What would you like to discuss or change? When you're happy, say "approve" and I
         // confirmation chips in the assistant reply.
         const ccTurnId = 'cct-' + shared.uid();
         const turnSystemPrompt = renderCcSystemPromptForTurn(ccTurnId);
-        const result = await ccCall(body.message, { store: 'cc', transcript: body.transcript, systemPrompt: turnSystemPrompt, turnId: ccTurnId });
-        // Non-zero exit with text = max_turns or partial success — still usable
-        if (!result.text) {
+        // W-mpmwxni2000c25c7-b — wall-clock turn watchdog. On expiry the
+        // in-flight LLM call is aborted and ccCall returns a synthetic
+        // envelope with error.code === 'cc-turn-timeout'.
+        const turnTimeoutMs = _resolveCcTurnTimeoutMs();
+        const result = await llm.withCcTurnTimeout({
+          timeoutMs: turnTimeoutMs, label: 'command-center',
+        }, (registerAbort) => ccCall(body.message, {
+          store: 'cc', transcript: body.transcript, systemPrompt: turnSystemPrompt, turnId: ccTurnId,
+          onAbortReady: registerAbort,
+        }));
+        // W-mpmwxni2000c25c7-b — typed-error envelope path. Any failure that
+        // produced no usable text is surfaced to the client as 5xx JSON
+        // `{ error, code, retriable }` instead of a polite 200 "I had trouble
+        // processing that" string that silently halves CC retry signal.
+        if (!result.text || result.error) {
+          const errEnvelope = result.error || (result.errorMessage
+            ? { message: result.errorMessage, code: result.errorClass || 'unknown', retriable: true }
+            : { message: 'Command Center returned no output', code: 'empty-output', retriable: true });
+          llm.trackEngineError('command-center', errEnvelope.code);
           const debugInfo = result.code !== 0 ? `(exit code ${result.code})` : '(empty response)';
           const stderrTail = (result.stderr || '').trim().split('\n').filter(Boolean).slice(-5).join(' | ');
-          console.error(`[CC] LLM failed after retries ${debugInfo}: ${stderrTail}`);
-          try { shared.log('warn', `CC failed ${debugInfo}: ${stderrTail.slice(0, 300)}`); } catch {}
-          const hasSession = !!ccSession.sessionId;
-          const retryHint = hasSession
-            ? 'Your session is still active — just send your message again to retry.'
-            : 'Try clicking **New Session** and sending your message again.';
-          return jsonReply(res, 200, {
-            text: `I had trouble processing that ${debugInfo}. ${stderrTail ? 'Detail: ' + stderrTail : ''}\n\n${retryHint}`,
-            actions: [], sessionId: ccSession.sessionId
+          console.error(`[CC] LLM failed after retries ${debugInfo} code=${errEnvelope.code}: ${stderrTail}`);
+          try { shared.log('warn', `CC failed ${debugInfo} code=${errEnvelope.code}: ${stderrTail.slice(0, 300)}`); } catch {}
+          // Missing-runtime is a 503 (service config); auth-failure also 503; other classes 502.
+          const status = result.missingRuntime ? 503
+            : errEnvelope.code === 'auth-failure' ? 503
+              : 502;
+          return jsonReply(res, status, {
+            error: errEnvelope.message,
+            code: errEnvelope.code,
+            retriable: !!errEnvelope.retriable,
+            sessionId: ccSession.sessionId || null,
+            ...(stderrTail ? { stderr: stderrTail.slice(0, 500) } : {}),
           });
         }
@@ -7496,7 +7680,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
       } finally {
         _releaseCCTab(tabId);
       }
-    } catch (e) { _releaseCCTab(tabId); return jsonReply(res, e.statusCode || 500, { error: e.message }); }
+    } catch (e) { _releaseCCTab(tabId); return jsonReply(res, e.statusCode || 500, { error: e.message, code: 'handler-exception', retriable: false }); }
   }
   /** Build a lightweight input object for SSE tool events — keeps only the fields formatToolSummary needs, with truncated string values. */
@@ -7618,6 +7802,11 @@ What would you like to discuss or change? When you're happy, say "approve" and I
         });
       } catch (err) {
         _emitTimingLog(null, null, Date.now(), 'spawn-failed');
+        // W-mpmwxni2000c25c7-c — pipe the pool's typed-error fields
+        // (`code`, `retriable`) onto the envelope so the SSE writer can
+        // render a structured error event instead of grepping the stderr
+        // string. Pool stamps `.code` (worker-spawn-failed or
+        // acp-handshake-failed) on every getSession rejection.
         return resolveResult({
           text: '',
           sessionId: null,
@@ -7625,6 +7814,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
           usage: {},
           raw: '',
           stderr: String((err && err.message) || err || 'cc-worker-pool spawn failed'),
+          errorCode: (err && err.code) || ccWorkerPool.ERROR_CODES.WORKER_SPAWN_FAILED,
+          errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
         });
       }
       const _tSessionReady = Date.now();
@@ -7671,13 +7862,29 @@ What would you like to discuss or change? When you're happy, say "approve" and I
         },
         onError: (err) => {
           _emitTimingLog(_lifecycle, _tSessionReady, Date.now(), cancelled ? 'cancelled' : 'error');
+          if (cancelled) {
+            resolveResult({
+              text: accumulated,
+              sessionId: sessionHandle.sessionId,
+              code: 0,
+              usage: {},
+              raw: accumulated,
+              stderr: '',
+            });
+            return;
+          }
+          // W-mpmwxni2000c25c7-c — pipe the pool's typed-error fields
+          // through. mid-stream worker exits stamp `.code = 'worker-died'`
+          // on the Error before invoking onError.
           resolveResult({
             text: accumulated,
             sessionId: sessionHandle.sessionId,
-            code: cancelled ? 0 : 1,
+            code: 1,
             usage: {},
             raw: accumulated,
             stderr: String((err && err.message) || err || 'cc-worker-pool stream error'),
+            errorCode: (err && err.code) || ccWorkerPool.ERROR_CODES.WORKER_DIED,
+            errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
           });
         },
       });
@@ -8032,73 +8239,100 @@ What would you like to discuss or change? When you're happy, say "approve" and I
           : '';
         const prompt = _joinCcPromptParts(preamble, resumeGuard, carryover, turnHeader, projectContextPart, body.message);
-        const { trackEngineUsage: trackUsage } = require('./engine/llm');
+        const { trackEngineUsage: trackUsage, trackEngineError: trackErr, withCcTurnTimeout: withTimeout } = require('./engine/llm');
         const streamModel = CONFIG.engine?.ccModel || shared.ENGINE_DEFAULTS.ccModel;
         const streamEffort = CONFIG.engine?.ccEffort || shared.ENGINE_DEFAULTS.ccEffort;
         const ccMaxTurns = CONFIG.engine?.ccMaxTurns || shared.ENGINE_DEFAULTS.ccMaxTurns;
         let toolUses = [];
-        const llmPromise = _invokeCcStream({
-          prompt, sessionId, liveState, toolUses,
-          model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
-          engineConfig: CONFIG.engine,
-          systemPrompt: turnSystemPrompt,
-          tabId,
-        });
-        _ccStreamAbort = llmPromise.abort;
-        liveState.abortFn = _ccStreamAbort;
-        ccInFlightAborts.set(tabId, _ccStreamAbort);
-        const result = await llmPromise;
-        trackUsage('command-center', result.usage);
-        if (result.missingRuntime) {
-          finishMissingRuntime(result, liveState);
-          return;
-        }
-        // Handle failure — non-zero exit with text = max_turns or partial success, still usable
-        if (!result.text && wasResume && result.code !== 0 && !req.destroyed) {
-          // Resume failed (stale/expired session) — auto-retry as fresh session (skip if client already disconnected)
-          console.log(`[CC-stream] Resume failed (code=${result.code}) — retrying fresh`);
-          const freshPreamble = buildCCStatePreamble();
-          const freshCarryover = _buildTranscriptCarryover(body.transcript, { currentMessage: body.message });
-          const freshPrompt = _joinCcPromptParts(freshPreamble, freshCarryover, turnHeader, projectContextPart, body.message);
-          toolUses = []; // discard stale metadata from the failed resume attempt
-          const retryPromise = _invokeCcStream({
-            prompt: freshPrompt, sessionId: undefined, liveState, toolUses,
+        // W-mpmwxni2000c25c7-b — turn-level watchdog. Wraps the initial
+        // _invokeCcStream PLUS the post-resume-fail retry so the wall clock
+        // covers the entire CC turn (not just one underlying LLM call). On
+        // expiry, whichever call is in flight is aborted; the watchdog
+        // resolves with a synthetic `{ error: { code: 'cc-turn-timeout' } }`
+        // envelope so the SSE error path below kicks in.
+        const turnTimeoutMs = _resolveCcTurnTimeoutMs();
+        const result = await withTimeout({
+          timeoutMs: turnTimeoutMs, label: 'command-center-stream',
+        }, async (registerAbort) => {
+          const llmPromise = _invokeCcStream({
+            prompt, sessionId, liveState, toolUses,
             model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
             engineConfig: CONFIG.engine,
             systemPrompt: turnSystemPrompt,
             tabId,
           });
-          _ccStreamAbort = retryPromise.abort;
+          _ccStreamAbort = llmPromise.abort;
           liveState.abortFn = _ccStreamAbort;
           ccInFlightAborts.set(tabId, _ccStreamAbort);
-          const retryResult = await retryPromise;
-          trackUsage('command-center', retryResult.usage);
-          if (retryResult.text) {
-            // Fresh session succeeded — use retryResult from here
-            Object.assign(result, retryResult);
+          registerAbort(_ccStreamAbort);
+          const initial = await llmPromise;
+          trackUsage('command-center', initial.usage);
+          if (initial.missingRuntime) return initial;
+          // Handle failure — non-zero exit with text = max_turns or partial success, still usable
+          if (!initial.text && wasResume && initial.code !== 0 && !req.destroyed) {
+            // Resume failed (stale/expired session) — auto-retry as fresh session (skip if client already disconnected)
+            console.log(`[CC-stream] Resume failed (code=${initial.code}) — retrying fresh`);
+            const freshPreamble = buildCCStatePreamble();
+            const freshCarryover = _buildTranscriptCarryover(body.transcript, { currentMessage: body.message });
+            const freshPrompt = _joinCcPromptParts(freshPreamble, freshCarryover, turnHeader, projectContextPart, body.message);
+            toolUses = []; // discard stale metadata from the failed resume attempt
+            const retryPromise = _invokeCcStream({
+              prompt: freshPrompt, sessionId: undefined, liveState, toolUses,
+              model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
+              engineConfig: CONFIG.engine,
+              systemPrompt: turnSystemPrompt,
+              tabId,
+            });
+            _ccStreamAbort = retryPromise.abort;
+            liveState.abortFn = _ccStreamAbort;
+            ccInFlightAborts.set(tabId, _ccStreamAbort);
+            registerAbort(_ccStreamAbort);
+            const retryResult = await retryPromise;
+            trackUsage('command-center', retryResult.usage);
+            if (retryResult.text) {
+              // Fresh session succeeded — use retryResult from here
+              Object.assign(initial, retryResult);
+              // Clear the error envelope inherited from the failed first attempt
+              // so the success path below doesn't misclassify a recovered turn.
+              if (retryResult.text) { initial.error = null; initial.ok = true; }
+            } else if (retryResult.error) {
+              initial.error = retryResult.error;
+            }
           }
-        }
+          return initial;
+        });
         if (result.missingRuntime) {
           finishMissingRuntime(result, liveState);
           return;
         }
-        if (!result.text) {
+        if (!result.text || result.error) {
           if (req.destroyed) {
             _ccStreamEnded = true;
             _logCcStreamEnd(_ccTelemetry, 'llm-empty-client-gone', { code: result.code });
             return;
           }
-          const debugInfo = result.code !== 0 ? `(exit code ${result.code})` : '(empty response)';
+          // W-mpmwxni2000c25c7-b — surface the typed error envelope as a
+          // distinct SSE `event: error` frame so the client renders a real
+          // error UI (with a retry hint derived from `retriable`) instead of
+          // swallowing a polite 200 "I had trouble processing that" string.
+          const envelope = result.error || (result.errorMessage
+            ? { message: result.errorMessage, code: result.errorClass || 'unknown', retriable: true }
+            : { message: 'Command Center returned no output', code: 'empty-output', retriable: true });
+          trackErr('command-center', envelope.code);
           const stderrTail = (result.stderr || '').trim().split('\n').filter(Boolean).slice(-3).join(' | ');
-          console.error(`[CC-stream] Failed: code=${result.code}, stderr=${(result.stderr || '').slice(0, 500)}, stdout_tail=${(result.raw || '').slice(-500)}`);
-          const retryHint = 'Send your message again to retry.';
-          liveState.donePayload = { type: 'done', text: `I had trouble processing that ${debugInfo}. ${stderrTail ? 'Detail: ' + stderrTail : ''}\n\n${retryHint}`, actions: [], sessionId: null };
+          console.error(`[CC-stream] Failed code=${envelope.code} retriable=${envelope.retriable}: ${(result.stderr || '').slice(0, 500)}; stdout_tail=${(result.raw || '').slice(-500)}`);
+          // Emit `event: error` (named SSE frame), then a `done`-style frame
+          // for clients that only handle the default message channel, then
+          // close cleanly so the EventSource exits its read loop without
+          // throwing a connection-reset.
+          try { res.write(`event: error\ndata: ${JSON.stringify({ message: envelope.message, code: envelope.code, retriable: !!envelope.retriable, ...(stderrTail ? { stderr: stderrTail.slice(0, 500) } : {}) })}\n\n`); } catch {}
+          liveState.donePayload = { type: 'error', error: envelope.message, code: envelope.code, retriable: !!envelope.retriable, sessionId: null };
           if (liveState.writer) liveState.writer(liveState.donePayload);
           if (liveState.endResponse) liveState.endResponse();
           _scheduleCcLiveCleanup(tabId);
-          _logCcStreamEnd(_ccTelemetry, 'llm-failed-fallback-sent', { code: result.code });
+          _logCcStreamEnd(_ccTelemetry, 'llm-failed-error-envelope-sent', { code: result.code, errorCode: envelope.code });
           return;
         }
@@ -8670,7 +8904,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
           if (_isClear(e.defaultModel)) _deleteEngineConfig('defaultModel');
           else {
             const candidate = String(e.defaultModel);
-            const resolvedCli = config.engine.defaultCli || 'claude';
+            const resolvedCli = config.engine.defaultCli || 'copilot';
             const rejection = await _validateFleetModel(candidate, resolvedCli);
             if (rejection) _clamped.push(`engine.defaultModel: "${candidate}" ${rejection} — kept previous value`);
             else _setEngineConfig('defaultModel', candidate);
@@ -8680,7 +8914,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
           if (_isClear(e.ccModel)) _deleteEngineConfig('ccModel');
           else {
             const candidate = String(e.ccModel);
-            const resolvedCli = config.engine.ccCli || config.engine.defaultCli || 'claude';
+            const resolvedCli = config.engine.ccCli || config.engine.defaultCli || 'copilot';
             const rejection = await _validateFleetModel(candidate, resolvedCli);
             if (rejection) _clamped.push(`engine.ccModel: "${candidate}" ${rejection} — kept previous value`);
             else _setEngineConfig('ccModel', candidate);
@@ -8798,7 +9032,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
             if (updates.model === '' || updates.model === null) delete config.agents[id].model;
             else {
               const candidate = String(updates.model);
-              const resolvedCli = config.agents[id].cli || config.engine.defaultCli || 'claude';
+              const resolvedCli = config.agents[id].cli || config.engine.defaultCli || 'copilot';
               const runtimeModelStr = _resolveModelForRuntime(candidate, resolvedCli);
               const knownModels = await _modelsFor(resolvedCli);
               // Two validation paths:
@@ -9792,12 +10026,6 @@ What would you like to discuss or change? When you're happy, say "approve" and I
     { method: 'GET', path: '/api/qa/runs', desc: 'List QA validation runs (newest first). Optional ?limit=N and ?status=pending|running|passed|failed|errored filters.', handler: handleQaRunsList },
     { method: 'GET', path: /^\/api\/qa\/runs\/([^/?]+)$/, template: '/api/qa/runs/<id>', desc: 'Fetch a single QA run record by id.', handler: handleQaRunsById },
     { method: 'GET', path: /^\/api\/qa\/artifacts\/([^/?]+)\/([^?]+)$/, template: '/api/qa/artifacts/<runId>/<file>', desc: 'Serve a QA artifact file (image/video/log). Sandboxed to engine/qa-artifacts/; rejects path traversal with 403.', handler: handleQaArtifact },
-    { method: 'GET', path: '/api/hot-reload', desc: 'SSE stream for dashboard hot-reload notifications', handler: (req, res) => {
-      res.writeHead(200, { 'Content-Type': 'text/event-stream', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive' });
-      res.write('data: connected\n\n');
-      _trackSseClient(_hotReloadClients, req, res);
-    }},
     // QA Runbooks (W-mpeiwz6k0005bf34-a) — per-project test plans stored at
     // <MINIONS_DIR>/projects/<name>/runbooks/<id>.json. Pure persistence —
     // dispatch + run records + UI live in follow-up plan items.
@@ -10020,6 +10248,18 @@ What would you like to discuss or change? When you're happy, say "approve" and I
       })();
     }},
+    { method: 'POST', path: '/api/pull-requests/observe', desc: 'Toggle auto-observe (_contextOnly flag) on a tracked PR', params: 'host (github|ado), slug, number, observe (boolean)', handler: async (req, res) => {
+      const body = await readBody(req);
+      reloadConfig();
+      try {
+        const result = updatePullRequestObserveFlag(body, CONFIG);
+        invalidateStatusCache();
+        return jsonReply(res, 200, { ok: true, ...result, observe: !result._contextOnly });
+      } catch (e) {
+        return jsonReply(res, e.statusCode || 400, { error: e.message });
+      }
+    }},
     { method: 'POST', path: '/api/pull-requests/delete', desc: 'Remove a PR from tracking', params: 'id, project?', handler: async (req, res) => {
       const body = await readBody(req);
       const { id } = body;
@@ -10629,6 +10869,7 @@ module.exports = {
   _buildDocChatResponsePayload,
   _inferDocChatProject,
   _linkPullRequestForTracking: linkPullRequestForTracking,
+  _updatePullRequestObserveFlag: updatePullRequestObserveFlag,
   _resolveSkillReadPath,
   // Per-CC-turn correlation surface
   _ccTurnCreations,