@yemi33/minions 0.1.2044 → 0.1.2046

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dashboard.js CHANGED
@@ -2042,6 +2042,13 @@ function _markStatusCacheBuilt() {
2042
2042
  _statusCacheJson = null;
2043
2043
  _statusCacheGzip = null;
2044
2044
  _statusCacheVersion++;
2045
+ // A4: keep body.version.statusCacheVersion in sync with the ETag we're
2046
+ // about to return. Without this, the field lags by up to 60s because
2047
+ // `version` is built in slow-state, making refresh-diagnostics unable
2048
+ // to distinguish "server cache pinned" from "slow-state TTL not expired."
2049
+ if (_statusCache && _statusCache.version) {
2050
+ _statusCache.version.statusCacheVersion = _statusCacheVersion;
2051
+ }
2045
2052
  }
2046
2053
 
2047
2054
  function getStatus() {
@@ -2330,6 +2337,17 @@ const CC_LOG_ERROR_MAX_LEN = 80; // truncate exception messages in [cc-stream] l
2330
2337
  const CC_STREAM_REATTACH_GRACE_MS = 60000; // keep CC job alive briefly after disconnect so the UI can reattach
2331
2338
  const CC_STREAM_DONE_RETENTION_MS = 30000; // retain final payload briefly so reconnect can still receive it
2332
2339
  const CC_LIVE_STREAM_MAX_AGE_MS = shared.ENGINE_DEFAULTS.ccLiveStreamMaxAgeMs;
2340
+ // W-mpmwxni2000c25c7-b — CC/doc-chat turn watchdog. Resolves per-call from
2341
+ // CONFIG.engine.ccTurnTimeoutMs (defaults to ENGINE_DEFAULTS.ccTurnTimeoutMs)
2342
+ // so an operator can shorten/lengthen the wall-clock cap without a code
2343
+ // change. callLLM's own `timeout` opt only kills the spawned child after a
2344
+ // long idle stretch; this turn-level cap kills WHICHEVER LLM call is in
2345
+ // flight inside ccCall/ccCallStreaming (resume → fresh → final retry).
2346
+ function _resolveCcTurnTimeoutMs() {
2347
+ const cfg = CONFIG && CONFIG.engine;
2348
+ const candidate = cfg && Number.isFinite(cfg.ccTurnTimeoutMs) ? cfg.ccTurnTimeoutMs : shared.ENGINE_DEFAULTS.ccTurnTimeoutMs;
2349
+ return Number.isFinite(candidate) && candidate > 0 ? candidate : 0;
2350
+ }
2333
2351
  // Doc-chat is interactive — long-doc edits with multi-step Read+Write tool use can run
2334
2352
  // well past 5 min on `canEdit:true` paths. Bumped to 1 hour (matching CC) so legitimate
2335
2353
  // edits aren't killed mid-stream and the backend timeout never beats the user's reading
@@ -3384,6 +3402,22 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
3384
3402
  let timeoutTimer = null;
3385
3403
  let resolveResult;
3386
3404
  const promise = new Promise((resolve) => { resolveResult = resolve; });
3405
+ // W-mpmwxni2000c25c7-c — build a single failure envelope shape from a
3406
+ // typed Error (or a plain Error). Reads `.code` / `.retriable` if the
3407
+ // pool stamped them; falls back to safe defaults otherwise so callers
3408
+ // see a consistent `{ ..., errorCode, errorRetriable }` shape. Sub-item
3409
+ // b's SSE writer consumes these to render a structured error event
3410
+ // instead of grepping the stderr string for a code.
3411
+ const _failureEnvelope = (err, defaultCode) => ({
3412
+ text: accumulated,
3413
+ sessionId: sessionHandle ? sessionHandle.sessionId : null,
3414
+ code: 1,
3415
+ usage: {},
3416
+ raw: accumulated,
3417
+ stderr: String((err && err.message) || err || 'cc-worker-pool failure'),
3418
+ errorCode: (err && err.code) || defaultCode || null,
3419
+ errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
3420
+ });
3387
3421
  const finalize = (envelope) => {
3388
3422
  if (settled) return;
3389
3423
  settled = true;
@@ -3408,14 +3442,18 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
3408
3442
  timeoutTimer = setTimeout(() => {
3409
3443
  try { sessionHandle && sessionHandle.cancel(); } catch { /* swallow */ }
3410
3444
  try { ccWorkerPool.closeTab(tabKey); } catch { /* swallow */ }
3411
- finalize({
3412
- text: accumulated,
3413
- sessionId: sessionHandle ? sessionHandle.sessionId : null,
3414
- code: 1,
3415
- usage: {},
3416
- raw: accumulated,
3417
- stderr: `doc-chat-pool: timeout after ${timeoutMs}ms`,
3418
- });
3445
+ // W-mpmwxni2000c25c7-c — convert the legacy synthesized
3446
+ // `{ code: 1, stderr: 'doc-chat-pool: timeout after Xms' }` shape into
3447
+ // a typed-error envelope so the SSE writer can render the same
3448
+ // structured error event for timeouts as for spawn/handshake/exit
3449
+ // failures. The error code carries `cc-turn-timeout`; consumers
3450
+ // grep on that instead of parsing the stderr string.
3451
+ const timeoutErr = ccWorkerPool._typedError(
3452
+ `doc-chat-pool: timeout after ${timeoutMs}ms`,
3453
+ ccWorkerPool.ERROR_CODES.CC_TURN_TIMEOUT,
3454
+ true
3455
+ );
3456
+ finalize(_failureEnvelope(timeoutErr, ccWorkerPool.ERROR_CODES.CC_TURN_TIMEOUT));
3419
3457
  }, timeoutMs);
3420
3458
  if (typeof timeoutTimer.unref === 'function') timeoutTimer.unref();
3421
3459
  }
@@ -3429,14 +3467,10 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
3429
3467
  systemPromptHash: _docChatPromptHash,
3430
3468
  });
3431
3469
  } catch (err) {
3432
- return finalize({
3433
- text: '',
3434
- sessionId: null,
3435
- code: 1,
3436
- usage: {},
3437
- raw: '',
3438
- stderr: String((err && err.message) || err || 'cc-worker-pool spawn failed'),
3439
- });
3470
+ // Pool stamps `.code` (worker-spawn-failed / acp-handshake-failed) on
3471
+ // every error from getSession; fall back to worker-spawn-failed if
3472
+ // the error is a plain Error from somewhere unexpected.
3473
+ return finalize(_failureEnvelope(err, ccWorkerPool.ERROR_CODES.WORKER_SPAWN_FAILED));
3440
3474
  }
3441
3475
  if (cancelled) {
3442
3476
  try { sessionHandle.cancel(); } catch { /* swallow */ }
@@ -3464,14 +3498,15 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
3464
3498
  finalize({ text: accumulated, sessionId: sessionHandle.sessionId, code: 0, usage: {}, raw: accumulated, stderr: '' });
3465
3499
  },
3466
3500
  onError: (err) => {
3467
- finalize({
3468
- text: accumulated,
3469
- sessionId: sessionHandle.sessionId,
3470
- code: cancelled ? 0 : 1,
3471
- usage: {},
3472
- raw: accumulated,
3473
- stderr: String((err && err.message) || err || 'cc-worker-pool stream error'),
3474
- });
3501
+ if (cancelled) {
3502
+ // User-driven cancel — not a real error, treat as a clean exit.
3503
+ finalize({ text: accumulated, sessionId: sessionHandle.sessionId, code: 0, usage: {}, raw: accumulated, stderr: '' });
3504
+ return;
3505
+ }
3506
+ // Pool stamps `.code` (worker-died for mid-stream proc exit).
3507
+ // Fallback default is worker-died because the stream onError is
3508
+ // overwhelmingly fired from the post-handshake exit handler.
3509
+ finalize(_failureEnvelope(err, ccWorkerPool.ERROR_CODES.WORKER_DIED));
3475
3510
  },
3476
3511
  });
3477
3512
  })();
@@ -3968,12 +4003,65 @@ async function _retryDocChatAfterResumeFailure({ result, initialPass, freshSessi
3968
4003
  // Shape the per-failure debug envelope (raw stderr + classification metadata)
3969
4004
  // shared by hard failures and partial recoveries — keeps the wire shape in lockstep.
3970
4005
  function _buildDocChatErrorEnvelope(result) {
4006
+ // W-mpmwxni2000c25c7-b — also surface the typed `error: {message, code,
4007
+ // retriable}` envelope when llm.callLLM* produced one, so doc-chat clients
4008
+ // get the same shape Command Center handlers emit.
4009
+ const typed = result && result.error;
3971
4010
  return {
3972
4011
  code: result.code ?? null,
3973
4012
  stderr: String(result.stderr || '').slice(-2048),
3974
4013
  errorClass: result.errorClass || null,
3975
4014
  errorMessage: result.errorMessage || null,
3976
4015
  runtime: result.runtime || null,
4016
+ ...(typed ? {
4017
+ typedCode: typed.code || null,
4018
+ typedMessage: typed.message || null,
4019
+ retriable: typed.retriable !== false,
4020
+ } : {}),
4021
+ };
4022
+ }
4023
+
4024
+ // W-mpmwxni2000c25c7-b — race a ccDocCall* promise against a wall-clock turn
4025
+ // timer. On expiry, fires `abortFn` (killing the in-flight CLI) and resolves
4026
+ // with a doc-chat-shaped failure payload that flows through the existing
4027
+ // _docChatFailureResponse / SSE error event paths. timeoutMs <= 0 disables
4028
+ // the watchdog (passthrough).
4029
+ async function _raceCcDocChatTimeout(callPromise, timeoutMs, abortFn, label) {
4030
+ if (!timeoutMs || timeoutMs <= 0) return callPromise;
4031
+ let timer = null;
4032
+ let timedOut = false;
4033
+ const timeoutPromise = new Promise((resolve) => {
4034
+ timer = setTimeout(() => {
4035
+ timedOut = true;
4036
+ try { if (abortFn) abortFn(); } catch { /* swallow */ }
4037
+ resolve(null);
4038
+ }, timeoutMs);
4039
+ // NOTE: do NOT unref — Node would exit the event loop while awaiting the
4040
+ // call promise (Promises don't keep the loop open; timers/I/O do). Cleared
4041
+ // immediately on the success path below.
4042
+ });
4043
+ const winner = await Promise.race([callPromise, timeoutPromise]);
4044
+ if (!timedOut) {
4045
+ clearTimeout(timer);
4046
+ return winner;
4047
+ }
4048
+ // Drain the in-flight call so its cleanup runs before we hand back the
4049
+ // synthetic envelope.
4050
+ await callPromise.catch(() => null);
4051
+ const message = `${label || 'doc-chat'} turn timed out after ${timeoutMs}ms`;
4052
+ return {
4053
+ answer: 'Document chat request timed out — try again.',
4054
+ toolUses: [],
4055
+ error: {
4056
+ code: 'cc-turn-timeout',
4057
+ stderr: '',
4058
+ errorClass: 'cc-turn-timeout',
4059
+ errorMessage: message,
4060
+ runtime: null,
4061
+ typedCode: 'cc-turn-timeout',
4062
+ typedMessage: message,
4063
+ retriable: true,
4064
+ },
3977
4065
  };
3978
4066
  }
3979
4067
 
@@ -5848,19 +5936,17 @@ const server = http.createServer(async (req, res) => {
5848
5936
  }
5849
5937
 
5850
5938
  async function handleKnowledgeSweep(req, res) {
5851
- // Source of truth = kb-sweep-state.json + PID liveness. The sweep now runs
5852
- // as a detached child (engine/kb-sweep-runner.js) so it survives
5853
- // `minions restart`; the in-memory `global._kbSweep*` flags from the old
5854
- // in-process implementation are gone.
5939
+ // Source of truth = kb-sweep-state.json + PID liveness. The sweep runs as
5940
+ // a detached child (engine/kb-sweep-runner.js) so it survives
5941
+ // `minions restart`. The actual spawn logic lives in
5942
+ // engine/kb-sweep.js::spawnSweepRunnerDetached shared with the engine's
5943
+ // auto-sweep tick phase.
5855
5944
  const {
5856
- readSweepLiveness, staleGuardMs, KB_SWEEP_STATE_PATH, KB_SWEEP_LOG_PATH, KB_SWEEP_RUNNER_PATH,
5945
+ readSweepLiveness, staleGuardMs, spawnSweepRunnerDetached, KB_SWEEP_STATE_PATH,
5857
5946
  } = require('./engine/kb-sweep');
5858
5947
  const entryCount = ((await queries.getKnowledgeBaseEntries()) || []).length;
5859
5948
  const guardMs = staleGuardMs(entryCount);
5860
5949
 
5861
- // Synchronous pre-claim BEFORE awaiting the body so a concurrent POST
5862
- // arriving in the same tick sees in-flight state and can't double-spawn.
5863
- const sweepToken = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
5864
5950
  const liveness = readSweepLiveness({ entryCount });
5865
5951
  if (liveness.inFlight && liveness.stale) {
5866
5952
  const reason = !liveness.alive
@@ -5875,79 +5961,15 @@ const server = http.createServer(async (req, res) => {
5875
5961
  });
5876
5962
  }
5877
5963
 
5878
- // Claim the slot synchronously by writing a "starting" state. The runner
5879
- // will overwrite this with status:'in-flight' + its real pid once it boots.
5880
- // readSweepLiveness grants a 15s boot-grace to "starting" records with no pid.
5881
- const startedAt = Date.now();
5882
- try {
5883
- safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify({
5884
- status: 'starting', startedAt, startedAtIso: new Date().toISOString(),
5885
- sweepToken, pid: null,
5886
- }));
5887
- } catch (e) {
5888
- console.error(`[kb-sweep] failed to write starting state: ${e.message}`);
5889
- }
5890
-
5891
5964
  const body = await readBody(req).catch(() => ({}));
5892
-
5893
- // Persist body to a temp file so spawn doesn't have to serialize large
5894
- // pinnedKeys arrays via argv. Skip when body is empty.
5895
- let bodyFile = null;
5896
- if (body && (Array.isArray(body.pinnedKeys) || body.dryRun != null)) {
5897
- bodyFile = path.join(ENGINE_DIR, `tmp-kb-sweep-body-${sweepToken}.json`);
5898
- try { safeWrite(bodyFile, JSON.stringify(body)); }
5899
- catch (e) {
5900
- console.error(`[kb-sweep] failed to write body-file ${bodyFile}: ${e.message}`);
5901
- bodyFile = null;
5902
- }
5903
- }
5904
-
5905
- const { spawn: cpSpawn } = require('child_process');
5906
- // Open log fd in append mode so spawn can pipe stdio there. Child inherits
5907
- // the fd; parent closes its copy after spawn returns successfully.
5908
- let logFdNum = null;
5909
- let stdio = ['ignore', 'ignore', 'ignore'];
5910
- try {
5911
- logFdNum = fs.openSync(KB_SWEEP_LOG_PATH, 'a');
5912
- stdio = ['ignore', logFdNum, logFdNum];
5913
- } catch (e) {
5914
- console.error(`[kb-sweep] failed to open log ${KB_SWEEP_LOG_PATH}: ${e.message}`);
5915
- }
5916
-
5917
- const spawnArgs = ['--sweep-token', sweepToken];
5918
- if (bodyFile) spawnArgs.push('--body-file', bodyFile);
5919
-
5920
- let proc;
5921
- try {
5922
- proc = cpSpawn(process.execPath, [KB_SWEEP_RUNNER_PATH, ...spawnArgs], {
5923
- cwd: MINIONS_DIR, stdio, detached: true, windowsHide: true,
5924
- env: { ...process.env },
5925
- });
5926
- } catch (e) {
5927
- if (logFdNum != null) try { fs.closeSync(logFdNum); } catch { /* ignore */ }
5928
- if (bodyFile) try { fs.unlinkSync(bodyFile); } catch { /* ignore */ }
5929
- // Release the "starting" claim on synchronous spawn failure so the user
5930
- // can retry immediately.
5931
- try { shared.safeUnlink(KB_SWEEP_STATE_PATH); } catch { /* ignore */ }
5932
- return jsonReply(res, 500, { error: `spawn failed: ${e.message}` });
5965
+ const result = spawnSweepRunnerDetached({
5966
+ pinnedKeys: Array.isArray(body?.pinnedKeys) ? body.pinnedKeys : undefined,
5967
+ dryRun: body?.dryRun,
5968
+ });
5969
+ if (!result.ok) {
5970
+ return jsonReply(res, 500, { error: result.error || 'spawn failed' });
5933
5971
  }
5934
- if (logFdNum != null) try { fs.closeSync(logFdNum); } catch { /* ignore */ }
5935
-
5936
- // Conditional CAS: only update the state file from "starting" → "in-flight"
5937
- // if our sweepToken still owns it. If the (fast) runner already wrote
5938
- // "completed"/"failed" or its own "in-flight", leave that newer state alone.
5939
- try {
5940
- const current = safeJson(KB_SWEEP_STATE_PATH);
5941
- if (current && current.status === 'starting' && current.sweepToken === sweepToken) {
5942
- safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify({
5943
- status: 'in-flight', startedAt, startedAtIso: new Date().toISOString(),
5944
- sweepToken, pid: proc.pid,
5945
- }));
5946
- }
5947
- } catch { /* best-effort */ }
5948
-
5949
- proc.unref();
5950
- return jsonReply(res, 202, { ok: true, started: true, sweepToken });
5972
+ return jsonReply(res, 202, { ok: true, started: true, sweepToken: result.sweepToken });
5951
5973
  }
5952
5974
 
5953
5975
 
@@ -6788,7 +6810,14 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6788
6810
  const ccTurnId = 'cct-' + shared.uid();
6789
6811
  const turnSystemPrompt = renderDocChatSystemPromptForTurn(ccTurnId);
6790
6812
 
6791
- let { answer, partial, warning, toolUses, error: ccError } = await ccDocCall({
6813
+ // W-mpmwxni2000c25c7-b wall-clock turn watchdog. The doc-chat call
6814
+ // can internally spawn resume + fresh + final-retry LLM calls; we want
6815
+ // ONE wall-clock cap that covers the whole turn so a runtime stuck
6816
+ // mid-stream can't outlive ccTurnTimeoutMs. On expiry the watchdog
6817
+ // calls _docAbort (kills the in-flight CLI) and the synthesized payload
6818
+ // below flows through the existing _docChatFailureResponse path.
6819
+ const _docTurnTimeoutMs = _resolveCcTurnTimeoutMs();
6820
+ const _docCallPromise = ccDocCall({
6792
6821
  message: body.message, document: currentContent, title: body.title,
6793
6822
  filePath: body.filePath, selection: body.selection, canEdit, isJson,
6794
6823
  model: body.model || undefined,
@@ -6798,6 +6827,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6798
6827
  systemPrompt: turnSystemPrompt,
6799
6828
  turnId: ccTurnId,
6800
6829
  });
6830
+ const _docCallResult = await _raceCcDocChatTimeout(_docCallPromise, _docTurnTimeoutMs, () => _docAbort && _docAbort(), 'doc-chat');
6831
+ let { answer, partial, warning, toolUses, error: ccError } = _docCallResult;
6801
6832
  const finalize = _finalizeDocChatEdit({
6802
6833
  filePath: body.filePath, fullPath, isJson, canEdit,
6803
6834
  originalContent: currentContent, delimiterContent: null,
@@ -6811,6 +6842,25 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6811
6842
  ccError, partial, warning, toolUses, finalize,
6812
6843
  });
6813
6844
  _docDone = true;
6845
+ // W-mpmwxni2000c25c7-b — track every surfaced doc-chat error code so
6846
+ // /api/metrics reflects silent-error regressions. Hard failures (no
6847
+ // partial recovery, no edited file) graduate to 5xx so the client can
6848
+ // render a real error UI instead of treating the polite "Failed to
6849
+ // process request" string as a successful turn.
6850
+ if (ccError) {
6851
+ const errCode = ccError.typedCode || ccError.errorClass || ccError.code || 'unknown';
6852
+ llm.trackEngineError('doc-chat', errCode);
6853
+ const isHardFailure = !partial && !(finalize && finalize.edited);
6854
+ if (isHardFailure) {
6855
+ const status = errCode === shared.FAILURE_CLASS.CONFIG_ERROR ? 503 : 502;
6856
+ return jsonReply(res, status, {
6857
+ ...payload,
6858
+ error: ccError.typedMessage || ccError.errorMessage || 'Document chat failed',
6859
+ code: errCode,
6860
+ retriable: ccError.retriable !== false,
6861
+ });
6862
+ }
6863
+ }
6814
6864
  return jsonReply(res, 200, payload);
6815
6865
  } finally { _docAbort = null; _docDone = true; docChatInFlight.delete(docKey); }
6816
6866
  } catch (e) { return jsonReply(res, e.statusCode || 500, { error: e.message }); }
@@ -6899,7 +6949,12 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6899
6949
  const ccTurnId = 'cct-' + shared.uid();
6900
6950
  const turnSystemPrompt = renderDocChatSystemPromptForTurn(ccTurnId);
6901
6951
 
6902
- let { answer, partial, warning, toolUses, error: ccError } = await ccDocCallStreaming({
6952
+ // W-mpmwxni2000c25c7-b wall-clock turn watchdog (mirrors the
6953
+ // non-stream handleDocChat path). On expiry _docAbort kills the
6954
+ // in-flight LLM and the synthesized payload below flows through the
6955
+ // SSE done frame the client already expects with `error` set.
6956
+ const _docTurnTimeoutMs = _resolveCcTurnTimeoutMs();
6957
+ const _docStreamCallPromise = ccDocCallStreaming({
6903
6958
  message: body.message, document: currentContent, title: body.title,
6904
6959
  filePath: body.filePath, selection: body.selection, canEdit, isJson,
6905
6960
  model: body.model || undefined,
@@ -6912,6 +6967,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6912
6967
  systemPrompt: turnSystemPrompt,
6913
6968
  turnId: ccTurnId,
6914
6969
  });
6970
+ const _docStreamResult = await _raceCcDocChatTimeout(_docStreamCallPromise, _docTurnTimeoutMs, () => _docAbort && _docAbort(), 'doc-chat-stream');
6971
+ let { answer, partial, warning, toolUses, error: ccError } = _docStreamResult;
6915
6972
  const finalize = _finalizeDocChatEdit({
6916
6973
  filePath: body.filePath, fullPath, isJson, canEdit,
6917
6974
  originalContent: currentContent, delimiterContent: null,
@@ -6924,6 +6981,23 @@ What would you like to discuss or change? When you're happy, say "approve" and I
6924
6981
  actionFeedback: null, actionParseError: null,
6925
6982
  ccError, partial, warning, toolUses, finalize,
6926
6983
  });
6984
+ // W-mpmwxni2000c25c7-b — track surfaced doc-chat error codes for
6985
+ // /api/metrics and emit a named SSE `event: error` frame so the
6986
+ // client can render a typed error instead of treating the polite
6987
+ // fallback string as a normal completion.
6988
+ if (ccError) {
6989
+ const errCode = ccError.typedCode || ccError.errorClass || ccError.code || 'unknown';
6990
+ llm.trackEngineError('doc-chat', errCode);
6991
+ const isHardFailure = !partial && !(finalize && finalize.edited);
6992
+ if (isHardFailure) {
6993
+ const errPayload = {
6994
+ message: ccError.typedMessage || ccError.errorMessage || 'Document chat failed',
6995
+ code: errCode,
6996
+ retriable: ccError.retriable !== false,
6997
+ };
6998
+ try { res.write(`event: error\ndata: ${JSON.stringify(errPayload)}\n\n`); } catch {}
6999
+ }
7000
+ }
6927
7001
  const { answer: finalAnswer, ...donePayload } = payload;
6928
7002
  writeDocEvent({
6929
7003
  type: 'done',
@@ -7520,21 +7594,40 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7520
7594
  // confirmation chips in the assistant reply.
7521
7595
  const ccTurnId = 'cct-' + shared.uid();
7522
7596
  const turnSystemPrompt = renderCcSystemPromptForTurn(ccTurnId);
7523
- const result = await ccCall(body.message, { store: 'cc', transcript: body.transcript, systemPrompt: turnSystemPrompt, turnId: ccTurnId });
7597
+ // W-mpmwxni2000c25c7-b wall-clock turn watchdog. On expiry the
7598
+ // in-flight LLM call is aborted and ccCall returns a synthetic
7599
+ // envelope with error.code === 'cc-turn-timeout'.
7600
+ const turnTimeoutMs = _resolveCcTurnTimeoutMs();
7601
+ const result = await llm.withCcTurnTimeout({
7602
+ timeoutMs: turnTimeoutMs, label: 'command-center',
7603
+ }, (registerAbort) => ccCall(body.message, {
7604
+ store: 'cc', transcript: body.transcript, systemPrompt: turnSystemPrompt, turnId: ccTurnId,
7605
+ onAbortReady: registerAbort,
7606
+ }));
7524
7607
 
7525
- // Non-zero exit with text = max_turns or partial success — still usable
7526
- if (!result.text) {
7608
+ // W-mpmwxni2000c25c7-b typed-error envelope path. Any failure that
7609
+ // produced no usable text is surfaced to the client as 5xx JSON
7610
+ // `{ error, code, retriable }` instead of a polite 200 "I had trouble
7611
+ // processing that" string that silently halves CC retry signal.
7612
+ if (!result.text || result.error) {
7613
+ const errEnvelope = result.error || (result.errorMessage
7614
+ ? { message: result.errorMessage, code: result.errorClass || 'unknown', retriable: true }
7615
+ : { message: 'Command Center returned no output', code: 'empty-output', retriable: true });
7616
+ llm.trackEngineError('command-center', errEnvelope.code);
7527
7617
  const debugInfo = result.code !== 0 ? `(exit code ${result.code})` : '(empty response)';
7528
7618
  const stderrTail = (result.stderr || '').trim().split('\n').filter(Boolean).slice(-5).join(' | ');
7529
- console.error(`[CC] LLM failed after retries ${debugInfo}: ${stderrTail}`);
7530
- try { shared.log('warn', `CC failed ${debugInfo}: ${stderrTail.slice(0, 300)}`); } catch {}
7531
- const hasSession = !!ccSession.sessionId;
7532
- const retryHint = hasSession
7533
- ? 'Your session is still active — just send your message again to retry.'
7534
- : 'Try clicking **New Session** and sending your message again.';
7535
- return jsonReply(res, 200, {
7536
- text: `I had trouble processing that ${debugInfo}. ${stderrTail ? 'Detail: ' + stderrTail : ''}\n\n${retryHint}`,
7537
- actions: [], sessionId: ccSession.sessionId
7619
+ console.error(`[CC] LLM failed after retries ${debugInfo} code=${errEnvelope.code}: ${stderrTail}`);
7620
+ try { shared.log('warn', `CC failed ${debugInfo} code=${errEnvelope.code}: ${stderrTail.slice(0, 300)}`); } catch {}
7621
+ // Missing-runtime is a 503 (service config); auth-failure also 503; other classes 502.
7622
+ const status = result.missingRuntime ? 503
7623
+ : errEnvelope.code === 'auth-failure' ? 503
7624
+ : 502;
7625
+ return jsonReply(res, status, {
7626
+ error: errEnvelope.message,
7627
+ code: errEnvelope.code,
7628
+ retriable: !!errEnvelope.retriable,
7629
+ sessionId: ccSession.sessionId || null,
7630
+ ...(stderrTail ? { stderr: stderrTail.slice(0, 500) } : {}),
7538
7631
  });
7539
7632
  }
7540
7633
 
@@ -7555,7 +7648,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7555
7648
  } finally {
7556
7649
  _releaseCCTab(tabId);
7557
7650
  }
7558
- } catch (e) { _releaseCCTab(tabId); return jsonReply(res, e.statusCode || 500, { error: e.message }); }
7651
+ } catch (e) { _releaseCCTab(tabId); return jsonReply(res, e.statusCode || 500, { error: e.message, code: 'handler-exception', retriable: false }); }
7559
7652
  }
7560
7653
 
7561
7654
  /** Build a lightweight input object for SSE tool events — keeps only the fields formatToolSummary needs, with truncated string values. */
@@ -7677,6 +7770,11 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7677
7770
  });
7678
7771
  } catch (err) {
7679
7772
  _emitTimingLog(null, null, Date.now(), 'spawn-failed');
7773
+ // W-mpmwxni2000c25c7-c — pipe the pool's typed-error fields
7774
+ // (`code`, `retriable`) onto the envelope so the SSE writer can
7775
+ // render a structured error event instead of grepping the stderr
7776
+ // string. Pool stamps `.code` (worker-spawn-failed or
7777
+ // acp-handshake-failed) on every getSession rejection.
7680
7778
  return resolveResult({
7681
7779
  text: '',
7682
7780
  sessionId: null,
@@ -7684,6 +7782,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7684
7782
  usage: {},
7685
7783
  raw: '',
7686
7784
  stderr: String((err && err.message) || err || 'cc-worker-pool spawn failed'),
7785
+ errorCode: (err && err.code) || ccWorkerPool.ERROR_CODES.WORKER_SPAWN_FAILED,
7786
+ errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
7687
7787
  });
7688
7788
  }
7689
7789
  const _tSessionReady = Date.now();
@@ -7730,13 +7830,29 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7730
7830
  },
7731
7831
  onError: (err) => {
7732
7832
  _emitTimingLog(_lifecycle, _tSessionReady, Date.now(), cancelled ? 'cancelled' : 'error');
7833
+ if (cancelled) {
7834
+ resolveResult({
7835
+ text: accumulated,
7836
+ sessionId: sessionHandle.sessionId,
7837
+ code: 0,
7838
+ usage: {},
7839
+ raw: accumulated,
7840
+ stderr: '',
7841
+ });
7842
+ return;
7843
+ }
7844
+ // W-mpmwxni2000c25c7-c — pipe the pool's typed-error fields
7845
+ // through. mid-stream worker exits stamp `.code = 'worker-died'`
7846
+ // on the Error before invoking onError.
7733
7847
  resolveResult({
7734
7848
  text: accumulated,
7735
7849
  sessionId: sessionHandle.sessionId,
7736
- code: cancelled ? 0 : 1,
7850
+ code: 1,
7737
7851
  usage: {},
7738
7852
  raw: accumulated,
7739
7853
  stderr: String((err && err.message) || err || 'cc-worker-pool stream error'),
7854
+ errorCode: (err && err.code) || ccWorkerPool.ERROR_CODES.WORKER_DIED,
7855
+ errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
7740
7856
  });
7741
7857
  },
7742
7858
  });
@@ -8091,73 +8207,100 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8091
8207
  : '';
8092
8208
  const prompt = _joinCcPromptParts(preamble, resumeGuard, carryover, turnHeader, projectContextPart, body.message);
8093
8209
 
8094
- const { trackEngineUsage: trackUsage } = require('./engine/llm');
8210
+ const { trackEngineUsage: trackUsage, trackEngineError: trackErr, withCcTurnTimeout: withTimeout } = require('./engine/llm');
8095
8211
  const streamModel = CONFIG.engine?.ccModel || shared.ENGINE_DEFAULTS.ccModel;
8096
8212
  const streamEffort = CONFIG.engine?.ccEffort || shared.ENGINE_DEFAULTS.ccEffort;
8097
8213
  const ccMaxTurns = CONFIG.engine?.ccMaxTurns || shared.ENGINE_DEFAULTS.ccMaxTurns;
8098
8214
  let toolUses = [];
8099
- const llmPromise = _invokeCcStream({
8100
- prompt, sessionId, liveState, toolUses,
8101
- model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
8102
- engineConfig: CONFIG.engine,
8103
- systemPrompt: turnSystemPrompt,
8104
- tabId,
8105
- });
8106
- _ccStreamAbort = llmPromise.abort;
8107
- liveState.abortFn = _ccStreamAbort;
8108
- ccInFlightAborts.set(tabId, _ccStreamAbort);
8109
- const result = await llmPromise;
8110
- trackUsage('command-center', result.usage);
8111
-
8112
- if (result.missingRuntime) {
8113
- finishMissingRuntime(result, liveState);
8114
- return;
8115
- }
8116
-
8117
- // Handle failure — non-zero exit with text = max_turns or partial success, still usable
8118
- if (!result.text && wasResume && result.code !== 0 && !req.destroyed) {
8119
- // Resume failed (stale/expired session) — auto-retry as fresh session (skip if client already disconnected)
8120
- console.log(`[CC-stream] Resume failed (code=${result.code}) — retrying fresh`);
8121
- const freshPreamble = buildCCStatePreamble();
8122
- const freshCarryover = _buildTranscriptCarryover(body.transcript, { currentMessage: body.message });
8123
- const freshPrompt = _joinCcPromptParts(freshPreamble, freshCarryover, turnHeader, projectContextPart, body.message);
8124
- toolUses = []; // discard stale metadata from the failed resume attempt
8125
- const retryPromise = _invokeCcStream({
8126
- prompt: freshPrompt, sessionId: undefined, liveState, toolUses,
8215
+ // W-mpmwxni2000c25c7-b turn-level watchdog. Wraps the initial
8216
+ // _invokeCcStream PLUS the post-resume-fail retry so the wall clock
8217
+ // covers the entire CC turn (not just one underlying LLM call). On
8218
+ // expiry, whichever call is in flight is aborted; the watchdog
8219
+ // resolves with a synthetic `{ error: { code: 'cc-turn-timeout' } }`
8220
+ // envelope so the SSE error path below kicks in.
8221
+ const turnTimeoutMs = _resolveCcTurnTimeoutMs();
8222
+ const result = await withTimeout({
8223
+ timeoutMs: turnTimeoutMs, label: 'command-center-stream',
8224
+ }, async (registerAbort) => {
8225
+ const llmPromise = _invokeCcStream({
8226
+ prompt, sessionId, liveState, toolUses,
8127
8227
  model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
8128
8228
  engineConfig: CONFIG.engine,
8129
8229
  systemPrompt: turnSystemPrompt,
8130
8230
  tabId,
8131
8231
  });
8132
- _ccStreamAbort = retryPromise.abort;
8232
+ _ccStreamAbort = llmPromise.abort;
8133
8233
  liveState.abortFn = _ccStreamAbort;
8134
8234
  ccInFlightAborts.set(tabId, _ccStreamAbort);
8135
- const retryResult = await retryPromise;
8136
- trackUsage('command-center', retryResult.usage);
8137
- if (retryResult.text) {
8138
- // Fresh session succeeded — use retryResult from here
8139
- Object.assign(result, retryResult);
8235
+ registerAbort(_ccStreamAbort);
8236
+ const initial = await llmPromise;
8237
+ trackUsage('command-center', initial.usage);
8238
+
8239
+ if (initial.missingRuntime) return initial;
8240
+
8241
+ // Handle failure — non-zero exit with text = max_turns or partial success, still usable
8242
+ if (!initial.text && wasResume && initial.code !== 0 && !req.destroyed) {
8243
+ // Resume failed (stale/expired session) — auto-retry as fresh session (skip if client already disconnected)
8244
+ console.log(`[CC-stream] Resume failed (code=${initial.code}) — retrying fresh`);
8245
+ const freshPreamble = buildCCStatePreamble();
8246
+ const freshCarryover = _buildTranscriptCarryover(body.transcript, { currentMessage: body.message });
8247
+ const freshPrompt = _joinCcPromptParts(freshPreamble, freshCarryover, turnHeader, projectContextPart, body.message);
8248
+ toolUses = []; // discard stale metadata from the failed resume attempt
8249
+ const retryPromise = _invokeCcStream({
8250
+ prompt: freshPrompt, sessionId: undefined, liveState, toolUses,
8251
+ model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
8252
+ engineConfig: CONFIG.engine,
8253
+ systemPrompt: turnSystemPrompt,
8254
+ tabId,
8255
+ });
8256
+ _ccStreamAbort = retryPromise.abort;
8257
+ liveState.abortFn = _ccStreamAbort;
8258
+ ccInFlightAborts.set(tabId, _ccStreamAbort);
8259
+ registerAbort(_ccStreamAbort);
8260
+ const retryResult = await retryPromise;
8261
+ trackUsage('command-center', retryResult.usage);
8262
+ if (retryResult.text) {
8263
+ // Fresh session succeeded — use retryResult from here
8264
+ Object.assign(initial, retryResult);
8265
+ // Clear the error envelope inherited from the failed first attempt
8266
+ // so the success path below doesn't misclassify a recovered turn.
8267
+ if (retryResult.text) { initial.error = null; initial.ok = true; }
8268
+ } else if (retryResult.error) {
8269
+ initial.error = retryResult.error;
8270
+ }
8140
8271
  }
8141
- }
8272
+ return initial;
8273
+ });
8142
8274
  if (result.missingRuntime) {
8143
8275
  finishMissingRuntime(result, liveState);
8144
8276
  return;
8145
8277
  }
8146
- if (!result.text) {
8278
+ if (!result.text || result.error) {
8147
8279
  if (req.destroyed) {
8148
8280
  _ccStreamEnded = true;
8149
8281
  _logCcStreamEnd(_ccTelemetry, 'llm-empty-client-gone', { code: result.code });
8150
8282
  return;
8151
8283
  }
8152
- const debugInfo = result.code !== 0 ? `(exit code ${result.code})` : '(empty response)';
8284
+ // W-mpmwxni2000c25c7-b surface the typed error envelope as a
8285
+ // distinct SSE `event: error` frame so the client renders a real
8286
+ // error UI (with a retry hint derived from `retriable`) instead of
8287
+ // swallowing a polite 200 "I had trouble processing that" string.
8288
+ const envelope = result.error || (result.errorMessage
8289
+ ? { message: result.errorMessage, code: result.errorClass || 'unknown', retriable: true }
8290
+ : { message: 'Command Center returned no output', code: 'empty-output', retriable: true });
8291
+ trackErr('command-center', envelope.code);
8153
8292
  const stderrTail = (result.stderr || '').trim().split('\n').filter(Boolean).slice(-3).join(' | ');
8154
- console.error(`[CC-stream] Failed: code=${result.code}, stderr=${(result.stderr || '').slice(0, 500)}, stdout_tail=${(result.raw || '').slice(-500)}`);
8155
- const retryHint = 'Send your message again to retry.';
8156
- liveState.donePayload = { type: 'done', text: `I had trouble processing that ${debugInfo}. ${stderrTail ? 'Detail: ' + stderrTail : ''}\n\n${retryHint}`, actions: [], sessionId: null };
8293
+ console.error(`[CC-stream] Failed code=${envelope.code} retriable=${envelope.retriable}: ${(result.stderr || '').slice(0, 500)}; stdout_tail=${(result.raw || '').slice(-500)}`);
8294
+ // Emit `event: error` (named SSE frame), then a `done`-style frame
8295
+ // for clients that only handle the default message channel, then
8296
+ // close cleanly so the EventSource exits its read loop without
8297
+ // throwing a connection-reset.
8298
+ try { res.write(`event: error\ndata: ${JSON.stringify({ message: envelope.message, code: envelope.code, retriable: !!envelope.retriable, ...(stderrTail ? { stderr: stderrTail.slice(0, 500) } : {}) })}\n\n`); } catch {}
8299
+ liveState.donePayload = { type: 'error', error: envelope.message, code: envelope.code, retriable: !!envelope.retriable, sessionId: null };
8157
8300
  if (liveState.writer) liveState.writer(liveState.donePayload);
8158
8301
  if (liveState.endResponse) liveState.endResponse();
8159
8302
  _scheduleCcLiveCleanup(tabId);
8160
- _logCcStreamEnd(_ccTelemetry, 'llm-failed-fallback-sent', { code: result.code });
8303
+ _logCcStreamEnd(_ccTelemetry, 'llm-failed-error-envelope-sent', { code: result.code, errorCode: envelope.code });
8161
8304
  return;
8162
8305
  }
8163
8306
 
@@ -8613,6 +8756,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8613
8756
  versionCheckInterval: [60000],
8614
8757
  prPollStatusEvery: [1], prPollCommentsEvery: [1],
8615
8758
  agentBusyReassignMs: [0],
8759
+ maxRetriesPerAgent: [1, 20],
8616
8760
  };
8617
8761
  for (const [key, [min, max]] of Object.entries(numericFields)) {
8618
8762
  if (e[key] !== undefined) {
@@ -8728,7 +8872,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8728
8872
  if (_isClear(e.defaultModel)) _deleteEngineConfig('defaultModel');
8729
8873
  else {
8730
8874
  const candidate = String(e.defaultModel);
8731
- const resolvedCli = config.engine.defaultCli || 'claude';
8875
+ const resolvedCli = config.engine.defaultCli || 'copilot';
8732
8876
  const rejection = await _validateFleetModel(candidate, resolvedCli);
8733
8877
  if (rejection) _clamped.push(`engine.defaultModel: "${candidate}" ${rejection} — kept previous value`);
8734
8878
  else _setEngineConfig('defaultModel', candidate);
@@ -8738,7 +8882,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8738
8882
  if (_isClear(e.ccModel)) _deleteEngineConfig('ccModel');
8739
8883
  else {
8740
8884
  const candidate = String(e.ccModel);
8741
- const resolvedCli = config.engine.ccCli || config.engine.defaultCli || 'claude';
8885
+ const resolvedCli = config.engine.ccCli || config.engine.defaultCli || 'copilot';
8742
8886
  const rejection = await _validateFleetModel(candidate, resolvedCli);
8743
8887
  if (rejection) _clamped.push(`engine.ccModel: "${candidate}" ${rejection} — kept previous value`);
8744
8888
  else _setEngineConfig('ccModel', candidate);
@@ -8856,7 +9000,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8856
9000
  if (updates.model === '' || updates.model === null) delete config.agents[id].model;
8857
9001
  else {
8858
9002
  const candidate = String(updates.model);
8859
- const resolvedCli = config.agents[id].cli || config.engine.defaultCli || 'claude';
9003
+ const resolvedCli = config.agents[id].cli || config.engine.defaultCli || 'copilot';
8860
9004
  const runtimeModelStr = _resolveModelForRuntime(candidate, resolvedCli);
8861
9005
  const knownModels = await _modelsFor(resolvedCli);
8862
9006
  // Two validation paths: