@yemi33/minions 0.1.2045 → 0.1.2046
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dashboard/js/fre.js +3 -2
- package/dashboard/js/settings.js +5 -5
- package/dashboard.js +288 -86
- package/docs/runtime-adapters.md +8 -3
- package/engine/cc-worker-pool.js +87 -11
- package/engine/llm.js +148 -2
- package/engine/preflight.js +5 -5
- package/engine/queries.js +61 -32
- package/engine/shared.js +4 -3
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -11,8 +11,8 @@ Inspired by and initially scaffolded from [Brady Gaster's Squad](https://bradyga
|
|
|
11
11
|
## Prerequisites
|
|
12
12
|
|
|
13
13
|
- **Node.js** 18+ (LTS recommended)
|
|
14
|
-
- **
|
|
15
|
-
- **
|
|
14
|
+
- **A supported runtime CLI** — Minions defaults to GitHub Copilot CLI (`npm install -g @github/copilot`). Claude Code CLI (`npm install -g @anthropic-ai/claude-code`) is also supported; switch with `minions config set-cli claude` or per-agent `cli` overrides.
|
|
15
|
+
- **Auth for your runtime** — GitHub Copilot subscription (Copilot CLI handles its own auth) or an Anthropic API key / Claude Max subscription
|
|
16
16
|
- **Git** — agents create worktrees for all code changes
|
|
17
17
|
|
|
18
18
|
> **Note:** you do **not** need to configure your CLI for "autopilot" / "bypass permissions" / "dangerous mode". Minions passes the right bypass flag per spawn (`--dangerously-skip-permissions` for Claude; `--autopilot --allow-all --no-ask-user` for Copilot), independent of your global CLI config. Run `minions doctor` to verify your installed CLI accepts those flags.
|
package/dashboard/js/fre.js
CHANGED
|
@@ -80,9 +80,10 @@ function renderFre(statusOrProjects) {
|
|
|
80
80
|
|
|
81
81
|
// Resolve the currently-configured runtime CLI for the explainer copy.
|
|
82
82
|
// /api/status surfaces this as autoMode.defaultCli (resolveAgentCli(null, engine)).
|
|
83
|
-
// Fall back to autoMode.ccCli (also defaultCli-derived when ccCli unset) then '
|
|
83
|
+
// Fall back to autoMode.ccCli (also defaultCli-derived when ccCli unset) then 'copilot'
|
|
84
|
+
// (matches ENGINE_DEFAULTS.defaultCli — W-mpmwxkk40007c995).
|
|
84
85
|
const auto = (status && status.autoMode) || {};
|
|
85
|
-
const runtimeCli = String(auto.defaultCli || auto.ccCli || '
|
|
86
|
+
const runtimeCli = String(auto.defaultCli || auto.ccCli || 'copilot');
|
|
86
87
|
|
|
87
88
|
const cardStyle = [
|
|
88
89
|
'margin:12px 24px',
|
package/dashboard/js/settings.js
CHANGED
|
@@ -49,7 +49,7 @@ async function openSettings() {
|
|
|
49
49
|
// Per-agent override placeholders surface the inherited fleet defaults as
|
|
50
50
|
// muted text — operators see exactly what each agent will resolve to without
|
|
51
51
|
// chasing config files. Empty input clears the override → re-inherit fleet.
|
|
52
|
-
const fleetCliLabel = e.defaultCli || '
|
|
52
|
+
const fleetCliLabel = e.defaultCli || 'copilot';
|
|
53
53
|
const fleetModelLabel = e.defaultModel ? String(e.defaultModel) : 'CLI default';
|
|
54
54
|
const agentRows = Object.entries(agents).map(function([id, a]) {
|
|
55
55
|
return '<tr>' +
|
|
@@ -406,10 +406,10 @@ async function initRuntimeFleetUI(engineCfg, agentsCfg) {
|
|
|
406
406
|
runtimes = Array.isArray(d.runtimes) ? d.runtimes : [];
|
|
407
407
|
} catch { /* ignore — we'll surface a free-text-only path below */ }
|
|
408
408
|
|
|
409
|
-
// Always include '
|
|
409
|
+
// Always include 'copilot' as a fallback option even if /api/runtimes is empty;
|
|
410
410
|
// legacy installs without the registry endpoint should still see something pickable.
|
|
411
|
-
const names = runtimes.length ? runtimes.map(rt => rt.name) : ['
|
|
412
|
-
const currentDefault = engineCfg.defaultCli || '
|
|
411
|
+
const names = runtimes.length ? runtimes.map(rt => rt.name) : ['copilot'];
|
|
412
|
+
const currentDefault = engineCfg.defaultCli || 'copilot';
|
|
413
413
|
const currentCc = engineCfg.ccCli || '';
|
|
414
414
|
cliSelect.innerHTML = names.map(n =>
|
|
415
415
|
'<option value="' + escHtml(n) + '"' + (n === currentDefault ? ' selected' : '') + '>' + escHtml(n) + '</option>'
|
|
@@ -440,7 +440,7 @@ async function initRuntimeFleetUI(engineCfg, agentsCfg) {
|
|
|
440
440
|
// this the input was free-text and a user could (and did) save an agent
|
|
441
441
|
// with cli=claude + model=<some gpt> — invalid combination that crashed
|
|
442
442
|
// dispatch. Refreshing on CLI change clears stale model values.
|
|
443
|
-
const fleetDefaultCli = engineCfg.defaultCli || '
|
|
443
|
+
const fleetDefaultCli = engineCfg.defaultCli || 'copilot';
|
|
444
444
|
for (const cell of cliCells) {
|
|
445
445
|
const agentId = cell.getAttribute('data-runtime-cli');
|
|
446
446
|
const agent = (agentsCfg || {})[agentId] || {};
|
package/dashboard.js
CHANGED
|
@@ -2337,6 +2337,17 @@ const CC_LOG_ERROR_MAX_LEN = 80; // truncate exception messages in [cc-stream] l
|
|
|
2337
2337
|
const CC_STREAM_REATTACH_GRACE_MS = 60000; // keep CC job alive briefly after disconnect so the UI can reattach
|
|
2338
2338
|
const CC_STREAM_DONE_RETENTION_MS = 30000; // retain final payload briefly so reconnect can still receive it
|
|
2339
2339
|
const CC_LIVE_STREAM_MAX_AGE_MS = shared.ENGINE_DEFAULTS.ccLiveStreamMaxAgeMs;
|
|
2340
|
+
// W-mpmwxni2000c25c7-b — CC/doc-chat turn watchdog. Resolves per-call from
|
|
2341
|
+
// CONFIG.engine.ccTurnTimeoutMs (defaults to ENGINE_DEFAULTS.ccTurnTimeoutMs)
|
|
2342
|
+
// so an operator can shorten/lengthen the wall-clock cap without a code
|
|
2343
|
+
// change. callLLM's own `timeout` opt only kills the spawned child after a
|
|
2344
|
+
// long idle stretch; this turn-level cap kills WHICHEVER LLM call is in
|
|
2345
|
+
// flight inside ccCall/ccCallStreaming (resume → fresh → final retry).
|
|
2346
|
+
function _resolveCcTurnTimeoutMs() {
|
|
2347
|
+
const cfg = CONFIG && CONFIG.engine;
|
|
2348
|
+
const candidate = cfg && Number.isFinite(cfg.ccTurnTimeoutMs) ? cfg.ccTurnTimeoutMs : shared.ENGINE_DEFAULTS.ccTurnTimeoutMs;
|
|
2349
|
+
return Number.isFinite(candidate) && candidate > 0 ? candidate : 0;
|
|
2350
|
+
}
|
|
2340
2351
|
// Doc-chat is interactive — long-doc edits with multi-step Read+Write tool use can run
|
|
2341
2352
|
// well past 5 min on `canEdit:true` paths. Bumped to 1 hour (matching CC) so legitimate
|
|
2342
2353
|
// edits aren't killed mid-stream and the backend timeout never beats the user's reading
|
|
@@ -3391,6 +3402,22 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
|
|
|
3391
3402
|
let timeoutTimer = null;
|
|
3392
3403
|
let resolveResult;
|
|
3393
3404
|
const promise = new Promise((resolve) => { resolveResult = resolve; });
|
|
3405
|
+
// W-mpmwxni2000c25c7-c — build a single failure envelope shape from a
|
|
3406
|
+
// typed Error (or a plain Error). Reads `.code` / `.retriable` if the
|
|
3407
|
+
// pool stamped them; falls back to safe defaults otherwise so callers
|
|
3408
|
+
// see a consistent `{ ..., errorCode, errorRetriable }` shape. Sub-item
|
|
3409
|
+
// b's SSE writer consumes these to render a structured error event
|
|
3410
|
+
// instead of grepping the stderr string for a code.
|
|
3411
|
+
const _failureEnvelope = (err, defaultCode) => ({
|
|
3412
|
+
text: accumulated,
|
|
3413
|
+
sessionId: sessionHandle ? sessionHandle.sessionId : null,
|
|
3414
|
+
code: 1,
|
|
3415
|
+
usage: {},
|
|
3416
|
+
raw: accumulated,
|
|
3417
|
+
stderr: String((err && err.message) || err || 'cc-worker-pool failure'),
|
|
3418
|
+
errorCode: (err && err.code) || defaultCode || null,
|
|
3419
|
+
errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
|
|
3420
|
+
});
|
|
3394
3421
|
const finalize = (envelope) => {
|
|
3395
3422
|
if (settled) return;
|
|
3396
3423
|
settled = true;
|
|
@@ -3415,14 +3442,18 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
|
|
|
3415
3442
|
timeoutTimer = setTimeout(() => {
|
|
3416
3443
|
try { sessionHandle && sessionHandle.cancel(); } catch { /* swallow */ }
|
|
3417
3444
|
try { ccWorkerPool.closeTab(tabKey); } catch { /* swallow */ }
|
|
3418
|
-
|
|
3419
|
-
|
|
3420
|
-
|
|
3421
|
-
|
|
3422
|
-
|
|
3423
|
-
|
|
3424
|
-
|
|
3425
|
-
|
|
3445
|
+
// W-mpmwxni2000c25c7-c — convert the legacy synthesized
|
|
3446
|
+
// `{ code: 1, stderr: 'doc-chat-pool: timeout after Xms' }` shape into
|
|
3447
|
+
// a typed-error envelope so the SSE writer can render the same
|
|
3448
|
+
// structured error event for timeouts as for spawn/handshake/exit
|
|
3449
|
+
// failures. The error code carries `cc-turn-timeout`; consumers
|
|
3450
|
+
// grep on that instead of parsing the stderr string.
|
|
3451
|
+
const timeoutErr = ccWorkerPool._typedError(
|
|
3452
|
+
`doc-chat-pool: timeout after ${timeoutMs}ms`,
|
|
3453
|
+
ccWorkerPool.ERROR_CODES.CC_TURN_TIMEOUT,
|
|
3454
|
+
true
|
|
3455
|
+
);
|
|
3456
|
+
finalize(_failureEnvelope(timeoutErr, ccWorkerPool.ERROR_CODES.CC_TURN_TIMEOUT));
|
|
3426
3457
|
}, timeoutMs);
|
|
3427
3458
|
if (typeof timeoutTimer.unref === 'function') timeoutTimer.unref();
|
|
3428
3459
|
}
|
|
@@ -3436,14 +3467,10 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
|
|
|
3436
3467
|
systemPromptHash: _docChatPromptHash,
|
|
3437
3468
|
});
|
|
3438
3469
|
} catch (err) {
|
|
3439
|
-
|
|
3440
|
-
|
|
3441
|
-
|
|
3442
|
-
|
|
3443
|
-
usage: {},
|
|
3444
|
-
raw: '',
|
|
3445
|
-
stderr: String((err && err.message) || err || 'cc-worker-pool spawn failed'),
|
|
3446
|
-
});
|
|
3470
|
+
// Pool stamps `.code` (worker-spawn-failed / acp-handshake-failed) on
|
|
3471
|
+
// every error from getSession; fall back to worker-spawn-failed if
|
|
3472
|
+
// the error is a plain Error from somewhere unexpected.
|
|
3473
|
+
return finalize(_failureEnvelope(err, ccWorkerPool.ERROR_CODES.WORKER_SPAWN_FAILED));
|
|
3447
3474
|
}
|
|
3448
3475
|
if (cancelled) {
|
|
3449
3476
|
try { sessionHandle.cancel(); } catch { /* swallow */ }
|
|
@@ -3471,14 +3498,15 @@ function _invokeDocChatViaPool({ prompt, model, effort, engineConfig, systemProm
|
|
|
3471
3498
|
finalize({ text: accumulated, sessionId: sessionHandle.sessionId, code: 0, usage: {}, raw: accumulated, stderr: '' });
|
|
3472
3499
|
},
|
|
3473
3500
|
onError: (err) => {
|
|
3474
|
-
|
|
3475
|
-
|
|
3476
|
-
sessionId: sessionHandle.sessionId,
|
|
3477
|
-
|
|
3478
|
-
|
|
3479
|
-
|
|
3480
|
-
|
|
3481
|
-
|
|
3501
|
+
if (cancelled) {
|
|
3502
|
+
// User-driven cancel — not a real error, treat as a clean exit.
|
|
3503
|
+
finalize({ text: accumulated, sessionId: sessionHandle.sessionId, code: 0, usage: {}, raw: accumulated, stderr: '' });
|
|
3504
|
+
return;
|
|
3505
|
+
}
|
|
3506
|
+
// Pool stamps `.code` (worker-died for mid-stream proc exit).
|
|
3507
|
+
// Fallback default is worker-died because the stream onError is
|
|
3508
|
+
// overwhelmingly fired from the post-handshake exit handler.
|
|
3509
|
+
finalize(_failureEnvelope(err, ccWorkerPool.ERROR_CODES.WORKER_DIED));
|
|
3482
3510
|
},
|
|
3483
3511
|
});
|
|
3484
3512
|
})();
|
|
@@ -3975,12 +4003,65 @@ async function _retryDocChatAfterResumeFailure({ result, initialPass, freshSessi
|
|
|
3975
4003
|
// Shape the per-failure debug envelope (raw stderr + classification metadata)
|
|
3976
4004
|
// shared by hard failures and partial recoveries — keeps the wire shape in lockstep.
|
|
3977
4005
|
function _buildDocChatErrorEnvelope(result) {
|
|
4006
|
+
// W-mpmwxni2000c25c7-b — also surface the typed `error: {message, code,
|
|
4007
|
+
// retriable}` envelope when llm.callLLM* produced one, so doc-chat clients
|
|
4008
|
+
// get the same shape Command Center handlers emit.
|
|
4009
|
+
const typed = result && result.error;
|
|
3978
4010
|
return {
|
|
3979
4011
|
code: result.code ?? null,
|
|
3980
4012
|
stderr: String(result.stderr || '').slice(-2048),
|
|
3981
4013
|
errorClass: result.errorClass || null,
|
|
3982
4014
|
errorMessage: result.errorMessage || null,
|
|
3983
4015
|
runtime: result.runtime || null,
|
|
4016
|
+
...(typed ? {
|
|
4017
|
+
typedCode: typed.code || null,
|
|
4018
|
+
typedMessage: typed.message || null,
|
|
4019
|
+
retriable: typed.retriable !== false,
|
|
4020
|
+
} : {}),
|
|
4021
|
+
};
|
|
4022
|
+
}
|
|
4023
|
+
|
|
4024
|
+
// W-mpmwxni2000c25c7-b — race a ccDocCall* promise against a wall-clock turn
|
|
4025
|
+
// timer. On expiry, fires `abortFn` (killing the in-flight CLI) and resolves
|
|
4026
|
+
// with a doc-chat-shaped failure payload that flows through the existing
|
|
4027
|
+
// _docChatFailureResponse / SSE error event paths. timeoutMs <= 0 disables
|
|
4028
|
+
// the watchdog (passthrough).
|
|
4029
|
+
async function _raceCcDocChatTimeout(callPromise, timeoutMs, abortFn, label) {
|
|
4030
|
+
if (!timeoutMs || timeoutMs <= 0) return callPromise;
|
|
4031
|
+
let timer = null;
|
|
4032
|
+
let timedOut = false;
|
|
4033
|
+
const timeoutPromise = new Promise((resolve) => {
|
|
4034
|
+
timer = setTimeout(() => {
|
|
4035
|
+
timedOut = true;
|
|
4036
|
+
try { if (abortFn) abortFn(); } catch { /* swallow */ }
|
|
4037
|
+
resolve(null);
|
|
4038
|
+
}, timeoutMs);
|
|
4039
|
+
// NOTE: do NOT unref — Node would exit the event loop while awaiting the
|
|
4040
|
+
// call promise (Promises don't keep the loop open; timers/I/O do). Cleared
|
|
4041
|
+
// immediately on the success path below.
|
|
4042
|
+
});
|
|
4043
|
+
const winner = await Promise.race([callPromise, timeoutPromise]);
|
|
4044
|
+
if (!timedOut) {
|
|
4045
|
+
clearTimeout(timer);
|
|
4046
|
+
return winner;
|
|
4047
|
+
}
|
|
4048
|
+
// Drain the in-flight call so its cleanup runs before we hand back the
|
|
4049
|
+
// synthetic envelope.
|
|
4050
|
+
await callPromise.catch(() => null);
|
|
4051
|
+
const message = `${label || 'doc-chat'} turn timed out after ${timeoutMs}ms`;
|
|
4052
|
+
return {
|
|
4053
|
+
answer: 'Document chat request timed out — try again.',
|
|
4054
|
+
toolUses: [],
|
|
4055
|
+
error: {
|
|
4056
|
+
code: 'cc-turn-timeout',
|
|
4057
|
+
stderr: '',
|
|
4058
|
+
errorClass: 'cc-turn-timeout',
|
|
4059
|
+
errorMessage: message,
|
|
4060
|
+
runtime: null,
|
|
4061
|
+
typedCode: 'cc-turn-timeout',
|
|
4062
|
+
typedMessage: message,
|
|
4063
|
+
retriable: true,
|
|
4064
|
+
},
|
|
3984
4065
|
};
|
|
3985
4066
|
}
|
|
3986
4067
|
|
|
@@ -6729,7 +6810,14 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
6729
6810
|
const ccTurnId = 'cct-' + shared.uid();
|
|
6730
6811
|
const turnSystemPrompt = renderDocChatSystemPromptForTurn(ccTurnId);
|
|
6731
6812
|
|
|
6732
|
-
|
|
6813
|
+
// W-mpmwxni2000c25c7-b — wall-clock turn watchdog. The doc-chat call
|
|
6814
|
+
// can internally spawn resume + fresh + final-retry LLM calls; we want
|
|
6815
|
+
// ONE wall-clock cap that covers the whole turn so a runtime stuck
|
|
6816
|
+
// mid-stream can't outlive ccTurnTimeoutMs. On expiry the watchdog
|
|
6817
|
+
// calls _docAbort (kills the in-flight CLI) and the synthesized payload
|
|
6818
|
+
// below flows through the existing _docChatFailureResponse path.
|
|
6819
|
+
const _docTurnTimeoutMs = _resolveCcTurnTimeoutMs();
|
|
6820
|
+
const _docCallPromise = ccDocCall({
|
|
6733
6821
|
message: body.message, document: currentContent, title: body.title,
|
|
6734
6822
|
filePath: body.filePath, selection: body.selection, canEdit, isJson,
|
|
6735
6823
|
model: body.model || undefined,
|
|
@@ -6739,6 +6827,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
6739
6827
|
systemPrompt: turnSystemPrompt,
|
|
6740
6828
|
turnId: ccTurnId,
|
|
6741
6829
|
});
|
|
6830
|
+
const _docCallResult = await _raceCcDocChatTimeout(_docCallPromise, _docTurnTimeoutMs, () => _docAbort && _docAbort(), 'doc-chat');
|
|
6831
|
+
let { answer, partial, warning, toolUses, error: ccError } = _docCallResult;
|
|
6742
6832
|
const finalize = _finalizeDocChatEdit({
|
|
6743
6833
|
filePath: body.filePath, fullPath, isJson, canEdit,
|
|
6744
6834
|
originalContent: currentContent, delimiterContent: null,
|
|
@@ -6752,6 +6842,25 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
6752
6842
|
ccError, partial, warning, toolUses, finalize,
|
|
6753
6843
|
});
|
|
6754
6844
|
_docDone = true;
|
|
6845
|
+
// W-mpmwxni2000c25c7-b — track every surfaced doc-chat error code so
|
|
6846
|
+
// /api/metrics reflects silent-error regressions. Hard failures (no
|
|
6847
|
+
// partial recovery, no edited file) graduate to 5xx so the client can
|
|
6848
|
+
// render a real error UI instead of treating the polite "Failed to
|
|
6849
|
+
// process request" string as a successful turn.
|
|
6850
|
+
if (ccError) {
|
|
6851
|
+
const errCode = ccError.typedCode || ccError.errorClass || ccError.code || 'unknown';
|
|
6852
|
+
llm.trackEngineError('doc-chat', errCode);
|
|
6853
|
+
const isHardFailure = !partial && !(finalize && finalize.edited);
|
|
6854
|
+
if (isHardFailure) {
|
|
6855
|
+
const status = errCode === shared.FAILURE_CLASS.CONFIG_ERROR ? 503 : 502;
|
|
6856
|
+
return jsonReply(res, status, {
|
|
6857
|
+
...payload,
|
|
6858
|
+
error: ccError.typedMessage || ccError.errorMessage || 'Document chat failed',
|
|
6859
|
+
code: errCode,
|
|
6860
|
+
retriable: ccError.retriable !== false,
|
|
6861
|
+
});
|
|
6862
|
+
}
|
|
6863
|
+
}
|
|
6755
6864
|
return jsonReply(res, 200, payload);
|
|
6756
6865
|
} finally { _docAbort = null; _docDone = true; docChatInFlight.delete(docKey); }
|
|
6757
6866
|
} catch (e) { return jsonReply(res, e.statusCode || 500, { error: e.message }); }
|
|
@@ -6840,7 +6949,12 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
6840
6949
|
const ccTurnId = 'cct-' + shared.uid();
|
|
6841
6950
|
const turnSystemPrompt = renderDocChatSystemPromptForTurn(ccTurnId);
|
|
6842
6951
|
|
|
6843
|
-
|
|
6952
|
+
// W-mpmwxni2000c25c7-b — wall-clock turn watchdog (mirrors the
|
|
6953
|
+
// non-stream handleDocChat path). On expiry _docAbort kills the
|
|
6954
|
+
// in-flight LLM and the synthesized payload below flows through the
|
|
6955
|
+
// SSE done frame the client already expects with `error` set.
|
|
6956
|
+
const _docTurnTimeoutMs = _resolveCcTurnTimeoutMs();
|
|
6957
|
+
const _docStreamCallPromise = ccDocCallStreaming({
|
|
6844
6958
|
message: body.message, document: currentContent, title: body.title,
|
|
6845
6959
|
filePath: body.filePath, selection: body.selection, canEdit, isJson,
|
|
6846
6960
|
model: body.model || undefined,
|
|
@@ -6853,6 +6967,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
6853
6967
|
systemPrompt: turnSystemPrompt,
|
|
6854
6968
|
turnId: ccTurnId,
|
|
6855
6969
|
});
|
|
6970
|
+
const _docStreamResult = await _raceCcDocChatTimeout(_docStreamCallPromise, _docTurnTimeoutMs, () => _docAbort && _docAbort(), 'doc-chat-stream');
|
|
6971
|
+
let { answer, partial, warning, toolUses, error: ccError } = _docStreamResult;
|
|
6856
6972
|
const finalize = _finalizeDocChatEdit({
|
|
6857
6973
|
filePath: body.filePath, fullPath, isJson, canEdit,
|
|
6858
6974
|
originalContent: currentContent, delimiterContent: null,
|
|
@@ -6865,6 +6981,23 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
6865
6981
|
actionFeedback: null, actionParseError: null,
|
|
6866
6982
|
ccError, partial, warning, toolUses, finalize,
|
|
6867
6983
|
});
|
|
6984
|
+
// W-mpmwxni2000c25c7-b — track surfaced doc-chat error codes for
|
|
6985
|
+
// /api/metrics and emit a named SSE `event: error` frame so the
|
|
6986
|
+
// client can render a typed error instead of treating the polite
|
|
6987
|
+
// fallback string as a normal completion.
|
|
6988
|
+
if (ccError) {
|
|
6989
|
+
const errCode = ccError.typedCode || ccError.errorClass || ccError.code || 'unknown';
|
|
6990
|
+
llm.trackEngineError('doc-chat', errCode);
|
|
6991
|
+
const isHardFailure = !partial && !(finalize && finalize.edited);
|
|
6992
|
+
if (isHardFailure) {
|
|
6993
|
+
const errPayload = {
|
|
6994
|
+
message: ccError.typedMessage || ccError.errorMessage || 'Document chat failed',
|
|
6995
|
+
code: errCode,
|
|
6996
|
+
retriable: ccError.retriable !== false,
|
|
6997
|
+
};
|
|
6998
|
+
try { res.write(`event: error\ndata: ${JSON.stringify(errPayload)}\n\n`); } catch {}
|
|
6999
|
+
}
|
|
7000
|
+
}
|
|
6868
7001
|
const { answer: finalAnswer, ...donePayload } = payload;
|
|
6869
7002
|
writeDocEvent({
|
|
6870
7003
|
type: 'done',
|
|
@@ -7461,21 +7594,40 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
7461
7594
|
// confirmation chips in the assistant reply.
|
|
7462
7595
|
const ccTurnId = 'cct-' + shared.uid();
|
|
7463
7596
|
const turnSystemPrompt = renderCcSystemPromptForTurn(ccTurnId);
|
|
7464
|
-
|
|
7465
|
-
|
|
7466
|
-
//
|
|
7467
|
-
|
|
7597
|
+
// W-mpmwxni2000c25c7-b — wall-clock turn watchdog. On expiry the
|
|
7598
|
+
// in-flight LLM call is aborted and ccCall returns a synthetic
|
|
7599
|
+
// envelope with error.code === 'cc-turn-timeout'.
|
|
7600
|
+
const turnTimeoutMs = _resolveCcTurnTimeoutMs();
|
|
7601
|
+
const result = await llm.withCcTurnTimeout({
|
|
7602
|
+
timeoutMs: turnTimeoutMs, label: 'command-center',
|
|
7603
|
+
}, (registerAbort) => ccCall(body.message, {
|
|
7604
|
+
store: 'cc', transcript: body.transcript, systemPrompt: turnSystemPrompt, turnId: ccTurnId,
|
|
7605
|
+
onAbortReady: registerAbort,
|
|
7606
|
+
}));
|
|
7607
|
+
|
|
7608
|
+
// W-mpmwxni2000c25c7-b — typed-error envelope path. Any failure that
|
|
7609
|
+
// produced no usable text is surfaced to the client as 5xx JSON
|
|
7610
|
+
// `{ error, code, retriable }` instead of a polite 200 "I had trouble
|
|
7611
|
+
// processing that" string that silently halves CC retry signal.
|
|
7612
|
+
if (!result.text || result.error) {
|
|
7613
|
+
const errEnvelope = result.error || (result.errorMessage
|
|
7614
|
+
? { message: result.errorMessage, code: result.errorClass || 'unknown', retriable: true }
|
|
7615
|
+
: { message: 'Command Center returned no output', code: 'empty-output', retriable: true });
|
|
7616
|
+
llm.trackEngineError('command-center', errEnvelope.code);
|
|
7468
7617
|
const debugInfo = result.code !== 0 ? `(exit code ${result.code})` : '(empty response)';
|
|
7469
7618
|
const stderrTail = (result.stderr || '').trim().split('\n').filter(Boolean).slice(-5).join(' | ');
|
|
7470
|
-
console.error(`[CC] LLM failed after retries ${debugInfo}: ${stderrTail}`);
|
|
7471
|
-
try { shared.log('warn', `CC failed ${debugInfo}: ${stderrTail.slice(0, 300)}`); } catch {}
|
|
7472
|
-
|
|
7473
|
-
const
|
|
7474
|
-
|
|
7475
|
-
|
|
7476
|
-
return jsonReply(res,
|
|
7477
|
-
|
|
7478
|
-
|
|
7619
|
+
console.error(`[CC] LLM failed after retries ${debugInfo} code=${errEnvelope.code}: ${stderrTail}`);
|
|
7620
|
+
try { shared.log('warn', `CC failed ${debugInfo} code=${errEnvelope.code}: ${stderrTail.slice(0, 300)}`); } catch {}
|
|
7621
|
+
// Missing-runtime is a 503 (service config); auth-failure also 503; other classes 502.
|
|
7622
|
+
const status = result.missingRuntime ? 503
|
|
7623
|
+
: errEnvelope.code === 'auth-failure' ? 503
|
|
7624
|
+
: 502;
|
|
7625
|
+
return jsonReply(res, status, {
|
|
7626
|
+
error: errEnvelope.message,
|
|
7627
|
+
code: errEnvelope.code,
|
|
7628
|
+
retriable: !!errEnvelope.retriable,
|
|
7629
|
+
sessionId: ccSession.sessionId || null,
|
|
7630
|
+
...(stderrTail ? { stderr: stderrTail.slice(0, 500) } : {}),
|
|
7479
7631
|
});
|
|
7480
7632
|
}
|
|
7481
7633
|
|
|
@@ -7496,7 +7648,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
7496
7648
|
} finally {
|
|
7497
7649
|
_releaseCCTab(tabId);
|
|
7498
7650
|
}
|
|
7499
|
-
} catch (e) { _releaseCCTab(tabId); return jsonReply(res, e.statusCode || 500, { error: e.message }); }
|
|
7651
|
+
} catch (e) { _releaseCCTab(tabId); return jsonReply(res, e.statusCode || 500, { error: e.message, code: 'handler-exception', retriable: false }); }
|
|
7500
7652
|
}
|
|
7501
7653
|
|
|
7502
7654
|
/** Build a lightweight input object for SSE tool events — keeps only the fields formatToolSummary needs, with truncated string values. */
|
|
@@ -7618,6 +7770,11 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
7618
7770
|
});
|
|
7619
7771
|
} catch (err) {
|
|
7620
7772
|
_emitTimingLog(null, null, Date.now(), 'spawn-failed');
|
|
7773
|
+
// W-mpmwxni2000c25c7-c — pipe the pool's typed-error fields
|
|
7774
|
+
// (`code`, `retriable`) onto the envelope so the SSE writer can
|
|
7775
|
+
// render a structured error event instead of grepping the stderr
|
|
7776
|
+
// string. Pool stamps `.code` (worker-spawn-failed or
|
|
7777
|
+
// acp-handshake-failed) on every getSession rejection.
|
|
7621
7778
|
return resolveResult({
|
|
7622
7779
|
text: '',
|
|
7623
7780
|
sessionId: null,
|
|
@@ -7625,6 +7782,8 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
7625
7782
|
usage: {},
|
|
7626
7783
|
raw: '',
|
|
7627
7784
|
stderr: String((err && err.message) || err || 'cc-worker-pool spawn failed'),
|
|
7785
|
+
errorCode: (err && err.code) || ccWorkerPool.ERROR_CODES.WORKER_SPAWN_FAILED,
|
|
7786
|
+
errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
|
|
7628
7787
|
});
|
|
7629
7788
|
}
|
|
7630
7789
|
const _tSessionReady = Date.now();
|
|
@@ -7671,13 +7830,29 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
7671
7830
|
},
|
|
7672
7831
|
onError: (err) => {
|
|
7673
7832
|
_emitTimingLog(_lifecycle, _tSessionReady, Date.now(), cancelled ? 'cancelled' : 'error');
|
|
7833
|
+
if (cancelled) {
|
|
7834
|
+
resolveResult({
|
|
7835
|
+
text: accumulated,
|
|
7836
|
+
sessionId: sessionHandle.sessionId,
|
|
7837
|
+
code: 0,
|
|
7838
|
+
usage: {},
|
|
7839
|
+
raw: accumulated,
|
|
7840
|
+
stderr: '',
|
|
7841
|
+
});
|
|
7842
|
+
return;
|
|
7843
|
+
}
|
|
7844
|
+
// W-mpmwxni2000c25c7-c — pipe the pool's typed-error fields
|
|
7845
|
+
// through. mid-stream worker exits stamp `.code = 'worker-died'`
|
|
7846
|
+
// on the Error before invoking onError.
|
|
7674
7847
|
resolveResult({
|
|
7675
7848
|
text: accumulated,
|
|
7676
7849
|
sessionId: sessionHandle.sessionId,
|
|
7677
|
-
code:
|
|
7850
|
+
code: 1,
|
|
7678
7851
|
usage: {},
|
|
7679
7852
|
raw: accumulated,
|
|
7680
7853
|
stderr: String((err && err.message) || err || 'cc-worker-pool stream error'),
|
|
7854
|
+
errorCode: (err && err.code) || ccWorkerPool.ERROR_CODES.WORKER_DIED,
|
|
7855
|
+
errorRetriable: (err && err.retriable !== undefined) ? err.retriable : true,
|
|
7681
7856
|
});
|
|
7682
7857
|
},
|
|
7683
7858
|
});
|
|
@@ -8032,73 +8207,100 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
8032
8207
|
: '';
|
|
8033
8208
|
const prompt = _joinCcPromptParts(preamble, resumeGuard, carryover, turnHeader, projectContextPart, body.message);
|
|
8034
8209
|
|
|
8035
|
-
const { trackEngineUsage: trackUsage } = require('./engine/llm');
|
|
8210
|
+
const { trackEngineUsage: trackUsage, trackEngineError: trackErr, withCcTurnTimeout: withTimeout } = require('./engine/llm');
|
|
8036
8211
|
const streamModel = CONFIG.engine?.ccModel || shared.ENGINE_DEFAULTS.ccModel;
|
|
8037
8212
|
const streamEffort = CONFIG.engine?.ccEffort || shared.ENGINE_DEFAULTS.ccEffort;
|
|
8038
8213
|
const ccMaxTurns = CONFIG.engine?.ccMaxTurns || shared.ENGINE_DEFAULTS.ccMaxTurns;
|
|
8039
8214
|
let toolUses = [];
|
|
8040
|
-
|
|
8041
|
-
|
|
8042
|
-
|
|
8043
|
-
|
|
8044
|
-
|
|
8045
|
-
|
|
8046
|
-
|
|
8047
|
-
|
|
8048
|
-
|
|
8049
|
-
|
|
8050
|
-
|
|
8051
|
-
|
|
8052
|
-
|
|
8053
|
-
if (result.missingRuntime) {
|
|
8054
|
-
finishMissingRuntime(result, liveState);
|
|
8055
|
-
return;
|
|
8056
|
-
}
|
|
8057
|
-
|
|
8058
|
-
// Handle failure — non-zero exit with text = max_turns or partial success, still usable
|
|
8059
|
-
if (!result.text && wasResume && result.code !== 0 && !req.destroyed) {
|
|
8060
|
-
// Resume failed (stale/expired session) — auto-retry as fresh session (skip if client already disconnected)
|
|
8061
|
-
console.log(`[CC-stream] Resume failed (code=${result.code}) — retrying fresh`);
|
|
8062
|
-
const freshPreamble = buildCCStatePreamble();
|
|
8063
|
-
const freshCarryover = _buildTranscriptCarryover(body.transcript, { currentMessage: body.message });
|
|
8064
|
-
const freshPrompt = _joinCcPromptParts(freshPreamble, freshCarryover, turnHeader, projectContextPart, body.message);
|
|
8065
|
-
toolUses = []; // discard stale metadata from the failed resume attempt
|
|
8066
|
-
const retryPromise = _invokeCcStream({
|
|
8067
|
-
prompt: freshPrompt, sessionId: undefined, liveState, toolUses,
|
|
8215
|
+
// W-mpmwxni2000c25c7-b — turn-level watchdog. Wraps the initial
|
|
8216
|
+
// _invokeCcStream PLUS the post-resume-fail retry so the wall clock
|
|
8217
|
+
// covers the entire CC turn (not just one underlying LLM call). On
|
|
8218
|
+
// expiry, whichever call is in flight is aborted; the watchdog
|
|
8219
|
+
// resolves with a synthetic `{ error: { code: 'cc-turn-timeout' } }`
|
|
8220
|
+
// envelope so the SSE error path below kicks in.
|
|
8221
|
+
const turnTimeoutMs = _resolveCcTurnTimeoutMs();
|
|
8222
|
+
const result = await withTimeout({
|
|
8223
|
+
timeoutMs: turnTimeoutMs, label: 'command-center-stream',
|
|
8224
|
+
}, async (registerAbort) => {
|
|
8225
|
+
const llmPromise = _invokeCcStream({
|
|
8226
|
+
prompt, sessionId, liveState, toolUses,
|
|
8068
8227
|
model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
|
|
8069
8228
|
engineConfig: CONFIG.engine,
|
|
8070
8229
|
systemPrompt: turnSystemPrompt,
|
|
8071
8230
|
tabId,
|
|
8072
8231
|
});
|
|
8073
|
-
_ccStreamAbort =
|
|
8232
|
+
_ccStreamAbort = llmPromise.abort;
|
|
8074
8233
|
liveState.abortFn = _ccStreamAbort;
|
|
8075
8234
|
ccInFlightAborts.set(tabId, _ccStreamAbort);
|
|
8076
|
-
|
|
8077
|
-
|
|
8078
|
-
|
|
8079
|
-
|
|
8080
|
-
|
|
8235
|
+
registerAbort(_ccStreamAbort);
|
|
8236
|
+
const initial = await llmPromise;
|
|
8237
|
+
trackUsage('command-center', initial.usage);
|
|
8238
|
+
|
|
8239
|
+
if (initial.missingRuntime) return initial;
|
|
8240
|
+
|
|
8241
|
+
// Handle failure — non-zero exit with text = max_turns or partial success, still usable
|
|
8242
|
+
if (!initial.text && wasResume && initial.code !== 0 && !req.destroyed) {
|
|
8243
|
+
// Resume failed (stale/expired session) — auto-retry as fresh session (skip if client already disconnected)
|
|
8244
|
+
console.log(`[CC-stream] Resume failed (code=${initial.code}) — retrying fresh`);
|
|
8245
|
+
const freshPreamble = buildCCStatePreamble();
|
|
8246
|
+
const freshCarryover = _buildTranscriptCarryover(body.transcript, { currentMessage: body.message });
|
|
8247
|
+
const freshPrompt = _joinCcPromptParts(freshPreamble, freshCarryover, turnHeader, projectContextPart, body.message);
|
|
8248
|
+
toolUses = []; // discard stale metadata from the failed resume attempt
|
|
8249
|
+
const retryPromise = _invokeCcStream({
|
|
8250
|
+
prompt: freshPrompt, sessionId: undefined, liveState, toolUses,
|
|
8251
|
+
model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
|
|
8252
|
+
engineConfig: CONFIG.engine,
|
|
8253
|
+
systemPrompt: turnSystemPrompt,
|
|
8254
|
+
tabId,
|
|
8255
|
+
});
|
|
8256
|
+
_ccStreamAbort = retryPromise.abort;
|
|
8257
|
+
liveState.abortFn = _ccStreamAbort;
|
|
8258
|
+
ccInFlightAborts.set(tabId, _ccStreamAbort);
|
|
8259
|
+
registerAbort(_ccStreamAbort);
|
|
8260
|
+
const retryResult = await retryPromise;
|
|
8261
|
+
trackUsage('command-center', retryResult.usage);
|
|
8262
|
+
if (retryResult.text) {
|
|
8263
|
+
// Fresh session succeeded — use retryResult from here
|
|
8264
|
+
Object.assign(initial, retryResult);
|
|
8265
|
+
// Clear the error envelope inherited from the failed first attempt
|
|
8266
|
+
// so the success path below doesn't misclassify a recovered turn.
|
|
8267
|
+
if (retryResult.text) { initial.error = null; initial.ok = true; }
|
|
8268
|
+
} else if (retryResult.error) {
|
|
8269
|
+
initial.error = retryResult.error;
|
|
8270
|
+
}
|
|
8081
8271
|
}
|
|
8082
|
-
|
|
8272
|
+
return initial;
|
|
8273
|
+
});
|
|
8083
8274
|
if (result.missingRuntime) {
|
|
8084
8275
|
finishMissingRuntime(result, liveState);
|
|
8085
8276
|
return;
|
|
8086
8277
|
}
|
|
8087
|
-
if (!result.text) {
|
|
8278
|
+
if (!result.text || result.error) {
|
|
8088
8279
|
if (req.destroyed) {
|
|
8089
8280
|
_ccStreamEnded = true;
|
|
8090
8281
|
_logCcStreamEnd(_ccTelemetry, 'llm-empty-client-gone', { code: result.code });
|
|
8091
8282
|
return;
|
|
8092
8283
|
}
|
|
8093
|
-
|
|
8284
|
+
// W-mpmwxni2000c25c7-b — surface the typed error envelope as a
|
|
8285
|
+
// distinct SSE `event: error` frame so the client renders a real
|
|
8286
|
+
// error UI (with a retry hint derived from `retriable`) instead of
|
|
8287
|
+
// swallowing a polite 200 "I had trouble processing that" string.
|
|
8288
|
+
const envelope = result.error || (result.errorMessage
|
|
8289
|
+
? { message: result.errorMessage, code: result.errorClass || 'unknown', retriable: true }
|
|
8290
|
+
: { message: 'Command Center returned no output', code: 'empty-output', retriable: true });
|
|
8291
|
+
trackErr('command-center', envelope.code);
|
|
8094
8292
|
const stderrTail = (result.stderr || '').trim().split('\n').filter(Boolean).slice(-3).join(' | ');
|
|
8095
|
-
console.error(`[CC-stream] Failed
|
|
8096
|
-
|
|
8097
|
-
|
|
8293
|
+
console.error(`[CC-stream] Failed code=${envelope.code} retriable=${envelope.retriable}: ${(result.stderr || '').slice(0, 500)}; stdout_tail=${(result.raw || '').slice(-500)}`);
|
|
8294
|
+
// Emit `event: error` (named SSE frame), then a `done`-style frame
|
|
8295
|
+
// for clients that only handle the default message channel, then
|
|
8296
|
+
// close cleanly so the EventSource exits its read loop without
|
|
8297
|
+
// throwing a connection-reset.
|
|
8298
|
+
try { res.write(`event: error\ndata: ${JSON.stringify({ message: envelope.message, code: envelope.code, retriable: !!envelope.retriable, ...(stderrTail ? { stderr: stderrTail.slice(0, 500) } : {}) })}\n\n`); } catch {}
|
|
8299
|
+
liveState.donePayload = { type: 'error', error: envelope.message, code: envelope.code, retriable: !!envelope.retriable, sessionId: null };
|
|
8098
8300
|
if (liveState.writer) liveState.writer(liveState.donePayload);
|
|
8099
8301
|
if (liveState.endResponse) liveState.endResponse();
|
|
8100
8302
|
_scheduleCcLiveCleanup(tabId);
|
|
8101
|
-
_logCcStreamEnd(_ccTelemetry, 'llm-failed-
|
|
8303
|
+
_logCcStreamEnd(_ccTelemetry, 'llm-failed-error-envelope-sent', { code: result.code, errorCode: envelope.code });
|
|
8102
8304
|
return;
|
|
8103
8305
|
}
|
|
8104
8306
|
|
|
@@ -8670,7 +8872,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
8670
8872
|
if (_isClear(e.defaultModel)) _deleteEngineConfig('defaultModel');
|
|
8671
8873
|
else {
|
|
8672
8874
|
const candidate = String(e.defaultModel);
|
|
8673
|
-
const resolvedCli = config.engine.defaultCli || '
|
|
8875
|
+
const resolvedCli = config.engine.defaultCli || 'copilot';
|
|
8674
8876
|
const rejection = await _validateFleetModel(candidate, resolvedCli);
|
|
8675
8877
|
if (rejection) _clamped.push(`engine.defaultModel: "${candidate}" ${rejection} — kept previous value`);
|
|
8676
8878
|
else _setEngineConfig('defaultModel', candidate);
|
|
@@ -8680,7 +8882,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
8680
8882
|
if (_isClear(e.ccModel)) _deleteEngineConfig('ccModel');
|
|
8681
8883
|
else {
|
|
8682
8884
|
const candidate = String(e.ccModel);
|
|
8683
|
-
const resolvedCli = config.engine.ccCli || config.engine.defaultCli || '
|
|
8885
|
+
const resolvedCli = config.engine.ccCli || config.engine.defaultCli || 'copilot';
|
|
8684
8886
|
const rejection = await _validateFleetModel(candidate, resolvedCli);
|
|
8685
8887
|
if (rejection) _clamped.push(`engine.ccModel: "${candidate}" ${rejection} — kept previous value`);
|
|
8686
8888
|
else _setEngineConfig('ccModel', candidate);
|
|
@@ -8798,7 +9000,7 @@ What would you like to discuss or change? When you're happy, say "approve" and I
|
|
|
8798
9000
|
if (updates.model === '' || updates.model === null) delete config.agents[id].model;
|
|
8799
9001
|
else {
|
|
8800
9002
|
const candidate = String(updates.model);
|
|
8801
|
-
const resolvedCli = config.agents[id].cli || config.engine.defaultCli || '
|
|
9003
|
+
const resolvedCli = config.agents[id].cli || config.engine.defaultCli || 'copilot';
|
|
8802
9004
|
const runtimeModelStr = _resolveModelForRuntime(candidate, resolvedCli);
|
|
8803
9005
|
const knownModels = await _modelsFor(resolvedCli);
|
|
8804
9006
|
// Two validation paths:
|
package/docs/runtime-adapters.md
CHANGED
|
@@ -14,7 +14,12 @@ behavior is hidden behind an adapter object resolved through `resolveRuntime()`.
|
|
|
14
14
|
|
|
15
15
|
`resolveRuntime(name)` throws when `name` is unknown so misconfigurations surface
|
|
16
16
|
at dispatch time instead of producing silent fallbacks deep inside spawn logic.
|
|
17
|
-
|
|
17
|
+
When `name` is `null`/omitted, `resolveRuntime()` falls back to `'claude'` for
|
|
18
|
+
parser-routing compatibility (Copilot's `parseOutput` cannot consume the Claude
|
|
19
|
+
JSONL `{type:"result",result:"..."}` shape — see W-mpmwxkk40007c995). The fleet
|
|
20
|
+
default that determines which runtime *new spawns* use is separate:
|
|
21
|
+
`ENGINE_DEFAULTS.defaultCli` (also in W-mpmwxkk40007c995) is now `'copilot'`, so
|
|
22
|
+
operators with no explicit `engine.defaultCli` get Copilot on dispatch.
|
|
18
23
|
|
|
19
24
|
## Adapter Interface
|
|
20
25
|
|
|
@@ -93,8 +98,8 @@ directly.
|
|
|
93
98
|
|
|
94
99
|
| Helper | Chain |
|
|
95
100
|
|--------|-------|
|
|
96
|
-
| `resolveAgentCli(agent, engine)` | `agent.cli` → `engine.defaultCli` → `'
|
|
97
|
-
| `resolveCcCli(engine)` | `engine.ccCli` → `engine.defaultCli` → `'
|
|
101
|
+
| `resolveAgentCli(agent, engine)` | `agent.cli` → `engine.defaultCli` → `'copilot'` |
|
|
102
|
+
| `resolveCcCli(engine)` | `engine.ccCli` → `engine.defaultCli` → `'copilot'` |
|
|
98
103
|
| `resolveAgentModel(agent, engine)` | `agent.model` → `engine.defaultModel` → undefined |
|
|
99
104
|
| `resolveCcModel(engine)` | `engine.ccModel` → `engine.defaultModel` → undefined |
|
|
100
105
|
| `resolveAgentMaxBudget(agent, engine)` | `agent.maxBudgetUsd` → `engine.maxBudgetUsd`. Honors literal `0`. |
|
package/engine/cc-worker-pool.js
CHANGED
|
@@ -54,6 +54,45 @@
|
|
|
54
54
|
const { spawn } = require('child_process');
|
|
55
55
|
const crypto = require('crypto');
|
|
56
56
|
|
|
57
|
+
// W-mpmwxni2000c25c7-c — typed error codes the pool emits through every
|
|
58
|
+
// failure exit so the consumer (CC streaming handler / doc-chat pool
|
|
59
|
+
// wrapper / SSE writer) can render a structured error envelope instead of
|
|
60
|
+
// parsing the stderr string. Matches the `{ message, code, retriable }`
|
|
61
|
+
// shape sub-item b standardized on for the dashboard's SSE envelope and
|
|
62
|
+
// the runtime adapter parseError() contract (engine/runtimes/*.js).
|
|
63
|
+
const ERROR_CODES = Object.freeze({
|
|
64
|
+
// spawn() threw synchronously OR the child process emitted an 'error'
|
|
65
|
+
// event (binary missing on PATH, exec failed, EPERM, etc.). Retriable
|
|
66
|
+
// because a transient PATH / fs glitch may recover.
|
|
67
|
+
WORKER_SPAWN_FAILED: 'worker-spawn-failed',
|
|
68
|
+
// The worker process exited DURING the ACP handshake (initialize or
|
|
69
|
+
// session/new) — usually `copilot login` is incomplete or the CLI
|
|
70
|
+
// version is too old. Also fires when session/new returns no
|
|
71
|
+
// sessionId. Retriable: the engine swaps to a fallback model / a re-auth
|
|
72
|
+
// may unblock the next attempt.
|
|
73
|
+
ACP_HANDSHAKE_FAILED: 'acp-handshake-failed',
|
|
74
|
+
// The worker process exited AFTER a successful handshake (the daemon
|
|
75
|
+
// died mid-turn). Retriable — the next call cold-spawns a fresh worker.
|
|
76
|
+
WORKER_DIED: 'worker-died',
|
|
77
|
+
// The consumer's per-turn timeout fired before the ACP session/prompt
|
|
78
|
+
// resolved. Owned by the dashboard pool wrappers (cc-worker-pool itself
|
|
79
|
+
// has no turn timeout) but exported here so all callers stringify the
|
|
80
|
+
// same constant. Retriable — most timeouts are transient.
|
|
81
|
+
CC_TURN_TIMEOUT: 'cc-turn-timeout',
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
// Build a typed Error carrying the `{ message, code, retriable }` envelope
|
|
85
|
+
// fields the consumer expects. Plain Errors flow through unchanged; the
|
|
86
|
+
// helper only stamps the extra metadata. Keep retriable defaulting to
|
|
87
|
+
// `true` so a caller that forgets to set it still gets the safe default
|
|
88
|
+
// (the legacy pre-typed-error code path treated every failure as retriable).
|
|
89
|
+
function _typedError(message, code, retriable = true) {
|
|
90
|
+
const err = new Error(message);
|
|
91
|
+
err.code = code;
|
|
92
|
+
err.retriable = retriable;
|
|
93
|
+
return err;
|
|
94
|
+
}
|
|
95
|
+
|
|
57
96
|
// 10 minutes — matches the work-item spec.
|
|
58
97
|
const IDLE_REAPER_MS = 10 * 60 * 1000;
|
|
59
98
|
// Reaper sweep cadence. Not exposed as ENGINE_DEFAULTS to keep the pool
|
|
@@ -176,8 +215,13 @@ class Worker {
|
|
|
176
215
|
try {
|
|
177
216
|
proc = _internals.spawnAcp({ cwd: this.cwd });
|
|
178
217
|
} catch (err) {
|
|
179
|
-
|
|
180
|
-
|
|
218
|
+
// spawn() threw synchronously — typically ENOENT (copilot binary not
|
|
219
|
+
// on PATH) or EACCES. Surface as worker-spawn-failed so the consumer
|
|
220
|
+
// can show "install the CLI / fix PATH" guidance.
|
|
221
|
+
throw _typedError(
|
|
222
|
+
`copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`,
|
|
223
|
+
ERROR_CODES.WORKER_SPAWN_FAILED,
|
|
224
|
+
true
|
|
181
225
|
);
|
|
182
226
|
}
|
|
183
227
|
this.proc = proc;
|
|
@@ -193,8 +237,13 @@ class Worker {
|
|
|
193
237
|
const earlyExitPromise = new Promise((_, reject) => {
|
|
194
238
|
earlyExitReject = (code) => {
|
|
195
239
|
this.killed = true;
|
|
196
|
-
|
|
197
|
-
|
|
240
|
+
// Early exit DURING the handshake = acp-handshake-failed (almost
|
|
241
|
+
// always missing `copilot login`, stale CLI, or daemon crash on
|
|
242
|
+
// boot). Retriable so re-auth or a CLI upgrade can recover.
|
|
243
|
+
const err = _typedError(
|
|
244
|
+
`copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (exit ${code})`,
|
|
245
|
+
ERROR_CODES.ACP_HANDSHAKE_FAILED,
|
|
246
|
+
true
|
|
198
247
|
);
|
|
199
248
|
this.spawnError = err;
|
|
200
249
|
this._failAllPending(err);
|
|
@@ -205,8 +254,13 @@ class Worker {
|
|
|
205
254
|
proc.once('exit', earlyExitHandler);
|
|
206
255
|
|
|
207
256
|
const errorHandler = (err) => {
|
|
208
|
-
|
|
209
|
-
|
|
257
|
+
// proc 'error' event fires when the OS can't actually start the child
|
|
258
|
+
// (ENOENT after a successful spawn() call, etc.). Treat as a spawn
|
|
259
|
+
// failure even though we made it past the synchronous spawn() above.
|
|
260
|
+
const wrapped = _typedError(
|
|
261
|
+
`copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`,
|
|
262
|
+
ERROR_CODES.WORKER_SPAWN_FAILED,
|
|
263
|
+
true
|
|
210
264
|
);
|
|
211
265
|
this.spawnError = wrapped;
|
|
212
266
|
this.killed = true;
|
|
@@ -227,7 +281,13 @@ class Worker {
|
|
|
227
281
|
]);
|
|
228
282
|
this.sessionId = result && result.sessionId;
|
|
229
283
|
if (!this.sessionId) {
|
|
230
|
-
|
|
284
|
+
// Handshake completed without an error but the daemon didn't hand
|
|
285
|
+
// back a sessionId — protocol violation or partial init failure.
|
|
286
|
+
throw _typedError(
|
|
287
|
+
'copilot --acp failed -- session/new returned no sessionId',
|
|
288
|
+
ERROR_CODES.ACP_HANDSHAKE_FAILED,
|
|
289
|
+
true
|
|
290
|
+
);
|
|
231
291
|
}
|
|
232
292
|
} finally {
|
|
233
293
|
// Either the handshake finished (swap to a persistent exit handler that
|
|
@@ -236,7 +296,13 @@ class Worker {
|
|
|
236
296
|
}
|
|
237
297
|
proc.on('exit', () => {
|
|
238
298
|
this.killed = true;
|
|
239
|
-
|
|
299
|
+
// Post-handshake exit = the daemon died mid-conversation. Retriable
|
|
300
|
+
// because the next call will cold-spawn a fresh worker.
|
|
301
|
+
const err = _typedError(
|
|
302
|
+
'copilot --acp process exited',
|
|
303
|
+
ERROR_CODES.WORKER_DIED,
|
|
304
|
+
true
|
|
305
|
+
);
|
|
240
306
|
this._failAllPending(err);
|
|
241
307
|
// Settle inflight too if it's still hanging
|
|
242
308
|
if (this.inflight && !this.inflight.settled) {
|
|
@@ -656,9 +722,13 @@ async function getSession({ tabId, model, effort, mcpServers, systemPromptHash,
|
|
|
656
722
|
// This is the bug class the ab141995 fix closed; if it ever recurs the
|
|
657
723
|
// engine should fail loudly rather than hand back a half-initialized
|
|
658
724
|
// handle. Throwing here lets the dashboard surface spawn-failed instead
|
|
659
|
-
// of the silent thinking-dots-forever symptom.
|
|
660
|
-
|
|
661
|
-
|
|
725
|
+
// of the silent thinking-dots-forever symptom. Mark non-retriable —
|
|
726
|
+
// this is a real engine bug, not a transient pool failure; the next
|
|
727
|
+
// attempt would hit the same race.
|
|
728
|
+
throw _typedError(
|
|
729
|
+
`cc-worker-pool: getSession returning handle with null sessionId (tab=${tabId} lifecycle=${lifecycle}) — engine race regression, see W-mpd45blx00072f04 / W-mpdavudb000v8446`,
|
|
730
|
+
ERROR_CODES.ACP_HANDSHAKE_FAILED,
|
|
731
|
+
false
|
|
662
732
|
);
|
|
663
733
|
}
|
|
664
734
|
|
|
@@ -766,4 +836,10 @@ module.exports = {
|
|
|
766
836
|
IDLE_REAPER_MS,
|
|
767
837
|
REAPER_INTERVAL_MS,
|
|
768
838
|
WARM_MAX_CONCURRENT,
|
|
839
|
+
// W-mpmwxni2000c25c7-c — typed-error envelope contract. Exported so the
|
|
840
|
+
// dashboard pool wrappers (and their tests) reference the same string
|
|
841
|
+
// constants and so the doc-chat timeout path can stamp the same
|
|
842
|
+
// `{ message, code, retriable }` shape the pool itself emits.
|
|
843
|
+
ERROR_CODES,
|
|
844
|
+
_typedError,
|
|
769
845
|
};
|
package/engine/llm.js
CHANGED
|
@@ -82,6 +82,21 @@ function trackEngineUsage(category, usage) {
|
|
|
82
82
|
_ensureFlushTimer();
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
+
// W-mpmwxni2000c25c7-b — silent-error regression counter. Every CC/doc-chat
|
|
86
|
+
// error surfaced through the handlers bumps `_engine[category].errorsByCode[code]`
|
|
87
|
+
// so /api/metrics reflects new error codes (cc-turn-timeout, empty-output, …)
|
|
88
|
+
// without polluting cost/tokens. Counters flush on the same timer as
|
|
89
|
+
// trackEngineUsage so the dashboard's fast-state mtime gate isn't bypassed.
|
|
90
|
+
function trackEngineError(category, errorCode) {
|
|
91
|
+
if (!category || !errorCode) return;
|
|
92
|
+
if (category.startsWith('_test') || category.startsWith('test-')) return;
|
|
93
|
+
if (!_pendingMetrics.engine[category]) _pendingMetrics.engine[category] = _emptyEngineDelta();
|
|
94
|
+
const cat = _pendingMetrics.engine[category];
|
|
95
|
+
if (!cat.errorsByCode) cat.errorsByCode = Object.create(null);
|
|
96
|
+
cat.errorsByCode[errorCode] = (cat.errorsByCode[errorCode] || 0) + 1;
|
|
97
|
+
_ensureFlushTimer();
|
|
98
|
+
}
|
|
99
|
+
|
|
85
100
|
function flushMetricsBuffer() {
|
|
86
101
|
const pending = _pendingMetrics;
|
|
87
102
|
if (!Object.keys(pending.engine).length && !Object.keys(pending.daily).length) return;
|
|
@@ -106,6 +121,12 @@ function flushMetricsBuffer() {
|
|
|
106
121
|
cat.totalDurationMs = (cat.totalDurationMs || 0) + delta.totalDurationMs;
|
|
107
122
|
cat.timedCalls = (cat.timedCalls || 0) + delta.timedCalls;
|
|
108
123
|
}
|
|
124
|
+
if (delta.errorsByCode) {
|
|
125
|
+
if (!cat.errorsByCode) cat.errorsByCode = {};
|
|
126
|
+
for (const [code, count] of Object.entries(delta.errorsByCode)) {
|
|
127
|
+
cat.errorsByCode[code] = (cat.errorsByCode[code] || 0) + count;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
109
130
|
}
|
|
110
131
|
if (!metrics._daily) metrics._daily = {};
|
|
111
132
|
for (const [day, delta] of Object.entries(pending.daily)) {
|
|
@@ -129,6 +150,12 @@ function flushMetricsBuffer() {
|
|
|
129
150
|
c.inputTokens += delta.inputTokens; c.outputTokens += delta.outputTokens;
|
|
130
151
|
c.cacheRead += delta.cacheRead; c.cacheCreation += delta.cacheCreation;
|
|
131
152
|
c.totalDurationMs += delta.totalDurationMs; c.timedCalls += delta.timedCalls;
|
|
153
|
+
if (delta.errorsByCode) {
|
|
154
|
+
if (!c.errorsByCode) c.errorsByCode = Object.create(null);
|
|
155
|
+
for (const [code, count] of Object.entries(delta.errorsByCode)) {
|
|
156
|
+
c.errorsByCode[code] = (c.errorsByCode[code] || 0) + count;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
132
159
|
}
|
|
133
160
|
for (const [day, delta] of Object.entries(pending.daily)) {
|
|
134
161
|
if (!_pendingMetrics.daily[day]) _pendingMetrics.daily[day] = _emptyDailyDelta();
|
|
@@ -233,6 +260,8 @@ function _missingRuntimeResult(runtimeName, runtime, reason) {
|
|
|
233
260
|
errorClass: shared.FAILURE_CLASS.CONFIG_ERROR,
|
|
234
261
|
errorMessage: message,
|
|
235
262
|
missingRuntime: true,
|
|
263
|
+
error: { message, code: shared.FAILURE_CLASS.CONFIG_ERROR, retriable: false },
|
|
264
|
+
ok: false,
|
|
236
265
|
};
|
|
237
266
|
}
|
|
238
267
|
|
|
@@ -245,7 +274,7 @@ function _resolvedCallResult(result) {
|
|
|
245
274
|
function _resolveRuntimeNameFor(callOpts = {}) {
|
|
246
275
|
let runtimeName = callOpts.cli;
|
|
247
276
|
if (!runtimeName && callOpts.engineConfig) runtimeName = resolveCcCli(callOpts.engineConfig);
|
|
248
|
-
return runtimeName || '
|
|
277
|
+
return runtimeName || 'copilot';
|
|
249
278
|
}
|
|
250
279
|
|
|
251
280
|
function _runtimeUnavailableResult(callOpts = {}) {
|
|
@@ -566,7 +595,7 @@ function _createStreamAccumulator({
|
|
|
566
595
|
|
|
567
596
|
function _resolveRuntimeFor(callOpts) {
|
|
568
597
|
// Explicit `cli` opt wins; otherwise fall to `engineConfig` resolution;
|
|
569
|
-
// otherwise default to
|
|
598
|
+
// otherwise default to copilot (fleet default as of W-mpmwxkk40007c995).
|
|
570
599
|
return resolveRuntime(_resolveRuntimeNameFor(callOpts));
|
|
571
600
|
}
|
|
572
601
|
|
|
@@ -599,6 +628,52 @@ function _resolveRuntimeFeatureOpts({
|
|
|
599
628
|
|
|
600
629
|
// ─── Core LLM Call ───────────────────────────────────────────────────────────
|
|
601
630
|
|
|
631
|
+
// W-mpmwxni2000c25c7-b — typed-error envelope helper. callLLM /
|
|
632
|
+
// callLLMStreaming attach `error: { message, code, retriable }` to every
|
|
633
|
+
// failure resolution so dashboard CC/doc-chat handlers can surface a
|
|
634
|
+
// structured 5xx JSON or SSE `event: error` instead of returning an empty
|
|
635
|
+
// reply that hangs the UI. The shape mirrors the existing `runtime.parseError`
|
|
636
|
+
// contract from sub-item (a) so adapter classifications (auth-failure,
|
|
637
|
+
// context-limit, budget-exceeded, crash, model-unavailable) propagate
|
|
638
|
+
// verbatim. Engine codes added here:
|
|
639
|
+
// - 'spawn-error' runFile/proc.on('error') failure (binary missing,
|
|
640
|
+
// EACCES, fork bomb, ...)
|
|
641
|
+
// - 'runtime-exit' non-zero exit code with no parseError signal
|
|
642
|
+
// - 'empty-output' zero exit but no parsed text — runtime returned
|
|
643
|
+
// nothing useful (CLI bug or silent timeout)
|
|
644
|
+
// - 'unparseable-output' bytes streamed but accumulator extracted no text
|
|
645
|
+
// (malformed JSONL or unknown event shape)
|
|
646
|
+
//
|
|
647
|
+
// Existing `errorClass` / `errorMessage` fields stay populated for callers
|
|
648
|
+
// that haven't moved to the typed envelope yet.
|
|
649
|
+
function _buildErrorEnvelope(errInfo, code, parsed, fallback) {
|
|
650
|
+
if (errInfo && errInfo.code) {
|
|
651
|
+
return { message: errInfo.message || fallback || 'LLM call failed', code: errInfo.code, retriable: errInfo.retriable !== false };
|
|
652
|
+
}
|
|
653
|
+
if (code !== 0 && code !== null) {
|
|
654
|
+
const stderrTail = parsed && parsed.stderr ? String(parsed.stderr).trim().split('\n').slice(-3).join(' | ').slice(0, 500) : '';
|
|
655
|
+
return {
|
|
656
|
+
message: stderrTail ? `Runtime exited with code ${code}: ${stderrTail}` : `Runtime exited with code ${code}`,
|
|
657
|
+
code: 'runtime-exit',
|
|
658
|
+
retriable: true,
|
|
659
|
+
};
|
|
660
|
+
}
|
|
661
|
+
if (parsed && parsed.text) return null;
|
|
662
|
+
const rawLen = parsed && parsed.raw ? String(parsed.raw).length : 0;
|
|
663
|
+
if (rawLen > 0) {
|
|
664
|
+
return {
|
|
665
|
+
message: 'Runtime produced output the adapter could not parse',
|
|
666
|
+
code: 'unparseable-output',
|
|
667
|
+
retriable: true,
|
|
668
|
+
};
|
|
669
|
+
}
|
|
670
|
+
return {
|
|
671
|
+
message: fallback || 'Runtime returned no output',
|
|
672
|
+
code: 'empty-output',
|
|
673
|
+
retriable: true,
|
|
674
|
+
};
|
|
675
|
+
}
|
|
676
|
+
|
|
602
677
|
function callLLM(promptText, sysPromptText, opts = {}) {
|
|
603
678
|
const {
|
|
604
679
|
timeout = 120000, label = 'llm', maxTurns = 1, allowedTools = '',
|
|
@@ -670,6 +745,7 @@ function callLLM(promptText, sysPromptText, opts = {}) {
|
|
|
670
745
|
const errInfo = code !== 0
|
|
671
746
|
? runtime.parseError([parsed.raw, parsed.stderr].filter(Boolean).join('\n'))
|
|
672
747
|
: { message: '', code: null, retriable: true };
|
|
748
|
+
const errorEnvelope = _buildErrorEnvelope(errInfo, code, parsed, null);
|
|
673
749
|
resolve({
|
|
674
750
|
text: parsed.text || '',
|
|
675
751
|
usage,
|
|
@@ -681,6 +757,8 @@ function callLLM(promptText, sysPromptText, opts = {}) {
|
|
|
681
757
|
runtime: runtime.name,
|
|
682
758
|
errorClass: errInfo.code,
|
|
683
759
|
errorMessage: errInfo.message || null,
|
|
760
|
+
error: errorEnvelope,
|
|
761
|
+
ok: !errorEnvelope,
|
|
684
762
|
});
|
|
685
763
|
};
|
|
686
764
|
|
|
@@ -704,6 +782,8 @@ function callLLM(promptText, sysPromptText, opts = {}) {
|
|
|
704
782
|
text: '', usage: null, sessionId: null, code: 1,
|
|
705
783
|
stderr: err.message, raw: '', toolUses: [],
|
|
706
784
|
runtime: runtime.name, errorClass: null, errorMessage: null,
|
|
785
|
+
error: { message: `Runtime spawn failed: ${err.message}`, code: 'spawn-error', retriable: true },
|
|
786
|
+
ok: false,
|
|
707
787
|
});
|
|
708
788
|
});
|
|
709
789
|
});
|
|
@@ -784,6 +864,7 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
784
864
|
const errInfo = code !== 0
|
|
785
865
|
? runtime.parseError([parsed.raw, parsed.stderr].filter(Boolean).join('\n'))
|
|
786
866
|
: { message: '', code: null, retriable: true };
|
|
867
|
+
const errorEnvelope = _buildErrorEnvelope(errInfo, code, parsed, null);
|
|
787
868
|
resolve({
|
|
788
869
|
text: parsed.text || '',
|
|
789
870
|
usage,
|
|
@@ -795,6 +876,8 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
795
876
|
runtime: runtime.name,
|
|
796
877
|
errorClass: errInfo.code,
|
|
797
878
|
errorMessage: errInfo.message || null,
|
|
879
|
+
error: errorEnvelope,
|
|
880
|
+
ok: !errorEnvelope,
|
|
798
881
|
});
|
|
799
882
|
};
|
|
800
883
|
|
|
@@ -818,6 +901,8 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
818
901
|
text: '', usage: null, sessionId: null, code: 1,
|
|
819
902
|
stderr: err.message, raw: '', toolUses: [],
|
|
820
903
|
runtime: runtime.name, errorClass: null, errorMessage: null,
|
|
904
|
+
error: { message: `Runtime spawn failed: ${err.message}`, code: 'spawn-error', retriable: true },
|
|
905
|
+
ok: false,
|
|
821
906
|
});
|
|
822
907
|
});
|
|
823
908
|
});
|
|
@@ -825,13 +910,74 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
825
910
|
return promise;
|
|
826
911
|
}
|
|
827
912
|
|
|
913
|
+
// ─── CC turn watchdog ────────────────────────────────────────────────────────
|
|
914
|
+
//
|
|
915
|
+
// W-mpmwxni2000c25c7-b — wall-clock cap for a single CC/doc-chat turn. CC turns
|
|
916
|
+
// are a higher-level concept than the per-LLM-call `timeout` opt: a turn can
|
|
917
|
+
// internally retry (resume → fresh → final retry) and each retry has its own
|
|
918
|
+
// per-call timer. Without a turn-level watchdog, a runtime stuck mid-stream
|
|
919
|
+
// (no exit, no chunks, no errors) leaves the SSE handler waiting for the
|
|
920
|
+
// per-call timer to fire and the user staring at the typing dots.
|
|
921
|
+
//
|
|
922
|
+
// Usage: `result = await withCcTurnTimeout({ timeoutMs, label, onAbortReady }, (registerAbort) => callerThatReturnsResultPromise(registerAbort))`.
|
|
923
|
+
// The caller plumbs `registerAbort(abortFn)` into every nested LLM call's
|
|
924
|
+
// `onAbortReady` so the watchdog can kill whichever attempt is in flight on
|
|
925
|
+
// expiry. Returns the original result on success or a synthetic envelope
|
|
926
|
+
// `{ text:'', error:{ code:'cc-turn-timeout', retriable:true } }` on expiry.
|
|
927
|
+
async function withCcTurnTimeout({ timeoutMs, label = 'cc-turn', onAbortReady } = {}, callFn) {
|
|
928
|
+
if (!timeoutMs || timeoutMs <= 0) return callFn(onAbortReady || (() => {}));
|
|
929
|
+
let currentAbort = null;
|
|
930
|
+
let timedOut = false;
|
|
931
|
+
let timer = null;
|
|
932
|
+
const registerAbort = (abort) => {
|
|
933
|
+
currentAbort = abort;
|
|
934
|
+
if (onAbortReady) onAbortReady(abort);
|
|
935
|
+
};
|
|
936
|
+
const inflight = Promise.resolve().then(() => callFn(registerAbort));
|
|
937
|
+
const timeoutPromise = new Promise((resolve) => {
|
|
938
|
+
timer = setTimeout(() => {
|
|
939
|
+
timedOut = true;
|
|
940
|
+
try { if (currentAbort) currentAbort(); } catch { /* swallow */ }
|
|
941
|
+
resolve(null);
|
|
942
|
+
}, timeoutMs);
|
|
943
|
+
// NOTE: do NOT unref this timer. If we did, Node would exit the event
|
|
944
|
+
// loop while waiting on the inflight promise (Promises themselves don't
|
|
945
|
+
// hold the loop open — only timers/I/O do). The race below clears the
|
|
946
|
+
// timer immediately on success, so a still-armed timer never leaks past
|
|
947
|
+
// the resolution.
|
|
948
|
+
});
|
|
949
|
+
const winner = await Promise.race([inflight, timeoutPromise]);
|
|
950
|
+
if (!timedOut) {
|
|
951
|
+
clearTimeout(timer);
|
|
952
|
+
return winner;
|
|
953
|
+
}
|
|
954
|
+
// Let the in-flight call settle so its cleanup (cleanupFiles/Dirs, kill
|
|
955
|
+
// sweeps) actually runs before we hand a synthetic envelope to the caller.
|
|
956
|
+
const settled = await inflight.catch((err) => ({
|
|
957
|
+
text: '', usage: null, sessionId: null, code: 1, stderr: String(err && err.message || err), raw: '', toolUses: [],
|
|
958
|
+
}));
|
|
959
|
+
const message = `CC turn ${label} timed out after ${timeoutMs}ms`;
|
|
960
|
+
return {
|
|
961
|
+
...settled,
|
|
962
|
+
text: '',
|
|
963
|
+
code: settled?.code || 1,
|
|
964
|
+
errorClass: 'cc-turn-timeout',
|
|
965
|
+
errorMessage: message,
|
|
966
|
+
error: { message, code: 'cc-turn-timeout', retriable: true },
|
|
967
|
+
ok: false,
|
|
968
|
+
};
|
|
969
|
+
}
|
|
970
|
+
|
|
828
971
|
module.exports = {
|
|
829
972
|
callLLM,
|
|
830
973
|
callLLMStreaming,
|
|
831
974
|
trackEngineUsage,
|
|
975
|
+
trackEngineError,
|
|
832
976
|
flushMetricsBuffer,
|
|
977
|
+
withCcTurnTimeout,
|
|
833
978
|
// Exposed for unit tests — engine code MUST use the runtime adapter contract.
|
|
834
979
|
_buildSpawnAgentFlags,
|
|
980
|
+
_buildErrorEnvelope,
|
|
835
981
|
_resolveBin,
|
|
836
982
|
_resetBinCache,
|
|
837
983
|
_resetMetricsBufferForTest,
|
package/engine/preflight.js
CHANGED
|
@@ -87,17 +87,17 @@ function findClaudeBinary() {
|
|
|
87
87
|
* `shared.runtimeConfigWarnings` so unknown-CLI warnings and binary checks
|
|
88
88
|
* always cover the same surface.
|
|
89
89
|
*
|
|
90
|
-
* Without a config (legacy callers), returns just `['
|
|
91
|
-
*
|
|
90
|
+
* Without a config (legacy callers), returns just `['copilot']` — matches
|
|
91
|
+
* `ENGINE_DEFAULTS.defaultCli` (W-mpmwxkk40007c995).
|
|
92
92
|
*/
|
|
93
93
|
function _distinctRuntimes(config) {
|
|
94
94
|
const set = new Set();
|
|
95
95
|
if (!config || typeof config !== 'object') {
|
|
96
|
-
set.add('
|
|
96
|
+
set.add('copilot');
|
|
97
97
|
return Array.from(set);
|
|
98
98
|
}
|
|
99
99
|
const engine = config.engine || {};
|
|
100
|
-
set.add(engine.defaultCli ? String(engine.defaultCli) : '
|
|
100
|
+
set.add(engine.defaultCli ? String(engine.defaultCli) : 'copilot');
|
|
101
101
|
if (engine.ccCli) set.add(String(engine.ccCli));
|
|
102
102
|
for (const agent of Object.values(config.agents || {})) {
|
|
103
103
|
if (agent && agent.cli) set.add(String(agent.cli));
|
|
@@ -355,7 +355,7 @@ function _fleetSummaryResults(config) {
|
|
|
355
355
|
const results = [];
|
|
356
356
|
if (!config || typeof config !== 'object') return results;
|
|
357
357
|
const engine = config.engine || {};
|
|
358
|
-
const defaultCli = engine.defaultCli ? String(engine.defaultCli) : '
|
|
358
|
+
const defaultCli = engine.defaultCli ? String(engine.defaultCli) : 'copilot';
|
|
359
359
|
const defaultModel = engine.defaultModel ? String(engine.defaultModel) : '(runtime default)';
|
|
360
360
|
results.push({ name: 'Fleet', ok: true, message: `defaultCli=${defaultCli} defaultModel=${defaultModel}` });
|
|
361
361
|
|
package/engine/queries.js
CHANGED
|
@@ -528,7 +528,7 @@ function getAgents(config) {
|
|
|
528
528
|
|
|
529
529
|
return roster.map(a => {
|
|
530
530
|
// Resolve which CLI runtime this agent dispatches to: per-agent override
|
|
531
|
-
// → engine.defaultCli → '
|
|
531
|
+
// → engine.defaultCli → 'copilot'. Surfaced so the dashboard can show a
|
|
532
532
|
// runtime tag next to the agent name.
|
|
533
533
|
const runtime = shared.resolveAgentCli(a, config.engine || {});
|
|
534
534
|
const inboxFiles = allInboxFiles.filter(f => f.includes(a.id));
|
|
@@ -1770,19 +1770,18 @@ function _projectGitStatusEqual(a, b) {
|
|
|
1770
1770
|
function _scheduleProjectGitStatusRefresh(localPath, key, configuredMainBranch) {
|
|
1771
1771
|
const existing = _projectGitStatusCache.get(key);
|
|
1772
1772
|
if (existing && existing.promise) return existing.promise;
|
|
1773
|
-
const entry = existing || { ts: 0, value: PROJECT_GIT_STATUS_PENDING, promise: null };
|
|
1773
|
+
const entry = existing || { ts: 0, value: PROJECT_GIT_STATUS_PENDING, promise: null, refMtimes: null };
|
|
1774
1774
|
const prevValue = entry.value;
|
|
1775
|
-
//
|
|
1776
|
-
//
|
|
1777
|
-
//
|
|
1778
|
-
//
|
|
1779
|
-
//
|
|
1780
|
-
// cache spuriously on the very next read. Probe-START is the safer
|
|
1781
|
-
// anchor — any file with `mtimeMs > probeStartTs` legitimately changed
|
|
1782
|
-
// at-or-after the probe, so re-probing is correct.
|
|
1775
|
+
// Snapshot ref mtimes BEFORE the probe so the next call compares against
|
|
1776
|
+
// an exact baseline rather than a Date.now() timestamp. On Windows
|
|
1777
|
+
// Date.now() can have ~15ms granularity while NTFS mtime is sub-ms, so
|
|
1778
|
+
// a file written shortly before the probe could appear `mtimeMs > ts`
|
|
1779
|
+
// even when nothing actually changed.
|
|
1783
1780
|
const probeStartTs = Date.now();
|
|
1781
|
+
const probeStartRefMtimes = _snapshotProjectGitRefMtimes(localPath, configuredMainBranch);
|
|
1784
1782
|
entry.promise = _probeProjectGitStatus(localPath, configuredMainBranch).then(value => {
|
|
1785
1783
|
entry.ts = probeStartTs;
|
|
1784
|
+
entry.refMtimes = probeStartRefMtimes;
|
|
1786
1785
|
entry.value = value;
|
|
1787
1786
|
entry.promise = null;
|
|
1788
1787
|
if (_onProjectGitStatusChanged && !_projectGitStatusEqual(prevValue, value)) {
|
|
@@ -1857,35 +1856,65 @@ function _resolveCommonGitDir(gitDir) {
|
|
|
1857
1856
|
return path.isAbsolute(raw) ? raw : path.resolve(gitDir, raw);
|
|
1858
1857
|
}
|
|
1859
1858
|
|
|
1860
|
-
//
|
|
1861
|
-
//
|
|
1862
|
-
//
|
|
1863
|
-
//
|
|
1864
|
-
|
|
1865
|
-
// (W-mphdmr8c00030124). Tolerates ENOENT on FETCH_HEAD / refs (never-
|
|
1866
|
-
// fetched repos simply haven't moved those files yet). Cost ≤3 statSync
|
|
1867
|
-
// per project per /api/status build — well under the 'cheap' budget
|
|
1868
|
-
// called out in getStatusFastStateMtimePaths's docstring.
|
|
1869
|
-
function _projectGitRefsAdvancedSince(localPath, cachedTs, configuredMainBranch) {
|
|
1859
|
+
// Enumerate the per-project git ref files we watch for cache-busting:
|
|
1860
|
+
// logs/HEAD (per-worktree gitdir), FETCH_HEAD + refs/remotes/origin/* (common
|
|
1861
|
+
// gitdir for linked worktrees). Same paths as the fast-state mtime tracker
|
|
1862
|
+
// so callers see a coherent view across surfaces.
|
|
1863
|
+
function _projectGitRefFiles(localPath, configuredMainBranch) {
|
|
1870
1864
|
const gitDir = _resolveGitDir(localPath);
|
|
1871
|
-
if (!gitDir) return
|
|
1872
|
-
// logs/HEAD is per-worktree; FETCH_HEAD + refs/remotes/origin/* live in
|
|
1873
|
-
// the COMMON gitdir for linked worktrees. For the main worktree both
|
|
1874
|
-
// resolve to the same place, so this is a no-op there.
|
|
1865
|
+
if (!gitDir) return null;
|
|
1875
1866
|
const commonGitDir = _resolveCommonGitDir(gitDir);
|
|
1876
|
-
const
|
|
1867
|
+
const files = [
|
|
1877
1868
|
path.join(gitDir, 'logs', 'HEAD'),
|
|
1878
1869
|
path.join(commonGitDir, 'FETCH_HEAD'),
|
|
1879
1870
|
];
|
|
1880
1871
|
const comparator = configuredMainBranch && String(configuredMainBranch).trim();
|
|
1881
1872
|
if (comparator) {
|
|
1882
|
-
|
|
1873
|
+
files.push(path.join(commonGitDir, 'refs', 'remotes', 'origin', comparator));
|
|
1883
1874
|
}
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1875
|
+
return files;
|
|
1876
|
+
}
|
|
1877
|
+
|
|
1878
|
+
// Snapshot mtimeMs for each ref file. Missing files record `null`. Used as
|
|
1879
|
+
// the baseline that the next `getProjectGitStatus` call compares against —
|
|
1880
|
+
// inequality, not threshold-vs-timestamp, so the result is precision-
|
|
1881
|
+
// independent (Windows `Date.now()` can be 15ms coarse while NTFS mtime is
|
|
1882
|
+
// sub-millisecond, which used to make threshold checks fire spuriously on
|
|
1883
|
+
// freshly-written files).
|
|
1884
|
+
function _snapshotProjectGitRefMtimes(localPath, configuredMainBranch) {
|
|
1885
|
+
const files = _projectGitRefFiles(localPath, configuredMainBranch);
|
|
1886
|
+
if (!files) return null;
|
|
1887
|
+
const out = Object.create(null);
|
|
1888
|
+
for (const f of files) {
|
|
1889
|
+
try { out[f] = fs.statSync(f).mtimeMs; }
|
|
1890
|
+
catch { out[f] = null; /* ENOENT recorded as null — flipping to present must bust */ }
|
|
1891
|
+
}
|
|
1892
|
+
return out;
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
// Return true when ANY tracked ref file's mtime (or existence) differs from
|
|
1896
|
+
// the snapshot captured during the last probe. Replaces the older threshold-
|
|
1897
|
+
// vs-cachedTs check that suffered from `Date.now()`/`mtimeMs` resolution
|
|
1898
|
+
// races on Windows. Lets `getProjectGitStatus` bypass its 15s TTL after
|
|
1899
|
+
// `git pull`, `git fetch`, `git checkout`, etc. so the next /api/status
|
|
1900
|
+
// reflects the new HEAD / ahead-behind within one SPA poll instead of
|
|
1901
|
+
// waiting out the TTL (W-mphdmr8c00030124). Cost: 2-3 statSync per call —
|
|
1902
|
+
// well under the 'cheap' budget.
|
|
1903
|
+
function _projectGitRefsAdvancedSince(localPath, configuredMainBranch, snapshot) {
|
|
1904
|
+
// No snapshot yet (legacy entry shape OR first call) — preserve the
|
|
1905
|
+
// current cached value so the TTL-only fast-path still works. A real
|
|
1906
|
+
// change still surfaces on the next /api/status because the fast-state
|
|
1907
|
+
// mtime tracker watches the same files and will bust the upstream cache.
|
|
1908
|
+
if (!snapshot) return false;
|
|
1909
|
+
const current = _snapshotProjectGitRefMtimes(localPath, configuredMainBranch);
|
|
1910
|
+
if (!current) return false;
|
|
1911
|
+
for (const file of Object.keys(snapshot)) {
|
|
1912
|
+
if (current[file] !== snapshot[file]) return true;
|
|
1913
|
+
}
|
|
1914
|
+
// Also catch a file that appeared since the snapshot (e.g. first `git
|
|
1915
|
+
// fetch` materialises FETCH_HEAD).
|
|
1916
|
+
for (const file of Object.keys(current)) {
|
|
1917
|
+
if (!(file in snapshot)) return true;
|
|
1889
1918
|
}
|
|
1890
1919
|
return false;
|
|
1891
1920
|
}
|
|
@@ -1902,7 +1931,7 @@ function getProjectGitStatus(localPath, configuredMainBranch = null) {
|
|
|
1902
1931
|
// user-visible lag) because the rebuilt fast-state still hits this
|
|
1903
1932
|
// cache and never schedules a refresh until the TTL itself expires.
|
|
1904
1933
|
if (cached && cached.ts && (now - cached.ts) < PROJECT_GIT_STATUS_TTL
|
|
1905
|
-
&& !_projectGitRefsAdvancedSince(localPath, cached.
|
|
1934
|
+
&& !_projectGitRefsAdvancedSince(localPath, configuredMainBranch, cached.refMtimes)) {
|
|
1906
1935
|
return cached.value;
|
|
1907
1936
|
}
|
|
1908
1937
|
// Cheap synchronous existsSync — short-circuits a path that just disappeared
|
package/engine/shared.js
CHANGED
|
@@ -1856,7 +1856,7 @@ const ENGINE_DEFAULTS = {
|
|
|
1856
1856
|
// Engine code MUST go through the resolveAgent*/resolveCc* helpers below;
|
|
1857
1857
|
// never read these fields directly. New runtimes are added by registering
|
|
1858
1858
|
// an adapter in engine/runtimes/index.js — these defaults stay stable.
|
|
1859
|
-
defaultCli: '
|
|
1859
|
+
defaultCli: 'copilot', // fleet-wide CLI runtime (must be a key in engine/runtimes/index.js); flipped from 'claude' in W-mpmwxkk40007c995 — Copilot is now the primary runtime, Claude remains supported as an opt-in
|
|
1860
1860
|
defaultModel: undefined, // fleet-wide model; undefined = let the runtime adapter pick its own default
|
|
1861
1861
|
ccCli: undefined, // CC/doc-chat CLI override; undefined = inherit defaultCli (independent of agent path)
|
|
1862
1862
|
ccModel: undefined, // CC/doc-chat model override; undefined = inherit defaultModel
|
|
@@ -1879,6 +1879,7 @@ const ENGINE_DEFAULTS = {
|
|
|
1879
1879
|
removeWorktreeFailureTtlMs: 24 * 60 * 60 * 1000, // stale failed paths are forgotten after a day
|
|
1880
1880
|
removeWorktreeFailureMaxEntries: 1000, // bound failed-worktree retry suppression cache
|
|
1881
1881
|
ccMaxTurns: 50, // max tool-use turns per CC/doc-chat call before CLI stops (per response, not per session)
|
|
1882
|
+
ccTurnTimeoutMs: 300000, // W-mpmwxni2000c25c7-b: wall-clock cap per CC/doc-chat turn; on expiry the in-flight LLM call is aborted and the handler surfaces `{code:'cc-turn-timeout', retriable:true}` instead of hanging the UI
|
|
1882
1883
|
docSessionMaxEntries: 200, // cap doc-chat session map/disk store by least-recent activity (LRU; sessions are non-expiring otherwise)
|
|
1883
1884
|
ccLiveStreamMaxAgeMs: 30 * 60 * 1000, // hard cap reconnect buffers if abort/cleanup stalls
|
|
1884
1885
|
metricsFlushIntervalMs: 10000, // batch trackEngineUsage writes to metrics.json — flushed every 10s instead of per-call to cut lock contention and dashboard mtime churn
|
|
@@ -2084,7 +2085,7 @@ function _isMeaningful(v) {
|
|
|
2084
2085
|
* Resolve the CLI runtime for a per-agent spawn. Priority:
|
|
2085
2086
|
* 1. `agent.cli` — per-agent override
|
|
2086
2087
|
* 2. `engine.defaultCli` — fleet default
|
|
2087
|
-
* 3. `ENGINE_DEFAULTS.defaultCli` ('
|
|
2088
|
+
* 3. `ENGINE_DEFAULTS.defaultCli` ('copilot') — hardcoded fallback
|
|
2088
2089
|
*
|
|
2089
2090
|
* Does NOT fall through to `engine.ccCli`. CC and agents are independent paths.
|
|
2090
2091
|
*/
|
|
@@ -2098,7 +2099,7 @@ function resolveAgentCli(agent, engine) {
|
|
|
2098
2099
|
* Resolve the CLI runtime for the Command Center / doc-chat. Priority:
|
|
2099
2100
|
* 1. `engine.ccCli` — CC-only override
|
|
2100
2101
|
* 2. `engine.defaultCli` — fleet default
|
|
2101
|
-
* 3. `ENGINE_DEFAULTS.defaultCli` ('
|
|
2102
|
+
* 3. `ENGINE_DEFAULTS.defaultCli` ('copilot') — hardcoded fallback
|
|
2102
2103
|
*
|
|
2103
2104
|
* Does NOT inspect any agent overrides. CC has no notion of "which agent" —
|
|
2104
2105
|
* it's a fleet-wide singleton.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yemi33/minions",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2046",
|
|
4
4
|
"description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
|
|
5
5
|
"bin": {
|
|
6
6
|
"minions": "bin/minions.js"
|