@yemi33/minions 0.1.2044 → 0.1.2046
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dashboard/js/command-center.js +64 -7
- package/dashboard/js/fre.js +3 -2
- package/dashboard/js/refresh.js +143 -2
- package/dashboard/js/render-prs.js +43 -9
- package/dashboard/js/settings.js +9 -5
- package/dashboard/styles.css +21 -0
- package/dashboard.js +308 -164
- package/docs/auto-discovery.md +3 -1
- package/docs/qa-runbook-lifecycle.md +71 -0
- package/docs/qa-runbooks.md +6 -5
- package/docs/runtime-adapters.md +9 -4
- package/docs/security.md +2 -1
- package/docs/watches.md +19 -19
- package/engine/cc-worker-pool.js +87 -11
- package/engine/cleanup.js +84 -2
- package/engine/dispatch.js +6 -0
- package/engine/kb-sweep.js +127 -0
- package/engine/lifecycle.js +18 -0
- package/engine/llm.js +148 -2
- package/engine/preflight.js +5 -5
- package/engine/queries.js +133 -27
- package/engine/shared.js +40 -3
- package/engine/timeout.js +4 -0
- package/engine.js +240 -11
- package/package.json +1 -1
package/engine/kb-sweep.js
CHANGED
|
@@ -23,6 +23,8 @@ const KB_SWEEP_STATE_PATH = path.join(ENGINE_DIR, 'kb-sweep-state.json');
|
|
|
23
23
|
const KB_SWEEP_LOG_PATH = path.join(ENGINE_DIR, 'kb-sweep.log');
|
|
24
24
|
const KB_SWEEP_RUNNER_PATH = path.join(__dirname, 'kb-sweep-runner.js');
|
|
25
25
|
const SWEPT_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
|
|
26
|
+
const AUTO_SWEEP_INTERVAL_MS = 4 * 60 * 60 * 1000;
|
|
27
|
+
const KB_SWEPT_PATH = path.join(ENGINE_DIR, 'kb-swept.json');
|
|
26
28
|
const COMPRESS_THRESHOLD_BYTES = 5000;
|
|
27
29
|
const LLM_BATCH_SIZE = 30;
|
|
28
30
|
const NORMALIZE_CONCURRENCY = 5;
|
|
@@ -555,6 +557,127 @@ async function _runKbSweepImpl(opts = {}) {
|
|
|
555
557
|
return summary;
|
|
556
558
|
}
|
|
557
559
|
|
|
560
|
+
/**
|
|
561
|
+
* Spawn the KB sweep runner (`engine/kb-sweep-runner.js`) as a detached child.
|
|
562
|
+
* Shared between dashboard's POST /api/knowledge/sweep handler and the engine
|
|
563
|
+
* tick's auto-sweep phase. Performs the same synchronous "starting" → "in-flight"
|
|
564
|
+
* CAS dance the dashboard handler used to do inline.
|
|
565
|
+
*
|
|
566
|
+
* Callers are responsible for the in-flight / stale-guard check BEFORE calling
|
|
567
|
+
* (so they can return distinct HTTP responses or log levels).
|
|
568
|
+
*
|
|
569
|
+
* @param {object} opts
|
|
570
|
+
* @param {string[]} [opts.pinnedKeys] - extra pinned KB keys to skip in the sweep
|
|
571
|
+
* @param {boolean} [opts.dryRun] - dry-run mode for the runner
|
|
572
|
+
* @param {string} [opts.cwd=MINIONS_DIR] - working directory for the spawned runner
|
|
573
|
+
* @param {(level:string,msg:string)=>void} [opts.log] - logger (defaults to console)
|
|
574
|
+
* @returns {{ sweepToken:string, pid:number|null, bodyFile:string|null,
|
|
575
|
+
* ok:boolean, error?:string }}
|
|
576
|
+
* ok=false + error on synchronous spawn failure; the "starting" claim is
|
|
577
|
+
* released so the caller can retry immediately.
|
|
578
|
+
*/
|
|
579
|
+
function spawnSweepRunnerDetached(opts = {}) {
|
|
580
|
+
const fsLocal = require('fs');
|
|
581
|
+
const { spawn: cpSpawn } = require('child_process');
|
|
582
|
+
const logFn = typeof opts.log === 'function'
|
|
583
|
+
? opts.log
|
|
584
|
+
: (level, msg) => { (level === 'error' ? console.error : console.log)(`[kb-sweep] ${msg}`); };
|
|
585
|
+
const cwd = opts.cwd || require('./queries').MINIONS_DIR;
|
|
586
|
+
const startedAt = Date.now();
|
|
587
|
+
const sweepToken = `${startedAt}-${Math.random().toString(36).slice(2, 8)}`;
|
|
588
|
+
|
|
589
|
+
try {
|
|
590
|
+
safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify({
|
|
591
|
+
status: 'starting', startedAt, startedAtIso: new Date().toISOString(),
|
|
592
|
+
sweepToken, pid: null,
|
|
593
|
+
}));
|
|
594
|
+
} catch (e) {
|
|
595
|
+
logFn('error', `failed to write starting state: ${e.message}`);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
let bodyFile = null;
|
|
599
|
+
const hasBody = (Array.isArray(opts.pinnedKeys) && opts.pinnedKeys.length > 0)
|
|
600
|
+
|| opts.dryRun != null;
|
|
601
|
+
if (hasBody) {
|
|
602
|
+
bodyFile = path.join(ENGINE_DIR, `tmp-kb-sweep-body-${sweepToken}.json`);
|
|
603
|
+
try {
|
|
604
|
+
safeWrite(bodyFile, JSON.stringify({
|
|
605
|
+
pinnedKeys: Array.isArray(opts.pinnedKeys) ? opts.pinnedKeys : undefined,
|
|
606
|
+
dryRun: opts.dryRun != null ? !!opts.dryRun : undefined,
|
|
607
|
+
}));
|
|
608
|
+
} catch (e) {
|
|
609
|
+
logFn('error', `failed to write body-file ${bodyFile}: ${e.message}`);
|
|
610
|
+
bodyFile = null;
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
let logFdNum = null;
|
|
615
|
+
let stdio = ['ignore', 'ignore', 'ignore'];
|
|
616
|
+
try {
|
|
617
|
+
logFdNum = fsLocal.openSync(KB_SWEEP_LOG_PATH, 'a');
|
|
618
|
+
stdio = ['ignore', logFdNum, logFdNum];
|
|
619
|
+
} catch (e) {
|
|
620
|
+
logFn('error', `failed to open log ${KB_SWEEP_LOG_PATH}: ${e.message}`);
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
const spawnArgs = ['--sweep-token', sweepToken];
|
|
624
|
+
if (bodyFile) spawnArgs.push('--body-file', bodyFile);
|
|
625
|
+
|
|
626
|
+
let proc;
|
|
627
|
+
try {
|
|
628
|
+
proc = cpSpawn(process.execPath, [KB_SWEEP_RUNNER_PATH, ...spawnArgs], {
|
|
629
|
+
cwd, stdio, detached: true, windowsHide: true,
|
|
630
|
+
env: { ...process.env },
|
|
631
|
+
});
|
|
632
|
+
} catch (e) {
|
|
633
|
+
if (logFdNum != null) try { fsLocal.closeSync(logFdNum); } catch { /* ignore */ }
|
|
634
|
+
if (bodyFile) try { fsLocal.unlinkSync(bodyFile); } catch { /* ignore */ }
|
|
635
|
+
try { shared.safeUnlink(KB_SWEEP_STATE_PATH); } catch { /* ignore */ }
|
|
636
|
+
return { ok: false, error: `spawn failed: ${e.message}`, sweepToken, pid: null, bodyFile: null };
|
|
637
|
+
}
|
|
638
|
+
if (logFdNum != null) try { fsLocal.closeSync(logFdNum); } catch { /* ignore */ }
|
|
639
|
+
|
|
640
|
+
try {
|
|
641
|
+
const current = safeJson(KB_SWEEP_STATE_PATH);
|
|
642
|
+
if (current && current.status === 'starting' && current.sweepToken === sweepToken) {
|
|
643
|
+
safeWrite(KB_SWEEP_STATE_PATH, JSON.stringify({
|
|
644
|
+
status: 'in-flight', startedAt, startedAtIso: new Date().toISOString(),
|
|
645
|
+
sweepToken, pid: proc.pid,
|
|
646
|
+
}));
|
|
647
|
+
}
|
|
648
|
+
} catch { /* best-effort */ }
|
|
649
|
+
|
|
650
|
+
proc.unref();
|
|
651
|
+
return { ok: true, sweepToken, pid: proc.pid, bodyFile };
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
/**
|
|
655
|
+
* Decide whether the engine tick should auto-spawn a sweep right now.
|
|
656
|
+
* Pure function (reads disk, no side effects). Used by the tick's
|
|
657
|
+
* auto-sweep phase.
|
|
658
|
+
*
|
|
659
|
+
* @param {object} [opts]
|
|
660
|
+
* @param {number} [opts.now=Date.now()] injectable clock (tests)
|
|
661
|
+
* @param {number} [opts.intervalMs=AUTO_SWEEP_INTERVAL_MS]
|
|
662
|
+
* @param {object} [opts.liveness] pre-computed liveness (optional)
|
|
663
|
+
* @returns {{ shouldSpawn:boolean, reason:string, lastCompletedAt:number|null }}
|
|
664
|
+
*/
|
|
665
|
+
function shouldAutoSweep(opts = {}) {
|
|
666
|
+
const now = Number(opts.now) || Date.now();
|
|
667
|
+
const intervalMs = Number(opts.intervalMs) || AUTO_SWEEP_INTERVAL_MS;
|
|
668
|
+
const liveness = opts.liveness || readSweepLiveness({ entryCount: opts.entryCount || 0, now });
|
|
669
|
+
if (liveness.inFlight && liveness.alive && !liveness.stale) {
|
|
670
|
+
return { shouldSpawn: false, reason: 'sweep-in-flight', lastCompletedAt: null };
|
|
671
|
+
}
|
|
672
|
+
const swept = safeJson(KB_SWEPT_PATH);
|
|
673
|
+
const sweptTs = swept && swept.timestamp ? Date.parse(swept.timestamp) : NaN;
|
|
674
|
+
const lastCompletedAt = Number.isFinite(sweptTs) ? sweptTs : null;
|
|
675
|
+
if (lastCompletedAt != null && (now - lastCompletedAt) < intervalMs) {
|
|
676
|
+
return { shouldSpawn: false, reason: 'within-interval', lastCompletedAt };
|
|
677
|
+
}
|
|
678
|
+
return { shouldSpawn: true, reason: lastCompletedAt == null ? 'no-prior-sweep' : 'interval-elapsed', lastCompletedAt };
|
|
679
|
+
}
|
|
680
|
+
|
|
558
681
|
/** Compute a dynamic stale-guard timeout based on KB size. */
|
|
559
682
|
function staleGuardMs(entryCount) {
|
|
560
683
|
// 30 minutes minimum, plus 1 second per entry (for the rewrite pass)
|
|
@@ -566,6 +689,10 @@ module.exports = {
|
|
|
566
689
|
staleGuardMs,
|
|
567
690
|
readSweepLiveness,
|
|
568
691
|
reconcileSweepStateOnBoot,
|
|
692
|
+
spawnSweepRunnerDetached,
|
|
693
|
+
shouldAutoSweep,
|
|
694
|
+
AUTO_SWEEP_INTERVAL_MS,
|
|
695
|
+
KB_SWEPT_PATH,
|
|
569
696
|
KB_SWEEP_STATE_PATH,
|
|
570
697
|
KB_SWEEP_LOG_PATH,
|
|
571
698
|
KB_SWEEP_RUNNER_PATH,
|
package/engine/lifecycle.js
CHANGED
|
@@ -595,6 +595,7 @@ function updateWorkItemStatus(meta, status, reason) {
|
|
|
595
595
|
delete target.failReason;
|
|
596
596
|
delete target.failedAt;
|
|
597
597
|
delete target._retryCount;
|
|
598
|
+
delete target._retriesByAgent;
|
|
598
599
|
target.completedAgents = Object.entries(target.agentResults)
|
|
599
600
|
.filter(([, r]) => r.status === WI_STATUS.DONE)
|
|
600
601
|
.map(([a]) => a);
|
|
@@ -611,6 +612,7 @@ function updateWorkItemStatus(meta, status, reason) {
|
|
|
611
612
|
delete target.failReason;
|
|
612
613
|
delete target.failedAt;
|
|
613
614
|
delete target._retryCount;
|
|
615
|
+
delete target._retriesByAgent;
|
|
614
616
|
// P-e0b4f7a5 — successful completion (including a phantom-retry
|
|
615
617
|
// succeeding) clears the phantom markers so cleanup can reap the
|
|
616
618
|
// worktree on the next sweep.
|
|
@@ -3218,6 +3220,14 @@ function _deferRetryWithCounter(meta, detection, counterField, maxCount, pending
|
|
|
3218
3220
|
w._lastRetryAt = ts();
|
|
3219
3221
|
w._lastRetryReason = reason;
|
|
3220
3222
|
w._pendingReason = pendingReason;
|
|
3223
|
+
// W-mpmwxn1j — only the standard PR-attachment / nonterminal counter
|
|
3224
|
+
// (_retryCount) participates in per-agent reassignment. Phantom
|
|
3225
|
+
// retries (runtime crashes before any work product) are not
|
|
3226
|
+
// agent-specific failures, so we don't bump _retriesByAgent for them.
|
|
3227
|
+
if (counterField === '_retryCount') {
|
|
3228
|
+
const failedAgent = meta?._agentId || w.dispatched_to;
|
|
3229
|
+
if (failedAgent) shared.bumpAgentRetryCount(w, failedAgent);
|
|
3230
|
+
}
|
|
3221
3231
|
// P-e0b4f7a5 — phantom-retry path stamps _phantomCompletion +
|
|
3222
3232
|
// _phantomBranch so cleanup.js can preserve the worktree across the
|
|
3223
3233
|
// re-dispatch window. Only set for the phantom counter; nonterminal
|
|
@@ -4018,6 +4028,10 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
|
|
|
4018
4028
|
w._retryCount = retries + 1;
|
|
4019
4029
|
w._lastRetryAt = ts();
|
|
4020
4030
|
w._lastRetryReason = 'no review verdict';
|
|
4031
|
+
// W-mpmwxn1j — bump per-agent counter so a reviewer who never
|
|
4032
|
+
// emits a verdict gets reassigned after maxRetriesPerAgent hits.
|
|
4033
|
+
const failedAgent = meta?._agentId || w.dispatched_to;
|
|
4034
|
+
if (failedAgent) shared.bumpAgentRetryCount(w, failedAgent);
|
|
4021
4035
|
delete w.dispatched_at;
|
|
4022
4036
|
delete w.completedAt;
|
|
4023
4037
|
delete w._pendingReason;
|
|
@@ -4125,6 +4139,10 @@ async function runPostCompletionHooks(dispatchItem, agentId, code, stdout, confi
|
|
|
4125
4139
|
if (retries < ENGINE_DEFAULTS.maxRetries) {
|
|
4126
4140
|
w.status = WI_STATUS.PENDING;
|
|
4127
4141
|
w._retryCount = retries + 1;
|
|
4142
|
+
// W-mpmwxn1j — bump per-agent counter so a planner that never
|
|
4143
|
+
// writes the PRD gets reassigned after maxRetriesPerAgent hits.
|
|
4144
|
+
const failedAgent = meta?._agentId || w.dispatched_to;
|
|
4145
|
+
if (failedAgent) shared.bumpAgentRetryCount(w, failedAgent);
|
|
4128
4146
|
delete w.dispatched_at;
|
|
4129
4147
|
delete w.completedAt;
|
|
4130
4148
|
log('warn', `plan-to-prd ${meta.item.id} completed without PRD file — auto-retry ${retries + 1}/${ENGINE_DEFAULTS.maxRetries}`);
|
package/engine/llm.js
CHANGED
|
@@ -82,6 +82,21 @@ function trackEngineUsage(category, usage) {
|
|
|
82
82
|
_ensureFlushTimer();
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
+
// W-mpmwxni2000c25c7-b — silent-error regression counter. Every CC/doc-chat
|
|
86
|
+
// error surfaced through the handlers bumps `_engine[category].errorsByCode[code]`
|
|
87
|
+
// so /api/metrics reflects new error codes (cc-turn-timeout, empty-output, …)
|
|
88
|
+
// without polluting cost/tokens. Counters flush on the same timer as
|
|
89
|
+
// trackEngineUsage so the dashboard's fast-state mtime gate isn't bypassed.
|
|
90
|
+
function trackEngineError(category, errorCode) {
|
|
91
|
+
if (!category || !errorCode) return;
|
|
92
|
+
if (category.startsWith('_test') || category.startsWith('test-')) return;
|
|
93
|
+
if (!_pendingMetrics.engine[category]) _pendingMetrics.engine[category] = _emptyEngineDelta();
|
|
94
|
+
const cat = _pendingMetrics.engine[category];
|
|
95
|
+
if (!cat.errorsByCode) cat.errorsByCode = Object.create(null);
|
|
96
|
+
cat.errorsByCode[errorCode] = (cat.errorsByCode[errorCode] || 0) + 1;
|
|
97
|
+
_ensureFlushTimer();
|
|
98
|
+
}
|
|
99
|
+
|
|
85
100
|
function flushMetricsBuffer() {
|
|
86
101
|
const pending = _pendingMetrics;
|
|
87
102
|
if (!Object.keys(pending.engine).length && !Object.keys(pending.daily).length) return;
|
|
@@ -106,6 +121,12 @@ function flushMetricsBuffer() {
|
|
|
106
121
|
cat.totalDurationMs = (cat.totalDurationMs || 0) + delta.totalDurationMs;
|
|
107
122
|
cat.timedCalls = (cat.timedCalls || 0) + delta.timedCalls;
|
|
108
123
|
}
|
|
124
|
+
if (delta.errorsByCode) {
|
|
125
|
+
if (!cat.errorsByCode) cat.errorsByCode = {};
|
|
126
|
+
for (const [code, count] of Object.entries(delta.errorsByCode)) {
|
|
127
|
+
cat.errorsByCode[code] = (cat.errorsByCode[code] || 0) + count;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
109
130
|
}
|
|
110
131
|
if (!metrics._daily) metrics._daily = {};
|
|
111
132
|
for (const [day, delta] of Object.entries(pending.daily)) {
|
|
@@ -129,6 +150,12 @@ function flushMetricsBuffer() {
|
|
|
129
150
|
c.inputTokens += delta.inputTokens; c.outputTokens += delta.outputTokens;
|
|
130
151
|
c.cacheRead += delta.cacheRead; c.cacheCreation += delta.cacheCreation;
|
|
131
152
|
c.totalDurationMs += delta.totalDurationMs; c.timedCalls += delta.timedCalls;
|
|
153
|
+
if (delta.errorsByCode) {
|
|
154
|
+
if (!c.errorsByCode) c.errorsByCode = Object.create(null);
|
|
155
|
+
for (const [code, count] of Object.entries(delta.errorsByCode)) {
|
|
156
|
+
c.errorsByCode[code] = (c.errorsByCode[code] || 0) + count;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
132
159
|
}
|
|
133
160
|
for (const [day, delta] of Object.entries(pending.daily)) {
|
|
134
161
|
if (!_pendingMetrics.daily[day]) _pendingMetrics.daily[day] = _emptyDailyDelta();
|
|
@@ -233,6 +260,8 @@ function _missingRuntimeResult(runtimeName, runtime, reason) {
|
|
|
233
260
|
errorClass: shared.FAILURE_CLASS.CONFIG_ERROR,
|
|
234
261
|
errorMessage: message,
|
|
235
262
|
missingRuntime: true,
|
|
263
|
+
error: { message, code: shared.FAILURE_CLASS.CONFIG_ERROR, retriable: false },
|
|
264
|
+
ok: false,
|
|
236
265
|
};
|
|
237
266
|
}
|
|
238
267
|
|
|
@@ -245,7 +274,7 @@ function _resolvedCallResult(result) {
|
|
|
245
274
|
function _resolveRuntimeNameFor(callOpts = {}) {
|
|
246
275
|
let runtimeName = callOpts.cli;
|
|
247
276
|
if (!runtimeName && callOpts.engineConfig) runtimeName = resolveCcCli(callOpts.engineConfig);
|
|
248
|
-
return runtimeName || '
|
|
277
|
+
return runtimeName || 'copilot';
|
|
249
278
|
}
|
|
250
279
|
|
|
251
280
|
function _runtimeUnavailableResult(callOpts = {}) {
|
|
@@ -566,7 +595,7 @@ function _createStreamAccumulator({
|
|
|
566
595
|
|
|
567
596
|
function _resolveRuntimeFor(callOpts) {
|
|
568
597
|
// Explicit `cli` opt wins; otherwise fall to `engineConfig` resolution;
|
|
569
|
-
// otherwise default to
|
|
598
|
+
// otherwise default to copilot (fleet default as of W-mpmwxkk40007c995).
|
|
570
599
|
return resolveRuntime(_resolveRuntimeNameFor(callOpts));
|
|
571
600
|
}
|
|
572
601
|
|
|
@@ -599,6 +628,52 @@ function _resolveRuntimeFeatureOpts({
|
|
|
599
628
|
|
|
600
629
|
// ─── Core LLM Call ───────────────────────────────────────────────────────────
|
|
601
630
|
|
|
631
|
+
// W-mpmwxni2000c25c7-b — typed-error envelope helper. callLLM /
|
|
632
|
+
// callLLMStreaming attach `error: { message, code, retriable }` to every
|
|
633
|
+
// failure resolution so dashboard CC/doc-chat handlers can surface a
|
|
634
|
+
// structured 5xx JSON or SSE `event: error` instead of returning an empty
|
|
635
|
+
// reply that hangs the UI. The shape mirrors the existing `runtime.parseError`
|
|
636
|
+
// contract from sub-item (a) so adapter classifications (auth-failure,
|
|
637
|
+
// context-limit, budget-exceeded, crash, model-unavailable) propagate
|
|
638
|
+
// verbatim. Engine codes added here:
|
|
639
|
+
// - 'spawn-error' runFile/proc.on('error') failure (binary missing,
|
|
640
|
+
// EACCES, fork bomb, ...)
|
|
641
|
+
// - 'runtime-exit' non-zero exit code with no parseError signal
|
|
642
|
+
// - 'empty-output' zero exit but no parsed text — runtime returned
|
|
643
|
+
// nothing useful (CLI bug or silent timeout)
|
|
644
|
+
// - 'unparseable-output' bytes streamed but accumulator extracted no text
|
|
645
|
+
// (malformed JSONL or unknown event shape)
|
|
646
|
+
//
|
|
647
|
+
// Existing `errorClass` / `errorMessage` fields stay populated for callers
|
|
648
|
+
// that haven't moved to the typed envelope yet.
|
|
649
|
+
function _buildErrorEnvelope(errInfo, code, parsed, fallback) {
|
|
650
|
+
if (errInfo && errInfo.code) {
|
|
651
|
+
return { message: errInfo.message || fallback || 'LLM call failed', code: errInfo.code, retriable: errInfo.retriable !== false };
|
|
652
|
+
}
|
|
653
|
+
if (code !== 0 && code !== null) {
|
|
654
|
+
const stderrTail = parsed && parsed.stderr ? String(parsed.stderr).trim().split('\n').slice(-3).join(' | ').slice(0, 500) : '';
|
|
655
|
+
return {
|
|
656
|
+
message: stderrTail ? `Runtime exited with code ${code}: ${stderrTail}` : `Runtime exited with code ${code}`,
|
|
657
|
+
code: 'runtime-exit',
|
|
658
|
+
retriable: true,
|
|
659
|
+
};
|
|
660
|
+
}
|
|
661
|
+
if (parsed && parsed.text) return null;
|
|
662
|
+
const rawLen = parsed && parsed.raw ? String(parsed.raw).length : 0;
|
|
663
|
+
if (rawLen > 0) {
|
|
664
|
+
return {
|
|
665
|
+
message: 'Runtime produced output the adapter could not parse',
|
|
666
|
+
code: 'unparseable-output',
|
|
667
|
+
retriable: true,
|
|
668
|
+
};
|
|
669
|
+
}
|
|
670
|
+
return {
|
|
671
|
+
message: fallback || 'Runtime returned no output',
|
|
672
|
+
code: 'empty-output',
|
|
673
|
+
retriable: true,
|
|
674
|
+
};
|
|
675
|
+
}
|
|
676
|
+
|
|
602
677
|
function callLLM(promptText, sysPromptText, opts = {}) {
|
|
603
678
|
const {
|
|
604
679
|
timeout = 120000, label = 'llm', maxTurns = 1, allowedTools = '',
|
|
@@ -670,6 +745,7 @@ function callLLM(promptText, sysPromptText, opts = {}) {
|
|
|
670
745
|
const errInfo = code !== 0
|
|
671
746
|
? runtime.parseError([parsed.raw, parsed.stderr].filter(Boolean).join('\n'))
|
|
672
747
|
: { message: '', code: null, retriable: true };
|
|
748
|
+
const errorEnvelope = _buildErrorEnvelope(errInfo, code, parsed, null);
|
|
673
749
|
resolve({
|
|
674
750
|
text: parsed.text || '',
|
|
675
751
|
usage,
|
|
@@ -681,6 +757,8 @@ function callLLM(promptText, sysPromptText, opts = {}) {
|
|
|
681
757
|
runtime: runtime.name,
|
|
682
758
|
errorClass: errInfo.code,
|
|
683
759
|
errorMessage: errInfo.message || null,
|
|
760
|
+
error: errorEnvelope,
|
|
761
|
+
ok: !errorEnvelope,
|
|
684
762
|
});
|
|
685
763
|
};
|
|
686
764
|
|
|
@@ -704,6 +782,8 @@ function callLLM(promptText, sysPromptText, opts = {}) {
|
|
|
704
782
|
text: '', usage: null, sessionId: null, code: 1,
|
|
705
783
|
stderr: err.message, raw: '', toolUses: [],
|
|
706
784
|
runtime: runtime.name, errorClass: null, errorMessage: null,
|
|
785
|
+
error: { message: `Runtime spawn failed: ${err.message}`, code: 'spawn-error', retriable: true },
|
|
786
|
+
ok: false,
|
|
707
787
|
});
|
|
708
788
|
});
|
|
709
789
|
});
|
|
@@ -784,6 +864,7 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
784
864
|
const errInfo = code !== 0
|
|
785
865
|
? runtime.parseError([parsed.raw, parsed.stderr].filter(Boolean).join('\n'))
|
|
786
866
|
: { message: '', code: null, retriable: true };
|
|
867
|
+
const errorEnvelope = _buildErrorEnvelope(errInfo, code, parsed, null);
|
|
787
868
|
resolve({
|
|
788
869
|
text: parsed.text || '',
|
|
789
870
|
usage,
|
|
@@ -795,6 +876,8 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
795
876
|
runtime: runtime.name,
|
|
796
877
|
errorClass: errInfo.code,
|
|
797
878
|
errorMessage: errInfo.message || null,
|
|
879
|
+
error: errorEnvelope,
|
|
880
|
+
ok: !errorEnvelope,
|
|
798
881
|
});
|
|
799
882
|
};
|
|
800
883
|
|
|
@@ -818,6 +901,8 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
818
901
|
text: '', usage: null, sessionId: null, code: 1,
|
|
819
902
|
stderr: err.message, raw: '', toolUses: [],
|
|
820
903
|
runtime: runtime.name, errorClass: null, errorMessage: null,
|
|
904
|
+
error: { message: `Runtime spawn failed: ${err.message}`, code: 'spawn-error', retriable: true },
|
|
905
|
+
ok: false,
|
|
821
906
|
});
|
|
822
907
|
});
|
|
823
908
|
});
|
|
@@ -825,13 +910,74 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
825
910
|
return promise;
|
|
826
911
|
}
|
|
827
912
|
|
|
913
|
+
// ─── CC turn watchdog ────────────────────────────────────────────────────────
|
|
914
|
+
//
|
|
915
|
+
// W-mpmwxni2000c25c7-b — wall-clock cap for a single CC/doc-chat turn. CC turns
|
|
916
|
+
// are a higher-level concept than the per-LLM-call `timeout` opt: a turn can
|
|
917
|
+
// internally retry (resume → fresh → final retry) and each retry has its own
|
|
918
|
+
// per-call timer. Without a turn-level watchdog, a runtime stuck mid-stream
|
|
919
|
+
// (no exit, no chunks, no errors) leaves the SSE handler waiting for the
|
|
920
|
+
// per-call timer to fire and the user staring at the typing dots.
|
|
921
|
+
//
|
|
922
|
+
// Usage: `result = await withCcTurnTimeout({ timeoutMs, label, onAbortReady }, (registerAbort) => callerThatReturnsResultPromise(registerAbort))`.
|
|
923
|
+
// The caller plumbs `registerAbort(abortFn)` into every nested LLM call's
|
|
924
|
+
// `onAbortReady` so the watchdog can kill whichever attempt is in flight on
|
|
925
|
+
// expiry. Returns the original result on success or a synthetic envelope
|
|
926
|
+
// `{ text:'', error:{ code:'cc-turn-timeout', retriable:true } }` on expiry.
|
|
927
|
+
async function withCcTurnTimeout({ timeoutMs, label = 'cc-turn', onAbortReady } = {}, callFn) {
|
|
928
|
+
if (!timeoutMs || timeoutMs <= 0) return callFn(onAbortReady || (() => {}));
|
|
929
|
+
let currentAbort = null;
|
|
930
|
+
let timedOut = false;
|
|
931
|
+
let timer = null;
|
|
932
|
+
const registerAbort = (abort) => {
|
|
933
|
+
currentAbort = abort;
|
|
934
|
+
if (onAbortReady) onAbortReady(abort);
|
|
935
|
+
};
|
|
936
|
+
const inflight = Promise.resolve().then(() => callFn(registerAbort));
|
|
937
|
+
const timeoutPromise = new Promise((resolve) => {
|
|
938
|
+
timer = setTimeout(() => {
|
|
939
|
+
timedOut = true;
|
|
940
|
+
try { if (currentAbort) currentAbort(); } catch { /* swallow */ }
|
|
941
|
+
resolve(null);
|
|
942
|
+
}, timeoutMs);
|
|
943
|
+
// NOTE: do NOT unref this timer. If we did, Node would exit the event
|
|
944
|
+
// loop while waiting on the inflight promise (Promises themselves don't
|
|
945
|
+
// hold the loop open — only timers/I/O do). The race below clears the
|
|
946
|
+
// timer immediately on success, so a still-armed timer never leaks past
|
|
947
|
+
// the resolution.
|
|
948
|
+
});
|
|
949
|
+
const winner = await Promise.race([inflight, timeoutPromise]);
|
|
950
|
+
if (!timedOut) {
|
|
951
|
+
clearTimeout(timer);
|
|
952
|
+
return winner;
|
|
953
|
+
}
|
|
954
|
+
// Let the in-flight call settle so its cleanup (cleanupFiles/Dirs, kill
|
|
955
|
+
// sweeps) actually runs before we hand a synthetic envelope to the caller.
|
|
956
|
+
const settled = await inflight.catch((err) => ({
|
|
957
|
+
text: '', usage: null, sessionId: null, code: 1, stderr: String(err && err.message || err), raw: '', toolUses: [],
|
|
958
|
+
}));
|
|
959
|
+
const message = `CC turn ${label} timed out after ${timeoutMs}ms`;
|
|
960
|
+
return {
|
|
961
|
+
...settled,
|
|
962
|
+
text: '',
|
|
963
|
+
code: settled?.code || 1,
|
|
964
|
+
errorClass: 'cc-turn-timeout',
|
|
965
|
+
errorMessage: message,
|
|
966
|
+
error: { message, code: 'cc-turn-timeout', retriable: true },
|
|
967
|
+
ok: false,
|
|
968
|
+
};
|
|
969
|
+
}
|
|
970
|
+
|
|
828
971
|
module.exports = {
|
|
829
972
|
callLLM,
|
|
830
973
|
callLLMStreaming,
|
|
831
974
|
trackEngineUsage,
|
|
975
|
+
trackEngineError,
|
|
832
976
|
flushMetricsBuffer,
|
|
977
|
+
withCcTurnTimeout,
|
|
833
978
|
// Exposed for unit tests — engine code MUST use the runtime adapter contract.
|
|
834
979
|
_buildSpawnAgentFlags,
|
|
980
|
+
_buildErrorEnvelope,
|
|
835
981
|
_resolveBin,
|
|
836
982
|
_resetBinCache,
|
|
837
983
|
_resetMetricsBufferForTest,
|
package/engine/preflight.js
CHANGED
|
@@ -87,17 +87,17 @@ function findClaudeBinary() {
|
|
|
87
87
|
* `shared.runtimeConfigWarnings` so unknown-CLI warnings and binary checks
|
|
88
88
|
* always cover the same surface.
|
|
89
89
|
*
|
|
90
|
-
* Without a config (legacy callers), returns just `['
|
|
91
|
-
*
|
|
90
|
+
* Without a config (legacy callers), returns just `['copilot']` — matches
|
|
91
|
+
* `ENGINE_DEFAULTS.defaultCli` (W-mpmwxkk40007c995).
|
|
92
92
|
*/
|
|
93
93
|
function _distinctRuntimes(config) {
|
|
94
94
|
const set = new Set();
|
|
95
95
|
if (!config || typeof config !== 'object') {
|
|
96
|
-
set.add('
|
|
96
|
+
set.add('copilot');
|
|
97
97
|
return Array.from(set);
|
|
98
98
|
}
|
|
99
99
|
const engine = config.engine || {};
|
|
100
|
-
set.add(engine.defaultCli ? String(engine.defaultCli) : '
|
|
100
|
+
set.add(engine.defaultCli ? String(engine.defaultCli) : 'copilot');
|
|
101
101
|
if (engine.ccCli) set.add(String(engine.ccCli));
|
|
102
102
|
for (const agent of Object.values(config.agents || {})) {
|
|
103
103
|
if (agent && agent.cli) set.add(String(agent.cli));
|
|
@@ -355,7 +355,7 @@ function _fleetSummaryResults(config) {
|
|
|
355
355
|
const results = [];
|
|
356
356
|
if (!config || typeof config !== 'object') return results;
|
|
357
357
|
const engine = config.engine || {};
|
|
358
|
-
const defaultCli = engine.defaultCli ? String(engine.defaultCli) : '
|
|
358
|
+
const defaultCli = engine.defaultCli ? String(engine.defaultCli) : 'copilot';
|
|
359
359
|
const defaultModel = engine.defaultModel ? String(engine.defaultModel) : '(runtime default)';
|
|
360
360
|
results.push({ name: 'Fleet', ok: true, message: `defaultCli=${defaultCli} defaultModel=${defaultModel}` });
|
|
361
361
|
|