@yemi33/minions 0.1.2045 → 0.1.2047
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dashboard/js/fre.js +3 -2
- package/dashboard/js/render-prs.js +82 -2
- package/dashboard/js/settings.js +5 -5
- package/dashboard/styles.css +11 -0
- package/dashboard.js +376 -135
- package/docs/copilot-cli-schema.md +2 -1
- package/docs/runtime-adapters.md +9 -4
- package/engine/cc-worker-pool.js +87 -11
- package/engine/llm.js +148 -2
- package/engine/preflight.js +5 -5
- package/engine/queries.js +75 -35
- package/engine/runtimes/claude.js +41 -0
- package/engine/runtimes/copilot.js +97 -3
- package/engine/shared.js +4 -3
- package/package.json +1 -1
|
@@ -614,8 +614,9 @@ When implementing `engine/runtimes/copilot.js`:
|
|
|
614
614
|
should still parse cleanly — let the consumer decide to ignore.
|
|
615
615
|
9. `parseError(rawOutput)` patterns:
|
|
616
616
|
- `auth-failure`: `/not authenticated|copilot login|401|403/i`
|
|
617
|
+
- `model-unavailable` (retriable=false): `/unknown model|model not found|invalid model|model_not_found|400.*model/i` — message includes the cached model catalog (`_warmModelCache` populates `_modelDiscoveryResults` from `listModels()` ahead of time so the error path stays sync). Falls back to "Configure a valid model in Settings → Engine." when the cache is empty.
|
|
618
|
+
- `model-unavailable` (retriable=true): `/overloaded_error|service_unavailable|503|temporarily unavailable/i` — engine retries with `engine.copilotFallbackModel`.
|
|
617
619
|
- `rate-limit`: `/rate limit|too many requests|429/i`
|
|
618
|
-
- `unknown-model`: `/unknown model|model not found|model.*invalid/i`
|
|
619
620
|
- `crash`: `/internal error|panic|uncaught/i`
|
|
620
621
|
10. `listModels()` per §6 — return `null` on any failure (network, parse, auth).
|
|
621
622
|
`modelsCache` path: `engine/copilot-models.json`.
|
package/docs/runtime-adapters.md
CHANGED
|
@@ -14,7 +14,12 @@ behavior is hidden behind an adapter object resolved through `resolveRuntime()`.
|
|
|
14
14
|
|
|
15
15
|
`resolveRuntime(name)` throws when `name` is unknown so misconfigurations surface
|
|
16
16
|
at dispatch time instead of producing silent fallbacks deep inside spawn logic.
|
|
17
|
-
|
|
17
|
+
When `name` is `null`/omitted, `resolveRuntime()` falls back to `'claude'` for
|
|
18
|
+
parser-routing compatibility (Copilot's `parseOutput` cannot consume the Claude
|
|
19
|
+
JSONL `{type:"result",result:"..."}` shape — see W-mpmwxkk40007c995). The fleet
|
|
20
|
+
default that determines which runtime *new spawns* use is separate:
|
|
21
|
+
`ENGINE_DEFAULTS.defaultCli` (also in W-mpmwxkk40007c995) is now `'copilot'`, so
|
|
22
|
+
operators with no explicit `engine.defaultCli` get Copilot on dispatch.
|
|
18
23
|
|
|
19
24
|
## Adapter Interface
|
|
20
25
|
|
|
@@ -44,7 +49,7 @@ methods that genuinely differ.
|
|
|
44
49
|
| `modelLooksFamiliar(model)` | boolean | Heuristic powering the preflight "stale model after CLI switch" warning. |
|
|
45
50
|
| `parseOutput(raw)` | `{ text, usage, sessionId, model }` | Final-event parser. |
|
|
46
51
|
| `parseStreamChunk(line)` | event object or null | Single JSONL line → typed event. |
|
|
47
|
-
| `parseError(rawOutput)` | `{ message, code, retriable }` | Codes: `auth-failure`, `context-limit`, `budget-exceeded`, `crash`, null. |
|
|
52
|
+
| `parseError(rawOutput)` | `{ message, code, retriable }` | Codes: `auth-failure`, `context-limit`, `budget-exceeded`, `model-unavailable` (retriable=true for upstream overload/503; retriable=false for invalid/typo'd model id — Copilot enriches the message via `_warmModelCache()` so it lists the available models), `crash`, null. |
|
|
48
53
|
| `createStreamConsumer(ctx)` | consumer object | Stream accumulator used by `engine/llm.js`. |
|
|
49
54
|
| `detectPermissionGate`, `getPromptDeliveryMode`, `usesSystemPromptFile`, `classifyFailure` | misc | Adapter-owned policy that engine code reads through accessors instead of branching on `runtime.name`. |
|
|
50
55
|
|
|
@@ -93,8 +98,8 @@ directly.
|
|
|
93
98
|
|
|
94
99
|
| Helper | Chain |
|
|
95
100
|
|--------|-------|
|
|
96
|
-
| `resolveAgentCli(agent, engine)` | `agent.cli` → `engine.defaultCli` → `'
|
|
97
|
-
| `resolveCcCli(engine)` | `engine.ccCli` → `engine.defaultCli` → `'
|
|
101
|
+
| `resolveAgentCli(agent, engine)` | `agent.cli` → `engine.defaultCli` → `'copilot'` |
|
|
102
|
+
| `resolveCcCli(engine)` | `engine.ccCli` → `engine.defaultCli` → `'copilot'` |
|
|
98
103
|
| `resolveAgentModel(agent, engine)` | `agent.model` → `engine.defaultModel` → undefined |
|
|
99
104
|
| `resolveCcModel(engine)` | `engine.ccModel` → `engine.defaultModel` → undefined |
|
|
100
105
|
| `resolveAgentMaxBudget(agent, engine)` | `agent.maxBudgetUsd` → `engine.maxBudgetUsd`. Honors literal `0`. |
|
package/engine/cc-worker-pool.js
CHANGED
|
@@ -54,6 +54,45 @@
|
|
|
54
54
|
const { spawn } = require('child_process');
|
|
55
55
|
const crypto = require('crypto');
|
|
56
56
|
|
|
57
|
+
// W-mpmwxni2000c25c7-c — typed error codes the pool emits through every
|
|
58
|
+
// failure exit so the consumer (CC streaming handler / doc-chat pool
|
|
59
|
+
// wrapper / SSE writer) can render a structured error envelope instead of
|
|
60
|
+
// parsing the stderr string. Matches the `{ message, code, retriable }`
|
|
61
|
+
// shape sub-item b standardized on for the dashboard's SSE envelope and
|
|
62
|
+
// the runtime adapter parseError() contract (engine/runtimes/*.js).
|
|
63
|
+
const ERROR_CODES = Object.freeze({
|
|
64
|
+
// spawn() threw synchronously OR the child process emitted an 'error'
|
|
65
|
+
// event (binary missing on PATH, exec failed, EPERM, etc.). Retriable
|
|
66
|
+
// because a transient PATH / fs glitch may recover.
|
|
67
|
+
WORKER_SPAWN_FAILED: 'worker-spawn-failed',
|
|
68
|
+
// The worker process exited DURING the ACP handshake (initialize or
|
|
69
|
+
// session/new) — usually `copilot login` is incomplete or the CLI
|
|
70
|
+
// version is too old. Also fires when session/new returns no
|
|
71
|
+
// sessionId. Retriable: the engine swaps to a fallback model / a re-auth
|
|
72
|
+
// may unblock the next attempt.
|
|
73
|
+
ACP_HANDSHAKE_FAILED: 'acp-handshake-failed',
|
|
74
|
+
// The worker process exited AFTER a successful handshake (the daemon
|
|
75
|
+
// died mid-turn). Retriable — the next call cold-spawns a fresh worker.
|
|
76
|
+
WORKER_DIED: 'worker-died',
|
|
77
|
+
// The consumer's per-turn timeout fired before the ACP session/prompt
|
|
78
|
+
// resolved. Owned by the dashboard pool wrappers (cc-worker-pool itself
|
|
79
|
+
// has no turn timeout) but exported here so all callers stringify the
|
|
80
|
+
// same constant. Retriable — most timeouts are transient.
|
|
81
|
+
CC_TURN_TIMEOUT: 'cc-turn-timeout',
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
// Build a typed Error carrying the `{ message, code, retriable }` envelope
|
|
85
|
+
// fields the consumer expects. Plain Errors flow through unchanged; the
|
|
86
|
+
// helper only stamps the extra metadata. Keep retriable defaulting to
|
|
87
|
+
// `true` so a caller that forgets to set it still gets the safe default
|
|
88
|
+
// (the legacy pre-typed-error code path treated every failure as retriable).
|
|
89
|
+
function _typedError(message, code, retriable = true) {
|
|
90
|
+
const err = new Error(message);
|
|
91
|
+
err.code = code;
|
|
92
|
+
err.retriable = retriable;
|
|
93
|
+
return err;
|
|
94
|
+
}
|
|
95
|
+
|
|
57
96
|
// 10 minutes — matches the work-item spec.
|
|
58
97
|
const IDLE_REAPER_MS = 10 * 60 * 1000;
|
|
59
98
|
// Reaper sweep cadence. Not exposed as ENGINE_DEFAULTS to keep the pool
|
|
@@ -176,8 +215,13 @@ class Worker {
|
|
|
176
215
|
try {
|
|
177
216
|
proc = _internals.spawnAcp({ cwd: this.cwd });
|
|
178
217
|
} catch (err) {
|
|
179
|
-
|
|
180
|
-
|
|
218
|
+
// spawn() threw synchronously — typically ENOENT (copilot binary not
|
|
219
|
+
// on PATH) or EACCES. Surface as worker-spawn-failed so the consumer
|
|
220
|
+
// can show "install the CLI / fix PATH" guidance.
|
|
221
|
+
throw _typedError(
|
|
222
|
+
`copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`,
|
|
223
|
+
ERROR_CODES.WORKER_SPAWN_FAILED,
|
|
224
|
+
true
|
|
181
225
|
);
|
|
182
226
|
}
|
|
183
227
|
this.proc = proc;
|
|
@@ -193,8 +237,13 @@ class Worker {
|
|
|
193
237
|
const earlyExitPromise = new Promise((_, reject) => {
|
|
194
238
|
earlyExitReject = (code) => {
|
|
195
239
|
this.killed = true;
|
|
196
|
-
|
|
197
|
-
|
|
240
|
+
// Early exit DURING the handshake = acp-handshake-failed (almost
|
|
241
|
+
// always missing `copilot login`, stale CLI, or daemon crash on
|
|
242
|
+
// boot). Retriable so re-auth or a CLI upgrade can recover.
|
|
243
|
+
const err = _typedError(
|
|
244
|
+
`copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (exit ${code})`,
|
|
245
|
+
ERROR_CODES.ACP_HANDSHAKE_FAILED,
|
|
246
|
+
true
|
|
198
247
|
);
|
|
199
248
|
this.spawnError = err;
|
|
200
249
|
this._failAllPending(err);
|
|
@@ -205,8 +254,13 @@ class Worker {
|
|
|
205
254
|
proc.once('exit', earlyExitHandler);
|
|
206
255
|
|
|
207
256
|
const errorHandler = (err) => {
|
|
208
|
-
|
|
209
|
-
|
|
257
|
+
// proc 'error' event fires when the OS can't actually start the child
|
|
258
|
+
// (ENOENT after a successful spawn() call, etc.). Treat as a spawn
|
|
259
|
+
// failure even though we made it past the synchronous spawn() above.
|
|
260
|
+
const wrapped = _typedError(
|
|
261
|
+
`copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`,
|
|
262
|
+
ERROR_CODES.WORKER_SPAWN_FAILED,
|
|
263
|
+
true
|
|
210
264
|
);
|
|
211
265
|
this.spawnError = wrapped;
|
|
212
266
|
this.killed = true;
|
|
@@ -227,7 +281,13 @@ class Worker {
|
|
|
227
281
|
]);
|
|
228
282
|
this.sessionId = result && result.sessionId;
|
|
229
283
|
if (!this.sessionId) {
|
|
230
|
-
|
|
284
|
+
// Handshake completed without an error but the daemon didn't hand
|
|
285
|
+
// back a sessionId — protocol violation or partial init failure.
|
|
286
|
+
throw _typedError(
|
|
287
|
+
'copilot --acp failed -- session/new returned no sessionId',
|
|
288
|
+
ERROR_CODES.ACP_HANDSHAKE_FAILED,
|
|
289
|
+
true
|
|
290
|
+
);
|
|
231
291
|
}
|
|
232
292
|
} finally {
|
|
233
293
|
// Either the handshake finished (swap to a persistent exit handler that
|
|
@@ -236,7 +296,13 @@ class Worker {
|
|
|
236
296
|
}
|
|
237
297
|
proc.on('exit', () => {
|
|
238
298
|
this.killed = true;
|
|
239
|
-
|
|
299
|
+
// Post-handshake exit = the daemon died mid-conversation. Retriable
|
|
300
|
+
// because the next call will cold-spawn a fresh worker.
|
|
301
|
+
const err = _typedError(
|
|
302
|
+
'copilot --acp process exited',
|
|
303
|
+
ERROR_CODES.WORKER_DIED,
|
|
304
|
+
true
|
|
305
|
+
);
|
|
240
306
|
this._failAllPending(err);
|
|
241
307
|
// Settle inflight too if it's still hanging
|
|
242
308
|
if (this.inflight && !this.inflight.settled) {
|
|
@@ -656,9 +722,13 @@ async function getSession({ tabId, model, effort, mcpServers, systemPromptHash,
|
|
|
656
722
|
// This is the bug class the ab141995 fix closed; if it ever recurs the
|
|
657
723
|
// engine should fail loudly rather than hand back a half-initialized
|
|
658
724
|
// handle. Throwing here lets the dashboard surface spawn-failed instead
|
|
659
|
-
// of the silent thinking-dots-forever symptom.
|
|
660
|
-
|
|
661
|
-
|
|
725
|
+
// of the silent thinking-dots-forever symptom. Mark non-retriable —
|
|
726
|
+
// this is a real engine bug, not a transient pool failure; the next
|
|
727
|
+
// attempt would hit the same race.
|
|
728
|
+
throw _typedError(
|
|
729
|
+
`cc-worker-pool: getSession returning handle with null sessionId (tab=${tabId} lifecycle=${lifecycle}) — engine race regression, see W-mpd45blx00072f04 / W-mpdavudb000v8446`,
|
|
730
|
+
ERROR_CODES.ACP_HANDSHAKE_FAILED,
|
|
731
|
+
false
|
|
662
732
|
);
|
|
663
733
|
}
|
|
664
734
|
|
|
@@ -766,4 +836,10 @@ module.exports = {
|
|
|
766
836
|
IDLE_REAPER_MS,
|
|
767
837
|
REAPER_INTERVAL_MS,
|
|
768
838
|
WARM_MAX_CONCURRENT,
|
|
839
|
+
// W-mpmwxni2000c25c7-c — typed-error envelope contract. Exported so the
|
|
840
|
+
// dashboard pool wrappers (and their tests) reference the same string
|
|
841
|
+
// constants and so the doc-chat timeout path can stamp the same
|
|
842
|
+
// `{ message, code, retriable }` shape the pool itself emits.
|
|
843
|
+
ERROR_CODES,
|
|
844
|
+
_typedError,
|
|
769
845
|
};
|
package/engine/llm.js
CHANGED
|
@@ -82,6 +82,21 @@ function trackEngineUsage(category, usage) {
|
|
|
82
82
|
_ensureFlushTimer();
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
+
// W-mpmwxni2000c25c7-b — silent-error regression counter. Every CC/doc-chat
|
|
86
|
+
// error surfaced through the handlers bumps `_engine[category].errorsByCode[code]`
|
|
87
|
+
// so /api/metrics reflects new error codes (cc-turn-timeout, empty-output, …)
|
|
88
|
+
// without polluting cost/tokens. Counters flush on the same timer as
|
|
89
|
+
// trackEngineUsage so the dashboard's fast-state mtime gate isn't bypassed.
|
|
90
|
+
function trackEngineError(category, errorCode) {
|
|
91
|
+
if (!category || !errorCode) return;
|
|
92
|
+
if (category.startsWith('_test') || category.startsWith('test-')) return;
|
|
93
|
+
if (!_pendingMetrics.engine[category]) _pendingMetrics.engine[category] = _emptyEngineDelta();
|
|
94
|
+
const cat = _pendingMetrics.engine[category];
|
|
95
|
+
if (!cat.errorsByCode) cat.errorsByCode = Object.create(null);
|
|
96
|
+
cat.errorsByCode[errorCode] = (cat.errorsByCode[errorCode] || 0) + 1;
|
|
97
|
+
_ensureFlushTimer();
|
|
98
|
+
}
|
|
99
|
+
|
|
85
100
|
function flushMetricsBuffer() {
|
|
86
101
|
const pending = _pendingMetrics;
|
|
87
102
|
if (!Object.keys(pending.engine).length && !Object.keys(pending.daily).length) return;
|
|
@@ -106,6 +121,12 @@ function flushMetricsBuffer() {
|
|
|
106
121
|
cat.totalDurationMs = (cat.totalDurationMs || 0) + delta.totalDurationMs;
|
|
107
122
|
cat.timedCalls = (cat.timedCalls || 0) + delta.timedCalls;
|
|
108
123
|
}
|
|
124
|
+
if (delta.errorsByCode) {
|
|
125
|
+
if (!cat.errorsByCode) cat.errorsByCode = {};
|
|
126
|
+
for (const [code, count] of Object.entries(delta.errorsByCode)) {
|
|
127
|
+
cat.errorsByCode[code] = (cat.errorsByCode[code] || 0) + count;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
109
130
|
}
|
|
110
131
|
if (!metrics._daily) metrics._daily = {};
|
|
111
132
|
for (const [day, delta] of Object.entries(pending.daily)) {
|
|
@@ -129,6 +150,12 @@ function flushMetricsBuffer() {
|
|
|
129
150
|
c.inputTokens += delta.inputTokens; c.outputTokens += delta.outputTokens;
|
|
130
151
|
c.cacheRead += delta.cacheRead; c.cacheCreation += delta.cacheCreation;
|
|
131
152
|
c.totalDurationMs += delta.totalDurationMs; c.timedCalls += delta.timedCalls;
|
|
153
|
+
if (delta.errorsByCode) {
|
|
154
|
+
if (!c.errorsByCode) c.errorsByCode = Object.create(null);
|
|
155
|
+
for (const [code, count] of Object.entries(delta.errorsByCode)) {
|
|
156
|
+
c.errorsByCode[code] = (c.errorsByCode[code] || 0) + count;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
132
159
|
}
|
|
133
160
|
for (const [day, delta] of Object.entries(pending.daily)) {
|
|
134
161
|
if (!_pendingMetrics.daily[day]) _pendingMetrics.daily[day] = _emptyDailyDelta();
|
|
@@ -233,6 +260,8 @@ function _missingRuntimeResult(runtimeName, runtime, reason) {
|
|
|
233
260
|
errorClass: shared.FAILURE_CLASS.CONFIG_ERROR,
|
|
234
261
|
errorMessage: message,
|
|
235
262
|
missingRuntime: true,
|
|
263
|
+
error: { message, code: shared.FAILURE_CLASS.CONFIG_ERROR, retriable: false },
|
|
264
|
+
ok: false,
|
|
236
265
|
};
|
|
237
266
|
}
|
|
238
267
|
|
|
@@ -245,7 +274,7 @@ function _resolvedCallResult(result) {
|
|
|
245
274
|
function _resolveRuntimeNameFor(callOpts = {}) {
|
|
246
275
|
let runtimeName = callOpts.cli;
|
|
247
276
|
if (!runtimeName && callOpts.engineConfig) runtimeName = resolveCcCli(callOpts.engineConfig);
|
|
248
|
-
return runtimeName || '
|
|
277
|
+
return runtimeName || 'copilot';
|
|
249
278
|
}
|
|
250
279
|
|
|
251
280
|
function _runtimeUnavailableResult(callOpts = {}) {
|
|
@@ -566,7 +595,7 @@ function _createStreamAccumulator({
|
|
|
566
595
|
|
|
567
596
|
function _resolveRuntimeFor(callOpts) {
|
|
568
597
|
// Explicit `cli` opt wins; otherwise fall to `engineConfig` resolution;
|
|
569
|
-
// otherwise default to
|
|
598
|
+
// otherwise default to copilot (fleet default as of W-mpmwxkk40007c995).
|
|
570
599
|
return resolveRuntime(_resolveRuntimeNameFor(callOpts));
|
|
571
600
|
}
|
|
572
601
|
|
|
@@ -599,6 +628,52 @@ function _resolveRuntimeFeatureOpts({
|
|
|
599
628
|
|
|
600
629
|
// ─── Core LLM Call ───────────────────────────────────────────────────────────
|
|
601
630
|
|
|
631
|
+
// W-mpmwxni2000c25c7-b — typed-error envelope helper. callLLM /
|
|
632
|
+
// callLLMStreaming attach `error: { message, code, retriable }` to every
|
|
633
|
+
// failure resolution so dashboard CC/doc-chat handlers can surface a
|
|
634
|
+
// structured 5xx JSON or SSE `event: error` instead of returning an empty
|
|
635
|
+
// reply that hangs the UI. The shape mirrors the existing `runtime.parseError`
|
|
636
|
+
// contract from sub-item (a) so adapter classifications (auth-failure,
|
|
637
|
+
// context-limit, budget-exceeded, crash, model-unavailable) propagate
|
|
638
|
+
// verbatim. Engine codes added here:
|
|
639
|
+
// - 'spawn-error' runFile/proc.on('error') failure (binary missing,
|
|
640
|
+
// EACCES, fork bomb, ...)
|
|
641
|
+
// - 'runtime-exit' non-zero exit code with no parseError signal
|
|
642
|
+
// - 'empty-output' zero exit but no parsed text — runtime returned
|
|
643
|
+
// nothing useful (CLI bug or silent timeout)
|
|
644
|
+
// - 'unparseable-output' bytes streamed but accumulator extracted no text
|
|
645
|
+
// (malformed JSONL or unknown event shape)
|
|
646
|
+
//
|
|
647
|
+
// Existing `errorClass` / `errorMessage` fields stay populated for callers
|
|
648
|
+
// that haven't moved to the typed envelope yet.
|
|
649
|
+
function _buildErrorEnvelope(errInfo, code, parsed, fallback) {
|
|
650
|
+
if (errInfo && errInfo.code) {
|
|
651
|
+
return { message: errInfo.message || fallback || 'LLM call failed', code: errInfo.code, retriable: errInfo.retriable !== false };
|
|
652
|
+
}
|
|
653
|
+
if (code !== 0 && code !== null) {
|
|
654
|
+
const stderrTail = parsed && parsed.stderr ? String(parsed.stderr).trim().split('\n').slice(-3).join(' | ').slice(0, 500) : '';
|
|
655
|
+
return {
|
|
656
|
+
message: stderrTail ? `Runtime exited with code ${code}: ${stderrTail}` : `Runtime exited with code ${code}`,
|
|
657
|
+
code: 'runtime-exit',
|
|
658
|
+
retriable: true,
|
|
659
|
+
};
|
|
660
|
+
}
|
|
661
|
+
if (parsed && parsed.text) return null;
|
|
662
|
+
const rawLen = parsed && parsed.raw ? String(parsed.raw).length : 0;
|
|
663
|
+
if (rawLen > 0) {
|
|
664
|
+
return {
|
|
665
|
+
message: 'Runtime produced output the adapter could not parse',
|
|
666
|
+
code: 'unparseable-output',
|
|
667
|
+
retriable: true,
|
|
668
|
+
};
|
|
669
|
+
}
|
|
670
|
+
return {
|
|
671
|
+
message: fallback || 'Runtime returned no output',
|
|
672
|
+
code: 'empty-output',
|
|
673
|
+
retriable: true,
|
|
674
|
+
};
|
|
675
|
+
}
|
|
676
|
+
|
|
602
677
|
function callLLM(promptText, sysPromptText, opts = {}) {
|
|
603
678
|
const {
|
|
604
679
|
timeout = 120000, label = 'llm', maxTurns = 1, allowedTools = '',
|
|
@@ -670,6 +745,7 @@ function callLLM(promptText, sysPromptText, opts = {}) {
|
|
|
670
745
|
const errInfo = code !== 0
|
|
671
746
|
? runtime.parseError([parsed.raw, parsed.stderr].filter(Boolean).join('\n'))
|
|
672
747
|
: { message: '', code: null, retriable: true };
|
|
748
|
+
const errorEnvelope = _buildErrorEnvelope(errInfo, code, parsed, null);
|
|
673
749
|
resolve({
|
|
674
750
|
text: parsed.text || '',
|
|
675
751
|
usage,
|
|
@@ -681,6 +757,8 @@ function callLLM(promptText, sysPromptText, opts = {}) {
|
|
|
681
757
|
runtime: runtime.name,
|
|
682
758
|
errorClass: errInfo.code,
|
|
683
759
|
errorMessage: errInfo.message || null,
|
|
760
|
+
error: errorEnvelope,
|
|
761
|
+
ok: !errorEnvelope,
|
|
684
762
|
});
|
|
685
763
|
};
|
|
686
764
|
|
|
@@ -704,6 +782,8 @@ function callLLM(promptText, sysPromptText, opts = {}) {
|
|
|
704
782
|
text: '', usage: null, sessionId: null, code: 1,
|
|
705
783
|
stderr: err.message, raw: '', toolUses: [],
|
|
706
784
|
runtime: runtime.name, errorClass: null, errorMessage: null,
|
|
785
|
+
error: { message: `Runtime spawn failed: ${err.message}`, code: 'spawn-error', retriable: true },
|
|
786
|
+
ok: false,
|
|
707
787
|
});
|
|
708
788
|
});
|
|
709
789
|
});
|
|
@@ -784,6 +864,7 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
784
864
|
const errInfo = code !== 0
|
|
785
865
|
? runtime.parseError([parsed.raw, parsed.stderr].filter(Boolean).join('\n'))
|
|
786
866
|
: { message: '', code: null, retriable: true };
|
|
867
|
+
const errorEnvelope = _buildErrorEnvelope(errInfo, code, parsed, null);
|
|
787
868
|
resolve({
|
|
788
869
|
text: parsed.text || '',
|
|
789
870
|
usage,
|
|
@@ -795,6 +876,8 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
795
876
|
runtime: runtime.name,
|
|
796
877
|
errorClass: errInfo.code,
|
|
797
878
|
errorMessage: errInfo.message || null,
|
|
879
|
+
error: errorEnvelope,
|
|
880
|
+
ok: !errorEnvelope,
|
|
798
881
|
});
|
|
799
882
|
};
|
|
800
883
|
|
|
@@ -818,6 +901,8 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
818
901
|
text: '', usage: null, sessionId: null, code: 1,
|
|
819
902
|
stderr: err.message, raw: '', toolUses: [],
|
|
820
903
|
runtime: runtime.name, errorClass: null, errorMessage: null,
|
|
904
|
+
error: { message: `Runtime spawn failed: ${err.message}`, code: 'spawn-error', retriable: true },
|
|
905
|
+
ok: false,
|
|
821
906
|
});
|
|
822
907
|
});
|
|
823
908
|
});
|
|
@@ -825,13 +910,74 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
|
|
|
825
910
|
return promise;
|
|
826
911
|
}
|
|
827
912
|
|
|
913
|
+
// ─── CC turn watchdog ────────────────────────────────────────────────────────
|
|
914
|
+
//
|
|
915
|
+
// W-mpmwxni2000c25c7-b — wall-clock cap for a single CC/doc-chat turn. CC turns
|
|
916
|
+
// are a higher-level concept than the per-LLM-call `timeout` opt: a turn can
|
|
917
|
+
// internally retry (resume → fresh → final retry) and each retry has its own
|
|
918
|
+
// per-call timer. Without a turn-level watchdog, a runtime stuck mid-stream
|
|
919
|
+
// (no exit, no chunks, no errors) leaves the SSE handler waiting for the
|
|
920
|
+
// per-call timer to fire and the user staring at the typing dots.
|
|
921
|
+
//
|
|
922
|
+
// Usage: `result = await withCcTurnTimeout({ timeoutMs, label, onAbortReady }, (registerAbort) => callerThatReturnsResultPromise(registerAbort))`.
|
|
923
|
+
// The caller plumbs `registerAbort(abortFn)` into every nested LLM call's
|
|
924
|
+
// `onAbortReady` so the watchdog can kill whichever attempt is in flight on
|
|
925
|
+
// expiry. Returns the original result on success or a synthetic envelope
|
|
926
|
+
// `{ text:'', error:{ code:'cc-turn-timeout', retriable:true } }` on expiry.
|
|
927
|
+
async function withCcTurnTimeout({ timeoutMs, label = 'cc-turn', onAbortReady } = {}, callFn) {
|
|
928
|
+
if (!timeoutMs || timeoutMs <= 0) return callFn(onAbortReady || (() => {}));
|
|
929
|
+
let currentAbort = null;
|
|
930
|
+
let timedOut = false;
|
|
931
|
+
let timer = null;
|
|
932
|
+
const registerAbort = (abort) => {
|
|
933
|
+
currentAbort = abort;
|
|
934
|
+
if (onAbortReady) onAbortReady(abort);
|
|
935
|
+
};
|
|
936
|
+
const inflight = Promise.resolve().then(() => callFn(registerAbort));
|
|
937
|
+
const timeoutPromise = new Promise((resolve) => {
|
|
938
|
+
timer = setTimeout(() => {
|
|
939
|
+
timedOut = true;
|
|
940
|
+
try { if (currentAbort) currentAbort(); } catch { /* swallow */ }
|
|
941
|
+
resolve(null);
|
|
942
|
+
}, timeoutMs);
|
|
943
|
+
// NOTE: do NOT unref this timer. If we did, Node would exit the event
|
|
944
|
+
// loop while waiting on the inflight promise (Promises themselves don't
|
|
945
|
+
// hold the loop open — only timers/I/O do). The race below clears the
|
|
946
|
+
// timer immediately on success, so a still-armed timer never leaks past
|
|
947
|
+
// the resolution.
|
|
948
|
+
});
|
|
949
|
+
const winner = await Promise.race([inflight, timeoutPromise]);
|
|
950
|
+
if (!timedOut) {
|
|
951
|
+
clearTimeout(timer);
|
|
952
|
+
return winner;
|
|
953
|
+
}
|
|
954
|
+
// Let the in-flight call settle so its cleanup (cleanupFiles/Dirs, kill
|
|
955
|
+
// sweeps) actually runs before we hand a synthetic envelope to the caller.
|
|
956
|
+
const settled = await inflight.catch((err) => ({
|
|
957
|
+
text: '', usage: null, sessionId: null, code: 1, stderr: String(err && err.message || err), raw: '', toolUses: [],
|
|
958
|
+
}));
|
|
959
|
+
const message = `CC turn ${label} timed out after ${timeoutMs}ms`;
|
|
960
|
+
return {
|
|
961
|
+
...settled,
|
|
962
|
+
text: '',
|
|
963
|
+
code: settled?.code || 1,
|
|
964
|
+
errorClass: 'cc-turn-timeout',
|
|
965
|
+
errorMessage: message,
|
|
966
|
+
error: { message, code: 'cc-turn-timeout', retriable: true },
|
|
967
|
+
ok: false,
|
|
968
|
+
};
|
|
969
|
+
}
|
|
970
|
+
|
|
828
971
|
module.exports = {
|
|
829
972
|
callLLM,
|
|
830
973
|
callLLMStreaming,
|
|
831
974
|
trackEngineUsage,
|
|
975
|
+
trackEngineError,
|
|
832
976
|
flushMetricsBuffer,
|
|
977
|
+
withCcTurnTimeout,
|
|
833
978
|
// Exposed for unit tests — engine code MUST use the runtime adapter contract.
|
|
834
979
|
_buildSpawnAgentFlags,
|
|
980
|
+
_buildErrorEnvelope,
|
|
835
981
|
_resolveBin,
|
|
836
982
|
_resetBinCache,
|
|
837
983
|
_resetMetricsBufferForTest,
|
package/engine/preflight.js
CHANGED
|
@@ -87,17 +87,17 @@ function findClaudeBinary() {
|
|
|
87
87
|
* `shared.runtimeConfigWarnings` so unknown-CLI warnings and binary checks
|
|
88
88
|
* always cover the same surface.
|
|
89
89
|
*
|
|
90
|
-
* Without a config (legacy callers), returns just `['
|
|
91
|
-
*
|
|
90
|
+
* Without a config (legacy callers), returns just `['copilot']` — matches
|
|
91
|
+
* `ENGINE_DEFAULTS.defaultCli` (W-mpmwxkk40007c995).
|
|
92
92
|
*/
|
|
93
93
|
function _distinctRuntimes(config) {
|
|
94
94
|
const set = new Set();
|
|
95
95
|
if (!config || typeof config !== 'object') {
|
|
96
|
-
set.add('
|
|
96
|
+
set.add('copilot');
|
|
97
97
|
return Array.from(set);
|
|
98
98
|
}
|
|
99
99
|
const engine = config.engine || {};
|
|
100
|
-
set.add(engine.defaultCli ? String(engine.defaultCli) : '
|
|
100
|
+
set.add(engine.defaultCli ? String(engine.defaultCli) : 'copilot');
|
|
101
101
|
if (engine.ccCli) set.add(String(engine.ccCli));
|
|
102
102
|
for (const agent of Object.values(config.agents || {})) {
|
|
103
103
|
if (agent && agent.cli) set.add(String(agent.cli));
|
|
@@ -355,7 +355,7 @@ function _fleetSummaryResults(config) {
|
|
|
355
355
|
const results = [];
|
|
356
356
|
if (!config || typeof config !== 'object') return results;
|
|
357
357
|
const engine = config.engine || {};
|
|
358
|
-
const defaultCli = engine.defaultCli ? String(engine.defaultCli) : '
|
|
358
|
+
const defaultCli = engine.defaultCli ? String(engine.defaultCli) : 'copilot';
|
|
359
359
|
const defaultModel = engine.defaultModel ? String(engine.defaultModel) : '(runtime default)';
|
|
360
360
|
results.push({ name: 'Fleet', ok: true, message: `defaultCli=${defaultCli} defaultModel=${defaultModel}` });
|
|
361
361
|
|
package/engine/queries.js
CHANGED
|
@@ -528,7 +528,7 @@ function getAgents(config) {
|
|
|
528
528
|
|
|
529
529
|
return roster.map(a => {
|
|
530
530
|
// Resolve which CLI runtime this agent dispatches to: per-agent override
|
|
531
|
-
// → engine.defaultCli → '
|
|
531
|
+
// → engine.defaultCli → 'copilot'. Surfaced so the dashboard can show a
|
|
532
532
|
// runtime tag next to the agent name.
|
|
533
533
|
const runtime = shared.resolveAgentCli(a, config.engine || {});
|
|
534
534
|
const inboxFiles = allInboxFiles.filter(f => f.includes(a.id));
|
|
@@ -1770,19 +1770,18 @@ function _projectGitStatusEqual(a, b) {
|
|
|
1770
1770
|
function _scheduleProjectGitStatusRefresh(localPath, key, configuredMainBranch) {
|
|
1771
1771
|
const existing = _projectGitStatusCache.get(key);
|
|
1772
1772
|
if (existing && existing.promise) return existing.promise;
|
|
1773
|
-
const entry = existing || { ts: 0, value: PROJECT_GIT_STATUS_PENDING, promise: null };
|
|
1773
|
+
const entry = existing || { ts: 0, value: PROJECT_GIT_STATUS_PENDING, promise: null, refMtimes: null };
|
|
1774
1774
|
const prevValue = entry.value;
|
|
1775
|
-
//
|
|
1776
|
-
//
|
|
1777
|
-
//
|
|
1778
|
-
//
|
|
1779
|
-
//
|
|
1780
|
-
// cache spuriously on the very next read. Probe-START is the safer
|
|
1781
|
-
// anchor — any file with `mtimeMs > probeStartTs` legitimately changed
|
|
1782
|
-
// at-or-after the probe, so re-probing is correct.
|
|
1775
|
+
// Snapshot ref mtimes BEFORE the probe so the next call compares against
|
|
1776
|
+
// an exact baseline rather than a Date.now() timestamp. On Windows
|
|
1777
|
+
// Date.now() can have ~15ms granularity while NTFS mtime is sub-ms, so
|
|
1778
|
+
// a file written shortly before the probe could appear `mtimeMs > ts`
|
|
1779
|
+
// even when nothing actually changed.
|
|
1783
1780
|
const probeStartTs = Date.now();
|
|
1781
|
+
const probeStartRefMtimes = _snapshotProjectGitRefMtimes(localPath, configuredMainBranch);
|
|
1784
1782
|
entry.promise = _probeProjectGitStatus(localPath, configuredMainBranch).then(value => {
|
|
1785
1783
|
entry.ts = probeStartTs;
|
|
1784
|
+
entry.refMtimes = probeStartRefMtimes;
|
|
1786
1785
|
entry.value = value;
|
|
1787
1786
|
entry.promise = null;
|
|
1788
1787
|
if (_onProjectGitStatusChanged && !_projectGitStatusEqual(prevValue, value)) {
|
|
@@ -1857,35 +1856,65 @@ function _resolveCommonGitDir(gitDir) {
|
|
|
1857
1856
|
return path.isAbsolute(raw) ? raw : path.resolve(gitDir, raw);
|
|
1858
1857
|
}
|
|
1859
1858
|
|
|
1860
|
-
//
|
|
1861
|
-
//
|
|
1862
|
-
//
|
|
1863
|
-
//
|
|
1864
|
-
|
|
1865
|
-
// (W-mphdmr8c00030124). Tolerates ENOENT on FETCH_HEAD / refs (never-
|
|
1866
|
-
// fetched repos simply haven't moved those files yet). Cost ≤3 statSync
|
|
1867
|
-
// per project per /api/status build — well under the 'cheap' budget
|
|
1868
|
-
// called out in getStatusFastStateMtimePaths's docstring.
|
|
1869
|
-
function _projectGitRefsAdvancedSince(localPath, cachedTs, configuredMainBranch) {
|
|
1859
|
+
// Enumerate the per-project git ref files we watch for cache-busting:
|
|
1860
|
+
// logs/HEAD (per-worktree gitdir), FETCH_HEAD + refs/remotes/origin/* (common
|
|
1861
|
+
// gitdir for linked worktrees). Same paths as the fast-state mtime tracker
|
|
1862
|
+
// so callers see a coherent view across surfaces.
|
|
1863
|
+
function _projectGitRefFiles(localPath, configuredMainBranch) {
|
|
1870
1864
|
const gitDir = _resolveGitDir(localPath);
|
|
1871
|
-
if (!gitDir) return
|
|
1872
|
-
// logs/HEAD is per-worktree; FETCH_HEAD + refs/remotes/origin/* live in
|
|
1873
|
-
// the COMMON gitdir for linked worktrees. For the main worktree both
|
|
1874
|
-
// resolve to the same place, so this is a no-op there.
|
|
1865
|
+
if (!gitDir) return null;
|
|
1875
1866
|
const commonGitDir = _resolveCommonGitDir(gitDir);
|
|
1876
|
-
const
|
|
1867
|
+
const files = [
|
|
1877
1868
|
path.join(gitDir, 'logs', 'HEAD'),
|
|
1878
1869
|
path.join(commonGitDir, 'FETCH_HEAD'),
|
|
1879
1870
|
];
|
|
1880
1871
|
const comparator = configuredMainBranch && String(configuredMainBranch).trim();
|
|
1881
1872
|
if (comparator) {
|
|
1882
|
-
|
|
1873
|
+
files.push(path.join(commonGitDir, 'refs', 'remotes', 'origin', comparator));
|
|
1883
1874
|
}
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1875
|
+
return files;
|
|
1876
|
+
}
|
|
1877
|
+
|
|
1878
|
+
// Snapshot mtimeMs for each ref file. Missing files record `null`. Used as
|
|
1879
|
+
// the baseline that the next `getProjectGitStatus` call compares against —
|
|
1880
|
+
// inequality, not threshold-vs-timestamp, so the result is precision-
|
|
1881
|
+
// independent (Windows `Date.now()` can be 15ms coarse while NTFS mtime is
|
|
1882
|
+
// sub-millisecond, which used to make threshold checks fire spuriously on
|
|
1883
|
+
// freshly-written files).
|
|
1884
|
+
function _snapshotProjectGitRefMtimes(localPath, configuredMainBranch) {
|
|
1885
|
+
const files = _projectGitRefFiles(localPath, configuredMainBranch);
|
|
1886
|
+
if (!files) return null;
|
|
1887
|
+
const out = Object.create(null);
|
|
1888
|
+
for (const f of files) {
|
|
1889
|
+
try { out[f] = fs.statSync(f).mtimeMs; }
|
|
1890
|
+
catch { out[f] = null; /* ENOENT recorded as null — flipping to present must bust */ }
|
|
1891
|
+
}
|
|
1892
|
+
return out;
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
// Return true when ANY tracked ref file's mtime (or existence) differs from
|
|
1896
|
+
// the snapshot captured during the last probe. Replaces the older threshold-
|
|
1897
|
+
// vs-cachedTs check that suffered from `Date.now()`/`mtimeMs` resolution
|
|
1898
|
+
// races on Windows. Lets `getProjectGitStatus` bypass its 15s TTL after
|
|
1899
|
+
// `git pull`, `git fetch`, `git checkout`, etc. so the next /api/status
|
|
1900
|
+
// reflects the new HEAD / ahead-behind within one SPA poll instead of
|
|
1901
|
+
// waiting out the TTL (W-mphdmr8c00030124). Cost: 2-3 statSync per call —
|
|
1902
|
+
// well under the 'cheap' budget.
|
|
1903
|
+
function _projectGitRefsAdvancedSince(localPath, configuredMainBranch, snapshot) {
|
|
1904
|
+
// No snapshot yet (legacy entry shape OR first call) — preserve the
|
|
1905
|
+
// current cached value so the TTL-only fast-path still works. A real
|
|
1906
|
+
// change still surfaces on the next /api/status because the fast-state
|
|
1907
|
+
// mtime tracker watches the same files and will bust the upstream cache.
|
|
1908
|
+
if (!snapshot) return false;
|
|
1909
|
+
const current = _snapshotProjectGitRefMtimes(localPath, configuredMainBranch);
|
|
1910
|
+
if (!current) return false;
|
|
1911
|
+
for (const file of Object.keys(snapshot)) {
|
|
1912
|
+
if (current[file] !== snapshot[file]) return true;
|
|
1913
|
+
}
|
|
1914
|
+
// Also catch a file that appeared since the snapshot (e.g. first `git
|
|
1915
|
+
// fetch` materialises FETCH_HEAD).
|
|
1916
|
+
for (const file of Object.keys(current)) {
|
|
1917
|
+
if (!(file in snapshot)) return true;
|
|
1889
1918
|
}
|
|
1890
1919
|
return false;
|
|
1891
1920
|
}
|
|
@@ -1901,14 +1930,25 @@ function getProjectGitStatus(localPath, configuredMainBranch = null) {
|
|
|
1901
1930
|
// the pre-pull ahead/behind counts for up to 15s + one SPA poll (~19s
|
|
1902
1931
|
// user-visible lag) because the rebuilt fast-state still hits this
|
|
1903
1932
|
// cache and never schedules a refresh until the TTL itself expires.
|
|
1904
|
-
|
|
1905
|
-
|
|
1933
|
+
// Revalidate a cached MISSING value via a cheap existsSync. The snapshot-
|
|
1934
|
+
// based freshness check below can't detect "directory came back" because
|
|
1935
|
+
// there was no `.git` to snapshot when we wrote MISSING — without this
|
|
1936
|
+
// gate the cache pins MISSING for the full 15s TTL after the path is
|
|
1937
|
+
// recreated.
|
|
1938
|
+
const cachedIsMissing = cached && cached.value === PROJECT_GIT_STATUS_MISSING;
|
|
1939
|
+
if (cachedIsMissing && fs.existsSync(localPath)) {
|
|
1940
|
+
// Path came back — fall through to schedule a fresh probe.
|
|
1941
|
+
} else if (cached && cached.ts && (now - cached.ts) < PROJECT_GIT_STATUS_TTL
|
|
1942
|
+
&& !_projectGitRefsAdvancedSince(localPath, configuredMainBranch, cached.refMtimes)) {
|
|
1906
1943
|
return cached.value;
|
|
1907
1944
|
}
|
|
1908
1945
|
// Cheap synchronous existsSync — short-circuits a path that just disappeared
|
|
1909
|
-
// (project removed) without scheduling a useless git probe.
|
|
1946
|
+
// (project removed) without scheduling a useless git probe. `refMtimes: null`
|
|
1947
|
+
// keeps the entry shape uniform with entries produced by
|
|
1948
|
+
// `_scheduleProjectGitStatusRefresh` so the freshness check above always
|
|
1949
|
+
// sees a defined field.
|
|
1910
1950
|
if (!fs.existsSync(localPath)) {
|
|
1911
|
-
_projectGitStatusCache.set(key, { ts: now, value: PROJECT_GIT_STATUS_MISSING, promise: null });
|
|
1951
|
+
_projectGitStatusCache.set(key, { ts: now, value: PROJECT_GIT_STATUS_MISSING, promise: null, refMtimes: null });
|
|
1912
1952
|
return PROJECT_GIT_STATUS_MISSING;
|
|
1913
1953
|
}
|
|
1914
1954
|
// Stale or never-populated — kick off a background refresh and return the
|