@yemi33/minions 0.1.2045 → 0.1.2047

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -614,8 +614,9 @@ When implementing `engine/runtimes/copilot.js`:
614
614
  should still parse cleanly — let the consumer decide to ignore.
615
615
  9. `parseError(rawOutput)` patterns:
616
616
  - `auth-failure`: `/not authenticated|copilot login|401|403/i`
617
+ - `model-unavailable` (retriable=false): `/unknown model|model not found|invalid model|model_not_found|400.*model/i` — message includes the cached model catalog (`_warmModelCache` populates `_modelDiscoveryResults` from `listModels()` ahead of time so the error path stays sync). Falls back to "Configure a valid model in Settings → Engine." when the cache is empty.
618
+ - `model-unavailable` (retriable=true): `/overloaded_error|service_unavailable|503|temporarily unavailable/i` — engine retries with `engine.copilotFallbackModel`.
617
619
  - `rate-limit`: `/rate limit|too many requests|429/i`
618
- - `unknown-model`: `/unknown model|model not found|model.*invalid/i`
619
620
  - `crash`: `/internal error|panic|uncaught/i`
620
621
  10. `listModels()` per §6 — return `null` on any failure (network, parse, auth).
621
622
  `modelsCache` path: `engine/copilot-models.json`.
@@ -14,7 +14,12 @@ behavior is hidden behind an adapter object resolved through `resolveRuntime()`.
14
14
 
15
15
  `resolveRuntime(name)` throws when `name` is unknown so misconfigurations surface
16
16
  at dispatch time instead of producing silent fallbacks deep inside spawn logic.
17
- The default name is `'claude'` (matches `engine.defaultCli` fallback).
17
+ When `name` is `null`/omitted, `resolveRuntime()` falls back to `'claude'` for
18
+ parser-routing compatibility (Copilot's `parseOutput` cannot consume the Claude
19
+ JSONL `{type:"result",result:"..."}` shape — see W-mpmwxkk40007c995). The fleet
20
+ default that determines which runtime *new spawns* use is separate:
21
+ `ENGINE_DEFAULTS.defaultCli` (also in W-mpmwxkk40007c995) is now `'copilot'`, so
22
+ operators with no explicit `engine.defaultCli` get Copilot on dispatch.
18
23
 
19
24
  ## Adapter Interface
20
25
 
@@ -44,7 +49,7 @@ methods that genuinely differ.
44
49
  | `modelLooksFamiliar(model)` | boolean | Heuristic powering the preflight "stale model after CLI switch" warning. |
45
50
  | `parseOutput(raw)` | `{ text, usage, sessionId, model }` | Final-event parser. |
46
51
  | `parseStreamChunk(line)` | event object or null | Single JSONL line → typed event. |
47
- | `parseError(rawOutput)` | `{ message, code, retriable }` | Codes: `auth-failure`, `context-limit`, `budget-exceeded`, `crash`, null. |
52
+ | `parseError(rawOutput)` | `{ message, code, retriable }` | Codes: `auth-failure`, `context-limit`, `budget-exceeded`, `model-unavailable` (retriable=true for upstream overload/503; retriable=false for invalid/typo'd model id — Copilot enriches the message via `_warmModelCache()` so it lists the available models), `crash`, null. |
48
53
  | `createStreamConsumer(ctx)` | consumer object | Stream accumulator used by `engine/llm.js`. |
49
54
  | `detectPermissionGate`, `getPromptDeliveryMode`, `usesSystemPromptFile`, `classifyFailure` | misc | Adapter-owned policy that engine code reads through accessors instead of branching on `runtime.name`. |
50
55
 
@@ -93,8 +98,8 @@ directly.
93
98
 
94
99
  | Helper | Chain |
95
100
  |--------|-------|
96
- | `resolveAgentCli(agent, engine)` | `agent.cli` → `engine.defaultCli` → `'claude'` |
97
- | `resolveCcCli(engine)` | `engine.ccCli` → `engine.defaultCli` → `'claude'` |
101
+ | `resolveAgentCli(agent, engine)` | `agent.cli` → `engine.defaultCli` → `'copilot'` |
102
+ | `resolveCcCli(engine)` | `engine.ccCli` → `engine.defaultCli` → `'copilot'` |
98
103
  | `resolveAgentModel(agent, engine)` | `agent.model` → `engine.defaultModel` → undefined |
99
104
  | `resolveCcModel(engine)` | `engine.ccModel` → `engine.defaultModel` → undefined |
100
105
  | `resolveAgentMaxBudget(agent, engine)` | `agent.maxBudgetUsd` → `engine.maxBudgetUsd`. Honors literal `0`. |
@@ -54,6 +54,45 @@
54
54
  const { spawn } = require('child_process');
55
55
  const crypto = require('crypto');
56
56
 
57
+ // W-mpmwxni2000c25c7-c — typed error codes the pool emits through every
58
+ // failure exit so the consumer (CC streaming handler / doc-chat pool
59
+ // wrapper / SSE writer) can render a structured error envelope instead of
60
+ // parsing the stderr string. Matches the `{ message, code, retriable }`
61
+ // shape sub-item b standardized on for the dashboard's SSE envelope and
62
+ // the runtime adapter parseError() contract (engine/runtimes/*.js).
63
+ const ERROR_CODES = Object.freeze({
64
+ // spawn() threw synchronously OR the child process emitted an 'error'
65
+ // event (binary missing on PATH, exec failed, EPERM, etc.). Retriable
66
+ // because a transient PATH / fs glitch may recover.
67
+ WORKER_SPAWN_FAILED: 'worker-spawn-failed',
68
+ // The worker process exited DURING the ACP handshake (initialize or
69
+ // session/new) — usually `copilot login` is incomplete or the CLI
70
+ // version is too old. Also fires when session/new returns no
71
+ // sessionId. Retriable: the engine swaps to a fallback model / a re-auth
72
+ // may unblock the next attempt.
73
+ ACP_HANDSHAKE_FAILED: 'acp-handshake-failed',
74
+ // The worker process exited AFTER a successful handshake (the daemon
75
+ // died mid-turn). Retriable — the next call cold-spawns a fresh worker.
76
+ WORKER_DIED: 'worker-died',
77
+ // The consumer's per-turn timeout fired before the ACP session/prompt
78
+ // resolved. Owned by the dashboard pool wrappers (cc-worker-pool itself
79
+ // has no turn timeout) but exported here so all callers stringify the
80
+ // same constant. Retriable — most timeouts are transient.
81
+ CC_TURN_TIMEOUT: 'cc-turn-timeout',
82
+ });
83
+
84
+ // Build a typed Error carrying the `{ message, code, retriable }` envelope
85
+ // fields the consumer expects. Plain Errors flow through unchanged; the
86
+ // helper only stamps the extra metadata. Keep retriable defaulting to
87
+ // `true` so a caller that forgets to set it still gets the safe default
88
+ // (the legacy pre-typed-error code path treated every failure as retriable).
89
+ function _typedError(message, code, retriable = true) {
90
+ const err = new Error(message);
91
+ err.code = code;
92
+ err.retriable = retriable;
93
+ return err;
94
+ }
95
+
57
96
  // 10 minutes — matches the work-item spec.
58
97
  const IDLE_REAPER_MS = 10 * 60 * 1000;
59
98
  // Reaper sweep cadence. Not exposed as ENGINE_DEFAULTS to keep the pool
@@ -176,8 +215,13 @@ class Worker {
176
215
  try {
177
216
  proc = _internals.spawnAcp({ cwd: this.cwd });
178
217
  } catch (err) {
179
- throw new Error(
180
- `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`
218
+ // spawn() threw synchronously — typically ENOENT (copilot binary not
219
+ // on PATH) or EACCES. Surface as worker-spawn-failed so the consumer
220
+ // can show "install the CLI / fix PATH" guidance.
221
+ throw _typedError(
222
+ `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`,
223
+ ERROR_CODES.WORKER_SPAWN_FAILED,
224
+ true
181
225
  );
182
226
  }
183
227
  this.proc = proc;
@@ -193,8 +237,13 @@ class Worker {
193
237
  const earlyExitPromise = new Promise((_, reject) => {
194
238
  earlyExitReject = (code) => {
195
239
  this.killed = true;
196
- const err = new Error(
197
- `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (exit ${code})`
240
+ // Early exit DURING the handshake = acp-handshake-failed (almost
241
+ // always missing `copilot login`, stale CLI, or daemon crash on
242
+ // boot). Retriable so re-auth or a CLI upgrade can recover.
243
+ const err = _typedError(
244
+ `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (exit ${code})`,
245
+ ERROR_CODES.ACP_HANDSHAKE_FAILED,
246
+ true
198
247
  );
199
248
  this.spawnError = err;
200
249
  this._failAllPending(err);
@@ -205,8 +254,13 @@ class Worker {
205
254
  proc.once('exit', earlyExitHandler);
206
255
 
207
256
  const errorHandler = (err) => {
208
- const wrapped = new Error(
209
- `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`
257
+ // proc 'error' event fires when the OS can't actually start the child
258
+ // (ENOENT after a successful spawn() call, etc.). Treat as a spawn
259
+ // failure even though we made it past the synchronous spawn() above.
260
+ const wrapped = _typedError(
261
+ `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`,
262
+ ERROR_CODES.WORKER_SPAWN_FAILED,
263
+ true
210
264
  );
211
265
  this.spawnError = wrapped;
212
266
  this.killed = true;
@@ -227,7 +281,13 @@ class Worker {
227
281
  ]);
228
282
  this.sessionId = result && result.sessionId;
229
283
  if (!this.sessionId) {
230
- throw new Error('copilot --acp failed -- session/new returned no sessionId');
284
+ // Handshake completed without an error but the daemon didn't hand
285
+ // back a sessionId — protocol violation or partial init failure.
286
+ throw _typedError(
287
+ 'copilot --acp failed -- session/new returned no sessionId',
288
+ ERROR_CODES.ACP_HANDSHAKE_FAILED,
289
+ true
290
+ );
231
291
  }
232
292
  } finally {
233
293
  // Either the handshake finished (swap to a persistent exit handler that
@@ -236,7 +296,13 @@ class Worker {
236
296
  }
237
297
  proc.on('exit', () => {
238
298
  this.killed = true;
239
- const err = new Error('copilot --acp process exited');
299
+ // Post-handshake exit = the daemon died mid-conversation. Retriable
300
+ // because the next call will cold-spawn a fresh worker.
301
+ const err = _typedError(
302
+ 'copilot --acp process exited',
303
+ ERROR_CODES.WORKER_DIED,
304
+ true
305
+ );
240
306
  this._failAllPending(err);
241
307
  // Settle inflight too if it's still hanging
242
308
  if (this.inflight && !this.inflight.settled) {
@@ -656,9 +722,13 @@ async function getSession({ tabId, model, effort, mcpServers, systemPromptHash,
656
722
  // This is the bug class the ab141995 fix closed; if it ever recurs the
657
723
  // engine should fail loudly rather than hand back a half-initialized
658
724
  // handle. Throwing here lets the dashboard surface spawn-failed instead
659
- // of the silent thinking-dots-forever symptom.
660
- throw new Error(
661
- `cc-worker-pool: getSession returning handle with null sessionId (tab=${tabId} lifecycle=${lifecycle}) — engine race regression, see W-mpd45blx00072f04 / W-mpdavudb000v8446`
725
+ // of the silent thinking-dots-forever symptom. Mark non-retriable —
726
+ // this is a real engine bug, not a transient pool failure; the next
727
+ // attempt would hit the same race.
728
+ throw _typedError(
729
+ `cc-worker-pool: getSession returning handle with null sessionId (tab=${tabId} lifecycle=${lifecycle}) — engine race regression, see W-mpd45blx00072f04 / W-mpdavudb000v8446`,
730
+ ERROR_CODES.ACP_HANDSHAKE_FAILED,
731
+ false
662
732
  );
663
733
  }
664
734
 
@@ -766,4 +836,10 @@ module.exports = {
766
836
  IDLE_REAPER_MS,
767
837
  REAPER_INTERVAL_MS,
768
838
  WARM_MAX_CONCURRENT,
839
+ // W-mpmwxni2000c25c7-c — typed-error envelope contract. Exported so the
840
+ // dashboard pool wrappers (and their tests) reference the same string
841
+ // constants and so the doc-chat timeout path can stamp the same
842
+ // `{ message, code, retriable }` shape the pool itself emits.
843
+ ERROR_CODES,
844
+ _typedError,
769
845
  };
package/engine/llm.js CHANGED
@@ -82,6 +82,21 @@ function trackEngineUsage(category, usage) {
82
82
  _ensureFlushTimer();
83
83
  }
84
84
 
85
+ // W-mpmwxni2000c25c7-b — silent-error regression counter. Every CC/doc-chat
86
+ // error surfaced through the handlers bumps `_engine[category].errorsByCode[code]`
87
+ // so /api/metrics reflects new error codes (cc-turn-timeout, empty-output, …)
88
+ // without polluting cost/tokens. Counters flush on the same timer as
89
+ // trackEngineUsage so the dashboard's fast-state mtime gate isn't bypassed.
90
+ function trackEngineError(category, errorCode) {
91
+ if (!category || !errorCode) return;
92
+ if (category.startsWith('_test') || category.startsWith('test-')) return;
93
+ if (!_pendingMetrics.engine[category]) _pendingMetrics.engine[category] = _emptyEngineDelta();
94
+ const cat = _pendingMetrics.engine[category];
95
+ if (!cat.errorsByCode) cat.errorsByCode = Object.create(null);
96
+ cat.errorsByCode[errorCode] = (cat.errorsByCode[errorCode] || 0) + 1;
97
+ _ensureFlushTimer();
98
+ }
99
+
85
100
  function flushMetricsBuffer() {
86
101
  const pending = _pendingMetrics;
87
102
  if (!Object.keys(pending.engine).length && !Object.keys(pending.daily).length) return;
@@ -106,6 +121,12 @@ function flushMetricsBuffer() {
106
121
  cat.totalDurationMs = (cat.totalDurationMs || 0) + delta.totalDurationMs;
107
122
  cat.timedCalls = (cat.timedCalls || 0) + delta.timedCalls;
108
123
  }
124
+ if (delta.errorsByCode) {
125
+ if (!cat.errorsByCode) cat.errorsByCode = {};
126
+ for (const [code, count] of Object.entries(delta.errorsByCode)) {
127
+ cat.errorsByCode[code] = (cat.errorsByCode[code] || 0) + count;
128
+ }
129
+ }
109
130
  }
110
131
  if (!metrics._daily) metrics._daily = {};
111
132
  for (const [day, delta] of Object.entries(pending.daily)) {
@@ -129,6 +150,12 @@ function flushMetricsBuffer() {
129
150
  c.inputTokens += delta.inputTokens; c.outputTokens += delta.outputTokens;
130
151
  c.cacheRead += delta.cacheRead; c.cacheCreation += delta.cacheCreation;
131
152
  c.totalDurationMs += delta.totalDurationMs; c.timedCalls += delta.timedCalls;
153
+ if (delta.errorsByCode) {
154
+ if (!c.errorsByCode) c.errorsByCode = Object.create(null);
155
+ for (const [code, count] of Object.entries(delta.errorsByCode)) {
156
+ c.errorsByCode[code] = (c.errorsByCode[code] || 0) + count;
157
+ }
158
+ }
132
159
  }
133
160
  for (const [day, delta] of Object.entries(pending.daily)) {
134
161
  if (!_pendingMetrics.daily[day]) _pendingMetrics.daily[day] = _emptyDailyDelta();
@@ -233,6 +260,8 @@ function _missingRuntimeResult(runtimeName, runtime, reason) {
233
260
  errorClass: shared.FAILURE_CLASS.CONFIG_ERROR,
234
261
  errorMessage: message,
235
262
  missingRuntime: true,
263
+ error: { message, code: shared.FAILURE_CLASS.CONFIG_ERROR, retriable: false },
264
+ ok: false,
236
265
  };
237
266
  }
238
267
 
@@ -245,7 +274,7 @@ function _resolvedCallResult(result) {
245
274
  function _resolveRuntimeNameFor(callOpts = {}) {
246
275
  let runtimeName = callOpts.cli;
247
276
  if (!runtimeName && callOpts.engineConfig) runtimeName = resolveCcCli(callOpts.engineConfig);
248
- return runtimeName || 'claude';
277
+ return runtimeName || 'copilot';
249
278
  }
250
279
 
251
280
  function _runtimeUnavailableResult(callOpts = {}) {
@@ -566,7 +595,7 @@ function _createStreamAccumulator({
566
595
 
567
596
  function _resolveRuntimeFor(callOpts) {
568
597
  // Explicit `cli` opt wins; otherwise fall to `engineConfig` resolution;
569
- // otherwise default to claude (the historical behavior).
598
+ // otherwise default to copilot (fleet default as of W-mpmwxkk40007c995).
570
599
  return resolveRuntime(_resolveRuntimeNameFor(callOpts));
571
600
  }
572
601
 
@@ -599,6 +628,52 @@ function _resolveRuntimeFeatureOpts({
599
628
 
600
629
  // ─── Core LLM Call ───────────────────────────────────────────────────────────
601
630
 
631
+ // W-mpmwxni2000c25c7-b — typed-error envelope helper. callLLM /
632
+ // callLLMStreaming attach `error: { message, code, retriable }` to every
633
+ // failure resolution so dashboard CC/doc-chat handlers can surface a
634
+ // structured 5xx JSON or SSE `event: error` instead of returning an empty
635
+ // reply that hangs the UI. The shape mirrors the existing `runtime.parseError`
636
+ // contract from sub-item (a) so adapter classifications (auth-failure,
637
+ // context-limit, budget-exceeded, crash, model-unavailable) propagate
638
+ // verbatim. Engine codes added here:
639
+ // - 'spawn-error' runFile/proc.on('error') failure (binary missing,
640
+ // EACCES, fork bomb, ...)
641
+ // - 'runtime-exit' non-zero exit code with no parseError signal
642
+ // - 'empty-output' zero exit but no parsed text — runtime returned
643
+ // nothing useful (CLI bug or silent timeout)
644
+ // - 'unparseable-output' bytes streamed but accumulator extracted no text
645
+ // (malformed JSONL or unknown event shape)
646
+ //
647
+ // Existing `errorClass` / `errorMessage` fields stay populated for callers
648
+ // that haven't moved to the typed envelope yet.
649
+ function _buildErrorEnvelope(errInfo, code, parsed, fallback) {
650
+ if (errInfo && errInfo.code) {
651
+ return { message: errInfo.message || fallback || 'LLM call failed', code: errInfo.code, retriable: errInfo.retriable !== false };
652
+ }
653
+ if (code !== 0 && code !== null) {
654
+ const stderrTail = parsed && parsed.stderr ? String(parsed.stderr).trim().split('\n').slice(-3).join(' | ').slice(0, 500) : '';
655
+ return {
656
+ message: stderrTail ? `Runtime exited with code ${code}: ${stderrTail}` : `Runtime exited with code ${code}`,
657
+ code: 'runtime-exit',
658
+ retriable: true,
659
+ };
660
+ }
661
+ if (parsed && parsed.text) return null;
662
+ const rawLen = parsed && parsed.raw ? String(parsed.raw).length : 0;
663
+ if (rawLen > 0) {
664
+ return {
665
+ message: 'Runtime produced output the adapter could not parse',
666
+ code: 'unparseable-output',
667
+ retriable: true,
668
+ };
669
+ }
670
+ return {
671
+ message: fallback || 'Runtime returned no output',
672
+ code: 'empty-output',
673
+ retriable: true,
674
+ };
675
+ }
676
+
602
677
  function callLLM(promptText, sysPromptText, opts = {}) {
603
678
  const {
604
679
  timeout = 120000, label = 'llm', maxTurns = 1, allowedTools = '',
@@ -670,6 +745,7 @@ function callLLM(promptText, sysPromptText, opts = {}) {
670
745
  const errInfo = code !== 0
671
746
  ? runtime.parseError([parsed.raw, parsed.stderr].filter(Boolean).join('\n'))
672
747
  : { message: '', code: null, retriable: true };
748
+ const errorEnvelope = _buildErrorEnvelope(errInfo, code, parsed, null);
673
749
  resolve({
674
750
  text: parsed.text || '',
675
751
  usage,
@@ -681,6 +757,8 @@ function callLLM(promptText, sysPromptText, opts = {}) {
681
757
  runtime: runtime.name,
682
758
  errorClass: errInfo.code,
683
759
  errorMessage: errInfo.message || null,
760
+ error: errorEnvelope,
761
+ ok: !errorEnvelope,
684
762
  });
685
763
  };
686
764
 
@@ -704,6 +782,8 @@ function callLLM(promptText, sysPromptText, opts = {}) {
704
782
  text: '', usage: null, sessionId: null, code: 1,
705
783
  stderr: err.message, raw: '', toolUses: [],
706
784
  runtime: runtime.name, errorClass: null, errorMessage: null,
785
+ error: { message: `Runtime spawn failed: ${err.message}`, code: 'spawn-error', retriable: true },
786
+ ok: false,
707
787
  });
708
788
  });
709
789
  });
@@ -784,6 +864,7 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
784
864
  const errInfo = code !== 0
785
865
  ? runtime.parseError([parsed.raw, parsed.stderr].filter(Boolean).join('\n'))
786
866
  : { message: '', code: null, retriable: true };
867
+ const errorEnvelope = _buildErrorEnvelope(errInfo, code, parsed, null);
787
868
  resolve({
788
869
  text: parsed.text || '',
789
870
  usage,
@@ -795,6 +876,8 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
795
876
  runtime: runtime.name,
796
877
  errorClass: errInfo.code,
797
878
  errorMessage: errInfo.message || null,
879
+ error: errorEnvelope,
880
+ ok: !errorEnvelope,
798
881
  });
799
882
  };
800
883
 
@@ -818,6 +901,8 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
818
901
  text: '', usage: null, sessionId: null, code: 1,
819
902
  stderr: err.message, raw: '', toolUses: [],
820
903
  runtime: runtime.name, errorClass: null, errorMessage: null,
904
+ error: { message: `Runtime spawn failed: ${err.message}`, code: 'spawn-error', retriable: true },
905
+ ok: false,
821
906
  });
822
907
  });
823
908
  });
@@ -825,13 +910,74 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
825
910
  return promise;
826
911
  }
827
912
 
913
+ // ─── CC turn watchdog ────────────────────────────────────────────────────────
914
+ //
915
+ // W-mpmwxni2000c25c7-b — wall-clock cap for a single CC/doc-chat turn. CC turns
916
+ // are a higher-level concept than the per-LLM-call `timeout` opt: a turn can
917
+ // internally retry (resume → fresh → final retry) and each retry has its own
918
+ // per-call timer. Without a turn-level watchdog, a runtime stuck mid-stream
919
+ // (no exit, no chunks, no errors) leaves the SSE handler waiting for the
920
+ // per-call timer to fire and the user staring at the typing dots.
921
+ //
922
+ // Usage: `result = await withCcTurnTimeout({ timeoutMs, label, onAbortReady }, (registerAbort) => callerThatReturnsResultPromise(registerAbort))`.
923
+ // The caller plumbs `registerAbort(abortFn)` into every nested LLM call's
924
+ // `onAbortReady` so the watchdog can kill whichever attempt is in flight on
925
+ // expiry. Returns the original result on success or a synthetic envelope
926
+ // `{ text:'', error:{ code:'cc-turn-timeout', retriable:true } }` on expiry.
927
+ async function withCcTurnTimeout({ timeoutMs, label = 'cc-turn', onAbortReady } = {}, callFn) {
928
+ if (!timeoutMs || timeoutMs <= 0) return callFn(onAbortReady || (() => {}));
929
+ let currentAbort = null;
930
+ let timedOut = false;
931
+ let timer = null;
932
+ const registerAbort = (abort) => {
933
+ currentAbort = abort;
934
+ if (onAbortReady) onAbortReady(abort);
935
+ };
936
+ const inflight = Promise.resolve().then(() => callFn(registerAbort));
937
+ const timeoutPromise = new Promise((resolve) => {
938
+ timer = setTimeout(() => {
939
+ timedOut = true;
940
+ try { if (currentAbort) currentAbort(); } catch { /* swallow */ }
941
+ resolve(null);
942
+ }, timeoutMs);
943
+ // NOTE: do NOT unref this timer. If we did, Node would exit the event
944
+ // loop while waiting on the inflight promise (Promises themselves don't
945
+ // hold the loop open — only timers/I/O do). The race below clears the
946
+ // timer immediately on success, so a still-armed timer never leaks past
947
+ // the resolution.
948
+ });
949
+ const winner = await Promise.race([inflight, timeoutPromise]);
950
+ if (!timedOut) {
951
+ clearTimeout(timer);
952
+ return winner;
953
+ }
954
+ // Let the in-flight call settle so its cleanup (cleanupFiles/Dirs, kill
955
+ // sweeps) actually runs before we hand a synthetic envelope to the caller.
956
+ const settled = await inflight.catch((err) => ({
957
+ text: '', usage: null, sessionId: null, code: 1, stderr: String(err && err.message || err), raw: '', toolUses: [],
958
+ }));
959
+ const message = `CC turn ${label} timed out after ${timeoutMs}ms`;
960
+ return {
961
+ ...settled,
962
+ text: '',
963
+ code: settled?.code || 1,
964
+ errorClass: 'cc-turn-timeout',
965
+ errorMessage: message,
966
+ error: { message, code: 'cc-turn-timeout', retriable: true },
967
+ ok: false,
968
+ };
969
+ }
970
+
828
971
  module.exports = {
829
972
  callLLM,
830
973
  callLLMStreaming,
831
974
  trackEngineUsage,
975
+ trackEngineError,
832
976
  flushMetricsBuffer,
977
+ withCcTurnTimeout,
833
978
  // Exposed for unit tests — engine code MUST use the runtime adapter contract.
834
979
  _buildSpawnAgentFlags,
980
+ _buildErrorEnvelope,
835
981
  _resolveBin,
836
982
  _resetBinCache,
837
983
  _resetMetricsBufferForTest,
@@ -87,17 +87,17 @@ function findClaudeBinary() {
87
87
  * `shared.runtimeConfigWarnings` so unknown-CLI warnings and binary checks
88
88
  * always cover the same surface.
89
89
  *
90
- * Without a config (legacy callers), returns just `['claude']` — the
91
- * historical default.
90
+ * Without a config (legacy callers), returns just `['copilot']` — matches
91
+ * `ENGINE_DEFAULTS.defaultCli` (W-mpmwxkk40007c995).
92
92
  */
93
93
  function _distinctRuntimes(config) {
94
94
  const set = new Set();
95
95
  if (!config || typeof config !== 'object') {
96
- set.add('claude');
96
+ set.add('copilot');
97
97
  return Array.from(set);
98
98
  }
99
99
  const engine = config.engine || {};
100
- set.add(engine.defaultCli ? String(engine.defaultCli) : 'claude');
100
+ set.add(engine.defaultCli ? String(engine.defaultCli) : 'copilot');
101
101
  if (engine.ccCli) set.add(String(engine.ccCli));
102
102
  for (const agent of Object.values(config.agents || {})) {
103
103
  if (agent && agent.cli) set.add(String(agent.cli));
@@ -355,7 +355,7 @@ function _fleetSummaryResults(config) {
355
355
  const results = [];
356
356
  if (!config || typeof config !== 'object') return results;
357
357
  const engine = config.engine || {};
358
- const defaultCli = engine.defaultCli ? String(engine.defaultCli) : 'claude';
358
+ const defaultCli = engine.defaultCli ? String(engine.defaultCli) : 'copilot';
359
359
  const defaultModel = engine.defaultModel ? String(engine.defaultModel) : '(runtime default)';
360
360
  results.push({ name: 'Fleet', ok: true, message: `defaultCli=${defaultCli} defaultModel=${defaultModel}` });
361
361
 
package/engine/queries.js CHANGED
@@ -528,7 +528,7 @@ function getAgents(config) {
528
528
 
529
529
  return roster.map(a => {
530
530
  // Resolve which CLI runtime this agent dispatches to: per-agent override
531
- // → engine.defaultCli → 'claude'. Surfaced so the dashboard can show a
531
+ // → engine.defaultCli → 'copilot'. Surfaced so the dashboard can show a
532
532
  // runtime tag next to the agent name.
533
533
  const runtime = shared.resolveAgentCli(a, config.engine || {});
534
534
  const inboxFiles = allInboxFiles.filter(f => f.includes(a.id));
@@ -1770,19 +1770,18 @@ function _projectGitStatusEqual(a, b) {
1770
1770
  function _scheduleProjectGitStatusRefresh(localPath, key, configuredMainBranch) {
1771
1771
  const existing = _projectGitStatusCache.get(key);
1772
1772
  if (existing && existing.promise) return existing.promise;
1773
- const entry = existing || { ts: 0, value: PROJECT_GIT_STATUS_PENDING, promise: null };
1773
+ const entry = existing || { ts: 0, value: PROJECT_GIT_STATUS_PENDING, promise: null, refMtimes: null };
1774
1774
  const prevValue = entry.value;
1775
- // Capture probe-start time BEFORE running git, not after. Used as the
1776
- // baseline for `_projectGitRefsAdvancedSince` on the next call. If we
1777
- // captured probe-END time, a file written just before the probe started
1778
- // could end up with `mtimeMs >= entry.ts` on a fast filesystem (NTFS
1779
- // mtime granularity vs millisecond-precise Date.now()), busting the
1780
- // cache spuriously on the very next read. Probe-START is the safer
1781
- // anchor — any file with `mtimeMs > probeStartTs` legitimately changed
1782
- // at-or-after the probe, so re-probing is correct.
1775
+ // Snapshot ref mtimes BEFORE the probe so the next call compares against
1776
+ // an exact baseline rather than a Date.now() timestamp. On Windows
1777
+ // Date.now() can have ~15ms granularity while NTFS mtime is sub-ms, so
1778
+ // a file written shortly before the probe could appear `mtimeMs > ts`
1779
+ // even when nothing actually changed.
1783
1780
  const probeStartTs = Date.now();
1781
+ const probeStartRefMtimes = _snapshotProjectGitRefMtimes(localPath, configuredMainBranch);
1784
1782
  entry.promise = _probeProjectGitStatus(localPath, configuredMainBranch).then(value => {
1785
1783
  entry.ts = probeStartTs;
1784
+ entry.refMtimes = probeStartRefMtimes;
1786
1785
  entry.value = value;
1787
1786
  entry.promise = null;
1788
1787
  if (_onProjectGitStatusChanged && !_projectGitStatusEqual(prevValue, value)) {
@@ -1857,35 +1856,65 @@ function _resolveCommonGitDir(gitDir) {
1857
1856
  return path.isAbsolute(raw) ? raw : path.resolve(gitDir, raw);
1858
1857
  }
1859
1858
 
1860
- // Return true when any of the per-project git ref files (logs/HEAD,
1861
- // FETCH_HEAD, refs/remotes/origin/<comparator>) have mtimeMs > cachedTs.
1862
- // Lets `getProjectGitStatus` bypass its 15s TTL after `git pull`, `git
1863
- // fetch`, `git checkout`, etc. so the next /api/status reflects the new
1864
- // HEAD / ahead-behind within one SPA poll instead of waiting out the TTL
1865
- // (W-mphdmr8c00030124). Tolerates ENOENT on FETCH_HEAD / refs (never-
1866
- // fetched repos simply haven't moved those files yet). Cost ≤3 statSync
1867
- // per project per /api/status build — well under the 'cheap' budget
1868
- // called out in getStatusFastStateMtimePaths's docstring.
1869
- function _projectGitRefsAdvancedSince(localPath, cachedTs, configuredMainBranch) {
1859
+ // Enumerate the per-project git ref files we watch for cache-busting:
1860
+ // logs/HEAD (per-worktree gitdir), FETCH_HEAD + refs/remotes/origin/* (common
1861
+ // gitdir for linked worktrees). Same paths as the fast-state mtime tracker
1862
+ // so callers see a coherent view across surfaces.
1863
+ function _projectGitRefFiles(localPath, configuredMainBranch) {
1870
1864
  const gitDir = _resolveGitDir(localPath);
1871
- if (!gitDir) return false;
1872
- // logs/HEAD is per-worktree; FETCH_HEAD + refs/remotes/origin/* live in
1873
- // the COMMON gitdir for linked worktrees. For the main worktree both
1874
- // resolve to the same place, so this is a no-op there.
1865
+ if (!gitDir) return null;
1875
1866
  const commonGitDir = _resolveCommonGitDir(gitDir);
1876
- const candidates = [
1867
+ const files = [
1877
1868
  path.join(gitDir, 'logs', 'HEAD'),
1878
1869
  path.join(commonGitDir, 'FETCH_HEAD'),
1879
1870
  ];
1880
1871
  const comparator = configuredMainBranch && String(configuredMainBranch).trim();
1881
1872
  if (comparator) {
1882
- candidates.push(path.join(commonGitDir, 'refs', 'remotes', 'origin', comparator));
1873
+ files.push(path.join(commonGitDir, 'refs', 'remotes', 'origin', comparator));
1883
1874
  }
1884
- for (const file of candidates) {
1885
- try {
1886
- const st = fs.statSync(file);
1887
- if (st.mtimeMs > cachedTs) return true;
1888
- } catch { /* ENOENT / EPERM file just hasn't moved */ }
1875
+ return files;
1876
+ }
1877
+
1878
+ // Snapshot mtimeMs for each ref file. Missing files record `null`. Used as
1879
+ // the baseline that the next `getProjectGitStatus` call compares against
1880
+ // inequality, not threshold-vs-timestamp, so the result is precision-
1881
+ // independent (Windows `Date.now()` can be 15ms coarse while NTFS mtime is
1882
+ // sub-millisecond, which used to make threshold checks fire spuriously on
1883
+ // freshly-written files).
1884
+ function _snapshotProjectGitRefMtimes(localPath, configuredMainBranch) {
1885
+ const files = _projectGitRefFiles(localPath, configuredMainBranch);
1886
+ if (!files) return null;
1887
+ const out = Object.create(null);
1888
+ for (const f of files) {
1889
+ try { out[f] = fs.statSync(f).mtimeMs; }
1890
+ catch { out[f] = null; /* ENOENT recorded as null — flipping to present must bust */ }
1891
+ }
1892
+ return out;
1893
+ }
1894
+
1895
+ // Return true when ANY tracked ref file's mtime (or existence) differs from
1896
+ // the snapshot captured during the last probe. Replaces the older threshold-
1897
+ // vs-cachedTs check that suffered from `Date.now()`/`mtimeMs` resolution
1898
+ // races on Windows. Lets `getProjectGitStatus` bypass its 15s TTL after
1899
+ // `git pull`, `git fetch`, `git checkout`, etc. so the next /api/status
1900
+ // reflects the new HEAD / ahead-behind within one SPA poll instead of
1901
+ // waiting out the TTL (W-mphdmr8c00030124). Cost: 2-3 statSync per call —
1902
+ // well under the 'cheap' budget.
1903
+ function _projectGitRefsAdvancedSince(localPath, configuredMainBranch, snapshot) {
1904
+ // No snapshot yet (legacy entry shape OR first call) — preserve the
1905
+ // current cached value so the TTL-only fast-path still works. A real
1906
+ // change still surfaces on the next /api/status because the fast-state
1907
+ // mtime tracker watches the same files and will bust the upstream cache.
1908
+ if (!snapshot) return false;
1909
+ const current = _snapshotProjectGitRefMtimes(localPath, configuredMainBranch);
1910
+ if (!current) return false;
1911
+ for (const file of Object.keys(snapshot)) {
1912
+ if (current[file] !== snapshot[file]) return true;
1913
+ }
1914
+ // Also catch a file that appeared since the snapshot (e.g. first `git
1915
+ // fetch` materialises FETCH_HEAD).
1916
+ for (const file of Object.keys(current)) {
1917
+ if (!(file in snapshot)) return true;
1889
1918
  }
1890
1919
  return false;
1891
1920
  }
@@ -1901,14 +1930,25 @@ function getProjectGitStatus(localPath, configuredMainBranch = null) {
1901
1930
  // the pre-pull ahead/behind counts for up to 15s + one SPA poll (~19s
1902
1931
  // user-visible lag) because the rebuilt fast-state still hits this
1903
1932
  // cache and never schedules a refresh until the TTL itself expires.
1904
- if (cached && cached.ts && (now - cached.ts) < PROJECT_GIT_STATUS_TTL
1905
- && !_projectGitRefsAdvancedSince(localPath, cached.ts, configuredMainBranch)) {
1933
+ // Revalidate a cached MISSING value via a cheap existsSync. The snapshot-
1934
+ // based freshness check below can't detect "directory came back" because
1935
+ // there was no `.git` to snapshot when we wrote MISSING — without this
1936
+ // gate the cache pins MISSING for the full 15s TTL after the path is
1937
+ // recreated.
1938
+ const cachedIsMissing = cached && cached.value === PROJECT_GIT_STATUS_MISSING;
1939
+ if (cachedIsMissing && fs.existsSync(localPath)) {
1940
+ // Path came back — fall through to schedule a fresh probe.
1941
+ } else if (cached && cached.ts && (now - cached.ts) < PROJECT_GIT_STATUS_TTL
1942
+ && !_projectGitRefsAdvancedSince(localPath, configuredMainBranch, cached.refMtimes)) {
1906
1943
  return cached.value;
1907
1944
  }
1908
1945
  // Cheap synchronous existsSync — short-circuits a path that just disappeared
1909
- // (project removed) without scheduling a useless git probe.
1946
+ // (project removed) without scheduling a useless git probe. `refMtimes: null`
1947
+ // keeps the entry shape uniform with entries produced by
1948
+ // `_scheduleProjectGitStatusRefresh` so the freshness check above always
1949
+ // sees a defined field.
1910
1950
  if (!fs.existsSync(localPath)) {
1911
- _projectGitStatusCache.set(key, { ts: now, value: PROJECT_GIT_STATUS_MISSING, promise: null });
1951
+ _projectGitStatusCache.set(key, { ts: now, value: PROJECT_GIT_STATUS_MISSING, promise: null, refMtimes: null });
1912
1952
  return PROJECT_GIT_STATUS_MISSING;
1913
1953
  }
1914
1954
  // Stale or never-populated — kick off a background refresh and return the