npm - @yemi33/minions - Versions diffs - 0.1.2045 → 0.1.2047 - Mend

@yemi33/minions 0.1.2045 → 0.1.2047

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +2 -2
package/dashboard/js/fre.js +3 -2
package/dashboard/js/render-prs.js +82 -2
package/dashboard/js/settings.js +5 -5
package/dashboard/styles.css +11 -0
package/dashboard.js +376 -135
package/docs/copilot-cli-schema.md +2 -1
package/docs/runtime-adapters.md +9 -4
package/engine/cc-worker-pool.js +87 -11
package/engine/llm.js +148 -2
package/engine/preflight.js +5 -5
package/engine/queries.js +75 -35
package/engine/runtimes/claude.js +41 -0
package/engine/runtimes/copilot.js +97 -3
package/engine/shared.js +4 -3
package/package.json +1 -1

package/docs/copilot-cli-schema.md CHANGED Viewed

@@ -614,8 +614,9 @@ When implementing `engine/runtimes/copilot.js`:
    should still parse cleanly — let the consumer decide to ignore.
 9. `parseError(rawOutput)` patterns:
    - `auth-failure`: `/not authenticated|copilot login|401|403/i`
+   - `model-unavailable` (retriable=false): `/unknown model|model not found|invalid model|model_not_found|400.*model/i` — message includes the cached model catalog (`_warmModelCache` populates `_modelDiscoveryResults` from `listModels()` ahead of time so the error path stays sync). Falls back to "Configure a valid model in Settings → Engine." when the cache is empty.
+   - `model-unavailable` (retriable=true): `/overloaded_error|service_unavailable|503|temporarily unavailable/i` — engine retries with `engine.copilotFallbackModel`.
    - `rate-limit`: `/rate limit|too many requests|429/i`
-   - `unknown-model`: `/unknown model|model not found|model.*invalid/i`
    - `crash`: `/internal error|panic|uncaught/i`
 10. `listModels()` per §6 — return `null` on any failure (network, parse, auth).
     `modelsCache` path: `engine/copilot-models.json`.

package/docs/runtime-adapters.md CHANGED Viewed

@@ -14,7 +14,12 @@ behavior is hidden behind an adapter object resolved through `resolveRuntime()`.
 `resolveRuntime(name)` throws when `name` is unknown so misconfigurations surface
 at dispatch time instead of producing silent fallbacks deep inside spawn logic.
-The default name is `'claude'` (matches `engine.defaultCli` fallback).
+When `name` is `null`/omitted, `resolveRuntime()` falls back to `'claude'` for
+parser-routing compatibility (Copilot's `parseOutput` cannot consume the Claude
+JSONL `{type:"result",result:"..."}` shape — see W-mpmwxkk40007c995). The fleet
+default that determines which runtime *new spawns* use is separate:
+`ENGINE_DEFAULTS.defaultCli` (also in W-mpmwxkk40007c995) is now `'copilot'`, so
+operators with no explicit `engine.defaultCli` get Copilot on dispatch.
 ## Adapter Interface
@@ -44,7 +49,7 @@ methods that genuinely differ.
 | `modelLooksFamiliar(model)` | boolean | Heuristic powering the preflight "stale model after CLI switch" warning. |
 | `parseOutput(raw)` | `{ text, usage, sessionId, model }` | Final-event parser. |
 | `parseStreamChunk(line)` | event object or null | Single JSONL line → typed event. |
-| `parseError(rawOutput)` | `{ message, code, retriable }` | Codes: `auth-failure`, `context-limit`, `budget-exceeded`, `crash`, null. |
+| `parseError(rawOutput)` | `{ message, code, retriable }` | Codes: `auth-failure`, `context-limit`, `budget-exceeded`, `model-unavailable` (retriable=true for upstream overload/503; retriable=false for invalid/typo'd model id — Copilot enriches the message via `_warmModelCache()` so it lists the available models), `crash`, null. |
 | `createStreamConsumer(ctx)` | consumer object | Stream accumulator used by `engine/llm.js`. |
 | `detectPermissionGate`, `getPromptDeliveryMode`, `usesSystemPromptFile`, `classifyFailure` | misc | Adapter-owned policy that engine code reads through accessors instead of branching on `runtime.name`. |
@@ -93,8 +98,8 @@ directly.
 | Helper | Chain |
 |--------|-------|
-| `resolveAgentCli(agent, engine)` | `agent.cli` → `engine.defaultCli` → `'claude'` |
-| `resolveCcCli(engine)` | `engine.ccCli` → `engine.defaultCli` → `'claude'` |
+| `resolveAgentCli(agent, engine)` | `agent.cli` → `engine.defaultCli` → `'copilot'` |
+| `resolveCcCli(engine)` | `engine.ccCli` → `engine.defaultCli` → `'copilot'` |
 | `resolveAgentModel(agent, engine)` | `agent.model` → `engine.defaultModel` → undefined |
 | `resolveCcModel(engine)` | `engine.ccModel` → `engine.defaultModel` → undefined |
 | `resolveAgentMaxBudget(agent, engine)` | `agent.maxBudgetUsd` → `engine.maxBudgetUsd`. Honors literal `0`. |

package/engine/cc-worker-pool.js CHANGED Viewed

@@ -54,6 +54,45 @@
 const { spawn } = require('child_process');
 const crypto = require('crypto');
+// W-mpmwxni2000c25c7-c — typed error codes the pool emits through every
+// failure exit so the consumer (CC streaming handler / doc-chat pool
+// wrapper / SSE writer) can render a structured error envelope instead of
+// parsing the stderr string. Matches the `{ message, code, retriable }`
+// shape sub-item b standardized on for the dashboard's SSE envelope and
+// the runtime adapter parseError() contract (engine/runtimes/*.js).
+const ERROR_CODES = Object.freeze({
+  // spawn() threw synchronously OR the child process emitted an 'error'
+  // event (binary missing on PATH, exec failed, EPERM, etc.). Retriable
+  // because a transient PATH / fs glitch may recover.
+  WORKER_SPAWN_FAILED: 'worker-spawn-failed',
+  // The worker process exited DURING the ACP handshake (initialize or
+  // session/new) — usually `copilot login` is incomplete or the CLI
+  // version is too old. Also fires when session/new returns no
+  // sessionId. Retriable: the engine swaps to a fallback model / a re-auth
+  // may unblock the next attempt.
+  ACP_HANDSHAKE_FAILED: 'acp-handshake-failed',
+  // The worker process exited AFTER a successful handshake (the daemon
+  // died mid-turn). Retriable — the next call cold-spawns a fresh worker.
+  WORKER_DIED: 'worker-died',
+  // The consumer's per-turn timeout fired before the ACP session/prompt
+  // resolved. Owned by the dashboard pool wrappers (cc-worker-pool itself
+  // has no turn timeout) but exported here so all callers stringify the
+  // same constant. Retriable — most timeouts are transient.
+  CC_TURN_TIMEOUT: 'cc-turn-timeout',
+});
+// Build a typed Error carrying the `{ message, code, retriable }` envelope
+// fields the consumer expects. Plain Errors flow through unchanged; the
+// helper only stamps the extra metadata. Keep retriable defaulting to
+// `true` so a caller that forgets to set it still gets the safe default
+// (the legacy pre-typed-error code path treated every failure as retriable).
+function _typedError(message, code, retriable = true) {
+  const err = new Error(message);
+  err.code = code;
+  err.retriable = retriable;
+  return err;
+}
 // 10 minutes — matches the work-item spec.
 const IDLE_REAPER_MS = 10 * 60 * 1000;
 // Reaper sweep cadence. Not exposed as ENGINE_DEFAULTS to keep the pool
@@ -176,8 +215,13 @@ class Worker {
     try {
       proc = _internals.spawnAcp({ cwd: this.cwd });
     } catch (err) {
-      throw new Error(
-        `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`
+      // spawn() threw synchronously — typically ENOENT (copilot binary not
+      // on PATH) or EACCES. Surface as worker-spawn-failed so the consumer
+      // can show "install the CLI / fix PATH" guidance.
+      throw _typedError(
+        `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`,
+        ERROR_CODES.WORKER_SPAWN_FAILED,
+        true
       );
     }
     this.proc = proc;
@@ -193,8 +237,13 @@ class Worker {
     const earlyExitPromise = new Promise((_, reject) => {
       earlyExitReject = (code) => {
         this.killed = true;
-        const err = new Error(
-          `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (exit ${code})`
+        // Early exit DURING the handshake = acp-handshake-failed (almost
+        // always missing `copilot login`, stale CLI, or daemon crash on
+        // boot). Retriable so re-auth or a CLI upgrade can recover.
+        const err = _typedError(
+          `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (exit ${code})`,
+          ERROR_CODES.ACP_HANDSHAKE_FAILED,
+          true
         );
         this.spawnError = err;
         this._failAllPending(err);
@@ -205,8 +254,13 @@ class Worker {
     proc.once('exit', earlyExitHandler);
     const errorHandler = (err) => {
-      const wrapped = new Error(
-        `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`
+      // proc 'error' event fires when the OS can't actually start the child
+      // (ENOENT after a successful spawn() call, etc.). Treat as a spawn
+      // failure even though we made it past the synchronous spawn() above.
+      const wrapped = _typedError(
+        `copilot --acp failed -- ensure copilot CLI >=1.0.46 and copilot login is complete (${err.message})`,
+        ERROR_CODES.WORKER_SPAWN_FAILED,
+        true
       );
       this.spawnError = wrapped;
       this.killed = true;
@@ -227,7 +281,13 @@ class Worker {
       ]);
       this.sessionId = result && result.sessionId;
       if (!this.sessionId) {
-        throw new Error('copilot --acp failed -- session/new returned no sessionId');
+        // Handshake completed without an error but the daemon didn't hand
+        // back a sessionId — protocol violation or partial init failure.
+        throw _typedError(
+          'copilot --acp failed -- session/new returned no sessionId',
+          ERROR_CODES.ACP_HANDSHAKE_FAILED,
+          true
+        );
       }
     } finally {
       // Either the handshake finished (swap to a persistent exit handler that
@@ -236,7 +296,13 @@ class Worker {
     }
     proc.on('exit', () => {
       this.killed = true;
-      const err = new Error('copilot --acp process exited');
+      // Post-handshake exit = the daemon died mid-conversation. Retriable
+      // because the next call will cold-spawn a fresh worker.
+      const err = _typedError(
+        'copilot --acp process exited',
+        ERROR_CODES.WORKER_DIED,
+        true
+      );
       this._failAllPending(err);
       // Settle inflight too if it's still hanging
       if (this.inflight && !this.inflight.settled) {
@@ -656,9 +722,13 @@ async function getSession({ tabId, model, effort, mcpServers, systemPromptHash,
     // This is the bug class the ab141995 fix closed; if it ever recurs the
     // engine should fail loudly rather than hand back a half-initialized
     // handle. Throwing here lets the dashboard surface spawn-failed instead
-    // of the silent thinking-dots-forever symptom.
-    throw new Error(
-      `cc-worker-pool: getSession returning handle with null sessionId (tab=${tabId} lifecycle=${lifecycle}) — engine race regression, see W-mpd45blx00072f04 / W-mpdavudb000v8446`
+    // of the silent thinking-dots-forever symptom. Mark non-retriable —
+    // this is a real engine bug, not a transient pool failure; the next
+    // attempt would hit the same race.
+    throw _typedError(
+      `cc-worker-pool: getSession returning handle with null sessionId (tab=${tabId} lifecycle=${lifecycle}) — engine race regression, see W-mpd45blx00072f04 / W-mpdavudb000v8446`,
+      ERROR_CODES.ACP_HANDSHAKE_FAILED,
+      false
     );
   }
@@ -766,4 +836,10 @@ module.exports = {
   IDLE_REAPER_MS,
   REAPER_INTERVAL_MS,
   WARM_MAX_CONCURRENT,
+  // W-mpmwxni2000c25c7-c — typed-error envelope contract. Exported so the
+  // dashboard pool wrappers (and their tests) reference the same string
+  // constants and so the doc-chat timeout path can stamp the same
+  // `{ message, code, retriable }` shape the pool itself emits.
+  ERROR_CODES,
+  _typedError,
 };

package/engine/llm.js CHANGED Viewed

@@ -82,6 +82,21 @@ function trackEngineUsage(category, usage) {
   _ensureFlushTimer();
 }
+// W-mpmwxni2000c25c7-b — silent-error regression counter. Every CC/doc-chat
+// error surfaced through the handlers bumps `_engine[category].errorsByCode[code]`
+// so /api/metrics reflects new error codes (cc-turn-timeout, empty-output, …)
+// without polluting cost/tokens. Counters flush on the same timer as
+// trackEngineUsage so the dashboard's fast-state mtime gate isn't bypassed.
+function trackEngineError(category, errorCode) {
+  if (!category || !errorCode) return;
+  if (category.startsWith('_test') || category.startsWith('test-')) return;
+  if (!_pendingMetrics.engine[category]) _pendingMetrics.engine[category] = _emptyEngineDelta();
+  const cat = _pendingMetrics.engine[category];
+  if (!cat.errorsByCode) cat.errorsByCode = Object.create(null);
+  cat.errorsByCode[errorCode] = (cat.errorsByCode[errorCode] || 0) + 1;
+  _ensureFlushTimer();
+}
 function flushMetricsBuffer() {
   const pending = _pendingMetrics;
   if (!Object.keys(pending.engine).length && !Object.keys(pending.daily).length) return;
@@ -106,6 +121,12 @@ function flushMetricsBuffer() {
           cat.totalDurationMs = (cat.totalDurationMs || 0) + delta.totalDurationMs;
           cat.timedCalls = (cat.timedCalls || 0) + delta.timedCalls;
         }
+        if (delta.errorsByCode) {
+          if (!cat.errorsByCode) cat.errorsByCode = {};
+          for (const [code, count] of Object.entries(delta.errorsByCode)) {
+            cat.errorsByCode[code] = (cat.errorsByCode[code] || 0) + count;
+          }
+        }
       }
       if (!metrics._daily) metrics._daily = {};
       for (const [day, delta] of Object.entries(pending.daily)) {
@@ -129,6 +150,12 @@ function flushMetricsBuffer() {
       c.inputTokens += delta.inputTokens; c.outputTokens += delta.outputTokens;
       c.cacheRead += delta.cacheRead; c.cacheCreation += delta.cacheCreation;
       c.totalDurationMs += delta.totalDurationMs; c.timedCalls += delta.timedCalls;
+      if (delta.errorsByCode) {
+        if (!c.errorsByCode) c.errorsByCode = Object.create(null);
+        for (const [code, count] of Object.entries(delta.errorsByCode)) {
+          c.errorsByCode[code] = (c.errorsByCode[code] || 0) + count;
+        }
+      }
     }
     for (const [day, delta] of Object.entries(pending.daily)) {
       if (!_pendingMetrics.daily[day]) _pendingMetrics.daily[day] = _emptyDailyDelta();
@@ -233,6 +260,8 @@ function _missingRuntimeResult(runtimeName, runtime, reason) {
     errorClass: shared.FAILURE_CLASS.CONFIG_ERROR,
     errorMessage: message,
     missingRuntime: true,
+    error: { message, code: shared.FAILURE_CLASS.CONFIG_ERROR, retriable: false },
+    ok: false,
   };
 }
@@ -245,7 +274,7 @@ function _resolvedCallResult(result) {
 function _resolveRuntimeNameFor(callOpts = {}) {
   let runtimeName = callOpts.cli;
   if (!runtimeName && callOpts.engineConfig) runtimeName = resolveCcCli(callOpts.engineConfig);
-  return runtimeName || 'claude';
+  return runtimeName || 'copilot';
 }
 function _runtimeUnavailableResult(callOpts = {}) {
@@ -566,7 +595,7 @@ function _createStreamAccumulator({
 function _resolveRuntimeFor(callOpts) {
   // Explicit `cli` opt wins; otherwise fall to `engineConfig` resolution;
-  // otherwise default to claude (the historical behavior).
+  // otherwise default to copilot (fleet default as of W-mpmwxkk40007c995).
   return resolveRuntime(_resolveRuntimeNameFor(callOpts));
 }
@@ -599,6 +628,52 @@ function _resolveRuntimeFeatureOpts({
 // ─── Core LLM Call ───────────────────────────────────────────────────────────
+// W-mpmwxni2000c25c7-b — typed-error envelope helper. callLLM /
+// callLLMStreaming attach `error: { message, code, retriable }` to every
+// failure resolution so dashboard CC/doc-chat handlers can surface a
+// structured 5xx JSON or SSE `event: error` instead of returning an empty
+// reply that hangs the UI. The shape mirrors the existing `runtime.parseError`
+// contract from sub-item (a) so adapter classifications (auth-failure,
+// context-limit, budget-exceeded, crash, model-unavailable) propagate
+// verbatim. Engine codes added here:
+//   - 'spawn-error'        runFile/proc.on('error') failure (binary missing,
+//                          EACCES, fork bomb, ...)
+//   - 'runtime-exit'       non-zero exit code with no parseError signal
+//   - 'empty-output'       zero exit but no parsed text — runtime returned
+//                          nothing useful (CLI bug or silent timeout)
+//   - 'unparseable-output' bytes streamed but accumulator extracted no text
+//                          (malformed JSONL or unknown event shape)
+//
+// Existing `errorClass` / `errorMessage` fields stay populated for callers
+// that haven't moved to the typed envelope yet.
+function _buildErrorEnvelope(errInfo, code, parsed, fallback) {
+  if (errInfo && errInfo.code) {
+    return { message: errInfo.message || fallback || 'LLM call failed', code: errInfo.code, retriable: errInfo.retriable !== false };
+  }
+  if (code !== 0 && code !== null) {
+    const stderrTail = parsed && parsed.stderr ? String(parsed.stderr).trim().split('\n').slice(-3).join(' | ').slice(0, 500) : '';
+    return {
+      message: stderrTail ? `Runtime exited with code ${code}: ${stderrTail}` : `Runtime exited with code ${code}`,
+      code: 'runtime-exit',
+      retriable: true,
+    };
+  }
+  if (parsed && parsed.text) return null;
+  const rawLen = parsed && parsed.raw ? String(parsed.raw).length : 0;
+  if (rawLen > 0) {
+    return {
+      message: 'Runtime produced output the adapter could not parse',
+      code: 'unparseable-output',
+      retriable: true,
+    };
+  }
+  return {
+    message: fallback || 'Runtime returned no output',
+    code: 'empty-output',
+    retriable: true,
+  };
+}
 function callLLM(promptText, sysPromptText, opts = {}) {
   const {
     timeout = 120000, label = 'llm', maxTurns = 1, allowedTools = '',
@@ -670,6 +745,7 @@ function callLLM(promptText, sysPromptText, opts = {}) {
       const errInfo = code !== 0
         ? runtime.parseError([parsed.raw, parsed.stderr].filter(Boolean).join('\n'))
         : { message: '', code: null, retriable: true };
+      const errorEnvelope = _buildErrorEnvelope(errInfo, code, parsed, null);
       resolve({
         text: parsed.text || '',
         usage,
@@ -681,6 +757,8 @@ function callLLM(promptText, sysPromptText, opts = {}) {
         runtime: runtime.name,
         errorClass: errInfo.code,
         errorMessage: errInfo.message || null,
+        error: errorEnvelope,
+        ok: !errorEnvelope,
       });
     };
@@ -704,6 +782,8 @@ function callLLM(promptText, sysPromptText, opts = {}) {
         text: '', usage: null, sessionId: null, code: 1,
         stderr: err.message, raw: '', toolUses: [],
         runtime: runtime.name, errorClass: null, errorMessage: null,
+        error: { message: `Runtime spawn failed: ${err.message}`, code: 'spawn-error', retriable: true },
+        ok: false,
       });
     });
   });
@@ -784,6 +864,7 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
       const errInfo = code !== 0
         ? runtime.parseError([parsed.raw, parsed.stderr].filter(Boolean).join('\n'))
         : { message: '', code: null, retriable: true };
+      const errorEnvelope = _buildErrorEnvelope(errInfo, code, parsed, null);
       resolve({
         text: parsed.text || '',
         usage,
@@ -795,6 +876,8 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
         runtime: runtime.name,
         errorClass: errInfo.code,
         errorMessage: errInfo.message || null,
+        error: errorEnvelope,
+        ok: !errorEnvelope,
       });
     };
@@ -818,6 +901,8 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
         text: '', usage: null, sessionId: null, code: 1,
         stderr: err.message, raw: '', toolUses: [],
         runtime: runtime.name, errorClass: null, errorMessage: null,
+        error: { message: `Runtime spawn failed: ${err.message}`, code: 'spawn-error', retriable: true },
+        ok: false,
       });
     });
   });
@@ -825,13 +910,74 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
   return promise;
 }
+// ─── CC turn watchdog ────────────────────────────────────────────────────────
+//
+// W-mpmwxni2000c25c7-b — wall-clock cap for a single CC/doc-chat turn. CC turns
+// are a higher-level concept than the per-LLM-call `timeout` opt: a turn can
+// internally retry (resume → fresh → final retry) and each retry has its own
+// per-call timer. Without a turn-level watchdog, a runtime stuck mid-stream
+// (no exit, no chunks, no errors) leaves the SSE handler waiting for the
+// per-call timer to fire and the user staring at the typing dots.
+//
+// Usage: `result = await withCcTurnTimeout({ timeoutMs, label, onAbortReady }, (registerAbort) => callerThatReturnsResultPromise(registerAbort))`.
+// The caller plumbs `registerAbort(abortFn)` into every nested LLM call's
+// `onAbortReady` so the watchdog can kill whichever attempt is in flight on
+// expiry. Returns the original result on success or a synthetic envelope
+// `{ text:'', error:{ code:'cc-turn-timeout', retriable:true } }` on expiry.
+async function withCcTurnTimeout({ timeoutMs, label = 'cc-turn', onAbortReady } = {}, callFn) {
+  if (!timeoutMs || timeoutMs <= 0) return callFn(onAbortReady || (() => {}));
+  let currentAbort = null;
+  let timedOut = false;
+  let timer = null;
+  const registerAbort = (abort) => {
+    currentAbort = abort;
+    if (onAbortReady) onAbortReady(abort);
+  };
+  const inflight = Promise.resolve().then(() => callFn(registerAbort));
+  const timeoutPromise = new Promise((resolve) => {
+    timer = setTimeout(() => {
+      timedOut = true;
+      try { if (currentAbort) currentAbort(); } catch { /* swallow */ }
+      resolve(null);
+    }, timeoutMs);
+    // NOTE: do NOT unref this timer. If we did, Node would exit the event
+    // loop while waiting on the inflight promise (Promises themselves don't
+    // hold the loop open — only timers/I/O do). The race below clears the
+    // timer immediately on success, so a still-armed timer never leaks past
+    // the resolution.
+  });
+  const winner = await Promise.race([inflight, timeoutPromise]);
+  if (!timedOut) {
+    clearTimeout(timer);
+    return winner;
+  }
+  // Let the in-flight call settle so its cleanup (cleanupFiles/Dirs, kill
+  // sweeps) actually runs before we hand a synthetic envelope to the caller.
+  const settled = await inflight.catch((err) => ({
+    text: '', usage: null, sessionId: null, code: 1, stderr: String(err && err.message || err), raw: '', toolUses: [],
+  }));
+  const message = `CC turn ${label} timed out after ${timeoutMs}ms`;
+  return {
+    ...settled,
+    text: '',
+    code: settled?.code || 1,
+    errorClass: 'cc-turn-timeout',
+    errorMessage: message,
+    error: { message, code: 'cc-turn-timeout', retriable: true },
+    ok: false,
+  };
+}
 module.exports = {
   callLLM,
   callLLMStreaming,
   trackEngineUsage,
+  trackEngineError,
   flushMetricsBuffer,
+  withCcTurnTimeout,
   // Exposed for unit tests — engine code MUST use the runtime adapter contract.
   _buildSpawnAgentFlags,
+  _buildErrorEnvelope,
   _resolveBin,
   _resetBinCache,
   _resetMetricsBufferForTest,

package/engine/preflight.js CHANGED Viewed

@@ -87,17 +87,17 @@ function findClaudeBinary() {
  * `shared.runtimeConfigWarnings` so unknown-CLI warnings and binary checks
  * always cover the same surface.
  *
- * Without a config (legacy callers), returns just `['claude']` — the
- * historical default.
+ * Without a config (legacy callers), returns just `['copilot']` — matches
+ * `ENGINE_DEFAULTS.defaultCli` (W-mpmwxkk40007c995).
  */
 function _distinctRuntimes(config) {
   const set = new Set();
   if (!config || typeof config !== 'object') {
-    set.add('claude');
+    set.add('copilot');
     return Array.from(set);
   }
   const engine = config.engine || {};
-  set.add(engine.defaultCli ? String(engine.defaultCli) : 'claude');
+  set.add(engine.defaultCli ? String(engine.defaultCli) : 'copilot');
   if (engine.ccCli) set.add(String(engine.ccCli));
   for (const agent of Object.values(config.agents || {})) {
     if (agent && agent.cli) set.add(String(agent.cli));
@@ -355,7 +355,7 @@ function _fleetSummaryResults(config) {
   const results = [];
   if (!config || typeof config !== 'object') return results;
   const engine = config.engine || {};
-  const defaultCli = engine.defaultCli ? String(engine.defaultCli) : 'claude';
+  const defaultCli = engine.defaultCli ? String(engine.defaultCli) : 'copilot';
   const defaultModel = engine.defaultModel ? String(engine.defaultModel) : '(runtime default)';
   results.push({ name: 'Fleet', ok: true, message: `defaultCli=${defaultCli}  defaultModel=${defaultModel}` });

package/engine/queries.js CHANGED Viewed

@@ -528,7 +528,7 @@ function getAgents(config) {
   return roster.map(a => {
     // Resolve which CLI runtime this agent dispatches to: per-agent override
-    // → engine.defaultCli → 'claude'. Surfaced so the dashboard can show a
+    // → engine.defaultCli → 'copilot'. Surfaced so the dashboard can show a
     // runtime tag next to the agent name.
     const runtime = shared.resolveAgentCli(a, config.engine || {});
     const inboxFiles = allInboxFiles.filter(f => f.includes(a.id));
@@ -1770,19 +1770,18 @@ function _projectGitStatusEqual(a, b) {
 function _scheduleProjectGitStatusRefresh(localPath, key, configuredMainBranch) {
   const existing = _projectGitStatusCache.get(key);
   if (existing && existing.promise) return existing.promise;
-  const entry = existing || { ts: 0, value: PROJECT_GIT_STATUS_PENDING, promise: null };
+  const entry = existing || { ts: 0, value: PROJECT_GIT_STATUS_PENDING, promise: null, refMtimes: null };
   const prevValue = entry.value;
-  // Capture probe-start time BEFORE running git, not after. Used as the
-  // baseline for `_projectGitRefsAdvancedSince` on the next call. If we
-  // captured probe-END time, a file written just before the probe started
-  // could end up with `mtimeMs >= entry.ts` on a fast filesystem (NTFS
-  // mtime granularity vs millisecond-precise Date.now()), busting the
-  // cache spuriously on the very next read. Probe-START is the safer
-  // anchor — any file with `mtimeMs > probeStartTs` legitimately changed
-  // at-or-after the probe, so re-probing is correct.
+  // Snapshot ref mtimes BEFORE the probe so the next call compares against
+  // an exact baseline rather than a Date.now() timestamp. On Windows
+  // Date.now() can have ~15ms granularity while NTFS mtime is sub-ms, so
+  // a file written shortly before the probe could appear `mtimeMs > ts`
+  // even when nothing actually changed.
   const probeStartTs = Date.now();
+  const probeStartRefMtimes = _snapshotProjectGitRefMtimes(localPath, configuredMainBranch);
   entry.promise = _probeProjectGitStatus(localPath, configuredMainBranch).then(value => {
     entry.ts = probeStartTs;
+    entry.refMtimes = probeStartRefMtimes;
     entry.value = value;
     entry.promise = null;
     if (_onProjectGitStatusChanged && !_projectGitStatusEqual(prevValue, value)) {
@@ -1857,35 +1856,65 @@ function _resolveCommonGitDir(gitDir) {
   return path.isAbsolute(raw) ? raw : path.resolve(gitDir, raw);
 }
-// Return true when any of the per-project git ref files (logs/HEAD,
-// FETCH_HEAD, refs/remotes/origin/<comparator>) have mtimeMs > cachedTs.
-// Lets `getProjectGitStatus` bypass its 15s TTL after `git pull`, `git
-// fetch`, `git checkout`, etc. so the next /api/status reflects the new
-// HEAD / ahead-behind within one SPA poll instead of waiting out the TTL
-// (W-mphdmr8c00030124). Tolerates ENOENT on FETCH_HEAD / refs (never-
-// fetched repos simply haven't moved those files yet). Cost ≤3 statSync
-// per project per /api/status build — well under the 'cheap' budget
-// called out in getStatusFastStateMtimePaths's docstring.
-function _projectGitRefsAdvancedSince(localPath, cachedTs, configuredMainBranch) {
+// Enumerate the per-project git ref files we watch for cache-busting:
+// logs/HEAD (per-worktree gitdir), FETCH_HEAD + refs/remotes/origin/* (common
+// gitdir for linked worktrees). Same paths as the fast-state mtime tracker
+// so callers see a coherent view across surfaces.
+function _projectGitRefFiles(localPath, configuredMainBranch) {
   const gitDir = _resolveGitDir(localPath);
-  if (!gitDir) return false;
-  // logs/HEAD is per-worktree; FETCH_HEAD + refs/remotes/origin/* live in
-  // the COMMON gitdir for linked worktrees. For the main worktree both
-  // resolve to the same place, so this is a no-op there.
+  if (!gitDir) return null;
   const commonGitDir = _resolveCommonGitDir(gitDir);
-  const candidates = [
+  const files = [
     path.join(gitDir, 'logs', 'HEAD'),
     path.join(commonGitDir, 'FETCH_HEAD'),
   ];
   const comparator = configuredMainBranch && String(configuredMainBranch).trim();
   if (comparator) {
-    candidates.push(path.join(commonGitDir, 'refs', 'remotes', 'origin', comparator));
+    files.push(path.join(commonGitDir, 'refs', 'remotes', 'origin', comparator));
   }
-  for (const file of candidates) {
-    try {
-      const st = fs.statSync(file);
-      if (st.mtimeMs > cachedTs) return true;
-    } catch { /* ENOENT / EPERM — file just hasn't moved */ }
+  return files;
+}
+// Snapshot mtimeMs for each ref file. Missing files record `null`. Used as
+// the baseline that the next `getProjectGitStatus` call compares against —
+// inequality, not threshold-vs-timestamp, so the result is precision-
+// independent (Windows `Date.now()` can be 15ms coarse while NTFS mtime is
+// sub-millisecond, which used to make threshold checks fire spuriously on
+// freshly-written files).
+function _snapshotProjectGitRefMtimes(localPath, configuredMainBranch) {
+  const files = _projectGitRefFiles(localPath, configuredMainBranch);
+  if (!files) return null;
+  const out = Object.create(null);
+  for (const f of files) {
+    try { out[f] = fs.statSync(f).mtimeMs; }
+    catch { out[f] = null; /* ENOENT recorded as null — flipping to present must bust */ }
+  }
+  return out;
+}
+// Return true when ANY tracked ref file's mtime (or existence) differs from
+// the snapshot captured during the last probe. Replaces the older threshold-
+// vs-cachedTs check that suffered from `Date.now()`/`mtimeMs` resolution
+// races on Windows. Lets `getProjectGitStatus` bypass its 15s TTL after
+// `git pull`, `git fetch`, `git checkout`, etc. so the next /api/status
+// reflects the new HEAD / ahead-behind within one SPA poll instead of
+// waiting out the TTL (W-mphdmr8c00030124). Cost: 2-3 statSync per call —
+// well under the 'cheap' budget.
+function _projectGitRefsAdvancedSince(localPath, configuredMainBranch, snapshot) {
+  // No snapshot yet (legacy entry shape OR first call) — preserve the
+  // current cached value so the TTL-only fast-path still works. A real
+  // change still surfaces on the next /api/status because the fast-state
+  // mtime tracker watches the same files and will bust the upstream cache.
+  if (!snapshot) return false;
+  const current = _snapshotProjectGitRefMtimes(localPath, configuredMainBranch);
+  if (!current) return false;
+  for (const file of Object.keys(snapshot)) {
+    if (current[file] !== snapshot[file]) return true;
+  }
+  // Also catch a file that appeared since the snapshot (e.g. first `git
+  // fetch` materialises FETCH_HEAD).
+  for (const file of Object.keys(current)) {
+    if (!(file in snapshot)) return true;
   }
   return false;
 }
@@ -1901,14 +1930,25 @@ function getProjectGitStatus(localPath, configuredMainBranch = null) {
   // the pre-pull ahead/behind counts for up to 15s + one SPA poll (~19s
   // user-visible lag) because the rebuilt fast-state still hits this
   // cache and never schedules a refresh until the TTL itself expires.
-  if (cached && cached.ts && (now - cached.ts) < PROJECT_GIT_STATUS_TTL
-      && !_projectGitRefsAdvancedSince(localPath, cached.ts, configuredMainBranch)) {
+  // Revalidate a cached MISSING value via a cheap existsSync. The snapshot-
+  // based freshness check below can't detect "directory came back" because
+  // there was no `.git` to snapshot when we wrote MISSING — without this
+  // gate the cache pins MISSING for the full 15s TTL after the path is
+  // recreated.
+  const cachedIsMissing = cached && cached.value === PROJECT_GIT_STATUS_MISSING;
+  if (cachedIsMissing && fs.existsSync(localPath)) {
+    // Path came back — fall through to schedule a fresh probe.
+  } else if (cached && cached.ts && (now - cached.ts) < PROJECT_GIT_STATUS_TTL
+      && !_projectGitRefsAdvancedSince(localPath, configuredMainBranch, cached.refMtimes)) {
     return cached.value;
   }
   // Cheap synchronous existsSync — short-circuits a path that just disappeared
-  // (project removed) without scheduling a useless git probe.
+  // (project removed) without scheduling a useless git probe. `refMtimes: null`
+  // keeps the entry shape uniform with entries produced by
+  // `_scheduleProjectGitStatusRefresh` so the freshness check above always
+  // sees a defined field.
   if (!fs.existsSync(localPath)) {
-    _projectGitStatusCache.set(key, { ts: now, value: PROJECT_GIT_STATUS_MISSING, promise: null });
+    _projectGitStatusCache.set(key, { ts: now, value: PROJECT_GIT_STATUS_MISSING, promise: null, refMtimes: null });
     return PROJECT_GIT_STATUS_MISSING;
   }
   // Stale or never-populated — kick off a background refresh and return the