@yemi33/minions 0.1.2098 → 0.1.2100

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -145,7 +145,7 @@ async function openSettings() {
145
145
  // W-mpmwxni2000c25c7-d — per-turn watchdog. Surfaced under CC overrides
146
146
  // because it gates CC/doc-chat error visibility (not the agent fleet).
147
147
  '<div style="display:grid;grid-template-columns:1fr;gap:8px;margin-top:8px">' +
148
- settingsField('CC Turn Timeout', 'set-ccTurnTimeoutMs', e.ccTurnTimeoutMs || 300000, 'ms', 'Per-turn watchdog for CC + doc-chat. If no terminal SSE event arrives within this window the handler emits event: error with code: cc-turn-timeout, the spinner stops, and a Retry button is shown. Clamped to 10000–3600000 ms.') +
148
+ settingsField('CC Turn Timeout', 'set-ccTurnTimeoutMs', e.ccTurnTimeoutMs || 300000, 'ms', 'Per-turn no-progress watchdog for CC + doc-chat. Resets on every liveness signal (token chunk, tool call, tool update) so an actively-streaming turn — long shell command, deep search, sub-agent loop — survives indefinitely up to the outer 1h hard ceiling. Only true silence past this window fires event: error with code: cc-turn-timeout, stops the spinner, and offers Retry. Clamped to 10000–3600000 ms.') +
149
149
  '</div>' +
150
150
  '</details>' +
151
151
  '</div>' +
package/dashboard.js CHANGED
@@ -2566,7 +2566,19 @@ function _getCcLiveStream(tabId) {
2566
2566
  return ccLiveStreams.get(tabId) || null;
2567
2567
  }
2568
2568
  function _touchCcLiveStream(state) {
2569
- if (state) state.updatedAt = Date.now();
2569
+ if (!state) return;
2570
+ state.updatedAt = Date.now();
2571
+ // W-mpmwxni2000c25c7-b — every CC streaming progress event (onChunk,
2572
+ // onToolUse, onToolUpdate in both _invokeCcStream paths) already routes
2573
+ // through here for the stall detector. Piggy-back the per-turn watchdog's
2574
+ // bumpTimer on the same heartbeat so a turn that's actively streaming
2575
+ // tokens or running long tools can't be killed by stale inactivity. The
2576
+ // watchdog installs `_bumpTimer` for the duration of the turn and removes
2577
+ // it in `finally`, so late progress callbacks delivered after resolution
2578
+ // are a no-op.
2579
+ if (typeof state._bumpTimer === 'function') {
2580
+ try { state._bumpTimer(); } catch { /* swallow */ }
2581
+ }
2570
2582
  }
2571
2583
  function _clearCcLiveTimers(tabId) {
2572
2584
  const state = _getCcLiveStream(tabId);
@@ -4225,43 +4237,71 @@ function _buildDocChatErrorEnvelope(result) {
4225
4237
  // with a doc-chat-shaped failure payload that flows through the existing
4226
4238
  // _docChatFailureResponse / SSE error event paths. timeoutMs <= 0 disables
4227
4239
  // the watchdog (passthrough).
4228
- async function _raceCcDocChatTimeout(callPromise, timeoutMs, abortFn, label) {
4229
- if (!timeoutMs || timeoutMs <= 0) return callPromise;
4240
+ async function _raceCcDocChatTimeout(callPromiseOrFactory, timeoutMs, abortFn, label) {
4241
+ // W-mpmwxni2000c25c7-b no-progress semantics. Callers that want a
4242
+ // self-resetting watchdog pass `callPromiseOrFactory` as a function
4243
+ // `(bumpTimer) => Promise<result>`. The factory is invoked synchronously
4244
+ // before the race starts so any onChunk/onToolUse closures defined inside
4245
+ // can capture `bumpTimer` and reset the timer on each liveness signal.
4246
+ // Legacy callers that pass a pre-started Promise get the old wall-clock
4247
+ // behavior — kept for the non-streaming doc-chat path which has no
4248
+ // progress hooks to wire bump into.
4249
+ const isFactory = typeof callPromiseOrFactory === 'function';
4250
+ if (!timeoutMs || timeoutMs <= 0) {
4251
+ return isFactory ? callPromiseOrFactory(() => {}) : callPromiseOrFactory;
4252
+ }
4230
4253
  let timer = null;
4231
4254
  let timedOut = false;
4255
+ let done = false;
4256
+ let timeoutResolve = null;
4257
+ const fire = () => {
4258
+ if (done) return;
4259
+ timedOut = true;
4260
+ try { if (abortFn) abortFn(); } catch { /* swallow */ }
4261
+ if (timeoutResolve) timeoutResolve(null);
4262
+ };
4263
+ const bumpTimer = () => {
4264
+ if (timedOut || done) return;
4265
+ if (timer) clearTimeout(timer);
4266
+ timer = setTimeout(fire, timeoutMs);
4267
+ };
4232
4268
  const timeoutPromise = new Promise((resolve) => {
4233
- timer = setTimeout(() => {
4234
- timedOut = true;
4235
- try { if (abortFn) abortFn(); } catch { /* swallow */ }
4236
- resolve(null);
4237
- }, timeoutMs);
4269
+ timeoutResolve = resolve;
4270
+ timer = setTimeout(fire, timeoutMs);
4238
4271
  // NOTE: do NOT unref — Node would exit the event loop while awaiting the
4239
4272
  // call promise (Promises don't keep the loop open; timers/I/O do). Cleared
4240
4273
  // immediately on the success path below.
4241
4274
  });
4242
- const winner = await Promise.race([callPromise, timeoutPromise]);
4243
- if (!timedOut) {
4244
- clearTimeout(timer);
4245
- return winner;
4246
- }
4247
- // Drain the in-flight call so its cleanup runs before we hand back the
4248
- // synthetic envelope.
4249
- await callPromise.catch(() => null);
4250
- const message = `${label || 'doc-chat'} turn timed out after ${timeoutMs}ms`;
4251
- return {
4252
- answer: 'Document chat request timed out — try again.',
4253
- toolUses: [],
4254
- error: {
4255
- code: 'cc-turn-timeout',
4256
- stderr: '',
4257
- errorClass: 'cc-turn-timeout',
4258
- errorMessage: message,
4259
- runtime: null,
4260
- typedCode: 'cc-turn-timeout',
4261
- typedMessage: message,
4262
- retriable: true,
4263
- },
4264
- };
4275
+ // Factory branch: invoke synchronously so bumpTimer is already in scope
4276
+ // before any progress event fires. Promise branch: legacy passthrough.
4277
+ const callPromise = isFactory
4278
+ ? Promise.resolve().then(() => callPromiseOrFactory(bumpTimer))
4279
+ : callPromiseOrFactory;
4280
+ try {
4281
+ const winner = await Promise.race([callPromise, timeoutPromise]);
4282
+ if (!timedOut) return winner;
4283
+ // Drain the in-flight call so its cleanup runs before we hand back the
4284
+ // synthetic envelope.
4285
+ await callPromise.catch(() => null);
4286
+ const message = `${label || 'doc-chat'} turn timed out after ${timeoutMs}ms with no progress`;
4287
+ return {
4288
+ answer: 'Document chat request timed out — try again.',
4289
+ toolUses: [],
4290
+ error: {
4291
+ code: 'cc-turn-timeout',
4292
+ stderr: '',
4293
+ errorClass: 'cc-turn-timeout',
4294
+ errorMessage: message,
4295
+ runtime: null,
4296
+ typedCode: 'cc-turn-timeout',
4297
+ typedMessage: message,
4298
+ retriable: true,
4299
+ },
4300
+ };
4301
+ } finally {
4302
+ done = true;
4303
+ if (timer) { clearTimeout(timer); timer = null; }
4304
+ }
4265
4305
  }
4266
4306
 
4267
4307
  function _docChatFailureResponse(label, filePath, result, sessionPreserved = false) {
@@ -7104,12 +7144,17 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7104
7144
  const ccTurnId = 'cct-' + shared.uid();
7105
7145
  const turnSystemPrompt = renderDocChatSystemPromptForTurn(ccTurnId);
7106
7146
 
7107
- // W-mpmwxni2000c25c7-b — wall-clock turn watchdog. The doc-chat call
7108
- // can internally spawn resume + fresh + final-retry LLM calls; we want
7109
- // ONE wall-clock cap that covers the whole turn so a runtime stuck
7147
+ // W-mpmwxni2000c25c7-b — wall-clock turn watchdog. The non-streaming
7148
+ // doc-chat path has no progress callbacks to wire `bumpTimer` into,
7149
+ // so this caller intentionally uses the legacy promise shape of
7150
+ // _raceCcDocChatTimeout (wall-clock cap, not no-progress). The
7151
+ // doc-chat call can internally spawn resume + fresh + final-retry LLM
7152
+ // calls; one wall-clock cap covers the whole turn so a runtime stuck
7110
7153
  // mid-stream can't outlive ccTurnTimeoutMs. On expiry the watchdog
7111
- // calls _docAbort (kills the in-flight CLI) and the synthesized payload
7112
- // below flows through the existing _docChatFailureResponse path.
7154
+ // calls _docAbort (kills the in-flight CLI) and the synthesized
7155
+ // payload below flows through the existing _docChatFailureResponse
7156
+ // path. Streaming consumers (handleDocChatStream) get the proper
7157
+ // no-progress behavior via the factory shape.
7113
7158
  const _docTurnTimeoutMs = _resolveCcTurnTimeoutMs();
7114
7159
  const _docCallPromise = ccDocCall({
7115
7160
  message: body.message, document: currentContent, title: body.title,
@@ -7248,25 +7293,27 @@ What would you like to discuss or change? When you're happy, say "approve" and I
7248
7293
  const ccTurnId = 'cct-' + shared.uid();
7249
7294
  const turnSystemPrompt = renderDocChatSystemPromptForTurn(ccTurnId);
7250
7295
 
7251
- // W-mpmwxni2000c25c7-b — wall-clock turn watchdog (mirrors the
7252
- // non-stream handleDocChat path). On expiry _docAbort kills the
7253
- // in-flight LLM and the synthesized payload below flows through the
7254
- // SSE done frame the client already expects with `error` set.
7296
+ // W-mpmwxni2000c25c7-b — no-progress turn watchdog (mirrors the
7297
+ // non-stream handleDocChat path). The factory shape lets the
7298
+ // onChunk/onToolUse callbacks close over `bumpTimer` so each chunk
7299
+ // or tool event resets the timer; only true silence past
7300
+ // `_docTurnTimeoutMs` (no chunks, no tools) fires `_docAbort` and
7301
+ // surfaces the synthesized payload below through the SSE done frame
7302
+ // the client already expects with `error` set.
7255
7303
  const _docTurnTimeoutMs = _resolveCcTurnTimeoutMs();
7256
- const _docStreamCallPromise = ccDocCallStreaming({
7304
+ const _docStreamResult = await _raceCcDocChatTimeout((bumpTimer) => ccDocCallStreaming({
7257
7305
  message: body.message, document: currentContent, title: body.title,
7258
7306
  filePath: body.filePath, selection: body.selection, canEdit, isJson,
7259
7307
  model: body.model || undefined,
7260
7308
  freshSession: !!body.freshSession,
7261
7309
  transcript: body.transcript,
7262
7310
  onAbortReady: (abort) => { _docAbort = abort; },
7263
- onChunk: (text) => { writeDocEvent({ type: 'chunk', text }); },
7264
- onToolUse: (name, input) => { writeDocEvent({ type: 'tool', name, input: _lightToolInput(input) }); },
7265
- onRetry: (attempt) => { writeDocEvent({ type: 'progress', attempt }); },
7311
+ onChunk: (text) => { bumpTimer(); writeDocEvent({ type: 'chunk', text }); },
7312
+ onToolUse: (name, input) => { bumpTimer(); writeDocEvent({ type: 'tool', name, input: _lightToolInput(input) }); },
7313
+ onRetry: (attempt) => { bumpTimer(); writeDocEvent({ type: 'progress', attempt }); },
7266
7314
  systemPrompt: turnSystemPrompt,
7267
7315
  turnId: ccTurnId,
7268
- });
7269
- const _docStreamResult = await _raceCcDocChatTimeout(_docStreamCallPromise, _docTurnTimeoutMs, () => _docAbort && _docAbort(), 'doc-chat-stream');
7316
+ }), _docTurnTimeoutMs, () => _docAbort && _docAbort(), 'doc-chat-stream');
7270
7317
  let { answer, partial, warning, toolUses, error: ccError } = _docStreamResult;
7271
7318
  const finalize = _finalizeDocChatEdit({
7272
7319
  filePath: body.filePath, fullPath, isJson, canEdit,
@@ -8565,7 +8612,17 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8565
8612
  const turnTimeoutMs = _resolveCcTurnTimeoutMs();
8566
8613
  const result = await withTimeout({
8567
8614
  timeoutMs: turnTimeoutMs, label: 'command-center-stream',
8568
- }, async (registerAbort) => {
8615
+ }, async (registerAbort, bumpTimer) => {
8616
+ // W-mpmwxni2000c25c7-b — no-progress semantics: install bumpTimer
8617
+ // on liveState so _touchCcLiveStream (called from every onChunk /
8618
+ // onToolUse / onToolUpdate in both _invokeCcStream paths) resets
8619
+ // the watchdog on every liveness signal. A turn that's actively
8620
+ // streaming tokens or running long tools survives indefinitely
8621
+ // up to the outer CC_CALL_TIMEOUT_MS ceiling. Cleared in
8622
+ // `finally` so a late progress event after resolution can't
8623
+ // re-arm against a stale abort target.
8624
+ liveState._bumpTimer = bumpTimer;
8625
+ try {
8569
8626
  const llmPromise = _invokeCcStream({
8570
8627
  prompt, sessionId, liveState, toolUses,
8571
8628
  model: streamModel, effort: streamEffort, maxTurns: ccMaxTurns,
@@ -8614,6 +8671,13 @@ What would you like to discuss or change? When you're happy, say "approve" and I
8614
8671
  }
8615
8672
  }
8616
8673
  return initial;
8674
+ } finally {
8675
+ // Only clear if this turn's bumpTimer is still installed —
8676
+ // belt-and-suspenders for any future code that might swap a
8677
+ // newer bumpTimer in mid-turn (shouldn't happen today, but
8678
+ // protects against silent overwrite).
8679
+ if (liveState._bumpTimer === bumpTimer) delete liveState._bumpTimer;
8680
+ }
8617
8681
  });
8618
8682
  if (result.missingRuntime) {
8619
8683
  finishMissingRuntime(result, liveState);
@@ -27,7 +27,7 @@ Canonical envelope (`_buildCcErrorEnvelope` in `dashboard.js`):
27
27
 
28
28
  `code` is clamped to the allowlist (`CC_ERROR_CODES` constant); unknown codes collapse to `crash`. `retryable: true` is informational — there is **no auto-retry**; the dashboard always offers a manual Retry button instead. Auto-retrying these errors is a footgun because most are operator-fix categories (auth, budget, missing model) where re-spawning makes no progress.
29
29
 
30
- **Watchdog (`engine.ccTurnTimeoutMs`, default 5 min, clamped 10s–1h).** Each turn arms a `setTimeout` that fires `event: error` with `code: 'cc-turn-timeout'`, aborts the in-flight LLM call, and ends the stream when no terminal event (`done`/`error`) arrives in time. Independent of `CC_CALL_TIMEOUT_MS` (the outer 1h hard ceiling); the watchdog is the *visible-to-user* no-progress cap. Surfaced in Settings → CC overrides.
30
+ **Watchdog (`engine.ccTurnTimeoutMs`, default 5 min, clamped 10s–1h).** Per-turn **no-progress** cap. Each turn arms a `setTimeout` that fires `event: error` with `code: 'cc-turn-timeout'`, aborts the in-flight LLM call, and ends the stream when no liveness signal arrives within the window. The timer **resets on every progress event** — token chunk, tool-call notification, tool-update — so a turn that's actively streaming or running long tools (e.g. `Bash` running `npm test`, `WebFetch`/`WebSearch` against slow targets, agent sub-dispatches) survives indefinitely up to the outer 1h `CC_CALL_TIMEOUT_MS` hard ceiling. Wired via `liveState._bumpTimer` (CC stream path) and the factory shape of `_raceCcDocChatTimeout` (doc-chat stream path); the non-streaming doc-chat path has no progress hooks and retains wall-clock semantics. Surfaced in Settings → CC overrides.
31
31
 
32
32
  **No auto-retry policy.** The backend never re-spawns the LLM after an error envelope. The client never silently resends the user's turn. Retry is a single-click manual action — guards against silent budget burn on `budget-exceeded`, infinite loops on `auth-failure`, and accidental re-charges on `context-limit`. The 429 + reconnect paths (rate-limited fetch retry, SSE reconnect-after-disconnect) remain — those are transport-level, not error-envelope-level.
33
33
 
@@ -63,14 +63,22 @@ The `gh-copilot` extension is documented at
63
63
  <https://docs.github.com/en/copilot/github-copilot-in-the-cli>. On this test
64
64
  machine `gh extension list` returned empty, so this path was **not exercised
65
65
  empirically**. The adapter contract still needs to support it for hosts without
66
- the WinGet standalone install:
66
+ the WinGet standalone install.
67
+
68
+ **Detection note (#2965):** `gh ≥ ~2.90` ships Copilot as a built-in preview
69
+ subcommand, NOT as an extension — `gh extension list` does not list it, and
70
+ `gh extension install github/gh-copilot` is rejected because the command is
71
+ already provided. The adapter therefore probes with `gh copilot --help` (with
72
+ `--no-update-notifier`); exit 0 means the subcommand is available via either
73
+ the modern built-in OR a legacy installed extension, and exit non-zero means
74
+ neither path is present. See `engine/runtimes/copilot.js#_findGhCopilotPath`.
67
75
 
68
76
  ```text
69
- gh extension install github/gh-copilot
70
- gh copilot --help # subcommand of gh
77
+ gh extension install github/gh-copilot # only needed for gh < 2.90
78
+ gh copilot --help # detection probe (built-in OR extension)
71
79
  ```
72
80
 
73
- When falling back to the extension form, the adapter must return:
81
+ When falling back to the gh-hosted form, the adapter must return:
74
82
 
75
83
  ```js
76
84
  { bin: '<path-to-gh.exe>', native: true, leadingArgs: ['copilot'] }
@@ -584,8 +592,11 @@ When implementing `engine/runtimes/copilot.js`:
584
592
  2. `resolveBinary()`:
585
593
  - PATH → standalone first; cache to `engine/copilot-caps.json` with
586
594
  `{ copilotBin, copilotIsNative, leadingArgs: [] }`.
587
- - `gh extension list | grep gh-copilot` → fallback with
588
- `leadingArgs: ['copilot']`. Mark the result as `bestEffort: true` so
595
+ - `gh copilot --help` (exit 0) → fallback with
596
+ `leadingArgs: ['copilot']`. This covers BOTH the legacy `gh-copilot`
597
+ extension AND the modern built-in (gh ≥ ~2.90 ships `copilot` as a
598
+ built-in subcommand, not an extension — so `gh extension list` no longer
599
+ surfaces it; #2965). Mark the result as `bestEffort: true` so
589
600
  preflight can warn.
590
601
  - **Never** probe npm. Document this in the file header.
591
602
  3. `buildArgs(opts)` always emits:
@@ -65,6 +65,22 @@ The engine does not cap review→fix cycles or build-fix attempts. Each trigger
65
65
  - Routes to the PR author to resolve target-branch conflicts
66
66
  - Runs after review, human, and build triggers; if any earlier trigger enqueued a fix for this PR, the conflict fix waits for a later discovery pass
67
67
 
68
+ ### E. Paused causes (`_noOpFixes[cause].paused`)
69
+
70
+ Each fix cause (build-failure, review-feedback, human-feedback, merge-conflict, pr-fix) tracks repeated no-op outcomes in `target._noOpFixes[cause]`. When `recordPrNoOpFixAttempt` flips `paused` from false to true (typically after multiple no-op dispatches with the same evidence fingerprint), the engine:
71
+
72
+ - Writes a per-PR per-cause inbox alert with PR URL, branch, attempt count, before/after heads, the underlying reason, and three recovery paths.
73
+ - Enriches the PR record returned from `queries.getPullRequests()` with `_pausedCauses` (the list of currently-paused cause keys).
74
+ - Renders a red `_pausedCauses` chip per cause in the dashboard PR list (`dashboard/js/render-prs.js`).
75
+
76
+ A paused cause **suppresses further auto-dispatch for that cause** until cleared. Recovery paths (any one):
77
+
78
+ 1. **Push a new SHA to the PR branch** — the next poll refreshes `pr.headSha`, the evidence fingerprint shifts, and `clearPrNoOpFixAttempt` runs on the next non-noop completion.
79
+ 2. **Click the red chip on the dashboard** — confirms, then `POST /api/pull-requests/clear-paused-cause` (`dashboard.js`, issue #2969) validates the cause against `shared.PR_FIX_CAUSE`, locates the PR via `mutatePullRequests` across project/central files, and calls `clearPrNoOpFixAttempt` to wipe the cause record.
80
+ 3. **Direct API call** — `POST /api/pull-requests/clear-paused-cause` with `{ prId, cause }`.
81
+
82
+ The exported `recordPrNoOpFixAttempt` / `clearPrNoOpFixAttempt` helpers in `engine/lifecycle.js` are the only sanctioned entry points; the dispatch evaluator in `engine/lifecycle.js` (search `_noOpFixes[cause]`) gates re-dispatch on `record.paused === true`.
83
+
68
84
  ## 5. Fix completes
69
85
 
70
86
  - `updatePrAfterFix()` (lifecycle.js) sets `reviewStatus = 'waiting'` + `fixedAt = ts()`
@@ -128,6 +144,8 @@ The engine does not cap review→fix cycles or build-fix attempts. Each trigger
128
144
  | `lastReviewedAt` | `updatePrAfterReview()` | Prevents re-dispatch if reviewed |
129
145
  | `minionsReview` | Post-completion hooks | `{ reviewer, reviewedAt, note, fixedAt }` |
130
146
  | `humanFeedback` | `pollPrHumanComments()` | `{ pendingFix, feedbackContent, lastProcessedCommentDate }` |
147
+ | `_noOpFixes[cause]` | `recordPrNoOpFixAttempt()` | Per-cause record `{ count, paused, fingerprint, beforeHead, afterHead }` driving the issue #2969 pause loop |
148
+ | `_pausedCauses` | `queries.getPullRequests()` enrichment | Read-only list of currently-paused cause keys for UI rendering (computed from `_noOpFixes`) |
131
149
 
132
150
  ## Platform differences
133
151
 
package/engine/llm.js CHANGED
@@ -913,60 +913,88 @@ function callLLMStreaming(promptText, sysPromptText, opts = {}) {
913
913
 
914
914
  // ─── CC turn watchdog ────────────────────────────────────────────────────────
915
915
  //
916
- // W-mpmwxni2000c25c7-b — wall-clock cap for a single CC/doc-chat turn. CC turns
916
+ // W-mpmwxni2000c25c7-b — no-progress cap for a single CC/doc-chat turn. CC turns
917
917
  // are a higher-level concept than the per-LLM-call `timeout` opt: a turn can
918
918
  // internally retry (resume → fresh → final retry) and each retry has its own
919
919
  // per-call timer. Without a turn-level watchdog, a runtime stuck mid-stream
920
920
  // (no exit, no chunks, no errors) leaves the SSE handler waiting for the
921
921
  // per-call timer to fire and the user staring at the typing dots.
922
922
  //
923
- // Usage: `result = await withCcTurnTimeout({ timeoutMs, label, onAbortReady }, (registerAbort) => callerThatReturnsResultPromise(registerAbort))`.
923
+ // Semantics: `timeoutMs` is a *no-progress* window, not a wall-clock cap.
924
+ // Callers reset it by invoking `bumpTimer()` whenever a liveness signal
925
+ // arrives (token chunk, tool-call notification, tool-update). A long but
926
+ // actively-streaming turn survives indefinitely up to the outer call-level
927
+ // hard ceiling (CC_CALL_TIMEOUT_MS, ~1h). Only true silence past `timeoutMs`
928
+ // — no chunks, no tool events, no exit — fires the cancel. Without
929
+ // `bumpTimer()` calls the behavior degrades to the old wall-clock cap.
930
+ //
931
+ // Usage: `result = await withCcTurnTimeout({ timeoutMs, label, onAbortReady },
932
+ // (registerAbort, bumpTimer) => callerThatReturnsResultPromise(registerAbort, bumpTimer))`.
924
933
  // The caller plumbs `registerAbort(abortFn)` into every nested LLM call's
925
934
  // `onAbortReady` so the watchdog can kill whichever attempt is in flight on
926
- // expiry. Returns the original result on success or a synthetic envelope
935
+ // expiry, and invokes `bumpTimer()` from every progress callback so a still-
936
+ // productive turn doesn't get killed by stale inactivity. Returns the original
937
+ // result on success or a synthetic envelope
927
938
  // `{ text:'', error:{ code:'cc-turn-timeout', retriable:true } }` on expiry.
928
939
  async function withCcTurnTimeout({ timeoutMs, label = 'cc-turn', onAbortReady } = {}, callFn) {
929
- if (!timeoutMs || timeoutMs <= 0) return callFn(onAbortReady || (() => {}));
940
+ if (!timeoutMs || timeoutMs <= 0) return callFn(onAbortReady || (() => {}), () => {});
930
941
  let currentAbort = null;
931
942
  let timedOut = false;
943
+ let done = false;
932
944
  let timer = null;
945
+ let timeoutResolve = null;
933
946
  const registerAbort = (abort) => {
934
947
  currentAbort = abort;
935
948
  if (onAbortReady) onAbortReady(abort);
936
949
  };
937
- const inflight = Promise.resolve().then(() => callFn(registerAbort));
950
+ const fire = () => {
951
+ if (done) return;
952
+ timedOut = true;
953
+ try { if (currentAbort) currentAbort(); } catch { /* swallow */ }
954
+ if (timeoutResolve) timeoutResolve(null);
955
+ };
956
+ // bumpTimer is a no-op after the turn settles (done=true) so any late
957
+ // progress callback that fires post-resolution (e.g. a queued onChunk
958
+ // delivered after the SSE finished) cannot re-arm the watchdog and
959
+ // accidentally cancel a *future* unrelated abort target via a stale
960
+ // currentAbort reference.
961
+ const bumpTimer = () => {
962
+ if (timedOut || done) return;
963
+ if (timer) clearTimeout(timer);
964
+ timer = setTimeout(fire, timeoutMs);
965
+ };
938
966
  const timeoutPromise = new Promise((resolve) => {
939
- timer = setTimeout(() => {
940
- timedOut = true;
941
- try { if (currentAbort) currentAbort(); } catch { /* swallow */ }
942
- resolve(null);
943
- }, timeoutMs);
967
+ timeoutResolve = resolve;
968
+ timer = setTimeout(fire, timeoutMs);
944
969
  // NOTE: do NOT unref this timer. If we did, Node would exit the event
945
970
  // loop while waiting on the inflight promise (Promises themselves don't
946
971
  // hold the loop open — only timers/I/O do). The race below clears the
947
972
  // timer immediately on success, so a still-armed timer never leaks past
948
973
  // the resolution.
949
974
  });
950
- const winner = await Promise.race([inflight, timeoutPromise]);
951
- if (!timedOut) {
952
- clearTimeout(timer);
953
- return winner;
975
+ const inflight = Promise.resolve().then(() => callFn(registerAbort, bumpTimer));
976
+ try {
977
+ const winner = await Promise.race([inflight, timeoutPromise]);
978
+ if (!timedOut) return winner;
979
+ // Let the in-flight call settle so its cleanup (cleanupFiles/Dirs, kill
980
+ // sweeps) actually runs before we hand a synthetic envelope to the caller.
981
+ const settled = await inflight.catch((err) => ({
982
+ text: '', usage: null, sessionId: null, code: 1, stderr: String(err && err.message || err), raw: '', toolUses: [],
983
+ }));
984
+ const message = `CC turn ${label} timed out after ${timeoutMs}ms with no progress`;
985
+ return {
986
+ ...settled,
987
+ text: '',
988
+ code: settled?.code || 1,
989
+ errorClass: 'cc-turn-timeout',
990
+ errorMessage: message,
991
+ error: { message, code: 'cc-turn-timeout', retriable: true },
992
+ ok: false,
993
+ };
994
+ } finally {
995
+ done = true;
996
+ if (timer) { clearTimeout(timer); timer = null; }
954
997
  }
955
- // Let the in-flight call settle so its cleanup (cleanupFiles/Dirs, kill
956
- // sweeps) actually runs before we hand a synthetic envelope to the caller.
957
- const settled = await inflight.catch((err) => ({
958
- text: '', usage: null, sessionId: null, code: 1, stderr: String(err && err.message || err), raw: '', toolUses: [],
959
- }));
960
- const message = `CC turn ${label} timed out after ${timeoutMs}ms`;
961
- return {
962
- ...settled,
963
- text: '',
964
- code: settled?.code || 1,
965
- errorClass: 'cc-turn-timeout',
966
- errorMessage: message,
967
- error: { message, code: 'cc-turn-timeout', retriable: true },
968
- ok: false,
969
- };
970
998
  }
971
999
 
972
1000
  module.exports = {
package/engine/shared.js CHANGED
@@ -2072,7 +2072,7 @@ const ENGINE_DEFAULTS = {
2072
2072
  removeWorktreeFailureTtlMs: 24 * 60 * 60 * 1000, // stale failed paths are forgotten after a day
2073
2073
  removeWorktreeFailureMaxEntries: 1000, // bound failed-worktree retry suppression cache
2074
2074
  ccMaxTurns: 50, // max tool-use turns per CC/doc-chat call before CLI stops (per response, not per session)
2075
- ccTurnTimeoutMs: 300000, // W-mpmwxni2000c25c7-b/-d: 5min per-turn watchdog. Wall-clock cap per CC/doc-chat turn; on expiry the in-flight LLM call is aborted and the handler surfaces `{code:'cc-turn-timeout', retryable:true}` via the typed error envelope so the UI can stop the spinner and offer Retry. Clamped to [10000, 3600000] in the settings POST handler. Independent of CC_CALL_TIMEOUT_MS (the outer hour-long abort) this is the visible-to-user no-progress cap.
2075
+ ccTurnTimeoutMs: 300000, // W-mpmwxni2000c25c7-b/-d: 5min per-turn no-progress watchdog. The window resets on every liveness signal — token chunk, tool-call notification, tool-update — so an actively-streaming CC/doc-chat turn (long shell command, deep search, sub-agent loop) survives indefinitely up to the outer CC_CALL_TIMEOUT_MS (~1h) ceiling. Only true silence past this window with no progress fires the cancel: the in-flight LLM call is aborted and the handler surfaces `{code:'cc-turn-timeout', retryable:true}` via the typed error envelope so the UI can stop the spinner and offer Retry. Clamped to [10000, 3600000] in the settings POST handler. Independent of CC_CALL_TIMEOUT_MS. Non-streaming doc-chat is the lone wall-clock exception (no progress hooks); see _raceCcDocChatTimeout in dashboard.js for the dual factory/promise shape.
2076
2076
  docSessionMaxEntries: 200, // cap doc-chat session map/disk store by least-recent activity (LRU; sessions are non-expiring otherwise)
2077
2077
  ccLiveStreamMaxAgeMs: 30 * 60 * 1000, // hard cap reconnect buffers if abort/cleanup stalls
2078
2078
  metricsFlushIntervalMs: 10000, // batch trackEngineUsage writes to metrics.json — flushed every 10s instead of per-call to cut lock contention and dashboard mtime churn
@@ -5202,6 +5202,14 @@ function getPrFixAutomationCause({ dispatchKey = '', source = '', task = '' } =
5202
5202
  return PR_FIX_CAUSE.UNKNOWN;
5203
5203
  }
5204
5204
 
5205
+ // Source-branch head SHA, normalized across hosts. GitHub PRs carry
5206
+ // `headSha`/`headRefOid` (engine/github.js:718-742 keeps both in sync); ADO PRs
5207
+ // carry `_adoSourceCommit`/`headRefOid` (engine/ado.js:1083-1129) and a legacy
5208
+ // `_adoHeadCommit`. Mirrors engine/lifecycle.js:1849 getPrFixBaselineHead.
5209
+ function _prHeadSha(pr) {
5210
+ return String(pr?.headRefOid || pr?.headSha || pr?._adoSourceCommit || pr?._adoHeadCommit || '').trim();
5211
+ }
5212
+
5205
5213
  function prFixEvidenceFingerprint(pr, cause = PR_FIX_CAUSE.UNKNOWN) {
5206
5214
  const review = pr?.minionsReview || {};
5207
5215
  const feedback = pr?.humanFeedback || {};
@@ -5214,6 +5222,13 @@ function prFixEvidenceFingerprint(pr, cause = PR_FIX_CAUSE.UNKNOWN) {
5214
5222
  evidence.buildFailReason = pr?.buildFailReason || '';
5215
5223
  evidence.buildErrorLog = pr?.buildErrorLog || '';
5216
5224
  evidence.buildStatusDetail = pr?._buildStatusDetail || '';
5225
+ // #2979 — head SHA + lastPushedAt are the only fingerprint components that
5226
+ // change across a rebase + force-push. Without them, a no-op-fix pause was
5227
+ // sticky forever because failing pipeline name / fail reason / error log
5228
+ // are unchanged across the push. Existing paused records re-fingerprint on
5229
+ // the next poll and clear naturally when the head moves.
5230
+ evidence.headRefOid = _prHeadSha(pr);
5231
+ evidence.lastPushedAt = pr?.lastPushedAt || '';
5217
5232
  } else if (cause === PR_FIX_CAUSE.MERGE_CONFLICT) {
5218
5233
  evidence.mergeConflict = !!pr?._mergeConflict;
5219
5234
  evidence.mergeStatus = pr?.mergeStatus || '';
@@ -5223,6 +5238,13 @@ function prFixEvidenceFingerprint(pr, cause = PR_FIX_CAUSE.UNKNOWN) {
5223
5238
  evidence.lastReviewedAt = pr?.lastReviewedAt || '';
5224
5239
  evidence.reviewedAt = review.reviewedAt || '';
5225
5240
  evidence.reviewNote = review.note || pr?.reviewNote || '';
5241
+ // #2979 — same rationale as BUILD_FAILURE: review feedback fingerprints
5242
+ // were sticky across force-push because reviewStatus / reviewedAt /
5243
+ // reviewNote don't change when the author rebases. Adding the head SHA
5244
+ // gives REVIEW_FEEDBACK the same natural-unsticking property HUMAN_FEEDBACK
5245
+ // already has via lastProcessedCommentDate.
5246
+ evidence.headRefOid = _prHeadSha(pr);
5247
+ evidence.lastPushedAt = pr?.lastPushedAt || '';
5226
5248
  }
5227
5249
  return crypto.createHash('sha1').update(JSON.stringify(evidence)).digest('hex').slice(0, 16);
5228
5250
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.2098",
3
+ "version": "0.1.2100",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"