npm - @link-assistant/hive-mind - Versions diffs - 1.34.2 → 1.34.4 - Mend

@link-assistant/hive-mind 1.34.2 → 1.34.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/CHANGELOG.md +22 -0
package/package.json +1 -1
package/src/claude.lib.mjs +65 -52
package/src/config.lib.mjs +8 -2
package/src/solve.auto-merge.lib.mjs +68 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,27 @@
 # @link-assistant/hive-mind
+## 1.34.4
+### Patch Changes
+- c3806b5: Fix missing log upload on tool failure and make HTTP 529 overload error retryable (Issue #1439)
+  Two fixes:
+  1. When `--attach-logs` is enabled and the tool execution fails during an auto-restart session, the failure log was not being uploaded to GitHub. Now the log is attached before stopping on both tool execution failure paths.
+  2. HTTP 529 (Anthropic "Overloaded") errors were not recognized as transient/retryable by the outer retry loop. The code only matched `API Error: 500` + `Overloaded`, but 529 uses `API Error: 529` + `overloaded_error`. Now both 500 and 529 overload errors trigger the retry logic with exponential backoff.
+## 1.34.3
+### Patch Changes
+- 22a8868: Fail fast when API signals x-should-retry: false and retries make no progress (Issue #1437). Increase minimum retry delay to 2 minutes.
+  When the Anthropic API returns HTTP 500 with `x-should-retry: false` AND subsequent retries immediately fail with `num_turns <= 1`, the outer retry loop now exits early instead of waiting through up to 10 retries with exponential backoff. This prevents stuck sessions where recovery is impossible.
+  Two new signals are tracked: (1) `apiMarkedNotRetryable` — set when `ANTHROPIC_LOG=debug` stderr contains `"error; not retryable"` or `x-should-retry: false`; (2) `resultNumTurns` — captured from the result event to detect sessions that failed immediately on resume. If both conditions are met after `HIVE_MIND_MAX_NOT_RETRYABLE_ATTEMPTS` (default: 5) retry attempts, the loop fails fast with a clear error message instead of continuing indefinitely.
+  The minimum retry delay for transient API errors (Overloaded, 503, Internal Server Error) is increased from 1 minute to 2 minutes (`HIVE_MIND_INITIAL_TRANSIENT_ERROR_DELAY_MS`), giving the API more time to recover between retries.
 ## 1.34.2
 ### Patch Changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@link-assistant/hive-mind",
-  "version": "1.34.2",
+  "version": "1.34.4",
   "description": "AI-powered issue solver and hive mind for collaborative problem solving",
   "main": "src/hive.mjs",
   "type": "module",

package/src/claude.lib.mjs CHANGED Viewed

@@ -129,8 +129,8 @@ export const validateClaudeConnection = async (model = 'haiku-3') => {
         return null;
       };
       const jsonError = checkForJsonError(stdout) || checkForJsonError(stderr);
-      // Check for API overload error pattern
-      const isOverloadError = (stdout.includes('API Error: 500') && stdout.includes('Overloaded')) || (stderr.includes('API Error: 500') && stderr.includes('Overloaded')) || (jsonError && jsonError.type === 'api_error' && jsonError.message === 'Overloaded');
+      // Check for API overload error pattern (Issue #1439: also detect 529 overloaded_error)
+      const isOverloadError = (stdout.includes('API Error: 500') && stdout.includes('Overloaded')) || (stdout.includes('API Error: 529') && stdout.includes('Overloaded')) || (stderr.includes('API Error: 500') && stderr.includes('Overloaded')) || (stderr.includes('API Error: 529') && stderr.includes('Overloaded')) || (jsonError && (jsonError.type === 'api_error' || jsonError.type === 'overloaded_error') && jsonError.message === 'Overloaded');
       // Handle overload errors with retry
       if (isOverloadError) {
         if (retryCount < maxRetries) {
@@ -168,7 +168,7 @@ export const validateClaudeConnection = async (model = 'haiku-3') => {
       }
       // Check for error patterns in successful response
       if (jsonError) {
-        if (jsonError.type === 'api_error' && jsonError.message === 'Overloaded') {
+        if ((jsonError.type === 'api_error' || jsonError.type === 'overloaded_error') && jsonError.message === 'Overloaded') {
           if (retryCount < maxRetries) {
             const delay = baseDelay * Math.pow(2, retryCount);
             await log(`⚠️ API overload error in response. Retrying in ${delay / 1000} seconds...`, {
@@ -193,7 +193,7 @@ export const validateClaudeConnection = async (model = 'haiku-3') => {
       return true;
     } catch (error) {
       const errorStr = error.message || error.toString();
-      if ((errorStr.includes('API Error: 500') && errorStr.includes('Overloaded')) || (errorStr.includes('api_error') && errorStr.includes('Overloaded'))) {
+      if ((errorStr.includes('API Error: 500') && errorStr.includes('Overloaded')) || (errorStr.includes('API Error: 529') && errorStr.includes('Overloaded')) || (errorStr.includes('api_error') && errorStr.includes('Overloaded')) || (errorStr.includes('overloaded_error') && errorStr.includes('Overloaded'))) {
         if (retryCount < maxRetries) {
           const delay = baseDelay * Math.pow(2, retryCount);
           await log(`⚠️ API overload error during validation. Retrying in ${delay / 1000} seconds...`, {
@@ -841,6 +841,8 @@ export const executeClaudeCommand = async params => {
     let is503Error = false;
     let isInternalServerError = false; // Issue #1331: Track 500 Internal server error
     let isRequestTimeout = false; // Issue #1353: Track "Request timed out" from Claude CLI
+    let apiMarkedNotRetryable = false; // Issue #1437: Track when API explicitly signals x-should-retry: false
+    let resultNumTurns = 0; // Issue #1437: Track num_turns from result event to detect stuck retries
     let stderrErrors = [];
     let resultSuccessReceived = false; // Issue #1354: Track if result success event was received
     let anthropicTotalCostUSD = null; // Capture Anthropic's official total_cost_usd from result
@@ -881,14 +883,10 @@ export const executeClaudeCommand = async params => {
     try {
       // Resolve thinking settings (see issue #1146)
       const { thinkingBudget: resolvedThinkingBudget, thinkLevel, isNewVersion, maxBudget } = await resolveThinkingSettings(argv, log);
-      // Set CLAUDE_CODE_MAX_OUTPUT_TOKENS (see issue #1076), MAX_THINKING_TOKENS (see issue #1146),
-      // MCP timeout configurations (see issue #1066), and CLAUDE_CODE_EFFORT_LEVEL for Opus 4.6 (Issue #1238)
-      // Pass model for model-specific max output tokens (Issue #1221)
-      // Pass thinkLevel and maxBudget for Opus 4.6 effort level conversion (Issue #1238)
+      // Set CLAUDE_CODE_MAX_OUTPUT_TOKENS (#1076), MAX_THINKING_TOKENS (#1146), MCP timeout (#1066),
+      // CLAUDE_CODE_EFFORT_LEVEL (#1238), model/thinkLevel/maxBudget for effort conversion (#1221, #1238)
       const claudeEnv = getClaudeEnv({ thinkingBudget: resolvedThinkingBudget, model: mappedModel, thinkLevel, maxBudget });
-      // Issue #1337: Enable ANTHROPIC_LOG=debug in --verbose mode to diagnose slow API requests.
-      // The BashTool pre-flight check suggests "Run with ANTHROPIC_LOG=debug to check for failed or slow API requests."
-      // When --verbose is enabled, we propagate ANTHROPIC_LOG=debug so users can see detailed API request info.
+      // Issue #1337: Enable ANTHROPIC_LOG=debug in --verbose mode for detailed API request diagnostics.
       if (argv.verbose) {
         claudeEnv.ANTHROPIC_LOG = 'debug';
       }
@@ -923,14 +921,9 @@ export const executeClaudeCommand = async params => {
       // Issue #1183: Line buffer for NDJSON stream parsing - accumulate incomplete lines across chunks
       // Long JSON messages (e.g., result with total_cost_usd) may be split across multiple stdout chunks
       let stdoutLineBuffer = '';
-      // Issue #1280: Track result event and timeout for hung processes
-      // Root cause: command-stream's stream() async iterator waits for BOTH process exit AND
-      // stdout/stderr pipe close before emitting 'end'. If the CLI process keeps stdout open after
-      // sending the result event, pumpReadable() hangs → finish() never fires → stream never ends.
-      // Additionally, command-stream v0.9.4 does NOT yield {type:'exit'} chunks from stream(),
-      // so the exit code detection via chunk.type==='exit' below is dead code.
-      // Workaround: after receiving the result event, start a timeout to force-kill the process.
-      // See: https://github.com/link-foundation/command-stream/issues/155
+      // Issue #1280: Track result event and timeout for hung processes.
+      // command-stream's stream() waits for BOTH process exit AND stdout pipe close; if stdout stays open
+      // the stream hangs. Workaround: force-kill after result event. See command-stream/issues/155
       let resultEventReceived = false;
       let resultTimeoutId = null;
       let forceExitTriggered = false;
@@ -1025,12 +1018,16 @@ export const executeClaudeCommand = async params => {
                 } else if (data.total_cost_usd !== undefined && data.total_cost_usd !== null) {
                   await log(`💰 Anthropic cost from ${data.subtype || 'unknown'} result ignored: $${data.total_cost_usd.toFixed(6)}`, { verbose: true });
                 }
-                // Issue #1263: Extract result summary for --attach-solution-summary and --auto-attach-solution-summary
-                // The result field contains the AI's summary of the work done
+                // Issue #1263: Extract result summary (AI's summary of work done) for --attach-solution-summary
                 if (data.subtype === 'success' && data.result && typeof data.result === 'string') {
                   resultSummary = data.result;
                   await log('📝 Captured result summary from Claude output', { verbose: true });
                 }
+                // Issue #1437: Capture num_turns to detect stuck retries (degrading turn count signals non-recovery)
+                if (data.num_turns !== undefined) {
+                  resultNumTurns = data.num_turns;
+                  await log(`📊 Session num_turns: ${resultNumTurns}`, { verbose: true });
+                }
                 if (data.is_error === true) {
                   lastMessage = data.result || JSON.stringify(data);
                   const subtype = data.subtype || 'unknown';
@@ -1070,11 +1067,11 @@ export const executeClaudeCommand = async params => {
                 const content = Array.isArray(data.message.content) ? data.message.content : [data.message.content];
                 for (const item of content) {
                   if (item.type === 'text' && item.text) {
-                    // Check for the specific 500 overload error pattern
-                    if (item.text.includes('API Error: 500') && item.text.includes('api_error') && item.text.includes('Overloaded')) {
+                    // Check for the specific 500/529 overload error pattern (Issue #1439: 529 is also an overload)
+                    if ((item.text.includes('API Error: 500') || item.text.includes('API Error: 529')) && (item.text.includes('api_error') || item.text.includes('overloaded_error')) && item.text.includes('Overloaded')) {
                       isOverloadError = true;
                       lastMessage = item.text;
-                      await log('⚠️ Detected API overload error', { verbose: true });
+                      await log(`⚠️ Detected API overload error${item.text.includes('529') ? ' (529)' : ' (500)'}`, { verbose: true });
                     }
                     if (item.text.includes('API Error: 500') && item.text.includes('Internal server error') && !item.text.includes('Overloaded')) {
                       isInternalServerError = true;
@@ -1111,10 +1108,7 @@ export const executeClaudeCommand = async params => {
                 await log(line, { stream: 'raw' });
                 lastMessage = line;
-                // Detect Claude Code terms acceptance message (Issue #1015)
-                // When Claude CLI requires terms acceptance, it outputs a non-JSON message like:
-                // "[ACTION REQUIRED] An update to our Consumer Terms and Privacy Policy has taken effect..."
-                // This should be treated as an error requiring human intervention, not success
+                // Issue #1015: Detect terms acceptance prompt (non-JSON "[ACTION REQUIRED]..." message)
                 const termsAcceptancePattern = /\[ACTION REQUIRED\].*terms|must run.*claude.*review.*terms/i;
                 if (termsAcceptancePattern.test(line)) {
                   commandFailed = true;
@@ -1129,11 +1123,16 @@ export const executeClaudeCommand = async params => {
           // Log stderr immediately
           if (errorOutput) {
             await log(errorOutput, { stream: 'stderr' });
-            // Issue #1354: Split multi-line stderr chunks and check each line individually.
-            // A single chunk may contain multiple newline-separated JSON messages (e.g. two
-            // consecutive {"level":"warn",...} lines). Passing the whole chunk to isStderrError()
-            // causes JSON.parse() to fail (multi-object is not valid JSON), falling through to
-            // keyword matching and producing false positives on words like "failed".
+            // Issue #1437: Detect x-should-retry: false in ANTHROPIC_LOG=debug output — signals
+            // a non-transient error; fail fast instead of blindly retrying.
+            if (errorOutput.includes('not retryable') || errorOutput.includes("'x-should-retry': 'false'") || errorOutput.includes('"x-should-retry": "false"')) {
+              if (!apiMarkedNotRetryable) {
+                apiMarkedNotRetryable = true;
+                await log('⚠️ API signaled error is not retryable (x-should-retry: false)', { verbose: true });
+              }
+            }
+            // Issue #1354: Split multi-line chunks — a chunk may contain multiple JSON messages;
+            // passing the whole chunk to isStderrError() causes JSON.parse() to fail.
             for (const line of errorOutput.split('\n')) {
               if (isStderrError(line)) {
                 stderrErrors.push(line.trim());
@@ -1141,9 +1140,7 @@ export const executeClaudeCommand = async params => {
             }
           }
         } else if (chunk.type === 'exit') {
-          // Note: command-stream v0.9.4 stream() does NOT yield exit chunks (Issue #1280).
-          // Exit code is obtained from execCommand.result.code after the loop.
-          // This branch is kept for forward-compatibility if command-stream adds exit chunks.
+          // Note: command-stream v0.9.4 stream() does NOT yield exit chunks (Issue #1280) — kept for forward-compat.
           exitCode = chunk.code;
           if (chunk.code !== 0) {
             commandFailed = true;
@@ -1172,9 +1169,7 @@ export const executeClaudeCommand = async params => {
           await log('✅ Stream closed normally after result event', { verbose: true });
         }
       }
-      // Issue #1165: Check actual exit code from command result for more reliable detection
-      // The .stream() method may not emit 'exit' chunks, but the command object still tracks the exit code
-      // Exit code 127 is the standard Unix convention for "command not found"
+      // Issue #1165: Check actual exit code from command result (stream() may not emit 'exit' chunks)
       if (execCommand.result && typeof execCommand.result.code === 'number') {
         const resultExitCode = execCommand.result.code;
         if (exitCode === 0 && resultExitCode !== 0) {
@@ -1197,20 +1192,39 @@ export const executeClaudeCommand = async params => {
         }
       }
-      // Issue #1331: Unified handler for all transient API errors (Overloaded, 503, Internal Server Error)
-      // Issue #1353: Also handle "Request timed out" — Claude CLI times out after exhausting its own retries
-      // All use exponential backoff with session preservation via --resume
-      const isTransientError = isOverloadError || isInternalServerError || is503Error || isRequestTimeout || (lastMessage.includes('API Error: 500') && (lastMessage.includes('Overloaded') || lastMessage.includes('Internal server error'))) || (lastMessage.includes('api_error') && lastMessage.includes('Overloaded')) || lastMessage.includes('API Error: 503') || (lastMessage.includes('503') && (lastMessage.includes('upstream connect error') || lastMessage.includes('remote connection failure'))) || lastMessage === 'Request timed out' || lastMessage.includes('Request timed out');
+      // Issues #1331, #1353: Unified handler for transient API errors (Overloaded, 503, Internal Server Error,
+      // Request timed out). All use exponential backoff with session preservation via --resume.
+      const isTransientError = isOverloadError || isInternalServerError || is503Error || isRequestTimeout || (lastMessage.includes('API Error: 500') && (lastMessage.includes('Overloaded') || lastMessage.includes('Internal server error'))) || (lastMessage.includes('API Error: 529') && (lastMessage.includes('overloaded_error') || lastMessage.includes('Overloaded'))) || (lastMessage.includes('api_error') && lastMessage.includes('Overloaded')) || (lastMessage.includes('overloaded_error') && lastMessage.includes('Overloaded')) || lastMessage.includes('API Error: 503') || (lastMessage.includes('503') && (lastMessage.includes('upstream connect error') || lastMessage.includes('remote connection failure'))) || lastMessage === 'Request timed out' || lastMessage.includes('Request timed out');
       if ((commandFailed || isTransientError) && isTransientError) {
-        // Issue #1353: Use timeout-specific backoff params (5min–1hr) vs general transient params (1min–30min)
-        // Timeouts indicate network instability — Claude CLI already exhausted its own retries, so we need longer waits
+        // Issue #1353: Timeouts use longer backoff (5min–1hr) vs general transient (2min–30min)
         const maxRetries = isRequestTimeout ? retryLimits.maxRequestTimeoutRetries : retryLimits.maxTransientErrorRetries;
         const initialDelay = isRequestTimeout ? retryLimits.initialRequestTimeoutDelayMs : retryLimits.initialTransientErrorDelayMs;
         const maxDelay = isRequestTimeout ? retryLimits.maxRequestTimeoutDelayMs : retryLimits.maxTransientErrorDelayMs;
+        // Issue #1437: Fail fast when API signals x-should-retry: false AND session made no progress
+        // (num_turns <= 1). Allow maxNotRetryableAttempts before giving up (signal can be wrong sometimes).
+        const isStuckRetry = apiMarkedNotRetryable && retryCount >= retryLimits.maxNotRetryableAttempts && resultNumTurns <= 1;
+        if (isStuckRetry) {
+          await log(`\n\n❌ API explicitly marked error as not retryable (x-should-retry: false) and session made no progress (num_turns=${resultNumTurns}) after ${retryCount} attempt(s)`, { level: 'error' });
+          await log(`   This error is not recoverable. Failing fast to avoid a stuck retry loop (Issue #1437).`, { level: 'error' });
+          await log(`   Check https://status.anthropic.com/ for API status.`, { level: 'error' });
+          return {
+            success: false,
+            sessionId,
+            limitReached: false,
+            limitResetTime: null,
+            limitTimezone: null,
+            messageCount,
+            toolUseCount,
+            is503Error,
+            anthropicTotalCostUSD,
+            resultSummary,
+          };
+        }
         if (retryCount < maxRetries) {
           const delay = Math.min(initialDelay * Math.pow(retryLimits.retryBackoffMultiplier, retryCount), maxDelay);
-          const errorLabel = isRequestTimeout ? 'Request timeout' : isOverloadError || (lastMessage.includes('API Error: 500') && lastMessage.includes('Overloaded')) ? 'API overload (500)' : isInternalServerError || lastMessage.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
-          await log(`\n⚠️ ${errorLabel} detected. Retry ${retryCount + 1}/${maxRetries} in ${Math.round(delay / 60000)} min (session preserved)...`, { level: 'warning' });
+          const errorLabel = isRequestTimeout ? 'Request timeout' : isOverloadError || (lastMessage.includes('API Error: 500') && lastMessage.includes('Overloaded')) || (lastMessage.includes('API Error: 529') && lastMessage.includes('Overloaded')) ? `API overload (${lastMessage.includes('529') ? '529' : '500'})` : isInternalServerError || lastMessage.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
+          const notRetryableHint = apiMarkedNotRetryable ? ' (API says not retryable — will stop early if no progress)' : '';
+          await log(`\n⚠️ ${errorLabel} detected. Retry ${retryCount + 1}/${maxRetries} in ${Math.round(delay / 60000)} min (session preserved)${notRetryableHint}...`, { level: 'warning' });
           await log(`   Error: ${lastMessage.substring(0, 200)}`, { verbose: true });
           if (sessionId && !argv.resume) argv.resume = sessionId; // preserve session for resume
           await waitWithCountdown(delay, log);
@@ -1263,9 +1277,8 @@ export const executeClaudeCommand = async params => {
           }
         }
       }
-      // Additional failure detection: silent failures (no messages + stderr errors).
-      // E.g., sudo timeout causing "kill EPERM" → stderr error but exit code 0.
-      // Issue #1354: Skip if result event confirmed success (definitive proof regardless of messageCount).
+      // Issue #1354: Detect silent failures (no messages + stderr errors, e.g. "kill EPERM" with exit 0).
+      // Skip if result event confirmed success (definitive proof regardless of messageCount).
       if (!commandFailed && !resultSuccessReceived && stderrErrors.length > 0 && messageCount === 0 && toolUseCount === 0) {
         commandFailed = true;
         const errorsPreview = stderrErrors
@@ -1380,7 +1393,7 @@ export const executeClaudeCommand = async params => {
       // Issue #1353: Also handle "Request timed out" in exception block
       // (Overloaded, 503, Internal Server Error, Request timed out) - all with session preservation
       const isTimeoutException = errorStr === 'Request timed out' || errorStr.includes('Request timed out');
-      const isTransientException = isTimeoutException || (errorStr.includes('API Error: 500') && (errorStr.includes('Overloaded') || errorStr.includes('Internal server error'))) || (errorStr.includes('api_error') && errorStr.includes('Overloaded')) || errorStr.includes('API Error: 503') || (errorStr.includes('503') && (errorStr.includes('upstream connect error') || errorStr.includes('remote connection failure')));
+      const isTransientException = isTimeoutException || (errorStr.includes('API Error: 500') && (errorStr.includes('Overloaded') || errorStr.includes('Internal server error'))) || (errorStr.includes('API Error: 529') && (errorStr.includes('overloaded_error') || errorStr.includes('Overloaded'))) || (errorStr.includes('api_error') && errorStr.includes('Overloaded')) || (errorStr.includes('overloaded_error') && errorStr.includes('Overloaded')) || errorStr.includes('API Error: 503') || (errorStr.includes('503') && (errorStr.includes('upstream connect error') || errorStr.includes('remote connection failure')));
       if (isTransientException) {
         // Issue #1353: Use timeout-specific backoff for request timeouts
         const maxRetries = isTimeoutException ? retryLimits.maxRequestTimeoutRetries : retryLimits.maxTransientErrorRetries;
@@ -1388,7 +1401,7 @@ export const executeClaudeCommand = async params => {
         const maxDelay = isTimeoutException ? retryLimits.maxRequestTimeoutDelayMs : retryLimits.maxTransientErrorDelayMs;
         if (retryCount < maxRetries) {
           const delay = Math.min(initialDelay * Math.pow(retryLimits.retryBackoffMultiplier, retryCount), maxDelay);
-          const errorLabel = isTimeoutException ? 'Request timeout' : errorStr.includes('Overloaded') ? 'API overload (500)' : errorStr.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
+          const errorLabel = isTimeoutException ? 'Request timeout' : errorStr.includes('Overloaded') ? `API overload (${errorStr.includes('529') ? '529' : '500'})` : errorStr.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
           await log(`\n⚠️ ${errorLabel} in exception. Retry ${retryCount + 1}/${maxRetries} in ${Math.round(delay / 60000)} min (session preserved)...`, { level: 'warning' });
           if (sessionId && !argv.resume) argv.resume = sessionId;
           await waitWithCountdown(delay, log);

package/src/config.lib.mjs CHANGED Viewed

@@ -95,7 +95,7 @@ export const systemLimits = {
 // Retry configurations
 // Issue #1331: All API error types use unified retry parameters:
-// 10 max retries, 1 minute initial delay, 30 minute max delay (exponential backoff), session preserved
+// 10 max retries, 2 minute initial delay, 30 minute max delay (exponential backoff), session preserved
 export const retryLimits = {
   maxForkRetries: parseIntWithDefault('HIVE_MIND_MAX_FORK_RETRIES', 5),
   maxVerifyRetries: parseIntWithDefault('HIVE_MIND_MAX_VERIFY_RETRIES', 5),
@@ -103,13 +103,19 @@ export const retryLimits = {
   retryBackoffMultiplier: parseFloatWithDefault('HIVE_MIND_RETRY_BACKOFF_MULTIPLIER', 2),
   // Unified retry config for all transient API errors (Overloaded, 503, Internal Server Error)
   maxTransientErrorRetries: parseIntWithDefault('HIVE_MIND_MAX_TRANSIENT_ERROR_RETRIES', 10),
-  initialTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_TRANSIENT_ERROR_DELAY_MS', 60 * 1000), // 1 minute
+  initialTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_TRANSIENT_ERROR_DELAY_MS', 2 * 60 * 1000), // 2 minutes
   maxTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_MAX_TRANSIENT_ERROR_DELAY_MS', 30 * 60 * 1000), // 30 minutes
   // Request timeout retry configuration (Issue #1353)
   // Network timeouts need longer waits than API errors — Claude CLI already exhausted its own retries
   maxRequestTimeoutRetries: parseIntWithDefault('HIVE_MIND_MAX_REQUEST_TIMEOUT_RETRIES', 10),
   initialRequestTimeoutDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_REQUEST_TIMEOUT_DELAY_MS', 5 * 60 * 1000), // 5 minutes
   maxRequestTimeoutDelayMs: parseIntWithDefault('HIVE_MIND_MAX_REQUEST_TIMEOUT_DELAY_MS', 60 * 60 * 1000), // 1 hour
+  // Not-retryable error fail-fast configuration (Issue #1437)
+  // When the API sends x-should-retry: false AND retries make no progress (num_turns <= 1),
+  // stop retrying after this many attempts to avoid a stuck loop with no recovery prospects.
+  // Default: 5 — retry generously even when API signals not retryable, since the signal can be wrong
+  // for transient backend glitches (e.g. overloaded errors observed as non-retryable 500s).
+  maxNotRetryableAttempts: parseIntWithDefault('HIVE_MIND_MAX_NOT_RETRYABLE_ATTEMPTS', 5),
 };
 // Claude Code CLI configurations

package/src/solve.auto-merge.lib.mjs CHANGED Viewed

@@ -739,6 +739,40 @@ Once the billing issue is resolved, you can re-run the CI checks or push a new c
                 await log('');
                 await log(formatAligned('❌', `${argv.tool.toUpperCase()} RESUME FAILED`, ''));
                 await log(formatAligned('', 'Action:', 'Stopping auto-restart — tool execution failed after limit reset', 2));
+                // Issue #1439: Attach failure log before stopping, so user can see what happened
+                const shouldAttachLogsOnResumeFail = argv.attachLogs || argv['attach-logs'];
+                if (prNumber && shouldAttachLogsOnResumeFail) {
+                  try {
+                    const logFile = getLogFile();
+                    if (logFile) {
+                      await attachLogToGitHub({
+                        logFile,
+                        targetType: 'pr',
+                        targetNumber: prNumber,
+                        owner,
+                        repo,
+                        $,
+                        log,
+                        sanitizeLogContent,
+                        verbose: argv.verbose,
+                        errorMessage: `${argv.tool.toUpperCase()} execution failed after limit reset`,
+                        sessionId: latestSessionId,
+                        tempDir,
+                        requestedModel: argv.model,
+                        tool: argv.tool || 'claude',
+                      });
+                    }
+                  } catch (logUploadError) {
+                    reportError(logUploadError, {
+                      context: 'attach_auto_restart_failure_log',
+                      prNumber,
+                      owner,
+                      repo,
+                      operation: 'upload_failure_log',
+                    });
+                    await log(formatAligned('', `⚠️  Failure log upload error: ${cleanErrorMessage(logUploadError)}`, '', 2));
+                  }
+                }
                 return { success: false, reason: 'tool_failure_after_resume', latestSessionId, latestAnthropicCost };
               }
             } else {
@@ -755,6 +789,40 @@ Once the billing issue is resolved, you can re-run the CI checks or push a new c
           await log('');
           await log(formatAligned('❌', `${argv.tool.toUpperCase()} EXECUTION FAILED`, ''));
           await log(formatAligned('', 'Action:', 'Stopping auto-restart — tool execution failed', 2));
+          // Issue #1439: Attach failure log before stopping, so user can see what happened
+          const shouldAttachLogsOnFail = argv.attachLogs || argv['attach-logs'];
+          if (prNumber && shouldAttachLogsOnFail) {
+            try {
+              const logFile = getLogFile();
+              if (logFile) {
+                await attachLogToGitHub({
+                  logFile,
+                  targetType: 'pr',
+                  targetNumber: prNumber,
+                  owner,
+                  repo,
+                  $,
+                  log,
+                  sanitizeLogContent,
+                  verbose: argv.verbose,
+                  errorMessage: `${argv.tool.toUpperCase()} execution failed`,
+                  sessionId: latestSessionId,
+                  tempDir,
+                  requestedModel: argv.model,
+                  tool: argv.tool || 'claude',
+                });
+              }
+            } catch (logUploadError) {
+              reportError(logUploadError, {
+                context: 'attach_auto_restart_failure_log',
+                prNumber,
+                owner,
+                repo,
+                operation: 'upload_failure_log',
+              });
+              await log(formatAligned('', `⚠️  Failure log upload error: ${cleanErrorMessage(logUploadError)}`, '', 2));
+            }
+          }
           return { success: false, reason: 'tool_failure', latestSessionId, latestAnthropicCost };
         } else {
           // Success - capture latest session data