@link-assistant/hive-mind 1.34.2 → 1.34.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,27 @@
1
1
  # @link-assistant/hive-mind
2
2
 
3
+ ## 1.34.4
4
+
5
+ ### Patch Changes
6
+
7
+ - c3806b5: Fix missing log upload on tool failure and make HTTP 529 overload error retryable (Issue #1439)
8
+
9
+ Two fixes:
10
+ 1. When `--attach-logs` is enabled and the tool execution fails during an auto-restart session, the failure log was not being uploaded to GitHub. Now the log is attached before stopping on both tool execution failure paths.
11
+ 2. HTTP 529 (Anthropic "Overloaded") errors were not recognized as transient/retryable by the outer retry loop. The code only matched `API Error: 500` + `Overloaded`, but 529 uses `API Error: 529` + `overloaded_error`. Now both 500 and 529 overload errors trigger the retry logic with exponential backoff.
12
+
13
+ ## 1.34.3
14
+
15
+ ### Patch Changes
16
+
17
+ - 22a8868: Fail fast when API signals x-should-retry: false and retries make no progress (Issue #1437). Increase minimum retry delay to 2 minutes.
18
+
19
+ When the Anthropic API returns HTTP 500 with `x-should-retry: false` AND subsequent retries immediately fail with `num_turns <= 1`, the outer retry loop now exits early instead of waiting through up to 10 retries with exponential backoff. This prevents stuck sessions where recovery is impossible.
20
+
21
+ Two new signals are tracked: (1) `apiMarkedNotRetryable` — set when `ANTHROPIC_LOG=debug` stderr contains `"error; not retryable"` or `x-should-retry: false`; (2) `resultNumTurns` — captured from the result event to detect sessions that failed immediately on resume. If both conditions are met after `HIVE_MIND_MAX_NOT_RETRYABLE_ATTEMPTS` (default: 5) retry attempts, the loop fails fast with a clear error message instead of continuing indefinitely.
22
+
23
+ The minimum retry delay for transient API errors (Overloaded, 503, Internal Server Error) is increased from 1 minute to 2 minutes (`HIVE_MIND_INITIAL_TRANSIENT_ERROR_DELAY_MS`), giving the API more time to recover between retries.
24
+
3
25
  ## 1.34.2
4
26
 
5
27
  ### Patch Changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@link-assistant/hive-mind",
3
- "version": "1.34.2",
3
+ "version": "1.34.4",
4
4
  "description": "AI-powered issue solver and hive mind for collaborative problem solving",
5
5
  "main": "src/hive.mjs",
6
6
  "type": "module",
@@ -129,8 +129,8 @@ export const validateClaudeConnection = async (model = 'haiku-3') => {
129
129
  return null;
130
130
  };
131
131
  const jsonError = checkForJsonError(stdout) || checkForJsonError(stderr);
132
- // Check for API overload error pattern
133
- const isOverloadError = (stdout.includes('API Error: 500') && stdout.includes('Overloaded')) || (stderr.includes('API Error: 500') && stderr.includes('Overloaded')) || (jsonError && jsonError.type === 'api_error' && jsonError.message === 'Overloaded');
132
+ // Check for API overload error pattern (Issue #1439: also detect 529 overloaded_error)
133
+ const isOverloadError = (stdout.includes('API Error: 500') && stdout.includes('Overloaded')) || (stdout.includes('API Error: 529') && stdout.includes('Overloaded')) || (stderr.includes('API Error: 500') && stderr.includes('Overloaded')) || (stderr.includes('API Error: 529') && stderr.includes('Overloaded')) || (jsonError && (jsonError.type === 'api_error' || jsonError.type === 'overloaded_error') && jsonError.message === 'Overloaded');
134
134
  // Handle overload errors with retry
135
135
  if (isOverloadError) {
136
136
  if (retryCount < maxRetries) {
@@ -168,7 +168,7 @@ export const validateClaudeConnection = async (model = 'haiku-3') => {
168
168
  }
169
169
  // Check for error patterns in successful response
170
170
  if (jsonError) {
171
- if (jsonError.type === 'api_error' && jsonError.message === 'Overloaded') {
171
+ if ((jsonError.type === 'api_error' || jsonError.type === 'overloaded_error') && jsonError.message === 'Overloaded') {
172
172
  if (retryCount < maxRetries) {
173
173
  const delay = baseDelay * Math.pow(2, retryCount);
174
174
  await log(`⚠️ API overload error in response. Retrying in ${delay / 1000} seconds...`, {
@@ -193,7 +193,7 @@ export const validateClaudeConnection = async (model = 'haiku-3') => {
193
193
  return true;
194
194
  } catch (error) {
195
195
  const errorStr = error.message || error.toString();
196
- if ((errorStr.includes('API Error: 500') && errorStr.includes('Overloaded')) || (errorStr.includes('api_error') && errorStr.includes('Overloaded'))) {
196
+ if ((errorStr.includes('API Error: 500') && errorStr.includes('Overloaded')) || (errorStr.includes('API Error: 529') && errorStr.includes('Overloaded')) || (errorStr.includes('api_error') && errorStr.includes('Overloaded')) || (errorStr.includes('overloaded_error') && errorStr.includes('Overloaded'))) {
197
197
  if (retryCount < maxRetries) {
198
198
  const delay = baseDelay * Math.pow(2, retryCount);
199
199
  await log(`⚠️ API overload error during validation. Retrying in ${delay / 1000} seconds...`, {
@@ -841,6 +841,8 @@ export const executeClaudeCommand = async params => {
841
841
  let is503Error = false;
842
842
  let isInternalServerError = false; // Issue #1331: Track 500 Internal server error
843
843
  let isRequestTimeout = false; // Issue #1353: Track "Request timed out" from Claude CLI
844
+ let apiMarkedNotRetryable = false; // Issue #1437: Track when API explicitly signals x-should-retry: false
845
+ let resultNumTurns = 0; // Issue #1437: Track num_turns from result event to detect stuck retries
844
846
  let stderrErrors = [];
845
847
  let resultSuccessReceived = false; // Issue #1354: Track if result success event was received
846
848
  let anthropicTotalCostUSD = null; // Capture Anthropic's official total_cost_usd from result
@@ -881,14 +883,10 @@ export const executeClaudeCommand = async params => {
881
883
  try {
882
884
  // Resolve thinking settings (see issue #1146)
883
885
  const { thinkingBudget: resolvedThinkingBudget, thinkLevel, isNewVersion, maxBudget } = await resolveThinkingSettings(argv, log);
884
- // Set CLAUDE_CODE_MAX_OUTPUT_TOKENS (see issue #1076), MAX_THINKING_TOKENS (see issue #1146),
885
- // MCP timeout configurations (see issue #1066), and CLAUDE_CODE_EFFORT_LEVEL for Opus 4.6 (Issue #1238)
886
- // Pass model for model-specific max output tokens (Issue #1221)
887
- // Pass thinkLevel and maxBudget for Opus 4.6 effort level conversion (Issue #1238)
886
+ // Set CLAUDE_CODE_MAX_OUTPUT_TOKENS (#1076), MAX_THINKING_TOKENS (#1146), MCP timeout (#1066),
887
+ // CLAUDE_CODE_EFFORT_LEVEL (#1238), model/thinkLevel/maxBudget for effort conversion (#1221, #1238)
888
888
  const claudeEnv = getClaudeEnv({ thinkingBudget: resolvedThinkingBudget, model: mappedModel, thinkLevel, maxBudget });
889
- // Issue #1337: Enable ANTHROPIC_LOG=debug in --verbose mode to diagnose slow API requests.
890
- // The BashTool pre-flight check suggests "Run with ANTHROPIC_LOG=debug to check for failed or slow API requests."
891
- // When --verbose is enabled, we propagate ANTHROPIC_LOG=debug so users can see detailed API request info.
889
+ // Issue #1337: Enable ANTHROPIC_LOG=debug in --verbose mode for detailed API request diagnostics.
892
890
  if (argv.verbose) {
893
891
  claudeEnv.ANTHROPIC_LOG = 'debug';
894
892
  }
@@ -923,14 +921,9 @@ export const executeClaudeCommand = async params => {
923
921
  // Issue #1183: Line buffer for NDJSON stream parsing - accumulate incomplete lines across chunks
924
922
  // Long JSON messages (e.g., result with total_cost_usd) may be split across multiple stdout chunks
925
923
  let stdoutLineBuffer = '';
926
- // Issue #1280: Track result event and timeout for hung processes
927
- // Root cause: command-stream's stream() async iterator waits for BOTH process exit AND
928
- // stdout/stderr pipe close before emitting 'end'. If the CLI process keeps stdout open after
929
- // sending the result event, pumpReadable() hangs → finish() never fires → stream never ends.
930
- // Additionally, command-stream v0.9.4 does NOT yield {type:'exit'} chunks from stream(),
931
- // so the exit code detection via chunk.type==='exit' below is dead code.
932
- // Workaround: after receiving the result event, start a timeout to force-kill the process.
933
- // See: https://github.com/link-foundation/command-stream/issues/155
924
+ // Issue #1280: Track result event and timeout for hung processes.
925
+ // command-stream's stream() waits for BOTH process exit AND stdout pipe close; if stdout stays open
926
+ // the stream hangs. Workaround: force-kill after result event. See command-stream/issues/155
934
927
  let resultEventReceived = false;
935
928
  let resultTimeoutId = null;
936
929
  let forceExitTriggered = false;
@@ -1025,12 +1018,16 @@ export const executeClaudeCommand = async params => {
1025
1018
  } else if (data.total_cost_usd !== undefined && data.total_cost_usd !== null) {
1026
1019
  await log(`💰 Anthropic cost from ${data.subtype || 'unknown'} result ignored: $${data.total_cost_usd.toFixed(6)}`, { verbose: true });
1027
1020
  }
1028
- // Issue #1263: Extract result summary for --attach-solution-summary and --auto-attach-solution-summary
1029
- // The result field contains the AI's summary of the work done
1021
+ // Issue #1263: Extract result summary (AI's summary of work done) for --attach-solution-summary
1030
1022
  if (data.subtype === 'success' && data.result && typeof data.result === 'string') {
1031
1023
  resultSummary = data.result;
1032
1024
  await log('📝 Captured result summary from Claude output', { verbose: true });
1033
1025
  }
1026
+ // Issue #1437: Capture num_turns to detect stuck retries (degrading turn count signals non-recovery)
1027
+ if (data.num_turns !== undefined) {
1028
+ resultNumTurns = data.num_turns;
1029
+ await log(`📊 Session num_turns: ${resultNumTurns}`, { verbose: true });
1030
+ }
1034
1031
  if (data.is_error === true) {
1035
1032
  lastMessage = data.result || JSON.stringify(data);
1036
1033
  const subtype = data.subtype || 'unknown';
@@ -1070,11 +1067,11 @@ export const executeClaudeCommand = async params => {
1070
1067
  const content = Array.isArray(data.message.content) ? data.message.content : [data.message.content];
1071
1068
  for (const item of content) {
1072
1069
  if (item.type === 'text' && item.text) {
1073
- // Check for the specific 500 overload error pattern
1074
- if (item.text.includes('API Error: 500') && item.text.includes('api_error') && item.text.includes('Overloaded')) {
1070
+ // Check for the specific 500/529 overload error pattern (Issue #1439: 529 is also an overload)
1071
+ if ((item.text.includes('API Error: 500') || item.text.includes('API Error: 529')) && (item.text.includes('api_error') || item.text.includes('overloaded_error')) && item.text.includes('Overloaded')) {
1075
1072
  isOverloadError = true;
1076
1073
  lastMessage = item.text;
1077
- await log('⚠️ Detected API overload error', { verbose: true });
1074
+ await log(`⚠️ Detected API overload error${item.text.includes('529') ? ' (529)' : ' (500)'}`, { verbose: true });
1078
1075
  }
1079
1076
  if (item.text.includes('API Error: 500') && item.text.includes('Internal server error') && !item.text.includes('Overloaded')) {
1080
1077
  isInternalServerError = true;
@@ -1111,10 +1108,7 @@ export const executeClaudeCommand = async params => {
1111
1108
  await log(line, { stream: 'raw' });
1112
1109
  lastMessage = line;
1113
1110
 
1114
- // Detect Claude Code terms acceptance message (Issue #1015)
1115
- // When Claude CLI requires terms acceptance, it outputs a non-JSON message like:
1116
- // "[ACTION REQUIRED] An update to our Consumer Terms and Privacy Policy has taken effect..."
1117
- // This should be treated as an error requiring human intervention, not success
1111
+ // Issue #1015: Detect terms acceptance prompt (non-JSON "[ACTION REQUIRED]..." message)
1118
1112
  const termsAcceptancePattern = /\[ACTION REQUIRED\].*terms|must run.*claude.*review.*terms/i;
1119
1113
  if (termsAcceptancePattern.test(line)) {
1120
1114
  commandFailed = true;
@@ -1129,11 +1123,16 @@ export const executeClaudeCommand = async params => {
1129
1123
  // Log stderr immediately
1130
1124
  if (errorOutput) {
1131
1125
  await log(errorOutput, { stream: 'stderr' });
1132
- // Issue #1354: Split multi-line stderr chunks and check each line individually.
1133
- // A single chunk may contain multiple newline-separated JSON messages (e.g. two
1134
- // consecutive {"level":"warn",...} lines). Passing the whole chunk to isStderrError()
1135
- // causes JSON.parse() to fail (multi-object is not valid JSON), falling through to
1136
- // keyword matching and producing false positives on words like "failed".
1126
+ // Issue #1437: Detect x-should-retry: false in ANTHROPIC_LOG=debug output signals
1127
+ // a non-transient error; fail fast instead of blindly retrying.
1128
+ if (errorOutput.includes('not retryable') || errorOutput.includes("'x-should-retry': 'false'") || errorOutput.includes('"x-should-retry": "false"')) {
1129
+ if (!apiMarkedNotRetryable) {
1130
+ apiMarkedNotRetryable = true;
1131
+ await log('⚠️ API signaled error is not retryable (x-should-retry: false)', { verbose: true });
1132
+ }
1133
+ }
1134
+ // Issue #1354: Split multi-line chunks — a chunk may contain multiple JSON messages;
1135
+ // passing the whole chunk to isStderrError() causes JSON.parse() to fail.
1137
1136
  for (const line of errorOutput.split('\n')) {
1138
1137
  if (isStderrError(line)) {
1139
1138
  stderrErrors.push(line.trim());
@@ -1141,9 +1140,7 @@ export const executeClaudeCommand = async params => {
1141
1140
  }
1142
1141
  }
1143
1142
  } else if (chunk.type === 'exit') {
1144
- // Note: command-stream v0.9.4 stream() does NOT yield exit chunks (Issue #1280).
1145
- // Exit code is obtained from execCommand.result.code after the loop.
1146
- // This branch is kept for forward-compatibility if command-stream adds exit chunks.
1143
+ // Note: command-stream v0.9.4 stream() does NOT yield exit chunks (Issue #1280) — kept for forward-compat.
1147
1144
  exitCode = chunk.code;
1148
1145
  if (chunk.code !== 0) {
1149
1146
  commandFailed = true;
@@ -1172,9 +1169,7 @@ export const executeClaudeCommand = async params => {
1172
1169
  await log('✅ Stream closed normally after result event', { verbose: true });
1173
1170
  }
1174
1171
  }
1175
- // Issue #1165: Check actual exit code from command result for more reliable detection
1176
- // The .stream() method may not emit 'exit' chunks, but the command object still tracks the exit code
1177
- // Exit code 127 is the standard Unix convention for "command not found"
1172
+ // Issue #1165: Check actual exit code from command result (stream() may not emit 'exit' chunks)
1178
1173
  if (execCommand.result && typeof execCommand.result.code === 'number') {
1179
1174
  const resultExitCode = execCommand.result.code;
1180
1175
  if (exitCode === 0 && resultExitCode !== 0) {
@@ -1197,20 +1192,39 @@ export const executeClaudeCommand = async params => {
1197
1192
  }
1198
1193
  }
1199
1194
 
1200
- // Issue #1331: Unified handler for all transient API errors (Overloaded, 503, Internal Server Error)
1201
- // Issue #1353: Also handle "Request timed out" Claude CLI times out after exhausting its own retries
1202
- // All use exponential backoff with session preservation via --resume
1203
- const isTransientError = isOverloadError || isInternalServerError || is503Error || isRequestTimeout || (lastMessage.includes('API Error: 500') && (lastMessage.includes('Overloaded') || lastMessage.includes('Internal server error'))) || (lastMessage.includes('api_error') && lastMessage.includes('Overloaded')) || lastMessage.includes('API Error: 503') || (lastMessage.includes('503') && (lastMessage.includes('upstream connect error') || lastMessage.includes('remote connection failure'))) || lastMessage === 'Request timed out' || lastMessage.includes('Request timed out');
1195
+ // Issues #1331, #1353: Unified handler for transient API errors (Overloaded, 503, Internal Server Error,
1196
+ // Request timed out). All use exponential backoff with session preservation via --resume.
1197
+ const isTransientError = isOverloadError || isInternalServerError || is503Error || isRequestTimeout || (lastMessage.includes('API Error: 500') && (lastMessage.includes('Overloaded') || lastMessage.includes('Internal server error'))) || (lastMessage.includes('API Error: 529') && (lastMessage.includes('overloaded_error') || lastMessage.includes('Overloaded'))) || (lastMessage.includes('api_error') && lastMessage.includes('Overloaded')) || (lastMessage.includes('overloaded_error') && lastMessage.includes('Overloaded')) || lastMessage.includes('API Error: 503') || (lastMessage.includes('503') && (lastMessage.includes('upstream connect error') || lastMessage.includes('remote connection failure'))) || lastMessage === 'Request timed out' || lastMessage.includes('Request timed out');
1204
1198
  if ((commandFailed || isTransientError) && isTransientError) {
1205
- // Issue #1353: Use timeout-specific backoff params (5min–1hr) vs general transient params (1min–30min)
1206
- // Timeouts indicate network instability — Claude CLI already exhausted its own retries, so we need longer waits
1199
+ // Issue #1353: Timeouts use longer backoff (5min–1hr) vs general transient (2min–30min)
1207
1200
  const maxRetries = isRequestTimeout ? retryLimits.maxRequestTimeoutRetries : retryLimits.maxTransientErrorRetries;
1208
1201
  const initialDelay = isRequestTimeout ? retryLimits.initialRequestTimeoutDelayMs : retryLimits.initialTransientErrorDelayMs;
1209
1202
  const maxDelay = isRequestTimeout ? retryLimits.maxRequestTimeoutDelayMs : retryLimits.maxTransientErrorDelayMs;
1203
+ // Issue #1437: Fail fast when API signals x-should-retry: false AND session made no progress
1204
+ // (num_turns <= 1). Allow maxNotRetryableAttempts before giving up (signal can be wrong sometimes).
1205
+ const isStuckRetry = apiMarkedNotRetryable && retryCount >= retryLimits.maxNotRetryableAttempts && resultNumTurns <= 1;
1206
+ if (isStuckRetry) {
1207
+ await log(`\n\n❌ API explicitly marked error as not retryable (x-should-retry: false) and session made no progress (num_turns=${resultNumTurns}) after ${retryCount} attempt(s)`, { level: 'error' });
1208
+ await log(` This error is not recoverable. Failing fast to avoid a stuck retry loop (Issue #1437).`, { level: 'error' });
1209
+ await log(` Check https://status.anthropic.com/ for API status.`, { level: 'error' });
1210
+ return {
1211
+ success: false,
1212
+ sessionId,
1213
+ limitReached: false,
1214
+ limitResetTime: null,
1215
+ limitTimezone: null,
1216
+ messageCount,
1217
+ toolUseCount,
1218
+ is503Error,
1219
+ anthropicTotalCostUSD,
1220
+ resultSummary,
1221
+ };
1222
+ }
1210
1223
  if (retryCount < maxRetries) {
1211
1224
  const delay = Math.min(initialDelay * Math.pow(retryLimits.retryBackoffMultiplier, retryCount), maxDelay);
1212
- const errorLabel = isRequestTimeout ? 'Request timeout' : isOverloadError || (lastMessage.includes('API Error: 500') && lastMessage.includes('Overloaded')) ? 'API overload (500)' : isInternalServerError || lastMessage.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
1213
- await log(`\n⚠️ ${errorLabel} detected. Retry ${retryCount + 1}/${maxRetries} in ${Math.round(delay / 60000)} min (session preserved)...`, { level: 'warning' });
1225
+ const errorLabel = isRequestTimeout ? 'Request timeout' : isOverloadError || (lastMessage.includes('API Error: 500') && lastMessage.includes('Overloaded')) || (lastMessage.includes('API Error: 529') && lastMessage.includes('Overloaded')) ? `API overload (${lastMessage.includes('529') ? '529' : '500'})` : isInternalServerError || lastMessage.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
1226
+ const notRetryableHint = apiMarkedNotRetryable ? ' (API says not retryable will stop early if no progress)' : '';
1227
+ await log(`\n⚠️ ${errorLabel} detected. Retry ${retryCount + 1}/${maxRetries} in ${Math.round(delay / 60000)} min (session preserved)${notRetryableHint}...`, { level: 'warning' });
1214
1228
  await log(` Error: ${lastMessage.substring(0, 200)}`, { verbose: true });
1215
1229
  if (sessionId && !argv.resume) argv.resume = sessionId; // preserve session for resume
1216
1230
  await waitWithCountdown(delay, log);
@@ -1263,9 +1277,8 @@ export const executeClaudeCommand = async params => {
1263
1277
  }
1264
1278
  }
1265
1279
  }
1266
- // Additional failure detection: silent failures (no messages + stderr errors).
1267
- // E.g., sudo timeout causing "kill EPERM" stderr error but exit code 0.
1268
- // Issue #1354: Skip if result event confirmed success (definitive proof regardless of messageCount).
1280
+ // Issue #1354: Detect silent failures (no messages + stderr errors, e.g. "kill EPERM" with exit 0).
1281
+ // Skip if result event confirmed success (definitive proof regardless of messageCount).
1269
1282
  if (!commandFailed && !resultSuccessReceived && stderrErrors.length > 0 && messageCount === 0 && toolUseCount === 0) {
1270
1283
  commandFailed = true;
1271
1284
  const errorsPreview = stderrErrors
@@ -1380,7 +1393,7 @@ export const executeClaudeCommand = async params => {
1380
1393
  // Issue #1353: Also handle "Request timed out" in exception block
1381
1394
  // (Overloaded, 503, Internal Server Error, Request timed out) - all with session preservation
1382
1395
  const isTimeoutException = errorStr === 'Request timed out' || errorStr.includes('Request timed out');
1383
- const isTransientException = isTimeoutException || (errorStr.includes('API Error: 500') && (errorStr.includes('Overloaded') || errorStr.includes('Internal server error'))) || (errorStr.includes('api_error') && errorStr.includes('Overloaded')) || errorStr.includes('API Error: 503') || (errorStr.includes('503') && (errorStr.includes('upstream connect error') || errorStr.includes('remote connection failure')));
1396
+ const isTransientException = isTimeoutException || (errorStr.includes('API Error: 500') && (errorStr.includes('Overloaded') || errorStr.includes('Internal server error'))) || (errorStr.includes('API Error: 529') && (errorStr.includes('overloaded_error') || errorStr.includes('Overloaded'))) || (errorStr.includes('api_error') && errorStr.includes('Overloaded')) || (errorStr.includes('overloaded_error') && errorStr.includes('Overloaded')) || errorStr.includes('API Error: 503') || (errorStr.includes('503') && (errorStr.includes('upstream connect error') || errorStr.includes('remote connection failure')));
1384
1397
  if (isTransientException) {
1385
1398
  // Issue #1353: Use timeout-specific backoff for request timeouts
1386
1399
  const maxRetries = isTimeoutException ? retryLimits.maxRequestTimeoutRetries : retryLimits.maxTransientErrorRetries;
@@ -1388,7 +1401,7 @@ export const executeClaudeCommand = async params => {
1388
1401
  const maxDelay = isTimeoutException ? retryLimits.maxRequestTimeoutDelayMs : retryLimits.maxTransientErrorDelayMs;
1389
1402
  if (retryCount < maxRetries) {
1390
1403
  const delay = Math.min(initialDelay * Math.pow(retryLimits.retryBackoffMultiplier, retryCount), maxDelay);
1391
- const errorLabel = isTimeoutException ? 'Request timeout' : errorStr.includes('Overloaded') ? 'API overload (500)' : errorStr.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
1404
+ const errorLabel = isTimeoutException ? 'Request timeout' : errorStr.includes('Overloaded') ? `API overload (${errorStr.includes('529') ? '529' : '500'})` : errorStr.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
1392
1405
  await log(`\n⚠️ ${errorLabel} in exception. Retry ${retryCount + 1}/${maxRetries} in ${Math.round(delay / 60000)} min (session preserved)...`, { level: 'warning' });
1393
1406
  if (sessionId && !argv.resume) argv.resume = sessionId;
1394
1407
  await waitWithCountdown(delay, log);
@@ -95,7 +95,7 @@ export const systemLimits = {
95
95
 
96
96
  // Retry configurations
97
97
  // Issue #1331: All API error types use unified retry parameters:
98
- // 10 max retries, 1 minute initial delay, 30 minute max delay (exponential backoff), session preserved
98
+ // 10 max retries, 2 minute initial delay, 30 minute max delay (exponential backoff), session preserved
99
99
  export const retryLimits = {
100
100
  maxForkRetries: parseIntWithDefault('HIVE_MIND_MAX_FORK_RETRIES', 5),
101
101
  maxVerifyRetries: parseIntWithDefault('HIVE_MIND_MAX_VERIFY_RETRIES', 5),
@@ -103,13 +103,19 @@ export const retryLimits = {
103
103
  retryBackoffMultiplier: parseFloatWithDefault('HIVE_MIND_RETRY_BACKOFF_MULTIPLIER', 2),
104
104
  // Unified retry config for all transient API errors (Overloaded, 503, Internal Server Error)
105
105
  maxTransientErrorRetries: parseIntWithDefault('HIVE_MIND_MAX_TRANSIENT_ERROR_RETRIES', 10),
106
- initialTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_TRANSIENT_ERROR_DELAY_MS', 60 * 1000), // 1 minute
106
+ initialTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_TRANSIENT_ERROR_DELAY_MS', 2 * 60 * 1000), // 2 minutes
107
107
  maxTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_MAX_TRANSIENT_ERROR_DELAY_MS', 30 * 60 * 1000), // 30 minutes
108
108
  // Request timeout retry configuration (Issue #1353)
109
109
  // Network timeouts need longer waits than API errors — Claude CLI already exhausted its own retries
110
110
  maxRequestTimeoutRetries: parseIntWithDefault('HIVE_MIND_MAX_REQUEST_TIMEOUT_RETRIES', 10),
111
111
  initialRequestTimeoutDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_REQUEST_TIMEOUT_DELAY_MS', 5 * 60 * 1000), // 5 minutes
112
112
  maxRequestTimeoutDelayMs: parseIntWithDefault('HIVE_MIND_MAX_REQUEST_TIMEOUT_DELAY_MS', 60 * 60 * 1000), // 1 hour
113
+ // Not-retryable error fail-fast configuration (Issue #1437)
114
+ // When the API sends x-should-retry: false AND retries make no progress (num_turns <= 1),
115
+ // stop retrying after this many attempts to avoid a stuck loop with no recovery prospects.
116
+ // Default: 5 — retry generously even when API signals not retryable, since the signal can be wrong
117
+ // for transient backend glitches (e.g. overloaded errors observed as non-retryable 500s).
118
+ maxNotRetryableAttempts: parseIntWithDefault('HIVE_MIND_MAX_NOT_RETRYABLE_ATTEMPTS', 5),
113
119
  };
114
120
 
115
121
  // Claude Code CLI configurations
@@ -739,6 +739,40 @@ Once the billing issue is resolved, you can re-run the CI checks or push a new c
739
739
  await log('');
740
740
  await log(formatAligned('❌', `${argv.tool.toUpperCase()} RESUME FAILED`, ''));
741
741
  await log(formatAligned('', 'Action:', 'Stopping auto-restart — tool execution failed after limit reset', 2));
742
+ // Issue #1439: Attach failure log before stopping, so user can see what happened
743
+ const shouldAttachLogsOnResumeFail = argv.attachLogs || argv['attach-logs'];
744
+ if (prNumber && shouldAttachLogsOnResumeFail) {
745
+ try {
746
+ const logFile = getLogFile();
747
+ if (logFile) {
748
+ await attachLogToGitHub({
749
+ logFile,
750
+ targetType: 'pr',
751
+ targetNumber: prNumber,
752
+ owner,
753
+ repo,
754
+ $,
755
+ log,
756
+ sanitizeLogContent,
757
+ verbose: argv.verbose,
758
+ errorMessage: `${argv.tool.toUpperCase()} execution failed after limit reset`,
759
+ sessionId: latestSessionId,
760
+ tempDir,
761
+ requestedModel: argv.model,
762
+ tool: argv.tool || 'claude',
763
+ });
764
+ }
765
+ } catch (logUploadError) {
766
+ reportError(logUploadError, {
767
+ context: 'attach_auto_restart_failure_log',
768
+ prNumber,
769
+ owner,
770
+ repo,
771
+ operation: 'upload_failure_log',
772
+ });
773
+ await log(formatAligned('', `⚠️ Failure log upload error: ${cleanErrorMessage(logUploadError)}`, '', 2));
774
+ }
775
+ }
742
776
  return { success: false, reason: 'tool_failure_after_resume', latestSessionId, latestAnthropicCost };
743
777
  }
744
778
  } else {
@@ -755,6 +789,40 @@ Once the billing issue is resolved, you can re-run the CI checks or push a new c
755
789
  await log('');
756
790
  await log(formatAligned('❌', `${argv.tool.toUpperCase()} EXECUTION FAILED`, ''));
757
791
  await log(formatAligned('', 'Action:', 'Stopping auto-restart — tool execution failed', 2));
792
+ // Issue #1439: Attach failure log before stopping, so user can see what happened
793
+ const shouldAttachLogsOnFail = argv.attachLogs || argv['attach-logs'];
794
+ if (prNumber && shouldAttachLogsOnFail) {
795
+ try {
796
+ const logFile = getLogFile();
797
+ if (logFile) {
798
+ await attachLogToGitHub({
799
+ logFile,
800
+ targetType: 'pr',
801
+ targetNumber: prNumber,
802
+ owner,
803
+ repo,
804
+ $,
805
+ log,
806
+ sanitizeLogContent,
807
+ verbose: argv.verbose,
808
+ errorMessage: `${argv.tool.toUpperCase()} execution failed`,
809
+ sessionId: latestSessionId,
810
+ tempDir,
811
+ requestedModel: argv.model,
812
+ tool: argv.tool || 'claude',
813
+ });
814
+ }
815
+ } catch (logUploadError) {
816
+ reportError(logUploadError, {
817
+ context: 'attach_auto_restart_failure_log',
818
+ prNumber,
819
+ owner,
820
+ repo,
821
+ operation: 'upload_failure_log',
822
+ });
823
+ await log(formatAligned('', `⚠️ Failure log upload error: ${cleanErrorMessage(logUploadError)}`, '', 2));
824
+ }
825
+ }
758
826
  return { success: false, reason: 'tool_failure', latestSessionId, latestAnthropicCost };
759
827
  } else {
760
828
  // Success - capture latest session data