@link-assistant/hive-mind 1.24.1 → 1.24.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # @link-assistant/hive-mind
2
2
 
3
+ ## 1.24.2
4
+
5
+ ### Patch Changes
6
+
7
+ - a74e10c: fix: add auto-resume with session preservation on Internal Server Error (Issue #1331)
8
+
9
+ When Claude tool returns `API Error: 500 Internal server error`, automatically retry with exponential backoff starting from 1 minute, capped at 30 minutes per retry, up to 10 retries. Session ID is preserved so Claude Code can resume from where it left off using `--resume <sessionId>`.
10
+
3
11
  ## 1.24.1
4
12
 
5
13
  ### Patch Changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@link-assistant/hive-mind",
3
- "version": "1.24.1",
3
+ "version": "1.24.2",
4
4
  "description": "AI-powered issue solver and hive mind for collaborative problem solving",
5
5
  "main": "src/hive.mjs",
6
6
  "type": "module",
@@ -747,17 +747,30 @@ export const executeClaudeCommand = async params => {
747
747
  repo,
748
748
  prNumber,
749
749
  } = params;
750
- // Retry configuration for API overload errors
751
- const maxRetries = 3;
752
- const baseDelay = timeouts.retryBaseDelay;
750
+ // Issue #1331: Unified retry configuration for all transient API errors
751
+ // (Overloaded, 503 Network Error, Internal Server Error) - same params, all with session preservation
753
752
  let retryCount = 0;
753
+ // Helper: wait with per-minute countdown for delays >1 minute (Issue #1331)
754
+ const waitWithCountdown = async (delayMs, log) => {
755
+ if (delayMs <= 60000) {
756
+ await new Promise(resolve => setTimeout(resolve, delayMs));
757
+ return;
758
+ }
759
+ let remaining = delayMs;
760
+ const timer = setInterval(async () => {
761
+ remaining -= 60000;
762
+ if (remaining > 0) await log(`⏳ ${Math.round(remaining / 60000)} min remaining...`);
763
+ }, 60000);
764
+ await new Promise(resolve => setTimeout(resolve, delayMs));
765
+ clearInterval(timer);
766
+ };
754
767
  // Function to execute with retry logic
755
768
  const executeWithRetry = async () => {
756
769
  // Execute claude command from the cloned repository directory
757
770
  if (retryCount === 0) {
758
771
  await log(`\n${formatAligned('🤖', 'Executing Claude:', argv.model.toUpperCase())}`);
759
772
  } else {
760
- await log(`\n${formatAligned('🔄', 'Retry attempt:', `${retryCount}/${maxRetries}`)}`);
773
+ await log(`\n${formatAligned('🔄', 'Retry attempt:', `${retryCount}/${retryLimits.maxTransientErrorRetries}`)}`);
761
774
  }
762
775
  if (argv.verbose) {
763
776
  // Output the actual model being used
@@ -789,6 +802,7 @@ export const executeClaudeCommand = async params => {
789
802
  let lastMessage = '';
790
803
  let isOverloadError = false;
791
804
  let is503Error = false;
805
+ let isInternalServerError = false; // Issue #1331: Track 500 Internal server error
792
806
  let stderrErrors = [];
793
807
  let anthropicTotalCostUSD = null; // Capture Anthropic's official total_cost_usd from result
794
808
  let errorDuringExecution = false; // Issue #1088: Track if error_during_execution subtype occurred
@@ -979,6 +993,9 @@ export const executeClaudeCommand = async params => {
979
993
  limitReached = true;
980
994
  await log('⚠️ Detected session limit in result', { verbose: true });
981
995
  }
996
+ if (lastMessage.includes('Internal server error') && !lastMessage.includes('Overloaded')) {
997
+ isInternalServerError = true;
998
+ }
982
999
  }
983
1000
  }
984
1001
  // Store last message for error detection
@@ -986,6 +1003,9 @@ export const executeClaudeCommand = async params => {
986
1003
  lastMessage = data.text;
987
1004
  } else if (data.type === 'error') {
988
1005
  lastMessage = data.error || JSON.stringify(data);
1006
+ if (lastMessage.includes('Internal server error')) {
1007
+ isInternalServerError = true;
1008
+ }
989
1009
  }
990
1010
  // Check for API overload error and 503 errors
991
1011
  if (data.type === 'assistant' && data.message && data.message.content) {
@@ -998,6 +1018,10 @@ export const executeClaudeCommand = async params => {
998
1018
  lastMessage = item.text;
999
1019
  await log('⚠️ Detected API overload error', { verbose: true });
1000
1020
  }
1021
+ if (item.text.includes('API Error: 500') && item.text.includes('Internal server error') && !item.text.includes('Overloaded')) {
1022
+ isInternalServerError = true;
1023
+ lastMessage = item.text;
1024
+ }
1001
1025
  // Check for 503 errors
1002
1026
  if (item.text.includes('API Error: 503') || (item.text.includes('503') && item.text.includes('upstream connect error')) || (item.text.includes('503') && item.text.includes('remote connection failure'))) {
1003
1027
  is503Error = true;
@@ -1110,64 +1134,22 @@ export const executeClaudeCommand = async params => {
1110
1134
  }
1111
1135
  }
1112
1136
 
1113
- if ((commandFailed || isOverloadError) && (isOverloadError || (lastMessage.includes('API Error: 500') && lastMessage.includes('Overloaded')) || (lastMessage.includes('api_error') && lastMessage.includes('Overloaded')))) {
1114
- if (retryCount < maxRetries) {
1115
- // Calculate exponential backoff delay
1116
- const delay = baseDelay * Math.pow(2, retryCount);
1117
- await log(`\n⚠️ API overload error detected. Retrying in ${delay / 1000} seconds...`, { level: 'warning' });
1118
- await log(` Error: ${lastMessage.substring(0, 200)}`, { verbose: true });
1119
- // Wait before retrying
1120
- await new Promise(resolve => setTimeout(resolve, delay));
1121
- // Increment retry count and retry
1122
- retryCount++;
1123
- return await executeWithRetry();
1124
- } else {
1125
- await log(`\n\n❌ API overload error persisted after ${maxRetries} retries\n The API appears to be heavily loaded. Please try again later.`, { level: 'error' });
1126
- return {
1127
- success: false,
1128
- sessionId,
1129
- limitReached: false,
1130
- limitResetTime: null,
1131
- limitTimezone: null,
1132
- messageCount,
1133
- toolUseCount,
1134
- anthropicTotalCostUSD, // Issue #1104: Include cost even on failure
1135
- resultSummary, // Issue #1263: Include result summary
1136
- };
1137
- }
1138
- }
1139
- if ((commandFailed || is503Error) && argv.autoResumeOnErrors && (is503Error || lastMessage.includes('API Error: 503') || (lastMessage.includes('503') && lastMessage.includes('upstream connect error')) || (lastMessage.includes('503') && lastMessage.includes('remote connection failure')))) {
1140
- if (retryCount < retryLimits.max503Retries) {
1141
- // Calculate exponential backoff delay starting from 5 minutes
1142
- const delay = retryLimits.initial503RetryDelayMs * Math.pow(retryLimits.retryBackoffMultiplier, retryCount);
1143
- const delayMinutes = Math.round(delay / (1000 * 60));
1144
- await log(`\n⚠️ 503 network error detected. Retrying in ${delayMinutes} minutes...`, { level: 'warning' });
1137
+ // Issue #1331: Unified handler for all transient API errors (Overloaded, 503, Internal Server Error)
1138
+ // All use same params: 10 retries, 1min initial, 30min max, exponential backoff, session preserved
1139
+ const isTransientError = isOverloadError || isInternalServerError || is503Error || (lastMessage.includes('API Error: 500') && (lastMessage.includes('Overloaded') || lastMessage.includes('Internal server error'))) || (lastMessage.includes('api_error') && lastMessage.includes('Overloaded')) || lastMessage.includes('API Error: 503') || (lastMessage.includes('503') && (lastMessage.includes('upstream connect error') || lastMessage.includes('remote connection failure')));
1140
+ if ((commandFailed || isTransientError) && isTransientError) {
1141
+ if (retryCount < retryLimits.maxTransientErrorRetries) {
1142
+ const delay = Math.min(retryLimits.initialTransientErrorDelayMs * Math.pow(retryLimits.retryBackoffMultiplier, retryCount), retryLimits.maxTransientErrorDelayMs);
1143
+ const errorLabel = isOverloadError || (lastMessage.includes('API Error: 500') && lastMessage.includes('Overloaded')) ? 'API overload (500)' : isInternalServerError || lastMessage.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
1144
+ await log(`\n⚠️ ${errorLabel} detected. Retry ${retryCount + 1}/${retryLimits.maxTransientErrorRetries} in ${Math.round(delay / 60000)} min (session preserved)...`, { level: 'warning' });
1145
1145
  await log(` Error: ${lastMessage.substring(0, 200)}`, { verbose: true });
1146
- await log(` Retry ${retryCount + 1}/${retryLimits.max503Retries}`, { verbose: true });
1147
- // Show countdown for long waits
1148
- if (delay > 60000) {
1149
- const countdownInterval = 60000; // Every minute
1150
- let remainingMs = delay;
1151
- const countdownTimer = setInterval(async () => {
1152
- remainingMs -= countdownInterval;
1153
- if (remainingMs > 0) {
1154
- const remainingMinutes = Math.round(remainingMs / (1000 * 60));
1155
- await log(`⏳ ${remainingMinutes} minutes remaining until retry...`);
1156
- }
1157
- }, countdownInterval);
1158
- // Wait before retrying
1159
- await new Promise(resolve => setTimeout(resolve, delay));
1160
- clearInterval(countdownTimer);
1161
- } else {
1162
- // Wait before retrying
1163
- await new Promise(resolve => setTimeout(resolve, delay));
1164
- }
1146
+ if (sessionId && !argv.resume) argv.resume = sessionId; // preserve session for resume
1147
+ await waitWithCountdown(delay, log);
1165
1148
  await log('\n🔄 Retrying now...');
1166
- // Increment retry count and retry
1167
1149
  retryCount++;
1168
1150
  return await executeWithRetry();
1169
1151
  } else {
1170
- await log(`\n\n❌ 503 network error persisted after ${retryLimits.max503Retries} retries\n The Anthropic API appears to be experiencing network issues.\n Please try again later or check https://status.anthropic.com/`, { level: 'error' });
1152
+ await log(`\n\n❌ Transient API error persisted after ${retryLimits.maxTransientErrorRetries} retries\n Please try again later or check https://status.anthropic.com/`, { level: 'error' });
1171
1153
  return {
1172
1154
  success: false,
1173
1155
  sessionId,
@@ -1176,7 +1158,7 @@ export const executeClaudeCommand = async params => {
1176
1158
  limitTimezone: null,
1177
1159
  messageCount,
1178
1160
  toolUseCount,
1179
- is503Error: true,
1161
+ is503Error, // preserve for callers that check this
1180
1162
  anthropicTotalCostUSD, // Issue #1104: Include cost even on failure
1181
1163
  resultSummary, // Issue #1263: Include result summary
1182
1164
  };
@@ -1338,31 +1320,17 @@ export const executeClaudeCommand = async params => {
1338
1320
  operation: 'run_claude_command',
1339
1321
  });
1340
1322
  const errorStr = error.message || error.toString();
1341
- if ((errorStr.includes('API Error: 500') && errorStr.includes('Overloaded')) || (errorStr.includes('api_error') && errorStr.includes('Overloaded'))) {
1342
- if (retryCount < maxRetries) {
1343
- // Calculate exponential backoff delay
1344
- const delay = baseDelay * Math.pow(2, retryCount);
1345
- await log(`\n⚠️ API overload error in exception. Retrying in ${delay / 1000} seconds...`, {
1346
- level: 'warning',
1347
- });
1348
- // Wait before retrying
1349
- await new Promise(resolve => setTimeout(resolve, delay));
1350
- // Increment retry count and retry
1351
- retryCount++;
1352
- return await executeWithRetry();
1353
- }
1354
- }
1355
- if (argv.autoResumeOnErrors && (errorStr.includes('API Error: 503') || (errorStr.includes('503') && errorStr.includes('upstream connect error')) || (errorStr.includes('503') && errorStr.includes('remote connection failure')))) {
1356
- if (retryCount < retryLimits.max503Retries) {
1357
- // Calculate exponential backoff delay starting from 5 minutes
1358
- const delay = retryLimits.initial503RetryDelayMs * Math.pow(retryLimits.retryBackoffMultiplier, retryCount);
1359
- const delayMinutes = Math.round(delay / (1000 * 60));
1360
- await log(`\n⚠️ 503 network error in exception. Retrying in ${delayMinutes} minutes...`, {
1361
- level: 'warning',
1362
- });
1363
- // Wait before retrying
1364
- await new Promise(resolve => setTimeout(resolve, delay));
1365
- // Increment retry count and retry
1323
+ // Issue #1331: Unified handler for all transient API errors in exception block
1324
+ // (Overloaded, 503, Internal Server Error) - same params, all with session preservation
1325
+ const isTransientException = (errorStr.includes('API Error: 500') && (errorStr.includes('Overloaded') || errorStr.includes('Internal server error'))) || (errorStr.includes('api_error') && errorStr.includes('Overloaded')) || errorStr.includes('API Error: 503') || (errorStr.includes('503') && (errorStr.includes('upstream connect error') || errorStr.includes('remote connection failure')));
1326
+ if (isTransientException) {
1327
+ if (retryCount < retryLimits.maxTransientErrorRetries) {
1328
+ const delay = Math.min(retryLimits.initialTransientErrorDelayMs * Math.pow(retryLimits.retryBackoffMultiplier, retryCount), retryLimits.maxTransientErrorDelayMs);
1329
+ const errorLabel = errorStr.includes('Overloaded') ? 'API overload (500)' : errorStr.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
1330
+ await log(`\n⚠️ ${errorLabel} in exception. Retry ${retryCount + 1}/${retryLimits.maxTransientErrorRetries} in ${Math.round(delay / 60000)} min (session preserved)...`, { level: 'warning' });
1331
+ if (sessionId && !argv.resume) argv.resume = sessionId;
1332
+ await waitWithCountdown(delay, log);
1333
+ await log('\n🔄 Retrying now...');
1366
1334
  retryCount++;
1367
1335
  return await executeWithRetry();
1368
1336
  }
@@ -92,13 +92,17 @@ export const systemLimits = {
92
92
  };
93
93
 
94
94
  // Retry configurations
95
+ // Issue #1331: All API error types use unified retry parameters:
96
+ // 10 max retries, 1 minute initial delay, 30 minute max delay (exponential backoff), session preserved
95
97
  export const retryLimits = {
96
98
  maxForkRetries: parseIntWithDefault('HIVE_MIND_MAX_FORK_RETRIES', 5),
97
99
  maxVerifyRetries: parseIntWithDefault('HIVE_MIND_MAX_VERIFY_RETRIES', 5),
98
100
  maxApiRetries: parseIntWithDefault('HIVE_MIND_MAX_API_RETRIES', 3),
99
101
  retryBackoffMultiplier: parseFloatWithDefault('HIVE_MIND_RETRY_BACKOFF_MULTIPLIER', 2),
100
- max503Retries: parseIntWithDefault('HIVE_MIND_MAX_503_RETRIES', 3),
101
- initial503RetryDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_503_RETRY_DELAY_MS', 5 * 60 * 1000), // 5 minutes
102
+ // Unified retry config for all transient API errors (Overloaded, 503, Internal Server Error)
103
+ maxTransientErrorRetries: parseIntWithDefault('HIVE_MIND_MAX_TRANSIENT_ERROR_RETRIES', 10),
104
+ initialTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_TRANSIENT_ERROR_DELAY_MS', 60 * 1000), // 1 minute
105
+ maxTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_MAX_TRANSIENT_ERROR_DELAY_MS', 30 * 60 * 1000), // 30 minutes
102
106
  };
103
107
 
104
108
  // Claude Code CLI configurations