@link-assistant/hive-mind 1.34.2 → 1.34.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +22 -0
- package/package.json +1 -1
- package/src/claude.lib.mjs +65 -52
- package/src/config.lib.mjs +8 -2
- package/src/solve.auto-merge.lib.mjs +68 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,27 @@
|
|
|
1
1
|
# @link-assistant/hive-mind
|
|
2
2
|
|
|
3
|
+
## 1.34.4
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- c3806b5: Fix missing log upload on tool failure and make HTTP 529 overload error retryable (Issue #1439)
|
|
8
|
+
|
|
9
|
+
Two fixes:
|
|
10
|
+
1. When `--attach-logs` is enabled and the tool execution fails during an auto-restart session, the failure log was not being uploaded to GitHub. Now the log is attached before stopping on both tool execution failure paths.
|
|
11
|
+
2. HTTP 529 (Anthropic "Overloaded") errors were not recognized as transient/retryable by the outer retry loop. The code only matched `API Error: 500` + `Overloaded`, but 529 uses `API Error: 529` + `overloaded_error`. Now both 500 and 529 overload errors trigger the retry logic with exponential backoff.
|
|
12
|
+
|
|
13
|
+
## 1.34.3
|
|
14
|
+
|
|
15
|
+
### Patch Changes
|
|
16
|
+
|
|
17
|
+
- 22a8868: Fail fast when API signals x-should-retry: false and retries make no progress (Issue #1437). Increase minimum retry delay to 2 minutes.
|
|
18
|
+
|
|
19
|
+
When the Anthropic API returns HTTP 500 with `x-should-retry: false` AND subsequent retries immediately fail with `num_turns <= 1`, the outer retry loop now exits early instead of waiting through up to 10 retries with exponential backoff. This prevents stuck sessions where recovery is impossible.
|
|
20
|
+
|
|
21
|
+
Two new signals are tracked: (1) `apiMarkedNotRetryable` — set when `ANTHROPIC_LOG=debug` stderr contains `"error; not retryable"` or `x-should-retry: false`; (2) `resultNumTurns` — captured from the result event to detect sessions that failed immediately on resume. If both conditions are met after `HIVE_MIND_MAX_NOT_RETRYABLE_ATTEMPTS` (default: 5) retry attempts, the loop fails fast with a clear error message instead of continuing indefinitely.
|
|
22
|
+
|
|
23
|
+
The minimum retry delay for transient API errors (Overloaded, 503, Internal Server Error) is increased from 1 minute to 2 minutes (`HIVE_MIND_INITIAL_TRANSIENT_ERROR_DELAY_MS`), giving the API more time to recover between retries.
|
|
24
|
+
|
|
3
25
|
## 1.34.2
|
|
4
26
|
|
|
5
27
|
### Patch Changes
|
package/package.json
CHANGED
package/src/claude.lib.mjs
CHANGED
|
@@ -129,8 +129,8 @@ export const validateClaudeConnection = async (model = 'haiku-3') => {
|
|
|
129
129
|
return null;
|
|
130
130
|
};
|
|
131
131
|
const jsonError = checkForJsonError(stdout) || checkForJsonError(stderr);
|
|
132
|
-
// Check for API overload error pattern
|
|
133
|
-
const isOverloadError = (stdout.includes('API Error: 500') && stdout.includes('Overloaded')) || (stderr.includes('API Error: 500') && stderr.includes('Overloaded')) || (jsonError && jsonError.type === 'api_error' && jsonError.message === 'Overloaded');
|
|
132
|
+
// Check for API overload error pattern (Issue #1439: also detect 529 overloaded_error)
|
|
133
|
+
const isOverloadError = (stdout.includes('API Error: 500') && stdout.includes('Overloaded')) || (stdout.includes('API Error: 529') && stdout.includes('Overloaded')) || (stderr.includes('API Error: 500') && stderr.includes('Overloaded')) || (stderr.includes('API Error: 529') && stderr.includes('Overloaded')) || (jsonError && (jsonError.type === 'api_error' || jsonError.type === 'overloaded_error') && jsonError.message === 'Overloaded');
|
|
134
134
|
// Handle overload errors with retry
|
|
135
135
|
if (isOverloadError) {
|
|
136
136
|
if (retryCount < maxRetries) {
|
|
@@ -168,7 +168,7 @@ export const validateClaudeConnection = async (model = 'haiku-3') => {
|
|
|
168
168
|
}
|
|
169
169
|
// Check for error patterns in successful response
|
|
170
170
|
if (jsonError) {
|
|
171
|
-
if (jsonError.type === 'api_error' && jsonError.message === 'Overloaded') {
|
|
171
|
+
if ((jsonError.type === 'api_error' || jsonError.type === 'overloaded_error') && jsonError.message === 'Overloaded') {
|
|
172
172
|
if (retryCount < maxRetries) {
|
|
173
173
|
const delay = baseDelay * Math.pow(2, retryCount);
|
|
174
174
|
await log(`⚠️ API overload error in response. Retrying in ${delay / 1000} seconds...`, {
|
|
@@ -193,7 +193,7 @@ export const validateClaudeConnection = async (model = 'haiku-3') => {
|
|
|
193
193
|
return true;
|
|
194
194
|
} catch (error) {
|
|
195
195
|
const errorStr = error.message || error.toString();
|
|
196
|
-
if ((errorStr.includes('API Error: 500') && errorStr.includes('Overloaded')) || (errorStr.includes('api_error') && errorStr.includes('Overloaded'))) {
|
|
196
|
+
if ((errorStr.includes('API Error: 500') && errorStr.includes('Overloaded')) || (errorStr.includes('API Error: 529') && errorStr.includes('Overloaded')) || (errorStr.includes('api_error') && errorStr.includes('Overloaded')) || (errorStr.includes('overloaded_error') && errorStr.includes('Overloaded'))) {
|
|
197
197
|
if (retryCount < maxRetries) {
|
|
198
198
|
const delay = baseDelay * Math.pow(2, retryCount);
|
|
199
199
|
await log(`⚠️ API overload error during validation. Retrying in ${delay / 1000} seconds...`, {
|
|
@@ -841,6 +841,8 @@ export const executeClaudeCommand = async params => {
|
|
|
841
841
|
let is503Error = false;
|
|
842
842
|
let isInternalServerError = false; // Issue #1331: Track 500 Internal server error
|
|
843
843
|
let isRequestTimeout = false; // Issue #1353: Track "Request timed out" from Claude CLI
|
|
844
|
+
let apiMarkedNotRetryable = false; // Issue #1437: Track when API explicitly signals x-should-retry: false
|
|
845
|
+
let resultNumTurns = 0; // Issue #1437: Track num_turns from result event to detect stuck retries
|
|
844
846
|
let stderrErrors = [];
|
|
845
847
|
let resultSuccessReceived = false; // Issue #1354: Track if result success event was received
|
|
846
848
|
let anthropicTotalCostUSD = null; // Capture Anthropic's official total_cost_usd from result
|
|
@@ -881,14 +883,10 @@ export const executeClaudeCommand = async params => {
|
|
|
881
883
|
try {
|
|
882
884
|
// Resolve thinking settings (see issue #1146)
|
|
883
885
|
const { thinkingBudget: resolvedThinkingBudget, thinkLevel, isNewVersion, maxBudget } = await resolveThinkingSettings(argv, log);
|
|
884
|
-
// Set CLAUDE_CODE_MAX_OUTPUT_TOKENS (
|
|
885
|
-
//
|
|
886
|
-
// Pass model for model-specific max output tokens (Issue #1221)
|
|
887
|
-
// Pass thinkLevel and maxBudget for Opus 4.6 effort level conversion (Issue #1238)
|
|
886
|
+
// Set CLAUDE_CODE_MAX_OUTPUT_TOKENS (#1076), MAX_THINKING_TOKENS (#1146), MCP timeout (#1066),
|
|
887
|
+
// CLAUDE_CODE_EFFORT_LEVEL (#1238), model/thinkLevel/maxBudget for effort conversion (#1221, #1238)
|
|
888
888
|
const claudeEnv = getClaudeEnv({ thinkingBudget: resolvedThinkingBudget, model: mappedModel, thinkLevel, maxBudget });
|
|
889
|
-
// Issue #1337: Enable ANTHROPIC_LOG=debug in --verbose mode
|
|
890
|
-
// The BashTool pre-flight check suggests "Run with ANTHROPIC_LOG=debug to check for failed or slow API requests."
|
|
891
|
-
// When --verbose is enabled, we propagate ANTHROPIC_LOG=debug so users can see detailed API request info.
|
|
889
|
+
// Issue #1337: Enable ANTHROPIC_LOG=debug in --verbose mode for detailed API request diagnostics.
|
|
892
890
|
if (argv.verbose) {
|
|
893
891
|
claudeEnv.ANTHROPIC_LOG = 'debug';
|
|
894
892
|
}
|
|
@@ -923,14 +921,9 @@ export const executeClaudeCommand = async params => {
|
|
|
923
921
|
// Issue #1183: Line buffer for NDJSON stream parsing - accumulate incomplete lines across chunks
|
|
924
922
|
// Long JSON messages (e.g., result with total_cost_usd) may be split across multiple stdout chunks
|
|
925
923
|
let stdoutLineBuffer = '';
|
|
926
|
-
// Issue #1280: Track result event and timeout for hung processes
|
|
927
|
-
//
|
|
928
|
-
//
|
|
929
|
-
// sending the result event, pumpReadable() hangs → finish() never fires → stream never ends.
|
|
930
|
-
// Additionally, command-stream v0.9.4 does NOT yield {type:'exit'} chunks from stream(),
|
|
931
|
-
// so the exit code detection via chunk.type==='exit' below is dead code.
|
|
932
|
-
// Workaround: after receiving the result event, start a timeout to force-kill the process.
|
|
933
|
-
// See: https://github.com/link-foundation/command-stream/issues/155
|
|
924
|
+
// Issue #1280: Track result event and timeout for hung processes.
|
|
925
|
+
// command-stream's stream() waits for BOTH process exit AND stdout pipe close; if stdout stays open
|
|
926
|
+
// the stream hangs. Workaround: force-kill after result event. See command-stream/issues/155
|
|
934
927
|
let resultEventReceived = false;
|
|
935
928
|
let resultTimeoutId = null;
|
|
936
929
|
let forceExitTriggered = false;
|
|
@@ -1025,12 +1018,16 @@ export const executeClaudeCommand = async params => {
|
|
|
1025
1018
|
} else if (data.total_cost_usd !== undefined && data.total_cost_usd !== null) {
|
|
1026
1019
|
await log(`💰 Anthropic cost from ${data.subtype || 'unknown'} result ignored: $${data.total_cost_usd.toFixed(6)}`, { verbose: true });
|
|
1027
1020
|
}
|
|
1028
|
-
// Issue #1263: Extract result summary
|
|
1029
|
-
// The result field contains the AI's summary of the work done
|
|
1021
|
+
// Issue #1263: Extract result summary (AI's summary of work done) for --attach-solution-summary
|
|
1030
1022
|
if (data.subtype === 'success' && data.result && typeof data.result === 'string') {
|
|
1031
1023
|
resultSummary = data.result;
|
|
1032
1024
|
await log('📝 Captured result summary from Claude output', { verbose: true });
|
|
1033
1025
|
}
|
|
1026
|
+
// Issue #1437: Capture num_turns to detect stuck retries (degrading turn count signals non-recovery)
|
|
1027
|
+
if (data.num_turns !== undefined) {
|
|
1028
|
+
resultNumTurns = data.num_turns;
|
|
1029
|
+
await log(`📊 Session num_turns: ${resultNumTurns}`, { verbose: true });
|
|
1030
|
+
}
|
|
1034
1031
|
if (data.is_error === true) {
|
|
1035
1032
|
lastMessage = data.result || JSON.stringify(data);
|
|
1036
1033
|
const subtype = data.subtype || 'unknown';
|
|
@@ -1070,11 +1067,11 @@ export const executeClaudeCommand = async params => {
|
|
|
1070
1067
|
const content = Array.isArray(data.message.content) ? data.message.content : [data.message.content];
|
|
1071
1068
|
for (const item of content) {
|
|
1072
1069
|
if (item.type === 'text' && item.text) {
|
|
1073
|
-
// Check for the specific 500 overload error pattern
|
|
1074
|
-
if (item.text.includes('API Error: 500') && item.text.includes('api_error') && item.text.includes('Overloaded')) {
|
|
1070
|
+
// Check for the specific 500/529 overload error pattern (Issue #1439: 529 is also an overload)
|
|
1071
|
+
if ((item.text.includes('API Error: 500') || item.text.includes('API Error: 529')) && (item.text.includes('api_error') || item.text.includes('overloaded_error')) && item.text.includes('Overloaded')) {
|
|
1075
1072
|
isOverloadError = true;
|
|
1076
1073
|
lastMessage = item.text;
|
|
1077
|
-
await log(
|
|
1074
|
+
await log(`⚠️ Detected API overload error${item.text.includes('529') ? ' (529)' : ' (500)'}`, { verbose: true });
|
|
1078
1075
|
}
|
|
1079
1076
|
if (item.text.includes('API Error: 500') && item.text.includes('Internal server error') && !item.text.includes('Overloaded')) {
|
|
1080
1077
|
isInternalServerError = true;
|
|
@@ -1111,10 +1108,7 @@ export const executeClaudeCommand = async params => {
|
|
|
1111
1108
|
await log(line, { stream: 'raw' });
|
|
1112
1109
|
lastMessage = line;
|
|
1113
1110
|
|
|
1114
|
-
//
|
|
1115
|
-
// When Claude CLI requires terms acceptance, it outputs a non-JSON message like:
|
|
1116
|
-
// "[ACTION REQUIRED] An update to our Consumer Terms and Privacy Policy has taken effect..."
|
|
1117
|
-
// This should be treated as an error requiring human intervention, not success
|
|
1111
|
+
// Issue #1015: Detect terms acceptance prompt (non-JSON "[ACTION REQUIRED]..." message)
|
|
1118
1112
|
const termsAcceptancePattern = /\[ACTION REQUIRED\].*terms|must run.*claude.*review.*terms/i;
|
|
1119
1113
|
if (termsAcceptancePattern.test(line)) {
|
|
1120
1114
|
commandFailed = true;
|
|
@@ -1129,11 +1123,16 @@ export const executeClaudeCommand = async params => {
|
|
|
1129
1123
|
// Log stderr immediately
|
|
1130
1124
|
if (errorOutput) {
|
|
1131
1125
|
await log(errorOutput, { stream: 'stderr' });
|
|
1132
|
-
// Issue #
|
|
1133
|
-
//
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1126
|
+
// Issue #1437: Detect x-should-retry: false in ANTHROPIC_LOG=debug output — signals
|
|
1127
|
+
// a non-transient error; fail fast instead of blindly retrying.
|
|
1128
|
+
if (errorOutput.includes('not retryable') || errorOutput.includes("'x-should-retry': 'false'") || errorOutput.includes('"x-should-retry": "false"')) {
|
|
1129
|
+
if (!apiMarkedNotRetryable) {
|
|
1130
|
+
apiMarkedNotRetryable = true;
|
|
1131
|
+
await log('⚠️ API signaled error is not retryable (x-should-retry: false)', { verbose: true });
|
|
1132
|
+
}
|
|
1133
|
+
}
|
|
1134
|
+
// Issue #1354: Split multi-line chunks — a chunk may contain multiple JSON messages;
|
|
1135
|
+
// passing the whole chunk to isStderrError() causes JSON.parse() to fail.
|
|
1137
1136
|
for (const line of errorOutput.split('\n')) {
|
|
1138
1137
|
if (isStderrError(line)) {
|
|
1139
1138
|
stderrErrors.push(line.trim());
|
|
@@ -1141,9 +1140,7 @@ export const executeClaudeCommand = async params => {
|
|
|
1141
1140
|
}
|
|
1142
1141
|
}
|
|
1143
1142
|
} else if (chunk.type === 'exit') {
|
|
1144
|
-
// Note: command-stream v0.9.4 stream() does NOT yield exit chunks (Issue #1280).
|
|
1145
|
-
// Exit code is obtained from execCommand.result.code after the loop.
|
|
1146
|
-
// This branch is kept for forward-compatibility if command-stream adds exit chunks.
|
|
1143
|
+
// Note: command-stream v0.9.4 stream() does NOT yield exit chunks (Issue #1280) — kept for forward-compat.
|
|
1147
1144
|
exitCode = chunk.code;
|
|
1148
1145
|
if (chunk.code !== 0) {
|
|
1149
1146
|
commandFailed = true;
|
|
@@ -1172,9 +1169,7 @@ export const executeClaudeCommand = async params => {
|
|
|
1172
1169
|
await log('✅ Stream closed normally after result event', { verbose: true });
|
|
1173
1170
|
}
|
|
1174
1171
|
}
|
|
1175
|
-
// Issue #1165: Check actual exit code from command result
|
|
1176
|
-
// The .stream() method may not emit 'exit' chunks, but the command object still tracks the exit code
|
|
1177
|
-
// Exit code 127 is the standard Unix convention for "command not found"
|
|
1172
|
+
// Issue #1165: Check actual exit code from command result (stream() may not emit 'exit' chunks)
|
|
1178
1173
|
if (execCommand.result && typeof execCommand.result.code === 'number') {
|
|
1179
1174
|
const resultExitCode = execCommand.result.code;
|
|
1180
1175
|
if (exitCode === 0 && resultExitCode !== 0) {
|
|
@@ -1197,20 +1192,39 @@ export const executeClaudeCommand = async params => {
|
|
|
1197
1192
|
}
|
|
1198
1193
|
}
|
|
1199
1194
|
|
|
1200
|
-
//
|
|
1201
|
-
//
|
|
1202
|
-
|
|
1203
|
-
const isTransientError = isOverloadError || isInternalServerError || is503Error || isRequestTimeout || (lastMessage.includes('API Error: 500') && (lastMessage.includes('Overloaded') || lastMessage.includes('Internal server error'))) || (lastMessage.includes('api_error') && lastMessage.includes('Overloaded')) || lastMessage.includes('API Error: 503') || (lastMessage.includes('503') && (lastMessage.includes('upstream connect error') || lastMessage.includes('remote connection failure'))) || lastMessage === 'Request timed out' || lastMessage.includes('Request timed out');
|
|
1195
|
+
// Issues #1331, #1353: Unified handler for transient API errors (Overloaded, 503, Internal Server Error,
|
|
1196
|
+
// Request timed out). All use exponential backoff with session preservation via --resume.
|
|
1197
|
+
const isTransientError = isOverloadError || isInternalServerError || is503Error || isRequestTimeout || (lastMessage.includes('API Error: 500') && (lastMessage.includes('Overloaded') || lastMessage.includes('Internal server error'))) || (lastMessage.includes('API Error: 529') && (lastMessage.includes('overloaded_error') || lastMessage.includes('Overloaded'))) || (lastMessage.includes('api_error') && lastMessage.includes('Overloaded')) || (lastMessage.includes('overloaded_error') && lastMessage.includes('Overloaded')) || lastMessage.includes('API Error: 503') || (lastMessage.includes('503') && (lastMessage.includes('upstream connect error') || lastMessage.includes('remote connection failure'))) || lastMessage === 'Request timed out' || lastMessage.includes('Request timed out');
|
|
1204
1198
|
if ((commandFailed || isTransientError) && isTransientError) {
|
|
1205
|
-
// Issue #1353:
|
|
1206
|
-
// Timeouts indicate network instability — Claude CLI already exhausted its own retries, so we need longer waits
|
|
1199
|
+
// Issue #1353: Timeouts use longer backoff (5min–1hr) vs general transient (2min–30min)
|
|
1207
1200
|
const maxRetries = isRequestTimeout ? retryLimits.maxRequestTimeoutRetries : retryLimits.maxTransientErrorRetries;
|
|
1208
1201
|
const initialDelay = isRequestTimeout ? retryLimits.initialRequestTimeoutDelayMs : retryLimits.initialTransientErrorDelayMs;
|
|
1209
1202
|
const maxDelay = isRequestTimeout ? retryLimits.maxRequestTimeoutDelayMs : retryLimits.maxTransientErrorDelayMs;
|
|
1203
|
+
// Issue #1437: Fail fast when API signals x-should-retry: false AND session made no progress
|
|
1204
|
+
// (num_turns <= 1). Allow maxNotRetryableAttempts before giving up (signal can be wrong sometimes).
|
|
1205
|
+
const isStuckRetry = apiMarkedNotRetryable && retryCount >= retryLimits.maxNotRetryableAttempts && resultNumTurns <= 1;
|
|
1206
|
+
if (isStuckRetry) {
|
|
1207
|
+
await log(`\n\n❌ API explicitly marked error as not retryable (x-should-retry: false) and session made no progress (num_turns=${resultNumTurns}) after ${retryCount} attempt(s)`, { level: 'error' });
|
|
1208
|
+
await log(` This error is not recoverable. Failing fast to avoid a stuck retry loop (Issue #1437).`, { level: 'error' });
|
|
1209
|
+
await log(` Check https://status.anthropic.com/ for API status.`, { level: 'error' });
|
|
1210
|
+
return {
|
|
1211
|
+
success: false,
|
|
1212
|
+
sessionId,
|
|
1213
|
+
limitReached: false,
|
|
1214
|
+
limitResetTime: null,
|
|
1215
|
+
limitTimezone: null,
|
|
1216
|
+
messageCount,
|
|
1217
|
+
toolUseCount,
|
|
1218
|
+
is503Error,
|
|
1219
|
+
anthropicTotalCostUSD,
|
|
1220
|
+
resultSummary,
|
|
1221
|
+
};
|
|
1222
|
+
}
|
|
1210
1223
|
if (retryCount < maxRetries) {
|
|
1211
1224
|
const delay = Math.min(initialDelay * Math.pow(retryLimits.retryBackoffMultiplier, retryCount), maxDelay);
|
|
1212
|
-
const errorLabel = isRequestTimeout ? 'Request timeout' : isOverloadError || (lastMessage.includes('API Error: 500') && lastMessage.includes('Overloaded'))
|
|
1213
|
-
|
|
1225
|
+
const errorLabel = isRequestTimeout ? 'Request timeout' : isOverloadError || (lastMessage.includes('API Error: 500') && lastMessage.includes('Overloaded')) || (lastMessage.includes('API Error: 529') && lastMessage.includes('Overloaded')) ? `API overload (${lastMessage.includes('529') ? '529' : '500'})` : isInternalServerError || lastMessage.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
|
|
1226
|
+
const notRetryableHint = apiMarkedNotRetryable ? ' (API says not retryable — will stop early if no progress)' : '';
|
|
1227
|
+
await log(`\n⚠️ ${errorLabel} detected. Retry ${retryCount + 1}/${maxRetries} in ${Math.round(delay / 60000)} min (session preserved)${notRetryableHint}...`, { level: 'warning' });
|
|
1214
1228
|
await log(` Error: ${lastMessage.substring(0, 200)}`, { verbose: true });
|
|
1215
1229
|
if (sessionId && !argv.resume) argv.resume = sessionId; // preserve session for resume
|
|
1216
1230
|
await waitWithCountdown(delay, log);
|
|
@@ -1263,9 +1277,8 @@ export const executeClaudeCommand = async params => {
|
|
|
1263
1277
|
}
|
|
1264
1278
|
}
|
|
1265
1279
|
}
|
|
1266
|
-
//
|
|
1267
|
-
//
|
|
1268
|
-
// Issue #1354: Skip if result event confirmed success (definitive proof regardless of messageCount).
|
|
1280
|
+
// Issue #1354: Detect silent failures (no messages + stderr errors, e.g. "kill EPERM" with exit 0).
|
|
1281
|
+
// Skip if result event confirmed success (definitive proof regardless of messageCount).
|
|
1269
1282
|
if (!commandFailed && !resultSuccessReceived && stderrErrors.length > 0 && messageCount === 0 && toolUseCount === 0) {
|
|
1270
1283
|
commandFailed = true;
|
|
1271
1284
|
const errorsPreview = stderrErrors
|
|
@@ -1380,7 +1393,7 @@ export const executeClaudeCommand = async params => {
|
|
|
1380
1393
|
// Issue #1353: Also handle "Request timed out" in exception block
|
|
1381
1394
|
// (Overloaded, 503, Internal Server Error, Request timed out) - all with session preservation
|
|
1382
1395
|
const isTimeoutException = errorStr === 'Request timed out' || errorStr.includes('Request timed out');
|
|
1383
|
-
const isTransientException = isTimeoutException || (errorStr.includes('API Error: 500') && (errorStr.includes('Overloaded') || errorStr.includes('Internal server error'))) || (errorStr.includes('api_error') && errorStr.includes('Overloaded')) || errorStr.includes('API Error: 503') || (errorStr.includes('503') && (errorStr.includes('upstream connect error') || errorStr.includes('remote connection failure')));
|
|
1396
|
+
const isTransientException = isTimeoutException || (errorStr.includes('API Error: 500') && (errorStr.includes('Overloaded') || errorStr.includes('Internal server error'))) || (errorStr.includes('API Error: 529') && (errorStr.includes('overloaded_error') || errorStr.includes('Overloaded'))) || (errorStr.includes('api_error') && errorStr.includes('Overloaded')) || (errorStr.includes('overloaded_error') && errorStr.includes('Overloaded')) || errorStr.includes('API Error: 503') || (errorStr.includes('503') && (errorStr.includes('upstream connect error') || errorStr.includes('remote connection failure')));
|
|
1384
1397
|
if (isTransientException) {
|
|
1385
1398
|
// Issue #1353: Use timeout-specific backoff for request timeouts
|
|
1386
1399
|
const maxRetries = isTimeoutException ? retryLimits.maxRequestTimeoutRetries : retryLimits.maxTransientErrorRetries;
|
|
@@ -1388,7 +1401,7 @@ export const executeClaudeCommand = async params => {
|
|
|
1388
1401
|
const maxDelay = isTimeoutException ? retryLimits.maxRequestTimeoutDelayMs : retryLimits.maxTransientErrorDelayMs;
|
|
1389
1402
|
if (retryCount < maxRetries) {
|
|
1390
1403
|
const delay = Math.min(initialDelay * Math.pow(retryLimits.retryBackoffMultiplier, retryCount), maxDelay);
|
|
1391
|
-
const errorLabel = isTimeoutException ? 'Request timeout' : errorStr.includes('Overloaded') ?
|
|
1404
|
+
const errorLabel = isTimeoutException ? 'Request timeout' : errorStr.includes('Overloaded') ? `API overload (${errorStr.includes('529') ? '529' : '500'})` : errorStr.includes('Internal server error') ? 'Internal server error (500)' : '503 network error';
|
|
1392
1405
|
await log(`\n⚠️ ${errorLabel} in exception. Retry ${retryCount + 1}/${maxRetries} in ${Math.round(delay / 60000)} min (session preserved)...`, { level: 'warning' });
|
|
1393
1406
|
if (sessionId && !argv.resume) argv.resume = sessionId;
|
|
1394
1407
|
await waitWithCountdown(delay, log);
|
package/src/config.lib.mjs
CHANGED
|
@@ -95,7 +95,7 @@ export const systemLimits = {
|
|
|
95
95
|
|
|
96
96
|
// Retry configurations
|
|
97
97
|
// Issue #1331: All API error types use unified retry parameters:
|
|
98
|
-
// 10 max retries,
|
|
98
|
+
// 10 max retries, 2 minute initial delay, 30 minute max delay (exponential backoff), session preserved
|
|
99
99
|
export const retryLimits = {
|
|
100
100
|
maxForkRetries: parseIntWithDefault('HIVE_MIND_MAX_FORK_RETRIES', 5),
|
|
101
101
|
maxVerifyRetries: parseIntWithDefault('HIVE_MIND_MAX_VERIFY_RETRIES', 5),
|
|
@@ -103,13 +103,19 @@ export const retryLimits = {
|
|
|
103
103
|
retryBackoffMultiplier: parseFloatWithDefault('HIVE_MIND_RETRY_BACKOFF_MULTIPLIER', 2),
|
|
104
104
|
// Unified retry config for all transient API errors (Overloaded, 503, Internal Server Error)
|
|
105
105
|
maxTransientErrorRetries: parseIntWithDefault('HIVE_MIND_MAX_TRANSIENT_ERROR_RETRIES', 10),
|
|
106
|
-
initialTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_TRANSIENT_ERROR_DELAY_MS', 60 * 1000), //
|
|
106
|
+
initialTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_TRANSIENT_ERROR_DELAY_MS', 2 * 60 * 1000), // 2 minutes
|
|
107
107
|
maxTransientErrorDelayMs: parseIntWithDefault('HIVE_MIND_MAX_TRANSIENT_ERROR_DELAY_MS', 30 * 60 * 1000), // 30 minutes
|
|
108
108
|
// Request timeout retry configuration (Issue #1353)
|
|
109
109
|
// Network timeouts need longer waits than API errors — Claude CLI already exhausted its own retries
|
|
110
110
|
maxRequestTimeoutRetries: parseIntWithDefault('HIVE_MIND_MAX_REQUEST_TIMEOUT_RETRIES', 10),
|
|
111
111
|
initialRequestTimeoutDelayMs: parseIntWithDefault('HIVE_MIND_INITIAL_REQUEST_TIMEOUT_DELAY_MS', 5 * 60 * 1000), // 5 minutes
|
|
112
112
|
maxRequestTimeoutDelayMs: parseIntWithDefault('HIVE_MIND_MAX_REQUEST_TIMEOUT_DELAY_MS', 60 * 60 * 1000), // 1 hour
|
|
113
|
+
// Not-retryable error fail-fast configuration (Issue #1437)
|
|
114
|
+
// When the API sends x-should-retry: false AND retries make no progress (num_turns <= 1),
|
|
115
|
+
// stop retrying after this many attempts to avoid a stuck loop with no recovery prospects.
|
|
116
|
+
// Default: 5 — retry generously even when API signals not retryable, since the signal can be wrong
|
|
117
|
+
// for transient backend glitches (e.g. overloaded errors observed as non-retryable 500s).
|
|
118
|
+
maxNotRetryableAttempts: parseIntWithDefault('HIVE_MIND_MAX_NOT_RETRYABLE_ATTEMPTS', 5),
|
|
113
119
|
};
|
|
114
120
|
|
|
115
121
|
// Claude Code CLI configurations
|
|
@@ -739,6 +739,40 @@ Once the billing issue is resolved, you can re-run the CI checks or push a new c
|
|
|
739
739
|
await log('');
|
|
740
740
|
await log(formatAligned('❌', `${argv.tool.toUpperCase()} RESUME FAILED`, ''));
|
|
741
741
|
await log(formatAligned('', 'Action:', 'Stopping auto-restart — tool execution failed after limit reset', 2));
|
|
742
|
+
// Issue #1439: Attach failure log before stopping, so user can see what happened
|
|
743
|
+
const shouldAttachLogsOnResumeFail = argv.attachLogs || argv['attach-logs'];
|
|
744
|
+
if (prNumber && shouldAttachLogsOnResumeFail) {
|
|
745
|
+
try {
|
|
746
|
+
const logFile = getLogFile();
|
|
747
|
+
if (logFile) {
|
|
748
|
+
await attachLogToGitHub({
|
|
749
|
+
logFile,
|
|
750
|
+
targetType: 'pr',
|
|
751
|
+
targetNumber: prNumber,
|
|
752
|
+
owner,
|
|
753
|
+
repo,
|
|
754
|
+
$,
|
|
755
|
+
log,
|
|
756
|
+
sanitizeLogContent,
|
|
757
|
+
verbose: argv.verbose,
|
|
758
|
+
errorMessage: `${argv.tool.toUpperCase()} execution failed after limit reset`,
|
|
759
|
+
sessionId: latestSessionId,
|
|
760
|
+
tempDir,
|
|
761
|
+
requestedModel: argv.model,
|
|
762
|
+
tool: argv.tool || 'claude',
|
|
763
|
+
});
|
|
764
|
+
}
|
|
765
|
+
} catch (logUploadError) {
|
|
766
|
+
reportError(logUploadError, {
|
|
767
|
+
context: 'attach_auto_restart_failure_log',
|
|
768
|
+
prNumber,
|
|
769
|
+
owner,
|
|
770
|
+
repo,
|
|
771
|
+
operation: 'upload_failure_log',
|
|
772
|
+
});
|
|
773
|
+
await log(formatAligned('', `⚠️ Failure log upload error: ${cleanErrorMessage(logUploadError)}`, '', 2));
|
|
774
|
+
}
|
|
775
|
+
}
|
|
742
776
|
return { success: false, reason: 'tool_failure_after_resume', latestSessionId, latestAnthropicCost };
|
|
743
777
|
}
|
|
744
778
|
} else {
|
|
@@ -755,6 +789,40 @@ Once the billing issue is resolved, you can re-run the CI checks or push a new c
|
|
|
755
789
|
await log('');
|
|
756
790
|
await log(formatAligned('❌', `${argv.tool.toUpperCase()} EXECUTION FAILED`, ''));
|
|
757
791
|
await log(formatAligned('', 'Action:', 'Stopping auto-restart — tool execution failed', 2));
|
|
792
|
+
// Issue #1439: Attach failure log before stopping, so user can see what happened
|
|
793
|
+
const shouldAttachLogsOnFail = argv.attachLogs || argv['attach-logs'];
|
|
794
|
+
if (prNumber && shouldAttachLogsOnFail) {
|
|
795
|
+
try {
|
|
796
|
+
const logFile = getLogFile();
|
|
797
|
+
if (logFile) {
|
|
798
|
+
await attachLogToGitHub({
|
|
799
|
+
logFile,
|
|
800
|
+
targetType: 'pr',
|
|
801
|
+
targetNumber: prNumber,
|
|
802
|
+
owner,
|
|
803
|
+
repo,
|
|
804
|
+
$,
|
|
805
|
+
log,
|
|
806
|
+
sanitizeLogContent,
|
|
807
|
+
verbose: argv.verbose,
|
|
808
|
+
errorMessage: `${argv.tool.toUpperCase()} execution failed`,
|
|
809
|
+
sessionId: latestSessionId,
|
|
810
|
+
tempDir,
|
|
811
|
+
requestedModel: argv.model,
|
|
812
|
+
tool: argv.tool || 'claude',
|
|
813
|
+
});
|
|
814
|
+
}
|
|
815
|
+
} catch (logUploadError) {
|
|
816
|
+
reportError(logUploadError, {
|
|
817
|
+
context: 'attach_auto_restart_failure_log',
|
|
818
|
+
prNumber,
|
|
819
|
+
owner,
|
|
820
|
+
repo,
|
|
821
|
+
operation: 'upload_failure_log',
|
|
822
|
+
});
|
|
823
|
+
await log(formatAligned('', `⚠️ Failure log upload error: ${cleanErrorMessage(logUploadError)}`, '', 2));
|
|
824
|
+
}
|
|
825
|
+
}
|
|
758
826
|
return { success: false, reason: 'tool_failure', latestSessionId, latestAnthropicCost };
|
|
759
827
|
} else {
|
|
760
828
|
// Success - capture latest session data
|