opencode-swarm-plugin 0.12.27 → 0.12.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -14
- package/bin/swarm.ts +100 -50
- package/package.json +1 -1
- package/global-skills/agent-patterns/SKILL.md +0 -682
- package/global-skills/code-review/SKILL.md +0 -166
- package/global-skills/debugging/SKILL.md +0 -150
- package/global-skills/resilience-patterns/SKILL.md +0 -648
- package/global-skills/tacit-knowledge-extraction/SKILL.md +0 -387
- package/global-skills/testing-strategies/SKILL.md +0 -558
- package/global-skills/zod-validation/SKILL.md +0 -763
|
@@ -1,648 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: resilience-patterns
|
|
3
|
-
description: Error recovery, retry strategies, and graceful degradation patterns. Use when handling failures, implementing retries, designing fallback strategies, or building fault-tolerant systems. Covers exponential backoff, circuit breakers, and backend fallbacks.
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
# Resilience Patterns
|
|
7
|
-
|
|
8
|
-
Error recovery, retry strategies, and graceful degradation for fault-tolerant systems.
|
|
9
|
-
|
|
10
|
-
## Core Principle
|
|
11
|
-
|
|
12
|
-
**Design for failure, not perfection.** Reduce probability of faults causing system failures by recovering gracefully, degrading functionality intelligently, and learning from patterns of failure.
|
|
13
|
-
|
|
14
|
-
## Error Classification
|
|
15
|
-
|
|
16
|
-
Classify errors before deciding recovery strategy.
|
|
17
|
-
|
|
18
|
-
### Retryable Errors (Transient)
|
|
19
|
-
|
|
20
|
-
Network and server issues that typically resolve on retry:
|
|
21
|
-
|
|
22
|
-
- Connection refused/reset (`ECONNREFUSED`, `ECONNRESET`)
|
|
23
|
-
- Timeouts (network, socket, aborted requests)
|
|
24
|
-
- Server overload (502, 503, 504 HTTP codes)
|
|
25
|
-
- Temporary unavailability ("unexpected error" from dependencies)
|
|
26
|
-
|
|
27
|
-
### Non-Retryable Errors (Permanent)
|
|
28
|
-
|
|
29
|
-
Logic bugs, validation failures, resource constraints:
|
|
30
|
-
|
|
31
|
-
- Authentication failures (401, 403)
|
|
32
|
-
- Not found errors (404)
|
|
33
|
-
- Validation errors (400)
|
|
34
|
-
- Server errors from logic bugs (500)
|
|
35
|
-
- Resource not found (project/agent/entity missing)
|
|
36
|
-
|
|
37
|
-
### Detection Pattern
|
|
38
|
-
|
|
39
|
-
```typescript
|
|
40
|
-
function isRetryableError(error: unknown): boolean {
|
|
41
|
-
if (error instanceof Error) {
|
|
42
|
-
const message = error.message.toLowerCase();
|
|
43
|
-
|
|
44
|
-
// Network errors
|
|
45
|
-
if (
|
|
46
|
-
message.includes("econnrefused") ||
|
|
47
|
-
message.includes("econnreset") ||
|
|
48
|
-
message.includes("timeout") ||
|
|
49
|
-
message.includes("network") ||
|
|
50
|
-
message.includes("socket") ||
|
|
51
|
-
message.includes("aborted")
|
|
52
|
-
) {
|
|
53
|
-
return true;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
// Server errors (but not 500 - usually logic bugs)
|
|
57
|
-
if (error instanceof CustomError && error.code) {
|
|
58
|
-
return error.code === 502 || error.code === 503 || error.code === 504;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
// Recoverable unexpected errors
|
|
62
|
-
if (message.includes("unexpected error")) {
|
|
63
|
-
return true;
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
return false;
|
|
68
|
-
}
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
## Retry Strategies
|
|
72
|
-
|
|
73
|
-
### Exponential Backoff with Jitter
|
|
74
|
-
|
|
75
|
-
Prevents thundering herd, spreads retry load.
|
|
76
|
-
|
|
77
|
-
**Formula**: `delay = min(baseDelay * 2^(attempt-1), maxDelay) ± jitter`
|
|
78
|
-
|
|
79
|
-
**Configuration**:
|
|
80
|
-
|
|
81
|
-
- `baseDelay`: Starting delay (e.g., 100ms)
|
|
82
|
-
- `maxDelay`: Cap on delay growth (e.g., 5000ms)
|
|
83
|
-
- `maxRetries`: Retry limit (e.g., 3)
|
|
84
|
-
- `jitterPercent`: Randomness range (e.g., 20%)
|
|
85
|
-
|
|
86
|
-
**Implementation**:
|
|
87
|
-
|
|
88
|
-
```typescript
|
|
89
|
-
function calculateBackoffDelay(attempt: number): number {
|
|
90
|
-
if (attempt === 0) return 0;
|
|
91
|
-
|
|
92
|
-
const exponentialDelay = baseDelayMs * Math.pow(2, attempt - 1);
|
|
93
|
-
const cappedDelay = Math.min(exponentialDelay, maxDelayMs);
|
|
94
|
-
|
|
95
|
-
// Add jitter (±jitterPercent%)
|
|
96
|
-
const jitterRange = cappedDelay * (jitterPercent / 100);
|
|
97
|
-
const jitter = (Math.random() * 2 - 1) * jitterRange;
|
|
98
|
-
|
|
99
|
-
return Math.round(cappedDelay + jitter);
|
|
100
|
-
}
|
|
101
|
-
```
|
|
102
|
-
|
|
103
|
-
**Example Delays** (base=100ms, max=5000ms, jitter=20%):
|
|
104
|
-
|
|
105
|
-
- Attempt 1: ~100ms ± 20ms
|
|
106
|
-
- Attempt 2: ~200ms ± 40ms
|
|
107
|
-
- Attempt 3: ~400ms ± 80ms
|
|
108
|
-
- Attempt 4: ~800ms ± 160ms
|
|
109
|
-
- Attempt 5+: ~5000ms ± 1000ms (capped)
|
|
110
|
-
|
|
111
|
-
### Retry Loop Pattern
|
|
112
|
-
|
|
113
|
-
```typescript
|
|
114
|
-
async function callWithRetry<T>(
|
|
115
|
-
operation: () => Promise<T>,
|
|
116
|
-
maxRetries: number = 3,
|
|
117
|
-
): Promise<T> {
|
|
118
|
-
let lastError: Error | null = null;
|
|
119
|
-
|
|
120
|
-
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
121
|
-
// Apply backoff delay (except first attempt)
|
|
122
|
-
if (attempt > 0) {
|
|
123
|
-
const delay = calculateBackoffDelay(attempt);
|
|
124
|
-
console.warn(`Retry ${attempt}/${maxRetries} after ${delay}ms`);
|
|
125
|
-
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
try {
|
|
129
|
-
const result = await operation();
|
|
130
|
-
return result; // Success - reset failure tracking
|
|
131
|
-
} catch (error) {
|
|
132
|
-
lastError = error instanceof Error ? error : new Error(String(error));
|
|
133
|
-
|
|
134
|
-
// Check if error is retryable
|
|
135
|
-
if (!isRetryableError(error)) {
|
|
136
|
-
console.warn(`Non-retryable error: ${lastError.message}`);
|
|
137
|
-
throw lastError;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
// Last retry exhausted
|
|
141
|
-
if (attempt === maxRetries) {
|
|
142
|
-
console.error(`All ${maxRetries} retries exhausted`);
|
|
143
|
-
throw lastError;
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
throw lastError || new Error("Unknown error in retry loop");
|
|
149
|
-
}
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
## Circuit Breaker Pattern
|
|
153
|
-
|
|
154
|
-
Stop retrying when repeated failures indicate systemic issue.
|
|
155
|
-
|
|
156
|
-
### State Machine
|
|
157
|
-
|
|
158
|
-
Track consecutive failures to decide when to stop:
|
|
159
|
-
|
|
160
|
-
```typescript
|
|
161
|
-
let consecutiveFailures = 0;
|
|
162
|
-
const failureThreshold = 3; // Circuit opens after 3 failures
|
|
163
|
-
|
|
164
|
-
// On success
|
|
165
|
-
consecutiveFailures = 0;
|
|
166
|
-
|
|
167
|
-
// On retryable error
|
|
168
|
-
consecutiveFailures++;
|
|
169
|
-
if (consecutiveFailures >= failureThreshold) {
|
|
170
|
-
// Circuit open - attempt recovery instead of retry
|
|
171
|
-
await attemptRecovery();
|
|
172
|
-
}
|
|
173
|
-
```
|
|
174
|
-
|
|
175
|
-
### Health Checks
|
|
176
|
-
|
|
177
|
-
Probe service availability before opening circuit:
|
|
178
|
-
|
|
179
|
-
```typescript
|
|
180
|
-
async function isServiceHealthy(): Promise<boolean> {
|
|
181
|
-
try {
|
|
182
|
-
const controller = new AbortController();
|
|
183
|
-
const timeout = setTimeout(() => controller.abort(), 3000);
|
|
184
|
-
|
|
185
|
-
const response = await fetch(`${SERVICE_URL}/health`, {
|
|
186
|
-
signal: controller.signal,
|
|
187
|
-
});
|
|
188
|
-
clearTimeout(timeout);
|
|
189
|
-
|
|
190
|
-
return response.ok;
|
|
191
|
-
} catch {
|
|
192
|
-
return false;
|
|
193
|
-
}
|
|
194
|
-
}
|
|
195
|
-
```
|
|
196
|
-
|
|
197
|
-
### Recovery Actions
|
|
198
|
-
|
|
199
|
-
When circuit opens, attempt automated recovery:
|
|
200
|
-
|
|
201
|
-
```typescript
|
|
202
|
-
// Check health before attempting recovery
|
|
203
|
-
const healthy = await isServiceHealthy();
|
|
204
|
-
if (!healthy) {
|
|
205
|
-
await restartService();
|
|
206
|
-
// Reset failure counter on successful restart
|
|
207
|
-
consecutiveFailures = 0;
|
|
208
|
-
}
|
|
209
|
-
```
|
|
210
|
-
|
|
211
|
-
## Graceful Degradation
|
|
212
|
-
|
|
213
|
-
Reduce functionality instead of total failure.
|
|
214
|
-
|
|
215
|
-
### Backend Fallback Pattern
|
|
216
|
-
|
|
217
|
-
Primary/secondary backend with automatic switching.
|
|
218
|
-
|
|
219
|
-
**Redis → SQLite Example**:
|
|
220
|
-
|
|
221
|
-
```typescript
|
|
222
|
-
async function createWithFallback(): Promise<Client> {
|
|
223
|
-
const maxRetries = 3;
|
|
224
|
-
const retryDelays = [100, 500, 1000]; // Exponential backoff
|
|
225
|
-
|
|
226
|
-
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
227
|
-
try {
|
|
228
|
-
const redis = new Redis(redisUrl, {
|
|
229
|
-
connectTimeout: 2000,
|
|
230
|
-
maxRetriesPerRequest: 1,
|
|
231
|
-
retryStrategy: () => null, // Don't retry internally
|
|
232
|
-
lazyConnect: true,
|
|
233
|
-
});
|
|
234
|
-
|
|
235
|
-
// Test connection
|
|
236
|
-
await redis.connect();
|
|
237
|
-
await redis.ping();
|
|
238
|
-
|
|
239
|
-
return new RedisClient(redis);
|
|
240
|
-
} catch (error) {
|
|
241
|
-
const isLastAttempt = attempt === maxRetries - 1;
|
|
242
|
-
|
|
243
|
-
if (isLastAttempt) {
|
|
244
|
-
// All retries exhausted - fall back to SQLite
|
|
245
|
-
console.warn(
|
|
246
|
-
`Redis connection failed after ${maxRetries} attempts, falling back to SQLite`,
|
|
247
|
-
);
|
|
248
|
-
return new SqliteClient(sqlitePath);
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
// Wait before retrying
|
|
252
|
-
await new Promise((resolve) => setTimeout(resolve, retryDelays[attempt]));
|
|
253
|
-
}
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
// Unreachable due to return in last attempt, but satisfies TypeScript
|
|
257
|
-
return new SqliteClient(sqlitePath);
|
|
258
|
-
}
|
|
259
|
-
```
|
|
260
|
-
|
|
261
|
-
### Warn Once Pattern
|
|
262
|
-
|
|
263
|
-
Avoid log spam when degraded:
|
|
264
|
-
|
|
265
|
-
```typescript
|
|
266
|
-
let hasWarnedAboutFallback = false;
|
|
267
|
-
|
|
268
|
-
if (!hasWarnedAboutFallback) {
|
|
269
|
-
console.warn(`Primary backend unavailable, using fallback`);
|
|
270
|
-
hasWarnedAboutFallback = true;
|
|
271
|
-
}
|
|
272
|
-
```
|
|
273
|
-
|
|
274
|
-
### Feature Toggles
|
|
275
|
-
|
|
276
|
-
Disable non-essential features when degraded:
|
|
277
|
-
|
|
278
|
-
```typescript
|
|
279
|
-
const available = await checkPrimaryBackend();
|
|
280
|
-
if (!available) {
|
|
281
|
-
return {
|
|
282
|
-
error: "Primary backend not available",
|
|
283
|
-
available: false,
|
|
284
|
-
hint: "Start primary backend or continue with reduced functionality",
|
|
285
|
-
fallback: "Operating in degraded mode - some features unavailable",
|
|
286
|
-
};
|
|
287
|
-
}
|
|
288
|
-
```
|
|
289
|
-
|
|
290
|
-
## Server Recovery
|
|
291
|
-
|
|
292
|
-
Automated restart for unrecoverable states.
|
|
293
|
-
|
|
294
|
-
### Recovery State Machine
|
|
295
|
-
|
|
296
|
-
```typescript
|
|
297
|
-
let consecutiveFailures = 0;
|
|
298
|
-
let lastRestartAttempt = 0;
|
|
299
|
-
let isRestarting = false;
|
|
300
|
-
|
|
301
|
-
const RECOVERY_CONFIG = {
|
|
302
|
-
failureThreshold: 1, // Restart after 1 "unexpected error"
|
|
303
|
-
restartCooldownMs: 10000, // 10 second cooldown between restarts
|
|
304
|
-
enabled: true, // Can disable via env var
|
|
305
|
-
};
|
|
306
|
-
```
|
|
307
|
-
|
|
308
|
-
### Restart Cooldown
|
|
309
|
-
|
|
310
|
-
Prevent restart loops:
|
|
311
|
-
|
|
312
|
-
```typescript
|
|
313
|
-
async function restartServer(): Promise<boolean> {
|
|
314
|
-
// Prevent concurrent restarts
|
|
315
|
-
if (isRestarting) {
|
|
316
|
-
console.warn("Restart already in progress");
|
|
317
|
-
return false;
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
// Respect cooldown
|
|
321
|
-
const now = Date.now();
|
|
322
|
-
if (now - lastRestartAttempt < RECOVERY_CONFIG.restartCooldownMs) {
|
|
323
|
-
const waitSec = Math.ceil(
|
|
324
|
-
(RECOVERY_CONFIG.restartCooldownMs - (now - lastRestartAttempt)) / 1000,
|
|
325
|
-
);
|
|
326
|
-
console.warn(`Restart cooldown active, wait ${waitSec}s`);
|
|
327
|
-
return false;
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
isRestarting = true;
|
|
331
|
-
lastRestartAttempt = now;
|
|
332
|
-
|
|
333
|
-
try {
|
|
334
|
-
// Kill existing process
|
|
335
|
-
// Start new process
|
|
336
|
-
// Wait for health check
|
|
337
|
-
consecutiveFailures = 0;
|
|
338
|
-
return true;
|
|
339
|
-
} catch (error) {
|
|
340
|
-
console.error("Restart failed:", error);
|
|
341
|
-
return false;
|
|
342
|
-
} finally {
|
|
343
|
-
isRestarting = false;
|
|
344
|
-
}
|
|
345
|
-
}
|
|
346
|
-
```
|
|
347
|
-
|
|
348
|
-
### Aggressive Recovery
|
|
349
|
-
|
|
350
|
-
Restart immediately on specific error patterns:
|
|
351
|
-
|
|
352
|
-
```typescript
|
|
353
|
-
const isUnexpectedError = errorMessage.includes("unexpected error");
|
|
354
|
-
if (isUnexpectedError && !restartAttempted && RECOVERY_CONFIG.enabled) {
|
|
355
|
-
console.warn("Unexpected error detected, restarting server immediately...");
|
|
356
|
-
restartAttempted = true;
|
|
357
|
-
const restarted = await restartServer();
|
|
358
|
-
|
|
359
|
-
if (restarted) {
|
|
360
|
-
// Clear caches
|
|
361
|
-
availabilityCache = null;
|
|
362
|
-
consecutiveFailures = 0;
|
|
363
|
-
|
|
364
|
-
// Small delay for server to stabilize
|
|
365
|
-
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
366
|
-
|
|
367
|
-
// Don't count this as a retry attempt - retry immediately
|
|
368
|
-
attempt--;
|
|
369
|
-
continue;
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
```
|
|
373
|
-
|
|
374
|
-
## Self-Healing Patterns
|
|
375
|
-
|
|
376
|
-
Automatic re-registration after server restarts.
|
|
377
|
-
|
|
378
|
-
### Not Found Detection
|
|
379
|
-
|
|
380
|
-
Server restart loses in-memory state:
|
|
381
|
-
|
|
382
|
-
```typescript
|
|
383
|
-
function isProjectNotFoundError(error: unknown): boolean {
|
|
384
|
-
if (error instanceof Error) {
|
|
385
|
-
const message = error.message.toLowerCase();
|
|
386
|
-
return (
|
|
387
|
-
message.includes("project") &&
|
|
388
|
-
(message.includes("not found") || message.includes("does not exist"))
|
|
389
|
-
);
|
|
390
|
-
}
|
|
391
|
-
return false;
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
function isAgentNotFoundError(error: unknown): boolean {
|
|
395
|
-
if (error instanceof Error) {
|
|
396
|
-
const message = error.message.toLowerCase();
|
|
397
|
-
return (
|
|
398
|
-
message.includes("agent") &&
|
|
399
|
-
(message.includes("not found") || message.includes("does not exist"))
|
|
400
|
-
);
|
|
401
|
-
}
|
|
402
|
-
return false;
|
|
403
|
-
}
|
|
404
|
-
```
|
|
405
|
-
|
|
406
|
-
### Auto-Init Wrapper
|
|
407
|
-
|
|
408
|
-
Re-register after detecting lost state:
|
|
409
|
-
|
|
410
|
-
```typescript
|
|
411
|
-
async function callWithAutoInit<T>(
|
|
412
|
-
toolName: string,
|
|
413
|
-
args: { project_key: string; agent_name?: string },
|
|
414
|
-
options?: { taskDescription?: string; maxReregistrationAttempts?: number },
|
|
415
|
-
): Promise<T> {
|
|
416
|
-
const maxAttempts = options?.maxReregistrationAttempts ?? 1;
|
|
417
|
-
let reregistrationAttempts = 0;
|
|
418
|
-
|
|
419
|
-
while (true) {
|
|
420
|
-
try {
|
|
421
|
-
return await call<T>(toolName, args);
|
|
422
|
-
} catch (error) {
|
|
423
|
-
const isProjectError = isProjectNotFoundError(error);
|
|
424
|
-
const isAgentError = isAgentNotFoundError(error);
|
|
425
|
-
|
|
426
|
-
if (!isProjectError && !isAgentError) {
|
|
427
|
-
throw error; // Not recoverable
|
|
428
|
-
}
|
|
429
|
-
|
|
430
|
-
if (reregistrationAttempts >= maxAttempts) {
|
|
431
|
-
console.error(`Exhausted ${maxAttempts} re-registration attempt(s)`);
|
|
432
|
-
throw error;
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
reregistrationAttempts++;
|
|
436
|
-
console.warn(
|
|
437
|
-
`Detected "not found", re-registering (attempt ${reregistrationAttempts})...`,
|
|
438
|
-
);
|
|
439
|
-
|
|
440
|
-
// Re-register project first (always needed)
|
|
441
|
-
await reRegisterProject(args.project_key);
|
|
442
|
-
|
|
443
|
-
// Re-register agent if needed
|
|
444
|
-
if (args.agent_name) {
|
|
445
|
-
await reRegisterAgent(
|
|
446
|
-
args.project_key,
|
|
447
|
-
args.agent_name,
|
|
448
|
-
options?.taskDescription,
|
|
449
|
-
);
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
console.warn(`Retrying ${toolName} after re-registration...`);
|
|
453
|
-
// Loop continues to retry
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
```
|
|
458
|
-
|
|
459
|
-
## Rate Limiting Resilience
|
|
460
|
-
|
|
461
|
-
Handle rate limit errors with informative feedback.
|
|
462
|
-
|
|
463
|
-
### Rate Limit Detection
|
|
464
|
-
|
|
465
|
-
```typescript
|
|
466
|
-
class RateLimitExceededError extends Error {
|
|
467
|
-
constructor(
|
|
468
|
-
public readonly endpoint: string,
|
|
469
|
-
public readonly remaining: number,
|
|
470
|
-
public readonly resetAt: number,
|
|
471
|
-
) {
|
|
472
|
-
const resetDate = new Date(resetAt);
|
|
473
|
-
const waitMs = Math.max(0, resetAt - Date.now());
|
|
474
|
-
const waitSec = Math.ceil(waitMs / 1000);
|
|
475
|
-
|
|
476
|
-
super(
|
|
477
|
-
`Rate limit exceeded for ${endpoint}. ` +
|
|
478
|
-
`${remaining} remaining. ` +
|
|
479
|
-
`Retry in ${waitSec}s (at ${resetDate.toISOString()})`,
|
|
480
|
-
);
|
|
481
|
-
this.name = "RateLimitExceededError";
|
|
482
|
-
}
|
|
483
|
-
}
|
|
484
|
-
```
|
|
485
|
-
|
|
486
|
-
### Pre-Check Pattern
|
|
487
|
-
|
|
488
|
-
Check rate limit before making request:
|
|
489
|
-
|
|
490
|
-
```typescript
|
|
491
|
-
async function checkRateLimit(agent: string, endpoint: string): Promise<void> {
|
|
492
|
-
const result = await rateLimiter.checkLimit(agent, endpoint);
|
|
493
|
-
if (!result.allowed) {
|
|
494
|
-
throw new RateLimitExceededError(
|
|
495
|
-
endpoint,
|
|
496
|
-
result.remaining,
|
|
497
|
-
result.resetAt,
|
|
498
|
-
);
|
|
499
|
-
}
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
// Record after successful request
|
|
503
|
-
await recordRateLimitedRequest(agent, endpoint);
|
|
504
|
-
```
|
|
505
|
-
|
|
506
|
-
## Timeout Handling
|
|
507
|
-
|
|
508
|
-
Abort long-running operations.
|
|
509
|
-
|
|
510
|
-
### AbortController Pattern
|
|
511
|
-
|
|
512
|
-
```typescript
|
|
513
|
-
async function callWithTimeout<T>(
|
|
514
|
-
operation: () => Promise<T>,
|
|
515
|
-
timeoutMs: number,
|
|
516
|
-
): Promise<T> {
|
|
517
|
-
const controller = new AbortController();
|
|
518
|
-
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
519
|
-
|
|
520
|
-
try {
|
|
521
|
-
const response = await fetch(url, {
|
|
522
|
-
signal: controller.signal,
|
|
523
|
-
});
|
|
524
|
-
clearTimeout(timeout);
|
|
525
|
-
return response;
|
|
526
|
-
} catch (error) {
|
|
527
|
-
clearTimeout(timeout);
|
|
528
|
-
throw error;
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
```
|
|
532
|
-
|
|
533
|
-
## Context Preservation
|
|
534
|
-
|
|
535
|
-
Prevent context exhaustion from retries/errors.
|
|
536
|
-
|
|
537
|
-
### Hard Caps
|
|
538
|
-
|
|
539
|
-
Enforce limits on unbounded operations:
|
|
540
|
-
|
|
541
|
-
```typescript
|
|
542
|
-
const MAX_INBOX_LIMIT = 5; // HARD CAP
|
|
543
|
-
const limit = Math.min(args.limit || MAX_INBOX_LIMIT, MAX_INBOX_LIMIT);
|
|
544
|
-
```
|
|
545
|
-
|
|
546
|
-
### Prefer Summaries
|
|
547
|
-
|
|
548
|
-
Avoid fetching full content when headers suffice:
|
|
549
|
-
|
|
550
|
-
```typescript
|
|
551
|
-
// ALWAYS use include_bodies: false for inbox
|
|
552
|
-
const messages = await call<MessageHeader[]>("fetch_inbox", {
|
|
553
|
-
project_key: state.projectKey,
|
|
554
|
-
agent_name: state.agentName,
|
|
555
|
-
limit: 5,
|
|
556
|
-
include_bodies: false, // MANDATORY - never include bodies
|
|
557
|
-
});
|
|
558
|
-
|
|
559
|
-
// Use dedicated endpoint for single message bodies
|
|
560
|
-
await call("get_message", { message_id });
|
|
561
|
-
|
|
562
|
-
// Use summarization instead of fetching all messages
|
|
563
|
-
const summary = await call<ThreadSummary>("summarize_thread", {
|
|
564
|
-
project_key: state.projectKey,
|
|
565
|
-
thread_id: args.thread_id,
|
|
566
|
-
include_examples: false,
|
|
567
|
-
});
|
|
568
|
-
```
|
|
569
|
-
|
|
570
|
-
## Anti-Patterns
|
|
571
|
-
|
|
572
|
-
### Avoid
|
|
573
|
-
|
|
574
|
-
- **Infinite retries** - Always set max retry limit
|
|
575
|
-
- **No backoff** - Immediate retries cause thundering herd
|
|
576
|
-
- **Retrying non-retryable errors** - Wastes time, delays failure detection
|
|
577
|
-
- **Silent degradation** - Always log when falling back
|
|
578
|
-
- **No circuit breaker** - Retrying when system is down compounds issues
|
|
579
|
-
- **Restart loops** - Use cooldown to prevent rapid restart cycles
|
|
580
|
-
- **Ignoring timeout errors** - Timeouts are retryable, handle them
|
|
581
|
-
- **Fixed delays** - Use exponential backoff, not fixed intervals
|
|
582
|
-
- **Missing jitter** - Synchronized retries create load spikes
|
|
583
|
-
|
|
584
|
-
## Configuration
|
|
585
|
-
|
|
586
|
-
Make resilience configurable via environment variables:
|
|
587
|
-
|
|
588
|
-
```typescript
|
|
589
|
-
const RETRY_CONFIG = {
|
|
590
|
-
maxRetries: parseInt(process.env.MAX_RETRIES || "3"),
|
|
591
|
-
baseDelayMs: parseInt(process.env.BASE_DELAY_MS || "100"),
|
|
592
|
-
maxDelayMs: parseInt(process.env.MAX_DELAY_MS || "5000"),
|
|
593
|
-
timeoutMs: parseInt(process.env.TIMEOUT_MS || "10000"),
|
|
594
|
-
jitterPercent: 20,
|
|
595
|
-
};
|
|
596
|
-
|
|
597
|
-
const RECOVERY_CONFIG = {
|
|
598
|
-
failureThreshold: 1,
|
|
599
|
-
restartCooldownMs: 10000,
|
|
600
|
-
enabled: process.env.AUTO_RESTART !== "false",
|
|
601
|
-
};
|
|
602
|
-
```
|
|
603
|
-
|
|
604
|
-
## Testing Resilience
|
|
605
|
-
|
|
606
|
-
### Fault Injection
|
|
607
|
-
|
|
608
|
-
Test error handling by simulating failures:
|
|
609
|
-
|
|
610
|
-
```typescript
|
|
611
|
-
// Simulate network errors
|
|
612
|
-
if (Math.random() < 0.3) {
|
|
613
|
-
throw new Error("ECONNRESET");
|
|
614
|
-
}
|
|
615
|
-
|
|
616
|
-
// Simulate rate limiting
|
|
617
|
-
if (requestCount > limit) {
|
|
618
|
-
throw new RateLimitExceededError(endpoint, 0, Date.now() + 60000);
|
|
619
|
-
}
|
|
620
|
-
|
|
621
|
-
// Simulate server restart (lost state)
|
|
622
|
-
if (simulateRestart) {
|
|
623
|
-
throw new Error("Project not found");
|
|
624
|
-
}
|
|
625
|
-
```
|
|
626
|
-
|
|
627
|
-
### Reset Test State
|
|
628
|
-
|
|
629
|
-
Provide reset functions for clean test isolation:
|
|
630
|
-
|
|
631
|
-
```typescript
|
|
632
|
-
export function resetRecoveryState(): void {
|
|
633
|
-
consecutiveFailures = 0;
|
|
634
|
-
lastRestartAttempt = 0;
|
|
635
|
-
isRestarting = false;
|
|
636
|
-
}
|
|
637
|
-
|
|
638
|
-
export function resetFallbackWarning(): void {
|
|
639
|
-
hasWarnedAboutFallback = false;
|
|
640
|
-
}
|
|
641
|
-
```
|
|
642
|
-
|
|
643
|
-
## References
|
|
644
|
-
|
|
645
|
-
- Agent Mail module (`agent-mail.ts`): Retry logic, server recovery, auto-init
|
|
646
|
-
- Rate Limiter module (`rate-limiter.ts`): Backend fallback (Redis → SQLite)
|
|
647
|
-
- Storage module (`storage.ts`): Storage fallback (semantic-memory → in-memory)
|
|
648
|
-
- _Designing Data-Intensive Applications_ by Martin Kleppmann: Fault tolerance, exactly-once semantics
|