opencode-swarm-plugin 0.12.24 → 0.12.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,648 @@
1
+ ---
2
+ name: resilience-patterns
3
+ description: Error recovery, retry strategies, and graceful degradation patterns. Use when handling failures, implementing retries, designing fallback strategies, or building fault-tolerant systems. Covers exponential backoff, circuit breakers, and backend fallbacks.
4
+ ---
5
+
6
+ # Resilience Patterns
7
+
8
+ Error recovery, retry strategies, and graceful degradation for fault-tolerant systems.
9
+
10
+ ## Core Principle
11
+
12
+ **Design for failure, not perfection.** Reduce probability of faults causing system failures by recovering gracefully, degrading functionality intelligently, and learning from patterns of failure.
13
+
14
+ ## Error Classification
15
+
16
+ Classify errors before deciding recovery strategy.
17
+
18
+ ### Retryable Errors (Transient)
19
+
20
+ Network and server issues that typically resolve on retry:
21
+
22
+ - Connection refused/reset (`ECONNREFUSED`, `ECONNRESET`)
23
+ - Timeouts (network, socket, aborted requests)
24
+ - Server overload (502, 503, 504 HTTP codes)
25
+ - Temporary unavailability ("unexpected error" from dependencies)
26
+
27
+ ### Non-Retryable Errors (Permanent)
28
+
29
+ Logic bugs, validation failures, resource constraints:
30
+
31
+ - Authentication failures (401, 403)
32
+ - Not found errors (404)
33
+ - Validation errors (400)
34
+ - Server errors from logic bugs (500)
35
+ - Resource not found (project/agent/entity missing)
36
+
37
+ ### Detection Pattern
38
+
39
+ ```typescript
40
+ function isRetryableError(error: unknown): boolean {
41
+ if (error instanceof Error) {
42
+ const message = error.message.toLowerCase();
43
+
44
+ // Network errors
45
+ if (
46
+ message.includes("econnrefused") ||
47
+ message.includes("econnreset") ||
48
+ message.includes("timeout") ||
49
+ message.includes("network") ||
50
+ message.includes("socket") ||
51
+ message.includes("aborted")
52
+ ) {
53
+ return true;
54
+ }
55
+
56
+ // Server errors (but not 500 - usually logic bugs)
57
+ if (error instanceof CustomError && error.code) {
58
+ return error.code === 502 || error.code === 503 || error.code === 504;
59
+ }
60
+
61
+ // Recoverable unexpected errors
62
+ if (message.includes("unexpected error")) {
63
+ return true;
64
+ }
65
+ }
66
+
67
+ return false;
68
+ }
69
+ ```
70
+
71
+ ## Retry Strategies
72
+
73
+ ### Exponential Backoff with Jitter
74
+
75
+ Prevents thundering herd, spreads retry load.
76
+
77
+ **Formula**: `delay = min(baseDelay * 2^(attempt-1), maxDelay) ± jitter`
78
+
79
+ **Configuration**:
80
+
81
+ - `baseDelay`: Starting delay (e.g., 100ms)
82
+ - `maxDelay`: Cap on delay growth (e.g., 5000ms)
83
+ - `maxRetries`: Retry limit (e.g., 3)
84
+ - `jitterPercent`: Randomness range (e.g., 20%)
85
+
86
+ **Implementation**:
87
+
88
+ ```typescript
89
+ function calculateBackoffDelay(attempt: number): number {
90
+ if (attempt === 0) return 0;
91
+
92
+ const exponentialDelay = baseDelayMs * Math.pow(2, attempt - 1);
93
+ const cappedDelay = Math.min(exponentialDelay, maxDelayMs);
94
+
95
+ // Add jitter (±jitterPercent%)
96
+ const jitterRange = cappedDelay * (jitterPercent / 100);
97
+ const jitter = (Math.random() * 2 - 1) * jitterRange;
98
+
99
+ return Math.round(cappedDelay + jitter);
100
+ }
101
+ ```
102
+
103
+ **Example Delays** (base=100ms, max=5000ms, jitter=20%):
104
+
105
+ - Attempt 1: ~100ms ± 20ms
106
+ - Attempt 2: ~200ms ± 40ms
107
+ - Attempt 3: ~400ms ± 80ms
108
+ - Attempt 4: ~800ms ± 160ms
109
+ - Attempt 5+: ~5000ms ± 1000ms (capped)
110
+
111
+ ### Retry Loop Pattern
112
+
113
+ ```typescript
114
+ async function callWithRetry<T>(
115
+ operation: () => Promise<T>,
116
+ maxRetries: number = 3,
117
+ ): Promise<T> {
118
+ let lastError: Error | null = null;
119
+
120
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
121
+ // Apply backoff delay (except first attempt)
122
+ if (attempt > 0) {
123
+ const delay = calculateBackoffDelay(attempt);
124
+ console.warn(`Retry ${attempt}/${maxRetries} after ${delay}ms`);
125
+ await new Promise((resolve) => setTimeout(resolve, delay));
126
+ }
127
+
128
+ try {
129
+ const result = await operation();
130
+ return result; // Success - reset failure tracking
131
+ } catch (error) {
132
+ lastError = error instanceof Error ? error : new Error(String(error));
133
+
134
+ // Check if error is retryable
135
+ if (!isRetryableError(error)) {
136
+ console.warn(`Non-retryable error: ${lastError.message}`);
137
+ throw lastError;
138
+ }
139
+
140
+ // Last retry exhausted
141
+ if (attempt === maxRetries) {
142
+ console.error(`All ${maxRetries} retries exhausted`);
143
+ throw lastError;
144
+ }
145
+ }
146
+ }
147
+
148
+ throw lastError || new Error("Unknown error in retry loop");
149
+ }
150
+ ```
151
+
152
+ ## Circuit Breaker Pattern
153
+
154
+ Stop retrying when repeated failures indicate systemic issue.
155
+
156
+ ### State Machine
157
+
158
+ Track consecutive failures to decide when to stop:
159
+
160
+ ```typescript
161
+ let consecutiveFailures = 0;
162
+ const failureThreshold = 3; // Circuit opens after 3 failures
163
+
164
+ // On success
165
+ consecutiveFailures = 0;
166
+
167
+ // On retryable error
168
+ consecutiveFailures++;
169
+ if (consecutiveFailures >= failureThreshold) {
170
+ // Circuit open - attempt recovery instead of retry
171
+ await attemptRecovery();
172
+ }
173
+ ```
174
+
175
+ ### Health Checks
176
+
177
+ Probe service availability before opening circuit:
178
+
179
+ ```typescript
180
+ async function isServiceHealthy(): Promise<boolean> {
181
+ try {
182
+ const controller = new AbortController();
183
+ const timeout = setTimeout(() => controller.abort(), 3000);
184
+
185
+ const response = await fetch(`${SERVICE_URL}/health`, {
186
+ signal: controller.signal,
187
+ });
188
+ clearTimeout(timeout);
189
+
190
+ return response.ok;
191
+ } catch {
192
+ return false;
193
+ }
194
+ }
195
+ ```
196
+
197
+ ### Recovery Actions
198
+
199
+ When circuit opens, attempt automated recovery:
200
+
201
+ ```typescript
202
+ // Check health before attempting recovery
203
+ const healthy = await isServiceHealthy();
204
+ if (!healthy) {
205
+ await restartService();
206
+ // Reset failure counter on successful restart
207
+ consecutiveFailures = 0;
208
+ }
209
+ ```
210
+
211
+ ## Graceful Degradation
212
+
213
+ Reduce functionality instead of total failure.
214
+
215
+ ### Backend Fallback Pattern
216
+
217
+ Primary/secondary backend with automatic switching.
218
+
219
+ **Redis → SQLite Example**:
220
+
221
+ ```typescript
222
+ async function createWithFallback(): Promise<Client> {
223
+ const maxRetries = 3;
224
+ const retryDelays = [100, 500, 1000]; // Exponential backoff
225
+
226
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
227
+ try {
228
+ const redis = new Redis(redisUrl, {
229
+ connectTimeout: 2000,
230
+ maxRetriesPerRequest: 1,
231
+ retryStrategy: () => null, // Don't retry internally
232
+ lazyConnect: true,
233
+ });
234
+
235
+ // Test connection
236
+ await redis.connect();
237
+ await redis.ping();
238
+
239
+ return new RedisClient(redis);
240
+ } catch (error) {
241
+ const isLastAttempt = attempt === maxRetries - 1;
242
+
243
+ if (isLastAttempt) {
244
+ // All retries exhausted - fall back to SQLite
245
+ console.warn(
246
+ `Redis connection failed after ${maxRetries} attempts, falling back to SQLite`,
247
+ );
248
+ return new SqliteClient(sqlitePath);
249
+ }
250
+
251
+ // Wait before retrying
252
+ await new Promise((resolve) => setTimeout(resolve, retryDelays[attempt]));
253
+ }
254
+ }
255
+
256
+ // Unreachable due to return in last attempt, but satisfies TypeScript
257
+ return new SqliteClient(sqlitePath);
258
+ }
259
+ ```
260
+
261
+ ### Warn Once Pattern
262
+
263
+ Avoid log spam when degraded:
264
+
265
+ ```typescript
266
+ let hasWarnedAboutFallback = false;
267
+
268
+ if (!hasWarnedAboutFallback) {
269
+ console.warn(`Primary backend unavailable, using fallback`);
270
+ hasWarnedAboutFallback = true;
271
+ }
272
+ ```
273
+
274
+ ### Feature Toggles
275
+
276
+ Disable non-essential features when degraded:
277
+
278
+ ```typescript
279
+ const available = await checkPrimaryBackend();
280
+ if (!available) {
281
+ return {
282
+ error: "Primary backend not available",
283
+ available: false,
284
+ hint: "Start primary backend or continue with reduced functionality",
285
+ fallback: "Operating in degraded mode - some features unavailable",
286
+ };
287
+ }
288
+ ```
289
+
290
+ ## Server Recovery
291
+
292
+ Automated restart for unrecoverable states.
293
+
294
+ ### Recovery State Machine
295
+
296
+ ```typescript
297
+ let consecutiveFailures = 0;
298
+ let lastRestartAttempt = 0;
299
+ let isRestarting = false;
300
+
301
+ const RECOVERY_CONFIG = {
302
+ failureThreshold: 1, // Restart after 1 "unexpected error"
303
+ restartCooldownMs: 10000, // 10 second cooldown between restarts
304
+ enabled: true, // Can disable via env var
305
+ };
306
+ ```
307
+
308
+ ### Restart Cooldown
309
+
310
+ Prevent restart loops:
311
+
312
+ ```typescript
313
+ async function restartServer(): Promise<boolean> {
314
+ // Prevent concurrent restarts
315
+ if (isRestarting) {
316
+ console.warn("Restart already in progress");
317
+ return false;
318
+ }
319
+
320
+ // Respect cooldown
321
+ const now = Date.now();
322
+ if (now - lastRestartAttempt < RECOVERY_CONFIG.restartCooldownMs) {
323
+ const waitSec = Math.ceil(
324
+ (RECOVERY_CONFIG.restartCooldownMs - (now - lastRestartAttempt)) / 1000,
325
+ );
326
+ console.warn(`Restart cooldown active, wait ${waitSec}s`);
327
+ return false;
328
+ }
329
+
330
+ isRestarting = true;
331
+ lastRestartAttempt = now;
332
+
333
+ try {
334
+ // Kill existing process
335
+ // Start new process
336
+ // Wait for health check
337
+ consecutiveFailures = 0;
338
+ return true;
339
+ } catch (error) {
340
+ console.error("Restart failed:", error);
341
+ return false;
342
+ } finally {
343
+ isRestarting = false;
344
+ }
345
+ }
346
+ ```
347
+
348
+ ### Aggressive Recovery
349
+
350
+ Restart immediately on specific error patterns:
351
+
352
+ ```typescript
353
+ const isUnexpectedError = errorMessage.includes("unexpected error");
354
+ if (isUnexpectedError && !restartAttempted && RECOVERY_CONFIG.enabled) {
355
+ console.warn("Unexpected error detected, restarting server immediately...");
356
+ restartAttempted = true;
357
+ const restarted = await restartServer();
358
+
359
+ if (restarted) {
360
+ // Clear caches
361
+ availabilityCache = null;
362
+ consecutiveFailures = 0;
363
+
364
+ // Small delay for server to stabilize
365
+ await new Promise((resolve) => setTimeout(resolve, 1000));
366
+
367
+ // Don't count this as a retry attempt - retry immediately
368
+ attempt--;
369
+ continue;
370
+ }
371
+ }
372
+ ```
373
+
374
+ ## Self-Healing Patterns
375
+
376
+ Automatic re-registration after server restarts.
377
+
378
+ ### Not Found Detection
379
+
380
+ Server restart loses in-memory state:
381
+
382
+ ```typescript
383
+ function isProjectNotFoundError(error: unknown): boolean {
384
+ if (error instanceof Error) {
385
+ const message = error.message.toLowerCase();
386
+ return (
387
+ message.includes("project") &&
388
+ (message.includes("not found") || message.includes("does not exist"))
389
+ );
390
+ }
391
+ return false;
392
+ }
393
+
394
+ function isAgentNotFoundError(error: unknown): boolean {
395
+ if (error instanceof Error) {
396
+ const message = error.message.toLowerCase();
397
+ return (
398
+ message.includes("agent") &&
399
+ (message.includes("not found") || message.includes("does not exist"))
400
+ );
401
+ }
402
+ return false;
403
+ }
404
+ ```
405
+
406
+ ### Auto-Init Wrapper
407
+
408
+ Re-register after detecting lost state:
409
+
410
+ ```typescript
411
+ async function callWithAutoInit<T>(
412
+ toolName: string,
413
+ args: { project_key: string; agent_name?: string },
414
+ options?: { taskDescription?: string; maxReregistrationAttempts?: number },
415
+ ): Promise<T> {
416
+ const maxAttempts = options?.maxReregistrationAttempts ?? 1;
417
+ let reregistrationAttempts = 0;
418
+
419
+ while (true) {
420
+ try {
421
+ return await call<T>(toolName, args);
422
+ } catch (error) {
423
+ const isProjectError = isProjectNotFoundError(error);
424
+ const isAgentError = isAgentNotFoundError(error);
425
+
426
+ if (!isProjectError && !isAgentError) {
427
+ throw error; // Not recoverable
428
+ }
429
+
430
+ if (reregistrationAttempts >= maxAttempts) {
431
+ console.error(`Exhausted ${maxAttempts} re-registration attempt(s)`);
432
+ throw error;
433
+ }
434
+
435
+ reregistrationAttempts++;
436
+ console.warn(
437
+ `Detected "not found", re-registering (attempt ${reregistrationAttempts})...`,
438
+ );
439
+
440
+ // Re-register project first (always needed)
441
+ await reRegisterProject(args.project_key);
442
+
443
+ // Re-register agent if needed
444
+ if (args.agent_name) {
445
+ await reRegisterAgent(
446
+ args.project_key,
447
+ args.agent_name,
448
+ options?.taskDescription,
449
+ );
450
+ }
451
+
452
+ console.warn(`Retrying ${toolName} after re-registration...`);
453
+ // Loop continues to retry
454
+ }
455
+ }
456
+ }
457
+ ```
458
+
459
+ ## Rate Limiting Resilience
460
+
461
+ Handle rate limit errors with informative feedback.
462
+
463
+ ### Rate Limit Detection
464
+
465
+ ```typescript
466
+ class RateLimitExceededError extends Error {
467
+ constructor(
468
+ public readonly endpoint: string,
469
+ public readonly remaining: number,
470
+ public readonly resetAt: number,
471
+ ) {
472
+ const resetDate = new Date(resetAt);
473
+ const waitMs = Math.max(0, resetAt - Date.now());
474
+ const waitSec = Math.ceil(waitMs / 1000);
475
+
476
+ super(
477
+ `Rate limit exceeded for ${endpoint}. ` +
478
+ `${remaining} remaining. ` +
479
+ `Retry in ${waitSec}s (at ${resetDate.toISOString()})`,
480
+ );
481
+ this.name = "RateLimitExceededError";
482
+ }
483
+ }
484
+ ```
485
+
486
+ ### Pre-Check Pattern
487
+
488
+ Check rate limit before making request:
489
+
490
+ ```typescript
491
+ async function checkRateLimit(agent: string, endpoint: string): Promise<void> {
492
+ const result = await rateLimiter.checkLimit(agent, endpoint);
493
+ if (!result.allowed) {
494
+ throw new RateLimitExceededError(
495
+ endpoint,
496
+ result.remaining,
497
+ result.resetAt,
498
+ );
499
+ }
500
+ }
501
+
502
+ // Record after successful request
503
+ await recordRateLimitedRequest(agent, endpoint);
504
+ ```
505
+
506
+ ## Timeout Handling
507
+
508
+ Abort long-running operations.
509
+
510
+ ### AbortController Pattern
511
+
512
+ ```typescript
513
+ async function callWithTimeout<T>(
514
+ operation: () => Promise<T>,
515
+ timeoutMs: number,
516
+ ): Promise<T> {
517
+ const controller = new AbortController();
518
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
519
+
520
+ try {
521
+ const response = await fetch(url, {
522
+ signal: controller.signal,
523
+ });
524
+ clearTimeout(timeout);
525
+ return response;
526
+ } catch (error) {
527
+ clearTimeout(timeout);
528
+ throw error;
529
+ }
530
+ }
531
+ ```
532
+
533
+ ## Context Preservation
534
+
535
+ Prevent context exhaustion from retries/errors.
536
+
537
+ ### Hard Caps
538
+
539
+ Enforce limits on unbounded operations:
540
+
541
+ ```typescript
542
+ const MAX_INBOX_LIMIT = 5; // HARD CAP
543
+ const limit = Math.min(args.limit || MAX_INBOX_LIMIT, MAX_INBOX_LIMIT);
544
+ ```
545
+
546
+ ### Prefer Summaries
547
+
548
+ Avoid fetching full content when headers suffice:
549
+
550
+ ```typescript
551
+ // ALWAYS use include_bodies: false for inbox
552
+ const messages = await call<MessageHeader[]>("fetch_inbox", {
553
+ project_key: state.projectKey,
554
+ agent_name: state.agentName,
555
+ limit: 5,
556
+ include_bodies: false, // MANDATORY - never include bodies
557
+ });
558
+
559
+ // Use dedicated endpoint for single message bodies
560
+ await call("get_message", { message_id });
561
+
562
+ // Use summarization instead of fetching all messages
563
+ const summary = await call<ThreadSummary>("summarize_thread", {
564
+ project_key: state.projectKey,
565
+ thread_id: args.thread_id,
566
+ include_examples: false,
567
+ });
568
+ ```
569
+
570
+ ## Anti-Patterns
571
+
572
+ ### Avoid
573
+
574
+ - **Infinite retries** - Always set max retry limit
575
+ - **No backoff** - Immediate retries cause thundering herd
576
+ - **Retrying non-retryable errors** - Wastes time, delays failure detection
577
+ - **Silent degradation** - Always log when falling back
578
+ - **No circuit breaker** - Retrying when system is down compounds issues
579
+ - **Restart loops** - Use cooldown to prevent rapid restart cycles
580
+ - **Ignoring timeout errors** - Timeouts are retryable, handle them
581
+ - **Fixed delays** - Use exponential backoff, not fixed intervals
582
+ - **Missing jitter** - Synchronized retries create load spikes
583
+
584
+ ## Configuration
585
+
586
+ Make resilience configurable via environment variables:
587
+
588
+ ```typescript
589
+ const RETRY_CONFIG = {
590
+ maxRetries: parseInt(process.env.MAX_RETRIES || "3"),
591
+ baseDelayMs: parseInt(process.env.BASE_DELAY_MS || "100"),
592
+ maxDelayMs: parseInt(process.env.MAX_DELAY_MS || "5000"),
593
+ timeoutMs: parseInt(process.env.TIMEOUT_MS || "10000"),
594
+ jitterPercent: 20,
595
+ };
596
+
597
+ const RECOVERY_CONFIG = {
598
+ failureThreshold: 1,
599
+ restartCooldownMs: 10000,
600
+ enabled: process.env.AUTO_RESTART !== "false",
601
+ };
602
+ ```
603
+
604
+ ## Testing Resilience
605
+
606
+ ### Fault Injection
607
+
608
+ Test error handling by simulating failures:
609
+
610
+ ```typescript
611
+ // Simulate network errors
612
+ if (Math.random() < 0.3) {
613
+ throw new Error("ECONNRESET");
614
+ }
615
+
616
+ // Simulate rate limiting
617
+ if (requestCount > limit) {
618
+ throw new RateLimitExceededError(endpoint, 0, Date.now() + 60000);
619
+ }
620
+
621
+ // Simulate server restart (lost state)
622
+ if (simulateRestart) {
623
+ throw new Error("Project not found");
624
+ }
625
+ ```
626
+
627
+ ### Reset Test State
628
+
629
+ Provide reset functions for clean test isolation:
630
+
631
+ ```typescript
632
+ export function resetRecoveryState(): void {
633
+ consecutiveFailures = 0;
634
+ lastRestartAttempt = 0;
635
+ isRestarting = false;
636
+ }
637
+
638
+ export function resetFallbackWarning(): void {
639
+ hasWarnedAboutFallback = false;
640
+ }
641
+ ```
642
+
643
+ ## References
644
+
645
+ - Agent Mail module (`agent-mail.ts`): Retry logic, server recovery, auto-init
646
+ - Rate Limiter module (`rate-limiter.ts`): Backend fallback (Redis → SQLite)
647
+ - Storage module (`storage.ts`): Storage fallback (semantic-memory → in-memory)
648
+ - _Designing Data-Intensive Applications_ by Martin Kleppmann: Fault tolerance, exactly-once semantics