@cheapestinference/openclaw-ratelimit-retry 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,604 @@
1
+ # retry-on-error Plugin Implementation Plan
2
+
3
+ > **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking.
4
+
5
+ **Goal:** Build an OpenClaw plugin that automatically retries agent conversations that fail due to provider rate limits (429), resuming them when the budget window resets.
6
+
7
+ **Architecture:** OpenClaw plugin with an `agent_end` hook for error detection, a persistent JSON queue on disk, and a background service that periodically checks for retries and sends `chat.send` via WebSocket to the local gateway. Uses Node.js built-in WebSocket (Node 22+) with token auth.
8
+
9
+ **Tech Stack:** TypeScript (ESM via Jiti), OpenClaw Plugin SDK, Node.js built-in WebSocket, fs/promises
10
+
11
+ **Spec:** `docs/superpowers/specs/2026-03-12-retry-on-error-plugin-design.md`
12
+
13
+ ---
14
+
15
+ ## Chunk 1: Project scaffolding and plugin manifest
16
+
17
+ ### Task 1: Create plugin manifest and package.json
18
+
19
+ **Files:**
20
+ - Create: `openclaw.plugin.json`
21
+ - Create: `package.json`
22
+
23
+ - [ ] **Step 1: Create openclaw.plugin.json**
24
+
25
+ ```json
26
+ {
27
+ "id": "retry-on-error",
28
+ "configSchema": {
29
+ "type": "object",
30
+ "additionalProperties": false,
31
+ "properties": {
32
+ "budgetWindowHours": {
33
+ "type": "number",
34
+ "default": 5,
35
+ "description": "Budget reset window in hours (aligned to UTC clock boundaries)"
36
+ },
37
+ "maxRetryAttempts": {
38
+ "type": "number",
39
+ "default": 3,
40
+ "description": "Maximum retry attempts per session before abandoning"
41
+ },
42
+ "checkIntervalMinutes": {
43
+ "type": "number",
44
+ "default": 5,
45
+ "description": "How often to check for pending retries (minutes)"
46
+ },
47
+ "retryMessage": {
48
+ "type": "string",
49
+ "default": "Continue where you left off. The previous attempt failed due to a rate limit that has now reset.",
50
+ "description": "Message sent to the session to resume the conversation"
51
+ }
52
+ }
53
+ }
54
+ }
55
+ ```
56
+
57
+ - [ ] **Step 2: Create package.json**
58
+
59
+ ```json
60
+ {
61
+ "name": "@openclaw/retry-on-error",
62
+ "version": "1.0.0",
63
+ "description": "Automatically retry agent conversations that fail due to provider rate limits",
64
+ "type": "module",
65
+ "license": "MIT",
66
+ "repository": {
67
+ "type": "git",
68
+ "url": "https://github.com/cheapestinference/openclaw-plugin-retry-on-error"
69
+ },
70
+ "keywords": ["openclaw", "plugin", "retry", "rate-limit", "429"],
71
+ "openclaw": {
72
+ "extensions": ["./index.ts"]
73
+ }
74
+ }
75
+ ```
76
+
77
+ - [ ] **Step 3: Commit**
78
+
79
+ ```bash
80
+ git add openclaw.plugin.json package.json
81
+ git commit -m "feat: add plugin manifest and package.json"
82
+ ```
83
+
84
+ ---
85
+
86
+ ## Chunk 2: Core implementation
87
+
88
+ ### Task 2: Create the retry service
89
+
90
+ **Files:**
91
+ - Create: `src/service.ts`
92
+
93
+ This is the main module. Contains:
94
+ - Error pattern matching (`isRetriableError`)
95
+ - Queue management (`loadQueue`, `saveQueue`)
96
+ - Next reset time calculation (`nextResetTime`)
97
+ - Minimal WebSocket client for `chat.send` (`sendRetryMessage`)
98
+ - Background service lifecycle (`createRetryService`)
99
+
100
+ - [ ] **Step 1: Create `src/service.ts` with error detection**
101
+
102
+ ```typescript
103
+ import { writeFile, readFile, mkdir, rename } from "node:fs/promises";
104
+ import { join } from "node:path";
105
+ import type { OpenClawPluginService } from "openclaw/plugin-sdk";
106
+
107
+ // --- Types ---
108
+
109
+ interface QueueEntry {
110
+ sessionKey: string;
111
+ errorTime: number;
112
+ retryAfter: number;
113
+ errorMessage: string;
114
+ attempts: number;
115
+ }
116
+
117
+ interface RetryConfig {
118
+ budgetWindowHours: number;
119
+ maxRetryAttempts: number;
120
+ checkIntervalMinutes: number;
121
+ retryMessage: string;
122
+ gatewayPort: number;
123
+ gatewayToken: string | undefined;
124
+ gatewayPassword: string | undefined;
125
+ }
126
+
127
+ // --- Error Detection ---
128
+
129
+ const RETRIABLE_PATTERNS = [
130
+ /429/i,
131
+ /rate[_ ]?limit/i,
132
+ /too many requests/i,
133
+ /budget/i,
134
+ /quota[_ ]?exceeded/i,
135
+ /resource[_ ]?(exhausted|has been exhausted)/i,
136
+ /tokens? per minute/i,
137
+ /\btpm\b/i,
138
+ ];
139
+
140
+ const NON_RETRIABLE_PATTERNS = [
141
+ /40[1-3]/i,
142
+ /invalid api key/i,
143
+ /unauthorized/i,
144
+ /context[_ ]?(length|overflow)/i,
145
+ /prompt too (large|long)/i,
146
+ /model not found/i,
147
+ /insufficient[_ ]?credits/i,
148
+ /malformed/i,
149
+ ];
150
+
151
+ export function isRetriableError(error: string | undefined): boolean {
152
+ if (!error) return false;
153
+ for (const pattern of NON_RETRIABLE_PATTERNS) {
154
+ if (pattern.test(error)) return false;
155
+ }
156
+ for (const pattern of RETRIABLE_PATTERNS) {
157
+ if (pattern.test(error)) return true;
158
+ }
159
+ return false;
160
+ }
161
+
162
+ // --- Reset Time Calculation ---
163
+
164
+ export function nextResetTime(now: Date, windowHours: number): number {
165
+ const currentHour = now.getUTCHours();
166
+ const currentMinute = now.getUTCMinutes();
167
+ const currentSecond = now.getUTCSeconds();
168
+
169
+ let nextBoundary = currentHour + windowHours - (currentHour % windowHours);
170
+
171
+ // If exactly on boundary, push to next window
172
+ if (currentHour % windowHours === 0 && currentMinute === 0 && currentSecond === 0) {
173
+ nextBoundary = currentHour + windowHours;
174
+ }
175
+
176
+ const result = new Date(now);
177
+ result.setUTCSeconds(0, 0);
178
+
179
+ if (nextBoundary >= 24) {
180
+ // Overflows to next day
181
+ result.setUTCDate(result.getUTCDate() + 1);
182
+ result.setUTCHours(nextBoundary - 24, 1, 0, 0); // +1 minute margin
183
+ } else {
184
+ result.setUTCHours(nextBoundary, 1, 0, 0); // +1 minute margin
185
+ }
186
+
187
+ return result.getTime();
188
+ }
189
+
190
+ // --- Queue Management ---
191
+
192
+ const MAX_QUEUE_SIZE = 100;
193
+
194
+ async function loadQueue(queuePath: string): Promise<QueueEntry[]> {
195
+ try {
196
+ const data = await readFile(queuePath, "utf-8");
197
+ const parsed = JSON.parse(data);
198
+ if (!Array.isArray(parsed)) return [];
199
+ return parsed;
200
+ } catch {
201
+ return [];
202
+ }
203
+ }
204
+
205
+ async function saveQueue(queuePath: string, queue: QueueEntry[]): Promise<void> {
206
+ const dir = queuePath.substring(0, queuePath.lastIndexOf("/"));
207
+ await mkdir(dir, { recursive: true });
208
+ const tmpPath = queuePath + ".tmp";
209
+ await writeFile(tmpPath, JSON.stringify(queue, null, 2), "utf-8");
210
+ await rename(tmpPath, queuePath);
211
+ }
212
+
213
+ function addToQueue(queue: QueueEntry[], entry: QueueEntry): QueueEntry[] {
214
+ // Deduplicate by sessionKey
215
+ const filtered = queue.filter((e) => e.sessionKey !== entry.sessionKey);
216
+ filtered.push(entry);
217
+ // Cap at MAX_QUEUE_SIZE, evict oldest
218
+ if (filtered.length > MAX_QUEUE_SIZE) {
219
+ filtered.sort((a, b) => a.errorTime - b.errorTime);
220
+ return filtered.slice(-MAX_QUEUE_SIZE);
221
+ }
222
+ return filtered;
223
+ }
224
+
225
+ // --- WebSocket Chat Client ---
226
+
227
+ interface ChatSendResult {
228
+ ok: boolean;
229
+ error?: string;
230
+ }
231
+
232
+ async function sendRetryMessage(
233
+ port: number,
234
+ token: string | undefined,
235
+ password: string | undefined,
236
+ sessionKey: string,
237
+ message: string,
238
+ ): Promise<ChatSendResult> {
239
+ return new Promise((resolve) => {
240
+ const timeout = setTimeout(() => {
241
+ try { ws.close(); } catch {}
242
+ resolve({ ok: false, error: "Connection timeout" });
243
+ }, 30_000);
244
+
245
+ const ws = new WebSocket(`ws://127.0.0.1:${port}`);
246
+ let requestId = 0;
247
+
248
+ ws.addEventListener("error", () => {
249
+ clearTimeout(timeout);
250
+ resolve({ ok: false, error: "WebSocket connection error" });
251
+ });
252
+
253
+ ws.addEventListener("close", () => {
254
+ clearTimeout(timeout);
255
+ });
256
+
257
+ ws.addEventListener("message", (event) => {
258
+ try {
259
+ const frame = JSON.parse(String(event.data));
260
+
261
+ // Handle connect.challenge → send connect
262
+ if (frame.type === "event" && frame.event === "connect.challenge") {
263
+ const connectFrame: Record<string, unknown> = {
264
+ type: "req",
265
+ id: ++requestId,
266
+ method: "connect",
267
+ params: {
268
+ minProtocol: 1,
269
+ maxProtocol: 1,
270
+ client: {
271
+ name: "retry-on-error",
272
+ displayName: "Retry on Error Plugin",
273
+ version: "1.0.0",
274
+ mode: "backend",
275
+ },
276
+ role: "operator",
277
+ scopes: ["operator.admin"],
278
+ },
279
+ };
280
+
281
+ // Add auth
282
+ if (token) {
283
+ (connectFrame.params as Record<string, unknown>).auth = { token };
284
+ } else if (password) {
285
+ (connectFrame.params as Record<string, unknown>).auth = { password };
286
+ }
287
+
288
+ ws.send(JSON.stringify(connectFrame));
289
+ return;
290
+ }
291
+
292
+ // Handle connect response (HelloOk) → send chat.send
293
+ if (frame.type === "res" && frame.id === 1 && frame.ok) {
294
+ const chatFrame = {
295
+ type: "req",
296
+ id: ++requestId,
297
+ method: "chat.send",
298
+ params: {
299
+ sessionKey,
300
+ message,
301
+ idempotencyKey: `retry:${sessionKey}:${Date.now()}`,
302
+ },
303
+ };
304
+ ws.send(JSON.stringify(chatFrame));
305
+ return;
306
+ }
307
+
308
+ // Handle chat.send response → done
309
+ if (frame.type === "res" && frame.id === 2) {
310
+ clearTimeout(timeout);
311
+ if (frame.ok) {
312
+ resolve({ ok: true });
313
+ } else {
314
+ resolve({ ok: false, error: frame.error?.message ?? "chat.send failed" });
315
+ }
316
+ ws.close();
317
+ return;
318
+ }
319
+ } catch {
320
+ // Ignore unparseable frames
321
+ }
322
+ });
323
+ });
324
+ }
325
+
326
+ // --- Service ---
327
+
328
+ export function createRetryService(
329
+ addEntry: (sessionKey: string, errorMessage: string, config: RetryConfig) => void,
330
+ ): {
331
+ service: OpenClawPluginService;
332
+ getAddEntry: () => typeof addEntry;
333
+ } {
334
+ let queue: QueueEntry[] = [];
335
+ let queuePath = "";
336
+ let timer: ReturnType<typeof setInterval> | null = null;
337
+ let retryInProgress = false;
338
+ let config: RetryConfig = {
339
+ budgetWindowHours: 5,
340
+ maxRetryAttempts: 3,
341
+ checkIntervalMinutes: 5,
342
+ retryMessage: "Continue where you left off. The previous attempt failed due to a rate limit that has now reset.",
343
+ gatewayPort: 18789,
344
+ gatewayToken: undefined,
345
+ gatewayPassword: undefined,
346
+ };
347
+
348
+ const addEntryImpl = (sessionKey: string, errorMessage: string, cfg: RetryConfig) => {
349
+ config = cfg;
350
+ const now = new Date();
351
+ const existing = queue.find((e) => e.sessionKey === sessionKey);
352
+ const attempts = existing ? existing.attempts + 1 : 0;
353
+
354
+ if (attempts >= config.maxRetryAttempts) {
355
+ queue = queue.filter((e) => e.sessionKey !== sessionKey);
356
+ saveQueue(queuePath, queue).catch(() => {});
357
+ return;
358
+ }
359
+
360
+ const entry: QueueEntry = {
361
+ sessionKey,
362
+ errorTime: now.getTime(),
363
+ retryAfter: nextResetTime(now, config.budgetWindowHours),
364
+ errorMessage,
365
+ attempts,
366
+ };
367
+
368
+ queue = addToQueue(queue, entry);
369
+ saveQueue(queuePath, queue).catch(() => {});
370
+ };
371
+
372
+ // Replace the external addEntry with the internal one
373
+ addEntry = addEntryImpl;
374
+
375
+ const processTick = async (logger: { info: (msg: string) => void; warn: (msg: string) => void; error: (msg: string) => void }) => {
376
+ if (retryInProgress || queue.length === 0) return;
377
+ retryInProgress = true;
378
+
379
+ try {
380
+ const now = Date.now();
381
+ const ready = queue.filter((e) => e.retryAfter <= now);
382
+ if (ready.length === 0) return;
383
+
384
+ logger.info(`retry-on-error: ${ready.length} session(s) ready for retry`);
385
+
386
+ for (const entry of ready) {
387
+ logger.info(`retry-on-error: retrying session ${entry.sessionKey} (attempt ${entry.attempts + 1})`);
388
+
389
+ const result = await sendRetryMessage(
390
+ config.gatewayPort,
391
+ config.gatewayToken,
392
+ config.gatewayPassword,
393
+ entry.sessionKey,
394
+ config.retryMessage,
395
+ );
396
+
397
+ if (result.ok) {
398
+ // Remove from queue — if it fails again, agent_end will re-queue
399
+ queue = queue.filter((e) => e.sessionKey !== entry.sessionKey);
400
+ logger.info(`retry-on-error: sent retry to ${entry.sessionKey}`);
401
+ } else {
402
+ logger.warn(`retry-on-error: failed to send retry to ${entry.sessionKey}: ${result.error}`);
403
+ // Leave in queue for next tick
404
+ }
405
+ }
406
+
407
+ await saveQueue(queuePath, queue);
408
+ } finally {
409
+ retryInProgress = false;
410
+ }
411
+ };
412
+
413
+ const service: OpenClawPluginService = {
414
+ id: "retry-on-error",
415
+
416
+ async start(ctx) {
417
+ const stateDir = join(ctx.stateDir, "retry-on-error");
418
+ queuePath = join(stateDir, "queue.json");
419
+
420
+ config = {
421
+ ...config,
422
+ gatewayPort: (ctx.config as Record<string, any>).gateway?.port ?? 18789,
423
+ gatewayToken: (ctx.config as Record<string, any>).gateway?.auth?.token,
424
+ gatewayPassword: (ctx.config as Record<string, any>).gateway?.auth?.password,
425
+ };
426
+
427
+ queue = await loadQueue(queuePath);
428
+
429
+ if (queue.length > 0) {
430
+ ctx.logger.info(`retry-on-error: loaded ${queue.length} pending retry(s) from disk`);
431
+ }
432
+
433
+ const intervalMs = config.checkIntervalMinutes * 60 * 1000;
434
+ timer = setInterval(() => {
435
+ processTick(ctx.logger).catch((err) => {
436
+ ctx.logger.error(`retry-on-error: tick failed: ${err}`);
437
+ });
438
+ }, intervalMs);
439
+
440
+ ctx.logger.info(
441
+ `retry-on-error: service started (window=${config.budgetWindowHours}h, check=${config.checkIntervalMinutes}min, maxAttempts=${config.maxRetryAttempts})`,
442
+ );
443
+ },
444
+
445
+ async stop(ctx) {
446
+ if (timer) {
447
+ clearInterval(timer);
448
+ timer = null;
449
+ }
450
+ if (queuePath && queue.length > 0) {
451
+ await saveQueue(queuePath, queue);
452
+ }
453
+ ctx.logger.info("retry-on-error: service stopped");
454
+ },
455
+ };
456
+
457
+ return { service, getAddEntry: () => addEntryImpl };
458
+ }
459
+ ```
460
+
461
+ - [ ] **Step 2: Commit**
462
+
463
+ ```bash
464
+ git add src/service.ts
465
+ git commit -m "feat: add retry service with error detection, queue, and WS client"
466
+ ```
467
+
468
+ ### Task 3: Create the plugin entry point
469
+
470
+ **Files:**
471
+ - Create: `index.ts`
472
+
473
+ - [ ] **Step 1: Create `index.ts`**
474
+
475
+ ```typescript
476
+ import type { OpenClawPluginApi } from "openclaw/plugin-sdk";
477
+ import { createRetryService, isRetriableError } from "./src/service.js";
478
+
479
+ interface PluginConfig {
480
+ budgetWindowHours?: number;
481
+ maxRetryAttempts?: number;
482
+ checkIntervalMinutes?: number;
483
+ retryMessage?: string;
484
+ }
485
+
486
+ const DEFAULT_CONFIG: Required<PluginConfig> = {
487
+ budgetWindowHours: 5,
488
+ maxRetryAttempts: 3,
489
+ checkIntervalMinutes: 5,
490
+ retryMessage: "Continue where you left off. The previous attempt failed due to a rate limit that has now reset.",
491
+ };
492
+
493
+ // Placeholder that gets replaced when service starts
494
+ let addEntry: (sessionKey: string, errorMessage: string, config: any) => void = () => {};
495
+
496
+ const { service, getAddEntry } = createRetryService(addEntry);
497
+
498
+ const plugin = {
499
+ id: "retry-on-error",
500
+ name: "Retry on Error",
501
+ description: "Automatically retry agent conversations that fail due to provider rate limits",
502
+
503
+ register(api: OpenClawPluginApi) {
504
+ const cfg = {
505
+ ...DEFAULT_CONFIG,
506
+ ...(api.pluginConfig as PluginConfig),
507
+ };
508
+
509
+ // Register the agent_end hook
510
+ api.on("agent_end", (event, ctx) => {
511
+ const error = (event as Record<string, unknown>).error as string | undefined;
512
+ const success = (event as Record<string, unknown>).success as boolean | undefined;
513
+
514
+ // Only process failures
515
+ if (success || !error) return;
516
+
517
+ // Only process retriable errors
518
+ if (!isRetriableError(error)) {
519
+ api.logger.debug?.(`retry-on-error: non-retriable error on ${ctx.sessionKey}: ${error.slice(0, 100)}`);
520
+ return;
521
+ }
522
+
523
+ const sessionKey = (ctx as Record<string, unknown>).sessionKey as string | undefined;
524
+ if (!sessionKey) return;
525
+
526
+ api.logger.info(`retry-on-error: queuing retry for ${sessionKey} (error: ${error.slice(0, 100)})`);
527
+
528
+ const resolvedConfig = {
529
+ ...cfg,
530
+ gatewayPort: (api.config as Record<string, any>).gateway?.port ?? 18789,
531
+ gatewayToken: (api.config as Record<string, any>).gateway?.auth?.token,
532
+ gatewayPassword: (api.config as Record<string, any>).gateway?.auth?.password,
533
+ };
534
+
535
+ getAddEntry()(sessionKey, error, resolvedConfig);
536
+ });
537
+
538
+ // Register the background service
539
+ api.registerService(service);
540
+
541
+ api.logger.info("retry-on-error: plugin registered");
542
+ },
543
+ };
544
+
545
+ export default plugin;
546
+ ```
547
+
548
+ - [ ] **Step 2: Commit**
549
+
550
+ ```bash
551
+ git add index.ts
552
+ git commit -m "feat: add plugin entry point with agent_end hook"
553
+ ```
554
+
555
+ ---
556
+
557
+ ## Chunk 3: README and repository setup
558
+
559
+ ### Task 4: Create production README
560
+
561
+ **Files:**
562
+ - Create: `README.md`
563
+ - Create: `LICENSE`
564
+ - Create: `.gitignore`
565
+
566
+ - [ ] **Step 1: Create README.md**
567
+
568
+ Full production README with:
569
+ - Clear problem statement
570
+ - Installation instructions
571
+ - Configuration reference
572
+ - How it works (architecture diagram)
573
+ - Edge cases and limitations
574
+ - Contributing section
575
+
576
+ - [ ] **Step 2: Create LICENSE (MIT)**
577
+
578
+ - [ ] **Step 3: Create .gitignore**
579
+
580
+ ```
581
+ node_modules/
582
+ *.tmp
583
+ ```
584
+
585
+ - [ ] **Step 4: Commit**
586
+
587
+ ```bash
588
+ git add README.md LICENSE .gitignore
589
+ git commit -m "docs: add production README, LICENSE, and .gitignore"
590
+ ```
591
+
592
+ ### Task 5: Create GitHub repository and push
593
+
594
+ - [ ] **Step 1: Create private repo on GitHub**
595
+
596
+ ```bash
597
+ gh repo create cheapestinference/openclaw-plugin-retry-on-error --private --source=. --push
598
+ ```
599
+
600
+ - [ ] **Step 2: Verify**
601
+
602
+ ```bash
603
+ gh repo view cheapestinference/openclaw-plugin-retry-on-error
604
+ ```