@cheapestinference/openclaw-ratelimit-retry 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ # OpenClaw Plugin: retry-on-error
2
+
3
+ ## Problem
4
+
5
+ When the inference provider (CheapestInference via LiteLLM) returns 429 rate limit errors due to budget exhaustion (5-hour fixed window), all running agent tasks and conversations stop. When the budget resets, nothing resumes automatically. Users must manually re-trigger each conversation, and if the dashboard is closed, there is no way to resume at all.
6
+
7
+ ## Solution
8
+
9
+ An OpenClaw plugin (`retry-on-error`) that:
10
+
11
+ 1. Detects retriable provider errors via the `agent_end` hook
12
+ 2. Parks failed sessions in a persistent queue on disk
13
+ 3. Runs a background service that retries parked sessions when the budget window resets
14
+ 4. Uses OpenClaw's internal `GatewayClient` to send `chat.send` to the local gateway, resuming conversations with their full transcript context
15
+
16
+ ## Architecture
17
+
18
+ ### Plugin Structure
19
+
20
+ ```
21
+ ~/.openclaw/extensions/retry-on-error/
22
+ ├── openclaw.plugin.json # Plugin manifest
23
+ ├── package.json # NPM metadata
24
+ ├── index.ts # Entry point: registers hook + service
25
+ └── src/
26
+ └── service.ts # Background retry service
27
+ ```
28
+
29
+ ### Components
30
+
31
+ #### 1. Error Detection (hook: `agent_end`)
32
+
33
+ The `agent_end` hook receives `{ messages, success, error, durationMs }` with `PluginHookAgentContext` which includes `sessionKey`.
34
+
35
+ The `error` field is a **plain string** (not an HTTP status code). Error strings from the CheapestInference/LiteLLM stack arrive in formats like `"Error code: 429 - ..."` or `"RateLimitError: ..."`.
36
+
37
+ **Retriable error patterns** (string matching, case-insensitive):
38
+ - `"429"` (catches `"Error code: 429"`)
39
+ - `"rate limit"` / `"rate_limit"` / `"too many requests"`
40
+ - `"budget"` / `"quota exceeded"`
41
+ - `"resource_exhausted"` / `"resource has been exhausted"`
42
+
43
+ **Non-retriable errors** (ignored):
44
+ - Auth errors ("invalid api key", "unauthorized", "401", "403")
45
+ - Format errors ("invalid request", "malformed")
46
+ - Model not found ("404", "model not found")
47
+ - Context overflow ("context length", "prompt too large")
48
+ - Billing errors ("402", "insufficient credits") — require user action
49
+
50
+ #### 2. Persistent Queue
51
+
52
+ Path: `path.join(ctx.stateDir, 'retry-on-error', 'queue.json')` (resolves to `~/.openclaw/retry-on-error/queue.json`).
53
+
54
+ ```json
55
+ [
56
+ {
57
+ "sessionKey": "agent:myagent:main",
58
+ "errorTime": 1710000000000,
59
+ "retryAfter": 1710018000000,
60
+ "errorMessage": "Error code: 429 - rate limit exceeded",
61
+ "attempts": 0
62
+ }
63
+ ]
64
+ ```
65
+
66
+ **Deduplication**: Only one entry per `sessionKey`. If the same session errors again, the existing entry is updated with incremented `attempts` and recalculated `retryAfter`.
67
+
68
+ **Retry time calculation**: Computes the next 5-hour budget window boundary aligned to midnight (00:00, 05:00, 10:00, 15:00, 20:00 UTC) plus 1 minute margin. LiteLLM uses UTC-aligned boundaries per its `get_next_standardized_reset_time()` implementation.
69
+
70
+ ```
71
+ function nextResetTime(now, windowHours):
72
+ currentHour = now.getUTCHours()
73
+ nextBoundary = currentHour + windowHours - (currentHour % windowHours)
74
+ if currentHour % windowHours == 0 and minutes == 0:
75
+ nextBoundary = currentHour + windowHours
76
+ // Handle day overflow (nextBoundary >= 24)
77
+ return date at nextBoundary:01:00 UTC
78
+ ```
79
+
80
+ **Atomic writes**: Write to a temp file, then rename (atomic rename) to prevent corruption on crashes.
81
+
82
+ **Max queue size**: Capped at 100 entries. Oldest entries evicted when full.
83
+
84
+ #### 3. Background Service (`registerService`)
85
+
86
+ **`start(ctx)`**:
87
+ 1. Read gateway port: `ctx.config.gateway?.port ?? 18789`
88
+ 2. Capture actual port via `gateway_start` hook if available
89
+ 3. Load `queue.json` from `ctx.stateDir` (recovers state after restarts)
90
+ 4. Start interval timer (every `checkIntervalMinutes`, default 5 minutes)
91
+ 5. On each tick (guarded by `retryInProgress` flag to prevent overlapping batches):
92
+ - Filter queue items where `retryAfter < Date.now()`
93
+ - If items ready: connect `GatewayClient` to local gateway
94
+ - Send `chat.send` for each ready session (fire-and-forget, see Response Model)
95
+ - On ack success: remove from queue, log
96
+ - On connection/send error: leave in queue, retry on next tick
97
+ - Save updated `queue.json` to disk (atomic write)
98
+
99
+ **`stop(ctx)`**:
100
+ 1. Clear interval timer
101
+ 2. Disconnect `GatewayClient` if connected
102
+ 3. Save queue to disk
103
+
104
+ #### 4. Gateway Client (authentication)
105
+
106
+ Uses OpenClaw's internal `GatewayClient` class instead of a custom WebSocket implementation. This handles:
107
+ - `connect.challenge` handshake
108
+ - Device identity authentication (`loadOrCreateDeviceIdentity`, `buildDeviceAuthPayloadV3`)
109
+ - Reconnection with exponential backoff
110
+ - Protocol negotiation
111
+
112
+ The `GatewayClient` is imported from `openclaw` internals (resolved via Jiti at runtime). Connection is ephemeral: opens when retries are pending, closes after processing.
113
+
114
+ Auth token can alternatively be read from `ctx.config.gateway?.auth?.token` if device identity is not available.
115
+
116
+ #### 5. Response Model (fire-and-forget with re-detection)
117
+
118
+ `chat.send` returns an immediate ack `{ ok: true, runId, status: "started" }`, NOT the final result. The plugin does NOT wait for the agent run to complete.
119
+
120
+ **If the retry succeeds**: The agent processes the message normally. No further action needed.
121
+
122
+ **If the retry fails again with 429**: The `agent_end` hook fires again with the same `sessionKey`. The deduplication logic updates the existing queue entry: increments `attempts`, recalculates `retryAfter` to the next 5h window. This natural loop continues until the retry succeeds or `attempts >= maxRetryAttempts`.
123
+
124
+ **Idempotency key generation**: Each retry uses `retry:${sessionKey}:${Date.now()}` to prevent replay/deduplication conflicts with previous messages.
125
+
126
+ ### Configuration
127
+
128
+ Plugin config in OpenClaw settings:
129
+
130
+ ```yaml
131
+ plugins:
132
+ retry-on-error:
133
+ budgetWindowHours: 5 # Budget reset window (hours)
134
+ maxRetryAttempts: 3 # Max retries per session before abandoning
135
+ checkIntervalMinutes: 5 # How often to check for pending retries
136
+ retryMessage: "Continue where you left off. The previous attempt failed due to a rate limit that has now reset."
137
+ ```
138
+
139
+ Config schema (in `openclaw.plugin.json`):
140
+
141
+ ```json
142
+ {
143
+ "id": "retry-on-error",
144
+ "configSchema": {
145
+ "type": "object",
146
+ "additionalProperties": false,
147
+ "properties": {
148
+ "budgetWindowHours": {
149
+ "type": "number",
150
+ "default": 5,
151
+ "description": "Budget reset window in hours"
152
+ },
153
+ "maxRetryAttempts": {
154
+ "type": "number",
155
+ "default": 3,
156
+ "description": "Maximum retry attempts per session"
157
+ },
158
+ "checkIntervalMinutes": {
159
+ "type": "number",
160
+ "default": 5,
161
+ "description": "Interval between retry checks in minutes"
162
+ },
163
+ "retryMessage": {
164
+ "type": "string",
165
+ "default": "Continue where you left off. The previous attempt failed due to a rate limit that has now reset.",
166
+ "description": "Message sent to resume the conversation"
167
+ }
168
+ }
169
+ }
170
+ }
171
+ ```
172
+
173
+ ### Edge Cases
174
+
175
+ | Scenario | Behavior |
176
+ |----------|----------|
177
+ | Server restarts | `start()` reloads `queue.json` from disk |
178
+ | Multiple errors same session | Deduplicate by `sessionKey` (update existing entry) |
179
+ | Retry also fails with 429 | `agent_end` hook fires again → re-queues with incremented attempts |
180
+ | Gateway unreachable during retry | Catch connection error, leave in queue for next tick |
181
+ | `attempts >= maxRetryAttempts` | Remove from queue, log warning |
182
+ | 24 not divisible by windowHours | Handle day overflow (hour >= 24 wraps to next day) |
183
+ | Sub-agent session error | Same treatment — sessionKey format `agent:X:subagent:Y` handled identically |
184
+ | Timer fires during active retry | `retryInProgress` guard prevents overlapping batches |
185
+ | Queue file corrupted | Catch JSON parse error, start with empty queue, log warning |
186
+ | Queue exceeds 100 entries | Evict oldest entries |
187
+
188
+ ### Why `chat.send` (not `/hooks` endpoint)
189
+
190
+ The `/hooks` endpoint creates "isolated agent turns" (cron-like). Using `chat.send` with the original `sessionKey` is equivalent to a user sending a message manually — the gateway loads the complete JSONL transcript and the agent resumes with full context. This is the correct behavior for conversation resumption.
191
+
192
+ ### Dependencies
193
+
194
+ Runtime: None (zero runtime dependencies).
195
+
196
+ Dev/type-only:
197
+ - `openclaw` — devDependency for types, resolved at runtime via Jiti alias
198
+ - `GatewayClient` — imported from `openclaw` internals at runtime
199
+
200
+ ### Installation
201
+
202
+ ```bash
203
+ # Copy to global extensions directory
204
+ cp -r retry-on-error ~/.openclaw/extensions/
205
+
206
+ # Enable in OpenClaw config
207
+ openclaw config set plugins.retry-on-error.budgetWindowHours 5
208
+ openclaw config set plugins.retry-on-error.maxRetryAttempts 3
209
+ ```
210
+
211
+ No `npm install` needed (zero runtime dependencies).
package/index.ts ADDED
@@ -0,0 +1,67 @@
1
+ import type { OpenClawPluginApi } from "openclaw/plugin-sdk";
2
+ import { createRetryService, isRetriableError } from "./src/service.js";
3
+
4
+ interface PluginConfig {
5
+ budgetWindowHours?: number;
6
+ maxRetryAttempts?: number;
7
+ checkIntervalMinutes?: number;
8
+ retryMessage?: string;
9
+ }
10
+
11
+ const DEFAULT_CONFIG: Required<PluginConfig> = {
12
+ budgetWindowHours: 5,
13
+ maxRetryAttempts: 3,
14
+ checkIntervalMinutes: 5,
15
+ retryMessage: "Continue where you left off. The previous attempt failed due to a rate limit that has now reset.",
16
+ };
17
+
18
+ const { service, addEntry, removeEntry } = createRetryService();
19
+
20
+ const plugin = {
21
+ id: "ratelimit-retry",
22
+ name: "Ratelimit Retry",
23
+ description: "Automatically retry agent conversations that fail due to provider rate limits",
24
+
25
+ register(api: OpenClawPluginApi) {
26
+ const cfg = {
27
+ ...DEFAULT_CONFIG,
28
+ ...(api.pluginConfig as PluginConfig),
29
+ };
30
+
31
+ api.on("agent_end", (event, ctx) => {
32
+ const error = (event as Record<string, unknown>).error as string | undefined;
33
+ const success = (event as Record<string, unknown>).success as boolean | undefined;
34
+ const sessionKey = (ctx as Record<string, unknown>).sessionKey as string | undefined;
35
+ if (!sessionKey) return;
36
+
37
+ // On success, remove from retry queue (if present)
38
+ if (success || !error) {
39
+ removeEntry(sessionKey);
40
+ return;
41
+ }
42
+
43
+ // Ignore non-retriable errors
44
+ if (!isRetriableError(error)) {
45
+ api.logger.debug?.(`ratelimit-retry: non-retriable error on ${sessionKey}: ${error.slice(0, 100)}`);
46
+ return;
47
+ }
48
+
49
+ api.logger.info(`ratelimit-retry: queuing retry for ${sessionKey} (error: ${error.slice(0, 100)})`);
50
+
51
+ const resolvedConfig = {
52
+ ...cfg,
53
+ gatewayPort: (api.config as Record<string, any>).gateway?.port ?? 18789,
54
+ gatewayToken: (api.config as Record<string, any>).gateway?.auth?.token,
55
+ gatewayPassword: (api.config as Record<string, any>).gateway?.auth?.password,
56
+ };
57
+
58
+ addEntry(sessionKey, error, resolvedConfig, api.logger as any);
59
+ });
60
+
61
+ api.registerService(service);
62
+
63
+ api.logger.info("ratelimit-retry: plugin registered");
64
+ },
65
+ };
66
+
67
+ export default plugin;
@@ -0,0 +1,32 @@
1
+ {
2
+ "id": "ratelimit-retry",
3
+ "configSchema": {
4
+ "type": "object",
5
+ "additionalProperties": false,
6
+ "properties": {
7
+ "budgetWindowHours": {
8
+ "type": "number",
9
+ "default": 5,
10
+ "minimum": 1,
11
+ "description": "Budget reset window in hours (aligned to UTC clock boundaries)"
12
+ },
13
+ "maxRetryAttempts": {
14
+ "type": "number",
15
+ "default": 3,
16
+ "minimum": 1,
17
+ "description": "Maximum retry attempts per session before abandoning"
18
+ },
19
+ "checkIntervalMinutes": {
20
+ "type": "number",
21
+ "default": 5,
22
+ "minimum": 1,
23
+ "description": "How often to check for pending retries (minutes)"
24
+ },
25
+ "retryMessage": {
26
+ "type": "string",
27
+ "default": "Continue where you left off. The previous attempt failed due to a rate limit that has now reset.",
28
+ "description": "Message sent to the session to resume the conversation"
29
+ }
30
+ }
31
+ }
32
+ }
package/package.json ADDED
@@ -0,0 +1,15 @@
1
+ {
2
+ "name": "@cheapestinference/openclaw-ratelimit-retry",
3
+ "version": "1.0.0",
4
+ "description": "Automatically retry agent conversations that fail due to provider rate limits",
5
+ "type": "module",
6
+ "license": "MIT",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "https://github.com/cheapestinference/openclaw-plugin-ratelimit-retry"
10
+ },
11
+ "keywords": ["openclaw", "plugin", "retry", "rate-limit", "429", "budget", "ratelimit"],
12
+ "openclaw": {
13
+ "extensions": ["./index.ts"]
14
+ }
15
+ }
package/src/service.ts ADDED
@@ -0,0 +1,377 @@
1
+ import { writeFile, readFile, mkdir, rename } from "node:fs/promises";
2
+ import { join, dirname } from "node:path";
3
+ import type { OpenClawPluginService } from "openclaw/plugin-sdk";
4
+
5
+ // --- Types ---
6
+
7
+ interface QueueEntry {
8
+ sessionKey: string;
9
+ errorTime: number;
10
+ retryAfter: number;
11
+ errorMessage: string;
12
+ attempts: number;
13
+ }
14
+
15
+ interface RetryConfig {
16
+ budgetWindowHours: number;
17
+ maxRetryAttempts: number;
18
+ checkIntervalMinutes: number;
19
+ retryMessage: string;
20
+ gatewayPort: number;
21
+ gatewayToken: string | undefined;
22
+ gatewayPassword: string | undefined;
23
+ }
24
+
25
+ // --- Error Detection ---
26
+
27
+ const RETRIABLE_PATTERNS = [
28
+ /\b429\b/i,
29
+ /rate[_ ]?limit/i,
30
+ /too many requests/i,
31
+ /budget/i,
32
+ /quota[_ ]?exceeded/i,
33
+ /resource[_ ]?(exhausted|has been exhausted)/i,
34
+ /tokens? per minute/i,
35
+ /\btpm\b/i,
36
+ ];
37
+
38
+ const NON_RETRIABLE_PATTERNS = [
39
+ /\b40[1-4]\b/i,
40
+ /invalid api key/i,
41
+ /unauthorized/i,
42
+ /invalid request/i,
43
+ /context[_ ]?(length|overflow)/i,
44
+ /prompt too (large|long)/i,
45
+ /model not found/i,
46
+ /insufficient[_ ]?credits/i,
47
+ /malformed/i,
48
+ ];
49
+
50
+ export function isRetriableError(error: string | undefined): boolean {
51
+ if (!error) return false;
52
+ for (const pattern of NON_RETRIABLE_PATTERNS) {
53
+ if (pattern.test(error)) return false;
54
+ }
55
+ for (const pattern of RETRIABLE_PATTERNS) {
56
+ if (pattern.test(error)) return true;
57
+ }
58
+ return false;
59
+ }
60
+
61
+ // --- Reset Time Calculation ---
62
+
63
+ export function nextResetTime(now: Date, windowHours: number): number {
64
+ if (!windowHours || windowHours <= 0) windowHours = 5;
65
+
66
+ const currentHour = now.getUTCHours();
67
+ const nextBoundary = currentHour + windowHours - (currentHour % windowHours);
68
+
69
+ const result = new Date(now);
70
+
71
+ if (nextBoundary >= 24) {
72
+ // Overflows to next day
73
+ result.setUTCDate(result.getUTCDate() + 1);
74
+ result.setUTCHours(Math.floor(nextBoundary - 24), 1, 0, 0); // +1 minute margin
75
+ } else {
76
+ result.setUTCHours(Math.floor(nextBoundary), 1, 0, 0); // +1 minute margin
77
+ }
78
+
79
+ return result.getTime();
80
+ }
81
+
82
+ // --- Queue Management ---
83
+
84
+ const MAX_QUEUE_SIZE = 100;
85
+
86
+ async function loadQueue(queuePath: string): Promise<QueueEntry[]> {
87
+ try {
88
+ const data = await readFile(queuePath, "utf-8");
89
+ const parsed = JSON.parse(data);
90
+ if (!Array.isArray(parsed)) return [];
91
+ return parsed;
92
+ } catch {
93
+ return [];
94
+ }
95
+ }
96
+
97
+ async function saveQueue(queuePath: string, queue: QueueEntry[]): Promise<void> {
98
+ await mkdir(dirname(queuePath), { recursive: true });
99
+ const tmpPath = `${queuePath}.tmp.${Date.now()}.${Math.random().toString(36).slice(2, 8)}`;
100
+ await writeFile(tmpPath, JSON.stringify(queue, null, 2), "utf-8");
101
+ await rename(tmpPath, queuePath);
102
+ }
103
+
104
+ function addToQueue(queue: QueueEntry[], entry: QueueEntry): QueueEntry[] {
105
+ const filtered = queue.filter((e) => e.sessionKey !== entry.sessionKey);
106
+ filtered.push(entry);
107
+ if (filtered.length > MAX_QUEUE_SIZE) {
108
+ filtered.sort((a, b) => a.errorTime - b.errorTime);
109
+ return filtered.slice(-MAX_QUEUE_SIZE);
110
+ }
111
+ return filtered;
112
+ }
113
+
114
+ // --- WebSocket Chat Client ---
115
+
116
+ interface ChatSendResult {
117
+ ok: boolean;
118
+ error?: string;
119
+ }
120
+
121
+ async function sendRetryMessage(
122
+ port: number,
123
+ token: string | undefined,
124
+ password: string | undefined,
125
+ sessionKey: string,
126
+ message: string,
127
+ ): Promise<ChatSendResult> {
128
+ return new Promise((outerResolve) => {
129
+ let settled = false;
130
+ const resolve = (val: ChatSendResult) => {
131
+ if (settled) return;
132
+ settled = true;
133
+ outerResolve(val);
134
+ };
135
+
136
+ const timeout = setTimeout(() => {
137
+ try { ws.close(); } catch {}
138
+ resolve({ ok: false, error: "Connection timeout" });
139
+ }, 30_000);
140
+
141
+ const ws = new WebSocket(`ws://127.0.0.1:${port}`);
142
+ let requestId = 0;
143
+
144
+ ws.addEventListener("error", () => {
145
+ clearTimeout(timeout);
146
+ resolve({ ok: false, error: "WebSocket connection error" });
147
+ });
148
+
149
+ ws.addEventListener("close", () => {
150
+ clearTimeout(timeout);
151
+ resolve({ ok: false, error: "Connection closed unexpectedly" });
152
+ });
153
+
154
+ ws.addEventListener("message", (event) => {
155
+ try {
156
+ const frame = JSON.parse(String(event.data));
157
+
158
+ if (frame.type === "event" && frame.event === "connect.challenge") {
159
+ const connectFrame: Record<string, unknown> = {
160
+ type: "req",
161
+ id: ++requestId,
162
+ method: "connect",
163
+ params: {
164
+ minProtocol: 1,
165
+ maxProtocol: 1,
166
+ client: {
167
+ name: "ratelimit-retry",
168
+ displayName: "Ratelimit Retry Plugin",
169
+ version: "1.0.0",
170
+ mode: "backend",
171
+ },
172
+ role: "operator",
173
+ scopes: ["operator.admin"],
174
+ },
175
+ };
176
+
177
+ if (token) {
178
+ (connectFrame.params as Record<string, unknown>).auth = { token };
179
+ } else if (password) {
180
+ (connectFrame.params as Record<string, unknown>).auth = { password };
181
+ }
182
+
183
+ ws.send(JSON.stringify(connectFrame));
184
+ return;
185
+ }
186
+
187
+ if (frame.type === "res" && frame.id === 1 && !frame.ok) {
188
+ clearTimeout(timeout);
189
+ resolve({ ok: false, error: frame.error?.message ?? "Gateway authentication failed" });
190
+ ws.close();
191
+ return;
192
+ }
193
+
194
+ if (frame.type === "res" && frame.id === 1 && frame.ok) {
195
+ const chatFrame = {
196
+ type: "req",
197
+ id: ++requestId,
198
+ method: "chat.send",
199
+ params: {
200
+ sessionKey,
201
+ message,
202
+ idempotencyKey: `retry:${sessionKey}:${Date.now()}`,
203
+ },
204
+ };
205
+ ws.send(JSON.stringify(chatFrame));
206
+ return;
207
+ }
208
+
209
+ if (frame.type === "res" && frame.id === 2) {
210
+ clearTimeout(timeout);
211
+ if (frame.ok) {
212
+ resolve({ ok: true });
213
+ } else {
214
+ resolve({ ok: false, error: frame.error?.message ?? "chat.send failed" });
215
+ }
216
+ ws.close();
217
+ return;
218
+ }
219
+ } catch {
220
+ // Ignore unparseable frames
221
+ }
222
+ });
223
+ });
224
+ }
225
+
226
+ // --- Service ---
227
+
228
+ interface Logger {
229
+ info: (msg: string) => void;
230
+ warn: (msg: string) => void;
231
+ error: (msg: string) => void;
232
+ }
233
+
234
+ export function createRetryService(): {
235
+ service: OpenClawPluginService;
236
+ addEntry: (sessionKey: string, errorMessage: string, config: RetryConfig, logger?: Logger) => void;
237
+ removeEntry: (sessionKey: string) => void;
238
+ } {
239
+ let queue: QueueEntry[] = [];
240
+ let queuePath = "";
241
+ let timer: ReturnType<typeof setInterval> | null = null;
242
+ let retryInProgress = false;
243
+ let config: RetryConfig = {
244
+ budgetWindowHours: 5,
245
+ maxRetryAttempts: 3,
246
+ checkIntervalMinutes: 5,
247
+ retryMessage: "Continue where you left off. The previous attempt failed due to a rate limit that has now reset.",
248
+ gatewayPort: 18789,
249
+ gatewayToken: undefined,
250
+ gatewayPassword: undefined,
251
+ };
252
+
253
+ const addEntry = (sessionKey: string, errorMessage: string, cfg: RetryConfig, logger?: Logger) => {
254
+ config = cfg;
255
+ const now = new Date();
256
+ const existing = queue.find((e) => e.sessionKey === sessionKey);
257
+ const attempts = existing ? existing.attempts + 1 : 0;
258
+
259
+ if (attempts >= config.maxRetryAttempts) {
260
+ logger?.warn(`ratelimit-retry: max attempts (${config.maxRetryAttempts}) reached for ${sessionKey}, abandoning`);
261
+ queue = queue.filter((e) => e.sessionKey !== sessionKey);
262
+ if (queuePath) saveQueue(queuePath, queue).catch(() => {});
263
+ return;
264
+ }
265
+
266
+ const entry: QueueEntry = {
267
+ sessionKey,
268
+ errorTime: now.getTime(),
269
+ retryAfter: nextResetTime(now, config.budgetWindowHours),
270
+ errorMessage,
271
+ attempts,
272
+ };
273
+
274
+ queue = addToQueue(queue, entry);
275
+ if (queuePath) saveQueue(queuePath, queue).catch(() => {});
276
+ };
277
+
278
+ const removeEntry = (sessionKey: string) => {
279
+ const existed = queue.some((e) => e.sessionKey === sessionKey);
280
+ if (existed) {
281
+ queue = queue.filter((e) => e.sessionKey !== sessionKey);
282
+ if (queuePath) saveQueue(queuePath, queue).catch(() => {});
283
+ }
284
+ };
285
+
286
+ const processTick = async (logger: Logger) => {
287
+ if (retryInProgress || queue.length === 0) return;
288
+ retryInProgress = true;
289
+
290
+ try {
291
+ const now = Date.now();
292
+ const ready = queue.filter((e) => e.retryAfter <= now);
293
+ if (ready.length === 0) return;
294
+
295
+ logger.info(`ratelimit-retry: ${ready.length} session(s) ready for retry`);
296
+
297
+ for (const entry of ready) {
298
+ logger.info(`ratelimit-retry: retrying session ${entry.sessionKey} (attempt ${entry.attempts + 1})`);
299
+
300
+ const result = await sendRetryMessage(
301
+ config.gatewayPort,
302
+ config.gatewayToken,
303
+ config.gatewayPassword,
304
+ entry.sessionKey,
305
+ config.retryMessage,
306
+ );
307
+
308
+ if (result.ok) {
309
+ // Don't remove — keep entry so attempts counter is preserved.
310
+ // Push retryAfter to next window to prevent re-sending on next tick.
311
+ // Entry is removed when agent_end fires with success=true.
312
+ // If the retry fails again, agent_end fires with error and increments attempts.
313
+ entry.retryAfter = nextResetTime(new Date(), config.budgetWindowHours);
314
+ logger.info(`ratelimit-retry: sent retry to ${entry.sessionKey}`);
315
+ } else {
316
+ // Push retryAfter forward to avoid hammering a down gateway every tick
317
+ entry.retryAfter = nextResetTime(new Date(), config.budgetWindowHours);
318
+ logger.warn(`ratelimit-retry: failed to send retry to ${entry.sessionKey}: ${result.error}`);
319
+ }
320
+ }
321
+
322
+ await saveQueue(queuePath, queue);
323
+ } finally {
324
+ retryInProgress = false;
325
+ }
326
+ };
327
+
328
+ const service: OpenClawPluginService = {
329
+ id: "ratelimit-retry",
330
+
331
+ async start(ctx) {
332
+ const stateDir = join(ctx.stateDir, "ratelimit-retry");
333
+ queuePath = join(stateDir, "queue.json");
334
+
335
+ config = {
336
+ ...config,
337
+ gatewayPort: (ctx.config as Record<string, any>).gateway?.port ?? 18789,
338
+ gatewayToken: (ctx.config as Record<string, any>).gateway?.auth?.token,
339
+ gatewayPassword: (ctx.config as Record<string, any>).gateway?.auth?.password,
340
+ };
341
+
342
+ const loaded = await loadQueue(queuePath);
343
+
344
+ // Merge: disk entries + any in-memory entries added between register() and start()
345
+ if (loaded.length > 0) {
346
+ const loadedKeys = new Set(loaded.map((e) => e.sessionKey));
347
+ const preStartEntries = queue.filter((e) => !loadedKeys.has(e.sessionKey));
348
+ queue = [...loaded, ...preStartEntries];
349
+ ctx.logger.info(`ratelimit-retry: loaded ${loaded.length} pending retry(s) from disk`);
350
+ }
351
+
352
+ const intervalMs = config.checkIntervalMinutes * 60 * 1000;
353
+ timer = setInterval(() => {
354
+ processTick(ctx.logger).catch((err) => {
355
+ ctx.logger.error(`ratelimit-retry: tick failed: ${err}`);
356
+ });
357
+ }, intervalMs);
358
+
359
+ ctx.logger.info(
360
+ `ratelimit-retry: service started (window=${config.budgetWindowHours}h, check=${config.checkIntervalMinutes}min, maxAttempts=${config.maxRetryAttempts})`,
361
+ );
362
+ },
363
+
364
+ async stop(ctx) {
365
+ if (timer) {
366
+ clearInterval(timer);
367
+ timer = null;
368
+ }
369
+ if (queuePath && queue.length > 0) {
370
+ await saveQueue(queuePath, queue);
371
+ }
372
+ ctx.logger.info("ratelimit-retry: service stopped");
373
+ },
374
+ };
375
+
376
+ return { service, addEntry, removeEntry };
377
+ }