ai-sdk-rate-limiter 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -214,6 +214,53 @@ const result = await generateText({ model, prompt })
214
214
 
215
215
  ---
216
216
 
217
+ ## Multi-instance Redis store
218
+
219
+ By default, rate limit state is in-memory (per-process). In multi-instance deployments — serverless functions, multiple pods, workers — each instance has its own counters. Install the Redis store to share state across all instances:
220
+
221
+ ```
222
+ npm install ioredis
223
+ ```
224
+
225
+ ```typescript
226
+ import { createRateLimiter } from 'ai-sdk-rate-limiter'
227
+ import { RedisStore } from 'ai-sdk-rate-limiter/redis'
228
+ import Redis from 'ioredis'
229
+
230
+ const limiter = createRateLimiter({
231
+ store: new RedisStore(new Redis(process.env.REDIS_URL)),
232
+ // ... rest of your config
233
+ })
234
+ ```
235
+
236
+ That's the entire change. All APIs — `wrap()`, `rawProxy()`, events, cost reports — work identically. The Redis store enforces rate limits collectively so no two instances can jointly exceed the API limits.
237
+
238
+ **How it works:**
239
+
240
+ Each request atomically runs a Lua script that:
241
+ 1. Removes entries older than 60 seconds from a sorted set (`ZREMRANGEBYSCORE`)
242
+ 2. Counts remaining requests and sums input tokens
243
+ 3. Checks against RPM and ITPM limits
244
+ 4. If allowed: reserves the slot (`ZADD`) and returns immediately
245
+ 5. If blocked: returns the timestamp when the next slot opens
246
+
247
+ The local queue (priority ordering, drain timer, timeout handling) stays in-memory per instance — only the window counters are shared.
248
+
249
+ **Options:**
250
+
251
+ ```typescript
252
+ new RedisStore(redis, {
253
+ keyPrefix: 'rl:myapp:', // namespace if multiple apps share Redis
254
+ windowMs: 60_000, // window size; match your provider's limit window
255
+ })
256
+ ```
257
+
258
+ **Compatible clients** — any Redis client with `eval()`, `get()`, and `set()` works: `ioredis`, `node-redis`, Upstash Redis.
259
+
260
+ **Single-instance deployments:** the default `InMemoryStore` is more accurate (true sliding window, no network round-trips) and zero-config. Only switch to `RedisStore` when you actually need cross-instance coordination.
261
+
262
+ ---
263
+
217
264
  ## Raw SDK proxy
218
265
 
219
266
  If you're using the OpenAI, Anthropic, Groq, Mistral, or Cohere SDK directly — without the Vercel AI SDK — use `limiter.rawProxy()` to add rate limiting as a transparent drop-in:
package/dist/index.cjs CHANGED
@@ -69,6 +69,130 @@ Caused by: ${cause.stack}`;
69
69
  }
70
70
  };
71
71
 
72
+ // src/store/in-memory-store.ts
73
+ var WINDOW_MS = 6e4;
74
+ var InMemoryStore = class {
75
+ constructor() {
76
+ this.windows = /* @__PURE__ */ new Map();
77
+ this.backoffs = /* @__PURE__ */ new Map();
78
+ }
79
+ // -------------------------------------------------------------------------
80
+ // RateLimitStore implementation
81
+ // -------------------------------------------------------------------------
82
+ async checkAndRecord(key, estimatedInputTokens, limits) {
83
+ const now = Date.now();
84
+ const backoffUntil = this.backoffs.get(key) ?? 0;
85
+ if (now < backoffUntil) return backoffUntil;
86
+ const window = this.getOrCreate(key);
87
+ this.evict(window, now);
88
+ if (window.length >= limits.rpm) {
89
+ return (window[0]?.timestamp ?? now) + WINDOW_MS + 1;
90
+ }
91
+ if (limits.itpm !== void 0) {
92
+ const usedInput = sumInput(window);
93
+ if (usedInput + estimatedInputTokens > limits.itpm) {
94
+ return this.itpmNextSlot(window, limits.itpm, estimatedInputTokens, now);
95
+ }
96
+ }
97
+ window.push({ timestamp: now, inputTokens: estimatedInputTokens, outputTokens: 0 });
98
+ return 0;
99
+ }
100
+ async reconcile(key, actualInputTokens, actualOutputTokens) {
101
+ const window = this.windows.get(key);
102
+ if (!window) return;
103
+ for (let i = window.length - 1; i >= 0; i--) {
104
+ const entry = window[i];
105
+ if (entry.outputTokens === 0 && entry.inputTokens > 0) {
106
+ entry.inputTokens = actualInputTokens;
107
+ entry.outputTokens = actualOutputTokens;
108
+ return;
109
+ }
110
+ }
111
+ }
112
+ async setBackoff(key, untilMs) {
113
+ const current = this.backoffs.get(key) ?? 0;
114
+ if (untilMs > current) this.backoffs.set(key, untilMs);
115
+ }
116
+ async getBackoff(key) {
117
+ return this.backoffs.get(key) ?? 0;
118
+ }
119
+ async nextSlotMs(key, limits, estimatedInputTokens = 0) {
120
+ const now = Date.now();
121
+ const backoffUntil = this.backoffs.get(key) ?? 0;
122
+ if (now < backoffUntil) return backoffUntil;
123
+ const window = this.windows.get(key) ?? [];
124
+ this.evict(window, now);
125
+ if (window.length < limits.rpm) {
126
+ if (limits.itpm === void 0 || sumInput(window) + estimatedInputTokens <= limits.itpm) {
127
+ return 0;
128
+ }
129
+ }
130
+ let nextSlot = now;
131
+ if (window.length >= limits.rpm && window[0]) {
132
+ nextSlot = Math.max(nextSlot, window[0].timestamp + WINDOW_MS + 1);
133
+ }
134
+ if (limits.itpm !== void 0) {
135
+ let usedInput = sumInput(window);
136
+ if (usedInput + estimatedInputTokens > limits.itpm) {
137
+ for (const entry of window) {
138
+ usedInput -= entry.inputTokens;
139
+ if (usedInput + estimatedInputTokens <= limits.itpm) {
140
+ nextSlot = Math.max(nextSlot, entry.timestamp + WINDOW_MS + 1);
141
+ break;
142
+ }
143
+ }
144
+ }
145
+ }
146
+ return nextSlot;
147
+ }
148
+ // -------------------------------------------------------------------------
149
+ // Snapshot helpers (used by engine for status reporting)
150
+ // -------------------------------------------------------------------------
151
+ snapshot(key) {
152
+ const window = this.windows.get(key) ?? [];
153
+ this.evict(window, Date.now());
154
+ return {
155
+ requests: window.length,
156
+ inputTokens: sumInput(window),
157
+ outputTokens: window.reduce((s, e) => s + e.outputTokens, 0)
158
+ };
159
+ }
160
+ currentBackoff(key) {
161
+ const until = this.backoffs.get(key) ?? 0;
162
+ return Date.now() < until ? until : null;
163
+ }
164
+ // -------------------------------------------------------------------------
165
+ // Private helpers
166
+ // -------------------------------------------------------------------------
167
+ getOrCreate(key) {
168
+ let w = this.windows.get(key);
169
+ if (!w) {
170
+ w = [];
171
+ this.windows.set(key, w);
172
+ }
173
+ return w;
174
+ }
175
+ evict(window, now) {
176
+ const cutoff = now - WINDOW_MS;
177
+ let i = 0;
178
+ while (i < window.length && (window[i]?.timestamp ?? 0) <= cutoff) i++;
179
+ if (i > 0) window.splice(0, i);
180
+ }
181
+ itpmNextSlot(window, itpmLimit, estimatedInputTokens, now) {
182
+ let usedInput = sumInput(window);
183
+ for (const entry of window) {
184
+ usedInput -= entry.inputTokens;
185
+ if (usedInput + estimatedInputTokens <= itpmLimit) {
186
+ return entry.timestamp + WINDOW_MS + 1;
187
+ }
188
+ }
189
+ return now + WINDOW_MS + 1;
190
+ }
191
+ };
192
+ function sumInput(window) {
193
+ return window.reduce((s, e) => s + e.inputTokens, 0);
194
+ }
195
+
72
196
  // src/core/rate-limit-engine.ts
73
197
  var PRIORITY_RANK = {
74
198
  high: 0,
@@ -90,11 +214,14 @@ function insertWaiter(waiters, waiter) {
90
214
  }
91
215
  waiters.splice(lo, 0, waiter);
92
216
  }
93
- var WINDOW_MS = 6e4;
94
217
  var RateLimitEngine = class {
95
- constructor({ maxQueueSize = 500 } = {}) {
96
- this.states = /* @__PURE__ */ new Map();
218
+ constructor({
219
+ maxQueueSize = 500,
220
+ store
221
+ } = {}) {
222
+ this.localStates = /* @__PURE__ */ new Map();
97
223
  this.maxQueueSize = maxQueueSize;
224
+ this.store = store ?? new InMemoryStore();
98
225
  }
99
226
  // -------------------------------------------------------------------------
100
227
  // Public API
@@ -102,30 +229,30 @@ var RateLimitEngine = class {
102
229
  /**
103
230
  * Acquire a slot for the given model.
104
231
  *
105
- * - If capacity is available: records the request in the sliding window and
106
- * resolves immediately.
107
- * - If at capacity: enqueues the request (sorted by priority) and resolves
108
- * when a slot opens.
109
- * - If the queue is full: throws QueueFullError immediately.
110
- * - If the request waits longer than `timeoutMs`: throws QueueTimeoutError.
232
+ * - If capacity is available: records the request in the window and resolves.
233
+ * - If at capacity: enqueues (sorted by priority) and resolves when a slot opens.
234
+ * - If queue is full: throws QueueFullError immediately.
235
+ * - If waiting exceeds timeoutMs: throws QueueTimeoutError.
111
236
  */
112
237
  async acquire(key, opts) {
113
- const state = this.getOrCreate(key);
114
- if (this.canProceed(state, opts.limits, opts.estimatedInputTokens)) {
115
- this.record(state, opts.estimatedInputTokens, 0);
116
- return;
117
- }
118
- if (state.waiters.length >= this.maxQueueSize) {
238
+ const local = this.getOrCreate(key);
239
+ const nextSlotAtMs = await this.store.checkAndRecord(
240
+ key,
241
+ opts.estimatedInputTokens,
242
+ opts.limits
243
+ );
244
+ if (nextSlotAtMs <= Date.now()) return;
245
+ if (local.waiters.length >= this.maxQueueSize) {
119
246
  throw new QueueFullError(key, this.maxQueueSize);
120
247
  }
121
- const estimatedWaitMs = this.estimatedWaitMs(key, opts.limits, opts.estimatedInputTokens);
122
- opts.onQueued?.(state.waiters.length, estimatedWaitMs);
248
+ const estimatedWaitMs = Math.max(0, nextSlotAtMs - Date.now());
249
+ opts.onQueued?.(local.waiters.length, estimatedWaitMs);
123
250
  return new Promise((resolve, reject) => {
124
251
  const enqueuedAt = Date.now();
125
252
  const timeoutHandle = setTimeout(() => {
126
- const idx = state.waiters.indexOf(waiter);
127
- if (idx !== -1) state.waiters.splice(idx, 1);
128
- reject(new QueueTimeoutError(key, Date.now() - enqueuedAt, state.waiters.length));
253
+ const idx = local.waiters.indexOf(waiter);
254
+ if (idx !== -1) local.waiters.splice(idx, 1);
255
+ reject(new QueueTimeoutError(key, Date.now() - enqueuedAt, local.waiters.length));
129
256
  }, opts.timeoutMs);
130
257
  const waiter = {
131
258
  resolve: () => {
@@ -138,157 +265,93 @@ var RateLimitEngine = class {
138
265
  estimatedInputTokens: opts.estimatedInputTokens,
139
266
  timeoutHandle
140
267
  };
141
- insertWaiter(state.waiters, waiter);
142
- this.scheduleDrain(key, opts.limits);
268
+ insertWaiter(local.waiters, waiter);
269
+ this.scheduleDrain(key, opts.limits, nextSlotAtMs);
143
270
  });
144
271
  }
145
272
  /**
146
273
  * Record actual token usage after a request completes.
147
- * Replaces the estimated token count with the real values.
274
+ * Best-effort reconciliation with the estimate recorded during acquire().
148
275
  */
149
276
  recordActualUsage(key, inputTokens, outputTokens) {
150
- const state = this.states.get(key);
151
- if (!state) return;
152
- for (let i = state.window.length - 1; i >= 0; i--) {
153
- const entry = state.window[i];
154
- if (entry.outputTokens === 0 && entry.inputTokens > 0) {
155
- entry.inputTokens = inputTokens;
156
- entry.outputTokens = outputTokens;
157
- break;
158
- }
159
- }
277
+ void this.store.reconcile(key, inputTokens, outputTokens);
160
278
  }
161
279
  /**
162
- * Apply a backoff delay to a model key.
163
- * While a backoff is active, no new requests will be allowed through — they
164
- * will queue and wait until backoffUntil, then drain in priority order.
165
- *
166
- * Called when a remote 429 comes back with a Retry-After header.
280
+ * Apply a backoff delay from a Retry-After header.
281
+ * Propagated to the store so all instances respect it (Redis) or
282
+ * queued requests on this instance wait (in-memory).
167
283
  */
168
284
  applyBackoff(key, delayMs) {
169
- const state = this.getOrCreate(key);
170
- const newUntil = Date.now() + delayMs;
171
- if (newUntil > state.backoffUntil) {
172
- state.backoffUntil = newUntil;
173
- }
285
+ void this.store.setBackoff(key, Date.now() + delayMs);
174
286
  }
175
287
  /**
176
- * Estimated time in ms before the next slot opens for this model/priority.
177
- * Returns 0 if a slot is available right now.
288
+ * Estimated wait time in ms before the next slot opens.
289
+ * Returns 0 if immediately available. With RedisStore this is async
290
+ * so we return a Promise; callers that need the value should await it.
178
291
  */
179
- estimatedWaitMs(key, limits, estimatedTokens = 0) {
180
- const state = this.states.get(key);
181
- if (!state) return 0;
182
- if (this.canProceed(state, limits, estimatedTokens)) return 0;
183
- return this.nextSlotAt(state, limits, estimatedTokens) - Date.now();
292
+ async estimatedWaitMs(key, limits, estimatedTokens = 0) {
293
+ if (!this.store.nextSlotMs) return 0;
294
+ const nextSlot = await this.store.nextSlotMs(key, limits, estimatedTokens);
295
+ return Math.max(0, nextSlot - Date.now());
184
296
  }
185
297
  /** Current queue depth for a model */
186
298
  queueDepth(key) {
187
- return this.states.get(key)?.waiters.length ?? 0;
299
+ return this.localStates.get(key)?.waiters.length ?? 0;
188
300
  }
189
- /** Snapshot of the current window state for a model */
301
+ /** Snapshot of the current window (delegates to store where supported) */
190
302
  windowSnapshot(key) {
191
- const state = this.states.get(key);
192
- if (!state) return { requests: 0, inputTokens: 0, outputTokens: 0 };
193
- this.evict(state);
194
- return {
195
- requests: state.window.length,
196
- inputTokens: state.window.reduce((s, e) => s + e.inputTokens, 0),
197
- outputTokens: state.window.reduce((s, e) => s + e.outputTokens, 0)
198
- };
303
+ if (this.store instanceof InMemoryStore) {
304
+ return this.store.snapshot(key);
305
+ }
306
+ return { requests: 0, inputTokens: 0, outputTokens: 0 };
199
307
  }
200
308
  backoffUntil(key) {
201
- const state = this.states.get(key);
202
- if (!state || Date.now() >= state.backoffUntil) return null;
203
- return state.backoffUntil;
309
+ if (this.store instanceof InMemoryStore) {
310
+ return this.store.currentBackoff(key);
311
+ }
312
+ return null;
204
313
  }
205
314
  // -------------------------------------------------------------------------
206
- // Internal helpers
315
+ // Private helpers
207
316
  // -------------------------------------------------------------------------
208
317
  getOrCreate(key) {
209
- let state = this.states.get(key);
318
+ let state = this.localStates.get(key);
210
319
  if (!state) {
211
- state = { window: [], waiters: [], backoffUntil: 0, drainScheduled: false };
212
- this.states.set(key, state);
320
+ state = { waiters: [], drainScheduled: false };
321
+ this.localStates.set(key, state);
213
322
  }
214
323
  return state;
215
324
  }
216
- evict(state) {
217
- const cutoff = Date.now() - WINDOW_MS;
218
- let i = 0;
219
- while (i < state.window.length && (state.window[i]?.timestamp ?? 0) <= cutoff) i++;
220
- if (i > 0) state.window.splice(0, i);
221
- }
222
- canProceed(state, limits, estimatedInputTokens) {
223
- const now = Date.now();
224
- if (now < state.backoffUntil) return false;
225
- this.evict(state);
226
- if (state.window.length >= limits.rpm) return false;
227
- if (limits.itpm !== void 0) {
228
- const usedInput = state.window.reduce((s, e) => s + e.inputTokens, 0);
229
- if (usedInput + estimatedInputTokens > limits.itpm) return false;
230
- }
231
- return true;
232
- }
233
- record(state, inputTokens, outputTokens) {
234
- state.window.push({ timestamp: Date.now(), inputTokens, outputTokens });
235
- }
236
- /**
237
- * Returns the timestamp (ms) at which the next slot will open.
238
- */
239
- nextSlotAt(state, limits, estimatedInputTokens) {
240
- const now = Date.now();
241
- if (now < state.backoffUntil) return state.backoffUntil;
242
- this.evict(state);
243
- let nextSlot = now;
244
- if (state.window.length >= limits.rpm && state.window[0]) {
245
- nextSlot = Math.max(nextSlot, state.window[0].timestamp + WINDOW_MS + 1);
246
- }
247
- if (limits.itpm !== void 0) {
248
- let usedInput = state.window.reduce((s, e) => s + e.inputTokens, 0);
249
- if (usedInput + estimatedInputTokens > limits.itpm) {
250
- for (const entry of state.window) {
251
- usedInput -= entry.inputTokens;
252
- if (usedInput + estimatedInputTokens <= limits.itpm) {
253
- nextSlot = Math.max(nextSlot, entry.timestamp + WINDOW_MS + 1);
254
- break;
255
- }
256
- }
257
- }
258
- }
259
- return nextSlot;
260
- }
261
- /**
262
- * Schedule a drain of the waiters queue for the given model.
263
- * Only one drain timer is active at a time per model.
264
- */
265
- scheduleDrain(key, limits) {
266
- const state = this.states.get(key);
267
- if (!state || state.drainScheduled) return;
268
- state.drainScheduled = true;
269
- const delay = Math.max(0, this.nextSlotAt(state, limits, 0) - Date.now());
325
+ scheduleDrain(key, limits, nextSlotAtMs) {
326
+ const local = this.localStates.get(key);
327
+ if (!local || local.drainScheduled) return;
328
+ local.drainScheduled = true;
329
+ const delay = Math.max(0, nextSlotAtMs - Date.now());
270
330
  setTimeout(() => {
271
- state.drainScheduled = false;
272
- this.drain(key, limits);
331
+ local.drainScheduled = false;
332
+ void this.drain(key, limits);
273
333
  }, delay);
274
334
  }
275
- /**
276
- * Process as many waiters as possible. Reschedule if there are still waiters
277
- * but no capacity yet.
278
- */
279
- drain(key, limits) {
280
- const state = this.states.get(key);
281
- if (!state || state.waiters.length === 0) return;
282
- while (state.waiters.length > 0) {
283
- const next = state.waiters[0];
284
- if (!this.canProceed(state, limits, next.estimatedInputTokens)) break;
285
- state.waiters.shift();
286
- clearTimeout(next.timeoutHandle);
287
- this.record(state, next.estimatedInputTokens, 0);
288
- next.resolve();
289
- }
290
- if (state.waiters.length > 0) {
291
- this.scheduleDrain(key, limits);
335
+ async drain(key, limits) {
336
+ const local = this.localStates.get(key);
337
+ if (!local) return;
338
+ while (local.waiters.length > 0) {
339
+ const waiter = local.waiters[0];
340
+ const nextSlotAtMs = await this.store.checkAndRecord(
341
+ key,
342
+ waiter.estimatedInputTokens,
343
+ limits
344
+ );
345
+ if (nextSlotAtMs > Date.now()) {
346
+ this.scheduleDrain(key, limits, nextSlotAtMs);
347
+ return;
348
+ }
349
+ if (local.waiters[0] !== waiter) {
350
+ continue;
351
+ }
352
+ local.waiters.shift();
353
+ clearTimeout(waiter.timeoutHandle);
354
+ waiter.resolve();
292
355
  }
293
356
  }
294
357
  };
@@ -1292,7 +1355,8 @@ var Pipeline = class {
1292
1355
  constructor(config) {
1293
1356
  this.config = config;
1294
1357
  this.engine = new RateLimitEngine({
1295
- maxQueueSize: config.queue?.maxSize ?? 500
1358
+ maxQueueSize: config.queue?.maxSize ?? 500,
1359
+ ...config.store !== void 0 && { store: config.store }
1296
1360
  });
1297
1361
  this.costTracker = new CostTracker();
1298
1362
  this.emitter = new Emitter();
@@ -1461,7 +1525,7 @@ var Pipeline = class {
1461
1525
  const models = [];
1462
1526
  return { models, totalQueueDepth: 0 };
1463
1527
  }
1464
- estimatedWait(modelId, provider, priority = "normal") {
1528
+ async estimatedWait(modelId, provider, priority = "normal") {
1465
1529
  const key = `${provider}:${modelId}`;
1466
1530
  const limits = this.resolveModelLimits(modelId, provider);
1467
1531
  return this.engine.estimatedWaitMs(key, limits);