ai-sdk-rate-limiter 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -0
- package/dist/index.cjs +206 -142
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -320
- package/dist/index.d.ts +3 -320
- package/dist/index.js +206 -142
- package/dist/index.js.map +1 -1
- package/dist/redis.cjs +209 -0
- package/dist/redis.cjs.map +1 -0
- package/dist/redis.d.cts +54 -0
- package/dist/redis.d.ts +54 -0
- package/dist/redis.js +207 -0
- package/dist/redis.js.map +1 -0
- package/dist/types-CgePLtmQ.d.cts +385 -0
- package/dist/types-CgePLtmQ.d.ts +385 -0
- package/package.json +16 -2
package/README.md
CHANGED
|
@@ -214,6 +214,53 @@ const result = await generateText({ model, prompt })
|
|
|
214
214
|
|
|
215
215
|
---
|
|
216
216
|
|
|
217
|
+
## Multi-instance Redis store
|
|
218
|
+
|
|
219
|
+
By default, rate limit state is in-memory (per-process). In multi-instance deployments — serverless functions, multiple pods, workers — each instance has its own counters. Install the Redis store to share state across all instances:
|
|
220
|
+
|
|
221
|
+
```
|
|
222
|
+
npm install ioredis
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
```typescript
|
|
226
|
+
import { createRateLimiter } from 'ai-sdk-rate-limiter'
|
|
227
|
+
import { RedisStore } from 'ai-sdk-rate-limiter/redis'
|
|
228
|
+
import Redis from 'ioredis'
|
|
229
|
+
|
|
230
|
+
const limiter = createRateLimiter({
|
|
231
|
+
store: new RedisStore(new Redis(process.env.REDIS_URL)),
|
|
232
|
+
// ... rest of your config
|
|
233
|
+
})
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
That's the entire change. All APIs — `wrap()`, `rawProxy()`, events, cost reports — work identically. The Redis store enforces rate limits collectively so no two instances can jointly exceed the API limits.
|
|
237
|
+
|
|
238
|
+
**How it works:**
|
|
239
|
+
|
|
240
|
+
Each request atomically runs a Lua script that:
|
|
241
|
+
1. Removes entries older than 60 seconds from a sorted set (`ZREMRANGEBYSCORE`)
|
|
242
|
+
2. Counts remaining requests and sums input tokens
|
|
243
|
+
3. Checks against RPM and ITPM limits
|
|
244
|
+
4. If allowed: reserves the slot (`ZADD`) and returns immediately
|
|
245
|
+
5. If blocked: returns the timestamp when the next slot opens
|
|
246
|
+
|
|
247
|
+
The local queue (priority ordering, drain timer, timeout handling) stays in-memory per instance — only the window counters are shared.
|
|
248
|
+
|
|
249
|
+
**Options:**
|
|
250
|
+
|
|
251
|
+
```typescript
|
|
252
|
+
new RedisStore(redis, {
|
|
253
|
+
keyPrefix: 'rl:myapp:', // namespace if multiple apps share Redis
|
|
254
|
+
windowMs: 60_000, // window size; match your provider's limit window
|
|
255
|
+
})
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
**Compatible clients** — any Redis client with `eval()`, `get()`, and `set()` works: `ioredis`, `node-redis`, Upstash Redis.
|
|
259
|
+
|
|
260
|
+
**Single-instance deployments:** the default `InMemoryStore` is more accurate (true sliding window, no network round-trips) and zero-config. Only switch to `RedisStore` when you actually need cross-instance coordination.
|
|
261
|
+
|
|
262
|
+
---
|
|
263
|
+
|
|
217
264
|
## Raw SDK proxy
|
|
218
265
|
|
|
219
266
|
If you're using the OpenAI, Anthropic, Groq, Mistral, or Cohere SDK directly — without the Vercel AI SDK — use `limiter.rawProxy()` to add rate limiting as a transparent drop-in:
|
package/dist/index.cjs
CHANGED
|
@@ -69,6 +69,130 @@ Caused by: ${cause.stack}`;
|
|
|
69
69
|
}
|
|
70
70
|
};
|
|
71
71
|
|
|
72
|
+
// src/store/in-memory-store.ts
|
|
73
|
+
var WINDOW_MS = 6e4;
|
|
74
|
+
var InMemoryStore = class {
|
|
75
|
+
constructor() {
|
|
76
|
+
this.windows = /* @__PURE__ */ new Map();
|
|
77
|
+
this.backoffs = /* @__PURE__ */ new Map();
|
|
78
|
+
}
|
|
79
|
+
// -------------------------------------------------------------------------
|
|
80
|
+
// RateLimitStore implementation
|
|
81
|
+
// -------------------------------------------------------------------------
|
|
82
|
+
async checkAndRecord(key, estimatedInputTokens, limits) {
|
|
83
|
+
const now = Date.now();
|
|
84
|
+
const backoffUntil = this.backoffs.get(key) ?? 0;
|
|
85
|
+
if (now < backoffUntil) return backoffUntil;
|
|
86
|
+
const window = this.getOrCreate(key);
|
|
87
|
+
this.evict(window, now);
|
|
88
|
+
if (window.length >= limits.rpm) {
|
|
89
|
+
return (window[0]?.timestamp ?? now) + WINDOW_MS + 1;
|
|
90
|
+
}
|
|
91
|
+
if (limits.itpm !== void 0) {
|
|
92
|
+
const usedInput = sumInput(window);
|
|
93
|
+
if (usedInput + estimatedInputTokens > limits.itpm) {
|
|
94
|
+
return this.itpmNextSlot(window, limits.itpm, estimatedInputTokens, now);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
window.push({ timestamp: now, inputTokens: estimatedInputTokens, outputTokens: 0 });
|
|
98
|
+
return 0;
|
|
99
|
+
}
|
|
100
|
+
async reconcile(key, actualInputTokens, actualOutputTokens) {
|
|
101
|
+
const window = this.windows.get(key);
|
|
102
|
+
if (!window) return;
|
|
103
|
+
for (let i = window.length - 1; i >= 0; i--) {
|
|
104
|
+
const entry = window[i];
|
|
105
|
+
if (entry.outputTokens === 0 && entry.inputTokens > 0) {
|
|
106
|
+
entry.inputTokens = actualInputTokens;
|
|
107
|
+
entry.outputTokens = actualOutputTokens;
|
|
108
|
+
return;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
async setBackoff(key, untilMs) {
|
|
113
|
+
const current = this.backoffs.get(key) ?? 0;
|
|
114
|
+
if (untilMs > current) this.backoffs.set(key, untilMs);
|
|
115
|
+
}
|
|
116
|
+
async getBackoff(key) {
|
|
117
|
+
return this.backoffs.get(key) ?? 0;
|
|
118
|
+
}
|
|
119
|
+
async nextSlotMs(key, limits, estimatedInputTokens = 0) {
|
|
120
|
+
const now = Date.now();
|
|
121
|
+
const backoffUntil = this.backoffs.get(key) ?? 0;
|
|
122
|
+
if (now < backoffUntil) return backoffUntil;
|
|
123
|
+
const window = this.windows.get(key) ?? [];
|
|
124
|
+
this.evict(window, now);
|
|
125
|
+
if (window.length < limits.rpm) {
|
|
126
|
+
if (limits.itpm === void 0 || sumInput(window) + estimatedInputTokens <= limits.itpm) {
|
|
127
|
+
return 0;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
let nextSlot = now;
|
|
131
|
+
if (window.length >= limits.rpm && window[0]) {
|
|
132
|
+
nextSlot = Math.max(nextSlot, window[0].timestamp + WINDOW_MS + 1);
|
|
133
|
+
}
|
|
134
|
+
if (limits.itpm !== void 0) {
|
|
135
|
+
let usedInput = sumInput(window);
|
|
136
|
+
if (usedInput + estimatedInputTokens > limits.itpm) {
|
|
137
|
+
for (const entry of window) {
|
|
138
|
+
usedInput -= entry.inputTokens;
|
|
139
|
+
if (usedInput + estimatedInputTokens <= limits.itpm) {
|
|
140
|
+
nextSlot = Math.max(nextSlot, entry.timestamp + WINDOW_MS + 1);
|
|
141
|
+
break;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return nextSlot;
|
|
147
|
+
}
|
|
148
|
+
// -------------------------------------------------------------------------
|
|
149
|
+
// Snapshot helpers (used by engine for status reporting)
|
|
150
|
+
// -------------------------------------------------------------------------
|
|
151
|
+
snapshot(key) {
|
|
152
|
+
const window = this.windows.get(key) ?? [];
|
|
153
|
+
this.evict(window, Date.now());
|
|
154
|
+
return {
|
|
155
|
+
requests: window.length,
|
|
156
|
+
inputTokens: sumInput(window),
|
|
157
|
+
outputTokens: window.reduce((s, e) => s + e.outputTokens, 0)
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
currentBackoff(key) {
|
|
161
|
+
const until = this.backoffs.get(key) ?? 0;
|
|
162
|
+
return Date.now() < until ? until : null;
|
|
163
|
+
}
|
|
164
|
+
// -------------------------------------------------------------------------
|
|
165
|
+
// Private helpers
|
|
166
|
+
// -------------------------------------------------------------------------
|
|
167
|
+
getOrCreate(key) {
|
|
168
|
+
let w = this.windows.get(key);
|
|
169
|
+
if (!w) {
|
|
170
|
+
w = [];
|
|
171
|
+
this.windows.set(key, w);
|
|
172
|
+
}
|
|
173
|
+
return w;
|
|
174
|
+
}
|
|
175
|
+
evict(window, now) {
|
|
176
|
+
const cutoff = now - WINDOW_MS;
|
|
177
|
+
let i = 0;
|
|
178
|
+
while (i < window.length && (window[i]?.timestamp ?? 0) <= cutoff) i++;
|
|
179
|
+
if (i > 0) window.splice(0, i);
|
|
180
|
+
}
|
|
181
|
+
itpmNextSlot(window, itpmLimit, estimatedInputTokens, now) {
|
|
182
|
+
let usedInput = sumInput(window);
|
|
183
|
+
for (const entry of window) {
|
|
184
|
+
usedInput -= entry.inputTokens;
|
|
185
|
+
if (usedInput + estimatedInputTokens <= itpmLimit) {
|
|
186
|
+
return entry.timestamp + WINDOW_MS + 1;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
return now + WINDOW_MS + 1;
|
|
190
|
+
}
|
|
191
|
+
};
|
|
192
|
+
function sumInput(window) {
|
|
193
|
+
return window.reduce((s, e) => s + e.inputTokens, 0);
|
|
194
|
+
}
|
|
195
|
+
|
|
72
196
|
// src/core/rate-limit-engine.ts
|
|
73
197
|
var PRIORITY_RANK = {
|
|
74
198
|
high: 0,
|
|
@@ -90,11 +214,14 @@ function insertWaiter(waiters, waiter) {
|
|
|
90
214
|
}
|
|
91
215
|
waiters.splice(lo, 0, waiter);
|
|
92
216
|
}
|
|
93
|
-
var WINDOW_MS = 6e4;
|
|
94
217
|
var RateLimitEngine = class {
|
|
95
|
-
constructor({
|
|
96
|
-
|
|
218
|
+
constructor({
|
|
219
|
+
maxQueueSize = 500,
|
|
220
|
+
store
|
|
221
|
+
} = {}) {
|
|
222
|
+
this.localStates = /* @__PURE__ */ new Map();
|
|
97
223
|
this.maxQueueSize = maxQueueSize;
|
|
224
|
+
this.store = store ?? new InMemoryStore();
|
|
98
225
|
}
|
|
99
226
|
// -------------------------------------------------------------------------
|
|
100
227
|
// Public API
|
|
@@ -102,30 +229,30 @@ var RateLimitEngine = class {
|
|
|
102
229
|
/**
|
|
103
230
|
* Acquire a slot for the given model.
|
|
104
231
|
*
|
|
105
|
-
* - If capacity is available: records the request in the
|
|
106
|
-
*
|
|
107
|
-
* - If
|
|
108
|
-
*
|
|
109
|
-
* - If the queue is full: throws QueueFullError immediately.
|
|
110
|
-
* - If the request waits longer than `timeoutMs`: throws QueueTimeoutError.
|
|
232
|
+
* - If capacity is available: records the request in the window and resolves.
|
|
233
|
+
* - If at capacity: enqueues (sorted by priority) and resolves when a slot opens.
|
|
234
|
+
* - If queue is full: throws QueueFullError immediately.
|
|
235
|
+
* - If waiting exceeds timeoutMs: throws QueueTimeoutError.
|
|
111
236
|
*/
|
|
112
237
|
async acquire(key, opts) {
|
|
113
|
-
const
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
238
|
+
const local = this.getOrCreate(key);
|
|
239
|
+
const nextSlotAtMs = await this.store.checkAndRecord(
|
|
240
|
+
key,
|
|
241
|
+
opts.estimatedInputTokens,
|
|
242
|
+
opts.limits
|
|
243
|
+
);
|
|
244
|
+
if (nextSlotAtMs <= Date.now()) return;
|
|
245
|
+
if (local.waiters.length >= this.maxQueueSize) {
|
|
119
246
|
throw new QueueFullError(key, this.maxQueueSize);
|
|
120
247
|
}
|
|
121
|
-
const estimatedWaitMs =
|
|
122
|
-
opts.onQueued?.(
|
|
248
|
+
const estimatedWaitMs = Math.max(0, nextSlotAtMs - Date.now());
|
|
249
|
+
opts.onQueued?.(local.waiters.length, estimatedWaitMs);
|
|
123
250
|
return new Promise((resolve, reject) => {
|
|
124
251
|
const enqueuedAt = Date.now();
|
|
125
252
|
const timeoutHandle = setTimeout(() => {
|
|
126
|
-
const idx =
|
|
127
|
-
if (idx !== -1)
|
|
128
|
-
reject(new QueueTimeoutError(key, Date.now() - enqueuedAt,
|
|
253
|
+
const idx = local.waiters.indexOf(waiter);
|
|
254
|
+
if (idx !== -1) local.waiters.splice(idx, 1);
|
|
255
|
+
reject(new QueueTimeoutError(key, Date.now() - enqueuedAt, local.waiters.length));
|
|
129
256
|
}, opts.timeoutMs);
|
|
130
257
|
const waiter = {
|
|
131
258
|
resolve: () => {
|
|
@@ -138,157 +265,93 @@ var RateLimitEngine = class {
|
|
|
138
265
|
estimatedInputTokens: opts.estimatedInputTokens,
|
|
139
266
|
timeoutHandle
|
|
140
267
|
};
|
|
141
|
-
insertWaiter(
|
|
142
|
-
this.scheduleDrain(key, opts.limits);
|
|
268
|
+
insertWaiter(local.waiters, waiter);
|
|
269
|
+
this.scheduleDrain(key, opts.limits, nextSlotAtMs);
|
|
143
270
|
});
|
|
144
271
|
}
|
|
145
272
|
/**
|
|
146
273
|
* Record actual token usage after a request completes.
|
|
147
|
-
*
|
|
274
|
+
* Best-effort reconciliation with the estimate recorded during acquire().
|
|
148
275
|
*/
|
|
149
276
|
recordActualUsage(key, inputTokens, outputTokens) {
|
|
150
|
-
|
|
151
|
-
if (!state) return;
|
|
152
|
-
for (let i = state.window.length - 1; i >= 0; i--) {
|
|
153
|
-
const entry = state.window[i];
|
|
154
|
-
if (entry.outputTokens === 0 && entry.inputTokens > 0) {
|
|
155
|
-
entry.inputTokens = inputTokens;
|
|
156
|
-
entry.outputTokens = outputTokens;
|
|
157
|
-
break;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
277
|
+
void this.store.reconcile(key, inputTokens, outputTokens);
|
|
160
278
|
}
|
|
161
279
|
/**
|
|
162
|
-
* Apply a backoff delay
|
|
163
|
-
*
|
|
164
|
-
*
|
|
165
|
-
*
|
|
166
|
-
* Called when a remote 429 comes back with a Retry-After header.
|
|
280
|
+
* Apply a backoff delay from a Retry-After header.
|
|
281
|
+
* Propagated to the store so all instances respect it (Redis) or
|
|
282
|
+
* queued requests on this instance wait (in-memory).
|
|
167
283
|
*/
|
|
168
284
|
applyBackoff(key, delayMs) {
|
|
169
|
-
|
|
170
|
-
const newUntil = Date.now() + delayMs;
|
|
171
|
-
if (newUntil > state.backoffUntil) {
|
|
172
|
-
state.backoffUntil = newUntil;
|
|
173
|
-
}
|
|
285
|
+
void this.store.setBackoff(key, Date.now() + delayMs);
|
|
174
286
|
}
|
|
175
287
|
/**
|
|
176
|
-
* Estimated time in ms before the next slot opens
|
|
177
|
-
* Returns 0 if
|
|
288
|
+
* Estimated wait time in ms before the next slot opens.
|
|
289
|
+
* Returns 0 if immediately available. With RedisStore this is async
|
|
290
|
+
* so we return a Promise; callers that need the value should await it.
|
|
178
291
|
*/
|
|
179
|
-
estimatedWaitMs(key, limits, estimatedTokens = 0) {
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
return this.nextSlotAt(state, limits, estimatedTokens) - Date.now();
|
|
292
|
+
async estimatedWaitMs(key, limits, estimatedTokens = 0) {
|
|
293
|
+
if (!this.store.nextSlotMs) return 0;
|
|
294
|
+
const nextSlot = await this.store.nextSlotMs(key, limits, estimatedTokens);
|
|
295
|
+
return Math.max(0, nextSlot - Date.now());
|
|
184
296
|
}
|
|
185
297
|
/** Current queue depth for a model */
|
|
186
298
|
queueDepth(key) {
|
|
187
|
-
return this.
|
|
299
|
+
return this.localStates.get(key)?.waiters.length ?? 0;
|
|
188
300
|
}
|
|
189
|
-
/** Snapshot of the current window
|
|
301
|
+
/** Snapshot of the current window (delegates to store where supported) */
|
|
190
302
|
windowSnapshot(key) {
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
return {
|
|
195
|
-
requests: state.window.length,
|
|
196
|
-
inputTokens: state.window.reduce((s, e) => s + e.inputTokens, 0),
|
|
197
|
-
outputTokens: state.window.reduce((s, e) => s + e.outputTokens, 0)
|
|
198
|
-
};
|
|
303
|
+
if (this.store instanceof InMemoryStore) {
|
|
304
|
+
return this.store.snapshot(key);
|
|
305
|
+
}
|
|
306
|
+
return { requests: 0, inputTokens: 0, outputTokens: 0 };
|
|
199
307
|
}
|
|
200
308
|
backoffUntil(key) {
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
309
|
+
if (this.store instanceof InMemoryStore) {
|
|
310
|
+
return this.store.currentBackoff(key);
|
|
311
|
+
}
|
|
312
|
+
return null;
|
|
204
313
|
}
|
|
205
314
|
// -------------------------------------------------------------------------
|
|
206
|
-
//
|
|
315
|
+
// Private helpers
|
|
207
316
|
// -------------------------------------------------------------------------
|
|
208
317
|
getOrCreate(key) {
|
|
209
|
-
let state = this.
|
|
318
|
+
let state = this.localStates.get(key);
|
|
210
319
|
if (!state) {
|
|
211
|
-
state = {
|
|
212
|
-
this.
|
|
320
|
+
state = { waiters: [], drainScheduled: false };
|
|
321
|
+
this.localStates.set(key, state);
|
|
213
322
|
}
|
|
214
323
|
return state;
|
|
215
324
|
}
|
|
216
|
-
|
|
217
|
-
const
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
}
|
|
222
|
-
canProceed(state, limits, estimatedInputTokens) {
|
|
223
|
-
const now = Date.now();
|
|
224
|
-
if (now < state.backoffUntil) return false;
|
|
225
|
-
this.evict(state);
|
|
226
|
-
if (state.window.length >= limits.rpm) return false;
|
|
227
|
-
if (limits.itpm !== void 0) {
|
|
228
|
-
const usedInput = state.window.reduce((s, e) => s + e.inputTokens, 0);
|
|
229
|
-
if (usedInput + estimatedInputTokens > limits.itpm) return false;
|
|
230
|
-
}
|
|
231
|
-
return true;
|
|
232
|
-
}
|
|
233
|
-
record(state, inputTokens, outputTokens) {
|
|
234
|
-
state.window.push({ timestamp: Date.now(), inputTokens, outputTokens });
|
|
235
|
-
}
|
|
236
|
-
/**
|
|
237
|
-
* Returns the timestamp (ms) at which the next slot will open.
|
|
238
|
-
*/
|
|
239
|
-
nextSlotAt(state, limits, estimatedInputTokens) {
|
|
240
|
-
const now = Date.now();
|
|
241
|
-
if (now < state.backoffUntil) return state.backoffUntil;
|
|
242
|
-
this.evict(state);
|
|
243
|
-
let nextSlot = now;
|
|
244
|
-
if (state.window.length >= limits.rpm && state.window[0]) {
|
|
245
|
-
nextSlot = Math.max(nextSlot, state.window[0].timestamp + WINDOW_MS + 1);
|
|
246
|
-
}
|
|
247
|
-
if (limits.itpm !== void 0) {
|
|
248
|
-
let usedInput = state.window.reduce((s, e) => s + e.inputTokens, 0);
|
|
249
|
-
if (usedInput + estimatedInputTokens > limits.itpm) {
|
|
250
|
-
for (const entry of state.window) {
|
|
251
|
-
usedInput -= entry.inputTokens;
|
|
252
|
-
if (usedInput + estimatedInputTokens <= limits.itpm) {
|
|
253
|
-
nextSlot = Math.max(nextSlot, entry.timestamp + WINDOW_MS + 1);
|
|
254
|
-
break;
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
return nextSlot;
|
|
260
|
-
}
|
|
261
|
-
/**
|
|
262
|
-
* Schedule a drain of the waiters queue for the given model.
|
|
263
|
-
* Only one drain timer is active at a time per model.
|
|
264
|
-
*/
|
|
265
|
-
scheduleDrain(key, limits) {
|
|
266
|
-
const state = this.states.get(key);
|
|
267
|
-
if (!state || state.drainScheduled) return;
|
|
268
|
-
state.drainScheduled = true;
|
|
269
|
-
const delay = Math.max(0, this.nextSlotAt(state, limits, 0) - Date.now());
|
|
325
|
+
scheduleDrain(key, limits, nextSlotAtMs) {
|
|
326
|
+
const local = this.localStates.get(key);
|
|
327
|
+
if (!local || local.drainScheduled) return;
|
|
328
|
+
local.drainScheduled = true;
|
|
329
|
+
const delay = Math.max(0, nextSlotAtMs - Date.now());
|
|
270
330
|
setTimeout(() => {
|
|
271
|
-
|
|
272
|
-
this.drain(key, limits);
|
|
331
|
+
local.drainScheduled = false;
|
|
332
|
+
void this.drain(key, limits);
|
|
273
333
|
}, delay);
|
|
274
334
|
}
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
335
|
+
async drain(key, limits) {
|
|
336
|
+
const local = this.localStates.get(key);
|
|
337
|
+
if (!local) return;
|
|
338
|
+
while (local.waiters.length > 0) {
|
|
339
|
+
const waiter = local.waiters[0];
|
|
340
|
+
const nextSlotAtMs = await this.store.checkAndRecord(
|
|
341
|
+
key,
|
|
342
|
+
waiter.estimatedInputTokens,
|
|
343
|
+
limits
|
|
344
|
+
);
|
|
345
|
+
if (nextSlotAtMs > Date.now()) {
|
|
346
|
+
this.scheduleDrain(key, limits, nextSlotAtMs);
|
|
347
|
+
return;
|
|
348
|
+
}
|
|
349
|
+
if (local.waiters[0] !== waiter) {
|
|
350
|
+
continue;
|
|
351
|
+
}
|
|
352
|
+
local.waiters.shift();
|
|
353
|
+
clearTimeout(waiter.timeoutHandle);
|
|
354
|
+
waiter.resolve();
|
|
292
355
|
}
|
|
293
356
|
}
|
|
294
357
|
};
|
|
@@ -1292,7 +1355,8 @@ var Pipeline = class {
|
|
|
1292
1355
|
constructor(config) {
|
|
1293
1356
|
this.config = config;
|
|
1294
1357
|
this.engine = new RateLimitEngine({
|
|
1295
|
-
maxQueueSize: config.queue?.maxSize ?? 500
|
|
1358
|
+
maxQueueSize: config.queue?.maxSize ?? 500,
|
|
1359
|
+
...config.store !== void 0 && { store: config.store }
|
|
1296
1360
|
});
|
|
1297
1361
|
this.costTracker = new CostTracker();
|
|
1298
1362
|
this.emitter = new Emitter();
|
|
@@ -1461,7 +1525,7 @@ var Pipeline = class {
|
|
|
1461
1525
|
const models = [];
|
|
1462
1526
|
return { models, totalQueueDepth: 0 };
|
|
1463
1527
|
}
|
|
1464
|
-
estimatedWait(modelId, provider, priority = "normal") {
|
|
1528
|
+
async estimatedWait(modelId, provider, priority = "normal") {
|
|
1465
1529
|
const key = `${provider}:${modelId}`;
|
|
1466
1530
|
const limits = this.resolveModelLimits(modelId, provider);
|
|
1467
1531
|
return this.engine.estimatedWaitMs(key, limits);
|