ai-sdk-rate-limiter 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +254 -0
- package/dist/index.cjs +162 -4
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +38 -3
- package/dist/index.d.ts +38 -3
- package/dist/index.js +162 -5
- package/dist/index.js.map +1 -1
- package/dist/middleware.d.cts +1 -1
- package/dist/middleware.d.ts +1 -1
- package/dist/otel.d.cts +1 -1
- package/dist/otel.d.ts +1 -1
- package/dist/prometheus.d.cts +1 -1
- package/dist/prometheus.d.ts +1 -1
- package/dist/redis.d.cts +1 -1
- package/dist/redis.d.ts +1 -1
- package/dist/statsd.d.cts +1 -1
- package/dist/statsd.d.ts +1 -1
- package/dist/testing.cjs +129 -4
- package/dist/testing.cjs.map +1 -1
- package/dist/testing.d.cts +1 -1
- package/dist/testing.d.ts +1 -1
- package/dist/testing.js +129 -4
- package/dist/testing.js.map +1 -1
- package/dist/{types-CUPpMRPE.d.cts → types-DKIz0MLZ.d.cts} +56 -1
- package/dist/{types-CUPpMRPE.d.ts → types-DKIz0MLZ.d.ts} +56 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
Smart rate limiting, queuing, and cost tracking for AI API calls. Works across providers. Zero required dependencies.
|
|
4
4
|
|
|
5
5
|
[](https://www.npmjs.com/package/ai-sdk-rate-limiter)
|
|
6
|
+
[](https://github.com/piyushgupta344/ai-sdk-rate-limiter/actions/workflows/ci.yml)
|
|
7
|
+
[](https://www.npmjs.com/package/ai-sdk-rate-limiter)
|
|
6
8
|
|
|
7
9
|
```
|
|
8
10
|
npm install ai-sdk-rate-limiter
|
|
@@ -83,6 +85,8 @@ The wrapped model is a drop-in replacement. Every Vercel AI SDK feature — stre
|
|
|
83
85
|
|
|
84
86
|
**Fallback chains** — `fallback` now accepts an array of models. On `BudgetExceededError`, the chain is walked in order until one succeeds.
|
|
85
87
|
|
|
88
|
+
**Express / Hono middleware** — `createRateLimiterMiddleware()` (from `ai-sdk-rate-limiter/middleware`) attaches `req.rateLimiter` to every request and converts rate-limiter errors to proper HTTP responses at the middleware layer — no per-route boilerplate.
|
|
89
|
+
|
|
86
90
|
**OpenTelemetry** — Drop-in OTel plugin that emits GenAI-spec spans for every request. Works with any OTel-compatible tracer.
|
|
87
91
|
|
|
88
92
|
**Testing utilities** — `createTestLimiter()` records every completed call so you can assert on model usage, token counts, and costs in unit tests.
|
|
@@ -109,6 +113,7 @@ The wrapped model is a drop-in replacement. Every Vercel AI SDK feature — stre
|
|
|
109
113
|
- [Graceful shutdown](#graceful-shutdown)
|
|
110
114
|
- [Prometheus metrics](#prometheus-metrics)
|
|
111
115
|
- [StatsD metrics](#statsD-metrics)
|
|
116
|
+
- [Express / Hono middleware](#express--hono-middleware)
|
|
112
117
|
- [Events](#events)
|
|
113
118
|
- [Backpressure](#backpressure)
|
|
114
119
|
- [Error handling](#error-handling)
|
|
@@ -552,6 +557,27 @@ Costs are based on **actual token counts** from API responses — not estimates.
|
|
|
552
557
|
|
|
553
558
|
`byScope` is populated automatically when requests carry a `scope` (either set on `limiter.wrap()` or via `providerOptions.rateLimiter.scope`). Unscoped requests don't appear in `byScope`.
|
|
554
559
|
|
|
560
|
+
### Cost forecasting
|
|
561
|
+
|
|
562
|
+
`getCostForecast()` projects your end-of-period spend based on the current hourly rate. Useful for alerting before a budget cap is hit:
|
|
563
|
+
|
|
564
|
+
```typescript
|
|
565
|
+
const forecast = limiter.getCostForecast()
|
|
566
|
+
|
|
567
|
+
console.log(forecast)
|
|
568
|
+
// {
|
|
569
|
+
// hour: { spentUsd: 1.20, projectedUsd: 1.20, ratePerHourUsd: 1.20 },
|
|
570
|
+
// day: { spentUsd: 3.50, projectedUsd: 28.80, ratePerHourUsd: 1.20 },
|
|
571
|
+
// month: { spentUsd: 8.10, projectedUsd: 864, ratePerHourUsd: 1.20 },
|
|
572
|
+
// }
|
|
573
|
+
|
|
574
|
+
if (forecast.day.projectedUsd > 40) {
|
|
575
|
+
console.warn(`Heads up — on track to spend $${forecast.day.projectedUsd.toFixed(2)} today`)
|
|
576
|
+
}
|
|
577
|
+
```
|
|
578
|
+
|
|
579
|
+
`projectedUsd` = current hourly rate × hours in the period. It is based on the **last 60 minutes** of spend, so it responds quickly to usage spikes.
|
|
580
|
+
|
|
555
581
|
---
|
|
556
582
|
|
|
557
583
|
## Budget fallback routing
|
|
@@ -858,6 +884,121 @@ const client: StatsDClient = {
|
|
|
858
884
|
|
|
859
885
|
---
|
|
860
886
|
|
|
887
|
+
## Express / Hono middleware
|
|
888
|
+
|
|
889
|
+
The `ai-sdk-rate-limiter/middleware` entry point eliminates per-route boilerplate. Scope extraction, priority assignment, and rate-limiter error handling all move to the middleware layer — route handlers just pass `req.rateLimiter` through.
|
|
890
|
+
|
|
891
|
+
### Express
|
|
892
|
+
|
|
893
|
+
```typescript
|
|
894
|
+
import { createRateLimiterMiddleware } from 'ai-sdk-rate-limiter/middleware'
|
|
895
|
+
|
|
896
|
+
const { middleware, errorHandler } = createRateLimiterMiddleware(limiter, {
|
|
897
|
+
// Extract scope from the request — stored in req.rateLimiter.scope
|
|
898
|
+
scope: (req) => {
|
|
899
|
+
const plan = req.headers['x-user-plan'] ?? 'free'
|
|
900
|
+
const id = req.headers['x-user-id']
|
|
901
|
+
return id ? `user:${plan}:${id}` : undefined
|
|
902
|
+
},
|
|
903
|
+
|
|
904
|
+
// Derive queue priority per-request
|
|
905
|
+
priority: (req) => req.headers['x-user-plan'] === 'pro' ? 'normal' : 'low',
|
|
906
|
+
|
|
907
|
+
// Add X-RateLimit-* informational headers to every response
|
|
908
|
+
injectHeaders: 'gpt-4o-mini',
|
|
909
|
+
})
|
|
910
|
+
|
|
911
|
+
app.use(middleware) // BEFORE routes
|
|
912
|
+
|
|
913
|
+
app.post('/chat', async (req, res) => {
|
|
914
|
+
const { text } = await generateText({
|
|
915
|
+
model,
|
|
916
|
+
prompt: req.body.message,
|
|
917
|
+
// req.rateLimiter already has scope + priority — just pass it through
|
|
918
|
+
providerOptions: { rateLimiter: req.rateLimiter },
|
|
919
|
+
})
|
|
920
|
+
res.json({ text })
|
|
921
|
+
})
|
|
922
|
+
|
|
923
|
+
app.use(errorHandler) // AFTER routes
|
|
924
|
+
```
|
|
925
|
+
|
|
926
|
+
The `errorHandler` converts every `RateLimiterError` to a typed HTTP response automatically — no try/catch needed in route handlers:
|
|
927
|
+
|
|
928
|
+
| Error | HTTP status | `code` |
|
|
929
|
+
|---|---|---|
|
|
930
|
+
| `QueueTimeoutError` | 503 | `QUEUE_TIMEOUT` |
|
|
931
|
+
| `QueueFullError` | 503 | `QUEUE_FULL` |
|
|
932
|
+
| `CircuitOpenError` | 503 | `CIRCUIT_OPEN` |
|
|
933
|
+
| `ShutdownError` | 503 | `SHUTDOWN` |
|
|
934
|
+
| `BudgetExceededError` | 402 | `BUDGET_EXCEEDED` |
|
|
935
|
+
| `RateLimiterError` (generic) | 429 | `RATE_LIMITED` |
|
|
936
|
+
|
|
937
|
+
Non-rate-limiter errors are passed to the next error handler unchanged.
|
|
938
|
+
|
|
939
|
+
### Hono
|
|
940
|
+
|
|
941
|
+
```typescript
|
|
942
|
+
import { createHonoMiddleware } from 'ai-sdk-rate-limiter/middleware'
|
|
943
|
+
|
|
944
|
+
app.use(createHonoMiddleware(limiter, {
|
|
945
|
+
scope: (c) => c.req.header('x-user-id'),
|
|
946
|
+
priority: (c) => c.req.header('x-plan') === 'pro' ? 'normal' : 'low',
|
|
947
|
+
}))
|
|
948
|
+
|
|
949
|
+
app.post('/chat', async (c) => {
|
|
950
|
+
const { text } = await generateText({
|
|
951
|
+
model,
|
|
952
|
+
prompt: await c.req.text(),
|
|
953
|
+
providerOptions: { rateLimiter: c.var.rateLimiter },
|
|
954
|
+
})
|
|
955
|
+
return c.json({ text })
|
|
956
|
+
})
|
|
957
|
+
```
|
|
958
|
+
|
|
959
|
+
`createHonoMiddleware` wraps the `next()` call in a try/catch, so `RateLimiterErrors` thrown inside route handlers are caught and returned as JSON responses automatically.
|
|
960
|
+
|
|
961
|
+
### Standalone error handler
|
|
962
|
+
|
|
963
|
+
If you only need error handling without scope injection:
|
|
964
|
+
|
|
965
|
+
```typescript
|
|
966
|
+
import { createRateLimiterErrorHandler } from 'ai-sdk-rate-limiter/middleware'
|
|
967
|
+
|
|
968
|
+
app.use(createRateLimiterErrorHandler({
|
|
969
|
+
includeDetails: false, // omit retryAfter, period, limitUsd from response body
|
|
970
|
+
}))
|
|
971
|
+
```
|
|
972
|
+
|
|
973
|
+
### Custom framework (Fastify, etc.)
|
|
974
|
+
|
|
975
|
+
`mapErrorToResponse` is exported for frameworks that don't use the `(req, res, next)` convention:
|
|
976
|
+
|
|
977
|
+
```typescript
|
|
978
|
+
import { mapErrorToResponse } from 'ai-sdk-rate-limiter/middleware'
|
|
979
|
+
import { RateLimiterError } from 'ai-sdk-rate-limiter'
|
|
980
|
+
|
|
981
|
+
// Fastify onError hook
|
|
982
|
+
fastify.setErrorHandler((err, request, reply) => {
|
|
983
|
+
if (err instanceof RateLimiterError) {
|
|
984
|
+
const { status, body } = mapErrorToResponse(err)
|
|
985
|
+
return reply.status(status).send(body)
|
|
986
|
+
}
|
|
987
|
+
reply.send(err)
|
|
988
|
+
})
|
|
989
|
+
```
|
|
990
|
+
|
|
991
|
+
### `req.rateLimiter` TypeScript type
|
|
992
|
+
|
|
993
|
+
The middleware augments `http.IncomingMessage` so `req.rateLimiter` is typed in Express and Fastify without any additional setup:
|
|
994
|
+
|
|
995
|
+
```typescript
|
|
996
|
+
import type { RateLimiterRequestContext } from 'ai-sdk-rate-limiter/middleware'
|
|
997
|
+
// req.rateLimiter is automatically typed as RateLimiterRequestContext | undefined
|
|
998
|
+
```
|
|
999
|
+
|
|
1000
|
+
---
|
|
1001
|
+
|
|
861
1002
|
## Events
|
|
862
1003
|
|
|
863
1004
|
All events are typed. Register handlers at creation time or dynamically:
|
|
@@ -1336,6 +1477,57 @@ class MyStore implements RateLimitStore {
|
|
|
1336
1477
|
const limiter = createRateLimiter({ store: new MyStore() })
|
|
1337
1478
|
```
|
|
1338
1479
|
|
|
1480
|
+
### Load balancing across API keys
|
|
1481
|
+
|
|
1482
|
+
`createModelPool()` distributes requests round-robin across multiple model instances — useful when you have more than one API key:
|
|
1483
|
+
|
|
1484
|
+
```typescript
|
|
1485
|
+
import { createRateLimiter, createModelPool } from 'ai-sdk-rate-limiter'
|
|
1486
|
+
import { createOpenAI } from '@ai-sdk/openai'
|
|
1487
|
+
|
|
1488
|
+
// Two API keys, each with their own limiter tracking separate RPM limits
|
|
1489
|
+
const limiter1 = createRateLimiter({ limits: { 'gpt-4o': { rpm: 500, itpm: 2_000_000 } } })
|
|
1490
|
+
const limiter2 = createRateLimiter({ limits: { 'gpt-4o': { rpm: 500, itpm: 2_000_000 } } })
|
|
1491
|
+
|
|
1492
|
+
const openai1 = createOpenAI({ apiKey: process.env.OPENAI_KEY_1 })
|
|
1493
|
+
const openai2 = createOpenAI({ apiKey: process.env.OPENAI_KEY_2 })
|
|
1494
|
+
|
|
1495
|
+
const pool = createModelPool([
|
|
1496
|
+
limiter1.wrap(openai1('gpt-4o')),
|
|
1497
|
+
limiter2.wrap(openai2('gpt-4o')),
|
|
1498
|
+
])
|
|
1499
|
+
|
|
1500
|
+
// Use exactly like a regular model — calls alternate between the two keys
|
|
1501
|
+
const { text } = await generateText({ model: pool, prompt: 'Hello!' })
|
|
1502
|
+
```
|
|
1503
|
+
|
|
1504
|
+
Pass `{ strategy: 'random' }` for random selection instead of round-robin.
|
|
1505
|
+
|
|
1506
|
+
### Request deduplication
|
|
1507
|
+
|
|
1508
|
+
When multiple concurrent requests carry the same `dedupKey`, only one API call is made and all callers receive the same result. Useful for FAQ-style workloads where many users ask the same question simultaneously:
|
|
1509
|
+
|
|
1510
|
+
```typescript
|
|
1511
|
+
const model = limiter.wrap(openai('gpt-4o'))
|
|
1512
|
+
|
|
1513
|
+
// Server handler — two simultaneous identical requests share one API call
|
|
1514
|
+
async function handleRequest(questionId: string) {
|
|
1515
|
+
const { text } = await generateText({
|
|
1516
|
+
model,
|
|
1517
|
+
prompt: questions[questionId],
|
|
1518
|
+
providerOptions: {
|
|
1519
|
+
rateLimiter: { dedupKey: `faq:${questionId}` },
|
|
1520
|
+
},
|
|
1521
|
+
})
|
|
1522
|
+
return text
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1525
|
+
// If 50 users hit the same FAQ item at the same time → 1 API call, not 50
|
|
1526
|
+
const results = await Promise.all(users.map(() => handleRequest('faq-42')))
|
|
1527
|
+
```
|
|
1528
|
+
|
|
1529
|
+
The dedup entry is removed once the request completes (success or error), so subsequent requests always make a fresh call.
|
|
1530
|
+
|
|
1339
1531
|
---
|
|
1340
1532
|
|
|
1341
1533
|
## How it works
|
|
@@ -1358,6 +1550,42 @@ const limiter = createRateLimiter({ store: new MyStore() })
|
|
|
1358
1550
|
|
|
1359
1551
|
---
|
|
1360
1552
|
|
|
1553
|
+
## Debug mode
|
|
1554
|
+
|
|
1555
|
+
Set `debug: true` to enable structured console logging for every rate-limit decision, queue entry/exit, slot acquisition, circuit breaker transition, and completed call cost:
|
|
1556
|
+
|
|
1557
|
+
```typescript
|
|
1558
|
+
const limiter = createRateLimiter({ debug: true })
|
|
1559
|
+
```
|
|
1560
|
+
|
|
1561
|
+
Sample output:
|
|
1562
|
+
|
|
1563
|
+
```
|
|
1564
|
+
[ai-sdk-rate-limiter] gpt-4o: execute (provider="openai" priority="normal")
|
|
1565
|
+
[ai-sdk-rate-limiter] gpt-4o: queuing (queueDepth=3 estimatedWaitMs=1200 priority="normal")
|
|
1566
|
+
[ai-sdk-rate-limiter] gpt-4o: dequeued (waitedMs=1187 priority="normal")
|
|
1567
|
+
[ai-sdk-rate-limiter] gpt-4o: completed (tokens=342+87 costUsd=0.000021 latencyMs=1343 streaming=false)
|
|
1568
|
+
```
|
|
1569
|
+
|
|
1570
|
+
Debug logging is completely zero-overhead when disabled — no string building, no `JSON.stringify`, no property access on the details object.
|
|
1571
|
+
|
|
1572
|
+
---
|
|
1573
|
+
|
|
1574
|
+
## Config validation
|
|
1575
|
+
|
|
1576
|
+
`createRateLimiter()` validates your configuration at construction time. If it spots a likely misconfiguration it logs a `console.warn` (never throws). Catches you've got covered:
|
|
1577
|
+
|
|
1578
|
+
| Issue | Warning |
|
|
1579
|
+
|---|---|
|
|
1580
|
+
| `cost.store` set but `warmUp()` never called | Reminds you to call `warmUp()` at startup |
|
|
1581
|
+
| `circuit.failureThreshold < 3` | Too sensitive — risks false trips on transient errors |
|
|
1582
|
+
| `retry.retryOn` excludes 429 | Rate-limit errors won't be retried |
|
|
1583
|
+
| `queue.timeout < 3000ms` | Requests will time out before they can be served |
|
|
1584
|
+
| `cost.budget` set without `onExceeded` | Silent default is `'throw'` — may want `'queue'` or `'fallback'` |
|
|
1585
|
+
| `cost.onExceeded: 'fallback'` | Reminds you to pass a `fallback` model to `limiter.wrap()` |
|
|
1586
|
+
|
|
1587
|
+
---
|
|
1588
|
+
|
|
1361
1589
|
## Comparison
|
|
1362
1590
|
|
|
1363
1591
|
| | ai-sdk-rate-limiter | bottleneck | p-limit | SDK built-in retry | LangChain |
|
|
@@ -1382,6 +1610,7 @@ const limiter = createRateLimiter({ store: new MyStore() })
|
|
|
1382
1610
|
| Backoff propagation | yes | no | no | no | no |
|
|
1383
1611
|
| Prometheus metrics | yes | no | no | no | no |
|
|
1384
1612
|
| StatsD metrics | yes | no | no | no | no |
|
|
1613
|
+
| Express/Hono middleware | yes | no | no | no | no |
|
|
1385
1614
|
| OpenTelemetry | yes | no | no | no | partial |
|
|
1386
1615
|
| Testing utilities | yes | no | no | no | no |
|
|
1387
1616
|
| CLI audit | yes | no | no | no | no |
|
|
@@ -1425,6 +1654,14 @@ import type {
|
|
|
1425
1654
|
} from 'ai-sdk-rate-limiter/redis'
|
|
1426
1655
|
|
|
1427
1656
|
import type { StatsDClient } from 'ai-sdk-rate-limiter/statsd'
|
|
1657
|
+
|
|
1658
|
+
import type {
|
|
1659
|
+
RateLimiterRequestContext,
|
|
1660
|
+
RateLimiterMiddlewareOptions,
|
|
1661
|
+
ErrorHandlerOptions,
|
|
1662
|
+
HonoMiddlewareOptions,
|
|
1663
|
+
HonoContext,
|
|
1664
|
+
} from 'ai-sdk-rate-limiter/middleware'
|
|
1428
1665
|
```
|
|
1429
1666
|
|
|
1430
1667
|
---
|
|
@@ -1442,6 +1679,23 @@ Four runnable examples are included, each with its own README:
|
|
|
1442
1679
|
|
|
1443
1680
|
---
|
|
1444
1681
|
|
|
1682
|
+
## Bundle sizes
|
|
1683
|
+
|
|
1684
|
+
Each entry point is independently tree-shakeable. Importing `ai-sdk-rate-limiter` never pulls in Redis, Prometheus, OTel, or StatsD.
|
|
1685
|
+
|
|
1686
|
+
| Entry point | Size (minified) | Size (gzip) |
|
|
1687
|
+
|---|---|---|
|
|
1688
|
+
| `ai-sdk-rate-limiter` | ~80 KB | ~22 KB |
|
|
1689
|
+
| `ai-sdk-rate-limiter/redis` | ~12 KB | ~4 KB |
|
|
1690
|
+
| `ai-sdk-rate-limiter/middleware` | ~8 KB | ~2.5 KB |
|
|
1691
|
+
| `ai-sdk-rate-limiter/prometheus` | ~8 KB | ~2.5 KB |
|
|
1692
|
+
| `ai-sdk-rate-limiter/otel` | ~4 KB | ~1.5 KB |
|
|
1693
|
+
| `ai-sdk-rate-limiter/statsd` | ~4 KB | ~1.2 KB |
|
|
1694
|
+
|
|
1695
|
+
The core package is self-contained. Optional peer deps (`ioredis`, `@opentelemetry/api`) are only loaded when you import the corresponding entry point.
|
|
1696
|
+
|
|
1697
|
+
---
|
|
1698
|
+
|
|
1445
1699
|
## Requirements
|
|
1446
1700
|
|
|
1447
1701
|
- Node.js 18+ / Bun / Deno
|
package/dist/index.cjs
CHANGED
|
@@ -668,6 +668,30 @@ var CostTracker = class {
|
|
|
668
668
|
estimateCost(inputTokens, outputTokens, inputPricePerMillion, outputPricePerMillion) {
|
|
669
669
|
return inputTokens / 1e6 * inputPricePerMillion + outputTokens / 1e6 * outputPricePerMillion;
|
|
670
670
|
}
|
|
671
|
+
getForecast() {
|
|
672
|
+
const now = Date.now();
|
|
673
|
+
this.evict(now);
|
|
674
|
+
const hourlyRate = this.entries.filter((e) => e.timestamp > now - HOUR_MS).reduce((s, e) => s + e.costUsd, 0);
|
|
675
|
+
const daySpent = this.entries.filter((e) => e.timestamp > now - DAY_MS2).reduce((s, e) => s + e.costUsd, 0);
|
|
676
|
+
const monthSpent = this.entries.filter((e) => e.timestamp > now - MONTH_MS).reduce((s, e) => s + e.costUsd, 0);
|
|
677
|
+
return {
|
|
678
|
+
hour: {
|
|
679
|
+
spentUsd: hourlyRate,
|
|
680
|
+
projectedUsd: hourlyRate,
|
|
681
|
+
ratePerHourUsd: hourlyRate
|
|
682
|
+
},
|
|
683
|
+
day: {
|
|
684
|
+
spentUsd: daySpent,
|
|
685
|
+
projectedUsd: hourlyRate * 24,
|
|
686
|
+
ratePerHourUsd: hourlyRate
|
|
687
|
+
},
|
|
688
|
+
month: {
|
|
689
|
+
spentUsd: monthSpent,
|
|
690
|
+
projectedUsd: hourlyRate * 24 * 30,
|
|
691
|
+
ratePerHourUsd: hourlyRate
|
|
692
|
+
}
|
|
693
|
+
};
|
|
694
|
+
}
|
|
671
695
|
getReport() {
|
|
672
696
|
const now = Date.now();
|
|
673
697
|
this.evict(now);
|
|
@@ -1678,6 +1702,23 @@ var CircuitBreaker = class {
|
|
|
1678
1702
|
}
|
|
1679
1703
|
};
|
|
1680
1704
|
|
|
1705
|
+
// src/core/debug-logger.ts
|
|
1706
|
+
var PREFIX = "[ai-sdk-rate-limiter]";
|
|
1707
|
+
var DebugLogger = class {
|
|
1708
|
+
constructor(enabled) {
|
|
1709
|
+
this.enabled = enabled;
|
|
1710
|
+
}
|
|
1711
|
+
log(model, message, details) {
|
|
1712
|
+
if (!this.enabled) return;
|
|
1713
|
+
if (details && Object.keys(details).length > 0) {
|
|
1714
|
+
const parts = Object.entries(details).map(([k, v]) => `${k}=${JSON.stringify(v)}`).join(" ");
|
|
1715
|
+
console.log(`${PREFIX} ${model}: ${message} (${parts})`);
|
|
1716
|
+
} else {
|
|
1717
|
+
console.log(`${PREFIX} ${model}: ${message}`);
|
|
1718
|
+
}
|
|
1719
|
+
}
|
|
1720
|
+
};
|
|
1721
|
+
|
|
1681
1722
|
// src/core/pipeline.ts
|
|
1682
1723
|
function resolveRetryConfig(config) {
|
|
1683
1724
|
const r = config.retry ?? {};
|
|
@@ -1717,9 +1758,12 @@ var Pipeline = class {
|
|
|
1717
1758
|
this.circuits = /* @__PURE__ */ new Map();
|
|
1718
1759
|
/** Limits detected from provider response headers (lower priority than user config) */
|
|
1719
1760
|
this.detectedLimits = /* @__PURE__ */ new Map();
|
|
1761
|
+
/** In-flight promises indexed by dedup key, shared across identical concurrent requests */
|
|
1762
|
+
this.dedupMap = /* @__PURE__ */ new Map();
|
|
1720
1763
|
/** Set to true after shutdown() is called */
|
|
1721
1764
|
this.shutdownRequested = false;
|
|
1722
1765
|
this.config = config;
|
|
1766
|
+
this.log = new DebugLogger(config.debug === true);
|
|
1723
1767
|
this.engine = new RateLimitEngine({
|
|
1724
1768
|
maxQueueSize: config.queue?.maxSize ?? 500,
|
|
1725
1769
|
...config.store !== void 0 && { store: config.store }
|
|
@@ -1761,6 +1805,20 @@ var Pipeline = class {
|
|
|
1761
1805
|
* recordUsage() once they have actual token counts from the API response.
|
|
1762
1806
|
*/
|
|
1763
1807
|
async execute(modelId, provider, prompt, fn, opts) {
|
|
1808
|
+
this.log.log(modelId, "execute", { provider, priority: opts.priority, ...opts.scope !== void 0 && { scope: opts.scope } });
|
|
1809
|
+
if (opts.dedupKey !== void 0) {
|
|
1810
|
+
const existing = this.dedupMap.get(opts.dedupKey);
|
|
1811
|
+
if (existing !== void 0) {
|
|
1812
|
+
this.log.log(modelId, "dedup hit", { dedupKey: opts.dedupKey });
|
|
1813
|
+
return existing;
|
|
1814
|
+
}
|
|
1815
|
+
const { dedupKey, ...optsWithoutDedup } = opts;
|
|
1816
|
+
const promise = this.execute(modelId, provider, prompt, fn, optsWithoutDedup).finally(() => {
|
|
1817
|
+
this.dedupMap.delete(dedupKey);
|
|
1818
|
+
});
|
|
1819
|
+
this.dedupMap.set(dedupKey, promise);
|
|
1820
|
+
return promise;
|
|
1821
|
+
}
|
|
1764
1822
|
if (this.shutdownRequested) {
|
|
1765
1823
|
this.emitter.emit("dropped", {
|
|
1766
1824
|
model: modelId,
|
|
@@ -1850,10 +1908,12 @@ var Pipeline = class {
|
|
|
1850
1908
|
timeoutMs: opts.timeoutMs,
|
|
1851
1909
|
...opts.signal !== void 0 && { signal: opts.signal },
|
|
1852
1910
|
onQueued: (queueDepth, estimatedWaitMs) => {
|
|
1911
|
+
this.log.log(modelId, "queuing", { queueDepth, estimatedWaitMs, priority: opts.priority });
|
|
1853
1912
|
this.emitter.emit("queued", { model: modelId, provider, priority: opts.priority, queueDepth, estimatedWaitMs });
|
|
1854
1913
|
this.emitter.emit("rateLimited", { source: "local", model: modelId, provider, limitType: "rpm", resetAt: Date.now() + estimatedWaitMs });
|
|
1855
1914
|
},
|
|
1856
1915
|
onDequeued: (waitedMs) => {
|
|
1916
|
+
this.log.log(modelId, "dequeued", { waitedMs, priority: opts.priority });
|
|
1857
1917
|
this.emitter.emit("dequeued", { model: modelId, provider, waitedMs, priority: opts.priority });
|
|
1858
1918
|
}
|
|
1859
1919
|
});
|
|
@@ -1905,7 +1965,10 @@ var Pipeline = class {
|
|
|
1905
1965
|
});
|
|
1906
1966
|
if (circuit) {
|
|
1907
1967
|
const justClosed = circuit.recordSuccess();
|
|
1908
|
-
if (justClosed)
|
|
1968
|
+
if (justClosed) {
|
|
1969
|
+
this.log.log(modelId, "circuit closed \u2014 upstream recovered");
|
|
1970
|
+
this.emitter.emit("circuitClosed", { model: modelId, provider });
|
|
1971
|
+
}
|
|
1909
1972
|
}
|
|
1910
1973
|
return result;
|
|
1911
1974
|
} catch (error) {
|
|
@@ -1915,6 +1978,7 @@ var Pipeline = class {
|
|
|
1915
1978
|
if (shouldTrip) {
|
|
1916
1979
|
const justOpened = circuit.recordFailure();
|
|
1917
1980
|
if (justOpened) {
|
|
1981
|
+
this.log.log(modelId, "circuit OPEN", { status, cooldownMs: this.config.circuit?.cooldownMs ?? 6e4 });
|
|
1918
1982
|
this.emitter.emit("circuitOpen", {
|
|
1919
1983
|
model: modelId,
|
|
1920
1984
|
provider,
|
|
@@ -1945,6 +2009,13 @@ var Pipeline = class {
|
|
|
1945
2009
|
limits.outputPricePerMillion,
|
|
1946
2010
|
scope
|
|
1947
2011
|
);
|
|
2012
|
+
this.log.log(modelId, "completed", {
|
|
2013
|
+
tokens: `${usage.inputTokens}+${usage.outputTokens}`,
|
|
2014
|
+
costUsd: costUsd.toFixed(6),
|
|
2015
|
+
latencyMs,
|
|
2016
|
+
streaming,
|
|
2017
|
+
...scope !== void 0 && { scope }
|
|
2018
|
+
});
|
|
1948
2019
|
this.emitter.emit("completed", {
|
|
1949
2020
|
model: modelId,
|
|
1950
2021
|
provider,
|
|
@@ -1962,6 +2033,9 @@ var Pipeline = class {
|
|
|
1962
2033
|
getCostReport() {
|
|
1963
2034
|
return this.costTracker.getReport();
|
|
1964
2035
|
}
|
|
2036
|
+
getCostForecast() {
|
|
2037
|
+
return this.costTracker.getForecast();
|
|
2038
|
+
}
|
|
1965
2039
|
getStatus() {
|
|
1966
2040
|
const models = [];
|
|
1967
2041
|
let totalQueueDepth = 0;
|
|
@@ -2090,7 +2164,8 @@ function getPerRequestOptions(params, queueTimeout) {
|
|
|
2090
2164
|
metadata: raw?.metadata ?? {},
|
|
2091
2165
|
skipBudgetCheck: raw?._skipBudgetCheck ?? false,
|
|
2092
2166
|
scope: raw?.scope,
|
|
2093
|
-
callTimeout: raw?.callTimeout
|
|
2167
|
+
callTimeout: raw?.callTimeout,
|
|
2168
|
+
dedupKey: raw?.dedupKey
|
|
2094
2169
|
};
|
|
2095
2170
|
}
|
|
2096
2171
|
function extractTokenUsage(usage) {
|
|
@@ -2106,7 +2181,7 @@ function createMiddleware(pipeline, queueTimeout) {
|
|
|
2106
2181
|
// wrapGenerate — non-streaming
|
|
2107
2182
|
// -----------------------------------------------------------------------
|
|
2108
2183
|
async wrapGenerate({ doGenerate, params, model }) {
|
|
2109
|
-
const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout } = getPerRequestOptions(params, queueTimeout);
|
|
2184
|
+
const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout, dedupKey } = getPerRequestOptions(params, queueTimeout);
|
|
2110
2185
|
const modelId = model.modelId;
|
|
2111
2186
|
const provider = model.provider;
|
|
2112
2187
|
const startMs = Date.now();
|
|
@@ -2122,6 +2197,7 @@ function createMiddleware(pipeline, queueTimeout) {
|
|
|
2122
2197
|
skipBudgetCheck,
|
|
2123
2198
|
...scope !== void 0 && { scope },
|
|
2124
2199
|
...callTimeout !== void 0 && { callTimeout },
|
|
2200
|
+
...dedupKey !== void 0 && { dedupKey },
|
|
2125
2201
|
...params.abortSignal !== void 0 && { signal: params.abortSignal }
|
|
2126
2202
|
}
|
|
2127
2203
|
);
|
|
@@ -2135,7 +2211,7 @@ function createMiddleware(pipeline, queueTimeout) {
|
|
|
2135
2211
|
// wrapStream — streaming
|
|
2136
2212
|
// -----------------------------------------------------------------------
|
|
2137
2213
|
async wrapStream({ doStream, params, model }) {
|
|
2138
|
-
const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout } = getPerRequestOptions(params, queueTimeout);
|
|
2214
|
+
const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout, dedupKey } = getPerRequestOptions(params, queueTimeout);
|
|
2139
2215
|
const modelId = model.modelId;
|
|
2140
2216
|
const provider = model.provider;
|
|
2141
2217
|
const startMs = Date.now();
|
|
@@ -2151,6 +2227,7 @@ function createMiddleware(pipeline, queueTimeout) {
|
|
|
2151
2227
|
skipBudgetCheck,
|
|
2152
2228
|
...scope !== void 0 && { scope },
|
|
2153
2229
|
...callTimeout !== void 0 && { callTimeout },
|
|
2230
|
+
...dedupKey !== void 0 && { dedupKey },
|
|
2154
2231
|
...params.abortSignal !== void 0 && { signal: params.abortSignal }
|
|
2155
2232
|
}
|
|
2156
2233
|
);
|
|
@@ -2416,8 +2493,53 @@ function rateLimited(client, options = {}) {
|
|
|
2416
2493
|
});
|
|
2417
2494
|
}
|
|
2418
2495
|
|
|
2496
|
+
// src/core/config-validator.ts
|
|
2497
|
+
var PREFIX2 = "\x1B[33m\u26A0 ai-sdk-rate-limiter\x1B[0m";
|
|
2498
|
+
var RESET = "\x1B[0m";
|
|
2499
|
+
function validateConfig(config) {
|
|
2500
|
+
const warnings = [];
|
|
2501
|
+
if (config.cost?.store !== void 0) {
|
|
2502
|
+
warnings.push(
|
|
2503
|
+
"cost.store is configured \u2014 call `await limiter.warmUp()` at startup.\n Without it, budget caps won't account for spend from previous process runs."
|
|
2504
|
+
);
|
|
2505
|
+
}
|
|
2506
|
+
const threshold = config.circuit?.failureThreshold;
|
|
2507
|
+
if (threshold !== void 0 && threshold < 3) {
|
|
2508
|
+
warnings.push(
|
|
2509
|
+
`circuit.failureThreshold is ${threshold} \u2014 very low. The circuit will open after nearly every error. Consider a value of 5 or higher for typical production workloads.`
|
|
2510
|
+
);
|
|
2511
|
+
}
|
|
2512
|
+
if (config.retry?.retryOn !== void 0 && !config.retry.retryOn.includes(429)) {
|
|
2513
|
+
warnings.push(
|
|
2514
|
+
"retry.retryOn does not include 429. Rate limit errors from the API will not be retried. Add 429 to retry.retryOn, or remove the override to use the default."
|
|
2515
|
+
);
|
|
2516
|
+
}
|
|
2517
|
+
const queueTimeout = config.queue?.timeout;
|
|
2518
|
+
if (queueTimeout !== void 0 && queueTimeout < 3e3) {
|
|
2519
|
+
warnings.push(
|
|
2520
|
+
`queue.timeout is ${queueTimeout}ms \u2014 less than 3 seconds. Requests may time out before the rate limit window resets (typically 60s). Consider 30_000ms (30s) or higher.`
|
|
2521
|
+
);
|
|
2522
|
+
}
|
|
2523
|
+
if (config.cost?.onExceeded === "fallback") {
|
|
2524
|
+
warnings.push(
|
|
2525
|
+
"cost.onExceeded is 'fallback' but fallback models are configured per-model in limiter.wrap(model, { fallback: cheaperModel }). If no fallback is set on a wrapped model, BudgetExceededError will still be thrown."
|
|
2526
|
+
);
|
|
2527
|
+
}
|
|
2528
|
+
if (config.cost?.budget !== void 0 && config.cost.onExceeded === void 0) {
|
|
2529
|
+
warnings.push(
|
|
2530
|
+
"cost.budget is set but cost.onExceeded is not. Defaulting to 'throw' \u2014 requests will throw BudgetExceededError when the cap is hit. Set onExceeded: 'queue' or 'fallback' to change this behavior."
|
|
2531
|
+
);
|
|
2532
|
+
}
|
|
2533
|
+
for (const warning of warnings) {
|
|
2534
|
+
const formatted = warning.replace(/\n/g, `
|
|
2535
|
+
`);
|
|
2536
|
+
console.warn(`${PREFIX2}: ${formatted}${RESET}`);
|
|
2537
|
+
}
|
|
2538
|
+
}
|
|
2539
|
+
|
|
2419
2540
|
// src/create-rate-limiter.ts
|
|
2420
2541
|
function createRateLimiter(config = {}) {
|
|
2542
|
+
validateConfig(config);
|
|
2421
2543
|
const pipeline = new Pipeline(config);
|
|
2422
2544
|
const queueTimeout = config.queue?.timeout ?? 3e4;
|
|
2423
2545
|
const middleware = createMiddleware(pipeline, queueTimeout);
|
|
@@ -2431,6 +2553,9 @@ function createRateLimiter(config = {}) {
|
|
|
2431
2553
|
getCostReport() {
|
|
2432
2554
|
return pipeline.getCostReport();
|
|
2433
2555
|
},
|
|
2556
|
+
getCostForecast() {
|
|
2557
|
+
return pipeline.getCostForecast();
|
|
2558
|
+
},
|
|
2434
2559
|
getStatus() {
|
|
2435
2560
|
return pipeline.getStatus();
|
|
2436
2561
|
},
|
|
@@ -2455,6 +2580,38 @@ function createRateLimiter(config = {}) {
|
|
|
2455
2580
|
};
|
|
2456
2581
|
}
|
|
2457
2582
|
|
|
2583
|
+
// src/adapters/model-pool.ts
|
|
2584
|
+
function createModelPool(models, options) {
|
|
2585
|
+
if (models.length === 0) {
|
|
2586
|
+
throw new Error("createModelPool: at least one model is required");
|
|
2587
|
+
}
|
|
2588
|
+
const strategy = options?.strategy ?? "round-robin";
|
|
2589
|
+
let index = 0;
|
|
2590
|
+
function pick() {
|
|
2591
|
+
if (strategy === "random") {
|
|
2592
|
+
return models[Math.floor(Math.random() * models.length)];
|
|
2593
|
+
}
|
|
2594
|
+
const model = models[index % models.length];
|
|
2595
|
+
index = (index + 1) % models.length;
|
|
2596
|
+
return model;
|
|
2597
|
+
}
|
|
2598
|
+
const primary = models[0];
|
|
2599
|
+
return {
|
|
2600
|
+
get modelId() {
|
|
2601
|
+
return primary.modelId;
|
|
2602
|
+
},
|
|
2603
|
+
get provider() {
|
|
2604
|
+
return primary.provider;
|
|
2605
|
+
},
|
|
2606
|
+
doGenerate(params) {
|
|
2607
|
+
return pick().doGenerate(params);
|
|
2608
|
+
},
|
|
2609
|
+
doStream(params) {
|
|
2610
|
+
return pick().doStream(params);
|
|
2611
|
+
}
|
|
2612
|
+
};
|
|
2613
|
+
}
|
|
2614
|
+
|
|
2458
2615
|
exports.ANTHROPIC_MODELS = ANTHROPIC_MODELS;
|
|
2459
2616
|
exports.BudgetExceededError = BudgetExceededError;
|
|
2460
2617
|
exports.COHERE_MODELS = COHERE_MODELS;
|
|
@@ -2469,6 +2626,7 @@ exports.RateLimitExceededError = RateLimitExceededError;
|
|
|
2469
2626
|
exports.RateLimiterError = RateLimiterError;
|
|
2470
2627
|
exports.RetryExhaustedError = RetryExhaustedError;
|
|
2471
2628
|
exports.ShutdownError = ShutdownError;
|
|
2629
|
+
exports.createModelPool = createModelPool;
|
|
2472
2630
|
exports.createRateLimiter = createRateLimiter;
|
|
2473
2631
|
exports.isKnownModel = isKnownModel;
|
|
2474
2632
|
exports.rateLimited = rateLimited;
|