ai-sdk-rate-limiter 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +363 -12
- package/dist/index.cjs +626 -155
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +17 -3
- package/dist/index.d.ts +17 -3
- package/dist/index.js +625 -156
- package/dist/index.js.map +1 -1
- package/dist/otel.d.cts +1 -1
- package/dist/otel.d.ts +1 -1
- package/dist/prometheus.cjs +133 -0
- package/dist/prometheus.cjs.map +1 -0
- package/dist/prometheus.d.cts +37 -0
- package/dist/prometheus.d.ts +37 -0
- package/dist/prometheus.js +131 -0
- package/dist/prometheus.js.map +1 -0
- package/dist/redis.cjs +160 -19
- package/dist/redis.cjs.map +1 -1
- package/dist/redis.d.cts +39 -2
- package/dist/redis.d.ts +39 -2
- package/dist/redis.js +160 -20
- package/dist/redis.js.map +1 -1
- package/dist/statsd.cjs +67 -0
- package/dist/statsd.cjs.map +1 -0
- package/dist/statsd.d.cts +46 -0
- package/dist/statsd.d.ts +46 -0
- package/dist/statsd.js +65 -0
- package/dist/statsd.js.map +1 -0
- package/dist/testing.cjs +624 -155
- package/dist/testing.cjs.map +1 -1
- package/dist/testing.d.cts +1 -1
- package/dist/testing.d.ts +1 -1
- package/dist/testing.js +624 -155
- package/dist/testing.js.map +1 -1
- package/dist/{types-D7qskXNw.d.cts → types-CUPpMRPE.d.cts} +146 -4
- package/dist/{types-D7qskXNw.d.ts → types-CUPpMRPE.d.ts} +146 -4
- package/package.json +21 -1
package/README.md
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
Smart rate limiting, queuing, and cost tracking for AI API calls. Works across providers. Zero required dependencies.
|
|
4
4
|
|
|
5
|
+
[](https://www.npmjs.com/package/ai-sdk-rate-limiter)
|
|
6
|
+
|
|
5
7
|
```
|
|
6
8
|
npm install ai-sdk-rate-limiter
|
|
7
9
|
```
|
|
@@ -63,6 +65,24 @@ The wrapped model is a drop-in replacement. Every Vercel AI SDK feature — stre
|
|
|
63
65
|
|
|
64
66
|
**Raw SDK support** — Works with the native OpenAI, Anthropic, Groq, Mistral, and Cohere SDKs directly via a transparent JavaScript Proxy. No Vercel AI SDK required.
|
|
65
67
|
|
|
68
|
+
**Circuit breaker** — Automatically opens on repeated 5xx failures, blocking requests until the upstream recovers. Transitions to half-open state to probe recovery, then closes once healthy.
|
|
69
|
+
|
|
70
|
+
**Graceful shutdown** — `limiter.shutdown()` drains in-flight requests before the process exits. New requests received during shutdown are rejected with `ShutdownError`.
|
|
71
|
+
|
|
72
|
+
**Persistent cost tracking** — `RedisCostStore` (from `ai-sdk-rate-limiter/redis`) survives process restarts so budget caps remain accurate. `warmUp()` pre-loads historical spend from the store on startup.
|
|
73
|
+
|
|
74
|
+
**Per-scope cost attribution** — `getCostReport()` includes a `byScope` breakdown so you can see exact spend per user, org, or tenant.
|
|
75
|
+
|
|
76
|
+
**Auto-detected limits** — Parses `x-ratelimit-limit-*` headers from every response and tightens the local windows automatically. Your config always wins; detected values fill in where you haven't overridden.
|
|
77
|
+
|
|
78
|
+
**Prometheus metrics** — `createPrometheusPlugin()` (from `ai-sdk-rate-limiter/prometheus`) exports counters, gauges, and histograms for every request, token, cost, retry, and queue event.
|
|
79
|
+
|
|
80
|
+
**StatsD / DogStatsD** — `createStatsDPlugin()` (from `ai-sdk-rate-limiter/statsd`) bridges all events to any StatsD-compatible client.
|
|
81
|
+
|
|
82
|
+
**Call timeout** — `callTimeout` option kills a hung AI call after N milliseconds via `Promise.race()` — independent of the Vercel AI SDK `abortSignal`.
|
|
83
|
+
|
|
84
|
+
**Fallback chains** — `fallback` now accepts an array of models. On `BudgetExceededError`, the chain is walked in order until one succeeds.
|
|
85
|
+
|
|
66
86
|
**OpenTelemetry** — Drop-in OTel plugin that emits GenAI-spec spans for every request. Works with any OTel-compatible tracer.
|
|
67
87
|
|
|
68
88
|
**Testing utilities** — `createTestLimiter()` records every completed call so you can assert on model usage, token counts, and costs in unit tests.
|
|
@@ -80,9 +100,15 @@ The wrapped model is a drop-in replacement. Every Vercel AI SDK feature — stre
|
|
|
80
100
|
- [Multi-tenant scoped limits](#multi-tenant-scoped-limits)
|
|
81
101
|
- [Concurrency limits](#concurrency-limits)
|
|
82
102
|
- [AbortSignal support](#abortsignal-support)
|
|
103
|
+
- [Call timeout](#call-timeout)
|
|
83
104
|
- [Cost tracking](#cost-tracking)
|
|
84
105
|
- [Budget fallback routing](#budget-fallback-routing)
|
|
106
|
+
- [Persistent cost tracking](#persistent-cost-tracking)
|
|
85
107
|
- [Multi-instance Redis store](#multi-instance-redis-store)
|
|
108
|
+
- [Circuit breaker](#circuit-breaker)
|
|
109
|
+
- [Graceful shutdown](#graceful-shutdown)
|
|
110
|
+
- [Prometheus metrics](#prometheus-metrics)
|
|
111
|
+
- [StatsD metrics](#statsD-metrics)
|
|
86
112
|
- [Events](#events)
|
|
87
113
|
- [Backpressure](#backpressure)
|
|
88
114
|
- [Error handling](#error-handling)
|
|
@@ -224,6 +250,9 @@ const limiter = createRateLimiter({
|
|
|
224
250
|
limits: {
|
|
225
251
|
'gpt-4o': { rpm: 500, itpm: 30_000, maxConcurrent: 20 },
|
|
226
252
|
'claude-opus-4-6': { rpm: 50, itpm: 20_000 },
|
|
253
|
+
// rpd — requests per day (enforced in a rolling 24-hour window)
|
|
254
|
+
// otpm — output tokens per minute (based on actuals from completed requests)
|
|
255
|
+
'gemini-1.5-flash': { rpm: 15, rpd: 1_500, otpm: 500_000 },
|
|
227
256
|
},
|
|
228
257
|
|
|
229
258
|
// Cost budgets and behavior when exceeded
|
|
@@ -234,6 +263,14 @@ const limiter = createRateLimiter({
|
|
|
234
263
|
monthly: 500, // USD — hard cap per month
|
|
235
264
|
},
|
|
236
265
|
onExceeded: 'throw', // 'throw' | 'queue' | 'fallback'
|
|
266
|
+
store: new RedisCostStore(redis), // persist cost history across restarts (optional)
|
|
267
|
+
},
|
|
268
|
+
|
|
269
|
+
// Circuit breaker — open on repeated 5xx failures, probe recovery automatically
|
|
270
|
+
circuit: {
|
|
271
|
+
failureThreshold: 5, // consecutive failures before opening
|
|
272
|
+
cooldownMs: 30_000, // how long to stay open before probing
|
|
273
|
+
tripOn: [500, 502, 503, 504], // which status codes count as failures
|
|
237
274
|
},
|
|
238
275
|
|
|
239
276
|
// Queue behavior
|
|
@@ -252,6 +289,7 @@ const limiter = createRateLimiter({
|
|
|
252
289
|
jitter: true, // ±30% randomness (prevents thundering herd)
|
|
253
290
|
parseRetryAfter: true, // honor Retry-After header from 429 responses
|
|
254
291
|
retryOn: [429, 500, 502, 503, 504],
|
|
292
|
+
callTimeout: 30_000, // ms — kill hung AI calls via Promise.race()
|
|
255
293
|
},
|
|
256
294
|
|
|
257
295
|
// Per-scope rate limit overrides for multi-tenant use cases
|
|
@@ -290,8 +328,9 @@ await generateText({
|
|
|
290
328
|
prompt: 'Urgent user request...',
|
|
291
329
|
providerOptions: {
|
|
292
330
|
rateLimiter: {
|
|
293
|
-
priority:
|
|
294
|
-
timeout:
|
|
331
|
+
priority: 'high', // 'high' | 'normal' | 'low'
|
|
332
|
+
timeout: 10_000, // override the default queue timeout for this request
|
|
333
|
+
callTimeout: 15_000, // kill the AI call itself if it hangs beyond 15s
|
|
295
334
|
},
|
|
296
335
|
},
|
|
297
336
|
})
|
|
@@ -373,7 +412,17 @@ await generateText({
|
|
|
373
412
|
})
|
|
374
413
|
```
|
|
375
414
|
|
|
376
|
-
**
|
|
415
|
+
**Model limit fields:**
|
|
416
|
+
|
|
417
|
+
| Field | Description |
|
|
418
|
+
|---|---|
|
|
419
|
+
| `rpm` | Max requests per minute |
|
|
420
|
+
| `itpm` | Max input tokens per minute |
|
|
421
|
+
| `otpm` | Max output tokens per minute (based on completed request actuals) |
|
|
422
|
+
| `rpd` | Max requests per day (rolling 24-hour window) |
|
|
423
|
+
| `maxConcurrent` | Max concurrent in-flight requests |
|
|
424
|
+
|
|
425
|
+
**Scope fields (`config.scopes`):**
|
|
377
426
|
|
|
378
427
|
| Field | Description |
|
|
379
428
|
|---|---|
|
|
@@ -453,6 +502,30 @@ The signal threads through both the rate-limit queue and the concurrency queue.
|
|
|
453
502
|
|
|
454
503
|
---
|
|
455
504
|
|
|
505
|
+
## Call timeout
|
|
506
|
+
|
|
507
|
+
`callTimeout` kills a hung AI API call after N milliseconds using `Promise.race()`. This is distinct from the queue `timeout` (which fires if a request waits too long to _start_) — `callTimeout` fires if the request is already executing but the API hasn't responded.
|
|
508
|
+
|
|
509
|
+
```typescript
|
|
510
|
+
// Global default for all requests
|
|
511
|
+
const limiter = createRateLimiter({
|
|
512
|
+
retry: { callTimeout: 30_000 }, // abort any call that takes longer than 30s
|
|
513
|
+
})
|
|
514
|
+
|
|
515
|
+
// Per-request override
|
|
516
|
+
await generateText({
|
|
517
|
+
model,
|
|
518
|
+
prompt: '...',
|
|
519
|
+
providerOptions: {
|
|
520
|
+
rateLimiter: { callTimeout: 10_000 }, // stricter timeout for this request
|
|
521
|
+
},
|
|
522
|
+
})
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
When a `callTimeout` fires, the request throws a `TimeoutError` (native `DOMException` with `name: 'TimeoutError'`). The retry logic treats it as a retryable failure if the status code is in `retryOn`. Set `callTimeout` to `undefined` (the default) to disable it.
|
|
526
|
+
|
|
527
|
+
---
|
|
528
|
+
|
|
456
529
|
## Cost tracking
|
|
457
530
|
|
|
458
531
|
```typescript
|
|
@@ -467,12 +540,18 @@ console.log(report)
|
|
|
467
540
|
// byModel: {
|
|
468
541
|
// 'gpt-4o': { requests: 120, inputTokens: 240_000, outputTokens: 60_000, costUsd: 1.20 },
|
|
469
542
|
// 'gpt-4o-mini': { requests: 198, inputTokens: 380_000, outputTokens: 95_000, costUsd: 0.91 },
|
|
543
|
+
// },
|
|
544
|
+
// byScope: {
|
|
545
|
+
// 'user:alice': { requests: 15, inputTokens: 30_000, outputTokens: 7_500, costUsd: 0.15 },
|
|
546
|
+
// 'user:bob': { requests: 8, inputTokens: 12_000, outputTokens: 3_000, costUsd: 0.06 },
|
|
470
547
|
// }
|
|
471
548
|
// }
|
|
472
549
|
```
|
|
473
550
|
|
|
474
551
|
Costs are based on **actual token counts** from API responses — not estimates. The report uses rolling windows, so `hour` always means "the last 60 minutes."
|
|
475
552
|
|
|
553
|
+
`byScope` is populated automatically when requests carry a `scope` (either set on `limiter.wrap()` or via `providerOptions.rateLimiter.scope`). Unscoped requests don't appear in `byScope`.
|
|
554
|
+
|
|
476
555
|
---
|
|
477
556
|
|
|
478
557
|
## Budget fallback routing
|
|
@@ -501,19 +580,73 @@ const model = limiter.wrap(
|
|
|
501
580
|
const result = await generateText({ model, prompt })
|
|
502
581
|
```
|
|
503
582
|
|
|
583
|
+
**Fallback chains** — pass an array to walk multiple fallbacks in order:
|
|
584
|
+
|
|
585
|
+
```typescript
|
|
586
|
+
const model = limiter.wrap(openai('gpt-4o'), {
|
|
587
|
+
fallback: [
|
|
588
|
+
openai('gpt-4o-mini'), // try first
|
|
589
|
+
openai('gpt-3.5-turbo'), // try second if gpt-4o-mini is also over budget
|
|
590
|
+
],
|
|
591
|
+
})
|
|
592
|
+
```
|
|
593
|
+
|
|
594
|
+
Each model in the chain is tried in order. If all are over budget, `BudgetExceededError` is thrown.
|
|
595
|
+
|
|
504
596
|
**Behavior matrix:**
|
|
505
597
|
|
|
506
598
|
| `onExceeded` | `fallback` configured | Outcome |
|
|
507
599
|
|---|---|---|
|
|
508
600
|
| `'throw'` | any | Throws `BudgetExceededError` |
|
|
509
|
-
| `'fallback'` | yes | Transparently
|
|
601
|
+
| `'fallback'` | yes | Transparently walks the fallback chain |
|
|
510
602
|
| `'fallback'` | no | Throws `BudgetExceededError` |
|
|
511
|
-
| `'queue'` | any |
|
|
603
|
+
| `'queue'` | any | Holds the request until the rolling window clears enough spend; throws `QueueTimeoutError` if `queue.timeout` elapses |
|
|
512
604
|
|
|
513
605
|
Fallback usage is tracked under the fallback model's ID in `getCostReport()`.
|
|
514
606
|
|
|
515
607
|
---
|
|
516
608
|
|
|
609
|
+
## Persistent cost tracking
|
|
610
|
+
|
|
611
|
+
By default, cost history lives in memory and resets on restart. If your process restarts frequently (serverless, rolling deploys), budget caps could be bypassed because the new instance starts with $0 spend.
|
|
612
|
+
|
|
613
|
+
`RedisCostStore` persists every cost entry to a Redis sorted set so budget caps survive restarts:
|
|
614
|
+
|
|
615
|
+
```typescript
|
|
616
|
+
import { createRateLimiter } from 'ai-sdk-rate-limiter'
|
|
617
|
+
import { RedisCostStore } from 'ai-sdk-rate-limiter/redis'
|
|
618
|
+
import Redis from 'ioredis'
|
|
619
|
+
|
|
620
|
+
const redis = new Redis(process.env.REDIS_URL)
|
|
621
|
+
|
|
622
|
+
const limiter = createRateLimiter({
|
|
623
|
+
cost: {
|
|
624
|
+
budget: { daily: 50 },
|
|
625
|
+
onExceeded: 'throw',
|
|
626
|
+
store: new RedisCostStore(redis), // persist entries to Redis
|
|
627
|
+
},
|
|
628
|
+
})
|
|
629
|
+
|
|
630
|
+
// On startup — pre-load the last 30 days of spend so the in-memory
|
|
631
|
+
// window is accurate immediately (before any new requests)
|
|
632
|
+
await limiter.warmUp()
|
|
633
|
+
```
|
|
634
|
+
|
|
635
|
+
**`warmUp()`** — loads the last 30 days of entries from the store into the in-memory cost tracker. Call it once after `createRateLimiter()`. Without it the limiter works correctly for new requests, but budget checks won't account for spend from previous process runs until the first request arrives.
|
|
636
|
+
|
|
637
|
+
`RedisCostStore` options:
|
|
638
|
+
|
|
639
|
+
```typescript
|
|
640
|
+
new RedisCostStore(redis, {
|
|
641
|
+
keyPrefix: 'cost:myapp:', // namespace key (default: 'airl:cost:')
|
|
642
|
+
ttlMs: 30 * 86_400_000, // TTL for the sorted set (default: 30 days)
|
|
643
|
+
})
|
|
644
|
+
```
|
|
645
|
+
|
|
646
|
+
Errors from the cost store (connection failures, etc.) are swallowed silently — cost persistence is best-effort and never blocks request execution.
|
|
647
|
+
|
|
648
|
+
---
|
|
649
|
+
|
|
517
650
|
## Multi-instance Redis store
|
|
518
651
|
|
|
519
652
|
By default, rate limit state is in-memory (per-process). For multi-instance deployments — multiple pods, serverless replicas, workers — each instance has its own counters. Install the Redis store to share state:
|
|
@@ -546,7 +679,9 @@ new RedisStore(redis, {
|
|
|
546
679
|
|
|
547
680
|
**How it works internally:**
|
|
548
681
|
|
|
549
|
-
Each request runs a Lua script atomically that: removes stale entries from a sorted set, counts requests and tokens in the current window, checks against RPM and
|
|
682
|
+
Each request runs a Lua script atomically that: removes stale entries from a sorted set, counts requests and tokens in the current window, checks against RPM, ITPM, OTPM, and RPD limits, and either reserves the slot or returns when the next slot opens. The local queue (priority ordering, drain timer, timeout handling) stays in-memory per instance — only the window counters are shared via Redis.
|
|
683
|
+
|
|
684
|
+
**Failover** — If Redis is unreachable (connection error, timeout), the store fails open: rate limit enforcement is suspended for that call and the request proceeds normally. Enforcement resumes as soon as the store recovers. This means AI calls never block on Redis availability — you trade enforcement precision for reliability during outages.
|
|
550
685
|
|
|
551
686
|
**Compatible clients** — any client with `eval()`, `get()`, and `set()` works: `ioredis`, `node-redis`, Upstash Redis.
|
|
552
687
|
|
|
@@ -554,6 +689,175 @@ Use the default `InMemoryStore` for single-instance deployments — it's more ac
|
|
|
554
689
|
|
|
555
690
|
---
|
|
556
691
|
|
|
692
|
+
## Circuit breaker
|
|
693
|
+
|
|
694
|
+
The circuit breaker protects against cascading failures when an upstream AI API is degrading. After N consecutive 5xx failures, the circuit opens and subsequent requests fail immediately with `CircuitOpenError` rather than piling up and timing out.
|
|
695
|
+
|
|
696
|
+
```typescript
|
|
697
|
+
import { createRateLimiter, CircuitOpenError } from 'ai-sdk-rate-limiter'
|
|
698
|
+
|
|
699
|
+
const limiter = createRateLimiter({
|
|
700
|
+
circuit: {
|
|
701
|
+
failureThreshold: 5, // open after 5 consecutive failures
|
|
702
|
+
cooldownMs: 30_000, // stay open for 30s, then probe
|
|
703
|
+
tripOn: [500, 502, 503, 504], // which HTTP status codes trip the circuit
|
|
704
|
+
},
|
|
705
|
+
on: {
|
|
706
|
+
circuitOpen: ({ model, openUntilMs }) =>
|
|
707
|
+
console.error(`Circuit open for ${model} until ${new Date(openUntilMs).toISOString()}`),
|
|
708
|
+
circuitClosed: ({ model }) =>
|
|
709
|
+
console.log(`Circuit closed for ${model} — upstream recovered`),
|
|
710
|
+
},
|
|
711
|
+
})
|
|
712
|
+
|
|
713
|
+
try {
|
|
714
|
+
const result = await generateText({ model, prompt })
|
|
715
|
+
} catch (err) {
|
|
716
|
+
if (err instanceof CircuitOpenError) {
|
|
717
|
+
// Fail fast — upstream is degraded, don't pile on
|
|
718
|
+
return res.status(503).json({
|
|
719
|
+
error: 'AI service temporarily unavailable',
|
|
720
|
+
retryAfter: Math.ceil((err.openUntilMs - Date.now()) / 1000),
|
|
721
|
+
})
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
```
|
|
725
|
+
|
|
726
|
+
**State machine:**
|
|
727
|
+
|
|
728
|
+
- `CLOSED` (normal) — requests pass through; failures are counted
|
|
729
|
+
- `OPEN` — requests fail immediately with `CircuitOpenError`; after `cooldownMs`, transitions to HALF_OPEN
|
|
730
|
+
- `HALF_OPEN` — one probe request is allowed; success → CLOSED, failure → OPEN (resets cooldown)
|
|
731
|
+
|
|
732
|
+
The circuit is per-model. A failing `gpt-4o` doesn't affect `gpt-4o-mini`. 429 rate-limit errors do **not** trip the circuit — only 5xx errors (or whatever you configure in `tripOn`) count as failures.
|
|
733
|
+
|
|
734
|
+
---
|
|
735
|
+
|
|
736
|
+
## Graceful shutdown
|
|
737
|
+
|
|
738
|
+
```typescript
|
|
739
|
+
// On SIGTERM / process exit
|
|
740
|
+
process.on('SIGTERM', async () => {
|
|
741
|
+
// Stop accepting new requests, wait up to 30s for in-flight ones to finish
|
|
742
|
+
await limiter.shutdown({ drainMs: 30_000 })
|
|
743
|
+
process.exit(0)
|
|
744
|
+
})
|
|
745
|
+
```
|
|
746
|
+
|
|
747
|
+
After `shutdown()` is called:
|
|
748
|
+
- New requests throw `ShutdownError` immediately
|
|
749
|
+
- In-flight requests complete normally (up to `drainMs`)
|
|
750
|
+
- The returned promise resolves when the queue drains or `drainMs` elapses
|
|
751
|
+
|
|
752
|
+
```typescript
|
|
753
|
+
import { ShutdownError } from 'ai-sdk-rate-limiter'
|
|
754
|
+
|
|
755
|
+
try {
|
|
756
|
+
const result = await generateText({ model, prompt })
|
|
757
|
+
} catch (err) {
|
|
758
|
+
if (err instanceof ShutdownError) {
|
|
759
|
+
// Process is shutting down — expected, not an error
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
```
|
|
763
|
+
|
|
764
|
+
---
|
|
765
|
+
|
|
766
|
+
## Prometheus metrics
|
|
767
|
+
|
|
768
|
+
```
|
|
769
|
+
npm install ai-sdk-rate-limiter
|
|
770
|
+
```
|
|
771
|
+
|
|
772
|
+
The `ai-sdk-rate-limiter/prometheus` entry point provides in-process Prometheus metrics with no external dependencies. Metrics are accumulated in memory and rendered to the text exposition format on demand.
|
|
773
|
+
|
|
774
|
+
```typescript
|
|
775
|
+
import { createRateLimiter } from 'ai-sdk-rate-limiter'
|
|
776
|
+
import { createPrometheusPlugin } from 'ai-sdk-rate-limiter/prometheus'
|
|
777
|
+
|
|
778
|
+
const prometheus = createPrometheusPlugin()
|
|
779
|
+
|
|
780
|
+
const limiter = createRateLimiter({
|
|
781
|
+
on: prometheus,
|
|
782
|
+
})
|
|
783
|
+
|
|
784
|
+
// Expose /metrics endpoint (Express example)
|
|
785
|
+
app.get('/metrics', (req, res) => {
|
|
786
|
+
res.set('Content-Type', 'text/plain; version=0.0.4')
|
|
787
|
+
res.send(prometheus.collect())
|
|
788
|
+
})
|
|
789
|
+
```
|
|
790
|
+
|
|
791
|
+
**Metrics exported:**
|
|
792
|
+
|
|
793
|
+
| Metric | Type | Description |
|
|
794
|
+
|---|---|---|
|
|
795
|
+
| `ai_requests_total` | counter | Total requests, labelled by `model`, `provider`, `status` |
|
|
796
|
+
| `ai_tokens_input_total` | counter | Total input tokens, labelled by `model`, `provider` |
|
|
797
|
+
| `ai_tokens_output_total` | counter | Total output tokens, labelled by `model`, `provider` |
|
|
798
|
+
| `ai_cost_usd_total` | counter | Total cost in USD, labelled by `model`, `provider` |
|
|
799
|
+
| `ai_request_duration_ms` | summary | Request latency (p50, p90, p99), labelled by `model` |
|
|
800
|
+
| `ai_retries_total` | counter | Total retry attempts, labelled by `model` |
|
|
801
|
+
| `ai_rate_limited_total` | counter | Rate-limit hits, labelled by `model`, `source` |
|
|
802
|
+
| `ai_budget_exceeded_total` | counter | Budget exceeded events, labelled by `model`, `period` |
|
|
803
|
+
| `ai_queue_depth` | gauge | Current queue depth, labelled by `model` |
|
|
804
|
+
|
|
805
|
+
```typescript
|
|
806
|
+
// Custom metric prefix
|
|
807
|
+
const prometheus = createPrometheusPlugin({ prefix: 'myapp_' })
|
|
808
|
+
// → myapp_requests_total, myapp_tokens_input_total, ...
|
|
809
|
+
|
|
810
|
+
// Reset counters (e.g. for tests)
|
|
811
|
+
prometheus.reset()
|
|
812
|
+
```
|
|
813
|
+
|
|
814
|
+
---
|
|
815
|
+
|
|
816
|
+
## StatsD metrics
|
|
817
|
+
|
|
818
|
+
```typescript
|
|
819
|
+
import { createRateLimiter } from 'ai-sdk-rate-limiter'
|
|
820
|
+
import { createStatsDPlugin } from 'ai-sdk-rate-limiter/statsd'
|
|
821
|
+
import StatsD from 'hot-shots' // or node-statsd, dogstatsd-client, etc.
|
|
822
|
+
|
|
823
|
+
const statsd = new StatsD({ host: 'localhost', port: 8125 })
|
|
824
|
+
|
|
825
|
+
const limiter = createRateLimiter({
|
|
826
|
+
on: createStatsDPlugin(statsd, {
|
|
827
|
+
prefix: 'myapp.ai.', // default: 'ai.'
|
|
828
|
+
globalTags: ['env:prod'], // appended to every metric
|
|
829
|
+
}),
|
|
830
|
+
})
|
|
831
|
+
```
|
|
832
|
+
|
|
833
|
+
Any client that implements the `StatsDClient` interface works — `hot-shots`, `node-statsd`, `datadog-metrics`, or a custom implementation:
|
|
834
|
+
|
|
835
|
+
```typescript
|
|
836
|
+
import type { StatsDClient } from 'ai-sdk-rate-limiter/statsd'
|
|
837
|
+
|
|
838
|
+
const client: StatsDClient = {
|
|
839
|
+
increment(metric, value, tags) { /* ... */ },
|
|
840
|
+
gauge(metric, value, tags) { /* ... */ },
|
|
841
|
+
timing(metric, value, tags) { /* ... */ },
|
|
842
|
+
}
|
|
843
|
+
```
|
|
844
|
+
|
|
845
|
+
**Metrics emitted** (same set as Prometheus, DogStatsD tag format `['key:value']`):
|
|
846
|
+
|
|
847
|
+
| Metric | Type | Tags |
|
|
848
|
+
|---|---|---|
|
|
849
|
+
| `ai.requests` | increment | `model:*`, `provider:*`, `status:completed\|dropped` |
|
|
850
|
+
| `ai.tokens.input` | increment | `model:*`, `provider:*` |
|
|
851
|
+
| `ai.tokens.output` | increment | `model:*`, `provider:*` |
|
|
852
|
+
| `ai.cost_usd` | timing (gauge) | `model:*`, `provider:*` |
|
|
853
|
+
| `ai.latency_ms` | timing | `model:*` |
|
|
854
|
+
| `ai.retries` | increment | `model:*` |
|
|
855
|
+
| `ai.rate_limited` | increment | `model:*`, `source:local\|remote` |
|
|
856
|
+
| `ai.budget_exceeded` | increment | `model:*`, `period:hourly\|daily\|monthly` |
|
|
857
|
+
| `ai.queue_depth` | gauge | `model:*` |
|
|
858
|
+
|
|
859
|
+
---
|
|
860
|
+
|
|
557
861
|
## Events
|
|
558
862
|
|
|
559
863
|
All events are typed. Register handlers at creation time or dynamically:
|
|
@@ -579,8 +883,20 @@ limiter.off('queued', handler)
|
|
|
579
883
|
| `retrying` | A failed request is about to retry | `model`, `provider`, `attempt`, `maxAttempts`, `delayMs`, `error` |
|
|
580
884
|
| `rateLimited` | Limit hit (local or remote 429) | `model`, `provider`, `source`, `limitType`, `resetAt` |
|
|
581
885
|
| `budgetHit` | Cost budget exceeded | `model`, `provider`, `currentCostUsd`, `limitUsd`, `period`, `usingFallback` |
|
|
582
|
-
| `dropped` | Request rejected
|
|
583
|
-
| `completed` | Request finished successfully | `model`, `provider`, `inputTokens`, `outputTokens`, `costUsd`, `latencyMs`, `streaming` |
|
|
886
|
+
| `dropped` | Request rejected | `model`, `provider`, `reason`, `waitedMs?`, `queueDepth?`, `scope?`, `metadata?` |
|
|
887
|
+
| `completed` | Request finished successfully | `model`, `provider`, `inputTokens`, `outputTokens`, `costUsd`, `latencyMs`, `streaming`, `scope?` |
|
|
888
|
+
| `circuitOpen` | Circuit breaker opened | `model`, `provider`, `openUntilMs`, `failureCount` |
|
|
889
|
+
| `circuitClosed` | Circuit breaker closed (upstream recovered) | `model`, `provider` |
|
|
890
|
+
| `limitsDetected` | Limits auto-updated from response headers | `model`, `provider`, `detectedLimits` |
|
|
891
|
+
|
|
892
|
+
**`dropped` reason values:**
|
|
893
|
+
|
|
894
|
+
| Reason | Cause |
|
|
895
|
+
|---|---|
|
|
896
|
+
| `'queue-full'` | Queue at `maxSize` capacity |
|
|
897
|
+
| `'queue-timeout'` | Request waited longer than `queue.timeout` |
|
|
898
|
+
| `'circuit-open'` | Circuit breaker is open |
|
|
899
|
+
| `'shutdown'` | Limiter is shutting down |
|
|
584
900
|
|
|
585
901
|
The `source` on `rateLimited` distinguishes between requests we blocked locally (`'local'`) vs. requests the API rejected with a 429 (`'remote'`). Local blocks are expected and free. Frequent remote blocks mean your configured limits are too high for your tier — run `npx ai-sdk-rate-limiter audit` to get accurate numbers.
|
|
586
902
|
|
|
@@ -618,6 +934,8 @@ import {
|
|
|
618
934
|
QueueFullError,
|
|
619
935
|
BudgetExceededError,
|
|
620
936
|
RetryExhaustedError,
|
|
937
|
+
CircuitOpenError,
|
|
938
|
+
ShutdownError,
|
|
621
939
|
RateLimiterError,
|
|
622
940
|
} from 'ai-sdk-rate-limiter'
|
|
623
941
|
|
|
@@ -630,6 +948,13 @@ try {
|
|
|
630
948
|
} else if (error instanceof BudgetExceededError) {
|
|
631
949
|
// Cost budget hit and onExceeded is 'throw' or no fallback configured
|
|
632
950
|
console.error(`Budget exceeded: $${error.currentCostUsd} of $${error.limitUsd} ${error.period}`)
|
|
951
|
+
} else if (error instanceof CircuitOpenError) {
|
|
952
|
+
// Circuit breaker is open — upstream is degraded
|
|
953
|
+
const retryAfterSec = Math.ceil((error.openUntilMs - Date.now()) / 1000)
|
|
954
|
+
res.status(503).json({ error: 'AI service temporarily unavailable', retryAfter: retryAfterSec })
|
|
955
|
+
} else if (error instanceof ShutdownError) {
|
|
956
|
+
// Limiter is shutting down — process is exiting
|
|
957
|
+
console.log('Limiter shutting down, request rejected')
|
|
633
958
|
} else if (error instanceof RetryExhaustedError) {
|
|
634
959
|
// All retry attempts failed
|
|
635
960
|
console.error(`All ${error.attempts} retries exhausted`, error.cause)
|
|
@@ -654,6 +979,8 @@ All errors extend `RateLimiterError`, so a single `instanceof RateLimiterError`
|
|
|
654
979
|
|---|---|
|
|
655
980
|
| `QueueTimeoutError` | `model`, `waitedMs`, `queueDepth` |
|
|
656
981
|
| `BudgetExceededError` | `model`, `currentCostUsd`, `limitUsd`, `period` |
|
|
982
|
+
| `CircuitOpenError` | `model`, `openUntilMs` |
|
|
983
|
+
| `ShutdownError` | — |
|
|
657
984
|
| `RetryExhaustedError` | `model`, `attempts`, `cause` |
|
|
658
985
|
| `QueueFullError` | `model`, `maxSize` |
|
|
659
986
|
| `RateLimitExceededError` | `model`, `limitType`, `limit`, `resetAt` |
|
|
@@ -999,9 +1326,11 @@ Implement `RateLimitStore` to use any backend (DynamoDB, Postgres, etc.):
|
|
|
999
1326
|
import type { RateLimitStore } from 'ai-sdk-rate-limiter'
|
|
1000
1327
|
|
|
1001
1328
|
class MyStore implements RateLimitStore {
|
|
1002
|
-
async
|
|
1003
|
-
async
|
|
1329
|
+
async checkAndRecord(key, estimatedInputTokens, limits) { /* ... */ }
|
|
1330
|
+
async reconcile(key, actualInputTokens, actualOutputTokens) { /* ... */ }
|
|
1331
|
+
async setBackoff(key, untilMs) { /* ... */ }
|
|
1004
1332
|
async getBackoff(key) { /* ... */ }
|
|
1333
|
+
async nextSlotMs(key, limits, estimatedInputTokens) { /* ... */ }
|
|
1005
1334
|
}
|
|
1006
1335
|
|
|
1007
1336
|
const limiter = createRateLimiter({ store: new MyStore() })
|
|
@@ -1011,7 +1340,7 @@ const limiter = createRateLimiter({ store: new MyStore() })
|
|
|
1011
1340
|
|
|
1012
1341
|
## How it works
|
|
1013
1342
|
|
|
1014
|
-
**Rate limiting** — Sliding window counter per model. Each model tracks a list of `{timestamp, tokens}` entries for the past 60 seconds. On every request, stale entries are evicted and the window is checked against RPM and
|
|
1343
|
+
**Rate limiting** — Sliding window counter per model. Each model tracks a list of `{timestamp, tokens}` entries for the past 60 seconds. On every request, stale entries are evicted and the window is checked against RPM, ITPM, and OTPM limits simultaneously. RPD uses a separate 24-hour rolling window. OTPM is based on actual output token counts from completed requests.
|
|
1015
1344
|
|
|
1016
1345
|
**Queue** — A sorted priority queue per model, ordered by `priority` then enqueue time (FIFO within same priority). A drain timer fires when the oldest window entry expires, processing as many waiters as possible before rescheduling.
|
|
1017
1346
|
|
|
@@ -1023,7 +1352,7 @@ const limiter = createRateLimiter({ store: new MyStore() })
|
|
|
1023
1352
|
|
|
1024
1353
|
**Retry-After propagation** — When a remote 429 arrives with a `Retry-After` header, the backoff is applied to the entire model key in the engine, not just the failing request. All requests queued behind it pause until the backoff clears. This prevents the common thundering-herd failure where you retry one request while 10 others immediately follow and all get 429s.
|
|
1025
1354
|
|
|
1026
|
-
**Token estimation** — Before a request fires, tokens are estimated from the prompt text (~4 chars/token) and reserved in the window. After the response, actual usage from the API replaces the estimate. For streaming, actual counts come from the `finish` chunk (Vercel AI SDK) or the final usage chunk (raw proxy).
|
|
1355
|
+
**Token estimation** — Before a request fires, tokens are estimated from the prompt text (~4 chars/token) and reserved in the window. After the response, actual usage from the API replaces the estimate. For streaming, actual counts come from the `finish` chunk (Vercel AI SDK) or the final usage chunk (raw proxy). If a stream ends without a usage chunk (some error paths), the window is updated with zeros rather than leaving the estimate in place.
|
|
1027
1356
|
|
|
1028
1357
|
**Zero dependencies** — The Vercel AI SDK middleware interface is implemented structurally — `@ai-sdk/provider` types are used for type checking only and not required at runtime. No `ioredis`, no `bottleneck`, no tokenizer libraries in the core.
|
|
1029
1358
|
|
|
@@ -1040,10 +1369,19 @@ const limiter = createRateLimiter({ store: new MyStore() })
|
|
|
1040
1369
|
| Priority queue | yes | yes | no | no | no |
|
|
1041
1370
|
| Concurrency limits | yes | yes | yes | no | no |
|
|
1042
1371
|
| Cost tracking + budgets | yes | no | no | no | no |
|
|
1372
|
+
| Persistent cost store | yes | no | no | no | no |
|
|
1373
|
+
| Per-scope cost attribution | yes | no | no | no | no |
|
|
1374
|
+
| Budget fallback chains | yes | no | no | no | no |
|
|
1375
|
+
| Circuit breaker | yes | no | no | no | no |
|
|
1376
|
+
| Graceful shutdown | yes | no | no | no | no |
|
|
1377
|
+
| Auto-detected limits | yes | no | no | no | no |
|
|
1043
1378
|
| Multi-tenant scoped limits | yes | no | no | no | no |
|
|
1044
1379
|
| AbortSignal propagation | yes | no | no | no | no |
|
|
1380
|
+
| Call timeout | yes | no | no | no | no |
|
|
1045
1381
|
| Retry-After header | yes | no | no | partial | partial |
|
|
1046
1382
|
| Backoff propagation | yes | no | no | no | no |
|
|
1383
|
+
| Prometheus metrics | yes | no | no | no | no |
|
|
1384
|
+
| StatsD metrics | yes | no | no | no | no |
|
|
1047
1385
|
| OpenTelemetry | yes | no | no | no | partial |
|
|
1048
1386
|
| Testing utilities | yes | no | no | no | no |
|
|
1049
1387
|
| CLI audit | yes | no | no | no | no |
|
|
@@ -1071,9 +1409,22 @@ import type {
|
|
|
1071
1409
|
EventMap,
|
|
1072
1410
|
QueuedEvent,
|
|
1073
1411
|
Priority,
|
|
1412
|
+
CircuitBreakerConfig,
|
|
1413
|
+
CircuitOpenEvent,
|
|
1414
|
+
CircuitClosedEvent,
|
|
1415
|
+
LimitsDetectedEvent,
|
|
1416
|
+
DroppedEvent,
|
|
1417
|
+
CompletedEvent,
|
|
1074
1418
|
} from 'ai-sdk-rate-limiter'
|
|
1075
1419
|
|
|
1076
1420
|
import type { CallRecord } from 'ai-sdk-rate-limiter/testing'
|
|
1421
|
+
|
|
1422
|
+
import type {
|
|
1423
|
+
CostStore,
|
|
1424
|
+
PersistedCostEntry,
|
|
1425
|
+
} from 'ai-sdk-rate-limiter/redis'
|
|
1426
|
+
|
|
1427
|
+
import type { StatsDClient } from 'ai-sdk-rate-limiter/statsd'
|
|
1077
1428
|
```
|
|
1078
1429
|
|
|
1079
1430
|
---
|