ai-sdk-rate-limiter 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +195 -0
- package/LICENSE +21 -0
- package/README.md +76 -0
- package/dist/cli.js +1 -1
- package/dist/index.cjs +180 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +38 -3
- package/dist/index.d.ts +38 -3
- package/dist/index.js +180 -12
- package/dist/index.js.map +1 -1
- package/dist/middleware.d.cts +1 -1
- package/dist/middleware.d.ts +1 -1
- package/dist/otel.d.cts +1 -1
- package/dist/otel.d.ts +1 -1
- package/dist/prometheus.d.cts +1 -1
- package/dist/prometheus.d.ts +1 -1
- package/dist/redis.d.cts +1 -1
- package/dist/redis.d.ts +1 -1
- package/dist/statsd.d.cts +1 -1
- package/dist/statsd.d.ts +1 -1
- package/dist/testing.cjs +150 -11
- package/dist/testing.cjs.map +1 -1
- package/dist/testing.d.cts +1 -1
- package/dist/testing.d.ts +1 -1
- package/dist/testing.js +150 -11
- package/dist/testing.js.map +1 -1
- package/dist/{types-CMevWGWK.d.cts → types-DtVjHfbF.d.cts} +54 -1
- package/dist/{types-CMevWGWK.d.ts → types-DtVjHfbF.d.ts} +54 -1
- package/package.json +27 -6
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## [Unreleased]
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## [0.13.0] - 2026-03-13
|
|
15
|
+
|
|
16
|
+
### Added
|
|
17
|
+
- **`limiter.reset()`** — clear all rate-limit, queue, cost, and circuit-breaker state without recreating the instance. Queued requests are rejected with `ShutdownError`. Primarily useful in tests to reset between cases with a shared limiter instance.
|
|
18
|
+
- **`queue.onFull: 'drop-low'`** fully implemented — when the queue is at capacity and a `high` or `normal` priority request arrives, the tail `low`-priority waiter is evicted (rejected with `QueueFullError`) to make room. Useful for mixed workloads where background batch jobs should never block user-facing requests.
|
|
19
|
+
|
|
20
|
+
### Changed
|
|
21
|
+
- Unknown models with zero pricing now emit a one-time `console.warn` on first use, pointing at the exact `config.limits` fix needed to enable cost tracking. Known registry models and models with user-supplied pricing are silent.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## [0.12.0] - 2026-03-12
|
|
26
|
+
|
|
27
|
+
### Added
|
|
28
|
+
- **`getCostForecast()`** — project end-of-period spend based on the current hourly rate. Returns `{ hour, day, month }` each with `spentUsd`, `projectedUsd`, and `ratePerHourUsd`. Useful for alerting before a budget cap is hit.
|
|
29
|
+
- **`createModelPool(models, options?)`** — round-robin (or random) load balancer across multiple wrapped model instances. Distributes calls evenly across API keys or model variants. Import from `ai-sdk-rate-limiter`.
|
|
30
|
+
- **Request deduplication** — pass `dedupKey` in `providerOptions.rateLimiter` to make concurrent identical requests share a single API call. All callers receive the same result; the dedup entry is cleared on completion so the next request always makes a fresh call.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## [0.11.0] - 2026-03-12
|
|
35
|
+
|
|
36
|
+
### Added
|
|
37
|
+
- **Debug mode** — set `debug: true` on `createRateLimiter()` to enable structured console logging for every rate-limit decision, queue entry/exit, slot acquisition, circuit breaker state change, and cost recording. Zero overhead when disabled.
|
|
38
|
+
- **Config validation** — `createRateLimiter()` now validates your config at construction time and emits `console.warn` for common misconfigurations:
|
|
39
|
+
- `cost.store` set without calling `warmUp()` reminder
|
|
40
|
+
- `circuit.failureThreshold < 3` (too sensitive, risks false trips)
|
|
41
|
+
- `retry.retryOn` explicitly excludes 429 (defeats rate-limit retry)
|
|
42
|
+
- `queue.timeout < 3000ms` (too short, requests will time out before serving)
|
|
43
|
+
- `cost.budget` set without `onExceeded` (uses silent default `'throw'`)
|
|
44
|
+
- `cost.onExceeded: 'fallback'` reminder to configure fallback model
|
|
45
|
+
- GitHub Actions CI workflow (Node 18 / 20 / 22 matrix)
|
|
46
|
+
- `CHANGELOG.md` with retroactive entries from v0.1.0
|
|
47
|
+
|
|
48
|
+
### Fixed
|
|
49
|
+
- `DebugLogger` details serialization: empty objects no longer emit trailing `()`
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## [0.10.0] - 2026-03-12
|
|
54
|
+
|
|
55
|
+
### Added
|
|
56
|
+
- `ai-sdk-rate-limiter/middleware` entry point
|
|
57
|
+
- `createRateLimiterMiddleware(limiter, opts)` — returns `{ middleware, errorHandler }` for Express
|
|
58
|
+
- `createRateLimiterErrorHandler(opts)` — standalone 4-arg Express error handler
|
|
59
|
+
- `createHonoMiddleware(limiter, opts)` — Hono middleware with `c.var.rateLimiter`
|
|
60
|
+
- `mapErrorToResponse(err)` — utility for Fastify and custom frameworks
|
|
61
|
+
- `RateLimiterRequestContext` type + `http.IncomingMessage` augmentation for `req.rateLimiter`
|
|
62
|
+
- Automatic error → HTTP mapping: `QueueTimeoutError` → 503, `BudgetExceededError` → 402, etc.
|
|
63
|
+
- `injectHeaders` option: adds `X-RateLimit-*` informational headers to responses
|
|
64
|
+
|
|
65
|
+
### Added (examples)
|
|
66
|
+
- `examples/multi-tenant-express/` — Express API with per-user scoped rate limits (free/pro tiers)
|
|
67
|
+
- `examples/batch-processing/` — Concurrent batch jobs with priority queuing + graceful shutdown
|
|
68
|
+
- `examples/budget-alerts/` — Slack/webhook budget alerts with per-scope spend breakdown
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## [0.9.0] - 2026-03-12
|
|
73
|
+
|
|
74
|
+
### Added
|
|
75
|
+
- **Circuit breaker** — auto-opens on repeated 5xx failures, half-open probe, configurable thresholds
|
|
76
|
+
- `CircuitBreakerConfig` in `RateLimiterConfig.circuit`
|
|
77
|
+
- `CircuitOpenError` thrown when circuit is open
|
|
78
|
+
- `circuitOpen` / `circuitClosed` events
|
|
79
|
+
- **Graceful shutdown** — `limiter.shutdown({ drainMs })`, `ShutdownError`
|
|
80
|
+
- **Persistent cost tracking** — `CostStore` interface + `RedisCostStore` in `ai-sdk-rate-limiter/redis`
|
|
81
|
+
- `limiter.warmUp()` pre-loads historical spend on startup
|
|
82
|
+
- **Per-scope cost attribution** — `getCostReport().byScope` breakdown per user/org/tenant
|
|
83
|
+
- **Fallback chains** — `fallback` accepts `WrappableModel[]`, walked in order on `BudgetExceededError`
|
|
84
|
+
- **Call timeout** — `callTimeout` in retry config and per-request options (uses `Promise.race`)
|
|
85
|
+
- **Auto-detected limits** — parses `x-ratelimit-limit-*` response headers, user config wins
|
|
86
|
+
- `limitsDetected` event
|
|
87
|
+
- **Prometheus metrics** — `createPrometheusPlugin()` in `ai-sdk-rate-limiter/prometheus`
|
|
88
|
+
- **StatsD / DogStatsD** — `createStatsDPlugin(client)` in `ai-sdk-rate-limiter/statsd`
|
|
89
|
+
- **Drop hooks** — `DroppedEvent` now includes `reason`, `waitedMs`, `queueDepth`, `scope`, `metadata`
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## [0.8.0] - 2026-03-11
|
|
94
|
+
|
|
95
|
+
### Added
|
|
96
|
+
- Redis store for multi-instance rate limiting (`ai-sdk-rate-limiter/redis`)
|
|
97
|
+
- `RedisStore` — Lua-script-based atomic sliding window shared across instances
|
|
98
|
+
- Fail-open on Redis errors (enforcement suspended, never blocks requests)
|
|
99
|
+
- Compatible with ioredis, node-redis, Upstash Redis
|
|
100
|
+
- `rpd` (requests per day) limit support — rolling 24-hour window
|
|
101
|
+
- `otpm` (output tokens per minute) limit support — based on completed request actuals
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## [0.7.1] - 2026-03-10
|
|
106
|
+
|
|
107
|
+
### Fixed
|
|
108
|
+
- `Retry-After` header parsing: correctly handles duration strings like `"6m30s"` (previously parsed as 6s)
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## [0.7.0] - 2026-03-10
|
|
113
|
+
|
|
114
|
+
### Added
|
|
115
|
+
- Raw SDK proxy — `limiter.rawProxy(client)` wraps native OpenAI/Anthropic/Groq/Mistral/Cohere clients
|
|
116
|
+
- Transparent `Proxy`-based drop-in with no API changes
|
|
117
|
+
- Streaming support via `AsyncIterable` wrapping for usage chunk capture
|
|
118
|
+
- `rateLimited(client, opts)` standalone factory
|
|
119
|
+
- Budget fallback routing — `onExceeded: 'fallback'` transparently reroutes to a cheaper model
|
|
120
|
+
- `limiter.wrap(model, { fallback: cheaperModel })`
|
|
121
|
+
- `usingFallback` field on `budgetHit` event
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## [0.6.0] - 2026-03-09
|
|
126
|
+
|
|
127
|
+
### Added
|
|
128
|
+
- OpenTelemetry plugin (`ai-sdk-rate-limiter/otel`)
|
|
129
|
+
- `createOtelPlugin(tracer)` — emits GenAI-spec spans for every request
|
|
130
|
+
- No hard dependency on `@opentelemetry/api` (structural typing)
|
|
131
|
+
- Span duration reconstructed from `latencyMs` for accurate wall-clock timing
|
|
132
|
+
- Testing utilities (`ai-sdk-rate-limiter/testing`)
|
|
133
|
+
- `createTestLimiter()` — records all completed calls for assertions
|
|
134
|
+
- `limiter.getCalls()` / `limiter.reset()`
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## [0.5.0] - 2026-03-08
|
|
139
|
+
|
|
140
|
+
### Added
|
|
141
|
+
- Concurrency limits — `maxConcurrent` per model, enforced as a semaphore
|
|
142
|
+
- Multi-tenant scoped limits — `config.scopes` with `*` wildcard patterns
|
|
143
|
+
- Each scope gets its own independent sliding window
|
|
144
|
+
- Per-request scope via `providerOptions.rateLimiter.scope`
|
|
145
|
+
- `queue.onFull: 'drop-low'` — evict lowest-priority requests before throwing `QueueFullError`
|
|
146
|
+
- `AbortSignal` propagation through both rate-limit and concurrency queues
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## [0.4.0] - 2026-03-07
|
|
151
|
+
|
|
152
|
+
### Added
|
|
153
|
+
- Priority queue — `high` / `normal` / `low` priorities; FIFO within same priority
|
|
154
|
+
- Per-request options via `providerOptions.rateLimiter` (priority, timeout, scope)
|
|
155
|
+
- `limiter.estimatedWait(modelId)` — returns ms until next available slot
|
|
156
|
+
- `QueueFullError` when queue is at `maxSize` capacity
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## [0.3.0] - 2026-03-06
|
|
161
|
+
|
|
162
|
+
### Added
|
|
163
|
+
- Cost tracking — records actual token usage per request
|
|
164
|
+
- `getCostReport()` with hourly / daily / monthly rolling windows
|
|
165
|
+
- `byModel` breakdown in cost report
|
|
166
|
+
- Budget caps — `cost.budget` with `hourly`, `daily`, `monthly` limits
|
|
167
|
+
- `onExceeded: 'throw' | 'queue'` behavior
|
|
168
|
+
- `BudgetExceededError` with period, current spend, and limit
|
|
169
|
+
- `nextBudgetClearMs()` used internally for queue-mode budget holds
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## [0.2.0] - 2026-03-05
|
|
174
|
+
|
|
175
|
+
### Added
|
|
176
|
+
- Model registry expanded to include Groq, Mistral, and Cohere
|
|
177
|
+
- `GROQ_MODELS`, `MISTRAL_MODELS`, `COHERE_MODELS`
|
|
178
|
+
- `isKnownModel(modelId, provider)` utility
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## [0.1.0] - 2026-03-04
|
|
183
|
+
|
|
184
|
+
### Added
|
|
185
|
+
- Initial release
|
|
186
|
+
- Sliding window rate limiting (RPM + ITPM) for OpenAI, Anthropic, Google Gemini models
|
|
187
|
+
- Priority queue with drain timer (`scheduleDrain` per model)
|
|
188
|
+
- Exponential backoff retry with jitter, `Retry-After` header support
|
|
189
|
+
- Vercel AI SDK `.wrap()` adapter via `LanguageModelV4Middleware`
|
|
190
|
+
- In-memory store (default, zero config)
|
|
191
|
+
- `createRateLimiter()` factory
|
|
192
|
+
- `limiter.getStatus()` — queue depths and window state per model
|
|
193
|
+
- Event system: `queued`, `dequeued`, `retrying`, `rateLimited`, `budgetHit`, `dropped`, `completed`
|
|
194
|
+
- `RateLimiterError` hierarchy: `RateLimitExceededError`, `QueueTimeoutError`, `RetryExhaustedError`
|
|
195
|
+
- Built-in model registry for OpenAI and Anthropic with pricing data
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 ai-sdk-rate-limiter contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -4,12 +4,16 @@ Smart rate limiting, queuing, and cost tracking for AI API calls. Works across p
|
|
|
4
4
|
|
|
5
5
|
[](https://www.npmjs.com/package/ai-sdk-rate-limiter)
|
|
6
6
|
[](https://github.com/piyushgupta344/ai-sdk-rate-limiter/actions/workflows/ci.yml)
|
|
7
|
+
[](https://codecov.io/gh/piyushgupta344/ai-sdk-rate-limiter)
|
|
7
8
|
[](https://www.npmjs.com/package/ai-sdk-rate-limiter)
|
|
9
|
+
[](https://piyushgupta344.github.io/ai-sdk-rate-limiter/)
|
|
8
10
|
|
|
9
11
|
```
|
|
10
12
|
npm install ai-sdk-rate-limiter
|
|
11
13
|
```
|
|
12
14
|
|
|
15
|
+
**[Documentation →](https://piyushgupta344.github.io/ai-sdk-rate-limiter/)**
|
|
16
|
+
|
|
13
17
|
---
|
|
14
18
|
|
|
15
19
|
## The problem
|
|
@@ -557,6 +561,27 @@ Costs are based on **actual token counts** from API responses — not estimates.
|
|
|
557
561
|
|
|
558
562
|
`byScope` is populated automatically when requests carry a `scope` (either set on `limiter.wrap()` or via `providerOptions.rateLimiter.scope`). Unscoped requests don't appear in `byScope`.
|
|
559
563
|
|
|
564
|
+
### Cost forecasting
|
|
565
|
+
|
|
566
|
+
`getCostForecast()` projects your end-of-period spend based on the current hourly rate. Useful for alerting before a budget cap is hit:
|
|
567
|
+
|
|
568
|
+
```typescript
|
|
569
|
+
const forecast = limiter.getCostForecast()
|
|
570
|
+
|
|
571
|
+
console.log(forecast)
|
|
572
|
+
// {
|
|
573
|
+
// hour: { spentUsd: 1.20, projectedUsd: 1.20, ratePerHourUsd: 1.20 },
|
|
574
|
+
// day: { spentUsd: 3.50, projectedUsd: 28.80, ratePerHourUsd: 1.20 },
|
|
575
|
+
// month: { spentUsd: 8.10, projectedUsd: 864, ratePerHourUsd: 1.20 },
|
|
576
|
+
// }
|
|
577
|
+
|
|
578
|
+
if (forecast.day.projectedUsd > 40) {
|
|
579
|
+
console.warn(`Heads up — on track to spend $${forecast.day.projectedUsd.toFixed(2)} today`)
|
|
580
|
+
}
|
|
581
|
+
```
|
|
582
|
+
|
|
583
|
+
`projectedUsd` = current hourly rate × hours in the period. It is based on the **last 60 minutes** of spend, so it responds quickly to usage spikes.
|
|
584
|
+
|
|
560
585
|
---
|
|
561
586
|
|
|
562
587
|
## Budget fallback routing
|
|
@@ -1456,6 +1481,57 @@ class MyStore implements RateLimitStore {
|
|
|
1456
1481
|
const limiter = createRateLimiter({ store: new MyStore() })
|
|
1457
1482
|
```
|
|
1458
1483
|
|
|
1484
|
+
### Load balancing across API keys
|
|
1485
|
+
|
|
1486
|
+
`createModelPool()` distributes requests round-robin across multiple model instances — useful when you have more than one API key:
|
|
1487
|
+
|
|
1488
|
+
```typescript
|
|
1489
|
+
import { createRateLimiter, createModelPool } from 'ai-sdk-rate-limiter'
|
|
1490
|
+
import { createOpenAI } from '@ai-sdk/openai'
|
|
1491
|
+
|
|
1492
|
+
// Two API keys, each with their own limiter tracking separate RPM limits
|
|
1493
|
+
const limiter1 = createRateLimiter({ limits: { 'gpt-4o': { rpm: 500, itpm: 2_000_000 } } })
|
|
1494
|
+
const limiter2 = createRateLimiter({ limits: { 'gpt-4o': { rpm: 500, itpm: 2_000_000 } } })
|
|
1495
|
+
|
|
1496
|
+
const openai1 = createOpenAI({ apiKey: process.env.OPENAI_KEY_1 })
|
|
1497
|
+
const openai2 = createOpenAI({ apiKey: process.env.OPENAI_KEY_2 })
|
|
1498
|
+
|
|
1499
|
+
const pool = createModelPool([
|
|
1500
|
+
limiter1.wrap(openai1('gpt-4o')),
|
|
1501
|
+
limiter2.wrap(openai2('gpt-4o')),
|
|
1502
|
+
])
|
|
1503
|
+
|
|
1504
|
+
// Use exactly like a regular model — calls alternate between the two keys
|
|
1505
|
+
const { text } = await generateText({ model: pool, prompt: 'Hello!' })
|
|
1506
|
+
```
|
|
1507
|
+
|
|
1508
|
+
Pass `{ strategy: 'random' }` for random selection instead of round-robin.
|
|
1509
|
+
|
|
1510
|
+
### Request deduplication
|
|
1511
|
+
|
|
1512
|
+
When multiple concurrent requests carry the same `dedupKey`, only one API call is made and all callers receive the same result. Useful for FAQ-style workloads where many users ask the same question simultaneously:
|
|
1513
|
+
|
|
1514
|
+
```typescript
|
|
1515
|
+
const model = limiter.wrap(openai('gpt-4o'))
|
|
1516
|
+
|
|
1517
|
+
// Server handler — two simultaneous identical requests share one API call
|
|
1518
|
+
async function handleRequest(questionId: string) {
|
|
1519
|
+
const { text } = await generateText({
|
|
1520
|
+
model,
|
|
1521
|
+
prompt: questions[questionId],
|
|
1522
|
+
providerOptions: {
|
|
1523
|
+
rateLimiter: { dedupKey: `faq:${questionId}` },
|
|
1524
|
+
},
|
|
1525
|
+
})
|
|
1526
|
+
return text
|
|
1527
|
+
}
|
|
1528
|
+
|
|
1529
|
+
// If 50 users hit the same FAQ item at the same time → 1 API call, not 50
|
|
1530
|
+
const results = await Promise.all(users.map(() => handleRequest('faq-42')))
|
|
1531
|
+
```
|
|
1532
|
+
|
|
1533
|
+
The dedup entry is removed once the request completes (success or error), so subsequent requests always make a fresh call.
|
|
1534
|
+
|
|
1459
1535
|
---
|
|
1460
1536
|
|
|
1461
1537
|
## How it works
|
package/dist/cli.js
CHANGED
|
@@ -1021,7 +1021,7 @@ async function main() {
|
|
|
1021
1021
|
Run with --help for usage.`);
|
|
1022
1022
|
process.exit(1);
|
|
1023
1023
|
}
|
|
1024
|
-
await runAudit({ provider, json });
|
|
1024
|
+
await runAudit({ ...provider !== void 0 && { provider }, json });
|
|
1025
1025
|
}
|
|
1026
1026
|
main().catch((err) => {
|
|
1027
1027
|
console.error(err instanceof Error ? err.message : String(err));
|
package/dist/index.cjs
CHANGED
|
@@ -271,6 +271,13 @@ var InMemoryStore = class {
|
|
|
271
271
|
}
|
|
272
272
|
return now + WINDOW_MS + 1;
|
|
273
273
|
}
|
|
274
|
+
/** Clear all state — useful in tests to reset between runs. */
|
|
275
|
+
reset() {
|
|
276
|
+
this.windows.clear();
|
|
277
|
+
this.dailyWindows.clear();
|
|
278
|
+
this.backoffs.clear();
|
|
279
|
+
this.pruneCounter = 0;
|
|
280
|
+
}
|
|
274
281
|
/**
|
|
275
282
|
* Periodically sweep stale entries from the scope/model maps.
|
|
276
283
|
* Runs every 200 checkAndRecord calls to avoid per-request overhead.
|
|
@@ -360,7 +367,18 @@ var RateLimitEngine = class {
|
|
|
360
367
|
}
|
|
361
368
|
if (nextSlotAtMs > Date.now()) {
|
|
362
369
|
if (local.waiters.length >= this.maxQueueSize) {
|
|
363
|
-
|
|
370
|
+
if (opts.onFull === "drop-low" && opts.priority !== "low") {
|
|
371
|
+
const lastIdx = local.waiters.length - 1;
|
|
372
|
+
const victim = local.waiters[lastIdx];
|
|
373
|
+
if (victim !== void 0 && victim.priority === "low") {
|
|
374
|
+
local.waiters.splice(lastIdx, 1);
|
|
375
|
+
victim.reject(new QueueFullError(key, this.maxQueueSize));
|
|
376
|
+
} else {
|
|
377
|
+
throw new QueueFullError(key, this.maxQueueSize);
|
|
378
|
+
}
|
|
379
|
+
} else {
|
|
380
|
+
throw new QueueFullError(key, this.maxQueueSize);
|
|
381
|
+
}
|
|
364
382
|
}
|
|
365
383
|
const estimatedWaitMs = Math.max(0, nextSlotAtMs - Date.now());
|
|
366
384
|
opts.onQueued?.(local.waiters.length, estimatedWaitMs);
|
|
@@ -481,6 +499,18 @@ var RateLimitEngine = class {
|
|
|
481
499
|
}
|
|
482
500
|
return total;
|
|
483
501
|
}
|
|
502
|
+
/**
|
|
503
|
+
* Clear all rate-limit state — sliding windows, queues, backoffs.
|
|
504
|
+
* Queued requests are rejected with ShutdownError before clearing.
|
|
505
|
+
* Useful in tests to reset between runs without recreating the instance.
|
|
506
|
+
*/
|
|
507
|
+
reset() {
|
|
508
|
+
this.shutdown();
|
|
509
|
+
this.localStates.clear();
|
|
510
|
+
if (this.store instanceof InMemoryStore) {
|
|
511
|
+
this.store.reset();
|
|
512
|
+
}
|
|
513
|
+
}
|
|
484
514
|
/**
|
|
485
515
|
* Immediately reject all queued and concurrency-waiting requests with
|
|
486
516
|
* ShutdownError. Called by Pipeline.shutdown() before draining.
|
|
@@ -668,6 +698,30 @@ var CostTracker = class {
|
|
|
668
698
|
estimateCost(inputTokens, outputTokens, inputPricePerMillion, outputPricePerMillion) {
|
|
669
699
|
return inputTokens / 1e6 * inputPricePerMillion + outputTokens / 1e6 * outputPricePerMillion;
|
|
670
700
|
}
|
|
701
|
+
getForecast() {
|
|
702
|
+
const now = Date.now();
|
|
703
|
+
this.evict(now);
|
|
704
|
+
const hourlyRate = this.entries.filter((e) => e.timestamp > now - HOUR_MS).reduce((s, e) => s + e.costUsd, 0);
|
|
705
|
+
const daySpent = this.entries.filter((e) => e.timestamp > now - DAY_MS2).reduce((s, e) => s + e.costUsd, 0);
|
|
706
|
+
const monthSpent = this.entries.filter((e) => e.timestamp > now - MONTH_MS).reduce((s, e) => s + e.costUsd, 0);
|
|
707
|
+
return {
|
|
708
|
+
hour: {
|
|
709
|
+
spentUsd: hourlyRate,
|
|
710
|
+
projectedUsd: hourlyRate,
|
|
711
|
+
ratePerHourUsd: hourlyRate
|
|
712
|
+
},
|
|
713
|
+
day: {
|
|
714
|
+
spentUsd: daySpent,
|
|
715
|
+
projectedUsd: hourlyRate * 24,
|
|
716
|
+
ratePerHourUsd: hourlyRate
|
|
717
|
+
},
|
|
718
|
+
month: {
|
|
719
|
+
spentUsd: monthSpent,
|
|
720
|
+
projectedUsd: hourlyRate * 24 * 30,
|
|
721
|
+
ratePerHourUsd: hourlyRate
|
|
722
|
+
}
|
|
723
|
+
};
|
|
724
|
+
}
|
|
671
725
|
getReport() {
|
|
672
726
|
const now = Date.now();
|
|
673
727
|
this.evict(now);
|
|
@@ -700,6 +754,10 @@ var CostTracker = class {
|
|
|
700
754
|
}
|
|
701
755
|
return { hour, day, month, byModel, byScope };
|
|
702
756
|
}
|
|
757
|
+
/** Clear all in-memory cost entries. Useful in tests to reset between runs. */
|
|
758
|
+
reset() {
|
|
759
|
+
this.entries = [];
|
|
760
|
+
}
|
|
703
761
|
// -------------------------------------------------------------------------
|
|
704
762
|
// Private helpers
|
|
705
763
|
// -------------------------------------------------------------------------
|
|
@@ -1696,6 +1754,8 @@ var DebugLogger = class {
|
|
|
1696
1754
|
};
|
|
1697
1755
|
|
|
1698
1756
|
// src/core/pipeline.ts
|
|
1757
|
+
var WARN_PREFIX = "\x1B[33m\u26A0 ai-sdk-rate-limiter\x1B[0m";
|
|
1758
|
+
var WARN_RESET = "\x1B[0m";
|
|
1699
1759
|
function resolveRetryConfig(config) {
|
|
1700
1760
|
const r = config.retry ?? {};
|
|
1701
1761
|
return {
|
|
@@ -1734,8 +1794,14 @@ var Pipeline = class {
|
|
|
1734
1794
|
this.circuits = /* @__PURE__ */ new Map();
|
|
1735
1795
|
/** Limits detected from provider response headers (lower priority than user config) */
|
|
1736
1796
|
this.detectedLimits = /* @__PURE__ */ new Map();
|
|
1797
|
+
/** In-flight promises indexed by dedup key, shared across identical concurrent requests */
|
|
1798
|
+
this.dedupMap = /* @__PURE__ */ new Map();
|
|
1799
|
+
/** Counter used to trigger periodic keyMeta GC without a setInterval */
|
|
1800
|
+
this.executeCount = 0;
|
|
1737
1801
|
/** Set to true after shutdown() is called */
|
|
1738
1802
|
this.shutdownRequested = false;
|
|
1803
|
+
/** Models that have already received an "unknown model" warning (dedup) */
|
|
1804
|
+
this.warnedModels = /* @__PURE__ */ new Set();
|
|
1739
1805
|
this.config = config;
|
|
1740
1806
|
this.log = new DebugLogger(config.debug === true);
|
|
1741
1807
|
this.engine = new RateLimitEngine({
|
|
@@ -1780,6 +1846,19 @@ var Pipeline = class {
|
|
|
1780
1846
|
*/
|
|
1781
1847
|
async execute(modelId, provider, prompt, fn, opts) {
|
|
1782
1848
|
this.log.log(modelId, "execute", { provider, priority: opts.priority, ...opts.scope !== void 0 && { scope: opts.scope } });
|
|
1849
|
+
if (opts.dedupKey !== void 0) {
|
|
1850
|
+
const existing = this.dedupMap.get(opts.dedupKey);
|
|
1851
|
+
if (existing !== void 0) {
|
|
1852
|
+
this.log.log(modelId, "dedup hit", { dedupKey: opts.dedupKey });
|
|
1853
|
+
return existing;
|
|
1854
|
+
}
|
|
1855
|
+
const { dedupKey, ...optsWithoutDedup } = opts;
|
|
1856
|
+
const promise = this.execute(modelId, provider, prompt, fn, optsWithoutDedup).finally(() => {
|
|
1857
|
+
this.dedupMap.delete(dedupKey);
|
|
1858
|
+
});
|
|
1859
|
+
this.dedupMap.set(dedupKey, promise);
|
|
1860
|
+
return promise;
|
|
1861
|
+
}
|
|
1783
1862
|
if (this.shutdownRequested) {
|
|
1784
1863
|
this.emitter.emit("dropped", {
|
|
1785
1864
|
model: modelId,
|
|
@@ -1868,6 +1947,7 @@ var Pipeline = class {
|
|
|
1868
1947
|
priority: opts.priority,
|
|
1869
1948
|
timeoutMs: opts.timeoutMs,
|
|
1870
1949
|
...opts.signal !== void 0 && { signal: opts.signal },
|
|
1950
|
+
...this.config.queue?.onFull !== void 0 && { onFull: this.config.queue.onFull },
|
|
1871
1951
|
onQueued: (queueDepth, estimatedWaitMs) => {
|
|
1872
1952
|
this.log.log(modelId, "queuing", { queueDepth, estimatedWaitMs, priority: opts.priority });
|
|
1873
1953
|
this.emitter.emit("queued", { model: modelId, provider, priority: opts.priority, queueDepth, estimatedWaitMs });
|
|
@@ -1878,6 +1958,7 @@ var Pipeline = class {
|
|
|
1878
1958
|
this.emitter.emit("dequeued", { model: modelId, provider, waitedMs, priority: opts.priority });
|
|
1879
1959
|
}
|
|
1880
1960
|
});
|
|
1961
|
+
if (++this.executeCount % 1e3 === 0) this.pruneKeyMeta();
|
|
1881
1962
|
} catch (acquireErr) {
|
|
1882
1963
|
if (acquireErr instanceof QueueFullError) {
|
|
1883
1964
|
const maxSize = this.config.queue?.maxSize;
|
|
@@ -1994,6 +2075,9 @@ var Pipeline = class {
|
|
|
1994
2075
|
getCostReport() {
|
|
1995
2076
|
return this.costTracker.getReport();
|
|
1996
2077
|
}
|
|
2078
|
+
getCostForecast() {
|
|
2079
|
+
return this.costTracker.getForecast();
|
|
2080
|
+
}
|
|
1997
2081
|
getStatus() {
|
|
1998
2082
|
const models = [];
|
|
1999
2083
|
let totalQueueDepth = 0;
|
|
@@ -2004,6 +2088,7 @@ var Pipeline = class {
|
|
|
2004
2088
|
const snapshot = this.engine.windowSnapshot(key);
|
|
2005
2089
|
const queueDepth = this.engine.queueDepth(key);
|
|
2006
2090
|
const backoffUntil = this.engine.backoffUntil(key);
|
|
2091
|
+
if (snapshot.requests === 0 && queueDepth === 0 && backoffUntil === null) continue;
|
|
2007
2092
|
totalQueueDepth += queueDepth;
|
|
2008
2093
|
models.push({
|
|
2009
2094
|
modelId,
|
|
@@ -2057,6 +2142,22 @@ var Pipeline = class {
|
|
|
2057
2142
|
await new Promise((resolve) => setTimeout(resolve, 50));
|
|
2058
2143
|
}
|
|
2059
2144
|
}
|
|
2145
|
+
/**
|
|
2146
|
+
* Clear all rate-limit, queue, cost, and circuit-breaker state.
|
|
2147
|
+
* Any currently queued requests are rejected with ShutdownError.
|
|
2148
|
+
* Useful in tests to reset between runs without recreating the limiter.
|
|
2149
|
+
*/
|
|
2150
|
+
reset() {
|
|
2151
|
+
this.engine.reset();
|
|
2152
|
+
this.costTracker.reset();
|
|
2153
|
+
this.keyMeta.clear();
|
|
2154
|
+
this.circuits.clear();
|
|
2155
|
+
this.detectedLimits.clear();
|
|
2156
|
+
this.dedupMap.clear();
|
|
2157
|
+
this.warnedModels.clear();
|
|
2158
|
+
this.executeCount = 0;
|
|
2159
|
+
this.shutdownRequested = false;
|
|
2160
|
+
}
|
|
2060
2161
|
/** Pre-load historical cost data from the persistent cost store. */
|
|
2061
2162
|
async warmUp() {
|
|
2062
2163
|
if (this.config.cost?.store) {
|
|
@@ -2092,6 +2193,21 @@ var Pipeline = class {
|
|
|
2092
2193
|
// -------------------------------------------------------------------------
|
|
2093
2194
|
// Private helpers
|
|
2094
2195
|
// -------------------------------------------------------------------------
|
|
2196
|
+
/**
|
|
2197
|
+
* Remove keyMeta entries for keys that have no recent activity.
|
|
2198
|
+
* A key is safe to evict when its sliding window is empty (all entries
|
|
2199
|
+
* older than 60s) and it has no queued or in-flight requests.
|
|
2200
|
+
* Called every 1000 executions — no setInterval, no GC interference.
|
|
2201
|
+
*/
|
|
2202
|
+
pruneKeyMeta() {
|
|
2203
|
+
for (const key of this.keyMeta.keys()) {
|
|
2204
|
+
const snapshot = this.engine.windowSnapshot(key);
|
|
2205
|
+
const queueDepth = this.engine.queueDepth(key);
|
|
2206
|
+
if (snapshot.requests === 0 && queueDepth === 0) {
|
|
2207
|
+
this.keyMeta.delete(key);
|
|
2208
|
+
}
|
|
2209
|
+
}
|
|
2210
|
+
}
|
|
2095
2211
|
getOrCreateCircuit(key) {
|
|
2096
2212
|
let cb = this.circuits.get(key);
|
|
2097
2213
|
if (!cb) {
|
|
@@ -2103,13 +2219,24 @@ var Pipeline = class {
|
|
|
2103
2219
|
resolveModelLimits(modelId, provider) {
|
|
2104
2220
|
const base = resolveModelLimits(modelId, provider, this.config.limits ?? {});
|
|
2105
2221
|
const detected = this.detectedLimits.get(`${provider}:${modelId}`);
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2222
|
+
const limits = detected ? (() => {
|
|
2223
|
+
const userOverride = this.config.limits?.[modelId] ?? {};
|
|
2224
|
+
return {
|
|
2225
|
+
...base,
|
|
2226
|
+
...!("rpm" in userOverride) && detected.rpm !== void 0 && { rpm: detected.rpm },
|
|
2227
|
+
...!("itpm" in userOverride) && detected.itpm !== void 0 && { itpm: detected.itpm }
|
|
2228
|
+
};
|
|
2229
|
+
})() : base;
|
|
2230
|
+
const warnKey = `${provider}:${modelId}`;
|
|
2231
|
+
if (!this.warnedModels.has(warnKey) && !isKnownModel(modelId, provider) && limits.inputPricePerMillion === 0 && limits.outputPricePerMillion === 0 && !(this.config.limits?.[modelId]?.inputPricePerMillion !== void 0 || this.config.limits?.[modelId]?.outputPricePerMillion !== void 0)) {
|
|
2232
|
+
this.warnedModels.add(warnKey);
|
|
2233
|
+
console.warn(
|
|
2234
|
+
`${WARN_PREFIX}: Unknown model '${modelId}' (provider: '${provider}'). Using fallback limits (${limits.rpm} RPM). Cost tracking is disabled.
|
|
2235
|
+
Add pricing to config.limits to enable it:
|
|
2236
|
+
limits: { '${modelId}': { inputPricePerMillion: <n>, outputPricePerMillion: <n> } }` + WARN_RESET
|
|
2237
|
+
);
|
|
2238
|
+
}
|
|
2239
|
+
return limits;
|
|
2113
2240
|
}
|
|
2114
2241
|
};
|
|
2115
2242
|
|
|
@@ -2122,7 +2249,8 @@ function getPerRequestOptions(params, queueTimeout) {
|
|
|
2122
2249
|
metadata: raw?.metadata ?? {},
|
|
2123
2250
|
skipBudgetCheck: raw?._skipBudgetCheck ?? false,
|
|
2124
2251
|
scope: raw?.scope,
|
|
2125
|
-
callTimeout: raw?.callTimeout
|
|
2252
|
+
callTimeout: raw?.callTimeout,
|
|
2253
|
+
dedupKey: raw?.dedupKey
|
|
2126
2254
|
};
|
|
2127
2255
|
}
|
|
2128
2256
|
function extractTokenUsage(usage) {
|
|
@@ -2138,7 +2266,7 @@ function createMiddleware(pipeline, queueTimeout) {
|
|
|
2138
2266
|
// wrapGenerate — non-streaming
|
|
2139
2267
|
// -----------------------------------------------------------------------
|
|
2140
2268
|
async wrapGenerate({ doGenerate, params, model }) {
|
|
2141
|
-
const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout } = getPerRequestOptions(params, queueTimeout);
|
|
2269
|
+
const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout, dedupKey } = getPerRequestOptions(params, queueTimeout);
|
|
2142
2270
|
const modelId = model.modelId;
|
|
2143
2271
|
const provider = model.provider;
|
|
2144
2272
|
const startMs = Date.now();
|
|
@@ -2154,6 +2282,7 @@ function createMiddleware(pipeline, queueTimeout) {
|
|
|
2154
2282
|
skipBudgetCheck,
|
|
2155
2283
|
...scope !== void 0 && { scope },
|
|
2156
2284
|
...callTimeout !== void 0 && { callTimeout },
|
|
2285
|
+
...dedupKey !== void 0 && { dedupKey },
|
|
2157
2286
|
...params.abortSignal !== void 0 && { signal: params.abortSignal }
|
|
2158
2287
|
}
|
|
2159
2288
|
);
|
|
@@ -2167,7 +2296,7 @@ function createMiddleware(pipeline, queueTimeout) {
|
|
|
2167
2296
|
// wrapStream — streaming
|
|
2168
2297
|
// -----------------------------------------------------------------------
|
|
2169
2298
|
async wrapStream({ doStream, params, model }) {
|
|
2170
|
-
const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout } = getPerRequestOptions(params, queueTimeout);
|
|
2299
|
+
const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout, dedupKey } = getPerRequestOptions(params, queueTimeout);
|
|
2171
2300
|
const modelId = model.modelId;
|
|
2172
2301
|
const provider = model.provider;
|
|
2173
2302
|
const startMs = Date.now();
|
|
@@ -2183,6 +2312,7 @@ function createMiddleware(pipeline, queueTimeout) {
|
|
|
2183
2312
|
skipBudgetCheck,
|
|
2184
2313
|
...scope !== void 0 && { scope },
|
|
2185
2314
|
...callTimeout !== void 0 && { callTimeout },
|
|
2315
|
+
...dedupKey !== void 0 && { dedupKey },
|
|
2186
2316
|
...params.abortSignal !== void 0 && { signal: params.abortSignal }
|
|
2187
2317
|
}
|
|
2188
2318
|
);
|
|
@@ -2508,6 +2638,9 @@ function createRateLimiter(config = {}) {
|
|
|
2508
2638
|
getCostReport() {
|
|
2509
2639
|
return pipeline.getCostReport();
|
|
2510
2640
|
},
|
|
2641
|
+
getCostForecast() {
|
|
2642
|
+
return pipeline.getCostForecast();
|
|
2643
|
+
},
|
|
2511
2644
|
getStatus() {
|
|
2512
2645
|
return pipeline.getStatus();
|
|
2513
2646
|
},
|
|
@@ -2528,6 +2661,41 @@ function createRateLimiter(config = {}) {
|
|
|
2528
2661
|
},
|
|
2529
2662
|
warmUp() {
|
|
2530
2663
|
return pipeline.warmUp();
|
|
2664
|
+
},
|
|
2665
|
+
reset() {
|
|
2666
|
+
pipeline.reset();
|
|
2667
|
+
}
|
|
2668
|
+
};
|
|
2669
|
+
}
|
|
2670
|
+
|
|
2671
|
+
// src/adapters/model-pool.ts
|
|
2672
|
+
function createModelPool(models, options) {
|
|
2673
|
+
if (models.length === 0) {
|
|
2674
|
+
throw new Error("createModelPool: at least one model is required");
|
|
2675
|
+
}
|
|
2676
|
+
const strategy = options?.strategy ?? "round-robin";
|
|
2677
|
+
let index = 0;
|
|
2678
|
+
function pick() {
|
|
2679
|
+
if (strategy === "random") {
|
|
2680
|
+
return models[Math.floor(Math.random() * models.length)];
|
|
2681
|
+
}
|
|
2682
|
+
const model = models[index % models.length];
|
|
2683
|
+
index = (index + 1) % models.length;
|
|
2684
|
+
return model;
|
|
2685
|
+
}
|
|
2686
|
+
const primary = models[0];
|
|
2687
|
+
return {
|
|
2688
|
+
get modelId() {
|
|
2689
|
+
return primary.modelId;
|
|
2690
|
+
},
|
|
2691
|
+
get provider() {
|
|
2692
|
+
return primary.provider;
|
|
2693
|
+
},
|
|
2694
|
+
doGenerate(params) {
|
|
2695
|
+
return pick().doGenerate(params);
|
|
2696
|
+
},
|
|
2697
|
+
doStream(params) {
|
|
2698
|
+
return pick().doStream(params);
|
|
2531
2699
|
}
|
|
2532
2700
|
};
|
|
2533
2701
|
}
|
|
@@ -2546,6 +2714,7 @@ exports.RateLimitExceededError = RateLimitExceededError;
|
|
|
2546
2714
|
exports.RateLimiterError = RateLimiterError;
|
|
2547
2715
|
exports.RetryExhaustedError = RetryExhaustedError;
|
|
2548
2716
|
exports.ShutdownError = ShutdownError;
|
|
2717
|
+
exports.createModelPool = createModelPool;
|
|
2549
2718
|
exports.createRateLimiter = createRateLimiter;
|
|
2550
2719
|
exports.isKnownModel = isKnownModel;
|
|
2551
2720
|
exports.rateLimited = rateLimited;
|