ai-sdk-rate-limiter 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,195 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ---
9
+
10
+ ## [Unreleased]
11
+
12
+ ---
13
+
14
+ ## [0.13.0] - 2026-03-13
15
+
16
+ ### Added
17
+ - **`limiter.reset()`** — clear all rate-limit, queue, cost, and circuit-breaker state without recreating the instance. Queued requests are rejected with `ShutdownError`. Primarily useful in tests to reset between cases with a shared limiter instance.
18
+ - **`queue.onFull: 'drop-low'`** fully implemented — when the queue is at capacity and a `high` or `normal` priority request arrives, the tail `low`-priority waiter is evicted (rejected with `QueueFullError`) to make room. Useful for mixed workloads where background batch jobs should never block user-facing requests.
19
+
20
+ ### Changed
21
+ - Unknown models with zero pricing now emit a one-time `console.warn` on first use, pointing at the exact `config.limits` fix needed to enable cost tracking. Known registry models and models with user-supplied pricing are silent.
22
+
23
+ ---
24
+
25
+ ## [0.12.0] - 2026-03-12
26
+
27
+ ### Added
28
+ - **`getCostForecast()`** — project end-of-period spend based on the current hourly rate. Returns `{ hour, day, month }` each with `spentUsd`, `projectedUsd`, and `ratePerHourUsd`. Useful for alerting before a budget cap is hit.
29
+ - **`createModelPool(models, options?)`** — round-robin (or random) load balancer across multiple wrapped model instances. Distributes calls evenly across API keys or model variants. Import from `ai-sdk-rate-limiter`.
30
+ - **Request deduplication** — pass `dedupKey` in `providerOptions.rateLimiter` to make concurrent identical requests share a single API call. All callers receive the same result; the dedup entry is cleared on completion so the next request always makes a fresh call.
31
+
32
+ ---
33
+
34
+ ## [0.11.0] - 2026-03-12
35
+
36
+ ### Added
37
+ - **Debug mode** — set `debug: true` on `createRateLimiter()` to enable structured console logging for every rate-limit decision, queue entry/exit, slot acquisition, circuit breaker state change, and cost recording. Zero overhead when disabled.
38
+ - **Config validation** — `createRateLimiter()` now validates your config at construction time and emits `console.warn` for common misconfigurations:
39
+ - `cost.store` set without calling `warmUp()` reminder
40
+ - `circuit.failureThreshold < 3` (too sensitive, risks false trips)
41
+ - `retry.retryOn` explicitly excludes 429 (defeats rate-limit retry)
42
+ - `queue.timeout < 3000ms` (too short, requests will time out before serving)
43
+ - `cost.budget` set without `onExceeded` (uses silent default `'throw'`)
44
+ - `cost.onExceeded: 'fallback'` reminder to configure fallback model
45
+ - GitHub Actions CI workflow (Node 18 / 20 / 22 matrix)
46
+ - `CHANGELOG.md` with retroactive entries from v0.1.0
47
+
48
+ ### Fixed
49
+ - `DebugLogger` details serialization: empty objects no longer emit trailing `()`
50
+
51
+ ---
52
+
53
+ ## [0.10.0] - 2026-03-12
54
+
55
+ ### Added
56
+ - `ai-sdk-rate-limiter/middleware` entry point
57
+ - `createRateLimiterMiddleware(limiter, opts)` — returns `{ middleware, errorHandler }` for Express
58
+ - `createRateLimiterErrorHandler(opts)` — standalone 4-arg Express error handler
59
+ - `createHonoMiddleware(limiter, opts)` — Hono middleware with `c.var.rateLimiter`
60
+ - `mapErrorToResponse(err)` — utility for Fastify and custom frameworks
61
+ - `RateLimiterRequestContext` type + `http.IncomingMessage` augmentation for `req.rateLimiter`
62
+ - Automatic error → HTTP mapping: `QueueTimeoutError` → 503, `BudgetExceededError` → 402, etc.
63
+ - `injectHeaders` option: adds `X-RateLimit-*` informational headers to responses
64
+
65
+ ### Added (examples)
66
+ - `examples/multi-tenant-express/` — Express API with per-user scoped rate limits (free/pro tiers)
67
+ - `examples/batch-processing/` — Concurrent batch jobs with priority queuing + graceful shutdown
68
+ - `examples/budget-alerts/` — Slack/webhook budget alerts with per-scope spend breakdown
69
+
70
+ ---
71
+
72
+ ## [0.9.0] - 2026-03-12
73
+
74
+ ### Added
75
+ - **Circuit breaker** — auto-opens on repeated 5xx failures, half-open probe, configurable thresholds
76
+ - `CircuitBreakerConfig` in `RateLimiterConfig.circuit`
77
+ - `CircuitOpenError` thrown when circuit is open
78
+ - `circuitOpen` / `circuitClosed` events
79
+ - **Graceful shutdown** — `limiter.shutdown({ drainMs })`, `ShutdownError`
80
+ - **Persistent cost tracking** — `CostStore` interface + `RedisCostStore` in `ai-sdk-rate-limiter/redis`
81
+ - `limiter.warmUp()` pre-loads historical spend on startup
82
+ - **Per-scope cost attribution** — `getCostReport().byScope` breakdown per user/org/tenant
83
+ - **Fallback chains** — `fallback` accepts `WrappableModel[]`, walked in order on `BudgetExceededError`
84
+ - **Call timeout** — `callTimeout` in retry config and per-request options (uses `Promise.race`)
85
+ - **Auto-detected limits** — parses `x-ratelimit-limit-*` response headers, user config wins
86
+ - `limitsDetected` event
87
+ - **Prometheus metrics** — `createPrometheusPlugin()` in `ai-sdk-rate-limiter/prometheus`
88
+ - **StatsD / DogStatsD** — `createStatsDPlugin(client)` in `ai-sdk-rate-limiter/statsd`
89
+ - **Drop hooks** — `DroppedEvent` now includes `reason`, `waitedMs`, `queueDepth`, `scope`, `metadata`
90
+
91
+ ---
92
+
93
+ ## [0.8.0] - 2026-03-11
94
+
95
+ ### Added
96
+ - Redis store for multi-instance rate limiting (`ai-sdk-rate-limiter/redis`)
97
+ - `RedisStore` — Lua-script-based atomic sliding window shared across instances
98
+ - Fail-open on Redis errors (enforcement suspended, never blocks requests)
99
+ - Compatible with ioredis, node-redis, Upstash Redis
100
+ - `rpd` (requests per day) limit support — rolling 24-hour window
101
+ - `otpm` (output tokens per minute) limit support — based on completed request actuals
102
+
103
+ ---
104
+
105
+ ## [0.7.1] - 2026-03-10
106
+
107
+ ### Fixed
108
+ - `Retry-After` header parsing: correctly handles duration strings like `"6m30s"` (previously parsed as 6s)
109
+
110
+ ---
111
+
112
+ ## [0.7.0] - 2026-03-10
113
+
114
+ ### Added
115
+ - Raw SDK proxy — `limiter.rawProxy(client)` wraps native OpenAI/Anthropic/Groq/Mistral/Cohere clients
116
+ - Transparent `Proxy`-based drop-in with no API changes
117
+ - Streaming support via `AsyncIterable` wrapping for usage chunk capture
118
+ - `rateLimited(client, opts)` standalone factory
119
+ - Budget fallback routing — `onExceeded: 'fallback'` transparently reroutes to a cheaper model
120
+ - `limiter.wrap(model, { fallback: cheaperModel })`
121
+ - `usingFallback` field on `budgetHit` event
122
+
123
+ ---
124
+
125
+ ## [0.6.0] - 2026-03-09
126
+
127
+ ### Added
128
+ - OpenTelemetry plugin (`ai-sdk-rate-limiter/otel`)
129
+ - `createOtelPlugin(tracer)` — emits GenAI-spec spans for every request
130
+ - No hard dependency on `@opentelemetry/api` (structural typing)
131
+ - Span duration reconstructed from `latencyMs` for accurate wall-clock timing
132
+ - Testing utilities (`ai-sdk-rate-limiter/testing`)
133
+ - `createTestLimiter()` — records all completed calls for assertions
134
+ - `limiter.getCalls()` / `limiter.reset()`
135
+
136
+ ---
137
+
138
+ ## [0.5.0] - 2026-03-08
139
+
140
+ ### Added
141
+ - Concurrency limits — `maxConcurrent` per model, enforced as a semaphore
142
+ - Multi-tenant scoped limits — `config.scopes` with `*` wildcard patterns
143
+ - Each scope gets its own independent sliding window
144
+ - Per-request scope via `providerOptions.rateLimiter.scope`
145
+ - `queue.onFull: 'drop-low'` — evict lowest-priority requests before throwing `QueueFullError`
146
+ - `AbortSignal` propagation through both rate-limit and concurrency queues
147
+
148
+ ---
149
+
150
+ ## [0.4.0] - 2026-03-07
151
+
152
+ ### Added
153
+ - Priority queue — `high` / `normal` / `low` priorities; FIFO within same priority
154
+ - Per-request options via `providerOptions.rateLimiter` (priority, timeout, scope)
155
+ - `limiter.estimatedWait(modelId)` — returns ms until next available slot
156
+ - `QueueFullError` when queue is at `maxSize` capacity
157
+
158
+ ---
159
+
160
+ ## [0.3.0] - 2026-03-06
161
+
162
+ ### Added
163
+ - Cost tracking — records actual token usage per request
164
+ - `getCostReport()` with hourly / daily / monthly rolling windows
165
+ - `byModel` breakdown in cost report
166
+ - Budget caps — `cost.budget` with `hourly`, `daily`, `monthly` limits
167
+ - `onExceeded: 'throw' | 'queue'` behavior
168
+ - `BudgetExceededError` with period, current spend, and limit
169
+ - `nextBudgetClearMs()` used internally for queue-mode budget holds
170
+
171
+ ---
172
+
173
+ ## [0.2.0] - 2026-03-05
174
+
175
+ ### Added
176
+ - Model registry expanded to include Groq, Mistral, and Cohere
177
+ - `GROQ_MODELS`, `MISTRAL_MODELS`, `COHERE_MODELS`
178
+ - `isKnownModel(modelId, provider)` utility
179
+
180
+ ---
181
+
182
+ ## [0.1.0] - 2026-03-04
183
+
184
+ ### Added
185
+ - Initial release
186
+ - Sliding window rate limiting (RPM + ITPM) for OpenAI, Anthropic, Google Gemini models
187
+ - Priority queue with drain timer (`scheduleDrain` per model)
188
+ - Exponential backoff retry with jitter, `Retry-After` header support
189
+ - Vercel AI SDK `.wrap()` adapter via `LanguageModelV4Middleware`
190
+ - In-memory store (default, zero config)
191
+ - `createRateLimiter()` factory
192
+ - `limiter.getStatus()` — queue depths and window state per model
193
+ - Event system: `queued`, `dequeued`, `retrying`, `rateLimited`, `budgetHit`, `dropped`, `completed`
194
+ - `RateLimiterError` hierarchy: `RateLimitExceededError`, `QueueTimeoutError`, `RetryExhaustedError`
195
+ - Built-in model registry for OpenAI and Anthropic with pricing data
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 ai-sdk-rate-limiter contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -4,12 +4,16 @@ Smart rate limiting, queuing, and cost tracking for AI API calls. Works across p
4
4
 
5
5
  [![npm](https://img.shields.io/npm/v/ai-sdk-rate-limiter)](https://www.npmjs.com/package/ai-sdk-rate-limiter)
6
6
  [![CI](https://github.com/piyushgupta344/ai-sdk-rate-limiter/actions/workflows/ci.yml/badge.svg)](https://github.com/piyushgupta344/ai-sdk-rate-limiter/actions/workflows/ci.yml)
7
+ [![codecov](https://codecov.io/gh/piyushgupta344/ai-sdk-rate-limiter/graph/badge.svg)](https://codecov.io/gh/piyushgupta344/ai-sdk-rate-limiter)
7
8
  [![npm downloads](https://img.shields.io/npm/dm/ai-sdk-rate-limiter)](https://www.npmjs.com/package/ai-sdk-rate-limiter)
9
+ [![docs](https://img.shields.io/badge/docs-piyushgupta344.github.io-blue)](https://piyushgupta344.github.io/ai-sdk-rate-limiter/)
8
10
 
9
11
  ```
10
12
  npm install ai-sdk-rate-limiter
11
13
  ```
12
14
 
15
+ **[Documentation →](https://piyushgupta344.github.io/ai-sdk-rate-limiter/)**
16
+
13
17
  ---
14
18
 
15
19
  ## The problem
@@ -557,6 +561,27 @@ Costs are based on **actual token counts** from API responses — not estimates.
557
561
 
558
562
  `byScope` is populated automatically when requests carry a `scope` (either set on `limiter.wrap()` or via `providerOptions.rateLimiter.scope`). Unscoped requests don't appear in `byScope`.
559
563
 
564
+ ### Cost forecasting
565
+
566
+ `getCostForecast()` projects your end-of-period spend based on the current hourly rate. Useful for alerting before a budget cap is hit:
567
+
568
+ ```typescript
569
+ const forecast = limiter.getCostForecast()
570
+
571
+ console.log(forecast)
572
+ // {
573
+ // hour: { spentUsd: 1.20, projectedUsd: 1.20, ratePerHourUsd: 1.20 },
574
+ // day: { spentUsd: 3.50, projectedUsd: 28.80, ratePerHourUsd: 1.20 },
575
+ // month: { spentUsd: 8.10, projectedUsd: 864, ratePerHourUsd: 1.20 },
576
+ // }
577
+
578
+ if (forecast.day.projectedUsd > 40) {
579
+ console.warn(`Heads up — on track to spend $${forecast.day.projectedUsd.toFixed(2)} today`)
580
+ }
581
+ ```
582
+
583
+ `projectedUsd` = current hourly rate × hours in the period. It is based on the **last 60 minutes** of spend, so it responds quickly to usage spikes.
584
+
560
585
  ---
561
586
 
562
587
  ## Budget fallback routing
@@ -1456,6 +1481,57 @@ class MyStore implements RateLimitStore {
1456
1481
  const limiter = createRateLimiter({ store: new MyStore() })
1457
1482
  ```
1458
1483
 
1484
+ ### Load balancing across API keys
1485
+
1486
+ `createModelPool()` distributes requests round-robin across multiple model instances — useful when you have more than one API key:
1487
+
1488
+ ```typescript
1489
+ import { createRateLimiter, createModelPool } from 'ai-sdk-rate-limiter'
1490
+ import { createOpenAI } from '@ai-sdk/openai'
1491
+
1492
+ // Two API keys, each with their own limiter tracking separate RPM limits
1493
+ const limiter1 = createRateLimiter({ limits: { 'gpt-4o': { rpm: 500, itpm: 2_000_000 } } })
1494
+ const limiter2 = createRateLimiter({ limits: { 'gpt-4o': { rpm: 500, itpm: 2_000_000 } } })
1495
+
1496
+ const openai1 = createOpenAI({ apiKey: process.env.OPENAI_KEY_1 })
1497
+ const openai2 = createOpenAI({ apiKey: process.env.OPENAI_KEY_2 })
1498
+
1499
+ const pool = createModelPool([
1500
+ limiter1.wrap(openai1('gpt-4o')),
1501
+ limiter2.wrap(openai2('gpt-4o')),
1502
+ ])
1503
+
1504
+ // Use exactly like a regular model — calls alternate between the two keys
1505
+ const { text } = await generateText({ model: pool, prompt: 'Hello!' })
1506
+ ```
1507
+
1508
+ Pass `{ strategy: 'random' }` for random selection instead of round-robin.
1509
+
1510
+ ### Request deduplication
1511
+
1512
+ When multiple concurrent requests carry the same `dedupKey`, only one API call is made and all callers receive the same result. Useful for FAQ-style workloads where many users ask the same question simultaneously:
1513
+
1514
+ ```typescript
1515
+ const model = limiter.wrap(openai('gpt-4o'))
1516
+
1517
+ // Server handler — two simultaneous identical requests share one API call
1518
+ async function handleRequest(questionId: string) {
1519
+ const { text } = await generateText({
1520
+ model,
1521
+ prompt: questions[questionId],
1522
+ providerOptions: {
1523
+ rateLimiter: { dedupKey: `faq:${questionId}` },
1524
+ },
1525
+ })
1526
+ return text
1527
+ }
1528
+
1529
+ // If 50 users hit the same FAQ item at the same time → 1 API call, not 50
1530
+ const results = await Promise.all(users.map(() => handleRequest('faq-42')))
1531
+ ```
1532
+
1533
+ The dedup entry is removed once the request completes (success or error), so subsequent requests always make a fresh call.
1534
+
1459
1535
  ---
1460
1536
 
1461
1537
  ## How it works
package/dist/cli.js CHANGED
@@ -1021,7 +1021,7 @@ async function main() {
1021
1021
  Run with --help for usage.`);
1022
1022
  process.exit(1);
1023
1023
  }
1024
- await runAudit({ provider, json });
1024
+ await runAudit({ ...provider !== void 0 && { provider }, json });
1025
1025
  }
1026
1026
  main().catch((err) => {
1027
1027
  console.error(err instanceof Error ? err.message : String(err));
package/dist/index.cjs CHANGED
@@ -271,6 +271,13 @@ var InMemoryStore = class {
271
271
  }
272
272
  return now + WINDOW_MS + 1;
273
273
  }
274
+ /** Clear all state — useful in tests to reset between runs. */
275
+ reset() {
276
+ this.windows.clear();
277
+ this.dailyWindows.clear();
278
+ this.backoffs.clear();
279
+ this.pruneCounter = 0;
280
+ }
274
281
  /**
275
282
  * Periodically sweep stale entries from the scope/model maps.
276
283
  * Runs every 200 checkAndRecord calls to avoid per-request overhead.
@@ -360,7 +367,18 @@ var RateLimitEngine = class {
360
367
  }
361
368
  if (nextSlotAtMs > Date.now()) {
362
369
  if (local.waiters.length >= this.maxQueueSize) {
363
- throw new QueueFullError(key, this.maxQueueSize);
370
+ if (opts.onFull === "drop-low" && opts.priority !== "low") {
371
+ const lastIdx = local.waiters.length - 1;
372
+ const victim = local.waiters[lastIdx];
373
+ if (victim !== void 0 && victim.priority === "low") {
374
+ local.waiters.splice(lastIdx, 1);
375
+ victim.reject(new QueueFullError(key, this.maxQueueSize));
376
+ } else {
377
+ throw new QueueFullError(key, this.maxQueueSize);
378
+ }
379
+ } else {
380
+ throw new QueueFullError(key, this.maxQueueSize);
381
+ }
364
382
  }
365
383
  const estimatedWaitMs = Math.max(0, nextSlotAtMs - Date.now());
366
384
  opts.onQueued?.(local.waiters.length, estimatedWaitMs);
@@ -481,6 +499,18 @@ var RateLimitEngine = class {
481
499
  }
482
500
  return total;
483
501
  }
502
+ /**
503
+ * Clear all rate-limit state — sliding windows, queues, backoffs.
504
+ * Queued requests are rejected with ShutdownError before clearing.
505
+ * Useful in tests to reset between runs without recreating the instance.
506
+ */
507
+ reset() {
508
+ this.shutdown();
509
+ this.localStates.clear();
510
+ if (this.store instanceof InMemoryStore) {
511
+ this.store.reset();
512
+ }
513
+ }
484
514
  /**
485
515
  * Immediately reject all queued and concurrency-waiting requests with
486
516
  * ShutdownError. Called by Pipeline.shutdown() before draining.
@@ -668,6 +698,30 @@ var CostTracker = class {
668
698
  estimateCost(inputTokens, outputTokens, inputPricePerMillion, outputPricePerMillion) {
669
699
  return inputTokens / 1e6 * inputPricePerMillion + outputTokens / 1e6 * outputPricePerMillion;
670
700
  }
701
+ getForecast() {
702
+ const now = Date.now();
703
+ this.evict(now);
704
+ const hourlyRate = this.entries.filter((e) => e.timestamp > now - HOUR_MS).reduce((s, e) => s + e.costUsd, 0);
705
+ const daySpent = this.entries.filter((e) => e.timestamp > now - DAY_MS2).reduce((s, e) => s + e.costUsd, 0);
706
+ const monthSpent = this.entries.filter((e) => e.timestamp > now - MONTH_MS).reduce((s, e) => s + e.costUsd, 0);
707
+ return {
708
+ hour: {
709
+ spentUsd: hourlyRate,
710
+ projectedUsd: hourlyRate,
711
+ ratePerHourUsd: hourlyRate
712
+ },
713
+ day: {
714
+ spentUsd: daySpent,
715
+ projectedUsd: hourlyRate * 24,
716
+ ratePerHourUsd: hourlyRate
717
+ },
718
+ month: {
719
+ spentUsd: monthSpent,
720
+ projectedUsd: hourlyRate * 24 * 30,
721
+ ratePerHourUsd: hourlyRate
722
+ }
723
+ };
724
+ }
671
725
  getReport() {
672
726
  const now = Date.now();
673
727
  this.evict(now);
@@ -700,6 +754,10 @@ var CostTracker = class {
700
754
  }
701
755
  return { hour, day, month, byModel, byScope };
702
756
  }
757
+ /** Clear all in-memory cost entries. Useful in tests to reset between runs. */
758
+ reset() {
759
+ this.entries = [];
760
+ }
703
761
  // -------------------------------------------------------------------------
704
762
  // Private helpers
705
763
  // -------------------------------------------------------------------------
@@ -1696,6 +1754,8 @@ var DebugLogger = class {
1696
1754
  };
1697
1755
 
1698
1756
  // src/core/pipeline.ts
1757
+ var WARN_PREFIX = "\x1B[33m\u26A0 ai-sdk-rate-limiter\x1B[0m";
1758
+ var WARN_RESET = "\x1B[0m";
1699
1759
  function resolveRetryConfig(config) {
1700
1760
  const r = config.retry ?? {};
1701
1761
  return {
@@ -1734,8 +1794,14 @@ var Pipeline = class {
1734
1794
  this.circuits = /* @__PURE__ */ new Map();
1735
1795
  /** Limits detected from provider response headers (lower priority than user config) */
1736
1796
  this.detectedLimits = /* @__PURE__ */ new Map();
1797
+ /** In-flight promises indexed by dedup key, shared across identical concurrent requests */
1798
+ this.dedupMap = /* @__PURE__ */ new Map();
1799
+ /** Counter used to trigger periodic keyMeta GC without a setInterval */
1800
+ this.executeCount = 0;
1737
1801
  /** Set to true after shutdown() is called */
1738
1802
  this.shutdownRequested = false;
1803
+ /** Models that have already received an "unknown model" warning (dedup) */
1804
+ this.warnedModels = /* @__PURE__ */ new Set();
1739
1805
  this.config = config;
1740
1806
  this.log = new DebugLogger(config.debug === true);
1741
1807
  this.engine = new RateLimitEngine({
@@ -1780,6 +1846,19 @@ var Pipeline = class {
1780
1846
  */
1781
1847
  async execute(modelId, provider, prompt, fn, opts) {
1782
1848
  this.log.log(modelId, "execute", { provider, priority: opts.priority, ...opts.scope !== void 0 && { scope: opts.scope } });
1849
+ if (opts.dedupKey !== void 0) {
1850
+ const existing = this.dedupMap.get(opts.dedupKey);
1851
+ if (existing !== void 0) {
1852
+ this.log.log(modelId, "dedup hit", { dedupKey: opts.dedupKey });
1853
+ return existing;
1854
+ }
1855
+ const { dedupKey, ...optsWithoutDedup } = opts;
1856
+ const promise = this.execute(modelId, provider, prompt, fn, optsWithoutDedup).finally(() => {
1857
+ this.dedupMap.delete(dedupKey);
1858
+ });
1859
+ this.dedupMap.set(dedupKey, promise);
1860
+ return promise;
1861
+ }
1783
1862
  if (this.shutdownRequested) {
1784
1863
  this.emitter.emit("dropped", {
1785
1864
  model: modelId,
@@ -1868,6 +1947,7 @@ var Pipeline = class {
1868
1947
  priority: opts.priority,
1869
1948
  timeoutMs: opts.timeoutMs,
1870
1949
  ...opts.signal !== void 0 && { signal: opts.signal },
1950
+ ...this.config.queue?.onFull !== void 0 && { onFull: this.config.queue.onFull },
1871
1951
  onQueued: (queueDepth, estimatedWaitMs) => {
1872
1952
  this.log.log(modelId, "queuing", { queueDepth, estimatedWaitMs, priority: opts.priority });
1873
1953
  this.emitter.emit("queued", { model: modelId, provider, priority: opts.priority, queueDepth, estimatedWaitMs });
@@ -1878,6 +1958,7 @@ var Pipeline = class {
1878
1958
  this.emitter.emit("dequeued", { model: modelId, provider, waitedMs, priority: opts.priority });
1879
1959
  }
1880
1960
  });
1961
+ if (++this.executeCount % 1e3 === 0) this.pruneKeyMeta();
1881
1962
  } catch (acquireErr) {
1882
1963
  if (acquireErr instanceof QueueFullError) {
1883
1964
  const maxSize = this.config.queue?.maxSize;
@@ -1994,6 +2075,9 @@ var Pipeline = class {
1994
2075
  getCostReport() {
1995
2076
  return this.costTracker.getReport();
1996
2077
  }
2078
+ getCostForecast() {
2079
+ return this.costTracker.getForecast();
2080
+ }
1997
2081
  getStatus() {
1998
2082
  const models = [];
1999
2083
  let totalQueueDepth = 0;
@@ -2004,6 +2088,7 @@ var Pipeline = class {
2004
2088
  const snapshot = this.engine.windowSnapshot(key);
2005
2089
  const queueDepth = this.engine.queueDepth(key);
2006
2090
  const backoffUntil = this.engine.backoffUntil(key);
2091
+ if (snapshot.requests === 0 && queueDepth === 0 && backoffUntil === null) continue;
2007
2092
  totalQueueDepth += queueDepth;
2008
2093
  models.push({
2009
2094
  modelId,
@@ -2057,6 +2142,22 @@ var Pipeline = class {
2057
2142
  await new Promise((resolve) => setTimeout(resolve, 50));
2058
2143
  }
2059
2144
  }
2145
+ /**
2146
+ * Clear all rate-limit, queue, cost, and circuit-breaker state.
2147
+ * Any currently queued requests are rejected with ShutdownError.
2148
+ * Useful in tests to reset between runs without recreating the limiter.
2149
+ */
2150
+ reset() {
2151
+ this.engine.reset();
2152
+ this.costTracker.reset();
2153
+ this.keyMeta.clear();
2154
+ this.circuits.clear();
2155
+ this.detectedLimits.clear();
2156
+ this.dedupMap.clear();
2157
+ this.warnedModels.clear();
2158
+ this.executeCount = 0;
2159
+ this.shutdownRequested = false;
2160
+ }
2060
2161
  /** Pre-load historical cost data from the persistent cost store. */
2061
2162
  async warmUp() {
2062
2163
  if (this.config.cost?.store) {
@@ -2092,6 +2193,21 @@ var Pipeline = class {
2092
2193
  // -------------------------------------------------------------------------
2093
2194
  // Private helpers
2094
2195
  // -------------------------------------------------------------------------
2196
+ /**
2197
+ * Remove keyMeta entries for keys that have no recent activity.
2198
+ * A key is safe to evict when its sliding window is empty (all entries
2199
+ * older than 60s) and it has no queued or in-flight requests.
2200
+ * Called every 1000 executions — no setInterval, no GC interference.
2201
+ */
2202
+ pruneKeyMeta() {
2203
+ for (const key of this.keyMeta.keys()) {
2204
+ const snapshot = this.engine.windowSnapshot(key);
2205
+ const queueDepth = this.engine.queueDepth(key);
2206
+ if (snapshot.requests === 0 && queueDepth === 0) {
2207
+ this.keyMeta.delete(key);
2208
+ }
2209
+ }
2210
+ }
2095
2211
  getOrCreateCircuit(key) {
2096
2212
  let cb = this.circuits.get(key);
2097
2213
  if (!cb) {
@@ -2103,13 +2219,24 @@ var Pipeline = class {
2103
2219
  resolveModelLimits(modelId, provider) {
2104
2220
  const base = resolveModelLimits(modelId, provider, this.config.limits ?? {});
2105
2221
  const detected = this.detectedLimits.get(`${provider}:${modelId}`);
2106
- if (!detected) return base;
2107
- const userOverride = this.config.limits?.[modelId] ?? {};
2108
- return {
2109
- ...base,
2110
- ...!("rpm" in userOverride) && detected.rpm !== void 0 && { rpm: detected.rpm },
2111
- ...!("itpm" in userOverride) && detected.itpm !== void 0 && { itpm: detected.itpm }
2112
- };
2222
+ const limits = detected ? (() => {
2223
+ const userOverride = this.config.limits?.[modelId] ?? {};
2224
+ return {
2225
+ ...base,
2226
+ ...!("rpm" in userOverride) && detected.rpm !== void 0 && { rpm: detected.rpm },
2227
+ ...!("itpm" in userOverride) && detected.itpm !== void 0 && { itpm: detected.itpm }
2228
+ };
2229
+ })() : base;
2230
+ const warnKey = `${provider}:${modelId}`;
2231
+ if (!this.warnedModels.has(warnKey) && !isKnownModel(modelId, provider) && limits.inputPricePerMillion === 0 && limits.outputPricePerMillion === 0 && !(this.config.limits?.[modelId]?.inputPricePerMillion !== void 0 || this.config.limits?.[modelId]?.outputPricePerMillion !== void 0)) {
2232
+ this.warnedModels.add(warnKey);
2233
+ console.warn(
2234
+ `${WARN_PREFIX}: Unknown model '${modelId}' (provider: '${provider}'). Using fallback limits (${limits.rpm} RPM). Cost tracking is disabled.
2235
+ Add pricing to config.limits to enable it:
2236
+ limits: { '${modelId}': { inputPricePerMillion: <n>, outputPricePerMillion: <n> } }` + WARN_RESET
2237
+ );
2238
+ }
2239
+ return limits;
2113
2240
  }
2114
2241
  };
2115
2242
 
@@ -2122,7 +2249,8 @@ function getPerRequestOptions(params, queueTimeout) {
2122
2249
  metadata: raw?.metadata ?? {},
2123
2250
  skipBudgetCheck: raw?._skipBudgetCheck ?? false,
2124
2251
  scope: raw?.scope,
2125
- callTimeout: raw?.callTimeout
2252
+ callTimeout: raw?.callTimeout,
2253
+ dedupKey: raw?.dedupKey
2126
2254
  };
2127
2255
  }
2128
2256
  function extractTokenUsage(usage) {
@@ -2138,7 +2266,7 @@ function createMiddleware(pipeline, queueTimeout) {
2138
2266
  // wrapGenerate — non-streaming
2139
2267
  // -----------------------------------------------------------------------
2140
2268
  async wrapGenerate({ doGenerate, params, model }) {
2141
- const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout } = getPerRequestOptions(params, queueTimeout);
2269
+ const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout, dedupKey } = getPerRequestOptions(params, queueTimeout);
2142
2270
  const modelId = model.modelId;
2143
2271
  const provider = model.provider;
2144
2272
  const startMs = Date.now();
@@ -2154,6 +2282,7 @@ function createMiddleware(pipeline, queueTimeout) {
2154
2282
  skipBudgetCheck,
2155
2283
  ...scope !== void 0 && { scope },
2156
2284
  ...callTimeout !== void 0 && { callTimeout },
2285
+ ...dedupKey !== void 0 && { dedupKey },
2157
2286
  ...params.abortSignal !== void 0 && { signal: params.abortSignal }
2158
2287
  }
2159
2288
  );
@@ -2167,7 +2296,7 @@ function createMiddleware(pipeline, queueTimeout) {
2167
2296
  // wrapStream — streaming
2168
2297
  // -----------------------------------------------------------------------
2169
2298
  async wrapStream({ doStream, params, model }) {
2170
- const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout } = getPerRequestOptions(params, queueTimeout);
2299
+ const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout, dedupKey } = getPerRequestOptions(params, queueTimeout);
2171
2300
  const modelId = model.modelId;
2172
2301
  const provider = model.provider;
2173
2302
  const startMs = Date.now();
@@ -2183,6 +2312,7 @@ function createMiddleware(pipeline, queueTimeout) {
2183
2312
  skipBudgetCheck,
2184
2313
  ...scope !== void 0 && { scope },
2185
2314
  ...callTimeout !== void 0 && { callTimeout },
2315
+ ...dedupKey !== void 0 && { dedupKey },
2186
2316
  ...params.abortSignal !== void 0 && { signal: params.abortSignal }
2187
2317
  }
2188
2318
  );
@@ -2508,6 +2638,9 @@ function createRateLimiter(config = {}) {
2508
2638
  getCostReport() {
2509
2639
  return pipeline.getCostReport();
2510
2640
  },
2641
+ getCostForecast() {
2642
+ return pipeline.getCostForecast();
2643
+ },
2511
2644
  getStatus() {
2512
2645
  return pipeline.getStatus();
2513
2646
  },
@@ -2528,6 +2661,41 @@ function createRateLimiter(config = {}) {
2528
2661
  },
2529
2662
  warmUp() {
2530
2663
  return pipeline.warmUp();
2664
+ },
2665
+ reset() {
2666
+ pipeline.reset();
2667
+ }
2668
+ };
2669
+ }
2670
+
2671
+ // src/adapters/model-pool.ts
2672
+ function createModelPool(models, options) {
2673
+ if (models.length === 0) {
2674
+ throw new Error("createModelPool: at least one model is required");
2675
+ }
2676
+ const strategy = options?.strategy ?? "round-robin";
2677
+ let index = 0;
2678
+ function pick() {
2679
+ if (strategy === "random") {
2680
+ return models[Math.floor(Math.random() * models.length)];
2681
+ }
2682
+ const model = models[index % models.length];
2683
+ index = (index + 1) % models.length;
2684
+ return model;
2685
+ }
2686
+ const primary = models[0];
2687
+ return {
2688
+ get modelId() {
2689
+ return primary.modelId;
2690
+ },
2691
+ get provider() {
2692
+ return primary.provider;
2693
+ },
2694
+ doGenerate(params) {
2695
+ return pick().doGenerate(params);
2696
+ },
2697
+ doStream(params) {
2698
+ return pick().doStream(params);
2531
2699
  }
2532
2700
  };
2533
2701
  }
@@ -2546,6 +2714,7 @@ exports.RateLimitExceededError = RateLimitExceededError;
2546
2714
  exports.RateLimiterError = RateLimiterError;
2547
2715
  exports.RetryExhaustedError = RetryExhaustedError;
2548
2716
  exports.ShutdownError = ShutdownError;
2717
+ exports.createModelPool = createModelPool;
2549
2718
  exports.createRateLimiter = createRateLimiter;
2550
2719
  exports.isKnownModel = isKnownModel;
2551
2720
  exports.rateLimited = rateLimited;