ai-sdk-rate-limiter 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -3,6 +3,8 @@
3
3
  Smart rate limiting, queuing, and cost tracking for AI API calls. Works across providers. Zero required dependencies.
4
4
 
5
5
  [![npm](https://img.shields.io/npm/v/ai-sdk-rate-limiter)](https://www.npmjs.com/package/ai-sdk-rate-limiter)
6
+ [![CI](https://github.com/piyushgupta344/ai-sdk-rate-limiter/actions/workflows/ci.yml/badge.svg)](https://github.com/piyushgupta344/ai-sdk-rate-limiter/actions/workflows/ci.yml)
7
+ [![npm downloads](https://img.shields.io/npm/dm/ai-sdk-rate-limiter)](https://www.npmjs.com/package/ai-sdk-rate-limiter)
6
8
 
7
9
  ```
8
10
  npm install ai-sdk-rate-limiter
@@ -83,6 +85,8 @@ The wrapped model is a drop-in replacement. Every Vercel AI SDK feature — stre
83
85
 
84
86
  **Fallback chains** — `fallback` now accepts an array of models. On `BudgetExceededError`, the chain is walked in order until one succeeds.
85
87
 
88
+ **Express / Hono middleware** — `createRateLimiterMiddleware()` (from `ai-sdk-rate-limiter/middleware`) attaches `req.rateLimiter` to every request and converts rate-limiter errors to proper HTTP responses at the middleware layer — no per-route boilerplate.
89
+
86
90
  **OpenTelemetry** — Drop-in OTel plugin that emits GenAI-spec spans for every request. Works with any OTel-compatible tracer.
87
91
 
88
92
  **Testing utilities** — `createTestLimiter()` records every completed call so you can assert on model usage, token counts, and costs in unit tests.
@@ -109,6 +113,7 @@ The wrapped model is a drop-in replacement. Every Vercel AI SDK feature — stre
109
113
  - [Graceful shutdown](#graceful-shutdown)
110
114
  - [Prometheus metrics](#prometheus-metrics)
111
115
  - [StatsD metrics](#statsD-metrics)
116
+ - [Express / Hono middleware](#express--hono-middleware)
112
117
  - [Events](#events)
113
118
  - [Backpressure](#backpressure)
114
119
  - [Error handling](#error-handling)
@@ -552,6 +557,27 @@ Costs are based on **actual token counts** from API responses — not estimates.
552
557
 
553
558
  `byScope` is populated automatically when requests carry a `scope` (either set on `limiter.wrap()` or via `providerOptions.rateLimiter.scope`). Unscoped requests don't appear in `byScope`.
554
559
 
560
+ ### Cost forecasting
561
+
562
+ `getCostForecast()` projects your end-of-period spend based on the current hourly rate. Useful for alerting before a budget cap is hit:
563
+
564
+ ```typescript
565
+ const forecast = limiter.getCostForecast()
566
+
567
+ console.log(forecast)
568
+ // {
569
+ // hour: { spentUsd: 1.20, projectedUsd: 1.20, ratePerHourUsd: 1.20 },
570
+ // day: { spentUsd: 3.50, projectedUsd: 28.80, ratePerHourUsd: 1.20 },
571
+ // month: { spentUsd: 8.10, projectedUsd: 864, ratePerHourUsd: 1.20 },
572
+ // }
573
+
574
+ if (forecast.day.projectedUsd > 40) {
575
+ console.warn(`Heads up — on track to spend $${forecast.day.projectedUsd.toFixed(2)} today`)
576
+ }
577
+ ```
578
+
579
+ `projectedUsd` = current hourly rate × hours in the period. It is based on the **last 60 minutes** of spend, so it responds quickly to usage spikes.
580
+
555
581
  ---
556
582
 
557
583
  ## Budget fallback routing
@@ -858,6 +884,121 @@ const client: StatsDClient = {
858
884
 
859
885
  ---
860
886
 
887
+ ## Express / Hono middleware
888
+
889
+ The `ai-sdk-rate-limiter/middleware` entry point eliminates per-route boilerplate. Scope extraction, priority assignment, and rate-limiter error handling all move to the middleware layer — route handlers just pass `req.rateLimiter` through.
890
+
891
+ ### Express
892
+
893
+ ```typescript
894
+ import { createRateLimiterMiddleware } from 'ai-sdk-rate-limiter/middleware'
895
+
896
+ const { middleware, errorHandler } = createRateLimiterMiddleware(limiter, {
897
+ // Extract scope from the request — stored in req.rateLimiter.scope
898
+ scope: (req) => {
899
+ const plan = req.headers['x-user-plan'] ?? 'free'
900
+ const id = req.headers['x-user-id']
901
+ return id ? `user:${plan}:${id}` : undefined
902
+ },
903
+
904
+ // Derive queue priority per-request
905
+ priority: (req) => req.headers['x-user-plan'] === 'pro' ? 'normal' : 'low',
906
+
907
+ // Add X-RateLimit-* informational headers to every response
908
+ injectHeaders: 'gpt-4o-mini',
909
+ })
910
+
911
+ app.use(middleware) // BEFORE routes
912
+
913
+ app.post('/chat', async (req, res) => {
914
+ const { text } = await generateText({
915
+ model,
916
+ prompt: req.body.message,
917
+ // req.rateLimiter already has scope + priority — just pass it through
918
+ providerOptions: { rateLimiter: req.rateLimiter },
919
+ })
920
+ res.json({ text })
921
+ })
922
+
923
+ app.use(errorHandler) // AFTER routes
924
+ ```
925
+
926
+ The `errorHandler` converts every `RateLimiterError` to a typed HTTP response automatically — no try/catch needed in route handlers:
927
+
928
+ | Error | HTTP status | `code` |
929
+ |---|---|---|
930
+ | `QueueTimeoutError` | 503 | `QUEUE_TIMEOUT` |
931
+ | `QueueFullError` | 503 | `QUEUE_FULL` |
932
+ | `CircuitOpenError` | 503 | `CIRCUIT_OPEN` |
933
+ | `ShutdownError` | 503 | `SHUTDOWN` |
934
+ | `BudgetExceededError` | 402 | `BUDGET_EXCEEDED` |
935
+ | `RateLimiterError` (generic) | 429 | `RATE_LIMITED` |
936
+
937
+ Non-rate-limiter errors are passed to the next error handler unchanged.
938
+
939
+ ### Hono
940
+
941
+ ```typescript
942
+ import { createHonoMiddleware } from 'ai-sdk-rate-limiter/middleware'
943
+
944
+ app.use(createHonoMiddleware(limiter, {
945
+ scope: (c) => c.req.header('x-user-id'),
946
+ priority: (c) => c.req.header('x-plan') === 'pro' ? 'normal' : 'low',
947
+ }))
948
+
949
+ app.post('/chat', async (c) => {
950
+ const { text } = await generateText({
951
+ model,
952
+ prompt: await c.req.text(),
953
+ providerOptions: { rateLimiter: c.var.rateLimiter },
954
+ })
955
+ return c.json({ text })
956
+ })
957
+ ```
958
+
959
+ `createHonoMiddleware` wraps the `next()` call in a try/catch, so `RateLimiterErrors` thrown inside route handlers are caught and returned as JSON responses automatically.
960
+
961
+ ### Standalone error handler
962
+
963
+ If you only need error handling without scope injection:
964
+
965
+ ```typescript
966
+ import { createRateLimiterErrorHandler } from 'ai-sdk-rate-limiter/middleware'
967
+
968
+ app.use(createRateLimiterErrorHandler({
969
+ includeDetails: false, // omit retryAfter, period, limitUsd from response body
970
+ }))
971
+ ```
972
+
973
+ ### Custom framework (Fastify, etc.)
974
+
975
+ `mapErrorToResponse` is exported for frameworks that don't use the `(req, res, next)` convention:
976
+
977
+ ```typescript
978
+ import { mapErrorToResponse } from 'ai-sdk-rate-limiter/middleware'
979
+ import { RateLimiterError } from 'ai-sdk-rate-limiter'
980
+
981
+ // Fastify onError hook
982
+ fastify.setErrorHandler((err, request, reply) => {
983
+ if (err instanceof RateLimiterError) {
984
+ const { status, body } = mapErrorToResponse(err)
985
+ return reply.status(status).send(body)
986
+ }
987
+ reply.send(err)
988
+ })
989
+ ```
990
+
991
+ ### `req.rateLimiter` TypeScript type
992
+
993
+ The middleware augments `http.IncomingMessage` so `req.rateLimiter` is typed in Express and Fastify without any additional setup:
994
+
995
+ ```typescript
996
+ import type { RateLimiterRequestContext } from 'ai-sdk-rate-limiter/middleware'
997
+ // req.rateLimiter is automatically typed as RateLimiterRequestContext | undefined
998
+ ```
999
+
1000
+ ---
1001
+
861
1002
  ## Events
862
1003
 
863
1004
  All events are typed. Register handlers at creation time or dynamically:
@@ -1336,6 +1477,57 @@ class MyStore implements RateLimitStore {
1336
1477
  const limiter = createRateLimiter({ store: new MyStore() })
1337
1478
  ```
1338
1479
 
1480
+ ### Load balancing across API keys
1481
+
1482
+ `createModelPool()` distributes requests round-robin across multiple model instances — useful when you have more than one API key:
1483
+
1484
+ ```typescript
1485
+ import { createRateLimiter, createModelPool } from 'ai-sdk-rate-limiter'
1486
+ import { createOpenAI } from '@ai-sdk/openai'
1487
+
1488
+ // Two API keys, each with their own limiter tracking separate RPM limits
1489
+ const limiter1 = createRateLimiter({ limits: { 'gpt-4o': { rpm: 500, itpm: 2_000_000 } } })
1490
+ const limiter2 = createRateLimiter({ limits: { 'gpt-4o': { rpm: 500, itpm: 2_000_000 } } })
1491
+
1492
+ const openai1 = createOpenAI({ apiKey: process.env.OPENAI_KEY_1 })
1493
+ const openai2 = createOpenAI({ apiKey: process.env.OPENAI_KEY_2 })
1494
+
1495
+ const pool = createModelPool([
1496
+ limiter1.wrap(openai1('gpt-4o')),
1497
+ limiter2.wrap(openai2('gpt-4o')),
1498
+ ])
1499
+
1500
+ // Use exactly like a regular model — calls alternate between the two keys
1501
+ const { text } = await generateText({ model: pool, prompt: 'Hello!' })
1502
+ ```
1503
+
1504
+ Pass `{ strategy: 'random' }` for random selection instead of round-robin.
1505
+
1506
+ ### Request deduplication
1507
+
1508
+ When multiple concurrent requests carry the same `dedupKey`, only one API call is made and all callers receive the same result. Useful for FAQ-style workloads where many users ask the same question simultaneously:
1509
+
1510
+ ```typescript
1511
+ const model = limiter.wrap(openai('gpt-4o'))
1512
+
1513
+ // Server handler — two simultaneous identical requests share one API call
1514
+ async function handleRequest(questionId: string) {
1515
+ const { text } = await generateText({
1516
+ model,
1517
+ prompt: questions[questionId],
1518
+ providerOptions: {
1519
+ rateLimiter: { dedupKey: `faq:${questionId}` },
1520
+ },
1521
+ })
1522
+ return text
1523
+ }
1524
+
1525
+ // If 50 users hit the same FAQ item at the same time → 1 API call, not 50
1526
+ const results = await Promise.all(users.map(() => handleRequest('faq-42')))
1527
+ ```
1528
+
1529
+ The dedup entry is removed once the request completes (success or error), so subsequent requests always make a fresh call.
1530
+
1339
1531
  ---
1340
1532
 
1341
1533
  ## How it works
@@ -1358,6 +1550,42 @@ const limiter = createRateLimiter({ store: new MyStore() })
1358
1550
 
1359
1551
  ---
1360
1552
 
1553
+ ## Debug mode
1554
+
1555
+ Set `debug: true` to enable structured console logging for every rate-limit decision, queue entry/exit, slot acquisition, circuit breaker transition, and completed call cost:
1556
+
1557
+ ```typescript
1558
+ const limiter = createRateLimiter({ debug: true })
1559
+ ```
1560
+
1561
+ Sample output:
1562
+
1563
+ ```
1564
+ [ai-sdk-rate-limiter] gpt-4o: execute (provider="openai" priority="normal")
1565
+ [ai-sdk-rate-limiter] gpt-4o: queuing (queueDepth=3 estimatedWaitMs=1200 priority="normal")
1566
+ [ai-sdk-rate-limiter] gpt-4o: dequeued (waitedMs=1187 priority="normal")
1567
+ [ai-sdk-rate-limiter] gpt-4o: completed (tokens=342+87 costUsd=0.000021 latencyMs=1343 streaming=false)
1568
+ ```
1569
+
1570
+ Debug logging is completely zero-overhead when disabled — no string building, no `JSON.stringify`, no property access on the details object.
1571
+
1572
+ ---
1573
+
1574
+ ## Config validation
1575
+
1576
+ `createRateLimiter()` validates your configuration at construction time. If it spots a likely misconfiguration it logs a `console.warn` (never throws). Catches you've got covered:
1577
+
1578
+ | Issue | Warning |
1579
+ |---|---|
1580
+ | `cost.store` set but `warmUp()` never called | Reminds you to call `warmUp()` at startup |
1581
+ | `circuit.failureThreshold < 3` | Too sensitive — risks false trips on transient errors |
1582
+ | `retry.retryOn` excludes 429 | Rate-limit errors won't be retried |
1583
+ | `queue.timeout < 3000ms` | Requests will time out before they can be served |
1584
+ | `cost.budget` set without `onExceeded` | Silent default is `'throw'` — may want `'queue'` or `'fallback'` |
1585
+ | `cost.onExceeded: 'fallback'` | Reminds you to pass a `fallback` model to `limiter.wrap()` |
1586
+
1587
+ ---
1588
+
1361
1589
  ## Comparison
1362
1590
 
1363
1591
  | | ai-sdk-rate-limiter | bottleneck | p-limit | SDK built-in retry | LangChain |
@@ -1382,6 +1610,7 @@ const limiter = createRateLimiter({ store: new MyStore() })
1382
1610
  | Backoff propagation | yes | no | no | no | no |
1383
1611
  | Prometheus metrics | yes | no | no | no | no |
1384
1612
  | StatsD metrics | yes | no | no | no | no |
1613
+ | Express/Hono middleware | yes | no | no | no | no |
1385
1614
  | OpenTelemetry | yes | no | no | no | partial |
1386
1615
  | Testing utilities | yes | no | no | no | no |
1387
1616
  | CLI audit | yes | no | no | no | no |
@@ -1425,6 +1654,14 @@ import type {
1425
1654
  } from 'ai-sdk-rate-limiter/redis'
1426
1655
 
1427
1656
  import type { StatsDClient } from 'ai-sdk-rate-limiter/statsd'
1657
+
1658
+ import type {
1659
+ RateLimiterRequestContext,
1660
+ RateLimiterMiddlewareOptions,
1661
+ ErrorHandlerOptions,
1662
+ HonoMiddlewareOptions,
1663
+ HonoContext,
1664
+ } from 'ai-sdk-rate-limiter/middleware'
1428
1665
  ```
1429
1666
 
1430
1667
  ---
@@ -1442,6 +1679,23 @@ Four runnable examples are included, each with its own README:
1442
1679
 
1443
1680
  ---
1444
1681
 
1682
+ ## Bundle sizes
1683
+
1684
+ Each entry point is independently tree-shakeable. Importing `ai-sdk-rate-limiter` never pulls in Redis, Prometheus, OTel, or StatsD.
1685
+
1686
+ | Entry point | Size (minified) | Size (gzip) |
1687
+ |---|---|---|
1688
+ | `ai-sdk-rate-limiter` | ~80 KB | ~22 KB |
1689
+ | `ai-sdk-rate-limiter/redis` | ~12 KB | ~4 KB |
1690
+ | `ai-sdk-rate-limiter/middleware` | ~8 KB | ~2.5 KB |
1691
+ | `ai-sdk-rate-limiter/prometheus` | ~8 KB | ~2.5 KB |
1692
+ | `ai-sdk-rate-limiter/otel` | ~4 KB | ~1.5 KB |
1693
+ | `ai-sdk-rate-limiter/statsd` | ~4 KB | ~1.2 KB |
1694
+
1695
+ The core package is self-contained. Optional peer deps (`ioredis`, `@opentelemetry/api`) are only loaded when you import the corresponding entry point.
1696
+
1697
+ ---
1698
+
1445
1699
  ## Requirements
1446
1700
 
1447
1701
  - Node.js 18+ / Bun / Deno
package/dist/index.cjs CHANGED
@@ -668,6 +668,30 @@ var CostTracker = class {
668
668
  estimateCost(inputTokens, outputTokens, inputPricePerMillion, outputPricePerMillion) {
669
669
  return inputTokens / 1e6 * inputPricePerMillion + outputTokens / 1e6 * outputPricePerMillion;
670
670
  }
671
+ getForecast() {
672
+ const now = Date.now();
673
+ this.evict(now);
674
+ const hourlyRate = this.entries.filter((e) => e.timestamp > now - HOUR_MS).reduce((s, e) => s + e.costUsd, 0);
675
+ const daySpent = this.entries.filter((e) => e.timestamp > now - DAY_MS2).reduce((s, e) => s + e.costUsd, 0);
676
+ const monthSpent = this.entries.filter((e) => e.timestamp > now - MONTH_MS).reduce((s, e) => s + e.costUsd, 0);
677
+ return {
678
+ hour: {
679
+ spentUsd: hourlyRate,
680
+ projectedUsd: hourlyRate,
681
+ ratePerHourUsd: hourlyRate
682
+ },
683
+ day: {
684
+ spentUsd: daySpent,
685
+ projectedUsd: hourlyRate * 24,
686
+ ratePerHourUsd: hourlyRate
687
+ },
688
+ month: {
689
+ spentUsd: monthSpent,
690
+ projectedUsd: hourlyRate * 24 * 30,
691
+ ratePerHourUsd: hourlyRate
692
+ }
693
+ };
694
+ }
671
695
  getReport() {
672
696
  const now = Date.now();
673
697
  this.evict(now);
@@ -1678,6 +1702,23 @@ var CircuitBreaker = class {
1678
1702
  }
1679
1703
  };
1680
1704
 
1705
+ // src/core/debug-logger.ts
1706
+ var PREFIX = "[ai-sdk-rate-limiter]";
1707
+ var DebugLogger = class {
1708
+ constructor(enabled) {
1709
+ this.enabled = enabled;
1710
+ }
1711
+ log(model, message, details) {
1712
+ if (!this.enabled) return;
1713
+ if (details && Object.keys(details).length > 0) {
1714
+ const parts = Object.entries(details).map(([k, v]) => `${k}=${JSON.stringify(v)}`).join(" ");
1715
+ console.log(`${PREFIX} ${model}: ${message} (${parts})`);
1716
+ } else {
1717
+ console.log(`${PREFIX} ${model}: ${message}`);
1718
+ }
1719
+ }
1720
+ };
1721
+
1681
1722
  // src/core/pipeline.ts
1682
1723
  function resolveRetryConfig(config) {
1683
1724
  const r = config.retry ?? {};
@@ -1717,9 +1758,12 @@ var Pipeline = class {
1717
1758
  this.circuits = /* @__PURE__ */ new Map();
1718
1759
  /** Limits detected from provider response headers (lower priority than user config) */
1719
1760
  this.detectedLimits = /* @__PURE__ */ new Map();
1761
+ /** In-flight promises indexed by dedup key, shared across identical concurrent requests */
1762
+ this.dedupMap = /* @__PURE__ */ new Map();
1720
1763
  /** Set to true after shutdown() is called */
1721
1764
  this.shutdownRequested = false;
1722
1765
  this.config = config;
1766
+ this.log = new DebugLogger(config.debug === true);
1723
1767
  this.engine = new RateLimitEngine({
1724
1768
  maxQueueSize: config.queue?.maxSize ?? 500,
1725
1769
  ...config.store !== void 0 && { store: config.store }
@@ -1761,6 +1805,20 @@ var Pipeline = class {
1761
1805
  * recordUsage() once they have actual token counts from the API response.
1762
1806
  */
1763
1807
  async execute(modelId, provider, prompt, fn, opts) {
1808
+ this.log.log(modelId, "execute", { provider, priority: opts.priority, ...opts.scope !== void 0 && { scope: opts.scope } });
1809
+ if (opts.dedupKey !== void 0) {
1810
+ const existing = this.dedupMap.get(opts.dedupKey);
1811
+ if (existing !== void 0) {
1812
+ this.log.log(modelId, "dedup hit", { dedupKey: opts.dedupKey });
1813
+ return existing;
1814
+ }
1815
+ const { dedupKey, ...optsWithoutDedup } = opts;
1816
+ const promise = this.execute(modelId, provider, prompt, fn, optsWithoutDedup).finally(() => {
1817
+ this.dedupMap.delete(dedupKey);
1818
+ });
1819
+ this.dedupMap.set(dedupKey, promise);
1820
+ return promise;
1821
+ }
1764
1822
  if (this.shutdownRequested) {
1765
1823
  this.emitter.emit("dropped", {
1766
1824
  model: modelId,
@@ -1850,10 +1908,12 @@ var Pipeline = class {
1850
1908
  timeoutMs: opts.timeoutMs,
1851
1909
  ...opts.signal !== void 0 && { signal: opts.signal },
1852
1910
  onQueued: (queueDepth, estimatedWaitMs) => {
1911
+ this.log.log(modelId, "queuing", { queueDepth, estimatedWaitMs, priority: opts.priority });
1853
1912
  this.emitter.emit("queued", { model: modelId, provider, priority: opts.priority, queueDepth, estimatedWaitMs });
1854
1913
  this.emitter.emit("rateLimited", { source: "local", model: modelId, provider, limitType: "rpm", resetAt: Date.now() + estimatedWaitMs });
1855
1914
  },
1856
1915
  onDequeued: (waitedMs) => {
1916
+ this.log.log(modelId, "dequeued", { waitedMs, priority: opts.priority });
1857
1917
  this.emitter.emit("dequeued", { model: modelId, provider, waitedMs, priority: opts.priority });
1858
1918
  }
1859
1919
  });
@@ -1905,7 +1965,10 @@ var Pipeline = class {
1905
1965
  });
1906
1966
  if (circuit) {
1907
1967
  const justClosed = circuit.recordSuccess();
1908
- if (justClosed) this.emitter.emit("circuitClosed", { model: modelId, provider });
1968
+ if (justClosed) {
1969
+ this.log.log(modelId, "circuit closed \u2014 upstream recovered");
1970
+ this.emitter.emit("circuitClosed", { model: modelId, provider });
1971
+ }
1909
1972
  }
1910
1973
  return result;
1911
1974
  } catch (error) {
@@ -1915,6 +1978,7 @@ var Pipeline = class {
1915
1978
  if (shouldTrip) {
1916
1979
  const justOpened = circuit.recordFailure();
1917
1980
  if (justOpened) {
1981
+ this.log.log(modelId, "circuit OPEN", { status, cooldownMs: this.config.circuit?.cooldownMs ?? 6e4 });
1918
1982
  this.emitter.emit("circuitOpen", {
1919
1983
  model: modelId,
1920
1984
  provider,
@@ -1945,6 +2009,13 @@ var Pipeline = class {
1945
2009
  limits.outputPricePerMillion,
1946
2010
  scope
1947
2011
  );
2012
+ this.log.log(modelId, "completed", {
2013
+ tokens: `${usage.inputTokens}+${usage.outputTokens}`,
2014
+ costUsd: costUsd.toFixed(6),
2015
+ latencyMs,
2016
+ streaming,
2017
+ ...scope !== void 0 && { scope }
2018
+ });
1948
2019
  this.emitter.emit("completed", {
1949
2020
  model: modelId,
1950
2021
  provider,
@@ -1962,6 +2033,9 @@ var Pipeline = class {
1962
2033
  getCostReport() {
1963
2034
  return this.costTracker.getReport();
1964
2035
  }
2036
+ getCostForecast() {
2037
+ return this.costTracker.getForecast();
2038
+ }
1965
2039
  getStatus() {
1966
2040
  const models = [];
1967
2041
  let totalQueueDepth = 0;
@@ -2090,7 +2164,8 @@ function getPerRequestOptions(params, queueTimeout) {
2090
2164
  metadata: raw?.metadata ?? {},
2091
2165
  skipBudgetCheck: raw?._skipBudgetCheck ?? false,
2092
2166
  scope: raw?.scope,
2093
- callTimeout: raw?.callTimeout
2167
+ callTimeout: raw?.callTimeout,
2168
+ dedupKey: raw?.dedupKey
2094
2169
  };
2095
2170
  }
2096
2171
  function extractTokenUsage(usage) {
@@ -2106,7 +2181,7 @@ function createMiddleware(pipeline, queueTimeout) {
2106
2181
  // wrapGenerate — non-streaming
2107
2182
  // -----------------------------------------------------------------------
2108
2183
  async wrapGenerate({ doGenerate, params, model }) {
2109
- const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout } = getPerRequestOptions(params, queueTimeout);
2184
+ const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout, dedupKey } = getPerRequestOptions(params, queueTimeout);
2110
2185
  const modelId = model.modelId;
2111
2186
  const provider = model.provider;
2112
2187
  const startMs = Date.now();
@@ -2122,6 +2197,7 @@ function createMiddleware(pipeline, queueTimeout) {
2122
2197
  skipBudgetCheck,
2123
2198
  ...scope !== void 0 && { scope },
2124
2199
  ...callTimeout !== void 0 && { callTimeout },
2200
+ ...dedupKey !== void 0 && { dedupKey },
2125
2201
  ...params.abortSignal !== void 0 && { signal: params.abortSignal }
2126
2202
  }
2127
2203
  );
@@ -2135,7 +2211,7 @@ function createMiddleware(pipeline, queueTimeout) {
2135
2211
  // wrapStream — streaming
2136
2212
  // -----------------------------------------------------------------------
2137
2213
  async wrapStream({ doStream, params, model }) {
2138
- const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout } = getPerRequestOptions(params, queueTimeout);
2214
+ const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout, dedupKey } = getPerRequestOptions(params, queueTimeout);
2139
2215
  const modelId = model.modelId;
2140
2216
  const provider = model.provider;
2141
2217
  const startMs = Date.now();
@@ -2151,6 +2227,7 @@ function createMiddleware(pipeline, queueTimeout) {
2151
2227
  skipBudgetCheck,
2152
2228
  ...scope !== void 0 && { scope },
2153
2229
  ...callTimeout !== void 0 && { callTimeout },
2230
+ ...dedupKey !== void 0 && { dedupKey },
2154
2231
  ...params.abortSignal !== void 0 && { signal: params.abortSignal }
2155
2232
  }
2156
2233
  );
@@ -2416,8 +2493,53 @@ function rateLimited(client, options = {}) {
2416
2493
  });
2417
2494
  }
2418
2495
 
2496
+ // src/core/config-validator.ts
2497
+ var PREFIX2 = "\x1B[33m\u26A0 ai-sdk-rate-limiter\x1B[0m";
2498
+ var RESET = "\x1B[0m";
2499
+ function validateConfig(config) {
2500
+ const warnings = [];
2501
+ if (config.cost?.store !== void 0) {
2502
+ warnings.push(
2503
+ "cost.store is configured \u2014 call `await limiter.warmUp()` at startup.\n Without it, budget caps won't account for spend from previous process runs."
2504
+ );
2505
+ }
2506
+ const threshold = config.circuit?.failureThreshold;
2507
+ if (threshold !== void 0 && threshold < 3) {
2508
+ warnings.push(
2509
+ `circuit.failureThreshold is ${threshold} \u2014 very low. The circuit will open after nearly every error. Consider a value of 5 or higher for typical production workloads.`
2510
+ );
2511
+ }
2512
+ if (config.retry?.retryOn !== void 0 && !config.retry.retryOn.includes(429)) {
2513
+ warnings.push(
2514
+ "retry.retryOn does not include 429. Rate limit errors from the API will not be retried. Add 429 to retry.retryOn, or remove the override to use the default."
2515
+ );
2516
+ }
2517
+ const queueTimeout = config.queue?.timeout;
2518
+ if (queueTimeout !== void 0 && queueTimeout < 3e3) {
2519
+ warnings.push(
2520
+ `queue.timeout is ${queueTimeout}ms \u2014 less than 3 seconds. Requests may time out before the rate limit window resets (typically 60s). Consider 30_000ms (30s) or higher.`
2521
+ );
2522
+ }
2523
+ if (config.cost?.onExceeded === "fallback") {
2524
+ warnings.push(
2525
+ "cost.onExceeded is 'fallback' but fallback models are configured per-model in limiter.wrap(model, { fallback: cheaperModel }). If no fallback is set on a wrapped model, BudgetExceededError will still be thrown."
2526
+ );
2527
+ }
2528
+ if (config.cost?.budget !== void 0 && config.cost.onExceeded === void 0) {
2529
+ warnings.push(
2530
+ "cost.budget is set but cost.onExceeded is not. Defaulting to 'throw' \u2014 requests will throw BudgetExceededError when the cap is hit. Set onExceeded: 'queue' or 'fallback' to change this behavior."
2531
+ );
2532
+ }
2533
+ for (const warning of warnings) {
2534
+ const formatted = warning.replace(/\n/g, `
2535
+ `);
2536
+ console.warn(`${PREFIX2}: ${formatted}${RESET}`);
2537
+ }
2538
+ }
2539
+
2419
2540
  // src/create-rate-limiter.ts
2420
2541
  function createRateLimiter(config = {}) {
2542
+ validateConfig(config);
2421
2543
  const pipeline = new Pipeline(config);
2422
2544
  const queueTimeout = config.queue?.timeout ?? 3e4;
2423
2545
  const middleware = createMiddleware(pipeline, queueTimeout);
@@ -2431,6 +2553,9 @@ function createRateLimiter(config = {}) {
2431
2553
  getCostReport() {
2432
2554
  return pipeline.getCostReport();
2433
2555
  },
2556
+ getCostForecast() {
2557
+ return pipeline.getCostForecast();
2558
+ },
2434
2559
  getStatus() {
2435
2560
  return pipeline.getStatus();
2436
2561
  },
@@ -2455,6 +2580,38 @@ function createRateLimiter(config = {}) {
2455
2580
  };
2456
2581
  }
2457
2582
 
2583
+ // src/adapters/model-pool.ts
2584
+ function createModelPool(models, options) {
2585
+ if (models.length === 0) {
2586
+ throw new Error("createModelPool: at least one model is required");
2587
+ }
2588
+ const strategy = options?.strategy ?? "round-robin";
2589
+ let index = 0;
2590
+ function pick() {
2591
+ if (strategy === "random") {
2592
+ return models[Math.floor(Math.random() * models.length)];
2593
+ }
2594
+ const model = models[index % models.length];
2595
+ index = (index + 1) % models.length;
2596
+ return model;
2597
+ }
2598
+ const primary = models[0];
2599
+ return {
2600
+ get modelId() {
2601
+ return primary.modelId;
2602
+ },
2603
+ get provider() {
2604
+ return primary.provider;
2605
+ },
2606
+ doGenerate(params) {
2607
+ return pick().doGenerate(params);
2608
+ },
2609
+ doStream(params) {
2610
+ return pick().doStream(params);
2611
+ }
2612
+ };
2613
+ }
2614
+
2458
2615
  exports.ANTHROPIC_MODELS = ANTHROPIC_MODELS;
2459
2616
  exports.BudgetExceededError = BudgetExceededError;
2460
2617
  exports.COHERE_MODELS = COHERE_MODELS;
@@ -2469,6 +2626,7 @@ exports.RateLimitExceededError = RateLimitExceededError;
2469
2626
  exports.RateLimiterError = RateLimiterError;
2470
2627
  exports.RetryExhaustedError = RetryExhaustedError;
2471
2628
  exports.ShutdownError = ShutdownError;
2629
+ exports.createModelPool = createModelPool;
2472
2630
  exports.createRateLimiter = createRateLimiter;
2473
2631
  exports.isKnownModel = isKnownModel;
2474
2632
  exports.rateLimited = rateLimited;