ai-sdk-rate-limiter 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -3,6 +3,8 @@
3
3
  Smart rate limiting, queuing, and cost tracking for AI API calls. Works across providers. Zero required dependencies.
4
4
 
5
5
  [![npm](https://img.shields.io/npm/v/ai-sdk-rate-limiter)](https://www.npmjs.com/package/ai-sdk-rate-limiter)
6
+ [![CI](https://github.com/piyushgupta344/ai-sdk-rate-limiter/actions/workflows/ci.yml/badge.svg)](https://github.com/piyushgupta344/ai-sdk-rate-limiter/actions/workflows/ci.yml)
7
+ [![npm downloads](https://img.shields.io/npm/dm/ai-sdk-rate-limiter)](https://www.npmjs.com/package/ai-sdk-rate-limiter)
6
8
 
7
9
  ```
8
10
  npm install ai-sdk-rate-limiter
@@ -83,6 +85,8 @@ The wrapped model is a drop-in replacement. Every Vercel AI SDK feature — stre
83
85
 
84
86
  **Fallback chains** — `fallback` now accepts an array of models. On `BudgetExceededError`, the chain is walked in order until one succeeds.
85
87
 
88
+ **Express / Hono middleware** — `createRateLimiterMiddleware()` (from `ai-sdk-rate-limiter/middleware`) attaches `req.rateLimiter` to every request and converts rate-limiter errors to proper HTTP responses at the middleware layer — no per-route boilerplate.
89
+
86
90
  **OpenTelemetry** — Drop-in OTel plugin that emits GenAI-spec spans for every request. Works with any OTel-compatible tracer.
87
91
 
88
92
  **Testing utilities** — `createTestLimiter()` records every completed call so you can assert on model usage, token counts, and costs in unit tests.
@@ -109,6 +113,7 @@ The wrapped model is a drop-in replacement. Every Vercel AI SDK feature — stre
109
113
  - [Graceful shutdown](#graceful-shutdown)
110
114
  - [Prometheus metrics](#prometheus-metrics)
111
115
  - [StatsD metrics](#statsD-metrics)
116
+ - [Express / Hono middleware](#express--hono-middleware)
112
117
  - [Events](#events)
113
118
  - [Backpressure](#backpressure)
114
119
  - [Error handling](#error-handling)
@@ -858,6 +863,121 @@ const client: StatsDClient = {
858
863
 
859
864
  ---
860
865
 
866
+ ## Express / Hono middleware
867
+
868
+ The `ai-sdk-rate-limiter/middleware` entry point eliminates per-route boilerplate. Scope extraction, priority assignment, and rate-limiter error handling all move to the middleware layer — route handlers just pass `req.rateLimiter` through.
869
+
870
+ ### Express
871
+
872
+ ```typescript
873
+ import { createRateLimiterMiddleware } from 'ai-sdk-rate-limiter/middleware'
874
+
875
+ const { middleware, errorHandler } = createRateLimiterMiddleware(limiter, {
876
+ // Extract scope from the request — stored in req.rateLimiter.scope
877
+ scope: (req) => {
878
+ const plan = req.headers['x-user-plan'] ?? 'free'
879
+ const id = req.headers['x-user-id']
880
+ return id ? `user:${plan}:${id}` : undefined
881
+ },
882
+
883
+ // Derive queue priority per-request
884
+ priority: (req) => req.headers['x-user-plan'] === 'pro' ? 'normal' : 'low',
885
+
886
+ // Add X-RateLimit-* informational headers to every response
887
+ injectHeaders: 'gpt-4o-mini',
888
+ })
889
+
890
+ app.use(middleware) // BEFORE routes
891
+
892
+ app.post('/chat', async (req, res) => {
893
+ const { text } = await generateText({
894
+ model,
895
+ prompt: req.body.message,
896
+ // req.rateLimiter already has scope + priority — just pass it through
897
+ providerOptions: { rateLimiter: req.rateLimiter },
898
+ })
899
+ res.json({ text })
900
+ })
901
+
902
+ app.use(errorHandler) // AFTER routes
903
+ ```
904
+
905
+ The `errorHandler` converts every `RateLimiterError` to a typed HTTP response automatically — no try/catch needed in route handlers:
906
+
907
+ | Error | HTTP status | `code` |
908
+ |---|---|---|
909
+ | `QueueTimeoutError` | 503 | `QUEUE_TIMEOUT` |
910
+ | `QueueFullError` | 503 | `QUEUE_FULL` |
911
+ | `CircuitOpenError` | 503 | `CIRCUIT_OPEN` |
912
+ | `ShutdownError` | 503 | `SHUTDOWN` |
913
+ | `BudgetExceededError` | 402 | `BUDGET_EXCEEDED` |
914
+ | `RateLimiterError` (generic) | 429 | `RATE_LIMITED` |
915
+
916
+ Non-rate-limiter errors are passed to the next error handler unchanged.
917
+
918
+ ### Hono
919
+
920
+ ```typescript
921
+ import { createHonoMiddleware } from 'ai-sdk-rate-limiter/middleware'
922
+
923
+ app.use(createHonoMiddleware(limiter, {
924
+ scope: (c) => c.req.header('x-user-id'),
925
+ priority: (c) => c.req.header('x-plan') === 'pro' ? 'normal' : 'low',
926
+ }))
927
+
928
+ app.post('/chat', async (c) => {
929
+ const { text } = await generateText({
930
+ model,
931
+ prompt: await c.req.text(),
932
+ providerOptions: { rateLimiter: c.var.rateLimiter },
933
+ })
934
+ return c.json({ text })
935
+ })
936
+ ```
937
+
938
+ `createHonoMiddleware` wraps the `next()` call in a try/catch, so `RateLimiterErrors` thrown inside route handlers are caught and returned as JSON responses automatically.
939
+
940
+ ### Standalone error handler
941
+
942
+ If you only need error handling without scope injection:
943
+
944
+ ```typescript
945
+ import { createRateLimiterErrorHandler } from 'ai-sdk-rate-limiter/middleware'
946
+
947
+ app.use(createRateLimiterErrorHandler({
948
+ includeDetails: false, // omit retryAfter, period, limitUsd from response body
949
+ }))
950
+ ```
951
+
952
+ ### Custom framework (Fastify, etc.)
953
+
954
+ `mapErrorToResponse` is exported for frameworks that don't use the `(req, res, next)` convention:
955
+
956
+ ```typescript
957
+ import { mapErrorToResponse } from 'ai-sdk-rate-limiter/middleware'
958
+ import { RateLimiterError } from 'ai-sdk-rate-limiter'
959
+
960
+ // Fastify onError hook
961
+ fastify.setErrorHandler((err, request, reply) => {
962
+ if (err instanceof RateLimiterError) {
963
+ const { status, body } = mapErrorToResponse(err)
964
+ return reply.status(status).send(body)
965
+ }
966
+ reply.send(err)
967
+ })
968
+ ```
969
+
970
+ ### `req.rateLimiter` TypeScript type
971
+
972
+ The middleware augments `http.IncomingMessage` so `req.rateLimiter` is typed in Express and Fastify without any additional setup:
973
+
974
+ ```typescript
975
+ import type { RateLimiterRequestContext } from 'ai-sdk-rate-limiter/middleware'
976
+ // req.rateLimiter is automatically typed as RateLimiterRequestContext | undefined
977
+ ```
978
+
979
+ ---
980
+
861
981
  ## Events
862
982
 
863
983
  All events are typed. Register handlers at creation time or dynamically:
@@ -1358,6 +1478,42 @@ const limiter = createRateLimiter({ store: new MyStore() })
1358
1478
 
1359
1479
  ---
1360
1480
 
1481
+ ## Debug mode
1482
+
1483
+ Set `debug: true` to enable structured console logging for every rate-limit decision, queue entry/exit, slot acquisition, circuit breaker transition, and completed call cost:
1484
+
1485
+ ```typescript
1486
+ const limiter = createRateLimiter({ debug: true })
1487
+ ```
1488
+
1489
+ Sample output:
1490
+
1491
+ ```
1492
+ [ai-sdk-rate-limiter] gpt-4o: execute (provider="openai" priority="normal")
1493
+ [ai-sdk-rate-limiter] gpt-4o: queuing (queueDepth=3 estimatedWaitMs=1200 priority="normal")
1494
+ [ai-sdk-rate-limiter] gpt-4o: dequeued (waitedMs=1187 priority="normal")
1495
+ [ai-sdk-rate-limiter] gpt-4o: completed (tokens=342+87 costUsd=0.000021 latencyMs=1343 streaming=false)
1496
+ ```
1497
+
1498
+ Debug logging is completely zero-overhead when disabled — no string building, no `JSON.stringify`, no property access on the details object.
1499
+
1500
+ ---
1501
+
1502
+ ## Config validation
1503
+
1504
+ `createRateLimiter()` validates your configuration at construction time. If it spots a likely misconfiguration it logs a `console.warn` (never throws). Catches you've got covered:
1505
+
1506
+ | Issue | Warning |
1507
+ |---|---|
1508
+ | `cost.store` set but `warmUp()` never called | Reminds you to call `warmUp()` at startup |
1509
+ | `circuit.failureThreshold < 3` | Too sensitive — risks false trips on transient errors |
1510
+ | `retry.retryOn` excludes 429 | Rate-limit errors won't be retried |
1511
+ | `queue.timeout < 3000ms` | Requests will time out before they can be served |
1512
+ | `cost.budget` set without `onExceeded` | Silent default is `'throw'` — may want `'queue'` or `'fallback'` |
1513
+ | `cost.onExceeded: 'fallback'` | Reminds you to pass a `fallback` model to `limiter.wrap()` |
1514
+
1515
+ ---
1516
+
1361
1517
  ## Comparison
1362
1518
 
1363
1519
  | | ai-sdk-rate-limiter | bottleneck | p-limit | SDK built-in retry | LangChain |
@@ -1382,6 +1538,7 @@ const limiter = createRateLimiter({ store: new MyStore() })
1382
1538
  | Backoff propagation | yes | no | no | no | no |
1383
1539
  | Prometheus metrics | yes | no | no | no | no |
1384
1540
  | StatsD metrics | yes | no | no | no | no |
1541
+ | Express/Hono middleware | yes | no | no | no | no |
1385
1542
  | OpenTelemetry | yes | no | no | no | partial |
1386
1543
  | Testing utilities | yes | no | no | no | no |
1387
1544
  | CLI audit | yes | no | no | no | no |
@@ -1425,6 +1582,14 @@ import type {
1425
1582
  } from 'ai-sdk-rate-limiter/redis'
1426
1583
 
1427
1584
  import type { StatsDClient } from 'ai-sdk-rate-limiter/statsd'
1585
+
1586
+ import type {
1587
+ RateLimiterRequestContext,
1588
+ RateLimiterMiddlewareOptions,
1589
+ ErrorHandlerOptions,
1590
+ HonoMiddlewareOptions,
1591
+ HonoContext,
1592
+ } from 'ai-sdk-rate-limiter/middleware'
1428
1593
  ```
1429
1594
 
1430
1595
  ---
@@ -1442,6 +1607,23 @@ Four runnable examples are included, each with its own README:
1442
1607
 
1443
1608
  ---
1444
1609
 
1610
+ ## Bundle sizes
1611
+
1612
+ Each entry point is independently tree-shakeable. Importing `ai-sdk-rate-limiter` never pulls in Redis, Prometheus, OTel, or StatsD.
1613
+
1614
+ | Entry point | Size (minified) | Size (gzip) |
1615
+ |---|---|---|
1616
+ | `ai-sdk-rate-limiter` | ~80 KB | ~22 KB |
1617
+ | `ai-sdk-rate-limiter/redis` | ~12 KB | ~4 KB |
1618
+ | `ai-sdk-rate-limiter/middleware` | ~8 KB | ~2.5 KB |
1619
+ | `ai-sdk-rate-limiter/prometheus` | ~8 KB | ~2.5 KB |
1620
+ | `ai-sdk-rate-limiter/otel` | ~4 KB | ~1.5 KB |
1621
+ | `ai-sdk-rate-limiter/statsd` | ~4 KB | ~1.2 KB |
1622
+
1623
+ The core package is self-contained. Optional peer deps (`ioredis`, `@opentelemetry/api`) are only loaded when you import the corresponding entry point.
1624
+
1625
+ ---
1626
+
1445
1627
  ## Requirements
1446
1628
 
1447
1629
  - Node.js 18+ / Bun / Deno
package/dist/index.cjs CHANGED
@@ -1678,6 +1678,23 @@ var CircuitBreaker = class {
1678
1678
  }
1679
1679
  };
1680
1680
 
1681
+ // src/core/debug-logger.ts
1682
+ var PREFIX = "[ai-sdk-rate-limiter]";
1683
+ var DebugLogger = class {
1684
+ constructor(enabled) {
1685
+ this.enabled = enabled;
1686
+ }
1687
+ log(model, message, details) {
1688
+ if (!this.enabled) return;
1689
+ if (details && Object.keys(details).length > 0) {
1690
+ const parts = Object.entries(details).map(([k, v]) => `${k}=${JSON.stringify(v)}`).join(" ");
1691
+ console.log(`${PREFIX} ${model}: ${message} (${parts})`);
1692
+ } else {
1693
+ console.log(`${PREFIX} ${model}: ${message}`);
1694
+ }
1695
+ }
1696
+ };
1697
+
1681
1698
  // src/core/pipeline.ts
1682
1699
  function resolveRetryConfig(config) {
1683
1700
  const r = config.retry ?? {};
@@ -1720,6 +1737,7 @@ var Pipeline = class {
1720
1737
  /** Set to true after shutdown() is called */
1721
1738
  this.shutdownRequested = false;
1722
1739
  this.config = config;
1740
+ this.log = new DebugLogger(config.debug === true);
1723
1741
  this.engine = new RateLimitEngine({
1724
1742
  maxQueueSize: config.queue?.maxSize ?? 500,
1725
1743
  ...config.store !== void 0 && { store: config.store }
@@ -1761,6 +1779,7 @@ var Pipeline = class {
1761
1779
  * recordUsage() once they have actual token counts from the API response.
1762
1780
  */
1763
1781
  async execute(modelId, provider, prompt, fn, opts) {
1782
+ this.log.log(modelId, "execute", { provider, priority: opts.priority, ...opts.scope !== void 0 && { scope: opts.scope } });
1764
1783
  if (this.shutdownRequested) {
1765
1784
  this.emitter.emit("dropped", {
1766
1785
  model: modelId,
@@ -1850,10 +1869,12 @@ var Pipeline = class {
1850
1869
  timeoutMs: opts.timeoutMs,
1851
1870
  ...opts.signal !== void 0 && { signal: opts.signal },
1852
1871
  onQueued: (queueDepth, estimatedWaitMs) => {
1872
+ this.log.log(modelId, "queuing", { queueDepth, estimatedWaitMs, priority: opts.priority });
1853
1873
  this.emitter.emit("queued", { model: modelId, provider, priority: opts.priority, queueDepth, estimatedWaitMs });
1854
1874
  this.emitter.emit("rateLimited", { source: "local", model: modelId, provider, limitType: "rpm", resetAt: Date.now() + estimatedWaitMs });
1855
1875
  },
1856
1876
  onDequeued: (waitedMs) => {
1877
+ this.log.log(modelId, "dequeued", { waitedMs, priority: opts.priority });
1857
1878
  this.emitter.emit("dequeued", { model: modelId, provider, waitedMs, priority: opts.priority });
1858
1879
  }
1859
1880
  });
@@ -1905,7 +1926,10 @@ var Pipeline = class {
1905
1926
  });
1906
1927
  if (circuit) {
1907
1928
  const justClosed = circuit.recordSuccess();
1908
- if (justClosed) this.emitter.emit("circuitClosed", { model: modelId, provider });
1929
+ if (justClosed) {
1930
+ this.log.log(modelId, "circuit closed \u2014 upstream recovered");
1931
+ this.emitter.emit("circuitClosed", { model: modelId, provider });
1932
+ }
1909
1933
  }
1910
1934
  return result;
1911
1935
  } catch (error) {
@@ -1915,6 +1939,7 @@ var Pipeline = class {
1915
1939
  if (shouldTrip) {
1916
1940
  const justOpened = circuit.recordFailure();
1917
1941
  if (justOpened) {
1942
+ this.log.log(modelId, "circuit OPEN", { status, cooldownMs: this.config.circuit?.cooldownMs ?? 6e4 });
1918
1943
  this.emitter.emit("circuitOpen", {
1919
1944
  model: modelId,
1920
1945
  provider,
@@ -1945,6 +1970,13 @@ var Pipeline = class {
1945
1970
  limits.outputPricePerMillion,
1946
1971
  scope
1947
1972
  );
1973
+ this.log.log(modelId, "completed", {
1974
+ tokens: `${usage.inputTokens}+${usage.outputTokens}`,
1975
+ costUsd: costUsd.toFixed(6),
1976
+ latencyMs,
1977
+ streaming,
1978
+ ...scope !== void 0 && { scope }
1979
+ });
1948
1980
  this.emitter.emit("completed", {
1949
1981
  model: modelId,
1950
1982
  provider,
@@ -2416,8 +2448,53 @@ function rateLimited(client, options = {}) {
2416
2448
  });
2417
2449
  }
2418
2450
 
2451
+ // src/core/config-validator.ts
2452
+ var PREFIX2 = "\x1B[33m\u26A0 ai-sdk-rate-limiter\x1B[0m";
2453
+ var RESET = "\x1B[0m";
2454
+ function validateConfig(config) {
2455
+ const warnings = [];
2456
+ if (config.cost?.store !== void 0) {
2457
+ warnings.push(
2458
+ "cost.store is configured \u2014 call `await limiter.warmUp()` at startup.\n Without it, budget caps won't account for spend from previous process runs."
2459
+ );
2460
+ }
2461
+ const threshold = config.circuit?.failureThreshold;
2462
+ if (threshold !== void 0 && threshold < 3) {
2463
+ warnings.push(
2464
+ `circuit.failureThreshold is ${threshold} \u2014 very low. The circuit will open after nearly every error. Consider a value of 5 or higher for typical production workloads.`
2465
+ );
2466
+ }
2467
+ if (config.retry?.retryOn !== void 0 && !config.retry.retryOn.includes(429)) {
2468
+ warnings.push(
2469
+ "retry.retryOn does not include 429. Rate limit errors from the API will not be retried. Add 429 to retry.retryOn, or remove the override to use the default."
2470
+ );
2471
+ }
2472
+ const queueTimeout = config.queue?.timeout;
2473
+ if (queueTimeout !== void 0 && queueTimeout < 3e3) {
2474
+ warnings.push(
2475
+ `queue.timeout is ${queueTimeout}ms \u2014 less than 3 seconds. Requests may time out before the rate limit window resets (typically 60s). Consider 30_000ms (30s) or higher.`
2476
+ );
2477
+ }
2478
+ if (config.cost?.onExceeded === "fallback") {
2479
+ warnings.push(
2480
+ "cost.onExceeded is 'fallback' but fallback models are configured per-model in limiter.wrap(model, { fallback: cheaperModel }). If no fallback is set on a wrapped model, BudgetExceededError will still be thrown."
2481
+ );
2482
+ }
2483
+ if (config.cost?.budget !== void 0 && config.cost.onExceeded === void 0) {
2484
+ warnings.push(
2485
+ "cost.budget is set but cost.onExceeded is not. Defaulting to 'throw' \u2014 requests will throw BudgetExceededError when the cap is hit. Set onExceeded: 'queue' or 'fallback' to change this behavior."
2486
+ );
2487
+ }
2488
+ for (const warning of warnings) {
2489
+ const formatted = warning.replace(/\n/g, `
2490
+ `);
2491
+ console.warn(`${PREFIX2}: ${formatted}${RESET}`);
2492
+ }
2493
+ }
2494
+
2419
2495
  // src/create-rate-limiter.ts
2420
2496
  function createRateLimiter(config = {}) {
2497
+ validateConfig(config);
2421
2498
  const pipeline = new Pipeline(config);
2422
2499
  const queueTimeout = config.queue?.timeout ?? 3e4;
2423
2500
  const middleware = createMiddleware(pipeline, queueTimeout);