npm - ai-sdk-rate-limiter - Versions diffs - 0.11.0 → 0.13.0 - Mend

ai-sdk-rate-limiter 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/CHANGELOG.md +195 -0
package/LICENSE +21 -0
package/README.md +76 -0
package/dist/cli.js +1 -1
package/dist/index.cjs +180 -11
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +38 -3
package/dist/index.d.ts +38 -3
package/dist/index.js +180 -12
package/dist/index.js.map +1 -1
package/dist/middleware.d.cts +1 -1
package/dist/middleware.d.ts +1 -1
package/dist/otel.d.cts +1 -1
package/dist/otel.d.ts +1 -1
package/dist/prometheus.d.cts +1 -1
package/dist/prometheus.d.ts +1 -1
package/dist/redis.d.cts +1 -1
package/dist/redis.d.ts +1 -1
package/dist/statsd.d.cts +1 -1
package/dist/statsd.d.ts +1 -1
package/dist/testing.cjs +150 -11
package/dist/testing.cjs.map +1 -1
package/dist/testing.d.cts +1 -1
package/dist/testing.d.ts +1 -1
package/dist/testing.js +150 -11
package/dist/testing.js.map +1 -1
package/dist/{types-CMevWGWK.d.cts → types-DtVjHfbF.d.cts} +54 -1
package/dist/{types-CMevWGWK.d.ts → types-DtVjHfbF.d.ts} +54 -1
package/package.json +27 -6

package/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,195 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+---
+## [Unreleased]
+---
+## [0.13.0] - 2026-03-13
+### Added
+- **`limiter.reset()`** — clear all rate-limit, queue, cost, and circuit-breaker state without recreating the instance. Queued requests are rejected with `ShutdownError`. Primarily useful in tests to reset between cases with a shared limiter instance.
+- **`queue.onFull: 'drop-low'`** fully implemented — when the queue is at capacity and a `high` or `normal` priority request arrives, the tail `low`-priority waiter is evicted (rejected with `QueueFullError`) to make room. Useful for mixed workloads where background batch jobs should never block user-facing requests.
+### Changed
+- Unknown models with zero pricing now emit a one-time `console.warn` on first use, pointing at the exact `config.limits` fix needed to enable cost tracking. Known registry models and models with user-supplied pricing are silent.
+---
+## [0.12.0] - 2026-03-12
+### Added
+- **`getCostForecast()`** — project end-of-period spend based on the current hourly rate. Returns `{ hour, day, month }` each with `spentUsd`, `projectedUsd`, and `ratePerHourUsd`. Useful for alerting before a budget cap is hit.
+- **`createModelPool(models, options?)`** — round-robin (or random) load balancer across multiple wrapped model instances. Distributes calls evenly across API keys or model variants. Import from `ai-sdk-rate-limiter`.
+- **Request deduplication** — pass `dedupKey` in `providerOptions.rateLimiter` to make concurrent identical requests share a single API call. All callers receive the same result; the dedup entry is cleared on completion so the next request always makes a fresh call.
+---
+## [0.11.0] - 2026-03-12
+### Added
+- **Debug mode** — set `debug: true` on `createRateLimiter()` to enable structured console logging for every rate-limit decision, queue entry/exit, slot acquisition, circuit breaker state change, and cost recording. Zero overhead when disabled.
+- **Config validation** — `createRateLimiter()` now validates your config at construction time and emits `console.warn` for common misconfigurations:
+  - `cost.store` set without calling `warmUp()` reminder
+  - `circuit.failureThreshold < 3` (too sensitive, risks false trips)
+  - `retry.retryOn` explicitly excludes 429 (defeats rate-limit retry)
+  - `queue.timeout < 3000ms` (too short, requests will time out before serving)
+  - `cost.budget` set without `onExceeded` (uses silent default `'throw'`)
+  - `cost.onExceeded: 'fallback'` reminder to configure fallback model
+- GitHub Actions CI workflow (Node 18 / 20 / 22 matrix)
+- `CHANGELOG.md` with retroactive entries from v0.1.0
+### Fixed
+- `DebugLogger` details serialization: empty objects no longer emit trailing `()`
+---
+## [0.10.0] - 2026-03-12
+### Added
+- `ai-sdk-rate-limiter/middleware` entry point
+  - `createRateLimiterMiddleware(limiter, opts)` — returns `{ middleware, errorHandler }` for Express
+  - `createRateLimiterErrorHandler(opts)` — standalone 4-arg Express error handler
+  - `createHonoMiddleware(limiter, opts)` — Hono middleware with `c.var.rateLimiter`
+  - `mapErrorToResponse(err)` — utility for Fastify and custom frameworks
+  - `RateLimiterRequestContext` type + `http.IncomingMessage` augmentation for `req.rateLimiter`
+  - Automatic error → HTTP mapping: `QueueTimeoutError` → 503, `BudgetExceededError` → 402, etc.
+  - `injectHeaders` option: adds `X-RateLimit-*` informational headers to responses
+### Added (examples)
+- `examples/multi-tenant-express/` — Express API with per-user scoped rate limits (free/pro tiers)
+- `examples/batch-processing/` — Concurrent batch jobs with priority queuing + graceful shutdown
+- `examples/budget-alerts/` — Slack/webhook budget alerts with per-scope spend breakdown
+---
+## [0.9.0] - 2026-03-12
+### Added
+- **Circuit breaker** — auto-opens on repeated 5xx failures, half-open probe, configurable thresholds
+  - `CircuitBreakerConfig` in `RateLimiterConfig.circuit`
+  - `CircuitOpenError` thrown when circuit is open
+  - `circuitOpen` / `circuitClosed` events
+- **Graceful shutdown** — `limiter.shutdown({ drainMs })`, `ShutdownError`
+- **Persistent cost tracking** — `CostStore` interface + `RedisCostStore` in `ai-sdk-rate-limiter/redis`
+  - `limiter.warmUp()` pre-loads historical spend on startup
+- **Per-scope cost attribution** — `getCostReport().byScope` breakdown per user/org/tenant
+- **Fallback chains** — `fallback` accepts `WrappableModel[]`, walked in order on `BudgetExceededError`
+- **Call timeout** — `callTimeout` in retry config and per-request options (uses `Promise.race`)
+- **Auto-detected limits** — parses `x-ratelimit-limit-*` response headers, user config wins
+  - `limitsDetected` event
+- **Prometheus metrics** — `createPrometheusPlugin()` in `ai-sdk-rate-limiter/prometheus`
+- **StatsD / DogStatsD** — `createStatsDPlugin(client)` in `ai-sdk-rate-limiter/statsd`
+- **Drop hooks** — `DroppedEvent` now includes `reason`, `waitedMs`, `queueDepth`, `scope`, `metadata`
+---
+## [0.8.0] - 2026-03-11
+### Added
+- Redis store for multi-instance rate limiting (`ai-sdk-rate-limiter/redis`)
+  - `RedisStore` — Lua-script-based atomic sliding window shared across instances
+  - Fail-open on Redis errors (enforcement suspended, never blocks requests)
+  - Compatible with ioredis, node-redis, Upstash Redis
+- `rpd` (requests per day) limit support — rolling 24-hour window
+- `otpm` (output tokens per minute) limit support — based on completed request actuals
+---
+## [0.7.1] - 2026-03-10
+### Fixed
+- `Retry-After` header parsing: correctly handles duration strings like `"6m30s"` (previously parsed as 6s)
+---
+## [0.7.0] - 2026-03-10
+### Added
+- Raw SDK proxy — `limiter.rawProxy(client)` wraps native OpenAI/Anthropic/Groq/Mistral/Cohere clients
+  - Transparent `Proxy`-based drop-in with no API changes
+  - Streaming support via `AsyncIterable` wrapping for usage chunk capture
+  - `rateLimited(client, opts)` standalone factory
+- Budget fallback routing — `onExceeded: 'fallback'` transparently reroutes to a cheaper model
+  - `limiter.wrap(model, { fallback: cheaperModel })`
+  - `usingFallback` field on `budgetHit` event
+---
+## [0.6.0] - 2026-03-09
+### Added
+- OpenTelemetry plugin (`ai-sdk-rate-limiter/otel`)
+  - `createOtelPlugin(tracer)` — emits GenAI-spec spans for every request
+  - No hard dependency on `@opentelemetry/api` (structural typing)
+  - Span duration reconstructed from `latencyMs` for accurate wall-clock timing
+- Testing utilities (`ai-sdk-rate-limiter/testing`)
+  - `createTestLimiter()` — records all completed calls for assertions
+  - `limiter.getCalls()` / `limiter.reset()`
+---
+## [0.5.0] - 2026-03-08
+### Added
+- Concurrency limits — `maxConcurrent` per model, enforced as a semaphore
+- Multi-tenant scoped limits — `config.scopes` with `*` wildcard patterns
+  - Each scope gets its own independent sliding window
+  - Per-request scope via `providerOptions.rateLimiter.scope`
+- `queue.onFull: 'drop-low'` — evict lowest-priority requests before throwing `QueueFullError`
+- `AbortSignal` propagation through both rate-limit and concurrency queues
+---
+## [0.4.0] - 2026-03-07
+### Added
+- Priority queue — `high` / `normal` / `low` priorities; FIFO within same priority
+- Per-request options via `providerOptions.rateLimiter` (priority, timeout, scope)
+- `limiter.estimatedWait(modelId)` — returns ms until next available slot
+- `QueueFullError` when queue is at `maxSize` capacity
+---
+## [0.3.0] - 2026-03-06
+### Added
+- Cost tracking — records actual token usage per request
+  - `getCostReport()` with hourly / daily / monthly rolling windows
+  - `byModel` breakdown in cost report
+- Budget caps — `cost.budget` with `hourly`, `daily`, `monthly` limits
+  - `onExceeded: 'throw' | 'queue'` behavior
+  - `BudgetExceededError` with period, current spend, and limit
+- `nextBudgetClearMs()` used internally for queue-mode budget holds
+---
+## [0.2.0] - 2026-03-05
+### Added
+- Model registry expanded to include Groq, Mistral, and Cohere
+  - `GROQ_MODELS`, `MISTRAL_MODELS`, `COHERE_MODELS`
+  - `isKnownModel(modelId, provider)` utility
+---
+## [0.1.0] - 2026-03-04
+### Added
+- Initial release
+- Sliding window rate limiting (RPM + ITPM) for OpenAI, Anthropic, Google Gemini models
+- Priority queue with drain timer (`scheduleDrain` per model)
+- Exponential backoff retry with jitter, `Retry-After` header support
+- Vercel AI SDK `.wrap()` adapter via `LanguageModelV4Middleware`
+- In-memory store (default, zero config)
+- `createRateLimiter()` factory
+- `limiter.getStatus()` — queue depths and window state per model
+- Event system: `queued`, `dequeued`, `retrying`, `rateLimited`, `budgetHit`, `dropped`, `completed`
+- `RateLimiterError` hierarchy: `RateLimitExceededError`, `QueueTimeoutError`, `RetryExhaustedError`
+- Built-in model registry for OpenAI and Anthropic with pricing data

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 ai-sdk-rate-limiter contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md CHANGED Viewed

@@ -4,12 +4,16 @@ Smart rate limiting, queuing, and cost tracking for AI API calls. Works across p
 [![npm](https://img.shields.io/npm/v/ai-sdk-rate-limiter)](https://www.npmjs.com/package/ai-sdk-rate-limiter)
 [![CI](https://github.com/piyushgupta344/ai-sdk-rate-limiter/actions/workflows/ci.yml/badge.svg)](https://github.com/piyushgupta344/ai-sdk-rate-limiter/actions/workflows/ci.yml)
+[![codecov](https://codecov.io/gh/piyushgupta344/ai-sdk-rate-limiter/graph/badge.svg)](https://codecov.io/gh/piyushgupta344/ai-sdk-rate-limiter)
 [![npm downloads](https://img.shields.io/npm/dm/ai-sdk-rate-limiter)](https://www.npmjs.com/package/ai-sdk-rate-limiter)
+[![docs](https://img.shields.io/badge/docs-piyushgupta344.github.io-blue)](https://piyushgupta344.github.io/ai-sdk-rate-limiter/)
 ```
 npm install ai-sdk-rate-limiter
 ```
+**[Documentation →](https://piyushgupta344.github.io/ai-sdk-rate-limiter/)**
 ---
 ## The problem
@@ -557,6 +561,27 @@ Costs are based on **actual token counts** from API responses — not estimates.
 `byScope` is populated automatically when requests carry a `scope` (either set on `limiter.wrap()` or via `providerOptions.rateLimiter.scope`). Unscoped requests don't appear in `byScope`.
+### Cost forecasting
+`getCostForecast()` projects your end-of-period spend based on the current hourly rate. Useful for alerting before a budget cap is hit:
+```typescript
+const forecast = limiter.getCostForecast()
+console.log(forecast)
+// {
+//   hour:  { spentUsd: 1.20, projectedUsd: 1.20,  ratePerHourUsd: 1.20 },
+//   day:   { spentUsd: 3.50, projectedUsd: 28.80, ratePerHourUsd: 1.20 },
+//   month: { spentUsd: 8.10, projectedUsd: 864,   ratePerHourUsd: 1.20 },
+// }
+if (forecast.day.projectedUsd > 40) {
+  console.warn(`Heads up — on track to spend $${forecast.day.projectedUsd.toFixed(2)} today`)
+}
+```
+`projectedUsd` = current hourly rate × hours in the period. It is based on the **last 60 minutes** of spend, so it responds quickly to usage spikes.
 ---
 ## Budget fallback routing
@@ -1456,6 +1481,57 @@ class MyStore implements RateLimitStore {
 const limiter = createRateLimiter({ store: new MyStore() })
 ```
+### Load balancing across API keys
+`createModelPool()` distributes requests round-robin across multiple model instances — useful when you have more than one API key:
+```typescript
+import { createRateLimiter, createModelPool } from 'ai-sdk-rate-limiter'
+import { createOpenAI } from '@ai-sdk/openai'
+// Two API keys, each with their own limiter tracking separate RPM limits
+const limiter1 = createRateLimiter({ limits: { 'gpt-4o': { rpm: 500, itpm: 2_000_000 } } })
+const limiter2 = createRateLimiter({ limits: { 'gpt-4o': { rpm: 500, itpm: 2_000_000 } } })
+const openai1 = createOpenAI({ apiKey: process.env.OPENAI_KEY_1 })
+const openai2 = createOpenAI({ apiKey: process.env.OPENAI_KEY_2 })
+const pool = createModelPool([
+  limiter1.wrap(openai1('gpt-4o')),
+  limiter2.wrap(openai2('gpt-4o')),
+])
+// Use exactly like a regular model — calls alternate between the two keys
+const { text } = await generateText({ model: pool, prompt: 'Hello!' })
+```
+Pass `{ strategy: 'random' }` for random selection instead of round-robin.
+### Request deduplication
+When multiple concurrent requests carry the same `dedupKey`, only one API call is made and all callers receive the same result. Useful for FAQ-style workloads where many users ask the same question simultaneously:
+```typescript
+const model = limiter.wrap(openai('gpt-4o'))
+// Server handler — two simultaneous identical requests share one API call
+async function handleRequest(questionId: string) {
+  const { text } = await generateText({
+    model,
+    prompt: questions[questionId],
+    providerOptions: {
+      rateLimiter: { dedupKey: `faq:${questionId}` },
+    },
+  })
+  return text
+}
+// If 50 users hit the same FAQ item at the same time → 1 API call, not 50
+const results = await Promise.all(users.map(() => handleRequest('faq-42')))
+```
+The dedup entry is removed once the request completes (success or error), so subsequent requests always make a fresh call.
 ---
 ## How it works

package/dist/cli.js CHANGED Viewed

@@ -1021,7 +1021,7 @@ async function main() {
 Run with --help for usage.`);
     process.exit(1);
   }
-  await runAudit({ provider, json });
+  await runAudit({ ...provider !== void 0 && { provider }, json });
 }
 main().catch((err) => {
   console.error(err instanceof Error ? err.message : String(err));

package/dist/index.cjs CHANGED Viewed

@@ -271,6 +271,13 @@ var InMemoryStore = class {
     }
     return now + WINDOW_MS + 1;
   }
+  /** Clear all state — useful in tests to reset between runs. */
+  reset() {
+    this.windows.clear();
+    this.dailyWindows.clear();
+    this.backoffs.clear();
+    this.pruneCounter = 0;
+  }
   /**
    * Periodically sweep stale entries from the scope/model maps.
    * Runs every 200 checkAndRecord calls to avoid per-request overhead.
@@ -360,7 +367,18 @@ var RateLimitEngine = class {
     }
     if (nextSlotAtMs > Date.now()) {
       if (local.waiters.length >= this.maxQueueSize) {
-        throw new QueueFullError(key, this.maxQueueSize);
+        if (opts.onFull === "drop-low" && opts.priority !== "low") {
+          const lastIdx = local.waiters.length - 1;
+          const victim = local.waiters[lastIdx];
+          if (victim !== void 0 && victim.priority === "low") {
+            local.waiters.splice(lastIdx, 1);
+            victim.reject(new QueueFullError(key, this.maxQueueSize));
+          } else {
+            throw new QueueFullError(key, this.maxQueueSize);
+          }
+        } else {
+          throw new QueueFullError(key, this.maxQueueSize);
+        }
       }
       const estimatedWaitMs = Math.max(0, nextSlotAtMs - Date.now());
       opts.onQueued?.(local.waiters.length, estimatedWaitMs);
@@ -481,6 +499,18 @@ var RateLimitEngine = class {
     }
     return total;
   }
+  /**
+   * Clear all rate-limit state — sliding windows, queues, backoffs.
+   * Queued requests are rejected with ShutdownError before clearing.
+   * Useful in tests to reset between runs without recreating the instance.
+   */
+  reset() {
+    this.shutdown();
+    this.localStates.clear();
+    if (this.store instanceof InMemoryStore) {
+      this.store.reset();
+    }
+  }
   /**
    * Immediately reject all queued and concurrency-waiting requests with
    * ShutdownError. Called by Pipeline.shutdown() before draining.
@@ -668,6 +698,30 @@ var CostTracker = class {
   estimateCost(inputTokens, outputTokens, inputPricePerMillion, outputPricePerMillion) {
     return inputTokens / 1e6 * inputPricePerMillion + outputTokens / 1e6 * outputPricePerMillion;
   }
+  getForecast() {
+    const now = Date.now();
+    this.evict(now);
+    const hourlyRate = this.entries.filter((e) => e.timestamp > now - HOUR_MS).reduce((s, e) => s + e.costUsd, 0);
+    const daySpent = this.entries.filter((e) => e.timestamp > now - DAY_MS2).reduce((s, e) => s + e.costUsd, 0);
+    const monthSpent = this.entries.filter((e) => e.timestamp > now - MONTH_MS).reduce((s, e) => s + e.costUsd, 0);
+    return {
+      hour: {
+        spentUsd: hourlyRate,
+        projectedUsd: hourlyRate,
+        ratePerHourUsd: hourlyRate
+      },
+      day: {
+        spentUsd: daySpent,
+        projectedUsd: hourlyRate * 24,
+        ratePerHourUsd: hourlyRate
+      },
+      month: {
+        spentUsd: monthSpent,
+        projectedUsd: hourlyRate * 24 * 30,
+        ratePerHourUsd: hourlyRate
+      }
+    };
+  }
   getReport() {
     const now = Date.now();
     this.evict(now);
@@ -700,6 +754,10 @@ var CostTracker = class {
     }
     return { hour, day, month, byModel, byScope };
   }
+  /** Clear all in-memory cost entries. Useful in tests to reset between runs. */
+  reset() {
+    this.entries = [];
+  }
   // -------------------------------------------------------------------------
   // Private helpers
   // -------------------------------------------------------------------------
@@ -1696,6 +1754,8 @@ var DebugLogger = class {
 };
 // src/core/pipeline.ts
+var WARN_PREFIX = "\x1B[33m\u26A0 ai-sdk-rate-limiter\x1B[0m";
+var WARN_RESET = "\x1B[0m";
 function resolveRetryConfig(config) {
   const r = config.retry ?? {};
   return {
@@ -1734,8 +1794,14 @@ var Pipeline = class {
     this.circuits = /* @__PURE__ */ new Map();
     /** Limits detected from provider response headers (lower priority than user config) */
     this.detectedLimits = /* @__PURE__ */ new Map();
+    /** In-flight promises indexed by dedup key, shared across identical concurrent requests */
+    this.dedupMap = /* @__PURE__ */ new Map();
+    /** Counter used to trigger periodic keyMeta GC without a setInterval */
+    this.executeCount = 0;
     /** Set to true after shutdown() is called */
     this.shutdownRequested = false;
+    /** Models that have already received an "unknown model" warning (dedup) */
+    this.warnedModels = /* @__PURE__ */ new Set();
     this.config = config;
     this.log = new DebugLogger(config.debug === true);
     this.engine = new RateLimitEngine({
@@ -1780,6 +1846,19 @@ var Pipeline = class {
    */
   async execute(modelId, provider, prompt, fn, opts) {
     this.log.log(modelId, "execute", { provider, priority: opts.priority, ...opts.scope !== void 0 && { scope: opts.scope } });
+    if (opts.dedupKey !== void 0) {
+      const existing = this.dedupMap.get(opts.dedupKey);
+      if (existing !== void 0) {
+        this.log.log(modelId, "dedup hit", { dedupKey: opts.dedupKey });
+        return existing;
+      }
+      const { dedupKey, ...optsWithoutDedup } = opts;
+      const promise = this.execute(modelId, provider, prompt, fn, optsWithoutDedup).finally(() => {
+        this.dedupMap.delete(dedupKey);
+      });
+      this.dedupMap.set(dedupKey, promise);
+      return promise;
+    }
     if (this.shutdownRequested) {
       this.emitter.emit("dropped", {
         model: modelId,
@@ -1868,6 +1947,7 @@ var Pipeline = class {
         priority: opts.priority,
         timeoutMs: opts.timeoutMs,
         ...opts.signal !== void 0 && { signal: opts.signal },
+        ...this.config.queue?.onFull !== void 0 && { onFull: this.config.queue.onFull },
         onQueued: (queueDepth, estimatedWaitMs) => {
           this.log.log(modelId, "queuing", { queueDepth, estimatedWaitMs, priority: opts.priority });
           this.emitter.emit("queued", { model: modelId, provider, priority: opts.priority, queueDepth, estimatedWaitMs });
@@ -1878,6 +1958,7 @@ var Pipeline = class {
           this.emitter.emit("dequeued", { model: modelId, provider, waitedMs, priority: opts.priority });
         }
       });
+      if (++this.executeCount % 1e3 === 0) this.pruneKeyMeta();
     } catch (acquireErr) {
       if (acquireErr instanceof QueueFullError) {
         const maxSize = this.config.queue?.maxSize;
@@ -1994,6 +2075,9 @@ var Pipeline = class {
   getCostReport() {
     return this.costTracker.getReport();
   }
+  getCostForecast() {
+    return this.costTracker.getForecast();
+  }
   getStatus() {
     const models = [];
     let totalQueueDepth = 0;
@@ -2004,6 +2088,7 @@ var Pipeline = class {
       const snapshot = this.engine.windowSnapshot(key);
       const queueDepth = this.engine.queueDepth(key);
       const backoffUntil = this.engine.backoffUntil(key);
+      if (snapshot.requests === 0 && queueDepth === 0 && backoffUntil === null) continue;
       totalQueueDepth += queueDepth;
       models.push({
         modelId,
@@ -2057,6 +2142,22 @@ var Pipeline = class {
       await new Promise((resolve) => setTimeout(resolve, 50));
     }
   }
+  /**
+   * Clear all rate-limit, queue, cost, and circuit-breaker state.
+   * Any currently queued requests are rejected with ShutdownError.
+   * Useful in tests to reset between runs without recreating the limiter.
+   */
+  reset() {
+    this.engine.reset();
+    this.costTracker.reset();
+    this.keyMeta.clear();
+    this.circuits.clear();
+    this.detectedLimits.clear();
+    this.dedupMap.clear();
+    this.warnedModels.clear();
+    this.executeCount = 0;
+    this.shutdownRequested = false;
+  }
   /** Pre-load historical cost data from the persistent cost store. */
   async warmUp() {
     if (this.config.cost?.store) {
@@ -2092,6 +2193,21 @@ var Pipeline = class {
   // -------------------------------------------------------------------------
   // Private helpers
   // -------------------------------------------------------------------------
+  /**
+   * Remove keyMeta entries for keys that have no recent activity.
+   * A key is safe to evict when its sliding window is empty (all entries
+   * older than 60s) and it has no queued or in-flight requests.
+   * Called every 1000 executions — no setInterval, no GC interference.
+   */
+  pruneKeyMeta() {
+    for (const key of this.keyMeta.keys()) {
+      const snapshot = this.engine.windowSnapshot(key);
+      const queueDepth = this.engine.queueDepth(key);
+      if (snapshot.requests === 0 && queueDepth === 0) {
+        this.keyMeta.delete(key);
+      }
+    }
+  }
   getOrCreateCircuit(key) {
     let cb = this.circuits.get(key);
     if (!cb) {
@@ -2103,13 +2219,24 @@ var Pipeline = class {
   resolveModelLimits(modelId, provider) {
     const base = resolveModelLimits(modelId, provider, this.config.limits ?? {});
     const detected = this.detectedLimits.get(`${provider}:${modelId}`);
-    if (!detected) return base;
-    const userOverride = this.config.limits?.[modelId] ?? {};
-    return {
-      ...base,
-      ...!("rpm" in userOverride) && detected.rpm !== void 0 && { rpm: detected.rpm },
-      ...!("itpm" in userOverride) && detected.itpm !== void 0 && { itpm: detected.itpm }
-    };
+    const limits = detected ? (() => {
+      const userOverride = this.config.limits?.[modelId] ?? {};
+      return {
+        ...base,
+        ...!("rpm" in userOverride) && detected.rpm !== void 0 && { rpm: detected.rpm },
+        ...!("itpm" in userOverride) && detected.itpm !== void 0 && { itpm: detected.itpm }
+      };
+    })() : base;
+    const warnKey = `${provider}:${modelId}`;
+    if (!this.warnedModels.has(warnKey) && !isKnownModel(modelId, provider) && limits.inputPricePerMillion === 0 && limits.outputPricePerMillion === 0 && !(this.config.limits?.[modelId]?.inputPricePerMillion !== void 0 || this.config.limits?.[modelId]?.outputPricePerMillion !== void 0)) {
+      this.warnedModels.add(warnKey);
+      console.warn(
+        `${WARN_PREFIX}: Unknown model '${modelId}' (provider: '${provider}'). Using fallback limits (${limits.rpm} RPM). Cost tracking is disabled.
+  Add pricing to config.limits to enable it:
+  limits: { '${modelId}': { inputPricePerMillion: <n>, outputPricePerMillion: <n> } }` + WARN_RESET
+      );
+    }
+    return limits;
   }
 };
@@ -2122,7 +2249,8 @@ function getPerRequestOptions(params, queueTimeout) {
     metadata: raw?.metadata ?? {},
     skipBudgetCheck: raw?._skipBudgetCheck ?? false,
     scope: raw?.scope,
-    callTimeout: raw?.callTimeout
+    callTimeout: raw?.callTimeout,
+    dedupKey: raw?.dedupKey
   };
 }
 function extractTokenUsage(usage) {
@@ -2138,7 +2266,7 @@ function createMiddleware(pipeline, queueTimeout) {
     // wrapGenerate — non-streaming
     // -----------------------------------------------------------------------
     async wrapGenerate({ doGenerate, params, model }) {
-      const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout } = getPerRequestOptions(params, queueTimeout);
+      const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout, dedupKey } = getPerRequestOptions(params, queueTimeout);
       const modelId = model.modelId;
       const provider = model.provider;
       const startMs = Date.now();
@@ -2154,6 +2282,7 @@ function createMiddleware(pipeline, queueTimeout) {
           skipBudgetCheck,
           ...scope !== void 0 && { scope },
           ...callTimeout !== void 0 && { callTimeout },
+          ...dedupKey !== void 0 && { dedupKey },
           ...params.abortSignal !== void 0 && { signal: params.abortSignal }
         }
       );
@@ -2167,7 +2296,7 @@ function createMiddleware(pipeline, queueTimeout) {
     // wrapStream — streaming
     // -----------------------------------------------------------------------
     async wrapStream({ doStream, params, model }) {
-      const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout } = getPerRequestOptions(params, queueTimeout);
+      const { priority, timeoutMs, skipBudgetCheck, scope, callTimeout, dedupKey } = getPerRequestOptions(params, queueTimeout);
       const modelId = model.modelId;
       const provider = model.provider;
       const startMs = Date.now();
@@ -2183,6 +2312,7 @@ function createMiddleware(pipeline, queueTimeout) {
           skipBudgetCheck,
           ...scope !== void 0 && { scope },
           ...callTimeout !== void 0 && { callTimeout },
+          ...dedupKey !== void 0 && { dedupKey },
           ...params.abortSignal !== void 0 && { signal: params.abortSignal }
         }
       );
@@ -2508,6 +2638,9 @@ function createRateLimiter(config = {}) {
     getCostReport() {
       return pipeline.getCostReport();
     },
+    getCostForecast() {
+      return pipeline.getCostForecast();
+    },
     getStatus() {
       return pipeline.getStatus();
     },
@@ -2528,6 +2661,41 @@ function createRateLimiter(config = {}) {
     },
     warmUp() {
       return pipeline.warmUp();
+    },
+    reset() {
+      pipeline.reset();
+    }
+  };
+}
+// src/adapters/model-pool.ts
+function createModelPool(models, options) {
+  if (models.length === 0) {
+    throw new Error("createModelPool: at least one model is required");
+  }
+  const strategy = options?.strategy ?? "round-robin";
+  let index = 0;
+  function pick() {
+    if (strategy === "random") {
+      return models[Math.floor(Math.random() * models.length)];
+    }
+    const model = models[index % models.length];
+    index = (index + 1) % models.length;
+    return model;
+  }
+  const primary = models[0];
+  return {
+    get modelId() {
+      return primary.modelId;
+    },
+    get provider() {
+      return primary.provider;
+    },
+    doGenerate(params) {
+      return pick().doGenerate(params);
+    },
+    doStream(params) {
+      return pick().doStream(params);
     }
   };
 }
@@ -2546,6 +2714,7 @@ exports.RateLimitExceededError = RateLimitExceededError;
 exports.RateLimiterError = RateLimiterError;
 exports.RetryExhaustedError = RetryExhaustedError;
 exports.ShutdownError = ShutdownError;
+exports.createModelPool = createModelPool;
 exports.createRateLimiter = createRateLimiter;
 exports.isKnownModel = isKnownModel;
 exports.rateLimited = rateLimited;