@eidentic/model 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,309 @@
1
+ import { LanguageModel, generateText, embed } from 'ai';
2
+ import { ModelPort, ModelRequest, ModelResponse, ModelStreamPart, EmbeddingPort, PriceTable } from '@eidentic/types';
3
+
4
+ type ModelResolver = (modelStr?: string) => LanguageModel | Promise<LanguageModel>;
5
+ type GenParams = Parameters<typeof generateText>[0];
6
+ /**
7
+ * Keys of the AI SDK's `generateText` parameters that are generation settings (sampling,
8
+ * limits, provider passthrough) — as opposed to prompt/tools/telemetry/internal options.
9
+ * Listing a name that the installed AI SDK does not accept is a **compile error** (`Pick`
10
+ * requires the key to exist), so this list can never silently drift out of sync.
11
+ */
12
+ type GenerationSettingKey = "temperature" | "maxOutputTokens" | "topP" | "topK" | "presencePenalty" | "frequencyPenalty" | "stopSequences" | "seed" | "maxRetries" | "providerOptions" | "headers";
13
+ /**
14
+ * Generation settings forwarded to every model call, derived directly from the AI SDK's
15
+ * `generateText` signature via `Pick`. Names and types always match the installed SDK — a
16
+ * renamed or removed setting becomes a compile error here rather than being silently ignored
17
+ * at runtime. All keys are optional (they are optional on the SDK type).
18
+ *
19
+ * @example new AIModel(anthropic("claude-sonnet-4-5"), { temperature: 0.2, maxOutputTokens: 1024 })
20
+ */
21
+ type AIModelOptions = Pick<GenParams, GenerationSettingKey>;
22
+ /**
23
+ * A Eidentic `ModelPort` backed by Vercel AI SDK v6.
24
+ * Pass a concrete AI SDK `LanguageModel` (e.g. `anthropic("claude-...")`) or a resolver
25
+ * that turns the request's `model` string into a `LanguageModel`. The optional second
26
+ * argument sets generation defaults (temperature, maxOutputTokens, …) applied to every call.
27
+ */
28
+ declare class AIModel implements ModelPort {
29
+ private readonly resolve;
30
+ private readonly settings;
31
+ /** The model's own identifier, sourced from the AI SDK LanguageModel when a static model is passed. */
32
+ readonly modelId: string | undefined;
33
+ constructor(model: LanguageModel | ModelResolver, options?: AIModelOptions);
34
+ complete(request: ModelRequest): Promise<ModelResponse>;
35
+ stream(request: ModelRequest): AsyncIterable<ModelStreamPart>;
36
+ }
37
+
38
+ /** The embedding-model type accepted by AI SDK v6 `embed`, minus the bare-string branch. */
39
+ type AIEmbeddingModel = Exclude<Parameters<typeof embed>[0]["model"], string>;
40
+ /**
41
+ * Provider-agnostic hosted embedder over AI SDK v6. Bring your own provider + key + model:
42
+ * const embedder = await AIEmbedder.create(openai.embedding("text-embedding-3-small"));
43
+ * Works with any `@ai-sdk/*` embedding model (OpenAI, Cohere, Google, Mistral, ...).
44
+ * A first-class peer to the local `@eidentic/transformers` embedder; pick whichever fits.
45
+ */
46
+ declare class AIEmbedder implements EmbeddingPort {
47
+ private readonly model;
48
+ readonly dim: number;
49
+ private constructor();
50
+ /** Construct an embedder, probing the model once to discover its output dimension. */
51
+ static create(model: AIEmbeddingModel): Promise<AIEmbedder>;
52
+ embed(text: string): Promise<number[]>;
53
+ /**
54
+ * Batch embedding via AI SDK v6 `embedMany({ model, values })` → `{ embeddings: number[][] }`.
55
+ * Embeds all texts in a single provider call (fewer round-trips on the ingest hot path).
56
+ * Each returned vector is validated to have length === `this.dim`.
57
+ */
58
+ embedBatch(texts: string[]): Promise<number[][]>;
59
+ }
60
+
61
+ declare const pricesUpdatedAt = "2026-06-08T00:00:00.000Z";
62
+ declare const defaultPrices: PriceTable;
63
+
64
+ declare const LITELLM_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json";
65
+ /**
66
+ * Map a raw LiteLLM model_prices_and_context_window.json object to a PriceTable.
67
+ * Pure function — usable in tests without any network calls.
68
+ */
69
+ declare function mapLiteLLM(raw: Record<string, unknown>): PriceTable;
70
+ /**
71
+ * Fetch the latest prices live from LiteLLM (opt-in — schedule it yourself;
72
+ * the library never auto-fetches at runtime).
73
+ */
74
+ declare function fetchLatestPrices(opts?: {
75
+ fetchImpl?: typeof fetch;
76
+ url?: string;
77
+ }): Promise<PriceTable>;
78
+
79
+ /**
80
+ * Options for `createOllamaModel`.
81
+ */
82
+ interface OllamaModelOptions {
83
+ /**
84
+ * Base URL of the Ollama server (defaults to `http://localhost:11434/api`).
85
+ * Override when Ollama runs on a non-default port or a remote host.
86
+ */
87
+ baseURL?: string;
88
+ /**
89
+ * Injectable provider factory for testing. When provided, the `ollama-ai-provider`
90
+ * peer dependency is NOT resolved — the factory is called directly. In production
91
+ * code, leave this unset.
92
+ * @internal
93
+ */
94
+ _factory?: OllamaProviderFactory;
95
+ }
96
+ /**
97
+ * Minimal type describing what we need from `ollama-ai-provider`.
98
+ * @internal
99
+ */
100
+ interface OllamaProvider {
101
+ (modelId: string): LanguageModel;
102
+ }
103
+ /** @internal */
104
+ interface OllamaProviderFactory {
105
+ createOllama(opts?: {
106
+ baseURL?: string;
107
+ }): OllamaProvider;
108
+ }
109
+ /**
110
+ * Create a Vercel AI SDK `LanguageModel` backed by a locally-running Ollama instance.
111
+ *
112
+ * `ollama-ai-provider` is an **optional peer dependency** — install it separately:
113
+ * ```sh
114
+ * npm install ollama-ai-provider
115
+ * # or
116
+ * pnpm add ollama-ai-provider
117
+ * ```
118
+ *
119
+ * **Usage:**
120
+ * ```ts
121
+ * import { AIModel, createOllamaModel } from "@eidentic/model";
122
+ *
123
+ * // Default: connects to http://localhost:11434/api
124
+ * const model = new AIModel(createOllamaModel("llama3.2"));
125
+ *
126
+ * // Multimodal (vision-capable) model:
127
+ * const visionModel = new AIModel(createOllamaModel("llava"));
128
+ *
129
+ * // Custom server URL:
130
+ * const remoteModel = new AIModel(
131
+ * createOllamaModel("mistral", { baseURL: "http://192.168.1.10:11434/api" }),
132
+ * );
133
+ * ```
134
+ *
135
+ * No API key required — works entirely offline.
136
+ *
137
+ * @param modelId - Ollama model identifier, e.g. `"llama3.2"`, `"mistral"`, `"llava"`.
138
+ * @param opts - Optional configuration (baseURL + optional test factory).
139
+ * @returns A Vercel AI SDK `LanguageModel` that routes calls to the local Ollama server.
140
+ * @throws Error if `ollama-ai-provider` is not installed and no `_factory` is provided.
141
+ */
142
+ declare function createOllamaModel(modelId: string, opts?: OllamaModelOptions): LanguageModel;
143
+
144
+ interface WithFallbackOptions {
145
+ /**
146
+ * Custom predicate to decide whether an error should trigger fallback.
147
+ * Called with the thrown error. Return `true` to try the next model.
148
+ * Default: always fall back (except for AbortError — see below).
149
+ */
150
+ shouldFallback?: (err: unknown) => boolean;
151
+ /**
152
+ * Called each time a fallback transition happens.
153
+ * Useful for alerting/metrics — e.g. log to your observability stack.
154
+ */
155
+ onFallback?: (err: unknown, fromIndex: number, toIndex: number) => void;
156
+ }
157
+ /**
158
+ * Wraps a primary `ModelPort` with one or more fallback models.
159
+ *
160
+ * On failure of `complete()` or `stream()` (network error, provider 5xx/429,
161
+ * or a custom `shouldFallback` predicate), the next model in the chain is tried.
162
+ *
163
+ * **Stream caveat**: fallback is only attempted when the failed stream produced
164
+ * **zero** text deltas. If any delta was already yielded to the caller, the
165
+ * output would be corrupted by a mid-stream provider switch, so the error is
166
+ * re-thrown instead.
167
+ *
168
+ * **AbortError**: never triggers a fallback — the caller's cancellation intent
169
+ * is always respected.
170
+ *
171
+ * **Cost-optimization recipe**: pair a cheaper, faster tier as the primary with
172
+ * a slower but more-capable tier as fallback. Under normal conditions you pay
173
+ * the cheap rate; spikes or outages automatically route to the reliable tier
174
+ * without changing any call sites. Documented 55-65 % cost reductions in
175
+ * production systems that apply this pattern.
176
+ *
177
+ * @example
178
+ * const model = withFallback(cheap, [premium], {
179
+ * onFallback: (err, from, to) => console.warn(`model[${from}] failed, trying [${to}]`, err),
180
+ * });
181
+ * // Drop into any AgentConfig.model unchanged — it is still a ModelPort.
182
+ */
183
+ declare function withFallback(primary: ModelPort, fallbacks: ModelPort[], opts?: WithFallbackOptions): ModelPort;
184
+
185
+ /**
186
+ * A threshold entry for `byTokenEstimate`.
187
+ * The selector routes to `tier` when the estimated token count is `<= upTo`.
188
+ * Entries are evaluated in ascending `upTo` order; the first match wins.
189
+ */
190
+ interface TokenThreshold {
191
+ upTo: number;
192
+ tier: string;
193
+ }
194
+ /**
195
+ * Returns a `routeModel` selector that maps requests to tier names based on
196
+ * estimated prompt token count. Thresholds are evaluated in ascending `upTo`
197
+ * order; the first whose `upTo >= estimatedTokens` wins. Falls back to
198
+ * `fallbackTier` when no threshold matches.
199
+ *
200
+ * @example
201
+ * const sel = byTokenEstimate(
202
+ * [{ upTo: 4_000, tier: "small" }, { upTo: 32_000, tier: "medium" }],
203
+ * "large",
204
+ * );
205
+ * const model = routeModel(sel, { small: cheapModel, medium: midModel, large: bigModel });
206
+ */
207
+ declare function byTokenEstimate(thresholds: TokenThreshold[], fallbackTier: string): (req: ModelRequest) => string;
208
+ /**
209
+ * Routes each request to one of a named set of `ModelPort` implementations
210
+ * according to a caller-supplied `selector` function.
211
+ *
212
+ * The selector receives the full `ModelRequest` and returns a tier name.
213
+ * Use `byTokenEstimate` as a drop-in selector for token-count-based routing,
214
+ * or write your own (e.g. inspect a custom tag on the messages array).
215
+ *
216
+ * An unknown tier name throws immediately with a clear diagnostic message
217
+ * rather than silently failing or falling through.
218
+ *
219
+ * **Cost-optimization recipe**: assign a cheap fast model to the "small" tier
220
+ * and a powerful model only to the "large" tier. Short requests (most of them
221
+ * in practice) pay the cheap rate; only requests that genuinely need the
222
+ * bigger model incur the premium cost. This alone yields 40-60 % spend
223
+ * reductions in production workloads.
224
+ *
225
+ * @example
226
+ * const model = routeModel(
227
+ * byTokenEstimate([{ upTo: 4_000, tier: "small" }], "large"),
228
+ * { small: cheapModel, large: premiumModel },
229
+ * );
230
+ * agent.run({ model, ... }); // works as a plain ModelPort
231
+ */
232
+ declare function routeModel(selector: (req: ModelRequest) => string, tiers: Record<string, ModelPort>): ModelPort;
233
+
234
+ /** Pluggable backing store for `cachedModel`. Implement this to share the
235
+ * cache across workers / survive restarts. */
236
+ interface CacheStore {
237
+ get(key: string): Promise<ModelResponse | undefined> | ModelResponse | undefined;
238
+ set(key: string, value: ModelResponse, ttlMs?: number): Promise<void> | void;
239
+ }
240
+ interface CachedModelOptions {
241
+ /** How long a cached response is valid. Default: 5 minutes. */
242
+ ttlMs?: number;
243
+ /** Maximum number of entries in the in-process LRU. Default: 500. */
244
+ maxEntries?: number;
245
+ /**
246
+ * Custom key derivation function. Receives the full request; return a
247
+ * string that is unique for semantically-distinct requests.
248
+ * Default: stable-JSON hash of messages + tools + model + outputSchema.
249
+ */
250
+ keyFn?: (req: ModelRequest) => string;
251
+ /**
252
+ * External cache store (Redis, Memcached, …). When provided, the LRU is
253
+ * used as a local write-through layer and the store is consulted on misses.
254
+ */
255
+ store?: CacheStore;
256
+ /**
257
+ * Decide whether a successful result should be cached.
258
+ * Default: cache all non-error results (always `true` on the happy path).
259
+ *
260
+ * **Tool-call responses are cached** at the model layer — tool definitions
261
+ * are schemas only and carry no side-effects from the model's perspective.
262
+ * This is desirable for replay scenarios. Override here if you need
263
+ * per-request exclusions.
264
+ */
265
+ shouldCache?: (req: ModelRequest, result: ModelResponse) => boolean;
266
+ /** Called on every cache hit (key, cached response). Use for metrics. */
267
+ onCacheHit?: (key: string, result: ModelResponse) => void;
268
+ /**
269
+ * Clock used for TTL bookkeeping. Defaults to `Date.now`.
270
+ * Inject a fake clock in tests for deterministic expiry behaviour.
271
+ */
272
+ clock?: () => number;
273
+ }
274
+ /** Runtime statistics exposed by `cachedModel`. */
275
+ interface CacheStats {
276
+ /** Number of requests served from cache. */
277
+ hits: number;
278
+ /** Number of requests that required a live model call. */
279
+ misses: number;
280
+ /** Current number of entries in the in-process LRU. */
281
+ size: number;
282
+ }
283
+ /**
284
+ * Wraps a `ModelPort` with an exact-match response cache for `complete()`.
285
+ *
286
+ * - **Streaming calls are never cached** and always pass through to the
287
+ * underlying model.
288
+ * - The default key is a stable JSON hash of messages + tools + model +
289
+ * outputSchema. Provide `keyFn` to override.
290
+ * - In-memory LRU (default 500 entries, 5-minute TTL) with optional
291
+ * external `store` for cross-process sharing.
292
+ * - Inspect runtime behaviour via the returned `stats()` method.
293
+ *
294
+ * **Cost-optimization recipe**: enable caching for deterministic agent
295
+ * sub-tasks — system prompts, classification prompts, or any prompt that
296
+ * recurs verbatim within a session. Even a 20 % cache-hit rate on a
297
+ * high-volume workload reduces spend by 20 % with zero changes elsewhere.
298
+ * Documented combined savings of 55-65 % when combined with `withFallback`
299
+ * + `routeModel`.
300
+ *
301
+ * @example
302
+ * const model = cachedModel(baseModel, { ttlMs: 10 * 60_000, maxEntries: 1000 });
303
+ * const { hits, misses, size } = model.stats();
304
+ */
305
+ declare function cachedModel(model: ModelPort, opts?: CachedModelOptions): ModelPort & {
306
+ stats(): CacheStats;
307
+ };
308
+
309
+ export { AIEmbedder, AIModel, type AIModelOptions, type CacheStats, type CacheStore, type CachedModelOptions, LITELLM_URL, type ModelResolver, type OllamaModelOptions, type TokenThreshold, type WithFallbackOptions, byTokenEstimate, cachedModel, createOllamaModel, defaultPrices, fetchLatestPrices, mapLiteLLM, pricesUpdatedAt, routeModel, withFallback };
@@ -0,0 +1,309 @@
1
+ import { LanguageModel, generateText, embed } from 'ai';
2
+ import { ModelPort, ModelRequest, ModelResponse, ModelStreamPart, EmbeddingPort, PriceTable } from '@eidentic/types';
3
+
4
+ type ModelResolver = (modelStr?: string) => LanguageModel | Promise<LanguageModel>;
5
+ type GenParams = Parameters<typeof generateText>[0];
6
+ /**
7
+ * Keys of the AI SDK's `generateText` parameters that are generation settings (sampling,
8
+ * limits, provider passthrough) — as opposed to prompt/tools/telemetry/internal options.
9
+ * Listing a name that the installed AI SDK does not accept is a **compile error** (`Pick`
10
+ * requires the key to exist), so this list can never silently drift out of sync.
11
+ */
12
+ type GenerationSettingKey = "temperature" | "maxOutputTokens" | "topP" | "topK" | "presencePenalty" | "frequencyPenalty" | "stopSequences" | "seed" | "maxRetries" | "providerOptions" | "headers";
13
+ /**
14
+ * Generation settings forwarded to every model call, derived directly from the AI SDK's
15
+ * `generateText` signature via `Pick`. Names and types always match the installed SDK — a
16
+ * renamed or removed setting becomes a compile error here rather than being silently ignored
17
+ * at runtime. All keys are optional (they are optional on the SDK type).
18
+ *
19
+ * @example new AIModel(anthropic("claude-sonnet-4-5"), { temperature: 0.2, maxOutputTokens: 1024 })
20
+ */
21
+ type AIModelOptions = Pick<GenParams, GenerationSettingKey>;
22
+ /**
23
+ * A Eidentic `ModelPort` backed by Vercel AI SDK v6.
24
+ * Pass a concrete AI SDK `LanguageModel` (e.g. `anthropic("claude-...")`) or a resolver
25
+ * that turns the request's `model` string into a `LanguageModel`. The optional second
26
+ * argument sets generation defaults (temperature, maxOutputTokens, …) applied to every call.
27
+ */
28
+ declare class AIModel implements ModelPort {
29
+ private readonly resolve;
30
+ private readonly settings;
31
+ /** The model's own identifier, sourced from the AI SDK LanguageModel when a static model is passed. */
32
+ readonly modelId: string | undefined;
33
+ constructor(model: LanguageModel | ModelResolver, options?: AIModelOptions);
34
+ complete(request: ModelRequest): Promise<ModelResponse>;
35
+ stream(request: ModelRequest): AsyncIterable<ModelStreamPart>;
36
+ }
37
+
38
+ /** The embedding-model type accepted by AI SDK v6 `embed`, minus the bare-string branch. */
39
+ type AIEmbeddingModel = Exclude<Parameters<typeof embed>[0]["model"], string>;
40
+ /**
41
+ * Provider-agnostic hosted embedder over AI SDK v6. Bring your own provider + key + model:
42
+ * const embedder = await AIEmbedder.create(openai.embedding("text-embedding-3-small"));
43
+ * Works with any `@ai-sdk/*` embedding model (OpenAI, Cohere, Google, Mistral, ...).
44
+ * A first-class peer to the local `@eidentic/transformers` embedder; pick whichever fits.
45
+ */
46
+ declare class AIEmbedder implements EmbeddingPort {
47
+ private readonly model;
48
+ readonly dim: number;
49
+ private constructor();
50
+ /** Construct an embedder, probing the model once to discover its output dimension. */
51
+ static create(model: AIEmbeddingModel): Promise<AIEmbedder>;
52
+ embed(text: string): Promise<number[]>;
53
+ /**
54
+ * Batch embedding via AI SDK v6 `embedMany({ model, values })` → `{ embeddings: number[][] }`.
55
+ * Embeds all texts in a single provider call (fewer round-trips on the ingest hot path).
56
+ * Each returned vector is validated to have length === `this.dim`.
57
+ */
58
+ embedBatch(texts: string[]): Promise<number[][]>;
59
+ }
60
+
61
+ declare const pricesUpdatedAt = "2026-06-08T00:00:00.000Z";
62
+ declare const defaultPrices: PriceTable;
63
+
64
+ declare const LITELLM_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json";
65
+ /**
66
+ * Map a raw LiteLLM model_prices_and_context_window.json object to a PriceTable.
67
+ * Pure function — usable in tests without any network calls.
68
+ */
69
+ declare function mapLiteLLM(raw: Record<string, unknown>): PriceTable;
70
+ /**
71
+ * Fetch the latest prices live from LiteLLM (opt-in — schedule it yourself;
72
+ * the library never auto-fetches at runtime).
73
+ */
74
+ declare function fetchLatestPrices(opts?: {
75
+ fetchImpl?: typeof fetch;
76
+ url?: string;
77
+ }): Promise<PriceTable>;
78
+
79
+ /**
80
+ * Options for `createOllamaModel`.
81
+ */
82
+ interface OllamaModelOptions {
83
+ /**
84
+ * Base URL of the Ollama server (defaults to `http://localhost:11434/api`).
85
+ * Override when Ollama runs on a non-default port or a remote host.
86
+ */
87
+ baseURL?: string;
88
+ /**
89
+ * Injectable provider factory for testing. When provided, the `ollama-ai-provider`
90
+ * peer dependency is NOT resolved — the factory is called directly. In production
91
+ * code, leave this unset.
92
+ * @internal
93
+ */
94
+ _factory?: OllamaProviderFactory;
95
+ }
96
+ /**
97
+ * Minimal type describing what we need from `ollama-ai-provider`.
98
+ * @internal
99
+ */
100
+ interface OllamaProvider {
101
+ (modelId: string): LanguageModel;
102
+ }
103
+ /** @internal */
104
+ interface OllamaProviderFactory {
105
+ createOllama(opts?: {
106
+ baseURL?: string;
107
+ }): OllamaProvider;
108
+ }
109
+ /**
110
+ * Create a Vercel AI SDK `LanguageModel` backed by a locally-running Ollama instance.
111
+ *
112
+ * `ollama-ai-provider` is an **optional peer dependency** — install it separately:
113
+ * ```sh
114
+ * npm install ollama-ai-provider
115
+ * # or
116
+ * pnpm add ollama-ai-provider
117
+ * ```
118
+ *
119
+ * **Usage:**
120
+ * ```ts
121
+ * import { AIModel, createOllamaModel } from "@eidentic/model";
122
+ *
123
+ * // Default: connects to http://localhost:11434/api
124
+ * const model = new AIModel(createOllamaModel("llama3.2"));
125
+ *
126
+ * // Multimodal (vision-capable) model:
127
+ * const visionModel = new AIModel(createOllamaModel("llava"));
128
+ *
129
+ * // Custom server URL:
130
+ * const remoteModel = new AIModel(
131
+ * createOllamaModel("mistral", { baseURL: "http://192.168.1.10:11434/api" }),
132
+ * );
133
+ * ```
134
+ *
135
+ * No API key required — works entirely offline.
136
+ *
137
+ * @param modelId - Ollama model identifier, e.g. `"llama3.2"`, `"mistral"`, `"llava"`.
138
+ * @param opts - Optional configuration (baseURL + optional test factory).
139
+ * @returns A Vercel AI SDK `LanguageModel` that routes calls to the local Ollama server.
140
+ * @throws Error if `ollama-ai-provider` is not installed and no `_factory` is provided.
141
+ */
142
+ declare function createOllamaModel(modelId: string, opts?: OllamaModelOptions): LanguageModel;
143
+
144
+ interface WithFallbackOptions {
145
+ /**
146
+ * Custom predicate to decide whether an error should trigger fallback.
147
+ * Called with the thrown error. Return `true` to try the next model.
148
+ * Default: always fall back (except for AbortError — see below).
149
+ */
150
+ shouldFallback?: (err: unknown) => boolean;
151
+ /**
152
+ * Called each time a fallback transition happens.
153
+ * Useful for alerting/metrics — e.g. log to your observability stack.
154
+ */
155
+ onFallback?: (err: unknown, fromIndex: number, toIndex: number) => void;
156
+ }
157
+ /**
158
+ * Wraps a primary `ModelPort` with one or more fallback models.
159
+ *
160
+ * On failure of `complete()` or `stream()` (network error, provider 5xx/429,
161
+ * or a custom `shouldFallback` predicate), the next model in the chain is tried.
162
+ *
163
+ * **Stream caveat**: fallback is only attempted when the failed stream produced
164
+ * **zero** text deltas. If any delta was already yielded to the caller, the
165
+ * output would be corrupted by a mid-stream provider switch, so the error is
166
+ * re-thrown instead.
167
+ *
168
+ * **AbortError**: never triggers a fallback — the caller's cancellation intent
169
+ * is always respected.
170
+ *
171
+ * **Cost-optimization recipe**: pair a cheaper, faster tier as the primary with
172
+ * a slower but more-capable tier as fallback. Under normal conditions you pay
173
+ * the cheap rate; spikes or outages automatically route to the reliable tier
174
+ * without changing any call sites. Documented 55-65 % cost reductions in
175
+ * production systems that apply this pattern.
176
+ *
177
+ * @example
178
+ * const model = withFallback(cheap, [premium], {
179
+ * onFallback: (err, from, to) => console.warn(`model[${from}] failed, trying [${to}]`, err),
180
+ * });
181
+ * // Drop into any AgentConfig.model unchanged — it is still a ModelPort.
182
+ */
183
+ declare function withFallback(primary: ModelPort, fallbacks: ModelPort[], opts?: WithFallbackOptions): ModelPort;
184
+
185
+ /**
186
+ * A threshold entry for `byTokenEstimate`.
187
+ * The selector routes to `tier` when the estimated token count is `<= upTo`.
188
+ * Entries are evaluated in ascending `upTo` order; the first match wins.
189
+ */
190
+ interface TokenThreshold {
191
+ upTo: number;
192
+ tier: string;
193
+ }
194
+ /**
195
+ * Returns a `routeModel` selector that maps requests to tier names based on
196
+ * estimated prompt token count. Thresholds are evaluated in ascending `upTo`
197
+ * order; the first whose `upTo >= estimatedTokens` wins. Falls back to
198
+ * `fallbackTier` when no threshold matches.
199
+ *
200
+ * @example
201
+ * const sel = byTokenEstimate(
202
+ * [{ upTo: 4_000, tier: "small" }, { upTo: 32_000, tier: "medium" }],
203
+ * "large",
204
+ * );
205
+ * const model = routeModel(sel, { small: cheapModel, medium: midModel, large: bigModel });
206
+ */
207
+ declare function byTokenEstimate(thresholds: TokenThreshold[], fallbackTier: string): (req: ModelRequest) => string;
208
+ /**
209
+ * Routes each request to one of a named set of `ModelPort` implementations
210
+ * according to a caller-supplied `selector` function.
211
+ *
212
+ * The selector receives the full `ModelRequest` and returns a tier name.
213
+ * Use `byTokenEstimate` as a drop-in selector for token-count-based routing,
214
+ * or write your own (e.g. inspect a custom tag on the messages array).
215
+ *
216
+ * An unknown tier name throws immediately with a clear diagnostic message
217
+ * rather than silently failing or falling through.
218
+ *
219
+ * **Cost-optimization recipe**: assign a cheap fast model to the "small" tier
220
+ * and a powerful model only to the "large" tier. Short requests (most of them
221
+ * in practice) pay the cheap rate; only requests that genuinely need the
222
+ * bigger model incur the premium cost. This alone yields 40-60 % spend
223
+ * reductions in production workloads.
224
+ *
225
+ * @example
226
+ * const model = routeModel(
227
+ * byTokenEstimate([{ upTo: 4_000, tier: "small" }], "large"),
228
+ * { small: cheapModel, large: premiumModel },
229
+ * );
230
+ * agent.run({ model, ... }); // works as a plain ModelPort
231
+ */
232
+ declare function routeModel(selector: (req: ModelRequest) => string, tiers: Record<string, ModelPort>): ModelPort;
233
+
234
+ /** Pluggable backing store for `cachedModel`. Implement this to share the
235
+ * cache across workers / survive restarts. */
236
+ interface CacheStore {
237
+ get(key: string): Promise<ModelResponse | undefined> | ModelResponse | undefined;
238
+ set(key: string, value: ModelResponse, ttlMs?: number): Promise<void> | void;
239
+ }
240
+ interface CachedModelOptions {
241
+ /** How long a cached response is valid. Default: 5 minutes. */
242
+ ttlMs?: number;
243
+ /** Maximum number of entries in the in-process LRU. Default: 500. */
244
+ maxEntries?: number;
245
+ /**
246
+ * Custom key derivation function. Receives the full request; return a
247
+ * string that is unique for semantically-distinct requests.
248
+ * Default: stable-JSON hash of messages + tools + model + outputSchema.
249
+ */
250
+ keyFn?: (req: ModelRequest) => string;
251
+ /**
252
+ * External cache store (Redis, Memcached, …). When provided, the LRU is
253
+ * used as a local write-through layer and the store is consulted on misses.
254
+ */
255
+ store?: CacheStore;
256
+ /**
257
+ * Decide whether a successful result should be cached.
258
+ * Default: cache all non-error results (always `true` on the happy path).
259
+ *
260
+ * **Tool-call responses are cached** at the model layer — tool definitions
261
+ * are schemas only and carry no side-effects from the model's perspective.
262
+ * This is desirable for replay scenarios. Override here if you need
263
+ * per-request exclusions.
264
+ */
265
+ shouldCache?: (req: ModelRequest, result: ModelResponse) => boolean;
266
+ /** Called on every cache hit (key, cached response). Use for metrics. */
267
+ onCacheHit?: (key: string, result: ModelResponse) => void;
268
+ /**
269
+ * Clock used for TTL bookkeeping. Defaults to `Date.now`.
270
+ * Inject a fake clock in tests for deterministic expiry behaviour.
271
+ */
272
+ clock?: () => number;
273
+ }
274
+ /** Runtime statistics exposed by `cachedModel`. */
275
+ interface CacheStats {
276
+ /** Number of requests served from cache. */
277
+ hits: number;
278
+ /** Number of requests that required a live model call. */
279
+ misses: number;
280
+ /** Current number of entries in the in-process LRU. */
281
+ size: number;
282
+ }
283
+ /**
284
+ * Wraps a `ModelPort` with an exact-match response cache for `complete()`.
285
+ *
286
+ * - **Streaming calls are never cached** and always pass through to the
287
+ * underlying model.
288
+ * - The default key is a stable JSON hash of messages + tools + model +
289
+ * outputSchema. Provide `keyFn` to override.
290
+ * - In-memory LRU (default 500 entries, 5-minute TTL) with optional
291
+ * external `store` for cross-process sharing.
292
+ * - Inspect runtime behaviour via the returned `stats()` method.
293
+ *
294
+ * **Cost-optimization recipe**: enable caching for deterministic agent
295
+ * sub-tasks — system prompts, classification prompts, or any prompt that
296
+ * recurs verbatim within a session. Even a 20 % cache-hit rate on a
297
+ * high-volume workload reduces spend by 20 % with zero changes elsewhere.
298
+ * Documented combined savings of 55-65 % when combined with `withFallback`
299
+ * + `routeModel`.
300
+ *
301
+ * @example
302
+ * const model = cachedModel(baseModel, { ttlMs: 10 * 60_000, maxEntries: 1000 });
303
+ * const { hits, misses, size } = model.stats();
304
+ */
305
+ declare function cachedModel(model: ModelPort, opts?: CachedModelOptions): ModelPort & {
306
+ stats(): CacheStats;
307
+ };
308
+
309
+ export { AIEmbedder, AIModel, type AIModelOptions, type CacheStats, type CacheStore, type CachedModelOptions, LITELLM_URL, type ModelResolver, type OllamaModelOptions, type TokenThreshold, type WithFallbackOptions, byTokenEstimate, cachedModel, createOllamaModel, defaultPrices, fetchLatestPrices, mapLiteLLM, pricesUpdatedAt, routeModel, withFallback };