npm - universal-llm-client - Versions diffs - 4.2.0 → 4.5.0 - Mend

universal-llm-client 4.2.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

package/CHANGELOG.md +142 -103
package/LICENSE +21 -21
package/README.md +640 -591
package/dist/ai-model.d.ts +12 -1
package/dist/ai-model.d.ts.map +1 -1
package/dist/ai-model.js +36 -1
package/dist/ai-model.js.map +1 -1
package/dist/gemma-channel.d.ts +14 -0
package/dist/gemma-channel.d.ts.map +1 -0
package/dist/gemma-channel.js +38 -0
package/dist/gemma-channel.js.map +1 -0
package/dist/gemma-diffusion.d.ts +49 -0
package/dist/gemma-diffusion.d.ts.map +1 -0
package/dist/gemma-diffusion.js +147 -0
package/dist/gemma-diffusion.js.map +1 -0
package/dist/http.d.ts +4 -0
package/dist/http.d.ts.map +1 -1
package/dist/http.js +14 -1
package/dist/http.js.map +1 -1
package/dist/index.d.ts +2 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +4 -0
package/dist/index.js.map +1 -1
package/dist/interfaces.d.ts +183 -7
package/dist/interfaces.d.ts.map +1 -1
package/dist/interfaces.js.map +1 -1
package/dist/providers/anthropic.d.ts.map +1 -1
package/dist/providers/anthropic.js +28 -3
package/dist/providers/anthropic.js.map +1 -1
package/dist/providers/google.d.ts +22 -1
package/dist/providers/google.d.ts.map +1 -1
package/dist/providers/google.js +225 -13
package/dist/providers/google.js.map +1 -1
package/dist/providers/ollama.d.ts +2 -0
package/dist/providers/ollama.d.ts.map +1 -1
package/dist/providers/ollama.js +59 -30
package/dist/providers/ollama.js.map +1 -1
package/dist/providers/openai.d.ts +14 -0
package/dist/providers/openai.d.ts.map +1 -1
package/dist/providers/openai.js +200 -22
package/dist/providers/openai.js.map +1 -1
package/dist/router.d.ts +2 -0
package/dist/router.d.ts.map +1 -1
package/dist/router.js +4 -0
package/dist/router.js.map +1 -1
package/dist/stream-decoder.d.ts +12 -0
package/dist/stream-decoder.d.ts.map +1 -1
package/dist/stream-decoder.js +182 -5
package/dist/stream-decoder.js.map +1 -1
package/dist/thinking.d.ts +36 -0
package/dist/thinking.d.ts.map +1 -0
package/dist/thinking.js +52 -0
package/dist/thinking.js.map +1 -0
package/package.json +118 -116
package/src/ai-model.ts +400 -350
package/src/auditor.ts +213 -213
package/src/client.ts +402 -402
package/src/debug/debug-google-streaming.ts +1 -1
package/src/demos/basic/universal-llm-examples.ts +3 -3
package/src/demos/diffusion-gemma/.env +29 -0
package/src/demos/diffusion-gemma/.env.example +27 -0
package/src/demos/diffusion-gemma/CLAUDE.md +95 -0
package/src/demos/diffusion-gemma/README.md +59 -0
package/src/demos/diffusion-gemma/canvas.ts +1606 -0
package/src/demos/diffusion-gemma/docker-compose.yml +29 -0
package/src/demos/diffusion-gemma/probe-stream.ts +51 -0
package/src/demos/diffusion-gemma/probe-tools.ts +55 -0
package/src/demos/diffusion-gemma/server.ts +1205 -0
package/src/demos/diffusion-gemma/start-vllm.sh +98 -0
package/src/gemma-channel.ts +47 -0
package/src/gemma-diffusion.ts +167 -0
package/src/http.ts +261 -247
package/src/index.ts +180 -161
package/src/interfaces.ts +843 -657
package/src/mcp.ts +345 -345
package/src/providers/anthropic.ts +796 -762
package/src/providers/google.ts +840 -620
package/src/providers/index.ts +8 -8
package/src/providers/ollama.ts +503 -469
package/src/providers/openai.ts +587 -392
package/src/router.ts +785 -780
package/src/stream-decoder.ts +535 -361
package/src/structured-output.ts +759 -759
package/src/test-scripts/test-google-deep-research.ts +33 -0
package/src/test-scripts/test-google-streaming-enhanced.ts +147 -147
package/src/test-scripts/test-google-streaming.ts +1 -1
package/src/test-scripts/test-google-system-prompt-comprehensive.ts +189 -189
package/src/test-scripts/test-google-thinking.ts +46 -0
package/src/test-scripts/test-system-message-positions.ts +163 -163
package/src/test-scripts/test-system-prompt-improvement-demo.ts +83 -83
package/src/test-scripts/test-vllm-qwen36.ts +256 -0
package/src/tests/ai-model.test.ts +1614 -1614
package/src/tests/auditor.test.ts +224 -224
package/src/tests/gemma-diffusion.test.ts +115 -0
package/src/tests/http.test.ts +200 -200
package/src/tests/interfaces.test.ts +117 -117
package/src/tests/providers/anthropic.test.ts +118 -0
package/src/tests/providers/google.test.ts +841 -660
package/src/tests/providers/ollama.test.ts +1034 -954
package/src/tests/providers/openai.test.ts +1511 -1122
package/src/tests/router.test.ts +254 -254
package/src/tests/stream-decoder.test.ts +263 -179
package/src/tests/structured-output.test.ts +1450 -1450
package/src/tests/thinking.test.ts +65 -0
package/src/tests/tools.test.ts +175 -175
package/src/thinking.ts +73 -0
package/src/tools.ts +246 -246
package/src/zod-adapter.ts +72 -72

package/README.md CHANGED Viewed

@@ -1,591 +1,640 @@
-# universal-llm-client
-A universal LLM client for JavaScript/TypeScript with **transparent provider failover**, streaming tool execution, pluggable reasoning strategies, and native observability.
-```typescript
-import { AIModel } from 'universal-llm-client';
-const model = new AIModel({
-    model: 'gemini-2.5-flash',
-    providers: [
-        { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
-        { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY },
-        { type: 'ollama' },
-    ],
-});
-const response = await model.chat([
-    { role: 'user', content: 'Hello!' },
-]);
-```
-> **One model, multiple backends.** If Google fails, it transparently fails over to OpenRouter, then to local Ollama. Your code never knows the difference.
----
-## Features
-- 🔄 **Transparent Failover** — Priority-ordered provider chain with retries, health tracking, and cooldowns
-- 🛠️ **Tool Calling** — Register tools once, works across all providers. Autonomous multi-turn execution loop
-- 📋 **Structured Output** — Zod schema validation, JSON Schema support, streaming, and type-safe responses
-- 🌊 **Streaming** — First-class async generator streaming with pluggable decoder strategies
-- 🧠 **Reasoning** — Native `<think>` tag parsing, interleaved reasoning, and model thinking support
-- 🔍 **Observability** — Built-in auditor interface for logging, cost tracking, and behavioral analysis
-- 🌐 **Universal Runtime** — Node.js 22+, Bun, Deno, and modern browsers
-- 🤖 **MCP Native** — Bridge MCP servers to LLM tools with zero glue code
-- 📊 **Embeddings** — Single and batch embedding generation
-## Supported Providers
-| Provider | Type | Notes |
-|---|---|---|
-| **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal |
-| **OpenAI** | `openai` | GPT-4o, o3, etc. Also works with OpenRouter, Groq, LM Studio, vLLM |
-| **Google AI Studio** | `google` | Gemini models, system instructions, multimodal |
-| **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints and Bearer tokens |
-| **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances |
----
-## Installation
-```bash
-bun add universal-llm-client
-# or
-npm install universal-llm-client
-```
-**Optional**: For MCP integration:
-```bash
-bun add @modelcontextprotocol/sdk
-```
----
-## Quick Start
-### Basic Chat
-```typescript
-import { AIModel } from 'universal-llm-client';
-const model = new AIModel({
-    model: 'qwen3:4b',
-    providers: [{ type: 'ollama' }],
-});
-const response = await model.chat([
-    { role: 'system', content: 'You are a helpful assistant.' },
-    { role: 'user', content: 'What is the capital of France?' },
-]);
-console.log(response.message.content);
-// "The capital of France is Paris."
-```
-### Streaming
-```typescript
-for await (const event of model.chatStream([
-    { role: 'user', content: 'Write a haiku about code.' },
-])) {
-    if (event.type === 'text') {
-        process.stdout.write(event.content);
-    } else if (event.type === 'thinking') {
-        // Model reasoning (when supported)
-        console.log('[thinking]', event.content);
-    }
-}
-```
-### Tool Calling
-```typescript
-model.registerTool(
-    'get_weather',
-    'Get current weather for a location',
-    {
-        type: 'object',
-        properties: {
-            city: { type: 'string', description: 'City name' },
-        },
-        required: ['city'],
-    },
-    async (args) => {
-        const { city } = args as { city: string };
-        return { temperature: 22, condition: 'sunny', city };
-    },
-);
-// Autonomous tool execution — the model calls tools and loops until done
-const response = await model.chatWithTools([
-    { role: 'user', content: "What's the weather in Tokyo?" },
-]);
-console.log(response.message.content);
-// "The weather in Tokyo is 22°C and sunny."
-console.log(response.toolTrace);
-// [{ name: 'get_weather', args: { city: 'Tokyo' }, result: {...}, duration: 5 }]
-```
-### Provider Failover
-```typescript
-const model = new AIModel({
-    model: 'gemini-2.5-flash',
-    retries: 2,        // retries per provider before failover
-    timeout: 30000,    // request timeout in ms
-    providers: [
-        { type: 'google', apiKey: process.env.GOOGLE_KEY, priority: 0 },
-        { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY, priority: 1 },
-        { type: 'ollama', url: 'http://localhost:11434', priority: 2 },
-    ],
-});
-// If Google returns 500, retries twice, then seamlessly tries OpenRouter.
-// If OpenRouter also fails, falls back to local Ollama.
-// Your code sees a single response.
-const response = await model.chat([{ role: 'user', content: 'Hello' }]);
-// Check provider health at any time
-console.log(model.getProviderStatus());
-// [{ id: 'google-0', healthy: true }, { id: 'openai-1', healthy: true }, ...]
-```
-### Multimodal (Vision)
-```typescript
-import { AIModel, multimodalMessage } from 'universal-llm-client';
-const model = new AIModel({
-    model: 'gemini-2.5-flash',
-    providers: [{ type: 'google', apiKey: process.env.GOOGLE_KEY }],
-});
-const response = await model.chat([
-    multimodalMessage('What do you see in this image?', [
-        'https://example.com/photo.jpg',
-    ]),
-]);
-```
-### Embeddings
-```typescript
-const embedModel = new AIModel({
-    model: 'nomic-embed-text-v2-moe:latest',
-    providers: [{ type: 'ollama' }],
-});
-const vector = await embedModel.embed('Hello world');
-// [0.006, 0.026, -0.009, ...]
-const vectors = await embedModel.embedArray(['Hello', 'World']);
-// [[0.006, ...], [0.012, ...]]
-```
-### Structured Output
-Get typed, validated JSON responses from any LLM using Zod schemas:
-```typescript
-import { AIModel } from 'universal-llm-client';
-import { z } from 'zod';
-const model = new AIModel({
-    model: 'gemini-2.5-flash',
-    providers: [
-        { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
-        { type: 'ollama' },
-    ],
-});
-// Define your schema
-const UserSchema = z.object({
-    name: z.string(),
-    age: z.number(),
-    email: z.string().email(),
-    interests: z.array(z.string()),
-});
-// Method 1: generateStructured (throws on validation failure)
-const user = await model.generateStructured(UserSchema, [
-    { role: 'user', content: 'Generate a user profile for a software developer' },
-]);
-console.log(user.name);     // TypeScript knows this is string
-console.log(user.age);      // TypeScript knows this is number
-console.log(user.email);    // TypeScript knows this is string
-console.log(user.interests); // TypeScript knows this is string[]
-```
-**Non-throwing variant:**
-```typescript
-// Method 2: tryParseStructured (returns result object, never throws)
-const result = await model.tryParseStructured(UserSchema, messages);
-if (result.ok) {
-    console.log('User:', result.value.name);
-} else {
-    console.log('Error:', result.error.message);
-    console.log('Raw LLM output:', result.rawOutput);
-}
-```
-**Via chat options:**
-```typescript
-// Method 3: chat with output parameter
-const response = await model.chat(messages, {
-    output: { schema: UserSchema },
-});
-// response.structured is typed as { name: string, age: number, ... }
-if (response.structured) {
-    console.log(response.structured.name);
-}
-```
-**Streaming structured output:**
-```typescript
-// Stream partial validated objects as JSON generates
-for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
-    console.log('Partial:', partial);
-    // Partial: { name: 'Alice' }
-    // Partial: { name: 'Alice', age: 30 }
-    // Partial: { name: 'Alice', age: 30, email: 'alice@example.com' }
-}
-```
-**Raw JSON Schema (without Zod):**
-```typescript
-const response = await model.chat(messages, {
-    jsonSchema: {
-        type: 'object',
-        properties: {
-            name: { type: 'string' },
-            age: { type: 'number' },
-        },
-        required: ['name', 'age'],
-    },
-    name: 'Person',  // Optional, used for LLM guidance
-});
-```
-**Separate module import (tree-shaking):**
-```typescript
-// Import only structured output types if you don't need the full client
-import {
-    StructuredOutputError,
-    type StructuredOutputResult,
-    type StructuredOutputOptions,
-    parseStructured,
-    tryParseStructured,
-    zodToJsonSchema,
-} from 'universal-llm-client/structured-output';
-```
-**Vision with structured output:**
-```typescript
-const ImageAnalysisSchema = z.object({
-    objects: z.array(z.string()),
-    scene: z.string(),
-    mood: z.string(),
-});
-const response = await model.generateStructured(ImageAnalysisSchema, [
-    multimodalMessage('Analyze this image', ['https://example.com/photo.jpg']),
-]);
-```
-**Provider compatibility:**
-| Provider | Method | Notes |
-|----------|--------|-------|
-| OpenAI | `response_format.json_schema` | Strict mode enabled |
-| Ollama | `format: { schema }` | Model must support grammar |
-| Google | `responseMimeType + responseSchema` | Some features stripped |
-### Observability
-```typescript
-import { AIModel, ConsoleAuditor, BufferedAuditor } from 'universal-llm-client';
-// Simple console logging
-const model = new AIModel({
-    model: 'qwen3:4b',
-    providers: [{ type: 'ollama' }],
-    auditor: new ConsoleAuditor('[LLM]'),
-});
-// [LLM] REQUEST [ollama] (qwen3:4b) →
-// [LLM] RESPONSE [ollama] (qwen3:4b) 1200ms 68 tokens
-// Buffered for custom sinks (OpenTelemetry, DB, etc.)
-const auditor = new BufferedAuditor({
-    maxBufferSize: 100,
-    onFlush: async (events) => {
-        await sendToOpenTelemetry(events);
-    },
-});
-```
-### MCP Integration
-```typescript
-import { AIModel, MCPToolBridge } from 'universal-llm-client';
-const model = new AIModel({
-    model: 'qwen3:4b',
-    providers: [{ type: 'ollama' }],
-});
-const mcp = new MCPToolBridge({
-    servers: {
-        filesystem: {
-            command: 'npx',
-            args: ['-y', '@modelcontextprotocol/server-filesystem', './'],
-        },
-        weather: {
-            url: 'https://mcp.example.com/weather',
-        },
-    },
-});
-await mcp.connect();
-await mcp.registerTools(model);
-// MCP tools are now callable via chatWithTools
-const response = await model.chatWithTools([
-    { role: 'user', content: 'List files in the current directory' },
-]);
-await mcp.disconnect();
-```
-### Stream Decoders
-```typescript
-import { AIModel, createDecoder } from 'universal-llm-client';
-// Passthrough — raw text, no parsing
-// Standard Chat — text + native reasoning + tool calls
-// Interleaved Reasoning — parses <think> and <progress> tags from text streams
-const decoder = createDecoder('interleaved-reasoning', (event) => {
-    switch (event.type) {
-        case 'text': console.log(event.content); break;
-        case 'thinking': console.log('[think]', event.content); break;
-        case 'progress': console.log('[progress]', event.content); break;
-        case 'tool_call': console.log('[tool]', event.calls); break;
-    }
-});
-decoder.push('<think>Let me analyze this</think>The answer is 42');
-decoder.flush();
-console.log(decoder.getCleanContent());  // "The answer is 42"
-console.log(decoder.getReasoning());      // "Let me analyze this"
-```
----
-## API Reference
-### `AIModel`
-The universal client. One class, multiple backends.
-```typescript
-new AIModel(config: AIModelConfig)
-```
-**Config:**
-| Property | Type | Default | Description |
-|---|---|---|---|
-| `model` | `string` | — | Model name (e.g., `'gemini-2.5-flash'`) |
-| `providers` | `ProviderConfig[]` | — | Ordered list of provider backends |
-| `retries` | `number` | `2` | Retries per provider before failover |
-| `timeout` | `number` | `30000` | Request timeout in ms |
-| `auditor` | `Auditor` | `NoopAuditor` | Observability sink |
-| `thinking` | `boolean` | `false` | Enable model thinking/reasoning |
-| `debug` | `boolean` | `false` | Debug logging |
-| `defaultParameters` | `object` | — | Default parameters for all requests |
-**Provider Config:**
-| Property | Type | Description |
-|---|---|---|
-| `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'` |
-| `url` | `string` | Provider URL (has sensible defaults) |
-| `apiKey` | `string` | API key or Bearer token |
-| `priority` | `number` | Lower = tried first (defaults to array index) |
-| `model` | `string` | Override model name for this provider |
-| `region` | `string` | Vertex AI region (e.g., `'us-central1'`) |
-| `apiVersion` | `string` | API version (e.g., `'v1beta'`) |
-**Methods:**
-| Method | Returns | Description |
-|---|---|---|
-| `chat(messages, options?)` | `Promise<LLMChatResponse>` | Send chat request |
-| `chatWithTools(messages, options?)` | `Promise<LLMChatResponse>` | Chat with autonomous tool execution |
-| `chatStream(messages, options?)` | `AsyncGenerator<DecodedEvent>` | Stream chat response |
-| `generateStructured(schema, messages, options?)` | `Promise<T>` | Generate typed JSON validated against Zod schema |
-| `tryParseStructured(schema, messages, options?)` | `Promise<StructuredOutputResult<T>>` | Non-throwing variant returning result object |
-| `generateStructuredStream(schema, messages, options?)` | `AsyncGenerator<T, T>` | Stream partial validated objects as JSON generates |
-| `embed(text)` | `Promise<number[]>` | Generate single embedding |
-| `embedArray(texts)` | `Promise<number[][]>` | Generate batch embeddings |
-| `registerTool(name, desc, params, handler)` | `void` | Register a callable tool |
-| `registerTools(tools)` | `void` | Register multiple tools |
-| `getModels()` | `Promise<string[]>` | List available models |
-| `getModelInfo()` | `Promise<ModelMetadata>` | Get model metadata |
-| `getProviderStatus()` | `ProviderStatus[]` | Check provider health |
-| `setModel(name)` | `void` | Switch model at runtime |
-| `dispose()` | `Promise<void>` | Clean shutdown |
-### Structured Output
-```typescript
-import { z } from 'zod';
-// Define your schema
-const UserSchema = z.object({
-    name: z.string(),
-    age: z.number(),
-    email: z.string().email(),
-});
-// Generate typed JSON
-const user = await model.generateStructured(UserSchema, messages);
-// TypeScript infers: { name: string; age: number; email: string }
-// Non-throwing variant
-const result = await model.tryParseStructured(UserSchema, messages);
-if (result.ok) {
-    console.log(result.value.name);  // Fully typed
-} else {
-    console.log(result.error.message);
-}
-// Stream partial objects
-for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
-    console.log(partial);  // Partial validated objects
-}
-```
-**Separate module import (tree-shaking):**
-```typescript
-import {
-    StructuredOutputError,
-    type StructuredOutputResult,
-    parseStructured,
-    tryParseStructured,
-    zodToJsonSchema,
-} from 'universal-llm-client/structured-output';
-// Use without importing the full client
-const schema = z.object({ name: z.string() });
-const jsonSchema = zodToJsonSchema(schema);
-```
-### `ToolBuilder` / `ToolExecutor`
-```typescript
-import { ToolBuilder, ToolExecutor } from 'universal-llm-client';
-// Fluent builder
-const tool = new ToolBuilder('search')
-    .description('Search the web')
-    .addParameter('query', 'string', 'Search query', true)
-    .addParameter('limit', 'number', 'Max results', false)
-    .build();
-// Execution wrappers
-const safeHandler = ToolExecutor.compose(
-    myHandler,
-    h => ToolExecutor.withTimeout(h, 5000),
-    h => ToolExecutor.safe(h),
-    h => ToolExecutor.withValidation(h, ['query']),
-);
-```
-### Auditor Interface
-Implement custom observability by providing an `Auditor`:
-```typescript
-interface Auditor {
-    record(event: AuditEvent): void;
-    flush?(): Promise<void>;
-}
-```
-**Built-in implementations:**
-- `NoopAuditor` — Zero overhead (default)
-- `ConsoleAuditor` — Structured console logging
-- `BufferedAuditor` — Collects events for custom sinks
----
-## Architecture
-```
-universal-llm-client
-├── AIModel          ← Public API (the only class you import)
-├── Router           ← Internal failover engine
-├── BaseLLMClient    ← Abstract client with tool execution
-├── Providers
-│   ├── OllamaClient
-│   ├── OpenAICompatibleClient  (OpenAI, OpenRouter, Groq, LM Studio, vLLM, LlamaCpp)
-│   └── GoogleClient            (AI Studio + Vertex AI)
-├── StreamDecoder    ← Pluggable reasoning strategies
-├── Auditor          ← Observability interface
-├── MCPToolBridge    ← MCP server integration
-└── HTTP Utilities   ← Universal fetch-based transport
-```
-### Design Principles
-1. **Single import** — `AIModel` is the only class users need
-2. **Provider agnostic** — Same code works with any backend
-3. **Transparent failover** — Health tracking and cooldowns happen behind the scenes
-4. **Zero dependencies** — Core library depends only on native `fetch`
-5. **Agent-ready** — Stateless, composable instances designed as foundation for agent frameworks
-6. **Observable** — Every request, response, tool call, retry, and failover is auditable
----
-## Runtime Support
-| Runtime | Version | Status |
-|---|---|---|
-| **Node.js** | 22+ | ✅ Full support |
-| **Bun** | 1.0+ | ✅ Full support |
-| **Deno** | 2.0+ | ✅ Full support |
-| **Browsers** | Modern | ✅ No stdio MCP, HTTP transport only |
----
-## For Agent Framework Authors
-`AIModel` is designed as the transport layer for agentic systems:
-- **Stateless** — No conversation history stored. Your framework manages memory
-- **Composable** — Create separate instances for chat, embeddings, vision
-- **Tool tracing** — `chatWithTools()` returns full execution trace
-- **Context budget** — `getModelInfo()` exposes `contextLength`
-- **Auditor as system bus** — Inject custom sinks for cost tracking, behavioral scoring
-- **StreamDecoder as UI bridge** — Select decoder strategy per-call
----
-## License
-MIT
+# universal-llm-client
+A universal LLM client for JavaScript/TypeScript with **transparent provider failover** and a **provider-agnostic reasoning API** — one set of code across OpenAI, Anthropic, Google Gemini, Ollama, vLLM, and any OpenAI-compatible endpoint. Streaming tool execution, structured output, generation stats, and native observability included.
+```typescript
+import { AIModel } from 'universal-llm-client';
+const model = new AIModel({
+    model: 'gemini-3.5-flash',
+    providers: [
+        { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
+        { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY },
+        { type: 'ollama' },
+    ],
+});
+const response = await model.chat([
+    { role: 'user', content: 'Hello!' },
+]);
+```
+> **One model, multiple backends.** If Google fails, it transparently fails over to OpenRouter, then to local Ollama. Your code never knows the difference.
+---
+## Features
+- 🔄 **Transparent Failover** — Priority-ordered provider chain with retries, health tracking, and cooldowns
+- 🧠 **Unified Reasoning** — One `thinking` flag (`true`/`false` or a level: `'minimal' | 'low' | 'medium' | 'high'`) mapped to each backend's native control; chain-of-thought surfaced as `response.reasoning` + streaming `thinking` events (with `<think>`-tag parsing as a fallback)
+- 🛠️ **Tool Calling** — Register tools once, works across all providers. Autonomous multi-turn execution loop
+- 📋 **Structured Output** — Zod schema validation, JSON Schema support, streaming, and type-safe responses
+- 🌊 **Streaming** — First-class async generator streaming with pluggable decoder strategies
+- 🔬 **Deep Research** — Drive Google Gemini's agentic Deep Research (background interactions with polling + streaming)
+- 📈 **Generation Stats** — `usage.tokensPerSecond` and `durationMs` reported across providers
+- 🔌 **Flexible Transport** — Custom headers, query params, auth header/prefix, and base path for Azure OpenAI and gateways
+- 🔍 **Observability** — Built-in auditor interface for logging, cost tracking, and behavioral analysis
+- 🌐 **Universal Runtime** — Node.js 22+, Bun, Deno, and modern browsers
+- 🤖 **MCP Native** — Bridge MCP servers to LLM tools with zero glue code
+- 📊 **Embeddings** — Single and batch embedding generation
+## Supported Providers
+| Provider | Type | Notes |
+|---|---|---|
+| **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal, native thinking |
+| **OpenAI + Compat** | `openai` | GPT series, o-series + **any OpenAI-compatible endpoint**: xAI/Grok, Mistral, DeepSeek, Cohere Compatibility, Groq, Together, Fireworks, OpenRouter, Perplexity Sonar, vLLM, LM Studio, TGI, most self-hosted servers |
+| **Google AI Studio** | `google` | Gemini models, system instructions, multimodal, native thinking + grounding |
+| **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints, Bearer tokens, service tiers (flex/priority) |
+| **Anthropic (Claude)** | `anthropic` | Claude 3.5/4 models via native Messages API. Excellent tool use, extended thinking with signatures, strong prompt caching |
+| **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances (OpenAI-compatible under the hood) |
+**Most of the world** is reachable via `type: 'openai'` + a `url` override. We only maintain dedicated clients for fundamentally different protocols (Anthropic Messages, Google Gemini) that offer unique high-value capabilities, plus Ollama for local developer experience. See `docs/guide/providers.md` and the research survey in `docs/research/provider-api-landscape-2026.md`.
+---
+## Installation
+```bash
+bun add universal-llm-client
+# or
+npm install universal-llm-client
+```
+**Optional**: For MCP integration:
+```bash
+bun add @modelcontextprotocol/sdk
+```
+---
+## Quick Start
+### Basic Chat
+```typescript
+import { AIModel } from 'universal-llm-client';
+const model = new AIModel({
+    model: 'qwen3:4b',
+    providers: [{ type: 'ollama' }],
+});
+const response = await model.chat([
+    { role: 'system', content: 'You are a helpful assistant.' },
+    { role: 'user', content: 'What is the capital of France?' },
+]);
+console.log(response.message.content);
+// "The capital of France is Paris."
+```
+### Streaming
+```typescript
+for await (const event of model.chatStream([
+    { role: 'user', content: 'Write a haiku about code.' },
+])) {
+    if (event.type === 'text') {
+        process.stdout.write(event.content);
+    } else if (event.type === 'thinking') {
+        // Model reasoning (when supported)
+        console.log('[thinking]', event.content);
+    }
+}
+```
+### Thinking & Reasoning
+Set one `thinking` value — `true`/`false` or a level (`'minimal' | 'low' | 'medium' | 'high'`) —
+and it maps to each provider's native control (Gemini `thinkingLevel`/`thinkingBudget`, OpenAI
+`reasoning_effort`, vLLM `enable_thinking`, Anthropic `budget_tokens`, Ollama `think`):
+```typescript
+const model = new AIModel({
+    model: 'gemini-3.5-flash',
+    thinking: 'high', // true | false | 'minimal' | 'low' | 'medium' | 'high'
+    providers: [{ type: 'google', apiKey: process.env.GOOGLE_API_KEY }],
+});
+const res = await model.chat([{ role: 'user', content: 'Solve this step by step: ...' }]);
+console.log(res.message.content); // final answer (clean)
+console.log(res.reasoning);       // chain-of-thought, when the model exposes it
+// Per-call override (e.g. turn thinking off for structured output)
+await model.chat(messages, { thinking: false });
+```
+### Deep Research (Gemini)
+Run Google's agentic Deep Research — creates a background interaction and polls to completion:
+```typescript
+const result = await model.deepResearch('Research the history of Google TPUs.', {
+    tools: ['google_search', 'url_context'],
+});
+console.log(result.status, result.report);
+// Or stream intermediate thoughts and steps as they arrive:
+for await (const ev of model.deepResearchStream('Compare RISC-V vs ARM in 2026.')) {
+    if (ev.type === 'thought') console.log('[thinking]', ev.content);
+    else if (ev.type === 'text') process.stdout.write(ev.content);
+}
+```
+### Tool Calling
+```typescript
+model.registerTool(
+    'get_weather',
+    'Get current weather for a location',
+    {
+        type: 'object',
+        properties: {
+            city: { type: 'string', description: 'City name' },
+        },
+        required: ['city'],
+    },
+    async (args) => {
+        const { city } = args as { city: string };
+        return { temperature: 22, condition: 'sunny', city };
+    },
+);
+// Autonomous tool execution — the model calls tools and loops until done
+const response = await model.chatWithTools([
+    { role: 'user', content: "What's the weather in Tokyo?" },
+]);
+console.log(response.message.content);
+// "The weather in Tokyo is 22°C and sunny."
+console.log(response.toolExecutions);
+// [{ tool_call_id: 'call_abc', output: { temperature: 22, condition: 'sunny', city: 'Tokyo' }, duration: 5 }]
+```
+### Provider Failover
+```typescript
+const model = new AIModel({
+    model: 'gemini-2.5-flash',
+    retries: 2,        // retries per provider before failover
+    timeout: 30000,    // request timeout in ms
+    providers: [
+        { type: 'google', apiKey: process.env.GOOGLE_KEY, priority: 0 },
+        { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY, priority: 1 },
+        { type: 'ollama', url: 'http://localhost:11434', priority: 2 },
+    ],
+});
+// If Google returns 500, retries twice, then seamlessly tries OpenRouter.
+// If OpenRouter also fails, falls back to local Ollama.
+// Your code sees a single response.
+const response = await model.chat([{ role: 'user', content: 'Hello' }]);
+// Check provider health at any time
+console.log(model.getProviderStatus());
+// [{ id: 'google-0', healthy: true }, { id: 'openai-1', healthy: true }, ...]
+```
+### Multimodal (Vision)
+```typescript
+import { AIModel, multimodalMessage } from 'universal-llm-client';
+const model = new AIModel({
+    model: 'gemini-2.5-flash',
+    providers: [{ type: 'google', apiKey: process.env.GOOGLE_KEY }],
+});
+const response = await model.chat([
+    multimodalMessage('What do you see in this image?', [
+        'https://example.com/photo.jpg',
+    ]),
+]);
+```
+### Embeddings
+```typescript
+const embedModel = new AIModel({
+    model: 'nomic-embed-text-v2-moe:latest',
+    providers: [{ type: 'ollama' }],
+});
+const vector = await embedModel.embed('Hello world');
+// [0.006, 0.026, -0.009, ...]
+const vectors = await embedModel.embedArray(['Hello', 'World']);
+// [[0.006, ...], [0.012, ...]]
+```
+### Structured Output
+Get typed, validated JSON responses from any LLM using Zod schemas:
+```typescript
+import { AIModel } from 'universal-llm-client';
+import { z } from 'zod';
+const model = new AIModel({
+    model: 'gemini-2.5-flash',
+    providers: [
+        { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
+        { type: 'ollama' },
+    ],
+});
+// Define your schema
+const UserSchema = z.object({
+    name: z.string(),
+    age: z.number(),
+    email: z.string().email(),
+    interests: z.array(z.string()),
+});
+// Method 1: generateStructured (throws on validation failure)
+const user = await model.generateStructured(UserSchema, [
+    { role: 'user', content: 'Generate a user profile for a software developer' },
+]);
+console.log(user.name);     // TypeScript knows this is string
+console.log(user.age);      // TypeScript knows this is number
+console.log(user.email);    // TypeScript knows this is string
+console.log(user.interests); // TypeScript knows this is string[]
+```
+**Non-throwing variant:**
+```typescript
+// Method 2: tryParseStructured (returns result object, never throws)
+const result = await model.tryParseStructured(UserSchema, messages);
+if (result.ok) {
+    console.log('User:', result.value.name);
+} else {
+    console.log('Error:', result.error.message);
+    console.log('Raw LLM output:', result.rawOutput);
+}
+```
+**Via chat options:**
+```typescript
+// Method 3: chat with output parameter
+const response = await model.chat(messages, {
+    output: { schema: UserSchema },
+});
+// response.structured is typed as { name: string, age: number, ... }
+if (response.structured) {
+    console.log(response.structured.name);
+}
+```
+**Streaming structured output:**
+```typescript
+// Stream partial validated objects as JSON generates
+for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
+    console.log('Partial:', partial);
+    // Partial: { name: 'Alice' }
+    // Partial: { name: 'Alice', age: 30 }
+    // Partial: { name: 'Alice', age: 30, email: 'alice@example.com' }
+}
+```
+**Raw JSON Schema (without Zod):**
+```typescript
+const response = await model.chat(messages, {
+    jsonSchema: {
+        type: 'object',
+        properties: {
+            name: { type: 'string' },
+            age: { type: 'number' },
+        },
+        required: ['name', 'age'],
+    },
+    name: 'Person',  // Optional, used for LLM guidance
+});
+```
+**Separate module import (tree-shaking):**
+```typescript
+// Import only structured output types if you don't need the full client
+import {
+    StructuredOutputError,
+    type StructuredOutputResult,
+    type StructuredOutputOptions,
+    parseStructured,
+    tryParseStructured,
+    zodToJsonSchema,
+} from 'universal-llm-client/structured-output';
+```
+**Vision with structured output:**
+```typescript
+const ImageAnalysisSchema = z.object({
+    objects: z.array(z.string()),
+    scene: z.string(),
+    mood: z.string(),
+});
+const response = await model.generateStructured(ImageAnalysisSchema, [
+    multimodalMessage('Analyze this image', ['https://example.com/photo.jpg']),
+]);
+```
+**Provider compatibility:**
+| Provider | Method | Notes |
+|----------|--------|-------|
+| OpenAI | `response_format.json_schema` | Strict mode enabled |
+| Ollama | `format: { schema }` | Model must support grammar |
+| Google | `responseMimeType + responseSchema` | Some features stripped |
+### Observability
+```typescript
+import { AIModel, ConsoleAuditor, BufferedAuditor } from 'universal-llm-client';
+// Simple console logging
+const model = new AIModel({
+    model: 'qwen3:4b',
+    providers: [{ type: 'ollama' }],
+    auditor: new ConsoleAuditor('[LLM]'),
+});
+// [LLM] REQUEST [ollama] (qwen3:4b) →
+// [LLM] RESPONSE [ollama] (qwen3:4b) 1200ms 68 tokens
+// Buffered for custom sinks (OpenTelemetry, DB, etc.)
+const auditor = new BufferedAuditor({
+    maxBufferSize: 100,
+    onFlush: async (events) => {
+        await sendToOpenTelemetry(events);
+    },
+});
+```
+### MCP Integration
+```typescript
+import { AIModel, MCPToolBridge } from 'universal-llm-client';
+const model = new AIModel({
+    model: 'qwen3:4b',
+    providers: [{ type: 'ollama' }],
+});
+const mcp = new MCPToolBridge({
+    servers: {
+        filesystem: {
+            command: 'npx',
+            args: ['-y', '@modelcontextprotocol/server-filesystem', './'],
+        },
+        weather: {
+            url: 'https://mcp.example.com/weather',
+        },
+    },
+});
+await mcp.connect();
+await mcp.registerTools(model);
+// MCP tools are now callable via chatWithTools
+const response = await model.chatWithTools([
+    { role: 'user', content: 'List files in the current directory' },
+]);
+await mcp.disconnect();
+```
+### Stream Decoders
+```typescript
+import { AIModel, createDecoder } from 'universal-llm-client';
+// Passthrough — raw text, no parsing
+// Standard Chat — text + native reasoning + tool calls
+// Interleaved Reasoning — parses <think> and <progress> tags from text streams
+const decoder = createDecoder('interleaved-reasoning', (event) => {
+    switch (event.type) {
+        case 'text': console.log(event.content); break;
+        case 'thinking': console.log('[think]', event.content); break;
+        case 'progress': console.log('[progress]', event.content); break;
+        case 'tool_call': console.log('[tool]', event.calls); break;
+    }
+});
+decoder.push('<think>Let me analyze this</think>The answer is 42');
+decoder.flush();
+console.log(decoder.getCleanContent());  // "The answer is 42"
+console.log(decoder.getReasoning());      // "Let me analyze this"
+```
+---
+## API Reference
+### `AIModel`
+The universal client. One class, multiple backends.
+```typescript
+new AIModel(config: AIModelConfig)
+```
+**Config:**
+| Property | Type | Default | Description |
+|---|---|---|---|
+| `model` | `string` | — | Model name (e.g., `'gemini-2.5-flash'`) |
+| `providers` | `ProviderConfig[]` | — | Ordered list of provider backends |
+| `retries` | `number` | `2` | Retries per provider before failover |
+| `timeout` | `number` | `30000` | Request timeout in ms |
+| `auditor` | `Auditor` | `NoopAuditor` | Observability sink |
+| `thinking` | `boolean` | `false` | Enable model thinking/reasoning |
+| `debug` | `boolean` | `false` | Debug logging |
+| `defaultParameters` | `object` | — | Default parameters for all requests |
+**Provider Config:**
+| Property | Type | Description |
+|---|---|---|
+| `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'`, `'anthropic'` |
+| `url` | `string` | Provider URL (has sensible defaults) |
+| `apiKey` | `string` | API key or Bearer token |
+| `priority` | `number` | Lower = tried first (defaults to array index) |
+| `model` | `string` | Override model name for this provider |
+| `region` | `string` | Vertex AI region (e.g., `'us-central1'`) |
+| `apiVersion` | `string` | API version (e.g., `'v1beta'`) |
+| `headers` | `Record<string,string>` | Extra headers merged into requests — OpenAI-compatible & Ollama (Azure `api-key`, gateways) |
+| `queryParams` | `Record<string,string>` | Query params appended to URLs — OpenAI-compatible only (e.g. Azure `api-version`) |
+| `authHeader` | `string` | Header name for the key — OpenAI-compatible & Ollama (e.g. `'api-key'`) |
+| `authPrefix` | `string` | Prefix before the key value — OpenAI-compatible & Ollama (e.g. `''` for api-key style) |
+| `apiBasePath` | `string` | OpenAI-compatible only: override or disable the `/v1` suffix (use `''` for full Azure deployment URLs) |
+**Methods:**
+| Method | Returns | Description |
+|---|---|---|
+| `chat(messages, options?)` | `Promise<LLMChatResponse>` | Send chat request |
+| `chatWithTools(messages, options?)` | `Promise<LLMChatResponse>` | Chat with autonomous tool execution |
+| `chatStream(messages, options?)` | `AsyncGenerator<DecodedEvent>` | Stream chat response |
+| `generateStructured(schema, messages, options?)` | `Promise<T>` | Generate typed JSON validated against Zod schema |
+| `tryParseStructured(schema, messages, options?)` | `Promise<StructuredOutputResult<T>>` | Non-throwing variant returning result object |
+| `generateStructuredStream(schema, messages, options?)` | `AsyncGenerator<T, T>` | Stream partial validated objects as JSON generates |
+| `embed(text)` | `Promise<number[]>` | Generate single embedding |
+| `embedArray(texts)` | `Promise<number[][]>` | Generate batch embeddings |
+| `registerTool(name, desc, params, handler)` | `void` | Register a callable tool |
+| `registerTools(tools)` | `void` | Register multiple tools |
+| `getModels()` | `Promise<string[]>` | List available models |
+| `getModelInfo()` | `Promise<ModelMetadata>` | Get model metadata |
+| `getProviderStatus()` | `ProviderStatus[]` | Check provider health |
+| `setModel(name)` | `void` | Switch model at runtime |
+| `dispose()` | `Promise<void>` | Clean shutdown |
+### Structured Output
+```typescript
+import { z } from 'zod';
+// Define your schema
+const UserSchema = z.object({
+    name: z.string(),
+    age: z.number(),
+    email: z.string().email(),
+});
+// Generate typed JSON
+const user = await model.generateStructured(UserSchema, messages);
+// TypeScript infers: { name: string; age: number; email: string }
+// Non-throwing variant
+const result = await model.tryParseStructured(UserSchema, messages);
+if (result.ok) {
+    console.log(result.value.name);  // Fully typed
+} else {
+    console.log(result.error.message);
+}
+// Stream partial objects
+for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
+    console.log(partial);  // Partial validated objects
+}
+```
+**Separate module import (tree-shaking):**
+```typescript
+import {
+    StructuredOutputError,
+    type StructuredOutputResult,
+    parseStructured,
+    tryParseStructured,
+    zodToJsonSchema,
+} from 'universal-llm-client/structured-output';
+// Use without importing the full client
+const schema = z.object({ name: z.string() });
+const jsonSchema = zodToJsonSchema(schema);
+```
+### `ToolBuilder` / `ToolExecutor`
+```typescript
+import { ToolBuilder, ToolExecutor } from 'universal-llm-client';
+// Fluent builder
+const tool = new ToolBuilder('search')
+    .description('Search the web')
+    .addParameter('query', 'string', 'Search query', true)
+    .addParameter('limit', 'number', 'Max results', false)
+    .build();
+// Execution wrappers
+const safeHandler = ToolExecutor.compose(
+    myHandler,
+    h => ToolExecutor.withTimeout(h, 5000),
+    h => ToolExecutor.safe(h),
+    h => ToolExecutor.withValidation(h, ['query']),
+);
+```
+### Auditor Interface
+Implement custom observability by providing an `Auditor`:
+```typescript
+interface Auditor {
+    record(event: AuditEvent): void;
+    flush?(): Promise<void>;
+}
+```
+**Built-in implementations:**
+- `NoopAuditor` — Zero overhead (default)
+- `ConsoleAuditor` — Structured console logging
+- `BufferedAuditor` — Collects events for custom sinks
+---
+## Architecture
+```
+universal-llm-client
+├── AIModel          ← Public API (the only class you import)
+├── Router           ← Internal failover engine
+├── BaseLLMClient    ← Abstract client with tool execution
+├── Providers
+│   ├── OllamaClient
+│   ├── OpenAICompatibleClient  (OpenAI, OpenRouter, Groq, LM Studio, vLLM, LlamaCpp)
+│   └── GoogleClient            (AI Studio + Vertex AI)
+├── StreamDecoder    ← Pluggable reasoning strategies
+├── Auditor          ← Observability interface
+├── MCPToolBridge    ← MCP server integration
+└── HTTP Utilities   ← Universal fetch-based transport
+```
+### Design Principles
+1. **Single import** — `AIModel` is the only class users need
+2. **Provider agnostic** — Same code works with any backend
+3. **Transparent failover** — Health tracking and cooldowns happen behind the scenes
+4. **Zero dependencies** — Core library depends only on native `fetch`
+5. **Agent-ready** — Stateless, composable instances designed as foundation for agent frameworks
+6. **Observable** — Every request, response, tool call, retry, and failover is auditable
+---
+## Runtime Support
+| Runtime | Version | Status |
+|---|---|---|
+| **Node.js** | 22+ | ✅ Full support |
+| **Bun** | 1.0+ | ✅ Full support |
+| **Deno** | 2.0+ | ✅ Full support |
+| **Browsers** | Modern | ✅ No stdio MCP, HTTP transport only |
+---
+## For Agent Framework Authors
+`AIModel` is designed as the transport layer for agentic systems:
+- **Stateless** — No conversation history stored. Your framework manages memory
+- **Composable** — Create separate instances for chat, embeddings, vision
+- **Tool tracing** — `chatWithTools()` returns full execution trace
+- **Context budget** — `getModelInfo()` exposes `contextLength`
+- **Auditor as system bus** — Inject custom sinks for cost tracking, behavioral scoring
+- **StreamDecoder as UI bridge** — Select decoder strategy per-call
+---
+## License
+MIT