npm - universal-llm-client - Versions diffs - 4.1.0 → 4.3.0 - Mend

universal-llm-client 4.1.0 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

package/CHANGELOG.md +139 -103
package/LICENSE +21 -21
package/README.md +591 -591
package/dist/ai-model.js.map +1 -1
package/dist/auditor.js.map +1 -1
package/dist/client.js.map +1 -1
package/dist/http.js.map +1 -1
package/dist/index.d.ts +1 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +1 -1
package/dist/index.js.map +1 -1
package/dist/interfaces.d.ts +20 -0
package/dist/interfaces.d.ts.map +1 -1
package/dist/interfaces.js.map +1 -1
package/dist/mcp.js.map +1 -1
package/dist/providers/anthropic.js.map +1 -1
package/dist/providers/google.d.ts.map +1 -1
package/dist/providers/google.js +2 -0
package/dist/providers/google.js.map +1 -1
package/dist/providers/index.js.map +1 -1
package/dist/providers/ollama.js.map +1 -1
package/dist/providers/openai.js.map +1 -1
package/dist/router.js.map +1 -1
package/dist/stream-decoder.js.map +1 -1
package/dist/structured-output.d.ts +24 -1
package/dist/structured-output.d.ts.map +1 -1
package/dist/structured-output.js +58 -5
package/dist/structured-output.js.map +1 -1
package/dist/tools.js.map +1 -1
package/dist/zod-adapter.js.map +1 -1
package/package.json +115 -116
package/src/ai-model.ts +0 -350
package/src/auditor.ts +0 -213
package/src/client.ts +0 -402
package/src/debug/debug-google-streaming.ts +0 -97
package/src/debug/debug-tool-execution.ts +0 -86
package/src/debug/test-lmstudio-tools.ts +0 -155
package/src/demos/README.md +0 -47
package/src/demos/basic/universal-llm-examples.ts +0 -161
package/src/demos/mcp/astrid-memory-demo.ts +0 -295
package/src/demos/mcp/astrid-persona-memory.ts +0 -357
package/src/demos/mcp/mcp-mongodb-demo.ts +0 -275
package/src/demos/mcp/simple-astrid-memory.ts +0 -148
package/src/demos/mcp/simple-mcp-demo.ts +0 -68
package/src/demos/mcp/working-mcp-demo.ts +0 -62
package/src/demos/model-alias-demo.ts +0 -0
package/src/demos/tools/RAG_MEMORY_INTEGRATION.md +0 -267
package/src/demos/tools/astrid-memory-demo.ts +0 -270
package/src/demos/tools/astrid-production-memory-clean.ts +0 -785
package/src/demos/tools/astrid-production-memory.ts +0 -558
package/src/demos/tools/basic-translation-test.ts +0 -66
package/src/demos/tools/chromadb-similarity-tuning.ts +0 -390
package/src/demos/tools/clean-multilingual-conversation.ts +0 -209
package/src/demos/tools/clean-translation-test.ts +0 -119
package/src/demos/tools/clean-universal-multilingual-test.ts +0 -131
package/src/demos/tools/complete-rag-demo.ts +0 -369
package/src/demos/tools/complete-tool-demo.ts +0 -132
package/src/demos/tools/demo-tool-calling.ts +0 -124
package/src/demos/tools/dynamic-language-switching-test.ts +0 -251
package/src/demos/tools/hybrid-thinking-test.ts +0 -154
package/src/demos/tools/memory-integration-test.ts +0 -420
package/src/demos/tools/multilingual-memory-system.ts +0 -802
package/src/demos/tools/ondemand-translation-demo.ts +0 -655
package/src/demos/tools/production-tool-demo.ts +0 -245
package/src/demos/tools/revolutionary-multilingual-test.ts +0 -151
package/src/demos/tools/rigorous-language-analysis.ts +0 -218
package/src/demos/tools/test-universal-memory-system.ts +0 -126
package/src/demos/tools/translation-integration-guide.ts +0 -346
package/src/demos/tools/universal-memory-system.ts +0 -560
package/src/http.ts +0 -247
package/src/index.ts +0 -160
package/src/interfaces.ts +0 -657
package/src/mcp.ts +0 -345
package/src/providers/anthropic.ts +0 -762
package/src/providers/google.ts +0 -620
package/src/providers/index.ts +0 -8
package/src/providers/ollama.ts +0 -469
package/src/providers/openai.ts +0 -392
package/src/router.ts +0 -780
package/src/stream-decoder.ts +0 -361
package/src/structured-output.ts +0 -702
package/src/test-scripts/test-advanced-tools.ts +0 -310
package/src/test-scripts/test-google-streaming-enhanced.ts +0 -147
package/src/test-scripts/test-google-streaming.ts +0 -63
package/src/test-scripts/test-google-system-prompt-comprehensive.ts +0 -189
package/src/test-scripts/test-mcp-config.ts +0 -28
package/src/test-scripts/test-mcp-connection.ts +0 -29
package/src/test-scripts/test-system-message-positions.ts +0 -163
package/src/test-scripts/test-system-prompt-improvement-demo.ts +0 -83
package/src/test-scripts/test-tool-calling.ts +0 -231
package/src/tests/ai-model.test.ts +0 -1614
package/src/tests/auditor.test.ts +0 -224
package/src/tests/http.test.ts +0 -200
package/src/tests/interfaces.test.ts +0 -117
package/src/tests/providers/google.test.ts +0 -660
package/src/tests/providers/ollama.test.ts +0 -954
package/src/tests/providers/openai.test.ts +0 -1122
package/src/tests/router.test.ts +0 -254
package/src/tests/stream-decoder.test.ts +0 -179
package/src/tests/structured-output.test.ts +0 -1340
package/src/tests/tools.test.ts +0 -175
package/src/tools.ts +0 -246
package/src/zod-adapter.ts +0 -72

package/README.md CHANGED Viewed

@@ -1,591 +1,591 @@
-# universal-llm-client
-A universal LLM client for JavaScript/TypeScript with **transparent provider failover**, streaming tool execution, pluggable reasoning strategies, and native observability.
-```typescript
-import { AIModel } from 'universal-llm-client';
-const model = new AIModel({
-    model: 'gemini-2.5-flash',
-    providers: [
-        { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
-        { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY },
-        { type: 'ollama' },
-    ],
-});
-const response = await model.chat([
-    { role: 'user', content: 'Hello!' },
-]);
-```
-> **One model, multiple backends.** If Google fails, it transparently fails over to OpenRouter, then to local Ollama. Your code never knows the difference.
----
-## Features
-- 🔄 **Transparent Failover** — Priority-ordered provider chain with retries, health tracking, and cooldowns
-- 🛠️ **Tool Calling** — Register tools once, works across all providers. Autonomous multi-turn execution loop
-- 📋 **Structured Output** — Zod schema validation, JSON Schema support, streaming, and type-safe responses
-- 🌊 **Streaming** — First-class async generator streaming with pluggable decoder strategies
-- 🧠 **Reasoning** — Native `<think>` tag parsing, interleaved reasoning, and model thinking support
-- 🔍 **Observability** — Built-in auditor interface for logging, cost tracking, and behavioral analysis
-- 🌐 **Universal Runtime** — Node.js 22+, Bun, Deno, and modern browsers
-- 🤖 **MCP Native** — Bridge MCP servers to LLM tools with zero glue code
-- 📊 **Embeddings** — Single and batch embedding generation
-## Supported Providers
-| Provider | Type | Notes |
-|---|---|---|
-| **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal |
-| **OpenAI** | `openai` | GPT-4o, o3, etc. Also works with OpenRouter, Groq, LM Studio, vLLM |
-| **Google AI Studio** | `google` | Gemini models, system instructions, multimodal |
-| **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints and Bearer tokens |
-| **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances |
----
-## Installation
-```bash
-bun add universal-llm-client
-# or
-npm install universal-llm-client
-```
-**Optional**: For MCP integration:
-```bash
-bun add @modelcontextprotocol/sdk
-```
----
-## Quick Start
-### Basic Chat
-```typescript
-import { AIModel } from 'universal-llm-client';
-const model = new AIModel({
-    model: 'qwen3:4b',
-    providers: [{ type: 'ollama' }],
-});
-const response = await model.chat([
-    { role: 'system', content: 'You are a helpful assistant.' },
-    { role: 'user', content: 'What is the capital of France?' },
-]);
-console.log(response.message.content);
-// "The capital of France is Paris."
-```
-### Streaming
-```typescript
-for await (const event of model.chatStream([
-    { role: 'user', content: 'Write a haiku about code.' },
-])) {
-    if (event.type === 'text') {
-        process.stdout.write(event.content);
-    } else if (event.type === 'thinking') {
-        // Model reasoning (when supported)
-        console.log('[thinking]', event.content);
-    }
-}
-```
-### Tool Calling
-```typescript
-model.registerTool(
-    'get_weather',
-    'Get current weather for a location',
-    {
-        type: 'object',
-        properties: {
-            city: { type: 'string', description: 'City name' },
-        },
-        required: ['city'],
-    },
-    async (args) => {
-        const { city } = args as { city: string };
-        return { temperature: 22, condition: 'sunny', city };
-    },
-);
-// Autonomous tool execution — the model calls tools and loops until done
-const response = await model.chatWithTools([
-    { role: 'user', content: "What's the weather in Tokyo?" },
-]);
-console.log(response.message.content);
-// "The weather in Tokyo is 22°C and sunny."
-console.log(response.toolTrace);
-// [{ name: 'get_weather', args: { city: 'Tokyo' }, result: {...}, duration: 5 }]
-```
-### Provider Failover
-```typescript
-const model = new AIModel({
-    model: 'gemini-2.5-flash',
-    retries: 2,        // retries per provider before failover
-    timeout: 30000,    // request timeout in ms
-    providers: [
-        { type: 'google', apiKey: process.env.GOOGLE_KEY, priority: 0 },
-        { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY, priority: 1 },
-        { type: 'ollama', url: 'http://localhost:11434', priority: 2 },
-    ],
-});
-// If Google returns 500, retries twice, then seamlessly tries OpenRouter.
-// If OpenRouter also fails, falls back to local Ollama.
-// Your code sees a single response.
-const response = await model.chat([{ role: 'user', content: 'Hello' }]);
-// Check provider health at any time
-console.log(model.getProviderStatus());
-// [{ id: 'google-0', healthy: true }, { id: 'openai-1', healthy: true }, ...]
-```
-### Multimodal (Vision)
-```typescript
-import { AIModel, multimodalMessage } from 'universal-llm-client';
-const model = new AIModel({
-    model: 'gemini-2.5-flash',
-    providers: [{ type: 'google', apiKey: process.env.GOOGLE_KEY }],
-});
-const response = await model.chat([
-    multimodalMessage('What do you see in this image?', [
-        'https://example.com/photo.jpg',
-    ]),
-]);
-```
-### Embeddings
-```typescript
-const embedModel = new AIModel({
-    model: 'nomic-embed-text-v2-moe:latest',
-    providers: [{ type: 'ollama' }],
-});
-const vector = await embedModel.embed('Hello world');
-// [0.006, 0.026, -0.009, ...]
-const vectors = await embedModel.embedArray(['Hello', 'World']);
-// [[0.006, ...], [0.012, ...]]
-```
-### Structured Output
-Get typed, validated JSON responses from any LLM using Zod schemas:
-```typescript
-import { AIModel } from 'universal-llm-client';
-import { z } from 'zod';
-const model = new AIModel({
-    model: 'gemini-2.5-flash',
-    providers: [
-        { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
-        { type: 'ollama' },
-    ],
-});
-// Define your schema
-const UserSchema = z.object({
-    name: z.string(),
-    age: z.number(),
-    email: z.string().email(),
-    interests: z.array(z.string()),
-});
-// Method 1: generateStructured (throws on validation failure)
-const user = await model.generateStructured(UserSchema, [
-    { role: 'user', content: 'Generate a user profile for a software developer' },
-]);
-console.log(user.name);     // TypeScript knows this is string
-console.log(user.age);      // TypeScript knows this is number
-console.log(user.email);    // TypeScript knows this is string
-console.log(user.interests); // TypeScript knows this is string[]
-```
-**Non-throwing variant:**
-```typescript
-// Method 2: tryParseStructured (returns result object, never throws)
-const result = await model.tryParseStructured(UserSchema, messages);
-if (result.ok) {
-    console.log('User:', result.value.name);
-} else {
-    console.log('Error:', result.error.message);
-    console.log('Raw LLM output:', result.rawOutput);
-}
-```
-**Via chat options:**
-```typescript
-// Method 3: chat with output parameter
-const response = await model.chat(messages, {
-    output: { schema: UserSchema },
-});
-// response.structured is typed as { name: string, age: number, ... }
-if (response.structured) {
-    console.log(response.structured.name);
-}
-```
-**Streaming structured output:**
-```typescript
-// Stream partial validated objects as JSON generates
-for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
-    console.log('Partial:', partial);
-    // Partial: { name: 'Alice' }
-    // Partial: { name: 'Alice', age: 30 }
-    // Partial: { name: 'Alice', age: 30, email: 'alice@example.com' }
-}
-```
-**Raw JSON Schema (without Zod):**
-```typescript
-const response = await model.chat(messages, {
-    jsonSchema: {
-        type: 'object',
-        properties: {
-            name: { type: 'string' },
-            age: { type: 'number' },
-        },
-        required: ['name', 'age'],
-    },
-    name: 'Person',  // Optional, used for LLM guidance
-});
-```
-**Separate module import (tree-shaking):**
-```typescript
-// Import only structured output types if you don't need the full client
-import {
-    StructuredOutputError,
-    type StructuredOutputResult,
-    type StructuredOutputOptions,
-    parseStructured,
-    tryParseStructured,
-    zodToJsonSchema,
-} from 'universal-llm-client/structured-output';
-```
-**Vision with structured output:**
-```typescript
-const ImageAnalysisSchema = z.object({
-    objects: z.array(z.string()),
-    scene: z.string(),
-    mood: z.string(),
-});
-const response = await model.generateStructured(ImageAnalysisSchema, [
-    multimodalMessage('Analyze this image', ['https://example.com/photo.jpg']),
-]);
-```
-**Provider compatibility:**
-| Provider | Method | Notes |
-|----------|--------|-------|
-| OpenAI | `response_format.json_schema` | Strict mode enabled |
-| Ollama | `format: { schema }` | Model must support grammar |
-| Google | `responseMimeType + responseSchema` | Some features stripped |
-### Observability
-```typescript
-import { AIModel, ConsoleAuditor, BufferedAuditor } from 'universal-llm-client';
-// Simple console logging
-const model = new AIModel({
-    model: 'qwen3:4b',
-    providers: [{ type: 'ollama' }],
-    auditor: new ConsoleAuditor('[LLM]'),
-});
-// [LLM] REQUEST [ollama] (qwen3:4b) →
-// [LLM] RESPONSE [ollama] (qwen3:4b) 1200ms 68 tokens
-// Buffered for custom sinks (OpenTelemetry, DB, etc.)
-const auditor = new BufferedAuditor({
-    maxBufferSize: 100,
-    onFlush: async (events) => {
-        await sendToOpenTelemetry(events);
-    },
-});
-```
-### MCP Integration
-```typescript
-import { AIModel, MCPToolBridge } from 'universal-llm-client';
-const model = new AIModel({
-    model: 'qwen3:4b',
-    providers: [{ type: 'ollama' }],
-});
-const mcp = new MCPToolBridge({
-    servers: {
-        filesystem: {
-            command: 'npx',
-            args: ['-y', '@modelcontextprotocol/server-filesystem', './'],
-        },
-        weather: {
-            url: 'https://mcp.example.com/weather',
-        },
-    },
-});
-await mcp.connect();
-await mcp.registerTools(model);
-// MCP tools are now callable via chatWithTools
-const response = await model.chatWithTools([
-    { role: 'user', content: 'List files in the current directory' },
-]);
-await mcp.disconnect();
-```
-### Stream Decoders
-```typescript
-import { AIModel, createDecoder } from 'universal-llm-client';
-// Passthrough — raw text, no parsing
-// Standard Chat — text + native reasoning + tool calls
-// Interleaved Reasoning — parses <think> and <progress> tags from text streams
-const decoder = createDecoder('interleaved-reasoning', (event) => {
-    switch (event.type) {
-        case 'text': console.log(event.content); break;
-        case 'thinking': console.log('[think]', event.content); break;
-        case 'progress': console.log('[progress]', event.content); break;
-        case 'tool_call': console.log('[tool]', event.calls); break;
-    }
-});
-decoder.push('<think>Let me analyze this</think>The answer is 42');
-decoder.flush();
-console.log(decoder.getCleanContent());  // "The answer is 42"
-console.log(decoder.getReasoning());      // "Let me analyze this"
-```
----
-## API Reference
-### `AIModel`
-The universal client. One class, multiple backends.
-```typescript
-new AIModel(config: AIModelConfig)
-```
-**Config:**
-| Property | Type | Default | Description |
-|---|---|---|---|
-| `model` | `string` | — | Model name (e.g., `'gemini-2.5-flash'`) |
-| `providers` | `ProviderConfig[]` | — | Ordered list of provider backends |
-| `retries` | `number` | `2` | Retries per provider before failover |
-| `timeout` | `number` | `30000` | Request timeout in ms |
-| `auditor` | `Auditor` | `NoopAuditor` | Observability sink |
-| `thinking` | `boolean` | `false` | Enable model thinking/reasoning |
-| `debug` | `boolean` | `false` | Debug logging |
-| `defaultParameters` | `object` | — | Default parameters for all requests |
-**Provider Config:**
-| Property | Type | Description |
-|---|---|---|
-| `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'` |
-| `url` | `string` | Provider URL (has sensible defaults) |
-| `apiKey` | `string` | API key or Bearer token |
-| `priority` | `number` | Lower = tried first (defaults to array index) |
-| `model` | `string` | Override model name for this provider |
-| `region` | `string` | Vertex AI region (e.g., `'us-central1'`) |
-| `apiVersion` | `string` | API version (e.g., `'v1beta'`) |
-**Methods:**
-| Method | Returns | Description |
-|---|---|---|
-| `chat(messages, options?)` | `Promise<LLMChatResponse>` | Send chat request |
-| `chatWithTools(messages, options?)` | `Promise<LLMChatResponse>` | Chat with autonomous tool execution |
-| `chatStream(messages, options?)` | `AsyncGenerator<DecodedEvent>` | Stream chat response |
-| `generateStructured(schema, messages, options?)` | `Promise<T>` | Generate typed JSON validated against Zod schema |
-| `tryParseStructured(schema, messages, options?)` | `Promise<StructuredOutputResult<T>>` | Non-throwing variant returning result object |
-| `generateStructuredStream(schema, messages, options?)` | `AsyncGenerator<T, T>` | Stream partial validated objects as JSON generates |
-| `embed(text)` | `Promise<number[]>` | Generate single embedding |
-| `embedArray(texts)` | `Promise<number[][]>` | Generate batch embeddings |
-| `registerTool(name, desc, params, handler)` | `void` | Register a callable tool |
-| `registerTools(tools)` | `void` | Register multiple tools |
-| `getModels()` | `Promise<string[]>` | List available models |
-| `getModelInfo()` | `Promise<ModelMetadata>` | Get model metadata |
-| `getProviderStatus()` | `ProviderStatus[]` | Check provider health |
-| `setModel(name)` | `void` | Switch model at runtime |
-| `dispose()` | `Promise<void>` | Clean shutdown |
-### Structured Output
-```typescript
-import { z } from 'zod';
-// Define your schema
-const UserSchema = z.object({
-    name: z.string(),
-    age: z.number(),
-    email: z.string().email(),
-});
-// Generate typed JSON
-const user = await model.generateStructured(UserSchema, messages);
-// TypeScript infers: { name: string; age: number; email: string }
-// Non-throwing variant
-const result = await model.tryParseStructured(UserSchema, messages);
-if (result.ok) {
-    console.log(result.value.name);  // Fully typed
-} else {
-    console.log(result.error.message);
-}
-// Stream partial objects
-for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
-    console.log(partial);  // Partial validated objects
-}
-```
-**Separate module import (tree-shaking):**
-```typescript
-import {
-    StructuredOutputError,
-    type StructuredOutputResult,
-    parseStructured,
-    tryParseStructured,
-    zodToJsonSchema,
-} from 'universal-llm-client/structured-output';
-// Use without importing the full client
-const schema = z.object({ name: z.string() });
-const jsonSchema = zodToJsonSchema(schema);
-```
-### `ToolBuilder` / `ToolExecutor`
-```typescript
-import { ToolBuilder, ToolExecutor } from 'universal-llm-client';
-// Fluent builder
-const tool = new ToolBuilder('search')
-    .description('Search the web')
-    .addParameter('query', 'string', 'Search query', true)
-    .addParameter('limit', 'number', 'Max results', false)
-    .build();
-// Execution wrappers
-const safeHandler = ToolExecutor.compose(
-    myHandler,
-    h => ToolExecutor.withTimeout(h, 5000),
-    h => ToolExecutor.safe(h),
-    h => ToolExecutor.withValidation(h, ['query']),
-);
-```
-### Auditor Interface
-Implement custom observability by providing an `Auditor`:
-```typescript
-interface Auditor {
-    record(event: AuditEvent): void;
-    flush?(): Promise<void>;
-}
-```
-**Built-in implementations:**
-- `NoopAuditor` — Zero overhead (default)
-- `ConsoleAuditor` — Structured console logging
-- `BufferedAuditor` — Collects events for custom sinks
----
-## Architecture
-```
-universal-llm-client
-├── AIModel          ← Public API (the only class you import)
-├── Router           ← Internal failover engine
-├── BaseLLMClient    ← Abstract client with tool execution
-├── Providers
-│   ├── OllamaClient
-│   ├── OpenAICompatibleClient  (OpenAI, OpenRouter, Groq, LM Studio, vLLM, LlamaCpp)
-│   └── GoogleClient            (AI Studio + Vertex AI)
-├── StreamDecoder    ← Pluggable reasoning strategies
-├── Auditor          ← Observability interface
-├── MCPToolBridge    ← MCP server integration
-└── HTTP Utilities   ← Universal fetch-based transport
-```
-### Design Principles
-1. **Single import** — `AIModel` is the only class users need
-2. **Provider agnostic** — Same code works with any backend
-3. **Transparent failover** — Health tracking and cooldowns happen behind the scenes
-4. **Zero dependencies** — Core library depends only on native `fetch`
-5. **Agent-ready** — Stateless, composable instances designed as foundation for agent frameworks
-6. **Observable** — Every request, response, tool call, retry, and failover is auditable
----
-## Runtime Support
-| Runtime | Version | Status |
-|---|---|---|
-| **Node.js** | 22+ | ✅ Full support |
-| **Bun** | 1.0+ | ✅ Full support |
-| **Deno** | 2.0+ | ✅ Full support |
-| **Browsers** | Modern | ✅ No stdio MCP, HTTP transport only |
----
-## For Agent Framework Authors
-`AIModel` is designed as the transport layer for agentic systems:
-- **Stateless** — No conversation history stored. Your framework manages memory
-- **Composable** — Create separate instances for chat, embeddings, vision
-- **Tool tracing** — `chatWithTools()` returns full execution trace
-- **Context budget** — `getModelInfo()` exposes `contextLength`
-- **Auditor as system bus** — Inject custom sinks for cost tracking, behavioral scoring
-- **StreamDecoder as UI bridge** — Select decoder strategy per-call
----
-## License
-MIT
+# universal-llm-client
+A universal LLM client for JavaScript/TypeScript with **transparent provider failover**, streaming tool execution, pluggable reasoning strategies, and native observability.
+```typescript
+import { AIModel } from 'universal-llm-client';
+const model = new AIModel({
+    model: 'gemini-2.5-flash',
+    providers: [
+        { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
+        { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY },
+        { type: 'ollama' },
+    ],
+});
+const response = await model.chat([
+    { role: 'user', content: 'Hello!' },
+]);
+```
+> **One model, multiple backends.** If Google fails, it transparently fails over to OpenRouter, then to local Ollama. Your code never knows the difference.
+---
+## Features
+- 🔄 **Transparent Failover** — Priority-ordered provider chain with retries, health tracking, and cooldowns
+- 🛠️ **Tool Calling** — Register tools once, works across all providers. Autonomous multi-turn execution loop
+- 📋 **Structured Output** — Zod schema validation, JSON Schema support, streaming, and type-safe responses
+- 🌊 **Streaming** — First-class async generator streaming with pluggable decoder strategies
+- 🧠 **Reasoning** — Native `<think>` tag parsing, interleaved reasoning, and model thinking support
+- 🔍 **Observability** — Built-in auditor interface for logging, cost tracking, and behavioral analysis
+- 🌐 **Universal Runtime** — Node.js 22+, Bun, Deno, and modern browsers
+- 🤖 **MCP Native** — Bridge MCP servers to LLM tools with zero glue code
+- 📊 **Embeddings** — Single and batch embedding generation
+## Supported Providers
+| Provider | Type | Notes |
+|---|---|---|
+| **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal |
+| **OpenAI** | `openai` | GPT-4o, o3, etc. Also works with OpenRouter, Groq, LM Studio, vLLM |
+| **Google AI Studio** | `google` | Gemini models, system instructions, multimodal |
+| **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints and Bearer tokens |
+| **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances |
+---
+## Installation
+```bash
+bun add universal-llm-client
+# or
+npm install universal-llm-client
+```
+**Optional**: For MCP integration:
+```bash
+bun add @modelcontextprotocol/sdk
+```
+---
+## Quick Start
+### Basic Chat
+```typescript
+import { AIModel } from 'universal-llm-client';
+const model = new AIModel({
+    model: 'qwen3:4b',
+    providers: [{ type: 'ollama' }],
+});
+const response = await model.chat([
+    { role: 'system', content: 'You are a helpful assistant.' },
+    { role: 'user', content: 'What is the capital of France?' },
+]);
+console.log(response.message.content);
+// "The capital of France is Paris."
+```
+### Streaming
+```typescript
+for await (const event of model.chatStream([
+    { role: 'user', content: 'Write a haiku about code.' },
+])) {
+    if (event.type === 'text') {
+        process.stdout.write(event.content);
+    } else if (event.type === 'thinking') {
+        // Model reasoning (when supported)
+        console.log('[thinking]', event.content);
+    }
+}
+```
+### Tool Calling
+```typescript
+model.registerTool(
+    'get_weather',
+    'Get current weather for a location',
+    {
+        type: 'object',
+        properties: {
+            city: { type: 'string', description: 'City name' },
+        },
+        required: ['city'],
+    },
+    async (args) => {
+        const { city } = args as { city: string };
+        return { temperature: 22, condition: 'sunny', city };
+    },
+);
+// Autonomous tool execution — the model calls tools and loops until done
+const response = await model.chatWithTools([
+    { role: 'user', content: "What's the weather in Tokyo?" },
+]);
+console.log(response.message.content);
+// "The weather in Tokyo is 22°C and sunny."
+console.log(response.toolTrace);
+// [{ name: 'get_weather', args: { city: 'Tokyo' }, result: {...}, duration: 5 }]
+```
+### Provider Failover
+```typescript
+const model = new AIModel({
+    model: 'gemini-2.5-flash',
+    retries: 2,        // retries per provider before failover
+    timeout: 30000,    // request timeout in ms
+    providers: [
+        { type: 'google', apiKey: process.env.GOOGLE_KEY, priority: 0 },
+        { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY, priority: 1 },
+        { type: 'ollama', url: 'http://localhost:11434', priority: 2 },
+    ],
+});
+// If Google returns 500, retries twice, then seamlessly tries OpenRouter.
+// If OpenRouter also fails, falls back to local Ollama.
+// Your code sees a single response.
+const response = await model.chat([{ role: 'user', content: 'Hello' }]);
+// Check provider health at any time
+console.log(model.getProviderStatus());
+// [{ id: 'google-0', healthy: true }, { id: 'openai-1', healthy: true }, ...]
+```
+### Multimodal (Vision)
+```typescript
+import { AIModel, multimodalMessage } from 'universal-llm-client';
+const model = new AIModel({
+    model: 'gemini-2.5-flash',
+    providers: [{ type: 'google', apiKey: process.env.GOOGLE_KEY }],
+});
+const response = await model.chat([
+    multimodalMessage('What do you see in this image?', [
+        'https://example.com/photo.jpg',
+    ]),
+]);
+```
+### Embeddings
+```typescript
+const embedModel = new AIModel({
+    model: 'nomic-embed-text-v2-moe:latest',
+    providers: [{ type: 'ollama' }],
+});
+const vector = await embedModel.embed('Hello world');
+// [0.006, 0.026, -0.009, ...]
+const vectors = await embedModel.embedArray(['Hello', 'World']);
+// [[0.006, ...], [0.012, ...]]
+```
+### Structured Output
+Get typed, validated JSON responses from any LLM using Zod schemas:
+```typescript
+import { AIModel } from 'universal-llm-client';
+import { z } from 'zod';
+const model = new AIModel({
+    model: 'gemini-2.5-flash',
+    providers: [
+        { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
+        { type: 'ollama' },
+    ],
+});
+// Define your schema
+const UserSchema = z.object({
+    name: z.string(),
+    age: z.number(),
+    email: z.string().email(),
+    interests: z.array(z.string()),
+});
+// Method 1: generateStructured (throws on validation failure)
+const user = await model.generateStructured(UserSchema, [
+    { role: 'user', content: 'Generate a user profile for a software developer' },
+]);
+console.log(user.name);     // TypeScript knows this is string
+console.log(user.age);      // TypeScript knows this is number
+console.log(user.email);    // TypeScript knows this is string
+console.log(user.interests); // TypeScript knows this is string[]
+```
+**Non-throwing variant:**
+```typescript
+// Method 2: tryParseStructured (returns result object, never throws)
+const result = await model.tryParseStructured(UserSchema, messages);
+if (result.ok) {
+    console.log('User:', result.value.name);
+} else {
+    console.log('Error:', result.error.message);
+    console.log('Raw LLM output:', result.rawOutput);
+}
+```
+**Via chat options:**
+```typescript
+// Method 3: chat with output parameter
+const response = await model.chat(messages, {
+    output: { schema: UserSchema },
+});
+// response.structured is typed as { name: string, age: number, ... }
+if (response.structured) {
+    console.log(response.structured.name);
+}
+```
+**Streaming structured output:**
+```typescript
+// Stream partial validated objects as JSON generates
+for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
+    console.log('Partial:', partial);
+    // Partial: { name: 'Alice' }
+    // Partial: { name: 'Alice', age: 30 }
+    // Partial: { name: 'Alice', age: 30, email: 'alice@example.com' }
+}
+```
+**Raw JSON Schema (without Zod):**
+```typescript
+const response = await model.chat(messages, {
+    jsonSchema: {
+        type: 'object',
+        properties: {
+            name: { type: 'string' },
+            age: { type: 'number' },
+        },
+        required: ['name', 'age'],
+    },
+    name: 'Person',  // Optional, used for LLM guidance
+});
+```
+**Separate module import (tree-shaking):**
+```typescript
+// Import only structured output types if you don't need the full client
+import {
+    StructuredOutputError,
+    type StructuredOutputResult,
+    type StructuredOutputOptions,
+    parseStructured,
+    tryParseStructured,
+    zodToJsonSchema,
+} from 'universal-llm-client/structured-output';
+```
+**Vision with structured output:**
+```typescript
+const ImageAnalysisSchema = z.object({
+    objects: z.array(z.string()),
+    scene: z.string(),
+    mood: z.string(),
+});
+const response = await model.generateStructured(ImageAnalysisSchema, [
+    multimodalMessage('Analyze this image', ['https://example.com/photo.jpg']),
+]);
+```
+**Provider compatibility:**
+| Provider | Method | Notes |
+|----------|--------|-------|
+| OpenAI | `response_format.json_schema` | Strict mode enabled |
+| Ollama | `format: { schema }` | Model must support grammar |
+| Google | `responseMimeType + responseSchema` | Some features stripped |
+### Observability
+```typescript
+import { AIModel, ConsoleAuditor, BufferedAuditor } from 'universal-llm-client';
+// Simple console logging
+const model = new AIModel({
+    model: 'qwen3:4b',
+    providers: [{ type: 'ollama' }],
+    auditor: new ConsoleAuditor('[LLM]'),
+});
+// [LLM] REQUEST [ollama] (qwen3:4b) →
+// [LLM] RESPONSE [ollama] (qwen3:4b) 1200ms 68 tokens
+// Buffered for custom sinks (OpenTelemetry, DB, etc.)
+const auditor = new BufferedAuditor({
+    maxBufferSize: 100,
+    onFlush: async (events) => {
+        await sendToOpenTelemetry(events);
+    },
+});
+```
+### MCP Integration
+```typescript
+import { AIModel, MCPToolBridge } from 'universal-llm-client';
+const model = new AIModel({
+    model: 'qwen3:4b',
+    providers: [{ type: 'ollama' }],
+});
+const mcp = new MCPToolBridge({
+    servers: {
+        filesystem: {
+            command: 'npx',
+            args: ['-y', '@modelcontextprotocol/server-filesystem', './'],
+        },
+        weather: {
+            url: 'https://mcp.example.com/weather',
+        },
+    },
+});
+await mcp.connect();
+await mcp.registerTools(model);
+// MCP tools are now callable via chatWithTools
+const response = await model.chatWithTools([
+    { role: 'user', content: 'List files in the current directory' },
+]);
+await mcp.disconnect();
+```
+### Stream Decoders
+```typescript
+import { AIModel, createDecoder } from 'universal-llm-client';
+// Passthrough — raw text, no parsing
+// Standard Chat — text + native reasoning + tool calls
+// Interleaved Reasoning — parses <think> and <progress> tags from text streams
+const decoder = createDecoder('interleaved-reasoning', (event) => {
+    switch (event.type) {
+        case 'text': console.log(event.content); break;
+        case 'thinking': console.log('[think]', event.content); break;
+        case 'progress': console.log('[progress]', event.content); break;
+        case 'tool_call': console.log('[tool]', event.calls); break;
+    }
+});
+decoder.push('<think>Let me analyze this</think>The answer is 42');
+decoder.flush();
+console.log(decoder.getCleanContent());  // "The answer is 42"
+console.log(decoder.getReasoning());      // "Let me analyze this"
+```
+---
+## API Reference
+### `AIModel`
+The universal client. One class, multiple backends.
+```typescript
+new AIModel(config: AIModelConfig)
+```
+**Config:**
+| Property | Type | Default | Description |
+|---|---|---|---|
+| `model` | `string` | — | Model name (e.g., `'gemini-2.5-flash'`) |
+| `providers` | `ProviderConfig[]` | — | Ordered list of provider backends |
+| `retries` | `number` | `2` | Retries per provider before failover |
+| `timeout` | `number` | `30000` | Request timeout in ms |
+| `auditor` | `Auditor` | `NoopAuditor` | Observability sink |
+| `thinking` | `boolean` | `false` | Enable model thinking/reasoning |
+| `debug` | `boolean` | `false` | Debug logging |
+| `defaultParameters` | `object` | — | Default parameters for all requests |
+**Provider Config:**
+| Property | Type | Description |
+|---|---|---|
+| `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'` |
+| `url` | `string` | Provider URL (has sensible defaults) |
+| `apiKey` | `string` | API key or Bearer token |
+| `priority` | `number` | Lower = tried first (defaults to array index) |
+| `model` | `string` | Override model name for this provider |
+| `region` | `string` | Vertex AI region (e.g., `'us-central1'`) |
+| `apiVersion` | `string` | API version (e.g., `'v1beta'`) |
+**Methods:**
+| Method | Returns | Description |
+|---|---|---|
+| `chat(messages, options?)` | `Promise<LLMChatResponse>` | Send chat request |
+| `chatWithTools(messages, options?)` | `Promise<LLMChatResponse>` | Chat with autonomous tool execution |
+| `chatStream(messages, options?)` | `AsyncGenerator<DecodedEvent>` | Stream chat response |
+| `generateStructured(schema, messages, options?)` | `Promise<T>` | Generate typed JSON validated against Zod schema |
+| `tryParseStructured(schema, messages, options?)` | `Promise<StructuredOutputResult<T>>` | Non-throwing variant returning result object |
+| `generateStructuredStream(schema, messages, options?)` | `AsyncGenerator<T, T>` | Stream partial validated objects as JSON generates |
+| `embed(text)` | `Promise<number[]>` | Generate single embedding |
+| `embedArray(texts)` | `Promise<number[][]>` | Generate batch embeddings |
+| `registerTool(name, desc, params, handler)` | `void` | Register a callable tool |
+| `registerTools(tools)` | `void` | Register multiple tools |
+| `getModels()` | `Promise<string[]>` | List available models |
+| `getModelInfo()` | `Promise<ModelMetadata>` | Get model metadata |
+| `getProviderStatus()` | `ProviderStatus[]` | Check provider health |
+| `setModel(name)` | `void` | Switch model at runtime |
+| `dispose()` | `Promise<void>` | Clean shutdown |
+### Structured Output
+```typescript
+import { z } from 'zod';
+// Define your schema
+const UserSchema = z.object({
+    name: z.string(),
+    age: z.number(),
+    email: z.string().email(),
+});
+// Generate typed JSON
+const user = await model.generateStructured(UserSchema, messages);
+// TypeScript infers: { name: string; age: number; email: string }
+// Non-throwing variant
+const result = await model.tryParseStructured(UserSchema, messages);
+if (result.ok) {
+    console.log(result.value.name);  // Fully typed
+} else {
+    console.log(result.error.message);
+}
+// Stream partial objects
+for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
+    console.log(partial);  // Partial validated objects
+}
+```
+**Separate module import (tree-shaking):**
+```typescript
+import {
+    StructuredOutputError,
+    type StructuredOutputResult,
+    parseStructured,
+    tryParseStructured,
+    zodToJsonSchema,
+} from 'universal-llm-client/structured-output';
+// Use without importing the full client
+const schema = z.object({ name: z.string() });
+const jsonSchema = zodToJsonSchema(schema);
+```
+### `ToolBuilder` / `ToolExecutor`
+```typescript
+import { ToolBuilder, ToolExecutor } from 'universal-llm-client';
+// Fluent builder
+const tool = new ToolBuilder('search')
+    .description('Search the web')
+    .addParameter('query', 'string', 'Search query', true)
+    .addParameter('limit', 'number', 'Max results', false)
+    .build();
+// Execution wrappers
+const safeHandler = ToolExecutor.compose(
+    myHandler,
+    h => ToolExecutor.withTimeout(h, 5000),
+    h => ToolExecutor.safe(h),
+    h => ToolExecutor.withValidation(h, ['query']),
+);
+```
+### Auditor Interface
+Implement custom observability by providing an `Auditor`:
+```typescript
+interface Auditor {
+    record(event: AuditEvent): void;
+    flush?(): Promise<void>;
+}
+```
+**Built-in implementations:**
+- `NoopAuditor` — Zero overhead (default)
+- `ConsoleAuditor` — Structured console logging
+- `BufferedAuditor` — Collects events for custom sinks
+---
+## Architecture
+```
+universal-llm-client
+├── AIModel          ← Public API (the only class you import)
+├── Router           ← Internal failover engine
+├── BaseLLMClient    ← Abstract client with tool execution
+├── Providers
+│   ├── OllamaClient
+│   ├── OpenAICompatibleClient  (OpenAI, OpenRouter, Groq, LM Studio, vLLM, LlamaCpp)
+│   └── GoogleClient            (AI Studio + Vertex AI)
+├── StreamDecoder    ← Pluggable reasoning strategies
+├── Auditor          ← Observability interface
+├── MCPToolBridge    ← MCP server integration
+└── HTTP Utilities   ← Universal fetch-based transport
+```
+### Design Principles
+1. **Single import** — `AIModel` is the only class users need
+2. **Provider agnostic** — Same code works with any backend
+3. **Transparent failover** — Health tracking and cooldowns happen behind the scenes
+4. **Zero dependencies** — Core library depends only on native `fetch`
+5. **Agent-ready** — Stateless, composable instances designed as foundation for agent frameworks
+6. **Observable** — Every request, response, tool call, retry, and failover is auditable
+---
+## Runtime Support
+| Runtime | Version | Status |
+|---|---|---|
+| **Node.js** | 22+ | ✅ Full support |
+| **Bun** | 1.0+ | ✅ Full support |
+| **Deno** | 2.0+ | ✅ Full support |
+| **Browsers** | Modern | ✅ No stdio MCP, HTTP transport only |
+---
+## For Agent Framework Authors
+`AIModel` is designed as the transport layer for agentic systems:
+- **Stateless** — No conversation history stored. Your framework manages memory
+- **Composable** — Create separate instances for chat, embeddings, vision
+- **Tool tracing** — `chatWithTools()` returns full execution trace
+- **Context budget** — `getModelInfo()` exposes `contextLength`
+- **Auditor as system bus** — Inject custom sinks for cost tracking, behavioral scoring
+- **StreamDecoder as UI bridge** — Select decoder strategy per-call
+---
+## License
+MIT