npm - universal-llm-client - Versions diffs - 4.2.0 → 4.5.0 - Mend

universal-llm-client 4.2.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

package/CHANGELOG.md +142 -103
package/LICENSE +21 -21
package/README.md +640 -591
package/dist/ai-model.d.ts +12 -1
package/dist/ai-model.d.ts.map +1 -1
package/dist/ai-model.js +36 -1
package/dist/ai-model.js.map +1 -1
package/dist/gemma-channel.d.ts +14 -0
package/dist/gemma-channel.d.ts.map +1 -0
package/dist/gemma-channel.js +38 -0
package/dist/gemma-channel.js.map +1 -0
package/dist/gemma-diffusion.d.ts +49 -0
package/dist/gemma-diffusion.d.ts.map +1 -0
package/dist/gemma-diffusion.js +147 -0
package/dist/gemma-diffusion.js.map +1 -0
package/dist/http.d.ts +4 -0
package/dist/http.d.ts.map +1 -1
package/dist/http.js +14 -1
package/dist/http.js.map +1 -1
package/dist/index.d.ts +2 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +4 -0
package/dist/index.js.map +1 -1
package/dist/interfaces.d.ts +183 -7
package/dist/interfaces.d.ts.map +1 -1
package/dist/interfaces.js.map +1 -1
package/dist/providers/anthropic.d.ts.map +1 -1
package/dist/providers/anthropic.js +28 -3
package/dist/providers/anthropic.js.map +1 -1
package/dist/providers/google.d.ts +22 -1
package/dist/providers/google.d.ts.map +1 -1
package/dist/providers/google.js +225 -13
package/dist/providers/google.js.map +1 -1
package/dist/providers/ollama.d.ts +2 -0
package/dist/providers/ollama.d.ts.map +1 -1
package/dist/providers/ollama.js +59 -30
package/dist/providers/ollama.js.map +1 -1
package/dist/providers/openai.d.ts +14 -0
package/dist/providers/openai.d.ts.map +1 -1
package/dist/providers/openai.js +200 -22
package/dist/providers/openai.js.map +1 -1
package/dist/router.d.ts +2 -0
package/dist/router.d.ts.map +1 -1
package/dist/router.js +4 -0
package/dist/router.js.map +1 -1
package/dist/stream-decoder.d.ts +12 -0
package/dist/stream-decoder.d.ts.map +1 -1
package/dist/stream-decoder.js +182 -5
package/dist/stream-decoder.js.map +1 -1
package/dist/thinking.d.ts +36 -0
package/dist/thinking.d.ts.map +1 -0
package/dist/thinking.js +52 -0
package/dist/thinking.js.map +1 -0
package/package.json +118 -116
package/src/ai-model.ts +400 -350
package/src/auditor.ts +213 -213
package/src/client.ts +402 -402
package/src/debug/debug-google-streaming.ts +1 -1
package/src/demos/basic/universal-llm-examples.ts +3 -3
package/src/demos/diffusion-gemma/.env +29 -0
package/src/demos/diffusion-gemma/.env.example +27 -0
package/src/demos/diffusion-gemma/CLAUDE.md +95 -0
package/src/demos/diffusion-gemma/README.md +59 -0
package/src/demos/diffusion-gemma/canvas.ts +1606 -0
package/src/demos/diffusion-gemma/docker-compose.yml +29 -0
package/src/demos/diffusion-gemma/probe-stream.ts +51 -0
package/src/demos/diffusion-gemma/probe-tools.ts +55 -0
package/src/demos/diffusion-gemma/server.ts +1205 -0
package/src/demos/diffusion-gemma/start-vllm.sh +98 -0
package/src/gemma-channel.ts +47 -0
package/src/gemma-diffusion.ts +167 -0
package/src/http.ts +261 -247
package/src/index.ts +180 -161
package/src/interfaces.ts +843 -657
package/src/mcp.ts +345 -345
package/src/providers/anthropic.ts +796 -762
package/src/providers/google.ts +840 -620
package/src/providers/index.ts +8 -8
package/src/providers/ollama.ts +503 -469
package/src/providers/openai.ts +587 -392
package/src/router.ts +785 -780
package/src/stream-decoder.ts +535 -361
package/src/structured-output.ts +759 -759
package/src/test-scripts/test-google-deep-research.ts +33 -0
package/src/test-scripts/test-google-streaming-enhanced.ts +147 -147
package/src/test-scripts/test-google-streaming.ts +1 -1
package/src/test-scripts/test-google-system-prompt-comprehensive.ts +189 -189
package/src/test-scripts/test-google-thinking.ts +46 -0
package/src/test-scripts/test-system-message-positions.ts +163 -163
package/src/test-scripts/test-system-prompt-improvement-demo.ts +83 -83
package/src/test-scripts/test-vllm-qwen36.ts +256 -0
package/src/tests/ai-model.test.ts +1614 -1614
package/src/tests/auditor.test.ts +224 -224
package/src/tests/gemma-diffusion.test.ts +115 -0
package/src/tests/http.test.ts +200 -200
package/src/tests/interfaces.test.ts +117 -117
package/src/tests/providers/anthropic.test.ts +118 -0
package/src/tests/providers/google.test.ts +841 -660
package/src/tests/providers/ollama.test.ts +1034 -954
package/src/tests/providers/openai.test.ts +1511 -1122
package/src/tests/router.test.ts +254 -254
package/src/tests/stream-decoder.test.ts +263 -179
package/src/tests/structured-output.test.ts +1450 -1450
package/src/tests/thinking.test.ts +65 -0
package/src/tests/tools.test.ts +175 -175
package/src/thinking.ts +73 -0
package/src/tools.ts +246 -246
package/src/zod-adapter.ts +72 -72

package/src/providers/ollama.ts CHANGED Viewed

@@ -1,469 +1,503 @@
-/**
- * Universal LLM Client v3 — Ollama Provider
- *
- * Implements BaseLLMClient for Ollama's native API.
- * Supports chat, streaming (NDJSON), embeddings, model discovery,
- * context length detection via /api/show, and structured output.
- *
- * Structured Output Assertions:
- * - VAL-PROVIDER-OLLAMA-001: format parameter with JSON Schema
- * - VAL-PROVIDER-OLLAMA-003: Vision with base64 extraction alongside format
- * - VAL-PROVIDER-OLLAMA-004: format "json" vs schema modes
- */
-import { BaseLLMClient } from '../client.js';
-import { httpRequest, httpStream, parseNDJSON, buildHeaders } from '../http.js';
-import { StandardChatDecoder } from '../stream-decoder.js';
-import {
-    normalizeJsonSchema,
-    getJsonSchemaFromConfig,
-} from '../structured-output.js';
-import type {
-    LLMClientOptions,
-    LLMChatMessage,
-    LLMChatResponse,
-    ChatOptions,
-    ModelMetadata,
-    OllamaResponse,
-    OllamaModelInfo,
-    LLMToolDefinition,
-    TokenUsageInfo,
-} from '../interfaces.js';
-import type { DecodedEvent } from '../stream-decoder.js';
-import type { Auditor } from '../auditor.js';
-export class OllamaClient extends BaseLLMClient {
-    constructor(options: LLMClientOptions, auditor?: Auditor) {
-        super({
-            ...options,
-            url: (options.url || 'http://localhost:11434').replace(/\/+$/, ''),
-        }, auditor);
-    }
-    // ========================================================================
-    // Chat
-    // ========================================================================
-    async chat(
-        messages: LLMChatMessage[],
-        options?: ChatOptions,
-    ): Promise<LLMChatResponse> {
-        // Structured output and tools can now be used together.\n        // The provider sends both format and tools in the request.\n        // The Router handles skipping validation when the response contains tool calls.
-        const url = `${this.options.url}/api/chat`;
-        const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
-        const body: Record<string, unknown> = {
-            model: this.options.model,
-            messages: this.convertMessages(messages),
-            stream: false,
-            options: this.buildOllamaOptions(options),
-        };
-        if (tools?.length) {
-            body['tools'] = this.convertToolsToOllama(tools);
-        }
-        // Enable native thinking by default — thinking models produce better
-        // tool selections and reasoning when allowed to think before acting.
-        body['think'] = this.options.thinking ?? true;
-        // Handle structured output via format parameter
-        const schemaOptions = this.extractSchemaOptions(options);
-        if (schemaOptions) {
-            body['format'] = this.buildFormatParameter(schemaOptions);
-        } else if (options?.responseFormat) {
-            // Legacy json_object mode - map to Ollama's "json" format
-            body['format'] = 'json';
-        }
-        const start = Date.now();
-        this.auditor.record({
-            timestamp: start,
-            type: 'request',
-            provider: 'ollama',
-            model: this.options.model,
-        });
-        const response = await httpRequest<OllamaResponse>(url, {
-            method: 'POST',
-            headers: buildHeaders(this.options),
-            body,
-            timeout: this.options.timeout ?? 30000,
-        });
-        const data = response.data;
-        const usage: TokenUsageInfo | undefined = (data.prompt_eval_count || data.eval_count)
-            ? {
-                inputTokens: data.prompt_eval_count ?? 0,
-                outputTokens: data.eval_count ?? 0,
-                totalTokens: (data.prompt_eval_count ?? 0) + (data.eval_count ?? 0),
-            }
-            : undefined;
-        // Normalize tool call IDs (Ollama sometimes omits them)
-        const toolCalls = data.message.tool_calls?.map(tc => ({
-            ...tc,
-            id: tc.id || this.generateToolCallId(),
-            function: {
-                ...tc.function,
-                arguments: typeof tc.function.arguments === 'string'
-                    ? tc.function.arguments
-                    : JSON.stringify(tc.function.arguments),
-            },
-        }));
-        // Get content, handling potential null
-        const content = data.message.content || data.message.thinking || '';
-        const result: LLMChatResponse = {
-            message: {
-                role: 'assistant',
-                content,
-                tool_calls: toolCalls,
-            },
-            reasoning: data.message.content ? data.message.thinking : undefined,
-            usage,
-            provider: 'ollama',
-        };
-        this.auditor.record({
-            timestamp: Date.now(),
-            type: 'response',
-            provider: 'ollama',
-            model: this.options.model,
-            duration: Date.now() - start,
-            usage,
-        });
-        return result;
-    }
-    // ========================================================================
-    // Streaming
-    // ========================================================================
-    async *chatStream(
-        messages: LLMChatMessage[],
-        options?: ChatOptions,
-    ): AsyncGenerator<DecodedEvent, LLMChatResponse | void, unknown> {
-        const url = `${this.options.url}/api/chat`;
-        const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
-        const body: Record<string, unknown> = {
-            model: this.options.model,
-            messages: this.convertMessages(messages),
-            stream: true,
-            options: this.buildOllamaOptions(options),
-        };
-        if (tools?.length) {
-            body['tools'] = this.convertToolsToOllama(tools);
-        }
-        body['think'] = this.options.thinking ?? true;
-        const start = Date.now();
-        this.auditor.record({
-            timestamp: start,
-            type: 'stream_start',
-            provider: 'ollama',
-            model: this.options.model,
-        });
-        const decoder = new StandardChatDecoder(() => {});
-        let lastResponse: OllamaResponse | undefined;
-        const streamedToolCalls: import('../interfaces.js').LLMToolCall[] = [];
-        // Stream idle timeout: thinking models can pause for minutes between chunks.
-        // Ensure at least 5 minutes regardless of the base request timeout.
-        const streamTimeout = Math.max(this.options.timeout ?? 300000, 300000);
-        const stream = httpStream(url, {
-            method: 'POST',
-            headers: buildHeaders(this.options),
-            body,
-            timeout: streamTimeout,
-        });
-        for await (const chunk of parseNDJSON<OllamaResponse>(stream)) {
-            lastResponse = chunk;
-            if (chunk.message?.thinking) {
-                decoder.pushReasoning(chunk.message.thinking);
-                yield { type: 'thinking', content: chunk.message.thinking };
-            }
-            if (chunk.message?.content) {
-                decoder.push(chunk.message.content);
-                yield { type: 'text', content: chunk.message.content };
-            }
-            if (chunk.message?.tool_calls?.length) {
-                const normalized = chunk.message.tool_calls.map(tc => ({
-                    ...tc,
-                    id: tc.id || this.generateToolCallId(),
-                    function: {
-                        ...tc.function,
-                        arguments: typeof tc.function.arguments === 'string'
-                            ? tc.function.arguments
-                            : JSON.stringify(tc.function.arguments),
-                    },
-                }));
-                streamedToolCalls.push(...normalized);
-                yield { type: 'tool_call', calls: normalized };
-            }
-        }
-        decoder.flush();
-        const usage: TokenUsageInfo | undefined = lastResponse?.prompt_eval_count
-            ? {
-                inputTokens: lastResponse.prompt_eval_count ?? 0,
-                outputTokens: lastResponse.eval_count ?? 0,
-                totalTokens: (lastResponse.prompt_eval_count ?? 0) + (lastResponse.eval_count ?? 0),
-            }
-            : undefined;
-        this.auditor.record({
-            timestamp: Date.now(),
-            type: 'stream_end',
-            provider: 'ollama',
-            model: this.options.model,
-            duration: Date.now() - start,
-            usage,
-        });
-        return {
-            message: {
-                role: 'assistant',
-                content: decoder.getCleanContent(),
-                tool_calls: streamedToolCalls.length > 0 ? streamedToolCalls : undefined,
-            },
-            reasoning: decoder.getReasoning(),
-            usage,
-            provider: 'ollama',
-        };
-    }
-    // ========================================================================
-    // Embeddings
-    // ========================================================================
-    async embed(text: string): Promise<number[]> {
-        const url = `${this.options.url}/api/embed`;
-        const response = await httpRequest<{ embeddings: number[][] }>(url, {
-            method: 'POST',
-            headers: buildHeaders(this.options),
-            body: { model: this.options.model, input: text },
-            timeout: this.options.timeout ?? 30000,
-        });
-        return response.data.embeddings[0] ?? [];
-    }
-    override async embedArray(texts: string[]): Promise<number[][]> {
-        const url = `${this.options.url}/api/embed`;
-        const response = await httpRequest<{ embeddings: number[][] }>(url, {
-            method: 'POST',
-            headers: buildHeaders(this.options),
-            body: { model: this.options.model, input: texts },
-            timeout: this.options.timeout ?? 30000,
-        });
-        return response.data.embeddings;
-    }
-    // ========================================================================
-    // Model Discovery
-    // ========================================================================
-    async getModels(): Promise<string[]> {
-        const url = `${this.options.url}/api/tags`;
-        const response = await httpRequest<{ models: OllamaModelInfo[] }>(url, {
-            timeout: 5000,
-        });
-        return response.data.models.map(m => m.name);
-    }
-    override async getModelInfo(modelName?: string): Promise<ModelMetadata> {
-        const url = `${this.options.url}/api/show`;
-        try {
-            const targetModel = modelName ?? this.options.model;
-            const response = await httpRequest<Record<string, unknown>>(url, {
-                method: 'POST',
-                body: { name: targetModel },
-                timeout: 5000,
-            });
-            const modelInfo = response.data['model_info'] as Record<string, unknown> | undefined;
-            if (!modelInfo) return { contextLength: 8192 };
-            // Extract architecture-specific context length
-            const arch = modelInfo['general.architecture'] as string | undefined;
-            let contextLength = 8192;
-            if (arch) {
-                const ctxKey = `${arch}.context_length`;
-                const ctxValue = modelInfo[ctxKey] as number | undefined;
-                if (ctxValue) contextLength = ctxValue;
-            }
-            // Prefer the live deployment context when available. /api/show reports
-            // the trained maximum; /api/ps reports what the daemon has actually loaded.
-            try {
-                const psResponse = await httpRequest<{ models?: Array<{ name?: string; context_length?: number }> }>(
-                    `${this.options.url}/api/ps`,
-                    { timeout: 5000 },
-                );
-                const liveModel = psResponse.data.models?.find(
-                    model => model.name?.toLowerCase() === targetModel.toLowerCase(),
-                );
-                if (liveModel?.context_length && liveModel.context_length > 0) {
-                    contextLength = Math.min(contextLength, liveModel.context_length);
-                }
-            } catch {
-                // Ignore /api/ps failures — /api/show is still a valid fallback
-            }
-            const paramCountRaw = modelInfo['general.parameter_count'] as number | undefined;
-            const capabilities = response.data['capabilities'] as string[] | undefined;
-            return {
-                model: targetModel,
-                contextLength,
-                architecture: arch,
-                parameterCount: paramCountRaw,
-                capabilities,
-            };
-        } catch {
-            return { contextLength: 8192 };
-        }
-    }
-    // ========================================================================
-    // Readiness
-    // ========================================================================
-    /** Ensure model is available, pull if missing */
-    async ensureReady(): Promise<void> {
-        try {
-            await this.getModelInfo();
-        } catch {
-            // Try pulling the model
-            this.debugLog(`Model not found, attempting pull: ${this.options.model}`);
-            await httpRequest(`${this.options.url}/api/pull`, {
-                method: 'POST',
-                body: { name: this.options.model },
-                timeout: 300000, // 5 min for pull
-            });
-        }
-    }
-    // ========================================================================
-    // Internals
-    // ========================================================================
-    private convertMessages(messages: LLMChatMessage[]): Record<string, unknown>[] {
-        return messages.map(msg => {
-            const converted: Record<string, unknown> = { role: msg.role };
-            // Handle multimodal content (array of text + image parts)
-            if (Array.isArray(msg.content)) {
-                const textParts: string[] = [];
-                const images: string[] = [];
-                for (const part of msg.content) {
-                    if (part.type === 'text') {
-                        textParts.push(part.text);
-                    } else if (part.type === 'audio') {
-                        this.debugLog('Ollama: skipping audio content (not supported)');
-                    } else if (part.type === 'image_url' && part.image_url?.url) {
-                        // Extract base64 data from data URL or use raw base64
-                        const url = part.image_url.url;
-                        if (url.startsWith('data:')) {
-                            // data:image/jpeg;base64,XXXX → extract XXXX
-                            const base64Data = url.split(',')[1];
-                            if (base64Data) images.push(base64Data);
-                        } else if (url.startsWith('http')) {
-                            // Ollama doesn't support URLs directly — skip
-                            // (caller should download and convert to base64)
-                            this.debugLog('Ollama vision: skipping URL image, use base64 instead');
-                        } else {
-                            // Assume raw base64
-                            images.push(url);
-                        }
-                    }
-                }
-                converted['content'] = textParts.join('\n');
-                if (images.length > 0) {
-                    converted['images'] = images;
-                }
-            } else {
-                converted['content'] = msg.content ?? '';
-            }
-            // Ollama needs tool call arguments as objects, not strings
-            if (msg.tool_calls?.length) {
-                converted['tool_calls'] = msg.tool_calls.map(tc => ({
-                    ...tc,
-                    function: {
-                        ...tc.function,
-                        arguments: typeof tc.function.arguments === 'string'
-                            ? (() => { try { return JSON.parse(tc.function.arguments); } catch { return tc.function.arguments; } })()
-                            : tc.function.arguments,
-                    },
-                }));
-            }
-            // Preserve tool_call_id for tool result messages
-            if (msg.tool_call_id) {
-                converted['tool_call_id'] = msg.tool_call_id;
-            }
-            return converted;
-        });
-    }
-    private convertToolsToOllama(tools: LLMToolDefinition[]): unknown[] {
-        return tools.map(t => ({
-            type: 'function',
-            function: {
-                name: t.function.name,
-                description: t.function.description,
-                parameters: t.function.parameters,
-            },
-        }));
-    }
-    private buildOllamaOptions(options?: ChatOptions): Record<string, unknown> {
-        const params: Record<string, unknown> = {
-            ...this.options.defaultParameters,
-            ...options?.parameters,
-        };
-        if (options?.temperature !== undefined) params['temperature'] = options.temperature;
-        if (options?.maxTokens !== undefined) params['num_predict'] = options.maxTokens;
-        return params;
-    }
-    // ========================================================================
-    // Structured Output Helpers
-    // ========================================================================
-    /**
-     * Build Ollama format parameter from schema options.
-     * Ollama accepts:
-     * - format: "json" for simple JSON mode
-     * - format: { ...schema } for structured output with JSON Schema
-     */
-    private buildFormatParameter(options: { schemaConfig?: import('../structured-output.js').SchemaConfig<unknown>, jsonSchema?: import('../structured-output.js').JSONSchema }): string | import('../structured-output.js').JSONSchema {
-        if (options.jsonSchema) {
-            return normalizeJsonSchema(options.jsonSchema);
-        }
-        if (options.schemaConfig) {
-            return getJsonSchemaFromConfig(options.schemaConfig);
-        }
-        return 'json';
-    }
-}
+/**
+ * Universal LLM Client v3 — Ollama Provider
+ *
+ * Implements BaseLLMClient for Ollama's native API.
+ * Supports chat, streaming (NDJSON), embeddings, model discovery,
+ * context length detection via /api/show, and structured output.
+ *
+ * Structured Output Assertions:
+ * - VAL-PROVIDER-OLLAMA-001: format parameter with JSON Schema
+ * - VAL-PROVIDER-OLLAMA-003: Vision with base64 extraction alongside format
+ * - VAL-PROVIDER-OLLAMA-004: format "json" vs schema modes
+ */
+import { BaseLLMClient } from '../client.js';
+import { resolveThinking } from '../thinking.js';
+import { httpRequest, httpStream, parseNDJSON, buildHeaders } from '../http.js';
+import { StandardChatDecoder } from '../stream-decoder.js';
+import {
+    normalizeJsonSchema,
+    getJsonSchemaFromConfig,
+} from '../structured-output.js';
+import { extractGemmaThoughtChannels } from '../gemma-channel.js';
+import type {
+    LLMClientOptions,
+    LLMChatMessage,
+    LLMChatResponse,
+    ChatOptions,
+    ModelMetadata,
+    OllamaResponse,
+    OllamaModelInfo,
+    LLMToolDefinition,
+    LLMToolCall,
+    TokenUsageInfo,
+} from '../interfaces.js';
+import type { DecodedEvent } from '../stream-decoder.js';
+import type { Auditor } from '../auditor.js';
+export class OllamaClient extends BaseLLMClient {
+    constructor(options: LLMClientOptions, auditor?: Auditor) {
+        super({
+            ...options,
+            url: (options.url || 'http://localhost:11434').replace(/\/+$/, ''),
+        }, auditor);
+    }
+    // ========================================================================
+    // Chat
+    // ========================================================================
+    async chat(
+        messages: LLMChatMessage[],
+        options?: ChatOptions,
+    ): Promise<LLMChatResponse> {
+        // Structured output and tools can now be used together.\n        // The provider sends both format and tools in the request.\n        // The Router handles skipping validation when the response contains tool calls.
+        const url = `${this.options.url}/api/chat`;
+        const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
+        const body: Record<string, unknown> = {
+            model: this.options.model,
+            messages: this.convertMessages(messages),
+            stream: false,
+            options: this.buildOllamaOptions(options),
+        };
+        if (tools?.length) {
+            body['tools'] = this.convertToolsToOllama(tools);
+        }
+        // Enable native thinking by default — thinking models produce better
+        // tool selections and reasoning when allowed to think before acting.
+        // Ollama `think` is on/off (no levels); default on for thinking models.
+        body['think'] = resolveThinking(options?.thinking, this.options.thinking)?.enabled ?? true;
+        // Handle structured output via format parameter
+        const schemaOptions = this.extractSchemaOptions(options);
+        if (schemaOptions) {
+            body['format'] = this.buildFormatParameter(schemaOptions);
+        } else if (options?.responseFormat) {
+            // Legacy json_object mode - map to Ollama's "json" format
+            body['format'] = 'json';
+        }
+        const start = Date.now();
+        this.auditor.record({
+            timestamp: start,
+            type: 'request',
+            provider: 'ollama',
+            model: this.options.model,
+        });
+        const response = await httpRequest<OllamaResponse>(url, {
+            method: 'POST',
+            headers: buildHeaders(this.options),
+            body,
+            timeout: this.options.timeout ?? 30000,
+        });
+        const data = response.data;
+        const usage: TokenUsageInfo | undefined = (data.prompt_eval_count || data.eval_count)
+            ? {
+                inputTokens: data.prompt_eval_count ?? 0,
+                outputTokens: data.eval_count ?? 0,
+                totalTokens: (data.prompt_eval_count ?? 0) + (data.eval_count ?? 0),
+                // Ollama reports server-precise timing in nanoseconds.
+                durationMs: data.total_duration ? data.total_duration / 1e6 : undefined,
+                tokensPerSecond: data.eval_duration && data.eval_count
+                    ? data.eval_count / (data.eval_duration / 1e9)
+                    : undefined,
+            }
+            : undefined;
+        // Normalize tool calls (Ollama sometimes omits IDs and empty args).
+        const toolCalls = data.message.tool_calls?.map(tc => this.normalizeToolCall(tc));
+        const gemmaContent = extractGemmaThoughtChannels(data.message.content || '');
+        const reasoning = [data.message.thinking, gemmaContent.reasoning].filter(Boolean).join('\n\n') || undefined;
+        const result: LLMChatResponse = {
+            message: {
+                role: 'assistant',
+                content: gemmaContent.content,
+                tool_calls: toolCalls,
+            },
+            finishReason: data.done_reason,
+            reasoning,
+            usage,
+            provider: 'ollama',
+        };
+        this.auditor.record({
+            timestamp: Date.now(),
+            type: 'response',
+            provider: 'ollama',
+            model: this.options.model,
+            duration: Date.now() - start,
+            usage,
+        });
+        return result;
+    }
+    // ========================================================================
+    // Streaming
+    // ========================================================================
+    async *chatStream(
+        messages: LLMChatMessage[],
+        options?: ChatOptions,
+    ): AsyncGenerator<DecodedEvent, LLMChatResponse | void, unknown> {
+        const url = `${this.options.url}/api/chat`;
+        const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
+        const body: Record<string, unknown> = {
+            model: this.options.model,
+            messages: this.convertMessages(messages),
+            stream: true,
+            options: this.buildOllamaOptions(options),
+        };
+        if (tools?.length) {
+            body['tools'] = this.convertToolsToOllama(tools);
+        }
+        // Ollama `think` is on/off (no levels); default on for thinking models.
+        body['think'] = resolveThinking(options?.thinking, this.options.thinking)?.enabled ?? true;
+        const start = Date.now();
+        this.auditor.record({
+            timestamp: start,
+            type: 'stream_start',
+            provider: 'ollama',
+            model: this.options.model,
+        });
+        const decoderEvents: DecodedEvent[] = [];
+        const decoder = new StandardChatDecoder(event => decoderEvents.push(event));
+        let lastResponse: OllamaResponse | undefined;
+        const streamedToolCalls: import('../interfaces.js').LLMToolCall[] = [];
+        // Stream idle timeout: thinking models can pause for minutes between chunks.
+        // Ensure at least 5 minutes regardless of the base request timeout.
+        const streamTimeout = Math.max(this.options.timeout ?? 300000, 300000);
+        const stream = httpStream(url, {
+            method: 'POST',
+            headers: buildHeaders(this.options),
+            body,
+            timeout: streamTimeout,
+        });
+        for await (const chunk of parseNDJSON<OllamaResponse>(stream)) {
+            lastResponse = chunk;
+            if (chunk.message?.thinking) {
+                decoder.pushReasoning(chunk.message.thinking);
+                const pending = decoderEvents.splice(0);
+                for (const event of pending) {
+                    yield event;
+                }
+            }
+            if (chunk.message?.content) {
+                decoder.push(chunk.message.content);
+                const pending = decoderEvents.splice(0);
+                for (const event of pending) {
+                    yield event;
+                }
+            }
+            if (chunk.message?.tool_calls?.length) {
+                const normalized = chunk.message.tool_calls.map(tc => this.normalizeToolCall(tc));
+                streamedToolCalls.push(...normalized);
+                yield { type: 'tool_call', calls: normalized };
+            }
+        }
+        decoder.flush();
+        const pending = decoderEvents.splice(0);
+        for (const event of pending) {
+            yield event;
+        }
+        const usage: TokenUsageInfo | undefined = lastResponse?.prompt_eval_count
+            ? {
+                inputTokens: lastResponse.prompt_eval_count ?? 0,
+                outputTokens: lastResponse.eval_count ?? 0,
+                totalTokens: (lastResponse.prompt_eval_count ?? 0) + (lastResponse.eval_count ?? 0),
+                durationMs: lastResponse.total_duration ? lastResponse.total_duration / 1e6 : undefined,
+                tokensPerSecond: lastResponse.eval_duration && lastResponse.eval_count
+                    ? lastResponse.eval_count / (lastResponse.eval_duration / 1e9)
+                    : undefined,
+            }
+            : undefined;
+        this.auditor.record({
+            timestamp: Date.now(),
+            type: 'stream_end',
+            provider: 'ollama',
+            model: this.options.model,
+            duration: Date.now() - start,
+            usage,
+        });
+        return {
+            message: {
+                role: 'assistant',
+                content: decoder.getCleanContent(),
+                tool_calls: streamedToolCalls.length > 0 ? streamedToolCalls : undefined,
+            },
+            finishReason: lastResponse?.done_reason,
+            reasoning: decoder.getReasoning(),
+            usage,
+            provider: 'ollama',
+        };
+    }
+    private normalizeToolCall(
+        toolCall: Partial<LLMToolCall> & { function?: Partial<LLMToolCall['function']> },
+    ): LLMToolCall {
+        return {
+            ...toolCall,
+            id: toolCall.id || this.generateToolCallId(),
+            type: 'function',
+            function: {
+                ...toolCall.function,
+                name: toolCall.function?.name || '',
+                arguments: this.normalizeToolArguments(toolCall.function?.arguments),
+            },
+        };
+    }
+    private normalizeToolArguments(args: unknown): string {
+        if (typeof args === 'string') {
+            return args.trim().length > 0 ? args : '{}';
+        }
+        if (args == null) {
+            return '{}';
+        }
+        return JSON.stringify(args) ?? '{}';
+    }
+    // ========================================================================
+    // Embeddings
+    // ========================================================================
+    async embed(text: string): Promise<number[]> {
+        const url = `${this.options.url}/api/embed`;
+        const response = await httpRequest<{ embeddings: number[][] }>(url, {
+            method: 'POST',
+            headers: buildHeaders(this.options),
+            body: { model: this.options.model, input: text },
+            timeout: this.options.timeout ?? 30000,
+        });
+        return response.data.embeddings[0] ?? [];
+    }
+    override async embedArray(texts: string[]): Promise<number[][]> {
+        const url = `${this.options.url}/api/embed`;
+        const response = await httpRequest<{ embeddings: number[][] }>(url, {
+            method: 'POST',
+            headers: buildHeaders(this.options),
+            body: { model: this.options.model, input: texts },
+            timeout: this.options.timeout ?? 30000,
+        });
+        return response.data.embeddings;
+    }
+    // ========================================================================
+    // Model Discovery
+    // ========================================================================
+    async getModels(): Promise<string[]> {
+        const url = `${this.options.url}/api/tags`;
+        const response = await httpRequest<{ models: OllamaModelInfo[] }>(url, {
+            timeout: 5000,
+        });
+        return response.data.models.map(m => m.name);
+    }
+    override async getModelInfo(modelName?: string): Promise<ModelMetadata> {
+        const url = `${this.options.url}/api/show`;
+        try {
+            const targetModel = modelName ?? this.options.model;
+            const response = await httpRequest<Record<string, unknown>>(url, {
+                method: 'POST',
+                body: { name: targetModel },
+                timeout: 5000,
+            });
+            const modelInfo = response.data['model_info'] as Record<string, unknown> | undefined;
+            if (!modelInfo) return { contextLength: 8192 };
+            // Extract architecture-specific context length
+            const arch = modelInfo['general.architecture'] as string | undefined;
+            let contextLength = 8192;
+            if (arch) {
+                const ctxKey = `${arch}.context_length`;
+                const ctxValue = modelInfo[ctxKey] as number | undefined;
+                if (ctxValue) contextLength = ctxValue;
+            }
+            // Prefer the live deployment context when available. /api/show reports
+            // the trained maximum; /api/ps reports what the daemon has actually loaded.
+            try {
+                const psResponse = await httpRequest<{ models?: Array<{ name?: string; context_length?: number }> }>(
+                    `${this.options.url}/api/ps`,
+                    { timeout: 5000 },
+                );
+                const liveModel = psResponse.data.models?.find(
+                    model => model.name?.toLowerCase() === targetModel.toLowerCase(),
+                );
+                if (liveModel?.context_length && liveModel.context_length > 0) {
+                    contextLength = Math.min(contextLength, liveModel.context_length);
+                }
+            } catch {
+                // Ignore /api/ps failures — /api/show is still a valid fallback
+            }
+            const paramCountRaw = modelInfo['general.parameter_count'] as number | undefined;
+            const capabilities = response.data['capabilities'] as string[] | undefined;
+            return {
+                model: targetModel,
+                contextLength,
+                architecture: arch,
+                parameterCount: paramCountRaw,
+                capabilities,
+            };
+        } catch {
+            return { contextLength: 8192 };
+        }
+    }
+    // ========================================================================
+    // Readiness
+    // ========================================================================
+    /** Ensure model is available, pull if missing */
+    async ensureReady(): Promise<void> {
+        try {
+            await this.getModelInfo();
+        } catch {
+            // Try pulling the model
+            this.debugLog(`Model not found, attempting pull: ${this.options.model}`);
+            await httpRequest(`${this.options.url}/api/pull`, {
+                method: 'POST',
+                body: { name: this.options.model },
+                timeout: 300000, // 5 min for pull
+            });
+        }
+    }
+    // ========================================================================
+    // Internals
+    // ========================================================================
+    private convertMessages(messages: LLMChatMessage[]): Record<string, unknown>[] {
+        return messages.map(msg => {
+            const converted: Record<string, unknown> = { role: msg.role };
+            // Handle multimodal content (array of text + image parts)
+            if (Array.isArray(msg.content)) {
+                const textParts: string[] = [];
+                const images: string[] = [];
+                for (const part of msg.content) {
+                    if (part.type === 'text') {
+                        textParts.push(part.text);
+                    } else if (part.type === 'audio') {
+                        this.debugLog('Ollama: skipping audio content (not supported)');
+                    } else if (part.type === 'image_url' && part.image_url?.url) {
+                        // Extract base64 data from data URL or use raw base64
+                        const url = part.image_url.url;
+                        if (url.startsWith('data:')) {
+                            // data:image/jpeg;base64,XXXX → extract XXXX
+                            const base64Data = url.split(',')[1];
+                            if (base64Data) images.push(base64Data);
+                        } else if (url.startsWith('http')) {
+                            // Ollama doesn't support URLs directly — skip
+                            // (caller should download and convert to base64)
+                            this.debugLog('Ollama vision: skipping URL image, use base64 instead');
+                        } else {
+                            // Assume raw base64
+                            images.push(url);
+                        }
+                    }
+                }
+                converted['content'] = textParts.join('\n');
+                if (images.length > 0) {
+                    converted['images'] = images;
+                }
+            } else {
+                converted['content'] = msg.content ?? '';
+            }
+            // Ollama needs tool call arguments as objects, not strings
+            if (msg.tool_calls?.length) {
+                converted['tool_calls'] = msg.tool_calls.map(tc => ({
+                    ...tc,
+                    function: {
+                        ...tc.function,
+                        arguments: typeof tc.function.arguments === 'string'
+                            ? (() => { try { return JSON.parse(tc.function.arguments); } catch { return tc.function.arguments; } })()
+                            : tc.function.arguments,
+                    },
+                }));
+            }
+            // Preserve tool_call_id for tool result messages
+            if (msg.tool_call_id) {
+                converted['tool_call_id'] = msg.tool_call_id;
+            }
+            return converted;
+        });
+    }
+    private convertToolsToOllama(tools: LLMToolDefinition[]): unknown[] {
+        return tools.map(t => ({
+            type: 'function',
+            function: {
+                name: t.function.name,
+                description: t.function.description,
+                parameters: t.function.parameters,
+            },
+        }));
+    }
+    private buildOllamaOptions(options?: ChatOptions): Record<string, unknown> {
+        const params: Record<string, unknown> = {
+            ...this.options.defaultParameters,
+            ...options?.parameters,
+        };
+        if (options?.temperature !== undefined) params['temperature'] = options.temperature;
+        if (options?.maxTokens !== undefined) params['num_predict'] = options.maxTokens;
+        return params;
+    }
+    // ========================================================================
+    // Structured Output Helpers
+    // ========================================================================
+    /**
+     * Build Ollama format parameter from schema options.
+     * Ollama accepts:
+     * - format: "json" for simple JSON mode
+     * - format: { ...schema } for structured output with JSON Schema
+     */
+    private buildFormatParameter(options: { schemaConfig?: import('../structured-output.js').SchemaConfig<unknown>, jsonSchema?: import('../structured-output.js').JSONSchema }): string | import('../structured-output.js').JSONSchema {
+        if (options.jsonSchema) {
+            return normalizeJsonSchema(options.jsonSchema);
+        }
+        if (options.schemaConfig) {
+            return getJsonSchemaFromConfig(options.schemaConfig);
+        }
+        return 'json';
+    }
+}