npm - universal-llm-client - Versions diffs - 4.3.0 → 4.5.1 - Mend

universal-llm-client 4.3.0 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

package/CHANGELOG.md +34 -19
package/README.md +62 -11
package/dist/ai-model.d.ts +12 -2
package/dist/ai-model.js +36 -2
package/dist/auditor.d.ts +0 -1
package/dist/auditor.js +0 -1
package/dist/client.d.ts +0 -1
package/dist/client.js +0 -1
package/dist/gemma-channel.d.ts +13 -0
package/dist/gemma-channel.js +37 -0
package/dist/gemma-diffusion.d.ts +48 -0
package/dist/gemma-diffusion.js +146 -0
package/dist/http.d.ts +4 -1
package/dist/http.js +14 -2
package/dist/index.d.ts +2 -2
package/dist/index.js +4 -1
package/dist/interfaces.d.ts +163 -8
package/dist/interfaces.js +0 -1
package/dist/mcp.d.ts +0 -1
package/dist/mcp.js +0 -1
package/dist/providers/anthropic.d.ts +0 -1
package/dist/providers/anthropic.js +28 -4
package/dist/providers/google.d.ts +22 -2
package/dist/providers/google.js +223 -14
package/dist/providers/index.d.ts +0 -1
package/dist/providers/index.js +0 -1
package/dist/providers/ollama.d.ts +2 -1
package/dist/providers/ollama.js +59 -31
package/dist/providers/openai.d.ts +16 -1
package/dist/providers/openai.js +488 -81
package/dist/router.d.ts +2 -1
package/dist/router.js +4 -1
package/dist/stream-decoder.d.ts +12 -1
package/dist/stream-decoder.js +182 -6
package/dist/structured-output.d.ts +0 -1
package/dist/structured-output.js +0 -1
package/dist/thinking.d.ts +35 -0
package/dist/thinking.js +51 -0
package/dist/tools.d.ts +0 -1
package/dist/tools.js +0 -1
package/dist/zod-adapter.d.ts +0 -1
package/dist/zod-adapter.js +0 -1
package/package.json +3 -1
package/dist/ai-model.d.ts.map +0 -1
package/dist/ai-model.js.map +0 -1
package/dist/auditor.d.ts.map +0 -1
package/dist/auditor.js.map +0 -1
package/dist/client.d.ts.map +0 -1
package/dist/client.js.map +0 -1
package/dist/http.d.ts.map +0 -1
package/dist/http.js.map +0 -1
package/dist/index.d.ts.map +0 -1
package/dist/index.js.map +0 -1
package/dist/interfaces.d.ts.map +0 -1
package/dist/interfaces.js.map +0 -1
package/dist/mcp.d.ts.map +0 -1
package/dist/mcp.js.map +0 -1
package/dist/providers/anthropic.d.ts.map +0 -1
package/dist/providers/anthropic.js.map +0 -1
package/dist/providers/google.d.ts.map +0 -1
package/dist/providers/google.js.map +0 -1
package/dist/providers/index.d.ts.map +0 -1
package/dist/providers/index.js.map +0 -1
package/dist/providers/ollama.d.ts.map +0 -1
package/dist/providers/ollama.js.map +0 -1
package/dist/providers/openai.d.ts.map +0 -1
package/dist/providers/openai.js.map +0 -1
package/dist/router.d.ts.map +0 -1
package/dist/router.js.map +0 -1
package/dist/stream-decoder.d.ts.map +0 -1
package/dist/stream-decoder.js.map +0 -1
package/dist/structured-output.d.ts.map +0 -1
package/dist/structured-output.js.map +0 -1
package/dist/tools.d.ts.map +0 -1
package/dist/tools.js.map +0 -1
package/dist/zod-adapter.d.ts.map +0 -1
package/dist/zod-adapter.js.map +0 -1

package/dist/providers/openai.js CHANGED Viewed

@@ -5,25 +5,268 @@
  * Works with: OpenAI, OpenRouter, LM Studio, LlamaCpp, vLLM, Groq, Together.
  */
 import { BaseLLMClient } from '../client.js';
+import { resolveThinking, isOpenAIReasoningModel } from '../thinking.js';
 import { httpRequest, httpStream, parseSSE, buildHeaders } from '../http.js';
 import { StandardChatDecoder } from '../stream-decoder.js';
 import { normalizeJsonSchema, getJsonSchemaFromConfig, } from '../structured-output.js';
+import { isGemmaDiffusionModel, parseGemmaDiffusionOutput } from '../gemma-diffusion.js';
+const VLLM_AUTO_TOOL_CHOICE_HINT = 'vLLM rejected automatic tool choice. Retrying with text-level tool calling. To use native tool_calls, start vLLM with --enable-auto-tool-choice and --tool-call-parser <parser>.';
+function normalizeMessagesForOpenAICompat(messages) {
+    let sawNonSystem = false;
+    return messages.map(message => {
+        if (message.role !== 'system') {
+            sawNonSystem = true;
+            return {
+                ...message,
+                content: message.content ?? '',
+            };
+        }
+        if (!sawNonSystem) {
+            return {
+                ...message,
+                content: message.content ?? '',
+            };
+        }
+        return {
+            ...message,
+            role: 'user',
+            content: `[SYSTEM MESSAGE]\n${stringifyMessageContent(message.content)}`,
+        };
+    });
+}
+function stringifyMessageContent(content) {
+    if (typeof content === 'string')
+        return content;
+    return content
+        .map(part => {
+        if (part.type === 'text')
+            return part.text;
+        if (part.type === 'image_url')
+            return `[Image: ${part.image_url.url}]`;
+        if (part.type === 'audio')
+            return `[Audio: ${part.audio.mimeType}]`;
+        return '';
+    })
+        .filter(Boolean)
+        .join('\n');
+}
+function hasToolDefinitions(body) {
+    const tools = body['tools'];
+    return Array.isArray(tools) && tools.length > 0;
+}
+function isVllmAutoToolChoiceError(value) {
+    const text = value instanceof Error
+        ? value.message
+        : typeof value === 'string'
+            ? value
+            : JSON.stringify(value ?? '');
+    const normalized = text.toLowerCase();
+    return (normalized.includes('auto')
+        && normalized.includes('tool choice requires --enable-auto-tool-choice')
+        && normalized.includes('--tool-call-parser'));
+}
+async function requestWithVllmToolFallback(url, request, tools, onFallback) {
+    try {
+        return await httpRequest(url, {
+            method: 'POST',
+            headers: request.headers,
+            body: request.body,
+            timeout: request.timeout,
+        });
+    }
+    catch (error) {
+        if (tools?.length
+            && hasToolDefinitions(request.body)
+            && isVllmAutoToolChoiceError(error)) {
+            onFallback();
+            return httpRequest(url, {
+                method: 'POST',
+                headers: request.headers,
+                body: withoutNativeTools(request.body, tools),
+                timeout: request.timeout,
+            });
+        }
+        throw error;
+    }
+}
+function parseJsonObject(text) {
+    try {
+        const parsed = JSON.parse(text);
+        if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
+            return parsed;
+        }
+    }
+    catch {
+        /* not JSON */
+    }
+    return null;
+}
+function parseTextToolCallBody(content) {
+    const body = content.trim();
+    if (!body)
+        return [];
+    try {
+        const parsed = JSON.parse(body);
+        const rawCalls = Array.isArray(parsed) ? parsed : [parsed];
+        const calls = [];
+        for (const rawCall of rawCalls) {
+            if (!rawCall || typeof rawCall !== 'object')
+                continue;
+            const record = rawCall;
+            const name = record['name'];
+            if (typeof name !== 'string' || !name)
+                continue;
+            const args = record['arguments'] ?? record['parameters'] ?? record['args'] ?? {};
+            calls.push({
+                name,
+                arguments: typeof args === 'string' ? JSON.stringify(parseJsonObject(args) ?? {}) : JSON.stringify(args ?? {}),
+            });
+        }
+        if (calls.length > 0)
+            return calls;
+    }
+    catch {
+        /* not structured JSON */
+    }
+    const functionCallMatch = /^([@A-Za-z_][@A-Za-z0-9_.:-]*)\s*\(([\s\S]*)\)\s*$/u.exec(body);
+    if (functionCallMatch) {
+        const rawArgs = functionCallMatch[2].trim();
+        const args = rawArgs ? parseJsonObject(rawArgs) : {};
+        if (args) {
+            return [{ name: functionCallMatch[1], arguments: JSON.stringify(args) }];
+        }
+    }
+    const calls = [];
+    const funcPattern = /<function=([@A-Za-z_][@A-Za-z0-9_.:-]*)>([\s\S]*?)<\/function>/g;
+    let fMatch;
+    while ((fMatch = funcPattern.exec(body)) !== null) {
+        const args = {};
+        const paramPattern = /<parameter=([A-Za-z_][A-Za-z0-9_-]*)>([\s\S]*?)<\/parameter>/g;
+        let pMatch;
+        while ((pMatch = paramPattern.exec(fMatch[2] ?? '')) !== null) {
+            args[pMatch[1]] = pMatch[2].trim();
+        }
+        calls.push({ name: fMatch[1], arguments: JSON.stringify(args) });
+    }
+    return calls;
+}
+function recoverToolCallsFromText(content, knownToolNames, generateId) {
+    if (!content || content.length < 10)
+        return null;
+    const calls = [];
+    let cleanContent = content;
+    const isKnownTool = (name) => knownToolNames.has(name);
+    const toolCallPattern = /<tool_call>([\s\S]*?)<\/tool_call>/g;
+    let tcMatch;
+    while ((tcMatch = toolCallPattern.exec(content)) !== null) {
+        const parsedCalls = parseTextToolCallBody(tcMatch[1]);
+        let matched = false;
+        for (const parsed of parsedCalls) {
+            if (!isKnownTool(parsed.name))
+                continue;
+            matched = true;
+            calls.push({
+                id: generateId(),
+                type: 'function',
+                function: { name: parsed.name, arguments: parsed.arguments },
+            });
+        }
+        if (matched)
+            cleanContent = cleanContent.replace(tcMatch[0], '');
+    }
+    if (calls.length === 0)
+        return null;
+    return { calls, cleanContent: cleanContent.trim() };
+}
+function toolFallbackInstruction(tools) {
+    const toolLines = tools.map(tool => {
+        const fn = tool.function;
+        return `- ${fn.name}: ${fn.description}\n  parameters JSON schema: ${JSON.stringify(fn.parameters)}`;
+    });
+    return {
+        role: 'system',
+        content: 'The server does not support native OpenAI tool parsing for this request. '
+            + 'Use this text tool protocol instead.\n\n'
+            + 'When you need a tool, respond with exactly one or more tool calls and no prose:\n'
+            + '<tool_call>tool_name({"argument":"value"})</tool_call>\n\n'
+            + 'After tool results are provided, answer the user normally. Available tools:\n'
+            + toolLines.join('\n'),
+    };
+}
+function withTextToolFallbackMessages(messages, tools) {
+    return [toolFallbackInstruction(tools), ...messages];
+}
+function withoutNativeTools(body, tools) {
+    const fallbackBody = { ...body };
+    delete fallbackBody['tools'];
+    delete fallbackBody['tool_choice'];
+    fallbackBody['messages'] = withTextToolFallbackMessages(body['messages'] ?? [], tools);
+    return fallbackBody;
+}
 export class OpenAICompatibleClient extends BaseLLMClient {
+    warnedVllmToolFallback = false;
+    /**
+     * DiffusionGemma on trimmed vLLM builds has no server-side reasoning or
+     * tool-call parser — the native channel protocol is handled client-side
+     * (see gemma-diffusion.ts). Auto-detected from the model name; override
+     * with `gemmaNativeProtocol` in LLMClientOptions.
+     */
+    get gemmaNative() {
+        return this.options.gemmaNativeProtocol ?? isGemmaDiffusionModel(this.options.model);
+    }
+    /**
+     * Build a full endpoint URL, respecting apiBasePath (already baked into this.options.url)
+     * and any queryParams provided at the provider config level.
+     */
+    buildUrl(suffix) {
+        const raw = this.options.url.replace(/\/+$/, '');
+        // Split off any query string already on the configured base URL so the
+        // path is inserted before it (avoids `host/v1?k=v/chat/completions`).
+        const qIdx = raw.indexOf('?');
+        const basePath = (qIdx === -1 ? raw : raw.slice(0, qIdx)).replace(/\/+$/, '');
+        const existingQuery = qIdx === -1 ? '' : raw.slice(qIdx + 1);
+        const path = suffix.startsWith('/') ? suffix : '/' + suffix;
+        const search = new URLSearchParams(existingQuery);
+        const qp = this.options.queryParams;
+        if (qp) {
+            for (const [k, v] of Object.entries(qp)) {
+                if (v != null)
+                    search.set(k, String(v));
+            }
+        }
+        const qs = search.toString();
+        return basePath + path + (qs ? `?${qs}` : '');
+    }
     constructor(options, auditor) {
-        // Ensure URL ends with /v1 for standard endpoints
-        let url = (options.url || 'https://api.openai.com').replace(/\/+$/, '');
-        if (!url.endsWith('/v1')) {
-            url += '/v1';
+        let base = (options.url || 'https://api.openai.com').replace(/\/+$/, '');
+        // Respect apiBasePath (from ProviderConfig.apiBasePath). Default "/v1" for broad compatibility.
+        // Set apiBasePath: '' (or '/') when you are supplying a *complete* path already
+        // (e.g. full Azure ".../deployments/my-model" URL) or for non-/v1 OpenAI-compatible servers.
+        const desired = options.apiBasePath;
+        const shouldAppend = desired !== '' && desired !== '/';
+        if (shouldAppend) {
+            // Normalize to exactly one leading slash and no trailing slash
+            // (so 'v1', '/v1', '//v1' and '/v1/' all become '/v1').
+            const basePath = ('/' + (desired || '/v1').replace(/^\/+/, '')).replace(/\/+$/, '');
+            if (!base.endsWith(basePath)) {
+                base += basePath;
+            }
         }
-        super({ ...options, url }, auditor);
+        super({ ...options, url: base }, auditor);
+    }
+    warnVllmToolFallback() {
+        if (this.warnedVllmToolFallback)
+            return;
+        this.warnedVllmToolFallback = true;
+        console.warn(`[OpenAI] ${VLLM_AUTO_TOOL_CHOICE_HINT}`);
     }
     // ========================================================================
     // Chat
     // ========================================================================
     async chat(messages, options) {
         // Structured output and tools can now be used together.\n        // The provider sends both response_format and tools in the request.\n        // The Router handles skipping validation when the response contains tool calls.
-        const url = `${this.options.url}/chat/completions`;
-        const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
+        const url = this.buildUrl('/chat/completions');
+        const tools = options?.tools;
         const body = {
             model: this.options.model,
             messages: this.convertMessages(messages),
@@ -43,6 +286,13 @@ export class OpenAICompatibleClient extends BaseLLMClient {
                 body['tool_choice'] = options.toolChoice;
             }
         }
+        if (this.gemmaNative) {
+            // Markers must survive decoding for client-side parsing,
+            // and request-level tool parsing is unavailable server-side.
+            body['skip_special_tokens'] = false;
+            if (tools?.length)
+                body['tool_choice'] = 'none';
+        }
         const start = Date.now();
         this.auditor.record({
             timestamp: start,
@@ -50,38 +300,72 @@ export class OpenAICompatibleClient extends BaseLLMClient {
             provider: 'openai',
             model: this.options.model,
         });
-        const response = await httpRequest(url, {
-            method: 'POST',
+        const response = await requestWithVllmToolFallback(url, {
             headers: buildHeaders(this.options),
             body,
             timeout: this.options.timeout ?? 30000,
-        });
+        }, tools, () => this.warnVllmToolFallback());
         const data = response.data;
         const choice = data.choices[0];
         if (!choice) {
             throw new Error('No choices returned from OpenAI API');
         }
+        // vLLM / OpenAI-compatible `usage` carries no timing, so derive decode
+        // throughput from the client-measured wall-clock duration.
+        const durationMs = Date.now() - start;
         const usage = data.usage
             ? {
                 inputTokens: data.usage.prompt_tokens,
                 outputTokens: data.usage.completion_tokens,
                 totalTokens: data.usage.total_tokens,
                 cachedTokens: data.usage.prompt_tokens_details?.cached_tokens,
+                durationMs,
+                tokensPerSecond: durationMs > 0
+                    ? data.usage.completion_tokens / (durationMs / 1000)
+                    : undefined,
             }
             : undefined;
-        // Normalize tool calls (ensure IDs exist)
-        const toolCalls = choice.message.tool_calls?.map(tc => ({
-            ...tc,
-            id: tc.id || this.generateToolCallId(),
-        }));
+        // Normalize tool calls (ensure IDs and JSON-parseable empty args exist).
+        let toolCalls = choice.message.tool_calls?.map(tc => this.normalizeToolCall(tc));
         // Get content, handling null case
-        const content = choice.message.content || '';
+        let content = choice.message.content || '';
+        let reasoning;
+        // Reasoning models served over the OpenAI-compatible API (vLLM
+        // `--reasoning-parser`, DeepSeek-R1, etc.) return the chain-of-thought
+        // in a dedicated field instead of inline <think> tags. vLLM uses
+        // `reasoning_content`; some gateways use `reasoning`.
+        const serverReasoning = choice.message.reasoning ?? choice.message.reasoning_content;
+        if (typeof serverReasoning === 'string' && serverReasoning.length > 0) {
+            reasoning = serverReasoning;
+        }
+        if (this.gemmaNative && content) {
+            const parsed = parseGemmaDiffusionOutput(content);
+            content = parsed.content;
+            if (parsed.reasoning)
+                reasoning = parsed.reasoning;
+            if (!toolCalls?.length && parsed.toolCalls.length) {
+                toolCalls = parsed.toolCalls.map(tc => ({
+                    id: this.generateToolCallId(),
+                    type: 'function',
+                    function: { name: tc.name, arguments: tc.argumentsJson },
+                }));
+            }
+        }
+        if (!toolCalls?.length && tools?.length && content) {
+            const knownToolNames = new Set(tools.map(tool => tool.function.name));
+            const recovered = recoverToolCallsFromText(content, knownToolNames, () => this.generateToolCallId());
+            if (recovered) {
+                toolCalls = recovered.calls;
+                content = recovered.cleanContent;
+            }
+        }
         const result = {
             message: {
                 role: 'assistant',
                 content,
                 tool_calls: toolCalls,
             },
+            ...(reasoning !== undefined && { reasoning }),
             usage,
             provider: 'openai',
         };
@@ -99,8 +383,8 @@ export class OpenAICompatibleClient extends BaseLLMClient {
     // Streaming
     // ========================================================================
     async *chatStream(messages, options) {
-        const url = `${this.options.url}/chat/completions`;
-        const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
+        const url = this.buildUrl('/chat/completions');
+        const tools = options?.tools;
         const body = {
             model: this.options.model,
             messages: this.convertMessages(messages),
@@ -113,6 +397,11 @@ export class OpenAICompatibleClient extends BaseLLMClient {
                 body['tool_choice'] = options.toolChoice;
             }
         }
+        if (this.gemmaNative) {
+            body['skip_special_tokens'] = false;
+            if (tools?.length)
+                body['tool_choice'] = 'none';
+        }
         const start = Date.now();
         this.auditor.record({
             timestamp: start,
@@ -120,71 +409,124 @@ export class OpenAICompatibleClient extends BaseLLMClient {
             provider: 'openai',
             model: this.options.model,
         });
-        const decoder = new StandardChatDecoder(() => { });
+        // In gemma-native mode the decoder classifies thought-channel content,
+        // so we yield ITS events (thinking vs text) instead of the raw deltas.
+        const decoderEvents = [];
+        const decoder = new StandardChatDecoder(this.gemmaNative ? e => decoderEvents.push(e) : () => { });
         // Track accumulated tool calls across chunks
         const toolCallAccum = new Map();
-        const stream = httpStream(url, {
-            method: 'POST',
-            headers: buildHeaders(this.options),
-            body,
-            timeout: this.options.timeout ?? 120000,
-        });
+        let activeBody = body;
+        let retriedWithTextTools = false;
         let usage;
-        for await (const { data } of parseSSE(stream)) {
+        // Accumulates reasoning deltas from servers that stream a dedicated
+        // `reasoning` / `reasoning_content` field (vLLM, DeepSeek-R1, etc.).
+        let reasoningBuffer = '';
+        while (true) {
+            const stream = httpStream(url, {
+                method: 'POST',
+                headers: buildHeaders(this.options),
+                body: activeBody,
+                timeout: this.options.timeout ?? 120000,
+            });
             try {
-                const parsed = JSON.parse(data);
-                if (parsed.usage) {
-                    usage = {
-                        inputTokens: parsed.usage.prompt_tokens,
-                        outputTokens: parsed.usage.completion_tokens,
-                        totalTokens: parsed.usage.total_tokens,
-                        cachedTokens: parsed.usage.prompt_tokens_details?.cached_tokens,
-                    };
-                }
-                const delta = parsed.choices?.[0]?.delta;
-                if (!delta)
-                    continue;
-                if (delta.content) {
-                    decoder.push(delta.content);
-                    yield { type: 'text', content: delta.content };
-                }
-                // Accumulate streamed tool calls
-                if (delta.tool_calls) {
-                    for (const tc of delta.tool_calls) {
-                        const existing = toolCallAccum.get(tc.index);
-                        if (!existing) {
-                            toolCallAccum.set(tc.index, {
-                                id: tc.id || this.generateToolCallId(),
-                                type: 'function',
-                                function: {
-                                    name: tc.function?.name || '',
-                                    arguments: tc.function?.arguments || '',
-                                },
-                            });
+                for await (const { data } of parseSSE(stream)) {
+                    try {
+                        const parsed = JSON.parse(data);
+                        if (parsed.usage) {
+                            usage = {
+                                inputTokens: parsed.usage.prompt_tokens,
+                                outputTokens: parsed.usage.completion_tokens,
+                                totalTokens: parsed.usage.total_tokens,
+                                cachedTokens: parsed.usage.prompt_tokens_details?.cached_tokens,
+                            };
+                        }
+                        const delta = parsed.choices?.[0]?.delta;
+                        if (!delta)
+                            continue;
+                        // Surface server-side reasoning deltas as thinking events.
+                        const reasoningDelta = delta.reasoning ?? delta.reasoning_content;
+                        if (reasoningDelta) {
+                            reasoningBuffer += reasoningDelta;
+                            yield { type: 'thinking', content: reasoningDelta };
                         }
-                        else {
-                            if (tc.function?.arguments) {
-                                existing.function.arguments += tc.function.arguments;
+                        if (delta.content) {
+                            decoder.push(delta.content);
+                            if (this.gemmaNative) {
+                                while (decoderEvents.length)
+                                    yield decoderEvents.shift();
                             }
-                            if (tc.function?.name) {
-                                existing.function.name += tc.function.name;
+                            else {
+                                yield { type: 'text', content: delta.content };
+                            }
+                        }
+                        // Accumulate streamed tool calls
+                        if (delta.tool_calls) {
+                            for (const tc of delta.tool_calls) {
+                                const existing = toolCallAccum.get(tc.index);
+                                if (!existing) {
+                                    toolCallAccum.set(tc.index, {
+                                        id: tc.id || this.generateToolCallId(),
+                                        type: 'function',
+                                        function: {
+                                            name: tc.function?.name || '',
+                                            arguments: tc.function?.arguments || '',
+                                        },
+                                    });
+                                }
+                                else {
+                                    if (tc.function?.arguments) {
+                                        existing.function.arguments += tc.function.arguments;
+                                    }
+                                    if (tc.function?.name) {
+                                        existing.function.name += tc.function.name;
+                                    }
+                                }
+                            }
+                        }
+                        // Emit tool calls when stream finishes
+                        if (parsed.choices?.[0]?.finish_reason === 'tool_calls' || parsed.choices?.[0]?.finish_reason === 'stop') {
+                            if (toolCallAccum.size > 0) {
+                                const calls = Array.from(toolCallAccum.values())
+                                    .map(tc => this.normalizeToolCall(tc));
+                                yield { type: 'tool_call', calls };
                             }
                         }
                     }
-                }
-                // Emit tool calls when stream finishes
-                if (parsed.choices?.[0]?.finish_reason === 'tool_calls' || parsed.choices?.[0]?.finish_reason === 'stop') {
-                    if (toolCallAccum.size > 0) {
-                        const calls = Array.from(toolCallAccum.values());
-                        yield { type: 'tool_call', calls };
+                    catch {
+                        // Skip unparseable SSE data
                     }
                 }
+                break;
             }
-            catch {
-                // Skip unparseable SSE data
+            catch (error) {
+                if (!retriedWithTextTools
+                    && tools?.length
+                    && hasToolDefinitions(activeBody)
+                    && isVllmAutoToolChoiceError(error)) {
+                    this.warnVllmToolFallback();
+                    activeBody = withoutNativeTools(activeBody, tools);
+                    retriedWithTextTools = true;
+                    continue;
+                }
+                throw error;
             }
         }
         decoder.flush();
+        if (this.gemmaNative) {
+            while (decoderEvents.length)
+                yield decoderEvents.shift();
+        }
+        // Augment usage with client-measured timing (vLLM streams no timing).
+        if (usage) {
+            const durationMs = Date.now() - start;
+            usage = {
+                ...usage,
+                durationMs,
+                tokensPerSecond: durationMs > 0
+                    ? usage.outputTokens / (durationMs / 1000)
+                    : undefined,
+            };
+        }
         this.auditor.record({
             timestamp: Date.now(),
             type: 'stream_end',
@@ -193,25 +535,75 @@ export class OpenAICompatibleClient extends BaseLLMClient {
             duration: Date.now() - start,
             usage,
         });
-        const finalToolCalls = toolCallAccum.size > 0
-            ? Array.from(toolCallAccum.values())
+        let finalToolCalls = toolCallAccum.size > 0
+            ? Array.from(toolCallAccum.values()).map(tc => this.normalizeToolCall(tc))
             : undefined;
+        let cleanContent = decoder.getCleanContent();
+        // Prefer the server's dedicated reasoning field; fall back to <think>
+        // tags parsed from the content stream by the decoder.
+        let reasoning = reasoningBuffer || decoder.getReasoning();
+        if (this.gemmaNative) {
+            // Native tool-call blocks live in the text channel; extract them.
+            const parsed = parseGemmaDiffusionOutput(cleanContent);
+            cleanContent = parsed.content;
+            if (parsed.reasoning) {
+                reasoning = reasoning ? `${reasoning}\n\n${parsed.reasoning}` : parsed.reasoning;
+            }
+            if (!finalToolCalls?.length && parsed.toolCalls.length) {
+                finalToolCalls = parsed.toolCalls.map(tc => ({
+                    id: this.generateToolCallId(),
+                    type: 'function',
+                    function: { name: tc.name, arguments: tc.argumentsJson },
+                }));
+                yield { type: 'tool_call', calls: finalToolCalls };
+            }
+        }
+        if (!finalToolCalls?.length && tools?.length && cleanContent) {
+            const knownToolNames = new Set(tools.map(tool => tool.function.name));
+            const recovered = recoverToolCallsFromText(cleanContent, knownToolNames, () => this.generateToolCallId());
+            if (recovered) {
+                finalToolCalls = recovered.calls;
+                cleanContent = recovered.cleanContent;
+                yield { type: 'tool_call', calls: finalToolCalls };
+            }
+        }
         return {
             message: {
                 role: 'assistant',
-                content: decoder.getCleanContent(),
+                content: cleanContent,
                 tool_calls: finalToolCalls,
             },
-            reasoning: decoder.getReasoning(),
+            reasoning,
             usage,
             provider: 'openai',
         };
     }
+    normalizeToolCall(toolCall) {
+        return {
+            ...toolCall,
+            id: toolCall.id || this.generateToolCallId(),
+            type: 'function',
+            function: {
+                ...toolCall.function,
+                name: toolCall.function?.name || '',
+                arguments: this.normalizeToolArguments(toolCall.function?.arguments),
+            },
+        };
+    }
+    normalizeToolArguments(args) {
+        if (typeof args === 'string') {
+            return args.trim().length > 0 ? args : '{}';
+        }
+        if (args == null) {
+            return '{}';
+        }
+        return JSON.stringify(args) ?? '{}';
+    }
     // ========================================================================
     // Embeddings
     // ========================================================================
     async embed(text) {
-        const url = `${this.options.url}/embeddings`;
+        const url = this.buildUrl('/embeddings');
         const response = await httpRequest(url, {
             method: 'POST',
             headers: buildHeaders(this.options),
@@ -227,7 +619,7 @@ export class OpenAICompatibleClient extends BaseLLMClient {
     // Model Discovery
     // ========================================================================
     async getModels() {
-        const url = `${this.options.url}/models`;
+        const url = this.buildUrl('/models');
         try {
             const response = await httpRequest(url, {
                 headers: buildHeaders(this.options),
@@ -243,12 +635,7 @@ export class OpenAICompatibleClient extends BaseLLMClient {
     // Internals
     // ========================================================================
     convertMessages(messages) {
-        // OpenAI format is our canonical format, minimal conversion needed
-        return messages.map(msg => ({
-            ...msg,
-            // Ensure content is never null/undefined
-            content: msg.content ?? '',
-        }));
+        return normalizeMessagesForOpenAICompat(messages);
     }
     buildRequestParams(options) {
         const params = {
@@ -259,6 +646,27 @@ export class OpenAICompatibleClient extends BaseLLMClient {
             params['temperature'] = options.temperature;
         if (options?.maxTokens !== undefined)
             params['max_tokens'] = options.maxTokens;
+        // Unified thinking flag. Per-call overrides model config; only emitted
+        // when explicitly set, so servers that reject unknown fields are
+        // unaffected by default. OpenAI reasoning models (o-series / GPT-5) use
+        // `reasoning_effort`; vLLM / Qwen use `chat_template_kwargs.enable_thinking`.
+        // A user-supplied value (via parameters) always wins.
+        const thinking = resolveThinking(options?.thinking, this.options.thinking);
+        if (thinking) {
+            const isOfficialOpenAI = (this.options.url ?? '').includes('api.openai.com');
+            if (isOpenAIReasoningModel(this.options.model)) {
+                if (params['reasoning_effort'] === undefined) {
+                    params['reasoning_effort'] = thinking.enabled ? (thinking.level ?? 'medium') : 'minimal';
+                }
+            }
+            else if (!isOfficialOpenAI) {
+                // `chat_template_kwargs` is a vLLM/Qwen extension. Official OpenAI
+                // rejects unknown body fields (and gpt-4o has no thinking toggle),
+                // so only send it to self-hosted / compatible gateways.
+                const existing = params['chat_template_kwargs'] ?? {};
+                params['chat_template_kwargs'] = { enable_thinking: thinking.enabled, ...existing };
+            }
+        }
         return params;
     }
     // ========================================================================
@@ -300,4 +708,3 @@ export class OpenAICompatibleClient extends BaseLLMClient {
         };
     }
 }
-//# sourceMappingURL=openai.js.map