npm - universal-llm-client - Versions diffs - 4.3.0 → 4.5.1 - Mend

universal-llm-client 4.3.0 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

package/CHANGELOG.md +34 -19
package/README.md +62 -11
package/dist/ai-model.d.ts +12 -2
package/dist/ai-model.js +36 -2
package/dist/auditor.d.ts +0 -1
package/dist/auditor.js +0 -1
package/dist/client.d.ts +0 -1
package/dist/client.js +0 -1
package/dist/gemma-channel.d.ts +13 -0
package/dist/gemma-channel.js +37 -0
package/dist/gemma-diffusion.d.ts +48 -0
package/dist/gemma-diffusion.js +146 -0
package/dist/http.d.ts +4 -1
package/dist/http.js +14 -2
package/dist/index.d.ts +2 -2
package/dist/index.js +4 -1
package/dist/interfaces.d.ts +163 -8
package/dist/interfaces.js +0 -1
package/dist/mcp.d.ts +0 -1
package/dist/mcp.js +0 -1
package/dist/providers/anthropic.d.ts +0 -1
package/dist/providers/anthropic.js +28 -4
package/dist/providers/google.d.ts +22 -2
package/dist/providers/google.js +223 -14
package/dist/providers/index.d.ts +0 -1
package/dist/providers/index.js +0 -1
package/dist/providers/ollama.d.ts +2 -1
package/dist/providers/ollama.js +59 -31
package/dist/providers/openai.d.ts +16 -1
package/dist/providers/openai.js +488 -81
package/dist/router.d.ts +2 -1
package/dist/router.js +4 -1
package/dist/stream-decoder.d.ts +12 -1
package/dist/stream-decoder.js +182 -6
package/dist/structured-output.d.ts +0 -1
package/dist/structured-output.js +0 -1
package/dist/thinking.d.ts +35 -0
package/dist/thinking.js +51 -0
package/dist/tools.d.ts +0 -1
package/dist/tools.js +0 -1
package/dist/zod-adapter.d.ts +0 -1
package/dist/zod-adapter.js +0 -1
package/package.json +3 -1
package/dist/ai-model.d.ts.map +0 -1
package/dist/ai-model.js.map +0 -1
package/dist/auditor.d.ts.map +0 -1
package/dist/auditor.js.map +0 -1
package/dist/client.d.ts.map +0 -1
package/dist/client.js.map +0 -1
package/dist/http.d.ts.map +0 -1
package/dist/http.js.map +0 -1
package/dist/index.d.ts.map +0 -1
package/dist/index.js.map +0 -1
package/dist/interfaces.d.ts.map +0 -1
package/dist/interfaces.js.map +0 -1
package/dist/mcp.d.ts.map +0 -1
package/dist/mcp.js.map +0 -1
package/dist/providers/anthropic.d.ts.map +0 -1
package/dist/providers/anthropic.js.map +0 -1
package/dist/providers/google.d.ts.map +0 -1
package/dist/providers/google.js.map +0 -1
package/dist/providers/index.d.ts.map +0 -1
package/dist/providers/index.js.map +0 -1
package/dist/providers/ollama.d.ts.map +0 -1
package/dist/providers/ollama.js.map +0 -1
package/dist/providers/openai.d.ts.map +0 -1
package/dist/providers/openai.js.map +0 -1
package/dist/router.d.ts.map +0 -1
package/dist/router.js.map +0 -1
package/dist/stream-decoder.d.ts.map +0 -1
package/dist/stream-decoder.js.map +0 -1
package/dist/structured-output.d.ts.map +0 -1
package/dist/structured-output.js.map +0 -1
package/dist/tools.d.ts.map +0 -1
package/dist/tools.js.map +0 -1
package/dist/zod-adapter.d.ts.map +0 -1
package/dist/zod-adapter.js.map +0 -1

package/CHANGELOG.md CHANGED Viewed

@@ -5,41 +5,56 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-## [4.3.0] - 2026-05-14
+## [4.5.1] - 2026-06-14
 ### Added
-- **`TokenUsageInfo.reasoningTokens`** — new optional field reporting server-side thinking/reasoning tokens that are billed but not emitted as visible text. Currently populated by the Google provider from `usageMetadata.thoughtsTokenCount` for Gemini thinking models (2.5 Pro / 3.x Pro). Other providers either roll thinking into `outputTokens` (Ollama) or surface it via `DecodedEvent { type: 'thinking' }`.
-- **`GoogleResponse.usageMetadata.thoughtsTokenCount`** — typed on the raw provider response shape.
+- **Automatic vLLM tool-calling fallback (OpenAI-compatible provider)** — when a vLLM / OpenAI-compatible server rejects native tool calling (started without `--enable-auto-tool-choice` / `--tool-call-parser`, surfaced as a `"auto" tool choice requires …` 400), the client now transparently retries with a text-level tool protocol: it drops `tools` / `tool_choice`, injects a `<tool_call>name({…})</tool_call>` instruction, and recovers the emitted calls back into `message.tool_calls` (parsing `<tool_call>`, `<function=…>`, and `name(args)` forms). Works for both `chat()` and `chatStream()`, and emits a one-time warning pointing to the flags needed for native parsing. Tool calling now works against vLLM servers not started with tool-parser flags.
 ### Changed
-- **Slimmer published tarball** — `src/` is no longer shipped to npm. The published package drops from ~271 kB / 144 files to ~146 kB / 72 files. Source maps now embed original sources (`inlineSources: true`), so debuggers stepping into the library continue to show TypeScript source without needing the raw `.ts` files in the tarball.
+- **Non-leading system messages normalized for OpenAI-compatible servers** — leading system messages are preserved, but a system message appearing *after* the conversation has started is rewritten to a `user` turn prefixed with `[SYSTEM MESSAGE]` (many OpenAI-compatible servers and chat templates reject mid-conversation system roles). Multimodal content is flattened to text for the prefix.
+- **`chat()` / `chatStream()` no longer auto-attach registered tools** — tools are sent only when explicitly passed via `options.tools` (the `chatWithTools` path already does this). Previously every tool added with `registerTool` was attached to plain `chat()` calls. Pass `{ tools }` explicitly (or use `chatWithTools`) if you relied on the old behavior.
+- **Minimal published package** — the npm tarball now ships only `dist/` (compiled JS + type declarations) plus `README` / `CHANGELOG` / `LICENSE`. `src/` (tests, demos, test-scripts) is no longer published, and the build no longer emits `.js.map` / `.d.ts.map` (they referenced sources that aren't shipped and triggered "missing source" warnings in consumer bundlers). Package size dropped ~78% (357 kB → 80 kB, 175 → 44 files). Because `src/` is no longer whitelisted, a local demo model cache under `src/` can never bloat the published package again.
-## [4.2.0] - Previously released
+## [4.5.0] - 2026-06-14
 ### Added
-- **Ollama native thinking enabled by default** — `think` parameter is now sent on every request to prevent model-default thinking from leaking into tool selection / reasoning paths.
-- **Ollama stream idle timeout** — `OllamaClient` enforces a minimum response wait time to avoid hung streams.
-- **Ollama live context length** — provider now fetches the running context length from `/api/ps` and reports model capabilities.
-- **Google Gemini service tiers** — configurable service tier with per-tier timeouts and retry logic.
-- **`LLMAudioContent` type** — audio content support across providers.
-- **Structured output tolerates markdown fences** — JSON output wrapped in ```` ```json ```` fences is now parsed correctly.
+- **Server-side reasoning field support (OpenAI-compatible provider)** — Reasoning models served over the OpenAI-compatible API (vLLM `--reasoning-parser`, DeepSeek-R1, etc.) return their chain-of-thought in a dedicated `reasoning_content` (vLLM) / `reasoning` (some gateways) field instead of inline `<think>` tags. The OpenAI provider now surfaces it:
+  - `chat()` populates `LLMChatResponse.reasoning` from `message.reasoning_content` / `message.reasoning`, keeping `message.content` clean
+  - `chatStream()` emits `delta.reasoning_content` / `delta.reasoning` chunks as `thinking` stream events and accumulates them into the final `reasoning`
+  - Inline `<think>` parsing (via `StandardChatDecoder`) is preserved as a fallback for servers run without a reasoning parser
+  - `OpenAIResponse` message type extended with optional `reasoning` / `reasoning_content`
+  - Verified end-to-end against vLLM serving `nvidia/Qwen3.6-35B-A3B-NVFP4` (NVFP4) on Blackwell — reasoning, streaming `thinking` events, tool calling (`qwen3_xml`), and structured output all pass
+- **Unified `thinking` flag with levels across all providers** — `thinking` (model config) and per-call `ChatOptions.thinking` accept `true`/`false` **or a level `'minimal' | 'low' | 'medium' | 'high'`** (new `ThinkingLevel` type), mapped to each backend's native control so apps switch providers without reasoning-specific code. A shared `resolveThinking` helper (`src/thinking.ts`) normalizes the value; each provider maps it:
+  - **OpenAI-compatible** → OpenAI reasoning models (o-series / GPT-5, by name) get `reasoning_effort:<level>`; vLLM / Qwen get `chat_template_kwargs.enable_thinking`. Emitted only when explicitly set.
+  - **Google / Gemini** → Gemini 3.x `thinkingConfig.thinkingLevel`; Gemini 2.5/2.0 `thinkingBudget` (level→budget map, `0` off, `-1` dynamic). `includeThoughts` enabled when thinking is on.
+  - **Anthropic** → extended thinking `budget_tokens` from the level (kept `< max_tokens`; temperature omitted, per API).
+  - **Ollama** → `think` on/off (no native levels).
+  - Per-call overrides model config everywhere. Verified live against vLLM (Qwen3.6-NVFP4) and **Gemini 3.5 Flash** (levels produce distinct reasoning-token counts); unit-tested per provider + `resolveThinking`.
+- **Gemini reasoning text surfaced** — with thinking on, the Google provider sets `includeThoughts:true` and routes `thought:true` parts into `response.reasoning` (non-streaming) and live `thinking` stream events, matching how Qwen/Anthropic expose chain-of-thought (previously only `reasoningTokens` was reported). Verified live on `gemini-3.5-flash` (654–976 chars of reasoning across levels).
+- **Gemini Deep Research API** — new Google-only `AIModel.deepResearch(input, opts)` (creates a `/v1beta/interactions` background interaction and polls to completion → `{ id, status, report, steps }`) and `AIModel.deepResearchStream(input, opts)` (live `thought`/`text`/`status` events). New `DeepResearchOptions`/`DeepResearchResult`/`DeepResearchStep`/`DeepResearchEvent` types. Throws a clear error if no Google provider is configured. Create + poll plumbing verified live.
+- **Generation stats — `usage.durationMs` and `usage.tokensPerSecond`** — decode throughput is now reported on `LLMChatResponse.usage`: server-precise for Ollama (from `eval_count` / `eval_duration`, which were previously discarded), and client-measured wall-clock for OpenAI-compatible / vLLM (which return no timing in `usage`). `OllamaResponse` gained `total_duration` / `load_duration` typings.
+- **OpenAI-compatible transport flexibility** — `ProviderConfig` gains `headers`, `queryParams`, `authHeader`, `authPrefix`, and `apiBasePath` for Azure OpenAI, custom gateways, and non-`/v1` servers (no custom code needed). Honored by the OpenAI-compatible provider (`headers`/`authHeader`/`authPrefix` also by Ollama, via `buildHeaders`); a `buildUrl` helper applies `queryParams`/`apiBasePath` across all endpoints and preserves any query string already on the base URL. Includes a 2026 provider-API-landscape research doc (`docs/research/`).
-## [4.1.0] - Previously released
+### Fixed
+- **README** — the tool-execution trace field is `response.toolExecutions` (array of `{ tool_call_id, output, error?, duration? }`), not `toolTrace`; corrected the `chatWithTools` example.
+## [4.4.0] - 2026-06-11
 ### Added
-- **Prompt caching support** — first-class cache configuration plumbed through providers.
-- **Anthropic provider** — `AnthropicClient` implementation of the universal protocol.
-- **Request cancellation** — `signal: AbortSignal` accepted on all chat methods.
-- **Decoupled structured output from Zod** — new generic `SchemaConfig` interface; Zod becomes an adapter (`./zod` sub-path) rather than a hard dependency on the schema path.
-- **Wider JSONSchema tuple types** — schema utilities accept more JSON Schema tuple shapes.
+- **Diffusion LM support (DiffusionGemma family)** — First-class client-side protocol for diffusion language models served by OpenAI-compatible endpoints that ship without server-side reasoning or tool-call parsers (e.g. current vLLM diffusion builds, which reject request-level `tools` with auto tool choice):
+  - `gemma-diffusion.ts` — model detection (`isGemmaDiffusionModel`), native channel parsing (`<|channel>thought … <channel|>` reasoning, `<|tool_call>call:name{…}<tool_call|>` tool calls), and pseudo-JSON argument conversion (`gemmaArgsToJson`: `<|"|>` quote tokens, bare keys, nested objects/arrays)
+  - OpenAI provider native mode (auto-detected from the model name, `gemmaNativeProtocol` option to override): sends `skip_special_tokens: false` and `tools` + `tool_choice: "none"` (declarations still render into the chat template), parses reasoning and tool calls client-side, and yields decoder-classified `thinking`/`text` streaming events
+  - Full agentic `chatWithTools` loop works end-to-end against DiffusionGemma; history tool turns use standard structured `tool_calls` + `role: "tool"` messages
+- **"Signal from Noise" demo** (`src/demos/diffusion-gemma/`) — vLLM test harness plus a diffusion chat canvas that animates block-parallel denoising paced by real block arrivals, with replay/scrubbing, reasoning-channel separation, a rendered-markdown reading view, and an engine-reload entropy control
-### Changed
+### Fixed
-- Tests migrated to the new `fromZod` adapter API.
+- Stray unbalanced `<channel|>` / `<turn|>` markers emitted by diffusion models are stripped from parsed content
 ## [4.0.0] - 2026-03-13

package/README.md CHANGED Viewed

@@ -1,12 +1,14 @@
 # universal-llm-client
-A universal LLM client for JavaScript/TypeScript with **transparent provider failover**, streaming tool execution, pluggable reasoning strategies, and native observability.
+[![npm version](https://img.shields.io/npm/v/universal-llm-client.svg)](https://www.npmjs.com/package/universal-llm-client) [![CI](https://github.com/igorls/universal-llm-client/actions/workflows/ci.yml/badge.svg)](https://github.com/igorls/universal-llm-client/actions/workflows/ci.yml) [![npm downloads](https://img.shields.io/npm/dm/universal-llm-client.svg)](https://www.npmjs.com/package/universal-llm-client) [![License: MIT](https://img.shields.io/npm/l/universal-llm-client.svg)](https://github.com/igorls/universal-llm-client/blob/main/LICENSE)
+A universal LLM client for JavaScript/TypeScript with **transparent provider failover** and a **provider-agnostic reasoning API** — one set of code across OpenAI, Anthropic, Google Gemini, Ollama, vLLM, and any OpenAI-compatible endpoint. Streaming tool execution, structured output, generation stats, and native observability included.
 ```typescript
 import { AIModel } from 'universal-llm-client';
 const model = new AIModel({
-    model: 'gemini-2.5-flash',
+    model: 'gemini-3.5-flash',
     providers: [
         { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
         { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY },
@@ -26,10 +28,13 @@ const response = await model.chat([
 ## Features
 - 🔄 **Transparent Failover** — Priority-ordered provider chain with retries, health tracking, and cooldowns
+- 🧠 **Unified Reasoning** — One `thinking` flag (`true`/`false` or a level: `'minimal' | 'low' | 'medium' | 'high'`) mapped to each backend's native control; chain-of-thought surfaced as `response.reasoning` + streaming `thinking` events (with `<think>`-tag parsing as a fallback)
 - 🛠️ **Tool Calling** — Register tools once, works across all providers. Autonomous multi-turn execution loop
 - 📋 **Structured Output** — Zod schema validation, JSON Schema support, streaming, and type-safe responses
 - 🌊 **Streaming** — First-class async generator streaming with pluggable decoder strategies
-- 🧠 **Reasoning** — Native `<think>` tag parsing, interleaved reasoning, and model thinking support
+- 🔬 **Deep Research** — Drive Google Gemini's agentic Deep Research (background interactions with polling + streaming)
+- 📈 **Generation Stats** — `usage.tokensPerSecond` and `durationMs` reported across providers
+- 🔌 **Flexible Transport** — Custom headers, query params, auth header/prefix, and base path for Azure OpenAI and gateways
 - 🔍 **Observability** — Built-in auditor interface for logging, cost tracking, and behavioral analysis
 - 🌐 **Universal Runtime** — Node.js 22+, Bun, Deno, and modern browsers
 - 🤖 **MCP Native** — Bridge MCP servers to LLM tools with zero glue code
@@ -39,11 +44,14 @@ const response = await model.chat([
 | Provider | Type | Notes |
 |---|---|---|
-| **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal |
-| **OpenAI** | `openai` | GPT-4o, o3, etc. Also works with OpenRouter, Groq, LM Studio, vLLM |
-| **Google AI Studio** | `google` | Gemini models, system instructions, multimodal |
-| **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints and Bearer tokens |
-| **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances |
+| **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal, native thinking |
+| **OpenAI + Compat** | `openai` | GPT series, o-series + **any OpenAI-compatible endpoint**: xAI/Grok, Mistral, DeepSeek, Cohere Compatibility, Groq, Together, Fireworks, OpenRouter, Perplexity Sonar, vLLM, LM Studio, TGI, most self-hosted servers |
+| **Google AI Studio** | `google` | Gemini models, system instructions, multimodal, native thinking + grounding |
+| **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints, Bearer tokens, service tiers (flex/priority) |
+| **Anthropic (Claude)** | `anthropic` | Claude 3.5/4 models via native Messages API. Excellent tool use, extended thinking with signatures, strong prompt caching |
+| **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances (OpenAI-compatible under the hood) |
+**Most of the world** is reachable via `type: 'openai'` + a `url` override. We only maintain dedicated clients for fundamentally different protocols (Anthropic Messages, Google Gemini) that offer unique high-value capabilities, plus Ollama for local developer experience. See `docs/guide/providers.md` and the research survey in `docs/research/provider-api-landscape-2026.md`.
 ---
@@ -98,6 +106,44 @@ for await (const event of model.chatStream([
 }
 ```
+### Thinking & Reasoning
+Set one `thinking` value — `true`/`false` or a level (`'minimal' | 'low' | 'medium' | 'high'`) —
+and it maps to each provider's native control (Gemini `thinkingLevel`/`thinkingBudget`, OpenAI
+`reasoning_effort`, vLLM `enable_thinking`, Anthropic `budget_tokens`, Ollama `think`):
+```typescript
+const model = new AIModel({
+    model: 'gemini-3.5-flash',
+    thinking: 'high', // true | false | 'minimal' | 'low' | 'medium' | 'high'
+    providers: [{ type: 'google', apiKey: process.env.GOOGLE_API_KEY }],
+});
+const res = await model.chat([{ role: 'user', content: 'Solve this step by step: ...' }]);
+console.log(res.message.content); // final answer (clean)
+console.log(res.reasoning);       // chain-of-thought, when the model exposes it
+// Per-call override (e.g. turn thinking off for structured output)
+await model.chat(messages, { thinking: false });
+```
+### Deep Research (Gemini)
+Run Google's agentic Deep Research — creates a background interaction and polls to completion:
+```typescript
+const result = await model.deepResearch('Research the history of Google TPUs.', {
+    tools: ['google_search', 'url_context'],
+});
+console.log(result.status, result.report);
+// Or stream intermediate thoughts and steps as they arrive:
+for await (const ev of model.deepResearchStream('Compare RISC-V vs ARM in 2026.')) {
+    if (ev.type === 'thought') console.log('[thinking]', ev.content);
+    else if (ev.type === 'text') process.stdout.write(ev.content);
+}
+```
 ### Tool Calling
 ```typescript
@@ -124,8 +170,8 @@ const response = await model.chatWithTools([
 console.log(response.message.content);
 // "The weather in Tokyo is 22°C and sunny."
-console.log(response.toolTrace);
-// [{ name: 'get_weather', args: { city: 'Tokyo' }, result: {...}, duration: 5 }]
+console.log(response.toolExecutions);
+// [{ tool_call_id: 'call_abc', output: { temperature: 22, condition: 'sunny', city: 'Tokyo' }, duration: 5 }]
 ```
 ### Provider Failover
@@ -421,13 +467,18 @@ new AIModel(config: AIModelConfig)
 | Property | Type | Description |
 |---|---|---|
-| `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'` |
+| `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'`, `'anthropic'` |
 | `url` | `string` | Provider URL (has sensible defaults) |
 | `apiKey` | `string` | API key or Bearer token |
 | `priority` | `number` | Lower = tried first (defaults to array index) |
 | `model` | `string` | Override model name for this provider |
 | `region` | `string` | Vertex AI region (e.g., `'us-central1'`) |
 | `apiVersion` | `string` | API version (e.g., `'v1beta'`) |
+| `headers` | `Record<string,string>` | Extra headers merged into requests — OpenAI-compatible & Ollama (Azure `api-key`, gateways) |
+| `queryParams` | `Record<string,string>` | Query params appended to URLs — OpenAI-compatible only (e.g. Azure `api-version`) |
+| `authHeader` | `string` | Header name for the key — OpenAI-compatible & Ollama (e.g. `'api-key'`) |
+| `authPrefix` | `string` | Prefix before the key value — OpenAI-compatible & Ollama (e.g. `''` for api-key style) |
+| `apiBasePath` | `string` | OpenAI-compatible only: override or disable the `/v1` suffix (use `''` for full Azure deployment URLs) |
 **Methods:**

package/dist/ai-model.d.ts CHANGED Viewed

@@ -6,7 +6,7 @@
  *
  * Provider classes are internal — the user never imports them.
  */
-import { type AIModelConfig, type LLMChatMessage, type LLMChatResponse, type ChatOptions, type ModelMetadata, type LLMFunction, type ToolHandler } from './interfaces.js';
+import { type AIModelConfig, type LLMChatMessage, type LLMChatResponse, type ChatOptions, type ModelMetadata, type LLMFunction, type ToolHandler, type DeepResearchOptions, type DeepResearchResult, type DeepResearchEvent } from './interfaces.js';
 import type { DecodedEvent } from './stream-decoder.js';
 import { type ProviderStatus } from './router.js';
 import { type StructuredOutputResult, type SchemaConfig } from './structured-output.js';
@@ -113,6 +113,17 @@ export declare class AIModel {
     embed(text: string): Promise<number[]>;
     /** Generate embeddings for multiple texts */
     embedArray(texts: string[]): Promise<number[][]>;
+    private getGoogleClient;
+    /**
+     * Run an agentic Deep Research interaction (Gemini only): creates it and
+     * polls until completion. Throws if no Google provider is configured.
+     */
+    deepResearch(input: string, options?: DeepResearchOptions): Promise<DeepResearchResult>;
+    /**
+     * Stream a Deep Research interaction's intermediate thought/text/step events
+     * (Gemini only), returning the final result. Throws if no Google provider.
+     */
+    deepResearchStream(input: string, options?: DeepResearchOptions): AsyncGenerator<DeepResearchEvent, DeepResearchResult, unknown>;
     /** Register a tool callable by the LLM (broadcast to all providers) */
     registerTool(name: string, description: string, parameters: LLMFunction['parameters'], handler: ToolHandler): void;
     /** Register multiple tools at once */
@@ -137,4 +148,3 @@ export declare class AIModel {
     private createClient;
     private normalizeType;
 }
-//# sourceMappingURL=ai-model.d.ts.map

package/dist/ai-model.js CHANGED Viewed

@@ -172,6 +172,34 @@ export class AIModel {
         return this.router.embedArray(texts);
     }
     // ========================================================================
+    // Deep Research (Gemini-only)
+    // ========================================================================
+    getGoogleClient(method) {
+        const googleClients = this.router.getClients().filter((c) => c instanceof GoogleClient);
+        // Prefer an AI Studio client — Vertex AI doesn't support Deep Research.
+        const aiStudio = googleClients.find(c => c.supportsDeepResearch());
+        if (aiStudio)
+            return aiStudio;
+        if (googleClients.length > 0) {
+            throw new Error(`${method} requires an AI Studio Google provider (type: "google"); Vertex AI is not supported for Deep Research.`);
+        }
+        throw new Error(`${method} requires a Google provider (type: "google"). None is configured.`);
+    }
+    /**
+     * Run an agentic Deep Research interaction (Gemini only): creates it and
+     * polls until completion. Throws if no Google provider is configured.
+     */
+    async deepResearch(input, options) {
+        return this.getGoogleClient('deepResearch').deepResearch(input, options);
+    }
+    /**
+     * Stream a Deep Research interaction's intermediate thought/text/step events
+     * (Gemini only), returning the final result. Throws if no Google provider.
+     */
+    async *deepResearchStream(input, options) {
+        return yield* this.getGoogleClient('deepResearchStream').deepResearchStream(input, options);
+    }
+    // ========================================================================
     // Tool Registration
     // ========================================================================
     /** Register a tool callable by the LLM (broadcast to all providers) */
@@ -232,9 +260,16 @@ export class AIModel {
             retries: this.config.retries ?? 2,
             debug: this.config.debug ?? false,
             defaultParameters: this.config.defaultParameters,
-            thinking: this.config.thinking ?? false,
+            // Preserve `undefined` (not set) vs explicit false so providers can
+            // decide whether to send a thinking toggle at all.
+            thinking: this.config.thinking,
             region: providerConfig.region,
             apiVersion: providerConfig.apiVersion,
+            extraHeaders: providerConfig.headers,
+            queryParams: providerConfig.queryParams,
+            authHeader: providerConfig.authHeader,
+            authPrefix: providerConfig.authPrefix,
+            apiBasePath: providerConfig.apiBasePath,
         };
         switch (type) {
             case 'ollama':
@@ -255,4 +290,3 @@ export class AIModel {
         return type.toLowerCase();
     }
 }
-//# sourceMappingURL=ai-model.js.map

package/dist/auditor.d.ts CHANGED Viewed

@@ -79,4 +79,3 @@ export declare class BufferedAuditor implements Auditor {
     /** Clear all buffered events without flushing */
     clear(): void;
 }
-//# sourceMappingURL=auditor.d.ts.map

package/dist/auditor.js CHANGED Viewed

@@ -110,4 +110,3 @@ export class BufferedAuditor {
         this.events.length = 0;
     }
 }
-//# sourceMappingURL=auditor.js.map

package/dist/client.d.ts CHANGED Viewed

@@ -86,4 +86,3 @@ export declare abstract class BaseLLMClient {
      */
     protected validateStructuredResponse(content: string, config: SchemaConfig<unknown>): void;
 }
-//# sourceMappingURL=client.d.ts.map

package/dist/client.js CHANGED Viewed

@@ -297,4 +297,3 @@ export class BaseLLMClient {
         }
     }
 }
-//# sourceMappingURL=client.js.map

package/dist/gemma-channel.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+/**
+ * Gemma 4 can emit its thought channel as text control tokens instead of the
+ * generic Ollama `message.thinking` field. Keep that provider quirk isolated so
+ * callers receive final-answer text and reasoning separately.
+ */
+export interface GemmaThoughtExtraction {
+    readonly content: string;
+    readonly reasoning: string;
+    readonly found: boolean;
+}
+export declare const GEMMA_THOUGHT_OPENERS: readonly ["<|channel>thought", "<|thought"];
+export declare function extractGemmaThoughtChannels(input: string): GemmaThoughtExtraction;
+export declare function normalizeGemmaThought(thought: string): string;

package/dist/gemma-channel.js ADDED Viewed

@@ -0,0 +1,37 @@
+/**
+ * Gemma 4 can emit its thought channel as text control tokens instead of the
+ * generic Ollama `message.thinking` field. Keep that provider quirk isolated so
+ * callers receive final-answer text and reasoning separately.
+ */
+const GEMMA_THOUGHT_BLOCK = /<\|channel>\s*thought\s*\r?\n?([\s\S]*?)<channel\|>/gi;
+const GEMMA_COMPACT_THOUGHT_BLOCK = /<\|thought\s*\r?\n?([\s\S]*?)\|>/gi;
+export const GEMMA_THOUGHT_OPENERS = ['<|channel>thought', '<|thought'];
+export function extractGemmaThoughtChannels(input) {
+    if (!input)
+        return { content: input, reasoning: '', found: false };
+    const reasoningParts = [];
+    let found = false;
+    const content = input
+        .replace(GEMMA_THOUGHT_BLOCK, (_match, thought) => {
+        found = true;
+        const normalized = normalizeGemmaThought(thought);
+        if (normalized)
+            reasoningParts.push(normalized);
+        return '';
+    })
+        .replace(GEMMA_COMPACT_THOUGHT_BLOCK, (_match, thought) => {
+        found = true;
+        const normalized = normalizeGemmaThought(thought);
+        if (normalized)
+            reasoningParts.push(normalized);
+        return '';
+    });
+    return {
+        content,
+        reasoning: reasoningParts.join('\n\n'),
+        found,
+    };
+}
+export function normalizeGemmaThought(thought) {
+    return thought.replace(/^\s+/, '').replace(/\s+$/, '');
+}

package/dist/gemma-diffusion.d.ts ADDED Viewed

@@ -0,0 +1,48 @@
+/**
+ * DiffusionGemma (vLLM) native-protocol adapter.
+ *
+ * Trimmed vLLM builds that serve DiffusionGemma ship with NO reasoning parser
+ * and NO tool-call parser module, and they reject OpenAI-style `tools` unless
+ * `--tool-call-parser` is configured. Everything therefore has to be handled
+ * client-side, against the model's native channel format (visible only when
+ * the request sets `skip_special_tokens: false`):
+ *
+ *   <|channel>thought ...reasoning... <channel|>          reasoning channel
+ *   <|tool_call>call:name{k:<|"|>v<|"|>,n:3}<tool_call|>  tool call
+ *
+ * Tool-call arguments are NOT JSON: keys are bare, strings are wrapped in the
+ * <|"|> quote token, numbers/booleans are bare (see the model's
+ * chat_template.jinja `format_argument` macro). `gemmaArgsToJson` converts
+ * that into a standard JSON string.
+ *
+ * Request-side protocol (implemented in the OpenAI provider):
+ *   - always send `skip_special_tokens: false`
+ *   - send `tools` with `tool_choice: 'none'` — vLLM still renders the
+ *     declarations into the chat template, it just skips its (absent) parser
+ *   - send history tool turns structurally (assistant `tool_calls` +
+ *     `role: 'tool'` messages) — the chat template renders them natively
+ */
+export interface GemmaParsedToolCall {
+    readonly name: string;
+    /** JSON-encoded arguments object, ready for LLMToolCall.function.arguments */
+    readonly argumentsJson: string;
+}
+export interface GemmaDiffusionParsed {
+    /** Final answer with reasoning, tool-call blocks and special tokens removed */
+    readonly content: string;
+    readonly reasoning: string;
+    readonly toolCalls: readonly GemmaParsedToolCall[];
+}
+/** Models that speak this native protocol when served by vLLM. */
+export declare function isGemmaDiffusionModel(model: string): boolean;
+/**
+ * Convert the Gemma template's pseudo-JSON argument syntax to a JSON string.
+ * Lenient by design: bare words that aren't numbers/booleans become strings,
+ * since the model occasionally omits the quote token.
+ */
+export declare function gemmaArgsToJson(body: string): string;
+/**
+ * Parse a complete raw DiffusionGemma output into reasoning, tool calls and
+ * clean answer text.
+ */
+export declare function parseGemmaDiffusionOutput(raw: string): GemmaDiffusionParsed;

package/dist/gemma-diffusion.js ADDED Viewed

@@ -0,0 +1,146 @@
+/**
+ * DiffusionGemma (vLLM) native-protocol adapter.
+ *
+ * Trimmed vLLM builds that serve DiffusionGemma ship with NO reasoning parser
+ * and NO tool-call parser module, and they reject OpenAI-style `tools` unless
+ * `--tool-call-parser` is configured. Everything therefore has to be handled
+ * client-side, against the model's native channel format (visible only when
+ * the request sets `skip_special_tokens: false`):
+ *
+ *   <|channel>thought ...reasoning... <channel|>          reasoning channel
+ *   <|tool_call>call:name{k:<|"|>v<|"|>,n:3}<tool_call|>  tool call
+ *
+ * Tool-call arguments are NOT JSON: keys are bare, strings are wrapped in the
+ * <|"|> quote token, numbers/booleans are bare (see the model's
+ * chat_template.jinja `format_argument` macro). `gemmaArgsToJson` converts
+ * that into a standard JSON string.
+ *
+ * Request-side protocol (implemented in the OpenAI provider):
+ *   - always send `skip_special_tokens: false`
+ *   - send `tools` with `tool_choice: 'none'` — vLLM still renders the
+ *     declarations into the chat template, it just skips its (absent) parser
+ *   - send history tool turns structurally (assistant `tool_calls` +
+ *     `role: 'tool'` messages) — the chat template renders them natively
+ */
+import { extractGemmaThoughtChannels } from './gemma-channel.js';
+/** Models that speak this native protocol when served by vLLM. */
+export function isGemmaDiffusionModel(model) {
+    return /diffusion[-_]?gemma/i.test(model);
+}
+const TOOL_CALL_BLOCK = /<\|tool_call>\s*call:([a-zA-Z0-9_.-]+)\s*\{([\s\S]*?)\}\s*<tool_call\|>/g;
+/**
+ * Residual control tokens that may leak into text output — including stray
+ * unbalanced channel markers (the model occasionally emits an extra
+ * <channel|> closer mid-answer).
+ */
+const RESIDUAL_SPECIAL = /<\|?(?:turn|think|image|audio|video|tool_response|tool_call|tool|channel)\b[^>]*?\|?>|<(?:turn|channel|tool_response|tool_call|tool)\|>/g;
+const QUOTE_TOKEN = '<|"|>';
+/**
+ * Convert the Gemma template's pseudo-JSON argument syntax to a JSON string.
+ * Lenient by design: bare words that aren't numbers/booleans become strings,
+ * since the model occasionally omits the quote token.
+ */
+export function gemmaArgsToJson(body) {
+    // Argument bodies arrive without their outer braces (the regex strips them)
+    const src = `{${body}}`;
+    let i = 0;
+    const n = src.length;
+    function skipWs() {
+        while (i < n && /\s/.test(src[i]))
+            i++;
+    }
+    function parseQuoted() {
+        // positioned at the start of QUOTE_TOKEN
+        i += QUOTE_TOKEN.length;
+        const end = src.indexOf(QUOTE_TOKEN, i);
+        const raw = end === -1 ? src.slice(i) : src.slice(i, end);
+        i = end === -1 ? n : end + QUOTE_TOKEN.length;
+        return raw;
+    }
+    function parseBare(stops) {
+        const start = i;
+        while (i < n && !stops.includes(src[i]) && !src.startsWith(QUOTE_TOKEN, i))
+            i++;
+        return src.slice(start, i).trim();
+    }
+    function parseValue() {
+        skipWs();
+        if (src.startsWith(QUOTE_TOKEN, i))
+            return JSON.stringify(parseQuoted());
+        const c = src[i];
+        if (c === '{')
+            return parseObject();
+        if (c === '[')
+            return parseArray();
+        const bare = parseBare(',}]');
+        if (/^-?\d+(\.\d+)?([eE][+-]?\d+)?$/.test(bare))
+            return bare;
+        if (bare === 'true' || bare === 'false' || bare === 'null')
+            return bare;
+        return JSON.stringify(bare);
+    }
+    function parseObject() {
+        i++; // consume {
+        const parts = [];
+        skipWs();
+        while (i < n && src[i] !== '}') {
+            skipWs();
+            const key = src.startsWith(QUOTE_TOKEN, i) ? parseQuoted() : parseBare(':');
+            skipWs();
+            if (src[i] === ':')
+                i++;
+            const value = parseValue();
+            parts.push(`${JSON.stringify(key.trim())}:${value}`);
+            skipWs();
+            if (src[i] === ',')
+                i++;
+            skipWs();
+        }
+        i++; // consume }
+        return `{${parts.join(',')}}`;
+    }
+    function parseArray() {
+        i++; // consume [
+        const parts = [];
+        skipWs();
+        while (i < n && src[i] !== ']') {
+            parts.push(parseValue());
+            skipWs();
+            if (src[i] === ',')
+                i++;
+            skipWs();
+        }
+        i++; // consume ]
+        return `[${parts.join(',')}]`;
+    }
+    skipWs();
+    return parseObject();
+}
+/**
+ * Parse a complete raw DiffusionGemma output into reasoning, tool calls and
+ * clean answer text.
+ */
+export function parseGemmaDiffusionOutput(raw) {
+    if (!raw)
+        return { content: raw, reasoning: '', toolCalls: [] };
+    const toolCalls = [];
+    let text = raw.replace(TOOL_CALL_BLOCK, (_m, name, args) => {
+        toolCalls.push({ name, argumentsJson: gemmaArgsToJson(args) });
+        return '';
+    });
+    const channels = extractGemmaThoughtChannels(text);
+    text = channels.content;
+    // Unterminated thought channel (model hit max_tokens mid-reasoning)
+    let reasoning = channels.reasoning;
+    const danglingThought = text.match(/<\|channel>\s*thought\s*\r?\n?([\s\S]*)$/i);
+    if (danglingThought) {
+        reasoning = reasoning ? `${reasoning}\n\n${danglingThought[1].trim()}` : danglingThought[1].trim();
+        text = text.slice(0, danglingThought.index);
+    }
+    text = text.replace(RESIDUAL_SPECIAL, '');
+    return {
+        content: text.trim(),
+        reasoning,
+        toolCalls,
+    };
+}

package/dist/http.d.ts CHANGED Viewed

@@ -44,6 +44,9 @@ export declare function parseSSE(stream: AsyncGenerator<string>): AsyncGenerator
 }, void, unknown>;
 /**
  * Build standard headers for LLM API requests.
+ * Merges any provider-specific extraHeaders (from ProviderConfig) on top.
+ * Provider clients can still fully override (e.g. Anthropic uses x-api-key).
+ *
+ * Respects authHeader / authPrefix from config for Azure-style or gateway auth.
  */
 export declare function buildHeaders(options: LLMClientOptions): Record<string, string>;
-//# sourceMappingURL=http.d.ts.map

package/dist/http.js CHANGED Viewed

@@ -174,14 +174,26 @@ export async function* parseSSE(stream) {
 // ============================================================================
 /**
  * Build standard headers for LLM API requests.
+ * Merges any provider-specific extraHeaders (from ProviderConfig) on top.
+ * Provider clients can still fully override (e.g. Anthropic uses x-api-key).
+ *
+ * Respects authHeader / authPrefix from config for Azure-style or gateway auth.
  */
 export function buildHeaders(options) {
     const headers = {
         'Content-Type': 'application/json',
     };
     if (options.apiKey) {
-        headers['Authorization'] = `Bearer ${options.apiKey}`;
+        const headerName = options.authHeader || 'Authorization';
+        // Sensible default prefix: Bearer for Authorization, nothing for api-key / x-api-key etc.
+        const defaultPrefix = headerName.toLowerCase() === 'authorization' ? 'Bearer ' : '';
+        const prefix = options.authPrefix !== undefined ? options.authPrefix : defaultPrefix;
+        headers[headerName] = `${prefix}${options.apiKey}`.trim();
+    }
+    // Merge provider-specific extras (e.g. Azure 'api-key', custom gateway headers).
+    // Later entries win on conflicts, allowing complete override of auth.
+    if (options.extraHeaders) {
+        Object.assign(headers, options.extraHeaders);
     }
     return headers;
 }
-//# sourceMappingURL=http.js.map