universal-llm-client 4.3.0 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CHANGELOG.md +34 -19
  2. package/README.md +62 -11
  3. package/dist/ai-model.d.ts +12 -2
  4. package/dist/ai-model.js +36 -2
  5. package/dist/auditor.d.ts +0 -1
  6. package/dist/auditor.js +0 -1
  7. package/dist/client.d.ts +0 -1
  8. package/dist/client.js +0 -1
  9. package/dist/gemma-channel.d.ts +13 -0
  10. package/dist/gemma-channel.js +37 -0
  11. package/dist/gemma-diffusion.d.ts +48 -0
  12. package/dist/gemma-diffusion.js +146 -0
  13. package/dist/http.d.ts +4 -1
  14. package/dist/http.js +14 -2
  15. package/dist/index.d.ts +2 -2
  16. package/dist/index.js +4 -1
  17. package/dist/interfaces.d.ts +163 -8
  18. package/dist/interfaces.js +0 -1
  19. package/dist/mcp.d.ts +0 -1
  20. package/dist/mcp.js +0 -1
  21. package/dist/providers/anthropic.d.ts +0 -1
  22. package/dist/providers/anthropic.js +28 -4
  23. package/dist/providers/google.d.ts +22 -2
  24. package/dist/providers/google.js +223 -14
  25. package/dist/providers/index.d.ts +0 -1
  26. package/dist/providers/index.js +0 -1
  27. package/dist/providers/ollama.d.ts +2 -1
  28. package/dist/providers/ollama.js +59 -31
  29. package/dist/providers/openai.d.ts +16 -1
  30. package/dist/providers/openai.js +488 -81
  31. package/dist/router.d.ts +2 -1
  32. package/dist/router.js +4 -1
  33. package/dist/stream-decoder.d.ts +12 -1
  34. package/dist/stream-decoder.js +182 -6
  35. package/dist/structured-output.d.ts +0 -1
  36. package/dist/structured-output.js +0 -1
  37. package/dist/thinking.d.ts +35 -0
  38. package/dist/thinking.js +51 -0
  39. package/dist/tools.d.ts +0 -1
  40. package/dist/tools.js +0 -1
  41. package/dist/zod-adapter.d.ts +0 -1
  42. package/dist/zod-adapter.js +0 -1
  43. package/package.json +3 -1
  44. package/dist/ai-model.d.ts.map +0 -1
  45. package/dist/ai-model.js.map +0 -1
  46. package/dist/auditor.d.ts.map +0 -1
  47. package/dist/auditor.js.map +0 -1
  48. package/dist/client.d.ts.map +0 -1
  49. package/dist/client.js.map +0 -1
  50. package/dist/http.d.ts.map +0 -1
  51. package/dist/http.js.map +0 -1
  52. package/dist/index.d.ts.map +0 -1
  53. package/dist/index.js.map +0 -1
  54. package/dist/interfaces.d.ts.map +0 -1
  55. package/dist/interfaces.js.map +0 -1
  56. package/dist/mcp.d.ts.map +0 -1
  57. package/dist/mcp.js.map +0 -1
  58. package/dist/providers/anthropic.d.ts.map +0 -1
  59. package/dist/providers/anthropic.js.map +0 -1
  60. package/dist/providers/google.d.ts.map +0 -1
  61. package/dist/providers/google.js.map +0 -1
  62. package/dist/providers/index.d.ts.map +0 -1
  63. package/dist/providers/index.js.map +0 -1
  64. package/dist/providers/ollama.d.ts.map +0 -1
  65. package/dist/providers/ollama.js.map +0 -1
  66. package/dist/providers/openai.d.ts.map +0 -1
  67. package/dist/providers/openai.js.map +0 -1
  68. package/dist/router.d.ts.map +0 -1
  69. package/dist/router.js.map +0 -1
  70. package/dist/stream-decoder.d.ts.map +0 -1
  71. package/dist/stream-decoder.js.map +0 -1
  72. package/dist/structured-output.d.ts.map +0 -1
  73. package/dist/structured-output.js.map +0 -1
  74. package/dist/tools.d.ts.map +0 -1
  75. package/dist/tools.js.map +0 -1
  76. package/dist/zod-adapter.d.ts.map +0 -1
  77. package/dist/zod-adapter.js.map +0 -1
package/CHANGELOG.md CHANGED
@@ -5,41 +5,56 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
- ## [4.3.0] - 2026-05-14
8
+ ## [4.5.1] - 2026-06-14
9
9
 
10
10
  ### Added
11
11
 
12
- - **`TokenUsageInfo.reasoningTokens`**new optional field reporting server-side thinking/reasoning tokens that are billed but not emitted as visible text. Currently populated by the Google provider from `usageMetadata.thoughtsTokenCount` for Gemini thinking models (2.5 Pro / 3.x Pro). Other providers either roll thinking into `outputTokens` (Ollama) or surface it via `DecodedEvent { type: 'thinking' }`.
13
- - **`GoogleResponse.usageMetadata.thoughtsTokenCount`** — typed on the raw provider response shape.
12
+ - **Automatic vLLM tool-calling fallback (OpenAI-compatible provider)** when a vLLM / OpenAI-compatible server rejects native tool calling (started without `--enable-auto-tool-choice` / `--tool-call-parser`, surfaced as a `"auto" tool choice requires …` 400), the client now transparently retries with a text-level tool protocol: it drops `tools` / `tool_choice`, injects a `<tool_call>name({…})</tool_call>` instruction, and recovers the emitted calls back into `message.tool_calls` (parsing `<tool_call>`, `<function=…>`, and `name(args)` forms). Works for both `chat()` and `chatStream()`, and emits a one-time warning pointing to the flags needed for native parsing. Tool calling now works against vLLM servers not started with tool-parser flags.
14
13
 
15
14
  ### Changed
16
15
 
17
- - **Slimmer published tarball** `src/` is no longer shipped to npm. The published package drops from ~271 kB / 144 files to ~146 kB / 72 files. Source maps now embed original sources (`inlineSources: true`), so debuggers stepping into the library continue to show TypeScript source without needing the raw `.ts` files in the tarball.
16
+ - **Non-leading system messages normalized for OpenAI-compatible servers** leading system messages are preserved, but a system message appearing *after* the conversation has started is rewritten to a `user` turn prefixed with `[SYSTEM MESSAGE]` (many OpenAI-compatible servers and chat templates reject mid-conversation system roles). Multimodal content is flattened to text for the prefix.
17
+ - **`chat()` / `chatStream()` no longer auto-attach registered tools** — tools are sent only when explicitly passed via `options.tools` (the `chatWithTools` path already does this). Previously every tool added with `registerTool` was attached to plain `chat()` calls. Pass `{ tools }` explicitly (or use `chatWithTools`) if you relied on the old behavior.
18
+ - **Minimal published package** — the npm tarball now ships only `dist/` (compiled JS + type declarations) plus `README` / `CHANGELOG` / `LICENSE`. `src/` (tests, demos, test-scripts) is no longer published, and the build no longer emits `.js.map` / `.d.ts.map` (they referenced sources that aren't shipped and triggered "missing source" warnings in consumer bundlers). Package size dropped ~78% (357 kB → 80 kB, 175 → 44 files). Because `src/` is no longer whitelisted, a local demo model cache under `src/` can never bloat the published package again.
18
19
 
19
- ## [4.2.0] - Previously released
20
+ ## [4.5.0] - 2026-06-14
20
21
 
21
22
  ### Added
22
23
 
23
- - **Ollama native thinking enabled by default** — `think` parameter is now sent on every request to prevent model-default thinking from leaking into tool selection / reasoning paths.
24
- - **Ollama stream idle timeout** `OllamaClient` enforces a minimum response wait time to avoid hung streams.
25
- - **Ollama live context length** provider now fetches the running context length from `/api/ps` and reports model capabilities.
26
- - **Google Gemini service tiers** configurable service tier with per-tier timeouts and retry logic.
27
- - **`LLMAudioContent` type** audio content support across providers.
28
- - **Structured output tolerates markdown fences**JSON output wrapped in ```` ```json ```` fences is now parsed correctly.
24
+ - **Server-side reasoning field support (OpenAI-compatible provider)** — Reasoning models served over the OpenAI-compatible API (vLLM `--reasoning-parser`, DeepSeek-R1, etc.) return their chain-of-thought in a dedicated `reasoning_content` (vLLM) / `reasoning` (some gateways) field instead of inline `<think>` tags. The OpenAI provider now surfaces it:
25
+ - `chat()` populates `LLMChatResponse.reasoning` from `message.reasoning_content` / `message.reasoning`, keeping `message.content` clean
26
+ - `chatStream()` emits `delta.reasoning_content` / `delta.reasoning` chunks as `thinking` stream events and accumulates them into the final `reasoning`
27
+ - Inline `<think>` parsing (via `StandardChatDecoder`) is preserved as a fallback for servers run without a reasoning parser
28
+ - `OpenAIResponse` message type extended with optional `reasoning` / `reasoning_content`
29
+ - Verified end-to-end against vLLM serving `nvidia/Qwen3.6-35B-A3B-NVFP4` (NVFP4) on Blackwell reasoning, streaming `thinking` events, tool calling (`qwen3_xml`), and structured output all pass
30
+ - **Unified `thinking` flag with levels across all providers** — `thinking` (model config) and per-call `ChatOptions.thinking` accept `true`/`false` **or a level `'minimal' | 'low' | 'medium' | 'high'`** (new `ThinkingLevel` type), mapped to each backend's native control so apps switch providers without reasoning-specific code. A shared `resolveThinking` helper (`src/thinking.ts`) normalizes the value; each provider maps it:
31
+ - **OpenAI-compatible** → OpenAI reasoning models (o-series / GPT-5, by name) get `reasoning_effort:<level>`; vLLM / Qwen get `chat_template_kwargs.enable_thinking`. Emitted only when explicitly set.
32
+ - **Google / Gemini** → Gemini 3.x `thinkingConfig.thinkingLevel`; Gemini 2.5/2.0 `thinkingBudget` (level→budget map, `0` off, `-1` dynamic). `includeThoughts` enabled when thinking is on.
33
+ - **Anthropic** → extended thinking `budget_tokens` from the level (kept `< max_tokens`; temperature omitted, per API).
34
+ - **Ollama** → `think` on/off (no native levels).
35
+ - Per-call overrides model config everywhere. Verified live against vLLM (Qwen3.6-NVFP4) and **Gemini 3.5 Flash** (levels produce distinct reasoning-token counts); unit-tested per provider + `resolveThinking`.
36
+ - **Gemini reasoning text surfaced** — with thinking on, the Google provider sets `includeThoughts:true` and routes `thought:true` parts into `response.reasoning` (non-streaming) and live `thinking` stream events, matching how Qwen/Anthropic expose chain-of-thought (previously only `reasoningTokens` was reported). Verified live on `gemini-3.5-flash` (654–976 chars of reasoning across levels).
37
+ - **Gemini Deep Research API** — new Google-only `AIModel.deepResearch(input, opts)` (creates a `/v1beta/interactions` background interaction and polls to completion → `{ id, status, report, steps }`) and `AIModel.deepResearchStream(input, opts)` (live `thought`/`text`/`status` events). New `DeepResearchOptions`/`DeepResearchResult`/`DeepResearchStep`/`DeepResearchEvent` types. Throws a clear error if no Google provider is configured. Create + poll plumbing verified live.
38
+ - **Generation stats — `usage.durationMs` and `usage.tokensPerSecond`** — decode throughput is now reported on `LLMChatResponse.usage`: server-precise for Ollama (from `eval_count` / `eval_duration`, which were previously discarded), and client-measured wall-clock for OpenAI-compatible / vLLM (which return no timing in `usage`). `OllamaResponse` gained `total_duration` / `load_duration` typings.
39
+ - **OpenAI-compatible transport flexibility** — `ProviderConfig` gains `headers`, `queryParams`, `authHeader`, `authPrefix`, and `apiBasePath` for Azure OpenAI, custom gateways, and non-`/v1` servers (no custom code needed). Honored by the OpenAI-compatible provider (`headers`/`authHeader`/`authPrefix` also by Ollama, via `buildHeaders`); a `buildUrl` helper applies `queryParams`/`apiBasePath` across all endpoints and preserves any query string already on the base URL. Includes a 2026 provider-API-landscape research doc (`docs/research/`).
29
40
 
30
- ## [4.1.0] - Previously released
41
+ ### Fixed
42
+
43
+ - **README** — the tool-execution trace field is `response.toolExecutions` (array of `{ tool_call_id, output, error?, duration? }`), not `toolTrace`; corrected the `chatWithTools` example.
44
+
45
+ ## [4.4.0] - 2026-06-11
31
46
 
32
47
  ### Added
33
48
 
34
- - **Prompt caching support** — first-class cache configuration plumbed through providers.
35
- - **Anthropic provider** — `AnthropicClient` implementation of the universal protocol.
36
- - **Request cancellation** `signal: AbortSignal` accepted on all chat methods.
37
- - **Decoupled structured output from Zod** — new generic `SchemaConfig` interface; Zod becomes an adapter (`./zod` sub-path) rather than a hard dependency on the schema path.
38
- - **Wider JSONSchema tuple types** — schema utilities accept more JSON Schema tuple shapes.
49
+ - **Diffusion LM support (DiffusionGemma family)** — First-class client-side protocol for diffusion language models served by OpenAI-compatible endpoints that ship without server-side reasoning or tool-call parsers (e.g. current vLLM diffusion builds, which reject request-level `tools` with auto tool choice):
50
+ - `gemma-diffusion.ts`model detection (`isGemmaDiffusionModel`), native channel parsing (`<|channel>thought … <channel|>` reasoning, `<|tool_call>call:name{…}<tool_call|>` tool calls), and pseudo-JSON argument conversion (`gemmaArgsToJson`: `<|"|>` quote tokens, bare keys, nested objects/arrays)
51
+ - OpenAI provider native mode (auto-detected from the model name, `gemmaNativeProtocol` option to override): sends `skip_special_tokens: false` and `tools` + `tool_choice: "none"` (declarations still render into the chat template), parses reasoning and tool calls client-side, and yields decoder-classified `thinking`/`text` streaming events
52
+ - Full agentic `chatWithTools` loop works end-to-end against DiffusionGemma; history tool turns use standard structured `tool_calls` + `role: "tool"` messages
53
+ - **"Signal from Noise" demo** (`src/demos/diffusion-gemma/`) vLLM test harness plus a diffusion chat canvas that animates block-parallel denoising paced by real block arrivals, with replay/scrubbing, reasoning-channel separation, a rendered-markdown reading view, and an engine-reload entropy control
39
54
 
40
- ### Changed
55
+ ### Fixed
41
56
 
42
- - Tests migrated to the new `fromZod` adapter API.
57
+ - Stray unbalanced `<channel|>` / `<turn|>` markers emitted by diffusion models are stripped from parsed content
43
58
 
44
59
  ## [4.0.0] - 2026-03-13
45
60
 
package/README.md CHANGED
@@ -1,12 +1,14 @@
1
1
  # universal-llm-client
2
2
 
3
- A universal LLM client for JavaScript/TypeScript with **transparent provider failover**, streaming tool execution, pluggable reasoning strategies, and native observability.
3
+ [![npm version](https://img.shields.io/npm/v/universal-llm-client.svg)](https://www.npmjs.com/package/universal-llm-client) [![CI](https://github.com/igorls/universal-llm-client/actions/workflows/ci.yml/badge.svg)](https://github.com/igorls/universal-llm-client/actions/workflows/ci.yml) [![npm downloads](https://img.shields.io/npm/dm/universal-llm-client.svg)](https://www.npmjs.com/package/universal-llm-client) [![License: MIT](https://img.shields.io/npm/l/universal-llm-client.svg)](https://github.com/igorls/universal-llm-client/blob/main/LICENSE)
4
+
5
+ A universal LLM client for JavaScript/TypeScript with **transparent provider failover** and a **provider-agnostic reasoning API** — one set of code across OpenAI, Anthropic, Google Gemini, Ollama, vLLM, and any OpenAI-compatible endpoint. Streaming tool execution, structured output, generation stats, and native observability included.
4
6
 
5
7
  ```typescript
6
8
  import { AIModel } from 'universal-llm-client';
7
9
 
8
10
  const model = new AIModel({
9
- model: 'gemini-2.5-flash',
11
+ model: 'gemini-3.5-flash',
10
12
  providers: [
11
13
  { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
12
14
  { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY },
@@ -26,10 +28,13 @@ const response = await model.chat([
26
28
  ## Features
27
29
 
28
30
  - 🔄 **Transparent Failover** — Priority-ordered provider chain with retries, health tracking, and cooldowns
31
+ - 🧠 **Unified Reasoning** — One `thinking` flag (`true`/`false` or a level: `'minimal' | 'low' | 'medium' | 'high'`) mapped to each backend's native control; chain-of-thought surfaced as `response.reasoning` + streaming `thinking` events (with `<think>`-tag parsing as a fallback)
29
32
  - 🛠️ **Tool Calling** — Register tools once, works across all providers. Autonomous multi-turn execution loop
30
33
  - 📋 **Structured Output** — Zod schema validation, JSON Schema support, streaming, and type-safe responses
31
34
  - 🌊 **Streaming** — First-class async generator streaming with pluggable decoder strategies
32
- - 🧠 **Reasoning** — Native `<think>` tag parsing, interleaved reasoning, and model thinking support
35
+ - 🔬 **Deep Research** — Drive Google Gemini's agentic Deep Research (background interactions with polling + streaming)
36
+ - 📈 **Generation Stats** — `usage.tokensPerSecond` and `durationMs` reported across providers
37
+ - 🔌 **Flexible Transport** — Custom headers, query params, auth header/prefix, and base path for Azure OpenAI and gateways
33
38
  - 🔍 **Observability** — Built-in auditor interface for logging, cost tracking, and behavioral analysis
34
39
  - 🌐 **Universal Runtime** — Node.js 22+, Bun, Deno, and modern browsers
35
40
  - 🤖 **MCP Native** — Bridge MCP servers to LLM tools with zero glue code
@@ -39,11 +44,14 @@ const response = await model.chat([
39
44
 
40
45
  | Provider | Type | Notes |
41
46
  |---|---|---|
42
- | **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal |
43
- | **OpenAI** | `openai` | GPT-4o, o3, etc. Also works with OpenRouter, Groq, LM Studio, vLLM |
44
- | **Google AI Studio** | `google` | Gemini models, system instructions, multimodal |
45
- | **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints and Bearer tokens |
46
- | **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances |
47
+ | **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal, native thinking |
48
+ | **OpenAI + Compat** | `openai` | GPT series, o-series + **any OpenAI-compatible endpoint**: xAI/Grok, Mistral, DeepSeek, Cohere Compatibility, Groq, Together, Fireworks, OpenRouter, Perplexity Sonar, vLLM, LM Studio, TGI, most self-hosted servers |
49
+ | **Google AI Studio** | `google` | Gemini models, system instructions, multimodal, native thinking + grounding |
50
+ | **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints, Bearer tokens, service tiers (flex/priority) |
51
+ | **Anthropic (Claude)** | `anthropic` | Claude 3.5/4 models via native Messages API. Excellent tool use, extended thinking with signatures, strong prompt caching |
52
+ | **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances (OpenAI-compatible under the hood) |
53
+
54
+ **Most of the world** is reachable via `type: 'openai'` + a `url` override. We only maintain dedicated clients for fundamentally different protocols (Anthropic Messages, Google Gemini) that offer unique high-value capabilities, plus Ollama for local developer experience. See `docs/guide/providers.md` and the research survey in `docs/research/provider-api-landscape-2026.md`.
47
55
 
48
56
  ---
49
57
 
@@ -98,6 +106,44 @@ for await (const event of model.chatStream([
98
106
  }
99
107
  ```
100
108
 
109
+ ### Thinking & Reasoning
110
+
111
+ Set one `thinking` value — `true`/`false` or a level (`'minimal' | 'low' | 'medium' | 'high'`) —
112
+ and it maps to each provider's native control (Gemini `thinkingLevel`/`thinkingBudget`, OpenAI
113
+ `reasoning_effort`, vLLM `enable_thinking`, Anthropic `budget_tokens`, Ollama `think`):
114
+
115
+ ```typescript
116
+ const model = new AIModel({
117
+ model: 'gemini-3.5-flash',
118
+ thinking: 'high', // true | false | 'minimal' | 'low' | 'medium' | 'high'
119
+ providers: [{ type: 'google', apiKey: process.env.GOOGLE_API_KEY }],
120
+ });
121
+
122
+ const res = await model.chat([{ role: 'user', content: 'Solve this step by step: ...' }]);
123
+ console.log(res.message.content); // final answer (clean)
124
+ console.log(res.reasoning); // chain-of-thought, when the model exposes it
125
+
126
+ // Per-call override (e.g. turn thinking off for structured output)
127
+ await model.chat(messages, { thinking: false });
128
+ ```
129
+
130
+ ### Deep Research (Gemini)
131
+
132
+ Run Google's agentic Deep Research — creates a background interaction and polls to completion:
133
+
134
+ ```typescript
135
+ const result = await model.deepResearch('Research the history of Google TPUs.', {
136
+ tools: ['google_search', 'url_context'],
137
+ });
138
+ console.log(result.status, result.report);
139
+
140
+ // Or stream intermediate thoughts and steps as they arrive:
141
+ for await (const ev of model.deepResearchStream('Compare RISC-V vs ARM in 2026.')) {
142
+ if (ev.type === 'thought') console.log('[thinking]', ev.content);
143
+ else if (ev.type === 'text') process.stdout.write(ev.content);
144
+ }
145
+ ```
146
+
101
147
  ### Tool Calling
102
148
 
103
149
  ```typescript
@@ -124,8 +170,8 @@ const response = await model.chatWithTools([
124
170
 
125
171
  console.log(response.message.content);
126
172
  // "The weather in Tokyo is 22°C and sunny."
127
- console.log(response.toolTrace);
128
- // [{ name: 'get_weather', args: { city: 'Tokyo' }, result: {...}, duration: 5 }]
173
+ console.log(response.toolExecutions);
174
+ // [{ tool_call_id: 'call_abc', output: { temperature: 22, condition: 'sunny', city: 'Tokyo' }, duration: 5 }]
129
175
  ```
130
176
 
131
177
  ### Provider Failover
@@ -421,13 +467,18 @@ new AIModel(config: AIModelConfig)
421
467
 
422
468
  | Property | Type | Description |
423
469
  |---|---|---|
424
- | `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'` |
470
+ | `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'`, `'anthropic'` |
425
471
  | `url` | `string` | Provider URL (has sensible defaults) |
426
472
  | `apiKey` | `string` | API key or Bearer token |
427
473
  | `priority` | `number` | Lower = tried first (defaults to array index) |
428
474
  | `model` | `string` | Override model name for this provider |
429
475
  | `region` | `string` | Vertex AI region (e.g., `'us-central1'`) |
430
476
  | `apiVersion` | `string` | API version (e.g., `'v1beta'`) |
477
+ | `headers` | `Record<string,string>` | Extra headers merged into requests — OpenAI-compatible & Ollama (Azure `api-key`, gateways) |
478
+ | `queryParams` | `Record<string,string>` | Query params appended to URLs — OpenAI-compatible only (e.g. Azure `api-version`) |
479
+ | `authHeader` | `string` | Header name for the key — OpenAI-compatible & Ollama (e.g. `'api-key'`) |
480
+ | `authPrefix` | `string` | Prefix before the key value — OpenAI-compatible & Ollama (e.g. `''` for api-key style) |
481
+ | `apiBasePath` | `string` | OpenAI-compatible only: override or disable the `/v1` suffix (use `''` for full Azure deployment URLs) |
431
482
 
432
483
  **Methods:**
433
484
 
@@ -6,7 +6,7 @@
6
6
  *
7
7
  * Provider classes are internal — the user never imports them.
8
8
  */
9
- import { type AIModelConfig, type LLMChatMessage, type LLMChatResponse, type ChatOptions, type ModelMetadata, type LLMFunction, type ToolHandler } from './interfaces.js';
9
+ import { type AIModelConfig, type LLMChatMessage, type LLMChatResponse, type ChatOptions, type ModelMetadata, type LLMFunction, type ToolHandler, type DeepResearchOptions, type DeepResearchResult, type DeepResearchEvent } from './interfaces.js';
10
10
  import type { DecodedEvent } from './stream-decoder.js';
11
11
  import { type ProviderStatus } from './router.js';
12
12
  import { type StructuredOutputResult, type SchemaConfig } from './structured-output.js';
@@ -113,6 +113,17 @@ export declare class AIModel {
113
113
  embed(text: string): Promise<number[]>;
114
114
  /** Generate embeddings for multiple texts */
115
115
  embedArray(texts: string[]): Promise<number[][]>;
116
+ private getGoogleClient;
117
+ /**
118
+ * Run an agentic Deep Research interaction (Gemini only): creates it and
119
+ * polls until completion. Throws if no Google provider is configured.
120
+ */
121
+ deepResearch(input: string, options?: DeepResearchOptions): Promise<DeepResearchResult>;
122
+ /**
123
+ * Stream a Deep Research interaction's intermediate thought/text/step events
124
+ * (Gemini only), returning the final result. Throws if no Google provider.
125
+ */
126
+ deepResearchStream(input: string, options?: DeepResearchOptions): AsyncGenerator<DeepResearchEvent, DeepResearchResult, unknown>;
116
127
  /** Register a tool callable by the LLM (broadcast to all providers) */
117
128
  registerTool(name: string, description: string, parameters: LLMFunction['parameters'], handler: ToolHandler): void;
118
129
  /** Register multiple tools at once */
@@ -137,4 +148,3 @@ export declare class AIModel {
137
148
  private createClient;
138
149
  private normalizeType;
139
150
  }
140
- //# sourceMappingURL=ai-model.d.ts.map
package/dist/ai-model.js CHANGED
@@ -172,6 +172,34 @@ export class AIModel {
172
172
  return this.router.embedArray(texts);
173
173
  }
174
174
  // ========================================================================
175
+ // Deep Research (Gemini-only)
176
+ // ========================================================================
177
+ getGoogleClient(method) {
178
+ const googleClients = this.router.getClients().filter((c) => c instanceof GoogleClient);
179
+ // Prefer an AI Studio client — Vertex AI doesn't support Deep Research.
180
+ const aiStudio = googleClients.find(c => c.supportsDeepResearch());
181
+ if (aiStudio)
182
+ return aiStudio;
183
+ if (googleClients.length > 0) {
184
+ throw new Error(`${method} requires an AI Studio Google provider (type: "google"); Vertex AI is not supported for Deep Research.`);
185
+ }
186
+ throw new Error(`${method} requires a Google provider (type: "google"). None is configured.`);
187
+ }
188
+ /**
189
+ * Run an agentic Deep Research interaction (Gemini only): creates it and
190
+ * polls until completion. Throws if no Google provider is configured.
191
+ */
192
+ async deepResearch(input, options) {
193
+ return this.getGoogleClient('deepResearch').deepResearch(input, options);
194
+ }
195
+ /**
196
+ * Stream a Deep Research interaction's intermediate thought/text/step events
197
+ * (Gemini only), returning the final result. Throws if no Google provider.
198
+ */
199
+ async *deepResearchStream(input, options) {
200
+ return yield* this.getGoogleClient('deepResearchStream').deepResearchStream(input, options);
201
+ }
202
+ // ========================================================================
175
203
  // Tool Registration
176
204
  // ========================================================================
177
205
  /** Register a tool callable by the LLM (broadcast to all providers) */
@@ -232,9 +260,16 @@ export class AIModel {
232
260
  retries: this.config.retries ?? 2,
233
261
  debug: this.config.debug ?? false,
234
262
  defaultParameters: this.config.defaultParameters,
235
- thinking: this.config.thinking ?? false,
263
+ // Preserve `undefined` (not set) vs explicit false so providers can
264
+ // decide whether to send a thinking toggle at all.
265
+ thinking: this.config.thinking,
236
266
  region: providerConfig.region,
237
267
  apiVersion: providerConfig.apiVersion,
268
+ extraHeaders: providerConfig.headers,
269
+ queryParams: providerConfig.queryParams,
270
+ authHeader: providerConfig.authHeader,
271
+ authPrefix: providerConfig.authPrefix,
272
+ apiBasePath: providerConfig.apiBasePath,
238
273
  };
239
274
  switch (type) {
240
275
  case 'ollama':
@@ -255,4 +290,3 @@ export class AIModel {
255
290
  return type.toLowerCase();
256
291
  }
257
292
  }
258
- //# sourceMappingURL=ai-model.js.map
package/dist/auditor.d.ts CHANGED
@@ -79,4 +79,3 @@ export declare class BufferedAuditor implements Auditor {
79
79
  /** Clear all buffered events without flushing */
80
80
  clear(): void;
81
81
  }
82
- //# sourceMappingURL=auditor.d.ts.map
package/dist/auditor.js CHANGED
@@ -110,4 +110,3 @@ export class BufferedAuditor {
110
110
  this.events.length = 0;
111
111
  }
112
112
  }
113
- //# sourceMappingURL=auditor.js.map
package/dist/client.d.ts CHANGED
@@ -86,4 +86,3 @@ export declare abstract class BaseLLMClient {
86
86
  */
87
87
  protected validateStructuredResponse(content: string, config: SchemaConfig<unknown>): void;
88
88
  }
89
- //# sourceMappingURL=client.d.ts.map
package/dist/client.js CHANGED
@@ -297,4 +297,3 @@ export class BaseLLMClient {
297
297
  }
298
298
  }
299
299
  }
300
- //# sourceMappingURL=client.js.map
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Gemma 4 can emit its thought channel as text control tokens instead of the
3
+ * generic Ollama `message.thinking` field. Keep that provider quirk isolated so
4
+ * callers receive final-answer text and reasoning separately.
5
+ */
6
+ export interface GemmaThoughtExtraction {
7
+ readonly content: string;
8
+ readonly reasoning: string;
9
+ readonly found: boolean;
10
+ }
11
+ export declare const GEMMA_THOUGHT_OPENERS: readonly ["<|channel>thought", "<|thought"];
12
+ export declare function extractGemmaThoughtChannels(input: string): GemmaThoughtExtraction;
13
+ export declare function normalizeGemmaThought(thought: string): string;
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Gemma 4 can emit its thought channel as text control tokens instead of the
3
+ * generic Ollama `message.thinking` field. Keep that provider quirk isolated so
4
+ * callers receive final-answer text and reasoning separately.
5
+ */
6
+ const GEMMA_THOUGHT_BLOCK = /<\|channel>\s*thought\s*\r?\n?([\s\S]*?)<channel\|>/gi;
7
+ const GEMMA_COMPACT_THOUGHT_BLOCK = /<\|thought\s*\r?\n?([\s\S]*?)\|>/gi;
8
+ export const GEMMA_THOUGHT_OPENERS = ['<|channel>thought', '<|thought'];
9
+ export function extractGemmaThoughtChannels(input) {
10
+ if (!input)
11
+ return { content: input, reasoning: '', found: false };
12
+ const reasoningParts = [];
13
+ let found = false;
14
+ const content = input
15
+ .replace(GEMMA_THOUGHT_BLOCK, (_match, thought) => {
16
+ found = true;
17
+ const normalized = normalizeGemmaThought(thought);
18
+ if (normalized)
19
+ reasoningParts.push(normalized);
20
+ return '';
21
+ })
22
+ .replace(GEMMA_COMPACT_THOUGHT_BLOCK, (_match, thought) => {
23
+ found = true;
24
+ const normalized = normalizeGemmaThought(thought);
25
+ if (normalized)
26
+ reasoningParts.push(normalized);
27
+ return '';
28
+ });
29
+ return {
30
+ content,
31
+ reasoning: reasoningParts.join('\n\n'),
32
+ found,
33
+ };
34
+ }
35
+ export function normalizeGemmaThought(thought) {
36
+ return thought.replace(/^\s+/, '').replace(/\s+$/, '');
37
+ }
@@ -0,0 +1,48 @@
1
+ /**
2
+ * DiffusionGemma (vLLM) native-protocol adapter.
3
+ *
4
+ * Trimmed vLLM builds that serve DiffusionGemma ship with NO reasoning parser
5
+ * and NO tool-call parser module, and they reject OpenAI-style `tools` unless
6
+ * `--tool-call-parser` is configured. Everything therefore has to be handled
7
+ * client-side, against the model's native channel format (visible only when
8
+ * the request sets `skip_special_tokens: false`):
9
+ *
10
+ * <|channel>thought ...reasoning... <channel|> reasoning channel
11
+ * <|tool_call>call:name{k:<|"|>v<|"|>,n:3}<tool_call|> tool call
12
+ *
13
+ * Tool-call arguments are NOT JSON: keys are bare, strings are wrapped in the
14
+ * <|"|> quote token, numbers/booleans are bare (see the model's
15
+ * chat_template.jinja `format_argument` macro). `gemmaArgsToJson` converts
16
+ * that into a standard JSON string.
17
+ *
18
+ * Request-side protocol (implemented in the OpenAI provider):
19
+ * - always send `skip_special_tokens: false`
20
+ * - send `tools` with `tool_choice: 'none'` — vLLM still renders the
21
+ * declarations into the chat template, it just skips its (absent) parser
22
+ * - send history tool turns structurally (assistant `tool_calls` +
23
+ * `role: 'tool'` messages) — the chat template renders them natively
24
+ */
25
+ export interface GemmaParsedToolCall {
26
+ readonly name: string;
27
+ /** JSON-encoded arguments object, ready for LLMToolCall.function.arguments */
28
+ readonly argumentsJson: string;
29
+ }
30
+ export interface GemmaDiffusionParsed {
31
+ /** Final answer with reasoning, tool-call blocks and special tokens removed */
32
+ readonly content: string;
33
+ readonly reasoning: string;
34
+ readonly toolCalls: readonly GemmaParsedToolCall[];
35
+ }
36
+ /** Models that speak this native protocol when served by vLLM. */
37
+ export declare function isGemmaDiffusionModel(model: string): boolean;
38
+ /**
39
+ * Convert the Gemma template's pseudo-JSON argument syntax to a JSON string.
40
+ * Lenient by design: bare words that aren't numbers/booleans become strings,
41
+ * since the model occasionally omits the quote token.
42
+ */
43
+ export declare function gemmaArgsToJson(body: string): string;
44
+ /**
45
+ * Parse a complete raw DiffusionGemma output into reasoning, tool calls and
46
+ * clean answer text.
47
+ */
48
+ export declare function parseGemmaDiffusionOutput(raw: string): GemmaDiffusionParsed;
@@ -0,0 +1,146 @@
1
+ /**
2
+ * DiffusionGemma (vLLM) native-protocol adapter.
3
+ *
4
+ * Trimmed vLLM builds that serve DiffusionGemma ship with NO reasoning parser
5
+ * and NO tool-call parser module, and they reject OpenAI-style `tools` unless
6
+ * `--tool-call-parser` is configured. Everything therefore has to be handled
7
+ * client-side, against the model's native channel format (visible only when
8
+ * the request sets `skip_special_tokens: false`):
9
+ *
10
+ * <|channel>thought ...reasoning... <channel|> reasoning channel
11
+ * <|tool_call>call:name{k:<|"|>v<|"|>,n:3}<tool_call|> tool call
12
+ *
13
+ * Tool-call arguments are NOT JSON: keys are bare, strings are wrapped in the
14
+ * <|"|> quote token, numbers/booleans are bare (see the model's
15
+ * chat_template.jinja `format_argument` macro). `gemmaArgsToJson` converts
16
+ * that into a standard JSON string.
17
+ *
18
+ * Request-side protocol (implemented in the OpenAI provider):
19
+ * - always send `skip_special_tokens: false`
20
+ * - send `tools` with `tool_choice: 'none'` — vLLM still renders the
21
+ * declarations into the chat template, it just skips its (absent) parser
22
+ * - send history tool turns structurally (assistant `tool_calls` +
23
+ * `role: 'tool'` messages) — the chat template renders them natively
24
+ */
25
+ import { extractGemmaThoughtChannels } from './gemma-channel.js';
26
+ /** Models that speak this native protocol when served by vLLM. */
27
+ export function isGemmaDiffusionModel(model) {
28
+ return /diffusion[-_]?gemma/i.test(model);
29
+ }
30
+ const TOOL_CALL_BLOCK = /<\|tool_call>\s*call:([a-zA-Z0-9_.-]+)\s*\{([\s\S]*?)\}\s*<tool_call\|>/g;
31
+ /**
32
+ * Residual control tokens that may leak into text output — including stray
33
+ * unbalanced channel markers (the model occasionally emits an extra
34
+ * <channel|> closer mid-answer).
35
+ */
36
+ const RESIDUAL_SPECIAL = /<\|?(?:turn|think|image|audio|video|tool_response|tool_call|tool|channel)\b[^>]*?\|?>|<(?:turn|channel|tool_response|tool_call|tool)\|>/g;
37
+ const QUOTE_TOKEN = '<|"|>';
38
+ /**
39
+ * Convert the Gemma template's pseudo-JSON argument syntax to a JSON string.
40
+ * Lenient by design: bare words that aren't numbers/booleans become strings,
41
+ * since the model occasionally omits the quote token.
42
+ */
43
+ export function gemmaArgsToJson(body) {
44
+ // Argument bodies arrive without their outer braces (the regex strips them)
45
+ const src = `{${body}}`;
46
+ let i = 0;
47
+ const n = src.length;
48
+ function skipWs() {
49
+ while (i < n && /\s/.test(src[i]))
50
+ i++;
51
+ }
52
+ function parseQuoted() {
53
+ // positioned at the start of QUOTE_TOKEN
54
+ i += QUOTE_TOKEN.length;
55
+ const end = src.indexOf(QUOTE_TOKEN, i);
56
+ const raw = end === -1 ? src.slice(i) : src.slice(i, end);
57
+ i = end === -1 ? n : end + QUOTE_TOKEN.length;
58
+ return raw;
59
+ }
60
+ function parseBare(stops) {
61
+ const start = i;
62
+ while (i < n && !stops.includes(src[i]) && !src.startsWith(QUOTE_TOKEN, i))
63
+ i++;
64
+ return src.slice(start, i).trim();
65
+ }
66
+ function parseValue() {
67
+ skipWs();
68
+ if (src.startsWith(QUOTE_TOKEN, i))
69
+ return JSON.stringify(parseQuoted());
70
+ const c = src[i];
71
+ if (c === '{')
72
+ return parseObject();
73
+ if (c === '[')
74
+ return parseArray();
75
+ const bare = parseBare(',}]');
76
+ if (/^-?\d+(\.\d+)?([eE][+-]?\d+)?$/.test(bare))
77
+ return bare;
78
+ if (bare === 'true' || bare === 'false' || bare === 'null')
79
+ return bare;
80
+ return JSON.stringify(bare);
81
+ }
82
+ function parseObject() {
83
+ i++; // consume {
84
+ const parts = [];
85
+ skipWs();
86
+ while (i < n && src[i] !== '}') {
87
+ skipWs();
88
+ const key = src.startsWith(QUOTE_TOKEN, i) ? parseQuoted() : parseBare(':');
89
+ skipWs();
90
+ if (src[i] === ':')
91
+ i++;
92
+ const value = parseValue();
93
+ parts.push(`${JSON.stringify(key.trim())}:${value}`);
94
+ skipWs();
95
+ if (src[i] === ',')
96
+ i++;
97
+ skipWs();
98
+ }
99
+ i++; // consume }
100
+ return `{${parts.join(',')}}`;
101
+ }
102
+ function parseArray() {
103
+ i++; // consume [
104
+ const parts = [];
105
+ skipWs();
106
+ while (i < n && src[i] !== ']') {
107
+ parts.push(parseValue());
108
+ skipWs();
109
+ if (src[i] === ',')
110
+ i++;
111
+ skipWs();
112
+ }
113
+ i++; // consume ]
114
+ return `[${parts.join(',')}]`;
115
+ }
116
+ skipWs();
117
+ return parseObject();
118
+ }
119
+ /**
120
+ * Parse a complete raw DiffusionGemma output into reasoning, tool calls and
121
+ * clean answer text.
122
+ */
123
+ export function parseGemmaDiffusionOutput(raw) {
124
+ if (!raw)
125
+ return { content: raw, reasoning: '', toolCalls: [] };
126
+ const toolCalls = [];
127
+ let text = raw.replace(TOOL_CALL_BLOCK, (_m, name, args) => {
128
+ toolCalls.push({ name, argumentsJson: gemmaArgsToJson(args) });
129
+ return '';
130
+ });
131
+ const channels = extractGemmaThoughtChannels(text);
132
+ text = channels.content;
133
+ // Unterminated thought channel (model hit max_tokens mid-reasoning)
134
+ let reasoning = channels.reasoning;
135
+ const danglingThought = text.match(/<\|channel>\s*thought\s*\r?\n?([\s\S]*)$/i);
136
+ if (danglingThought) {
137
+ reasoning = reasoning ? `${reasoning}\n\n${danglingThought[1].trim()}` : danglingThought[1].trim();
138
+ text = text.slice(0, danglingThought.index);
139
+ }
140
+ text = text.replace(RESIDUAL_SPECIAL, '');
141
+ return {
142
+ content: text.trim(),
143
+ reasoning,
144
+ toolCalls,
145
+ };
146
+ }
package/dist/http.d.ts CHANGED
@@ -44,6 +44,9 @@ export declare function parseSSE(stream: AsyncGenerator<string>): AsyncGenerator
44
44
  }, void, unknown>;
45
45
  /**
46
46
  * Build standard headers for LLM API requests.
47
+ * Merges any provider-specific extraHeaders (from ProviderConfig) on top.
48
+ * Provider clients can still fully override (e.g. Anthropic uses x-api-key).
49
+ *
50
+ * Respects authHeader / authPrefix from config for Azure-style or gateway auth.
47
51
  */
48
52
  export declare function buildHeaders(options: LLMClientOptions): Record<string, string>;
49
- //# sourceMappingURL=http.d.ts.map
package/dist/http.js CHANGED
@@ -174,14 +174,26 @@ export async function* parseSSE(stream) {
174
174
  // ============================================================================
175
175
  /**
176
176
  * Build standard headers for LLM API requests.
177
+ * Merges any provider-specific extraHeaders (from ProviderConfig) on top.
178
+ * Provider clients can still fully override (e.g. Anthropic uses x-api-key).
179
+ *
180
+ * Respects authHeader / authPrefix from config for Azure-style or gateway auth.
177
181
  */
178
182
  export function buildHeaders(options) {
179
183
  const headers = {
180
184
  'Content-Type': 'application/json',
181
185
  };
182
186
  if (options.apiKey) {
183
- headers['Authorization'] = `Bearer ${options.apiKey}`;
187
+ const headerName = options.authHeader || 'Authorization';
188
+ // Sensible default prefix: Bearer for Authorization, nothing for api-key / x-api-key etc.
189
+ const defaultPrefix = headerName.toLowerCase() === 'authorization' ? 'Bearer ' : '';
190
+ const prefix = options.authPrefix !== undefined ? options.authPrefix : defaultPrefix;
191
+ headers[headerName] = `${prefix}${options.apiKey}`.trim();
192
+ }
193
+ // Merge provider-specific extras (e.g. Azure 'api-key', custom gateway headers).
194
+ // Later entries win on conflicts, allowing complete override of auth.
195
+ if (options.extraHeaders) {
196
+ Object.assign(headers, options.extraHeaders);
184
197
  }
185
198
  return headers;
186
199
  }
187
- //# sourceMappingURL=http.js.map