universal-llm-client 4.3.0 → 4.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -19
- package/README.md +62 -11
- package/dist/ai-model.d.ts +12 -2
- package/dist/ai-model.js +36 -2
- package/dist/auditor.d.ts +0 -1
- package/dist/auditor.js +0 -1
- package/dist/client.d.ts +0 -1
- package/dist/client.js +0 -1
- package/dist/gemma-channel.d.ts +13 -0
- package/dist/gemma-channel.js +37 -0
- package/dist/gemma-diffusion.d.ts +48 -0
- package/dist/gemma-diffusion.js +146 -0
- package/dist/http.d.ts +4 -1
- package/dist/http.js +14 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.js +4 -1
- package/dist/interfaces.d.ts +163 -8
- package/dist/interfaces.js +0 -1
- package/dist/mcp.d.ts +0 -1
- package/dist/mcp.js +0 -1
- package/dist/providers/anthropic.d.ts +0 -1
- package/dist/providers/anthropic.js +28 -4
- package/dist/providers/google.d.ts +22 -2
- package/dist/providers/google.js +223 -14
- package/dist/providers/index.d.ts +0 -1
- package/dist/providers/index.js +0 -1
- package/dist/providers/ollama.d.ts +2 -1
- package/dist/providers/ollama.js +59 -31
- package/dist/providers/openai.d.ts +16 -1
- package/dist/providers/openai.js +488 -81
- package/dist/router.d.ts +2 -1
- package/dist/router.js +4 -1
- package/dist/stream-decoder.d.ts +12 -1
- package/dist/stream-decoder.js +182 -6
- package/dist/structured-output.d.ts +0 -1
- package/dist/structured-output.js +0 -1
- package/dist/thinking.d.ts +35 -0
- package/dist/thinking.js +51 -0
- package/dist/tools.d.ts +0 -1
- package/dist/tools.js +0 -1
- package/dist/zod-adapter.d.ts +0 -1
- package/dist/zod-adapter.js +0 -1
- package/package.json +3 -1
- package/dist/ai-model.d.ts.map +0 -1
- package/dist/ai-model.js.map +0 -1
- package/dist/auditor.d.ts.map +0 -1
- package/dist/auditor.js.map +0 -1
- package/dist/client.d.ts.map +0 -1
- package/dist/client.js.map +0 -1
- package/dist/http.d.ts.map +0 -1
- package/dist/http.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/interfaces.d.ts.map +0 -1
- package/dist/interfaces.js.map +0 -1
- package/dist/mcp.d.ts.map +0 -1
- package/dist/mcp.js.map +0 -1
- package/dist/providers/anthropic.d.ts.map +0 -1
- package/dist/providers/anthropic.js.map +0 -1
- package/dist/providers/google.d.ts.map +0 -1
- package/dist/providers/google.js.map +0 -1
- package/dist/providers/index.d.ts.map +0 -1
- package/dist/providers/index.js.map +0 -1
- package/dist/providers/ollama.d.ts.map +0 -1
- package/dist/providers/ollama.js.map +0 -1
- package/dist/providers/openai.d.ts.map +0 -1
- package/dist/providers/openai.js.map +0 -1
- package/dist/router.d.ts.map +0 -1
- package/dist/router.js.map +0 -1
- package/dist/stream-decoder.d.ts.map +0 -1
- package/dist/stream-decoder.js.map +0 -1
- package/dist/structured-output.d.ts.map +0 -1
- package/dist/structured-output.js.map +0 -1
- package/dist/tools.d.ts.map +0 -1
- package/dist/tools.js.map +0 -1
- package/dist/zod-adapter.d.ts.map +0 -1
- package/dist/zod-adapter.js.map +0 -1
package/CHANGELOG.md
CHANGED
|
@@ -5,41 +5,56 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
-
## [4.
|
|
8
|
+
## [4.5.1] - 2026-06-14
|
|
9
9
|
|
|
10
10
|
### Added
|
|
11
11
|
|
|
12
|
-
-
|
|
13
|
-
- **`GoogleResponse.usageMetadata.thoughtsTokenCount`** — typed on the raw provider response shape.
|
|
12
|
+
- **Automatic vLLM tool-calling fallback (OpenAI-compatible provider)** — when a vLLM / OpenAI-compatible server rejects native tool calling (started without `--enable-auto-tool-choice` / `--tool-call-parser`, surfaced as a `"auto" tool choice requires …` 400), the client now transparently retries with a text-level tool protocol: it drops `tools` / `tool_choice`, injects a `<tool_call>name({…})</tool_call>` instruction, and recovers the emitted calls back into `message.tool_calls` (parsing `<tool_call>`, `<function=…>`, and `name(args)` forms). Works for both `chat()` and `chatStream()`, and emits a one-time warning pointing to the flags needed for native parsing. Tool calling now works against vLLM servers not started with tool-parser flags.
|
|
14
13
|
|
|
15
14
|
### Changed
|
|
16
15
|
|
|
17
|
-
- **
|
|
16
|
+
- **Non-leading system messages normalized for OpenAI-compatible servers** — leading system messages are preserved, but a system message appearing *after* the conversation has started is rewritten to a `user` turn prefixed with `[SYSTEM MESSAGE]` (many OpenAI-compatible servers and chat templates reject mid-conversation system roles). Multimodal content is flattened to text for the prefix.
|
|
17
|
+
- **`chat()` / `chatStream()` no longer auto-attach registered tools** — tools are sent only when explicitly passed via `options.tools` (the `chatWithTools` path already does this). Previously every tool added with `registerTool` was attached to plain `chat()` calls. Pass `{ tools }` explicitly (or use `chatWithTools`) if you relied on the old behavior.
|
|
18
|
+
- **Minimal published package** — the npm tarball now ships only `dist/` (compiled JS + type declarations) plus `README` / `CHANGELOG` / `LICENSE`. `src/` (tests, demos, test-scripts) is no longer published, and the build no longer emits `.js.map` / `.d.ts.map` (they referenced sources that aren't shipped and triggered "missing source" warnings in consumer bundlers). Package size dropped ~78% (357 kB → 80 kB, 175 → 44 files). Because `src/` is no longer whitelisted, a local demo model cache under `src/` can never bloat the published package again.
|
|
18
19
|
|
|
19
|
-
## [4.
|
|
20
|
+
## [4.5.0] - 2026-06-14
|
|
20
21
|
|
|
21
22
|
### Added
|
|
22
23
|
|
|
23
|
-
- **
|
|
24
|
-
-
|
|
25
|
-
-
|
|
26
|
-
-
|
|
27
|
-
-
|
|
28
|
-
-
|
|
24
|
+
- **Server-side reasoning field support (OpenAI-compatible provider)** — Reasoning models served over the OpenAI-compatible API (vLLM `--reasoning-parser`, DeepSeek-R1, etc.) return their chain-of-thought in a dedicated `reasoning_content` (vLLM) / `reasoning` (some gateways) field instead of inline `<think>` tags. The OpenAI provider now surfaces it:
|
|
25
|
+
- `chat()` populates `LLMChatResponse.reasoning` from `message.reasoning_content` / `message.reasoning`, keeping `message.content` clean
|
|
26
|
+
- `chatStream()` emits `delta.reasoning_content` / `delta.reasoning` chunks as `thinking` stream events and accumulates them into the final `reasoning`
|
|
27
|
+
- Inline `<think>` parsing (via `StandardChatDecoder`) is preserved as a fallback for servers run without a reasoning parser
|
|
28
|
+
- `OpenAIResponse` message type extended with optional `reasoning` / `reasoning_content`
|
|
29
|
+
- Verified end-to-end against vLLM serving `nvidia/Qwen3.6-35B-A3B-NVFP4` (NVFP4) on Blackwell — reasoning, streaming `thinking` events, tool calling (`qwen3_xml`), and structured output all pass
|
|
30
|
+
- **Unified `thinking` flag with levels across all providers** — `thinking` (model config) and per-call `ChatOptions.thinking` accept `true`/`false` **or a level `'minimal' | 'low' | 'medium' | 'high'`** (new `ThinkingLevel` type), mapped to each backend's native control so apps switch providers without reasoning-specific code. A shared `resolveThinking` helper (`src/thinking.ts`) normalizes the value; each provider maps it:
|
|
31
|
+
- **OpenAI-compatible** → OpenAI reasoning models (o-series / GPT-5, by name) get `reasoning_effort:<level>`; vLLM / Qwen get `chat_template_kwargs.enable_thinking`. Emitted only when explicitly set.
|
|
32
|
+
- **Google / Gemini** → Gemini 3.x `thinkingConfig.thinkingLevel`; Gemini 2.5/2.0 `thinkingBudget` (level→budget map, `0` off, `-1` dynamic). `includeThoughts` enabled when thinking is on.
|
|
33
|
+
- **Anthropic** → extended thinking `budget_tokens` from the level (kept `< max_tokens`; temperature omitted, per API).
|
|
34
|
+
- **Ollama** → `think` on/off (no native levels).
|
|
35
|
+
- Per-call overrides model config everywhere. Verified live against vLLM (Qwen3.6-NVFP4) and **Gemini 3.5 Flash** (levels produce distinct reasoning-token counts); unit-tested per provider + `resolveThinking`.
|
|
36
|
+
- **Gemini reasoning text surfaced** — with thinking on, the Google provider sets `includeThoughts:true` and routes `thought:true` parts into `response.reasoning` (non-streaming) and live `thinking` stream events, matching how Qwen/Anthropic expose chain-of-thought (previously only `reasoningTokens` was reported). Verified live on `gemini-3.5-flash` (654–976 chars of reasoning across levels).
|
|
37
|
+
- **Gemini Deep Research API** — new Google-only `AIModel.deepResearch(input, opts)` (creates a `/v1beta/interactions` background interaction and polls to completion → `{ id, status, report, steps }`) and `AIModel.deepResearchStream(input, opts)` (live `thought`/`text`/`status` events). New `DeepResearchOptions`/`DeepResearchResult`/`DeepResearchStep`/`DeepResearchEvent` types. Throws a clear error if no Google provider is configured. Create + poll plumbing verified live.
|
|
38
|
+
- **Generation stats — `usage.durationMs` and `usage.tokensPerSecond`** — decode throughput is now reported on `LLMChatResponse.usage`: server-precise for Ollama (from `eval_count` / `eval_duration`, which were previously discarded), and client-measured wall-clock for OpenAI-compatible / vLLM (which return no timing in `usage`). `OllamaResponse` gained `total_duration` / `load_duration` typings.
|
|
39
|
+
- **OpenAI-compatible transport flexibility** — `ProviderConfig` gains `headers`, `queryParams`, `authHeader`, `authPrefix`, and `apiBasePath` for Azure OpenAI, custom gateways, and non-`/v1` servers (no custom code needed). Honored by the OpenAI-compatible provider (`headers`/`authHeader`/`authPrefix` also by Ollama, via `buildHeaders`); a `buildUrl` helper applies `queryParams`/`apiBasePath` across all endpoints and preserves any query string already on the base URL. Includes a 2026 provider-API-landscape research doc (`docs/research/`).
|
|
29
40
|
|
|
30
|
-
|
|
41
|
+
### Fixed
|
|
42
|
+
|
|
43
|
+
- **README** — the tool-execution trace field is `response.toolExecutions` (array of `{ tool_call_id, output, error?, duration? }`), not `toolTrace`; corrected the `chatWithTools` example.
|
|
44
|
+
|
|
45
|
+
## [4.4.0] - 2026-06-11
|
|
31
46
|
|
|
32
47
|
### Added
|
|
33
48
|
|
|
34
|
-
- **
|
|
35
|
-
-
|
|
36
|
-
-
|
|
37
|
-
-
|
|
38
|
-
- **
|
|
49
|
+
- **Diffusion LM support (DiffusionGemma family)** — First-class client-side protocol for diffusion language models served by OpenAI-compatible endpoints that ship without server-side reasoning or tool-call parsers (e.g. current vLLM diffusion builds, which reject request-level `tools` with auto tool choice):
|
|
50
|
+
- `gemma-diffusion.ts` — model detection (`isGemmaDiffusionModel`), native channel parsing (`<|channel>thought … <channel|>` reasoning, `<|tool_call>call:name{…}<tool_call|>` tool calls), and pseudo-JSON argument conversion (`gemmaArgsToJson`: `<|"|>` quote tokens, bare keys, nested objects/arrays)
|
|
51
|
+
- OpenAI provider native mode (auto-detected from the model name, `gemmaNativeProtocol` option to override): sends `skip_special_tokens: false` and `tools` + `tool_choice: "none"` (declarations still render into the chat template), parses reasoning and tool calls client-side, and yields decoder-classified `thinking`/`text` streaming events
|
|
52
|
+
- Full agentic `chatWithTools` loop works end-to-end against DiffusionGemma; history tool turns use standard structured `tool_calls` + `role: "tool"` messages
|
|
53
|
+
- **"Signal from Noise" demo** (`src/demos/diffusion-gemma/`) — vLLM test harness plus a diffusion chat canvas that animates block-parallel denoising paced by real block arrivals, with replay/scrubbing, reasoning-channel separation, a rendered-markdown reading view, and an engine-reload entropy control
|
|
39
54
|
|
|
40
|
-
###
|
|
55
|
+
### Fixed
|
|
41
56
|
|
|
42
|
-
-
|
|
57
|
+
- Stray unbalanced `<channel|>` / `<turn|>` markers emitted by diffusion models are stripped from parsed content
|
|
43
58
|
|
|
44
59
|
## [4.0.0] - 2026-03-13
|
|
45
60
|
|
package/README.md
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
# universal-llm-client
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[](https://www.npmjs.com/package/universal-llm-client) [](https://github.com/igorls/universal-llm-client/actions/workflows/ci.yml) [](https://www.npmjs.com/package/universal-llm-client) [](https://github.com/igorls/universal-llm-client/blob/main/LICENSE)
|
|
4
|
+
|
|
5
|
+
A universal LLM client for JavaScript/TypeScript with **transparent provider failover** and a **provider-agnostic reasoning API** — one set of code across OpenAI, Anthropic, Google Gemini, Ollama, vLLM, and any OpenAI-compatible endpoint. Streaming tool execution, structured output, generation stats, and native observability included.
|
|
4
6
|
|
|
5
7
|
```typescript
|
|
6
8
|
import { AIModel } from 'universal-llm-client';
|
|
7
9
|
|
|
8
10
|
const model = new AIModel({
|
|
9
|
-
model: 'gemini-
|
|
11
|
+
model: 'gemini-3.5-flash',
|
|
10
12
|
providers: [
|
|
11
13
|
{ type: 'google', apiKey: process.env.GOOGLE_API_KEY },
|
|
12
14
|
{ type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY },
|
|
@@ -26,10 +28,13 @@ const response = await model.chat([
|
|
|
26
28
|
## Features
|
|
27
29
|
|
|
28
30
|
- 🔄 **Transparent Failover** — Priority-ordered provider chain with retries, health tracking, and cooldowns
|
|
31
|
+
- 🧠 **Unified Reasoning** — One `thinking` flag (`true`/`false` or a level: `'minimal' | 'low' | 'medium' | 'high'`) mapped to each backend's native control; chain-of-thought surfaced as `response.reasoning` + streaming `thinking` events (with `<think>`-tag parsing as a fallback)
|
|
29
32
|
- 🛠️ **Tool Calling** — Register tools once, works across all providers. Autonomous multi-turn execution loop
|
|
30
33
|
- 📋 **Structured Output** — Zod schema validation, JSON Schema support, streaming, and type-safe responses
|
|
31
34
|
- 🌊 **Streaming** — First-class async generator streaming with pluggable decoder strategies
|
|
32
|
-
-
|
|
35
|
+
- 🔬 **Deep Research** — Drive Google Gemini's agentic Deep Research (background interactions with polling + streaming)
|
|
36
|
+
- 📈 **Generation Stats** — `usage.tokensPerSecond` and `durationMs` reported across providers
|
|
37
|
+
- 🔌 **Flexible Transport** — Custom headers, query params, auth header/prefix, and base path for Azure OpenAI and gateways
|
|
33
38
|
- 🔍 **Observability** — Built-in auditor interface for logging, cost tracking, and behavioral analysis
|
|
34
39
|
- 🌐 **Universal Runtime** — Node.js 22+, Bun, Deno, and modern browsers
|
|
35
40
|
- 🤖 **MCP Native** — Bridge MCP servers to LLM tools with zero glue code
|
|
@@ -39,11 +44,14 @@ const response = await model.chat([
|
|
|
39
44
|
|
|
40
45
|
| Provider | Type | Notes |
|
|
41
46
|
|---|---|---|
|
|
42
|
-
| **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal |
|
|
43
|
-
| **OpenAI** | `openai` | GPT-
|
|
44
|
-
| **Google AI Studio** | `google` | Gemini models, system instructions, multimodal |
|
|
45
|
-
| **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints
|
|
46
|
-
| **
|
|
47
|
+
| **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal, native thinking |
|
|
48
|
+
| **OpenAI + Compat** | `openai` | GPT series, o-series + **any OpenAI-compatible endpoint**: xAI/Grok, Mistral, DeepSeek, Cohere Compatibility, Groq, Together, Fireworks, OpenRouter, Perplexity Sonar, vLLM, LM Studio, TGI, most self-hosted servers |
|
|
49
|
+
| **Google AI Studio** | `google` | Gemini models, system instructions, multimodal, native thinking + grounding |
|
|
50
|
+
| **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints, Bearer tokens, service tiers (flex/priority) |
|
|
51
|
+
| **Anthropic (Claude)** | `anthropic` | Claude 3.5/4 models via native Messages API. Excellent tool use, extended thinking with signatures, strong prompt caching |
|
|
52
|
+
| **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances (OpenAI-compatible under the hood) |
|
|
53
|
+
|
|
54
|
+
**Most of the world** is reachable via `type: 'openai'` + a `url` override. We only maintain dedicated clients for fundamentally different protocols (Anthropic Messages, Google Gemini) that offer unique high-value capabilities, plus Ollama for local developer experience. See `docs/guide/providers.md` and the research survey in `docs/research/provider-api-landscape-2026.md`.
|
|
47
55
|
|
|
48
56
|
---
|
|
49
57
|
|
|
@@ -98,6 +106,44 @@ for await (const event of model.chatStream([
|
|
|
98
106
|
}
|
|
99
107
|
```
|
|
100
108
|
|
|
109
|
+
### Thinking & Reasoning
|
|
110
|
+
|
|
111
|
+
Set one `thinking` value — `true`/`false` or a level (`'minimal' | 'low' | 'medium' | 'high'`) —
|
|
112
|
+
and it maps to each provider's native control (Gemini `thinkingLevel`/`thinkingBudget`, OpenAI
|
|
113
|
+
`reasoning_effort`, vLLM `enable_thinking`, Anthropic `budget_tokens`, Ollama `think`):
|
|
114
|
+
|
|
115
|
+
```typescript
|
|
116
|
+
const model = new AIModel({
|
|
117
|
+
model: 'gemini-3.5-flash',
|
|
118
|
+
thinking: 'high', // true | false | 'minimal' | 'low' | 'medium' | 'high'
|
|
119
|
+
providers: [{ type: 'google', apiKey: process.env.GOOGLE_API_KEY }],
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
const res = await model.chat([{ role: 'user', content: 'Solve this step by step: ...' }]);
|
|
123
|
+
console.log(res.message.content); // final answer (clean)
|
|
124
|
+
console.log(res.reasoning); // chain-of-thought, when the model exposes it
|
|
125
|
+
|
|
126
|
+
// Per-call override (e.g. turn thinking off for structured output)
|
|
127
|
+
await model.chat(messages, { thinking: false });
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Deep Research (Gemini)
|
|
131
|
+
|
|
132
|
+
Run Google's agentic Deep Research — creates a background interaction and polls to completion:
|
|
133
|
+
|
|
134
|
+
```typescript
|
|
135
|
+
const result = await model.deepResearch('Research the history of Google TPUs.', {
|
|
136
|
+
tools: ['google_search', 'url_context'],
|
|
137
|
+
});
|
|
138
|
+
console.log(result.status, result.report);
|
|
139
|
+
|
|
140
|
+
// Or stream intermediate thoughts and steps as they arrive:
|
|
141
|
+
for await (const ev of model.deepResearchStream('Compare RISC-V vs ARM in 2026.')) {
|
|
142
|
+
if (ev.type === 'thought') console.log('[thinking]', ev.content);
|
|
143
|
+
else if (ev.type === 'text') process.stdout.write(ev.content);
|
|
144
|
+
}
|
|
145
|
+
```
|
|
146
|
+
|
|
101
147
|
### Tool Calling
|
|
102
148
|
|
|
103
149
|
```typescript
|
|
@@ -124,8 +170,8 @@ const response = await model.chatWithTools([
|
|
|
124
170
|
|
|
125
171
|
console.log(response.message.content);
|
|
126
172
|
// "The weather in Tokyo is 22°C and sunny."
|
|
127
|
-
console.log(response.
|
|
128
|
-
// [{
|
|
173
|
+
console.log(response.toolExecutions);
|
|
174
|
+
// [{ tool_call_id: 'call_abc', output: { temperature: 22, condition: 'sunny', city: 'Tokyo' }, duration: 5 }]
|
|
129
175
|
```
|
|
130
176
|
|
|
131
177
|
### Provider Failover
|
|
@@ -421,13 +467,18 @@ new AIModel(config: AIModelConfig)
|
|
|
421
467
|
|
|
422
468
|
| Property | Type | Description |
|
|
423
469
|
|---|---|---|
|
|
424
|
-
| `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'` |
|
|
470
|
+
| `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'`, `'anthropic'` |
|
|
425
471
|
| `url` | `string` | Provider URL (has sensible defaults) |
|
|
426
472
|
| `apiKey` | `string` | API key or Bearer token |
|
|
427
473
|
| `priority` | `number` | Lower = tried first (defaults to array index) |
|
|
428
474
|
| `model` | `string` | Override model name for this provider |
|
|
429
475
|
| `region` | `string` | Vertex AI region (e.g., `'us-central1'`) |
|
|
430
476
|
| `apiVersion` | `string` | API version (e.g., `'v1beta'`) |
|
|
477
|
+
| `headers` | `Record<string,string>` | Extra headers merged into requests — OpenAI-compatible & Ollama (Azure `api-key`, gateways) |
|
|
478
|
+
| `queryParams` | `Record<string,string>` | Query params appended to URLs — OpenAI-compatible only (e.g. Azure `api-version`) |
|
|
479
|
+
| `authHeader` | `string` | Header name for the key — OpenAI-compatible & Ollama (e.g. `'api-key'`) |
|
|
480
|
+
| `authPrefix` | `string` | Prefix before the key value — OpenAI-compatible & Ollama (e.g. `''` for api-key style) |
|
|
481
|
+
| `apiBasePath` | `string` | OpenAI-compatible only: override or disable the `/v1` suffix (use `''` for full Azure deployment URLs) |
|
|
431
482
|
|
|
432
483
|
**Methods:**
|
|
433
484
|
|
package/dist/ai-model.d.ts
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Provider classes are internal — the user never imports them.
|
|
8
8
|
*/
|
|
9
|
-
import { type AIModelConfig, type LLMChatMessage, type LLMChatResponse, type ChatOptions, type ModelMetadata, type LLMFunction, type ToolHandler } from './interfaces.js';
|
|
9
|
+
import { type AIModelConfig, type LLMChatMessage, type LLMChatResponse, type ChatOptions, type ModelMetadata, type LLMFunction, type ToolHandler, type DeepResearchOptions, type DeepResearchResult, type DeepResearchEvent } from './interfaces.js';
|
|
10
10
|
import type { DecodedEvent } from './stream-decoder.js';
|
|
11
11
|
import { type ProviderStatus } from './router.js';
|
|
12
12
|
import { type StructuredOutputResult, type SchemaConfig } from './structured-output.js';
|
|
@@ -113,6 +113,17 @@ export declare class AIModel {
|
|
|
113
113
|
embed(text: string): Promise<number[]>;
|
|
114
114
|
/** Generate embeddings for multiple texts */
|
|
115
115
|
embedArray(texts: string[]): Promise<number[][]>;
|
|
116
|
+
private getGoogleClient;
|
|
117
|
+
/**
|
|
118
|
+
* Run an agentic Deep Research interaction (Gemini only): creates it and
|
|
119
|
+
* polls until completion. Throws if no Google provider is configured.
|
|
120
|
+
*/
|
|
121
|
+
deepResearch(input: string, options?: DeepResearchOptions): Promise<DeepResearchResult>;
|
|
122
|
+
/**
|
|
123
|
+
* Stream a Deep Research interaction's intermediate thought/text/step events
|
|
124
|
+
* (Gemini only), returning the final result. Throws if no Google provider.
|
|
125
|
+
*/
|
|
126
|
+
deepResearchStream(input: string, options?: DeepResearchOptions): AsyncGenerator<DeepResearchEvent, DeepResearchResult, unknown>;
|
|
116
127
|
/** Register a tool callable by the LLM (broadcast to all providers) */
|
|
117
128
|
registerTool(name: string, description: string, parameters: LLMFunction['parameters'], handler: ToolHandler): void;
|
|
118
129
|
/** Register multiple tools at once */
|
|
@@ -137,4 +148,3 @@ export declare class AIModel {
|
|
|
137
148
|
private createClient;
|
|
138
149
|
private normalizeType;
|
|
139
150
|
}
|
|
140
|
-
//# sourceMappingURL=ai-model.d.ts.map
|
package/dist/ai-model.js
CHANGED
|
@@ -172,6 +172,34 @@ export class AIModel {
|
|
|
172
172
|
return this.router.embedArray(texts);
|
|
173
173
|
}
|
|
174
174
|
// ========================================================================
|
|
175
|
+
// Deep Research (Gemini-only)
|
|
176
|
+
// ========================================================================
|
|
177
|
+
getGoogleClient(method) {
|
|
178
|
+
const googleClients = this.router.getClients().filter((c) => c instanceof GoogleClient);
|
|
179
|
+
// Prefer an AI Studio client — Vertex AI doesn't support Deep Research.
|
|
180
|
+
const aiStudio = googleClients.find(c => c.supportsDeepResearch());
|
|
181
|
+
if (aiStudio)
|
|
182
|
+
return aiStudio;
|
|
183
|
+
if (googleClients.length > 0) {
|
|
184
|
+
throw new Error(`${method} requires an AI Studio Google provider (type: "google"); Vertex AI is not supported for Deep Research.`);
|
|
185
|
+
}
|
|
186
|
+
throw new Error(`${method} requires a Google provider (type: "google"). None is configured.`);
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Run an agentic Deep Research interaction (Gemini only): creates it and
|
|
190
|
+
* polls until completion. Throws if no Google provider is configured.
|
|
191
|
+
*/
|
|
192
|
+
async deepResearch(input, options) {
|
|
193
|
+
return this.getGoogleClient('deepResearch').deepResearch(input, options);
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Stream a Deep Research interaction's intermediate thought/text/step events
|
|
197
|
+
* (Gemini only), returning the final result. Throws if no Google provider.
|
|
198
|
+
*/
|
|
199
|
+
async *deepResearchStream(input, options) {
|
|
200
|
+
return yield* this.getGoogleClient('deepResearchStream').deepResearchStream(input, options);
|
|
201
|
+
}
|
|
202
|
+
// ========================================================================
|
|
175
203
|
// Tool Registration
|
|
176
204
|
// ========================================================================
|
|
177
205
|
/** Register a tool callable by the LLM (broadcast to all providers) */
|
|
@@ -232,9 +260,16 @@ export class AIModel {
|
|
|
232
260
|
retries: this.config.retries ?? 2,
|
|
233
261
|
debug: this.config.debug ?? false,
|
|
234
262
|
defaultParameters: this.config.defaultParameters,
|
|
235
|
-
|
|
263
|
+
// Preserve `undefined` (not set) vs explicit false so providers can
|
|
264
|
+
// decide whether to send a thinking toggle at all.
|
|
265
|
+
thinking: this.config.thinking,
|
|
236
266
|
region: providerConfig.region,
|
|
237
267
|
apiVersion: providerConfig.apiVersion,
|
|
268
|
+
extraHeaders: providerConfig.headers,
|
|
269
|
+
queryParams: providerConfig.queryParams,
|
|
270
|
+
authHeader: providerConfig.authHeader,
|
|
271
|
+
authPrefix: providerConfig.authPrefix,
|
|
272
|
+
apiBasePath: providerConfig.apiBasePath,
|
|
238
273
|
};
|
|
239
274
|
switch (type) {
|
|
240
275
|
case 'ollama':
|
|
@@ -255,4 +290,3 @@ export class AIModel {
|
|
|
255
290
|
return type.toLowerCase();
|
|
256
291
|
}
|
|
257
292
|
}
|
|
258
|
-
//# sourceMappingURL=ai-model.js.map
|
package/dist/auditor.d.ts
CHANGED
package/dist/auditor.js
CHANGED
package/dist/client.d.ts
CHANGED
package/dist/client.js
CHANGED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gemma 4 can emit its thought channel as text control tokens instead of the
|
|
3
|
+
* generic Ollama `message.thinking` field. Keep that provider quirk isolated so
|
|
4
|
+
* callers receive final-answer text and reasoning separately.
|
|
5
|
+
*/
|
|
6
|
+
export interface GemmaThoughtExtraction {
|
|
7
|
+
readonly content: string;
|
|
8
|
+
readonly reasoning: string;
|
|
9
|
+
readonly found: boolean;
|
|
10
|
+
}
|
|
11
|
+
export declare const GEMMA_THOUGHT_OPENERS: readonly ["<|channel>thought", "<|thought"];
|
|
12
|
+
export declare function extractGemmaThoughtChannels(input: string): GemmaThoughtExtraction;
|
|
13
|
+
export declare function normalizeGemmaThought(thought: string): string;
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gemma 4 can emit its thought channel as text control tokens instead of the
|
|
3
|
+
* generic Ollama `message.thinking` field. Keep that provider quirk isolated so
|
|
4
|
+
* callers receive final-answer text and reasoning separately.
|
|
5
|
+
*/
|
|
6
|
+
const GEMMA_THOUGHT_BLOCK = /<\|channel>\s*thought\s*\r?\n?([\s\S]*?)<channel\|>/gi;
|
|
7
|
+
const GEMMA_COMPACT_THOUGHT_BLOCK = /<\|thought\s*\r?\n?([\s\S]*?)\|>/gi;
|
|
8
|
+
export const GEMMA_THOUGHT_OPENERS = ['<|channel>thought', '<|thought'];
|
|
9
|
+
export function extractGemmaThoughtChannels(input) {
|
|
10
|
+
if (!input)
|
|
11
|
+
return { content: input, reasoning: '', found: false };
|
|
12
|
+
const reasoningParts = [];
|
|
13
|
+
let found = false;
|
|
14
|
+
const content = input
|
|
15
|
+
.replace(GEMMA_THOUGHT_BLOCK, (_match, thought) => {
|
|
16
|
+
found = true;
|
|
17
|
+
const normalized = normalizeGemmaThought(thought);
|
|
18
|
+
if (normalized)
|
|
19
|
+
reasoningParts.push(normalized);
|
|
20
|
+
return '';
|
|
21
|
+
})
|
|
22
|
+
.replace(GEMMA_COMPACT_THOUGHT_BLOCK, (_match, thought) => {
|
|
23
|
+
found = true;
|
|
24
|
+
const normalized = normalizeGemmaThought(thought);
|
|
25
|
+
if (normalized)
|
|
26
|
+
reasoningParts.push(normalized);
|
|
27
|
+
return '';
|
|
28
|
+
});
|
|
29
|
+
return {
|
|
30
|
+
content,
|
|
31
|
+
reasoning: reasoningParts.join('\n\n'),
|
|
32
|
+
found,
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
export function normalizeGemmaThought(thought) {
|
|
36
|
+
return thought.replace(/^\s+/, '').replace(/\s+$/, '');
|
|
37
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DiffusionGemma (vLLM) native-protocol adapter.
|
|
3
|
+
*
|
|
4
|
+
* Trimmed vLLM builds that serve DiffusionGemma ship with NO reasoning parser
|
|
5
|
+
* and NO tool-call parser module, and they reject OpenAI-style `tools` unless
|
|
6
|
+
* `--tool-call-parser` is configured. Everything therefore has to be handled
|
|
7
|
+
* client-side, against the model's native channel format (visible only when
|
|
8
|
+
* the request sets `skip_special_tokens: false`):
|
|
9
|
+
*
|
|
10
|
+
* <|channel>thought ...reasoning... <channel|> reasoning channel
|
|
11
|
+
* <|tool_call>call:name{k:<|"|>v<|"|>,n:3}<tool_call|> tool call
|
|
12
|
+
*
|
|
13
|
+
* Tool-call arguments are NOT JSON: keys are bare, strings are wrapped in the
|
|
14
|
+
* <|"|> quote token, numbers/booleans are bare (see the model's
|
|
15
|
+
* chat_template.jinja `format_argument` macro). `gemmaArgsToJson` converts
|
|
16
|
+
* that into a standard JSON string.
|
|
17
|
+
*
|
|
18
|
+
* Request-side protocol (implemented in the OpenAI provider):
|
|
19
|
+
* - always send `skip_special_tokens: false`
|
|
20
|
+
* - send `tools` with `tool_choice: 'none'` — vLLM still renders the
|
|
21
|
+
* declarations into the chat template, it just skips its (absent) parser
|
|
22
|
+
* - send history tool turns structurally (assistant `tool_calls` +
|
|
23
|
+
* `role: 'tool'` messages) — the chat template renders them natively
|
|
24
|
+
*/
|
|
25
|
+
export interface GemmaParsedToolCall {
|
|
26
|
+
readonly name: string;
|
|
27
|
+
/** JSON-encoded arguments object, ready for LLMToolCall.function.arguments */
|
|
28
|
+
readonly argumentsJson: string;
|
|
29
|
+
}
|
|
30
|
+
export interface GemmaDiffusionParsed {
|
|
31
|
+
/** Final answer with reasoning, tool-call blocks and special tokens removed */
|
|
32
|
+
readonly content: string;
|
|
33
|
+
readonly reasoning: string;
|
|
34
|
+
readonly toolCalls: readonly GemmaParsedToolCall[];
|
|
35
|
+
}
|
|
36
|
+
/** Models that speak this native protocol when served by vLLM. */
|
|
37
|
+
export declare function isGemmaDiffusionModel(model: string): boolean;
|
|
38
|
+
/**
|
|
39
|
+
* Convert the Gemma template's pseudo-JSON argument syntax to a JSON string.
|
|
40
|
+
* Lenient by design: bare words that aren't numbers/booleans become strings,
|
|
41
|
+
* since the model occasionally omits the quote token.
|
|
42
|
+
*/
|
|
43
|
+
export declare function gemmaArgsToJson(body: string): string;
|
|
44
|
+
/**
|
|
45
|
+
* Parse a complete raw DiffusionGemma output into reasoning, tool calls and
|
|
46
|
+
* clean answer text.
|
|
47
|
+
*/
|
|
48
|
+
export declare function parseGemmaDiffusionOutput(raw: string): GemmaDiffusionParsed;
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DiffusionGemma (vLLM) native-protocol adapter.
|
|
3
|
+
*
|
|
4
|
+
* Trimmed vLLM builds that serve DiffusionGemma ship with NO reasoning parser
|
|
5
|
+
* and NO tool-call parser module, and they reject OpenAI-style `tools` unless
|
|
6
|
+
* `--tool-call-parser` is configured. Everything therefore has to be handled
|
|
7
|
+
* client-side, against the model's native channel format (visible only when
|
|
8
|
+
* the request sets `skip_special_tokens: false`):
|
|
9
|
+
*
|
|
10
|
+
* <|channel>thought ...reasoning... <channel|> reasoning channel
|
|
11
|
+
* <|tool_call>call:name{k:<|"|>v<|"|>,n:3}<tool_call|> tool call
|
|
12
|
+
*
|
|
13
|
+
* Tool-call arguments are NOT JSON: keys are bare, strings are wrapped in the
|
|
14
|
+
* <|"|> quote token, numbers/booleans are bare (see the model's
|
|
15
|
+
* chat_template.jinja `format_argument` macro). `gemmaArgsToJson` converts
|
|
16
|
+
* that into a standard JSON string.
|
|
17
|
+
*
|
|
18
|
+
* Request-side protocol (implemented in the OpenAI provider):
|
|
19
|
+
* - always send `skip_special_tokens: false`
|
|
20
|
+
* - send `tools` with `tool_choice: 'none'` — vLLM still renders the
|
|
21
|
+
* declarations into the chat template, it just skips its (absent) parser
|
|
22
|
+
* - send history tool turns structurally (assistant `tool_calls` +
|
|
23
|
+
* `role: 'tool'` messages) — the chat template renders them natively
|
|
24
|
+
*/
|
|
25
|
+
import { extractGemmaThoughtChannels } from './gemma-channel.js';
|
|
26
|
+
/** Models that speak this native protocol when served by vLLM. */
|
|
27
|
+
export function isGemmaDiffusionModel(model) {
|
|
28
|
+
return /diffusion[-_]?gemma/i.test(model);
|
|
29
|
+
}
|
|
30
|
+
const TOOL_CALL_BLOCK = /<\|tool_call>\s*call:([a-zA-Z0-9_.-]+)\s*\{([\s\S]*?)\}\s*<tool_call\|>/g;
|
|
31
|
+
/**
|
|
32
|
+
* Residual control tokens that may leak into text output — including stray
|
|
33
|
+
* unbalanced channel markers (the model occasionally emits an extra
|
|
34
|
+
* <channel|> closer mid-answer).
|
|
35
|
+
*/
|
|
36
|
+
const RESIDUAL_SPECIAL = /<\|?(?:turn|think|image|audio|video|tool_response|tool_call|tool|channel)\b[^>]*?\|?>|<(?:turn|channel|tool_response|tool_call|tool)\|>/g;
|
|
37
|
+
const QUOTE_TOKEN = '<|"|>';
|
|
38
|
+
/**
|
|
39
|
+
* Convert the Gemma template's pseudo-JSON argument syntax to a JSON string.
|
|
40
|
+
* Lenient by design: bare words that aren't numbers/booleans become strings,
|
|
41
|
+
* since the model occasionally omits the quote token.
|
|
42
|
+
*/
|
|
43
|
+
export function gemmaArgsToJson(body) {
|
|
44
|
+
// Argument bodies arrive without their outer braces (the regex strips them)
|
|
45
|
+
const src = `{${body}}`;
|
|
46
|
+
let i = 0;
|
|
47
|
+
const n = src.length;
|
|
48
|
+
function skipWs() {
|
|
49
|
+
while (i < n && /\s/.test(src[i]))
|
|
50
|
+
i++;
|
|
51
|
+
}
|
|
52
|
+
function parseQuoted() {
|
|
53
|
+
// positioned at the start of QUOTE_TOKEN
|
|
54
|
+
i += QUOTE_TOKEN.length;
|
|
55
|
+
const end = src.indexOf(QUOTE_TOKEN, i);
|
|
56
|
+
const raw = end === -1 ? src.slice(i) : src.slice(i, end);
|
|
57
|
+
i = end === -1 ? n : end + QUOTE_TOKEN.length;
|
|
58
|
+
return raw;
|
|
59
|
+
}
|
|
60
|
+
function parseBare(stops) {
|
|
61
|
+
const start = i;
|
|
62
|
+
while (i < n && !stops.includes(src[i]) && !src.startsWith(QUOTE_TOKEN, i))
|
|
63
|
+
i++;
|
|
64
|
+
return src.slice(start, i).trim();
|
|
65
|
+
}
|
|
66
|
+
function parseValue() {
|
|
67
|
+
skipWs();
|
|
68
|
+
if (src.startsWith(QUOTE_TOKEN, i))
|
|
69
|
+
return JSON.stringify(parseQuoted());
|
|
70
|
+
const c = src[i];
|
|
71
|
+
if (c === '{')
|
|
72
|
+
return parseObject();
|
|
73
|
+
if (c === '[')
|
|
74
|
+
return parseArray();
|
|
75
|
+
const bare = parseBare(',}]');
|
|
76
|
+
if (/^-?\d+(\.\d+)?([eE][+-]?\d+)?$/.test(bare))
|
|
77
|
+
return bare;
|
|
78
|
+
if (bare === 'true' || bare === 'false' || bare === 'null')
|
|
79
|
+
return bare;
|
|
80
|
+
return JSON.stringify(bare);
|
|
81
|
+
}
|
|
82
|
+
function parseObject() {
|
|
83
|
+
i++; // consume {
|
|
84
|
+
const parts = [];
|
|
85
|
+
skipWs();
|
|
86
|
+
while (i < n && src[i] !== '}') {
|
|
87
|
+
skipWs();
|
|
88
|
+
const key = src.startsWith(QUOTE_TOKEN, i) ? parseQuoted() : parseBare(':');
|
|
89
|
+
skipWs();
|
|
90
|
+
if (src[i] === ':')
|
|
91
|
+
i++;
|
|
92
|
+
const value = parseValue();
|
|
93
|
+
parts.push(`${JSON.stringify(key.trim())}:${value}`);
|
|
94
|
+
skipWs();
|
|
95
|
+
if (src[i] === ',')
|
|
96
|
+
i++;
|
|
97
|
+
skipWs();
|
|
98
|
+
}
|
|
99
|
+
i++; // consume }
|
|
100
|
+
return `{${parts.join(',')}}`;
|
|
101
|
+
}
|
|
102
|
+
function parseArray() {
|
|
103
|
+
i++; // consume [
|
|
104
|
+
const parts = [];
|
|
105
|
+
skipWs();
|
|
106
|
+
while (i < n && src[i] !== ']') {
|
|
107
|
+
parts.push(parseValue());
|
|
108
|
+
skipWs();
|
|
109
|
+
if (src[i] === ',')
|
|
110
|
+
i++;
|
|
111
|
+
skipWs();
|
|
112
|
+
}
|
|
113
|
+
i++; // consume ]
|
|
114
|
+
return `[${parts.join(',')}]`;
|
|
115
|
+
}
|
|
116
|
+
skipWs();
|
|
117
|
+
return parseObject();
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Parse a complete raw DiffusionGemma output into reasoning, tool calls and
|
|
121
|
+
* clean answer text.
|
|
122
|
+
*/
|
|
123
|
+
export function parseGemmaDiffusionOutput(raw) {
|
|
124
|
+
if (!raw)
|
|
125
|
+
return { content: raw, reasoning: '', toolCalls: [] };
|
|
126
|
+
const toolCalls = [];
|
|
127
|
+
let text = raw.replace(TOOL_CALL_BLOCK, (_m, name, args) => {
|
|
128
|
+
toolCalls.push({ name, argumentsJson: gemmaArgsToJson(args) });
|
|
129
|
+
return '';
|
|
130
|
+
});
|
|
131
|
+
const channels = extractGemmaThoughtChannels(text);
|
|
132
|
+
text = channels.content;
|
|
133
|
+
// Unterminated thought channel (model hit max_tokens mid-reasoning)
|
|
134
|
+
let reasoning = channels.reasoning;
|
|
135
|
+
const danglingThought = text.match(/<\|channel>\s*thought\s*\r?\n?([\s\S]*)$/i);
|
|
136
|
+
if (danglingThought) {
|
|
137
|
+
reasoning = reasoning ? `${reasoning}\n\n${danglingThought[1].trim()}` : danglingThought[1].trim();
|
|
138
|
+
text = text.slice(0, danglingThought.index);
|
|
139
|
+
}
|
|
140
|
+
text = text.replace(RESIDUAL_SPECIAL, '');
|
|
141
|
+
return {
|
|
142
|
+
content: text.trim(),
|
|
143
|
+
reasoning,
|
|
144
|
+
toolCalls,
|
|
145
|
+
};
|
|
146
|
+
}
|
package/dist/http.d.ts
CHANGED
|
@@ -44,6 +44,9 @@ export declare function parseSSE(stream: AsyncGenerator<string>): AsyncGenerator
|
|
|
44
44
|
}, void, unknown>;
|
|
45
45
|
/**
|
|
46
46
|
* Build standard headers for LLM API requests.
|
|
47
|
+
* Merges any provider-specific extraHeaders (from ProviderConfig) on top.
|
|
48
|
+
* Provider clients can still fully override (e.g. Anthropic uses x-api-key).
|
|
49
|
+
*
|
|
50
|
+
* Respects authHeader / authPrefix from config for Azure-style or gateway auth.
|
|
47
51
|
*/
|
|
48
52
|
export declare function buildHeaders(options: LLMClientOptions): Record<string, string>;
|
|
49
|
-
//# sourceMappingURL=http.d.ts.map
|
package/dist/http.js
CHANGED
|
@@ -174,14 +174,26 @@ export async function* parseSSE(stream) {
|
|
|
174
174
|
// ============================================================================
|
|
175
175
|
/**
|
|
176
176
|
* Build standard headers for LLM API requests.
|
|
177
|
+
* Merges any provider-specific extraHeaders (from ProviderConfig) on top.
|
|
178
|
+
* Provider clients can still fully override (e.g. Anthropic uses x-api-key).
|
|
179
|
+
*
|
|
180
|
+
* Respects authHeader / authPrefix from config for Azure-style or gateway auth.
|
|
177
181
|
*/
|
|
178
182
|
export function buildHeaders(options) {
|
|
179
183
|
const headers = {
|
|
180
184
|
'Content-Type': 'application/json',
|
|
181
185
|
};
|
|
182
186
|
if (options.apiKey) {
|
|
183
|
-
|
|
187
|
+
const headerName = options.authHeader || 'Authorization';
|
|
188
|
+
// Sensible default prefix: Bearer for Authorization, nothing for api-key / x-api-key etc.
|
|
189
|
+
const defaultPrefix = headerName.toLowerCase() === 'authorization' ? 'Bearer ' : '';
|
|
190
|
+
const prefix = options.authPrefix !== undefined ? options.authPrefix : defaultPrefix;
|
|
191
|
+
headers[headerName] = `${prefix}${options.apiKey}`.trim();
|
|
192
|
+
}
|
|
193
|
+
// Merge provider-specific extras (e.g. Azure 'api-key', custom gateway headers).
|
|
194
|
+
// Later entries win on conflicts, allowing complete override of auth.
|
|
195
|
+
if (options.extraHeaders) {
|
|
196
|
+
Object.assign(headers, options.extraHeaders);
|
|
184
197
|
}
|
|
185
198
|
return headers;
|
|
186
199
|
}
|
|
187
|
-
//# sourceMappingURL=http.js.map
|