universal-llm-client 4.2.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/CHANGELOG.md +142 -103
  2. package/LICENSE +21 -21
  3. package/README.md +640 -591
  4. package/dist/ai-model.d.ts +12 -1
  5. package/dist/ai-model.d.ts.map +1 -1
  6. package/dist/ai-model.js +36 -1
  7. package/dist/ai-model.js.map +1 -1
  8. package/dist/gemma-channel.d.ts +14 -0
  9. package/dist/gemma-channel.d.ts.map +1 -0
  10. package/dist/gemma-channel.js +38 -0
  11. package/dist/gemma-channel.js.map +1 -0
  12. package/dist/gemma-diffusion.d.ts +49 -0
  13. package/dist/gemma-diffusion.d.ts.map +1 -0
  14. package/dist/gemma-diffusion.js +147 -0
  15. package/dist/gemma-diffusion.js.map +1 -0
  16. package/dist/http.d.ts +4 -0
  17. package/dist/http.d.ts.map +1 -1
  18. package/dist/http.js +14 -1
  19. package/dist/http.js.map +1 -1
  20. package/dist/index.d.ts +2 -1
  21. package/dist/index.d.ts.map +1 -1
  22. package/dist/index.js +4 -0
  23. package/dist/index.js.map +1 -1
  24. package/dist/interfaces.d.ts +183 -7
  25. package/dist/interfaces.d.ts.map +1 -1
  26. package/dist/interfaces.js.map +1 -1
  27. package/dist/providers/anthropic.d.ts.map +1 -1
  28. package/dist/providers/anthropic.js +28 -3
  29. package/dist/providers/anthropic.js.map +1 -1
  30. package/dist/providers/google.d.ts +22 -1
  31. package/dist/providers/google.d.ts.map +1 -1
  32. package/dist/providers/google.js +225 -13
  33. package/dist/providers/google.js.map +1 -1
  34. package/dist/providers/ollama.d.ts +2 -0
  35. package/dist/providers/ollama.d.ts.map +1 -1
  36. package/dist/providers/ollama.js +59 -30
  37. package/dist/providers/ollama.js.map +1 -1
  38. package/dist/providers/openai.d.ts +14 -0
  39. package/dist/providers/openai.d.ts.map +1 -1
  40. package/dist/providers/openai.js +200 -22
  41. package/dist/providers/openai.js.map +1 -1
  42. package/dist/router.d.ts +2 -0
  43. package/dist/router.d.ts.map +1 -1
  44. package/dist/router.js +4 -0
  45. package/dist/router.js.map +1 -1
  46. package/dist/stream-decoder.d.ts +12 -0
  47. package/dist/stream-decoder.d.ts.map +1 -1
  48. package/dist/stream-decoder.js +182 -5
  49. package/dist/stream-decoder.js.map +1 -1
  50. package/dist/thinking.d.ts +36 -0
  51. package/dist/thinking.d.ts.map +1 -0
  52. package/dist/thinking.js +52 -0
  53. package/dist/thinking.js.map +1 -0
  54. package/package.json +118 -116
  55. package/src/ai-model.ts +400 -350
  56. package/src/auditor.ts +213 -213
  57. package/src/client.ts +402 -402
  58. package/src/debug/debug-google-streaming.ts +1 -1
  59. package/src/demos/basic/universal-llm-examples.ts +3 -3
  60. package/src/demos/diffusion-gemma/.env +29 -0
  61. package/src/demos/diffusion-gemma/.env.example +27 -0
  62. package/src/demos/diffusion-gemma/CLAUDE.md +95 -0
  63. package/src/demos/diffusion-gemma/README.md +59 -0
  64. package/src/demos/diffusion-gemma/canvas.ts +1606 -0
  65. package/src/demos/diffusion-gemma/docker-compose.yml +29 -0
  66. package/src/demos/diffusion-gemma/probe-stream.ts +51 -0
  67. package/src/demos/diffusion-gemma/probe-tools.ts +55 -0
  68. package/src/demos/diffusion-gemma/server.ts +1205 -0
  69. package/src/demos/diffusion-gemma/start-vllm.sh +98 -0
  70. package/src/gemma-channel.ts +47 -0
  71. package/src/gemma-diffusion.ts +167 -0
  72. package/src/http.ts +261 -247
  73. package/src/index.ts +180 -161
  74. package/src/interfaces.ts +843 -657
  75. package/src/mcp.ts +345 -345
  76. package/src/providers/anthropic.ts +796 -762
  77. package/src/providers/google.ts +840 -620
  78. package/src/providers/index.ts +8 -8
  79. package/src/providers/ollama.ts +503 -469
  80. package/src/providers/openai.ts +587 -392
  81. package/src/router.ts +785 -780
  82. package/src/stream-decoder.ts +535 -361
  83. package/src/structured-output.ts +759 -759
  84. package/src/test-scripts/test-google-deep-research.ts +33 -0
  85. package/src/test-scripts/test-google-streaming-enhanced.ts +147 -147
  86. package/src/test-scripts/test-google-streaming.ts +1 -1
  87. package/src/test-scripts/test-google-system-prompt-comprehensive.ts +189 -189
  88. package/src/test-scripts/test-google-thinking.ts +46 -0
  89. package/src/test-scripts/test-system-message-positions.ts +163 -163
  90. package/src/test-scripts/test-system-prompt-improvement-demo.ts +83 -83
  91. package/src/test-scripts/test-vllm-qwen36.ts +256 -0
  92. package/src/tests/ai-model.test.ts +1614 -1614
  93. package/src/tests/auditor.test.ts +224 -224
  94. package/src/tests/gemma-diffusion.test.ts +115 -0
  95. package/src/tests/http.test.ts +200 -200
  96. package/src/tests/interfaces.test.ts +117 -117
  97. package/src/tests/providers/anthropic.test.ts +118 -0
  98. package/src/tests/providers/google.test.ts +841 -660
  99. package/src/tests/providers/ollama.test.ts +1034 -954
  100. package/src/tests/providers/openai.test.ts +1511 -1122
  101. package/src/tests/router.test.ts +254 -254
  102. package/src/tests/stream-decoder.test.ts +263 -179
  103. package/src/tests/structured-output.test.ts +1450 -1450
  104. package/src/tests/thinking.test.ts +65 -0
  105. package/src/tests/tools.test.ts +175 -175
  106. package/src/thinking.ts +73 -0
  107. package/src/tools.ts +246 -246
  108. package/src/zod-adapter.ts +72 -72
package/README.md CHANGED
@@ -1,591 +1,640 @@
1
- # universal-llm-client
2
-
3
- A universal LLM client for JavaScript/TypeScript with **transparent provider failover**, streaming tool execution, pluggable reasoning strategies, and native observability.
4
-
5
- ```typescript
6
- import { AIModel } from 'universal-llm-client';
7
-
8
- const model = new AIModel({
9
- model: 'gemini-2.5-flash',
10
- providers: [
11
- { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
12
- { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY },
13
- { type: 'ollama' },
14
- ],
15
- });
16
-
17
- const response = await model.chat([
18
- { role: 'user', content: 'Hello!' },
19
- ]);
20
- ```
21
-
22
- > **One model, multiple backends.** If Google fails, it transparently fails over to OpenRouter, then to local Ollama. Your code never knows the difference.
23
-
24
- ---
25
-
26
- ## Features
27
-
28
- - 🔄 **Transparent Failover** — Priority-ordered provider chain with retries, health tracking, and cooldowns
29
- - 🛠️ **Tool Calling** — Register tools once, works across all providers. Autonomous multi-turn execution loop
30
- - 📋 **Structured Output** — Zod schema validation, JSON Schema support, streaming, and type-safe responses
31
- - 🌊 **Streaming** — First-class async generator streaming with pluggable decoder strategies
32
- - 🧠 **Reasoning** — Native `<think>` tag parsing, interleaved reasoning, and model thinking support
33
- - 🔍 **Observability** — Built-in auditor interface for logging, cost tracking, and behavioral analysis
34
- - 🌐 **Universal Runtime** — Node.js 22+, Bun, Deno, and modern browsers
35
- - 🤖 **MCP Native** — Bridge MCP servers to LLM tools with zero glue code
36
- - 📊 **Embeddings** — Single and batch embedding generation
37
-
38
- ## Supported Providers
39
-
40
- | Provider | Type | Notes |
41
- |---|---|---|
42
- | **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal |
43
- | **OpenAI** | `openai` | GPT-4o, o3, etc. Also works with OpenRouter, Groq, LM Studio, vLLM |
44
- | **Google AI Studio** | `google` | Gemini models, system instructions, multimodal |
45
- | **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints and Bearer tokens |
46
- | **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances |
47
-
48
- ---
49
-
50
- ## Installation
51
-
52
- ```bash
53
- bun add universal-llm-client
54
- # or
55
- npm install universal-llm-client
56
- ```
57
-
58
- **Optional**: For MCP integration:
59
- ```bash
60
- bun add @modelcontextprotocol/sdk
61
- ```
62
-
63
- ---
64
-
65
- ## Quick Start
66
-
67
- ### Basic Chat
68
-
69
- ```typescript
70
- import { AIModel } from 'universal-llm-client';
71
-
72
- const model = new AIModel({
73
- model: 'qwen3:4b',
74
- providers: [{ type: 'ollama' }],
75
- });
76
-
77
- const response = await model.chat([
78
- { role: 'system', content: 'You are a helpful assistant.' },
79
- { role: 'user', content: 'What is the capital of France?' },
80
- ]);
81
-
82
- console.log(response.message.content);
83
- // "The capital of France is Paris."
84
- ```
85
-
86
- ### Streaming
87
-
88
- ```typescript
89
- for await (const event of model.chatStream([
90
- { role: 'user', content: 'Write a haiku about code.' },
91
- ])) {
92
- if (event.type === 'text') {
93
- process.stdout.write(event.content);
94
- } else if (event.type === 'thinking') {
95
- // Model reasoning (when supported)
96
- console.log('[thinking]', event.content);
97
- }
98
- }
99
- ```
100
-
101
- ### Tool Calling
102
-
103
- ```typescript
104
- model.registerTool(
105
- 'get_weather',
106
- 'Get current weather for a location',
107
- {
108
- type: 'object',
109
- properties: {
110
- city: { type: 'string', description: 'City name' },
111
- },
112
- required: ['city'],
113
- },
114
- async (args) => {
115
- const { city } = args as { city: string };
116
- return { temperature: 22, condition: 'sunny', city };
117
- },
118
- );
119
-
120
- // Autonomous tool execution the model calls tools and loops until done
121
- const response = await model.chatWithTools([
122
- { role: 'user', content: "What's the weather in Tokyo?" },
123
- ]);
124
-
125
- console.log(response.message.content);
126
- // "The weather in Tokyo is 22°C and sunny."
127
- console.log(response.toolTrace);
128
- // [{ name: 'get_weather', args: { city: 'Tokyo' }, result: {...}, duration: 5 }]
129
- ```
130
-
131
- ### Provider Failover
132
-
133
- ```typescript
134
- const model = new AIModel({
135
- model: 'gemini-2.5-flash',
136
- retries: 2, // retries per provider before failover
137
- timeout: 30000, // request timeout in ms
138
- providers: [
139
- { type: 'google', apiKey: process.env.GOOGLE_KEY, priority: 0 },
140
- { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY, priority: 1 },
141
- { type: 'ollama', url: 'http://localhost:11434', priority: 2 },
142
- ],
143
- });
144
-
145
- // If Google returns 500, retries twice, then seamlessly tries OpenRouter.
146
- // If OpenRouter also fails, falls back to local Ollama.
147
- // Your code sees a single response.
148
- const response = await model.chat([{ role: 'user', content: 'Hello' }]);
149
-
150
- // Check provider health at any time
151
- console.log(model.getProviderStatus());
152
- // [{ id: 'google-0', healthy: true }, { id: 'openai-1', healthy: true }, ...]
153
- ```
154
-
155
- ### Multimodal (Vision)
156
-
157
- ```typescript
158
- import { AIModel, multimodalMessage } from 'universal-llm-client';
159
-
160
- const model = new AIModel({
161
- model: 'gemini-2.5-flash',
162
- providers: [{ type: 'google', apiKey: process.env.GOOGLE_KEY }],
163
- });
164
-
165
- const response = await model.chat([
166
- multimodalMessage('What do you see in this image?', [
167
- 'https://example.com/photo.jpg',
168
- ]),
169
- ]);
170
- ```
171
-
172
- ### Embeddings
173
-
174
- ```typescript
175
- const embedModel = new AIModel({
176
- model: 'nomic-embed-text-v2-moe:latest',
177
- providers: [{ type: 'ollama' }],
178
- });
179
-
180
- const vector = await embedModel.embed('Hello world');
181
- // [0.006, 0.026, -0.009, ...]
182
-
183
- const vectors = await embedModel.embedArray(['Hello', 'World']);
184
- // [[0.006, ...], [0.012, ...]]
185
- ```
186
-
187
- ### Structured Output
188
-
189
- Get typed, validated JSON responses from any LLM using Zod schemas:
190
-
191
- ```typescript
192
- import { AIModel } from 'universal-llm-client';
193
- import { z } from 'zod';
194
-
195
- const model = new AIModel({
196
- model: 'gemini-2.5-flash',
197
- providers: [
198
- { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
199
- { type: 'ollama' },
200
- ],
201
- });
202
-
203
- // Define your schema
204
- const UserSchema = z.object({
205
- name: z.string(),
206
- age: z.number(),
207
- email: z.string().email(),
208
- interests: z.array(z.string()),
209
- });
210
-
211
- // Method 1: generateStructured (throws on validation failure)
212
- const user = await model.generateStructured(UserSchema, [
213
- { role: 'user', content: 'Generate a user profile for a software developer' },
214
- ]);
215
-
216
- console.log(user.name); // TypeScript knows this is string
217
- console.log(user.age); // TypeScript knows this is number
218
- console.log(user.email); // TypeScript knows this is string
219
- console.log(user.interests); // TypeScript knows this is string[]
220
- ```
221
-
222
- **Non-throwing variant:**
223
-
224
- ```typescript
225
- // Method 2: tryParseStructured (returns result object, never throws)
226
- const result = await model.tryParseStructured(UserSchema, messages);
227
-
228
- if (result.ok) {
229
- console.log('User:', result.value.name);
230
- } else {
231
- console.log('Error:', result.error.message);
232
- console.log('Raw LLM output:', result.rawOutput);
233
- }
234
- ```
235
-
236
- **Via chat options:**
237
-
238
- ```typescript
239
- // Method 3: chat with output parameter
240
- const response = await model.chat(messages, {
241
- output: { schema: UserSchema },
242
- });
243
-
244
- // response.structured is typed as { name: string, age: number, ... }
245
- if (response.structured) {
246
- console.log(response.structured.name);
247
- }
248
- ```
249
-
250
- **Streaming structured output:**
251
-
252
- ```typescript
253
- // Stream partial validated objects as JSON generates
254
- for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
255
- console.log('Partial:', partial);
256
- // Partial: { name: 'Alice' }
257
- // Partial: { name: 'Alice', age: 30 }
258
- // Partial: { name: 'Alice', age: 30, email: 'alice@example.com' }
259
- }
260
- ```
261
-
262
- **Raw JSON Schema (without Zod):**
263
-
264
- ```typescript
265
- const response = await model.chat(messages, {
266
- jsonSchema: {
267
- type: 'object',
268
- properties: {
269
- name: { type: 'string' },
270
- age: { type: 'number' },
271
- },
272
- required: ['name', 'age'],
273
- },
274
- name: 'Person', // Optional, used for LLM guidance
275
- });
276
- ```
277
-
278
- **Separate module import (tree-shaking):**
279
-
280
- ```typescript
281
- // Import only structured output types if you don't need the full client
282
- import {
283
- StructuredOutputError,
284
- type StructuredOutputResult,
285
- type StructuredOutputOptions,
286
- parseStructured,
287
- tryParseStructured,
288
- zodToJsonSchema,
289
- } from 'universal-llm-client/structured-output';
290
- ```
291
-
292
- **Vision with structured output:**
293
-
294
- ```typescript
295
- const ImageAnalysisSchema = z.object({
296
- objects: z.array(z.string()),
297
- scene: z.string(),
298
- mood: z.string(),
299
- });
300
-
301
- const response = await model.generateStructured(ImageAnalysisSchema, [
302
- multimodalMessage('Analyze this image', ['https://example.com/photo.jpg']),
303
- ]);
304
- ```
305
-
306
- **Provider compatibility:**
307
-
308
- | Provider | Method | Notes |
309
- |----------|--------|-------|
310
- | OpenAI | `response_format.json_schema` | Strict mode enabled |
311
- | Ollama | `format: { schema }` | Model must support grammar |
312
- | Google | `responseMimeType + responseSchema` | Some features stripped |
313
-
314
- ### Observability
315
-
316
- ```typescript
317
- import { AIModel, ConsoleAuditor, BufferedAuditor } from 'universal-llm-client';
318
-
319
- // Simple console logging
320
- const model = new AIModel({
321
- model: 'qwen3:4b',
322
- providers: [{ type: 'ollama' }],
323
- auditor: new ConsoleAuditor('[LLM]'),
324
- });
325
- // [LLM] REQUEST [ollama] (qwen3:4b)
326
- // [LLM] RESPONSE [ollama] (qwen3:4b) 1200ms 68 tokens
327
-
328
- // Buffered for custom sinks (OpenTelemetry, DB, etc.)
329
- const auditor = new BufferedAuditor({
330
- maxBufferSize: 100,
331
- onFlush: async (events) => {
332
- await sendToOpenTelemetry(events);
333
- },
334
- });
335
- ```
336
-
337
- ### MCP Integration
338
-
339
- ```typescript
340
- import { AIModel, MCPToolBridge } from 'universal-llm-client';
341
-
342
- const model = new AIModel({
343
- model: 'qwen3:4b',
344
- providers: [{ type: 'ollama' }],
345
- });
346
-
347
- const mcp = new MCPToolBridge({
348
- servers: {
349
- filesystem: {
350
- command: 'npx',
351
- args: ['-y', '@modelcontextprotocol/server-filesystem', './'],
352
- },
353
- weather: {
354
- url: 'https://mcp.example.com/weather',
355
- },
356
- },
357
- });
358
-
359
- await mcp.connect();
360
- await mcp.registerTools(model);
361
-
362
- // MCP tools are now callable via chatWithTools
363
- const response = await model.chatWithTools([
364
- { role: 'user', content: 'List files in the current directory' },
365
- ]);
366
-
367
- await mcp.disconnect();
368
- ```
369
-
370
- ### Stream Decoders
371
-
372
- ```typescript
373
- import { AIModel, createDecoder } from 'universal-llm-client';
374
-
375
- // Passthrough raw text, no parsing
376
- // Standard Chat — text + native reasoning + tool calls
377
- // Interleaved Reasoning — parses <think> and <progress> tags from text streams
378
-
379
- const decoder = createDecoder('interleaved-reasoning', (event) => {
380
- switch (event.type) {
381
- case 'text': console.log(event.content); break;
382
- case 'thinking': console.log('[think]', event.content); break;
383
- case 'progress': console.log('[progress]', event.content); break;
384
- case 'tool_call': console.log('[tool]', event.calls); break;
385
- }
386
- });
387
-
388
- decoder.push('<think>Let me analyze this</think>The answer is 42');
389
- decoder.flush();
390
-
391
- console.log(decoder.getCleanContent()); // "The answer is 42"
392
- console.log(decoder.getReasoning()); // "Let me analyze this"
393
- ```
394
-
395
- ---
396
-
397
- ## API Reference
398
-
399
- ### `AIModel`
400
-
401
- The universal client. One class, multiple backends.
402
-
403
- ```typescript
404
- new AIModel(config: AIModelConfig)
405
- ```
406
-
407
- **Config:**
408
-
409
- | Property | Type | Default | Description |
410
- |---|---|---|---|
411
- | `model` | `string` | — | Model name (e.g., `'gemini-2.5-flash'`) |
412
- | `providers` | `ProviderConfig[]` | — | Ordered list of provider backends |
413
- | `retries` | `number` | `2` | Retries per provider before failover |
414
- | `timeout` | `number` | `30000` | Request timeout in ms |
415
- | `auditor` | `Auditor` | `NoopAuditor` | Observability sink |
416
- | `thinking` | `boolean` | `false` | Enable model thinking/reasoning |
417
- | `debug` | `boolean` | `false` | Debug logging |
418
- | `defaultParameters` | `object` | — | Default parameters for all requests |
419
-
420
- **Provider Config:**
421
-
422
- | Property | Type | Description |
423
- |---|---|---|
424
- | `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'` |
425
- | `url` | `string` | Provider URL (has sensible defaults) |
426
- | `apiKey` | `string` | API key or Bearer token |
427
- | `priority` | `number` | Lower = tried first (defaults to array index) |
428
- | `model` | `string` | Override model name for this provider |
429
- | `region` | `string` | Vertex AI region (e.g., `'us-central1'`) |
430
- | `apiVersion` | `string` | API version (e.g., `'v1beta'`) |
431
-
432
- **Methods:**
433
-
434
- | Method | Returns | Description |
435
- |---|---|---|
436
- | `chat(messages, options?)` | `Promise<LLMChatResponse>` | Send chat request |
437
- | `chatWithTools(messages, options?)` | `Promise<LLMChatResponse>` | Chat with autonomous tool execution |
438
- | `chatStream(messages, options?)` | `AsyncGenerator<DecodedEvent>` | Stream chat response |
439
- | `generateStructured(schema, messages, options?)` | `Promise<T>` | Generate typed JSON validated against Zod schema |
440
- | `tryParseStructured(schema, messages, options?)` | `Promise<StructuredOutputResult<T>>` | Non-throwing variant returning result object |
441
- | `generateStructuredStream(schema, messages, options?)` | `AsyncGenerator<T, T>` | Stream partial validated objects as JSON generates |
442
- | `embed(text)` | `Promise<number[]>` | Generate single embedding |
443
- | `embedArray(texts)` | `Promise<number[][]>` | Generate batch embeddings |
444
- | `registerTool(name, desc, params, handler)` | `void` | Register a callable tool |
445
- | `registerTools(tools)` | `void` | Register multiple tools |
446
- | `getModels()` | `Promise<string[]>` | List available models |
447
- | `getModelInfo()` | `Promise<ModelMetadata>` | Get model metadata |
448
- | `getProviderStatus()` | `ProviderStatus[]` | Check provider health |
449
- | `setModel(name)` | `void` | Switch model at runtime |
450
- | `dispose()` | `Promise<void>` | Clean shutdown |
451
-
452
- ### Structured Output
453
-
454
- ```typescript
455
- import { z } from 'zod';
456
-
457
- // Define your schema
458
- const UserSchema = z.object({
459
- name: z.string(),
460
- age: z.number(),
461
- email: z.string().email(),
462
- });
463
-
464
- // Generate typed JSON
465
- const user = await model.generateStructured(UserSchema, messages);
466
- // TypeScript infers: { name: string; age: number; email: string }
467
-
468
- // Non-throwing variant
469
- const result = await model.tryParseStructured(UserSchema, messages);
470
- if (result.ok) {
471
- console.log(result.value.name); // Fully typed
472
- } else {
473
- console.log(result.error.message);
474
- }
475
-
476
- // Stream partial objects
477
- for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
478
- console.log(partial); // Partial validated objects
479
- }
480
- ```
481
-
482
- **Separate module import (tree-shaking):**
483
-
484
- ```typescript
485
- import {
486
- StructuredOutputError,
487
- type StructuredOutputResult,
488
- parseStructured,
489
- tryParseStructured,
490
- zodToJsonSchema,
491
- } from 'universal-llm-client/structured-output';
492
-
493
- // Use without importing the full client
494
- const schema = z.object({ name: z.string() });
495
- const jsonSchema = zodToJsonSchema(schema);
496
- ```
497
-
498
- ### `ToolBuilder` / `ToolExecutor`
499
-
500
- ```typescript
501
- import { ToolBuilder, ToolExecutor } from 'universal-llm-client';
502
-
503
- // Fluent builder
504
- const tool = new ToolBuilder('search')
505
- .description('Search the web')
506
- .addParameter('query', 'string', 'Search query', true)
507
- .addParameter('limit', 'number', 'Max results', false)
508
- .build();
509
-
510
- // Execution wrappers
511
- const safeHandler = ToolExecutor.compose(
512
- myHandler,
513
- h => ToolExecutor.withTimeout(h, 5000),
514
- h => ToolExecutor.safe(h),
515
- h => ToolExecutor.withValidation(h, ['query']),
516
- );
517
- ```
518
-
519
- ### Auditor Interface
520
-
521
- Implement custom observability by providing an `Auditor`:
522
-
523
- ```typescript
524
- interface Auditor {
525
- record(event: AuditEvent): void;
526
- flush?(): Promise<void>;
527
- }
528
- ```
529
-
530
- **Built-in implementations:**
531
- - `NoopAuditor` Zero overhead (default)
532
- - `ConsoleAuditor` — Structured console logging
533
- - `BufferedAuditor` — Collects events for custom sinks
534
-
535
- ---
536
-
537
- ## Architecture
538
-
539
- ```
540
- universal-llm-client
541
- ├── AIModel ← Public API (the only class you import)
542
- ├── Router ← Internal failover engine
543
- ├── BaseLLMClient ← Abstract client with tool execution
544
- ├── Providers
545
- │ ├── OllamaClient
546
- │ ├── OpenAICompatibleClient (OpenAI, OpenRouter, Groq, LM Studio, vLLM, LlamaCpp)
547
- │ └── GoogleClient (AI Studio + Vertex AI)
548
- ├── StreamDecoder ← Pluggable reasoning strategies
549
- ├── Auditor ← Observability interface
550
- ├── MCPToolBridge ← MCP server integration
551
- └── HTTP Utilities ← Universal fetch-based transport
552
- ```
553
-
554
- ### Design Principles
555
-
556
- 1. **Single import** `AIModel` is the only class users need
557
- 2. **Provider agnostic** — Same code works with any backend
558
- 3. **Transparent failover** — Health tracking and cooldowns happen behind the scenes
559
- 4. **Zero dependencies** — Core library depends only on native `fetch`
560
- 5. **Agent-ready** Stateless, composable instances designed as foundation for agent frameworks
561
- 6. **Observable** — Every request, response, tool call, retry, and failover is auditable
562
-
563
- ---
564
-
565
- ## Runtime Support
566
-
567
- | Runtime | Version | Status |
568
- |---|---|---|
569
- | **Node.js** | 22+ | ✅ Full support |
570
- | **Bun** | 1.0+ | Full support |
571
- | **Deno** | 2.0+ | ✅ Full support |
572
- | **Browsers** | Modern | ✅ No stdio MCP, HTTP transport only |
573
-
574
- ---
575
-
576
- ## For Agent Framework Authors
577
-
578
- `AIModel` is designed as the transport layer for agentic systems:
579
-
580
- - **Stateless**No conversation history stored. Your framework manages memory
581
- - **Composable**Create separate instances for chat, embeddings, vision
582
- - **Tool tracing** `chatWithTools()` returns full execution trace
583
- - **Context budget** — `getModelInfo()` exposes `contextLength`
584
- - **Auditor as system bus** — Inject custom sinks for cost tracking, behavioral scoring
585
- - **StreamDecoder as UI bridge** — Select decoder strategy per-call
586
-
587
- ---
588
-
589
- ## License
590
-
591
- MIT
1
+ # universal-llm-client
2
+
3
+ A universal LLM client for JavaScript/TypeScript with **transparent provider failover** and a **provider-agnostic reasoning API** — one set of code across OpenAI, Anthropic, Google Gemini, Ollama, vLLM, and any OpenAI-compatible endpoint. Streaming tool execution, structured output, generation stats, and native observability included.
4
+
5
+ ```typescript
6
+ import { AIModel } from 'universal-llm-client';
7
+
8
+ const model = new AIModel({
9
+ model: 'gemini-3.5-flash',
10
+ providers: [
11
+ { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
12
+ { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY },
13
+ { type: 'ollama' },
14
+ ],
15
+ });
16
+
17
+ const response = await model.chat([
18
+ { role: 'user', content: 'Hello!' },
19
+ ]);
20
+ ```
21
+
22
+ > **One model, multiple backends.** If Google fails, it transparently fails over to OpenRouter, then to local Ollama. Your code never knows the difference.
23
+
24
+ ---
25
+
26
+ ## Features
27
+
28
+ - 🔄 **Transparent Failover** — Priority-ordered provider chain with retries, health tracking, and cooldowns
29
+ - 🧠 **Unified Reasoning** — One `thinking` flag (`true`/`false` or a level: `'minimal' | 'low' | 'medium' | 'high'`) mapped to each backend's native control; chain-of-thought surfaced as `response.reasoning` + streaming `thinking` events (with `<think>`-tag parsing as a fallback)
30
+ - 🛠️ **Tool Calling** — Register tools once, works across all providers. Autonomous multi-turn execution loop
31
+ - 📋 **Structured Output** — Zod schema validation, JSON Schema support, streaming, and type-safe responses
32
+ - 🌊 **Streaming** — First-class async generator streaming with pluggable decoder strategies
33
+ - 🔬 **Deep Research** — Drive Google Gemini's agentic Deep Research (background interactions with polling + streaming)
34
+ - 📈 **Generation Stats** — `usage.tokensPerSecond` and `durationMs` reported across providers
35
+ - 🔌 **Flexible Transport** — Custom headers, query params, auth header/prefix, and base path for Azure OpenAI and gateways
36
+ - 🔍 **Observability** — Built-in auditor interface for logging, cost tracking, and behavioral analysis
37
+ - 🌐 **Universal Runtime** — Node.js 22+, Bun, Deno, and modern browsers
38
+ - 🤖 **MCP Native** — Bridge MCP servers to LLM tools with zero glue code
39
+ - 📊 **Embeddings** — Single and batch embedding generation
40
+
41
+ ## Supported Providers
42
+
43
+ | Provider | Type | Notes |
44
+ |---|---|---|
45
+ | **Ollama** | `ollama` | Local or cloud models, NDJSON streaming, model pulling, vision/multimodal, native thinking |
46
+ | **OpenAI + Compat** | `openai` | GPT series, o-series + **any OpenAI-compatible endpoint**: xAI/Grok, Mistral, DeepSeek, Cohere Compatibility, Groq, Together, Fireworks, OpenRouter, Perplexity Sonar, vLLM, LM Studio, TGI, most self-hosted servers |
47
+ | **Google AI Studio** | `google` | Gemini models, system instructions, multimodal, native thinking + grounding |
48
+ | **Vertex AI** | `vertex` | Same as Google AI but with regional endpoints, Bearer tokens, service tiers (flex/priority) |
49
+ | **Anthropic (Claude)** | `anthropic` | Claude 3.5/4 models via native Messages API. Excellent tool use, extended thinking with signatures, strong prompt caching |
50
+ | **LlamaCpp** | `llamacpp` | Local llama.cpp / llama-server instances (OpenAI-compatible under the hood) |
51
+
52
+ **Most of the world** is reachable via `type: 'openai'` + a `url` override. We only maintain dedicated clients for fundamentally different protocols (Anthropic Messages, Google Gemini) that offer unique high-value capabilities, plus Ollama for local developer experience. See `docs/guide/providers.md` and the research survey in `docs/research/provider-api-landscape-2026.md`.
53
+
54
+ ---
55
+
56
+ ## Installation
57
+
58
+ ```bash
59
+ bun add universal-llm-client
60
+ # or
61
+ npm install universal-llm-client
62
+ ```
63
+
64
+ **Optional**: For MCP integration:
65
+ ```bash
66
+ bun add @modelcontextprotocol/sdk
67
+ ```
68
+
69
+ ---
70
+
71
+ ## Quick Start
72
+
73
+ ### Basic Chat
74
+
75
+ ```typescript
76
+ import { AIModel } from 'universal-llm-client';
77
+
78
+ const model = new AIModel({
79
+ model: 'qwen3:4b',
80
+ providers: [{ type: 'ollama' }],
81
+ });
82
+
83
+ const response = await model.chat([
84
+ { role: 'system', content: 'You are a helpful assistant.' },
85
+ { role: 'user', content: 'What is the capital of France?' },
86
+ ]);
87
+
88
+ console.log(response.message.content);
89
+ // "The capital of France is Paris."
90
+ ```
91
+
92
+ ### Streaming
93
+
94
+ ```typescript
95
+ for await (const event of model.chatStream([
96
+ { role: 'user', content: 'Write a haiku about code.' },
97
+ ])) {
98
+ if (event.type === 'text') {
99
+ process.stdout.write(event.content);
100
+ } else if (event.type === 'thinking') {
101
+ // Model reasoning (when supported)
102
+ console.log('[thinking]', event.content);
103
+ }
104
+ }
105
+ ```
106
+
107
+ ### Thinking & Reasoning
108
+
109
+ Set one `thinking` value — `true`/`false` or a level (`'minimal' | 'low' | 'medium' | 'high'`) —
110
+ and it maps to each provider's native control (Gemini `thinkingLevel`/`thinkingBudget`, OpenAI
111
+ `reasoning_effort`, vLLM `enable_thinking`, Anthropic `budget_tokens`, Ollama `think`):
112
+
113
+ ```typescript
114
+ const model = new AIModel({
115
+ model: 'gemini-3.5-flash',
116
+ thinking: 'high', // true | false | 'minimal' | 'low' | 'medium' | 'high'
117
+ providers: [{ type: 'google', apiKey: process.env.GOOGLE_API_KEY }],
118
+ });
119
+
120
+ const res = await model.chat([{ role: 'user', content: 'Solve this step by step: ...' }]);
121
+ console.log(res.message.content); // final answer (clean)
122
+ console.log(res.reasoning); // chain-of-thought, when the model exposes it
123
+
124
+ // Per-call override (e.g. turn thinking off for structured output)
125
+ await model.chat(messages, { thinking: false });
126
+ ```
127
+
128
+ ### Deep Research (Gemini)
129
+
130
+ Run Google's agentic Deep Research — creates a background interaction and polls to completion:
131
+
132
+ ```typescript
133
+ const result = await model.deepResearch('Research the history of Google TPUs.', {
134
+ tools: ['google_search', 'url_context'],
135
+ });
136
+ console.log(result.status, result.report);
137
+
138
+ // Or stream intermediate thoughts and steps as they arrive:
139
+ for await (const ev of model.deepResearchStream('Compare RISC-V vs ARM in 2026.')) {
140
+ if (ev.type === 'thought') console.log('[thinking]', ev.content);
141
+ else if (ev.type === 'text') process.stdout.write(ev.content);
142
+ }
143
+ ```
144
+
145
+ ### Tool Calling
146
+
147
+ ```typescript
148
+ model.registerTool(
149
+ 'get_weather',
150
+ 'Get current weather for a location',
151
+ {
152
+ type: 'object',
153
+ properties: {
154
+ city: { type: 'string', description: 'City name' },
155
+ },
156
+ required: ['city'],
157
+ },
158
+ async (args) => {
159
+ const { city } = args as { city: string };
160
+ return { temperature: 22, condition: 'sunny', city };
161
+ },
162
+ );
163
+
164
+ // Autonomous tool execution — the model calls tools and loops until done
165
+ const response = await model.chatWithTools([
166
+ { role: 'user', content: "What's the weather in Tokyo?" },
167
+ ]);
168
+
169
+ console.log(response.message.content);
170
+ // "The weather in Tokyo is 22°C and sunny."
171
+ console.log(response.toolExecutions);
172
+ // [{ tool_call_id: 'call_abc', output: { temperature: 22, condition: 'sunny', city: 'Tokyo' }, duration: 5 }]
173
+ ```
174
+
175
+ ### Provider Failover
176
+
177
+ ```typescript
178
+ const model = new AIModel({
179
+ model: 'gemini-2.5-flash',
180
+ retries: 2, // retries per provider before failover
181
+ timeout: 30000, // request timeout in ms
182
+ providers: [
183
+ { type: 'google', apiKey: process.env.GOOGLE_KEY, priority: 0 },
184
+ { type: 'openai', url: 'https://openrouter.ai/api', apiKey: process.env.OPENROUTER_KEY, priority: 1 },
185
+ { type: 'ollama', url: 'http://localhost:11434', priority: 2 },
186
+ ],
187
+ });
188
+
189
+ // If Google returns 500, retries twice, then seamlessly tries OpenRouter.
190
+ // If OpenRouter also fails, falls back to local Ollama.
191
+ // Your code sees a single response.
192
+ const response = await model.chat([{ role: 'user', content: 'Hello' }]);
193
+
194
+ // Check provider health at any time
195
+ console.log(model.getProviderStatus());
196
+ // [{ id: 'google-0', healthy: true }, { id: 'openai-1', healthy: true }, ...]
197
+ ```
198
+
199
+ ### Multimodal (Vision)
200
+
201
+ ```typescript
202
+ import { AIModel, multimodalMessage } from 'universal-llm-client';
203
+
204
+ const model = new AIModel({
205
+ model: 'gemini-2.5-flash',
206
+ providers: [{ type: 'google', apiKey: process.env.GOOGLE_KEY }],
207
+ });
208
+
209
+ const response = await model.chat([
210
+ multimodalMessage('What do you see in this image?', [
211
+ 'https://example.com/photo.jpg',
212
+ ]),
213
+ ]);
214
+ ```
215
+
216
+ ### Embeddings
217
+
218
+ ```typescript
219
+ const embedModel = new AIModel({
220
+ model: 'nomic-embed-text-v2-moe:latest',
221
+ providers: [{ type: 'ollama' }],
222
+ });
223
+
224
+ const vector = await embedModel.embed('Hello world');
225
+ // [0.006, 0.026, -0.009, ...]
226
+
227
+ const vectors = await embedModel.embedArray(['Hello', 'World']);
228
+ // [[0.006, ...], [0.012, ...]]
229
+ ```
230
+
231
+ ### Structured Output
232
+
233
+ Get typed, validated JSON responses from any LLM using Zod schemas:
234
+
235
+ ```typescript
236
+ import { AIModel } from 'universal-llm-client';
237
+ import { z } from 'zod';
238
+
239
+ const model = new AIModel({
240
+ model: 'gemini-2.5-flash',
241
+ providers: [
242
+ { type: 'google', apiKey: process.env.GOOGLE_API_KEY },
243
+ { type: 'ollama' },
244
+ ],
245
+ });
246
+
247
+ // Define your schema
248
+ const UserSchema = z.object({
249
+ name: z.string(),
250
+ age: z.number(),
251
+ email: z.string().email(),
252
+ interests: z.array(z.string()),
253
+ });
254
+
255
+ // Method 1: generateStructured (throws on validation failure)
256
+ const user = await model.generateStructured(UserSchema, [
257
+ { role: 'user', content: 'Generate a user profile for a software developer' },
258
+ ]);
259
+
260
+ console.log(user.name); // TypeScript knows this is string
261
+ console.log(user.age); // TypeScript knows this is number
262
+ console.log(user.email); // TypeScript knows this is string
263
+ console.log(user.interests); // TypeScript knows this is string[]
264
+ ```
265
+
266
+ **Non-throwing variant:**
267
+
268
+ ```typescript
269
+ // Method 2: tryParseStructured (returns result object, never throws)
270
+ const result = await model.tryParseStructured(UserSchema, messages);
271
+
272
+ if (result.ok) {
273
+ console.log('User:', result.value.name);
274
+ } else {
275
+ console.log('Error:', result.error.message);
276
+ console.log('Raw LLM output:', result.rawOutput);
277
+ }
278
+ ```
279
+
280
+ **Via chat options:**
281
+
282
+ ```typescript
283
+ // Method 3: chat with output parameter
284
+ const response = await model.chat(messages, {
285
+ output: { schema: UserSchema },
286
+ });
287
+
288
+ // response.structured is typed as { name: string, age: number, ... }
289
+ if (response.structured) {
290
+ console.log(response.structured.name);
291
+ }
292
+ ```
293
+
294
+ **Streaming structured output:**
295
+
296
+ ```typescript
297
+ // Stream partial validated objects as JSON generates
298
+ for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
299
+ console.log('Partial:', partial);
300
+ // Partial: { name: 'Alice' }
301
+ // Partial: { name: 'Alice', age: 30 }
302
+ // Partial: { name: 'Alice', age: 30, email: 'alice@example.com' }
303
+ }
304
+ ```
305
+
306
+ **Raw JSON Schema (without Zod):**
307
+
308
+ ```typescript
309
+ const response = await model.chat(messages, {
310
+ jsonSchema: {
311
+ type: 'object',
312
+ properties: {
313
+ name: { type: 'string' },
314
+ age: { type: 'number' },
315
+ },
316
+ required: ['name', 'age'],
317
+ },
318
+ name: 'Person', // Optional, used for LLM guidance
319
+ });
320
+ ```
321
+
322
+ **Separate module import (tree-shaking):**
323
+
324
+ ```typescript
325
+ // Import only structured output types if you don't need the full client
326
+ import {
327
+ StructuredOutputError,
328
+ type StructuredOutputResult,
329
+ type StructuredOutputOptions,
330
+ parseStructured,
331
+ tryParseStructured,
332
+ zodToJsonSchema,
333
+ } from 'universal-llm-client/structured-output';
334
+ ```
335
+
336
+ **Vision with structured output:**
337
+
338
+ ```typescript
339
+ const ImageAnalysisSchema = z.object({
340
+ objects: z.array(z.string()),
341
+ scene: z.string(),
342
+ mood: z.string(),
343
+ });
344
+
345
+ const response = await model.generateStructured(ImageAnalysisSchema, [
346
+ multimodalMessage('Analyze this image', ['https://example.com/photo.jpg']),
347
+ ]);
348
+ ```
349
+
350
+ **Provider compatibility:**
351
+
352
+ | Provider | Method | Notes |
353
+ |----------|--------|-------|
354
+ | OpenAI | `response_format.json_schema` | Strict mode enabled |
355
+ | Ollama | `format: { schema }` | Model must support grammar |
356
+ | Google | `responseMimeType + responseSchema` | Some features stripped |
357
+
358
+ ### Observability
359
+
360
+ ```typescript
361
+ import { AIModel, ConsoleAuditor, BufferedAuditor } from 'universal-llm-client';
362
+
363
+ // Simple console logging
364
+ const model = new AIModel({
365
+ model: 'qwen3:4b',
366
+ providers: [{ type: 'ollama' }],
367
+ auditor: new ConsoleAuditor('[LLM]'),
368
+ });
369
+ // [LLM] REQUEST [ollama] (qwen3:4b) →
370
+ // [LLM] RESPONSE [ollama] (qwen3:4b) 1200ms 68 tokens
371
+
372
+ // Buffered for custom sinks (OpenTelemetry, DB, etc.)
373
+ const auditor = new BufferedAuditor({
374
+ maxBufferSize: 100,
375
+ onFlush: async (events) => {
376
+ await sendToOpenTelemetry(events);
377
+ },
378
+ });
379
+ ```
380
+
381
+ ### MCP Integration
382
+
383
+ ```typescript
384
+ import { AIModel, MCPToolBridge } from 'universal-llm-client';
385
+
386
+ const model = new AIModel({
387
+ model: 'qwen3:4b',
388
+ providers: [{ type: 'ollama' }],
389
+ });
390
+
391
+ const mcp = new MCPToolBridge({
392
+ servers: {
393
+ filesystem: {
394
+ command: 'npx',
395
+ args: ['-y', '@modelcontextprotocol/server-filesystem', './'],
396
+ },
397
+ weather: {
398
+ url: 'https://mcp.example.com/weather',
399
+ },
400
+ },
401
+ });
402
+
403
+ await mcp.connect();
404
+ await mcp.registerTools(model);
405
+
406
+ // MCP tools are now callable via chatWithTools
407
+ const response = await model.chatWithTools([
408
+ { role: 'user', content: 'List files in the current directory' },
409
+ ]);
410
+
411
+ await mcp.disconnect();
412
+ ```
413
+
414
+ ### Stream Decoders
415
+
416
+ ```typescript
417
+ import { AIModel, createDecoder } from 'universal-llm-client';
418
+
419
+ // Passthrough — raw text, no parsing
420
+ // Standard Chat — text + native reasoning + tool calls
421
+ // Interleaved Reasoning — parses <think> and <progress> tags from text streams
422
+
423
+ const decoder = createDecoder('interleaved-reasoning', (event) => {
424
+ switch (event.type) {
425
+ case 'text': console.log(event.content); break;
426
+ case 'thinking': console.log('[think]', event.content); break;
427
+ case 'progress': console.log('[progress]', event.content); break;
428
+ case 'tool_call': console.log('[tool]', event.calls); break;
429
+ }
430
+ });
431
+
432
+ decoder.push('<think>Let me analyze this</think>The answer is 42');
433
+ decoder.flush();
434
+
435
+ console.log(decoder.getCleanContent()); // "The answer is 42"
436
+ console.log(decoder.getReasoning()); // "Let me analyze this"
437
+ ```
438
+
439
+ ---
440
+
441
+ ## API Reference
442
+
443
+ ### `AIModel`
444
+
445
+ The universal client. One class, multiple backends.
446
+
447
+ ```typescript
448
+ new AIModel(config: AIModelConfig)
449
+ ```
450
+
451
+ **Config:**
452
+
453
+ | Property | Type | Default | Description |
454
+ |---|---|---|---|
455
+ | `model` | `string` | — | Model name (e.g., `'gemini-2.5-flash'`) |
456
+ | `providers` | `ProviderConfig[]` | — | Ordered list of provider backends |
457
+ | `retries` | `number` | `2` | Retries per provider before failover |
458
+ | `timeout` | `number` | `30000` | Request timeout in ms |
459
+ | `auditor` | `Auditor` | `NoopAuditor` | Observability sink |
460
+ | `thinking` | `boolean` | `false` | Enable model thinking/reasoning |
461
+ | `debug` | `boolean` | `false` | Debug logging |
462
+ | `defaultParameters` | `object` | — | Default parameters for all requests |
463
+
464
+ **Provider Config:**
465
+
466
+ | Property | Type | Description |
467
+ |---|---|---|
468
+ | `type` | `string` | `'ollama'`, `'openai'`, `'google'`, `'vertex'`, `'llamacpp'`, `'anthropic'` |
469
+ | `url` | `string` | Provider URL (has sensible defaults) |
470
+ | `apiKey` | `string` | API key or Bearer token |
471
+ | `priority` | `number` | Lower = tried first (defaults to array index) |
472
+ | `model` | `string` | Override model name for this provider |
473
+ | `region` | `string` | Vertex AI region (e.g., `'us-central1'`) |
474
+ | `apiVersion` | `string` | API version (e.g., `'v1beta'`) |
475
+ | `headers` | `Record<string,string>` | Extra headers merged into requests — OpenAI-compatible & Ollama (Azure `api-key`, gateways) |
476
+ | `queryParams` | `Record<string,string>` | Query params appended to URLs — OpenAI-compatible only (e.g. Azure `api-version`) |
477
+ | `authHeader` | `string` | Header name for the key OpenAI-compatible & Ollama (e.g. `'api-key'`) |
478
+ | `authPrefix` | `string` | Prefix before the key value — OpenAI-compatible & Ollama (e.g. `''` for api-key style) |
479
+ | `apiBasePath` | `string` | OpenAI-compatible only: override or disable the `/v1` suffix (use `''` for full Azure deployment URLs) |
480
+
481
+ **Methods:**
482
+
483
+ | Method | Returns | Description |
484
+ |---|---|---|
485
+ | `chat(messages, options?)` | `Promise<LLMChatResponse>` | Send chat request |
486
+ | `chatWithTools(messages, options?)` | `Promise<LLMChatResponse>` | Chat with autonomous tool execution |
487
+ | `chatStream(messages, options?)` | `AsyncGenerator<DecodedEvent>` | Stream chat response |
488
+ | `generateStructured(schema, messages, options?)` | `Promise<T>` | Generate typed JSON validated against Zod schema |
489
+ | `tryParseStructured(schema, messages, options?)` | `Promise<StructuredOutputResult<T>>` | Non-throwing variant returning result object |
490
+ | `generateStructuredStream(schema, messages, options?)` | `AsyncGenerator<T, T>` | Stream partial validated objects as JSON generates |
491
+ | `embed(text)` | `Promise<number[]>` | Generate single embedding |
492
+ | `embedArray(texts)` | `Promise<number[][]>` | Generate batch embeddings |
493
+ | `registerTool(name, desc, params, handler)` | `void` | Register a callable tool |
494
+ | `registerTools(tools)` | `void` | Register multiple tools |
495
+ | `getModels()` | `Promise<string[]>` | List available models |
496
+ | `getModelInfo()` | `Promise<ModelMetadata>` | Get model metadata |
497
+ | `getProviderStatus()` | `ProviderStatus[]` | Check provider health |
498
+ | `setModel(name)` | `void` | Switch model at runtime |
499
+ | `dispose()` | `Promise<void>` | Clean shutdown |
500
+
501
+ ### Structured Output
502
+
503
+ ```typescript
504
+ import { z } from 'zod';
505
+
506
+ // Define your schema
507
+ const UserSchema = z.object({
508
+ name: z.string(),
509
+ age: z.number(),
510
+ email: z.string().email(),
511
+ });
512
+
513
+ // Generate typed JSON
514
+ const user = await model.generateStructured(UserSchema, messages);
515
+ // TypeScript infers: { name: string; age: number; email: string }
516
+
517
+ // Non-throwing variant
518
+ const result = await model.tryParseStructured(UserSchema, messages);
519
+ if (result.ok) {
520
+ console.log(result.value.name); // Fully typed
521
+ } else {
522
+ console.log(result.error.message);
523
+ }
524
+
525
+ // Stream partial objects
526
+ for await (const partial of model.generateStructuredStream(UserSchema, messages)) {
527
+ console.log(partial); // Partial validated objects
528
+ }
529
+ ```
530
+
531
+ **Separate module import (tree-shaking):**
532
+
533
+ ```typescript
534
+ import {
535
+ StructuredOutputError,
536
+ type StructuredOutputResult,
537
+ parseStructured,
538
+ tryParseStructured,
539
+ zodToJsonSchema,
540
+ } from 'universal-llm-client/structured-output';
541
+
542
+ // Use without importing the full client
543
+ const schema = z.object({ name: z.string() });
544
+ const jsonSchema = zodToJsonSchema(schema);
545
+ ```
546
+
547
+ ### `ToolBuilder` / `ToolExecutor`
548
+
549
+ ```typescript
550
+ import { ToolBuilder, ToolExecutor } from 'universal-llm-client';
551
+
552
+ // Fluent builder
553
+ const tool = new ToolBuilder('search')
554
+ .description('Search the web')
555
+ .addParameter('query', 'string', 'Search query', true)
556
+ .addParameter('limit', 'number', 'Max results', false)
557
+ .build();
558
+
559
+ // Execution wrappers
560
+ const safeHandler = ToolExecutor.compose(
561
+ myHandler,
562
+ h => ToolExecutor.withTimeout(h, 5000),
563
+ h => ToolExecutor.safe(h),
564
+ h => ToolExecutor.withValidation(h, ['query']),
565
+ );
566
+ ```
567
+
568
+ ### Auditor Interface
569
+
570
+ Implement custom observability by providing an `Auditor`:
571
+
572
+ ```typescript
573
+ interface Auditor {
574
+ record(event: AuditEvent): void;
575
+ flush?(): Promise<void>;
576
+ }
577
+ ```
578
+
579
+ **Built-in implementations:**
580
+ - `NoopAuditor`Zero overhead (default)
581
+ - `ConsoleAuditor`Structured console logging
582
+ - `BufferedAuditor`Collects events for custom sinks
583
+
584
+ ---
585
+
586
+ ## Architecture
587
+
588
+ ```
589
+ universal-llm-client
590
+ ├── AIModel ← Public API (the only class you import)
591
+ ├── Router ← Internal failover engine
592
+ ├── BaseLLMClient ← Abstract client with tool execution
593
+ ├── Providers
594
+ │ ├── OllamaClient
595
+ │ ├── OpenAICompatibleClient (OpenAI, OpenRouter, Groq, LM Studio, vLLM, LlamaCpp)
596
+ │ └── GoogleClient (AI Studio + Vertex AI)
597
+ ├── StreamDecoder ← Pluggable reasoning strategies
598
+ ├── Auditor ← Observability interface
599
+ ├── MCPToolBridge ← MCP server integration
600
+ └── HTTP Utilities ← Universal fetch-based transport
601
+ ```
602
+
603
+ ### Design Principles
604
+
605
+ 1. **Single import** — `AIModel` is the only class users need
606
+ 2. **Provider agnostic** — Same code works with any backend
607
+ 3. **Transparent failover** — Health tracking and cooldowns happen behind the scenes
608
+ 4. **Zero dependencies** — Core library depends only on native `fetch`
609
+ 5. **Agent-ready** — Stateless, composable instances designed as foundation for agent frameworks
610
+ 6. **Observable** — Every request, response, tool call, retry, and failover is auditable
611
+
612
+ ---
613
+
614
+ ## Runtime Support
615
+
616
+ | Runtime | Version | Status |
617
+ |---|---|---|
618
+ | **Node.js** | 22+ | ✅ Full support |
619
+ | **Bun** | 1.0+ | ✅ Full support |
620
+ | **Deno** | 2.0+ | ✅ Full support |
621
+ | **Browsers** | Modern | ✅ No stdio MCP, HTTP transport only |
622
+
623
+ ---
624
+
625
+ ## For Agent Framework Authors
626
+
627
+ `AIModel` is designed as the transport layer for agentic systems:
628
+
629
+ - **Stateless** — No conversation history stored. Your framework manages memory
630
+ - **Composable** — Create separate instances for chat, embeddings, vision
631
+ - **Tool tracing** — `chatWithTools()` returns full execution trace
632
+ - **Context budget** — `getModelInfo()` exposes `contextLength`
633
+ - **Auditor as system bus** — Inject custom sinks for cost tracking, behavioral scoring
634
+ - **StreamDecoder as UI bridge** — Select decoder strategy per-call
635
+
636
+ ---
637
+
638
+ ## License
639
+
640
+ MIT