universal-llm-client 4.3.0 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CHANGELOG.md +34 -19
  2. package/README.md +62 -11
  3. package/dist/ai-model.d.ts +12 -2
  4. package/dist/ai-model.js +36 -2
  5. package/dist/auditor.d.ts +0 -1
  6. package/dist/auditor.js +0 -1
  7. package/dist/client.d.ts +0 -1
  8. package/dist/client.js +0 -1
  9. package/dist/gemma-channel.d.ts +13 -0
  10. package/dist/gemma-channel.js +37 -0
  11. package/dist/gemma-diffusion.d.ts +48 -0
  12. package/dist/gemma-diffusion.js +146 -0
  13. package/dist/http.d.ts +4 -1
  14. package/dist/http.js +14 -2
  15. package/dist/index.d.ts +2 -2
  16. package/dist/index.js +4 -1
  17. package/dist/interfaces.d.ts +163 -8
  18. package/dist/interfaces.js +0 -1
  19. package/dist/mcp.d.ts +0 -1
  20. package/dist/mcp.js +0 -1
  21. package/dist/providers/anthropic.d.ts +0 -1
  22. package/dist/providers/anthropic.js +28 -4
  23. package/dist/providers/google.d.ts +22 -2
  24. package/dist/providers/google.js +223 -14
  25. package/dist/providers/index.d.ts +0 -1
  26. package/dist/providers/index.js +0 -1
  27. package/dist/providers/ollama.d.ts +2 -1
  28. package/dist/providers/ollama.js +59 -31
  29. package/dist/providers/openai.d.ts +16 -1
  30. package/dist/providers/openai.js +488 -81
  31. package/dist/router.d.ts +2 -1
  32. package/dist/router.js +4 -1
  33. package/dist/stream-decoder.d.ts +12 -1
  34. package/dist/stream-decoder.js +182 -6
  35. package/dist/structured-output.d.ts +0 -1
  36. package/dist/structured-output.js +0 -1
  37. package/dist/thinking.d.ts +35 -0
  38. package/dist/thinking.js +51 -0
  39. package/dist/tools.d.ts +0 -1
  40. package/dist/tools.js +0 -1
  41. package/dist/zod-adapter.d.ts +0 -1
  42. package/dist/zod-adapter.js +0 -1
  43. package/package.json +3 -1
  44. package/dist/ai-model.d.ts.map +0 -1
  45. package/dist/ai-model.js.map +0 -1
  46. package/dist/auditor.d.ts.map +0 -1
  47. package/dist/auditor.js.map +0 -1
  48. package/dist/client.d.ts.map +0 -1
  49. package/dist/client.js.map +0 -1
  50. package/dist/http.d.ts.map +0 -1
  51. package/dist/http.js.map +0 -1
  52. package/dist/index.d.ts.map +0 -1
  53. package/dist/index.js.map +0 -1
  54. package/dist/interfaces.d.ts.map +0 -1
  55. package/dist/interfaces.js.map +0 -1
  56. package/dist/mcp.d.ts.map +0 -1
  57. package/dist/mcp.js.map +0 -1
  58. package/dist/providers/anthropic.d.ts.map +0 -1
  59. package/dist/providers/anthropic.js.map +0 -1
  60. package/dist/providers/google.d.ts.map +0 -1
  61. package/dist/providers/google.js.map +0 -1
  62. package/dist/providers/index.d.ts.map +0 -1
  63. package/dist/providers/index.js.map +0 -1
  64. package/dist/providers/ollama.d.ts.map +0 -1
  65. package/dist/providers/ollama.js.map +0 -1
  66. package/dist/providers/openai.d.ts.map +0 -1
  67. package/dist/providers/openai.js.map +0 -1
  68. package/dist/router.d.ts.map +0 -1
  69. package/dist/router.js.map +0 -1
  70. package/dist/stream-decoder.d.ts.map +0 -1
  71. package/dist/stream-decoder.js.map +0 -1
  72. package/dist/structured-output.d.ts.map +0 -1
  73. package/dist/structured-output.js.map +0 -1
  74. package/dist/tools.d.ts.map +0 -1
  75. package/dist/tools.js.map +0 -1
  76. package/dist/zod-adapter.d.ts.map +0 -1
  77. package/dist/zod-adapter.js.map +0 -1
@@ -6,7 +6,8 @@
6
6
  * streaming, embeddings, and system prompt handling.
7
7
  */
8
8
  import { BaseLLMClient } from '../client.js';
9
- import { httpRequest, httpStream } from '../http.js';
9
+ import { resolveThinking, geminiThinkingBudget } from '../thinking.js';
10
+ import { httpRequest, httpStream, parseSSE } from '../http.js';
10
11
  import { StandardChatDecoder } from '../stream-decoder.js';
11
12
  import { normalizeJsonSchema, stripUnsupportedFeatures, getJsonSchemaFromConfig, } from '../structured-output.js';
12
13
  export class GoogleClient extends BaseLLMClient {
@@ -132,6 +133,7 @@ export class GoogleClient extends BaseLLMClient {
132
133
  });
133
134
  // Google streams SSE with JSON payloads
134
135
  let buffer = '';
136
+ let reasoningBuffer = '';
135
137
  for await (const chunk of stream) {
136
138
  buffer += chunk;
137
139
  // Google SSE uses "data: " prefix
@@ -159,8 +161,14 @@ export class GoogleClient extends BaseLLMClient {
159
161
  continue;
160
162
  for (const part of candidate.content.parts) {
161
163
  if (part.text) {
162
- decoder.push(part.text);
163
- yield { type: 'text', content: part.text };
164
+ if (part.thought) {
165
+ reasoningBuffer += part.text;
166
+ yield { type: 'thinking', content: part.text };
167
+ }
168
+ else {
169
+ decoder.push(part.text);
170
+ yield { type: 'text', content: part.text };
171
+ }
164
172
  }
165
173
  if (part.functionCall) {
166
174
  const toolCall = this.convertFunctionCallToToolCall(part.functionCall, part.thoughtSignature);
@@ -189,12 +197,170 @@ export class GoogleClient extends BaseLLMClient {
189
197
  content: decoder.getCleanContent(),
190
198
  tool_calls: allToolCalls.length > 0 ? allToolCalls : undefined,
191
199
  },
192
- reasoning: decoder.getReasoning(),
200
+ reasoning: reasoningBuffer || decoder.getReasoning(),
193
201
  usage,
194
202
  provider: this.isVertex ? 'vertex' : 'google',
195
203
  };
196
204
  }
197
205
  // ========================================================================
206
+ // Deep Research (Gemini interactions API)
207
+ // ========================================================================
208
+ /** Deep Research is available via Google AI Studio only (not Vertex AI). */
209
+ supportsDeepResearch() {
210
+ return !this.isVertex;
211
+ }
212
+ interactionsBase() {
213
+ if (this.isVertex) {
214
+ throw new Error('Deep Research is only available via Google AI Studio, not Vertex AI.');
215
+ }
216
+ return `https://generativelanguage.googleapis.com/${this.apiVersion}/interactions`;
217
+ }
218
+ deepResearchHeaders() {
219
+ return {
220
+ 'Content-Type': 'application/json',
221
+ 'x-goog-api-key': this.options.apiKey ?? '',
222
+ 'Api-Revision': '2026-05-20',
223
+ };
224
+ }
225
+ buildInteractionBody(input, opts, background) {
226
+ return {
227
+ input,
228
+ agent: opts.agent ?? 'deep-research-preview-04-2026',
229
+ background,
230
+ agent_config: {
231
+ type: 'deep-research',
232
+ thinking_summaries: opts.thinkingSummaries ?? 'auto',
233
+ },
234
+ ...(opts.tools?.length ? { tools: opts.tools.map(t => ({ type: t })) } : {}),
235
+ ...(opts.previousInteractionId ? { previous_interaction_id: opts.previousInteractionId } : {}),
236
+ };
237
+ }
238
+ toDeepResearchResult(i) {
239
+ const obj = i ?? {};
240
+ const steps = obj['steps'];
241
+ let report = (obj['output_text'] ?? obj['outputText'] ?? obj['output']);
242
+ // Some responses carry the final report only inside the steps' content
243
+ // blocks (the last step is typically the answer) — concatenate text there.
244
+ if (!report && Array.isArray(steps)) {
245
+ const text = steps
246
+ .flatMap(s => (Array.isArray(s.content) ? s.content : []))
247
+ .map(c => (c && typeof c === 'object' && typeof c.text === 'string'
248
+ ? c.text
249
+ : ''))
250
+ .filter(Boolean)
251
+ .join('\n\n');
252
+ report = text || undefined;
253
+ }
254
+ return {
255
+ id: obj['id'] ?? '',
256
+ status: obj['status'] ?? 'in_progress',
257
+ report,
258
+ steps,
259
+ error: obj['error'],
260
+ raw: obj,
261
+ };
262
+ }
263
+ /** httpRequest with small backoff retries — the preview interactions API is flaky (503s). */
264
+ async drRequest(url, init, retries = 3) {
265
+ let lastErr;
266
+ for (let attempt = 0; attempt <= retries; attempt++) {
267
+ try {
268
+ const res = await httpRequest(url, init);
269
+ return res.data;
270
+ }
271
+ catch (e) {
272
+ lastErr = e;
273
+ if (attempt < retries)
274
+ await this.delay(1500 * (attempt + 1), init.signal);
275
+ }
276
+ }
277
+ throw lastErr;
278
+ }
279
+ /**
280
+ * Run an agentic Deep Research interaction: create it, then poll until it
281
+ * completes/fails or the timeout elapses. Returns the final report + steps.
282
+ */
283
+ async deepResearch(input, opts = {}) {
284
+ const base = this.interactionsBase();
285
+ const headers = this.deepResearchHeaders();
286
+ const pollInterval = opts.pollIntervalMs ?? 5000;
287
+ const deadline = Date.now() + (opts.timeoutMs ?? 600_000);
288
+ let interaction = await this.drRequest(base, {
289
+ method: 'POST',
290
+ headers,
291
+ body: this.buildInteractionBody(input, opts, true),
292
+ timeout: this.options.timeout ?? 60_000,
293
+ signal: opts.signal,
294
+ });
295
+ const id = interaction?.['id'];
296
+ if (!id)
297
+ return this.toDeepResearchResult(interaction);
298
+ while ((interaction?.['status'] ?? 'in_progress') === 'in_progress') {
299
+ if (Date.now() > deadline)
300
+ break;
301
+ await this.delay(pollInterval, opts.signal);
302
+ try {
303
+ interaction = await this.drRequest(`${base}/${id}`, { method: 'GET', headers, timeout: this.options.timeout ?? 60_000, signal: opts.signal }, 2);
304
+ }
305
+ catch {
306
+ // Tolerate transient errors during a long poll; keep trying until the deadline.
307
+ }
308
+ }
309
+ return this.toDeepResearchResult(interaction);
310
+ }
311
+ /**
312
+ * Stream a Deep Research interaction's intermediate updates (`step.delta`
313
+ * thought/text/image events) and return the final result. Best-effort:
314
+ * falls back to the created interaction object if the stream ends early.
315
+ */
316
+ async *deepResearchStream(input, opts = {}) {
317
+ const base = this.interactionsBase();
318
+ const headers = this.deepResearchHeaders();
319
+ // Streaming long-running research requires background:true AND stream:true
320
+ // in the create body (per the Deep Research Interactions API docs).
321
+ const stream = httpStream(base, {
322
+ method: 'POST',
323
+ headers,
324
+ body: { ...this.buildInteractionBody(input, opts, true), stream: true },
325
+ timeout: opts.timeoutMs ?? 600_000,
326
+ signal: opts.signal,
327
+ });
328
+ let last;
329
+ for await (const { data } of parseSSE(stream)) {
330
+ if (!data || data === '[DONE]')
331
+ continue;
332
+ let parsed;
333
+ try {
334
+ parsed = JSON.parse(data);
335
+ }
336
+ catch {
337
+ continue;
338
+ }
339
+ last = parsed;
340
+ const delta = (parsed['delta'] ?? parsed['step']?.['delta']);
341
+ if (delta) {
342
+ const dtype = delta['type'];
343
+ if (dtype === 'thought')
344
+ yield { type: 'thought', content: String(delta['text'] ?? delta['content'] ?? '') };
345
+ else if (dtype === 'text')
346
+ yield { type: 'text', content: String(delta['text'] ?? delta['content'] ?? '') };
347
+ else if (dtype === 'image')
348
+ yield { type: 'image', content: delta['image'] ?? delta['content'] };
349
+ }
350
+ if (typeof parsed['status'] === 'string')
351
+ yield { type: 'status', status: parsed['status'] };
352
+ }
353
+ return this.toDeepResearchResult(last);
354
+ }
355
+ delay(ms, signal) {
356
+ return new Promise((resolve, reject) => {
357
+ if (signal?.aborted)
358
+ return reject(new Error('aborted'));
359
+ const t = setTimeout(resolve, ms);
360
+ signal?.addEventListener('abort', () => { clearTimeout(t); reject(new Error('aborted')); }, { once: true });
361
+ });
362
+ }
363
+ // ========================================================================
198
364
  // Embeddings
199
365
  // ========================================================================
200
366
  async embed(text) {
@@ -268,8 +434,29 @@ export class GoogleClient extends BaseLLMClient {
268
434
  config['temperature'] = options.temperature;
269
435
  if (options?.maxTokens !== undefined)
270
436
  config['maxOutputTokens'] = options.maxTokens;
271
- if (this.options.thinking) {
272
- config['thinkingConfig'] = { thinkingBudget: 8192 };
437
+ // Unified thinking flag → Gemini thinkingConfig. Per-call overrides model
438
+ // config. Gemini 3.x uses `thinkingLevel`; 2.5/2.0 use `thinkingBudget`
439
+ // (0 = off, -1 = dynamic). `includeThoughts` surfaces the reasoning text.
440
+ // A user-supplied thinkingConfig (via parameters) is left untouched.
441
+ const thinking = resolveThinking(options?.thinking, this.options.thinking);
442
+ if (thinking && config['thinkingConfig'] === undefined) {
443
+ if (/gemini-3/i.test(this.options.model)) {
444
+ const tc = {};
445
+ if (!thinking.enabled) {
446
+ tc['thinkingLevel'] = 'MINIMAL';
447
+ }
448
+ else {
449
+ if (thinking.level)
450
+ tc['thinkingLevel'] = thinking.level.toUpperCase();
451
+ tc['includeThoughts'] = true;
452
+ }
453
+ config['thinkingConfig'] = tc;
454
+ }
455
+ else {
456
+ config['thinkingConfig'] = thinking.enabled
457
+ ? { thinkingBudget: geminiThinkingBudget(thinking.level), includeThoughts: true }
458
+ : { thinkingBudget: 0 };
459
+ }
273
460
  }
274
461
  // Structured output: add responseMimeType and responseSchema
275
462
  const schemaOptions = this.extractSchemaOptions(options);
@@ -346,9 +533,7 @@ export class GoogleClient extends BaseLLMClient {
346
533
  const part = {
347
534
  functionCall: {
348
535
  name: tc.function.name,
349
- args: typeof tc.function.arguments === 'string'
350
- ? JSON.parse(tc.function.arguments)
351
- : tc.function.arguments,
536
+ args: this.parseToolArguments(tc.function.arguments),
352
537
  },
353
538
  };
354
539
  // Echo thought signature back (required by Gemini 3.x)
@@ -430,8 +615,8 @@ export class GoogleClient extends BaseLLMClient {
430
615
  id: this.generateToolCallId(),
431
616
  type: 'function',
432
617
  function: {
433
- name: fc.name,
434
- arguments: JSON.stringify(fc.args),
618
+ name: fc.name || '',
619
+ arguments: JSON.stringify(fc.args ?? {}),
435
620
  },
436
621
  };
437
622
  if (thoughtSignature) {
@@ -439,6 +624,23 @@ export class GoogleClient extends BaseLLMClient {
439
624
  }
440
625
  return toolCall;
441
626
  }
627
+ parseToolArguments(args) {
628
+ if (typeof args !== 'string') {
629
+ return args ?? {};
630
+ }
631
+ if (args.length === 0) {
632
+ return {};
633
+ }
634
+ try {
635
+ const parsed = JSON.parse(args);
636
+ return parsed && typeof parsed === 'object' && !Array.isArray(parsed)
637
+ ? parsed
638
+ : {};
639
+ }
640
+ catch {
641
+ return {};
642
+ }
643
+ }
442
644
  // ========================================================================
443
645
  // Response Parsing
444
646
  // ========================================================================
@@ -451,10 +653,17 @@ export class GoogleClient extends BaseLLMClient {
451
653
  };
452
654
  }
453
655
  let textContent = '';
656
+ let reasoningText = '';
454
657
  const toolCalls = [];
455
658
  for (const part of candidate.content.parts) {
456
- if (part.text)
457
- textContent += part.text;
659
+ if (part.text) {
660
+ // Thought summaries (includeThoughts) carry the reasoning trace;
661
+ // keep them out of `content` and surface them as `reasoning`.
662
+ if (part.thought)
663
+ reasoningText += part.text;
664
+ else
665
+ textContent += part.text;
666
+ }
458
667
  if (part.functionCall) {
459
668
  toolCalls.push(this.convertFunctionCallToToolCall(part.functionCall, part.thoughtSignature));
460
669
  }
@@ -474,6 +683,7 @@ export class GoogleClient extends BaseLLMClient {
474
683
  content: textContent,
475
684
  tool_calls: toolCalls.length > 0 ? toolCalls : undefined,
476
685
  },
686
+ reasoning: reasoningText || undefined,
477
687
  usage,
478
688
  provider: this.isVertex ? 'vertex' : 'google',
479
689
  };
@@ -502,4 +712,3 @@ export class GoogleClient extends BaseLLMClient {
502
712
  throw new Error('Unreachable');
503
713
  }
504
714
  }
505
- //# sourceMappingURL=google.js.map
@@ -5,4 +5,3 @@ export { OllamaClient } from './ollama.js';
5
5
  export { OpenAICompatibleClient } from './openai.js';
6
6
  export { GoogleClient } from './google.js';
7
7
  export { AnthropicClient } from './anthropic.js';
8
- //# sourceMappingURL=index.d.ts.map
@@ -5,4 +5,3 @@ export { OllamaClient } from './ollama.js';
5
5
  export { OpenAICompatibleClient } from './openai.js';
6
6
  export { GoogleClient } from './google.js';
7
7
  export { AnthropicClient } from './anthropic.js';
8
- //# sourceMappingURL=index.js.map
@@ -18,6 +18,8 @@ export declare class OllamaClient extends BaseLLMClient {
18
18
  constructor(options: LLMClientOptions, auditor?: Auditor);
19
19
  chat(messages: LLMChatMessage[], options?: ChatOptions): Promise<LLMChatResponse>;
20
20
  chatStream(messages: LLMChatMessage[], options?: ChatOptions): AsyncGenerator<DecodedEvent, LLMChatResponse | void, unknown>;
21
+ private normalizeToolCall;
22
+ private normalizeToolArguments;
21
23
  embed(text: string): Promise<number[]>;
22
24
  embedArray(texts: string[]): Promise<number[][]>;
23
25
  getModels(): Promise<string[]>;
@@ -35,4 +37,3 @@ export declare class OllamaClient extends BaseLLMClient {
35
37
  */
36
38
  private buildFormatParameter;
37
39
  }
38
- //# sourceMappingURL=ollama.d.ts.map
@@ -11,9 +11,11 @@
11
11
  * - VAL-PROVIDER-OLLAMA-004: format "json" vs schema modes
12
12
  */
13
13
  import { BaseLLMClient } from '../client.js';
14
+ import { resolveThinking } from '../thinking.js';
14
15
  import { httpRequest, httpStream, parseNDJSON, buildHeaders } from '../http.js';
15
16
  import { StandardChatDecoder } from '../stream-decoder.js';
16
17
  import { normalizeJsonSchema, getJsonSchemaFromConfig, } from '../structured-output.js';
18
+ import { extractGemmaThoughtChannels } from '../gemma-channel.js';
17
19
  export class OllamaClient extends BaseLLMClient {
18
20
  constructor(options, auditor) {
19
21
  super({
@@ -39,7 +41,8 @@ export class OllamaClient extends BaseLLMClient {
39
41
  }
40
42
  // Enable native thinking by default — thinking models produce better
41
43
  // tool selections and reasoning when allowed to think before acting.
42
- body['think'] = this.options.thinking ?? true;
44
+ // Ollama `think` is on/off (no levels); default on for thinking models.
45
+ body['think'] = resolveThinking(options?.thinking, this.options.thinking)?.enabled ?? true;
43
46
  // Handle structured output via format parameter
44
47
  const schemaOptions = this.extractSchemaOptions(options);
45
48
  if (schemaOptions) {
@@ -68,28 +71,25 @@ export class OllamaClient extends BaseLLMClient {
68
71
  inputTokens: data.prompt_eval_count ?? 0,
69
72
  outputTokens: data.eval_count ?? 0,
70
73
  totalTokens: (data.prompt_eval_count ?? 0) + (data.eval_count ?? 0),
74
+ // Ollama reports server-precise timing in nanoseconds.
75
+ durationMs: data.total_duration ? data.total_duration / 1e6 : undefined,
76
+ tokensPerSecond: data.eval_duration && data.eval_count
77
+ ? data.eval_count / (data.eval_duration / 1e9)
78
+ : undefined,
71
79
  }
72
80
  : undefined;
73
- // Normalize tool call IDs (Ollama sometimes omits them)
74
- const toolCalls = data.message.tool_calls?.map(tc => ({
75
- ...tc,
76
- id: tc.id || this.generateToolCallId(),
77
- function: {
78
- ...tc.function,
79
- arguments: typeof tc.function.arguments === 'string'
80
- ? tc.function.arguments
81
- : JSON.stringify(tc.function.arguments),
82
- },
83
- }));
84
- // Get content, handling potential null
85
- const content = data.message.content || data.message.thinking || '';
81
+ // Normalize tool calls (Ollama sometimes omits IDs and empty args).
82
+ const toolCalls = data.message.tool_calls?.map(tc => this.normalizeToolCall(tc));
83
+ const gemmaContent = extractGemmaThoughtChannels(data.message.content || '');
84
+ const reasoning = [data.message.thinking, gemmaContent.reasoning].filter(Boolean).join('\n\n') || undefined;
86
85
  const result = {
87
86
  message: {
88
87
  role: 'assistant',
89
- content,
88
+ content: gemmaContent.content,
90
89
  tool_calls: toolCalls,
91
90
  },
92
- reasoning: data.message.content ? data.message.thinking : undefined,
91
+ finishReason: data.done_reason,
92
+ reasoning,
93
93
  usage,
94
94
  provider: 'ollama',
95
95
  };
@@ -118,7 +118,8 @@ export class OllamaClient extends BaseLLMClient {
118
118
  if (tools?.length) {
119
119
  body['tools'] = this.convertToolsToOllama(tools);
120
120
  }
121
- body['think'] = this.options.thinking ?? true;
121
+ // Ollama `think` is on/off (no levels); default on for thinking models.
122
+ body['think'] = resolveThinking(options?.thinking, this.options.thinking)?.enabled ?? true;
122
123
  const start = Date.now();
123
124
  this.auditor.record({
124
125
  timestamp: start,
@@ -126,7 +127,8 @@ export class OllamaClient extends BaseLLMClient {
126
127
  provider: 'ollama',
127
128
  model: this.options.model,
128
129
  });
129
- const decoder = new StandardChatDecoder(() => { });
130
+ const decoderEvents = [];
131
+ const decoder = new StandardChatDecoder(event => decoderEvents.push(event));
130
132
  let lastResponse;
131
133
  const streamedToolCalls = [];
132
134
  // Stream idle timeout: thinking models can pause for minutes between chunks.
@@ -142,33 +144,38 @@ export class OllamaClient extends BaseLLMClient {
142
144
  lastResponse = chunk;
143
145
  if (chunk.message?.thinking) {
144
146
  decoder.pushReasoning(chunk.message.thinking);
145
- yield { type: 'thinking', content: chunk.message.thinking };
147
+ const pending = decoderEvents.splice(0);
148
+ for (const event of pending) {
149
+ yield event;
150
+ }
146
151
  }
147
152
  if (chunk.message?.content) {
148
153
  decoder.push(chunk.message.content);
149
- yield { type: 'text', content: chunk.message.content };
154
+ const pending = decoderEvents.splice(0);
155
+ for (const event of pending) {
156
+ yield event;
157
+ }
150
158
  }
151
159
  if (chunk.message?.tool_calls?.length) {
152
- const normalized = chunk.message.tool_calls.map(tc => ({
153
- ...tc,
154
- id: tc.id || this.generateToolCallId(),
155
- function: {
156
- ...tc.function,
157
- arguments: typeof tc.function.arguments === 'string'
158
- ? tc.function.arguments
159
- : JSON.stringify(tc.function.arguments),
160
- },
161
- }));
160
+ const normalized = chunk.message.tool_calls.map(tc => this.normalizeToolCall(tc));
162
161
  streamedToolCalls.push(...normalized);
163
162
  yield { type: 'tool_call', calls: normalized };
164
163
  }
165
164
  }
166
165
  decoder.flush();
166
+ const pending = decoderEvents.splice(0);
167
+ for (const event of pending) {
168
+ yield event;
169
+ }
167
170
  const usage = lastResponse?.prompt_eval_count
168
171
  ? {
169
172
  inputTokens: lastResponse.prompt_eval_count ?? 0,
170
173
  outputTokens: lastResponse.eval_count ?? 0,
171
174
  totalTokens: (lastResponse.prompt_eval_count ?? 0) + (lastResponse.eval_count ?? 0),
175
+ durationMs: lastResponse.total_duration ? lastResponse.total_duration / 1e6 : undefined,
176
+ tokensPerSecond: lastResponse.eval_duration && lastResponse.eval_count
177
+ ? lastResponse.eval_count / (lastResponse.eval_duration / 1e9)
178
+ : undefined,
172
179
  }
173
180
  : undefined;
174
181
  this.auditor.record({
@@ -185,11 +192,33 @@ export class OllamaClient extends BaseLLMClient {
185
192
  content: decoder.getCleanContent(),
186
193
  tool_calls: streamedToolCalls.length > 0 ? streamedToolCalls : undefined,
187
194
  },
195
+ finishReason: lastResponse?.done_reason,
188
196
  reasoning: decoder.getReasoning(),
189
197
  usage,
190
198
  provider: 'ollama',
191
199
  };
192
200
  }
201
+ normalizeToolCall(toolCall) {
202
+ return {
203
+ ...toolCall,
204
+ id: toolCall.id || this.generateToolCallId(),
205
+ type: 'function',
206
+ function: {
207
+ ...toolCall.function,
208
+ name: toolCall.function?.name || '',
209
+ arguments: this.normalizeToolArguments(toolCall.function?.arguments),
210
+ },
211
+ };
212
+ }
213
+ normalizeToolArguments(args) {
214
+ if (typeof args === 'string') {
215
+ return args.trim().length > 0 ? args : '{}';
216
+ }
217
+ if (args == null) {
218
+ return '{}';
219
+ }
220
+ return JSON.stringify(args) ?? '{}';
221
+ }
193
222
  // ========================================================================
194
223
  // Embeddings
195
224
  // ========================================================================
@@ -397,4 +426,3 @@ export class OllamaClient extends BaseLLMClient {
397
426
  return 'json';
398
427
  }
399
428
  }
400
- //# sourceMappingURL=ollama.js.map
@@ -9,9 +9,25 @@ import type { LLMClientOptions, LLMChatMessage, LLMChatResponse, ChatOptions } f
9
9
  import type { DecodedEvent } from '../stream-decoder.js';
10
10
  import type { Auditor } from '../auditor.js';
11
11
  export declare class OpenAICompatibleClient extends BaseLLMClient {
12
+ private warnedVllmToolFallback;
13
+ /**
14
+ * DiffusionGemma on trimmed vLLM builds has no server-side reasoning or
15
+ * tool-call parser — the native channel protocol is handled client-side
16
+ * (see gemma-diffusion.ts). Auto-detected from the model name; override
17
+ * with `gemmaNativeProtocol` in LLMClientOptions.
18
+ */
19
+ private get gemmaNative();
20
+ /**
21
+ * Build a full endpoint URL, respecting apiBasePath (already baked into this.options.url)
22
+ * and any queryParams provided at the provider config level.
23
+ */
24
+ private buildUrl;
12
25
  constructor(options: LLMClientOptions, auditor?: Auditor);
26
+ private warnVllmToolFallback;
13
27
  chat(messages: LLMChatMessage[], options?: ChatOptions): Promise<LLMChatResponse>;
14
28
  chatStream(messages: LLMChatMessage[], options?: ChatOptions): AsyncGenerator<DecodedEvent, LLMChatResponse | void, unknown>;
29
+ private normalizeToolCall;
30
+ private normalizeToolArguments;
15
31
  embed(text: string): Promise<number[]>;
16
32
  getModels(): Promise<string[]>;
17
33
  private convertMessages;
@@ -21,4 +37,3 @@ export declare class OpenAICompatibleClient extends BaseLLMClient {
21
37
  */
22
38
  private buildResponseFormat;
23
39
  }
24
- //# sourceMappingURL=openai.d.ts.map