universal-llm-client 4.3.0 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CHANGELOG.md +34 -19
  2. package/README.md +62 -11
  3. package/dist/ai-model.d.ts +12 -2
  4. package/dist/ai-model.js +36 -2
  5. package/dist/auditor.d.ts +0 -1
  6. package/dist/auditor.js +0 -1
  7. package/dist/client.d.ts +0 -1
  8. package/dist/client.js +0 -1
  9. package/dist/gemma-channel.d.ts +13 -0
  10. package/dist/gemma-channel.js +37 -0
  11. package/dist/gemma-diffusion.d.ts +48 -0
  12. package/dist/gemma-diffusion.js +146 -0
  13. package/dist/http.d.ts +4 -1
  14. package/dist/http.js +14 -2
  15. package/dist/index.d.ts +2 -2
  16. package/dist/index.js +4 -1
  17. package/dist/interfaces.d.ts +163 -8
  18. package/dist/interfaces.js +0 -1
  19. package/dist/mcp.d.ts +0 -1
  20. package/dist/mcp.js +0 -1
  21. package/dist/providers/anthropic.d.ts +0 -1
  22. package/dist/providers/anthropic.js +28 -4
  23. package/dist/providers/google.d.ts +22 -2
  24. package/dist/providers/google.js +223 -14
  25. package/dist/providers/index.d.ts +0 -1
  26. package/dist/providers/index.js +0 -1
  27. package/dist/providers/ollama.d.ts +2 -1
  28. package/dist/providers/ollama.js +59 -31
  29. package/dist/providers/openai.d.ts +16 -1
  30. package/dist/providers/openai.js +488 -81
  31. package/dist/router.d.ts +2 -1
  32. package/dist/router.js +4 -1
  33. package/dist/stream-decoder.d.ts +12 -1
  34. package/dist/stream-decoder.js +182 -6
  35. package/dist/structured-output.d.ts +0 -1
  36. package/dist/structured-output.js +0 -1
  37. package/dist/thinking.d.ts +35 -0
  38. package/dist/thinking.js +51 -0
  39. package/dist/tools.d.ts +0 -1
  40. package/dist/tools.js +0 -1
  41. package/dist/zod-adapter.d.ts +0 -1
  42. package/dist/zod-adapter.js +0 -1
  43. package/package.json +3 -1
  44. package/dist/ai-model.d.ts.map +0 -1
  45. package/dist/ai-model.js.map +0 -1
  46. package/dist/auditor.d.ts.map +0 -1
  47. package/dist/auditor.js.map +0 -1
  48. package/dist/client.d.ts.map +0 -1
  49. package/dist/client.js.map +0 -1
  50. package/dist/http.d.ts.map +0 -1
  51. package/dist/http.js.map +0 -1
  52. package/dist/index.d.ts.map +0 -1
  53. package/dist/index.js.map +0 -1
  54. package/dist/interfaces.d.ts.map +0 -1
  55. package/dist/interfaces.js.map +0 -1
  56. package/dist/mcp.d.ts.map +0 -1
  57. package/dist/mcp.js.map +0 -1
  58. package/dist/providers/anthropic.d.ts.map +0 -1
  59. package/dist/providers/anthropic.js.map +0 -1
  60. package/dist/providers/google.d.ts.map +0 -1
  61. package/dist/providers/google.js.map +0 -1
  62. package/dist/providers/index.d.ts.map +0 -1
  63. package/dist/providers/index.js.map +0 -1
  64. package/dist/providers/ollama.d.ts.map +0 -1
  65. package/dist/providers/ollama.js.map +0 -1
  66. package/dist/providers/openai.d.ts.map +0 -1
  67. package/dist/providers/openai.js.map +0 -1
  68. package/dist/router.d.ts.map +0 -1
  69. package/dist/router.js.map +0 -1
  70. package/dist/stream-decoder.d.ts.map +0 -1
  71. package/dist/stream-decoder.js.map +0 -1
  72. package/dist/structured-output.d.ts.map +0 -1
  73. package/dist/structured-output.js.map +0 -1
  74. package/dist/tools.d.ts.map +0 -1
  75. package/dist/tools.js.map +0 -1
  76. package/dist/zod-adapter.d.ts.map +0 -1
  77. package/dist/zod-adapter.js.map +0 -1
@@ -5,25 +5,268 @@
5
5
  * Works with: OpenAI, OpenRouter, LM Studio, LlamaCpp, vLLM, Groq, Together.
6
6
  */
7
7
  import { BaseLLMClient } from '../client.js';
8
+ import { resolveThinking, isOpenAIReasoningModel } from '../thinking.js';
8
9
  import { httpRequest, httpStream, parseSSE, buildHeaders } from '../http.js';
9
10
  import { StandardChatDecoder } from '../stream-decoder.js';
10
11
  import { normalizeJsonSchema, getJsonSchemaFromConfig, } from '../structured-output.js';
12
+ import { isGemmaDiffusionModel, parseGemmaDiffusionOutput } from '../gemma-diffusion.js';
13
+ const VLLM_AUTO_TOOL_CHOICE_HINT = 'vLLM rejected automatic tool choice. Retrying with text-level tool calling. To use native tool_calls, start vLLM with --enable-auto-tool-choice and --tool-call-parser <parser>.';
14
+ function normalizeMessagesForOpenAICompat(messages) {
15
+ let sawNonSystem = false;
16
+ return messages.map(message => {
17
+ if (message.role !== 'system') {
18
+ sawNonSystem = true;
19
+ return {
20
+ ...message,
21
+ content: message.content ?? '',
22
+ };
23
+ }
24
+ if (!sawNonSystem) {
25
+ return {
26
+ ...message,
27
+ content: message.content ?? '',
28
+ };
29
+ }
30
+ return {
31
+ ...message,
32
+ role: 'user',
33
+ content: `[SYSTEM MESSAGE]\n${stringifyMessageContent(message.content)}`,
34
+ };
35
+ });
36
+ }
37
+ function stringifyMessageContent(content) {
38
+ if (typeof content === 'string')
39
+ return content;
40
+ return content
41
+ .map(part => {
42
+ if (part.type === 'text')
43
+ return part.text;
44
+ if (part.type === 'image_url')
45
+ return `[Image: ${part.image_url.url}]`;
46
+ if (part.type === 'audio')
47
+ return `[Audio: ${part.audio.mimeType}]`;
48
+ return '';
49
+ })
50
+ .filter(Boolean)
51
+ .join('\n');
52
+ }
53
+ function hasToolDefinitions(body) {
54
+ const tools = body['tools'];
55
+ return Array.isArray(tools) && tools.length > 0;
56
+ }
57
+ function isVllmAutoToolChoiceError(value) {
58
+ const text = value instanceof Error
59
+ ? value.message
60
+ : typeof value === 'string'
61
+ ? value
62
+ : JSON.stringify(value ?? '');
63
+ const normalized = text.toLowerCase();
64
+ return (normalized.includes('auto')
65
+ && normalized.includes('tool choice requires --enable-auto-tool-choice')
66
+ && normalized.includes('--tool-call-parser'));
67
+ }
68
+ async function requestWithVllmToolFallback(url, request, tools, onFallback) {
69
+ try {
70
+ return await httpRequest(url, {
71
+ method: 'POST',
72
+ headers: request.headers,
73
+ body: request.body,
74
+ timeout: request.timeout,
75
+ });
76
+ }
77
+ catch (error) {
78
+ if (tools?.length
79
+ && hasToolDefinitions(request.body)
80
+ && isVllmAutoToolChoiceError(error)) {
81
+ onFallback();
82
+ return httpRequest(url, {
83
+ method: 'POST',
84
+ headers: request.headers,
85
+ body: withoutNativeTools(request.body, tools),
86
+ timeout: request.timeout,
87
+ });
88
+ }
89
+ throw error;
90
+ }
91
+ }
92
+ function parseJsonObject(text) {
93
+ try {
94
+ const parsed = JSON.parse(text);
95
+ if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
96
+ return parsed;
97
+ }
98
+ }
99
+ catch {
100
+ /* not JSON */
101
+ }
102
+ return null;
103
+ }
104
+ function parseTextToolCallBody(content) {
105
+ const body = content.trim();
106
+ if (!body)
107
+ return [];
108
+ try {
109
+ const parsed = JSON.parse(body);
110
+ const rawCalls = Array.isArray(parsed) ? parsed : [parsed];
111
+ const calls = [];
112
+ for (const rawCall of rawCalls) {
113
+ if (!rawCall || typeof rawCall !== 'object')
114
+ continue;
115
+ const record = rawCall;
116
+ const name = record['name'];
117
+ if (typeof name !== 'string' || !name)
118
+ continue;
119
+ const args = record['arguments'] ?? record['parameters'] ?? record['args'] ?? {};
120
+ calls.push({
121
+ name,
122
+ arguments: typeof args === 'string' ? JSON.stringify(parseJsonObject(args) ?? {}) : JSON.stringify(args ?? {}),
123
+ });
124
+ }
125
+ if (calls.length > 0)
126
+ return calls;
127
+ }
128
+ catch {
129
+ /* not structured JSON */
130
+ }
131
+ const functionCallMatch = /^([@A-Za-z_][@A-Za-z0-9_.:-]*)\s*\(([\s\S]*)\)\s*$/u.exec(body);
132
+ if (functionCallMatch) {
133
+ const rawArgs = functionCallMatch[2].trim();
134
+ const args = rawArgs ? parseJsonObject(rawArgs) : {};
135
+ if (args) {
136
+ return [{ name: functionCallMatch[1], arguments: JSON.stringify(args) }];
137
+ }
138
+ }
139
+ const calls = [];
140
+ const funcPattern = /<function=([@A-Za-z_][@A-Za-z0-9_.:-]*)>([\s\S]*?)<\/function>/g;
141
+ let fMatch;
142
+ while ((fMatch = funcPattern.exec(body)) !== null) {
143
+ const args = {};
144
+ const paramPattern = /<parameter=([A-Za-z_][A-Za-z0-9_-]*)>([\s\S]*?)<\/parameter>/g;
145
+ let pMatch;
146
+ while ((pMatch = paramPattern.exec(fMatch[2] ?? '')) !== null) {
147
+ args[pMatch[1]] = pMatch[2].trim();
148
+ }
149
+ calls.push({ name: fMatch[1], arguments: JSON.stringify(args) });
150
+ }
151
+ return calls;
152
+ }
153
+ function recoverToolCallsFromText(content, knownToolNames, generateId) {
154
+ if (!content || content.length < 10)
155
+ return null;
156
+ const calls = [];
157
+ let cleanContent = content;
158
+ const isKnownTool = (name) => knownToolNames.has(name);
159
+ const toolCallPattern = /<tool_call>([\s\S]*?)<\/tool_call>/g;
160
+ let tcMatch;
161
+ while ((tcMatch = toolCallPattern.exec(content)) !== null) {
162
+ const parsedCalls = parseTextToolCallBody(tcMatch[1]);
163
+ let matched = false;
164
+ for (const parsed of parsedCalls) {
165
+ if (!isKnownTool(parsed.name))
166
+ continue;
167
+ matched = true;
168
+ calls.push({
169
+ id: generateId(),
170
+ type: 'function',
171
+ function: { name: parsed.name, arguments: parsed.arguments },
172
+ });
173
+ }
174
+ if (matched)
175
+ cleanContent = cleanContent.replace(tcMatch[0], '');
176
+ }
177
+ if (calls.length === 0)
178
+ return null;
179
+ return { calls, cleanContent: cleanContent.trim() };
180
+ }
181
+ function toolFallbackInstruction(tools) {
182
+ const toolLines = tools.map(tool => {
183
+ const fn = tool.function;
184
+ return `- ${fn.name}: ${fn.description}\n parameters JSON schema: ${JSON.stringify(fn.parameters)}`;
185
+ });
186
+ return {
187
+ role: 'system',
188
+ content: 'The server does not support native OpenAI tool parsing for this request. '
189
+ + 'Use this text tool protocol instead.\n\n'
190
+ + 'When you need a tool, respond with exactly one or more tool calls and no prose:\n'
191
+ + '<tool_call>tool_name({"argument":"value"})</tool_call>\n\n'
192
+ + 'After tool results are provided, answer the user normally. Available tools:\n'
193
+ + toolLines.join('\n'),
194
+ };
195
+ }
196
+ function withTextToolFallbackMessages(messages, tools) {
197
+ return [toolFallbackInstruction(tools), ...messages];
198
+ }
199
+ function withoutNativeTools(body, tools) {
200
+ const fallbackBody = { ...body };
201
+ delete fallbackBody['tools'];
202
+ delete fallbackBody['tool_choice'];
203
+ fallbackBody['messages'] = withTextToolFallbackMessages(body['messages'] ?? [], tools);
204
+ return fallbackBody;
205
+ }
11
206
  export class OpenAICompatibleClient extends BaseLLMClient {
207
+ warnedVllmToolFallback = false;
208
+ /**
209
+ * DiffusionGemma on trimmed vLLM builds has no server-side reasoning or
210
+ * tool-call parser — the native channel protocol is handled client-side
211
+ * (see gemma-diffusion.ts). Auto-detected from the model name; override
212
+ * with `gemmaNativeProtocol` in LLMClientOptions.
213
+ */
214
+ get gemmaNative() {
215
+ return this.options.gemmaNativeProtocol ?? isGemmaDiffusionModel(this.options.model);
216
+ }
217
+ /**
218
+ * Build a full endpoint URL, respecting apiBasePath (already baked into this.options.url)
219
+ * and any queryParams provided at the provider config level.
220
+ */
221
+ buildUrl(suffix) {
222
+ const raw = this.options.url.replace(/\/+$/, '');
223
+ // Split off any query string already on the configured base URL so the
224
+ // path is inserted before it (avoids `host/v1?k=v/chat/completions`).
225
+ const qIdx = raw.indexOf('?');
226
+ const basePath = (qIdx === -1 ? raw : raw.slice(0, qIdx)).replace(/\/+$/, '');
227
+ const existingQuery = qIdx === -1 ? '' : raw.slice(qIdx + 1);
228
+ const path = suffix.startsWith('/') ? suffix : '/' + suffix;
229
+ const search = new URLSearchParams(existingQuery);
230
+ const qp = this.options.queryParams;
231
+ if (qp) {
232
+ for (const [k, v] of Object.entries(qp)) {
233
+ if (v != null)
234
+ search.set(k, String(v));
235
+ }
236
+ }
237
+ const qs = search.toString();
238
+ return basePath + path + (qs ? `?${qs}` : '');
239
+ }
12
240
  constructor(options, auditor) {
13
- // Ensure URL ends with /v1 for standard endpoints
14
- let url = (options.url || 'https://api.openai.com').replace(/\/+$/, '');
15
- if (!url.endsWith('/v1')) {
16
- url += '/v1';
241
+ let base = (options.url || 'https://api.openai.com').replace(/\/+$/, '');
242
+ // Respect apiBasePath (from ProviderConfig.apiBasePath). Default "/v1" for broad compatibility.
243
+ // Set apiBasePath: '' (or '/') when you are supplying a *complete* path already
244
+ // (e.g. full Azure ".../deployments/my-model" URL) or for non-/v1 OpenAI-compatible servers.
245
+ const desired = options.apiBasePath;
246
+ const shouldAppend = desired !== '' && desired !== '/';
247
+ if (shouldAppend) {
248
+ // Normalize to exactly one leading slash and no trailing slash
249
+ // (so 'v1', '/v1', '//v1' and '/v1/' all become '/v1').
250
+ const basePath = ('/' + (desired || '/v1').replace(/^\/+/, '')).replace(/\/+$/, '');
251
+ if (!base.endsWith(basePath)) {
252
+ base += basePath;
253
+ }
17
254
  }
18
- super({ ...options, url }, auditor);
255
+ super({ ...options, url: base }, auditor);
256
+ }
257
+ warnVllmToolFallback() {
258
+ if (this.warnedVllmToolFallback)
259
+ return;
260
+ this.warnedVllmToolFallback = true;
261
+ console.warn(`[OpenAI] ${VLLM_AUTO_TOOL_CHOICE_HINT}`);
19
262
  }
20
263
  // ========================================================================
21
264
  // Chat
22
265
  // ========================================================================
23
266
  async chat(messages, options) {
24
267
  // Structured output and tools can now be used together.\n // The provider sends both response_format and tools in the request.\n // The Router handles skipping validation when the response contains tool calls.
25
- const url = `${this.options.url}/chat/completions`;
26
- const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
268
+ const url = this.buildUrl('/chat/completions');
269
+ const tools = options?.tools;
27
270
  const body = {
28
271
  model: this.options.model,
29
272
  messages: this.convertMessages(messages),
@@ -43,6 +286,13 @@ export class OpenAICompatibleClient extends BaseLLMClient {
43
286
  body['tool_choice'] = options.toolChoice;
44
287
  }
45
288
  }
289
+ if (this.gemmaNative) {
290
+ // Markers must survive decoding for client-side parsing,
291
+ // and request-level tool parsing is unavailable server-side.
292
+ body['skip_special_tokens'] = false;
293
+ if (tools?.length)
294
+ body['tool_choice'] = 'none';
295
+ }
46
296
  const start = Date.now();
47
297
  this.auditor.record({
48
298
  timestamp: start,
@@ -50,38 +300,72 @@ export class OpenAICompatibleClient extends BaseLLMClient {
50
300
  provider: 'openai',
51
301
  model: this.options.model,
52
302
  });
53
- const response = await httpRequest(url, {
54
- method: 'POST',
303
+ const response = await requestWithVllmToolFallback(url, {
55
304
  headers: buildHeaders(this.options),
56
305
  body,
57
306
  timeout: this.options.timeout ?? 30000,
58
- });
307
+ }, tools, () => this.warnVllmToolFallback());
59
308
  const data = response.data;
60
309
  const choice = data.choices[0];
61
310
  if (!choice) {
62
311
  throw new Error('No choices returned from OpenAI API');
63
312
  }
313
+ // vLLM / OpenAI-compatible `usage` carries no timing, so derive decode
314
+ // throughput from the client-measured wall-clock duration.
315
+ const durationMs = Date.now() - start;
64
316
  const usage = data.usage
65
317
  ? {
66
318
  inputTokens: data.usage.prompt_tokens,
67
319
  outputTokens: data.usage.completion_tokens,
68
320
  totalTokens: data.usage.total_tokens,
69
321
  cachedTokens: data.usage.prompt_tokens_details?.cached_tokens,
322
+ durationMs,
323
+ tokensPerSecond: durationMs > 0
324
+ ? data.usage.completion_tokens / (durationMs / 1000)
325
+ : undefined,
70
326
  }
71
327
  : undefined;
72
- // Normalize tool calls (ensure IDs exist)
73
- const toolCalls = choice.message.tool_calls?.map(tc => ({
74
- ...tc,
75
- id: tc.id || this.generateToolCallId(),
76
- }));
328
+ // Normalize tool calls (ensure IDs and JSON-parseable empty args exist).
329
+ let toolCalls = choice.message.tool_calls?.map(tc => this.normalizeToolCall(tc));
77
330
  // Get content, handling null case
78
- const content = choice.message.content || '';
331
+ let content = choice.message.content || '';
332
+ let reasoning;
333
+ // Reasoning models served over the OpenAI-compatible API (vLLM
334
+ // `--reasoning-parser`, DeepSeek-R1, etc.) return the chain-of-thought
335
+ // in a dedicated field instead of inline <think> tags. vLLM uses
336
+ // `reasoning_content`; some gateways use `reasoning`.
337
+ const serverReasoning = choice.message.reasoning ?? choice.message.reasoning_content;
338
+ if (typeof serverReasoning === 'string' && serverReasoning.length > 0) {
339
+ reasoning = serverReasoning;
340
+ }
341
+ if (this.gemmaNative && content) {
342
+ const parsed = parseGemmaDiffusionOutput(content);
343
+ content = parsed.content;
344
+ if (parsed.reasoning)
345
+ reasoning = parsed.reasoning;
346
+ if (!toolCalls?.length && parsed.toolCalls.length) {
347
+ toolCalls = parsed.toolCalls.map(tc => ({
348
+ id: this.generateToolCallId(),
349
+ type: 'function',
350
+ function: { name: tc.name, arguments: tc.argumentsJson },
351
+ }));
352
+ }
353
+ }
354
+ if (!toolCalls?.length && tools?.length && content) {
355
+ const knownToolNames = new Set(tools.map(tool => tool.function.name));
356
+ const recovered = recoverToolCallsFromText(content, knownToolNames, () => this.generateToolCallId());
357
+ if (recovered) {
358
+ toolCalls = recovered.calls;
359
+ content = recovered.cleanContent;
360
+ }
361
+ }
79
362
  const result = {
80
363
  message: {
81
364
  role: 'assistant',
82
365
  content,
83
366
  tool_calls: toolCalls,
84
367
  },
368
+ ...(reasoning !== undefined && { reasoning }),
85
369
  usage,
86
370
  provider: 'openai',
87
371
  };
@@ -99,8 +383,8 @@ export class OpenAICompatibleClient extends BaseLLMClient {
99
383
  // Streaming
100
384
  // ========================================================================
101
385
  async *chatStream(messages, options) {
102
- const url = `${this.options.url}/chat/completions`;
103
- const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
386
+ const url = this.buildUrl('/chat/completions');
387
+ const tools = options?.tools;
104
388
  const body = {
105
389
  model: this.options.model,
106
390
  messages: this.convertMessages(messages),
@@ -113,6 +397,11 @@ export class OpenAICompatibleClient extends BaseLLMClient {
113
397
  body['tool_choice'] = options.toolChoice;
114
398
  }
115
399
  }
400
+ if (this.gemmaNative) {
401
+ body['skip_special_tokens'] = false;
402
+ if (tools?.length)
403
+ body['tool_choice'] = 'none';
404
+ }
116
405
  const start = Date.now();
117
406
  this.auditor.record({
118
407
  timestamp: start,
@@ -120,71 +409,124 @@ export class OpenAICompatibleClient extends BaseLLMClient {
120
409
  provider: 'openai',
121
410
  model: this.options.model,
122
411
  });
123
- const decoder = new StandardChatDecoder(() => { });
412
+ // In gemma-native mode the decoder classifies thought-channel content,
413
+ // so we yield ITS events (thinking vs text) instead of the raw deltas.
414
+ const decoderEvents = [];
415
+ const decoder = new StandardChatDecoder(this.gemmaNative ? e => decoderEvents.push(e) : () => { });
124
416
  // Track accumulated tool calls across chunks
125
417
  const toolCallAccum = new Map();
126
- const stream = httpStream(url, {
127
- method: 'POST',
128
- headers: buildHeaders(this.options),
129
- body,
130
- timeout: this.options.timeout ?? 120000,
131
- });
418
+ let activeBody = body;
419
+ let retriedWithTextTools = false;
132
420
  let usage;
133
- for await (const { data } of parseSSE(stream)) {
421
+ // Accumulates reasoning deltas from servers that stream a dedicated
422
+ // `reasoning` / `reasoning_content` field (vLLM, DeepSeek-R1, etc.).
423
+ let reasoningBuffer = '';
424
+ while (true) {
425
+ const stream = httpStream(url, {
426
+ method: 'POST',
427
+ headers: buildHeaders(this.options),
428
+ body: activeBody,
429
+ timeout: this.options.timeout ?? 120000,
430
+ });
134
431
  try {
135
- const parsed = JSON.parse(data);
136
- if (parsed.usage) {
137
- usage = {
138
- inputTokens: parsed.usage.prompt_tokens,
139
- outputTokens: parsed.usage.completion_tokens,
140
- totalTokens: parsed.usage.total_tokens,
141
- cachedTokens: parsed.usage.prompt_tokens_details?.cached_tokens,
142
- };
143
- }
144
- const delta = parsed.choices?.[0]?.delta;
145
- if (!delta)
146
- continue;
147
- if (delta.content) {
148
- decoder.push(delta.content);
149
- yield { type: 'text', content: delta.content };
150
- }
151
- // Accumulate streamed tool calls
152
- if (delta.tool_calls) {
153
- for (const tc of delta.tool_calls) {
154
- const existing = toolCallAccum.get(tc.index);
155
- if (!existing) {
156
- toolCallAccum.set(tc.index, {
157
- id: tc.id || this.generateToolCallId(),
158
- type: 'function',
159
- function: {
160
- name: tc.function?.name || '',
161
- arguments: tc.function?.arguments || '',
162
- },
163
- });
432
+ for await (const { data } of parseSSE(stream)) {
433
+ try {
434
+ const parsed = JSON.parse(data);
435
+ if (parsed.usage) {
436
+ usage = {
437
+ inputTokens: parsed.usage.prompt_tokens,
438
+ outputTokens: parsed.usage.completion_tokens,
439
+ totalTokens: parsed.usage.total_tokens,
440
+ cachedTokens: parsed.usage.prompt_tokens_details?.cached_tokens,
441
+ };
442
+ }
443
+ const delta = parsed.choices?.[0]?.delta;
444
+ if (!delta)
445
+ continue;
446
+ // Surface server-side reasoning deltas as thinking events.
447
+ const reasoningDelta = delta.reasoning ?? delta.reasoning_content;
448
+ if (reasoningDelta) {
449
+ reasoningBuffer += reasoningDelta;
450
+ yield { type: 'thinking', content: reasoningDelta };
164
451
  }
165
- else {
166
- if (tc.function?.arguments) {
167
- existing.function.arguments += tc.function.arguments;
452
+ if (delta.content) {
453
+ decoder.push(delta.content);
454
+ if (this.gemmaNative) {
455
+ while (decoderEvents.length)
456
+ yield decoderEvents.shift();
168
457
  }
169
- if (tc.function?.name) {
170
- existing.function.name += tc.function.name;
458
+ else {
459
+ yield { type: 'text', content: delta.content };
460
+ }
461
+ }
462
+ // Accumulate streamed tool calls
463
+ if (delta.tool_calls) {
464
+ for (const tc of delta.tool_calls) {
465
+ const existing = toolCallAccum.get(tc.index);
466
+ if (!existing) {
467
+ toolCallAccum.set(tc.index, {
468
+ id: tc.id || this.generateToolCallId(),
469
+ type: 'function',
470
+ function: {
471
+ name: tc.function?.name || '',
472
+ arguments: tc.function?.arguments || '',
473
+ },
474
+ });
475
+ }
476
+ else {
477
+ if (tc.function?.arguments) {
478
+ existing.function.arguments += tc.function.arguments;
479
+ }
480
+ if (tc.function?.name) {
481
+ existing.function.name += tc.function.name;
482
+ }
483
+ }
484
+ }
485
+ }
486
+ // Emit tool calls when stream finishes
487
+ if (parsed.choices?.[0]?.finish_reason === 'tool_calls' || parsed.choices?.[0]?.finish_reason === 'stop') {
488
+ if (toolCallAccum.size > 0) {
489
+ const calls = Array.from(toolCallAccum.values())
490
+ .map(tc => this.normalizeToolCall(tc));
491
+ yield { type: 'tool_call', calls };
171
492
  }
172
493
  }
173
494
  }
174
- }
175
- // Emit tool calls when stream finishes
176
- if (parsed.choices?.[0]?.finish_reason === 'tool_calls' || parsed.choices?.[0]?.finish_reason === 'stop') {
177
- if (toolCallAccum.size > 0) {
178
- const calls = Array.from(toolCallAccum.values());
179
- yield { type: 'tool_call', calls };
495
+ catch {
496
+ // Skip unparseable SSE data
180
497
  }
181
498
  }
499
+ break;
182
500
  }
183
- catch {
184
- // Skip unparseable SSE data
501
+ catch (error) {
502
+ if (!retriedWithTextTools
503
+ && tools?.length
504
+ && hasToolDefinitions(activeBody)
505
+ && isVllmAutoToolChoiceError(error)) {
506
+ this.warnVllmToolFallback();
507
+ activeBody = withoutNativeTools(activeBody, tools);
508
+ retriedWithTextTools = true;
509
+ continue;
510
+ }
511
+ throw error;
185
512
  }
186
513
  }
187
514
  decoder.flush();
515
+ if (this.gemmaNative) {
516
+ while (decoderEvents.length)
517
+ yield decoderEvents.shift();
518
+ }
519
+ // Augment usage with client-measured timing (vLLM streams no timing).
520
+ if (usage) {
521
+ const durationMs = Date.now() - start;
522
+ usage = {
523
+ ...usage,
524
+ durationMs,
525
+ tokensPerSecond: durationMs > 0
526
+ ? usage.outputTokens / (durationMs / 1000)
527
+ : undefined,
528
+ };
529
+ }
188
530
  this.auditor.record({
189
531
  timestamp: Date.now(),
190
532
  type: 'stream_end',
@@ -193,25 +535,75 @@ export class OpenAICompatibleClient extends BaseLLMClient {
193
535
  duration: Date.now() - start,
194
536
  usage,
195
537
  });
196
- const finalToolCalls = toolCallAccum.size > 0
197
- ? Array.from(toolCallAccum.values())
538
+ let finalToolCalls = toolCallAccum.size > 0
539
+ ? Array.from(toolCallAccum.values()).map(tc => this.normalizeToolCall(tc))
198
540
  : undefined;
541
+ let cleanContent = decoder.getCleanContent();
542
+ // Prefer the server's dedicated reasoning field; fall back to <think>
543
+ // tags parsed from the content stream by the decoder.
544
+ let reasoning = reasoningBuffer || decoder.getReasoning();
545
+ if (this.gemmaNative) {
546
+ // Native tool-call blocks live in the text channel; extract them.
547
+ const parsed = parseGemmaDiffusionOutput(cleanContent);
548
+ cleanContent = parsed.content;
549
+ if (parsed.reasoning) {
550
+ reasoning = reasoning ? `${reasoning}\n\n${parsed.reasoning}` : parsed.reasoning;
551
+ }
552
+ if (!finalToolCalls?.length && parsed.toolCalls.length) {
553
+ finalToolCalls = parsed.toolCalls.map(tc => ({
554
+ id: this.generateToolCallId(),
555
+ type: 'function',
556
+ function: { name: tc.name, arguments: tc.argumentsJson },
557
+ }));
558
+ yield { type: 'tool_call', calls: finalToolCalls };
559
+ }
560
+ }
561
+ if (!finalToolCalls?.length && tools?.length && cleanContent) {
562
+ const knownToolNames = new Set(tools.map(tool => tool.function.name));
563
+ const recovered = recoverToolCallsFromText(cleanContent, knownToolNames, () => this.generateToolCallId());
564
+ if (recovered) {
565
+ finalToolCalls = recovered.calls;
566
+ cleanContent = recovered.cleanContent;
567
+ yield { type: 'tool_call', calls: finalToolCalls };
568
+ }
569
+ }
199
570
  return {
200
571
  message: {
201
572
  role: 'assistant',
202
- content: decoder.getCleanContent(),
573
+ content: cleanContent,
203
574
  tool_calls: finalToolCalls,
204
575
  },
205
- reasoning: decoder.getReasoning(),
576
+ reasoning,
206
577
  usage,
207
578
  provider: 'openai',
208
579
  };
209
580
  }
581
+ normalizeToolCall(toolCall) {
582
+ return {
583
+ ...toolCall,
584
+ id: toolCall.id || this.generateToolCallId(),
585
+ type: 'function',
586
+ function: {
587
+ ...toolCall.function,
588
+ name: toolCall.function?.name || '',
589
+ arguments: this.normalizeToolArguments(toolCall.function?.arguments),
590
+ },
591
+ };
592
+ }
593
+ normalizeToolArguments(args) {
594
+ if (typeof args === 'string') {
595
+ return args.trim().length > 0 ? args : '{}';
596
+ }
597
+ if (args == null) {
598
+ return '{}';
599
+ }
600
+ return JSON.stringify(args) ?? '{}';
601
+ }
210
602
  // ========================================================================
211
603
  // Embeddings
212
604
  // ========================================================================
213
605
  async embed(text) {
214
- const url = `${this.options.url}/embeddings`;
606
+ const url = this.buildUrl('/embeddings');
215
607
  const response = await httpRequest(url, {
216
608
  method: 'POST',
217
609
  headers: buildHeaders(this.options),
@@ -227,7 +619,7 @@ export class OpenAICompatibleClient extends BaseLLMClient {
227
619
  // Model Discovery
228
620
  // ========================================================================
229
621
  async getModels() {
230
- const url = `${this.options.url}/models`;
622
+ const url = this.buildUrl('/models');
231
623
  try {
232
624
  const response = await httpRequest(url, {
233
625
  headers: buildHeaders(this.options),
@@ -243,12 +635,7 @@ export class OpenAICompatibleClient extends BaseLLMClient {
243
635
  // Internals
244
636
  // ========================================================================
245
637
  convertMessages(messages) {
246
- // OpenAI format is our canonical format, minimal conversion needed
247
- return messages.map(msg => ({
248
- ...msg,
249
- // Ensure content is never null/undefined
250
- content: msg.content ?? '',
251
- }));
638
+ return normalizeMessagesForOpenAICompat(messages);
252
639
  }
253
640
  buildRequestParams(options) {
254
641
  const params = {
@@ -259,6 +646,27 @@ export class OpenAICompatibleClient extends BaseLLMClient {
259
646
  params['temperature'] = options.temperature;
260
647
  if (options?.maxTokens !== undefined)
261
648
  params['max_tokens'] = options.maxTokens;
649
+ // Unified thinking flag. Per-call overrides model config; only emitted
650
+ // when explicitly set, so servers that reject unknown fields are
651
+ // unaffected by default. OpenAI reasoning models (o-series / GPT-5) use
652
+ // `reasoning_effort`; vLLM / Qwen use `chat_template_kwargs.enable_thinking`.
653
+ // A user-supplied value (via parameters) always wins.
654
+ const thinking = resolveThinking(options?.thinking, this.options.thinking);
655
+ if (thinking) {
656
+ const isOfficialOpenAI = (this.options.url ?? '').includes('api.openai.com');
657
+ if (isOpenAIReasoningModel(this.options.model)) {
658
+ if (params['reasoning_effort'] === undefined) {
659
+ params['reasoning_effort'] = thinking.enabled ? (thinking.level ?? 'medium') : 'minimal';
660
+ }
661
+ }
662
+ else if (!isOfficialOpenAI) {
663
+ // `chat_template_kwargs` is a vLLM/Qwen extension. Official OpenAI
664
+ // rejects unknown body fields (and gpt-4o has no thinking toggle),
665
+ // so only send it to self-hosted / compatible gateways.
666
+ const existing = params['chat_template_kwargs'] ?? {};
667
+ params['chat_template_kwargs'] = { enable_thinking: thinking.enabled, ...existing };
668
+ }
669
+ }
262
670
  return params;
263
671
  }
264
672
  // ========================================================================
@@ -300,4 +708,3 @@ export class OpenAICompatibleClient extends BaseLLMClient {
300
708
  };
301
709
  }
302
710
  }
303
- //# sourceMappingURL=openai.js.map