universal-llm-client 4.2.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/CHANGELOG.md +142 -103
  2. package/LICENSE +21 -21
  3. package/README.md +640 -591
  4. package/dist/ai-model.d.ts +12 -1
  5. package/dist/ai-model.d.ts.map +1 -1
  6. package/dist/ai-model.js +36 -1
  7. package/dist/ai-model.js.map +1 -1
  8. package/dist/gemma-channel.d.ts +14 -0
  9. package/dist/gemma-channel.d.ts.map +1 -0
  10. package/dist/gemma-channel.js +38 -0
  11. package/dist/gemma-channel.js.map +1 -0
  12. package/dist/gemma-diffusion.d.ts +49 -0
  13. package/dist/gemma-diffusion.d.ts.map +1 -0
  14. package/dist/gemma-diffusion.js +147 -0
  15. package/dist/gemma-diffusion.js.map +1 -0
  16. package/dist/http.d.ts +4 -0
  17. package/dist/http.d.ts.map +1 -1
  18. package/dist/http.js +14 -1
  19. package/dist/http.js.map +1 -1
  20. package/dist/index.d.ts +2 -1
  21. package/dist/index.d.ts.map +1 -1
  22. package/dist/index.js +4 -0
  23. package/dist/index.js.map +1 -1
  24. package/dist/interfaces.d.ts +183 -7
  25. package/dist/interfaces.d.ts.map +1 -1
  26. package/dist/interfaces.js.map +1 -1
  27. package/dist/providers/anthropic.d.ts.map +1 -1
  28. package/dist/providers/anthropic.js +28 -3
  29. package/dist/providers/anthropic.js.map +1 -1
  30. package/dist/providers/google.d.ts +22 -1
  31. package/dist/providers/google.d.ts.map +1 -1
  32. package/dist/providers/google.js +225 -13
  33. package/dist/providers/google.js.map +1 -1
  34. package/dist/providers/ollama.d.ts +2 -0
  35. package/dist/providers/ollama.d.ts.map +1 -1
  36. package/dist/providers/ollama.js +59 -30
  37. package/dist/providers/ollama.js.map +1 -1
  38. package/dist/providers/openai.d.ts +14 -0
  39. package/dist/providers/openai.d.ts.map +1 -1
  40. package/dist/providers/openai.js +200 -22
  41. package/dist/providers/openai.js.map +1 -1
  42. package/dist/router.d.ts +2 -0
  43. package/dist/router.d.ts.map +1 -1
  44. package/dist/router.js +4 -0
  45. package/dist/router.js.map +1 -1
  46. package/dist/stream-decoder.d.ts +12 -0
  47. package/dist/stream-decoder.d.ts.map +1 -1
  48. package/dist/stream-decoder.js +182 -5
  49. package/dist/stream-decoder.js.map +1 -1
  50. package/dist/thinking.d.ts +36 -0
  51. package/dist/thinking.d.ts.map +1 -0
  52. package/dist/thinking.js +52 -0
  53. package/dist/thinking.js.map +1 -0
  54. package/package.json +118 -116
  55. package/src/ai-model.ts +400 -350
  56. package/src/auditor.ts +213 -213
  57. package/src/client.ts +402 -402
  58. package/src/debug/debug-google-streaming.ts +1 -1
  59. package/src/demos/basic/universal-llm-examples.ts +3 -3
  60. package/src/demos/diffusion-gemma/.env +29 -0
  61. package/src/demos/diffusion-gemma/.env.example +27 -0
  62. package/src/demos/diffusion-gemma/CLAUDE.md +95 -0
  63. package/src/demos/diffusion-gemma/README.md +59 -0
  64. package/src/demos/diffusion-gemma/canvas.ts +1606 -0
  65. package/src/demos/diffusion-gemma/docker-compose.yml +29 -0
  66. package/src/demos/diffusion-gemma/probe-stream.ts +51 -0
  67. package/src/demos/diffusion-gemma/probe-tools.ts +55 -0
  68. package/src/demos/diffusion-gemma/server.ts +1205 -0
  69. package/src/demos/diffusion-gemma/start-vllm.sh +98 -0
  70. package/src/gemma-channel.ts +47 -0
  71. package/src/gemma-diffusion.ts +167 -0
  72. package/src/http.ts +261 -247
  73. package/src/index.ts +180 -161
  74. package/src/interfaces.ts +843 -657
  75. package/src/mcp.ts +345 -345
  76. package/src/providers/anthropic.ts +796 -762
  77. package/src/providers/google.ts +840 -620
  78. package/src/providers/index.ts +8 -8
  79. package/src/providers/ollama.ts +503 -469
  80. package/src/providers/openai.ts +587 -392
  81. package/src/router.ts +785 -780
  82. package/src/stream-decoder.ts +535 -361
  83. package/src/structured-output.ts +759 -759
  84. package/src/test-scripts/test-google-deep-research.ts +33 -0
  85. package/src/test-scripts/test-google-streaming-enhanced.ts +147 -147
  86. package/src/test-scripts/test-google-streaming.ts +1 -1
  87. package/src/test-scripts/test-google-system-prompt-comprehensive.ts +189 -189
  88. package/src/test-scripts/test-google-thinking.ts +46 -0
  89. package/src/test-scripts/test-system-message-positions.ts +163 -163
  90. package/src/test-scripts/test-system-prompt-improvement-demo.ts +83 -83
  91. package/src/test-scripts/test-vllm-qwen36.ts +256 -0
  92. package/src/tests/ai-model.test.ts +1614 -1614
  93. package/src/tests/auditor.test.ts +224 -224
  94. package/src/tests/gemma-diffusion.test.ts +115 -0
  95. package/src/tests/http.test.ts +200 -200
  96. package/src/tests/interfaces.test.ts +117 -117
  97. package/src/tests/providers/anthropic.test.ts +118 -0
  98. package/src/tests/providers/google.test.ts +841 -660
  99. package/src/tests/providers/ollama.test.ts +1034 -954
  100. package/src/tests/providers/openai.test.ts +1511 -1122
  101. package/src/tests/router.test.ts +254 -254
  102. package/src/tests/stream-decoder.test.ts +263 -179
  103. package/src/tests/structured-output.test.ts +1450 -1450
  104. package/src/tests/thinking.test.ts +65 -0
  105. package/src/tests/tools.test.ts +175 -175
  106. package/src/thinking.ts +73 -0
  107. package/src/tools.ts +246 -246
  108. package/src/zod-adapter.ts +72 -72
@@ -1,392 +1,587 @@
1
- /**
2
- * Universal LLM Client v3 — OpenAI-Compatible Provider
3
- *
4
- * Implements BaseLLMClient for OpenAI-compatible APIs.
5
- * Works with: OpenAI, OpenRouter, LM Studio, LlamaCpp, vLLM, Groq, Together.
6
- */
7
-
8
- import { BaseLLMClient } from '../client.js';
9
- import { httpRequest, httpStream, parseSSE, buildHeaders } from '../http.js';
10
- import { StandardChatDecoder } from '../stream-decoder.js';
11
- import {
12
- normalizeJsonSchema,
13
- getJsonSchemaFromConfig,
14
- type JSONSchema,
15
- type StructuredOutputOptions,
16
- } from '../structured-output.js';
17
- import type {
18
- LLMClientOptions,
19
- LLMChatMessage,
20
- LLMChatResponse,
21
- ChatOptions,
22
- OpenAIResponse,
23
- OpenAIModelInfo,
24
- TokenUsageInfo,
25
- } from '../interfaces.js';
26
- import type { DecodedEvent } from '../stream-decoder.js';
27
- import type { Auditor } from '../auditor.js';
28
-
29
- export class OpenAICompatibleClient extends BaseLLMClient {
30
- constructor(options: LLMClientOptions, auditor?: Auditor) {
31
- // Ensure URL ends with /v1 for standard endpoints
32
- let url = (options.url || 'https://api.openai.com').replace(/\/+$/, '');
33
- if (!url.endsWith('/v1')) {
34
- url += '/v1';
35
- }
36
- super({ ...options, url }, auditor);
37
- }
38
-
39
- // ========================================================================
40
- // Chat
41
- // ========================================================================
42
-
43
- async chat(
44
- messages: LLMChatMessage[],
45
- options?: ChatOptions,
46
- ): Promise<LLMChatResponse> {
47
- // Structured output and tools can now be used together.\n // The provider sends both response_format and tools in the request.\n // The Router handles skipping validation when the response contains tool calls.
48
-
49
- const url = `${this.options.url}/chat/completions`;
50
- const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
51
-
52
- const body: Record<string, unknown> = {
53
- model: this.options.model,
54
- messages: this.convertMessages(messages),
55
- ...this.buildRequestParams(options),
56
- };
57
-
58
- // Handle structured output
59
- const schemaOptions = this.extractSchemaOptions(options);
60
- if (schemaOptions) {
61
- body['response_format'] = this.buildResponseFormat(schemaOptions);
62
- } else if (options?.responseFormat) {
63
- body['response_format'] = options.responseFormat;
64
- }
65
-
66
- if (tools?.length) {
67
- body['tools'] = tools;
68
- if (options?.toolChoice) {
69
- body['tool_choice'] = options.toolChoice;
70
- }
71
- }
72
-
73
- const start = Date.now();
74
- this.auditor.record({
75
- timestamp: start,
76
- type: 'request',
77
- provider: 'openai',
78
- model: this.options.model,
79
- });
80
-
81
- const response = await httpRequest<OpenAIResponse>(url, {
82
- method: 'POST',
83
- headers: buildHeaders(this.options),
84
- body,
85
- timeout: this.options.timeout ?? 30000,
86
- });
87
-
88
- const data = response.data;
89
- const choice = data.choices[0];
90
-
91
- if (!choice) {
92
- throw new Error('No choices returned from OpenAI API');
93
- }
94
-
95
- const usage: TokenUsageInfo | undefined = data.usage
96
- ? {
97
- inputTokens: data.usage.prompt_tokens,
98
- outputTokens: data.usage.completion_tokens,
99
- totalTokens: data.usage.total_tokens,
100
- cachedTokens: data.usage.prompt_tokens_details?.cached_tokens,
101
- }
102
- : undefined;
103
-
104
- // Normalize tool calls (ensure IDs exist)
105
- const toolCalls = choice.message.tool_calls?.map(tc => ({
106
- ...tc,
107
- id: tc.id || this.generateToolCallId(),
108
- }));
109
-
110
- // Get content, handling null case
111
- const content = choice.message.content || '';
112
-
113
- const result: LLMChatResponse = {
114
- message: {
115
- role: 'assistant',
116
- content,
117
- tool_calls: toolCalls,
118
- },
119
- usage,
120
- provider: 'openai',
121
- };
122
-
123
- this.auditor.record({
124
- timestamp: Date.now(),
125
- type: 'response',
126
- provider: 'openai',
127
- model: this.options.model,
128
- duration: Date.now() - start,
129
- usage,
130
- });
131
-
132
- return result;
133
- }
134
-
135
- // ========================================================================
136
- // Streaming
137
- // ========================================================================
138
-
139
- async *chatStream(
140
- messages: LLMChatMessage[],
141
- options?: ChatOptions,
142
- ): AsyncGenerator<DecodedEvent, LLMChatResponse | void, unknown> {
143
- const url = `${this.options.url}/chat/completions`;
144
- const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
145
-
146
- const body: Record<string, unknown> = {
147
- model: this.options.model,
148
- messages: this.convertMessages(messages),
149
- stream: true,
150
- ...this.buildRequestParams(options),
151
- };
152
-
153
- if (tools?.length) {
154
- body['tools'] = tools;
155
- if (options?.toolChoice) {
156
- body['tool_choice'] = options.toolChoice;
157
- }
158
- }
159
-
160
- const start = Date.now();
161
- this.auditor.record({
162
- timestamp: start,
163
- type: 'stream_start',
164
- provider: 'openai',
165
- model: this.options.model,
166
- });
167
-
168
- const decoder = new StandardChatDecoder(() => {});
169
-
170
- // Track accumulated tool calls across chunks
171
- const toolCallAccum: Map<number, {
172
- id: string;
173
- type: 'function';
174
- function: { name: string; arguments: string };
175
- }> = new Map();
176
-
177
- const stream = httpStream(url, {
178
- method: 'POST',
179
- headers: buildHeaders(this.options),
180
- body,
181
- timeout: this.options.timeout ?? 120000,
182
- });
183
-
184
- let usage: TokenUsageInfo | undefined;
185
-
186
- for await (const { data } of parseSSE(stream)) {
187
- try {
188
- const parsed = JSON.parse(data) as {
189
- choices?: Array<{
190
- delta?: {
191
- content?: string;
192
- tool_calls?: Array<{
193
- index: number;
194
- id?: string;
195
- type?: string;
196
- function?: { name?: string; arguments?: string };
197
- }>;
198
- };
199
- finish_reason?: string;
200
- }>;
201
- usage?: {
202
- prompt_tokens: number;
203
- completion_tokens: number;
204
- total_tokens: number;
205
- prompt_tokens_details?: {
206
- cached_tokens?: number;
207
- };
208
- };
209
- };
210
-
211
- if (parsed.usage) {
212
- usage = {
213
- inputTokens: parsed.usage.prompt_tokens,
214
- outputTokens: parsed.usage.completion_tokens,
215
- totalTokens: parsed.usage.total_tokens,
216
- cachedTokens: parsed.usage.prompt_tokens_details?.cached_tokens,
217
- };
218
- }
219
-
220
- const delta = parsed.choices?.[0]?.delta;
221
- if (!delta) continue;
222
-
223
- if (delta.content) {
224
- decoder.push(delta.content);
225
- yield { type: 'text', content: delta.content };
226
- }
227
-
228
- // Accumulate streamed tool calls
229
- if (delta.tool_calls) {
230
- for (const tc of delta.tool_calls) {
231
- const existing = toolCallAccum.get(tc.index);
232
- if (!existing) {
233
- toolCallAccum.set(tc.index, {
234
- id: tc.id || this.generateToolCallId(),
235
- type: 'function',
236
- function: {
237
- name: tc.function?.name || '',
238
- arguments: tc.function?.arguments || '',
239
- },
240
- });
241
- } else {
242
- if (tc.function?.arguments) {
243
- existing.function.arguments += tc.function.arguments;
244
- }
245
- if (tc.function?.name) {
246
- existing.function.name += tc.function.name;
247
- }
248
- }
249
- }
250
- }
251
-
252
- // Emit tool calls when stream finishes
253
- if (parsed.choices?.[0]?.finish_reason === 'tool_calls' || parsed.choices?.[0]?.finish_reason === 'stop') {
254
- if (toolCallAccum.size > 0) {
255
- const calls = Array.from(toolCallAccum.values());
256
- yield { type: 'tool_call', calls };
257
- }
258
- }
259
- } catch {
260
- // Skip unparseable SSE data
261
- }
262
- }
263
-
264
- decoder.flush();
265
-
266
- this.auditor.record({
267
- timestamp: Date.now(),
268
- type: 'stream_end',
269
- provider: 'openai',
270
- model: this.options.model,
271
- duration: Date.now() - start,
272
- usage,
273
- });
274
-
275
- const finalToolCalls = toolCallAccum.size > 0
276
- ? Array.from(toolCallAccum.values())
277
- : undefined;
278
-
279
- return {
280
- message: {
281
- role: 'assistant',
282
- content: decoder.getCleanContent(),
283
- tool_calls: finalToolCalls,
284
- },
285
- reasoning: decoder.getReasoning(),
286
- usage,
287
- provider: 'openai',
288
- };
289
- }
290
-
291
- // ========================================================================
292
- // Embeddings
293
- // ========================================================================
294
-
295
- async embed(text: string): Promise<number[]> {
296
- const url = `${this.options.url}/embeddings`;
297
- const response = await httpRequest<{
298
- data: Array<{ embedding: number[] }>;
299
- }>(url, {
300
- method: 'POST',
301
- headers: buildHeaders(this.options),
302
- body: {
303
- model: this.options.model,
304
- input: text,
305
- },
306
- timeout: this.options.timeout ?? 30000,
307
- });
308
- return response.data.data[0]?.embedding ?? [];
309
- }
310
-
311
- // ========================================================================
312
- // Model Discovery
313
- // ========================================================================
314
-
315
- async getModels(): Promise<string[]> {
316
- const url = `${this.options.url}/models`;
317
- try {
318
- const response = await httpRequest<{
319
- data: OpenAIModelInfo[];
320
- }>(url, {
321
- headers: buildHeaders(this.options),
322
- timeout: 5000,
323
- });
324
- return response.data.data.map(m => m.id);
325
- } catch {
326
- return [];
327
- }
328
- }
329
-
330
- // ========================================================================
331
- // Internals
332
- // ========================================================================
333
-
334
- private convertMessages(messages: LLMChatMessage[]): LLMChatMessage[] {
335
- // OpenAI format is our canonical format, minimal conversion needed
336
- return messages.map(msg => ({
337
- ...msg,
338
- // Ensure content is never null/undefined
339
- content: msg.content ?? '',
340
- }));
341
- }
342
-
343
- private buildRequestParams(options?: ChatOptions): Record<string, unknown> {
344
- const params: Record<string, unknown> = {
345
- ...this.options.defaultParameters,
346
- ...options?.parameters,
347
- };
348
- if (options?.temperature !== undefined) params['temperature'] = options.temperature;
349
- if (options?.maxTokens !== undefined) params['max_tokens'] = options.maxTokens;
350
- return params;
351
- }
352
-
353
- // ========================================================================
354
- // Structured Output Helpers
355
- // ========================================================================
356
-
357
- /**
358
- * Build OpenAI response_format for structured output.
359
- */
360
- private buildResponseFormat(options: StructuredOutputOptions<unknown> & { strict?: boolean }): Record<string, unknown> {
361
- let jsonSchema: JSONSchema;
362
- let name: string;
363
- let description: string | undefined;
364
-
365
- // Prefer jsonSchema if provided (handles raw JSON Schema case)
366
- if (options.jsonSchema) {
367
- // Use raw JSON Schema
368
- jsonSchema = normalizeJsonSchema(options.jsonSchema);
369
- name = options.name || 'response';
370
- description = options.description;
371
- } else if (options.schemaConfig) {
372
- // Use SchemaConfig's embedded JSON Schema
373
- jsonSchema = getJsonSchemaFromConfig(options.schemaConfig);
374
- name = options.name || options.schemaConfig.name || 'response';
375
- description = options.description || options.schemaConfig.description;
376
- } else {
377
- // Should not happen - we check this in extractSchemaOptions
378
- throw new Error('Either schemaConfig or jsonSchema must be provided');
379
- }
380
-
381
- // OpenAI strict mode — configurable, defaults to true for reliable structured output
382
- return {
383
- type: 'json_schema',
384
- json_schema: {
385
- name,
386
- ...(description && { description }),
387
- schema: jsonSchema,
388
- strict: options.strict ?? true,
389
- },
390
- };
391
- }
392
- }
1
+ /**
2
+ * Universal LLM Client v3 — OpenAI-Compatible Provider
3
+ *
4
+ * Implements BaseLLMClient for OpenAI-compatible APIs.
5
+ * Works with: OpenAI, OpenRouter, LM Studio, LlamaCpp, vLLM, Groq, Together.
6
+ */
7
+
8
+ import { BaseLLMClient } from '../client.js';
9
+ import { resolveThinking, isOpenAIReasoningModel } from '../thinking.js';
10
+ import { httpRequest, httpStream, parseSSE, buildHeaders } from '../http.js';
11
+ import { StandardChatDecoder } from '../stream-decoder.js';
12
+ import {
13
+ normalizeJsonSchema,
14
+ getJsonSchemaFromConfig,
15
+ type JSONSchema,
16
+ type StructuredOutputOptions,
17
+ } from '../structured-output.js';
18
+ import type {
19
+ LLMClientOptions,
20
+ LLMChatMessage,
21
+ LLMChatResponse,
22
+ ChatOptions,
23
+ OpenAIResponse,
24
+ OpenAIModelInfo,
25
+ LLMToolCall,
26
+ TokenUsageInfo,
27
+ } from '../interfaces.js';
28
+ import type { DecodedEvent } from '../stream-decoder.js';
29
+ import type { Auditor } from '../auditor.js';
30
+ import { isGemmaDiffusionModel, parseGemmaDiffusionOutput } from '../gemma-diffusion.js';
31
+
32
+ export class OpenAICompatibleClient extends BaseLLMClient {
33
+ /**
34
+ * DiffusionGemma on trimmed vLLM builds has no server-side reasoning or
35
+ * tool-call parser — the native channel protocol is handled client-side
36
+ * (see gemma-diffusion.ts). Auto-detected from the model name; override
37
+ * with `gemmaNativeProtocol` in LLMClientOptions.
38
+ */
39
+ private get gemmaNative(): boolean {
40
+ return this.options.gemmaNativeProtocol ?? isGemmaDiffusionModel(this.options.model);
41
+ }
42
+
43
+ /**
44
+ * Build a full endpoint URL, respecting apiBasePath (already baked into this.options.url)
45
+ * and any queryParams provided at the provider config level.
46
+ */
47
+ private buildUrl(suffix: string): string {
48
+ const raw = this.options.url.replace(/\/+$/, '');
49
+ // Split off any query string already on the configured base URL so the
50
+ // path is inserted before it (avoids `host/v1?k=v/chat/completions`).
51
+ const qIdx = raw.indexOf('?');
52
+ const basePath = (qIdx === -1 ? raw : raw.slice(0, qIdx)).replace(/\/+$/, '');
53
+ const existingQuery = qIdx === -1 ? '' : raw.slice(qIdx + 1);
54
+ const path = suffix.startsWith('/') ? suffix : '/' + suffix;
55
+
56
+ const search = new URLSearchParams(existingQuery);
57
+ const qp = this.options.queryParams;
58
+ if (qp) {
59
+ for (const [k, v] of Object.entries(qp)) {
60
+ if (v != null) search.set(k, String(v));
61
+ }
62
+ }
63
+ const qs = search.toString();
64
+ return basePath + path + (qs ? `?${qs}` : '');
65
+ }
66
+
67
+ constructor(options: LLMClientOptions, auditor?: Auditor) {
68
+ let base = (options.url || 'https://api.openai.com').replace(/\/+$/, '');
69
+
70
+ // Respect apiBasePath (from ProviderConfig.apiBasePath). Default "/v1" for broad compatibility.
71
+ // Set apiBasePath: '' (or '/') when you are supplying a *complete* path already
72
+ // (e.g. full Azure ".../deployments/my-model" URL) or for non-/v1 OpenAI-compatible servers.
73
+ const desired = options.apiBasePath;
74
+ const shouldAppend = desired !== '' && desired !== '/';
75
+
76
+ if (shouldAppend) {
77
+ // Normalize to exactly one leading slash and no trailing slash
78
+ // (so 'v1', '/v1', '//v1' and '/v1/' all become '/v1').
79
+ const basePath = ('/' + (desired || '/v1').replace(/^\/+/, '')).replace(/\/+$/, '');
80
+ if (!base.endsWith(basePath)) {
81
+ base += basePath;
82
+ }
83
+ }
84
+
85
+ super({ ...options, url: base }, auditor);
86
+ }
87
+
88
+ // ========================================================================
89
+ // Chat
90
+ // ========================================================================
91
+
92
+ async chat(
93
+ messages: LLMChatMessage[],
94
+ options?: ChatOptions,
95
+ ): Promise<LLMChatResponse> {
96
+ // Structured output and tools can now be used together.\n // The provider sends both response_format and tools in the request.\n // The Router handles skipping validation when the response contains tool calls.
97
+
98
+ const url = this.buildUrl('/chat/completions');
99
+ const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
100
+
101
+ const body: Record<string, unknown> = {
102
+ model: this.options.model,
103
+ messages: this.convertMessages(messages),
104
+ ...this.buildRequestParams(options),
105
+ };
106
+
107
+ // Handle structured output
108
+ const schemaOptions = this.extractSchemaOptions(options);
109
+ if (schemaOptions) {
110
+ body['response_format'] = this.buildResponseFormat(schemaOptions);
111
+ } else if (options?.responseFormat) {
112
+ body['response_format'] = options.responseFormat;
113
+ }
114
+
115
+ if (tools?.length) {
116
+ body['tools'] = tools;
117
+ if (options?.toolChoice) {
118
+ body['tool_choice'] = options.toolChoice;
119
+ }
120
+ }
121
+
122
+ if (this.gemmaNative) {
123
+ // Markers must survive decoding for client-side parsing,
124
+ // and request-level tool parsing is unavailable server-side.
125
+ body['skip_special_tokens'] = false;
126
+ if (tools?.length) body['tool_choice'] = 'none';
127
+ }
128
+
129
+ const start = Date.now();
130
+ this.auditor.record({
131
+ timestamp: start,
132
+ type: 'request',
133
+ provider: 'openai',
134
+ model: this.options.model,
135
+ });
136
+
137
+ const response = await httpRequest<OpenAIResponse>(url, {
138
+ method: 'POST',
139
+ headers: buildHeaders(this.options),
140
+ body,
141
+ timeout: this.options.timeout ?? 30000,
142
+ });
143
+
144
+ const data = response.data;
145
+ const choice = data.choices[0];
146
+
147
+ if (!choice) {
148
+ throw new Error('No choices returned from OpenAI API');
149
+ }
150
+
151
+ // vLLM / OpenAI-compatible `usage` carries no timing, so derive decode
152
+ // throughput from the client-measured wall-clock duration.
153
+ const durationMs = Date.now() - start;
154
+ const usage: TokenUsageInfo | undefined = data.usage
155
+ ? {
156
+ inputTokens: data.usage.prompt_tokens,
157
+ outputTokens: data.usage.completion_tokens,
158
+ totalTokens: data.usage.total_tokens,
159
+ cachedTokens: data.usage.prompt_tokens_details?.cached_tokens,
160
+ durationMs,
161
+ tokensPerSecond: durationMs > 0
162
+ ? data.usage.completion_tokens / (durationMs / 1000)
163
+ : undefined,
164
+ }
165
+ : undefined;
166
+
167
+ // Normalize tool calls (ensure IDs and JSON-parseable empty args exist).
168
+ let toolCalls = choice.message.tool_calls?.map(tc => this.normalizeToolCall(tc));
169
+
170
+ // Get content, handling null case
171
+ let content = choice.message.content || '';
172
+ let reasoning: string | undefined;
173
+
174
+ // Reasoning models served over the OpenAI-compatible API (vLLM
175
+ // `--reasoning-parser`, DeepSeek-R1, etc.) return the chain-of-thought
176
+ // in a dedicated field instead of inline <think> tags. vLLM uses
177
+ // `reasoning_content`; some gateways use `reasoning`.
178
+ const serverReasoning = choice.message.reasoning ?? choice.message.reasoning_content;
179
+ if (typeof serverReasoning === 'string' && serverReasoning.length > 0) {
180
+ reasoning = serverReasoning;
181
+ }
182
+
183
+ if (this.gemmaNative && content) {
184
+ const parsed = parseGemmaDiffusionOutput(content);
185
+ content = parsed.content;
186
+ if (parsed.reasoning) reasoning = parsed.reasoning;
187
+ if (!toolCalls?.length && parsed.toolCalls.length) {
188
+ toolCalls = parsed.toolCalls.map(tc => ({
189
+ id: this.generateToolCallId(),
190
+ type: 'function' as const,
191
+ function: { name: tc.name, arguments: tc.argumentsJson },
192
+ }));
193
+ }
194
+ }
195
+
196
+ const result: LLMChatResponse = {
197
+ message: {
198
+ role: 'assistant',
199
+ content,
200
+ tool_calls: toolCalls,
201
+ },
202
+ ...(reasoning !== undefined && { reasoning }),
203
+ usage,
204
+ provider: 'openai',
205
+ };
206
+
207
+ this.auditor.record({
208
+ timestamp: Date.now(),
209
+ type: 'response',
210
+ provider: 'openai',
211
+ model: this.options.model,
212
+ duration: Date.now() - start,
213
+ usage,
214
+ });
215
+
216
+ return result;
217
+ }
218
+
219
+ // ========================================================================
220
+ // Streaming
221
+ // ========================================================================
222
+
223
+ async *chatStream(
224
+ messages: LLMChatMessage[],
225
+ options?: ChatOptions,
226
+ ): AsyncGenerator<DecodedEvent, LLMChatResponse | void, unknown> {
227
+ const url = this.buildUrl('/chat/completions');
228
+ const tools = options?.tools ?? (Object.keys(this.toolRegistry).length > 0 ? this.getToolDefinitions() : undefined);
229
+
230
+ const body: Record<string, unknown> = {
231
+ model: this.options.model,
232
+ messages: this.convertMessages(messages),
233
+ stream: true,
234
+ ...this.buildRequestParams(options),
235
+ };
236
+
237
+ if (tools?.length) {
238
+ body['tools'] = tools;
239
+ if (options?.toolChoice) {
240
+ body['tool_choice'] = options.toolChoice;
241
+ }
242
+ }
243
+
244
+ if (this.gemmaNative) {
245
+ body['skip_special_tokens'] = false;
246
+ if (tools?.length) body['tool_choice'] = 'none';
247
+ }
248
+
249
+ const start = Date.now();
250
+ this.auditor.record({
251
+ timestamp: start,
252
+ type: 'stream_start',
253
+ provider: 'openai',
254
+ model: this.options.model,
255
+ });
256
+
257
+ // In gemma-native mode the decoder classifies thought-channel content,
258
+ // so we yield ITS events (thinking vs text) instead of the raw deltas.
259
+ const decoderEvents: DecodedEvent[] = [];
260
+ const decoder = new StandardChatDecoder(
261
+ this.gemmaNative ? e => decoderEvents.push(e) : () => {},
262
+ );
263
+
264
+ // Track accumulated tool calls across chunks
265
+ const toolCallAccum: Map<number, {
266
+ id: string;
267
+ type: 'function';
268
+ function: { name: string; arguments: string };
269
+ }> = new Map();
270
+
271
+ const stream = httpStream(url, {
272
+ method: 'POST',
273
+ headers: buildHeaders(this.options),
274
+ body,
275
+ timeout: this.options.timeout ?? 120000,
276
+ });
277
+
278
+ let usage: TokenUsageInfo | undefined;
279
+ // Accumulates reasoning deltas from servers that stream a dedicated
280
+ // `reasoning` / `reasoning_content` field (vLLM, DeepSeek-R1, etc.).
281
+ let reasoningBuffer = '';
282
+
283
+ for await (const { data } of parseSSE(stream)) {
284
+ try {
285
+ const parsed = JSON.parse(data) as {
286
+ choices?: Array<{
287
+ delta?: {
288
+ content?: string;
289
+ // Reasoning-model chain-of-thought deltas (vLLM
290
+ // `--reasoning-parser`, DeepSeek-R1, etc.).
291
+ reasoning?: string;
292
+ reasoning_content?: string;
293
+ tool_calls?: Array<{
294
+ index: number;
295
+ id?: string;
296
+ type?: string;
297
+ function?: { name?: string; arguments?: string };
298
+ }>;
299
+ };
300
+ finish_reason?: string;
301
+ }>;
302
+ usage?: {
303
+ prompt_tokens: number;
304
+ completion_tokens: number;
305
+ total_tokens: number;
306
+ prompt_tokens_details?: {
307
+ cached_tokens?: number;
308
+ };
309
+ };
310
+ };
311
+
312
+ if (parsed.usage) {
313
+ usage = {
314
+ inputTokens: parsed.usage.prompt_tokens,
315
+ outputTokens: parsed.usage.completion_tokens,
316
+ totalTokens: parsed.usage.total_tokens,
317
+ cachedTokens: parsed.usage.prompt_tokens_details?.cached_tokens,
318
+ };
319
+ }
320
+
321
+ const delta = parsed.choices?.[0]?.delta;
322
+ if (!delta) continue;
323
+
324
+ // Surface server-side reasoning deltas as thinking events.
325
+ const reasoningDelta = delta.reasoning ?? delta.reasoning_content;
326
+ if (reasoningDelta) {
327
+ reasoningBuffer += reasoningDelta;
328
+ yield { type: 'thinking', content: reasoningDelta };
329
+ }
330
+
331
+ if (delta.content) {
332
+ decoder.push(delta.content);
333
+ if (this.gemmaNative) {
334
+ while (decoderEvents.length) yield decoderEvents.shift()!;
335
+ } else {
336
+ yield { type: 'text', content: delta.content };
337
+ }
338
+ }
339
+
340
+ // Accumulate streamed tool calls
341
+ if (delta.tool_calls) {
342
+ for (const tc of delta.tool_calls) {
343
+ const existing = toolCallAccum.get(tc.index);
344
+ if (!existing) {
345
+ toolCallAccum.set(tc.index, {
346
+ id: tc.id || this.generateToolCallId(),
347
+ type: 'function',
348
+ function: {
349
+ name: tc.function?.name || '',
350
+ arguments: tc.function?.arguments || '',
351
+ },
352
+ });
353
+ } else {
354
+ if (tc.function?.arguments) {
355
+ existing.function.arguments += tc.function.arguments;
356
+ }
357
+ if (tc.function?.name) {
358
+ existing.function.name += tc.function.name;
359
+ }
360
+ }
361
+ }
362
+ }
363
+
364
+ // Emit tool calls when stream finishes
365
+ if (parsed.choices?.[0]?.finish_reason === 'tool_calls' || parsed.choices?.[0]?.finish_reason === 'stop') {
366
+ if (toolCallAccum.size > 0) {
367
+ const calls = Array.from(toolCallAccum.values())
368
+ .map(tc => this.normalizeToolCall(tc));
369
+ yield { type: 'tool_call', calls };
370
+ }
371
+ }
372
+ } catch {
373
+ // Skip unparseable SSE data
374
+ }
375
+ }
376
+
377
+ decoder.flush();
378
+ if (this.gemmaNative) {
379
+ while (decoderEvents.length) yield decoderEvents.shift()!;
380
+ }
381
+
382
+ // Augment usage with client-measured timing (vLLM streams no timing).
383
+ if (usage) {
384
+ const durationMs = Date.now() - start;
385
+ usage = {
386
+ ...usage,
387
+ durationMs,
388
+ tokensPerSecond: durationMs > 0
389
+ ? usage.outputTokens / (durationMs / 1000)
390
+ : undefined,
391
+ };
392
+ }
393
+
394
+ this.auditor.record({
395
+ timestamp: Date.now(),
396
+ type: 'stream_end',
397
+ provider: 'openai',
398
+ model: this.options.model,
399
+ duration: Date.now() - start,
400
+ usage,
401
+ });
402
+
403
+ let finalToolCalls = toolCallAccum.size > 0
404
+ ? Array.from(toolCallAccum.values()).map(tc => this.normalizeToolCall(tc))
405
+ : undefined;
406
+ let cleanContent = decoder.getCleanContent();
407
+ // Prefer the server's dedicated reasoning field; fall back to <think>
408
+ // tags parsed from the content stream by the decoder.
409
+ let reasoning = reasoningBuffer || decoder.getReasoning();
410
+
411
+ if (this.gemmaNative) {
412
+ // Native tool-call blocks live in the text channel; extract them.
413
+ const parsed = parseGemmaDiffusionOutput(cleanContent);
414
+ cleanContent = parsed.content;
415
+ if (parsed.reasoning) {
416
+ reasoning = reasoning ? `${reasoning}\n\n${parsed.reasoning}` : parsed.reasoning;
417
+ }
418
+ if (!finalToolCalls?.length && parsed.toolCalls.length) {
419
+ finalToolCalls = parsed.toolCalls.map(tc => ({
420
+ id: this.generateToolCallId(),
421
+ type: 'function' as const,
422
+ function: { name: tc.name, arguments: tc.argumentsJson },
423
+ }));
424
+ yield { type: 'tool_call', calls: finalToolCalls };
425
+ }
426
+ }
427
+
428
+ return {
429
+ message: {
430
+ role: 'assistant',
431
+ content: cleanContent,
432
+ tool_calls: finalToolCalls,
433
+ },
434
+ reasoning,
435
+ usage,
436
+ provider: 'openai',
437
+ };
438
+ }
439
+
440
+ private normalizeToolCall(
441
+ toolCall: Partial<LLMToolCall> & { function?: Partial<LLMToolCall['function']> },
442
+ ): LLMToolCall {
443
+ return {
444
+ ...toolCall,
445
+ id: toolCall.id || this.generateToolCallId(),
446
+ type: 'function',
447
+ function: {
448
+ ...toolCall.function,
449
+ name: toolCall.function?.name || '',
450
+ arguments: this.normalizeToolArguments(toolCall.function?.arguments),
451
+ },
452
+ };
453
+ }
454
+
455
+ private normalizeToolArguments(args: unknown): string {
456
+ if (typeof args === 'string') {
457
+ return args.trim().length > 0 ? args : '{}';
458
+ }
459
+ if (args == null) {
460
+ return '{}';
461
+ }
462
+ return JSON.stringify(args) ?? '{}';
463
+ }
464
+
465
+ // ========================================================================
466
+ // Embeddings
467
+ // ========================================================================
468
+
469
+ async embed(text: string): Promise<number[]> {
470
+ const url = this.buildUrl('/embeddings');
471
+ const response = await httpRequest<{
472
+ data: Array<{ embedding: number[] }>;
473
+ }>(url, {
474
+ method: 'POST',
475
+ headers: buildHeaders(this.options),
476
+ body: {
477
+ model: this.options.model,
478
+ input: text,
479
+ },
480
+ timeout: this.options.timeout ?? 30000,
481
+ });
482
+ return response.data.data[0]?.embedding ?? [];
483
+ }
484
+
485
+ // ========================================================================
486
+ // Model Discovery
487
+ // ========================================================================
488
+
489
+ async getModels(): Promise<string[]> {
490
+ const url = this.buildUrl('/models');
491
+ try {
492
+ const response = await httpRequest<{
493
+ data: OpenAIModelInfo[];
494
+ }>(url, {
495
+ headers: buildHeaders(this.options),
496
+ timeout: 5000,
497
+ });
498
+ return response.data.data.map(m => m.id);
499
+ } catch {
500
+ return [];
501
+ }
502
+ }
503
+
504
+ // ========================================================================
505
+ // Internals
506
+ // ========================================================================
507
+
508
+ private convertMessages(messages: LLMChatMessage[]): LLMChatMessage[] {
509
+ // OpenAI format is our canonical format, minimal conversion needed
510
+ return messages.map(msg => ({
511
+ ...msg,
512
+ // Ensure content is never null/undefined
513
+ content: msg.content ?? '',
514
+ }));
515
+ }
516
+
517
+ private buildRequestParams(options?: ChatOptions): Record<string, unknown> {
518
+ const params: Record<string, unknown> = {
519
+ ...this.options.defaultParameters,
520
+ ...options?.parameters,
521
+ };
522
+ if (options?.temperature !== undefined) params['temperature'] = options.temperature;
523
+ if (options?.maxTokens !== undefined) params['max_tokens'] = options.maxTokens;
524
+
525
+ // Unified thinking flag. Per-call overrides model config; only emitted
526
+ // when explicitly set, so servers that reject unknown fields are
527
+ // unaffected by default. OpenAI reasoning models (o-series / GPT-5) use
528
+ // `reasoning_effort`; vLLM / Qwen use `chat_template_kwargs.enable_thinking`.
529
+ // A user-supplied value (via parameters) always wins.
530
+ const thinking = resolveThinking(options?.thinking, this.options.thinking);
531
+ if (thinking) {
532
+ const isOfficialOpenAI = (this.options.url ?? '').includes('api.openai.com');
533
+ if (isOpenAIReasoningModel(this.options.model)) {
534
+ if (params['reasoning_effort'] === undefined) {
535
+ params['reasoning_effort'] = thinking.enabled ? (thinking.level ?? 'medium') : 'minimal';
536
+ }
537
+ } else if (!isOfficialOpenAI) {
538
+ // `chat_template_kwargs` is a vLLM/Qwen extension. Official OpenAI
539
+ // rejects unknown body fields (and gpt-4o has no thinking toggle),
540
+ // so only send it to self-hosted / compatible gateways.
541
+ const existing = (params['chat_template_kwargs'] as Record<string, unknown> | undefined) ?? {};
542
+ params['chat_template_kwargs'] = { enable_thinking: thinking.enabled, ...existing };
543
+ }
544
+ }
545
+ return params;
546
+ }
547
+
548
+ // ========================================================================
549
+ // Structured Output Helpers
550
+ // ========================================================================
551
+
552
+ /**
553
+ * Build OpenAI response_format for structured output.
554
+ */
555
+ private buildResponseFormat(options: StructuredOutputOptions<unknown> & { strict?: boolean }): Record<string, unknown> {
556
+ let jsonSchema: JSONSchema;
557
+ let name: string;
558
+ let description: string | undefined;
559
+
560
+ // Prefer jsonSchema if provided (handles raw JSON Schema case)
561
+ if (options.jsonSchema) {
562
+ // Use raw JSON Schema
563
+ jsonSchema = normalizeJsonSchema(options.jsonSchema);
564
+ name = options.name || 'response';
565
+ description = options.description;
566
+ } else if (options.schemaConfig) {
567
+ // Use SchemaConfig's embedded JSON Schema
568
+ jsonSchema = getJsonSchemaFromConfig(options.schemaConfig);
569
+ name = options.name || options.schemaConfig.name || 'response';
570
+ description = options.description || options.schemaConfig.description;
571
+ } else {
572
+ // Should not happen - we check this in extractSchemaOptions
573
+ throw new Error('Either schemaConfig or jsonSchema must be provided');
574
+ }
575
+
576
+ // OpenAI strict mode — configurable, defaults to true for reliable structured output
577
+ return {
578
+ type: 'json_schema',
579
+ json_schema: {
580
+ name,
581
+ ...(description && { description }),
582
+ schema: jsonSchema,
583
+ strict: options.strict ?? true,
584
+ },
585
+ };
586
+ }
587
+ }