universal-llm-client 4.2.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/CHANGELOG.md +142 -103
  2. package/LICENSE +21 -21
  3. package/README.md +640 -591
  4. package/dist/ai-model.d.ts +12 -1
  5. package/dist/ai-model.d.ts.map +1 -1
  6. package/dist/ai-model.js +36 -1
  7. package/dist/ai-model.js.map +1 -1
  8. package/dist/gemma-channel.d.ts +14 -0
  9. package/dist/gemma-channel.d.ts.map +1 -0
  10. package/dist/gemma-channel.js +38 -0
  11. package/dist/gemma-channel.js.map +1 -0
  12. package/dist/gemma-diffusion.d.ts +49 -0
  13. package/dist/gemma-diffusion.d.ts.map +1 -0
  14. package/dist/gemma-diffusion.js +147 -0
  15. package/dist/gemma-diffusion.js.map +1 -0
  16. package/dist/http.d.ts +4 -0
  17. package/dist/http.d.ts.map +1 -1
  18. package/dist/http.js +14 -1
  19. package/dist/http.js.map +1 -1
  20. package/dist/index.d.ts +2 -1
  21. package/dist/index.d.ts.map +1 -1
  22. package/dist/index.js +4 -0
  23. package/dist/index.js.map +1 -1
  24. package/dist/interfaces.d.ts +183 -7
  25. package/dist/interfaces.d.ts.map +1 -1
  26. package/dist/interfaces.js.map +1 -1
  27. package/dist/providers/anthropic.d.ts.map +1 -1
  28. package/dist/providers/anthropic.js +28 -3
  29. package/dist/providers/anthropic.js.map +1 -1
  30. package/dist/providers/google.d.ts +22 -1
  31. package/dist/providers/google.d.ts.map +1 -1
  32. package/dist/providers/google.js +225 -13
  33. package/dist/providers/google.js.map +1 -1
  34. package/dist/providers/ollama.d.ts +2 -0
  35. package/dist/providers/ollama.d.ts.map +1 -1
  36. package/dist/providers/ollama.js +59 -30
  37. package/dist/providers/ollama.js.map +1 -1
  38. package/dist/providers/openai.d.ts +14 -0
  39. package/dist/providers/openai.d.ts.map +1 -1
  40. package/dist/providers/openai.js +200 -22
  41. package/dist/providers/openai.js.map +1 -1
  42. package/dist/router.d.ts +2 -0
  43. package/dist/router.d.ts.map +1 -1
  44. package/dist/router.js +4 -0
  45. package/dist/router.js.map +1 -1
  46. package/dist/stream-decoder.d.ts +12 -0
  47. package/dist/stream-decoder.d.ts.map +1 -1
  48. package/dist/stream-decoder.js +182 -5
  49. package/dist/stream-decoder.js.map +1 -1
  50. package/dist/thinking.d.ts +36 -0
  51. package/dist/thinking.d.ts.map +1 -0
  52. package/dist/thinking.js +52 -0
  53. package/dist/thinking.js.map +1 -0
  54. package/package.json +118 -116
  55. package/src/ai-model.ts +400 -350
  56. package/src/auditor.ts +213 -213
  57. package/src/client.ts +402 -402
  58. package/src/debug/debug-google-streaming.ts +1 -1
  59. package/src/demos/basic/universal-llm-examples.ts +3 -3
  60. package/src/demos/diffusion-gemma/.env +29 -0
  61. package/src/demos/diffusion-gemma/.env.example +27 -0
  62. package/src/demos/diffusion-gemma/CLAUDE.md +95 -0
  63. package/src/demos/diffusion-gemma/README.md +59 -0
  64. package/src/demos/diffusion-gemma/canvas.ts +1606 -0
  65. package/src/demos/diffusion-gemma/docker-compose.yml +29 -0
  66. package/src/demos/diffusion-gemma/probe-stream.ts +51 -0
  67. package/src/demos/diffusion-gemma/probe-tools.ts +55 -0
  68. package/src/demos/diffusion-gemma/server.ts +1205 -0
  69. package/src/demos/diffusion-gemma/start-vllm.sh +98 -0
  70. package/src/gemma-channel.ts +47 -0
  71. package/src/gemma-diffusion.ts +167 -0
  72. package/src/http.ts +261 -247
  73. package/src/index.ts +180 -161
  74. package/src/interfaces.ts +843 -657
  75. package/src/mcp.ts +345 -345
  76. package/src/providers/anthropic.ts +796 -762
  77. package/src/providers/google.ts +840 -620
  78. package/src/providers/index.ts +8 -8
  79. package/src/providers/ollama.ts +503 -469
  80. package/src/providers/openai.ts +587 -392
  81. package/src/router.ts +785 -780
  82. package/src/stream-decoder.ts +535 -361
  83. package/src/structured-output.ts +759 -759
  84. package/src/test-scripts/test-google-deep-research.ts +33 -0
  85. package/src/test-scripts/test-google-streaming-enhanced.ts +147 -147
  86. package/src/test-scripts/test-google-streaming.ts +1 -1
  87. package/src/test-scripts/test-google-system-prompt-comprehensive.ts +189 -189
  88. package/src/test-scripts/test-google-thinking.ts +46 -0
  89. package/src/test-scripts/test-system-message-positions.ts +163 -163
  90. package/src/test-scripts/test-system-prompt-improvement-demo.ts +83 -83
  91. package/src/test-scripts/test-vllm-qwen36.ts +256 -0
  92. package/src/tests/ai-model.test.ts +1614 -1614
  93. package/src/tests/auditor.test.ts +224 -224
  94. package/src/tests/gemma-diffusion.test.ts +115 -0
  95. package/src/tests/http.test.ts +200 -200
  96. package/src/tests/interfaces.test.ts +117 -117
  97. package/src/tests/providers/anthropic.test.ts +118 -0
  98. package/src/tests/providers/google.test.ts +841 -660
  99. package/src/tests/providers/ollama.test.ts +1034 -954
  100. package/src/tests/providers/openai.test.ts +1511 -1122
  101. package/src/tests/router.test.ts +254 -254
  102. package/src/tests/stream-decoder.test.ts +263 -179
  103. package/src/tests/structured-output.test.ts +1450 -1450
  104. package/src/tests/thinking.test.ts +65 -0
  105. package/src/tests/tools.test.ts +175 -175
  106. package/src/thinking.ts +73 -0
  107. package/src/tools.ts +246 -246
  108. package/src/zod-adapter.ts +72 -72
@@ -0,0 +1,256 @@
1
+ /**
2
+ * vLLM + Qwen3.6 (NVFP4) compatibility test for universal-llm-client.
3
+ *
4
+ * Exercises the OpenAI-compatible provider against a local vLLM server running
5
+ * nvidia/Qwen3.6-35B-A3B-NVFP4, with special attention to REASONING handling.
6
+ *
7
+ * Run (server must be up on :8000):
8
+ * bun run src/test-scripts/test-vllm-qwen36.ts
9
+ *
10
+ * Env overrides:
11
+ * VLLM_URL (default http://localhost:8000)
12
+ * VLLM_MODEL (default qwen3.6-nvfp4)
13
+ */
14
+
15
+ import { AIModel } from '../index.js';
16
+ import type { DecodedEvent } from '../stream-decoder.js';
17
+ import type { LLMChatResponse } from '../interfaces.js';
18
+
19
+ const URL = process.env.VLLM_URL ?? 'http://localhost:8000';
20
+ const MODEL = process.env.VLLM_MODEL ?? 'qwen3.6-nvfp4';
21
+
22
+ // ---------------------------------------------------------------------------
23
+ // tiny test harness
24
+ // ---------------------------------------------------------------------------
25
+ type Status = 'PASS' | 'FAIL' | 'PARTIAL';
26
+ const results: { name: string; status: Status; note: string }[] = [];
27
+ function record(name: string, status: Status, note = '') {
28
+ results.push({ name, status, note });
29
+ const icon = status === 'PASS' ? '✅' : status === 'PARTIAL' ? '🟡' : '❌';
30
+ console.log(`\n${icon} ${name} — ${status}${note ? `\n ${note}` : ''}`);
31
+ }
32
+ function section(title: string) {
33
+ console.log(`\n${'━'.repeat(70)}\n${title}\n${'━'.repeat(70)}`);
34
+ }
35
+
36
+ /** Drain a chatStream generator, collecting events AND the final return value. */
37
+ async function drainStream(
38
+ gen: AsyncGenerator<DecodedEvent, LLMChatResponse | void, unknown>,
39
+ ): Promise<{ events: DecodedEvent[]; result: LLMChatResponse | void }> {
40
+ const events: DecodedEvent[] = [];
41
+ let result: LLMChatResponse | void;
42
+ while (true) {
43
+ const { value, done } = await gen.next();
44
+ if (done) { result = value as LLMChatResponse | void; break; }
45
+ events.push(value);
46
+ }
47
+ return { events, result };
48
+ }
49
+
50
+ /** Raw OpenAI call straight to vLLM — ground truth for what the server emits. */
51
+ async function rawChat(body: Record<string, unknown>): Promise<any> {
52
+ const res = await fetch(`${URL}/v1/chat/completions`, {
53
+ method: 'POST',
54
+ headers: { 'Content-Type': 'application/json' },
55
+ body: JSON.stringify({ model: MODEL, ...body }),
56
+ });
57
+ return res.json();
58
+ }
59
+
60
+ const REASON_PROMPT =
61
+ 'A farmer has 17 sheep. All but 9 run away. Then he buys 5 more. How many sheep does he have? Think it through, then give the number.';
62
+
63
+ // ---------------------------------------------------------------------------
64
+
65
+ async function main() {
66
+ console.log(`vLLM compatibility test\n url = ${URL}\n model = ${MODEL}`);
67
+
68
+ const model = new AIModel({
69
+ model: MODEL,
70
+ thinking: true, // intent flag (no-op for the openai provider, but documents intent)
71
+ timeout: 120_000,
72
+ providers: [{ type: 'openai', url: URL, apiKey: 'EMPTY' }],
73
+ });
74
+
75
+ // ----- 1. Connectivity / model discovery --------------------------------
76
+ section('1. Connectivity & model discovery');
77
+ try {
78
+ const models = await model.getModels();
79
+ console.log(' /v1/models ->', models);
80
+ if (models.includes(MODEL)) record('Model discovery', 'PASS', `served model "${MODEL}" is listed`);
81
+ else record('Model discovery', 'PARTIAL', `server reachable but "${MODEL}" not in ${JSON.stringify(models)}`);
82
+ } catch (e) {
83
+ record('Model discovery', 'FAIL', `cannot reach server: ${(e as Error).message}`);
84
+ console.log('\nAborting — server unreachable.');
85
+ printSummary();
86
+ return;
87
+ }
88
+
89
+ // ----- 2. Basic chat (non-streaming) ------------------------------------
90
+ section('2. Basic chat (non-streaming)');
91
+ try {
92
+ const r = await model.chat(
93
+ [{ role: 'user', content: 'In one short sentence, what is the capital of Japan?' }],
94
+ { temperature: 0, maxTokens: 256 },
95
+ );
96
+ const content = r.message.content?.trim() ?? '';
97
+ console.log(' content :', JSON.stringify(content));
98
+ console.log(' usage :', JSON.stringify(r.usage));
99
+ if (r.usage?.tokensPerSecond) {
100
+ console.log(` stats : ${r.usage.tokensPerSecond.toFixed(1)} tok/s over ${r.usage.durationMs}ms wall-clock`);
101
+ }
102
+ if (content.toLowerCase().includes('tokyo')) record('Basic chat', 'PASS', 'correct, clean answer (reasoning stripped server-side)');
103
+ else if (content.length > 0) record('Basic chat', 'PARTIAL', 'got content but expected "Tokyo"');
104
+ else record('Basic chat', 'FAIL', 'empty content (model spent budget reasoning — see reasoning section)');
105
+ } catch (e) {
106
+ record('Basic chat', 'FAIL', (e as Error).message);
107
+ }
108
+
109
+ // ----- 3. Streaming -----------------------------------------------------
110
+ section('3. Streaming (chatStream)');
111
+ try {
112
+ const { events, result } = await drainStream(
113
+ model.chatStream(
114
+ [{ role: 'user', content: 'List three colors, comma separated.' }],
115
+ // Generous budget: Qwen3.6 thinks first, so a small cap is spent
116
+ // entirely on reasoning before any answer tokens are produced.
117
+ { temperature: 0, maxTokens: 1024 },
118
+ ),
119
+ );
120
+ const textEvents = events.filter(e => e.type === 'text').length;
121
+ const thinkingEvents = events.filter(e => e.type === 'thinking').length;
122
+ const finalContent = (result && 'message' in result ? result.message.content : '')?.trim() ?? '';
123
+ console.log(` events: ${events.length} (text=${textEvents}, thinking=${thinkingEvents})`);
124
+ console.log(' final content:', JSON.stringify(finalContent));
125
+ if (textEvents > 0 && finalContent.length > 0) record('Streaming', 'PASS', `${textEvents} text deltas streamed, final content assembled`);
126
+ else record('Streaming', 'PARTIAL', 'stream completed but content was empty');
127
+ } catch (e) {
128
+ record('Streaming', 'FAIL', (e as Error).message);
129
+ }
130
+
131
+ // ----- 4. REASONING (the focus) -----------------------------------------
132
+ section('4. Reasoning exposure (Qwen3.6 thinking)');
133
+ try {
134
+ // 4a. Ground truth: what does vLLM actually send?
135
+ const raw = await rawChat({
136
+ messages: [{ role: 'user', content: REASON_PROMPT }],
137
+ max_tokens: 800,
138
+ temperature: 0,
139
+ });
140
+ const rawMsg = raw?.choices?.[0]?.message ?? {};
141
+ const serverReasoning: string = rawMsg.reasoning ?? rawMsg.reasoning_content ?? '';
142
+ const serverContent: string = rawMsg.content ?? '';
143
+ console.log(` [server raw] reasoning field: ${serverReasoning.length} chars; content field: ${serverContent.length} chars`);
144
+ if (serverReasoning) console.log(` [server raw] reasoning preview: ${JSON.stringify(serverReasoning.slice(0, 120))}…`);
145
+ console.log(` [server raw] content : ${JSON.stringify(serverContent.slice(0, 120))}`);
146
+
147
+ // 4b. What the client surfaces (non-streaming)
148
+ const r = await model.chat([{ role: 'user', content: REASON_PROMPT }], { temperature: 0, maxTokens: 800 });
149
+ const clientReasoning = r.reasoning ?? '';
150
+ const clientContent = r.message.content ?? '';
151
+ console.log(` [client chat] .reasoning: ${clientReasoning.length} chars; .content: ${clientContent.length} chars`);
152
+
153
+ // 4c. What the client surfaces (streaming — uses StandardChatDecoder <think> parser)
154
+ const { result } = await drainStream(model.chatStream([{ role: 'user', content: REASON_PROMPT }], { temperature: 0, maxTokens: 800 }));
155
+ const streamReasoning = (result && 'reasoning' in result ? result.reasoning : '') ?? '';
156
+ console.log(` [client stream] .reasoning: ${streamReasoning.length} chars`);
157
+
158
+ const serverHasReasoning = serverReasoning.length > 50;
159
+ const clientExposes = clientReasoning.length > 0 || streamReasoning.length > 0;
160
+ const contentClean = !clientContent.includes('<think>');
161
+
162
+ if (serverHasReasoning && clientExposes) {
163
+ record('Reasoning exposure', 'PASS', 'client surfaces the reasoning trace via .reasoning');
164
+ } else if (serverHasReasoning && !clientExposes && contentClean) {
165
+ record('Reasoning exposure', 'PARTIAL',
166
+ 'Server emits reasoning in a separate `reasoning` field; client returns CLEAN answers but does NOT expose the trace ' +
167
+ '(the openai provider reads `content`/`delta.content` only, never `reasoning`/`reasoning_content`). ' +
168
+ 'Fix: read `message.reasoning`/`delta.reasoning` in providers/openai.ts.');
169
+ } else if (clientContent.includes('<think>')) {
170
+ record('Reasoning exposure', 'PARTIAL', 'Reasoning leaks into content as <think> tags (run server WITHOUT --reasoning-parser, then streaming separates it).');
171
+ } else {
172
+ record('Reasoning exposure', 'FAIL', 'No reasoning surfaced anywhere.');
173
+ }
174
+ } catch (e) {
175
+ record('Reasoning exposure', 'FAIL', (e as Error).message);
176
+ }
177
+
178
+ // ----- 5. Tool calling --------------------------------------------------
179
+ section('5. Tool calling (chatWithTools)');
180
+ try {
181
+ let toolHit = false;
182
+ model.registerTool(
183
+ 'multiply',
184
+ 'Multiply two integers and return the product',
185
+ { type: 'object', properties: { a: { type: 'number' }, b: { type: 'number' } }, required: ['a', 'b'] },
186
+ async (args: any) => { toolHit = true; return { product: args.a * args.b }; },
187
+ );
188
+ const r = await model.chatWithTools(
189
+ [{ role: 'user', content: 'Use the multiply tool to compute 17 times 23, then state the result.' }],
190
+ { temperature: 0, maxTokens: 1024, maxIterations: 3 },
191
+ );
192
+ const trace = r.toolExecutions ?? [];
193
+ const content = r.message.content ?? '';
194
+ console.log(' toolExecutions:', JSON.stringify(trace));
195
+ console.log(' content :', JSON.stringify(content.slice(0, 160)));
196
+ if (toolHit && content.includes('391')) record('Tool calling', 'PASS', `tool executed (${trace.length} trace entr${trace.length === 1 ? 'y' : 'ies'}), answer 391 returned`);
197
+ else if (toolHit) record('Tool calling', 'PARTIAL', 'tool fired but final answer missing 391');
198
+ else record('Tool calling', 'PARTIAL', 'tool NOT invoked — vLLM likely needs `--enable-auto-tool-choice --tool-call-parser hermes`');
199
+ } catch (e) {
200
+ record('Tool calling', 'FAIL', (e as Error).message);
201
+ }
202
+
203
+ // ----- 6. Structured output (JSON schema / guided decoding) --------------
204
+ section('6. Structured output (response_format json_schema)');
205
+ try {
206
+ const r = await model.chat(
207
+ [{ role: 'user', content: 'Give the capital and population (millions, integer) of France.' }],
208
+ {
209
+ temperature: 0,
210
+ maxTokens: 1024,
211
+ // Unified thinking flag — now wired through the openai provider to
212
+ // vLLM's chat_template_kwargs.enable_thinking. Disable thinking so
213
+ // guided decoding emits the object directly.
214
+ thinking: false,
215
+ jsonSchema: {
216
+ type: 'object',
217
+ properties: { capital: { type: 'string' }, population_millions: { type: 'number' } },
218
+ required: ['capital', 'population_millions'],
219
+ additionalProperties: false,
220
+ },
221
+ name: 'CountryFact',
222
+ } as any,
223
+ );
224
+ const content = r.message.content ?? '';
225
+ const structured = (r as any).structured;
226
+ console.log(' content :', JSON.stringify(content.slice(0, 200)));
227
+ console.log(' structured:', JSON.stringify(structured));
228
+ let parsed: any = structured;
229
+ if (!parsed) { try { parsed = JSON.parse(content); } catch { /* ignore */ } }
230
+ if (parsed && typeof parsed.capital === 'string' && typeof parsed.population_millions === 'number') {
231
+ record('Structured output', 'PASS', `valid JSON: capital=${parsed.capital}`);
232
+ } else if (parsed) {
233
+ record('Structured output', 'PARTIAL', 'JSON parsed but schema fields missing/mistyped');
234
+ } else {
235
+ record('Structured output', 'FAIL', 'response was not valid JSON');
236
+ }
237
+ } catch (e) {
238
+ record('Structured output', 'FAIL', (e as Error).message);
239
+ }
240
+
241
+ await model.dispose();
242
+ printSummary();
243
+ }
244
+
245
+ function printSummary() {
246
+ section('SUMMARY');
247
+ const pad = Math.max(...results.map(r => r.name.length));
248
+ for (const r of results) {
249
+ const icon = r.status === 'PASS' ? '✅' : r.status === 'PARTIAL' ? '🟡' : '❌';
250
+ console.log(`${icon} ${r.name.padEnd(pad)} ${r.status}`);
251
+ }
252
+ const pass = results.filter(r => r.status === 'PASS').length;
253
+ console.log(`\n${pass}/${results.length} PASS`);
254
+ }
255
+
256
+ main().catch(e => { console.error('FATAL', e); process.exit(1); });