universal-llm-client 4.2.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/CHANGELOG.md +142 -103
  2. package/LICENSE +21 -21
  3. package/README.md +640 -591
  4. package/dist/ai-model.d.ts +12 -1
  5. package/dist/ai-model.d.ts.map +1 -1
  6. package/dist/ai-model.js +36 -1
  7. package/dist/ai-model.js.map +1 -1
  8. package/dist/gemma-channel.d.ts +14 -0
  9. package/dist/gemma-channel.d.ts.map +1 -0
  10. package/dist/gemma-channel.js +38 -0
  11. package/dist/gemma-channel.js.map +1 -0
  12. package/dist/gemma-diffusion.d.ts +49 -0
  13. package/dist/gemma-diffusion.d.ts.map +1 -0
  14. package/dist/gemma-diffusion.js +147 -0
  15. package/dist/gemma-diffusion.js.map +1 -0
  16. package/dist/http.d.ts +4 -0
  17. package/dist/http.d.ts.map +1 -1
  18. package/dist/http.js +14 -1
  19. package/dist/http.js.map +1 -1
  20. package/dist/index.d.ts +2 -1
  21. package/dist/index.d.ts.map +1 -1
  22. package/dist/index.js +4 -0
  23. package/dist/index.js.map +1 -1
  24. package/dist/interfaces.d.ts +183 -7
  25. package/dist/interfaces.d.ts.map +1 -1
  26. package/dist/interfaces.js.map +1 -1
  27. package/dist/providers/anthropic.d.ts.map +1 -1
  28. package/dist/providers/anthropic.js +28 -3
  29. package/dist/providers/anthropic.js.map +1 -1
  30. package/dist/providers/google.d.ts +22 -1
  31. package/dist/providers/google.d.ts.map +1 -1
  32. package/dist/providers/google.js +225 -13
  33. package/dist/providers/google.js.map +1 -1
  34. package/dist/providers/ollama.d.ts +2 -0
  35. package/dist/providers/ollama.d.ts.map +1 -1
  36. package/dist/providers/ollama.js +59 -30
  37. package/dist/providers/ollama.js.map +1 -1
  38. package/dist/providers/openai.d.ts +14 -0
  39. package/dist/providers/openai.d.ts.map +1 -1
  40. package/dist/providers/openai.js +200 -22
  41. package/dist/providers/openai.js.map +1 -1
  42. package/dist/router.d.ts +2 -0
  43. package/dist/router.d.ts.map +1 -1
  44. package/dist/router.js +4 -0
  45. package/dist/router.js.map +1 -1
  46. package/dist/stream-decoder.d.ts +12 -0
  47. package/dist/stream-decoder.d.ts.map +1 -1
  48. package/dist/stream-decoder.js +182 -5
  49. package/dist/stream-decoder.js.map +1 -1
  50. package/dist/thinking.d.ts +36 -0
  51. package/dist/thinking.d.ts.map +1 -0
  52. package/dist/thinking.js +52 -0
  53. package/dist/thinking.js.map +1 -0
  54. package/package.json +118 -116
  55. package/src/ai-model.ts +400 -350
  56. package/src/auditor.ts +213 -213
  57. package/src/client.ts +402 -402
  58. package/src/debug/debug-google-streaming.ts +1 -1
  59. package/src/demos/basic/universal-llm-examples.ts +3 -3
  60. package/src/demos/diffusion-gemma/.env +29 -0
  61. package/src/demos/diffusion-gemma/.env.example +27 -0
  62. package/src/demos/diffusion-gemma/CLAUDE.md +95 -0
  63. package/src/demos/diffusion-gemma/README.md +59 -0
  64. package/src/demos/diffusion-gemma/canvas.ts +1606 -0
  65. package/src/demos/diffusion-gemma/docker-compose.yml +29 -0
  66. package/src/demos/diffusion-gemma/probe-stream.ts +51 -0
  67. package/src/demos/diffusion-gemma/probe-tools.ts +55 -0
  68. package/src/demos/diffusion-gemma/server.ts +1205 -0
  69. package/src/demos/diffusion-gemma/start-vllm.sh +98 -0
  70. package/src/gemma-channel.ts +47 -0
  71. package/src/gemma-diffusion.ts +167 -0
  72. package/src/http.ts +261 -247
  73. package/src/index.ts +180 -161
  74. package/src/interfaces.ts +843 -657
  75. package/src/mcp.ts +345 -345
  76. package/src/providers/anthropic.ts +796 -762
  77. package/src/providers/google.ts +840 -620
  78. package/src/providers/index.ts +8 -8
  79. package/src/providers/ollama.ts +503 -469
  80. package/src/providers/openai.ts +587 -392
  81. package/src/router.ts +785 -780
  82. package/src/stream-decoder.ts +535 -361
  83. package/src/structured-output.ts +759 -759
  84. package/src/test-scripts/test-google-deep-research.ts +33 -0
  85. package/src/test-scripts/test-google-streaming-enhanced.ts +147 -147
  86. package/src/test-scripts/test-google-streaming.ts +1 -1
  87. package/src/test-scripts/test-google-system-prompt-comprehensive.ts +189 -189
  88. package/src/test-scripts/test-google-thinking.ts +46 -0
  89. package/src/test-scripts/test-system-message-positions.ts +163 -163
  90. package/src/test-scripts/test-system-prompt-improvement-demo.ts +83 -83
  91. package/src/test-scripts/test-vllm-qwen36.ts +256 -0
  92. package/src/tests/ai-model.test.ts +1614 -1614
  93. package/src/tests/auditor.test.ts +224 -224
  94. package/src/tests/gemma-diffusion.test.ts +115 -0
  95. package/src/tests/http.test.ts +200 -200
  96. package/src/tests/interfaces.test.ts +117 -117
  97. package/src/tests/providers/anthropic.test.ts +118 -0
  98. package/src/tests/providers/google.test.ts +841 -660
  99. package/src/tests/providers/ollama.test.ts +1034 -954
  100. package/src/tests/providers/openai.test.ts +1511 -1122
  101. package/src/tests/router.test.ts +254 -254
  102. package/src/tests/stream-decoder.test.ts +263 -179
  103. package/src/tests/structured-output.test.ts +1450 -1450
  104. package/src/tests/thinking.test.ts +65 -0
  105. package/src/tests/tools.test.ts +175 -175
  106. package/src/thinking.ts +73 -0
  107. package/src/tools.ts +246 -246
  108. package/src/zod-adapter.ts +72 -72
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ echo "=== Upgrading transformers ==="
5
+ pip install --upgrade transformers
6
+
7
+ echo "=== Installing WSL2 UVA compatibility patch ==="
8
+ cat > /usr/local/lib/python3.12/dist-packages/wsl2_uva_patch.py <<'PYEOF'
9
+ """
10
+ WSL2 UVA compatibility patch for vLLM.
11
+
12
+ UVA lets the GPU directly access pinned CPU memory. WSL2 does not support this
13
+ path reliably, so this patch uses explicit CPU/GPU copies instead.
14
+ """
15
+ import warnings
16
+
17
+ import numpy as np
18
+ import torch
19
+
20
+ warnings.warn("WSL2 UVA patch active: using explicit CPU/GPU copies instead of UVA")
21
+
22
+ import vllm.v1.worker.gpu.buffer_utils as bu
23
+
24
+
25
+ class PatchedUvaBuffer:
26
+ def __init__(self, size, dtype):
27
+ self.cpu = torch.zeros(size, dtype=dtype, device="cpu", pin_memory=False)
28
+ self.np = self.cpu.numpy()
29
+ self._gpu = torch.zeros(size, dtype=dtype, device="cuda")
30
+ self.uva = self._gpu
31
+
32
+ def sync_to_gpu(self):
33
+ self._gpu.copy_(self.cpu, non_blocking=True)
34
+
35
+
36
+ class PatchedUvaBufferPool:
37
+ def __init__(self, size, dtype, max_concurrency=None):
38
+ if max_concurrency is None:
39
+ max_concurrency = bu._DEFAULT_MAX_CONCURRENCY
40
+ self.size = size
41
+ self.dtype = dtype
42
+ self.max_concurrency = max_concurrency
43
+ self._uva_bufs = [PatchedUvaBuffer(size, dtype) for _ in range(max_concurrency)]
44
+ self._curr = 0
45
+
46
+ def copy_to_uva(self, x):
47
+ self._curr = (self._curr + 1) % self.max_concurrency
48
+ buf = self._uva_bufs[self._curr]
49
+ dst = buf.cpu if isinstance(x, torch.Tensor) else buf.np
50
+ n = len(x)
51
+ dst[:n] = x
52
+ buf.sync_to_gpu()
53
+ return buf.uva[:n]
54
+
55
+
56
+ import vllm.utils.platform_utils as pu
57
+ pu.is_uva_available = lambda: True
58
+
59
+ import vllm.utils.torch_utils as tu
60
+ tu.get_accelerator_view_from_cpu_tensor = lambda cpu_tensor: cpu_tensor.cuda()
61
+
62
+ bu.UvaBuffer = PatchedUvaBuffer
63
+ bu.UvaBufferPool = PatchedUvaBufferPool
64
+
65
+ print("[WSL2 UVA Patch] Applied successfully - using explicit CPU/GPU copies")
66
+ PYEOF
67
+
68
+ echo "import wsl2_uva_patch" > /usr/local/lib/python3.12/dist-packages/wsl2_uva_patch.pth
69
+
70
+ if [ -f /root/.cache/huggingface/diffusion-env.sh ]; then
71
+ # This file is written by the demo server's /api/engine-config endpoint.
72
+ . /root/.cache/huggingface/diffusion-env.sh
73
+ fi
74
+
75
+ MODEL_NAME="${MODEL_NAME:-RedHatAI/diffusiongemma-26B-A4B-it-NVFP4}"
76
+ GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.28}"
77
+ MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
78
+ MAX_NUM_SEQS="${MAX_NUM_SEQS:-1}"
79
+ DIFFUSION_ENTROPY="${DIFFUSION_ENTROPY:-0.1}"
80
+ ENFORCE_EAGER="${ENFORCE_EAGER:-0}"
81
+ export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}"
82
+
83
+ echo "=== Engine config: MODEL_NAME=${MODEL_NAME} DIFFUSION_ENTROPY=${DIFFUSION_ENTROPY} GPU_MEM_UTIL=${GPU_MEM_UTIL} MAX_MODEL_LEN=${MAX_MODEL_LEN} MAX_NUM_SEQS=${MAX_NUM_SEQS} ENFORCE_EAGER=${ENFORCE_EAGER} VLLM_NO_USAGE_STATS=${VLLM_NO_USAGE_STATS} ==="
84
+
85
+ EAGER_FLAG=""
86
+ if [ "${ENFORCE_EAGER}" = "1" ]; then
87
+ EAGER_FLAG="--enforce-eager"
88
+ fi
89
+
90
+ VLLM_USE_V2_MODEL_RUNNER=1 vllm serve "${MODEL_NAME}" \
91
+ --trust-remote-code \
92
+ --attention-backend TRITON_ATTN \
93
+ --max-num-seqs "${MAX_NUM_SEQS}" \
94
+ ${EAGER_FLAG} \
95
+ --gpu-memory-utilization "${GPU_MEM_UTIL}" \
96
+ --max-model-len "${MAX_MODEL_LEN}" \
97
+ --hf-overrides "{\"diffusion_sampler\": \"entropy_bound\", \"diffusion_entropy_bound\": ${DIFFUSION_ENTROPY}}" \
98
+ --default-chat-template-kwargs '{"enable_thinking": true}'
@@ -0,0 +1,47 @@
1
+ /**
2
+ * Gemma 4 can emit its thought channel as text control tokens instead of the
3
+ * generic Ollama `message.thinking` field. Keep that provider quirk isolated so
4
+ * callers receive final-answer text and reasoning separately.
5
+ */
6
+
7
+ export interface GemmaThoughtExtraction {
8
+ readonly content: string;
9
+ readonly reasoning: string;
10
+ readonly found: boolean;
11
+ }
12
+
13
+ const GEMMA_THOUGHT_BLOCK = /<\|channel>\s*thought\s*\r?\n?([\s\S]*?)<channel\|>/gi;
14
+ const GEMMA_COMPACT_THOUGHT_BLOCK = /<\|thought\s*\r?\n?([\s\S]*?)\|>/gi;
15
+
16
+ export const GEMMA_THOUGHT_OPENERS = ['<|channel>thought', '<|thought'] as const;
17
+
18
+ export function extractGemmaThoughtChannels(input: string): GemmaThoughtExtraction {
19
+ if (!input) return { content: input, reasoning: '', found: false };
20
+
21
+ const reasoningParts: string[] = [];
22
+ let found = false;
23
+
24
+ const content = input
25
+ .replace(GEMMA_THOUGHT_BLOCK, (_match, thought: string) => {
26
+ found = true;
27
+ const normalized = normalizeGemmaThought(thought);
28
+ if (normalized) reasoningParts.push(normalized);
29
+ return '';
30
+ })
31
+ .replace(GEMMA_COMPACT_THOUGHT_BLOCK, (_match, thought: string) => {
32
+ found = true;
33
+ const normalized = normalizeGemmaThought(thought);
34
+ if (normalized) reasoningParts.push(normalized);
35
+ return '';
36
+ });
37
+
38
+ return {
39
+ content,
40
+ reasoning: reasoningParts.join('\n\n'),
41
+ found,
42
+ };
43
+ }
44
+
45
+ export function normalizeGemmaThought(thought: string): string {
46
+ return thought.replace(/^\s+/, '').replace(/\s+$/, '');
47
+ }
@@ -0,0 +1,167 @@
1
+ /**
2
+ * DiffusionGemma (vLLM) native-protocol adapter.
3
+ *
4
+ * Trimmed vLLM builds that serve DiffusionGemma ship with NO reasoning parser
5
+ * and NO tool-call parser module, and they reject OpenAI-style `tools` unless
6
+ * `--tool-call-parser` is configured. Everything therefore has to be handled
7
+ * client-side, against the model's native channel format (visible only when
8
+ * the request sets `skip_special_tokens: false`):
9
+ *
10
+ * <|channel>thought ...reasoning... <channel|> reasoning channel
11
+ * <|tool_call>call:name{k:<|"|>v<|"|>,n:3}<tool_call|> tool call
12
+ *
13
+ * Tool-call arguments are NOT JSON: keys are bare, strings are wrapped in the
14
+ * <|"|> quote token, numbers/booleans are bare (see the model's
15
+ * chat_template.jinja `format_argument` macro). `gemmaArgsToJson` converts
16
+ * that into a standard JSON string.
17
+ *
18
+ * Request-side protocol (implemented in the OpenAI provider):
19
+ * - always send `skip_special_tokens: false`
20
+ * - send `tools` with `tool_choice: 'none'` — vLLM still renders the
21
+ * declarations into the chat template, it just skips its (absent) parser
22
+ * - send history tool turns structurally (assistant `tool_calls` +
23
+ * `role: 'tool'` messages) — the chat template renders them natively
24
+ */
25
+
26
+ import { extractGemmaThoughtChannels } from './gemma-channel.js';
27
+
28
+ export interface GemmaParsedToolCall {
29
+ readonly name: string;
30
+ /** JSON-encoded arguments object, ready for LLMToolCall.function.arguments */
31
+ readonly argumentsJson: string;
32
+ }
33
+
34
+ export interface GemmaDiffusionParsed {
35
+ /** Final answer with reasoning, tool-call blocks and special tokens removed */
36
+ readonly content: string;
37
+ readonly reasoning: string;
38
+ readonly toolCalls: readonly GemmaParsedToolCall[];
39
+ }
40
+
41
+ /** Models that speak this native protocol when served by vLLM. */
42
+ export function isGemmaDiffusionModel(model: string): boolean {
43
+ return /diffusion[-_]?gemma/i.test(model);
44
+ }
45
+
46
+ const TOOL_CALL_BLOCK = /<\|tool_call>\s*call:([a-zA-Z0-9_.-]+)\s*\{([\s\S]*?)\}\s*<tool_call\|>/g;
47
+
48
+ /**
49
+ * Residual control tokens that may leak into text output — including stray
50
+ * unbalanced channel markers (the model occasionally emits an extra
51
+ * <channel|> closer mid-answer).
52
+ */
53
+ const RESIDUAL_SPECIAL = /<\|?(?:turn|think|image|audio|video|tool_response|tool_call|tool|channel)\b[^>]*?\|?>|<(?:turn|channel|tool_response|tool_call|tool)\|>/g;
54
+
55
+ const QUOTE_TOKEN = '<|"|>';
56
+
57
+ /**
58
+ * Convert the Gemma template's pseudo-JSON argument syntax to a JSON string.
59
+ * Lenient by design: bare words that aren't numbers/booleans become strings,
60
+ * since the model occasionally omits the quote token.
61
+ */
62
+ export function gemmaArgsToJson(body: string): string {
63
+ // Argument bodies arrive without their outer braces (the regex strips them)
64
+ const src = `{${body}}`;
65
+ let i = 0;
66
+ const n = src.length;
67
+
68
+ function skipWs(): void {
69
+ while (i < n && /\s/.test(src[i]!)) i++;
70
+ }
71
+
72
+ function parseQuoted(): string {
73
+ // positioned at the start of QUOTE_TOKEN
74
+ i += QUOTE_TOKEN.length;
75
+ const end = src.indexOf(QUOTE_TOKEN, i);
76
+ const raw = end === -1 ? src.slice(i) : src.slice(i, end);
77
+ i = end === -1 ? n : end + QUOTE_TOKEN.length;
78
+ return raw;
79
+ }
80
+
81
+ function parseBare(stops: string): string {
82
+ const start = i;
83
+ while (i < n && !stops.includes(src[i]!) && !src.startsWith(QUOTE_TOKEN, i)) i++;
84
+ return src.slice(start, i).trim();
85
+ }
86
+
87
+ function parseValue(): string {
88
+ skipWs();
89
+ if (src.startsWith(QUOTE_TOKEN, i)) return JSON.stringify(parseQuoted());
90
+ const c = src[i];
91
+ if (c === '{') return parseObject();
92
+ if (c === '[') return parseArray();
93
+ const bare = parseBare(',}]');
94
+ if (/^-?\d+(\.\d+)?([eE][+-]?\d+)?$/.test(bare)) return bare;
95
+ if (bare === 'true' || bare === 'false' || bare === 'null') return bare;
96
+ return JSON.stringify(bare);
97
+ }
98
+
99
+ function parseObject(): string {
100
+ i++; // consume {
101
+ const parts: string[] = [];
102
+ skipWs();
103
+ while (i < n && src[i] !== '}') {
104
+ skipWs();
105
+ const key = src.startsWith(QUOTE_TOKEN, i) ? parseQuoted() : parseBare(':');
106
+ skipWs();
107
+ if (src[i] === ':') i++;
108
+ const value = parseValue();
109
+ parts.push(`${JSON.stringify(key.trim())}:${value}`);
110
+ skipWs();
111
+ if (src[i] === ',') i++;
112
+ skipWs();
113
+ }
114
+ i++; // consume }
115
+ return `{${parts.join(',')}}`;
116
+ }
117
+
118
+ function parseArray(): string {
119
+ i++; // consume [
120
+ const parts: string[] = [];
121
+ skipWs();
122
+ while (i < n && src[i] !== ']') {
123
+ parts.push(parseValue());
124
+ skipWs();
125
+ if (src[i] === ',') i++;
126
+ skipWs();
127
+ }
128
+ i++; // consume ]
129
+ return `[${parts.join(',')}]`;
130
+ }
131
+
132
+ skipWs();
133
+ return parseObject();
134
+ }
135
+
136
+ /**
137
+ * Parse a complete raw DiffusionGemma output into reasoning, tool calls and
138
+ * clean answer text.
139
+ */
140
+ export function parseGemmaDiffusionOutput(raw: string): GemmaDiffusionParsed {
141
+ if (!raw) return { content: raw, reasoning: '', toolCalls: [] };
142
+
143
+ const toolCalls: GemmaParsedToolCall[] = [];
144
+ let text = raw.replace(TOOL_CALL_BLOCK, (_m, name: string, args: string) => {
145
+ toolCalls.push({ name, argumentsJson: gemmaArgsToJson(args) });
146
+ return '';
147
+ });
148
+
149
+ const channels = extractGemmaThoughtChannels(text);
150
+ text = channels.content;
151
+
152
+ // Unterminated thought channel (model hit max_tokens mid-reasoning)
153
+ let reasoning = channels.reasoning;
154
+ const danglingThought = text.match(/<\|channel>\s*thought\s*\r?\n?([\s\S]*)$/i);
155
+ if (danglingThought) {
156
+ reasoning = reasoning ? `${reasoning}\n\n${danglingThought[1]!.trim()}` : danglingThought[1]!.trim();
157
+ text = text.slice(0, danglingThought.index);
158
+ }
159
+
160
+ text = text.replace(RESIDUAL_SPECIAL, '');
161
+
162
+ return {
163
+ content: text.trim(),
164
+ reasoning,
165
+ toolCalls,
166
+ };
167
+ }