npm - universal-llm-client - Versions diffs - 4.2.0 → 4.5.0 - Mend

universal-llm-client 4.2.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

package/CHANGELOG.md +142 -103
package/LICENSE +21 -21
package/README.md +640 -591
package/dist/ai-model.d.ts +12 -1
package/dist/ai-model.d.ts.map +1 -1
package/dist/ai-model.js +36 -1
package/dist/ai-model.js.map +1 -1
package/dist/gemma-channel.d.ts +14 -0
package/dist/gemma-channel.d.ts.map +1 -0
package/dist/gemma-channel.js +38 -0
package/dist/gemma-channel.js.map +1 -0
package/dist/gemma-diffusion.d.ts +49 -0
package/dist/gemma-diffusion.d.ts.map +1 -0
package/dist/gemma-diffusion.js +147 -0
package/dist/gemma-diffusion.js.map +1 -0
package/dist/http.d.ts +4 -0
package/dist/http.d.ts.map +1 -1
package/dist/http.js +14 -1
package/dist/http.js.map +1 -1
package/dist/index.d.ts +2 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +4 -0
package/dist/index.js.map +1 -1
package/dist/interfaces.d.ts +183 -7
package/dist/interfaces.d.ts.map +1 -1
package/dist/interfaces.js.map +1 -1
package/dist/providers/anthropic.d.ts.map +1 -1
package/dist/providers/anthropic.js +28 -3
package/dist/providers/anthropic.js.map +1 -1
package/dist/providers/google.d.ts +22 -1
package/dist/providers/google.d.ts.map +1 -1
package/dist/providers/google.js +225 -13
package/dist/providers/google.js.map +1 -1
package/dist/providers/ollama.d.ts +2 -0
package/dist/providers/ollama.d.ts.map +1 -1
package/dist/providers/ollama.js +59 -30
package/dist/providers/ollama.js.map +1 -1
package/dist/providers/openai.d.ts +14 -0
package/dist/providers/openai.d.ts.map +1 -1
package/dist/providers/openai.js +200 -22
package/dist/providers/openai.js.map +1 -1
package/dist/router.d.ts +2 -0
package/dist/router.d.ts.map +1 -1
package/dist/router.js +4 -0
package/dist/router.js.map +1 -1
package/dist/stream-decoder.d.ts +12 -0
package/dist/stream-decoder.d.ts.map +1 -1
package/dist/stream-decoder.js +182 -5
package/dist/stream-decoder.js.map +1 -1
package/dist/thinking.d.ts +36 -0
package/dist/thinking.d.ts.map +1 -0
package/dist/thinking.js +52 -0
package/dist/thinking.js.map +1 -0
package/package.json +118 -116
package/src/ai-model.ts +400 -350
package/src/auditor.ts +213 -213
package/src/client.ts +402 -402
package/src/debug/debug-google-streaming.ts +1 -1
package/src/demos/basic/universal-llm-examples.ts +3 -3
package/src/demos/diffusion-gemma/.env +29 -0
package/src/demos/diffusion-gemma/.env.example +27 -0
package/src/demos/diffusion-gemma/CLAUDE.md +95 -0
package/src/demos/diffusion-gemma/README.md +59 -0
package/src/demos/diffusion-gemma/canvas.ts +1606 -0
package/src/demos/diffusion-gemma/docker-compose.yml +29 -0
package/src/demos/diffusion-gemma/probe-stream.ts +51 -0
package/src/demos/diffusion-gemma/probe-tools.ts +55 -0
package/src/demos/diffusion-gemma/server.ts +1205 -0
package/src/demos/diffusion-gemma/start-vllm.sh +98 -0
package/src/gemma-channel.ts +47 -0
package/src/gemma-diffusion.ts +167 -0
package/src/http.ts +261 -247
package/src/index.ts +180 -161
package/src/interfaces.ts +843 -657
package/src/mcp.ts +345 -345
package/src/providers/anthropic.ts +796 -762
package/src/providers/google.ts +840 -620
package/src/providers/index.ts +8 -8
package/src/providers/ollama.ts +503 -469
package/src/providers/openai.ts +587 -392
package/src/router.ts +785 -780
package/src/stream-decoder.ts +535 -361
package/src/structured-output.ts +759 -759
package/src/test-scripts/test-google-deep-research.ts +33 -0
package/src/test-scripts/test-google-streaming-enhanced.ts +147 -147
package/src/test-scripts/test-google-streaming.ts +1 -1
package/src/test-scripts/test-google-system-prompt-comprehensive.ts +189 -189
package/src/test-scripts/test-google-thinking.ts +46 -0
package/src/test-scripts/test-system-message-positions.ts +163 -163
package/src/test-scripts/test-system-prompt-improvement-demo.ts +83 -83
package/src/test-scripts/test-vllm-qwen36.ts +256 -0
package/src/tests/ai-model.test.ts +1614 -1614
package/src/tests/auditor.test.ts +224 -224
package/src/tests/gemma-diffusion.test.ts +115 -0
package/src/tests/http.test.ts +200 -200
package/src/tests/interfaces.test.ts +117 -117
package/src/tests/providers/anthropic.test.ts +118 -0
package/src/tests/providers/google.test.ts +841 -660
package/src/tests/providers/ollama.test.ts +1034 -954
package/src/tests/providers/openai.test.ts +1511 -1122
package/src/tests/router.test.ts +254 -254
package/src/tests/stream-decoder.test.ts +263 -179
package/src/tests/structured-output.test.ts +1450 -1450
package/src/tests/thinking.test.ts +65 -0
package/src/tests/tools.test.ts +175 -175
package/src/thinking.ts +73 -0
package/src/tools.ts +246 -246
package/src/zod-adapter.ts +72 -72

package/src/demos/diffusion-gemma/start-vllm.sh ADDED Viewed

@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+set -euo pipefail
+echo "=== Upgrading transformers ==="
+pip install --upgrade transformers
+echo "=== Installing WSL2 UVA compatibility patch ==="
+cat > /usr/local/lib/python3.12/dist-packages/wsl2_uva_patch.py <<'PYEOF'
+"""
+WSL2 UVA compatibility patch for vLLM.
+UVA lets the GPU directly access pinned CPU memory. WSL2 does not support this
+path reliably, so this patch uses explicit CPU/GPU copies instead.
+"""
+import warnings
+import numpy as np
+import torch
+warnings.warn("WSL2 UVA patch active: using explicit CPU/GPU copies instead of UVA")
+import vllm.v1.worker.gpu.buffer_utils as bu
+class PatchedUvaBuffer:
+    def __init__(self, size, dtype):
+        self.cpu = torch.zeros(size, dtype=dtype, device="cpu", pin_memory=False)
+        self.np = self.cpu.numpy()
+        self._gpu = torch.zeros(size, dtype=dtype, device="cuda")
+        self.uva = self._gpu
+    def sync_to_gpu(self):
+        self._gpu.copy_(self.cpu, non_blocking=True)
+class PatchedUvaBufferPool:
+    def __init__(self, size, dtype, max_concurrency=None):
+        if max_concurrency is None:
+            max_concurrency = bu._DEFAULT_MAX_CONCURRENCY
+        self.size = size
+        self.dtype = dtype
+        self.max_concurrency = max_concurrency
+        self._uva_bufs = [PatchedUvaBuffer(size, dtype) for _ in range(max_concurrency)]
+        self._curr = 0
+    def copy_to_uva(self, x):
+        self._curr = (self._curr + 1) % self.max_concurrency
+        buf = self._uva_bufs[self._curr]
+        dst = buf.cpu if isinstance(x, torch.Tensor) else buf.np
+        n = len(x)
+        dst[:n] = x
+        buf.sync_to_gpu()
+        return buf.uva[:n]
+import vllm.utils.platform_utils as pu
+pu.is_uva_available = lambda: True
+import vllm.utils.torch_utils as tu
+tu.get_accelerator_view_from_cpu_tensor = lambda cpu_tensor: cpu_tensor.cuda()
+bu.UvaBuffer = PatchedUvaBuffer
+bu.UvaBufferPool = PatchedUvaBufferPool
+print("[WSL2 UVA Patch] Applied successfully - using explicit CPU/GPU copies")
+PYEOF
+echo "import wsl2_uva_patch" > /usr/local/lib/python3.12/dist-packages/wsl2_uva_patch.pth
+if [ -f /root/.cache/huggingface/diffusion-env.sh ]; then
+  # This file is written by the demo server's /api/engine-config endpoint.
+  . /root/.cache/huggingface/diffusion-env.sh
+fi
+MODEL_NAME="${MODEL_NAME:-RedHatAI/diffusiongemma-26B-A4B-it-NVFP4}"
+GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.28}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
+MAX_NUM_SEQS="${MAX_NUM_SEQS:-1}"
+DIFFUSION_ENTROPY="${DIFFUSION_ENTROPY:-0.1}"
+ENFORCE_EAGER="${ENFORCE_EAGER:-0}"
+export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}"
+echo "=== Engine config: MODEL_NAME=${MODEL_NAME} DIFFUSION_ENTROPY=${DIFFUSION_ENTROPY} GPU_MEM_UTIL=${GPU_MEM_UTIL} MAX_MODEL_LEN=${MAX_MODEL_LEN} MAX_NUM_SEQS=${MAX_NUM_SEQS} ENFORCE_EAGER=${ENFORCE_EAGER} VLLM_NO_USAGE_STATS=${VLLM_NO_USAGE_STATS} ==="
+EAGER_FLAG=""
+if [ "${ENFORCE_EAGER}" = "1" ]; then
+  EAGER_FLAG="--enforce-eager"
+fi
+VLLM_USE_V2_MODEL_RUNNER=1 vllm serve "${MODEL_NAME}" \
+  --trust-remote-code \
+  --attention-backend TRITON_ATTN \
+  --max-num-seqs "${MAX_NUM_SEQS}" \
+  ${EAGER_FLAG} \
+  --gpu-memory-utilization "${GPU_MEM_UTIL}" \
+  --max-model-len "${MAX_MODEL_LEN}" \
+  --hf-overrides "{\"diffusion_sampler\": \"entropy_bound\", \"diffusion_entropy_bound\": ${DIFFUSION_ENTROPY}}" \
+  --default-chat-template-kwargs '{"enable_thinking": true}'

package/src/gemma-channel.ts ADDED Viewed

@@ -0,0 +1,47 @@
+/**
+ * Gemma 4 can emit its thought channel as text control tokens instead of the
+ * generic Ollama `message.thinking` field. Keep that provider quirk isolated so
+ * callers receive final-answer text and reasoning separately.
+ */
+export interface GemmaThoughtExtraction {
+    readonly content: string;
+    readonly reasoning: string;
+    readonly found: boolean;
+}
+const GEMMA_THOUGHT_BLOCK = /<\|channel>\s*thought\s*\r?\n?([\s\S]*?)<channel\|>/gi;
+const GEMMA_COMPACT_THOUGHT_BLOCK = /<\|thought\s*\r?\n?([\s\S]*?)\|>/gi;
+export const GEMMA_THOUGHT_OPENERS = ['<|channel>thought', '<|thought'] as const;
+export function extractGemmaThoughtChannels(input: string): GemmaThoughtExtraction {
+    if (!input) return { content: input, reasoning: '', found: false };
+    const reasoningParts: string[] = [];
+    let found = false;
+    const content = input
+        .replace(GEMMA_THOUGHT_BLOCK, (_match, thought: string) => {
+            found = true;
+            const normalized = normalizeGemmaThought(thought);
+            if (normalized) reasoningParts.push(normalized);
+            return '';
+        })
+        .replace(GEMMA_COMPACT_THOUGHT_BLOCK, (_match, thought: string) => {
+            found = true;
+            const normalized = normalizeGemmaThought(thought);
+            if (normalized) reasoningParts.push(normalized);
+            return '';
+        });
+    return {
+        content,
+        reasoning: reasoningParts.join('\n\n'),
+        found,
+    };
+}
+export function normalizeGemmaThought(thought: string): string {
+    return thought.replace(/^\s+/, '').replace(/\s+$/, '');
+}

package/src/gemma-diffusion.ts ADDED Viewed

@@ -0,0 +1,167 @@
+/**
+ * DiffusionGemma (vLLM) native-protocol adapter.
+ *
+ * Trimmed vLLM builds that serve DiffusionGemma ship with NO reasoning parser
+ * and NO tool-call parser module, and they reject OpenAI-style `tools` unless
+ * `--tool-call-parser` is configured. Everything therefore has to be handled
+ * client-side, against the model's native channel format (visible only when
+ * the request sets `skip_special_tokens: false`):
+ *
+ *   <|channel>thought ...reasoning... <channel|>          reasoning channel
+ *   <|tool_call>call:name{k:<|"|>v<|"|>,n:3}<tool_call|>  tool call
+ *
+ * Tool-call arguments are NOT JSON: keys are bare, strings are wrapped in the
+ * <|"|> quote token, numbers/booleans are bare (see the model's
+ * chat_template.jinja `format_argument` macro). `gemmaArgsToJson` converts
+ * that into a standard JSON string.
+ *
+ * Request-side protocol (implemented in the OpenAI provider):
+ *   - always send `skip_special_tokens: false`
+ *   - send `tools` with `tool_choice: 'none'` — vLLM still renders the
+ *     declarations into the chat template, it just skips its (absent) parser
+ *   - send history tool turns structurally (assistant `tool_calls` +
+ *     `role: 'tool'` messages) — the chat template renders them natively
+ */
+import { extractGemmaThoughtChannels } from './gemma-channel.js';
+export interface GemmaParsedToolCall {
+    readonly name: string;
+    /** JSON-encoded arguments object, ready for LLMToolCall.function.arguments */
+    readonly argumentsJson: string;
+}
+export interface GemmaDiffusionParsed {
+    /** Final answer with reasoning, tool-call blocks and special tokens removed */
+    readonly content: string;
+    readonly reasoning: string;
+    readonly toolCalls: readonly GemmaParsedToolCall[];
+}
+/** Models that speak this native protocol when served by vLLM. */
+export function isGemmaDiffusionModel(model: string): boolean {
+    return /diffusion[-_]?gemma/i.test(model);
+}
+const TOOL_CALL_BLOCK = /<\|tool_call>\s*call:([a-zA-Z0-9_.-]+)\s*\{([\s\S]*?)\}\s*<tool_call\|>/g;
+/**
+ * Residual control tokens that may leak into text output — including stray
+ * unbalanced channel markers (the model occasionally emits an extra
+ * <channel|> closer mid-answer).
+ */
+const RESIDUAL_SPECIAL = /<\|?(?:turn|think|image|audio|video|tool_response|tool_call|tool|channel)\b[^>]*?\|?>|<(?:turn|channel|tool_response|tool_call|tool)\|>/g;
+const QUOTE_TOKEN = '<|"|>';
+/**
+ * Convert the Gemma template's pseudo-JSON argument syntax to a JSON string.
+ * Lenient by design: bare words that aren't numbers/booleans become strings,
+ * since the model occasionally omits the quote token.
+ */
+export function gemmaArgsToJson(body: string): string {
+    // Argument bodies arrive without their outer braces (the regex strips them)
+    const src = `{${body}}`;
+    let i = 0;
+    const n = src.length;
+    function skipWs(): void {
+        while (i < n && /\s/.test(src[i]!)) i++;
+    }
+    function parseQuoted(): string {
+        // positioned at the start of QUOTE_TOKEN
+        i += QUOTE_TOKEN.length;
+        const end = src.indexOf(QUOTE_TOKEN, i);
+        const raw = end === -1 ? src.slice(i) : src.slice(i, end);
+        i = end === -1 ? n : end + QUOTE_TOKEN.length;
+        return raw;
+    }
+    function parseBare(stops: string): string {
+        const start = i;
+        while (i < n && !stops.includes(src[i]!) && !src.startsWith(QUOTE_TOKEN, i)) i++;
+        return src.slice(start, i).trim();
+    }
+    function parseValue(): string {
+        skipWs();
+        if (src.startsWith(QUOTE_TOKEN, i)) return JSON.stringify(parseQuoted());
+        const c = src[i];
+        if (c === '{') return parseObject();
+        if (c === '[') return parseArray();
+        const bare = parseBare(',}]');
+        if (/^-?\d+(\.\d+)?([eE][+-]?\d+)?$/.test(bare)) return bare;
+        if (bare === 'true' || bare === 'false' || bare === 'null') return bare;
+        return JSON.stringify(bare);
+    }
+    function parseObject(): string {
+        i++; // consume {
+        const parts: string[] = [];
+        skipWs();
+        while (i < n && src[i] !== '}') {
+            skipWs();
+            const key = src.startsWith(QUOTE_TOKEN, i) ? parseQuoted() : parseBare(':');
+            skipWs();
+            if (src[i] === ':') i++;
+            const value = parseValue();
+            parts.push(`${JSON.stringify(key.trim())}:${value}`);
+            skipWs();
+            if (src[i] === ',') i++;
+            skipWs();
+        }
+        i++; // consume }
+        return `{${parts.join(',')}}`;
+    }
+    function parseArray(): string {
+        i++; // consume [
+        const parts: string[] = [];
+        skipWs();
+        while (i < n && src[i] !== ']') {
+            parts.push(parseValue());
+            skipWs();
+            if (src[i] === ',') i++;
+            skipWs();
+        }
+        i++; // consume ]
+        return `[${parts.join(',')}]`;
+    }
+    skipWs();
+    return parseObject();
+}
+/**
+ * Parse a complete raw DiffusionGemma output into reasoning, tool calls and
+ * clean answer text.
+ */
+export function parseGemmaDiffusionOutput(raw: string): GemmaDiffusionParsed {
+    if (!raw) return { content: raw, reasoning: '', toolCalls: [] };
+    const toolCalls: GemmaParsedToolCall[] = [];
+    let text = raw.replace(TOOL_CALL_BLOCK, (_m, name: string, args: string) => {
+        toolCalls.push({ name, argumentsJson: gemmaArgsToJson(args) });
+        return '';
+    });
+    const channels = extractGemmaThoughtChannels(text);
+    text = channels.content;
+    // Unterminated thought channel (model hit max_tokens mid-reasoning)
+    let reasoning = channels.reasoning;
+    const danglingThought = text.match(/<\|channel>\s*thought\s*\r?\n?([\s\S]*)$/i);
+    if (danglingThought) {
+        reasoning = reasoning ? `${reasoning}\n\n${danglingThought[1]!.trim()}` : danglingThought[1]!.trim();
+        text = text.slice(0, danglingThought.index);
+    }
+    text = text.replace(RESIDUAL_SPECIAL, '');
+    return {
+        content: text.trim(),
+        reasoning,
+        toolCalls,
+    };
+}