universal-llm-client 4.2.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/CHANGELOG.md +142 -103
  2. package/LICENSE +21 -21
  3. package/README.md +640 -591
  4. package/dist/ai-model.d.ts +12 -1
  5. package/dist/ai-model.d.ts.map +1 -1
  6. package/dist/ai-model.js +36 -1
  7. package/dist/ai-model.js.map +1 -1
  8. package/dist/gemma-channel.d.ts +14 -0
  9. package/dist/gemma-channel.d.ts.map +1 -0
  10. package/dist/gemma-channel.js +38 -0
  11. package/dist/gemma-channel.js.map +1 -0
  12. package/dist/gemma-diffusion.d.ts +49 -0
  13. package/dist/gemma-diffusion.d.ts.map +1 -0
  14. package/dist/gemma-diffusion.js +147 -0
  15. package/dist/gemma-diffusion.js.map +1 -0
  16. package/dist/http.d.ts +4 -0
  17. package/dist/http.d.ts.map +1 -1
  18. package/dist/http.js +14 -1
  19. package/dist/http.js.map +1 -1
  20. package/dist/index.d.ts +2 -1
  21. package/dist/index.d.ts.map +1 -1
  22. package/dist/index.js +4 -0
  23. package/dist/index.js.map +1 -1
  24. package/dist/interfaces.d.ts +183 -7
  25. package/dist/interfaces.d.ts.map +1 -1
  26. package/dist/interfaces.js.map +1 -1
  27. package/dist/providers/anthropic.d.ts.map +1 -1
  28. package/dist/providers/anthropic.js +28 -3
  29. package/dist/providers/anthropic.js.map +1 -1
  30. package/dist/providers/google.d.ts +22 -1
  31. package/dist/providers/google.d.ts.map +1 -1
  32. package/dist/providers/google.js +225 -13
  33. package/dist/providers/google.js.map +1 -1
  34. package/dist/providers/ollama.d.ts +2 -0
  35. package/dist/providers/ollama.d.ts.map +1 -1
  36. package/dist/providers/ollama.js +59 -30
  37. package/dist/providers/ollama.js.map +1 -1
  38. package/dist/providers/openai.d.ts +14 -0
  39. package/dist/providers/openai.d.ts.map +1 -1
  40. package/dist/providers/openai.js +200 -22
  41. package/dist/providers/openai.js.map +1 -1
  42. package/dist/router.d.ts +2 -0
  43. package/dist/router.d.ts.map +1 -1
  44. package/dist/router.js +4 -0
  45. package/dist/router.js.map +1 -1
  46. package/dist/stream-decoder.d.ts +12 -0
  47. package/dist/stream-decoder.d.ts.map +1 -1
  48. package/dist/stream-decoder.js +182 -5
  49. package/dist/stream-decoder.js.map +1 -1
  50. package/dist/thinking.d.ts +36 -0
  51. package/dist/thinking.d.ts.map +1 -0
  52. package/dist/thinking.js +52 -0
  53. package/dist/thinking.js.map +1 -0
  54. package/package.json +118 -116
  55. package/src/ai-model.ts +400 -350
  56. package/src/auditor.ts +213 -213
  57. package/src/client.ts +402 -402
  58. package/src/debug/debug-google-streaming.ts +1 -1
  59. package/src/demos/basic/universal-llm-examples.ts +3 -3
  60. package/src/demos/diffusion-gemma/.env +29 -0
  61. package/src/demos/diffusion-gemma/.env.example +27 -0
  62. package/src/demos/diffusion-gemma/CLAUDE.md +95 -0
  63. package/src/demos/diffusion-gemma/README.md +59 -0
  64. package/src/demos/diffusion-gemma/canvas.ts +1606 -0
  65. package/src/demos/diffusion-gemma/docker-compose.yml +29 -0
  66. package/src/demos/diffusion-gemma/probe-stream.ts +51 -0
  67. package/src/demos/diffusion-gemma/probe-tools.ts +55 -0
  68. package/src/demos/diffusion-gemma/server.ts +1205 -0
  69. package/src/demos/diffusion-gemma/start-vllm.sh +98 -0
  70. package/src/gemma-channel.ts +47 -0
  71. package/src/gemma-diffusion.ts +167 -0
  72. package/src/http.ts +261 -247
  73. package/src/index.ts +180 -161
  74. package/src/interfaces.ts +843 -657
  75. package/src/mcp.ts +345 -345
  76. package/src/providers/anthropic.ts +796 -762
  77. package/src/providers/google.ts +840 -620
  78. package/src/providers/index.ts +8 -8
  79. package/src/providers/ollama.ts +503 -469
  80. package/src/providers/openai.ts +587 -392
  81. package/src/router.ts +785 -780
  82. package/src/stream-decoder.ts +535 -361
  83. package/src/structured-output.ts +759 -759
  84. package/src/test-scripts/test-google-deep-research.ts +33 -0
  85. package/src/test-scripts/test-google-streaming-enhanced.ts +147 -147
  86. package/src/test-scripts/test-google-streaming.ts +1 -1
  87. package/src/test-scripts/test-google-system-prompt-comprehensive.ts +189 -189
  88. package/src/test-scripts/test-google-thinking.ts +46 -0
  89. package/src/test-scripts/test-system-message-positions.ts +163 -163
  90. package/src/test-scripts/test-system-prompt-improvement-demo.ts +83 -83
  91. package/src/test-scripts/test-vllm-qwen36.ts +256 -0
  92. package/src/tests/ai-model.test.ts +1614 -1614
  93. package/src/tests/auditor.test.ts +224 -224
  94. package/src/tests/gemma-diffusion.test.ts +115 -0
  95. package/src/tests/http.test.ts +200 -200
  96. package/src/tests/interfaces.test.ts +117 -117
  97. package/src/tests/providers/anthropic.test.ts +118 -0
  98. package/src/tests/providers/google.test.ts +841 -660
  99. package/src/tests/providers/ollama.test.ts +1034 -954
  100. package/src/tests/providers/openai.test.ts +1511 -1122
  101. package/src/tests/router.test.ts +254 -254
  102. package/src/tests/stream-decoder.test.ts +263 -179
  103. package/src/tests/structured-output.test.ts +1450 -1450
  104. package/src/tests/thinking.test.ts +65 -0
  105. package/src/tests/tools.test.ts +175 -175
  106. package/src/thinking.ts +73 -0
  107. package/src/tools.ts +246 -246
  108. package/src/zod-adapter.ts +72 -72
@@ -6,7 +6,7 @@ import {request} from 'undici';
6
6
  async function debugGoogleStreaming() {
7
7
  console.log('🔍 Debugging Google Generative AI Streaming...\n');
8
8
 
9
- const apiKey = 'AIzaSyBDbo7iVNEuCcRNTgDIgRrkGpFKisXXnm0';
9
+ const apiKey = (process.env.GOOGLE_API_KEY ?? '');
10
10
  const model = 'gemma-3-4b-it';
11
11
  const endpoint = `https://generativelanguage.googleapis.com/v1beta/models/${model}:streamGenerateContent`;
12
12
 
@@ -25,7 +25,7 @@ export async function createAIApplicationExample() {
25
25
  },
26
26
  google: {
27
27
  chatModel: 'gemma-3-4b-it',
28
- apiKey: 'AIzaSyBDbo7iVNEuCcRNTgDIgRrkGpFKisXXnm0'
28
+ apiKey: (process.env.GOOGLE_API_KEY ?? '')
29
29
  }
30
30
  });
31
31
 
@@ -36,7 +36,7 @@ export async function createAIApplicationExample() {
36
36
  // Method 3: Google-specific setup
37
37
  const googleChatModel = AIModelFactory.createGoogleChatModel(
38
38
  'gemma-3-4b-it',
39
- 'AIzaSyBDbo7iVNEuCcRNTgDIgRrkGpFKisXXnm0'
39
+ (process.env.GOOGLE_API_KEY ?? '')
40
40
  );
41
41
 
42
42
  // Example usage patterns:
@@ -113,7 +113,7 @@ export async function testGoogleAPI() {
113
113
 
114
114
  const googleModel = AIModelFactory.createGoogleChatModel(
115
115
  'gemma-3-4b-it',
116
- 'AIzaSyBDbo7iVNEuCcRNTgDIgRrkGpFKisXXnm0'
116
+ (process.env.GOOGLE_API_KEY ?? '')
117
117
  );
118
118
 
119
119
  try {
@@ -0,0 +1,29 @@
1
+ # Optional docker compose overrides for the DiffusionGemma vLLM backend.
2
+ #
3
+ # Start from packages/universal-llm-client:
4
+ # docker compose --env-file src/demos/diffusion-gemma/.env -f src/demos/diffusion-gemma/docker-compose.yml up -d
5
+
6
+ # Public vLLM image to run. If a future nightly regresses DiffusionGemma support,
7
+ # set this to a known-good local or registry tag.
8
+ VLLM_IMAGE=vllm/vllm-openai:gemma
9
+
10
+ # Host port for the OpenAI-compatible vLLM API.
11
+ VLLM_PORT=18000
12
+
13
+ VLLM_URL=http://localhost:18000
14
+
15
+ # DiffusionGemma model served by vLLM.
16
+ MODEL_NAME=RedHatAI/diffusiongemma-26B-A4B-it-NVFP4
17
+
18
+ # Single-user local serving defaults. Tune for your GPU.
19
+ GPU_MEM_UTIL=0.28
20
+ MAX_MODEL_LEN=32768
21
+ MAX_NUM_SEQS=1
22
+ DIFFUSION_ENTROPY=0.1
23
+
24
+ # Set to 1 only for CUDA graph / torch.compile debugging.
25
+ ENFORCE_EAGER=0
26
+
27
+ # Disable vLLM telemetry. In WSL this avoids a py-cpuinfo JSONDecodeError in
28
+ # vLLM's background usage-reporting thread during engine startup/reload.
29
+ VLLM_NO_USAGE_STATS=1
@@ -0,0 +1,27 @@
1
+ # Optional docker compose overrides for the DiffusionGemma vLLM backend.
2
+ #
3
+ # Start from packages/universal-llm-client:
4
+ # docker compose --env-file src/demos/diffusion-gemma/.env -f src/demos/diffusion-gemma/docker-compose.yml up -d
5
+
6
+ # Public vLLM image to run. If a future nightly regresses DiffusionGemma support,
7
+ # set this to a known-good local or registry tag.
8
+ VLLM_IMAGE=vllm/vllm-openai:gemma
9
+
10
+ # Host port for the OpenAI-compatible vLLM API.
11
+ VLLM_PORT=8000
12
+
13
+ # DiffusionGemma model served by vLLM.
14
+ MODEL_NAME=RedHatAI/diffusiongemma-26B-A4B-it-NVFP4
15
+
16
+ # Single-user local serving defaults. Tune for your GPU.
17
+ GPU_MEM_UTIL=0.28
18
+ MAX_MODEL_LEN=32768
19
+ MAX_NUM_SEQS=1
20
+ DIFFUSION_ENTROPY=0.1
21
+
22
+ # Set to 1 only for CUDA graph / torch.compile debugging.
23
+ ENFORCE_EAGER=0
24
+
25
+ # Disable vLLM telemetry. In WSL this avoids a py-cpuinfo JSONDecodeError in
26
+ # vLLM's background usage-reporting thread during engine startup/reload.
27
+ VLLM_NO_USAGE_STATS=1
@@ -0,0 +1,95 @@
1
+ # DiffusionGemma demo — test harness + "Signal from Noise" canvas
2
+
3
+ Standalone Bun server exercising `universal-llm-client` against DiffusionGemma
4
+ (a discrete diffusion LM served by vLLM).
5
+
6
+ ## Run
7
+
8
+ ```bash
9
+ bun run demo:diffusion-gemma:engine # starts vLLM via demo-local docker compose
10
+ bun run demo:diffusion-gemma # starts the Bun demo server
11
+ ```
12
+
13
+ - Demo server: **http://localhost:3333** (`/` test harness, `/canvas` diffusion chat UI)
14
+ - vLLM upstream: `VLLM_URL` env, default `http://localhost:8000`
15
+ - Model: `MODEL_NAME` env, default `RedHatAI/diffusiongemma-26B-A4B-it-NVFP4`
16
+ - vLLM is started via `src/demos/diffusion-gemma/docker-compose.yml` and
17
+ `src/demos/diffusion-gemma/start-vllm.sh` — includes a WSL2 UVA patch and
18
+ the `entropy_bound` diffusion sampler. Runs as docker container
19
+ `diffusiongemma` (script is bind-mounted as `/start-vllm.sh`, so edits apply
20
+ on `docker restart diffusiongemma`). The script also sources
21
+ `src/demos/diffusion-gemma/.cache/huggingface/diffusion-env.sh`
22
+ (host-writable through the HF-cache bind mount) — that's how
23
+ `/api/engine-config` changes settings without recreating the container.
24
+ - **Tuned for single-user local serving** (env-overridable in the start script):
25
+ `GPU_MEM_UTIL` (default 0.28 ≈ 27 GiB — without caps vLLM grabbed ~88 GiB:
26
+ 69 GiB KV cache for the native 262k context, measured <0.5% used),
27
+ `MAX_MODEL_LEN` (32768), `MAX_NUM_SEQS` (1), `DIFFUSION_ENTROPY` (0.1),
28
+ `ENFORCE_EAGER` (0). Weights are 17.4 GiB.
29
+ - **Never re-add `--enforce-eager` casually:** it disables CUDA graphs AND
30
+ torch.compile and cost 2.2× throughput (387 → 841 tok/s avg, peak 1002,
31
+ steady-state ~644 on long runs). Set `ENFORCE_EAGER=1` only to debug
32
+ WSL2/Blackwell graph-capture issues. Entropy 0.1→0.2 measured ≈ no speed
33
+ change (745–845 tok/s) — the dial trades quality, not meaningful speed,
34
+ at these settings.
35
+
36
+ ## Routes
37
+
38
+ | Route | What |
39
+ | ----- | ---- |
40
+ | `/` | Test harness UI (chat + compatibility tests via universal-llm-client) |
41
+ | `/canvas` | "Signal from Noise" — cinematic chat UI replaying the diffusion process |
42
+ | `/api/chat` | Chat via universal-llm-client (`messages`, `stream`, `maxTokens`, `temperature`) |
43
+ | `/api/stream-raw` | Direct vLLM SSE proxy preserving chunk timing (`messages` or `prompt`, `maxTokens`, `thinking:false` to disable the thought channel). Always sets `skip_special_tokens:false` so channel markers survive. |
44
+ | `/api/engine-config` | GET current entropy; POST `{entropy}` writes the env file + `docker restart`s the engine (~2–4 min; UI polls `/api/health`) |
45
+ | `/api/health` | Pings vLLM `/v1/models` |
46
+
47
+ ## Native protocol (no server-side parsers!)
48
+
49
+ This vLLM build has **no reasoning parser and no tool-call parser module** —
50
+ request-level `tools` with auto choice 400s. Everything is client-side, against
51
+ the chat template's native markers (visible only with `skip_special_tokens:false`):
52
+
53
+ - Reasoning: `<|channel>thought\n …<channel|>answer`. The canvas splits this
54
+ with a streaming state machine (partial markers carried across chunks) and
55
+ renders reasoning as a collapsible amber channel above the answer surface.
56
+ - **Canvas reading view:** the mono token surface is the animation; when a
57
+ reply settles it fades into a rendered-markdown view (zero-dep renderer in
58
+ the inner script — headings/lists/code/bold/links, all input HTML-escaped
59
+ first; backticks via `String.fromCharCode(96)` because literal backticks
60
+ would terminate the outer template literal). Replay/scrub swaps back to the
61
+ token surface. Root font scales with viewport (`clamp` on `html`) for
62
+ screen-recording legibility. Max-tokens select goes to 16k (default 4k);
63
+ `finish_reason:'length'` shows an amber "⚠ capped" warning in phase+footer.
64
+ - Tool calls: `<|tool_call>call:name{k:<|"|>v<|"|>,n:3}<tool_call|>` — pseudo-JSON
65
+ args (bare keys, `<|"|>` quote token). Send `tools` + `tool_choice:'none'`
66
+ (declarations still get rendered into the template); history tool turns go as
67
+ standard structured `tool_calls` + `role:'tool'` messages (template renders
68
+ them natively).
69
+ - All of this is implemented for the library in `src/gemma-diffusion.ts` and
70
+ wired into the OpenAI provider (auto-detected by model name; override with
71
+ `LLMClientOptions.gemmaNativeProtocol`). `chatWithTools` works end-to-end.
72
+ Tests: `src/tests/gemma-diffusion.test.ts`. Probes: `probe-stream.ts`
73
+ (chunk timing), `probe-tools.ts` (tool-loop wire format).
74
+
75
+ ## Things that bite
76
+
77
+ - **`canvas.ts` is one giant TS template literal.** Backslash escapes inside the
78
+ inner `<script>` are eaten by the outer literal (`/\S+/` silently becomes
79
+ `/S+/`). The inner script is written with ZERO backslashes — newlines via
80
+ `String.fromCharCode(10)`, tokenizing via charCode scans. Keep it that way.
81
+ - **No hot reload.** `CANVAS_HTML` is bundled at startup — restart the server
82
+ after editing `canvas.ts` (kill the bun process on :3333, start again).
83
+ - **Don't name a top-level browser var `history`** — `window.history` is
84
+ unshadowable; the conversation array is called `convo`.
85
+ - **Stream shape (measured):** the vLLM OpenAI stream emits ~1KB bursts, one per
86
+ finished 256-token diffusion block, every ~0.8–1.2s. There is no per-denoise-step
87
+ state in the stream; `/canvas` animates each block's reveal during the real
88
+ compute window of the next block. `probe-stream.ts` logs chunk timing.
89
+ - **The model emits stray unbalanced `<channel|>` closers** occasionally —
90
+ the parser strips them (`RESIDUAL_SPECIAL` in gemma-diffusion.ts), and it
91
+ sometimes puts the whole final answer inside the thought channel on
92
+ post-tool turns.
93
+ - **Entropy is engine-level** (`hf_overrides` read once at model init in
94
+ vLLM's `diffusion_gemma.py`); per-request `vllm_xargs` is accepted but
95
+ ignored. Hence the reload-based `/api/engine-config`.
@@ -0,0 +1,59 @@
1
+ # DiffusionGemma demo
2
+
3
+ Standalone Bun demo for testing `universal-llm-client` against DiffusionGemma
4
+ served by vLLM's OpenAI-compatible API.
5
+
6
+ ## Run the backend
7
+
8
+ From `packages/universal-llm-client`:
9
+
10
+ ```bash
11
+ docker compose -f src/demos/diffusion-gemma/docker-compose.yml up -d
12
+ ```
13
+
14
+ The compose file runs a `diffusiongemma` container on `localhost:8000`, mounts a
15
+ demo-local Hugging Face cache at `src/demos/diffusion-gemma/.cache/huggingface`,
16
+ and bind-mounts `start-vllm.sh` as the container entrypoint.
17
+
18
+ If you already have an older hand-created `diffusiongemma` container, remove it
19
+ before switching to the demo compose file:
20
+
21
+ ```bash
22
+ docker rm -f diffusiongemma
23
+ ```
24
+
25
+ Optional overrides:
26
+
27
+ ```bash
28
+ cp src/demos/diffusion-gemma/.env.example src/demos/diffusion-gemma/.env
29
+ docker compose --env-file src/demos/diffusion-gemma/.env -f src/demos/diffusion-gemma/docker-compose.yml up -d
30
+ ```
31
+
32
+ Useful knobs are `VLLM_IMAGE`, `GPU_MEM_UTIL`, `MAX_MODEL_LEN`,
33
+ `DIFFUSION_ENTROPY`, `ENFORCE_EAGER`, and `VLLM_NO_USAGE_STATS`.
34
+
35
+ ## Run the demo UI
36
+
37
+ ```bash
38
+ bun run src/demos/diffusion-gemma/server.ts
39
+ ```
40
+
41
+ - Harness: <http://localhost:3333/>
42
+ - Canvas: <http://localhost:3333/canvas>
43
+ - vLLM API: <http://localhost:8000/v1/models>
44
+
45
+ ## Notes
46
+
47
+ - The prior BentoKit setup did not use a `docker-compose.yml`; it was a direct
48
+ Docker container using a repo-root `scripts/diffusiongemma-start.sh` bind
49
+ mount. This demo now carries its own compose file and startup script.
50
+ - The default image is `vllm/vllm-openai:gemma`, the vLLM image line that
51
+ includes DiffusionGemma support. Set `VLLM_IMAGE` if you need to test another
52
+ local or registry image.
53
+ - The first startup can take several minutes while vLLM loads and compiles the
54
+ model. Poll `docker logs -f diffusiongemma` or `/api/health` from the demo UI.
55
+ - The `/api/engine-config` endpoint writes `diffusion-env.sh` into the mounted
56
+ Hugging Face cache and restarts the `diffusiongemma` container.
57
+ - `VLLM_NO_USAGE_STATS=1` is enabled by default because this vLLM image can hit
58
+ a non-fatal `py-cpuinfo` `JSONDecodeError` in its background usage-reporting
59
+ thread under WSL during startup/reload.