npm - @agorapete/wllama - Versions diffs - 3.5.1-q2.0 - Mend

@agorapete/wllama 3.5.1-q2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/.gitmodules +3 -0
package/.prettierignore +38 -0
package/AGENTS.md +1 -0
package/CMakeLists.txt +131 -0
package/LICENCE +21 -0
package/README-dev.md +178 -0
package/README.md +225 -0
package/README_banner.png +0 -0
package/assets/screenshot_0.png +0 -0
package/cpp/generate_glue_prototype.js +115 -0
package/cpp/glue.hpp +664 -0
package/cpp/test_glue.cpp +80 -0
package/cpp/wllama-context.h +1172 -0
package/cpp/wllama-fs.h +148 -0
package/cpp/wllama.cpp +187 -0
package/cpp/wllama.h +6 -0
package/esm/cache-manager.d.ts +130 -0
package/esm/debug.d.ts +28 -0
package/esm/glue/glue.d.ts +22 -0
package/esm/glue/messages.d.ts +146 -0
package/esm/huggingface.d.ts +31 -0
package/esm/index.cjs +3406 -0
package/esm/index.d.ts +8 -0
package/esm/index.js +3387 -0
package/esm/index.min.js +1 -0
package/esm/index.min.js.map +1 -0
package/esm/model-manager.d.ts +136 -0
package/esm/storage/cos.d.ts +36 -0
package/esm/storage/index.d.ts +33 -0
package/esm/storage/opfs.d.ts +12 -0
package/esm/types/oai-compat.d.ts +278 -0
package/esm/types/types.d.ts +112 -0
package/esm/utils.d.ts +119 -0
package/esm/wasm/source-map.d.ts +1 -0
package/esm/wasm/wllama.wasm +0 -0
package/esm/wasm-from-cdn.d.ts +8 -0
package/esm/wllama.d.ts +397 -0
package/esm/worker.d.ts +92 -0
package/esm/workers-code/generated.d.ts +4 -0
package/guides/intro-v2.md +132 -0
package/guides/intro-v3.1.md +40 -0
package/guides/intro-v3.md +230 -0
package/index.ts +1 -0
package/package.json +71 -0
package/scripts/bisect_test.sh +33 -0
package/scripts/build_hf_space.sh +26 -0
package/scripts/build_source_map.js +269 -0
package/scripts/build_wasm.sh +19 -0
package/scripts/build_worker.sh +38 -0
package/scripts/check_debug_build.js +30 -0
package/scripts/check_package_size.js +25 -0
package/scripts/docker-compose.yml +76 -0
package/scripts/generate_wasm_from_cdn.js +24 -0
package/scripts/http_server.js +44 -0
package/scripts/post_build.sh +32 -0
package/src/cache-manager.ts +358 -0
package/src/debug.ts +111 -0
package/src/glue/glue.ts +291 -0
package/src/glue/messages.ts +773 -0
package/src/huggingface.ts +151 -0
package/src/index.ts +8 -0
package/src/mjs.test.ts +44 -0
package/src/model-manager.test.ts +200 -0
package/src/model-manager.ts +359 -0
package/src/storage/cos.test.ts +83 -0
package/src/storage/cos.ts +171 -0
package/src/storage/index.ts +40 -0
package/src/storage/opfs.ts +119 -0
package/src/types/oai-compat.ts +342 -0
package/src/types/types.ts +133 -0
package/src/utils.test.ts +231 -0
package/src/utils.ts +403 -0
package/src/wasm/source-map.ts +7 -0
package/src/wasm/wllama.js +1 -0
package/src/wasm/wllama.wasm +0 -0
package/src/wasm-from-cdn.ts +13 -0
package/src/wllama.test.ts +392 -0
package/src/wllama.ts +1138 -0
package/src/wllama.wgpu.test.ts +62 -0
package/src/worker.ts +443 -0
package/src/workers-code/generated.ts +11 -0
package/src/workers-code/llama-cpp.js +511 -0
package/src/workers-code/opfs-utils.js +150 -0
package/tsconfig.build.json +34 -0
package/tsup.config.ts +23 -0
package/vitest.config.ts +61 -0

package/guides/intro-v2.md ADDED Viewed

@@ -0,0 +1,132 @@
+# Introducing Wllama V2.0
+## What's new
+V2.0 introduces significant improvements in model management and caching. Key features include:
+- Completely rewritten model downloader with service worker
+- New `ModelManager` class providing comprehensive model handling and caching capabilities
+- Enhanced testing system built on the `vitest` framework
+## Added `ModelManager`
+The new `ModelManager` class provides a robust interface for handling model files:
+```typescript
+// Example usage
+const modelManager = new ModelManager();
+// List all models in cache
+const cachedModels = await modelManager.getModels();
+// Add a new model
+const model = await modelManager.downloadModel('https://example.com/model.gguf');
+// Check if model is valid (i.e. it is not corrupted)
+// If status === ModelValidationStatus.VALID, you can use the model
+// Otherwise, call model.refresh() to re-download it
+const status = await model.validate();
+// Re-download if needed (useful when remote model file has changed)
+await model.refresh();
+// Remove model from cache
+await model.remove();
+// Load the selected model into llama.cpp
+const wllama = new Wllama(CONFIG_PATHS);
+await wllama.loadModel(model);
+// Alternatively, you can also pass directly model URL like in v1.x
+// This will automatically download the model to cache
+await wllama.loadModelFromUrl('https://example.com/model.gguf');
+```
+Key features of `ModelManager`:
+- Automatic handling of split GGUF models
+- Built-in model validation
+- Parallel downloads of model shards
+- Cache management with refresh and removal options
+## Added `loadModelFromHF`
+A new helper function to load models directly from Hugging Face Hub. This is a convenient wrapper over `loadModelFromUrl` that handles HF repository URLs.
+```js
+await wllama.loadModelFromHF(
+  'ggml-org/models',
+  'tinyllamas/stories260K.gguf'
+);
+```
+## Migration to v2.0
+### Simplified `new Wllama()` constructor
+In v2.0, the configuration paths have been simplified. You now only need to specify the `*.wasm` files, as the `*.js` files are no longer required.
+Previously in v1.x:
+```js
+const CONFIG_PATHS = {
+  'single-thread/wllama.js'       : '../../esm/single-thread/wllama.js',
+  'single-thread/wllama.wasm'     : '../../esm/single-thread/wllama.wasm',
+  'multi-thread/wllama.js'        : '../../esm/multi-thread/wllama.js',
+  'multi-thread/wllama.wasm'      : '../../esm/multi-thread/wllama.wasm',
+  'multi-thread/wllama.worker.mjs': '../../esm/multi-thread/wllama.worker.mjs',
+};
+const wllama = new Wllama(CONFIG_PATHS);
+```
+From v2.0:
+```js
+// You only need to specify 2 files
+const CONFIG_PATHS = {
+  'single-thread/wllama.wasm': '../../esm/single-thread/wllama.wasm',
+  'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm',
+};
+const wllama = new Wllama(CONFIG_PATHS);
+```
+Alternatively, you can use the `*.wasm` files from CDN:
+```js
+import WasmFromCDN from '@wllama/wllama/esm/wasm-from-cdn.js';
+const wllama = new Wllama(WasmFromCDN);
+// NOTE: this is not recommended
+// only use this when you can't embed wasm files in your project
+```
+The `Wllama` constructor now accepts an optional second parameter of type `WllamaConfig` for configuration options:
+> [!IMPORTANT]
+> Most configuration options previously available in `DownloadModelConfig` used with `loadModelFromUrl()` have been moved to this constructor config.
+```js
+const wllama = new Wllama(CONFIG_PATHS, {
+  parallelDownloads: 5, // maximum concurrent downloads
+  allowOffline: false, // whether to allow offline model loading
+});
+```
+### `Wllama.loadModelFromUrl`
+As mentioned earlier, some options are moved to `Wllama` constructor, including:
+- `parallelDownloads`
+- `allowOffline`
+### Other changes
+- `Wllama.downloadModel` is removed. Please use `ModelManager.downloadModel` instead
+- `loadModelFromUrl` won't check if cached model is up-to-date. You may need to manually call `Model.refresh()` to re-download the model.
+- Changes in `CacheManager`:
+  - Added `CacheManager.download` function
+  - `CacheManager.open(nameOrURL)` now accepts both file name and original URL. It now returns a `Blob` instead of a `ReadableStream`
+### Internal Changes
+Notable internal improvements made to the codebase:
+- Comprehensive test coverage using `vitest`, with browser testing for Chrome and Firefox (Safari support planned for the future)
+- Enhanced CI pipeline including validation for example builds, ESM compilation and lint checks

package/guides/intro-v3.1.md ADDED Viewed

@@ -0,0 +1,40 @@
+# Release note Wllama V3.1
+## What's new
+Continuing from the [V3.0 release](./intro-v3.md), V3.1 continues to bring more interesting features into wllama. This release marks 2 major changes:
+1. WebGPU support
+2. Single WASM build (no more single/multi-threaded build)
+### WebGPU support
+WebGPU support is introduced via [PR #215](https://github.com/ngxson/wllama/pull/215). Currently only supports Chrome (for Firefox, a flag must be enabled manually).
+Upon updating to V3.1, WebGPU will be enabled automatically. By default, all layers will be offloaded to GPU. If the model is too big to fit into VRAM, you can manually adjust the number of layers via the `n_gpu_layers` parameter of `LoadModelParams`. Example:
+```js
+await wllama.loadModel(files, {
+  n_gpu_layers: 4, // meaning 4 layers are offloaded to GPU; set to 0 to disable GPU inference
+});
+```
+### Single WASM build
+From [PR #214](https://github.com/ngxson/wllama/pull/214), the separation between single-threaded build and multi-threaded build has been removed. Wllama now uses a single build that can support both single/multi-threaded and WebGPU, each feature can be toggled at runtime.
+This allows cutting down the space to host the pre-built binary, while speeding up the build process.
+To migrate from an older version:
+```js
+// Old config
+const CONFIG_PATHS = {
+  'single-thread/wllama.wasm': './path_to_source/single-thread/wllama.wasm',
+  'multi-thread/wllama.wasm' : './path_to_source/multi-thread/wllama.wasm',
+};
+// New config
+const CONFIG_PATHS = {
+  default: './path_to_source/wasm/wllama.wasm',
+};
+```

package/guides/intro-v3.md ADDED Viewed

@@ -0,0 +1,230 @@
+# Introducing Wllama V3.0
+## What's new
+V3.0 is a major architectural overhaul that replaces the custom wllama core with `server-context`, the inference component from `llama-server`. Key highlights:
+- Full OAI-compatible API: `createChatCompletion`, `createCompletion`, `createEmbedding`
+- Multimodal support (vision/audio inputs)
+- Native tool calling support
+- Jinja-based chat template parsing (same as llama-server)
+New demos:
+- Multimodal (vision) completion: https://github.ngxson.com/wllama/examples/multimodal/ ([source code](../examples/multimodal/index.html))
+- Tool calling: https://github.ngxson.com/wllama/examples/tools/ ([source code](../examples/tools/index.html))
+## New architecture
+Previously, wllama implemented its own low-level bindings to llama.cpp. V3.0 instead reuses `server-context.cpp` from `llama-server`, which brings two major benefits:
+- Better compatibility: new llama.cpp features (tool calling, reasoning, multimodal) work automatically
+- Less maintenance: wllama no longer needs to re-implement chat template parsing, sampling logic, etc.
+The worker architecture is unchanged — the wasm thread runs the server-context main loop, the browser thread handles inference requests, and they communicate via the existing `glue` message protocol.
+## OAI-compatible API
+All completion methods now follow the OpenAI API shape closely. This makes it easy to swap wllama in wherever you already use the OpenAI SDK.
+### `createChatCompletion`
+```typescript
+// Non-streaming
+const response = await wllama.createChatCompletion({
+  messages: [{ role: 'user', content: 'Hello!' }],
+  max_tokens: 256,
+  temperature: 0.7,
+});
+console.log(response.choices[0].message.content);
+// Streaming
+const stream = await wllama.createChatCompletion({
+  messages: [{ role: 'user', content: 'Hello!' }],
+  max_tokens: 256,
+  stream: true,
+});
+for await (const chunk of stream) {
+  process.stdout.write(chunk.choices[0].delta.content ?? '');
+}
+```
+### `createCompletion`
+```typescript
+// Raw (non-chat) completion
+const response = await wllama.createCompletion({
+  prompt: 'The capital of France is',
+  max_tokens: 32,
+});
+console.log(response.choices[0].text);
+```
+### `createEmbedding`
+```typescript
+// Requires model loaded with { embeddings: true }
+const response = await wllama.createEmbedding({
+  input: 'The quick brown fox',
+});
+console.log(response.data[0].embedding); // float[]
+```
+## Tool calling
+Tool calling works out of the box for any model that supports it (e.g. Qwen, Llama with tool-call template).
+```typescript
+const tools = [
+  {
+    type: 'function',
+    function: {
+      name: 'get_weather',
+      description: 'Get the current weather for a given city.',
+      parameters: {
+        type: 'object',
+        properties: {
+          city: { type: 'string', description: 'City name' },
+        },
+        required: ['city'],
+      },
+    },
+  },
+];
+const messages = [{ role: 'user', content: 'What is the weather in Tokyo?' }];
+// First turn: model decides to call a tool
+const response = await wllama.createChatCompletion({
+  messages,
+  tools,
+  tool_choice: 'auto',
+  max_tokens: 256,
+});
+const choice = response.choices[0];
+if (choice.finish_reason === 'tool_calls') {
+  const toolCall = choice.message.tool_calls[0];
+  const args = JSON.parse(toolCall.function.arguments);
+  const result = { condition: 'rain', temperature_celsius: 21 };
+  // Second turn: feed tool result back
+  messages.push(choice.message);
+  messages.push({
+    role: 'tool',
+    tool_call_id: toolCall.id,
+    content: JSON.stringify(result),
+  });
+  const final = await wllama.createChatCompletion({ messages, max_tokens: 256 });
+  console.log(final.choices[0].message.content);
+}
+```
+## Multimodal support
+Models with a vision projector (mmproj) can now process image and audio inputs.
+```typescript
+// Load the model + mmproj from Hugging Face
+await wllama.loadModelFromHF({
+  repo: 'user/model-GGUF',
+  quant: 'Q4_K_M',
+  mmprojQuant: 'Q8_0',
+});
+// Or load from explicit URLs
+await wllama.loadModelFromUrl({
+  url: 'https://example.com/model.gguf',
+  mmprojUrl: 'https://example.com/mmproj.gguf',
+});
+// Pass an image as ArrayBuffer alongside text
+const imageData = await fetch('./photo.jpg').then(r => r.arrayBuffer());
+const response = await wllama.createChatCompletion({
+  messages: [
+    {
+      role: 'user',
+      content: [
+        { type: 'image', data: imageData },
+        { type: 'text', text: 'Describe this image.' },
+      ],
+    },
+  ],
+  max_tokens: 512,
+});
+```
+## Migration from v2.0
+### Removed low-level APIs
+The following APIs are **no longer available** in v3.0. They were tied to the old custom core and cannot easily be re-implemented on top of llama-server.
+| Removed | Reason |
+|---|---|
+| `tokenize` / `detokenize` | Low-level tokenizer API removed |
+| `decode` / `encode` | Replaced by OAI completion API |
+| `samplingInit` / `samplingAccept` / `samplingSample` | Sampling is now handled internally per-request |
+| Sequence shift/remove operations | Not exposed by llama-server context |
+> [!IMPORTANT]
+> If you rely on tokenizer APIs, please leave a comment on [PR #213](https://github.com/ngxson/wllama/pull/213) — it can be added back in the future.
+### Sampling params moved to per-request
+In v2.0, some sampling params were passed at model load time. From v3.0, all sampling params must be provided per request via `createChatCompletion` / `createCompletion`.
+Previously in v2.x:
+```js
+await wllama.loadModelFromUrl('https://example.com/model.gguf', {
+  temperature: 0.8,
+  top_k: 40,
+});
+```
+From v3.0:
+```js
+await wllama.loadModelFromUrl('https://example.com/model.gguf');
+const response = await wllama.createChatCompletion({
+  messages: [{ role: 'user', content: 'Hello!' }],
+  temperature: 0.8,
+  top_k: 40,
+});
+```
+### Auto context length removed
+The `n_ctx_auto` option is no longer supported. Set `n_ctx` explicitly at load time.
+```js
+await wllama.loadModelFromUrl('https://example.com/model.gguf', {
+  n_ctx: 4096,
+});
+```
+### Multimodal loading: `mmprojUrl` replaces separate file selection
+Previously you had to pass mmproj as a local file alongside the main model. From v3.0, pass it directly via `mmprojUrl` in `loadModelFromUrl`:
+```js
+await wllama.loadModelFromUrl({
+  url: 'https://example.com/model.gguf',
+  mmprojUrl: 'https://example.com/mmproj.gguf',
+});
+```
+Local file loading still works — just pass both GGUF blobs to `loadModel`:
+```js
+await wllama.loadModel([modelBlob, mmprojBlob]);
+```
+### Internal changes
+- The `server-context` main loop now runs on the wasm worker thread
+- Chat templates are parsed with Jinja (same as llama-server) — set `jinja: true` at load time to enable, or override with `chat_template`
+- `WllamaError` gains a new `'kv_cache_full'` error type for when the context runs out of space

package/index.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export * from './src';

package/package.json ADDED Viewed

@@ -0,0 +1,71 @@
+{
+  "name": "@agorapete/wllama",
+  "version": "3.5.1-q2.0",
+  "description": "WebAssembly binding for llama.cpp (with Q2_0 ternary support) - Enabling on-browser LLM inference",
+  "main": "index.js",
+  "type": "module",
+  "directories": {
+    "example": "examples"
+  },
+  "scripts": {
+    "serve": "node ./scripts/http_server.js",
+    "serve:mt": "MULTITHREAD=1 node ./scripts/http_server.js",
+    "clean": "rm -rf ./esm && rm -rf ./docs && rm -rf ./wasm",
+    "build:worker": "./scripts/build_worker.sh",
+    "build:glue": "node ./cpp/generate_glue_prototype.js",
+    "build:wasm": "./scripts/build_wasm.sh && npm run build:glue",
+    "build:test": "WLLAMA_TEST_BACKEND=1 npm run build:wasm",
+    "build:tsup": "tsup src/index.ts --format cjs,esm --clean",
+    "build:minified": "terser esm/index.js -o esm/index.min.js --compress --mangle --source-map",
+    "build:typedef": "tsc --emitDeclarationOnly --declaration -p tsconfig.build.json",
+    "build": "npm run clean && npm run build:worker && npm run build:tsup && npm run build:minified && npm run build:typedef && ./scripts/post_build.sh && npm run docs",
+    "docs": "typedoc --tsconfig tsconfig.build.json src/index.ts",
+    "upload": "npm run format && npm run build && node scripts/check_package_size.js && npm publish --access public && (cd compat && npm publish --access public)",
+    "format": "prettier --write .",
+    "test": "vitest",
+    "test:auto": "AUTO=1 vitest",
+    "test:firefox": "BROWSER=firefox vitest",
+    "test:safari": "BROWSER=safari vitest",
+    "test:wgpu": "WEBGPU=1 vitest"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/AgoraPete/wllama.git"
+  },
+  "keywords": [
+    "wasm",
+    "webassembly",
+    "llama",
+    "llm",
+    "ai",
+    "rag",
+    "embeddings",
+    "generation"
+  ],
+  "author": "Xuan Son NGUYEN <contact@ngxson.com>",
+  "license": "MIT",
+  "bugs": {
+    "url": "https://github.com/ngxson/wllama/issues"
+  },
+  "homepage": "https://github.com/ngxson/wllama#readme",
+  "devDependencies": {
+    "@playwright/test": "^1.60.0",
+    "@vitest/browser": "^2.1.6",
+    "express": "^4.18.3",
+    "mime-types": "^2.1.35",
+    "playwright": "^1.59.1",
+    "prettier": "^3.3.3",
+    "terser": "^5.39.0",
+    "tsup": "^8.4.0",
+    "typedoc": "^0.27.2",
+    "typescript": "^5.4.2",
+    "webdriverio": "^9.4.1"
+  },
+  "prettier": {
+    "trailingComma": "es5",
+    "tabWidth": 2,
+    "semi": true,
+    "singleQuote": true,
+    "bracketSameLine": false
+  }
+}

package/scripts/bisect_test.sh ADDED Viewed

@@ -0,0 +1,33 @@
+#!/bin/bash
+# Git bisect script for finding the llama.cpp commit that introduced the locale crash.
+# Run from wllama root: git bisect run ./scripts/bisect_test.sh
+#
+# Start (good): aa46bda89b9a8378ae76bb15fc2ce2f571f0983c  (wllama master's llama.cpp)
+# End   (bad):  dd4623a74                                  (current HEAD of submodule)
+set -e
+WLLAMA_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." &> /dev/null && pwd )"
+cd "$WLLAMA_ROOT"
+CONTEXT_H="$WLLAMA_ROOT/cpp/wllama-context.h"
+FIT_H="$WLLAMA_ROOT/llama.cpp/common/fit.h"
+# Adjust #if 0 / #if 1 around common_get_device_memory_data stub based on
+# which signature fit.h declares at the current bisect commit.
+if grep -q "common_device_memory_data_vec" "$FIT_H" 2>/dev/null; then
+    # New signature (after d8a24ccee): enable the common_device_memory_data_vec block
+    sed -i.bak 's/^#if [01]$/\#if 1/' "$CONTEXT_H"
+else
+    # Old signature (before d8a24ccee): enable the std::vector<llama_device_memory_data> block
+    sed -i.bak 's/^#if [01]$/\#if 0/' "$CONTEXT_H"
+fi
+rm -f "${CONTEXT_H}.bak"
+# Build and run tests. Exit 125 = skip (build infra broken at this commit).
+rm -rf build
+SKIP_COMPAT=1 npm run build:wasm 2>&1 || exit 125
+npm run build 2>&1 || exit 125
+AUTO=1 npm run test 2>&1

package/scripts/build_hf_space.sh ADDED Viewed

@@ -0,0 +1,26 @@
+#!/bin/bash
+set -e
+echo ">>> clone"
+rm -rf _tmp_hf_space
+git clone https://ngxson:${HF_TOKEN}@huggingface.co/spaces/ngxson/wllama --depth 1 _tmp_hf_space
+echo ">>> build"
+cd _tmp_hf_space
+./build.sh
+echo ">>> push"
+if [ -z "$(git status --porcelain)" ]; then
+  echo "nothing changed, skipping..."
+  exit 0
+fi
+git add -A
+git commit -m "update"
+git push
+echo ">>> clean up"
+cd ..
+rm -rf _tmp_hf_space
+echo ">>> done"