@agorapete/wllama 3.5.1-q2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.gitmodules +3 -0
  2. package/.prettierignore +38 -0
  3. package/AGENTS.md +1 -0
  4. package/CMakeLists.txt +131 -0
  5. package/LICENCE +21 -0
  6. package/README-dev.md +178 -0
  7. package/README.md +225 -0
  8. package/README_banner.png +0 -0
  9. package/assets/screenshot_0.png +0 -0
  10. package/cpp/generate_glue_prototype.js +115 -0
  11. package/cpp/glue.hpp +664 -0
  12. package/cpp/test_glue.cpp +80 -0
  13. package/cpp/wllama-context.h +1172 -0
  14. package/cpp/wllama-fs.h +148 -0
  15. package/cpp/wllama.cpp +187 -0
  16. package/cpp/wllama.h +6 -0
  17. package/esm/cache-manager.d.ts +130 -0
  18. package/esm/debug.d.ts +28 -0
  19. package/esm/glue/glue.d.ts +22 -0
  20. package/esm/glue/messages.d.ts +146 -0
  21. package/esm/huggingface.d.ts +31 -0
  22. package/esm/index.cjs +3406 -0
  23. package/esm/index.d.ts +8 -0
  24. package/esm/index.js +3387 -0
  25. package/esm/index.min.js +1 -0
  26. package/esm/index.min.js.map +1 -0
  27. package/esm/model-manager.d.ts +136 -0
  28. package/esm/storage/cos.d.ts +36 -0
  29. package/esm/storage/index.d.ts +33 -0
  30. package/esm/storage/opfs.d.ts +12 -0
  31. package/esm/types/oai-compat.d.ts +278 -0
  32. package/esm/types/types.d.ts +112 -0
  33. package/esm/utils.d.ts +119 -0
  34. package/esm/wasm/source-map.d.ts +1 -0
  35. package/esm/wasm/wllama.wasm +0 -0
  36. package/esm/wasm-from-cdn.d.ts +8 -0
  37. package/esm/wllama.d.ts +397 -0
  38. package/esm/worker.d.ts +92 -0
  39. package/esm/workers-code/generated.d.ts +4 -0
  40. package/guides/intro-v2.md +132 -0
  41. package/guides/intro-v3.1.md +40 -0
  42. package/guides/intro-v3.md +230 -0
  43. package/index.ts +1 -0
  44. package/package.json +71 -0
  45. package/scripts/bisect_test.sh +33 -0
  46. package/scripts/build_hf_space.sh +26 -0
  47. package/scripts/build_source_map.js +269 -0
  48. package/scripts/build_wasm.sh +19 -0
  49. package/scripts/build_worker.sh +38 -0
  50. package/scripts/check_debug_build.js +30 -0
  51. package/scripts/check_package_size.js +25 -0
  52. package/scripts/docker-compose.yml +76 -0
  53. package/scripts/generate_wasm_from_cdn.js +24 -0
  54. package/scripts/http_server.js +44 -0
  55. package/scripts/post_build.sh +32 -0
  56. package/src/cache-manager.ts +358 -0
  57. package/src/debug.ts +111 -0
  58. package/src/glue/glue.ts +291 -0
  59. package/src/glue/messages.ts +773 -0
  60. package/src/huggingface.ts +151 -0
  61. package/src/index.ts +8 -0
  62. package/src/mjs.test.ts +44 -0
  63. package/src/model-manager.test.ts +200 -0
  64. package/src/model-manager.ts +359 -0
  65. package/src/storage/cos.test.ts +83 -0
  66. package/src/storage/cos.ts +171 -0
  67. package/src/storage/index.ts +40 -0
  68. package/src/storage/opfs.ts +119 -0
  69. package/src/types/oai-compat.ts +342 -0
  70. package/src/types/types.ts +133 -0
  71. package/src/utils.test.ts +231 -0
  72. package/src/utils.ts +403 -0
  73. package/src/wasm/source-map.ts +7 -0
  74. package/src/wasm/wllama.js +1 -0
  75. package/src/wasm/wllama.wasm +0 -0
  76. package/src/wasm-from-cdn.ts +13 -0
  77. package/src/wllama.test.ts +392 -0
  78. package/src/wllama.ts +1138 -0
  79. package/src/wllama.wgpu.test.ts +62 -0
  80. package/src/worker.ts +443 -0
  81. package/src/workers-code/generated.ts +11 -0
  82. package/src/workers-code/llama-cpp.js +511 -0
  83. package/src/workers-code/opfs-utils.js +150 -0
  84. package/tsconfig.build.json +34 -0
  85. package/tsup.config.ts +23 -0
  86. package/vitest.config.ts +61 -0
package/.gitmodules ADDED
@@ -0,0 +1,3 @@
1
+ [submodule "llama.cpp"]
2
+ path = llama.cpp
3
+ url = https://github.com/AgoraPete/llama.cpp.git
@@ -0,0 +1,38 @@
1
+ **/.vscode
2
+ **/.github
3
+ **/.git
4
+ **/.svn
5
+ **/.hg
6
+ **/node_modules
7
+ **/dist
8
+ **/docs
9
+
10
+ /llama.cpp
11
+
12
+ /examples/advanced
13
+ /examples/basic
14
+ /examples/embeddings
15
+
16
+ /scripts
17
+ /esm
18
+ /models
19
+ /build
20
+
21
+ /src/multi-thread
22
+ /src/single-thread
23
+ /src/wasm
24
+ /src/workers-code/generated.ts
25
+ /src/wasm-from-cdn.ts
26
+ /src/glue/messages.ts
27
+
28
+ /compat/wasm
29
+
30
+ *.md
31
+ *.mdx
32
+ *.json
33
+ *.lock
34
+ *.yml
35
+ *.cpp
36
+ *.hpp
37
+
38
+ *.config.js
package/AGENTS.md ADDED
@@ -0,0 +1 @@
1
+ Refer to `README-dev.md` for development documentation.
package/CMakeLists.txt ADDED
@@ -0,0 +1,131 @@
1
+ cmake_minimum_required(VERSION 3.14)
2
+ project("wllama")
3
+
4
+ option(WLLAMA_TEST_BACKEND "Build wllama with test-backend-ops included" OFF)
5
+
6
+ set(MTMD_VIDEO OFF CACHE BOOL "" FORCE)
7
+
8
+ set(CMAKE_THREAD_LIBS_INIT "-lpthread")
9
+ set(CMAKE_HAVE_THREADS_LIBRARY 1)
10
+ set(CMAKE_USE_WIN32_THREADS_INIT 0)
11
+ set(CMAKE_USE_PTHREADS_INIT 1)
12
+ set(THREADS_PREFER_PTHREAD_FLAG ON)
13
+
14
+ set(WLLAMA_COMPILE_OPTIONS
15
+ -O3 -msimd128 -DNDEBUG
16
+ # -flto=full # note: this breaks things, better to disable
17
+ -frtti
18
+ -pthread
19
+ -gsource-map
20
+ )
21
+ set(WLLAMA_LINK_OPTIONS
22
+ # -flto=full # note: this breaks things, better to disable
23
+ --no-entry
24
+ -sEXPORT_ES6=0
25
+ -sMODULARIZE=0
26
+ -sINITIAL_MEMORY=128MB
27
+ -sMAXIMUM_MEMORY=4096MB
28
+ -sSTACK_SIZE=5MB
29
+ -sALLOW_MEMORY_GROWTH=1
30
+ -sFORCE_FILESYSTEM=1
31
+ -sEXPORTED_FUNCTIONS=_main,_wllama_malloc,_wllama_start,_wllama_action,_wllama_exit,_wllama_debug
32
+ -sEXPORTED_RUNTIME_METHODS=ccall,cwrap,HEAPU8,MEMFS,FS,mmapAlloc,ENV,wasmMemory
33
+ -sNO_EXIT_RUNTIME=1
34
+ -sIMPORTED_MEMORY=1
35
+ -sPTHREAD_POOL_SIZE=Module[\"pthreadPoolSize\"]
36
+ -sUSE_PTHREADS=1
37
+ -pthread
38
+ -gsource-map
39
+ --emit-symbol-map
40
+ -Wl,--wrap,fopen
41
+ -Wl,--wrap,fclose
42
+ -Wl,--wrap,fread
43
+ -Wl,--wrap,fseek
44
+ -Wl,--wrap,ftell
45
+ -Wl,--wrap,abort
46
+ -Wl,--wrap,ggml_graph_plan # for test-backend-ops
47
+ )
48
+
49
+ if (WLLAMA_COMPAT)
50
+ # no wasm exception (not compatible with asyncify - asyncify is needed for firefox and safari)
51
+ # no mem64 (not compatible with safari)
52
+ list(APPEND WLLAMA_COMPILE_OPTIONS
53
+ -fexceptions
54
+ -pthread
55
+ )
56
+ list(APPEND WLLAMA_LINK_OPTIONS
57
+ -fexceptions
58
+ -sASYNCIFY=1
59
+ -sASYNCIFY_ADD=['wllama_start','wllama_action']
60
+ )
61
+ else()
62
+ list(APPEND WLLAMA_COMPILE_OPTIONS
63
+ -sMEMORY64=1
64
+ -fwasm-exceptions
65
+ )
66
+ list(APPEND WLLAMA_LINK_OPTIONS
67
+ -sMEMORY64=1
68
+ -fwasm-exceptions
69
+ -sJSPI
70
+ -sJSPI_EXPORTS=['wllama_start','wllama_action']
71
+ )
72
+ endif()
73
+
74
+ add_compile_options(${WLLAMA_COMPILE_OPTIONS})
75
+ add_link_options(${WLLAMA_LINK_OPTIONS})
76
+
77
+ add_subdirectory(llama.cpp)
78
+
79
+ set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
80
+ add_subdirectory(llama.cpp/tools/mtmd)
81
+
82
+ file(GLOB_RECURSE LLAMA_COMMON_SRC
83
+ CONFIGURE_DEPENDS
84
+ llama.cpp/common/*.cpp
85
+ llama.cpp/common/*.h
86
+ llama.cpp/common/*.hpp
87
+ llama.cpp/common/jinja/*.cpp
88
+ llama.cpp/common/jinja/*.h
89
+ )
90
+
91
+ list(REMOVE_ITEM LLAMA_COMMON_SRC
92
+ ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/arg.cpp
93
+ ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/console.cpp
94
+ ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/debug.cpp
95
+ ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/fit.cpp
96
+ ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/hf-cache.cpp
97
+ ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/log.cpp
98
+ ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/preset.cpp
99
+ ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/download.cpp
100
+ )
101
+
102
+ set(LLAMA_SERVER_SRC llama.cpp/tools/server/server-context.cpp
103
+ llama.cpp/tools/server/server-task.cpp
104
+ llama.cpp/tools/server/server-chat.cpp
105
+ llama.cpp/tools/server/server-common.cpp
106
+ )
107
+
108
+ set(WLLAMA_SRC ${LLAMA_COMMON_SRC} ${LLAMA_SERVER_SRC}
109
+ cpp/wllama.cpp
110
+ cpp/glue.hpp
111
+ llama.cpp/include/llama.h)
112
+
113
+ if(WLLAMA_TEST_BACKEND)
114
+ message(WARNING "Building wllama with test-backend-ops included")
115
+ set(TEST_BACKEND_OPS_SRC ${CMAKE_CURRENT_BINARY_DIR}/test-backend-ops.cpp)
116
+ file(READ ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/tests/test-backend-ops.cpp TEST_BACKEND_OPS_CONTENT)
117
+ string(REPLACE "int main(" "int main_test_backend_ops(" TEST_BACKEND_OPS_CONTENT "${TEST_BACKEND_OPS_CONTENT}")
118
+ file(WRITE ${TEST_BACKEND_OPS_SRC} "${TEST_BACKEND_OPS_CONTENT}")
119
+ list(APPEND WLLAMA_SRC ${TEST_BACKEND_OPS_SRC})
120
+ add_compile_definitions(WLLAMA_TEST_BACKEND)
121
+ endif()
122
+
123
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/cpp)
124
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/include)
125
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common)
126
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/vendor)
127
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/tools/server)
128
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/tools/mtmd)
129
+
130
+ add_executable(wllama ${WLLAMA_SRC})
131
+ target_link_libraries(wllama PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
package/LICENCE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Xuan Son NGUYEN
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README-dev.md ADDED
@@ -0,0 +1,178 @@
1
+ # wllama
2
+
3
+ Wllama is a webassembly binding of llama.cpp. It contains the main source code of llama.cpp compiled to wasm (with emscripten), plus a wrapper to provide various convenient APIs, including: downloading and caching models, compatibility, etc.
4
+
5
+ ## Project structure
6
+
7
+ The project has these directories:
8
+ - `src`: the main typescript source code
9
+ - `cpp`: C++ interface
10
+ - `scripts`: various scripts for development
11
+ - `examples`: various examples
12
+
13
+ The project has these main components:
14
+ - `wllama.ts`: the main public API
15
+ - `model-manager.ts`: relies on cache manager to manage models. For example, a model can be composed of multiple files
16
+ - `cache-manager.ts`: interface for managing cache files. It uses OPFS under the hood
17
+ - `huggingface.ts`: utility for managing models downloading from hugging face hub
18
+ - `worker.ts`: the worker manager that will be responsible of starting the emscripten worker and maintaining the communication with it
19
+ - `glue.ts`: GLUE implementation
20
+ - `wllama.cpp`: the main C++ interface
21
+
22
+ ### GLUE
23
+
24
+ GLUE is a home-grown binary protocol inspired by Protobuf. It is used internally to communicate between the wasm context and the JavaScript context of wllama.
25
+
26
+ The main goal of GLUE is to allow a type-safe interface with low overhead. It works by serializing messages into `ArrayBuffer` and transferring them using [Transferable objects](https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Transferable_objects), which avoids copying.
27
+
28
+ **Wire format:**
29
+ - 4 bytes - magic number (`GLUE`)
30
+ - 4 bytes - version number (`GLUE_VERSION`)
31
+ - 8 bytes - message prototype ID
32
+ - 4 bytes - message length (unsigned)
33
+ - message fields, each encoded as:
34
+ - 4 bytes data type (e.g. `int`, `float`, `str`, `raw`, and array variants)
35
+ - 4 bytes size (only for arrays and strings)
36
+ - data bytes
37
+
38
+ **Supported field types:** `str`, `int`, `float`, `bool`, `raw` (arbitrary bytes), and array variants of each.
39
+
40
+ Upon build, `generate_glue_prototype.js` reads `glue.hpp` and generates `glue/messages.ts`, which provides the TypeScript-side message types used throughout the codebase.
41
+
42
+ ### Threading model
43
+
44
+ Wllama ships a **single wasm build** that supports both single-threaded and multi-threaded execution. The number of threads is determined at runtime rather than at compile time.
45
+
46
+ At startup, wllama checks whether the browser supports `SharedArrayBuffer` (required for wasm threads). This check validates both the existence of `SharedArrayBuffer` and whether the wasm atomics feature is available (COOP/COEP headers must be set by the server for `SharedArrayBuffer` to be accessible).
47
+
48
+ The thread pool size is passed to emscripten via `-sPTHREAD_POOL_SIZE=Module["pthreadPoolSize"]`:
49
+ - If the browser **supports** shared memory: `pthreadPoolSize` is set to the desired thread count (defaults to `hardwareConcurrency / 2`)
50
+ - If the browser **does not support** shared memory: `pthreadPoolSize` is set to `0`, which disables pthreads entirely and falls back to single-threaded execution
51
+
52
+ This logic lives in `wllama.ts` (`isSupportMultiThread()` from `utils.ts` performs the feature detection).
53
+
54
+ ## Startup process
55
+
56
+ Upon startup, these steps are performed:
57
+ - `ProxyToWorker` is created in the main wllama JS context
58
+ - A web worker is spawned, the code is taken from `workers-code/generated.ts`
59
+ - The worker loads emscripten code, sets up the environment then eventually calls the `main()` inside `wllama.cpp`. These preparation steps are injected (see `llama-cpp.js`):
60
+ - Hooking `printf` functions
61
+ - Setting up HeapFS
62
+ - Setting up communication callbacks
63
+
64
+ ## File access
65
+
66
+ Wllama employs some tricks to avoid making copies while reading GGUF files. The runtime uses one of these 2 mechanisms. See `workers-code/llama-cpp.js` for the implementation.
67
+
68
+ Please note that wllama only accepts `Blob` as input data.
69
+
70
+ ### Async file read
71
+
72
+ This implementation hooks into `fopen`, `fseek` and `fread`, and forwards these calls to the main thread (via message port), where we eventually call `Blob.slice()` to read the data. Because of the asynchronous execution via `onmessage` and `postMessage`, JSPI / Asyncify is required.
73
+
74
+ Upon running, action `fs.alloc` is fired to indicate that the file can be read through JSPI / Asyncify call. The actual buffer won't be allocated for the file, but only the metadata is.
75
+
76
+ When wasm calls `fread()`:
77
+ - `fread()` calls `await fileRead()` in the JS context
78
+ - `fileRead()` posts a message of type `fs.read_req` to the main thread
79
+ - Main thread uses `Blob.slice()` to read the data, then sends it back via a `fs.read_res` message
80
+ - Worker's `onmessage` receives the message and resumes the awaiting coroutine
81
+
82
+ Note:
83
+ - While awaiting the read data, the worker should not have any other activities (a global variable is used as a guard and will raise an exception on any incoming messages)
84
+ - The minimum read size is 1MB. If less than this amount is requested, the full 1MB block is cached for subsequent reads. This is because reading GGUF metadata frequently involves reads of less than 1KB at a time, which can become a bottleneck without caching.
85
+ - Env var `USE_ASYNC_FILE` is used to signal from JS to wasm that we are using async file read (upon starting the module). If `USE_ASYNC_FILE` is not set, we fallback to HeapFS/mmap case (see in next section)
86
+
87
+ ### HeapFS
88
+
89
+ HeapFS is a lightweight wrapper around emscripten's default FS driver. The main goal is to allow `mmap()` to map to existing data instead of copying it (the default emscripten behavior).
90
+
91
+ These steps are performed:
92
+
93
+ - Action `fs.alloc` is fired to create the file handle and file buffer in the wasm context
94
+ - The main thread then creates and holds a `ReadableStream` for the `Blob`
95
+ - The main thread reads the file chunk by chunk, streaming it to the worker via `fs.write` messages
96
+ - Once streaming is finished, the `ReadableStream` is closed
97
+ - The model load is then triggered with `mmap = true`, and `mmap()` is wrapped to return a pointer to the correct data in the buffer allocated in step 1
98
+
99
+ The main downside of this approach is that on WebGPU, even though some tensors can be offloaded to the GPU, we still need to allocate the full model in main memory. For example, a 4GB model will still occupy 4GB of main memory, even if half of the layers (~2GB) are offloaded to the GPU.
100
+
101
+ ## Compressed source map
102
+
103
+ Emscripten's `--emit-symbol-map` flag produces a `.js.symbols` file mapping each wasm function index to its demangled C++ name. `scripts/build_source_map.js` reads this file alongside the `.wasm` binary and produces a single TypeScript file (`src/wasm/source-map.ts`) containing a compact deduplicated name table per build, gzip-compressed and base64-encoded.
104
+
105
+ The script runs automatically as part of the docker build (see `scripts/docker-compose.yml`). It can also be run manually:
106
+
107
+ ```sh
108
+ # uses build/ and build-compat/ by default
109
+ node scripts/build_source_map.js
110
+
111
+ # or with explicit paths
112
+ node scripts/build_source_map.js \
113
+ --input default:build \
114
+ --input compat:build-compat \
115
+ --output src/wasm/source-map.ts
116
+ ```
117
+
118
+ ### Name cleaning rules
119
+
120
+ Raw demangled names can be hundreds of characters. The following rules are applied in order:
121
+
122
+ 1. **std:: collapse** - any name starting with `std::` is replaced with the single hint `std::...`
123
+ 2. **Lambda/closure extraction** - names containing `::$_N` or `::'lambda'` are replaced with the nearest enclosing context (the segment inside the last `<…>` before the marker)
124
+ 3. **Parameter stripping** - parameter lists are dropped; empty `()` is kept, non-empty is removed entirely
125
+ 4. **libc++ internals** - `::__1::`, `::__2::`, etc. are collapsed to `::`
126
+ 5. **ABI tags** - `[abi:…]` annotations are removed
127
+ 6. **Template truncation** - template argument content longer than 10 characters is truncated to `<first10chars...>`
128
+ 7. **Final cleanup** - double `::::` collapsed, whitespace normalised
129
+
130
+ ### Binary format (before gzip)
131
+
132
+ All integers are little-endian.
133
+
134
+ ```
135
+ ┌──────────────────────────────────────────────────────────┐
136
+ │ HEADER (12 bytes) │
137
+ │ u32 first_func_id - wasm function index of entry 0 │
138
+ │ u32 num_funcs - number of functions │
139
+ │ u32 num_names - number of unique names │
140
+ ├──────────────────────────────────────────────────────────┤
141
+ │ NAME TABLE (num_names entries) │
142
+ │ for each name: │
143
+ │ u8 length - byte length of name (max 254) │
144
+ │ u8[] name - UTF-8 string (no null term) │
145
+ ├──────────────────────────────────────────────────────────┤
146
+ │ INDEX ARRAY (num_funcs × u16) │
147
+ │ u16 name_idx - index into name table │
148
+ │ 0xFFFF = no name / unknown │
149
+ └──────────────────────────────────────────────────────────┘
150
+ ```
151
+
152
+ To decode at runtime: base64-decode -> `DecompressionStream('gzip')` -> parse binary. Given a wasm function index `id`, look up `index_array[id - first_func_id]` to get the name table slot.
153
+ %
154
+ ## Debugging backend ops
155
+
156
+ > [!IMPORTANT]
157
+ >
158
+ > By default, the build does NOT include `test-backend-ops` to save space. If you need to run it, please clone the repo and build it yourself, instructions below
159
+
160
+ Requirements:
161
+ - You have Docker installed and running on your machine
162
+ - On Windows, please use WSL
163
+
164
+ 1. Clone this repo locally: `git clone --recurse-submodules https://github.com/ngxson/wllama.git`
165
+ 2. `npm run build:test && npm run build`
166
+ 3. `npm run serve` and open http://localhost:8080/examples/test-backend-ops/
167
+
168
+ Note: A debugging build cannot be merged to `master` or publish to npm
169
+
170
+ ## Build process
171
+
172
+ The build process uses emscripten in docker to compile the project.
173
+
174
+ After compilation, `generate_glue_prototype.js` is called to generate the GLUE message types to be used in TypeScript.
175
+
176
+ Built wasm file will then be copied to the `src` directory.
177
+
178
+ Finally, `build_worker.sh` is called to generate the web worker code.
package/README.md ADDED
@@ -0,0 +1,225 @@
1
+ # wllama - Wasm binding for llama.cpp
2
+
3
+ ![](./README_banner.png)
4
+
5
+ WebAssembly binding for [llama.cpp](https://github.com/ggerganov/llama.cpp)
6
+
7
+ 👉 [Try the demo app](https://huggingface.co/spaces/ngxson/wllama)
8
+
9
+ 👉 See the [blog post](https://reeselevine.github.io/llamas-on-the-web/) introducing WebGPU support in llama.cpp and wllama
10
+
11
+ 📄 [Documentation](https://github.ngxson.com/wllama/docs/)
12
+
13
+ For changelog, please visit [releases page](https://github.com/ngxson/wllama/releases)
14
+
15
+ > [!IMPORTANT]
16
+ >
17
+ > **🔥🔥 V3 is out, with WebGPU, multimodal and tool calling support. Read the [V3 release guide](./guides/intro-v3.md)**
18
+ >
19
+ > For compatibility issues, please refer to [@wllama/wllama-compat](./compat/README.md)
20
+
21
+ ![](./assets/screenshot_0.png)
22
+
23
+ ## Features
24
+
25
+ - 🔌 OpenAI-compatible API (fully-typed built-in)
26
+ - 🚀 WebGPU support
27
+ - 🔥 Multimodal support (image and audio file input)
28
+ - 🔥 Tool calling support
29
+ - Can run inference directly on browser (using [WebAssembly SIMD](https://emscripten.org/docs/porting/simd.html)), no backend or GPU is needed!
30
+ - No runtime dependency (see [package.json](./package.json))
31
+ - Ability to split the model into smaller files and load them in parallel (same as `split` and `cat`)
32
+ - Auto switch between single-thread and multi-thread build based on browser support
33
+ - Inference is done inside a worker, does not block UI render
34
+ - Pre-built npm package [@wllama/wllama](https://www.npmjs.com/package/@wllama/wllama)
35
+
36
+ Limitations:
37
+ - To enable multi-thread, you must add `Cross-Origin-Embedder-Policy` and `Cross-Origin-Opener-Policy` headers. See [this discussion](https://github.com/ffmpegwasm/ffmpeg.wasm/issues/106#issuecomment-913450724) for more details.
38
+ - Max file size is 2GB, due to [size restriction of ArrayBuffer](https://stackoverflow.com/questions/17823225/do-arraybuffers-have-a-maximum-length). If your model is bigger than 2GB, please follow the **Split model** section below.
39
+
40
+ ## Code demo and documentation
41
+
42
+ Demo:
43
+ - Basic usages with completions and embeddings: https://github.ngxson.com/wllama/examples/basic/ ([source code](./examples/basic/index.html))
44
+ - Embedding and cosine distance: https://github.ngxson.com/wllama/examples/embeddings/ ([source code](./examples/embeddings/index.html))
45
+ - Multimodal (vision) completion: https://github.ngxson.com/wllama/examples/multimodal/ ([source code](./examples/multimodal/index.html))
46
+ - Tool calling: https://github.ngxson.com/wllama/examples/tools/ ([source code](./examples/tools/index.html))
47
+
48
+ ## How to use
49
+
50
+ ### Use Wllama inside React Typescript project
51
+
52
+ Install it:
53
+
54
+ ```bash
55
+ npm i @wllama/wllama
56
+ ```
57
+
58
+ Then, import the module:
59
+
60
+ ```ts
61
+ import { Wllama } from '@wllama/wllama';
62
+ let wllamaInstance = new Wllama(WLLAMA_CONFIG_PATHS, ...);
63
+ // (the rest is the same with earlier example)
64
+ ```
65
+
66
+ For complete code example, see [examples/main/src/utils/wllama.context.tsx](./examples/main/src/utils/wllama.context.tsx)
67
+
68
+ NOTE: this example only covers completions usage. For embeddings, please see [examples/embeddings/index.html](./examples/embeddings/index.html)
69
+
70
+ ### WebGPU support
71
+
72
+ WebGPU support is introduced via [PR #215](https://github.com/ngxson/wllama/pull/215).
73
+
74
+ Upon updating to V3.1, WebGPU will be enabled automatically. By default, all layers will be offloaded to GPU. If the model is too big to fit into VRAM, you can manually adjust the number of layers via the `n_gpu_layers` parameter of `LoadModelParams`. Example:
75
+
76
+ ```js
77
+ // (optionally) will allow running WebGPU on Firefox via compat mode; performance will be significantly degraded
78
+ wllama.setCompat('default', 'firefox_safari');
79
+
80
+ await wllama.loadModel(files, {
81
+ n_gpu_layers: 4, // meaning 4 layers are offloaded to GPU; set to 0 to disable GPU inference
82
+ });
83
+ ```
84
+
85
+ ### Prepare your model
86
+
87
+ - It is recommended to split the model into **chunks of maximum 512MB**. This will result in slightly faster download speed (because multiple splits can be downloaded in parallel), and also prevent some out-of-memory issues. **See the "Split model" section below for more details.**
88
+ - It is recommended to use quantized Q4, Q5 or Q6 for balance among performance, file size and quality. Using IQ (with imatrix) is **not** recommended, may result in slow inference and low quality.
89
+
90
+ ### Simple usage with ES6 module
91
+
92
+ For complete code, see [examples/basic/index.html](./examples/basic/index.html)
93
+
94
+ ```javascript
95
+ import { Wllama } from './esm/index.js';
96
+
97
+ (async () => {
98
+ const CONFIG_PATHS = {
99
+ default: './esm/wasm/wllama.wasm',
100
+ };
101
+ // Automatically switch between single-thread and multi-thread version based on browser support
102
+ // If you want to enforce single-thread, add { "n_threads": 1 } to LoadModelConfig
103
+ const wllama = new Wllama(CONFIG_PATHS);
104
+ // Define a function for tracking the model download progress
105
+ const progressCallback = ({ loaded, total }) => {
106
+ // Calculate the progress as a percentage
107
+ const progressPercentage = Math.round((loaded / total) * 100);
108
+ // Log the progress in a user-friendly format
109
+ console.log(`Downloading... ${progressPercentage}%`);
110
+ };
111
+ // Load GGUF from Hugging Face hub
112
+ // (alternatively, you can use loadModelFromUrl if the model is not from HF hub)
113
+ await wllama.loadModelFromHF(
114
+ { repo: 'ggml-org/models', file: 'tinyllamas/stories260K.gguf' },
115
+ { progressCallback }
116
+ );
117
+ const response = await wllama.createChatCompletion({
118
+ messages: [{ role: 'user', content: elemInput.value }],
119
+ max_tokens: 50,
120
+ temperature: 0.5,
121
+ top_k: 40,
122
+ top_p: 0.9,
123
+ });
124
+ console.log(response.choices[0].message.content);
125
+ })();
126
+ ```
127
+
128
+ Alternatively, you can use the `*.wasm` files from CDN:
129
+
130
+ ```js
131
+ import WasmFromCDN from '@wllama/wllama/esm/wasm-from-cdn.js';
132
+ const wllama = new Wllama(WasmFromCDN);
133
+ // NOTE: this is not recommended, only use when you can't embed wasm files in your project
134
+ ```
135
+
136
+ ### Split model
137
+
138
+ Cases where we want to split the model:
139
+ - Due to [size restriction of ArrayBuffer](https://stackoverflow.com/questions/17823225/do-arraybuffers-have-a-maximum-length), the size limitation of a file is 2GB. If your model is bigger than 2GB, you can split the model into small files.
140
+ - Even with a small model, splitting into chunks allows the browser to download multiple chunks in parallel, thus making the download process a bit faster.
141
+
142
+ We use `llama-gguf-split` to split a big gguf file into smaller files. You can download the pre-built binary via [llama.cpp release page](https://github.com/ggerganov/llama.cpp/releases):
143
+
144
+ ```bash
145
+ # Split the model into chunks of 512 Megabytes
146
+ ./llama-gguf-split --split-max-size 512M ./my_model.gguf ./my_model
147
+ ```
148
+
149
+ This will output files ending with `-00001-of-00003.gguf`, `-00002-of-00003.gguf`, and so on.
150
+
151
+ You can then pass to `loadModelFromUrl` or `loadModelFromHF` the URL of the first file and it will automatically load all the chunks:
152
+
153
+ ```js
154
+ const wllama = new Wllama(CONFIG_PATHS, {
155
+ parallelDownloads: 5, // optional: maximum files to download in parallel (default: 3)
156
+ });
157
+ await wllama.loadModelFromHF({
158
+ repo: 'ngxson/tinyllama_split_test',
159
+ file: 'stories15M-q8_0-00001-of-00003.gguf',
160
+ });
161
+ ```
162
+
163
+ ### Custom logger (suppress debug messages)
164
+
165
+ When initializing Wllama, you can pass a custom logger to Wllama.
166
+
167
+ Example 1: Suppress debug message
168
+
169
+ ```js
170
+ import { Wllama, LoggerWithoutDebug } from '@wllama/wllama';
171
+
172
+ const wllama = new Wllama(pathConfig, {
173
+ // LoggerWithoutDebug is predefined inside wllama
174
+ logger: LoggerWithoutDebug,
175
+ });
176
+ ```
177
+
178
+ Example 2: Add emoji prefix to log messages
179
+
180
+ ```js
181
+ const wllama = new Wllama(pathConfig, {
182
+ logger: {
183
+ debug: (...args) => console.debug('🔧', ...args),
184
+ log: (...args) => console.log('ℹ️', ...args),
185
+ warn: (...args) => console.warn('⚠️', ...args),
186
+ error: (...args) => console.error('☠️', ...args),
187
+ },
188
+ });
189
+ ```
190
+
191
+ ## How to compile the binary yourself
192
+
193
+ This repository already come with pre-built binary from llama.cpp source code. However, in some cases you may want to compile it yourself:
194
+ - You don't trust the pre-built one.
195
+ - You want to try out latest - bleeding-edge changes from upstream llama.cpp source code.
196
+
197
+ You can use the commands below to compile it yourself:
198
+
199
+ ```shell
200
+ # /!\ IMPORTANT: Require having docker compose installed
201
+
202
+ # Clone the repository with submodule
203
+ git clone --recurse-submodules https://github.com/ngxson/wllama.git
204
+ cd wllama
205
+
206
+ # Optionally, you can run this command to update llama.cpp to latest upstream version (bleeding-edge, use with your own risk!)
207
+ # git submodule update --remote --merge
208
+
209
+ # Install the required modules
210
+ npm i
211
+
212
+ # Firstly, build llama.cpp into wasm
213
+ npm run build:wasm
214
+ # Then, build ES module
215
+ npm run build
216
+ ```
217
+
218
+ ## TODO
219
+
220
+ - Add support for LoRA adapter
221
+ - Support multi-sequences: knowing the resource limitation when using WASM, I don't think having multi-sequences is a good idea
222
+
223
+ ## Acknowledgments
224
+
225
+ Wllama was created and is maintained by [Xuan-Son Nguyen](https://ngxson.com/). The WebGPU backend for llama.cpp is maintained by [Reese Levine](https://reeselevine.github.io/). We thank all other contributors to both wllama and llama.cpp, whose work made this project possible.
Binary file
Binary file