@agorapete/wllama 3.5.1-q2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitmodules +3 -0
- package/.prettierignore +38 -0
- package/AGENTS.md +1 -0
- package/CMakeLists.txt +131 -0
- package/LICENCE +21 -0
- package/README-dev.md +178 -0
- package/README.md +225 -0
- package/README_banner.png +0 -0
- package/assets/screenshot_0.png +0 -0
- package/cpp/generate_glue_prototype.js +115 -0
- package/cpp/glue.hpp +664 -0
- package/cpp/test_glue.cpp +80 -0
- package/cpp/wllama-context.h +1172 -0
- package/cpp/wllama-fs.h +148 -0
- package/cpp/wllama.cpp +187 -0
- package/cpp/wllama.h +6 -0
- package/esm/cache-manager.d.ts +130 -0
- package/esm/debug.d.ts +28 -0
- package/esm/glue/glue.d.ts +22 -0
- package/esm/glue/messages.d.ts +146 -0
- package/esm/huggingface.d.ts +31 -0
- package/esm/index.cjs +3406 -0
- package/esm/index.d.ts +8 -0
- package/esm/index.js +3387 -0
- package/esm/index.min.js +1 -0
- package/esm/index.min.js.map +1 -0
- package/esm/model-manager.d.ts +136 -0
- package/esm/storage/cos.d.ts +36 -0
- package/esm/storage/index.d.ts +33 -0
- package/esm/storage/opfs.d.ts +12 -0
- package/esm/types/oai-compat.d.ts +278 -0
- package/esm/types/types.d.ts +112 -0
- package/esm/utils.d.ts +119 -0
- package/esm/wasm/source-map.d.ts +1 -0
- package/esm/wasm/wllama.wasm +0 -0
- package/esm/wasm-from-cdn.d.ts +8 -0
- package/esm/wllama.d.ts +397 -0
- package/esm/worker.d.ts +92 -0
- package/esm/workers-code/generated.d.ts +4 -0
- package/guides/intro-v2.md +132 -0
- package/guides/intro-v3.1.md +40 -0
- package/guides/intro-v3.md +230 -0
- package/index.ts +1 -0
- package/package.json +71 -0
- package/scripts/bisect_test.sh +33 -0
- package/scripts/build_hf_space.sh +26 -0
- package/scripts/build_source_map.js +269 -0
- package/scripts/build_wasm.sh +19 -0
- package/scripts/build_worker.sh +38 -0
- package/scripts/check_debug_build.js +30 -0
- package/scripts/check_package_size.js +25 -0
- package/scripts/docker-compose.yml +76 -0
- package/scripts/generate_wasm_from_cdn.js +24 -0
- package/scripts/http_server.js +44 -0
- package/scripts/post_build.sh +32 -0
- package/src/cache-manager.ts +358 -0
- package/src/debug.ts +111 -0
- package/src/glue/glue.ts +291 -0
- package/src/glue/messages.ts +773 -0
- package/src/huggingface.ts +151 -0
- package/src/index.ts +8 -0
- package/src/mjs.test.ts +44 -0
- package/src/model-manager.test.ts +200 -0
- package/src/model-manager.ts +359 -0
- package/src/storage/cos.test.ts +83 -0
- package/src/storage/cos.ts +171 -0
- package/src/storage/index.ts +40 -0
- package/src/storage/opfs.ts +119 -0
- package/src/types/oai-compat.ts +342 -0
- package/src/types/types.ts +133 -0
- package/src/utils.test.ts +231 -0
- package/src/utils.ts +403 -0
- package/src/wasm/source-map.ts +7 -0
- package/src/wasm/wllama.js +1 -0
- package/src/wasm/wllama.wasm +0 -0
- package/src/wasm-from-cdn.ts +13 -0
- package/src/wllama.test.ts +392 -0
- package/src/wllama.ts +1138 -0
- package/src/wllama.wgpu.test.ts +62 -0
- package/src/worker.ts +443 -0
- package/src/workers-code/generated.ts +11 -0
- package/src/workers-code/llama-cpp.js +511 -0
- package/src/workers-code/opfs-utils.js +150 -0
- package/tsconfig.build.json +34 -0
- package/tsup.config.ts +23 -0
- package/vitest.config.ts +61 -0
package/.gitmodules
ADDED
package/.prettierignore
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
**/.vscode
|
|
2
|
+
**/.github
|
|
3
|
+
**/.git
|
|
4
|
+
**/.svn
|
|
5
|
+
**/.hg
|
|
6
|
+
**/node_modules
|
|
7
|
+
**/dist
|
|
8
|
+
**/docs
|
|
9
|
+
|
|
10
|
+
/llama.cpp
|
|
11
|
+
|
|
12
|
+
/examples/advanced
|
|
13
|
+
/examples/basic
|
|
14
|
+
/examples/embeddings
|
|
15
|
+
|
|
16
|
+
/scripts
|
|
17
|
+
/esm
|
|
18
|
+
/models
|
|
19
|
+
/build
|
|
20
|
+
|
|
21
|
+
/src/multi-thread
|
|
22
|
+
/src/single-thread
|
|
23
|
+
/src/wasm
|
|
24
|
+
/src/workers-code/generated.ts
|
|
25
|
+
/src/wasm-from-cdn.ts
|
|
26
|
+
/src/glue/messages.ts
|
|
27
|
+
|
|
28
|
+
/compat/wasm
|
|
29
|
+
|
|
30
|
+
*.md
|
|
31
|
+
*.mdx
|
|
32
|
+
*.json
|
|
33
|
+
*.lock
|
|
34
|
+
*.yml
|
|
35
|
+
*.cpp
|
|
36
|
+
*.hpp
|
|
37
|
+
|
|
38
|
+
*.config.js
|
package/AGENTS.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Refer to `README-dev.md` for development documentation.
|
package/CMakeLists.txt
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.14)
|
|
2
|
+
project("wllama")
|
|
3
|
+
|
|
4
|
+
option(WLLAMA_TEST_BACKEND "Build wllama with test-backend-ops included" OFF)
|
|
5
|
+
|
|
6
|
+
set(MTMD_VIDEO OFF CACHE BOOL "" FORCE)
|
|
7
|
+
|
|
8
|
+
set(CMAKE_THREAD_LIBS_INIT "-lpthread")
|
|
9
|
+
set(CMAKE_HAVE_THREADS_LIBRARY 1)
|
|
10
|
+
set(CMAKE_USE_WIN32_THREADS_INIT 0)
|
|
11
|
+
set(CMAKE_USE_PTHREADS_INIT 1)
|
|
12
|
+
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
|
13
|
+
|
|
14
|
+
set(WLLAMA_COMPILE_OPTIONS
|
|
15
|
+
-O3 -msimd128 -DNDEBUG
|
|
16
|
+
# -flto=full # note: this breaks things, better to disable
|
|
17
|
+
-frtti
|
|
18
|
+
-pthread
|
|
19
|
+
-gsource-map
|
|
20
|
+
)
|
|
21
|
+
set(WLLAMA_LINK_OPTIONS
|
|
22
|
+
# -flto=full # note: this breaks things, better to disable
|
|
23
|
+
--no-entry
|
|
24
|
+
-sEXPORT_ES6=0
|
|
25
|
+
-sMODULARIZE=0
|
|
26
|
+
-sINITIAL_MEMORY=128MB
|
|
27
|
+
-sMAXIMUM_MEMORY=4096MB
|
|
28
|
+
-sSTACK_SIZE=5MB
|
|
29
|
+
-sALLOW_MEMORY_GROWTH=1
|
|
30
|
+
-sFORCE_FILESYSTEM=1
|
|
31
|
+
-sEXPORTED_FUNCTIONS=_main,_wllama_malloc,_wllama_start,_wllama_action,_wllama_exit,_wllama_debug
|
|
32
|
+
-sEXPORTED_RUNTIME_METHODS=ccall,cwrap,HEAPU8,MEMFS,FS,mmapAlloc,ENV,wasmMemory
|
|
33
|
+
-sNO_EXIT_RUNTIME=1
|
|
34
|
+
-sIMPORTED_MEMORY=1
|
|
35
|
+
-sPTHREAD_POOL_SIZE=Module[\"pthreadPoolSize\"]
|
|
36
|
+
-sUSE_PTHREADS=1
|
|
37
|
+
-pthread
|
|
38
|
+
-gsource-map
|
|
39
|
+
--emit-symbol-map
|
|
40
|
+
-Wl,--wrap,fopen
|
|
41
|
+
-Wl,--wrap,fclose
|
|
42
|
+
-Wl,--wrap,fread
|
|
43
|
+
-Wl,--wrap,fseek
|
|
44
|
+
-Wl,--wrap,ftell
|
|
45
|
+
-Wl,--wrap,abort
|
|
46
|
+
-Wl,--wrap,ggml_graph_plan # for test-backend-ops
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
if (WLLAMA_COMPAT)
|
|
50
|
+
# no wasm exception (not compatible with asyncify - asyncify is needed for firefox and safari)
|
|
51
|
+
# no mem64 (not compatible with safari)
|
|
52
|
+
list(APPEND WLLAMA_COMPILE_OPTIONS
|
|
53
|
+
-fexceptions
|
|
54
|
+
-pthread
|
|
55
|
+
)
|
|
56
|
+
list(APPEND WLLAMA_LINK_OPTIONS
|
|
57
|
+
-fexceptions
|
|
58
|
+
-sASYNCIFY=1
|
|
59
|
+
-sASYNCIFY_ADD=['wllama_start','wllama_action']
|
|
60
|
+
)
|
|
61
|
+
else()
|
|
62
|
+
list(APPEND WLLAMA_COMPILE_OPTIONS
|
|
63
|
+
-sMEMORY64=1
|
|
64
|
+
-fwasm-exceptions
|
|
65
|
+
)
|
|
66
|
+
list(APPEND WLLAMA_LINK_OPTIONS
|
|
67
|
+
-sMEMORY64=1
|
|
68
|
+
-fwasm-exceptions
|
|
69
|
+
-sJSPI
|
|
70
|
+
-sJSPI_EXPORTS=['wllama_start','wllama_action']
|
|
71
|
+
)
|
|
72
|
+
endif()
|
|
73
|
+
|
|
74
|
+
add_compile_options(${WLLAMA_COMPILE_OPTIONS})
|
|
75
|
+
add_link_options(${WLLAMA_LINK_OPTIONS})
|
|
76
|
+
|
|
77
|
+
add_subdirectory(llama.cpp)
|
|
78
|
+
|
|
79
|
+
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
|
|
80
|
+
add_subdirectory(llama.cpp/tools/mtmd)
|
|
81
|
+
|
|
82
|
+
file(GLOB_RECURSE LLAMA_COMMON_SRC
|
|
83
|
+
CONFIGURE_DEPENDS
|
|
84
|
+
llama.cpp/common/*.cpp
|
|
85
|
+
llama.cpp/common/*.h
|
|
86
|
+
llama.cpp/common/*.hpp
|
|
87
|
+
llama.cpp/common/jinja/*.cpp
|
|
88
|
+
llama.cpp/common/jinja/*.h
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
list(REMOVE_ITEM LLAMA_COMMON_SRC
|
|
92
|
+
${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/arg.cpp
|
|
93
|
+
${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/console.cpp
|
|
94
|
+
${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/debug.cpp
|
|
95
|
+
${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/fit.cpp
|
|
96
|
+
${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/hf-cache.cpp
|
|
97
|
+
${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/log.cpp
|
|
98
|
+
${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/preset.cpp
|
|
99
|
+
${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common/download.cpp
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
set(LLAMA_SERVER_SRC llama.cpp/tools/server/server-context.cpp
|
|
103
|
+
llama.cpp/tools/server/server-task.cpp
|
|
104
|
+
llama.cpp/tools/server/server-chat.cpp
|
|
105
|
+
llama.cpp/tools/server/server-common.cpp
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
set(WLLAMA_SRC ${LLAMA_COMMON_SRC} ${LLAMA_SERVER_SRC}
|
|
109
|
+
cpp/wllama.cpp
|
|
110
|
+
cpp/glue.hpp
|
|
111
|
+
llama.cpp/include/llama.h)
|
|
112
|
+
|
|
113
|
+
if(WLLAMA_TEST_BACKEND)
|
|
114
|
+
message(WARNING "Building wllama with test-backend-ops included")
|
|
115
|
+
set(TEST_BACKEND_OPS_SRC ${CMAKE_CURRENT_BINARY_DIR}/test-backend-ops.cpp)
|
|
116
|
+
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/tests/test-backend-ops.cpp TEST_BACKEND_OPS_CONTENT)
|
|
117
|
+
string(REPLACE "int main(" "int main_test_backend_ops(" TEST_BACKEND_OPS_CONTENT "${TEST_BACKEND_OPS_CONTENT}")
|
|
118
|
+
file(WRITE ${TEST_BACKEND_OPS_SRC} "${TEST_BACKEND_OPS_CONTENT}")
|
|
119
|
+
list(APPEND WLLAMA_SRC ${TEST_BACKEND_OPS_SRC})
|
|
120
|
+
add_compile_definitions(WLLAMA_TEST_BACKEND)
|
|
121
|
+
endif()
|
|
122
|
+
|
|
123
|
+
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/cpp)
|
|
124
|
+
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/include)
|
|
125
|
+
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common)
|
|
126
|
+
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/vendor)
|
|
127
|
+
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/tools/server)
|
|
128
|
+
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/tools/mtmd)
|
|
129
|
+
|
|
130
|
+
add_executable(wllama ${WLLAMA_SRC})
|
|
131
|
+
target_link_libraries(wllama PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
|
package/LICENCE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Xuan Son NGUYEN
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README-dev.md
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# wllama
|
|
2
|
+
|
|
3
|
+
Wllama is a webassembly binding of llama.cpp. It contains the main source code of llama.cpp compiled to wasm (with emscripten), plus a wrapper to provide various convenient APIs, including: downloading and caching models, compatibility, etc.
|
|
4
|
+
|
|
5
|
+
## Project structure
|
|
6
|
+
|
|
7
|
+
The project has these directories:
|
|
8
|
+
- `src`: the main typescript source code
|
|
9
|
+
- `cpp`: C++ interface
|
|
10
|
+
- `scripts`: various scripts for development
|
|
11
|
+
- `examples`: various examples
|
|
12
|
+
|
|
13
|
+
The project has these main components:
|
|
14
|
+
- `wllama.ts`: the main public API
|
|
15
|
+
- `model-manager.ts`: relies on cache manager to manage models. For example, a model can be composed of multiple files
|
|
16
|
+
- `cache-manager.ts`: interface for managing cache files. It uses OPFS under the hood
|
|
17
|
+
- `huggingface.ts`: utility for managing models downloading from hugging face hub
|
|
18
|
+
- `worker.ts`: the worker manager that will be responsible of starting the emscripten worker and maintaining the communication with it
|
|
19
|
+
- `glue.ts`: GLUE implementation
|
|
20
|
+
- `wllama.cpp`: the main C++ interface
|
|
21
|
+
|
|
22
|
+
### GLUE
|
|
23
|
+
|
|
24
|
+
GLUE is a home-grown binary protocol inspired by Protobuf. It is used internally to communicate between the wasm context and the JavaScript context of wllama.
|
|
25
|
+
|
|
26
|
+
The main goal of GLUE is to allow a type-safe interface with low overhead. It works by serializing messages into `ArrayBuffer` and transferring them using [Transferable objects](https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Transferable_objects), which avoids copying.
|
|
27
|
+
|
|
28
|
+
**Wire format:**
|
|
29
|
+
- 4 bytes - magic number (`GLUE`)
|
|
30
|
+
- 4 bytes - version number (`GLUE_VERSION`)
|
|
31
|
+
- 8 bytes - message prototype ID
|
|
32
|
+
- 4 bytes - message length (unsigned)
|
|
33
|
+
- message fields, each encoded as:
|
|
34
|
+
- 4 bytes data type (e.g. `int`, `float`, `str`, `raw`, and array variants)
|
|
35
|
+
- 4 bytes size (only for arrays and strings)
|
|
36
|
+
- data bytes
|
|
37
|
+
|
|
38
|
+
**Supported field types:** `str`, `int`, `float`, `bool`, `raw` (arbitrary bytes), and array variants of each.
|
|
39
|
+
|
|
40
|
+
Upon build, `generate_glue_prototype.js` reads `glue.hpp` and generates `glue/messages.ts`, which provides the TypeScript-side message types used throughout the codebase.
|
|
41
|
+
|
|
42
|
+
### Threading model
|
|
43
|
+
|
|
44
|
+
Wllama ships a **single wasm build** that supports both single-threaded and multi-threaded execution. The number of threads is determined at runtime rather than at compile time.
|
|
45
|
+
|
|
46
|
+
At startup, wllama checks whether the browser supports `SharedArrayBuffer` (required for wasm threads). This check validates both the existence of `SharedArrayBuffer` and whether the wasm atomics feature is available (COOP/COEP headers must be set by the server for `SharedArrayBuffer` to be accessible).
|
|
47
|
+
|
|
48
|
+
The thread pool size is passed to emscripten via `-sPTHREAD_POOL_SIZE=Module["pthreadPoolSize"]`:
|
|
49
|
+
- If the browser **supports** shared memory: `pthreadPoolSize` is set to the desired thread count (defaults to `hardwareConcurrency / 2`)
|
|
50
|
+
- If the browser **does not support** shared memory: `pthreadPoolSize` is set to `0`, which disables pthreads entirely and falls back to single-threaded execution
|
|
51
|
+
|
|
52
|
+
This logic lives in `wllama.ts` (`isSupportMultiThread()` from `utils.ts` performs the feature detection).
|
|
53
|
+
|
|
54
|
+
## Startup process
|
|
55
|
+
|
|
56
|
+
Upon startup, these steps are performed:
|
|
57
|
+
- `ProxyToWorker` is created in the main wllama JS context
|
|
58
|
+
- A web worker is spawned, the code is taken from `workers-code/generated.ts`
|
|
59
|
+
- The worker loads emscripten code, sets up the environment then eventually calls the `main()` inside `wllama.cpp`. These preparation steps are injected (see `llama-cpp.js`):
|
|
60
|
+
- Hooking `printf` functions
|
|
61
|
+
- Setting up HeapFS
|
|
62
|
+
- Setting up communication callbacks
|
|
63
|
+
|
|
64
|
+
## File access
|
|
65
|
+
|
|
66
|
+
Wllama employs some tricks to avoid making copies while reading GGUF files. The runtime uses one of these 2 mechanisms. See `workers-code/llama-cpp.js` for the implementation.
|
|
67
|
+
|
|
68
|
+
Please note that wllama only accepts `Blob` as input data.
|
|
69
|
+
|
|
70
|
+
### Async file read
|
|
71
|
+
|
|
72
|
+
This implementation hooks into `fopen`, `fseek` and `fread`, and forwards these calls to the main thread (via message port), where we eventually call `Blob.slice()` to read the data. Because of the asynchronous execution via `onmessage` and `postMessage`, JSPI / Asyncify is required.
|
|
73
|
+
|
|
74
|
+
Upon running, action `fs.alloc` is fired to indicate that the file can be read through JSPI / Asyncify call. The actual buffer won't be allocated for the file, but only the metadata is.
|
|
75
|
+
|
|
76
|
+
When wasm calls `fread()`:
|
|
77
|
+
- `fread()` calls `await fileRead()` in the JS context
|
|
78
|
+
- `fileRead()` posts a message of type `fs.read_req` to the main thread
|
|
79
|
+
- Main thread uses `Blob.slice()` to read the data, then sends it back via a `fs.read_res` message
|
|
80
|
+
- Worker's `onmessage` receives the message and resumes the awaiting coroutine
|
|
81
|
+
|
|
82
|
+
Note:
|
|
83
|
+
- While awaiting the read data, the worker should not have any other activities (a global variable is used as a guard and will raise an exception on any incoming messages)
|
|
84
|
+
- The minimum read size is 1MB. If less than this amount is requested, the full 1MB block is cached for subsequent reads. This is because reading GGUF metadata frequently involves reads of less than 1KB at a time, which can become a bottleneck without caching.
|
|
85
|
+
- Env var `USE_ASYNC_FILE` is used to signal from JS to wasm that we are using async file read (upon starting the module). If `USE_ASYNC_FILE` is not set, we fallback to HeapFS/mmap case (see in next section)
|
|
86
|
+
|
|
87
|
+
### HeapFS
|
|
88
|
+
|
|
89
|
+
HeapFS is a lightweight wrapper around emscripten's default FS driver. The main goal is to allow `mmap()` to map to existing data instead of copying it (the default emscripten behavior).
|
|
90
|
+
|
|
91
|
+
These steps are performed:
|
|
92
|
+
|
|
93
|
+
- Action `fs.alloc` is fired to create the file handle and file buffer in the wasm context
|
|
94
|
+
- The main thread then creates and holds a `ReadableStream` for the `Blob`
|
|
95
|
+
- The main thread reads the file chunk by chunk, streaming it to the worker via `fs.write` messages
|
|
96
|
+
- Once streaming is finished, the `ReadableStream` is closed
|
|
97
|
+
- The model load is then triggered with `mmap = true`, and `mmap()` is wrapped to return a pointer to the correct data in the buffer allocated in step 1
|
|
98
|
+
|
|
99
|
+
The main downside of this approach is that on WebGPU, even though some tensors can be offloaded to the GPU, we still need to allocate the full model in main memory. For example, a 4GB model will still occupy 4GB of main memory, even if half of the layers (~2GB) are offloaded to the GPU.
|
|
100
|
+
|
|
101
|
+
## Compressed source map
|
|
102
|
+
|
|
103
|
+
Emscripten's `--emit-symbol-map` flag produces a `.js.symbols` file mapping each wasm function index to its demangled C++ name. `scripts/build_source_map.js` reads this file alongside the `.wasm` binary and produces a single TypeScript file (`src/wasm/source-map.ts`) containing a compact deduplicated name table per build, gzip-compressed and base64-encoded.
|
|
104
|
+
|
|
105
|
+
The script runs automatically as part of the docker build (see `scripts/docker-compose.yml`). It can also be run manually:
|
|
106
|
+
|
|
107
|
+
```sh
|
|
108
|
+
# uses build/ and build-compat/ by default
|
|
109
|
+
node scripts/build_source_map.js
|
|
110
|
+
|
|
111
|
+
# or with explicit paths
|
|
112
|
+
node scripts/build_source_map.js \
|
|
113
|
+
--input default:build \
|
|
114
|
+
--input compat:build-compat \
|
|
115
|
+
--output src/wasm/source-map.ts
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Name cleaning rules
|
|
119
|
+
|
|
120
|
+
Raw demangled names can be hundreds of characters. The following rules are applied in order:
|
|
121
|
+
|
|
122
|
+
1. **std:: collapse** - any name starting with `std::` is replaced with the single hint `std::...`
|
|
123
|
+
2. **Lambda/closure extraction** - names containing `::$_N` or `::'lambda'` are replaced with the nearest enclosing context (the segment inside the last `<…>` before the marker)
|
|
124
|
+
3. **Parameter stripping** - parameter lists are dropped; empty `()` is kept, non-empty is removed entirely
|
|
125
|
+
4. **libc++ internals** - `::__1::`, `::__2::`, etc. are collapsed to `::`
|
|
126
|
+
5. **ABI tags** - `[abi:…]` annotations are removed
|
|
127
|
+
6. **Template truncation** - template argument content longer than 10 characters is truncated to `<first10chars...>`
|
|
128
|
+
7. **Final cleanup** - double `::::` collapsed, whitespace normalised
|
|
129
|
+
|
|
130
|
+
### Binary format (before gzip)
|
|
131
|
+
|
|
132
|
+
All integers are little-endian.
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
┌──────────────────────────────────────────────────────────┐
|
|
136
|
+
│ HEADER (12 bytes) │
|
|
137
|
+
│ u32 first_func_id - wasm function index of entry 0 │
|
|
138
|
+
│ u32 num_funcs - number of functions │
|
|
139
|
+
│ u32 num_names - number of unique names │
|
|
140
|
+
├──────────────────────────────────────────────────────────┤
|
|
141
|
+
│ NAME TABLE (num_names entries) │
|
|
142
|
+
│ for each name: │
|
|
143
|
+
│ u8 length - byte length of name (max 254) │
|
|
144
|
+
│ u8[] name - UTF-8 string (no null term) │
|
|
145
|
+
├──────────────────────────────────────────────────────────┤
|
|
146
|
+
│ INDEX ARRAY (num_funcs × u16) │
|
|
147
|
+
│ u16 name_idx - index into name table │
|
|
148
|
+
│ 0xFFFF = no name / unknown │
|
|
149
|
+
└──────────────────────────────────────────────────────────┘
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
To decode at runtime: base64-decode -> `DecompressionStream('gzip')` -> parse binary. Given a wasm function index `id`, look up `index_array[id - first_func_id]` to get the name table slot.
|
|
153
|
+
%
|
|
154
|
+
## Debugging backend ops
|
|
155
|
+
|
|
156
|
+
> [!IMPORTANT]
|
|
157
|
+
>
|
|
158
|
+
> By default, the build does NOT include `test-backend-ops` to save space. If you need to run it, please clone the repo and build it yourself, instructions below
|
|
159
|
+
|
|
160
|
+
Requirements:
|
|
161
|
+
- You have Docker installed and running on your machine
|
|
162
|
+
- On Windows, please use WSL
|
|
163
|
+
|
|
164
|
+
1. Clone this repo locally: `git clone --recurse-submodules https://github.com/ngxson/wllama.git`
|
|
165
|
+
2. `npm run build:test && npm run build`
|
|
166
|
+
3. `npm run serve` and open http://localhost:8080/examples/test-backend-ops/
|
|
167
|
+
|
|
168
|
+
Note: A debugging build cannot be merged to `master` or publish to npm
|
|
169
|
+
|
|
170
|
+
## Build process
|
|
171
|
+
|
|
172
|
+
The build process uses emscripten in docker to compile the project.
|
|
173
|
+
|
|
174
|
+
After compilation, `generate_glue_prototype.js` is called to generate the GLUE message types to be used in TypeScript.
|
|
175
|
+
|
|
176
|
+
Built wasm file will then be copied to the `src` directory.
|
|
177
|
+
|
|
178
|
+
Finally, `build_worker.sh` is called to generate the web worker code.
|
package/README.md
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# wllama - Wasm binding for llama.cpp
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
WebAssembly binding for [llama.cpp](https://github.com/ggerganov/llama.cpp)
|
|
6
|
+
|
|
7
|
+
👉 [Try the demo app](https://huggingface.co/spaces/ngxson/wllama)
|
|
8
|
+
|
|
9
|
+
👉 See the [blog post](https://reeselevine.github.io/llamas-on-the-web/) introducing WebGPU support in llama.cpp and wllama
|
|
10
|
+
|
|
11
|
+
📄 [Documentation](https://github.ngxson.com/wllama/docs/)
|
|
12
|
+
|
|
13
|
+
For changelog, please visit [releases page](https://github.com/ngxson/wllama/releases)
|
|
14
|
+
|
|
15
|
+
> [!IMPORTANT]
|
|
16
|
+
>
|
|
17
|
+
> **🔥🔥 V3 is out, with WebGPU, multimodal and tool calling support. Read the [V3 release guide](./guides/intro-v3.md)**
|
|
18
|
+
>
|
|
19
|
+
> For compatibility issues, please refer to [@wllama/wllama-compat](./compat/README.md)
|
|
20
|
+
|
|
21
|
+

|
|
22
|
+
|
|
23
|
+
## Features
|
|
24
|
+
|
|
25
|
+
- 🔌 OpenAI-compatible API (fully-typed built-in)
|
|
26
|
+
- 🚀 WebGPU support
|
|
27
|
+
- 🔥 Multimodal support (image and audio file input)
|
|
28
|
+
- 🔥 Tool calling support
|
|
29
|
+
- Can run inference directly on browser (using [WebAssembly SIMD](https://emscripten.org/docs/porting/simd.html)), no backend or GPU is needed!
|
|
30
|
+
- No runtime dependency (see [package.json](./package.json))
|
|
31
|
+
- Ability to split the model into smaller files and load them in parallel (same as `split` and `cat`)
|
|
32
|
+
- Auto switch between single-thread and multi-thread build based on browser support
|
|
33
|
+
- Inference is done inside a worker, does not block UI render
|
|
34
|
+
- Pre-built npm package [@wllama/wllama](https://www.npmjs.com/package/@wllama/wllama)
|
|
35
|
+
|
|
36
|
+
Limitations:
|
|
37
|
+
- To enable multi-thread, you must add `Cross-Origin-Embedder-Policy` and `Cross-Origin-Opener-Policy` headers. See [this discussion](https://github.com/ffmpegwasm/ffmpeg.wasm/issues/106#issuecomment-913450724) for more details.
|
|
38
|
+
- Max file size is 2GB, due to [size restriction of ArrayBuffer](https://stackoverflow.com/questions/17823225/do-arraybuffers-have-a-maximum-length). If your model is bigger than 2GB, please follow the **Split model** section below.
|
|
39
|
+
|
|
40
|
+
## Code demo and documentation
|
|
41
|
+
|
|
42
|
+
Demo:
|
|
43
|
+
- Basic usages with completions and embeddings: https://github.ngxson.com/wllama/examples/basic/ ([source code](./examples/basic/index.html))
|
|
44
|
+
- Embedding and cosine distance: https://github.ngxson.com/wllama/examples/embeddings/ ([source code](./examples/embeddings/index.html))
|
|
45
|
+
- Multimodal (vision) completion: https://github.ngxson.com/wllama/examples/multimodal/ ([source code](./examples/multimodal/index.html))
|
|
46
|
+
- Tool calling: https://github.ngxson.com/wllama/examples/tools/ ([source code](./examples/tools/index.html))
|
|
47
|
+
|
|
48
|
+
## How to use
|
|
49
|
+
|
|
50
|
+
### Use Wllama inside React Typescript project
|
|
51
|
+
|
|
52
|
+
Install it:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
npm i @wllama/wllama
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Then, import the module:
|
|
59
|
+
|
|
60
|
+
```ts
|
|
61
|
+
import { Wllama } from '@wllama/wllama';
|
|
62
|
+
let wllamaInstance = new Wllama(WLLAMA_CONFIG_PATHS, ...);
|
|
63
|
+
// (the rest is the same with earlier example)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
For complete code example, see [examples/main/src/utils/wllama.context.tsx](./examples/main/src/utils/wllama.context.tsx)
|
|
67
|
+
|
|
68
|
+
NOTE: this example only covers completions usage. For embeddings, please see [examples/embeddings/index.html](./examples/embeddings/index.html)
|
|
69
|
+
|
|
70
|
+
### WebGPU support
|
|
71
|
+
|
|
72
|
+
WebGPU support is introduced via [PR #215](https://github.com/ngxson/wllama/pull/215).
|
|
73
|
+
|
|
74
|
+
Upon updating to V3.1, WebGPU will be enabled automatically. By default, all layers will be offloaded to GPU. If the model is too big to fit into VRAM, you can manually adjust the number of layers via the `n_gpu_layers` parameter of `LoadModelParams`. Example:
|
|
75
|
+
|
|
76
|
+
```js
|
|
77
|
+
// (optionally) will allow running WebGPU on Firefox via compat mode; performance will be significantly degraded
|
|
78
|
+
wllama.setCompat('default', 'firefox_safari');
|
|
79
|
+
|
|
80
|
+
await wllama.loadModel(files, {
|
|
81
|
+
n_gpu_layers: 4, // meaning 4 layers are offloaded to GPU; set to 0 to disable GPU inference
|
|
82
|
+
});
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Prepare your model
|
|
86
|
+
|
|
87
|
+
- It is recommended to split the model into **chunks of maximum 512MB**. This will result in slightly faster download speed (because multiple splits can be downloaded in parallel), and also prevent some out-of-memory issues. **See the "Split model" section below for more details.**
|
|
88
|
+
- It is recommended to use quantized Q4, Q5 or Q6 for balance among performance, file size and quality. Using IQ (with imatrix) is **not** recommended, may result in slow inference and low quality.
|
|
89
|
+
|
|
90
|
+
### Simple usage with ES6 module
|
|
91
|
+
|
|
92
|
+
For complete code, see [examples/basic/index.html](./examples/basic/index.html)
|
|
93
|
+
|
|
94
|
+
```javascript
|
|
95
|
+
import { Wllama } from './esm/index.js';
|
|
96
|
+
|
|
97
|
+
(async () => {
|
|
98
|
+
const CONFIG_PATHS = {
|
|
99
|
+
default: './esm/wasm/wllama.wasm',
|
|
100
|
+
};
|
|
101
|
+
// Automatically switch between single-thread and multi-thread version based on browser support
|
|
102
|
+
// If you want to enforce single-thread, add { "n_threads": 1 } to LoadModelConfig
|
|
103
|
+
const wllama = new Wllama(CONFIG_PATHS);
|
|
104
|
+
// Define a function for tracking the model download progress
|
|
105
|
+
const progressCallback = ({ loaded, total }) => {
|
|
106
|
+
// Calculate the progress as a percentage
|
|
107
|
+
const progressPercentage = Math.round((loaded / total) * 100);
|
|
108
|
+
// Log the progress in a user-friendly format
|
|
109
|
+
console.log(`Downloading... ${progressPercentage}%`);
|
|
110
|
+
};
|
|
111
|
+
// Load GGUF from Hugging Face hub
|
|
112
|
+
// (alternatively, you can use loadModelFromUrl if the model is not from HF hub)
|
|
113
|
+
await wllama.loadModelFromHF(
|
|
114
|
+
{ repo: 'ggml-org/models', file: 'tinyllamas/stories260K.gguf' },
|
|
115
|
+
{ progressCallback }
|
|
116
|
+
);
|
|
117
|
+
const response = await wllama.createChatCompletion({
|
|
118
|
+
messages: [{ role: 'user', content: elemInput.value }],
|
|
119
|
+
max_tokens: 50,
|
|
120
|
+
temperature: 0.5,
|
|
121
|
+
top_k: 40,
|
|
122
|
+
top_p: 0.9,
|
|
123
|
+
});
|
|
124
|
+
console.log(response.choices[0].message.content);
|
|
125
|
+
})();
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Alternatively, you can use the `*.wasm` files from CDN:
|
|
129
|
+
|
|
130
|
+
```js
|
|
131
|
+
import WasmFromCDN from '@wllama/wllama/esm/wasm-from-cdn.js';
|
|
132
|
+
const wllama = new Wllama(WasmFromCDN);
|
|
133
|
+
// NOTE: this is not recommended, only use when you can't embed wasm files in your project
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Split model
|
|
137
|
+
|
|
138
|
+
Cases where we want to split the model:
|
|
139
|
+
- Due to [size restriction of ArrayBuffer](https://stackoverflow.com/questions/17823225/do-arraybuffers-have-a-maximum-length), the size limitation of a file is 2GB. If your model is bigger than 2GB, you can split the model into small files.
|
|
140
|
+
- Even with a small model, splitting into chunks allows the browser to download multiple chunks in parallel, thus making the download process a bit faster.
|
|
141
|
+
|
|
142
|
+
We use `llama-gguf-split` to split a big gguf file into smaller files. You can download the pre-built binary via [llama.cpp release page](https://github.com/ggerganov/llama.cpp/releases):
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
# Split the model into chunks of 512 Megabytes
|
|
146
|
+
./llama-gguf-split --split-max-size 512M ./my_model.gguf ./my_model
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
This will output files ending with `-00001-of-00003.gguf`, `-00002-of-00003.gguf`, and so on.
|
|
150
|
+
|
|
151
|
+
You can then pass to `loadModelFromUrl` or `loadModelFromHF` the URL of the first file and it will automatically load all the chunks:
|
|
152
|
+
|
|
153
|
+
```js
|
|
154
|
+
const wllama = new Wllama(CONFIG_PATHS, {
|
|
155
|
+
parallelDownloads: 5, // optional: maximum files to download in parallel (default: 3)
|
|
156
|
+
});
|
|
157
|
+
await wllama.loadModelFromHF({
|
|
158
|
+
repo: 'ngxson/tinyllama_split_test',
|
|
159
|
+
file: 'stories15M-q8_0-00001-of-00003.gguf',
|
|
160
|
+
});
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Custom logger (suppress debug messages)
|
|
164
|
+
|
|
165
|
+
When initializing Wllama, you can pass a custom logger to Wllama.
|
|
166
|
+
|
|
167
|
+
Example 1: Suppress debug message
|
|
168
|
+
|
|
169
|
+
```js
|
|
170
|
+
import { Wllama, LoggerWithoutDebug } from '@wllama/wllama';
|
|
171
|
+
|
|
172
|
+
const wllama = new Wllama(pathConfig, {
|
|
173
|
+
// LoggerWithoutDebug is predefined inside wllama
|
|
174
|
+
logger: LoggerWithoutDebug,
|
|
175
|
+
});
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Example 2: Add emoji prefix to log messages
|
|
179
|
+
|
|
180
|
+
```js
|
|
181
|
+
const wllama = new Wllama(pathConfig, {
|
|
182
|
+
logger: {
|
|
183
|
+
debug: (...args) => console.debug('🔧', ...args),
|
|
184
|
+
log: (...args) => console.log('ℹ️', ...args),
|
|
185
|
+
warn: (...args) => console.warn('⚠️', ...args),
|
|
186
|
+
error: (...args) => console.error('☠️', ...args),
|
|
187
|
+
},
|
|
188
|
+
});
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## How to compile the binary yourself
|
|
192
|
+
|
|
193
|
+
This repository already come with pre-built binary from llama.cpp source code. However, in some cases you may want to compile it yourself:
|
|
194
|
+
- You don't trust the pre-built one.
|
|
195
|
+
- You want to try out latest - bleeding-edge changes from upstream llama.cpp source code.
|
|
196
|
+
|
|
197
|
+
You can use the commands below to compile it yourself:
|
|
198
|
+
|
|
199
|
+
```shell
|
|
200
|
+
# /!\ IMPORTANT: Require having docker compose installed
|
|
201
|
+
|
|
202
|
+
# Clone the repository with submodule
|
|
203
|
+
git clone --recurse-submodules https://github.com/ngxson/wllama.git
|
|
204
|
+
cd wllama
|
|
205
|
+
|
|
206
|
+
# Optionally, you can run this command to update llama.cpp to latest upstream version (bleeding-edge, use with your own risk!)
|
|
207
|
+
# git submodule update --remote --merge
|
|
208
|
+
|
|
209
|
+
# Install the required modules
|
|
210
|
+
npm i
|
|
211
|
+
|
|
212
|
+
# Firstly, build llama.cpp into wasm
|
|
213
|
+
npm run build:wasm
|
|
214
|
+
# Then, build ES module
|
|
215
|
+
npm run build
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## TODO
|
|
219
|
+
|
|
220
|
+
- Add support for LoRA adapter
|
|
221
|
+
- Support multi-sequences: knowing the resource limitation when using WASM, I don't think having multi-sequences is a good idea
|
|
222
|
+
|
|
223
|
+
## Acknowledgments
|
|
224
|
+
|
|
225
|
+
Wllama was created and is maintained by [Xuan-Son Nguyen](https://ngxson.com/). The WebGPU backend for llama.cpp is maintained by [Reese Levine](https://reeselevine.github.io/). We thank all other contributors to both wllama and llama.cpp, whose work made this project possible.
|
|
Binary file
|
|
Binary file
|