@agorapete/wllama 3.5.1-q2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.gitmodules +3 -0
  2. package/.prettierignore +38 -0
  3. package/AGENTS.md +1 -0
  4. package/CMakeLists.txt +131 -0
  5. package/LICENCE +21 -0
  6. package/README-dev.md +178 -0
  7. package/README.md +225 -0
  8. package/README_banner.png +0 -0
  9. package/assets/screenshot_0.png +0 -0
  10. package/cpp/generate_glue_prototype.js +115 -0
  11. package/cpp/glue.hpp +664 -0
  12. package/cpp/test_glue.cpp +80 -0
  13. package/cpp/wllama-context.h +1172 -0
  14. package/cpp/wllama-fs.h +148 -0
  15. package/cpp/wllama.cpp +187 -0
  16. package/cpp/wllama.h +6 -0
  17. package/esm/cache-manager.d.ts +130 -0
  18. package/esm/debug.d.ts +28 -0
  19. package/esm/glue/glue.d.ts +22 -0
  20. package/esm/glue/messages.d.ts +146 -0
  21. package/esm/huggingface.d.ts +31 -0
  22. package/esm/index.cjs +3406 -0
  23. package/esm/index.d.ts +8 -0
  24. package/esm/index.js +3387 -0
  25. package/esm/index.min.js +1 -0
  26. package/esm/index.min.js.map +1 -0
  27. package/esm/model-manager.d.ts +136 -0
  28. package/esm/storage/cos.d.ts +36 -0
  29. package/esm/storage/index.d.ts +33 -0
  30. package/esm/storage/opfs.d.ts +12 -0
  31. package/esm/types/oai-compat.d.ts +278 -0
  32. package/esm/types/types.d.ts +112 -0
  33. package/esm/utils.d.ts +119 -0
  34. package/esm/wasm/source-map.d.ts +1 -0
  35. package/esm/wasm/wllama.wasm +0 -0
  36. package/esm/wasm-from-cdn.d.ts +8 -0
  37. package/esm/wllama.d.ts +397 -0
  38. package/esm/worker.d.ts +92 -0
  39. package/esm/workers-code/generated.d.ts +4 -0
  40. package/guides/intro-v2.md +132 -0
  41. package/guides/intro-v3.1.md +40 -0
  42. package/guides/intro-v3.md +230 -0
  43. package/index.ts +1 -0
  44. package/package.json +71 -0
  45. package/scripts/bisect_test.sh +33 -0
  46. package/scripts/build_hf_space.sh +26 -0
  47. package/scripts/build_source_map.js +269 -0
  48. package/scripts/build_wasm.sh +19 -0
  49. package/scripts/build_worker.sh +38 -0
  50. package/scripts/check_debug_build.js +30 -0
  51. package/scripts/check_package_size.js +25 -0
  52. package/scripts/docker-compose.yml +76 -0
  53. package/scripts/generate_wasm_from_cdn.js +24 -0
  54. package/scripts/http_server.js +44 -0
  55. package/scripts/post_build.sh +32 -0
  56. package/src/cache-manager.ts +358 -0
  57. package/src/debug.ts +111 -0
  58. package/src/glue/glue.ts +291 -0
  59. package/src/glue/messages.ts +773 -0
  60. package/src/huggingface.ts +151 -0
  61. package/src/index.ts +8 -0
  62. package/src/mjs.test.ts +44 -0
  63. package/src/model-manager.test.ts +200 -0
  64. package/src/model-manager.ts +359 -0
  65. package/src/storage/cos.test.ts +83 -0
  66. package/src/storage/cos.ts +171 -0
  67. package/src/storage/index.ts +40 -0
  68. package/src/storage/opfs.ts +119 -0
  69. package/src/types/oai-compat.ts +342 -0
  70. package/src/types/types.ts +133 -0
  71. package/src/utils.test.ts +231 -0
  72. package/src/utils.ts +403 -0
  73. package/src/wasm/source-map.ts +7 -0
  74. package/src/wasm/wllama.js +1 -0
  75. package/src/wasm/wllama.wasm +0 -0
  76. package/src/wasm-from-cdn.ts +13 -0
  77. package/src/wllama.test.ts +392 -0
  78. package/src/wllama.ts +1138 -0
  79. package/src/wllama.wgpu.test.ts +62 -0
  80. package/src/worker.ts +443 -0
  81. package/src/workers-code/generated.ts +11 -0
  82. package/src/workers-code/llama-cpp.js +511 -0
  83. package/src/workers-code/opfs-utils.js +150 -0
  84. package/tsconfig.build.json +34 -0
  85. package/tsup.config.ts +23 -0
  86. package/vitest.config.ts +61 -0
@@ -0,0 +1,132 @@
1
+ # Introducing Wllama V2.0
2
+
3
+ ## What's new
4
+
5
+ V2.0 introduces significant improvements in model management and caching. Key features include:
6
+
7
+ - Completely rewritten model downloader with service worker
8
+ - New `ModelManager` class providing comprehensive model handling and caching capabilities
9
+ - Enhanced testing system built on the `vitest` framework
10
+
11
+ ## Added `ModelManager`
12
+
13
+ The new `ModelManager` class provides a robust interface for handling model files:
14
+
15
+ ```typescript
16
+ // Example usage
17
+ const modelManager = new ModelManager();
18
+
19
+ // List all models in cache
20
+ const cachedModels = await modelManager.getModels();
21
+
22
+ // Add a new model
23
+ const model = await modelManager.downloadModel('https://example.com/model.gguf');
24
+
25
+ // Check if model is valid (i.e. it is not corrupted)
26
+ // If status === ModelValidationStatus.VALID, you can use the model
27
+ // Otherwise, call model.refresh() to re-download it
28
+ const status = await model.validate();
29
+
30
+ // Re-download if needed (useful when remote model file has changed)
31
+ await model.refresh();
32
+
33
+ // Remove model from cache
34
+ await model.remove();
35
+
36
+ // Load the selected model into llama.cpp
37
+ const wllama = new Wllama(CONFIG_PATHS);
38
+ await wllama.loadModel(model);
39
+
40
+ // Alternatively, you can also pass directly model URL like in v1.x
41
+ // This will automatically download the model to cache
42
+ await wllama.loadModelFromUrl('https://example.com/model.gguf');
43
+ ```
44
+
45
+ Key features of `ModelManager`:
46
+ - Automatic handling of split GGUF models
47
+ - Built-in model validation
48
+ - Parallel downloads of model shards
49
+ - Cache management with refresh and removal options
50
+
51
+ ## Added `loadModelFromHF`
52
+
53
+ A new helper function to load models directly from Hugging Face Hub. This is a convenient wrapper over `loadModelFromUrl` that handles HF repository URLs.
54
+
55
+ ```js
56
+ await wllama.loadModelFromHF(
57
+ 'ggml-org/models',
58
+ 'tinyllamas/stories260K.gguf'
59
+ );
60
+ ```
61
+
62
+ ## Migration to v2.0
63
+
64
+ ### Simplified `new Wllama()` constructor
65
+
66
+ In v2.0, the configuration paths have been simplified. You now only need to specify the `*.wasm` files, as the `*.js` files are no longer required.
67
+
68
+ Previously in v1.x:
69
+
70
+ ```js
71
+ const CONFIG_PATHS = {
72
+ 'single-thread/wllama.js' : '../../esm/single-thread/wllama.js',
73
+ 'single-thread/wllama.wasm' : '../../esm/single-thread/wllama.wasm',
74
+ 'multi-thread/wllama.js' : '../../esm/multi-thread/wllama.js',
75
+ 'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm',
76
+ 'multi-thread/wllama.worker.mjs': '../../esm/multi-thread/wllama.worker.mjs',
77
+ };
78
+ const wllama = new Wllama(CONFIG_PATHS);
79
+ ```
80
+
81
+ From v2.0:
82
+
83
+ ```js
84
+ // You only need to specify 2 files
85
+ const CONFIG_PATHS = {
86
+ 'single-thread/wllama.wasm': '../../esm/single-thread/wllama.wasm',
87
+ 'multi-thread/wllama.wasm' : '../../esm/multi-thread/wllama.wasm',
88
+ };
89
+ const wllama = new Wllama(CONFIG_PATHS);
90
+ ```
91
+
92
+ Alternatively, you can use the `*.wasm` files from CDN:
93
+
94
+ ```js
95
+ import WasmFromCDN from '@wllama/wllama/esm/wasm-from-cdn.js';
96
+ const wllama = new Wllama(WasmFromCDN);
97
+ // NOTE: this is not recommended
98
+ // only use this when you can't embed wasm files in your project
99
+ ```
100
+
101
+ The `Wllama` constructor now accepts an optional second parameter of type `WllamaConfig` for configuration options:
102
+
103
+ > [!IMPORTANT]
104
+ > Most configuration options previously available in `DownloadModelConfig` used with `loadModelFromUrl()` have been moved to this constructor config.
105
+
106
+ ```js
107
+ const wllama = new Wllama(CONFIG_PATHS, {
108
+ parallelDownloads: 5, // maximum concurrent downloads
109
+ allowOffline: false, // whether to allow offline model loading
110
+ });
111
+ ```
112
+
113
+ ### `Wllama.loadModelFromUrl`
114
+
115
+ As mentioned earlier, some options are moved to `Wllama` constructor, including:
116
+ - `parallelDownloads`
117
+ - `allowOffline`
118
+
119
+ ### Other changes
120
+
121
+ - `Wllama.downloadModel` is removed. Please use `ModelManager.downloadModel` instead
122
+ - `loadModelFromUrl` won't check if cached model is up-to-date. You may need to manually call `Model.refresh()` to re-download the model.
123
+ - Changes in `CacheManager`:
124
+ - Added `CacheManager.download` function
125
+ - `CacheManager.open(nameOrURL)` now accepts both file name and original URL. It now returns a `Blob` instead of a `ReadableStream`
126
+
127
+ ### Internal Changes
128
+
129
+ Notable internal improvements made to the codebase:
130
+
131
+ - Comprehensive test coverage using `vitest`, with browser testing for Chrome and Firefox (Safari support planned for the future)
132
+ - Enhanced CI pipeline including validation for example builds, ESM compilation and lint checks
@@ -0,0 +1,40 @@
1
+ # Release note Wllama V3.1
2
+
3
+ ## What's new
4
+
5
+ Continuing from the [V3.0 release](./intro-v3.md), V3.1 continues to bring more interesting features into wllama. This release marks 2 major changes:
6
+ 1. WebGPU support
7
+ 2. Single WASM build (no more single/multi-threaded build)
8
+
9
+ ### WebGPU support
10
+
11
+ WebGPU support is introduced via [PR #215](https://github.com/ngxson/wllama/pull/215). Currently only supports Chrome (for Firefox, a flag must be enabled manually).
12
+
13
+ Upon updating to V3.1, WebGPU will be enabled automatically. By default, all layers will be offloaded to GPU. If the model is too big to fit into VRAM, you can manually adjust the number of layers via the `n_gpu_layers` parameter of `LoadModelParams`. Example:
14
+
15
+ ```js
16
+ await wllama.loadModel(files, {
17
+ n_gpu_layers: 4, // meaning 4 layers are offloaded to GPU; set to 0 to disable GPU inference
18
+ });
19
+ ```
20
+
21
+ ### Single WASM build
22
+
23
+ From [PR #214](https://github.com/ngxson/wllama/pull/214), the separation between single-threaded build and multi-threaded build has been removed. Wllama now uses a single build that can support both single/multi-threaded and WebGPU, each feature can be toggled at runtime.
24
+
25
+ This allows cutting down the space to host the pre-built binary, while speeding up the build process.
26
+
27
+ To migrate from an older version:
28
+
29
+ ```js
30
+ // Old config
31
+ const CONFIG_PATHS = {
32
+ 'single-thread/wllama.wasm': './path_to_source/single-thread/wllama.wasm',
33
+ 'multi-thread/wllama.wasm' : './path_to_source/multi-thread/wllama.wasm',
34
+ };
35
+
36
+ // New config
37
+ const CONFIG_PATHS = {
38
+ default: './path_to_source/wasm/wllama.wasm',
39
+ };
40
+ ```
@@ -0,0 +1,230 @@
1
+ # Introducing Wllama V3.0
2
+
3
+ ## What's new
4
+
5
+ V3.0 is a major architectural overhaul that replaces the custom wllama core with `server-context`, the inference component from `llama-server`. Key highlights:
6
+
7
+ - Full OAI-compatible API: `createChatCompletion`, `createCompletion`, `createEmbedding`
8
+ - Multimodal support (vision/audio inputs)
9
+ - Native tool calling support
10
+ - Jinja-based chat template parsing (same as llama-server)
11
+
12
+ New demos:
13
+ - Multimodal (vision) completion: https://github.ngxson.com/wllama/examples/multimodal/ ([source code](../examples/multimodal/index.html))
14
+ - Tool calling: https://github.ngxson.com/wllama/examples/tools/ ([source code](../examples/tools/index.html))
15
+
16
+ ## New architecture
17
+
18
+ Previously, wllama implemented its own low-level bindings to llama.cpp. V3.0 instead reuses `server-context.cpp` from `llama-server`, which brings two major benefits:
19
+
20
+ - Better compatibility: new llama.cpp features (tool calling, reasoning, multimodal) work automatically
21
+ - Less maintenance: wllama no longer needs to re-implement chat template parsing, sampling logic, etc.
22
+
23
+ The worker architecture is unchanged — the wasm thread runs the server-context main loop, the browser thread handles inference requests, and they communicate via the existing `glue` message protocol.
24
+
25
+ ## OAI-compatible API
26
+
27
+ All completion methods now follow the OpenAI API shape closely. This makes it easy to swap wllama in wherever you already use the OpenAI SDK.
28
+
29
+ ### `createChatCompletion`
30
+
31
+ ```typescript
32
+ // Non-streaming
33
+ const response = await wllama.createChatCompletion({
34
+ messages: [{ role: 'user', content: 'Hello!' }],
35
+ max_tokens: 256,
36
+ temperature: 0.7,
37
+ });
38
+ console.log(response.choices[0].message.content);
39
+
40
+ // Streaming
41
+ const stream = await wllama.createChatCompletion({
42
+ messages: [{ role: 'user', content: 'Hello!' }],
43
+ max_tokens: 256,
44
+ stream: true,
45
+ });
46
+ for await (const chunk of stream) {
47
+ process.stdout.write(chunk.choices[0].delta.content ?? '');
48
+ }
49
+ ```
50
+
51
+ ### `createCompletion`
52
+
53
+ ```typescript
54
+ // Raw (non-chat) completion
55
+ const response = await wllama.createCompletion({
56
+ prompt: 'The capital of France is',
57
+ max_tokens: 32,
58
+ });
59
+ console.log(response.choices[0].text);
60
+ ```
61
+
62
+ ### `createEmbedding`
63
+
64
+ ```typescript
65
+ // Requires model loaded with { embeddings: true }
66
+ const response = await wllama.createEmbedding({
67
+ input: 'The quick brown fox',
68
+ });
69
+ console.log(response.data[0].embedding); // float[]
70
+ ```
71
+
72
+ ## Tool calling
73
+
74
+ Tool calling works out of the box for any model that supports it (e.g. Qwen, Llama with tool-call template).
75
+
76
+ ```typescript
77
+ const tools = [
78
+ {
79
+ type: 'function',
80
+ function: {
81
+ name: 'get_weather',
82
+ description: 'Get the current weather for a given city.',
83
+ parameters: {
84
+ type: 'object',
85
+ properties: {
86
+ city: { type: 'string', description: 'City name' },
87
+ },
88
+ required: ['city'],
89
+ },
90
+ },
91
+ },
92
+ ];
93
+
94
+ const messages = [{ role: 'user', content: 'What is the weather in Tokyo?' }];
95
+
96
+ // First turn: model decides to call a tool
97
+ const response = await wllama.createChatCompletion({
98
+ messages,
99
+ tools,
100
+ tool_choice: 'auto',
101
+ max_tokens: 256,
102
+ });
103
+
104
+ const choice = response.choices[0];
105
+ if (choice.finish_reason === 'tool_calls') {
106
+ const toolCall = choice.message.tool_calls[0];
107
+ const args = JSON.parse(toolCall.function.arguments);
108
+ const result = { condition: 'rain', temperature_celsius: 21 };
109
+
110
+ // Second turn: feed tool result back
111
+ messages.push(choice.message);
112
+ messages.push({
113
+ role: 'tool',
114
+ tool_call_id: toolCall.id,
115
+ content: JSON.stringify(result),
116
+ });
117
+
118
+ const final = await wllama.createChatCompletion({ messages, max_tokens: 256 });
119
+ console.log(final.choices[0].message.content);
120
+ }
121
+ ```
122
+
123
+ ## Multimodal support
124
+
125
+ Models with a vision projector (mmproj) can now process image and audio inputs.
126
+
127
+ ```typescript
128
+ // Load the model + mmproj from Hugging Face
129
+ await wllama.loadModelFromHF({
130
+ repo: 'user/model-GGUF',
131
+ quant: 'Q4_K_M',
132
+ mmprojQuant: 'Q8_0',
133
+ });
134
+
135
+ // Or load from explicit URLs
136
+ await wllama.loadModelFromUrl({
137
+ url: 'https://example.com/model.gguf',
138
+ mmprojUrl: 'https://example.com/mmproj.gguf',
139
+ });
140
+
141
+ // Pass an image as ArrayBuffer alongside text
142
+ const imageData = await fetch('./photo.jpg').then(r => r.arrayBuffer());
143
+
144
+ const response = await wllama.createChatCompletion({
145
+ messages: [
146
+ {
147
+ role: 'user',
148
+ content: [
149
+ { type: 'image', data: imageData },
150
+ { type: 'text', text: 'Describe this image.' },
151
+ ],
152
+ },
153
+ ],
154
+ max_tokens: 512,
155
+ });
156
+ ```
157
+
158
+ ## Migration from v2.0
159
+
160
+ ### Removed low-level APIs
161
+
162
+ The following APIs are **no longer available** in v3.0. They were tied to the old custom core and cannot easily be re-implemented on top of llama-server.
163
+
164
+ | Removed | Reason |
165
+ |---|---|
166
+ | `tokenize` / `detokenize` | Low-level tokenizer API removed |
167
+ | `decode` / `encode` | Replaced by OAI completion API |
168
+ | `samplingInit` / `samplingAccept` / `samplingSample` | Sampling is now handled internally per-request |
169
+ | Sequence shift/remove operations | Not exposed by llama-server context |
170
+
171
+ > [!IMPORTANT]
172
+ > If you rely on tokenizer APIs, please leave a comment on [PR #213](https://github.com/ngxson/wllama/pull/213) — it can be added back in the future.
173
+
174
+ ### Sampling params moved to per-request
175
+
176
+ In v2.0, some sampling params were passed at model load time. From v3.0, all sampling params must be provided per request via `createChatCompletion` / `createCompletion`.
177
+
178
+ Previously in v2.x:
179
+
180
+ ```js
181
+ await wllama.loadModelFromUrl('https://example.com/model.gguf', {
182
+ temperature: 0.8,
183
+ top_k: 40,
184
+ });
185
+ ```
186
+
187
+ From v3.0:
188
+
189
+ ```js
190
+ await wllama.loadModelFromUrl('https://example.com/model.gguf');
191
+
192
+ const response = await wllama.createChatCompletion({
193
+ messages: [{ role: 'user', content: 'Hello!' }],
194
+ temperature: 0.8,
195
+ top_k: 40,
196
+ });
197
+ ```
198
+
199
+ ### Auto context length removed
200
+
201
+ The `n_ctx_auto` option is no longer supported. Set `n_ctx` explicitly at load time.
202
+
203
+ ```js
204
+ await wllama.loadModelFromUrl('https://example.com/model.gguf', {
205
+ n_ctx: 4096,
206
+ });
207
+ ```
208
+
209
+ ### Multimodal loading: `mmprojUrl` replaces separate file selection
210
+
211
+ Previously you had to pass mmproj as a local file alongside the main model. From v3.0, pass it directly via `mmprojUrl` in `loadModelFromUrl`:
212
+
213
+ ```js
214
+ await wllama.loadModelFromUrl({
215
+ url: 'https://example.com/model.gguf',
216
+ mmprojUrl: 'https://example.com/mmproj.gguf',
217
+ });
218
+ ```
219
+
220
+ Local file loading still works — just pass both GGUF blobs to `loadModel`:
221
+
222
+ ```js
223
+ await wllama.loadModel([modelBlob, mmprojBlob]);
224
+ ```
225
+
226
+ ### Internal changes
227
+
228
+ - The `server-context` main loop now runs on the wasm worker thread
229
+ - Chat templates are parsed with Jinja (same as llama-server) — set `jinja: true` at load time to enable, or override with `chat_template`
230
+ - `WllamaError` gains a new `'kv_cache_full'` error type for when the context runs out of space
package/index.ts ADDED
@@ -0,0 +1 @@
1
+ export * from './src';
package/package.json ADDED
@@ -0,0 +1,71 @@
1
+ {
2
+ "name": "@agorapete/wllama",
3
+ "version": "3.5.1-q2.0",
4
+ "description": "WebAssembly binding for llama.cpp (with Q2_0 ternary support) - Enabling on-browser LLM inference",
5
+ "main": "index.js",
6
+ "type": "module",
7
+ "directories": {
8
+ "example": "examples"
9
+ },
10
+ "scripts": {
11
+ "serve": "node ./scripts/http_server.js",
12
+ "serve:mt": "MULTITHREAD=1 node ./scripts/http_server.js",
13
+ "clean": "rm -rf ./esm && rm -rf ./docs && rm -rf ./wasm",
14
+ "build:worker": "./scripts/build_worker.sh",
15
+ "build:glue": "node ./cpp/generate_glue_prototype.js",
16
+ "build:wasm": "./scripts/build_wasm.sh && npm run build:glue",
17
+ "build:test": "WLLAMA_TEST_BACKEND=1 npm run build:wasm",
18
+ "build:tsup": "tsup src/index.ts --format cjs,esm --clean",
19
+ "build:minified": "terser esm/index.js -o esm/index.min.js --compress --mangle --source-map",
20
+ "build:typedef": "tsc --emitDeclarationOnly --declaration -p tsconfig.build.json",
21
+ "build": "npm run clean && npm run build:worker && npm run build:tsup && npm run build:minified && npm run build:typedef && ./scripts/post_build.sh && npm run docs",
22
+ "docs": "typedoc --tsconfig tsconfig.build.json src/index.ts",
23
+ "upload": "npm run format && npm run build && node scripts/check_package_size.js && npm publish --access public && (cd compat && npm publish --access public)",
24
+ "format": "prettier --write .",
25
+ "test": "vitest",
26
+ "test:auto": "AUTO=1 vitest",
27
+ "test:firefox": "BROWSER=firefox vitest",
28
+ "test:safari": "BROWSER=safari vitest",
29
+ "test:wgpu": "WEBGPU=1 vitest"
30
+ },
31
+ "repository": {
32
+ "type": "git",
33
+ "url": "git+https://github.com/AgoraPete/wllama.git"
34
+ },
35
+ "keywords": [
36
+ "wasm",
37
+ "webassembly",
38
+ "llama",
39
+ "llm",
40
+ "ai",
41
+ "rag",
42
+ "embeddings",
43
+ "generation"
44
+ ],
45
+ "author": "Xuan Son NGUYEN <contact@ngxson.com>",
46
+ "license": "MIT",
47
+ "bugs": {
48
+ "url": "https://github.com/ngxson/wllama/issues"
49
+ },
50
+ "homepage": "https://github.com/ngxson/wllama#readme",
51
+ "devDependencies": {
52
+ "@playwright/test": "^1.60.0",
53
+ "@vitest/browser": "^2.1.6",
54
+ "express": "^4.18.3",
55
+ "mime-types": "^2.1.35",
56
+ "playwright": "^1.59.1",
57
+ "prettier": "^3.3.3",
58
+ "terser": "^5.39.0",
59
+ "tsup": "^8.4.0",
60
+ "typedoc": "^0.27.2",
61
+ "typescript": "^5.4.2",
62
+ "webdriverio": "^9.4.1"
63
+ },
64
+ "prettier": {
65
+ "trailingComma": "es5",
66
+ "tabWidth": 2,
67
+ "semi": true,
68
+ "singleQuote": true,
69
+ "bracketSameLine": false
70
+ }
71
+ }
@@ -0,0 +1,33 @@
1
+ #!/bin/bash
2
+
3
+ # Git bisect script for finding the llama.cpp commit that introduced the locale crash.
4
+ # Run from wllama root: git bisect run ./scripts/bisect_test.sh
5
+ #
6
+ # Start (good): aa46bda89b9a8378ae76bb15fc2ce2f571f0983c (wllama master's llama.cpp)
7
+ # End (bad): dd4623a74 (current HEAD of submodule)
8
+
9
+ set -e
10
+
11
+ WLLAMA_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." &> /dev/null && pwd )"
12
+ cd "$WLLAMA_ROOT"
13
+
14
+ CONTEXT_H="$WLLAMA_ROOT/cpp/wllama-context.h"
15
+ FIT_H="$WLLAMA_ROOT/llama.cpp/common/fit.h"
16
+
17
+ # Adjust #if 0 / #if 1 around common_get_device_memory_data stub based on
18
+ # which signature fit.h declares at the current bisect commit.
19
+ if grep -q "common_device_memory_data_vec" "$FIT_H" 2>/dev/null; then
20
+ # New signature (after d8a24ccee): enable the common_device_memory_data_vec block
21
+ sed -i.bak 's/^#if [01]$/\#if 1/' "$CONTEXT_H"
22
+ else
23
+ # Old signature (before d8a24ccee): enable the std::vector<llama_device_memory_data> block
24
+ sed -i.bak 's/^#if [01]$/\#if 0/' "$CONTEXT_H"
25
+ fi
26
+ rm -f "${CONTEXT_H}.bak"
27
+
28
+ # Build and run tests. Exit 125 = skip (build infra broken at this commit).
29
+ rm -rf build
30
+
31
+ SKIP_COMPAT=1 npm run build:wasm 2>&1 || exit 125
32
+ npm run build 2>&1 || exit 125
33
+ AUTO=1 npm run test 2>&1
@@ -0,0 +1,26 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ echo ">>> clone"
6
+ rm -rf _tmp_hf_space
7
+ git clone https://ngxson:${HF_TOKEN}@huggingface.co/spaces/ngxson/wllama --depth 1 _tmp_hf_space
8
+
9
+ echo ">>> build"
10
+ cd _tmp_hf_space
11
+ ./build.sh
12
+
13
+ echo ">>> push"
14
+ if [ -z "$(git status --porcelain)" ]; then
15
+ echo "nothing changed, skipping..."
16
+ exit 0
17
+ fi
18
+ git add -A
19
+ git commit -m "update"
20
+ git push
21
+
22
+ echo ">>> clean up"
23
+ cd ..
24
+ rm -rf _tmp_hf_space
25
+
26
+ echo ">>> done"