@agorapete/wllama 3.5.1-q2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.gitmodules +3 -0
  2. package/.prettierignore +38 -0
  3. package/AGENTS.md +1 -0
  4. package/CMakeLists.txt +131 -0
  5. package/LICENCE +21 -0
  6. package/README-dev.md +178 -0
  7. package/README.md +225 -0
  8. package/README_banner.png +0 -0
  9. package/assets/screenshot_0.png +0 -0
  10. package/cpp/generate_glue_prototype.js +115 -0
  11. package/cpp/glue.hpp +664 -0
  12. package/cpp/test_glue.cpp +80 -0
  13. package/cpp/wllama-context.h +1172 -0
  14. package/cpp/wllama-fs.h +148 -0
  15. package/cpp/wllama.cpp +187 -0
  16. package/cpp/wllama.h +6 -0
  17. package/esm/cache-manager.d.ts +130 -0
  18. package/esm/debug.d.ts +28 -0
  19. package/esm/glue/glue.d.ts +22 -0
  20. package/esm/glue/messages.d.ts +146 -0
  21. package/esm/huggingface.d.ts +31 -0
  22. package/esm/index.cjs +3406 -0
  23. package/esm/index.d.ts +8 -0
  24. package/esm/index.js +3387 -0
  25. package/esm/index.min.js +1 -0
  26. package/esm/index.min.js.map +1 -0
  27. package/esm/model-manager.d.ts +136 -0
  28. package/esm/storage/cos.d.ts +36 -0
  29. package/esm/storage/index.d.ts +33 -0
  30. package/esm/storage/opfs.d.ts +12 -0
  31. package/esm/types/oai-compat.d.ts +278 -0
  32. package/esm/types/types.d.ts +112 -0
  33. package/esm/utils.d.ts +119 -0
  34. package/esm/wasm/source-map.d.ts +1 -0
  35. package/esm/wasm/wllama.wasm +0 -0
  36. package/esm/wasm-from-cdn.d.ts +8 -0
  37. package/esm/wllama.d.ts +397 -0
  38. package/esm/worker.d.ts +92 -0
  39. package/esm/workers-code/generated.d.ts +4 -0
  40. package/guides/intro-v2.md +132 -0
  41. package/guides/intro-v3.1.md +40 -0
  42. package/guides/intro-v3.md +230 -0
  43. package/index.ts +1 -0
  44. package/package.json +71 -0
  45. package/scripts/bisect_test.sh +33 -0
  46. package/scripts/build_hf_space.sh +26 -0
  47. package/scripts/build_source_map.js +269 -0
  48. package/scripts/build_wasm.sh +19 -0
  49. package/scripts/build_worker.sh +38 -0
  50. package/scripts/check_debug_build.js +30 -0
  51. package/scripts/check_package_size.js +25 -0
  52. package/scripts/docker-compose.yml +76 -0
  53. package/scripts/generate_wasm_from_cdn.js +24 -0
  54. package/scripts/http_server.js +44 -0
  55. package/scripts/post_build.sh +32 -0
  56. package/src/cache-manager.ts +358 -0
  57. package/src/debug.ts +111 -0
  58. package/src/glue/glue.ts +291 -0
  59. package/src/glue/messages.ts +773 -0
  60. package/src/huggingface.ts +151 -0
  61. package/src/index.ts +8 -0
  62. package/src/mjs.test.ts +44 -0
  63. package/src/model-manager.test.ts +200 -0
  64. package/src/model-manager.ts +359 -0
  65. package/src/storage/cos.test.ts +83 -0
  66. package/src/storage/cos.ts +171 -0
  67. package/src/storage/index.ts +40 -0
  68. package/src/storage/opfs.ts +119 -0
  69. package/src/types/oai-compat.ts +342 -0
  70. package/src/types/types.ts +133 -0
  71. package/src/utils.test.ts +231 -0
  72. package/src/utils.ts +403 -0
  73. package/src/wasm/source-map.ts +7 -0
  74. package/src/wasm/wllama.js +1 -0
  75. package/src/wasm/wllama.wasm +0 -0
  76. package/src/wasm-from-cdn.ts +13 -0
  77. package/src/wllama.test.ts +392 -0
  78. package/src/wllama.ts +1138 -0
  79. package/src/wllama.wgpu.test.ts +62 -0
  80. package/src/worker.ts +443 -0
  81. package/src/workers-code/generated.ts +11 -0
  82. package/src/workers-code/llama-cpp.js +511 -0
  83. package/src/workers-code/opfs-utils.js +150 -0
  84. package/tsconfig.build.json +34 -0
  85. package/tsup.config.ts +23 -0
  86. package/vitest.config.ts +61 -0
@@ -0,0 +1,392 @@
1
+ import { test, expect, beforeEach } from 'vitest';
2
+
3
+ declare const __GITHUB_CI__: boolean;
4
+
5
+ // Add a small delay before each test on GitHub CI to avoid HuggingFace rate limits.
6
+ // typeof guard handles the case where vitest define is not configured.
7
+ if (typeof __GITHUB_CI__ !== 'undefined' && __GITHUB_CI__) {
8
+ beforeEach(async () => {
9
+ await new Promise((resolve) => setTimeout(resolve, 100));
10
+ });
11
+ }
12
+ import { Wllama, type WllamaConfig } from './wllama';
13
+
14
+ const CONFIG_PATHS = {
15
+ default: '/src/wasm/wllama.wasm',
16
+ };
17
+
18
+ // TODO: enable compat mode in tests once test infrastructure supports Safari/asyncify
19
+ const createWllama = (config = CONFIG_PATHS, options: WllamaConfig = {}) => {
20
+ const w = new Wllama(config, options);
21
+ w.setCompat(null);
22
+ return w;
23
+ };
24
+
25
+ const TINY_MODEL =
26
+ 'https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf';
27
+
28
+ const SPLIT_MODEL =
29
+ 'https://huggingface.co/ngxson/tinyllama_split_test/resolve/main/stories15M-q8_0-00001-of-00003.gguf';
30
+
31
+ const EMBD_MODEL = TINY_MODEL; // for better speed
32
+
33
+ const RERANK_MODEL =
34
+ 'https://huggingface.co/ggml-org/models/resolve/main/jina-reranker-v1-tiny-en/ggml-model-f16.gguf';
35
+
36
+ test.sequential('loads single model file', async () => {
37
+ const wllama = createWllama();
38
+
39
+ await wllama.loadModelFromUrl(TINY_MODEL, {
40
+ n_ctx: 1024,
41
+ n_threads: 2,
42
+ });
43
+
44
+ expect(wllama.isModelLoaded()).toBe(true);
45
+ expect(wllama.getModelMetadata()).toBeDefined();
46
+ expect(wllama.getModelMetadata().hparams).toBeDefined();
47
+ expect(wllama.isMultithread()).toBe(true);
48
+
49
+ const metadata = wllama.getModelMetadata();
50
+ expect(metadata.hparams).toBeDefined();
51
+ expect(metadata.meta).toBeDefined();
52
+ await wllama.exit();
53
+ });
54
+
55
+ test.sequential('loads single model file from HF', async () => {
56
+ const wllama = createWllama();
57
+
58
+ await wllama.loadModelFromHF(
59
+ { repo: 'ggml-org/models', file: 'tinyllamas/stories15M-q4_0.gguf' },
60
+ {
61
+ n_ctx: 1024,
62
+ n_threads: 2,
63
+ }
64
+ );
65
+
66
+ expect(wllama.isModelLoaded()).toBe(true);
67
+ await wllama.exit();
68
+ });
69
+
70
+ test.sequential('loads single thread model', async () => {
71
+ const wllama = createWllama();
72
+
73
+ await wllama.loadModelFromUrl(TINY_MODEL, {
74
+ n_ctx: 1024,
75
+ n_threads: 1,
76
+ });
77
+
78
+ expect(wllama.isModelLoaded()).toBe(true);
79
+ expect(wllama.isMultithread()).toBe(false);
80
+
81
+ const res = await wllama.createCompletion({
82
+ prompt: 'Hello',
83
+ max_tokens: 10,
84
+ });
85
+ expect(res).toBeDefined();
86
+ expect(res.choices[0].text.length).toBeGreaterThan(0);
87
+ await wllama.exit();
88
+ });
89
+
90
+ test.sequential('loads model with progress callback', async () => {
91
+ const wllama = createWllama();
92
+
93
+ let progressCalled = false;
94
+ let lastLoaded = 0;
95
+ await wllama.loadModelFromUrl(TINY_MODEL, {
96
+ n_ctx: 1024,
97
+ progressCallback: ({ loaded, total }) => {
98
+ expect(loaded).toBeGreaterThan(0);
99
+ expect(total).toBeGreaterThan(0);
100
+ expect(loaded).toBeLessThanOrEqual(total);
101
+ expect(loaded).toBeGreaterThanOrEqual(lastLoaded);
102
+ progressCalled = true;
103
+ lastLoaded = loaded;
104
+ },
105
+ });
106
+
107
+ expect(progressCalled).toBe(true);
108
+ expect(wllama.isModelLoaded()).toBe(true);
109
+ await wllama.exit();
110
+ });
111
+
112
+ test.sequential('loads split model files', async () => {
113
+ const wllama = createWllama(CONFIG_PATHS, {
114
+ parallelDownloads: 5,
115
+ });
116
+
117
+ await wllama.loadModelFromUrl(SPLIT_MODEL, {
118
+ n_ctx: 1024,
119
+ });
120
+
121
+ expect(wllama.isModelLoaded()).toBe(true);
122
+ await wllama.exit();
123
+ });
124
+
125
+ test.sequential('generates completion', async () => {
126
+ const wllama = createWllama();
127
+
128
+ await wllama.loadModelFromUrl(TINY_MODEL, {
129
+ n_ctx: 1024,
130
+ });
131
+
132
+ const res = await wllama.createCompletion({
133
+ prompt: 'Once upon a time',
134
+ max_tokens: 10,
135
+ temperature: 0.0,
136
+ top_p: 0.95,
137
+ top_k: 40,
138
+ seed: 42,
139
+ });
140
+
141
+ expect(res).toBeDefined();
142
+ expect(res.choices[0].text).toMatch(/(there|little|girl|Lily)+/);
143
+ expect(res.choices[0].text.length).toBeGreaterThan(10);
144
+
145
+ await wllama.exit();
146
+ });
147
+
148
+ test.sequential('abort signal', async () => {
149
+ const wllama = createWllama();
150
+
151
+ await wllama.loadModelFromUrl(TINY_MODEL, {
152
+ n_ctx: 1024,
153
+ });
154
+
155
+ const abortController = new AbortController();
156
+ const stream = await wllama.createCompletion({
157
+ prompt: 'Once upon a time',
158
+ max_tokens: 10,
159
+ temperature: 0.0,
160
+ top_p: 0.95,
161
+ top_k: 40,
162
+ seed: 42,
163
+ stream: true,
164
+ abortSignal: abortController.signal,
165
+ });
166
+
167
+ let i = 0;
168
+ try {
169
+ for await (const _ of stream) {
170
+ if (i === 2) {
171
+ abortController.abort();
172
+ }
173
+ i++;
174
+ }
175
+ } catch (e) {
176
+ expect((e as Error).name).toBe('AbortError');
177
+ }
178
+
179
+ expect(i).toBe(4);
180
+
181
+ await wllama.exit();
182
+ });
183
+
184
+ test.sequential('generates embeddings', async () => {
185
+ const wllama = createWllama();
186
+
187
+ await wllama.loadModelFromUrl(EMBD_MODEL, {
188
+ n_ctx: 1024,
189
+ embeddings: true,
190
+ });
191
+
192
+ expect(wllama.isModelLoaded()).toBe(true);
193
+
194
+ const text = 'This is a test sentence';
195
+ const res = await wllama.createEmbedding({ input: text });
196
+
197
+ expect(res).toBeDefined();
198
+ const embedding = res.data[0].embedding as number[];
199
+ expect(Array.isArray(embedding)).toBe(true);
200
+ expect(embedding.length).toBeGreaterThan(0);
201
+ for (const e of embedding) {
202
+ expect(typeof e).toBe('number');
203
+ }
204
+
205
+ // slightly different text should have high cosine similarity
206
+ const res2 = await wllama.createEmbedding({ input: text + ' ' });
207
+ const embedding2 = res2.data[0].embedding as number[];
208
+ const dot = embedding.reduce((acc, v, i) => acc + v * embedding2[i], 0);
209
+ const norm1 = Math.sqrt(embedding.reduce((acc, v) => acc + v * v, 0));
210
+ const norm2 = Math.sqrt(embedding2.reduce((acc, v) => acc + v * v, 0));
211
+ const cosineSim = dot / (norm1 * norm2);
212
+ expect(cosineSim).toBeGreaterThan(1 - 0.05);
213
+ expect(cosineSim).toBeLessThan(1);
214
+
215
+ await wllama.exit();
216
+ });
217
+
218
+ test.sequential('reranks documents', async () => {
219
+ const wllama = createWllama();
220
+
221
+ await wllama.loadModelFromUrl(RERANK_MODEL, {
222
+ embeddings: true,
223
+ pooling_type: 'rank',
224
+ });
225
+
226
+ expect(wllama.isModelLoaded()).toBe(true);
227
+
228
+ const query = 'What is machine learning?';
229
+ const documents = [
230
+ 'Machine learning is a branch of artificial intelligence.',
231
+ 'The weather today is sunny and warm.',
232
+ 'Neural networks are used in deep learning.',
233
+ ];
234
+
235
+ const res = await wllama.createRerank({ query, documents });
236
+
237
+ expect(res).toBeDefined();
238
+ expect(res.results).toHaveLength(documents.length);
239
+ for (const r of res.results) {
240
+ expect(typeof r.index).toBe('number');
241
+ expect(typeof r.relevance_score).toBe('number');
242
+ }
243
+
244
+ // results should be sorted highest score first
245
+ for (let i = 0; i < res.results.length - 1; i++) {
246
+ expect(res.results[i].relevance_score).toBeGreaterThanOrEqual(
247
+ res.results[i + 1].relevance_score
248
+ );
249
+ }
250
+
251
+ // the most relevant documents should outscore the other
252
+ const weatherIdx = res.results.findIndex((r) => r.index === 1);
253
+ expect(weatherIdx).toBeGreaterThan(0);
254
+
255
+ await wllama.exit();
256
+ });
257
+
258
+ test.sequential('allowOffline', async () => {
259
+ const wllama = createWllama(CONFIG_PATHS, {
260
+ allowOffline: true,
261
+ });
262
+
263
+ // Mock fetch to simulate offline
264
+ const origFetch = window.fetch;
265
+ window.fetch = () => Promise.reject(new Error('offline'));
266
+
267
+ try {
268
+ await wllama.loadModelFromUrl(TINY_MODEL);
269
+ expect(wllama.isModelLoaded()).toBe(true);
270
+ await wllama.exit();
271
+ } catch (e) {
272
+ window.fetch = origFetch;
273
+ throw e;
274
+ } finally {
275
+ window.fetch = origFetch;
276
+ }
277
+ });
278
+
279
+ test.sequential('generates chat completion', async () => {
280
+ const wllama = createWllama();
281
+
282
+ await wllama.loadModelFromUrl(TINY_MODEL, {
283
+ n_ctx: 1024,
284
+ });
285
+
286
+ const res = await wllama.createChatCompletion({
287
+ messages: [
288
+ { role: 'system', content: 'You are helpful.' },
289
+ { role: 'user', content: 'Hi!' },
290
+ { role: 'assistant', content: 'Hello!' },
291
+ { role: 'user', content: 'How are you?' },
292
+ ],
293
+ max_tokens: 10,
294
+ temperature: 0.0,
295
+ top_p: 0.95,
296
+ top_k: 40,
297
+ seed: 42,
298
+ });
299
+
300
+ const text = res.choices[0].message.content as string;
301
+ expect(text).toBeDefined();
302
+ expect(text).toMatch(/(Sudden|big|scary)+/);
303
+ expect(text.length).toBeGreaterThan(10);
304
+
305
+ await wllama.exit();
306
+ });
307
+
308
+ test.sequential('generates chat completion using async iterator', async () => {
309
+ const wllama = createWllama();
310
+
311
+ await wllama.loadModelFromUrl(TINY_MODEL, {
312
+ n_ctx: 1024,
313
+ seed: 42,
314
+ });
315
+
316
+ const stream = await wllama.createChatCompletion({
317
+ messages: [
318
+ { role: 'system', content: 'You are helpful.' },
319
+ { role: 'user', content: 'Hi!' },
320
+ { role: 'assistant', content: 'Hello!' },
321
+ { role: 'user', content: 'How are you?' },
322
+ ],
323
+ max_tokens: 10,
324
+ temperature: 0.0,
325
+ stream: true,
326
+ });
327
+
328
+ let finalText = '';
329
+ for await (const chunk of stream) {
330
+ expect(chunk).toBeDefined();
331
+ expect(chunk.object).toBe('chat.completion.chunk');
332
+ const delta = chunk.choices[0].delta;
333
+ if (delta.content) {
334
+ finalText += delta.content;
335
+ }
336
+ }
337
+
338
+ expect(finalText.length).toBeGreaterThan(10);
339
+ expect(finalText).toMatch(/(Sudden|big|scary)+/);
340
+
341
+ await wllama.exit();
342
+ });
343
+
344
+ test.sequential('stack trace (abort)', async () => {
345
+ const wllama = createWllama();
346
+ await wllama.loadModelFromUrl(TINY_MODEL, {
347
+ pooling_type: 'test_stack_trace_abort' as any,
348
+ });
349
+ expect(wllama.isModelLoaded()).toBe(true);
350
+
351
+ const err1: unknown = await wllama
352
+ .createCompletion({ prompt: 'test', max_tokens: 1 })
353
+ .catch((e: unknown) => e);
354
+ expect(err1).toBeInstanceOf(Error);
355
+ expect((err1 as Error).name).toBe('RuntimeError');
356
+ expect((err1 as Error).stack).toMatch(/__wrap_abort/);
357
+ expect((err1 as Error).stack).toMatch(/server_response::send/);
358
+
359
+ await wllama.exit();
360
+ });
361
+
362
+ // TODO @ngxson : this stucks on github CI but not on local run, investigate why and re-enable
363
+ test.skip('stack trace (OOB memory access)', async () => {
364
+ const wllama = createWllama();
365
+ await wllama.loadModelFromUrl(TINY_MODEL, {
366
+ pooling_type: 'test_stack_trace_oob' as any,
367
+ n_threads: 1, // multithread stucks on github CI but not on local run, why?
368
+ });
369
+ expect(wllama.isModelLoaded()).toBe(true);
370
+
371
+ const err2: unknown = await wllama
372
+ .createCompletion({ prompt: 'test', max_tokens: 1 })
373
+ .catch((e: unknown) => e);
374
+ expect(err2).toBeInstanceOf(Error);
375
+ expect((err2 as Error).name).toBe('RuntimeError');
376
+ expect((err2 as Error).stack).toMatch(/server_response::send/);
377
+
378
+ await wllama.exit();
379
+ });
380
+
381
+ test.sequential('cleans up resources', async () => {
382
+ const wllama = createWllama();
383
+ await wllama.loadModelFromUrl(TINY_MODEL);
384
+ expect(wllama.isModelLoaded()).toBe(true);
385
+ await wllama.exit();
386
+ await expect(
387
+ wllama.createCompletion({ prompt: 'test', max_tokens: 1 })
388
+ ).rejects.toThrow();
389
+
390
+ // Double check that the model is really unloaded
391
+ expect(wllama.isModelLoaded()).toBe(false);
392
+ });