@agorapete/wllama 3.5.1-q2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/.gitmodules +3 -0
  2. package/.prettierignore +38 -0
  3. package/AGENTS.md +1 -0
  4. package/CMakeLists.txt +131 -0
  5. package/LICENCE +21 -0
  6. package/README-dev.md +178 -0
  7. package/README.md +225 -0
  8. package/README_banner.png +0 -0
  9. package/assets/screenshot_0.png +0 -0
  10. package/cpp/generate_glue_prototype.js +115 -0
  11. package/cpp/glue.hpp +664 -0
  12. package/cpp/test_glue.cpp +80 -0
  13. package/cpp/wllama-context.h +1172 -0
  14. package/cpp/wllama-fs.h +148 -0
  15. package/cpp/wllama.cpp +187 -0
  16. package/cpp/wllama.h +6 -0
  17. package/esm/cache-manager.d.ts +130 -0
  18. package/esm/debug.d.ts +28 -0
  19. package/esm/glue/glue.d.ts +22 -0
  20. package/esm/glue/messages.d.ts +146 -0
  21. package/esm/huggingface.d.ts +31 -0
  22. package/esm/index.cjs +3406 -0
  23. package/esm/index.d.ts +8 -0
  24. package/esm/index.js +3387 -0
  25. package/esm/index.min.js +1 -0
  26. package/esm/index.min.js.map +1 -0
  27. package/esm/model-manager.d.ts +136 -0
  28. package/esm/storage/cos.d.ts +36 -0
  29. package/esm/storage/index.d.ts +33 -0
  30. package/esm/storage/opfs.d.ts +12 -0
  31. package/esm/types/oai-compat.d.ts +278 -0
  32. package/esm/types/types.d.ts +112 -0
  33. package/esm/utils.d.ts +119 -0
  34. package/esm/wasm/source-map.d.ts +1 -0
  35. package/esm/wasm/wllama.wasm +0 -0
  36. package/esm/wasm-from-cdn.d.ts +8 -0
  37. package/esm/wllama.d.ts +397 -0
  38. package/esm/worker.d.ts +92 -0
  39. package/esm/workers-code/generated.d.ts +4 -0
  40. package/guides/intro-v2.md +132 -0
  41. package/guides/intro-v3.1.md +40 -0
  42. package/guides/intro-v3.md +230 -0
  43. package/index.ts +1 -0
  44. package/package.json +71 -0
  45. package/scripts/bisect_test.sh +33 -0
  46. package/scripts/build_hf_space.sh +26 -0
  47. package/scripts/build_source_map.js +269 -0
  48. package/scripts/build_wasm.sh +19 -0
  49. package/scripts/build_worker.sh +38 -0
  50. package/scripts/check_debug_build.js +30 -0
  51. package/scripts/check_package_size.js +25 -0
  52. package/scripts/docker-compose.yml +76 -0
  53. package/scripts/generate_wasm_from_cdn.js +24 -0
  54. package/scripts/http_server.js +44 -0
  55. package/scripts/post_build.sh +32 -0
  56. package/src/cache-manager.ts +358 -0
  57. package/src/debug.ts +111 -0
  58. package/src/glue/glue.ts +291 -0
  59. package/src/glue/messages.ts +773 -0
  60. package/src/huggingface.ts +151 -0
  61. package/src/index.ts +8 -0
  62. package/src/mjs.test.ts +44 -0
  63. package/src/model-manager.test.ts +200 -0
  64. package/src/model-manager.ts +359 -0
  65. package/src/storage/cos.test.ts +83 -0
  66. package/src/storage/cos.ts +171 -0
  67. package/src/storage/index.ts +40 -0
  68. package/src/storage/opfs.ts +119 -0
  69. package/src/types/oai-compat.ts +342 -0
  70. package/src/types/types.ts +133 -0
  71. package/src/utils.test.ts +231 -0
  72. package/src/utils.ts +403 -0
  73. package/src/wasm/source-map.ts +7 -0
  74. package/src/wasm/wllama.js +1 -0
  75. package/src/wasm/wllama.wasm +0 -0
  76. package/src/wasm-from-cdn.ts +13 -0
  77. package/src/wllama.test.ts +392 -0
  78. package/src/wllama.ts +1138 -0
  79. package/src/wllama.wgpu.test.ts +62 -0
  80. package/src/worker.ts +443 -0
  81. package/src/workers-code/generated.ts +11 -0
  82. package/src/workers-code/llama-cpp.js +511 -0
  83. package/src/workers-code/opfs-utils.js +150 -0
  84. package/tsconfig.build.json +34 -0
  85. package/tsup.config.ts +23 -0
  86. package/vitest.config.ts +61 -0
@@ -0,0 +1,62 @@
1
+ import { test, expect } from 'vitest';
2
+ import { Wllama } from './wllama';
3
+
4
+ const CONFIG_PATHS = {
5
+ default: '/src/wasm/wllama.wasm',
6
+ };
7
+
8
+ // TODO: enable compat mode in tests once test infrastructure supports Safari/asyncify
9
+ const createWllama = (): Wllama => {
10
+ const w = new Wllama(CONFIG_PATHS);
11
+ w.setCompat(null);
12
+ return w;
13
+ };
14
+
15
+ const TINY_MODEL =
16
+ 'https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories15M-q4_0.gguf';
17
+
18
+ test('WebGPU is supported in this browser', () => {
19
+ const wllama = createWllama();
20
+ expect(wllama.isSupportWebGPU()).toBe(true);
21
+ });
22
+
23
+ test.sequential('loads model with WebGPU', async () => {
24
+ const wllama = createWllama();
25
+
26
+ expect(wllama.isSupportWebGPU()).toBe(true);
27
+
28
+ await wllama.loadModelFromUrl(TINY_MODEL, {
29
+ n_ctx: 1024,
30
+ n_gpu_layers: 99999,
31
+ });
32
+
33
+ expect(wllama.isModelLoaded()).toBe(true);
34
+ expect(wllama.getModelMetadata()).toBeDefined();
35
+
36
+ await wllama.exit();
37
+ });
38
+
39
+ test.sequential('generates completion with WebGPU', async () => {
40
+ const wllama = createWllama();
41
+
42
+ expect(wllama.isSupportWebGPU()).toBe(true);
43
+
44
+ await wllama.loadModelFromUrl(TINY_MODEL, {
45
+ n_ctx: 1024,
46
+ n_gpu_layers: 99999,
47
+ });
48
+
49
+ const res = await wllama.createCompletion({
50
+ prompt: 'Once upon a time',
51
+ max_tokens: 10,
52
+ temperature: 0.0,
53
+ top_p: 0.95,
54
+ top_k: 40,
55
+ seed: 42,
56
+ });
57
+
58
+ expect(res).toBeDefined();
59
+ expect(res.choices[0].text.length).toBeGreaterThan(0);
60
+
61
+ await wllama.exit();
62
+ });
package/src/worker.ts ADDED
@@ -0,0 +1,443 @@
1
+ /**
2
+ * Module code will be copied into worker.
3
+ *
4
+ * Messages between main <==> worker:
5
+ *
6
+ * From main thread to worker:
7
+ * - Send direction: { verb, args, callbackId }
8
+ * - Result direction: { callbackId, result } or { callbackId, err }
9
+ *
10
+ * Signal from worker to main:
11
+ * - Unidirection: { verb, args }
12
+ */
13
+
14
+ import { glueDeserialize, glueSerialize } from './glue/glue';
15
+ import type { GlueMsg } from './glue/messages';
16
+ import { Debug } from './debug';
17
+ import {
18
+ canUseAsyncFileRead,
19
+ createWorker,
20
+ isSafariMobile,
21
+ isString,
22
+ } from './utils';
23
+ import {
24
+ LLAMA_CPP_WORKER_CODE,
25
+ WLLAMA_EMSCRIPTEN_CODE,
26
+ } from './workers-code/generated';
27
+ import { WllamaRuntimeError } from './wllama';
28
+
29
+ interface Logger {
30
+ debug: typeof console.debug;
31
+ log: typeof console.log;
32
+ warn: typeof console.warn;
33
+ error: typeof console.error;
34
+ }
35
+
36
+ const FILE_READ_REQ_EVENT = 'fs.read_req';
37
+
38
+ interface TaskParam {
39
+ verb:
40
+ | 'module.init'
41
+ | 'fs.alloc'
42
+ | 'fs.write'
43
+ | 'fs.read_res'
44
+ | 'wllama.start'
45
+ | 'wllama.action'
46
+ | 'wllama.exit'
47
+ | 'wllama.debug';
48
+ args: any[];
49
+ callbackId: number;
50
+ }
51
+
52
+ interface Task {
53
+ resolve: any;
54
+ reject: any;
55
+ param: TaskParam;
56
+ buffers?: ArrayBuffer[] | undefined;
57
+ }
58
+
59
+ const JSPI_STUB = `
60
+ if (!WebAssembly.Suspending) {
61
+ // JSPI not available - stubs that keep the import/export tables valid.
62
+ // Suspending wraps imports: identity is fine since async imports won't be called.
63
+ WebAssembly.Suspending = function (fn) {
64
+ // console.log(fn.toString());
65
+ return fn;
66
+ };
67
+ // promising wraps exports: must return a Promise so ccall's ret.then() works.
68
+ WebAssembly.promising = function (fn) {
69
+ return function (...args) {
70
+ try {
71
+ return Promise.resolve(fn(...args));
72
+ } catch (e) {
73
+ return Promise.reject(e);
74
+ }
75
+ };
76
+ };
77
+ }
78
+ `;
79
+
80
+ export interface WllamaWorkerResources {
81
+ wasmPath: string;
82
+ // if jsPath is not provided, use WLLAMA_EMSCRIPTEN_CODE
83
+ jsPath?: string | { code: string } | undefined;
84
+ // in compat mode, mem64 must be disabled
85
+ compat: boolean;
86
+ }
87
+
88
+ export class ProxyToWorker {
89
+ resources: WllamaWorkerResources;
90
+ logger: Logger;
91
+ suppressNativeLog: boolean;
92
+ taskQueue: Task[] = [];
93
+ taskId: number = 1;
94
+ resultQueue: Task[] = [];
95
+ busy = false; // is the work loop is running?
96
+ worker?: Worker | undefined;
97
+ multiThread: boolean;
98
+ nbThread: number;
99
+ useAsyncFile: boolean;
100
+ fileBlobs: Map<string, Blob> = new Map(); // filename -> Blob for async reads
101
+
102
+ constructor(
103
+ resources: WllamaWorkerResources,
104
+ nbThread: number,
105
+ suppressNativeLog: boolean,
106
+ logger: Logger
107
+ ) {
108
+ this.resources = resources;
109
+ this.nbThread = nbThread;
110
+ this.multiThread = nbThread > 0;
111
+ this.logger = logger;
112
+ this.suppressNativeLog = suppressNativeLog;
113
+ this.useAsyncFile = canUseAsyncFileRead(resources.compat);
114
+ }
115
+
116
+ async getModuleCode(): Promise<string> {
117
+ if (!this.resources.jsPath) {
118
+ if (this.resources.compat) {
119
+ throw new Error(
120
+ 'compat mode is enabled but no jsPath was provided. Pass a worker JS via setCompat() or install @wllama/wllama-compat.'
121
+ );
122
+ }
123
+ return WLLAMA_EMSCRIPTEN_CODE;
124
+ } else if ((this.resources.jsPath as { code: string }).code) {
125
+ return (this.resources.jsPath as { code: string }).code;
126
+ } else if (isString(this.resources.jsPath)) {
127
+ const response = await fetch(this.resources.jsPath as string);
128
+ if (!response.ok) {
129
+ throw new Error(
130
+ `Failed to fetch worker code from ${this.resources.jsPath}`
131
+ );
132
+ }
133
+ return await response.text();
134
+ } else {
135
+ throw new Error('No JS code provided for worker');
136
+ }
137
+ }
138
+
139
+ async moduleInit(ggufFiles: { name: string; blob: Blob }[]): Promise<void> {
140
+ let moduleCode = JSPI_STUB + (await this.getModuleCode());
141
+ let mainModuleCode = moduleCode.replace('var Module', 'var ___Module');
142
+ const runOptions = {
143
+ pathConfig: {
144
+ 'wllama.wasm': this.resources.wasmPath,
145
+ },
146
+ nbThread: this.nbThread,
147
+ compat: this.resources.compat,
148
+ };
149
+ const completeCode: string = [
150
+ `const RUN_OPTIONS = ${JSON.stringify(runOptions)};`,
151
+ `function wModuleInit() { ${mainModuleCode}; return Module; }`,
152
+ LLAMA_CPP_WORKER_CODE,
153
+ ].join(';\n\n');
154
+ this.worker = createWorker(completeCode);
155
+ this.worker.onmessage = this.onRecvMsg.bind(this);
156
+ this.worker.onerror = this.logger.error;
157
+
158
+ const res = await this.pushTask({
159
+ verb: 'module.init',
160
+ args: [
161
+ new Blob([moduleCode], { type: 'text/javascript' }),
162
+ this.useAsyncFile,
163
+ ],
164
+ callbackId: this.taskId++,
165
+ });
166
+
167
+ // allocate all files
168
+ const nativeFiles: ({ id: number } & (typeof ggufFiles)[number])[] = [];
169
+ for (const file of ggufFiles) {
170
+ const needAllocBuffer = !this.useAsyncFile; // only alloc if mmap is used
171
+ const id = await this.fileAlloc(
172
+ file.name,
173
+ file.blob.size,
174
+ needAllocBuffer
175
+ );
176
+ nativeFiles.push({ id, ...file });
177
+ if (this.useAsyncFile) {
178
+ this.fileBlobs.set(file.name, file.blob);
179
+ }
180
+ }
181
+
182
+ // stream files (only used in non async - mmap mode)
183
+ if (!this.useAsyncFile) {
184
+ await Promise.all(
185
+ nativeFiles.map((file) => {
186
+ return this.fileWrite(file.id, file.blob);
187
+ })
188
+ );
189
+ }
190
+
191
+ return res;
192
+ }
193
+
194
+ async wllamaStart(): Promise<number> {
195
+ const result = await this.pushTask({
196
+ verb: 'wllama.start',
197
+ args: [],
198
+ callbackId: this.taskId++,
199
+ });
200
+ const parsedResult = this.parseResult(result);
201
+ return parsedResult;
202
+ }
203
+
204
+ async wllamaAction<T extends GlueMsg>(
205
+ name: string,
206
+ body: GlueMsg
207
+ ): Promise<T> {
208
+ // console.debug(`wllamaAction: ${name}`, body);
209
+ const encodedMsg = glueSerialize(body);
210
+ const result = await this.pushTask({
211
+ verb: 'wllama.action',
212
+ args: [name, encodedMsg],
213
+ callbackId: this.taskId++,
214
+ });
215
+ const parsedResult = glueDeserialize(result);
216
+ return parsedResult as T;
217
+ }
218
+
219
+ async wllamaExit(): Promise<void> {
220
+ if (this.worker) {
221
+ // we don't actually need to send exit
222
+ // terminating the worker is faster and resources will be cleaned up by the browser
223
+ // const result = await this.pushTask({
224
+ // verb: 'wllama.exit',
225
+ // args: [],
226
+ // callbackId: this.taskId++,
227
+ // });
228
+ // this.parseResult(result); // only check for exceptions
229
+ this.worker.terminate();
230
+ }
231
+ }
232
+
233
+ async wllamaDebug(): Promise<any> {
234
+ const result = await this.pushTask({
235
+ verb: 'wllama.debug',
236
+ args: [],
237
+ callbackId: this.taskId++,
238
+ });
239
+ return JSON.parse(result);
240
+ }
241
+
242
+ ///////////////////////////////////////
243
+
244
+ /**
245
+ * Allocate a new file in heapfs
246
+ * @returns fileId, to be used by fileWrite()
247
+ */
248
+ private async fileAlloc(
249
+ fileName: string,
250
+ size: number,
251
+ allocBuffer: boolean
252
+ ): Promise<number> {
253
+ const result = await this.pushTask({
254
+ verb: 'fs.alloc',
255
+ args: [fileName, size, allocBuffer],
256
+ callbackId: this.taskId++,
257
+ });
258
+ return result.fileId;
259
+ }
260
+
261
+ /**
262
+ * Write a Blob to heapfs
263
+ */
264
+ private async fileWrite(fileId: number, blob: Blob): Promise<void> {
265
+ const reader = blob.stream().getReader();
266
+ let offset = 0;
267
+ while (true) {
268
+ const { done, value } = await reader.read();
269
+ if (done) break;
270
+ const size = value.byteLength;
271
+ await this.pushTask(
272
+ {
273
+ verb: 'fs.write',
274
+ args: [fileId, value, offset],
275
+ callbackId: this.taskId++,
276
+ },
277
+ // @ts-ignore Type 'ArrayBufferLike' is not assignable to type 'ArrayBuffer'
278
+ [value.buffer]
279
+ );
280
+ offset += size;
281
+ }
282
+ }
283
+
284
+ private async fileReadResponse(
285
+ name: string,
286
+ offset: number,
287
+ size: number
288
+ ): Promise<void> {
289
+ try {
290
+ const blob = this.fileBlobs.get(name);
291
+ if (!blob) {
292
+ throw new Error(`blob not found for name="${name}"`);
293
+ }
294
+ const chunk = blob.slice(offset, offset + size);
295
+ const buffer = await chunk.arrayBuffer();
296
+ this.worker!!.postMessage(
297
+ { verb: 'fs.read_res', args: [buffer] },
298
+ { transfer: [buffer] }
299
+ );
300
+ } catch (err) {
301
+ this.logger.error('fileReadResponse failed, terminating worker:', err);
302
+ this.worker?.terminate();
303
+ this.worker = undefined;
304
+ this.abort(`File read failed: ${err}`, (err as Error).stack || '');
305
+ }
306
+ }
307
+
308
+ /**
309
+ * Parse JSON result returned by cpp code.
310
+ * Throw new Error if "__exception" is present in the response
311
+ *
312
+ * TODO: get rid of this function once everything is migrated to Glue
313
+ */
314
+ private parseResult(result: any): any {
315
+ const parsedResult = JSON.parse(result);
316
+ if (parsedResult && parsedResult['error']) {
317
+ throw new WllamaRuntimeError('Unknown error, please see console.log', '');
318
+ }
319
+ return parsedResult;
320
+ }
321
+
322
+ /**
323
+ * Push a new task to taskQueue
324
+ */
325
+ private pushTask(param: TaskParam, buffers?: ArrayBuffer[]) {
326
+ return new Promise<any>((resolve, reject) => {
327
+ this.taskQueue.push({ resolve, reject, param, buffers });
328
+ this.runTaskLoop();
329
+ });
330
+ }
331
+
332
+ /**
333
+ * Main loop for processing tasks
334
+ */
335
+ private async runTaskLoop() {
336
+ if (this.busy) {
337
+ return; // another loop is already running
338
+ }
339
+ this.busy = true;
340
+ while (true) {
341
+ const task = this.taskQueue.shift();
342
+ if (!task) break; // no more tasks
343
+ this.resultQueue.push(task);
344
+ // TODO @ngxson : Safari mobile doesn't support transferable ArrayBuffer
345
+ this.worker!!.postMessage(
346
+ task.param,
347
+ isSafariMobile()
348
+ ? undefined
349
+ : {
350
+ transfer: task.buffers ?? [],
351
+ }
352
+ );
353
+ }
354
+ this.busy = false;
355
+ }
356
+
357
+ /**
358
+ * Handle messages from worker
359
+ */
360
+ private onRecvMsg(e: MessageEvent<any>) {
361
+ if (!e.data) return; // ignore
362
+ const { verb, args } = e.data;
363
+ const isCompatBuild = this.resources.compat;
364
+ if (verb && verb.startsWith('console.')) {
365
+ if (this.suppressNativeLog) {
366
+ return;
367
+ }
368
+ if (verb.endsWith('debug')) this.logger.debug(...args);
369
+ if (verb.endsWith('log')) this.logger.log(...args);
370
+ if (verb.endsWith('warn')) this.logger.warn(...args);
371
+ if (verb.endsWith('error')) this.logger.error(...args);
372
+ return;
373
+ } else if (verb === 'signal.abort') {
374
+ const [signalType, message, rawStack, originalErr] = args as [
375
+ string,
376
+ string,
377
+ string,
378
+ any,
379
+ ];
380
+ if (originalErr) {
381
+ this.logger.error(originalErr);
382
+ }
383
+ (async () => {
384
+ let stack = '';
385
+ let newMsg = message.replace(
386
+ 'Build with -sASSERTIONS for more info.',
387
+ ''
388
+ );
389
+ if (signalType === 'abort') {
390
+ newMsg = `(ABORT) ${newMsg}`;
391
+ stack = rawStack.replace(/\|/g, '\n');
392
+ } else if (signalType === 'exception') {
393
+ stack = rawStack;
394
+ }
395
+ const decoded = await Debug.decodeStackTrace(stack, isCompatBuild);
396
+ this.logger.error(`Stack trace (${signalType}):\n` + decoded);
397
+ this.abort(newMsg, decoded);
398
+ })();
399
+ return;
400
+ }
401
+
402
+ // handle fs.read_req signal from wasm (JSPI-suspended worker)
403
+ if (verb === FILE_READ_REQ_EVENT) {
404
+ const [name, offset, size] = args as [string, number, number];
405
+ this.fileReadResponse(name, offset, size).catch(() => {}); // errors handled inside
406
+ return;
407
+ }
408
+
409
+ // handle task result
410
+ const { callbackId, result, err } = e.data;
411
+ if (callbackId) {
412
+ const idx = this.resultQueue.findIndex(
413
+ (t) => t.param.callbackId === callbackId
414
+ );
415
+ if (idx !== -1) {
416
+ const waitingTask = this.resultQueue.splice(idx, 1)[0];
417
+ if (err) waitingTask.reject(err);
418
+ else waitingTask.resolve(result);
419
+ } else {
420
+ this.logger.error(
421
+ `Cannot find waiting task with callbackId = ${callbackId}`
422
+ );
423
+ }
424
+ }
425
+ }
426
+
427
+ private abort(text: string, stack: string) {
428
+ const error = new WllamaRuntimeError(
429
+ text.length == 0 ? '(unknown error)' : text,
430
+ stack
431
+ );
432
+ while (this.resultQueue.length > 0) {
433
+ const waitingTask = this.resultQueue.pop();
434
+ if (!waitingTask) break;
435
+ waitingTask.reject(error);
436
+ }
437
+ while (this.taskQueue.length > 0) {
438
+ const pendingTask = this.taskQueue.pop();
439
+ if (!pendingTask) break;
440
+ pendingTask.reject(error);
441
+ }
442
+ }
443
+ }