npm - web-llm-runner - Versions diffs - 0.1.14 → 0.1.18 - Mend

web-llm-runner 0.1.14 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/lib/index.js +168 -16
package/lib/index.js.map +1 -1
package/lib/onnx_engine.d.ts +3 -1
package/lib/onnx_engine.d.ts.map +1 -1
package/lib/wrapper/WebLLMWrapper.d.ts +1 -0
package/lib/wrapper/WebLLMWrapper.d.ts.map +1 -1
package/package.json +1 -1

package/lib/index.js CHANGED Viewed

@@ -36688,6 +36688,7 @@ class ONNXEngine {
     modelId = null;
     appConfig;
     initProgressCallback;
+    repoId = null;
     // APIs
     chat;
     completions;
@@ -36719,6 +36720,7 @@ class ONNXEngine {
             const { findModelRecord } = await Promise.resolve().then(function () { return support; });
             const record = findModelRecord(id, this.appConfig);
             repoId = record.onnx_id || id;
+            this.repoId = repoId;
         }
         catch (e) {
             log.warn(`Model record not found for ${id}, using raw ID for ONNX.`);
@@ -36739,8 +36741,9 @@ class ONNXEngine {
         }
         try {
             // For T5 models, text2text-generation is the standard task in transformers.js
-            const task = repoId.toLowerCase().includes("t5") ? "text2text-generation" : "text-generation";
-            this.generator = await pipeline(task, repoId, {
+            const currentRepoId = this.repoId || id;
+            const task = currentRepoId.toLowerCase().includes("t5") ? "text2text-generation" : "text-generation";
+            this.generator = await pipeline(task, currentRepoId, {
                 progress_callback: (p) => {
                     if (this.initProgressCallback && (p.status === 'progress' || p.status === 'downloading')) {
                         const pctValue = (typeof p.progress === 'number') ? p.progress : 0;
@@ -36812,28 +36815,144 @@ class ONNXEngine {
         };
     }
     async *asyncGenerateStreaming(prompt, request) {
-        // Current simple implementation yields only a single chunk.
-        // In future iterations, we can integrate the Transformers.js TextStreamer
-        const result = await this.generateNonStreaming(prompt, request);
-        const content = result.choices[0].message.content;
+        if (!this.generator)
+            throw new Error("ONNX model not loaded.");
+        const model = this.modelId;
+        const created = Math.floor(Date.now() / 1000);
+        const id = crypto.randomUUID();
+        const queue = [];
+        let isDone = false;
+        let fullTextSoFar = "";
+        // Run generation in the background
+        (this.repoId || "").toLowerCase().includes("t5") ? "text2text-generation" : "text-generation";
+        this.generator(prompt, {
+            max_new_tokens: request.max_tokens || 256,
+            temperature: request.temperature || 0.7,
+            top_p: request.top_p || 1.0,
+            do_sample: (request.temperature ?? 1.0) > 0,
+            repetition_penalty: request.repetition_penalty || 1.1,
+            callback_function: (beams) => {
+                const decoded = this.generator.tokenizer.decode(beams[0].output_token_ids, { skip_special_tokens: true });
+                const delta = decoded.slice(fullTextSoFar.length);
+                if (delta) {
+                    queue.push(delta);
+                    fullTextSoFar = decoded;
+                }
+            },
+        }).finally(() => {
+            isDone = true;
+        });
+        while (!isDone || queue.length > 0) {
+            if (queue.length > 0) {
+                const content = queue.shift();
+                yield {
+                    id,
+                    choices: [{
+                            delta: { content },
+                            finish_reason: null,
+                            index: 0,
+                        }],
+                    model,
+                    object: 'chat.completion.chunk',
+                    created,
+                };
+            }
+            else {
+                await new Promise(r => setTimeout(r, 10));
+            }
+        }
         yield {
-            id: result.id,
+            id,
             choices: [{
-                    delta: { role: 'assistant', content: content },
+                    delta: {},
                     finish_reason: 'stop',
                     index: 0,
-                    logprobs: null
                 }],
-            model: result.model,
+            model,
             object: 'chat.completion.chunk',
-            created: result.created
+            created,
         };
     }
-    async completion(_request) {
-        throw new Error("Generic completion not yet implemented in ONNXEngine fallback.");
+    async completion(request) {
+        if (!this.generator)
+            throw new Error("ONNX model not loaded.");
+        const prompt = typeof request.prompt === 'string' ? request.prompt : (Array.isArray(request.prompt) ? request.prompt[0] : "");
+        if (request.stream) {
+            return this.asyncGenerateStreamingCompletion(prompt, request);
+        }
+        else {
+            const result = await this.generator(prompt, {
+                max_new_tokens: request.max_tokens || 256,
+                temperature: request.temperature || 0.7,
+                top_p: request.top_p || 1.0,
+                do_sample: (request.temperature ?? 1.0) > 0,
+                repetition_penalty: request.repetition_penalty || 1.1,
+            });
+            const fullText = result[0].generated_text;
+            const text = fullText.startsWith(prompt) ? fullText.slice(prompt.length) : fullText;
+            return {
+                id: crypto.randomUUID(),
+                choices: [{
+                        text,
+                        finish_reason: 'stop',
+                        index: 0,
+                        logprobs: null
+                    }],
+                model: this.modelId,
+                object: 'text_completion',
+                created: Math.floor(Date.now() / 1000),
+                usage: { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }
+            };
+        }
     }
-    async embedding(_request) {
-        throw new Error("Embeddings not yet implemented in ONNXEngine fallback.");
+    async *asyncGenerateStreamingCompletion(prompt, request) {
+        const id = crypto.randomUUID();
+        const created = Math.floor(Date.now() / 1000);
+        const model = this.modelId;
+        const queue = [];
+        let isDone = false;
+        let fullTextSoFar = "";
+        this.generator(prompt, {
+            max_new_tokens: request.max_tokens || 256,
+            temperature: request.temperature || 0.7,
+            callback_function: (beams) => {
+                const decoded = this.generator.tokenizer.decode(beams[0].output_token_ids, { skip_special_tokens: true });
+                const delta = decoded.slice(fullTextSoFar.length);
+                if (delta) {
+                    queue.push(delta);
+                    fullTextSoFar = decoded;
+                }
+            },
+        }).finally(() => { isDone = true; });
+        while (!isDone || queue.length > 0) {
+            if (queue.length > 0) {
+                yield {
+                    id,
+                    choices: [{ text: queue.shift(), finish_reason: null, index: 0 }],
+                    model,
+                    object: 'text_completion',
+                    created,
+                };
+            }
+            else {
+                await new Promise(r => setTimeout(r, 10));
+            }
+        }
+    }
+    async embedding(request) {
+        const input = Array.isArray(request.input) ? request.input : [request.input];
+        const extractor = await pipeline('feature-extraction', this.modelId);
+        const results = await Promise.all(input.map(text => extractor(text, { pooling: 'mean', normalize: true })));
+        return {
+            object: 'list',
+            data: results.map((res, i) => ({
+                object: 'embedding',
+                index: i,
+                embedding: Array.from(res.data)
+            })),
+            model: this.modelId,
+            usage: { prompt_tokens: 0, total_tokens: 0, extra: {} }
+        };
     }
     async runtimeStatsText() {
         return "Backend: ONNX Runtime (WASM/CPU Falback)";
@@ -38031,10 +38150,35 @@ class WebLLM {
             ];
             list = list.filter(m => approvedIds.includes(m.model_id));
         }
+        else {
+            // On Desktop, filter out those that are exclusively ONNX-id based (not for WebGPU)
+            list = list.filter(m => !m.onnx_id);
+        }
         return list.map((m) => m.model_id);
     }
     async local_model_available(model_id) {
-        return await hasModelInCache(model_id);
+        const isMLCCached = await hasModelInCache(model_id);
+        if (isMLCCached)
+            return true;
+        // Check ONNX cache fallback
+        const record = prebuiltAppConfig.model_list.find(m => m.model_id === model_id);
+        if (record && record.onnx_id) {
+            return await this.hasONNXInCache(record.onnx_id);
+        }
+        return false;
+    }
+    async hasONNXInCache(onnx_id) {
+        if (typeof caches === 'undefined')
+            return false;
+        try {
+            const cache = await caches.open('transformers-cache');
+            const url = `https://huggingface.co/${onnx_id}/resolve/main/config.json`;
+            const match = await cache.match(url);
+            return !!match;
+        }
+        catch (e) {
+            return false;
+        }
     }
     async download_model(model_id, progressCallback) {
         // Initial feedback
@@ -38055,6 +38199,14 @@ class WebLLM {
         return this.downloadProgress[model_id] || "No progress available.";
     }
     async delete_model(model_id) {
+        const record = prebuiltAppConfig.model_list.find(m => m.model_id === model_id);
+        if (record && record.onnx_id) {
+            // For ONNX, we currently clear the whole transformers-cache for simplicity
+            // as individual file deletion is complex without a full manifest.
+            if (typeof caches !== 'undefined') {
+                await caches.delete('transformers-cache');
+            }
+        }
         await deleteModelAllInfoInCache(model_id);
     }
     // chat endpoints (Stateful)