inferis-ml 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +664 -0
- package/dist/adapters/onnx.cjs +2 -0
- package/dist/adapters/onnx.cjs.map +1 -0
- package/dist/adapters/onnx.d.cts +28 -0
- package/dist/adapters/onnx.d.ts +28 -0
- package/dist/adapters/onnx.js +2 -0
- package/dist/adapters/onnx.js.map +1 -0
- package/dist/adapters/transformers.cjs +2 -0
- package/dist/adapters/transformers.cjs.map +1 -0
- package/dist/adapters/transformers.d.cts +29 -0
- package/dist/adapters/transformers.d.ts +29 -0
- package/dist/adapters/transformers.js +2 -0
- package/dist/adapters/transformers.js.map +1 -0
- package/dist/adapters/web-llm.cjs +2 -0
- package/dist/adapters/web-llm.cjs.map +1 -0
- package/dist/adapters/web-llm.d.cts +31 -0
- package/dist/adapters/web-llm.d.ts +31 -0
- package/dist/adapters/web-llm.js +2 -0
- package/dist/adapters/web-llm.js.map +1 -0
- package/dist/index.cjs +2 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +543 -0
- package/dist/index.d.ts +543 -0
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -0
- package/dist/types-Y6Ytjh7U.d.cts +271 -0
- package/dist/types-Y6Ytjh7U.d.ts +271 -0
- package/dist/worker/dedicated.worker.cjs +2 -0
- package/dist/worker/dedicated.worker.cjs.map +1 -0
- package/dist/worker/dedicated.worker.d.cts +17 -0
- package/dist/worker/dedicated.worker.d.ts +17 -0
- package/dist/worker/dedicated.worker.js +502 -0
- package/dist/worker/dedicated.worker.js.map +1 -0
- package/dist/worker/shared.worker.js +460 -0
- package/package.json +103 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 inferis contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,664 @@
|
|
|
1
|
+
# inferis-ml
|
|
2
|
+
|
|
3
|
+
Worker pool for running AI models in the browser — WebGPU/WASM auto-detection, model lifecycle management, token streaming, and cross-tab deduplication.
|
|
4
|
+
|
|
5
|
+
[](https://npmjs.com/package/inferis-ml)
|
|
6
|
+
[](https://bundlephobia.com/package/inferis-ml)
|
|
7
|
+
[](https://github.com/pashunechka/inferis-ml)
|
|
8
|
+
|
|
9
|
+
> **[Live Examples](https://pashunechka.github.io/inferis-ml/)** — run AI models directly in your browser, no server needed.
|
|
10
|
+
|
|
11
|
+
## What is this
|
|
12
|
+
|
|
13
|
+
You want to add smart search, speech recognition, or a chatbot to your website. Normally this requires a server — you send a request to the cloud, wait for a response, pay per call.
|
|
14
|
+
|
|
15
|
+
**inferis-ml** lets you run AI models directly in the user's browser. The model downloads once, then runs on the user's GPU/CPU. No server, no per-request cost, no data leaving the device.
|
|
16
|
+
|
|
17
|
+
The catch: running a neural network in the browser is technically painful. Run it on the main thread and the page freezes. Move it to a Web Worker and you're writing `postMessage` boilerplate. inferis-ml takes that pain away.
|
|
18
|
+
|
|
19
|
+
### Three problems it solves
|
|
20
|
+
|
|
21
|
+
**1. Page freezes during inference**
|
|
22
|
+
|
|
23
|
+
Without inferis-ml, running a model on the main thread locks the UI. With inferis-ml, work runs in a background worker — the page stays responsive.
|
|
24
|
+
|
|
25
|
+
**2. 5 open tabs = 5 model copies in RAM**
|
|
26
|
+
|
|
27
|
+
Without inferis-ml: 5 tabs × 2 GB LLM = 10 GB RAM. Browser crashes.
|
|
28
|
+
With `crossTab: true`: all tabs share one worker, one model copy in memory.
|
|
29
|
+
|
|
30
|
+
**3. WebGPU not available everywhere**
|
|
31
|
+
|
|
32
|
+
Without inferis-ml: you manually detect WebGPU and swap backends.
|
|
33
|
+
With `defaultDevice: 'auto'`: inferis-ml tries WebGPU, silently falls back to WASM if unavailable.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Why
|
|
38
|
+
|
|
39
|
+
Existing browser AI runtimes (transformers.js, web-llm, onnxruntime-web) give you inference but leave worker management entirely to you:
|
|
40
|
+
|
|
41
|
+
- Create the Web Worker manually
|
|
42
|
+
- Wire up `postMessage` and response correlation
|
|
43
|
+
- Implement model lifecycle (load → infer → dispose)
|
|
44
|
+
- Avoid loading the same model twice across browser tabs
|
|
45
|
+
- Handle WebGPU → WASM fallback
|
|
46
|
+
- Evict models when memory budget is exceeded
|
|
47
|
+
- Forward streaming tokens to the UI
|
|
48
|
+
|
|
49
|
+
**inferis-ml** handles all of this. You get a clean async API and focus on building the product.
|
|
50
|
+
|
|
51
|
+
## Features
|
|
52
|
+
|
|
53
|
+
- **Runtime-agnostic** — adapters for `@huggingface/transformers`, `@mlc-ai/web-llm`, `onnxruntime-web`, or your own
|
|
54
|
+
- **Zero framework dependencies** — works with React, Vue, Svelte, or vanilla JS
|
|
55
|
+
- **WebGPU → WASM fallback** — auto-detected, or configured explicitly
|
|
56
|
+
- **Streaming** — `ReadableStream` + `for await` for token-by-token LLM output
|
|
57
|
+
- **Memory budget** — LRU eviction when models exceed the configured cap
|
|
58
|
+
- **Cross-tab dedup** — SharedWorker (tier 1), leader election (tier 2), or per-tab fallback (tier 3)
|
|
59
|
+
- **AbortController** — cancel any in-flight inference
|
|
60
|
+
- **TypeScript** — full type safety, generic output types
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
npm install inferis-ml
|
|
66
|
+
|
|
67
|
+
# Install the adapter you need (optional peer deps):
|
|
68
|
+
npm install @huggingface/transformers # for transformersAdapter
|
|
69
|
+
npm install @mlc-ai/web-llm # for webLlmAdapter
|
|
70
|
+
npm install onnxruntime-web # for onnxAdapter
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Use Cases
|
|
74
|
+
|
|
75
|
+
### Semantic search over articles
|
|
76
|
+
|
|
77
|
+
User types a query — find articles by meaning, not just keywords.
|
|
78
|
+
|
|
79
|
+
```typescript
|
|
80
|
+
import { createPool } from 'inferis-ml';
|
|
81
|
+
import { transformersAdapter } from 'inferis-ml/adapters/transformers';
|
|
82
|
+
|
|
83
|
+
const pool = await createPool({ adapter: transformersAdapter() });
|
|
84
|
+
|
|
85
|
+
const embedder = await pool.load<number[][]>('feature-extraction', {
|
|
86
|
+
model: 'mixedbread-ai/mxbai-embed-xsmall-v1',
|
|
87
|
+
onProgress: ({ phase, loaded, total }) => {
|
|
88
|
+
const pct = total > 0 ? Math.round(loaded / total * 100) : 0;
|
|
89
|
+
updateProgressBar(pct, phase);
|
|
90
|
+
},
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
const articles = ['How to choose a laptop', 'Borscht recipe', 'History of Rome'];
|
|
94
|
+
const embeddings = await embedder.run(articles);
|
|
95
|
+
// embeddings: number[][] — one vector per article
|
|
96
|
+
|
|
97
|
+
const query = await embedder.run(['buy a computer']);
|
|
98
|
+
// compare query[0] against embeddings with cosine similarity
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Chatbot with streaming output
|
|
102
|
+
|
|
103
|
+
Answer appears word by word, like ChatGPT.
|
|
104
|
+
|
|
105
|
+
```typescript
|
|
106
|
+
import { createPool } from 'inferis-ml';
|
|
107
|
+
import { webLlmAdapter } from 'inferis-ml/adapters/web-llm';
|
|
108
|
+
|
|
109
|
+
const pool = await createPool({
|
|
110
|
+
adapter: webLlmAdapter(),
|
|
111
|
+
maxWorkers: 1, // LLMs use one GPU context
|
|
112
|
+
defaultDevice: 'webgpu',
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
const llm = await pool.load<string>('text-generation', {
|
|
116
|
+
model: 'Llama-3.2-3B-Instruct-q4f32_1-MLC',
|
|
117
|
+
onProgress: ({ phase }) => setStatus(phase),
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
const outputDiv = document.getElementById('answer');
|
|
121
|
+
const stream = llm.stream({
|
|
122
|
+
messages: [
|
|
123
|
+
{ role: 'system', content: 'You are a helpful assistant.' },
|
|
124
|
+
{ role: 'user', content: userQuestion },
|
|
125
|
+
],
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
for await (const token of stream) {
|
|
129
|
+
outputDiv.textContent += token;
|
|
130
|
+
}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Speech transcription
|
|
134
|
+
|
|
135
|
+
```typescript
|
|
136
|
+
const transcriber = await pool.load<{ text: string }>('automatic-speech-recognition', {
|
|
137
|
+
model: 'openai/whisper-base',
|
|
138
|
+
estimatedMemoryMB: 80,
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
const audioData = await getMicrophoneAudio(); // Float32Array
|
|
142
|
+
const result = await transcriber.run(audioData);
|
|
143
|
+
console.log(result.text); // "Hello, how are you?"
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Cancel a running request
|
|
147
|
+
|
|
148
|
+
```typescript
|
|
149
|
+
const controller = new AbortController();
|
|
150
|
+
stopButton.onclick = () => controller.abort();
|
|
151
|
+
|
|
152
|
+
try {
|
|
153
|
+
const stream = llm.stream(input, { signal: controller.signal });
|
|
154
|
+
for await (const token of stream) {
|
|
155
|
+
outputDiv.textContent += token;
|
|
156
|
+
}
|
|
157
|
+
} catch (e) {
|
|
158
|
+
if (e.name === 'AbortError') outputDiv.textContent += ' [stopped]';
|
|
159
|
+
}
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Model state changes
|
|
163
|
+
|
|
164
|
+
```typescript
|
|
165
|
+
model.onStateChange((state) => {
|
|
166
|
+
if (state === 'loading') showSpinner();
|
|
167
|
+
if (state === 'ready') hideSpinner();
|
|
168
|
+
if (state === 'error') showError('Failed to load model');
|
|
169
|
+
if (state === 'disposed') disableUI();
|
|
170
|
+
});
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Quick Start
|
|
176
|
+
|
|
177
|
+
### Embeddings
|
|
178
|
+
|
|
179
|
+
```typescript
|
|
180
|
+
import { createPool } from 'inferis-ml';
|
|
181
|
+
import { transformersAdapter } from 'inferis-ml/adapters/transformers';
|
|
182
|
+
|
|
183
|
+
const pool = await createPool({
|
|
184
|
+
adapter: transformersAdapter(),
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
const model = await pool.load<number[][]>('feature-extraction', {
|
|
188
|
+
model: 'mixedbread-ai/mxbai-embed-xsmall-v1',
|
|
189
|
+
onProgress: (p) => console.log(`${p.phase}: ${(p.loaded / p.total * 100) | 0}%`),
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
const embeddings = await model.run(['Hello world', 'Another sentence']);
|
|
193
|
+
// embeddings: number[][]
|
|
194
|
+
|
|
195
|
+
await model.dispose();
|
|
196
|
+
await pool.terminate();
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### LLM Streaming
|
|
200
|
+
|
|
201
|
+
```typescript
|
|
202
|
+
import { createPool } from 'inferis-ml';
|
|
203
|
+
import { webLlmAdapter } from 'inferis-ml/adapters/web-llm';
|
|
204
|
+
|
|
205
|
+
const pool = await createPool({
|
|
206
|
+
adapter: webLlmAdapter(),
|
|
207
|
+
defaultDevice: 'webgpu',
|
|
208
|
+
maxWorkers: 1,
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
const llm = await pool.load<string>('text-generation', {
|
|
212
|
+
model: 'Llama-3.1-8B-Instruct-q4f32_1-MLC',
|
|
213
|
+
onProgress: ({ phase }) => console.log(phase),
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
const stream = llm.stream({
|
|
217
|
+
messages: [
|
|
218
|
+
{ role: 'system', content: 'You are a helpful assistant.' },
|
|
219
|
+
{ role: 'user', content: 'Explain WebGPU in 3 sentences.' },
|
|
220
|
+
],
|
|
221
|
+
});
|
|
222
|
+
|
|
223
|
+
const output = document.getElementById('output');
|
|
224
|
+
for await (const token of stream) {
|
|
225
|
+
output.textContent += token;
|
|
226
|
+
}
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Abort
|
|
230
|
+
|
|
231
|
+
```typescript
|
|
232
|
+
const ctrl = new AbortController();
|
|
233
|
+
|
|
234
|
+
const stream = llm.stream(input, { signal: ctrl.signal });
|
|
235
|
+
|
|
236
|
+
// Cancel after 5 seconds
|
|
237
|
+
setTimeout(() => ctrl.abort(), 5000);
|
|
238
|
+
|
|
239
|
+
try {
|
|
240
|
+
for await (const token of stream) {
|
|
241
|
+
updateUI(token);
|
|
242
|
+
}
|
|
243
|
+
} catch (e) {
|
|
244
|
+
if (e.name === 'AbortError') console.log('Cancelled');
|
|
245
|
+
}
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### Cross-Tab Deduplication
|
|
249
|
+
|
|
250
|
+
```typescript
|
|
251
|
+
// Enable cross-tab model sharing.
|
|
252
|
+
// If you open 5 tabs, the model is loaded only once.
|
|
253
|
+
const pool = await createPool({
|
|
254
|
+
adapter: transformersAdapter(),
|
|
255
|
+
crossTab: true, // auto-selects SharedWorker > leader election > per-tab
|
|
256
|
+
});
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Capability Detection
|
|
260
|
+
|
|
261
|
+
```typescript
|
|
262
|
+
import { detectCapabilities } from 'inferis-ml';
|
|
263
|
+
|
|
264
|
+
const caps = await detectCapabilities();
|
|
265
|
+
|
|
266
|
+
if (caps.webgpu.supported) {
|
|
267
|
+
console.log('GPU vendor:', caps.webgpu.adapter?.vendor);
|
|
268
|
+
console.log('Max buffer:', caps.webgpu.limits?.maxBufferSize);
|
|
269
|
+
} else {
|
|
270
|
+
console.log('Falling back to WASM');
|
|
271
|
+
console.log('SIMD support:', caps.wasm.simd);
|
|
272
|
+
}
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
## Custom Adapter
|
|
276
|
+
|
|
277
|
+
```typescript
|
|
278
|
+
import type { ModelAdapter, ModelAdapterFactory } from 'inferis-ml';
|
|
279
|
+
|
|
280
|
+
export function myCustomAdapter(): ModelAdapterFactory {
|
|
281
|
+
return {
|
|
282
|
+
name: 'my-adapter',
|
|
283
|
+
|
|
284
|
+
async create(): Promise<ModelAdapter> {
|
|
285
|
+
// This runs INSIDE the worker — safe to import heavy libs here
|
|
286
|
+
const { MyRuntime } = await import('my-runtime');
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
name: 'my-adapter',
|
|
290
|
+
|
|
291
|
+
estimateMemoryMB(_task, config) {
|
|
292
|
+
return (config.estimatedMemoryMB as number) ?? 50;
|
|
293
|
+
},
|
|
294
|
+
|
|
295
|
+
async load(task, config, device, onProgress) {
|
|
296
|
+
onProgress({ phase: 'loading', loaded: 0, total: 1 });
|
|
297
|
+
const instance = await MyRuntime.load(config.model as string, { device });
|
|
298
|
+
onProgress({ phase: 'done', loaded: 1, total: 1 });
|
|
299
|
+
return { instance, memoryMB: 50 };
|
|
300
|
+
},
|
|
301
|
+
|
|
302
|
+
async run(model, input) {
|
|
303
|
+
return (model.instance as MyRuntime).infer(input);
|
|
304
|
+
},
|
|
305
|
+
|
|
306
|
+
async stream(model, input, onChunk) {
|
|
307
|
+
for await (const chunk of (model.instance as MyRuntime).stream(input)) {
|
|
308
|
+
onChunk(chunk);
|
|
309
|
+
}
|
|
310
|
+
},
|
|
311
|
+
|
|
312
|
+
async unload(model) {
|
|
313
|
+
await (model.instance as MyRuntime).dispose();
|
|
314
|
+
},
|
|
315
|
+
};
|
|
316
|
+
},
|
|
317
|
+
};
|
|
318
|
+
}
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
## API Reference
|
|
322
|
+
|
|
323
|
+
### `createPool(config)`
|
|
324
|
+
|
|
325
|
+
```typescript
|
|
326
|
+
const pool = await createPool({
|
|
327
|
+
adapter: transformersAdapter(), // required
|
|
328
|
+
|
|
329
|
+
workerUrl: new URL('inferis-ml/worker', import.meta.url), // worker bundle URL
|
|
330
|
+
maxWorkers: navigator.hardwareConcurrency - 1, // default: cores - 1
|
|
331
|
+
maxMemoryMB: 2048, // default: 2048
|
|
332
|
+
defaultDevice: 'auto', // 'webgpu' | 'wasm' | 'auto'
|
|
333
|
+
crossTab: false, // cross-tab dedup
|
|
334
|
+
taskTimeout: 120_000, // per-task timeout in ms
|
|
335
|
+
});
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### `pool.load<TOutput>(task, config)`
|
|
339
|
+
|
|
340
|
+
Loads a model and returns a `ModelHandle`. If the model is already loaded, returns the existing handle.
|
|
341
|
+
|
|
342
|
+
```typescript
|
|
343
|
+
const model = await pool.load<number[][]>('feature-extraction', {
|
|
344
|
+
model: 'mixedbread-ai/mxbai-embed-xsmall-v1',
|
|
345
|
+
estimatedMemoryMB: 30, // hint for memory budget (optional)
|
|
346
|
+
onProgress: (p) => { ... }, // download/load progress
|
|
347
|
+
});
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
### `ModelHandle<TOutput>`
|
|
351
|
+
|
|
352
|
+
| Method | Description |
|
|
353
|
+
|--------|-------------|
|
|
354
|
+
| `run(input, options?)` | Non-streaming inference. Returns `Promise<TOutput>`. |
|
|
355
|
+
| `stream(input, options?)` | Streaming inference. Returns `ReadableStream<TOutput>`. |
|
|
356
|
+
| `dispose()` | Unload model and free memory. |
|
|
357
|
+
| `onStateChange(cb)` | Subscribe to state changes. Returns unsubscribe function. |
|
|
358
|
+
| `id` | Unique model ID (`task:model`). |
|
|
359
|
+
| `state` | Current state: `idle \| loading \| ready \| inferring \| unloading \| error \| disposed`. |
|
|
360
|
+
| `memoryMB` | Approximate memory usage. |
|
|
361
|
+
| `device` | Resolved device: `webgpu` or `wasm`. |
|
|
362
|
+
|
|
363
|
+
### `InferenceOptions`
|
|
364
|
+
|
|
365
|
+
```typescript
|
|
366
|
+
interface InferenceOptions {
|
|
367
|
+
signal?: AbortSignal; // cancel via AbortController
|
|
368
|
+
priority?: 'high' | 'normal' | 'low'; // scheduling priority
|
|
369
|
+
}
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
## Bundler Setup
|
|
373
|
+
|
|
374
|
+
### Vite
|
|
375
|
+
|
|
376
|
+
```typescript
|
|
377
|
+
// vite.config.ts
|
|
378
|
+
export default {
|
|
379
|
+
worker: { format: 'es' },
|
|
380
|
+
};
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
```typescript
|
|
384
|
+
// Usage
|
|
385
|
+
const pool = await createPool({
|
|
386
|
+
adapter: transformersAdapter(),
|
|
387
|
+
workerUrl: new URL('inferis-ml/worker', import.meta.url),
|
|
388
|
+
});
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
### webpack 5
|
|
392
|
+
|
|
393
|
+
```typescript
|
|
394
|
+
// webpack.config.js
|
|
395
|
+
module.exports = {
|
|
396
|
+
experiments: { asyncWebAssembly: true },
|
|
397
|
+
};
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
### Inline Worker (no bundler config needed)
|
|
401
|
+
|
|
402
|
+
```typescript
|
|
403
|
+
import { createPool } from 'inferis-ml';
|
|
404
|
+
import { inlineWorkerUrl } from 'inferis-ml/worker-inline';
|
|
405
|
+
|
|
406
|
+
const pool = await createPool({
|
|
407
|
+
adapter: transformersAdapter(),
|
|
408
|
+
workerUrl: inlineWorkerUrl(), // creates a Blob URL
|
|
409
|
+
});
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
## Browser Support
|
|
413
|
+
|
|
414
|
+
| Feature | Chrome | Firefox | Safari | Edge | iOS Safari | Android Chrome |
|
|
415
|
+
|---------|--------|---------|--------|------|------------|----------------|
|
|
416
|
+
| Core (Worker + WASM) | 57+ | 52+ | 11+ | 16+ | 11+ | 57+ |
|
|
417
|
+
| WebGPU | 113+ | 141+ | 26+ | 113+ | 26+ | 121+ |
|
|
418
|
+
| WASM SIMD | 91+ | 89+ | 16.4+ | 91+ | 16.4+ | 91+ |
|
|
419
|
+
| SharedWorker (cross-tab tier 1) | 4+ | 29+ | 16+ | 79+ | — | — |
|
|
420
|
+
| Leader Election (cross-tab tier 2) | 69+ | 96+ | 15.4+ | 79+ | 15.4+ | 69+ |
|
|
421
|
+
| AbortController | 66+ | 57+ | 12.1+ | 16+ | 12.2+ | 66+ |
|
|
422
|
+
|
|
423
|
+
**Minimum requirement:** Web Workers + WebAssembly (97%+ of browsers worldwide).
|
|
424
|
+
All advanced features (WebGPU, SharedWorker, leader election) are progressive enhancements.
|
|
425
|
+
|
|
426
|
+
## Performance Tips
|
|
427
|
+
|
|
428
|
+
- **Set `maxWorkers: 1`** for GPU-bound workloads (LLMs) — GPU has one execution context.
|
|
429
|
+
- **Set `defaultDevice: 'webgpu'`** explicitly if you know your users have modern hardware.
|
|
430
|
+
- **Use `estimatedMemoryMB`** to help the memory budget make accurate eviction decisions.
|
|
431
|
+
- **Reuse `ModelHandle`** — loading a model already in state `ready` is a no-op.
|
|
432
|
+
- **Enable `crossTab: true`** for apps users open in multiple tabs (chat, document editors).
|
|
433
|
+
|
|
434
|
+
## Popular Models
|
|
435
|
+
|
|
436
|
+
Models are downloaded automatically from [Hugging Face Hub](https://huggingface.co/models) on first use and cached in the browser's Cache API. Subsequent page loads use the cache — no re-download, works offline.
|
|
437
|
+
|
|
438
|
+
### Embeddings / Semantic Search
|
|
439
|
+
|
|
440
|
+
| Model ID | Size | Notes |
|
|
441
|
+
|----------|------|-------|
|
|
442
|
+
| `mixedbread-ai/mxbai-embed-xsmall-v1` | 23 MB | Best quality/size ratio for English |
|
|
443
|
+
| `Xenova/all-MiniLM-L6-v2` | 23 MB | Popular multilingual embedding model |
|
|
444
|
+
| `Xenova/all-mpnet-base-v2` | 86 MB | Higher quality, larger |
|
|
445
|
+
| `Xenova/multilingual-e5-small` | 118 MB | 100+ languages |
|
|
446
|
+
|
|
447
|
+
```typescript
|
|
448
|
+
const model = await pool.load<number[][]>('feature-extraction', {
|
|
449
|
+
model: 'mixedbread-ai/mxbai-embed-xsmall-v1',
|
|
450
|
+
});
|
|
451
|
+
const vectors = await model.run(['Hello world', 'Another sentence']);
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
### Text Generation (LLM)
|
|
455
|
+
|
|
456
|
+
> Requires `@mlc-ai/web-llm` and `defaultDevice: 'webgpu'`. Models are large — download once, cached permanently.
|
|
457
|
+
|
|
458
|
+
| Model ID | Size | Notes |
|
|
459
|
+
|----------|------|-------|
|
|
460
|
+
| `Llama-3.2-1B-Instruct-q4f32_1-MLC` | 0.8 GB | Fastest, decent quality |
|
|
461
|
+
| `Llama-3.2-3B-Instruct-q4f32_1-MLC` | 2 GB | Good balance |
|
|
462
|
+
| `Phi-3.5-mini-instruct-q4f16_1-MLC` | 2.2 GB | Microsoft, strong reasoning |
|
|
463
|
+
| `Llama-3.1-8B-Instruct-q4f32_1-MLC` | 5 GB | Best quality, needs 8+ GB RAM |
|
|
464
|
+
| `gemma-2-2b-it-q4f16_1-MLC` | 1.5 GB | Google, fast on mobile GPU |
|
|
465
|
+
|
|
466
|
+
```typescript
|
|
467
|
+
const llm = await pool.load<string>('text-generation', {
|
|
468
|
+
model: 'Llama-3.2-3B-Instruct-q4f32_1-MLC',
|
|
469
|
+
});
|
|
470
|
+
const stream = llm.stream({ messages: [{ role: 'user', content: 'Hello!' }] });
|
|
471
|
+
for await (const token of stream) { outputDiv.textContent += token; }
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
### Speech Recognition
|
|
475
|
+
|
|
476
|
+
| Model ID | Size | Notes |
|
|
477
|
+
|----------|------|-------|
|
|
478
|
+
| `openai/whisper-tiny` | 39 MB | Fastest, lower accuracy |
|
|
479
|
+
| `openai/whisper-base` | 74 MB | Good balance |
|
|
480
|
+
| `openai/whisper-small` | 244 MB | Better accuracy |
|
|
481
|
+
| `openai/whisper-medium` | 769 MB | Near server-level accuracy |
|
|
482
|
+
|
|
483
|
+
```typescript
|
|
484
|
+
const model = await pool.load<{ text: string }>('automatic-speech-recognition', {
|
|
485
|
+
model: 'openai/whisper-base',
|
|
486
|
+
});
|
|
487
|
+
const result = await model.run(float32AudioArray);
|
|
488
|
+
console.log(result.text);
|
|
489
|
+
```
|
|
490
|
+
|
|
491
|
+
### Text Classification / Sentiment
|
|
492
|
+
|
|
493
|
+
| Model ID | Size | Notes |
|
|
494
|
+
|----------|------|-------|
|
|
495
|
+
| `Xenova/distilbert-base-uncased-finetuned-sst-2-english` | 67 MB | Positive/negative sentiment |
|
|
496
|
+
| `Xenova/bert-base-multilingual-uncased-sentiment` | 168 MB | Multilingual, 1–5 stars |
|
|
497
|
+
| `Xenova/toxic-bert` | 438 MB | Toxicity detection |
|
|
498
|
+
|
|
499
|
+
```typescript
|
|
500
|
+
const model = await pool.load<{ label: string; score: number }[]>('text-classification', {
|
|
501
|
+
model: 'Xenova/distilbert-base-uncased-finetuned-sst-2-english',
|
|
502
|
+
});
|
|
503
|
+
const result = await model.run('I love this product!');
|
|
504
|
+
// [{ label: 'POSITIVE', score: 0.999 }]
|
|
505
|
+
```
|
|
506
|
+
|
|
507
|
+
### Translation
|
|
508
|
+
|
|
509
|
+
| Model ID | Size | Notes |
|
|
510
|
+
|----------|------|-------|
|
|
511
|
+
| `Xenova/opus-mt-en-ru` | 74 MB | English → Russian |
|
|
512
|
+
| `Xenova/opus-mt-ru-en` | 74 MB | Russian → English |
|
|
513
|
+
| `Xenova/m2m100_418M` | 418 MB | 100 languages ↔ 100 languages |
|
|
514
|
+
| `Xenova/nllb-200-distilled-600M` | 600 MB | Meta, 200 languages |
|
|
515
|
+
|
|
516
|
+
```typescript
|
|
517
|
+
const model = await pool.load<{ translation_text: string }[]>('translation', {
|
|
518
|
+
model: 'Xenova/opus-mt-en-ru',
|
|
519
|
+
});
|
|
520
|
+
const result = await model.run('Hello, world!');
|
|
521
|
+
// [{ translation_text: 'Привет, мир!' }]
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
### Image Classification
|
|
525
|
+
|
|
526
|
+
| Model ID | Size | Notes |
|
|
527
|
+
|----------|------|-------|
|
|
528
|
+
| `Xenova/vit-base-patch16-224` | 343 MB | General image classification |
|
|
529
|
+
| `Xenova/mobilevit-small` | 22 MB | Lightweight, mobile-friendly |
|
|
530
|
+
| `Xenova/efficientnet-lite4` | 13 MB | Fastest, 1000 ImageNet classes |
|
|
531
|
+
|
|
532
|
+
```typescript
|
|
533
|
+
const model = await pool.load<{ label: string; score: number }[]>('image-classification', {
|
|
534
|
+
model: 'Xenova/efficientnet-lite4',
|
|
535
|
+
});
|
|
536
|
+
const result = await model.run('https://example.com/cat.jpg');
|
|
537
|
+
// [{ label: 'tabby cat', score: 0.92 }]
|
|
538
|
+
```
|
|
539
|
+
|
|
540
|
+
### How downloads work
|
|
541
|
+
|
|
542
|
+
```
|
|
543
|
+
First visit: download from source → save to Cache API → run
|
|
544
|
+
(5–60s depending on model size and connection)
|
|
545
|
+
|
|
546
|
+
Next visits: load from Cache API → run
|
|
547
|
+
(1–3s initialization only, no network needed)
|
|
548
|
+
|
|
549
|
+
Offline: load from Cache API → run
|
|
550
|
+
(works without internet after first load)
|
|
551
|
+
```
|
|
552
|
+
|
|
553
|
+
### Where models come from
|
|
554
|
+
|
|
555
|
+
Models are **not** locked to Hugging Face. Each adapter has its own sources:
|
|
556
|
+
|
|
557
|
+
**transformers.js** — HF Hub ID or any direct URL:
|
|
558
|
+
|
|
559
|
+
```typescript
|
|
560
|
+
// From Hugging Face Hub (default)
|
|
561
|
+
await pool.load('feature-extraction', {
|
|
562
|
+
model: 'mixedbread-ai/mxbai-embed-xsmall-v1',
|
|
563
|
+
});
|
|
564
|
+
|
|
565
|
+
// From your own CDN or server
|
|
566
|
+
await pool.load('feature-extraction', {
|
|
567
|
+
model: 'https://your-cdn.com/models/mxbai-embed-xsmall-v1',
|
|
568
|
+
});
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
The model folder must contain the same file structure as HF Hub: `onnx/model.onnx`, `tokenizer.json`, `config.json`. You can download a model from HF Hub once and re-host it anywhere.
|
|
572
|
+
|
|
573
|
+
**web-llm** — from the MLC model registry by default. To use your own hosted model, add it to the registry before creating the pool:
|
|
574
|
+
|
|
575
|
+
```typescript
|
|
576
|
+
import { CreateMLCEngine, prebuiltAppConfig } from '@mlc-ai/web-llm';
|
|
577
|
+
|
|
578
|
+
// Register a custom model
|
|
579
|
+
const customConfig = {
|
|
580
|
+
...prebuiltAppConfig,
|
|
581
|
+
model_list: [
|
|
582
|
+
...prebuiltAppConfig.model_list,
|
|
583
|
+
{
|
|
584
|
+
model: 'https://your-cdn.com/my-llm/', // folder with model shards
|
|
585
|
+
model_id: 'my-custom-llm',
|
|
586
|
+
model_lib: 'https://your-cdn.com/my-llm/model.wasm',
|
|
587
|
+
},
|
|
588
|
+
],
|
|
589
|
+
};
|
|
590
|
+
|
|
591
|
+
// Pass config through the adapter
|
|
592
|
+
const pool = await createPool({ adapter: webLlmAdapter({ appConfig: customConfig }) });
|
|
593
|
+
await pool.load('text-generation', { model: 'my-custom-llm' });
|
|
594
|
+
```
|
|
595
|
+
|
|
596
|
+
**onnxruntime-web** — direct URL to a `.onnx` file, no registry:
|
|
597
|
+
|
|
598
|
+
```typescript
|
|
599
|
+
await pool.load('custom', {
|
|
600
|
+
model: 'https://your-cdn.com/model.onnx',
|
|
601
|
+
});
|
|
602
|
+
```
|
|
603
|
+
|
|
604
|
+
**Custom adapter** — full control, load from anywhere (fetch, IndexedDB, bundled asset):
|
|
605
|
+
|
|
606
|
+
```typescript
|
|
607
|
+
async load(task, config, device, onProgress) {
|
|
608
|
+
// fetch from any source
|
|
609
|
+
const response = await fetch(config.model as string);
|
|
610
|
+
const total = Number(response.headers.get('content-length') ?? 0);
|
|
611
|
+
let loaded = 0;
|
|
612
|
+
|
|
613
|
+
const reader = response.body!.getReader();
|
|
614
|
+
const chunks: Uint8Array[] = [];
|
|
615
|
+
while (true) {
|
|
616
|
+
const { done, value } = await reader.read();
|
|
617
|
+
if (done) break;
|
|
618
|
+
chunks.push(value);
|
|
619
|
+
loaded += value.byteLength;
|
|
620
|
+
onProgress({ loaded, phase: 'downloading', total });
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
// build model from raw bytes
|
|
624
|
+
const buffer = mergeChunks(chunks);
|
|
625
|
+
const instance = await MyRuntime.loadFromBuffer(buffer);
|
|
626
|
+
return { instance, memoryMB: 50 };
|
|
627
|
+
}
|
|
628
|
+
```
|
|
629
|
+
|
|
630
|
+
---
|
|
631
|
+
|
|
632
|
+
## When to use
|
|
633
|
+
|
|
634
|
+
| Scenario | Suitable? |
|
|
635
|
+
|----------|-----------|
|
|
636
|
+
| Semantic search over content | ✓ |
|
|
637
|
+
| Chatbot / text generation | ✓ |
|
|
638
|
+
| Speech transcription | ✓ |
|
|
639
|
+
| Image classification | ✓ |
|
|
640
|
+
| Sentiment analysis | ✓ |
|
|
641
|
+
| Translation | ✓ |
|
|
642
|
+
| Private data processing (data never leaves the device) | ✓ |
|
|
643
|
+
| Offline mode (works after first load, no internet) | ✓ |
|
|
644
|
+
| High-volume batch processing on a server | ✗ use server-side inference |
|
|
645
|
+
| Real-time video/audio streaming analysis | ✗ latency too high for WASM |
|
|
646
|
+
|
|
647
|
+
### inferis-ml is a good fit when
|
|
648
|
+
|
|
649
|
+
- You want to avoid per-request API costs
|
|
650
|
+
- Your users' data is sensitive and must not leave the device
|
|
651
|
+
- You need the app to work offline after first load
|
|
652
|
+
- Your users have modern hardware (GPU acceleration is a bonus, not a requirement)
|
|
653
|
+
- You are building a single-page app where the model stays loaded across user interactions
|
|
654
|
+
|
|
655
|
+
### inferis-ml is not a good fit when
|
|
656
|
+
|
|
657
|
+
- You need to process large datasets server-side
|
|
658
|
+
- Your model is too large to download in a browser (>4 GB)
|
|
659
|
+
- You need to support very old browsers (IE, Safari < 11)
|
|
660
|
+
- Inference latency must be under 100ms on low-end mobile devices
|
|
661
|
+
|
|
662
|
+
## License
|
|
663
|
+
|
|
664
|
+
MIT
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
'use strict';function c(){return {name:"onnx",async create(){let d=await import('onnxruntime-web');return {name:"onnx",estimateMemoryMB(n,e){return e.estimatedMemoryMB??50},async load(n,e,a,s){let o=e.model;s({loaded:0,phase:"downloading",total:0});let t=a==="webgpu"?["webgpu","wasm"]:["wasm"],r=await d.InferenceSession.create(o,{executionProviders:t,graphOptimizationLevel:"all"});return s({loaded:1,phase:"done",total:1}),{instance:r,memoryMB:e.estimatedMemoryMB??50}},async run(n,e,a){let s=n.instance,{feeds:o,outputNames:t}=e;return await s.run(o,t)},async stream(n,e,a,s){let o=n.instance,{feeds:t,outputNames:r}=e,i=await o.run(t,r,s);a(i);},async unload(n){await n.instance.release?.();}}}}}exports.onnxAdapter=c;//# sourceMappingURL=onnx.cjs.map
|
|
2
|
+
//# sourceMappingURL=onnx.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/adapters/onnx.ts"],"names":["onnxAdapter","ort","_task","config","device","onProgress","modelUrl","executionProviders","session","model","input","_options","feeds","outputNames","onChunk","options","result"],"mappings":"aAyCO,SAASA,GAAmC,CACjD,OAAO,CACL,IAAA,CAAM,OAEN,MAAM,MAAA,EAAgC,CAEpC,IAAMC,EAAM,MAAM,OAAO,iBAAiB,CAAA,CAE1C,OAAO,CACL,IAAA,CAAM,MAAA,CAEN,gBAAA,CAAiBC,EAAeC,CAAAA,CAAyC,CACvE,OAAQA,CAAAA,CAAO,mBAA4C,EAC7D,CAAA,CAEA,MAAM,IAAA,CACJD,EACAC,CAAAA,CACAC,CAAAA,CACAC,EACsB,CACtB,IAAMC,EAAWH,CAAAA,CAAO,KAAA,CAExBE,CAAAA,CAAW,CAAE,OAAQ,CAAA,CAAG,KAAA,CAAO,aAAA,CAAe,KAAA,CAAO,CAAE,CAAC,CAAA,CAExD,IAAME,CAAAA,CAAqBH,IAAW,QAAA,CAClC,CAAC,QAAA,CAAU,MAAM,EACjB,CAAC,MAAM,CAAA,CAELI,CAAAA,CAA4B,MAAMP,CAAAA,CAAI,gBAAA,CAAiB,MAAA,CAAOK,CAAAA,CAAU,CAC5E,kBAAA,CAAAC,CAAAA,CACA,sBAAA,CAAwB,KAC1B,CAAC,CAAA,CAED,OAAAF,EAAW,CAAE,MAAA,CAAQ,EAAG,KAAA,CAAO,MAAA,CAAQ,KAAA,CAAO,CAAE,CAAC,CAAA,CAE1C,CACL,QAAA,CAAUG,CAAAA,CACV,SAAWL,CAAAA,CAAO,iBAAA,EAA4C,EAChE,CACF,EAEA,MAAM,GAAA,CAAIM,CAAAA,CAAoBC,CAAAA,CAAgBC,EAAsC,CAClF,IAAMH,CAAAA,CAAUC,CAAAA,CAAM,SAChB,CAAE,KAAA,CAAAG,CAAAA,CAAO,WAAA,CAAAC,CAAY,CAAA,CAAIH,CAAAA,CAG/B,OADgB,MAAMF,EAAQ,GAAA,CAAII,CAAAA,CAAOC,CAAW,CAEtD,CAAA,CAEA,MAAM,MAAA,CACJJ,CAAAA,CACAC,CAAAA,CACAI,CAAAA,CACAC,EACe,CAGf,IAAMP,CAAAA,CAAUC,CAAAA,CAAM,SAChB,CAAE,KAAA,CAAAG,CAAAA,CAAO,WAAA,CAAAC,CAAY,CAAA,CAAIH,CAAAA,CACzBM,CAAAA,CAAS,MAAMR,EAAQ,GAAA,CAAII,CAAAA,CAAOC,CAAAA,CAAaE,CAAO,EAC5DD,CAAAA,CAAQE,CAAM,EAChB,CAAA,CAEA,MAAM,MAAA,CAAOP,CAAAA,CAAmC,CAE9C,MADgBA,EAAM,QAAA,CACR,OAAA,KAChB,CACF,CACF,CACF,CACF","file":"onnx.cjs","sourcesContent":["import type {\n Device,\n LoadedModel,\n LoadProgressEvent,\n ModelAdapter,\n ModelAdapterFactory,\n} from '../core/types.js';\n\n// eslint-disable-next-line ts/no-explicit-any\ntype InferenceSession = any;\n// eslint-disable-next-line ts/no-explicit-any\ntype Tensor = any;\n\ninterface OnnxInput {\n feeds: Record<string, Tensor>;\n outputNames?: string[];\n}\n\n/**\n * Adapter for onnxruntime-web.\n *\n * @remarks\n * Provides low-level access to ONNX model inference.\n * Use for custom models not supported by transformers.js or web-llm.\n * Input must be pre-processed `OrtTensor` instances.\n *\n * @example\n * ```ts\n * import { createPool } from 'inferis-ml';\n * import { onnxAdapter } from 'inferis-ml/adapters/onnx';\n * import * as ort from 'onnxruntime-web';\n *\n * const pool = await createPool({ adapter: onnxAdapter() });\n * const model = await pool.load('custom', {\n * model: 'https://example.com/model.onnx',\n * });\n *\n * const input = new ort.Tensor('float32', data, [1, 3, 224, 224]);\n * const output = await model.run({ feeds: { input } });\n * ```\n */\nexport function onnxAdapter(): ModelAdapterFactory {\n return {\n name: 'onnx',\n\n async create(): Promise<ModelAdapter> {\n // @ts-expect-error - optional peer dependency, resolved at runtime inside worker\n const ort = await import('onnxruntime-web');\n\n return {\n name: 'onnx',\n\n estimateMemoryMB(_task: string, config: Record<string, unknown>): number {\n return (config.estimatedMemoryMB as number | undefined) ?? 50;\n },\n\n async load(\n _task: string,\n config: Record<string, unknown>,\n device: Device,\n onProgress: (event: LoadProgressEvent) => void,\n ): Promise<LoadedModel> {\n const modelUrl = config.model as string;\n\n onProgress({ loaded: 0, phase: 'downloading', total: 0 });\n\n const executionProviders = device === 'webgpu'\n ? ['webgpu', 'wasm'] as const\n : ['wasm'] as const;\n\n const session: InferenceSession = await ort.InferenceSession.create(modelUrl, {\n executionProviders,\n graphOptimizationLevel: 'all',\n });\n\n onProgress({ loaded: 1, phase: 'done', total: 1 });\n\n return {\n instance: session,\n memoryMB: (config.estimatedMemoryMB as number | undefined) ?? 50,\n };\n },\n\n async run(model: LoadedModel, input: unknown, _options?: unknown): Promise<unknown> {\n const session = model.instance as InferenceSession;\n const { feeds, outputNames } = input as OnnxInput;\n\n const results = await session.run(feeds, outputNames);\n return results;\n },\n\n async stream(\n model: LoadedModel,\n input: unknown,\n onChunk: (chunk: unknown) => void,\n options?: unknown,\n ): Promise<void> {\n // ONNX Runtime Web doesn't natively support streaming inference.\n // Run full inference and emit the result as a single chunk.\n const session = model.instance as InferenceSession;\n const { feeds, outputNames } = input as OnnxInput;\n const result = await session.run(feeds, outputNames, options);\n onChunk(result);\n },\n\n async unload(model: LoadedModel): Promise<void> {\n const session = model.instance as InferenceSession;\n await session.release?.();\n },\n };\n },\n };\n}\n"]}
|