@tryhamster/gerbil 1.0.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +23 -0
- package/README.md +253 -0
- package/bin/cli.js +2 -0
- package/dist/auto-update-BbNHbSU1.mjs +3 -0
- package/dist/browser/index.d.mts +262 -0
- package/dist/browser/index.d.mts.map +1 -0
- package/dist/browser/index.mjs +755 -0
- package/dist/browser/index.mjs.map +1 -0
- package/dist/chrome-backend-C5Un08O4.mjs +771 -0
- package/dist/chrome-backend-C5Un08O4.mjs.map +1 -0
- package/dist/chrome-backend-CtwPENIW.mjs +3 -0
- package/dist/chunk-Ct1HF2bE.mjs +7 -0
- package/dist/cli.d.mts +1 -0
- package/dist/cli.mjs +7078 -0
- package/dist/cli.mjs.map +1 -0
- package/dist/frameworks/express.d.mts +22 -0
- package/dist/frameworks/express.d.mts.map +1 -0
- package/dist/frameworks/express.mjs +123 -0
- package/dist/frameworks/express.mjs.map +1 -0
- package/dist/frameworks/fastify.d.mts +11 -0
- package/dist/frameworks/fastify.d.mts.map +1 -0
- package/dist/frameworks/fastify.mjs +73 -0
- package/dist/frameworks/fastify.mjs.map +1 -0
- package/dist/frameworks/hono.d.mts +14 -0
- package/dist/frameworks/hono.d.mts.map +1 -0
- package/dist/frameworks/hono.mjs +82 -0
- package/dist/frameworks/hono.mjs.map +1 -0
- package/dist/frameworks/next.d.mts +31 -0
- package/dist/frameworks/next.d.mts.map +1 -0
- package/dist/frameworks/next.mjs +116 -0
- package/dist/frameworks/next.mjs.map +1 -0
- package/dist/frameworks/react.d.mts +56 -0
- package/dist/frameworks/react.d.mts.map +1 -0
- package/dist/frameworks/react.mjs +172 -0
- package/dist/frameworks/react.mjs.map +1 -0
- package/dist/frameworks/trpc.d.mts +12 -0
- package/dist/frameworks/trpc.d.mts.map +1 -0
- package/dist/frameworks/trpc.mjs +80 -0
- package/dist/frameworks/trpc.mjs.map +1 -0
- package/dist/gerbil-BfnsFWRE.mjs +644 -0
- package/dist/gerbil-BfnsFWRE.mjs.map +1 -0
- package/dist/gerbil-BjW-z7Fq.mjs +5 -0
- package/dist/gerbil-DZ1k3ChC.d.mts +138 -0
- package/dist/gerbil-DZ1k3ChC.d.mts.map +1 -0
- package/dist/index.d.mts +223 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +13 -0
- package/dist/index.mjs.map +1 -0
- package/dist/integrations/ai-sdk.d.mts +78 -0
- package/dist/integrations/ai-sdk.d.mts.map +1 -0
- package/dist/integrations/ai-sdk.mjs +199 -0
- package/dist/integrations/ai-sdk.mjs.map +1 -0
- package/dist/integrations/langchain.d.mts +41 -0
- package/dist/integrations/langchain.d.mts.map +1 -0
- package/dist/integrations/langchain.mjs +93 -0
- package/dist/integrations/langchain.mjs.map +1 -0
- package/dist/integrations/llamaindex.d.mts +45 -0
- package/dist/integrations/llamaindex.d.mts.map +1 -0
- package/dist/integrations/llamaindex.mjs +86 -0
- package/dist/integrations/llamaindex.mjs.map +1 -0
- package/dist/integrations/mcp-client.d.mts +206 -0
- package/dist/integrations/mcp-client.d.mts.map +1 -0
- package/dist/integrations/mcp-client.mjs +507 -0
- package/dist/integrations/mcp-client.mjs.map +1 -0
- package/dist/integrations/mcp.d.mts +177 -0
- package/dist/integrations/mcp.d.mts.map +1 -0
- package/dist/integrations/mcp.mjs +8 -0
- package/dist/mcp-R8kRLIKb.mjs +348 -0
- package/dist/mcp-R8kRLIKb.mjs.map +1 -0
- package/dist/models-DKULvhOr.mjs +136 -0
- package/dist/models-DKULvhOr.mjs.map +1 -0
- package/dist/models-De2-_GmQ.d.mts +22 -0
- package/dist/models-De2-_GmQ.d.mts.map +1 -0
- package/dist/one-liner-BUQR0nqq.mjs +98 -0
- package/dist/one-liner-BUQR0nqq.mjs.map +1 -0
- package/dist/skills/index.d.mts +390 -0
- package/dist/skills/index.d.mts.map +1 -0
- package/dist/skills/index.mjs +7 -0
- package/dist/skills-D3CEpgDc.mjs +630 -0
- package/dist/skills-D3CEpgDc.mjs.map +1 -0
- package/dist/tools-BsiEE6f2.mjs +567 -0
- package/dist/tools-BsiEE6f2.mjs.map +1 -0
- package/dist/types-BS1N92Jt.d.mts +183 -0
- package/dist/types-BS1N92Jt.d.mts.map +1 -0
- package/dist/utils-7vXqtq2Q.mjs +63 -0
- package/dist/utils-7vXqtq2Q.mjs.map +1 -0
- package/docs/ai-sdk.md +80 -0
- package/docs/architecture/README.md +84 -0
- package/docs/architecture/caching.md +227 -0
- package/docs/architecture/inference.md +176 -0
- package/docs/architecture/overview.md +179 -0
- package/docs/architecture/streaming.md +261 -0
- package/docs/architecture/webgpu.md +213 -0
- package/docs/browser.md +328 -0
- package/docs/cli.md +155 -0
- package/docs/frameworks.md +90 -0
- package/docs/mcp-client.md +224 -0
- package/docs/mcp.md +109 -0
- package/docs/memory.md +229 -0
- package/docs/repl.md +473 -0
- package/docs/skills.md +261 -0
- package/docs/tools.md +304 -0
- package/package.json +207 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# Streaming Architecture
|
|
2
|
+
|
|
3
|
+
How Gerbil streams tokens in real-time without blocking the UI.
|
|
4
|
+
|
|
5
|
+
## The Problem
|
|
6
|
+
|
|
7
|
+
LLM inference is computationally intensive:
|
|
8
|
+
- Model loading: 1-30 seconds
|
|
9
|
+
- Token generation: 10-100ms per token
|
|
10
|
+
- Total generation: 1-30 seconds
|
|
11
|
+
|
|
12
|
+
Running this on the main thread would freeze the UI completely.
|
|
13
|
+
|
|
14
|
+
## Solution: Web Workers
|
|
15
|
+
|
|
16
|
+
Gerbil runs inference in a Web Worker, keeping the main thread free for UI updates.
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
20
|
+
│ Main Thread │
|
|
21
|
+
│ │
|
|
22
|
+
│ ┌─────────────┐ ┌─────────────────────────────────┐ │
|
|
23
|
+
│ │ Your UI │◄───────►│ GerbilWorker API │ │
|
|
24
|
+
│ │ (React, │ tokens │ - generate() │ │
|
|
25
|
+
│ │ Vue, etc) │ │ - interrupt() │ │
|
|
26
|
+
│ └─────────────┘ │ - reset() │ │
|
|
27
|
+
│ └──────────────┬──────────────────┘ │
|
|
28
|
+
│ │ │
|
|
29
|
+
│ │ postMessage │
|
|
30
|
+
│ ▼ │
|
|
31
|
+
├─────────────────────────────────────────────────────────────┤
|
|
32
|
+
│ Web Worker │
|
|
33
|
+
│ │
|
|
34
|
+
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
35
|
+
│ │ ModelPipeline │ │
|
|
36
|
+
│ │ │ │
|
|
37
|
+
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ │ │
|
|
38
|
+
│ │ │ Tokenizer │ │ Model │ │ TextStreamer │ │ │
|
|
39
|
+
│ │ └─────────────┘ └─────────────┘ └────────┬────────┘ │ │
|
|
40
|
+
│ │ │ │ │
|
|
41
|
+
│ │ tokens │ │ │
|
|
42
|
+
│ │ ▼ │ │
|
|
43
|
+
│ │ postMessage(token) │ │
|
|
44
|
+
│ └─────────────────────────────────────────────────────────┘ │
|
|
45
|
+
│ │
|
|
46
|
+
└─────────────────────────────────────────────────────────────┘
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Browser Implementation
|
|
50
|
+
|
|
51
|
+
### Inline Worker
|
|
52
|
+
|
|
53
|
+
Gerbil creates an inline Web Worker from a blob URL:
|
|
54
|
+
|
|
55
|
+
```typescript
|
|
56
|
+
const workerCode = `
|
|
57
|
+
import { AutoModelForCausalLM, TextStreamer } from "@huggingface/transformers";
|
|
58
|
+
|
|
59
|
+
// ... worker code ...
|
|
60
|
+
|
|
61
|
+
self.addEventListener("message", async (e) => {
|
|
62
|
+
const { type, ...data } = e.data;
|
|
63
|
+
switch (type) {
|
|
64
|
+
case "load": await load(data); break;
|
|
65
|
+
case "generate": await generate(data); break;
|
|
66
|
+
case "interrupt": stoppingCriteria.interrupt(); break;
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
`;
|
|
70
|
+
|
|
71
|
+
const blob = new Blob([workerCode], { type: "application/javascript" });
|
|
72
|
+
const workerUrl = URL.createObjectURL(blob);
|
|
73
|
+
const worker = new Worker(workerUrl, { type: "module" });
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Message Protocol
|
|
77
|
+
|
|
78
|
+
```typescript
|
|
79
|
+
// Main → Worker
|
|
80
|
+
{ type: "load", modelId: "qwen3-0.6b" }
|
|
81
|
+
{ type: "generate", messages: [...], options: {...} }
|
|
82
|
+
{ type: "interrupt" }
|
|
83
|
+
{ type: "reset" }
|
|
84
|
+
|
|
85
|
+
// Worker → Main
|
|
86
|
+
{ status: "loading", message: "Loading model..." }
|
|
87
|
+
{ status: "progress", file: "model.onnx", progress: 50 }
|
|
88
|
+
{ status: "ready" }
|
|
89
|
+
{ status: "token", text: "Hello", state: "answering", tps: 75 }
|
|
90
|
+
{ status: "complete", text: "Hello world!", numTokens: 3, tps: 75 }
|
|
91
|
+
{ status: "error", error: "Out of memory" }
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Token Streaming
|
|
95
|
+
|
|
96
|
+
The `TextStreamer` class from transformers.js calls back for each token:
|
|
97
|
+
|
|
98
|
+
```typescript
|
|
99
|
+
const streamer = new TextStreamer(tokenizer, {
|
|
100
|
+
skip_prompt: true,
|
|
101
|
+
skip_special_tokens: true,
|
|
102
|
+
callback_function: (text) => {
|
|
103
|
+
self.postMessage({ status: "token", text, state, tps });
|
|
104
|
+
},
|
|
105
|
+
token_callback_function: (tokens) => {
|
|
106
|
+
// Track thinking state from special tokens
|
|
107
|
+
const tokenId = Number(tokens[0]);
|
|
108
|
+
if (tokenId === START_THINKING_TOKEN_ID) state = "thinking";
|
|
109
|
+
if (tokenId === END_THINKING_TOKEN_ID) state = "answering";
|
|
110
|
+
},
|
|
111
|
+
});
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Node.js Implementation
|
|
115
|
+
|
|
116
|
+
### CPU Mode
|
|
117
|
+
|
|
118
|
+
For CPU inference, Gerbil uses "fake streaming" - generate all tokens first, then yield:
|
|
119
|
+
|
|
120
|
+
```typescript
|
|
121
|
+
async *stream(prompt, options) {
|
|
122
|
+
const result = await this.generateRaw(prompt, options);
|
|
123
|
+
|
|
124
|
+
// Yield word by word for streaming effect
|
|
125
|
+
const words = result.rawText.split(/(\s+)/);
|
|
126
|
+
for (const word of words) {
|
|
127
|
+
yield word;
|
|
128
|
+
options.onToken?.(word);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return result.result;
|
|
132
|
+
}
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### WebGPU Mode (ChromeGPUBackend)
|
|
136
|
+
|
|
137
|
+
For WebGPU, real streaming via Chrome DevTools Protocol:
|
|
138
|
+
|
|
139
|
+
```typescript
|
|
140
|
+
async *stream(prompt, options) {
|
|
141
|
+
const tokenQueue: string[] = [];
|
|
142
|
+
let resolveNext: ((value: string | null) => void) | null = null;
|
|
143
|
+
let done = false;
|
|
144
|
+
|
|
145
|
+
// Start generation with streaming callback
|
|
146
|
+
const generatePromise = this.chromeBackend.generate(prompt, {
|
|
147
|
+
onToken: (token) => {
|
|
148
|
+
if (resolveNext) {
|
|
149
|
+
resolveNext(token.text);
|
|
150
|
+
resolveNext = null;
|
|
151
|
+
} else {
|
|
152
|
+
tokenQueue.push(token.text);
|
|
153
|
+
}
|
|
154
|
+
},
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
// Yield tokens as they arrive
|
|
158
|
+
while (!done || tokenQueue.length > 0) {
|
|
159
|
+
if (tokenQueue.length > 0) {
|
|
160
|
+
yield tokenQueue.shift()!;
|
|
161
|
+
} else if (!done) {
|
|
162
|
+
const token = await new Promise<string | null>((resolve) => {
|
|
163
|
+
resolveNext = resolve;
|
|
164
|
+
});
|
|
165
|
+
if (token) yield token;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
await generatePromise;
|
|
170
|
+
}
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Interruption
|
|
174
|
+
|
|
175
|
+
Users can interrupt generation mid-stream:
|
|
176
|
+
|
|
177
|
+
```typescript
|
|
178
|
+
// Browser
|
|
179
|
+
gerbil.interrupt();
|
|
180
|
+
|
|
181
|
+
// Internally uses InterruptableStoppingCriteria
|
|
182
|
+
const stoppingCriteria = new InterruptableStoppingCriteria();
|
|
183
|
+
|
|
184
|
+
// On interrupt message:
|
|
185
|
+
stoppingCriteria.interrupt();
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
The model stops generating after the current token.
|
|
189
|
+
|
|
190
|
+
## Thinking State Tracking
|
|
191
|
+
|
|
192
|
+
For Qwen3 thinking mode, Gerbil tracks whether the model is "thinking" or "answering":
|
|
193
|
+
|
|
194
|
+
```typescript
|
|
195
|
+
const [START_THINKING_TOKEN_ID, END_THINKING_TOKEN_ID] =
|
|
196
|
+
tokenizer.encode("<think></think>", { add_special_tokens: false });
|
|
197
|
+
|
|
198
|
+
let state = "answering";
|
|
199
|
+
|
|
200
|
+
const tokenCallback = (tokens) => {
|
|
201
|
+
const tokenId = Number(tokens[0]);
|
|
202
|
+
if (tokenId === START_THINKING_TOKEN_ID) state = "thinking";
|
|
203
|
+
if (tokenId === END_THINKING_TOKEN_ID) state = "answering";
|
|
204
|
+
};
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
This allows the UI to display thinking content differently:
|
|
208
|
+
|
|
209
|
+
```typescript
|
|
210
|
+
onToken: (token) => {
|
|
211
|
+
if (token.state === "thinking") {
|
|
212
|
+
setThinking(t => t + token.text);
|
|
213
|
+
} else {
|
|
214
|
+
setResponse(r => r + token.text);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Performance Considerations
|
|
220
|
+
|
|
221
|
+
### Batching
|
|
222
|
+
|
|
223
|
+
Tokens arrive very fast (~100/sec). Consider batching UI updates:
|
|
224
|
+
|
|
225
|
+
```typescript
|
|
226
|
+
let buffer = "";
|
|
227
|
+
let updateScheduled = false;
|
|
228
|
+
|
|
229
|
+
onToken: (token) => {
|
|
230
|
+
buffer += token.text;
|
|
231
|
+
if (!updateScheduled) {
|
|
232
|
+
updateScheduled = true;
|
|
233
|
+
requestAnimationFrame(() => {
|
|
234
|
+
setResponse(r => r + buffer);
|
|
235
|
+
buffer = "";
|
|
236
|
+
updateScheduled = false;
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### Memory
|
|
243
|
+
|
|
244
|
+
The Web Worker has its own memory space. Large models may need:
|
|
245
|
+
- Closing other tabs
|
|
246
|
+
- Using smaller models
|
|
247
|
+
- Falling back to server-side inference
|
|
248
|
+
|
|
249
|
+
### Cleanup
|
|
250
|
+
|
|
251
|
+
Always terminate workers when done:
|
|
252
|
+
|
|
253
|
+
```typescript
|
|
254
|
+
useEffect(() => {
|
|
255
|
+
const worker = await createGerbilWorker({ ... });
|
|
256
|
+
workerRef.current = worker;
|
|
257
|
+
|
|
258
|
+
return () => worker.terminate();
|
|
259
|
+
}, []);
|
|
260
|
+
```
|
|
261
|
+
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# WebGPU Acceleration
|
|
2
|
+
|
|
3
|
+
How Gerbil uses WebGPU for GPU-accelerated inference.
|
|
4
|
+
|
|
5
|
+
## What is WebGPU?
|
|
6
|
+
|
|
7
|
+
WebGPU is a modern graphics and compute API that provides access to GPU hardware from web browsers and (experimentally) Node.js.
|
|
8
|
+
|
|
9
|
+
Key features:
|
|
10
|
+
- **Cross-platform**: Works on Windows, macOS, Linux, ChromeOS
|
|
11
|
+
- **Modern**: Designed for ML workloads, not just graphics
|
|
12
|
+
- **Standardized**: W3C specification, implemented by all major browsers
|
|
13
|
+
|
|
14
|
+
## Browser WebGPU
|
|
15
|
+
|
|
16
|
+
In browsers, WebGPU is available via `navigator.gpu`:
|
|
17
|
+
|
|
18
|
+
```typescript
|
|
19
|
+
// Check support
|
|
20
|
+
if (!navigator.gpu) {
|
|
21
|
+
console.log("WebGPU not supported");
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Get adapter
|
|
26
|
+
const adapter = await navigator.gpu.requestAdapter();
|
|
27
|
+
const device = await adapter.requestDevice();
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Browser Support
|
|
31
|
+
|
|
32
|
+
| Browser | Version | Status |
|
|
33
|
+
|---------|---------|--------|
|
|
34
|
+
| Chrome | 113+ | ✅ Full support |
|
|
35
|
+
| Edge | 113+ | ✅ Full support |
|
|
36
|
+
| Safari | 18+ | ⚠️ Partial (some bugs) |
|
|
37
|
+
| Firefox | Nightly | 🔧 Behind flag |
|
|
38
|
+
|
|
39
|
+
### Gerbil Browser API
|
|
40
|
+
|
|
41
|
+
```typescript
|
|
42
|
+
import { createGerbilWorker, isWebGPUSupported } from "@tryhamster/gerbil/browser";
|
|
43
|
+
|
|
44
|
+
if (!isWebGPUSupported()) {
|
|
45
|
+
// Fall back to server-side inference
|
|
46
|
+
return;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const gerbil = await createGerbilWorker({
|
|
50
|
+
modelId: "qwen3-0.6b",
|
|
51
|
+
onToken: (token) => console.log(token.text),
|
|
52
|
+
});
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Node.js WebGPU (ChromeGPUBackend)
|
|
56
|
+
|
|
57
|
+
Node.js doesn't have native WebGPU support, so Gerbil uses a creative solution: **headless Chrome as a WebGPU accelerator**.
|
|
58
|
+
|
|
59
|
+
### Architecture
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
63
|
+
│ Node.js Process │
|
|
64
|
+
│ │
|
|
65
|
+
│ ┌─────────────┐ ┌─────────────────────────────────┐ │
|
|
66
|
+
│ │ Gerbil │◄───────►│ HTTP Server │ │
|
|
67
|
+
│ │ (your │ │ (port 43724) │ │
|
|
68
|
+
│ │ code) │ │ │ │
|
|
69
|
+
│ └──────┬──────┘ └─────────────────────────────────┘ │
|
|
70
|
+
│ │ ▲ │
|
|
71
|
+
│ │ CDP │ Serves worker.html │
|
|
72
|
+
│ ▼ │ │
|
|
73
|
+
│ ┌─────────────────────────────────────┴───────────────────┐ │
|
|
74
|
+
│ │ Headless Chrome │ │
|
|
75
|
+
│ │ │ │
|
|
76
|
+
│ │ ┌────────────────────────────────────────────────────┐ │ │
|
|
77
|
+
│ │ │ Worker Page │ │ │
|
|
78
|
+
│ │ │ │ │ │
|
|
79
|
+
│ │ │ ┌─────────────┐ ┌─────────────┐ ┌────────────┐ │ │ │
|
|
80
|
+
│ │ │ │transformers │ │ ONNX Runtime│ │ WebGPU │ │ │ │
|
|
81
|
+
│ │ │ │ .js │ │ (WASM) │ │ Shaders │ │ │ │
|
|
82
|
+
│ │ │ └─────────────┘ └─────────────┘ └────────────┘ │ │ │
|
|
83
|
+
│ │ └────────────────────────────────────────────────────┘ │ │
|
|
84
|
+
│ │ │ │ │
|
|
85
|
+
│ │ ▼ │ │
|
|
86
|
+
│ │ ┌─────────────┐ │ │
|
|
87
|
+
│ │ │ GPU │ │ │
|
|
88
|
+
│ │ └─────────────┘ │ │
|
|
89
|
+
│ └──────────────────────────────────────────────────────────┘ │
|
|
90
|
+
└─────────────────────────────────────────────────────────────┘
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### How It Works
|
|
94
|
+
|
|
95
|
+
1. **HTTP Server**: Gerbil starts a local HTTP server on port 43724
|
|
96
|
+
2. **Chrome Launch**: Puppeteer launches headless Chrome with WebGPU flags
|
|
97
|
+
3. **Page Load**: Chrome navigates to the HTTP server, loading the worker page
|
|
98
|
+
4. **Model Load**: transformers.js loads the model with WebGPU backend
|
|
99
|
+
5. **CDP Communication**: Gerbil sends prompts via Chrome DevTools Protocol
|
|
100
|
+
6. **Streaming**: Tokens stream back via console.log → CDP events
|
|
101
|
+
|
|
102
|
+
### Why Port 43724?
|
|
103
|
+
|
|
104
|
+
The port is fixed (43724 = "GERBI" on a phone keypad) for a critical reason: **IndexedDB cache consistency**.
|
|
105
|
+
|
|
106
|
+
IndexedDB caches are origin-specific. A fixed port means:
|
|
107
|
+
- Same origin every time → same cache
|
|
108
|
+
- Model downloads are cached and reused
|
|
109
|
+
- ~1.5s startup when cached vs ~20s first run
|
|
110
|
+
|
|
111
|
+
### Chrome Launch Flags
|
|
112
|
+
|
|
113
|
+
```typescript
|
|
114
|
+
const args = [
|
|
115
|
+
"--enable-unsafe-webgpu", // Enable WebGPU
|
|
116
|
+
"--enable-gpu", // Enable GPU acceleration
|
|
117
|
+
"--no-sandbox", // Required for some environments
|
|
118
|
+
"--disable-setuid-sandbox",
|
|
119
|
+
"--headless=new", // New headless mode
|
|
120
|
+
// Platform-specific...
|
|
121
|
+
];
|
|
122
|
+
|
|
123
|
+
// Linux: Use Vulkan backend
|
|
124
|
+
if (process.platform === "linux") {
|
|
125
|
+
args.push("--enable-features=Vulkan");
|
|
126
|
+
args.push("--use-vulkan");
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Singleton Pattern
|
|
131
|
+
|
|
132
|
+
The Chrome browser and HTTP server are singletons:
|
|
133
|
+
|
|
134
|
+
```typescript
|
|
135
|
+
let globalBrowser: Browser | null = null;
|
|
136
|
+
let globalServer: Server | null = null;
|
|
137
|
+
|
|
138
|
+
// Multiple Gerbil instances share the same browser
|
|
139
|
+
const backend1 = await ChromeGPUBackend.create({ modelId: "qwen3-0.6b" });
|
|
140
|
+
const backend2 = await ChromeGPUBackend.create({ modelId: "smollm2-360m" });
|
|
141
|
+
// Both use the same Chrome process
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
This avoids:
|
|
145
|
+
- Multiple Chrome processes consuming GPU memory
|
|
146
|
+
- Port conflicts from multiple HTTP servers
|
|
147
|
+
- Redundant model downloads
|
|
148
|
+
|
|
149
|
+
## Performance Comparison
|
|
150
|
+
|
|
151
|
+
Tested on Apple M4 Max with Qwen3-0.6B:
|
|
152
|
+
|
|
153
|
+
| Backend | Tokens/sec | First Token | Notes |
|
|
154
|
+
|---------|------------|-------------|-------|
|
|
155
|
+
| WebGPU (Browser) | ~80 tok/s | ~50ms | Native WebGPU |
|
|
156
|
+
| WebGPU (Node.js) | ~75 tok/s | ~60ms | Via Chrome |
|
|
157
|
+
| CPU (Node.js) | ~100+ tok/s | ~500ms | Apple Neural Engine |
|
|
158
|
+
| WASM (Browser) | ~10 tok/s | ~200ms | Fallback only |
|
|
159
|
+
|
|
160
|
+
Note: On Apple Silicon, CPU can be faster than WebGPU for small models due to the unified memory architecture and optimized ONNX Runtime.
|
|
161
|
+
|
|
162
|
+
## Shader Compilation
|
|
163
|
+
|
|
164
|
+
WebGPU compiles shaders (GPU programs) at runtime. This causes a one-time delay:
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
First run:
|
|
168
|
+
Model download: ~15-30s (cached after)
|
|
169
|
+
Shader compilation: ~5-10s (cached in browser)
|
|
170
|
+
First token: ~50ms
|
|
171
|
+
|
|
172
|
+
Subsequent runs:
|
|
173
|
+
Model load: ~1-2s (from IndexedDB)
|
|
174
|
+
Shader load: ~0.5s (from GPU cache)
|
|
175
|
+
First token: ~50ms
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Warm-up
|
|
179
|
+
|
|
180
|
+
Gerbil performs a warm-up generation after model load:
|
|
181
|
+
|
|
182
|
+
```typescript
|
|
183
|
+
// Compile shaders with a minimal generation
|
|
184
|
+
const warmupInputs = tokenizer("a");
|
|
185
|
+
await model.generate({ ...warmupInputs, max_new_tokens: 1 });
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
This ensures shaders are compiled before the user's first prompt.
|
|
189
|
+
|
|
190
|
+
## Troubleshooting
|
|
191
|
+
|
|
192
|
+
### "WebGPU not supported"
|
|
193
|
+
|
|
194
|
+
**Browser:**
|
|
195
|
+
- Update to Chrome/Edge 113+
|
|
196
|
+
- Check `chrome://gpu` for WebGPU status
|
|
197
|
+
- Try enabling `chrome://flags/#enable-unsafe-webgpu`
|
|
198
|
+
|
|
199
|
+
**Node.js:**
|
|
200
|
+
- Ensure Chrome is installed
|
|
201
|
+
- Set `CHROME_PATH` environment variable if Chrome is in a non-standard location
|
|
202
|
+
- Check GPU drivers are up to date
|
|
203
|
+
|
|
204
|
+
### Slow shader compilation
|
|
205
|
+
|
|
206
|
+
This is normal on first run. The shaders are cached after compilation.
|
|
207
|
+
|
|
208
|
+
### Out of GPU memory
|
|
209
|
+
|
|
210
|
+
- Close other GPU-intensive applications
|
|
211
|
+
- Use a smaller model (`smollm2-135m`)
|
|
212
|
+
- Fall back to CPU mode
|
|
213
|
+
|