@tryhamster/gerbil 1.0.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +23 -0
- package/README.md +253 -0
- package/bin/cli.js +2 -0
- package/dist/auto-update-BbNHbSU1.mjs +3 -0
- package/dist/browser/index.d.mts +262 -0
- package/dist/browser/index.d.mts.map +1 -0
- package/dist/browser/index.mjs +755 -0
- package/dist/browser/index.mjs.map +1 -0
- package/dist/chrome-backend-C5Un08O4.mjs +771 -0
- package/dist/chrome-backend-C5Un08O4.mjs.map +1 -0
- package/dist/chrome-backend-CtwPENIW.mjs +3 -0
- package/dist/chunk-Ct1HF2bE.mjs +7 -0
- package/dist/cli.d.mts +1 -0
- package/dist/cli.mjs +7078 -0
- package/dist/cli.mjs.map +1 -0
- package/dist/frameworks/express.d.mts +22 -0
- package/dist/frameworks/express.d.mts.map +1 -0
- package/dist/frameworks/express.mjs +123 -0
- package/dist/frameworks/express.mjs.map +1 -0
- package/dist/frameworks/fastify.d.mts +11 -0
- package/dist/frameworks/fastify.d.mts.map +1 -0
- package/dist/frameworks/fastify.mjs +73 -0
- package/dist/frameworks/fastify.mjs.map +1 -0
- package/dist/frameworks/hono.d.mts +14 -0
- package/dist/frameworks/hono.d.mts.map +1 -0
- package/dist/frameworks/hono.mjs +82 -0
- package/dist/frameworks/hono.mjs.map +1 -0
- package/dist/frameworks/next.d.mts +31 -0
- package/dist/frameworks/next.d.mts.map +1 -0
- package/dist/frameworks/next.mjs +116 -0
- package/dist/frameworks/next.mjs.map +1 -0
- package/dist/frameworks/react.d.mts +56 -0
- package/dist/frameworks/react.d.mts.map +1 -0
- package/dist/frameworks/react.mjs +172 -0
- package/dist/frameworks/react.mjs.map +1 -0
- package/dist/frameworks/trpc.d.mts +12 -0
- package/dist/frameworks/trpc.d.mts.map +1 -0
- package/dist/frameworks/trpc.mjs +80 -0
- package/dist/frameworks/trpc.mjs.map +1 -0
- package/dist/gerbil-BfnsFWRE.mjs +644 -0
- package/dist/gerbil-BfnsFWRE.mjs.map +1 -0
- package/dist/gerbil-BjW-z7Fq.mjs +5 -0
- package/dist/gerbil-DZ1k3ChC.d.mts +138 -0
- package/dist/gerbil-DZ1k3ChC.d.mts.map +1 -0
- package/dist/index.d.mts +223 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +13 -0
- package/dist/index.mjs.map +1 -0
- package/dist/integrations/ai-sdk.d.mts +78 -0
- package/dist/integrations/ai-sdk.d.mts.map +1 -0
- package/dist/integrations/ai-sdk.mjs +199 -0
- package/dist/integrations/ai-sdk.mjs.map +1 -0
- package/dist/integrations/langchain.d.mts +41 -0
- package/dist/integrations/langchain.d.mts.map +1 -0
- package/dist/integrations/langchain.mjs +93 -0
- package/dist/integrations/langchain.mjs.map +1 -0
- package/dist/integrations/llamaindex.d.mts +45 -0
- package/dist/integrations/llamaindex.d.mts.map +1 -0
- package/dist/integrations/llamaindex.mjs +86 -0
- package/dist/integrations/llamaindex.mjs.map +1 -0
- package/dist/integrations/mcp-client.d.mts +206 -0
- package/dist/integrations/mcp-client.d.mts.map +1 -0
- package/dist/integrations/mcp-client.mjs +507 -0
- package/dist/integrations/mcp-client.mjs.map +1 -0
- package/dist/integrations/mcp.d.mts +177 -0
- package/dist/integrations/mcp.d.mts.map +1 -0
- package/dist/integrations/mcp.mjs +8 -0
- package/dist/mcp-R8kRLIKb.mjs +348 -0
- package/dist/mcp-R8kRLIKb.mjs.map +1 -0
- package/dist/models-DKULvhOr.mjs +136 -0
- package/dist/models-DKULvhOr.mjs.map +1 -0
- package/dist/models-De2-_GmQ.d.mts +22 -0
- package/dist/models-De2-_GmQ.d.mts.map +1 -0
- package/dist/one-liner-BUQR0nqq.mjs +98 -0
- package/dist/one-liner-BUQR0nqq.mjs.map +1 -0
- package/dist/skills/index.d.mts +390 -0
- package/dist/skills/index.d.mts.map +1 -0
- package/dist/skills/index.mjs +7 -0
- package/dist/skills-D3CEpgDc.mjs +630 -0
- package/dist/skills-D3CEpgDc.mjs.map +1 -0
- package/dist/tools-BsiEE6f2.mjs +567 -0
- package/dist/tools-BsiEE6f2.mjs.map +1 -0
- package/dist/types-BS1N92Jt.d.mts +183 -0
- package/dist/types-BS1N92Jt.d.mts.map +1 -0
- package/dist/utils-7vXqtq2Q.mjs +63 -0
- package/dist/utils-7vXqtq2Q.mjs.map +1 -0
- package/docs/ai-sdk.md +80 -0
- package/docs/architecture/README.md +84 -0
- package/docs/architecture/caching.md +227 -0
- package/docs/architecture/inference.md +176 -0
- package/docs/architecture/overview.md +179 -0
- package/docs/architecture/streaming.md +261 -0
- package/docs/architecture/webgpu.md +213 -0
- package/docs/browser.md +328 -0
- package/docs/cli.md +155 -0
- package/docs/frameworks.md +90 -0
- package/docs/mcp-client.md +224 -0
- package/docs/mcp.md +109 -0
- package/docs/memory.md +229 -0
- package/docs/repl.md +473 -0
- package/docs/skills.md +261 -0
- package/docs/tools.md +304 -0
- package/package.json +207 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# Model Caching
|
|
2
|
+
|
|
3
|
+
How Gerbil caches models for fast subsequent loads.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Model files are large (100MB - 500MB). Gerbil caches them locally to avoid re-downloading:
|
|
8
|
+
|
|
9
|
+
| Environment | Cache Location | Mechanism |
|
|
10
|
+
|-------------|----------------|-----------|
|
|
11
|
+
| Browser | IndexedDB | transformers.js built-in |
|
|
12
|
+
| Node.js (CPU) | `~/.cache/huggingface/hub` | transformers.js built-in |
|
|
13
|
+
| Node.js (WebGPU) | Chrome's IndexedDB | Via ChromeGPUBackend |
|
|
14
|
+
|
|
15
|
+
## Browser Caching
|
|
16
|
+
|
|
17
|
+
### IndexedDB
|
|
18
|
+
|
|
19
|
+
transformers.js automatically caches model files in IndexedDB:
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
IndexedDB
|
|
23
|
+
└── transformers-cache
|
|
24
|
+
└── onnx-community/Qwen3-0.6B-ONNX
|
|
25
|
+
├── tokenizer.json
|
|
26
|
+
├── config.json
|
|
27
|
+
├── model_q4f16.onnx
|
|
28
|
+
└── ...
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Cache Behavior
|
|
32
|
+
|
|
33
|
+
1. **First load**: Downloads from Hugging Face Hub (~15-30s)
|
|
34
|
+
2. **Subsequent loads**: Reads from IndexedDB (~1-2s)
|
|
35
|
+
|
|
36
|
+
### Checking Cache
|
|
37
|
+
|
|
38
|
+
```typescript
|
|
39
|
+
import { getWebGPUInfo } from "@tryhamster/gerbil/browser";
|
|
40
|
+
|
|
41
|
+
// Models are cached per-origin
|
|
42
|
+
// Same origin = same cache
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Clearing Cache
|
|
46
|
+
|
|
47
|
+
```javascript
|
|
48
|
+
// In browser DevTools:
|
|
49
|
+
indexedDB.deleteDatabase("transformers-cache");
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Node.js CPU Caching
|
|
53
|
+
|
|
54
|
+
### Hugging Face Hub Cache
|
|
55
|
+
|
|
56
|
+
transformers.js uses the standard HF cache directory:
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
~/.cache/huggingface/hub/
|
|
60
|
+
└── models--onnx-community--Qwen3-0.6B-ONNX/
|
|
61
|
+
├── blobs/
|
|
62
|
+
│ └── [sha256 hashes]
|
|
63
|
+
├── refs/
|
|
64
|
+
│ └── main
|
|
65
|
+
└── snapshots/
|
|
66
|
+
└── [commit hash]/
|
|
67
|
+
├── tokenizer.json
|
|
68
|
+
├── config.json
|
|
69
|
+
└── model_q4.onnx
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Environment Variables
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# Custom cache directory
|
|
76
|
+
export HF_HOME=/path/to/cache
|
|
77
|
+
export TRANSFORMERS_CACHE=/path/to/cache
|
|
78
|
+
|
|
79
|
+
# Offline mode (use cache only)
|
|
80
|
+
export TRANSFORMERS_OFFLINE=1
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### CLI Cache Management
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# View cache
|
|
87
|
+
npx @tryhamster/gerbil cache
|
|
88
|
+
|
|
89
|
+
# Clear cache
|
|
90
|
+
npx @tryhamster/gerbil cache --clean
|
|
91
|
+
|
|
92
|
+
# Clear old models
|
|
93
|
+
npx @tryhamster/gerbil cache --older-than 30
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Node.js WebGPU Caching
|
|
97
|
+
|
|
98
|
+
### ChromeGPUBackend Cache
|
|
99
|
+
|
|
100
|
+
When using WebGPU in Node.js, models are cached in Chrome's IndexedDB:
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
~/.gerbil/chrome-cache/
|
|
104
|
+
└── Default/
|
|
105
|
+
└── IndexedDB/
|
|
106
|
+
└── http_127.0.0.1_43724.indexeddb.leveldb/
|
|
107
|
+
└── [model cache]
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Why a Fixed Port?
|
|
111
|
+
|
|
112
|
+
The ChromeGPUBackend uses port 43724 ("GERBI") for a critical reason:
|
|
113
|
+
|
|
114
|
+
IndexedDB caches are **origin-specific**. The origin includes:
|
|
115
|
+
- Protocol: `http://`
|
|
116
|
+
- Host: `127.0.0.1`
|
|
117
|
+
- Port: `43724`
|
|
118
|
+
|
|
119
|
+
A fixed port ensures the same origin every time → same cache.
|
|
120
|
+
|
|
121
|
+
```typescript
|
|
122
|
+
const GERBIL_LOCAL_PORT = 43724; // "GERBI" on phone keypad
|
|
123
|
+
|
|
124
|
+
// Always same origin:
|
|
125
|
+
// http://127.0.0.1:43724
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Cache Persistence
|
|
129
|
+
|
|
130
|
+
The Chrome user data directory persists between runs:
|
|
131
|
+
|
|
132
|
+
```typescript
|
|
133
|
+
this.userDataDir = join(homedir(), ".gerbil", "chrome-cache");
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
This means:
|
|
137
|
+
- Model downloads are cached
|
|
138
|
+
- Shader compilations are cached
|
|
139
|
+
- ~1.5s startup when cached vs ~20s first run
|
|
140
|
+
|
|
141
|
+
## Cache Sizes
|
|
142
|
+
|
|
143
|
+
| Model | Download Size | Cache Size |
|
|
144
|
+
|-------|--------------|------------|
|
|
145
|
+
| qwen3-0.6b | ~400MB | ~400MB |
|
|
146
|
+
| smollm2-360m | ~250MB | ~250MB |
|
|
147
|
+
| smollm2-135m | ~100MB | ~100MB |
|
|
148
|
+
|
|
149
|
+
## Preloading Models
|
|
150
|
+
|
|
151
|
+
### Browser
|
|
152
|
+
|
|
153
|
+
```typescript
|
|
154
|
+
// Preload during idle time
|
|
155
|
+
const gerbil = await createGerbilWorker({
|
|
156
|
+
modelId: "qwen3-0.6b",
|
|
157
|
+
onProgress: (p) => {
|
|
158
|
+
if (p.status === "ready") {
|
|
159
|
+
console.log("Model cached and ready");
|
|
160
|
+
}
|
|
161
|
+
},
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
// Model is now in IndexedDB for instant loads
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Node.js
|
|
168
|
+
|
|
169
|
+
```typescript
|
|
170
|
+
// Preload in background
|
|
171
|
+
const g = new Gerbil();
|
|
172
|
+
await g.loadModel("qwen3-0.6b");
|
|
173
|
+
// Model is now in HF cache
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### CLI
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
# Download without running
|
|
180
|
+
npx @tryhamster/gerbil info -m qwen3-0.6b
|
|
181
|
+
# Model is now cached
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## Offline Usage
|
|
185
|
+
|
|
186
|
+
Once cached, models work offline:
|
|
187
|
+
|
|
188
|
+
```typescript
|
|
189
|
+
// Browser: Works if model is in IndexedDB
|
|
190
|
+
const gerbil = await createGerbilWorker({ modelId: "qwen3-0.6b" });
|
|
191
|
+
|
|
192
|
+
// Node.js: Set offline mode
|
|
193
|
+
process.env.TRANSFORMERS_OFFLINE = "1";
|
|
194
|
+
const g = new Gerbil();
|
|
195
|
+
await g.loadModel("qwen3-0.6b"); // Uses cache only
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Troubleshooting
|
|
199
|
+
|
|
200
|
+
### "Model not found" after cache clear
|
|
201
|
+
|
|
202
|
+
Re-download by loading the model:
|
|
203
|
+
|
|
204
|
+
```typescript
|
|
205
|
+
await g.loadModel("qwen3-0.6b"); // Will re-download
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### Cache taking too much space
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
# View cache size
|
|
212
|
+
npx @tryhamster/gerbil cache
|
|
213
|
+
|
|
214
|
+
# Clear old models
|
|
215
|
+
npx @tryhamster/gerbil cache --older-than 7
|
|
216
|
+
|
|
217
|
+
# Clear everything
|
|
218
|
+
npx @tryhamster/gerbil cache --clean
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### Browser cache not persisting
|
|
222
|
+
|
|
223
|
+
Check browser settings:
|
|
224
|
+
- Cookies/site data must be allowed
|
|
225
|
+
- IndexedDB must not be blocked
|
|
226
|
+
- Storage quota must not be exceeded
|
|
227
|
+
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# Inference Pipeline
|
|
2
|
+
|
|
3
|
+
How Gerbil runs LLM inference using ONNX Runtime and transformers.js.
|
|
4
|
+
|
|
5
|
+
## The Stack
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
┌─────────────────────────────────────┐
|
|
9
|
+
│ transformers.js │ ← Tokenization, model loading, generation
|
|
10
|
+
├─────────────────────────────────────┤
|
|
11
|
+
│ ONNX Runtime │ ← Neural network execution
|
|
12
|
+
├─────────────────────────────────────┤
|
|
13
|
+
│ WebGPU │ CPU │ WASM │ ← Execution backends
|
|
14
|
+
└─────────────────────────────────────┘
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## transformers.js
|
|
18
|
+
|
|
19
|
+
[transformers.js](https://huggingface.co/docs/transformers.js) is a JavaScript port of Hugging Face Transformers that runs ONNX models in the browser and Node.js.
|
|
20
|
+
|
|
21
|
+
### Model Loading
|
|
22
|
+
|
|
23
|
+
```typescript
|
|
24
|
+
import { AutoModelForCausalLM, AutoTokenizer } from "@huggingface/transformers";
|
|
25
|
+
|
|
26
|
+
const tokenizer = await AutoTokenizer.from_pretrained(modelId);
|
|
27
|
+
const model = await AutoModelForCausalLM.from_pretrained(modelId, {
|
|
28
|
+
dtype: "q4f16", // Quantization
|
|
29
|
+
device: "webgpu", // Backend
|
|
30
|
+
});
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Generation
|
|
34
|
+
|
|
35
|
+
```typescript
|
|
36
|
+
const inputs = tokenizer.apply_chat_template(messages, {
|
|
37
|
+
add_generation_prompt: true,
|
|
38
|
+
return_dict: true,
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
const output = await model.generate({
|
|
42
|
+
...inputs,
|
|
43
|
+
max_new_tokens: 256,
|
|
44
|
+
temperature: 0.7,
|
|
45
|
+
do_sample: true,
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
const text = tokenizer.decode(output[0], { skip_special_tokens: true });
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Streaming with TextStreamer
|
|
52
|
+
|
|
53
|
+
```typescript
|
|
54
|
+
const streamer = new TextStreamer(tokenizer, {
|
|
55
|
+
skip_prompt: true,
|
|
56
|
+
skip_special_tokens: true,
|
|
57
|
+
callback_function: (text) => {
|
|
58
|
+
console.log(text); // Called for each token
|
|
59
|
+
},
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
await model.generate({ ...inputs, streamer });
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## ONNX Runtime
|
|
66
|
+
|
|
67
|
+
ONNX Runtime is the execution engine that runs the neural network operations.
|
|
68
|
+
|
|
69
|
+
### Backends
|
|
70
|
+
|
|
71
|
+
| Backend | Environment | Speed | Notes |
|
|
72
|
+
|---------|-------------|-------|-------|
|
|
73
|
+
| **WebGPU** | Browser, Chrome | ~70-100 tok/s | Fastest, requires GPU |
|
|
74
|
+
| **CPU** | Node.js | ~10-30 tok/s | Uses SIMD, good on Apple Silicon |
|
|
75
|
+
| **WASM** | Browser fallback | ~5-10 tok/s | Works everywhere |
|
|
76
|
+
|
|
77
|
+
### Execution Providers
|
|
78
|
+
|
|
79
|
+
ONNX Runtime selects execution providers based on availability:
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
WebGPU EP → WASM EP → CPU EP
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Gerbil explicitly requests the desired backend:
|
|
86
|
+
|
|
87
|
+
```typescript
|
|
88
|
+
// For WebGPU
|
|
89
|
+
await AutoModelForCausalLM.from_pretrained(modelId, { device: "webgpu" });
|
|
90
|
+
|
|
91
|
+
// For CPU
|
|
92
|
+
await pipeline("text-generation", modelId, { device: "cpu" });
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Quantization
|
|
96
|
+
|
|
97
|
+
Quantization reduces model size and improves inference speed by using lower-precision numbers.
|
|
98
|
+
|
|
99
|
+
### Quantization Types
|
|
100
|
+
|
|
101
|
+
| Type | Weights | Compute | Size Reduction | Use Case |
|
|
102
|
+
|------|---------|---------|----------------|----------|
|
|
103
|
+
| **fp32** | 32-bit float | 32-bit | 1x (baseline) | Training |
|
|
104
|
+
| **fp16** | 16-bit float | 16-bit | 2x | GPU inference |
|
|
105
|
+
| **q4f16** | 4-bit int | 16-bit | ~4x | WebGPU inference |
|
|
106
|
+
| **q4** | 4-bit int | 32-bit | ~4x | CPU inference |
|
|
107
|
+
|
|
108
|
+
### Why q4f16 for WebGPU?
|
|
109
|
+
|
|
110
|
+
WebGPU shaders work best with fp16 compute. The `q4f16` format:
|
|
111
|
+
- Stores weights as 4-bit integers (small download)
|
|
112
|
+
- Dequantizes to fp16 during inference (fast on GPU)
|
|
113
|
+
- Maintains good quality for small models
|
|
114
|
+
|
|
115
|
+
### Model Sizes
|
|
116
|
+
|
|
117
|
+
| Model | Original | q4f16 | Download |
|
|
118
|
+
|-------|----------|-------|----------|
|
|
119
|
+
| Qwen3-0.6B | ~2.4GB | ~400MB | ~400MB |
|
|
120
|
+
| SmolLM2-360M | ~1.4GB | ~250MB | ~250MB |
|
|
121
|
+
| SmolLM2-135M | ~540MB | ~100MB | ~100MB |
|
|
122
|
+
|
|
123
|
+
## Tokenization
|
|
124
|
+
|
|
125
|
+
### Chat Templates
|
|
126
|
+
|
|
127
|
+
Gerbil uses model-specific chat templates to format conversations:
|
|
128
|
+
|
|
129
|
+
```typescript
|
|
130
|
+
const messages = [
|
|
131
|
+
{ role: "system", content: "You are helpful." },
|
|
132
|
+
{ role: "user", content: "Hello!" },
|
|
133
|
+
];
|
|
134
|
+
|
|
135
|
+
const inputs = tokenizer.apply_chat_template(messages, {
|
|
136
|
+
add_generation_prompt: true, // Add assistant turn start
|
|
137
|
+
return_dict: true, // Return input_ids + attention_mask
|
|
138
|
+
enable_thinking: true, // Qwen3 thinking mode
|
|
139
|
+
});
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Thinking Mode (Qwen3)
|
|
143
|
+
|
|
144
|
+
Qwen3 models support a "thinking" mode where the model shows reasoning:
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
<think>
|
|
148
|
+
Let me work through this step by step...
|
|
149
|
+
127 × 43 = 127 × 40 + 127 × 3 = 5080 + 381 = 5461
|
|
150
|
+
</think>
|
|
151
|
+
The answer is 5461.
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Enabled via `enable_thinking: true` in the chat template.
|
|
155
|
+
|
|
156
|
+
## KV Cache
|
|
157
|
+
|
|
158
|
+
The Key-Value cache stores intermediate attention states to speed up autoregressive generation:
|
|
159
|
+
|
|
160
|
+
```typescript
|
|
161
|
+
const { past_key_values, sequences } = await model.generate({
|
|
162
|
+
...inputs,
|
|
163
|
+
past_key_values: previousCache, // Reuse from last turn
|
|
164
|
+
return_dict_in_generate: true,
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
// Save for next turn
|
|
168
|
+
cache = past_key_values;
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Benefits:
|
|
172
|
+
- Faster multi-turn conversations
|
|
173
|
+
- Reduced compute for long contexts
|
|
174
|
+
|
|
175
|
+
Gerbil manages the KV cache automatically for multi-turn chat.
|
|
176
|
+
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Architecture Overview
|
|
2
|
+
|
|
3
|
+
Gerbil is a local LLM inference library that runs entirely on-device, with no API calls or cloud dependencies.
|
|
4
|
+
|
|
5
|
+
## Core Components
|
|
6
|
+
|
|
7
|
+
### 1. Gerbil Class (`src/core/gerbil.ts`)
|
|
8
|
+
|
|
9
|
+
The main entry point for Node.js applications:
|
|
10
|
+
|
|
11
|
+
```typescript
|
|
12
|
+
const g = new Gerbil();
|
|
13
|
+
await g.loadModel("qwen3-0.6b");
|
|
14
|
+
const result = await g.generate("Hello");
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Responsibilities:
|
|
18
|
+
- Model loading and lifecycle management
|
|
19
|
+
- Device selection (WebGPU vs CPU)
|
|
20
|
+
- Generation orchestration
|
|
21
|
+
- Streaming coordination
|
|
22
|
+
- Session statistics
|
|
23
|
+
|
|
24
|
+
### 2. Model Registry (`src/core/models.ts`)
|
|
25
|
+
|
|
26
|
+
Maps friendly model IDs to Hugging Face paths:
|
|
27
|
+
|
|
28
|
+
```typescript
|
|
29
|
+
const BUILTIN_MODELS = {
|
|
30
|
+
"qwen3-0.6b": {
|
|
31
|
+
id: "qwen3-0.6b",
|
|
32
|
+
path: "onnx-community/Qwen3-0.6B-ONNX",
|
|
33
|
+
family: "qwen",
|
|
34
|
+
size: "0.6B",
|
|
35
|
+
contextLength: 32768,
|
|
36
|
+
supportsThinking: true,
|
|
37
|
+
},
|
|
38
|
+
// ...
|
|
39
|
+
};
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### 3. Chrome GPU Backend (`src/core/chrome-backend.ts`)
|
|
43
|
+
|
|
44
|
+
Enables WebGPU in Node.js by using headless Chrome as a GPU accelerator:
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
┌─────────────┐ HTTP ┌──────────────────┐
|
|
48
|
+
│ Node.js │◄──────────►│ Headless Chrome │
|
|
49
|
+
│ (Gerbil) │ :43724 │ (WebGPU worker) │
|
|
50
|
+
└─────────────┘ └──────────────────┘
|
|
51
|
+
│ │
|
|
52
|
+
│ CDP (DevTools) │ WebGPU
|
|
53
|
+
└──────────────────────────┘
|
|
54
|
+
│
|
|
55
|
+
┌─────▼─────┐
|
|
56
|
+
│ GPU │
|
|
57
|
+
└───────────┘
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### 4. Browser Worker (`src/browser/index.ts`)
|
|
61
|
+
|
|
62
|
+
Provides `createGerbilWorker()` for browser applications:
|
|
63
|
+
|
|
64
|
+
```typescript
|
|
65
|
+
const gerbil = await createGerbilWorker({
|
|
66
|
+
modelId: "qwen3-0.6b",
|
|
67
|
+
onToken: (token) => console.log(token.text),
|
|
68
|
+
});
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Uses an inline Web Worker to:
|
|
72
|
+
- Load models without blocking the UI
|
|
73
|
+
- Stream tokens in real-time
|
|
74
|
+
- Manage GPU memory separately from main thread
|
|
75
|
+
|
|
76
|
+
## Execution Paths
|
|
77
|
+
|
|
78
|
+
### Browser Path
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
User Code
|
|
82
|
+
│
|
|
83
|
+
▼
|
|
84
|
+
createGerbilWorker()
|
|
85
|
+
│
|
|
86
|
+
▼
|
|
87
|
+
Web Worker (inline blob)
|
|
88
|
+
│
|
|
89
|
+
▼
|
|
90
|
+
transformers.js
|
|
91
|
+
│
|
|
92
|
+
▼
|
|
93
|
+
ONNX Runtime (WebGPU)
|
|
94
|
+
│
|
|
95
|
+
▼
|
|
96
|
+
GPU
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Node.js CPU Path
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
User Code
|
|
103
|
+
│
|
|
104
|
+
▼
|
|
105
|
+
Gerbil.generate()
|
|
106
|
+
│
|
|
107
|
+
▼
|
|
108
|
+
transformers.js pipeline
|
|
109
|
+
│
|
|
110
|
+
▼
|
|
111
|
+
ONNX Runtime (CPU)
|
|
112
|
+
│
|
|
113
|
+
▼
|
|
114
|
+
CPU (with SIMD)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Node.js WebGPU Path
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
User Code
|
|
121
|
+
│
|
|
122
|
+
▼
|
|
123
|
+
Gerbil.generate()
|
|
124
|
+
│
|
|
125
|
+
▼
|
|
126
|
+
ChromeGPUBackend
|
|
127
|
+
│
|
|
128
|
+
▼
|
|
129
|
+
Headless Chrome (via CDP)
|
|
130
|
+
│
|
|
131
|
+
▼
|
|
132
|
+
transformers.js (in Chrome)
|
|
133
|
+
│
|
|
134
|
+
▼
|
|
135
|
+
ONNX Runtime (WebGPU)
|
|
136
|
+
│
|
|
137
|
+
▼
|
|
138
|
+
GPU
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Device Selection
|
|
142
|
+
|
|
143
|
+
Gerbil automatically selects the best available backend:
|
|
144
|
+
|
|
145
|
+
```typescript
|
|
146
|
+
// Explicit selection
|
|
147
|
+
await g.loadModel("qwen3-0.6b", { device: "webgpu" }); // or "cpu"
|
|
148
|
+
|
|
149
|
+
// Check current device
|
|
150
|
+
g.getDeviceMode(); // "webgpu" | "cpu" | "wasm"
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Priority:
|
|
154
|
+
1. **Browser**: WebGPU → WASM fallback
|
|
155
|
+
2. **Node.js with --gpu**: ChromeGPUBackend (headless Chrome)
|
|
156
|
+
3. **Node.js default**: CPU via ONNX Runtime
|
|
157
|
+
|
|
158
|
+
## File Structure
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
src/
|
|
162
|
+
├── core/
|
|
163
|
+
│ ├── gerbil.ts # Main Gerbil class
|
|
164
|
+
│ ├── models.ts # Model registry
|
|
165
|
+
│ ├── types.ts # TypeScript types
|
|
166
|
+
│ ├── tools.ts # Tool calling system
|
|
167
|
+
│ └── chrome-backend.ts # Node.js WebGPU via Chrome
|
|
168
|
+
├── browser/
|
|
169
|
+
│ └── index.ts # createGerbilWorker + utilities
|
|
170
|
+
├── skills/
|
|
171
|
+
│ └── ... # Built-in skills (commit, summarize, etc.)
|
|
172
|
+
├── integrations/
|
|
173
|
+
│ └── ... # AI SDK, LangChain, MCP
|
|
174
|
+
├── frameworks/
|
|
175
|
+
│ └── ... # Next.js, Express, React, etc.
|
|
176
|
+
└── cli/
|
|
177
|
+
└── repl/ # Interactive terminal UI
|
|
178
|
+
```
|
|
179
|
+
|