@tryhamster/gerbil 1.0.0-rc.9 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +318 -104
- package/dist/architectures-C1I5V3Dt.mjs +6070 -0
- package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
- package/dist/browser/index.d.ts +276 -590
- package/dist/browser/index.d.ts.map +1 -1
- package/dist/browser/index.js +592 -2334
- package/dist/browser/index.js.map +1 -1
- package/dist/cli.mjs +625 -1098
- package/dist/cli.mjs.map +1 -1
- package/dist/defaults-9komdrbY.mjs +24 -0
- package/dist/defaults-9komdrbY.mjs.map +1 -0
- package/dist/frameworks/express.d.mts +1 -3
- package/dist/frameworks/express.d.mts.map +1 -1
- package/dist/frameworks/express.mjs +7 -7
- package/dist/frameworks/express.mjs.map +1 -1
- package/dist/frameworks/fastify.d.mts +1 -1
- package/dist/frameworks/fastify.d.mts.map +1 -1
- package/dist/frameworks/fastify.mjs +3 -3
- package/dist/frameworks/fastify.mjs.map +1 -1
- package/dist/frameworks/hono.d.mts +1 -1
- package/dist/frameworks/hono.d.mts.map +1 -1
- package/dist/frameworks/hono.mjs +4 -4
- package/dist/frameworks/hono.mjs.map +1 -1
- package/dist/frameworks/next.d.mts +3 -2
- package/dist/frameworks/next.d.mts.map +1 -1
- package/dist/frameworks/next.mjs +4 -4
- package/dist/frameworks/next.mjs.map +1 -1
- package/dist/frameworks/react.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts.map +1 -1
- package/dist/frameworks/trpc.mjs +4 -4
- package/dist/frameworks/trpc.mjs.map +1 -1
- package/dist/gerbil-BetB5xb0.d.mts +488 -0
- package/dist/gerbil-BetB5xb0.d.mts.map +1 -0
- package/dist/gerbil-CTZUa8EZ.mjs +4 -0
- package/dist/gerbil-DNniplr4.mjs +1656 -0
- package/dist/gerbil-DNniplr4.mjs.map +1 -0
- package/dist/gpu/hooks.d.mts +640 -0
- package/dist/gpu/hooks.d.mts.map +1 -0
- package/dist/gpu/hooks.mjs +1369 -0
- package/dist/gpu/hooks.mjs.map +1 -0
- package/dist/gpu/index.d.mts +2 -0
- package/dist/gpu/index.mjs +6 -0
- package/dist/gpu-DFuglcEx.mjs +3790 -0
- package/dist/gpu-DFuglcEx.mjs.map +1 -0
- package/dist/index-Dgmb2kE3.d.mts +245 -0
- package/dist/index-Dgmb2kE3.d.mts.map +1 -0
- package/dist/index-DukkJRMj.d.mts +2114 -0
- package/dist/index-DukkJRMj.d.mts.map +1 -0
- package/dist/index.d.mts +22 -487
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +13 -8
- package/dist/index.mjs.map +1 -1
- package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
- package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
- package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
- package/dist/integrations/ai-sdk.d.mts +75 -6
- package/dist/integrations/ai-sdk.d.mts.map +1 -1
- package/dist/integrations/ai-sdk.mjs +131 -15
- package/dist/integrations/ai-sdk.mjs.map +1 -1
- package/dist/integrations/langchain.d.mts +1 -1
- package/dist/integrations/langchain.d.mts.map +1 -1
- package/dist/integrations/langchain.mjs +5 -5
- package/dist/integrations/langchain.mjs.map +1 -1
- package/dist/integrations/llamaindex.d.mts +1 -1
- package/dist/integrations/llamaindex.d.mts.map +1 -1
- package/dist/integrations/llamaindex.mjs +5 -5
- package/dist/integrations/llamaindex.mjs.map +1 -1
- package/dist/integrations/mcp-client.mjs +3 -3
- package/dist/integrations/mcp-client.mjs.map +1 -1
- package/dist/integrations/mcp.d.mts +3 -2
- package/dist/integrations/mcp.d.mts.map +1 -1
- package/dist/integrations/mcp.mjs +5 -5
- package/dist/{mcp-BvbriaBy.mjs → mcp-D2vvH1Xc.mjs} +4 -4
- package/dist/mcp-D2vvH1Xc.mjs.map +1 -0
- package/dist/memory/index.d.mts +3 -0
- package/dist/memory/index.mjs +6 -0
- package/dist/memory-D1P7Tmda.mjs +4 -0
- package/dist/memory-DVN0MnIG.mjs +132 -0
- package/dist/memory-DVN0MnIG.mjs.map +1 -0
- package/dist/memory-Dj0J1v88.mjs +294 -0
- package/dist/memory-Dj0J1v88.mjs.map +1 -0
- package/dist/moonshine-stt-17dpP1kr.mjs +4 -0
- package/dist/moonshine-stt-4ojLtMq7.mjs +11962 -0
- package/dist/moonshine-stt-4ojLtMq7.mjs.map +1 -0
- package/dist/{one-liner-s-lD8rCC.mjs → one-liner-JhdIPxzF.mjs} +14 -16
- package/dist/one-liner-JhdIPxzF.mjs.map +1 -0
- package/dist/repl-BDRkwPGX.mjs +9 -0
- package/dist/skills/index.d.mts +270 -320
- package/dist/skills/index.d.mts.map +1 -1
- package/dist/skills/index.mjs +5 -5
- package/dist/{skills-CD3Orlex.mjs → skills-CU694Dc8.mjs} +187 -32
- package/dist/skills-CU694Dc8.mjs.map +1 -0
- package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
- package/dist/tools-DQ1mPUw5.mjs.map +1 -0
- package/dist/types-DQBe2lFo.d.mts +165 -0
- package/dist/types-DQBe2lFo.d.mts.map +1 -0
- package/dist/{types-CiTc7ez3.d.mts → types-LlyYILII.d.mts} +112 -14
- package/dist/types-LlyYILII.d.mts.map +1 -0
- package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
- package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
- package/dist/vector-B0panuy6.mjs +95 -0
- package/dist/vector-B0panuy6.mjs.map +1 -0
- package/docs/PROJECT-STATE.md +321 -0
- package/docs/adding-a-model-family.md +280 -0
- package/docs/ai-sdk.md +70 -61
- package/docs/architecture/overview.md +17 -7
- package/docs/browser.md +203 -8
- package/docs/embeddings.md +156 -0
- package/docs/gerbil-site-native-migration.md +217 -0
- package/docs/gpu-engine/architectures.md +398 -0
- package/docs/gpu-engine/ir.md +372 -0
- package/docs/gpu-engine/kernels.md +718 -0
- package/docs/gpu-engine/paper.html +1759 -0
- package/docs/gpu-engine/paper.md +2109 -0
- package/docs/gpu-engine/safetensors.md +312 -0
- package/docs/gpu-engine/tokenizer.md +302 -0
- package/docs/memory-rag.md +91 -0
- package/docs/metal-safari-intel.md +190 -0
- package/docs/mobile-failure-diagnosis.md +124 -0
- package/docs/mobile.md +99 -0
- package/docs/observability.md +230 -0
- package/docs/onnx-removal-plan.md +339 -0
- package/docs/research/autoresearch-portable.md +904 -0
- package/docs/research/dispatch-reduction-hivemind.md +84 -0
- package/docs/research/ios-safari-model-caching.md +117 -0
- package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
- package/docs/research/native-stt-model-selection.md +49 -0
- package/docs/research/native-tts-model-selection.md +90 -0
- package/docs/research/native-vs-chromium-decision.md +152 -0
- package/docs/research/nemotron-mamba2-inference.md +910 -0
- package/docs/research/qwen35-multimodal.md +293 -0
- package/docs/research/qwen36-gemma4-targets.md +337 -0
- package/docs/research/sota-embedding-models.md +179 -0
- package/docs/research/sota-mobile-models-2026.md +263 -0
- package/docs/research/sota-modality-models.md +202 -0
- package/docs/research/tps-baselines.md +71 -0
- package/docs/research/webgpu-m4-reference.md +104 -0
- package/docs/site-update-plan.md +155 -0
- package/docs/structured-output.md +123 -0
- package/docs/stt.md +63 -446
- package/docs/tts.md +77 -499
- package/docs/vision.md +100 -338
- package/package.json +22 -7
- package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
- package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
- package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
- package/dist/gerbil-CJ3ifloF.mjs +0 -4
- package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
- package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
- package/dist/gerbil-qOTe1nl2.d.mts +0 -431
- package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
- package/dist/kokoro-BNTb6egA.mjs +0 -20210
- package/dist/kokoro-BNTb6egA.mjs.map +0 -1
- package/dist/kokoro-CMOGDSgT.js +0 -20212
- package/dist/kokoro-CMOGDSgT.js.map +0 -1
- package/dist/mcp-BvbriaBy.mjs.map +0 -1
- package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
- package/dist/repl-DveXw36T.mjs +0 -9
- package/dist/skills-CD3Orlex.mjs.map +0 -1
- package/dist/stt-Bu-E23Sc.js +0 -433
- package/dist/stt-Bu-E23Sc.js.map +0 -1
- package/dist/stt-CpLYbGFd.mjs +0 -433
- package/dist/stt-CpLYbGFd.mjs.map +0 -1
- package/dist/stt-DRPLEEHB.mjs +0 -3
- package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
- package/dist/transformers.web-DiD1gTwk.js +0 -44695
- package/dist/transformers.web-DiD1gTwk.js.map +0 -1
- package/dist/transformers.web-u34VxRFM.js +0 -3
- package/dist/tts-CqroPaSK.js +0 -724
- package/dist/tts-CqroPaSK.js.map +0 -1
- package/dist/tts-DXgsKGCe.mjs +0 -3
- package/dist/tts-DeGANMNV.mjs +0 -730
- package/dist/tts-DeGANMNV.mjs.map +0 -1
- package/dist/types-CiTc7ez3.d.mts.map +0 -1
- /package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
- /package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
- /package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0
package/docs/vision.md
CHANGED
|
@@ -1,396 +1,158 @@
|
|
|
1
|
-
# Vision
|
|
1
|
+
# Vision (Image → Text)
|
|
2
2
|
|
|
3
|
-
Gerbil
|
|
3
|
+
Gerbil's native WebGPU engine runs **vision language models** image-in → text-out. Two
|
|
4
|
+
vision towers are supported natively, both with the same `describeImage` API:
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
import { Gerbil } from "@tryhamster/gerbil";
|
|
9
|
-
|
|
10
|
-
const g = new Gerbil();
|
|
11
|
-
await g.loadModel("ministral-3b"); // Vision-capable model
|
|
12
|
-
|
|
13
|
-
const result = await g.generate("What's in this image?", {
|
|
14
|
-
images: [{ source: "https://example.com/photo.jpg" }]
|
|
15
|
-
});
|
|
16
|
-
|
|
17
|
-
console.log(result.text);
|
|
18
|
-
```
|
|
19
|
-
|
|
20
|
-
## Supported Models
|
|
21
|
-
|
|
22
|
-
| Model ID | Vision | Reasoning | Context | Size |
|
|
23
|
-
|----------|--------|-----------|---------|------|
|
|
24
|
-
| `ministral-3b` | ✅ | ✅ | 256K | ~2.5GB |
|
|
25
|
-
|
|
26
|
-
More vision models coming soon as they become available in ONNX format.
|
|
27
|
-
|
|
28
|
-
## Image Input Types
|
|
29
|
-
|
|
30
|
-
Gerbil accepts images in several formats:
|
|
31
|
-
|
|
32
|
-
```typescript
|
|
33
|
-
// URL (recommended for web images)
|
|
34
|
-
images: [{ source: "https://example.com/image.jpg" }]
|
|
35
|
-
|
|
36
|
-
// Data URI (base64 encoded)
|
|
37
|
-
images: [{ source: "data:image/png;base64,iVBORw0KGgo..." }]
|
|
38
|
-
|
|
39
|
-
// Local file path (Node.js only, auto-converted to data URI)
|
|
40
|
-
images: [{ source: "/path/to/image.png" }]
|
|
41
|
-
|
|
42
|
-
// With alt text (optional, provides context)
|
|
43
|
-
images: [{ source: "...", alt: "A photo of a sunset" }]
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
## Multiple Images
|
|
47
|
-
|
|
48
|
-
You can pass multiple images for comparison or multi-image understanding:
|
|
49
|
-
|
|
50
|
-
```typescript
|
|
51
|
-
const result = await g.generate("What's the difference between these two images?", {
|
|
52
|
-
images: [
|
|
53
|
-
{ source: "https://example.com/before.jpg" },
|
|
54
|
-
{ source: "https://example.com/after.jpg" }
|
|
55
|
-
]
|
|
56
|
-
});
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
## Model Capability Detection
|
|
60
|
-
|
|
61
|
-
Check if the loaded model supports vision:
|
|
6
|
+
- **Qwen3.5 ViT** — bundled with `Qwen/Qwen3.5-0.8B` (the 0.8B text model's own built-in
|
|
7
|
+
vision tower). Bit-exact vs HuggingFace transformers.
|
|
8
|
+
- **Gemma 4 ViT** — bundled with `mlx-community/gemma-4-e2b-it-4bit`.
|
|
62
9
|
|
|
63
|
-
|
|
64
|
-
|
|
10
|
+
There is no separate "vision model" — vision is the text model's own ViT, loaded on
|
|
11
|
+
demand with `enableVision: true`.
|
|
65
12
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
// Text-only mode
|
|
70
|
-
}
|
|
71
|
-
```
|
|
13
|
+
> **Pre-1.0.** This is the native engine surface. The legacy `Gerbil` class (ONNX /
|
|
14
|
+
> transformers.js) once exposed an `images:` array on `generate()`; that path is retired and
|
|
15
|
+
> not documented here.
|
|
72
16
|
|
|
73
|
-
##
|
|
17
|
+
## Quick Start
|
|
74
18
|
|
|
75
|
-
|
|
76
|
-
1. Log a warning to console
|
|
77
|
-
2. Ignore the images
|
|
78
|
-
3. Process the text prompt normally
|
|
19
|
+
### Node
|
|
79
20
|
|
|
80
|
-
|
|
21
|
+
In Node you decode the image to RGB pixels yourself (HWC layout, 0..255), then pass
|
|
22
|
+
`{ pixels, width, height }`. The engine handles smart-resize, normalization, and patchify
|
|
23
|
+
internally to match the HF image processor.
|
|
81
24
|
|
|
82
25
|
```typescript
|
|
83
|
-
|
|
84
|
-
const result = await g.generate("Describe this", {
|
|
85
|
-
images: [{ source: imageUrl }]
|
|
86
|
-
});
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
---
|
|
26
|
+
import { WebGPUEngine } from "@tryhamster/gerbil/gpu";
|
|
90
27
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
```typescript
|
|
96
|
-
import { generateText } from "ai";
|
|
97
|
-
import { gerbil } from "@tryhamster/gerbil/ai";
|
|
98
|
-
|
|
99
|
-
const { text } = await generateText({
|
|
100
|
-
model: gerbil("ministral-3b"),
|
|
101
|
-
messages: [
|
|
102
|
-
{
|
|
103
|
-
role: "user",
|
|
104
|
-
content: [
|
|
105
|
-
{ type: "image", image: new URL("https://example.com/photo.jpg") },
|
|
106
|
-
{ type: "text", text: "Describe this image in detail" },
|
|
107
|
-
],
|
|
108
|
-
},
|
|
109
|
-
],
|
|
28
|
+
const engine = await WebGPUEngine.create({
|
|
29
|
+
repo: "Qwen/Qwen3.5-0.8B", // BF16 checkpoint ships the ViT weights
|
|
30
|
+
enableVision: true,
|
|
110
31
|
});
|
|
111
|
-
```
|
|
112
32
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
{ type: "image", image: new URL("https://...") }
|
|
33
|
+
// pixels: Uint8ClampedArray | Uint8Array | Float32Array, RGB, length = width*height*3
|
|
34
|
+
const { text, tokensPerSecond } = await engine.describeImage(
|
|
35
|
+
{ pixels, width, height },
|
|
36
|
+
"What's in this image?",
|
|
37
|
+
{ maxTokens: 150 },
|
|
38
|
+
);
|
|
120
39
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
// Base64 string
|
|
125
|
-
{ type: "image", image: "data:image/png;base64,..." }
|
|
126
|
-
|
|
127
|
-
// Uint8Array with mime type
|
|
128
|
-
{ type: "image", image: imageBytes, mimeType: "image/png" }
|
|
40
|
+
console.log(text, `(${tokensPerSecond.toFixed(1)} tok/s)`);
|
|
41
|
+
engine.destroy();
|
|
129
42
|
```
|
|
130
43
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
## Express & Next.js Integration
|
|
134
|
-
|
|
135
|
-
### Express
|
|
136
|
-
|
|
137
|
-
```typescript
|
|
138
|
-
import express from "express";
|
|
139
|
-
import { gerbil } from "@tryhamster/gerbil/express";
|
|
140
|
-
|
|
141
|
-
const app = express();
|
|
142
|
-
app.use("/ai", gerbil({ model: "ministral-3b" })());
|
|
143
|
-
|
|
144
|
-
// POST /ai/generate
|
|
145
|
-
// Body: { prompt: "Describe this", images: [{ source: "https://..." }] }
|
|
146
|
-
```
|
|
147
|
-
|
|
148
|
-
### Next.js App Router
|
|
149
|
-
|
|
150
|
-
```typescript
|
|
151
|
-
// app/api/chat/route.ts
|
|
152
|
-
import { gerbil } from "@tryhamster/gerbil/next";
|
|
153
|
-
|
|
154
|
-
export const POST = gerbil.handler({ model: "ministral-3b" });
|
|
155
|
-
|
|
156
|
-
// Fetch from client:
|
|
157
|
-
// fetch("/api/chat", {
|
|
158
|
-
// method: "POST",
|
|
159
|
-
// body: JSON.stringify({
|
|
160
|
-
// prompt: "What's in this image?",
|
|
161
|
-
// images: [{ source: dataUri }]
|
|
162
|
-
// })
|
|
163
|
-
// })
|
|
164
|
-
```
|
|
44
|
+
### React (Browser)
|
|
165
45
|
|
|
166
|
-
|
|
46
|
+
The React hook decodes URLs and data-URLs for you, so you can pass an image source string
|
|
47
|
+
directly. Load with `enableVision: true`.
|
|
167
48
|
|
|
168
|
-
|
|
49
|
+
```tsx
|
|
50
|
+
import { useEngine } from "@tryhamster/gerbil/gpu/hooks";
|
|
169
51
|
|
|
170
|
-
|
|
52
|
+
function ImageDescriber() {
|
|
53
|
+
const { describeImage, completion, isLoading, isGenerating } = useEngine({
|
|
54
|
+
model: "Qwen/Qwen3.5-0.8B",
|
|
55
|
+
enableVision: true,
|
|
56
|
+
autoLoad: true,
|
|
57
|
+
});
|
|
171
58
|
|
|
172
|
-
|
|
173
|
-
import { useChat } from "@tryhamster/gerbil/browser";
|
|
174
|
-
|
|
175
|
-
function VisionChat() {
|
|
176
|
-
const {
|
|
177
|
-
messages,
|
|
178
|
-
input,
|
|
179
|
-
setInput,
|
|
180
|
-
handleSubmit,
|
|
181
|
-
attachImage,
|
|
182
|
-
attachedImages,
|
|
183
|
-
clearImages,
|
|
184
|
-
sendWithImages,
|
|
185
|
-
} = useChat({ model: "ministral-3b" });
|
|
186
|
-
|
|
187
|
-
const handleFileSelect = (e: React.ChangeEvent<HTMLInputElement>) => {
|
|
188
|
-
const file = e.target.files?.[0];
|
|
189
|
-
if (file) {
|
|
190
|
-
const reader = new FileReader();
|
|
191
|
-
reader.onload = () => attachImage(reader.result as string);
|
|
192
|
-
reader.readAsDataURL(file);
|
|
193
|
-
}
|
|
194
|
-
};
|
|
59
|
+
if (isLoading) return <div>Loading vision model…</div>;
|
|
195
60
|
|
|
196
61
|
return (
|
|
197
62
|
<div>
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
{/* Image attachment */}
|
|
209
|
-
<input type="file" accept="image/*" onChange={handleFileSelect} />
|
|
210
|
-
|
|
211
|
-
{attachedImages.length > 0 && (
|
|
212
|
-
<div>
|
|
213
|
-
📎 {attachedImages.length} image(s) attached
|
|
214
|
-
<button onClick={clearImages}>Clear</button>
|
|
215
|
-
</div>
|
|
216
|
-
)}
|
|
217
|
-
|
|
218
|
-
{/* Input */}
|
|
219
|
-
<form onSubmit={handleSubmit}>
|
|
220
|
-
<input
|
|
221
|
-
value={input}
|
|
222
|
-
onChange={e => setInput(e.target.value)}
|
|
223
|
-
placeholder="Describe the image..."
|
|
224
|
-
/>
|
|
225
|
-
<button type="submit">Send</button>
|
|
226
|
-
</form>
|
|
63
|
+
<button
|
|
64
|
+
disabled={isGenerating}
|
|
65
|
+
onClick={() =>
|
|
66
|
+
describeImage("https://example.com/photo.jpg", "Describe this image in detail")
|
|
67
|
+
}
|
|
68
|
+
>
|
|
69
|
+
Describe
|
|
70
|
+
</button>
|
|
71
|
+
<p>{completion}</p>
|
|
227
72
|
</div>
|
|
228
73
|
);
|
|
229
74
|
}
|
|
230
75
|
```
|
|
231
76
|
|
|
232
|
-
|
|
77
|
+
`describeImage` accepts either an image source string (data URL or http(s) URL) or
|
|
78
|
+
pre-decoded `{ pixels, width, height }`. The hook caps the longest side at ~448px before
|
|
79
|
+
decoding — ViT attention memory scales with patch-count², so this keeps mobile coherent.
|
|
233
80
|
|
|
234
|
-
|
|
235
|
-
// Send a message with specific images
|
|
236
|
-
sendWithImages("Compare these two photos", [image1DataUri, image2DataUri]);
|
|
237
|
-
```
|
|
81
|
+
## Supported Towers
|
|
238
82
|
|
|
239
|
-
|
|
83
|
+
| Tower | Load from | dtype | Notes |
|
|
84
|
+
|-------|-----------|-------|-------|
|
|
85
|
+
| Qwen3.5 ViT | `Qwen/Qwen3.5-0.8B` (`enableVision: true`) | BF16 (ships ViT) | Bit-exact vs HF (per-token cosine 1.000000) |
|
|
86
|
+
| Gemma 4 ViT | `mlx-community/gemma-4-e2b-it-4bit` (`enableVision: true`) | MLX 4-bit | Native projector + multimodal-embedder norms |
|
|
240
87
|
|
|
241
|
-
|
|
88
|
+
> The MLX 4-bit Qwen3.5 repo (`mlx-community/Qwen3.5-0.8B-4bit`) is text-only — load the
|
|
89
|
+
> BF16 `Qwen/Qwen3.5-0.8B` repo to get the ViT weights for vision.
|
|
242
90
|
|
|
243
|
-
|
|
91
|
+
## How it works
|
|
244
92
|
|
|
245
|
-
|
|
93
|
+
`describeImage` runs the full image-in → text-out pipeline natively:
|
|
246
94
|
|
|
247
|
-
|
|
248
|
-
|
|
95
|
+
1. **Preprocess** — pixels are smart-resized, normalized, and patchified to match the HF
|
|
96
|
+
image processor (`preprocessImage` / `preprocessImageGemma4`).
|
|
97
|
+
2. **Encode** — the ViT turns patches into merged image-embedding tokens (`encodeImage`).
|
|
98
|
+
3. **Splice** — image tokens are scattered into the `image_token_id` rows of the text
|
|
99
|
+
sequence (`EmbedSplice` in the multimodal graph).
|
|
100
|
+
4. **Decode** — Qwen3.5 uses multimodal M-RoPE positions; Gemma 4 uses standard sequential
|
|
101
|
+
1D RoPE. The LM then generates the description.
|
|
249
102
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
format: "bullets", // "paragraph" | "bullets" | "structured"
|
|
254
|
-
});
|
|
255
|
-
```
|
|
103
|
+
The low-level pieces are exposed if you need them: `engine.encodeImage(patches, gridTHW)`
|
|
104
|
+
returns the merged image tokens, and `engine.hasVision` reports whether the engine was
|
|
105
|
+
built with a ViT.
|
|
256
106
|
|
|
257
|
-
|
|
107
|
+
## Image input
|
|
258
108
|
|
|
259
109
|
```typescript
|
|
260
|
-
|
|
110
|
+
// Pre-decoded RGB pixels (Node, or browser if you've already decoded)
|
|
111
|
+
await engine.describeImage({ pixels, width, height }, prompt);
|
|
261
112
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
type: "accessibility", // "ui-review" | "accessibility" | "suggestions" | "qa"
|
|
265
|
-
});
|
|
113
|
+
// Already-built patch tensor + grid (skips host preprocessing — for reference parity)
|
|
114
|
+
await engine.describeImage({ patches, gridTHW: [1, gridH, gridW] }, prompt);
|
|
266
115
|
```
|
|
267
116
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
```typescript
|
|
271
|
-
import { extractFromImage } from "@tryhamster/gerbil/skills";
|
|
117
|
+
In React, the hook also accepts a source string:
|
|
272
118
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
outputFormat: "markdown", // "raw" | "json" | "markdown"
|
|
277
|
-
});
|
|
119
|
+
```tsx
|
|
120
|
+
describeImage("data:image/png;base64,iVBOR…", "What is this?");
|
|
121
|
+
describeImage("https://example.com/photo.jpg", "Describe it"); // must be CORS-accessible
|
|
278
122
|
```
|
|
279
123
|
|
|
280
|
-
|
|
124
|
+
## Options
|
|
281
125
|
|
|
282
|
-
|
|
283
|
-
import { compareImages } from "@tryhamster/gerbil/skills";
|
|
284
|
-
|
|
285
|
-
const comparison = await compareImages({
|
|
286
|
-
image1: beforeScreenshot,
|
|
287
|
-
image2: afterScreenshot,
|
|
288
|
-
focus: "differences", // "differences" | "similarities" | "detailed"
|
|
289
|
-
});
|
|
290
|
-
```
|
|
291
|
-
|
|
292
|
-
### Caption Image
|
|
126
|
+
`describeImage(image, prompt?, options?)` takes the standard generation options:
|
|
293
127
|
|
|
294
128
|
```typescript
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
129
|
+
await engine.describeImage(image, "Describe this image.", {
|
|
130
|
+
maxTokens: 150,
|
|
131
|
+
sampling: { temperature: 0.7 },
|
|
132
|
+
stopSequences: ["\n\n"],
|
|
133
|
+
onToken: (t) => process.stdout.write(t),
|
|
300
134
|
});
|
|
301
135
|
```
|
|
302
136
|
|
|
303
|
-
|
|
137
|
+
Returns a `GenerateResult`: `{ text, tokensGenerated, tokensPerSecond, totalTime, finishReason }`.
|
|
304
138
|
|
|
305
139
|
## Performance Tips
|
|
306
140
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
await g.loadModel("ministral-3b"); // Auto-detects WebGPU
|
|
314
|
-
|
|
315
|
-
// Browser: Native WebGPU
|
|
316
|
-
await g.loadModel("ministral-3b", { device: "webgpu" });
|
|
317
|
-
```
|
|
318
|
-
|
|
319
|
-
### Image Size
|
|
320
|
-
|
|
321
|
-
- Larger images take longer to process
|
|
322
|
-
- Consider resizing before sending to the model
|
|
323
|
-
- 512x512 to 1024x1024 is generally optimal
|
|
324
|
-
|
|
325
|
-
### Caching
|
|
326
|
-
|
|
327
|
-
The model caches in the browser's IndexedDB (via Chrome backend in Node.js), so subsequent loads are fast.
|
|
328
|
-
|
|
329
|
-
---
|
|
141
|
+
- **Resize before sending.** ViT attention cost grows with the square of the patch count.
|
|
142
|
+
The React hook already caps the longest side at 448px; in Node, downscale large photos
|
|
143
|
+
(≈448–512px longest side) before decoding to pixels.
|
|
144
|
+
- **Mobile.** On iOS/iPadOS the engine reserves fewer vision patches by default to stay
|
|
145
|
+
under the WebKit GPU watchdog and memory budget; very large images may still need a
|
|
146
|
+
smaller `maxVisionPatches` (a `WebGPUEngine.create` option).
|
|
330
147
|
|
|
331
148
|
## Troubleshooting
|
|
332
149
|
|
|
333
|
-
### "
|
|
334
|
-
|
|
335
|
-
Make sure you're using a vision-capable model like `ministral-3b`.
|
|
336
|
-
|
|
337
|
-
### Slow image processing
|
|
338
|
-
|
|
339
|
-
- Ensure WebGPU is being used (check `g.getDeviceMode()`)
|
|
340
|
-
- Resize large images before sending
|
|
341
|
-
- In Node.js, the Chrome backend provides GPU acceleration
|
|
342
|
-
|
|
343
|
-
### Image not loading
|
|
344
|
-
|
|
345
|
-
- Check the URL is accessible
|
|
346
|
-
- For local files, ensure the path is absolute
|
|
347
|
-
- Base64 data URIs must include the mime type prefix
|
|
348
|
-
|
|
349
|
-
---
|
|
350
|
-
|
|
351
|
-
## API Reference
|
|
352
|
-
|
|
353
|
-
### ImageInput
|
|
354
|
-
|
|
355
|
-
```typescript
|
|
356
|
-
interface ImageInput {
|
|
357
|
-
/** Image source: URL, base64 data URI, or local file path */
|
|
358
|
-
source: string;
|
|
359
|
-
/** Optional alt text for context */
|
|
360
|
-
alt?: string;
|
|
361
|
-
}
|
|
362
|
-
```
|
|
363
|
-
|
|
364
|
-
### GenerateOptions (with images)
|
|
150
|
+
### "describeImage() requires a vision encoder"
|
|
365
151
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
// ... standard options ...
|
|
369
|
-
|
|
370
|
-
/** Images to include (only used if model supports vision) */
|
|
371
|
-
images?: ImageInput[];
|
|
372
|
-
}
|
|
373
|
-
```
|
|
152
|
+
Load with `{ enableVision: true }` on a vision-capable checkpoint (Qwen3.5 BF16 or Gemma 4).
|
|
153
|
+
The text-only MLX-4bit Qwen3.5 repo does not ship the ViT weights.
|
|
374
154
|
|
|
375
|
-
###
|
|
376
|
-
|
|
377
|
-
```typescript
|
|
378
|
-
g.supportsVision(): boolean
|
|
379
|
-
```
|
|
380
|
-
|
|
381
|
-
Returns `true` if the loaded model supports vision input.
|
|
382
|
-
|
|
383
|
-
### ModelConfig
|
|
384
|
-
|
|
385
|
-
```typescript
|
|
386
|
-
interface ModelConfig {
|
|
387
|
-
// ... standard properties ...
|
|
388
|
-
|
|
389
|
-
/** Whether model supports vision/image input */
|
|
390
|
-
supportsVision?: boolean;
|
|
391
|
-
|
|
392
|
-
/** Size of vision encoder (if applicable) */
|
|
393
|
-
visionEncoderSize?: string;
|
|
394
|
-
}
|
|
395
|
-
```
|
|
155
|
+
### Out of memory on mobile
|
|
396
156
|
|
|
157
|
+
Use a smaller image and/or lower `maxVisionPatches` at create time. WebKit kills the page
|
|
158
|
+
content process around 1.5–2 GB.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tryhamster/gerbil",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.1",
|
|
4
4
|
"description": "Local LLM inference for Node.js. GPU-accelerated. Zero config. Works standalone or with Vercel AI SDK.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.mjs",
|
|
@@ -17,6 +17,10 @@
|
|
|
17
17
|
"import": "./dist/skills/index.mjs",
|
|
18
18
|
"types": "./dist/skills/index.d.mts"
|
|
19
19
|
},
|
|
20
|
+
"./memory": {
|
|
21
|
+
"import": "./dist/memory/index.mjs",
|
|
22
|
+
"types": "./dist/memory/index.d.mts"
|
|
23
|
+
},
|
|
20
24
|
"./ai": {
|
|
21
25
|
"import": "./dist/integrations/ai-sdk.mjs",
|
|
22
26
|
"types": "./dist/integrations/ai-sdk.d.mts"
|
|
@@ -64,6 +68,18 @@
|
|
|
64
68
|
"./browser": {
|
|
65
69
|
"import": "./dist/browser/index.js",
|
|
66
70
|
"types": "./dist/browser/index.d.ts"
|
|
71
|
+
},
|
|
72
|
+
"./gpu": {
|
|
73
|
+
"import": "./dist/gpu/index.mjs",
|
|
74
|
+
"types": "./dist/gpu/index.d.mts"
|
|
75
|
+
},
|
|
76
|
+
"./hooks": {
|
|
77
|
+
"import": "./dist/gpu/hooks.mjs",
|
|
78
|
+
"types": "./dist/gpu/hooks.d.mts"
|
|
79
|
+
},
|
|
80
|
+
"./gpu/hooks": {
|
|
81
|
+
"import": "./dist/gpu/hooks.mjs",
|
|
82
|
+
"types": "./dist/gpu/hooks.d.mts"
|
|
67
83
|
}
|
|
68
84
|
},
|
|
69
85
|
"dependencies": {
|
|
@@ -72,10 +88,10 @@
|
|
|
72
88
|
"cli-progress": "^3.12.0",
|
|
73
89
|
"commander": "^12.1.0",
|
|
74
90
|
"ora": "^8.0.1",
|
|
75
|
-
"
|
|
91
|
+
"p-queue": "^9.0.1",
|
|
76
92
|
"react": "^19.0.0",
|
|
77
|
-
"webgpu": "^0.
|
|
78
|
-
"zod": "^3.
|
|
93
|
+
"webgpu": "^0.4.0",
|
|
94
|
+
"zod": "^4.3.6"
|
|
79
95
|
},
|
|
80
96
|
"peerDependencies": {
|
|
81
97
|
"@ai-sdk/provider": ">=2.0.0",
|
|
@@ -127,9 +143,6 @@
|
|
|
127
143
|
"devDependencies": {
|
|
128
144
|
"@ai-sdk/provider": "^2.0.0",
|
|
129
145
|
"@biomejs/biome": "^2.3.8",
|
|
130
|
-
"@huggingface/transformers": "^3.8.0",
|
|
131
|
-
"kokoro-js": "^1.2.1",
|
|
132
|
-
"onnxruntime-web": "^1.21.0-dev.20250114-228dd16893",
|
|
133
146
|
"@changesets/changelog-github": "^0.5.1",
|
|
134
147
|
"@changesets/cli": "^2.28.1",
|
|
135
148
|
"@types/cli-progress": "^3.11.6",
|
|
@@ -137,7 +150,9 @@
|
|
|
137
150
|
"@types/ink-big-text": "^1.2.4",
|
|
138
151
|
"@types/node": "^20.14.0",
|
|
139
152
|
"@types/react": "^19.0.0",
|
|
153
|
+
"@webgpu/types": "^0.1.69",
|
|
140
154
|
"ai": "^5.0.0",
|
|
155
|
+
"esbuild": "^0.27.3",
|
|
141
156
|
"express": "^4.19.0",
|
|
142
157
|
"ink": "^6.5.1",
|
|
143
158
|
"ink-big-text": "^2.0.0",
|