@tryhamster/gerbil 1.0.0-rc.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +247 -84
  3. package/dist/architectures-C1I5V3Dt.mjs +6070 -0
  4. package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
  5. package/dist/browser/index.d.ts +264 -588
  6. package/dist/browser/index.d.ts.map +1 -1
  7. package/dist/browser/index.js +585 -2334
  8. package/dist/browser/index.js.map +1 -1
  9. package/dist/cli.mjs +625 -1098
  10. package/dist/cli.mjs.map +1 -1
  11. package/dist/defaults-9komdrbY.mjs +24 -0
  12. package/dist/defaults-9komdrbY.mjs.map +1 -0
  13. package/dist/frameworks/express.d.mts +1 -3
  14. package/dist/frameworks/express.d.mts.map +1 -1
  15. package/dist/frameworks/express.mjs +7 -7
  16. package/dist/frameworks/express.mjs.map +1 -1
  17. package/dist/frameworks/fastify.d.mts +1 -1
  18. package/dist/frameworks/fastify.d.mts.map +1 -1
  19. package/dist/frameworks/fastify.mjs +3 -3
  20. package/dist/frameworks/fastify.mjs.map +1 -1
  21. package/dist/frameworks/hono.d.mts +1 -1
  22. package/dist/frameworks/hono.d.mts.map +1 -1
  23. package/dist/frameworks/hono.mjs +4 -4
  24. package/dist/frameworks/hono.mjs.map +1 -1
  25. package/dist/frameworks/next.d.mts +3 -2
  26. package/dist/frameworks/next.d.mts.map +1 -1
  27. package/dist/frameworks/next.mjs +4 -4
  28. package/dist/frameworks/next.mjs.map +1 -1
  29. package/dist/frameworks/react.d.mts +1 -1
  30. package/dist/frameworks/trpc.d.mts +1 -1
  31. package/dist/frameworks/trpc.d.mts.map +1 -1
  32. package/dist/frameworks/trpc.mjs +4 -4
  33. package/dist/frameworks/trpc.mjs.map +1 -1
  34. package/dist/gerbil-BHrJJIa4.mjs +1656 -0
  35. package/dist/gerbil-BHrJJIa4.mjs.map +1 -0
  36. package/dist/gerbil-BT9fCydo.d.mts +488 -0
  37. package/dist/gerbil-BT9fCydo.d.mts.map +1 -0
  38. package/dist/gerbil-DomNfIr1.mjs +4 -0
  39. package/dist/gpu/hooks.d.mts +520 -0
  40. package/dist/gpu/hooks.d.mts.map +1 -0
  41. package/dist/gpu/hooks.mjs +1188 -0
  42. package/dist/gpu/hooks.mjs.map +1 -0
  43. package/dist/gpu/index.d.mts +2 -0
  44. package/dist/gpu/index.mjs +6 -0
  45. package/dist/gpu-33qCAtHW.mjs +3615 -0
  46. package/dist/gpu-33qCAtHW.mjs.map +1 -0
  47. package/dist/index-Dgmb2kE3.d.mts +245 -0
  48. package/dist/index-Dgmb2kE3.d.mts.map +1 -0
  49. package/dist/index-jEAL2s-A.d.mts +2022 -0
  50. package/dist/index-jEAL2s-A.d.mts.map +1 -0
  51. package/dist/index.d.mts +22 -487
  52. package/dist/index.d.mts.map +1 -1
  53. package/dist/index.mjs +13 -8
  54. package/dist/index.mjs.map +1 -1
  55. package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
  56. package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
  57. package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
  58. package/dist/integrations/ai-sdk.d.mts +75 -6
  59. package/dist/integrations/ai-sdk.d.mts.map +1 -1
  60. package/dist/integrations/ai-sdk.mjs +131 -15
  61. package/dist/integrations/ai-sdk.mjs.map +1 -1
  62. package/dist/integrations/langchain.d.mts +1 -1
  63. package/dist/integrations/langchain.d.mts.map +1 -1
  64. package/dist/integrations/langchain.mjs +5 -5
  65. package/dist/integrations/langchain.mjs.map +1 -1
  66. package/dist/integrations/llamaindex.d.mts +1 -1
  67. package/dist/integrations/llamaindex.d.mts.map +1 -1
  68. package/dist/integrations/llamaindex.mjs +5 -5
  69. package/dist/integrations/llamaindex.mjs.map +1 -1
  70. package/dist/integrations/mcp-client.mjs +3 -3
  71. package/dist/integrations/mcp-client.mjs.map +1 -1
  72. package/dist/integrations/mcp.d.mts +3 -2
  73. package/dist/integrations/mcp.d.mts.map +1 -1
  74. package/dist/integrations/mcp.mjs +5 -5
  75. package/dist/{mcp-BvbriaBy.mjs → mcp-1DaMsaBc.mjs} +4 -4
  76. package/dist/mcp-1DaMsaBc.mjs.map +1 -0
  77. package/dist/memory/index.d.mts +3 -0
  78. package/dist/memory/index.mjs +6 -0
  79. package/dist/memory-D1P7Tmda.mjs +4 -0
  80. package/dist/memory-DVN0MnIG.mjs +132 -0
  81. package/dist/memory-DVN0MnIG.mjs.map +1 -0
  82. package/dist/memory-Dj0J1v88.mjs +294 -0
  83. package/dist/memory-Dj0J1v88.mjs.map +1 -0
  84. package/dist/moonshine-stt-BLyVoRpB.mjs +4 -0
  85. package/dist/moonshine-stt-v_P_Ci_m.mjs +11936 -0
  86. package/dist/moonshine-stt-v_P_Ci_m.mjs.map +1 -0
  87. package/dist/{one-liner-s-lD8rCC.mjs → one-liner-DnQn7HJK.mjs} +14 -16
  88. package/dist/one-liner-DnQn7HJK.mjs.map +1 -0
  89. package/dist/repl-jV5gcJFA.mjs +9 -0
  90. package/dist/skills/index.d.mts +270 -320
  91. package/dist/skills/index.d.mts.map +1 -1
  92. package/dist/skills/index.mjs +5 -5
  93. package/dist/{skills-CD3Orlex.mjs → skills-DX8D59UH.mjs} +187 -32
  94. package/dist/skills-DX8D59UH.mjs.map +1 -0
  95. package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
  96. package/dist/tools-DQ1mPUw5.mjs.map +1 -0
  97. package/dist/{types-CiTc7ez3.d.mts → types-D6FiR_oh.d.mts} +106 -12
  98. package/dist/types-D6FiR_oh.d.mts.map +1 -0
  99. package/dist/types-DQBe2lFo.d.mts +165 -0
  100. package/dist/types-DQBe2lFo.d.mts.map +1 -0
  101. package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
  102. package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
  103. package/dist/vector-B0panuy6.mjs +95 -0
  104. package/dist/vector-B0panuy6.mjs.map +1 -0
  105. package/docs/PROJECT-STATE.md +321 -0
  106. package/docs/adding-a-model-family.md +280 -0
  107. package/docs/ai-sdk.md +70 -61
  108. package/docs/architecture/overview.md +17 -7
  109. package/docs/browser.md +203 -8
  110. package/docs/embeddings.md +156 -0
  111. package/docs/gerbil-site-native-migration.md +217 -0
  112. package/docs/gpu-engine/architectures.md +398 -0
  113. package/docs/gpu-engine/ir.md +372 -0
  114. package/docs/gpu-engine/kernels.md +718 -0
  115. package/docs/gpu-engine/paper.html +1759 -0
  116. package/docs/gpu-engine/paper.md +2109 -0
  117. package/docs/gpu-engine/safetensors.md +312 -0
  118. package/docs/gpu-engine/tokenizer.md +302 -0
  119. package/docs/memory-rag.md +91 -0
  120. package/docs/metal-safari-intel.md +190 -0
  121. package/docs/mobile-failure-diagnosis.md +124 -0
  122. package/docs/mobile.md +99 -0
  123. package/docs/observability.md +230 -0
  124. package/docs/onnx-removal-plan.md +339 -0
  125. package/docs/research/autoresearch-portable.md +904 -0
  126. package/docs/research/dispatch-reduction-hivemind.md +84 -0
  127. package/docs/research/ios-safari-model-caching.md +117 -0
  128. package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
  129. package/docs/research/native-stt-model-selection.md +49 -0
  130. package/docs/research/native-tts-model-selection.md +90 -0
  131. package/docs/research/native-vs-chromium-decision.md +152 -0
  132. package/docs/research/nemotron-mamba2-inference.md +910 -0
  133. package/docs/research/qwen35-multimodal.md +293 -0
  134. package/docs/research/qwen36-gemma4-targets.md +337 -0
  135. package/docs/research/sota-embedding-models.md +179 -0
  136. package/docs/research/sota-mobile-models-2026.md +263 -0
  137. package/docs/research/sota-modality-models.md +202 -0
  138. package/docs/research/tps-baselines.md +71 -0
  139. package/docs/research/webgpu-m4-reference.md +104 -0
  140. package/docs/site-update-plan.md +155 -0
  141. package/docs/structured-output.md +123 -0
  142. package/docs/stt.md +63 -446
  143. package/docs/tts.md +77 -499
  144. package/docs/vision.md +100 -338
  145. package/package.json +22 -7
  146. package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
  147. package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
  148. package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
  149. package/dist/gerbil-CJ3ifloF.mjs +0 -4
  150. package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
  151. package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
  152. package/dist/gerbil-qOTe1nl2.d.mts +0 -431
  153. package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
  154. package/dist/kokoro-BNTb6egA.mjs +0 -20210
  155. package/dist/kokoro-BNTb6egA.mjs.map +0 -1
  156. package/dist/kokoro-CMOGDSgT.js +0 -20212
  157. package/dist/kokoro-CMOGDSgT.js.map +0 -1
  158. package/dist/mcp-BvbriaBy.mjs.map +0 -1
  159. package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
  160. package/dist/repl-DveXw36T.mjs +0 -9
  161. package/dist/skills-CD3Orlex.mjs.map +0 -1
  162. package/dist/stt-Bu-E23Sc.js +0 -433
  163. package/dist/stt-Bu-E23Sc.js.map +0 -1
  164. package/dist/stt-CpLYbGFd.mjs +0 -433
  165. package/dist/stt-CpLYbGFd.mjs.map +0 -1
  166. package/dist/stt-DRPLEEHB.mjs +0 -3
  167. package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
  168. package/dist/transformers.web-DiD1gTwk.js +0 -44695
  169. package/dist/transformers.web-DiD1gTwk.js.map +0 -1
  170. package/dist/transformers.web-u34VxRFM.js +0 -3
  171. package/dist/tts-CqroPaSK.js +0 -724
  172. package/dist/tts-CqroPaSK.js.map +0 -1
  173. package/dist/tts-DXgsKGCe.mjs +0 -3
  174. package/dist/tts-DeGANMNV.mjs +0 -730
  175. package/dist/tts-DeGANMNV.mjs.map +0 -1
  176. package/dist/types-CiTc7ez3.d.mts.map +0 -1
  177. /package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
  178. /package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
  179. /package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0
package/docs/vision.md CHANGED
@@ -1,396 +1,158 @@
1
- # Vision Models in Gerbil
1
+ # Vision (Image Text)
2
2
 
3
- Gerbil supports **Vision Language Models (VLMs)** like Ministral 3B that can understand and describe images. This guide covers how to use vision capabilities across all Gerbil interfaces.
3
+ Gerbil's native WebGPU engine runs **vision language models** image-in text-out. Two
4
+ vision towers are supported natively, both with the same `describeImage` API:
4
5
 
5
- ## Quick Start
6
-
7
- ```typescript
8
- import { Gerbil } from "@tryhamster/gerbil";
9
-
10
- const g = new Gerbil();
11
- await g.loadModel("ministral-3b"); // Vision-capable model
12
-
13
- const result = await g.generate("What's in this image?", {
14
- images: [{ source: "https://example.com/photo.jpg" }]
15
- });
16
-
17
- console.log(result.text);
18
- ```
19
-
20
- ## Supported Models
21
-
22
- | Model ID | Vision | Reasoning | Context | Size |
23
- |----------|--------|-----------|---------|------|
24
- | `ministral-3b` | ✅ | ✅ | 256K | ~2.5GB |
25
-
26
- More vision models coming soon as they become available in ONNX format.
27
-
28
- ## Image Input Types
29
-
30
- Gerbil accepts images in several formats:
31
-
32
- ```typescript
33
- // URL (recommended for web images)
34
- images: [{ source: "https://example.com/image.jpg" }]
35
-
36
- // Data URI (base64 encoded)
37
- images: [{ source: "data:image/png;base64,iVBORw0KGgo..." }]
38
-
39
- // Local file path (Node.js only, auto-converted to data URI)
40
- images: [{ source: "/path/to/image.png" }]
41
-
42
- // With alt text (optional, provides context)
43
- images: [{ source: "...", alt: "A photo of a sunset" }]
44
- ```
45
-
46
- ## Multiple Images
47
-
48
- You can pass multiple images for comparison or multi-image understanding:
49
-
50
- ```typescript
51
- const result = await g.generate("What's the difference between these two images?", {
52
- images: [
53
- { source: "https://example.com/before.jpg" },
54
- { source: "https://example.com/after.jpg" }
55
- ]
56
- });
57
- ```
58
-
59
- ## Model Capability Detection
60
-
61
- Check if the loaded model supports vision:
6
+ - **Qwen3.5 ViT** — bundled with `Qwen/Qwen3.5-0.8B` (the 0.8B text model's own built-in
7
+ vision tower). Bit-exact vs HuggingFace transformers.
8
+ - **Gemma 4 ViT** — bundled with `mlx-community/gemma-4-e2b-it-4bit`.
62
9
 
63
- ```typescript
64
- await g.loadModel("ministral-3b");
10
+ There is no separate "vision model" — vision is the text model's own ViT, loaded on
11
+ demand with `enableVision: true`.
65
12
 
66
- if (g.supportsVision()) {
67
- // Use vision features
68
- } else {
69
- // Text-only mode
70
- }
71
- ```
13
+ > **Pre-1.0.** This is the native engine surface. The legacy `Gerbil` class (ONNX /
14
+ > transformers.js) once exposed an `images:` array on `generate()`; that path is retired and
15
+ > not documented here.
72
16
 
73
- ## Graceful Fallback
17
+ ## Quick Start
74
18
 
75
- If you pass images to a non-vision model, Gerbil will:
76
- 1. Log a warning to console
77
- 2. Ignore the images
78
- 3. Process the text prompt normally
19
+ ### Node
79
20
 
80
- This allows you to write code that works with any model:
21
+ In Node you decode the image to RGB pixels yourself (HWC layout, 0..255), then pass
22
+ `{ pixels, width, height }`. The engine handles smart-resize, normalization, and patchify
23
+ internally to match the HF image processor.
81
24
 
82
25
  ```typescript
83
- // This works with any model - images are used if supported
84
- const result = await g.generate("Describe this", {
85
- images: [{ source: imageUrl }]
86
- });
87
- ```
88
-
89
- ---
26
+ import { WebGPUEngine } from "@tryhamster/gerbil/gpu";
90
27
 
91
- ## AI SDK Integration
92
-
93
- Use vision models with Vercel AI SDK v5+:
94
-
95
- ```typescript
96
- import { generateText } from "ai";
97
- import { gerbil } from "@tryhamster/gerbil/ai";
98
-
99
- const { text } = await generateText({
100
- model: gerbil("ministral-3b"),
101
- messages: [
102
- {
103
- role: "user",
104
- content: [
105
- { type: "image", image: new URL("https://example.com/photo.jpg") },
106
- { type: "text", text: "Describe this image in detail" },
107
- ],
108
- },
109
- ],
28
+ const engine = await WebGPUEngine.create({
29
+ repo: "Qwen/Qwen3.5-0.8B", // BF16 checkpoint ships the ViT weights
30
+ enableVision: true,
110
31
  });
111
- ```
112
32
 
113
- ### Image Part Formats
114
-
115
- The AI SDK integration accepts images in these formats:
116
-
117
- ```typescript
118
- // URL object
119
- { type: "image", image: new URL("https://...") }
33
+ // pixels: Uint8ClampedArray | Uint8Array | Float32Array, RGB, length = width*height*3
34
+ const { text, tokensPerSecond } = await engine.describeImage(
35
+ { pixels, width, height },
36
+ "What's in this image?",
37
+ { maxTokens: 150 },
38
+ );
120
39
 
121
- // URL string
122
- { type: "image", image: "https://..." }
123
-
124
- // Base64 string
125
- { type: "image", image: "data:image/png;base64,..." }
126
-
127
- // Uint8Array with mime type
128
- { type: "image", image: imageBytes, mimeType: "image/png" }
40
+ console.log(text, `(${tokensPerSecond.toFixed(1)} tok/s)`);
41
+ engine.destroy();
129
42
  ```
130
43
 
131
- ---
132
-
133
- ## Express & Next.js Integration
134
-
135
- ### Express
136
-
137
- ```typescript
138
- import express from "express";
139
- import { gerbil } from "@tryhamster/gerbil/express";
140
-
141
- const app = express();
142
- app.use("/ai", gerbil({ model: "ministral-3b" })());
143
-
144
- // POST /ai/generate
145
- // Body: { prompt: "Describe this", images: [{ source: "https://..." }] }
146
- ```
147
-
148
- ### Next.js App Router
149
-
150
- ```typescript
151
- // app/api/chat/route.ts
152
- import { gerbil } from "@tryhamster/gerbil/next";
153
-
154
- export const POST = gerbil.handler({ model: "ministral-3b" });
155
-
156
- // Fetch from client:
157
- // fetch("/api/chat", {
158
- // method: "POST",
159
- // body: JSON.stringify({
160
- // prompt: "What's in this image?",
161
- // images: [{ source: dataUri }]
162
- // })
163
- // })
164
- ```
44
+ ### React (Browser)
165
45
 
166
- ---
46
+ The React hook decodes URLs and data-URLs for you, so you can pass an image source string
47
+ directly. Load with `enableVision: true`.
167
48
 
168
- ## React Hooks (Browser)
49
+ ```tsx
50
+ import { useEngine } from "@tryhamster/gerbil/gpu/hooks";
169
51
 
170
- ### useChat with Images
52
+ function ImageDescriber() {
53
+ const { describeImage, completion, isLoading, isGenerating } = useEngine({
54
+ model: "Qwen/Qwen3.5-0.8B",
55
+ enableVision: true,
56
+ autoLoad: true,
57
+ });
171
58
 
172
- ```tsx
173
- import { useChat } from "@tryhamster/gerbil/browser";
174
-
175
- function VisionChat() {
176
- const {
177
- messages,
178
- input,
179
- setInput,
180
- handleSubmit,
181
- attachImage,
182
- attachedImages,
183
- clearImages,
184
- sendWithImages,
185
- } = useChat({ model: "ministral-3b" });
186
-
187
- const handleFileSelect = (e: React.ChangeEvent<HTMLInputElement>) => {
188
- const file = e.target.files?.[0];
189
- if (file) {
190
- const reader = new FileReader();
191
- reader.onload = () => attachImage(reader.result as string);
192
- reader.readAsDataURL(file);
193
- }
194
- };
59
+ if (isLoading) return <div>Loading vision model…</div>;
195
60
 
196
61
  return (
197
62
  <div>
198
- {/* Messages */}
199
- {messages.map(m => (
200
- <div key={m.id}>
201
- {m.images?.map((img, i) => (
202
- <img key={i} src={img} alt="" className="max-w-xs" />
203
- ))}
204
- <p>{m.content}</p>
205
- </div>
206
- ))}
207
-
208
- {/* Image attachment */}
209
- <input type="file" accept="image/*" onChange={handleFileSelect} />
210
-
211
- {attachedImages.length > 0 && (
212
- <div>
213
- 📎 {attachedImages.length} image(s) attached
214
- <button onClick={clearImages}>Clear</button>
215
- </div>
216
- )}
217
-
218
- {/* Input */}
219
- <form onSubmit={handleSubmit}>
220
- <input
221
- value={input}
222
- onChange={e => setInput(e.target.value)}
223
- placeholder="Describe the image..."
224
- />
225
- <button type="submit">Send</button>
226
- </form>
63
+ <button
64
+ disabled={isGenerating}
65
+ onClick={() =>
66
+ describeImage("https://example.com/photo.jpg", "Describe this image in detail")
67
+ }
68
+ >
69
+ Describe
70
+ </button>
71
+ <p>{completion}</p>
227
72
  </div>
228
73
  );
229
74
  }
230
75
  ```
231
76
 
232
- ### Direct Image Send
77
+ `describeImage` accepts either an image source string (data URL or http(s) URL) or
78
+ pre-decoded `{ pixels, width, height }`. The hook caps the longest side at ~448px before
79
+ decoding — ViT attention memory scales with patch-count², so this keeps mobile coherent.
233
80
 
234
- ```tsx
235
- // Send a message with specific images
236
- sendWithImages("Compare these two photos", [image1DataUri, image2DataUri]);
237
- ```
81
+ ## Supported Towers
238
82
 
239
- ---
83
+ | Tower | Load from | dtype | Notes |
84
+ |-------|-----------|-------|-------|
85
+ | Qwen3.5 ViT | `Qwen/Qwen3.5-0.8B` (`enableVision: true`) | BF16 (ships ViT) | Bit-exact vs HF (per-token cosine 1.000000) |
86
+ | Gemma 4 ViT | `mlx-community/gemma-4-e2b-it-4bit` (`enableVision: true`) | MLX 4-bit | Native projector + multimodal-embedder norms |
240
87
 
241
- ## Built-in Vision Skills
88
+ > The MLX 4-bit Qwen3.5 repo (`mlx-community/Qwen3.5-0.8B-4bit`) is text-only — load the
89
+ > BF16 `Qwen/Qwen3.5-0.8B` repo to get the ViT weights for vision.
242
90
 
243
- Gerbil includes pre-built skills for common vision tasks:
91
+ ## How it works
244
92
 
245
- ### Describe Image
93
+ `describeImage` runs the full image-in → text-out pipeline natively:
246
94
 
247
- ```typescript
248
- import { describeImage } from "@tryhamster/gerbil/skills";
95
+ 1. **Preprocess** — pixels are smart-resized, normalized, and patchified to match the HF
96
+ image processor (`preprocessImage` / `preprocessImageGemma4`).
97
+ 2. **Encode** — the ViT turns patches into merged image-embedding tokens (`encodeImage`).
98
+ 3. **Splice** — image tokens are scattered into the `image_token_id` rows of the text
99
+ sequence (`EmbedSplice` in the multimodal graph).
100
+ 4. **Decode** — Qwen3.5 uses multimodal M-RoPE positions; Gemma 4 uses standard sequential
101
+ 1D RoPE. The LM then generates the description.
249
102
 
250
- const description = await describeImage({
251
- image: "https://example.com/photo.jpg",
252
- focus: "details", // "general" | "details" | "text" | "objects" | "scene"
253
- format: "bullets", // "paragraph" | "bullets" | "structured"
254
- });
255
- ```
103
+ The low-level pieces are exposed if you need them: `engine.encodeImage(patches, gridTHW)`
104
+ returns the merged image tokens, and `engine.hasVision` reports whether the engine was
105
+ built with a ViT.
256
106
 
257
- ### Analyze Screenshot
107
+ ## Image input
258
108
 
259
109
  ```typescript
260
- import { analyzeScreenshot } from "@tryhamster/gerbil/skills";
110
+ // Pre-decoded RGB pixels (Node, or browser if you've already decoded)
111
+ await engine.describeImage({ pixels, width, height }, prompt);
261
112
 
262
- const analysis = await analyzeScreenshot({
263
- image: screenshotDataUri,
264
- type: "accessibility", // "ui-review" | "accessibility" | "suggestions" | "qa"
265
- });
113
+ // Already-built patch tensor + grid (skips host preprocessing — for reference parity)
114
+ await engine.describeImage({ patches, gridTHW: [1, gridH, gridW] }, prompt);
266
115
  ```
267
116
 
268
- ### Extract from Image
269
-
270
- ```typescript
271
- import { extractFromImage } from "@tryhamster/gerbil/skills";
117
+ In React, the hook also accepts a source string:
272
118
 
273
- const extracted = await extractFromImage({
274
- image: documentPhoto,
275
- extract: "text", // "text" | "data" | "code" | "table" | "diagram"
276
- outputFormat: "markdown", // "raw" | "json" | "markdown"
277
- });
119
+ ```tsx
120
+ describeImage("data:image/png;base64,iVBOR…", "What is this?");
121
+ describeImage("https://example.com/photo.jpg", "Describe it"); // must be CORS-accessible
278
122
  ```
279
123
 
280
- ### Compare Images
124
+ ## Options
281
125
 
282
- ```typescript
283
- import { compareImages } from "@tryhamster/gerbil/skills";
284
-
285
- const comparison = await compareImages({
286
- image1: beforeScreenshot,
287
- image2: afterScreenshot,
288
- focus: "differences", // "differences" | "similarities" | "detailed"
289
- });
290
- ```
291
-
292
- ### Caption Image
126
+ `describeImage(image, prompt?, options?)` takes the standard generation options:
293
127
 
294
128
  ```typescript
295
- import { captionImage } from "@tryhamster/gerbil/skills";
296
-
297
- const caption = await captionImage({
298
- image: photo,
299
- style: "descriptive", // "concise" | "descriptive" | "creative" | "funny"
129
+ await engine.describeImage(image, "Describe this image.", {
130
+ maxTokens: 150,
131
+ sampling: { temperature: 0.7 },
132
+ stopSequences: ["\n\n"],
133
+ onToken: (t) => process.stdout.write(t),
300
134
  });
301
135
  ```
302
136
 
303
- ---
137
+ Returns a `GenerateResult`: `{ text, tokensGenerated, tokensPerSecond, totalTime, finishReason }`.
304
138
 
305
139
  ## Performance Tips
306
140
 
307
- ### WebGPU Acceleration
308
-
309
- Vision models benefit significantly from GPU acceleration:
310
-
311
- ```typescript
312
- // Node.js: Uses Chrome backend for WebGPU
313
- await g.loadModel("ministral-3b"); // Auto-detects WebGPU
314
-
315
- // Browser: Native WebGPU
316
- await g.loadModel("ministral-3b", { device: "webgpu" });
317
- ```
318
-
319
- ### Image Size
320
-
321
- - Larger images take longer to process
322
- - Consider resizing before sending to the model
323
- - 512x512 to 1024x1024 is generally optimal
324
-
325
- ### Caching
326
-
327
- The model caches in the browser's IndexedDB (via Chrome backend in Node.js), so subsequent loads are fast.
328
-
329
- ---
141
+ - **Resize before sending.** ViT attention cost grows with the square of the patch count.
142
+ The React hook already caps the longest side at 448px; in Node, downscale large photos
143
+ (≈448–512px longest side) before decoding to pixels.
144
+ - **Mobile.** On iOS/iPadOS the engine reserves fewer vision patches by default to stay
145
+ under the WebKit GPU watchdog and memory budget; very large images may still need a
146
+ smaller `maxVisionPatches` (a `WebGPUEngine.create` option).
330
147
 
331
148
  ## Troubleshooting
332
149
 
333
- ### "Model doesn't support vision"
334
-
335
- Make sure you're using a vision-capable model like `ministral-3b`.
336
-
337
- ### Slow image processing
338
-
339
- - Ensure WebGPU is being used (check `g.getDeviceMode()`)
340
- - Resize large images before sending
341
- - In Node.js, the Chrome backend provides GPU acceleration
342
-
343
- ### Image not loading
344
-
345
- - Check the URL is accessible
346
- - For local files, ensure the path is absolute
347
- - Base64 data URIs must include the mime type prefix
348
-
349
- ---
350
-
351
- ## API Reference
352
-
353
- ### ImageInput
354
-
355
- ```typescript
356
- interface ImageInput {
357
- /** Image source: URL, base64 data URI, or local file path */
358
- source: string;
359
- /** Optional alt text for context */
360
- alt?: string;
361
- }
362
- ```
363
-
364
- ### GenerateOptions (with images)
150
+ ### "describeImage() requires a vision encoder"
365
151
 
366
- ```typescript
367
- interface GenerateOptions {
368
- // ... standard options ...
369
-
370
- /** Images to include (only used if model supports vision) */
371
- images?: ImageInput[];
372
- }
373
- ```
152
+ Load with `{ enableVision: true }` on a vision-capable checkpoint (Qwen3.5 BF16 or Gemma 4).
153
+ The text-only MLX-4bit Qwen3.5 repo does not ship the ViT weights.
374
154
 
375
- ### supportsVision()
376
-
377
- ```typescript
378
- g.supportsVision(): boolean
379
- ```
380
-
381
- Returns `true` if the loaded model supports vision input.
382
-
383
- ### ModelConfig
384
-
385
- ```typescript
386
- interface ModelConfig {
387
- // ... standard properties ...
388
-
389
- /** Whether model supports vision/image input */
390
- supportsVision?: boolean;
391
-
392
- /** Size of vision encoder (if applicable) */
393
- visionEncoderSize?: string;
394
- }
395
- ```
155
+ ### Out of memory on mobile
396
156
 
157
+ Use a smaller image and/or lower `maxVisionPatches` at create time. WebKit kills the page
158
+ content process around 1.5–2 GB.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tryhamster/gerbil",
3
- "version": "1.0.0-rc.9",
3
+ "version": "1.0.0",
4
4
  "description": "Local LLM inference for Node.js. GPU-accelerated. Zero config. Works standalone or with Vercel AI SDK.",
5
5
  "type": "module",
6
6
  "main": "dist/index.mjs",
@@ -17,6 +17,10 @@
17
17
  "import": "./dist/skills/index.mjs",
18
18
  "types": "./dist/skills/index.d.mts"
19
19
  },
20
+ "./memory": {
21
+ "import": "./dist/memory/index.mjs",
22
+ "types": "./dist/memory/index.d.mts"
23
+ },
20
24
  "./ai": {
21
25
  "import": "./dist/integrations/ai-sdk.mjs",
22
26
  "types": "./dist/integrations/ai-sdk.d.mts"
@@ -64,6 +68,18 @@
64
68
  "./browser": {
65
69
  "import": "./dist/browser/index.js",
66
70
  "types": "./dist/browser/index.d.ts"
71
+ },
72
+ "./gpu": {
73
+ "import": "./dist/gpu/index.mjs",
74
+ "types": "./dist/gpu/index.d.mts"
75
+ },
76
+ "./hooks": {
77
+ "import": "./dist/gpu/hooks.mjs",
78
+ "types": "./dist/gpu/hooks.d.mts"
79
+ },
80
+ "./gpu/hooks": {
81
+ "import": "./dist/gpu/hooks.mjs",
82
+ "types": "./dist/gpu/hooks.d.mts"
67
83
  }
68
84
  },
69
85
  "dependencies": {
@@ -72,10 +88,10 @@
72
88
  "cli-progress": "^3.12.0",
73
89
  "commander": "^12.1.0",
74
90
  "ora": "^8.0.1",
75
- "puppeteer-core": "^24.31.0",
91
+ "p-queue": "^9.0.1",
76
92
  "react": "^19.0.0",
77
- "webgpu": "^0.3.8",
78
- "zod": "^3.23.0"
93
+ "webgpu": "^0.4.0",
94
+ "zod": "^4.3.6"
79
95
  },
80
96
  "peerDependencies": {
81
97
  "@ai-sdk/provider": ">=2.0.0",
@@ -127,9 +143,6 @@
127
143
  "devDependencies": {
128
144
  "@ai-sdk/provider": "^2.0.0",
129
145
  "@biomejs/biome": "^2.3.8",
130
- "@huggingface/transformers": "^3.8.0",
131
- "kokoro-js": "^1.2.1",
132
- "onnxruntime-web": "^1.21.0-dev.20250114-228dd16893",
133
146
  "@changesets/changelog-github": "^0.5.1",
134
147
  "@changesets/cli": "^2.28.1",
135
148
  "@types/cli-progress": "^3.11.6",
@@ -137,7 +150,9 @@
137
150
  "@types/ink-big-text": "^1.2.4",
138
151
  "@types/node": "^20.14.0",
139
152
  "@types/react": "^19.0.0",
153
+ "@webgpu/types": "^0.1.69",
140
154
  "ai": "^5.0.0",
155
+ "esbuild": "^0.27.3",
141
156
  "express": "^4.19.0",
142
157
  "ink": "^6.5.1",
143
158
  "ink-big-text": "^2.0.0",