@tryhamster/gerbil 1.0.0-rc.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +247 -84
  3. package/dist/architectures-C1I5V3Dt.mjs +6070 -0
  4. package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
  5. package/dist/browser/index.d.ts +264 -588
  6. package/dist/browser/index.d.ts.map +1 -1
  7. package/dist/browser/index.js +585 -2334
  8. package/dist/browser/index.js.map +1 -1
  9. package/dist/cli.mjs +625 -1098
  10. package/dist/cli.mjs.map +1 -1
  11. package/dist/defaults-9komdrbY.mjs +24 -0
  12. package/dist/defaults-9komdrbY.mjs.map +1 -0
  13. package/dist/frameworks/express.d.mts +1 -3
  14. package/dist/frameworks/express.d.mts.map +1 -1
  15. package/dist/frameworks/express.mjs +7 -7
  16. package/dist/frameworks/express.mjs.map +1 -1
  17. package/dist/frameworks/fastify.d.mts +1 -1
  18. package/dist/frameworks/fastify.d.mts.map +1 -1
  19. package/dist/frameworks/fastify.mjs +3 -3
  20. package/dist/frameworks/fastify.mjs.map +1 -1
  21. package/dist/frameworks/hono.d.mts +1 -1
  22. package/dist/frameworks/hono.d.mts.map +1 -1
  23. package/dist/frameworks/hono.mjs +4 -4
  24. package/dist/frameworks/hono.mjs.map +1 -1
  25. package/dist/frameworks/next.d.mts +3 -2
  26. package/dist/frameworks/next.d.mts.map +1 -1
  27. package/dist/frameworks/next.mjs +4 -4
  28. package/dist/frameworks/next.mjs.map +1 -1
  29. package/dist/frameworks/react.d.mts +1 -1
  30. package/dist/frameworks/trpc.d.mts +1 -1
  31. package/dist/frameworks/trpc.d.mts.map +1 -1
  32. package/dist/frameworks/trpc.mjs +4 -4
  33. package/dist/frameworks/trpc.mjs.map +1 -1
  34. package/dist/gerbil-BHrJJIa4.mjs +1656 -0
  35. package/dist/gerbil-BHrJJIa4.mjs.map +1 -0
  36. package/dist/gerbil-BT9fCydo.d.mts +488 -0
  37. package/dist/gerbil-BT9fCydo.d.mts.map +1 -0
  38. package/dist/gerbil-DomNfIr1.mjs +4 -0
  39. package/dist/gpu/hooks.d.mts +520 -0
  40. package/dist/gpu/hooks.d.mts.map +1 -0
  41. package/dist/gpu/hooks.mjs +1188 -0
  42. package/dist/gpu/hooks.mjs.map +1 -0
  43. package/dist/gpu/index.d.mts +2 -0
  44. package/dist/gpu/index.mjs +6 -0
  45. package/dist/gpu-33qCAtHW.mjs +3615 -0
  46. package/dist/gpu-33qCAtHW.mjs.map +1 -0
  47. package/dist/index-Dgmb2kE3.d.mts +245 -0
  48. package/dist/index-Dgmb2kE3.d.mts.map +1 -0
  49. package/dist/index-jEAL2s-A.d.mts +2022 -0
  50. package/dist/index-jEAL2s-A.d.mts.map +1 -0
  51. package/dist/index.d.mts +22 -487
  52. package/dist/index.d.mts.map +1 -1
  53. package/dist/index.mjs +13 -8
  54. package/dist/index.mjs.map +1 -1
  55. package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
  56. package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
  57. package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
  58. package/dist/integrations/ai-sdk.d.mts +75 -6
  59. package/dist/integrations/ai-sdk.d.mts.map +1 -1
  60. package/dist/integrations/ai-sdk.mjs +131 -15
  61. package/dist/integrations/ai-sdk.mjs.map +1 -1
  62. package/dist/integrations/langchain.d.mts +1 -1
  63. package/dist/integrations/langchain.d.mts.map +1 -1
  64. package/dist/integrations/langchain.mjs +5 -5
  65. package/dist/integrations/langchain.mjs.map +1 -1
  66. package/dist/integrations/llamaindex.d.mts +1 -1
  67. package/dist/integrations/llamaindex.d.mts.map +1 -1
  68. package/dist/integrations/llamaindex.mjs +5 -5
  69. package/dist/integrations/llamaindex.mjs.map +1 -1
  70. package/dist/integrations/mcp-client.mjs +3 -3
  71. package/dist/integrations/mcp-client.mjs.map +1 -1
  72. package/dist/integrations/mcp.d.mts +3 -2
  73. package/dist/integrations/mcp.d.mts.map +1 -1
  74. package/dist/integrations/mcp.mjs +5 -5
  75. package/dist/{mcp-BvbriaBy.mjs → mcp-1DaMsaBc.mjs} +4 -4
  76. package/dist/mcp-1DaMsaBc.mjs.map +1 -0
  77. package/dist/memory/index.d.mts +3 -0
  78. package/dist/memory/index.mjs +6 -0
  79. package/dist/memory-D1P7Tmda.mjs +4 -0
  80. package/dist/memory-DVN0MnIG.mjs +132 -0
  81. package/dist/memory-DVN0MnIG.mjs.map +1 -0
  82. package/dist/memory-Dj0J1v88.mjs +294 -0
  83. package/dist/memory-Dj0J1v88.mjs.map +1 -0
  84. package/dist/moonshine-stt-BLyVoRpB.mjs +4 -0
  85. package/dist/moonshine-stt-v_P_Ci_m.mjs +11936 -0
  86. package/dist/moonshine-stt-v_P_Ci_m.mjs.map +1 -0
  87. package/dist/{one-liner-s-lD8rCC.mjs → one-liner-DnQn7HJK.mjs} +14 -16
  88. package/dist/one-liner-DnQn7HJK.mjs.map +1 -0
  89. package/dist/repl-jV5gcJFA.mjs +9 -0
  90. package/dist/skills/index.d.mts +270 -320
  91. package/dist/skills/index.d.mts.map +1 -1
  92. package/dist/skills/index.mjs +5 -5
  93. package/dist/{skills-CD3Orlex.mjs → skills-DX8D59UH.mjs} +187 -32
  94. package/dist/skills-DX8D59UH.mjs.map +1 -0
  95. package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
  96. package/dist/tools-DQ1mPUw5.mjs.map +1 -0
  97. package/dist/{types-CiTc7ez3.d.mts → types-D6FiR_oh.d.mts} +106 -12
  98. package/dist/types-D6FiR_oh.d.mts.map +1 -0
  99. package/dist/types-DQBe2lFo.d.mts +165 -0
  100. package/dist/types-DQBe2lFo.d.mts.map +1 -0
  101. package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
  102. package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
  103. package/dist/vector-B0panuy6.mjs +95 -0
  104. package/dist/vector-B0panuy6.mjs.map +1 -0
  105. package/docs/PROJECT-STATE.md +321 -0
  106. package/docs/adding-a-model-family.md +280 -0
  107. package/docs/ai-sdk.md +70 -61
  108. package/docs/architecture/overview.md +17 -7
  109. package/docs/browser.md +203 -8
  110. package/docs/embeddings.md +156 -0
  111. package/docs/gerbil-site-native-migration.md +217 -0
  112. package/docs/gpu-engine/architectures.md +398 -0
  113. package/docs/gpu-engine/ir.md +372 -0
  114. package/docs/gpu-engine/kernels.md +718 -0
  115. package/docs/gpu-engine/paper.html +1759 -0
  116. package/docs/gpu-engine/paper.md +2109 -0
  117. package/docs/gpu-engine/safetensors.md +312 -0
  118. package/docs/gpu-engine/tokenizer.md +302 -0
  119. package/docs/memory-rag.md +91 -0
  120. package/docs/metal-safari-intel.md +190 -0
  121. package/docs/mobile-failure-diagnosis.md +124 -0
  122. package/docs/mobile.md +99 -0
  123. package/docs/observability.md +230 -0
  124. package/docs/onnx-removal-plan.md +339 -0
  125. package/docs/research/autoresearch-portable.md +904 -0
  126. package/docs/research/dispatch-reduction-hivemind.md +84 -0
  127. package/docs/research/ios-safari-model-caching.md +117 -0
  128. package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
  129. package/docs/research/native-stt-model-selection.md +49 -0
  130. package/docs/research/native-tts-model-selection.md +90 -0
  131. package/docs/research/native-vs-chromium-decision.md +152 -0
  132. package/docs/research/nemotron-mamba2-inference.md +910 -0
  133. package/docs/research/qwen35-multimodal.md +293 -0
  134. package/docs/research/qwen36-gemma4-targets.md +337 -0
  135. package/docs/research/sota-embedding-models.md +179 -0
  136. package/docs/research/sota-mobile-models-2026.md +263 -0
  137. package/docs/research/sota-modality-models.md +202 -0
  138. package/docs/research/tps-baselines.md +71 -0
  139. package/docs/research/webgpu-m4-reference.md +104 -0
  140. package/docs/site-update-plan.md +155 -0
  141. package/docs/structured-output.md +123 -0
  142. package/docs/stt.md +63 -446
  143. package/docs/tts.md +77 -499
  144. package/docs/vision.md +100 -338
  145. package/package.json +22 -7
  146. package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
  147. package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
  148. package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
  149. package/dist/gerbil-CJ3ifloF.mjs +0 -4
  150. package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
  151. package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
  152. package/dist/gerbil-qOTe1nl2.d.mts +0 -431
  153. package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
  154. package/dist/kokoro-BNTb6egA.mjs +0 -20210
  155. package/dist/kokoro-BNTb6egA.mjs.map +0 -1
  156. package/dist/kokoro-CMOGDSgT.js +0 -20212
  157. package/dist/kokoro-CMOGDSgT.js.map +0 -1
  158. package/dist/mcp-BvbriaBy.mjs.map +0 -1
  159. package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
  160. package/dist/repl-DveXw36T.mjs +0 -9
  161. package/dist/skills-CD3Orlex.mjs.map +0 -1
  162. package/dist/stt-Bu-E23Sc.js +0 -433
  163. package/dist/stt-Bu-E23Sc.js.map +0 -1
  164. package/dist/stt-CpLYbGFd.mjs +0 -433
  165. package/dist/stt-CpLYbGFd.mjs.map +0 -1
  166. package/dist/stt-DRPLEEHB.mjs +0 -3
  167. package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
  168. package/dist/transformers.web-DiD1gTwk.js +0 -44695
  169. package/dist/transformers.web-DiD1gTwk.js.map +0 -1
  170. package/dist/transformers.web-u34VxRFM.js +0 -3
  171. package/dist/tts-CqroPaSK.js +0 -724
  172. package/dist/tts-CqroPaSK.js.map +0 -1
  173. package/dist/tts-DXgsKGCe.mjs +0 -3
  174. package/dist/tts-DeGANMNV.mjs +0 -730
  175. package/dist/tts-DeGANMNV.mjs.map +0 -1
  176. package/dist/types-CiTc7ez3.d.mts.map +0 -1
  177. /package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
  178. /package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
  179. /package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0
@@ -0,0 +1,1656 @@
1
+ import { n as zodToJsonSchema, t as extractJson } from "./utils-DKO55ZmZ.mjs";
2
+ import { existsSync } from "node:fs";
3
+ import os from "node:os";
4
+ import path from "node:path";
5
+ import zlib from "node:zlib";
6
+ import PQueue from "p-queue";
7
+
8
+ //#region src/core/cache.ts
9
+ /**
10
+ * Generate a deterministic cache key from prompt and options.
11
+ * Key includes all parameters that affect the output.
12
+ */
13
+ function generateCacheKey(prompt, modelId, options) {
14
+ const keyParts = [
15
+ prompt,
16
+ modelId,
17
+ options.maxTokens ?? 256,
18
+ options.temperature ?? .7,
19
+ options.topP ?? .9,
20
+ options.topK ?? 50,
21
+ options.system ?? "",
22
+ options.thinking ?? false
23
+ ];
24
+ const str = JSON.stringify(keyParts);
25
+ let hash = 0;
26
+ for (let i = 0; i < str.length; i++) {
27
+ const char = str.charCodeAt(i);
28
+ hash = (hash << 5) - hash + char;
29
+ hash = hash & hash;
30
+ }
31
+ return `gerbil:${hash.toString(16)}`;
32
+ }
33
+ /**
34
+ * LRU cache with TTL expiration for inference responses.
35
+ */
36
+ var ResponseCache = class {
37
+ cache = /* @__PURE__ */ new Map();
38
+ maxSize;
39
+ defaultTtl;
40
+ hits = 0;
41
+ misses = 0;
42
+ /**
43
+ * Create a new response cache.
44
+ * @param maxSize Maximum number of entries (default: 100)
45
+ * @param defaultTtl Default TTL in ms (default: 5 minutes)
46
+ */
47
+ constructor(maxSize = 100, defaultTtl = 300 * 1e3) {
48
+ this.maxSize = maxSize;
49
+ this.defaultTtl = defaultTtl;
50
+ }
51
+ /**
52
+ * Get a cached response if it exists and hasn't expired.
53
+ */
54
+ get(key) {
55
+ const entry = this.cache.get(key);
56
+ if (!entry) {
57
+ this.misses++;
58
+ return null;
59
+ }
60
+ if (Date.now() - entry.createdAt > entry.ttl) {
61
+ this.cache.delete(key);
62
+ this.misses++;
63
+ return null;
64
+ }
65
+ this.cache.delete(key);
66
+ this.cache.set(key, entry);
67
+ this.hits++;
68
+ return {
69
+ ...entry.result,
70
+ cached: true
71
+ };
72
+ }
73
+ /**
74
+ * Store a response in the cache.
75
+ */
76
+ set(key, result, ttl) {
77
+ while (this.cache.size >= this.maxSize) {
78
+ const firstKey = this.cache.keys().next().value;
79
+ if (firstKey) this.cache.delete(firstKey);
80
+ }
81
+ this.cache.set(key, {
82
+ result,
83
+ createdAt: Date.now(),
84
+ ttl: ttl ?? this.defaultTtl
85
+ });
86
+ }
87
+ /**
88
+ * Check if a key exists and is not expired.
89
+ */
90
+ has(key) {
91
+ const entry = this.cache.get(key);
92
+ if (!entry) return false;
93
+ if (Date.now() - entry.createdAt > entry.ttl) {
94
+ this.cache.delete(key);
95
+ return false;
96
+ }
97
+ return true;
98
+ }
99
+ /**
100
+ * Remove a specific key from the cache.
101
+ */
102
+ delete(key) {
103
+ return this.cache.delete(key);
104
+ }
105
+ /**
106
+ * Clear all entries from the cache.
107
+ */
108
+ clear() {
109
+ this.cache.clear();
110
+ this.hits = 0;
111
+ this.misses = 0;
112
+ }
113
+ /**
114
+ * Remove all expired entries.
115
+ */
116
+ prune() {
117
+ const now = Date.now();
118
+ let pruned = 0;
119
+ for (const [key, entry] of this.cache) if (now - entry.createdAt > entry.ttl) {
120
+ this.cache.delete(key);
121
+ pruned++;
122
+ }
123
+ return pruned;
124
+ }
125
+ /**
126
+ * Get cache statistics.
127
+ */
128
+ getStats() {
129
+ return {
130
+ hits: this.hits,
131
+ misses: this.misses,
132
+ size: this.cache.size,
133
+ maxSize: this.maxSize
134
+ };
135
+ }
136
+ /**
137
+ * Get hit rate as a percentage.
138
+ */
139
+ getHitRate() {
140
+ const total = this.hits + this.misses;
141
+ if (total === 0) return 0;
142
+ return this.hits / total * 100;
143
+ }
144
+ };
145
+ let globalCache = null;
146
+ /**
147
+ * Get the global response cache instance.
148
+ * Creates one if it doesn't exist.
149
+ */
150
+ function getGlobalCache() {
151
+ if (!globalCache) globalCache = new ResponseCache();
152
+ return globalCache;
153
+ }
154
+ /**
155
+ * Configure the global cache with custom settings.
156
+ */
157
+ function configureGlobalCache(maxSize, defaultTtl) {
158
+ globalCache = new ResponseCache(maxSize, defaultTtl);
159
+ return globalCache;
160
+ }
161
+ /**
162
+ * Clear and reset the global cache.
163
+ */
164
+ function clearGlobalCache() {
165
+ if (globalCache) globalCache.clear();
166
+ }
167
+
168
+ //#endregion
169
+ //#region src/core/models.ts
170
+ /**
171
+ * The default model used everywhere a model id is not explicitly provided
172
+ * (CLI flags, REPL, framework adapters, integrations, one-liner). This is the
173
+ * e2e-validated model; reference this constant instead of hard-coding the id.
174
+ */
175
+ const DEFAULT_MODEL = "qwen3.5-0.8b";
176
+ const BUILTIN_MODELS = {
177
+ "qwen3.5-0.8b": {
178
+ id: "qwen3.5-0.8b",
179
+ repo: "Qwen/Qwen3.5-0.8B",
180
+ description: "Qwen3.5 0.8B - Fast, multimodal (vision), 262k context, supports thinking (default)",
181
+ size: "~1.6GB",
182
+ contextLength: 262144,
183
+ supportsThinking: true,
184
+ supportsJson: true,
185
+ supportsVision: true,
186
+ family: "qwen"
187
+ },
188
+ "qwen3.5-2b": {
189
+ id: "qwen3.5-2b",
190
+ repo: "Qwen/Qwen3.5-2B",
191
+ description: "Qwen3.5 2B - Higher quality, multimodal (vision), 262k context, supports thinking",
192
+ size: "~4GB",
193
+ contextLength: 262144,
194
+ supportsThinking: true,
195
+ supportsJson: true,
196
+ supportsVision: true,
197
+ family: "qwen"
198
+ },
199
+ "lfm2.5-1.2b-thinking": {
200
+ id: "lfm2.5-1.2b-thinking",
201
+ repo: "LiquidAI/LFM2.5-1.2B-Thinking",
202
+ description: "LFM2.5 1.2B Thinking - Efficient reasoning model, 128k context",
203
+ size: "~2.4GB",
204
+ contextLength: 128e3,
205
+ supportsThinking: true,
206
+ supportsJson: false,
207
+ family: "other"
208
+ }
209
+ };
210
+ /**
211
+ * Parse model identifier and resolve to source
212
+ *
213
+ * Supported formats:
214
+ * - "qwen3.5-0.8b" (built-in)
215
+ * - "hf:org/model" (HuggingFace shorthand)
216
+ * - "https://huggingface.co/org/model" (full URL)
217
+ * - "file:./path/to/model" (local path)
218
+ */
219
+ function resolveModel(modelId) {
220
+ if (BUILTIN_MODELS[modelId]) return {
221
+ type: "builtin",
222
+ path: BUILTIN_MODELS[modelId].repo
223
+ };
224
+ if (modelId.startsWith("hf:")) return {
225
+ type: "huggingface",
226
+ path: modelId.slice(3)
227
+ };
228
+ if (modelId.startsWith("https://huggingface.co/")) return {
229
+ type: "huggingface",
230
+ path: modelId.replace("https://huggingface.co/", "")
231
+ };
232
+ if (modelId.startsWith("file:")) return {
233
+ type: "local",
234
+ path: modelId.slice(5)
235
+ };
236
+ if (modelId.includes("/")) return {
237
+ type: "huggingface",
238
+ path: modelId
239
+ };
240
+ return {
241
+ type: "huggingface",
242
+ path: modelId
243
+ };
244
+ }
245
+ /**
246
+ * Get model config (built-in only)
247
+ */
248
+ function getModelConfig(modelId) {
249
+ return BUILTIN_MODELS[modelId] || null;
250
+ }
251
+ const FAMILY_CONTEXT_DEFAULTS = {
252
+ qwen: 32768,
253
+ other: 32768
254
+ };
255
+ /**
256
+ * Create model config for an external HuggingFace model.
257
+ *
258
+ * Inference is restricted to families the engine can actually run — Qwen
259
+ * (Qwen2/Qwen3/Qwen3.5) and LFM2 (Liquid). Everything else is left as "other"
260
+ * with conservative capability flags so the REPL doesn't advertise features the
261
+ * engine can't deliver.
262
+ */
263
+ function createExternalModelConfig(modelId, repo, contextLength) {
264
+ const repoLower = repo.toLowerCase();
265
+ let family = "other";
266
+ if (repoLower.includes("qwen")) family = "qwen";
267
+ const isLiquid = repoLower.includes("lfm") || repoLower.includes("liquid");
268
+ const isQwen = family === "qwen";
269
+ return {
270
+ id: modelId,
271
+ repo,
272
+ description: `External model: ${repo}`,
273
+ size: "Unknown",
274
+ contextLength: contextLength || FAMILY_CONTEXT_DEFAULTS[family] || 32768,
275
+ supportsThinking: isQwen || isLiquid,
276
+ supportsJson: isQwen,
277
+ family
278
+ };
279
+ }
280
+ /**
281
+ * Fetch context length from HuggingFace model config
282
+ */
283
+ async function fetchModelContextLength(repo) {
284
+ try {
285
+ const res = await fetch(`https://huggingface.co/${repo}/raw/main/config.json`);
286
+ if (!res.ok) return null;
287
+ const config = await res.json();
288
+ return config.max_position_embeddings || config.n_positions || config.max_seq_len || config.sliding_window || config.context_length || null;
289
+ } catch {
290
+ return null;
291
+ }
292
+ }
293
+ /**
294
+ * List all built-in models
295
+ */
296
+ function listBuiltinModels() {
297
+ return Object.values(BUILTIN_MODELS);
298
+ }
299
+
300
+ //#endregion
301
+ //#region src/core/gerbil.ts
302
+ /**
303
+ * Gerbil - Local GPU-accelerated LLM inference
304
+ */
305
+ /**
306
+ * Minimal PNG decoder: 8-bit, non-interlaced, color type 2 (RGB) or 6 (RGBA).
307
+ * Returns packed RGB pixels for the native vision encoder. Replaces the
308
+ * transformers.js RawImage decoder for the common PNG case.
309
+ */
310
+ function decodePng(buf) {
311
+ const view = new DataView(buf.buffer, buf.byteOffset, buf.byteLength);
312
+ const w = view.getUint32(16);
313
+ const h = view.getUint32(20);
314
+ const bitDepth = buf[24];
315
+ const colorType = buf[25];
316
+ if (bitDepth !== 8 || colorType !== 2 && colorType !== 6) throw new Error(`Unsupported PNG (bitDepth=${bitDepth} colorType=${colorType}); need 8-bit RGB/RGBA`);
317
+ const channels = colorType === 6 ? 4 : 3;
318
+ let off = 8;
319
+ const idat = [];
320
+ while (off < buf.length) {
321
+ const len = view.getUint32(off);
322
+ const type = String.fromCharCode(buf[off + 4], buf[off + 5], buf[off + 6], buf[off + 7]);
323
+ if (type === "IDAT") idat.push(buf.subarray(off + 8, off + 8 + len));
324
+ off += 12 + len;
325
+ if (type === "IEND") break;
326
+ }
327
+ const raw = zlib.inflateSync(Buffer.concat(idat));
328
+ const stride = w * channels;
329
+ const out = new Uint8Array(w * h * 3);
330
+ const line = new Uint8Array(stride);
331
+ const prev = new Uint8Array(stride);
332
+ let p = 0;
333
+ for (let y = 0; y < h; y += 1) {
334
+ const filter = raw[p];
335
+ p += 1;
336
+ for (let i = 0; i < stride; i += 1) {
337
+ const x = raw[p];
338
+ p += 1;
339
+ const a = i >= channels ? line[i - channels] : 0;
340
+ const b = prev[i];
341
+ const c = i >= channels ? prev[i - channels] : 0;
342
+ let v;
343
+ switch (filter) {
344
+ case 0:
345
+ v = x;
346
+ break;
347
+ case 1:
348
+ v = x + a;
349
+ break;
350
+ case 2:
351
+ v = x + b;
352
+ break;
353
+ case 3:
354
+ v = x + (a + b >> 1);
355
+ break;
356
+ case 4: {
357
+ const pp = a + b - c;
358
+ const pa = Math.abs(pp - a);
359
+ const pb = Math.abs(pp - b);
360
+ const pc = Math.abs(pp - c);
361
+ let pred = c;
362
+ if (pa <= pb && pa <= pc) pred = a;
363
+ else if (pb <= pc) pred = b;
364
+ v = x + pred;
365
+ break;
366
+ }
367
+ default: throw new Error(`bad PNG filter ${filter}`);
368
+ }
369
+ line[i] = v & 255;
370
+ }
371
+ for (let x = 0; x < w; x += 1) {
372
+ out[(y * w + x) * 3 + 0] = line[x * channels + 0];
373
+ out[(y * w + x) * 3 + 1] = line[x * channels + 1];
374
+ out[(y * w + x) * 3 + 2] = line[x * channels + 2];
375
+ }
376
+ prev.set(line);
377
+ }
378
+ return {
379
+ pixels: out,
380
+ width: w,
381
+ height: h
382
+ };
383
+ }
384
+ const KOKORO_VOICES_DEFAULT = [
385
+ {
386
+ id: "af_bella",
387
+ name: "Bella",
388
+ gender: "female",
389
+ language: "en-us",
390
+ description: "American female, warm and friendly"
391
+ },
392
+ {
393
+ id: "af_sarah",
394
+ name: "Sarah",
395
+ gender: "female",
396
+ language: "en-us",
397
+ description: "American female, clear and professional"
398
+ },
399
+ {
400
+ id: "af_nicole",
401
+ name: "Nicole",
402
+ gender: "female",
403
+ language: "en-us",
404
+ description: "American female, soft and gentle"
405
+ },
406
+ {
407
+ id: "af_sky",
408
+ name: "Sky",
409
+ gender: "female",
410
+ language: "en-us",
411
+ description: "American female, young and energetic"
412
+ },
413
+ {
414
+ id: "am_adam",
415
+ name: "Adam",
416
+ gender: "male",
417
+ language: "en-us",
418
+ description: "American male, deep and confident"
419
+ },
420
+ {
421
+ id: "am_michael",
422
+ name: "Michael",
423
+ gender: "male",
424
+ language: "en-us",
425
+ description: "American male, warm and friendly"
426
+ },
427
+ {
428
+ id: "bf_emma",
429
+ name: "Emma",
430
+ gender: "female",
431
+ language: "en-gb",
432
+ description: "British female, elegant and clear"
433
+ },
434
+ {
435
+ id: "bf_isabella",
436
+ name: "Isabella",
437
+ gender: "female",
438
+ language: "en-gb",
439
+ description: "British female, sophisticated"
440
+ },
441
+ {
442
+ id: "bm_george",
443
+ name: "George",
444
+ gender: "male",
445
+ language: "en-gb",
446
+ description: "British male, distinguished"
447
+ },
448
+ {
449
+ id: "bm_lewis",
450
+ name: "Lewis",
451
+ gender: "male",
452
+ language: "en-gb",
453
+ description: "British male, friendly and warm"
454
+ }
455
+ ];
456
+ var Gerbil = class {
457
+ currentModel = null;
458
+ modelConfig = null;
459
+ config;
460
+ stats;
461
+ _deviceMode = "cpu";
462
+ webgpuEngine = null;
463
+ nativeEmbedEngine = null;
464
+ nativeEmbedRepo = null;
465
+ nativeSTT = null;
466
+ nativeTTSEngine = null;
467
+ isVisionModel = false;
468
+ queue;
469
+ telemetry;
470
+ constructor(config = {}) {
471
+ this.config = config;
472
+ this.stats = {
473
+ prompts: 0,
474
+ tokensIn: 0,
475
+ tokensOut: 0,
476
+ avgSpeed: 0,
477
+ totalTime: 0,
478
+ cacheHits: 0,
479
+ cacheMisses: 0
480
+ };
481
+ const concurrency = config.concurrency || {};
482
+ this.queue = new PQueue({
483
+ concurrency: concurrency.maxConcurrent ?? 1,
484
+ timeout: concurrency.timeout ?? 3e5
485
+ });
486
+ this.telemetry = config.telemetry || {};
487
+ }
488
+ reportError(error, context) {
489
+ try {
490
+ this.telemetry.onError?.(error, context);
491
+ } catch {}
492
+ }
493
+ /**
494
+ * Whether the native (src/gpu) WebGPU engine should be used for a capability
495
+ * (embed / transcribe / speak / vision). The native WebGPU engine is the only
496
+ * inference backend, so this is always true; kept as a seam for callers.
497
+ */
498
+ preferNative() {
499
+ return true;
500
+ }
501
+ static listModels() {
502
+ return Object.values(BUILTIN_MODELS);
503
+ }
504
+ static getModel(modelId) {
505
+ return BUILTIN_MODELS[modelId];
506
+ }
507
+ /**
508
+ * Load a model
509
+ *
510
+ * @example
511
+ * ```ts
512
+ * // Built-in model
513
+ * await g.loadModel("qwen3.5-0.8b");
514
+ *
515
+ * // HuggingFace model
516
+ * await g.loadModel("hf:microsoft/Phi-3-mini");
517
+ *
518
+ * // Local model
519
+ * await g.loadModel("file:./models/my-model");
520
+ *
521
+ * // Vision model
522
+ * await g.loadModel("ministral-3b");
523
+ * ```
524
+ */
525
+ async loadModel(modelId = DEFAULT_MODEL, options = {}) {
526
+ const loadStartTime = performance.now();
527
+ if (this.isLoaded()) await this.dispose();
528
+ const source = resolveModel(modelId);
529
+ const { onProgress, device = "auto", dtype: userDtype } = options;
530
+ let config = getModelConfig(modelId);
531
+ if (!config) {
532
+ const contextLength = await fetchModelContextLength(source.path).catch(() => null);
533
+ config = createExternalModelConfig(modelId, source.path, contextLength || void 0);
534
+ }
535
+ if (config.supportsVision) return this.loadVisionModel(modelId, source.path, config, options);
536
+ if (device === "cpu" || device === "gpu") throw new Error("Gerbil requires WebGPU. CPU/WASM and the legacy ONNX backend have been removed; use device \"webgpu\" or \"auto\".");
537
+ onProgress?.({ status: `Loading ${modelId}...` });
538
+ try {
539
+ onProgress?.({ status: "Initializing WebGPU engine..." });
540
+ const { WebGPUEngine } = await import("./gpu/index.mjs");
541
+ let hfRepo = source.path;
542
+ if (hfRepo.includes("onnx-community/") || hfRepo.includes("-ONNX")) hfRepo = {
543
+ "onnx-community/Qwen3.5-0.8B-ONNX": "Qwen/Qwen3.5-0.8B",
544
+ "onnx-community/Qwen3-0.6B-ONNX": "Qwen/Qwen3-0.6B",
545
+ "onnx-community/Qwen3-1.7B-ONNX": "Qwen/Qwen3-1.7B",
546
+ "onnx-community/Qwen3.5-2B-ONNX": "Qwen/Qwen3.5-2B"
547
+ }[hfRepo] || hfRepo;
548
+ const gpuDtype = userDtype === "q4" ? "q4" : void 0;
549
+ this.webgpuEngine = await WebGPUEngine.create({
550
+ repo: hfRepo,
551
+ maxSeqLen: options.contextLength ?? config.contextLength ?? 4096,
552
+ dtype: gpuDtype,
553
+ onProgress: (loaded, total, message) => {
554
+ onProgress?.({
555
+ status: message,
556
+ progress: total > 0 ? Math.round(loaded / total * 100) : void 0
557
+ });
558
+ }
559
+ });
560
+ this._deviceMode = "webgpu";
561
+ this.isVisionModel = false;
562
+ this.currentModel = modelId;
563
+ this.modelConfig = config;
564
+ onProgress?.({ status: "Ready (WebGPU Native)!" });
565
+ if (this.telemetry.onModelLoad) try {
566
+ this.telemetry.onModelLoad({
567
+ modelId,
568
+ loadTimeMs: performance.now() - loadStartTime,
569
+ fromCache: false,
570
+ device: this._deviceMode,
571
+ success: true
572
+ });
573
+ } catch {}
574
+ } catch (err) {
575
+ this.reportError(err instanceof Error ? err : new Error(String(err)), {
576
+ operation: "load",
577
+ modelId
578
+ });
579
+ if (this.telemetry.onModelLoad) try {
580
+ this.telemetry.onModelLoad({
581
+ modelId,
582
+ loadTimeMs: performance.now() - loadStartTime,
583
+ fromCache: false,
584
+ device: this._deviceMode,
585
+ success: false,
586
+ error: err instanceof Error ? err.message : String(err)
587
+ });
588
+ } catch {}
589
+ throw err;
590
+ }
591
+ }
592
+ /**
593
+ * Load a vision model (VLM) on the native WebGPU engine.
594
+ * The native engine loads the vision-capable safetensors checkpoint directly
595
+ * and builds its ViT tower on demand (enableVision: true). describeImage() then
596
+ * runs encode → splice → decode entirely in WebGPU compute.
597
+ */
598
+ async loadVisionModel(modelId, repoPath, config, options = {}) {
599
+ const { onProgress, device = "auto" } = options;
600
+ onProgress?.({ status: `Loading ${modelId} (vision model)...` });
601
+ if (device === "cpu" || device === "gpu") throw new Error("Gerbil vision models require WebGPU. CPU/WASM and the legacy ONNX backend have been removed; use device \"webgpu\" or \"auto\".");
602
+ onProgress?.({ status: "Initializing WebGPU vision engine..." });
603
+ const { WebGPUEngine } = await import("./gpu/index.mjs");
604
+ let visRepo = repoPath;
605
+ if (visRepo.includes("onnx-community/") || visRepo.includes("-ONNX")) visRepo = { "onnx-community/Qwen3.5-0.8B-ONNX": "Qwen/Qwen3.5-0.8B" }[visRepo] || visRepo;
606
+ this.webgpuEngine = await WebGPUEngine.create({
607
+ repo: visRepo,
608
+ enableVision: true,
609
+ maxSeqLen: options.contextLength ?? config.contextLength ?? 4096,
610
+ onProgress: (loaded, total, message) => onProgress?.({
611
+ status: message,
612
+ progress: total > 0 ? Math.round(loaded / total * 100) : void 0
613
+ })
614
+ });
615
+ this._deviceMode = "webgpu";
616
+ this.isVisionModel = true;
617
+ this.currentModel = modelId;
618
+ this.modelConfig = config;
619
+ onProgress?.({ status: "Ready (Vision, WebGPU Native)!" });
620
+ }
621
+ /**
622
+ * Check if a model is loaded
623
+ */
624
+ isLoaded() {
625
+ return this.webgpuEngine !== null;
626
+ }
627
+ /**
628
+ * Check if current model supports vision
629
+ */
630
+ supportsVision() {
631
+ return this.isVisionModel && this.modelConfig?.supportsVision === true;
632
+ }
633
+ /**
634
+ * Get current model info
635
+ */
636
+ getModelInfo() {
637
+ return this.modelConfig;
638
+ }
639
+ /**
640
+ * Get current device mode (webgpu, cpu, or wasm)
641
+ */
642
+ getDeviceMode() {
643
+ return this._deviceMode;
644
+ }
645
+ /**
646
+ * Get the in-memory weight quantization the native engine uses for the loaded
647
+ * model. The WebGPU engine quantizes weights to INT4 ("q4") on load; the KV
648
+ * cache precision (f16/f32) is separate and device-detected.
649
+ */
650
+ getDtype() {
651
+ return "q4";
652
+ }
653
+ /**
654
+ * Get response cache statistics
655
+ */
656
+ getResponseCacheStats() {
657
+ const cache = getGlobalCache();
658
+ const stats = cache.getStats();
659
+ return {
660
+ hits: stats.hits,
661
+ misses: stats.misses,
662
+ size: stats.size,
663
+ hitRate: cache.getHitRate()
664
+ };
665
+ }
666
+ /**
667
+ * Clear the response cache (for cached generate() results)
668
+ */
669
+ clearResponseCache() {
670
+ getGlobalCache().clear();
671
+ }
672
+ /**
673
+ * Check if a model is cached (downloaded) without loading it
674
+ *
675
+ * @example
676
+ * ```ts
677
+ * if (await g.isModelCached("qwen3.5-0.8b")) {
678
+ * console.log("Model ready, will load instantly");
679
+ * } else {
680
+ * console.log("Model needs to download (~400MB)");
681
+ * }
682
+ * ```
683
+ */
684
+ async isModelCached(modelId) {
685
+ const source = resolveModel(modelId);
686
+ return this.isNativeRepoCached(source.path);
687
+ }
688
+ /**
689
+ * Check whether the native WebGPU engine has a repo cached on disk.
690
+ * The native loader stores files under ~/.cache/gerbil/<repo>/<revision>/.
691
+ */
692
+ isNativeRepoCached(repo, revision = "main") {
693
+ try {
694
+ const home = process.env.HOME || process.env.USERPROFILE || os.homedir();
695
+ if (!home) return false;
696
+ const modelDir = path.join(home, ".cache", "gerbil", repo.replace(/\//g, "_"), revision);
697
+ return existsSync(path.join(modelDir, "config.json".replace(/\//g, "_")));
698
+ } catch {
699
+ return false;
700
+ }
701
+ }
702
+ /**
703
+ * Preload a model (download without initializing for inference)
704
+ *
705
+ * Use this to download models ahead of time, e.g., during app startup,
706
+ * so users don't wait when they first use AI.
707
+ *
708
+ * @example
709
+ * ```ts
710
+ * // Preload for later (download only, free memory)
711
+ * await g.preloadModel("qwen3.5-0.8b", {
712
+ * onProgress: (p) => console.log(p.status, p.progress),
713
+ * });
714
+ *
715
+ * // Preload and keep in memory for instant use
716
+ * await g.preloadModel("qwen3.5-0.8b", { keepLoaded: true });
717
+ * await g.generate("Hello"); // Instant, no loading needed
718
+ * ```
719
+ */
720
+ async preloadModel(modelId, options = {}) {
721
+ resolveModel(modelId);
722
+ const { onProgress, keepLoaded = false } = options;
723
+ if (keepLoaded && this.isLoaded() && this.currentModel === modelId) {
724
+ onProgress?.({ status: "Model already loaded" });
725
+ return;
726
+ }
727
+ if (!keepLoaded && await this.isModelCached(modelId)) {
728
+ onProgress?.({ status: "Model already cached" });
729
+ return;
730
+ }
731
+ if (keepLoaded) {
732
+ await this.loadModel(modelId, { onProgress });
733
+ return;
734
+ }
735
+ onProgress?.({ status: `Preloading ${modelId}...` });
736
+ await this.loadModel(modelId, { onProgress });
737
+ await this.dispose();
738
+ onProgress?.({ status: "Preload complete" });
739
+ }
740
+ /**
741
+ * Check if the native TTS model is cached. The native engine always uses the
742
+ * Kani-TTS-2 checkpoint, so `modelId` is accepted for API compatibility only.
743
+ */
744
+ async isTTSCached(_modelId) {
745
+ const { DEFAULT_MODELS } = await import("./gpu/index.mjs");
746
+ return this.isNativeRepoCached(DEFAULT_MODELS.tts);
747
+ }
748
+ /**
749
+ * Preload the native TTS model (downloads Kani-TTS-2 weights to disk cache).
750
+ *
751
+ * @param modelId - Accepted for API compatibility; native TTS uses Kani-TTS-2.
752
+ * @param options.keepLoaded - Keep the engine in memory for instant use.
753
+ */
754
+ async preloadTTS(modelId, options = {}) {
755
+ const { onProgress, keepLoaded = false } = options;
756
+ if (keepLoaded && this.isTTSLoaded()) {
757
+ onProgress?.({ status: "TTS model already loaded" });
758
+ return;
759
+ }
760
+ if (!keepLoaded && await this.isTTSCached(modelId)) {
761
+ onProgress?.({ status: "TTS model already cached" });
762
+ return;
763
+ }
764
+ onProgress?.({ status: "Preloading TTS model..." });
765
+ await this.ensureNativeTTSEngine();
766
+ if (!keepLoaded && this.nativeTTSEngine) {
767
+ try {
768
+ this.nativeTTSEngine.destroy();
769
+ } catch {}
770
+ this.nativeTTSEngine = null;
771
+ }
772
+ onProgress?.({ status: "Preload complete" });
773
+ }
774
+ /**
775
+ * Check if the native STT model is cached. The native engine always uses the
776
+ * Moonshine checkpoint, so `modelId` is accepted for API compatibility only.
777
+ */
778
+ async isSTTCached(_modelId) {
779
+ const { DEFAULT_MODELS } = await import("./gpu/index.mjs");
780
+ return this.isNativeRepoCached(DEFAULT_MODELS.stt);
781
+ }
782
+ /**
783
+ * Preload the native STT model (downloads Moonshine weights to disk cache).
784
+ *
785
+ * @param modelId - Accepted for API compatibility; native STT uses Moonshine.
786
+ * @param options.keepLoaded - Keep the engine in memory for instant use.
787
+ */
788
+ async preloadSTT(modelId, options = {}) {
789
+ const { onProgress, keepLoaded = false } = options;
790
+ if (keepLoaded && this.isSTTLoaded()) {
791
+ onProgress?.({ status: "STT model already loaded" });
792
+ return;
793
+ }
794
+ if (!keepLoaded && await this.isSTTCached(modelId)) {
795
+ onProgress?.({ status: "STT model already cached" });
796
+ return;
797
+ }
798
+ onProgress?.({ status: "Preloading STT model..." });
799
+ await this.ensureNativeSTT();
800
+ if (!keepLoaded && this.nativeSTT) {
801
+ try {
802
+ this.nativeSTT.destroy?.();
803
+ } catch {}
804
+ this.nativeSTT = null;
805
+ }
806
+ onProgress?.({ status: "Preload complete" });
807
+ }
808
+ /**
809
+ * Check if a native embedding model is cached. Defaults to the native
810
+ * EmbeddingGemma checkpoint when no repo is provided.
811
+ */
812
+ async isEmbeddingCached(modelId) {
813
+ const { DEFAULT_MODELS } = await import("./gpu/index.mjs");
814
+ return this.isNativeRepoCached(modelId || DEFAULT_MODELS.embedding);
815
+ }
816
+ /**
817
+ * Preload a native embedding model (downloads weights to disk cache).
818
+ *
819
+ * @param modelId - Embedding repo (default: native EmbeddingGemma).
820
+ * @param options.keepLoaded - Keep the engine in memory for instant use.
821
+ */
822
+ async preloadEmbedding(modelId, options = {}) {
823
+ const { onProgress, keepLoaded = false } = options;
824
+ if (keepLoaded && this.nativeEmbedEngine) {
825
+ onProgress?.({ status: "Embedding model already loaded" });
826
+ return;
827
+ }
828
+ if (!keepLoaded && await this.isEmbeddingCached(modelId)) {
829
+ onProgress?.({ status: "Embedding model already cached" });
830
+ return;
831
+ }
832
+ onProgress?.({ status: "Preloading embedding model..." });
833
+ await this.ensureNativeEmbedEngine(modelId);
834
+ if (!keepLoaded && this.nativeEmbedEngine) {
835
+ try {
836
+ this.nativeEmbedEngine.destroy();
837
+ } catch {}
838
+ this.nativeEmbedEngine = null;
839
+ this.nativeEmbedRepo = null;
840
+ }
841
+ onProgress?.({ status: "Preload complete" });
842
+ }
843
+ /**
844
+ * Clear KV cache to free memory.
845
+ * The native engine manages its own KV cache; this is a no-op kept for API
846
+ * compatibility.
847
+ */
848
+ async clearCache() {}
849
+ /**
850
+ * Generate text (automatically routes to vision generation if images provided)
851
+ *
852
+ * @example
853
+ * ```ts
854
+ * // Text generation
855
+ * const result = await g.generate("Hello!");
856
+ *
857
+ * // Vision generation (with vision model)
858
+ * const result = await g.generate("What's in this image?", {
859
+ * images: [{ source: "https://example.com/cat.jpg" }]
860
+ * });
861
+ * ```
862
+ */
863
+ async generate(prompt, options = {}) {
864
+ const queueStartTime = performance.now();
865
+ try {
866
+ return await this.queue.add(async () => {
867
+ const queueWaitTime = performance.now() - queueStartTime;
868
+ if (queueWaitTime > 100 && this.telemetry.onQueueWait) try {
869
+ this.telemetry.onQueueWait(queueWaitTime);
870
+ } catch {}
871
+ const generatedResult = await this.generateInternal(prompt, options);
872
+ if (this.telemetry.onGenerate) try {
873
+ this.telemetry.onGenerate({
874
+ modelId: this.currentModel || "unknown",
875
+ result: generatedResult,
876
+ cached: generatedResult.cached ?? false,
877
+ queueTimeMs: queueWaitTime > 100 ? queueWaitTime : void 0
878
+ });
879
+ } catch {}
880
+ return generatedResult;
881
+ });
882
+ } catch (error) {
883
+ if (this.telemetry.onError) try {
884
+ this.telemetry.onError(error instanceof Error ? error : new Error(String(error)), {
885
+ method: "generate",
886
+ modelId: this.currentModel || "unknown",
887
+ prompt: prompt.slice(0, 100),
888
+ queueWaitTime: performance.now() - queueStartTime
889
+ });
890
+ } catch {}
891
+ throw error;
892
+ }
893
+ }
894
+ /**
895
+ * Internal generate implementation (called within queue)
896
+ */
897
+ async generateInternal(prompt, options = {}) {
898
+ if (!this.isLoaded()) await this.loadModel(this.config.model || DEFAULT_MODEL);
899
+ const { images } = options;
900
+ if (images?.length && this.isVisionModel) return this.generateWithVision(prompt, options);
901
+ const { maxTokens = 256, temperature = .7, topP = .9, topK = 50, thinking = false, system, cache = false, cacheTtl } = options;
902
+ if (cache && !options.onToken && !images?.length) {
903
+ const cacheKey = generateCacheKey(prompt, this.currentModel || "", {
904
+ maxTokens,
905
+ temperature,
906
+ topP,
907
+ topK,
908
+ system,
909
+ thinking
910
+ });
911
+ const cached = getGlobalCache().get(cacheKey);
912
+ if (cached) return cached;
913
+ }
914
+ const startTime = performance.now();
915
+ try {
916
+ let rawText = "";
917
+ let engineTokensGenerated = 0;
918
+ let engineTokensPerSecond = 0;
919
+ if (this.webgpuEngine) {
920
+ const result$1 = await this.webgpuEngine.generate(prompt, {
921
+ maxTokens,
922
+ sampling: {
923
+ temperature,
924
+ topP,
925
+ topK
926
+ },
927
+ systemPrompt: system,
928
+ onToken: options.onToken ? (t) => options.onToken?.(t) : void 0
929
+ });
930
+ rawText = result$1.text;
931
+ engineTokensGenerated = result$1.tokensGenerated;
932
+ engineTokensPerSecond = result$1.tokensPerSecond;
933
+ } else throw new Error("No model loaded");
934
+ const totalTime = performance.now() - startTime;
935
+ rawText = this.cleanOutput(rawText);
936
+ const { thinking: thinkingText, response } = this.parseThinking(rawText);
937
+ const finalThinking = thinking ? thinkingText : void 0;
938
+ const tokensGenerated = engineTokensGenerated;
939
+ this.stats.prompts += 1;
940
+ this.stats.tokensOut += tokensGenerated;
941
+ this.stats.totalTime += totalTime;
942
+ this.stats.avgSpeed = this.stats.tokensOut / this.stats.totalTime * 1e3;
943
+ const result = {
944
+ text: response,
945
+ thinking: finalThinking,
946
+ tokensGenerated,
947
+ tokensPerSecond: engineTokensPerSecond,
948
+ totalTime,
949
+ finishReason: "stop",
950
+ provider: "local",
951
+ cached: false
952
+ };
953
+ if (cache && !options.onToken && !images?.length) {
954
+ const cacheKey = generateCacheKey(prompt, this.currentModel || "", {
955
+ maxTokens,
956
+ temperature,
957
+ topP,
958
+ topK,
959
+ system,
960
+ thinking
961
+ });
962
+ getGlobalCache().set(cacheKey, result, cacheTtl);
963
+ }
964
+ return result;
965
+ } catch (error) {
966
+ this.reportError(error instanceof Error ? error : new Error(String(error)), {
967
+ operation: "generate",
968
+ modelId: this.currentModel || void 0
969
+ });
970
+ return {
971
+ text: "",
972
+ tokensGenerated: 0,
973
+ tokensPerSecond: 0,
974
+ totalTime: performance.now() - startTime,
975
+ finishReason: "error",
976
+ provider: "local",
977
+ cached: false
978
+ };
979
+ }
980
+ }
981
+ /**
982
+ * Stream text generation (simulated token-by-token)
983
+ *
984
+ * Note: Yields the raw output including <think> tags if thinking mode is enabled.
985
+ * The final result has parsed thinking separated out.
986
+ */
987
+ async *stream(prompt, options = {}) {
988
+ if (!this.isLoaded()) await this.loadModel(this.config.model || DEFAULT_MODEL);
989
+ const startTime = performance.now();
990
+ if (this.webgpuEngine) {
991
+ let fullText = "";
992
+ const tokenQueue = [];
993
+ let resolveNext = null;
994
+ let done = false;
995
+ let engineTokensGenerated = 0;
996
+ let engineTokensPerSecond = 0;
997
+ const generatePromise = this.webgpuEngine.generate(prompt, {
998
+ ...options,
999
+ sampling: {
1000
+ temperature: options.temperature,
1001
+ topP: options.topP,
1002
+ topK: options.topK
1003
+ },
1004
+ systemPrompt: options.system,
1005
+ onToken: (token) => {
1006
+ fullText += token;
1007
+ if (resolveNext) {
1008
+ resolveNext(token);
1009
+ resolveNext = null;
1010
+ } else tokenQueue.push(token);
1011
+ }
1012
+ }).then((result) => {
1013
+ engineTokensGenerated = result.tokensGenerated;
1014
+ engineTokensPerSecond = result.tokensPerSecond;
1015
+ done = true;
1016
+ if (resolveNext) resolveNext(null);
1017
+ }).catch((err) => {
1018
+ done = true;
1019
+ if (resolveNext) resolveNext(null);
1020
+ throw err;
1021
+ });
1022
+ while (!done || tokenQueue.length > 0) if (tokenQueue.length > 0) {
1023
+ const token = tokenQueue.shift();
1024
+ yield token;
1025
+ options.onToken?.(token);
1026
+ } else if (!done) {
1027
+ const token = await new Promise((resolve) => {
1028
+ resolveNext = resolve;
1029
+ });
1030
+ if (token) {
1031
+ yield token;
1032
+ options.onToken?.(token);
1033
+ }
1034
+ }
1035
+ await generatePromise;
1036
+ const { thinking: thinkingText, response } = this.parseThinking(fullText);
1037
+ const totalTime = performance.now() - startTime;
1038
+ return {
1039
+ text: response,
1040
+ thinking: options.thinking ? thinkingText : void 0,
1041
+ tokensGenerated: engineTokensGenerated,
1042
+ totalTime,
1043
+ tokensPerSecond: engineTokensPerSecond,
1044
+ finishReason: "stop"
1045
+ };
1046
+ }
1047
+ throw new Error("No model loaded");
1048
+ }
1049
+ /**
1050
+ * Generate text from images using a vision model
1051
+ * Called automatically by generate() when images are provided
1052
+ */
1053
+ async generateWithVision(prompt, options) {
1054
+ if (!(this.webgpuEngine && typeof this.webgpuEngine.describeImage === "function")) throw new Error("Vision model not loaded. Load a vision-capable model with device 'webgpu' first.");
1055
+ const imgs = options.images ?? [];
1056
+ if (imgs.length !== 1) throw new Error(`Native WebGPU vision supports exactly one image per request (got ${imgs.length}).`);
1057
+ const startTime = performance.now();
1058
+ const { pixels, width, height } = await this.decodeImageToPixels(imgs[0].source);
1059
+ const result = await this.webgpuEngine.describeImage({
1060
+ pixels,
1061
+ width,
1062
+ height
1063
+ }, prompt, {
1064
+ maxTokens: options.maxTokens ?? 512,
1065
+ sampling: {
1066
+ temperature: options.temperature ?? .7,
1067
+ topP: options.topP ?? .9,
1068
+ topK: options.topK ?? 20
1069
+ },
1070
+ onToken: options.onToken ? (t) => options.onToken?.(t) : void 0
1071
+ });
1072
+ const totalTime = performance.now() - startTime;
1073
+ this.stats.prompts += 1;
1074
+ this.stats.tokensOut += result.tokensGenerated;
1075
+ this.stats.totalTime += totalTime;
1076
+ this.stats.avgSpeed = this.stats.tokensOut / this.stats.totalTime * 1e3;
1077
+ return {
1078
+ text: this.cleanOutput(result.text),
1079
+ tokensGenerated: result.tokensGenerated,
1080
+ tokensPerSecond: result.tokensPerSecond,
1081
+ totalTime,
1082
+ finishReason: "stop",
1083
+ provider: "local",
1084
+ cached: false
1085
+ };
1086
+ }
1087
+ /**
1088
+ * Decode an image source (http(s) URL, file path, or data URI) to raw RGB
1089
+ * pixels for the native vision encoder. Supports 8-bit non-interlaced PNG
1090
+ * (color types 2/RGB and 6/RGBA). Other formats throw a clear error — callers
1091
+ * can pre-decode and use the lower-level WebGPUEngine.describeImage() with
1092
+ * pixels directly.
1093
+ */
1094
+ async decodeImageToPixels(source) {
1095
+ const bytes = await this.fetchImageBytes(source);
1096
+ if (!(bytes.length > 8 && bytes[0] === 137 && bytes[1] === 80 && bytes[2] === 78 && bytes[3] === 71)) throw new Error("Native vision currently decodes PNG images only. For other formats, pre-decode to RGB pixels and call the GPU engine's describeImage() directly.");
1097
+ return decodePng(bytes);
1098
+ }
1099
+ /** Fetch an image source to raw bytes (URL, data URI, or local file path). */
1100
+ async fetchImageBytes(source) {
1101
+ if (source.startsWith("data:")) {
1102
+ const comma = source.indexOf(",");
1103
+ const meta = source.slice(5, comma);
1104
+ const data = source.slice(comma + 1);
1105
+ if (meta.includes("base64")) return Uint8Array.from(Buffer.from(data, "base64"));
1106
+ return Uint8Array.from(Buffer.from(decodeURIComponent(data), "binary"));
1107
+ }
1108
+ if (source.startsWith("http://") || source.startsWith("https://")) {
1109
+ const res = await fetch(source);
1110
+ if (!res.ok) throw new Error(`Failed to fetch image (${res.status}): ${source}`);
1111
+ return new Uint8Array(await res.arrayBuffer());
1112
+ }
1113
+ const { readFile } = await import("node:fs/promises");
1114
+ return new Uint8Array(await readFile(source));
1115
+ }
1116
+ /**
1117
+ * Generate structured JSON output
1118
+ */
1119
+ async json(prompt, options) {
1120
+ const { schema, retries = 3, temperature = .3 } = options;
1121
+ const systemPrompt = `You are a JSON generator. You MUST respond with valid JSON only.
1122
+ No explanations, no markdown, no code blocks. Just pure JSON.
1123
+ The JSON must conform to this schema: ${JSON.stringify(zodToJsonSchema(schema))}`;
1124
+ for (let attempt = 0; attempt < retries; attempt += 1) {
1125
+ const result = await this.generate(prompt, {
1126
+ system: options.system || systemPrompt,
1127
+ temperature,
1128
+ maxTokens: 1e3
1129
+ });
1130
+ try {
1131
+ const jsonStr = extractJson(result.text);
1132
+ const parsed = JSON.parse(jsonStr);
1133
+ return schema.parse(parsed);
1134
+ } catch (error) {
1135
+ if (attempt === retries - 1) throw new Error(`Failed to generate valid JSON after ${retries} attempts: ${error}`);
1136
+ }
1137
+ }
1138
+ throw new Error("Failed to generate valid JSON");
1139
+ }
1140
+ /**
1141
+ * Generate a structured object via the native engine's retrying
1142
+ * `generateObject` (extract JSON → validate → retry with a nudge).
1143
+ *
1144
+ * Unlike {@link json} (which is Zod-driven), this passes through to the engine
1145
+ * and accepts either a predicate validator `(o) => boolean` or a minimal
1146
+ * `{ required: [...] }` schema; omit `schema` to accept any valid JSON.
1147
+ *
1148
+ * @example
1149
+ * ```ts
1150
+ * const { object } = await g.generateObject<{ name: string; age: number }>(
1151
+ * 'Extract {name, age} from: "I am Sarah, 28"',
1152
+ * { schema: { required: ["name", "age"] } },
1153
+ * );
1154
+ * ```
1155
+ */
1156
+ async generateObject(prompt, options = {}) {
1157
+ if (!this.isLoaded()) await this.loadModel(this.config.model || DEFAULT_MODEL);
1158
+ if (!this.webgpuEngine) throw new Error("No model loaded");
1159
+ return this.webgpuEngine.generateObject(prompt, options);
1160
+ }
1161
+ /**
1162
+ * Generate embeddings
1163
+ */
1164
+ async embed(text, options = {}) {
1165
+ if (!this.preferNative()) throw new Error("Embeddings require WebGPU. CPU/WASM and the legacy ONNX backend have been removed.");
1166
+ const native = await this.ensureNativeEmbedEngine(options.model);
1167
+ const startTime = performance.now();
1168
+ const vec = await native.embed(text);
1169
+ return {
1170
+ vector: Array.from(vec),
1171
+ text,
1172
+ totalTime: performance.now() - startTime
1173
+ };
1174
+ }
1175
+ /**
1176
+ * Lazily build (or reuse) the native embedding engine. Re-creates it when the
1177
+ * requested repo differs from the cached one. The default native embedding
1178
+ * model is resolved by the engine itself (EmbeddingGemma) when no repo given.
1179
+ */
1180
+ async ensureNativeEmbedEngine(repo) {
1181
+ if (this.nativeEmbedEngine && (!repo || repo === this.nativeEmbedRepo)) return this.nativeEmbedEngine;
1182
+ if (this.nativeEmbedEngine) {
1183
+ try {
1184
+ this.nativeEmbedEngine.destroy();
1185
+ } catch {}
1186
+ this.nativeEmbedEngine = null;
1187
+ this.nativeEmbedRepo = null;
1188
+ }
1189
+ const { WebGPUEngine } = await import("./gpu/index.mjs");
1190
+ this.nativeEmbedEngine = await WebGPUEngine.create({
1191
+ repo,
1192
+ embedding: true
1193
+ });
1194
+ this.nativeEmbedRepo = repo ?? null;
1195
+ return this.nativeEmbedEngine;
1196
+ }
1197
+ /**
1198
+ * Generate embeddings for multiple texts
1199
+ */
1200
+ async embedBatch(texts, options = {}) {
1201
+ const results = [];
1202
+ for (const text of texts) results.push(await this.embed(text, options));
1203
+ return results;
1204
+ }
1205
+ /**
1206
+ * Compute cosine similarity between two vectors
1207
+ *
1208
+ * @example
1209
+ * ```ts
1210
+ * const sim = g.cosineSimilarity([1, 0, 0], [1, 0, 0]); // 1.0
1211
+ * const sim2 = g.cosineSimilarity([1, 0, 0], [0, 1, 0]); // 0.0
1212
+ * ```
1213
+ */
1214
+ cosineSimilarity(a, b) {
1215
+ if (a.length !== b.length) throw new Error(`Vector dimensions must match: ${a.length} vs ${b.length}`);
1216
+ let dotProduct = 0;
1217
+ let normA = 0;
1218
+ let normB = 0;
1219
+ for (let i = 0; i < a.length; i++) {
1220
+ dotProduct += a[i] * b[i];
1221
+ normA += a[i] * a[i];
1222
+ normB += b[i] * b[i];
1223
+ }
1224
+ const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
1225
+ if (magnitude === 0) return 0;
1226
+ return dotProduct / magnitude;
1227
+ }
1228
+ /**
1229
+ * Compare similarity between two texts
1230
+ *
1231
+ * @example
1232
+ * ```ts
1233
+ * const result = await g.similarity("Hello world", "Hi there");
1234
+ * console.log(result.score); // 0.85
1235
+ * ```
1236
+ */
1237
+ async similarity(textA, textB, options = {}) {
1238
+ const startTime = performance.now();
1239
+ const [embedA, embedB] = await Promise.all([this.embed(textA, options), this.embed(textB, options)]);
1240
+ return {
1241
+ score: this.cosineSimilarity(embedA.vector, embedB.vector),
1242
+ textA,
1243
+ textB,
1244
+ totalTime: performance.now() - startTime
1245
+ };
1246
+ }
1247
+ /**
1248
+ * Semantic search - find most similar texts from a corpus
1249
+ *
1250
+ * @example
1251
+ * ```ts
1252
+ * const results = await g.search("capital of France", [
1253
+ * "Paris is beautiful",
1254
+ * "London is in England",
1255
+ * "Dogs are pets"
1256
+ * ]);
1257
+ * // [{ text: "Paris is beautiful", score: 0.89, index: 0 }, ...]
1258
+ * ```
1259
+ */
1260
+ async search(query, corpus, options = {}) {
1261
+ const { topK = corpus.length, ...embedOptions } = options;
1262
+ const queryEmbedding = await this.embed(query, embedOptions);
1263
+ return (await this.embedBatch(corpus, embedOptions)).map((doc, index) => ({
1264
+ text: doc.text,
1265
+ score: this.cosineSimilarity(queryEmbedding.vector, doc.vector),
1266
+ index
1267
+ })).sort((a, b) => b.score - a.score).slice(0, topK);
1268
+ }
1269
+ /**
1270
+ * Find the nearest text to an embedding vector
1271
+ *
1272
+ * @example
1273
+ * ```ts
1274
+ * const embedding = (await g.embed("dog")).vector;
1275
+ * const match = await g.findNearest(embedding, ["cat", "car", "tree"]);
1276
+ * // { text: "cat", score: 0.85, index: 0 }
1277
+ * ```
1278
+ */
1279
+ async findNearest(embedding, candidates, options = {}) {
1280
+ const { topK = candidates.length, ...embedOptions } = options;
1281
+ return (await this.embedBatch(candidates, embedOptions)).map((doc, index) => ({
1282
+ text: doc.text,
1283
+ score: this.cosineSimilarity(embedding, doc.vector),
1284
+ index
1285
+ })).sort((a, b) => b.score - a.score).slice(0, topK);
1286
+ }
1287
+ /**
1288
+ * Get session stats
1289
+ */
1290
+ getStats() {
1291
+ return { ...this.stats };
1292
+ }
1293
+ /**
1294
+ * Get system info
1295
+ */
1296
+ getInfo() {
1297
+ return {
1298
+ version: "1.0.0",
1299
+ model: this.modelConfig,
1300
+ device: {
1301
+ backend: "webgpu-native",
1302
+ gpu: null,
1303
+ vram: null,
1304
+ status: this.isLoaded() ? "ready" : "loading"
1305
+ },
1306
+ context: {
1307
+ max: this.modelConfig?.contextLength || 0,
1308
+ used: 0,
1309
+ available: this.modelConfig?.contextLength || 0
1310
+ },
1311
+ cache: {
1312
+ location: "~/.cache/gerbil",
1313
+ size: "0 MB",
1314
+ modelCount: 0
1315
+ }
1316
+ };
1317
+ }
1318
+ /**
1319
+ * Reset stats
1320
+ */
1321
+ resetStats() {
1322
+ this.stats = {
1323
+ prompts: 0,
1324
+ tokensIn: 0,
1325
+ tokensOut: 0,
1326
+ avgSpeed: 0,
1327
+ totalTime: 0,
1328
+ cacheHits: 0,
1329
+ cacheMisses: 0
1330
+ };
1331
+ }
1332
+ ttsModelId = "kani-tts-2";
1333
+ /**
1334
+ * Load the native TTS model (Kani-TTS-2) for text-to-speech synthesis.
1335
+ *
1336
+ * @example
1337
+ * ```ts
1338
+ * await g.loadTTS({ onProgress: (p) => console.log(p.status) });
1339
+ * const result = await g.speak("Hello world");
1340
+ * // result.audio = Float32Array PCM, result.sampleRate = 22050
1341
+ * ```
1342
+ */
1343
+ async loadTTS(_options = {}) {
1344
+ await this.ensureNativeTTSEngine();
1345
+ }
1346
+ /**
1347
+ * Ensure TTS model is loaded (lazy loading)
1348
+ */
1349
+ async ensureTTSLoaded(_options) {
1350
+ await this.ensureNativeTTSEngine();
1351
+ }
1352
+ /**
1353
+ * Generate speech from text using the native Kani-TTS-2 WebGPU engine.
1354
+ *
1355
+ * @example
1356
+ * ```ts
1357
+ * const result = await g.speak("Hello world");
1358
+ * // result.audio = Float32Array PCM, result.sampleRate = 22050
1359
+ * ```
1360
+ */
1361
+ async speak(text, options = {}) {
1362
+ if (!this.preferNative()) throw new Error("Speech synthesis requires WebGPU. CPU/WASM and the legacy ONNX backend have been removed.");
1363
+ const native = await this.ensureNativeTTSEngine();
1364
+ const startTime = performance.now();
1365
+ const out = await native.speak(text, {});
1366
+ return {
1367
+ audio: out.pcm,
1368
+ sampleRate: out.sampleRate,
1369
+ duration: out.audioSeconds,
1370
+ voice: options.voice ?? "default",
1371
+ totalTime: performance.now() - startTime
1372
+ };
1373
+ }
1374
+ /** Lazily build (or reuse) the native Kani-TTS WebGPUEngine (default repo). */
1375
+ async ensureNativeTTSEngine() {
1376
+ if (!this.nativeTTSEngine) {
1377
+ const { WebGPUEngine, DEFAULT_MODELS } = await import("./gpu/index.mjs");
1378
+ this.nativeTTSEngine = await WebGPUEngine.create({ repo: DEFAULT_MODELS.tts });
1379
+ }
1380
+ return this.nativeTTSEngine;
1381
+ }
1382
+ /**
1383
+ * Stream speech generation. The native engine synthesizes the full clip, so a
1384
+ * single final audio chunk is yielded.
1385
+ */
1386
+ async *speakStream(text, options = {}) {
1387
+ const result = await this.speak(text, options);
1388
+ yield {
1389
+ samples: result.audio,
1390
+ sampleRate: result.sampleRate,
1391
+ index: 0,
1392
+ isFinal: true
1393
+ };
1394
+ return result;
1395
+ }
1396
+ /**
1397
+ * Get list of available TTS voices (native Kani-TTS-2 default voice).
1398
+ */
1399
+ listVoices() {
1400
+ return KOKORO_VOICES_DEFAULT;
1401
+ }
1402
+ /**
1403
+ * Check if TTS model is loaded
1404
+ */
1405
+ isTTSLoaded() {
1406
+ return this.nativeTTSEngine !== null;
1407
+ }
1408
+ /**
1409
+ * Get current TTS model info
1410
+ */
1411
+ getTTSModelInfo() {
1412
+ if (!this.nativeTTSEngine) return null;
1413
+ return {
1414
+ id: this.ttsModelId,
1415
+ loaded: true,
1416
+ device: "webgpu"
1417
+ };
1418
+ }
1419
+ /**
1420
+ * List available TTS models (native Kani-TTS-2).
1421
+ */
1422
+ async listTTSModels() {
1423
+ return [{
1424
+ id: this.ttsModelId,
1425
+ description: "Kani-TTS-2 native WebGPU TTS",
1426
+ sampleRate: 22050,
1427
+ voiceCount: 1
1428
+ }];
1429
+ }
1430
+ /**
1431
+ * Load the native STT model (Moonshine) for speech-to-text transcription.
1432
+ *
1433
+ * @example
1434
+ * ```ts
1435
+ * await g.loadSTT();
1436
+ * const result = await g.transcribe(audioData);
1437
+ * console.log(result.text);
1438
+ * ```
1439
+ */
1440
+ async loadSTT(_modelId, _options = {}) {
1441
+ await this.ensureNativeSTT();
1442
+ }
1443
+ /**
1444
+ * Ensure STT model is loaded (lazy loading)
1445
+ */
1446
+ async ensureSTTLoaded(_modelId, _options) {
1447
+ await this.ensureNativeSTT();
1448
+ }
1449
+ /**
1450
+ * Transcribe audio to text
1451
+ *
1452
+ * @param audio - Audio data as Float32Array (16kHz mono) or Uint8Array (WAV file)
1453
+ * @param options - Transcription options
1454
+ *
1455
+ * @example
1456
+ * ```ts
1457
+ * // From Float32Array (16kHz mono)
1458
+ * const result = await g.transcribe(audioData);
1459
+ * console.log(result.text);
1460
+ *
1461
+ * // With timestamps
1462
+ * const result = await g.transcribe(audioData, { timestamps: true });
1463
+ * for (const seg of result.segments) {
1464
+ * console.log(`[${seg.start}s] ${seg.text}`);
1465
+ * }
1466
+ *
1467
+ * // From WAV file
1468
+ * const wavData = fs.readFileSync("audio.wav");
1469
+ * const result = await g.transcribe(new Uint8Array(wavData));
1470
+ * ```
1471
+ */
1472
+ async transcribe(audio, options = {}) {
1473
+ if (!this.preferNative()) throw new Error("Transcription requires WebGPU. CPU/WASM and the legacy ONNX backend have been removed.");
1474
+ if (!(audio instanceof Float32Array)) throw new Error("Native transcription requires 16 kHz mono Float32Array PCM. Decode WAV bytes to PCM first.");
1475
+ if (options.timestamps) throw new Error("Native transcription does not produce timestamps.");
1476
+ const native = await this.ensureNativeSTT();
1477
+ const startTime = performance.now();
1478
+ const out = await native.transcribe(audio);
1479
+ return {
1480
+ text: out.text,
1481
+ language: options.language ?? "en",
1482
+ duration: out.audioSeconds,
1483
+ totalTime: performance.now() - startTime
1484
+ };
1485
+ }
1486
+ /** Lazily build (or reuse) the native MoonshineSTT engine (default repo). */
1487
+ async ensureNativeSTT() {
1488
+ if (!this.nativeSTT) {
1489
+ const { MoonshineSTT } = await import("./moonshine-stt-BLyVoRpB.mjs");
1490
+ this.nativeSTT = await MoonshineSTT.create();
1491
+ }
1492
+ return this.nativeSTT;
1493
+ }
1494
+ /**
1495
+ * Create a streaming transcription session
1496
+ *
1497
+ * Transcribes audio in real-time by processing chunks at regular intervals.
1498
+ * Perfect for live captioning, call transcription, or real-time subtitles.
1499
+ *
1500
+ * @param options - Streaming options
1501
+ * @returns Streaming session controller
1502
+ *
1503
+ * @example
1504
+ * ```ts
1505
+ * const session = await g.createStreamingTranscription({
1506
+ * chunkDuration: 3000, // Transcribe every 3 seconds
1507
+ * onChunk: (text, idx) => console.log(`Chunk ${idx}: ${text}`),
1508
+ * onTranscript: (fullText) => console.log("Full:", fullText),
1509
+ * });
1510
+ *
1511
+ * // Feed audio data as it comes in
1512
+ * session.feedAudio(audioChunk);
1513
+ *
1514
+ * // Start automatic interval-based transcription
1515
+ * session.start();
1516
+ *
1517
+ * // Later, stop and get final transcript
1518
+ * const finalText = await session.stop();
1519
+ * ```
1520
+ */
1521
+ async createStreamingTranscription(_options = {}) {
1522
+ throw new Error("Streaming transcription is not supported by the native WebGPU STT engine. Use transcribe() on buffered 16 kHz Float32Array PCM instead.");
1523
+ }
1524
+ /**
1525
+ * Get list of available STT models (native Moonshine).
1526
+ */
1527
+ async listSTTModels() {
1528
+ return [{
1529
+ id: "moonshine-base",
1530
+ repo: "UsefulSensors/moonshine-base",
1531
+ description: "Moonshine native WebGPU STT",
1532
+ size: "61M",
1533
+ multilingual: false,
1534
+ languages: ["en"],
1535
+ sampleRate: 16e3
1536
+ }];
1537
+ }
1538
+ /**
1539
+ * Check if STT model is loaded
1540
+ */
1541
+ isSTTLoaded() {
1542
+ return this.nativeSTT !== null;
1543
+ }
1544
+ /**
1545
+ * Get current STT model info
1546
+ */
1547
+ getSTTModelInfo() {
1548
+ if (!this.nativeSTT) return null;
1549
+ return {
1550
+ id: "moonshine-base",
1551
+ loaded: true,
1552
+ device: "webgpu"
1553
+ };
1554
+ }
1555
+ /**
1556
+ * Record audio from microphone and transcribe
1557
+ *
1558
+ * @example
1559
+ * ```ts
1560
+ * // Record for 5 seconds and transcribe
1561
+ * const result = await g.listen(5000);
1562
+ * console.log(result.text);
1563
+ *
1564
+ * // Use with voice chat
1565
+ * const userInput = await g.listen(10000);
1566
+ * const response = await g.generate(userInput.text);
1567
+ * await g.speak(response.text);
1568
+ * ```
1569
+ */
1570
+ async listen(durationMs = 5e3, options = {}) {
1571
+ const { Microphone, isSoxAvailable } = await import("./microphone-Bqmoz9_K.mjs");
1572
+ if (!isSoxAvailable()) throw new Error("Microphone recording requires SoX. Install with:\n macOS: brew install sox\n Ubuntu: sudo apt install sox\n Windows: https://sox.sourceforge.net/");
1573
+ options.onProgress?.("Starting microphone...");
1574
+ const mic = new Microphone({ sampleRate: 16e3 });
1575
+ await mic.start();
1576
+ options.onProgress?.(`Recording for ${(durationMs / 1e3).toFixed(1)}s...`);
1577
+ await new Promise((r) => setTimeout(r, durationMs));
1578
+ options.onProgress?.("Processing audio...");
1579
+ const { audio } = await mic.stop();
1580
+ options.onProgress?.("Transcribing...");
1581
+ return this.transcribe(audio, { onProgress: (p) => options.onProgress?.(p.status || "Transcribing...") });
1582
+ }
1583
+ /**
1584
+ * Check if microphone recording is available
1585
+ */
1586
+ async isMicrophoneAvailable() {
1587
+ try {
1588
+ const { isSoxAvailable } = await import("./microphone-Bqmoz9_K.mjs");
1589
+ return isSoxAvailable();
1590
+ } catch {
1591
+ return false;
1592
+ }
1593
+ }
1594
+ /**
1595
+ * Dispose of resources (releases all native WebGPU engines and their devices).
1596
+ * @param _disconnect Accepted for API compatibility; no longer used.
1597
+ */
1598
+ async dispose(_disconnect = false) {
1599
+ if (this.webgpuEngine) {
1600
+ try {
1601
+ this.webgpuEngine.destroy();
1602
+ } catch {}
1603
+ this.webgpuEngine = null;
1604
+ }
1605
+ if (this.nativeEmbedEngine) {
1606
+ try {
1607
+ this.nativeEmbedEngine.destroy();
1608
+ } catch {}
1609
+ this.nativeEmbedEngine = null;
1610
+ this.nativeEmbedRepo = null;
1611
+ }
1612
+ if (this.nativeSTT) {
1613
+ try {
1614
+ this.nativeSTT.destroy?.();
1615
+ } catch {}
1616
+ this.nativeSTT = null;
1617
+ }
1618
+ if (this.nativeTTSEngine) {
1619
+ try {
1620
+ this.nativeTTSEngine.destroy();
1621
+ } catch {}
1622
+ this.nativeTTSEngine = null;
1623
+ }
1624
+ this.currentModel = null;
1625
+ this.modelConfig = null;
1626
+ this.isVisionModel = false;
1627
+ }
1628
+ /**
1629
+ * @deprecated The shared Chrome backend was removed; this is now a no-op.
1630
+ */
1631
+ static async shutdown() {}
1632
+ parseThinking(text) {
1633
+ const match = text.match(/<think>([\s\S]*?)<\/think>/);
1634
+ if (match) return {
1635
+ thinking: match[1].trim(),
1636
+ response: text.replace(/<think>[\s\S]*?<\/think>/, "").trim()
1637
+ };
1638
+ const unclosedMatch = text.match(/<think>([\s\S]*)$/);
1639
+ if (unclosedMatch) {
1640
+ const thinking = unclosedMatch[1].trim();
1641
+ const response = text.replace(/<think>[\s\S]*$/, "").trim();
1642
+ return {
1643
+ thinking: thinking || void 0,
1644
+ response
1645
+ };
1646
+ }
1647
+ return { response: text.replace(/<\/?think>/g, "").trim() };
1648
+ }
1649
+ cleanOutput(text) {
1650
+ return text.replace(/<\|im_end\|>/g, "").replace(/<\|im_start\|>/g, "").replace(/<\|endoftext\|>/g, "").replace(/<\/s>/g, "").replace(/^\/no_think\s*/i, "").replace(/^assistant\s*/i, "").replace(/^\s*\/no_think\s*/gim, "").replace(/^\s*assistant\s*/gim, "").replace(/^(system|user|assistant):\s*/gim, "").trim();
1651
+ }
1652
+ };
1653
+
1654
+ //#endregion
1655
+ export { resolveModel as a, configureGlobalCache as c, listBuiltinModels as i, getGlobalCache as l, BUILTIN_MODELS as n, ResponseCache as o, DEFAULT_MODEL as r, clearGlobalCache as s, Gerbil as t };
1656
+ //# sourceMappingURL=gerbil-BHrJJIa4.mjs.map