@tryhamster/gerbil 1.0.0-rc.0 → 1.0.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/README.md +79 -14
  2. package/dist/auto-update-S9s5-g0C.mjs +3 -0
  3. package/dist/browser/index.d.ts +1009 -0
  4. package/dist/browser/index.d.ts.map +1 -0
  5. package/dist/browser/index.js +2492 -0
  6. package/dist/browser/index.js.map +1 -0
  7. package/dist/{chrome-backend-C5Un08O4.mjs → chrome-backend-CORwaIyC.mjs} +514 -73
  8. package/dist/chrome-backend-CORwaIyC.mjs.map +1 -0
  9. package/dist/{chrome-backend-CtwPENIW.mjs → chrome-backend-DIKYoWj-.mjs} +1 -1
  10. package/dist/cli.mjs +3359 -647
  11. package/dist/cli.mjs.map +1 -1
  12. package/dist/frameworks/express.d.mts +1 -1
  13. package/dist/frameworks/express.mjs +3 -4
  14. package/dist/frameworks/express.mjs.map +1 -1
  15. package/dist/frameworks/fastify.d.mts +1 -1
  16. package/dist/frameworks/fastify.mjs +2 -3
  17. package/dist/frameworks/fastify.mjs.map +1 -1
  18. package/dist/frameworks/hono.d.mts +1 -1
  19. package/dist/frameworks/hono.mjs +2 -3
  20. package/dist/frameworks/hono.mjs.map +1 -1
  21. package/dist/frameworks/next.d.mts +2 -2
  22. package/dist/frameworks/next.mjs +2 -3
  23. package/dist/frameworks/next.mjs.map +1 -1
  24. package/dist/frameworks/react.d.mts +1 -1
  25. package/dist/frameworks/trpc.d.mts +1 -1
  26. package/dist/frameworks/trpc.mjs +2 -3
  27. package/dist/frameworks/trpc.mjs.map +1 -1
  28. package/dist/gerbil-DJGqq7BX.mjs +4 -0
  29. package/dist/gerbil-DoDGHe6Z.mjs +1631 -0
  30. package/dist/gerbil-DoDGHe6Z.mjs.map +1 -0
  31. package/dist/gerbil-qOTe1nl2.d.mts +431 -0
  32. package/dist/gerbil-qOTe1nl2.d.mts.map +1 -0
  33. package/dist/index.d.mts +411 -9
  34. package/dist/index.d.mts.map +1 -1
  35. package/dist/index.mjs +7 -6
  36. package/dist/index.mjs.map +1 -1
  37. package/dist/integrations/ai-sdk.d.mts +122 -4
  38. package/dist/integrations/ai-sdk.d.mts.map +1 -1
  39. package/dist/integrations/ai-sdk.mjs +238 -11
  40. package/dist/integrations/ai-sdk.mjs.map +1 -1
  41. package/dist/integrations/langchain.d.mts +132 -2
  42. package/dist/integrations/langchain.d.mts.map +1 -1
  43. package/dist/integrations/langchain.mjs +175 -8
  44. package/dist/integrations/langchain.mjs.map +1 -1
  45. package/dist/integrations/llamaindex.d.mts +1 -1
  46. package/dist/integrations/llamaindex.mjs +2 -3
  47. package/dist/integrations/llamaindex.mjs.map +1 -1
  48. package/dist/integrations/mcp-client.mjs +4 -4
  49. package/dist/integrations/mcp-client.mjs.map +1 -1
  50. package/dist/integrations/mcp.d.mts +2 -2
  51. package/dist/integrations/mcp.d.mts.map +1 -1
  52. package/dist/integrations/mcp.mjs +5 -6
  53. package/dist/kokoro-BNTb6egA.mjs +20210 -0
  54. package/dist/kokoro-BNTb6egA.mjs.map +1 -0
  55. package/dist/kokoro-CMOGDSgT.js +20212 -0
  56. package/dist/kokoro-CMOGDSgT.js.map +1 -0
  57. package/dist/{mcp-R8kRLIKb.mjs → mcp-kzDDWIoS.mjs} +10 -37
  58. package/dist/mcp-kzDDWIoS.mjs.map +1 -0
  59. package/dist/microphone-DaMZFRuR.mjs +3 -0
  60. package/dist/{one-liner-BUQR0nqq.mjs → one-liner-DxnNs_JK.mjs} +2 -2
  61. package/dist/{one-liner-BUQR0nqq.mjs.map → one-liner-DxnNs_JK.mjs.map} +1 -1
  62. package/dist/repl-DGUw4fCc.mjs +9 -0
  63. package/dist/skills/index.d.mts +305 -14
  64. package/dist/skills/index.d.mts.map +1 -1
  65. package/dist/skills/index.mjs +5 -6
  66. package/dist/skills-DulrOPeP.mjs +1435 -0
  67. package/dist/skills-DulrOPeP.mjs.map +1 -0
  68. package/dist/stt-1WIefHwc.mjs +3 -0
  69. package/dist/stt-CG_7KB_0.mjs +434 -0
  70. package/dist/stt-CG_7KB_0.mjs.map +1 -0
  71. package/dist/stt-Dne6SENv.js +434 -0
  72. package/dist/stt-Dne6SENv.js.map +1 -0
  73. package/dist/{tools-BsiEE6f2.mjs → tools-Bi1P7Xoy.mjs} +6 -7
  74. package/dist/{tools-BsiEE6f2.mjs.map → tools-Bi1P7Xoy.mjs.map} +1 -1
  75. package/dist/transformers.web-DiD1gTwk.js +44695 -0
  76. package/dist/transformers.web-DiD1gTwk.js.map +1 -0
  77. package/dist/transformers.web-u34VxRFM.js +3 -0
  78. package/dist/tts-B1pZMlDv.mjs +3 -0
  79. package/dist/tts-C2FzKuSx.js +725 -0
  80. package/dist/tts-C2FzKuSx.js.map +1 -0
  81. package/dist/tts-CyHhcLtN.mjs +731 -0
  82. package/dist/tts-CyHhcLtN.mjs.map +1 -0
  83. package/dist/types-CiTc7ez3.d.mts +353 -0
  84. package/dist/types-CiTc7ez3.d.mts.map +1 -0
  85. package/dist/{utils-7vXqtq2Q.mjs → utils-CZBZ8dgR.mjs} +1 -1
  86. package/dist/{utils-7vXqtq2Q.mjs.map → utils-CZBZ8dgR.mjs.map} +1 -1
  87. package/docs/ai-sdk.md +137 -21
  88. package/docs/browser.md +241 -2
  89. package/docs/memory.md +72 -0
  90. package/docs/stt.md +494 -0
  91. package/docs/tts.md +569 -0
  92. package/docs/vision.md +396 -0
  93. package/package.json +21 -22
  94. package/dist/auto-update-BbNHbSU1.mjs +0 -3
  95. package/dist/browser/index.d.mts +0 -262
  96. package/dist/browser/index.d.mts.map +0 -1
  97. package/dist/browser/index.mjs +0 -755
  98. package/dist/browser/index.mjs.map +0 -1
  99. package/dist/chrome-backend-C5Un08O4.mjs.map +0 -1
  100. package/dist/gerbil-BfnsFWRE.mjs +0 -644
  101. package/dist/gerbil-BfnsFWRE.mjs.map +0 -1
  102. package/dist/gerbil-BjW-z7Fq.mjs +0 -5
  103. package/dist/gerbil-DZ1k3ChC.d.mts +0 -138
  104. package/dist/gerbil-DZ1k3ChC.d.mts.map +0 -1
  105. package/dist/mcp-R8kRLIKb.mjs.map +0 -1
  106. package/dist/models-DKULvhOr.mjs +0 -136
  107. package/dist/models-DKULvhOr.mjs.map +0 -1
  108. package/dist/models-De2-_GmQ.d.mts +0 -22
  109. package/dist/models-De2-_GmQ.d.mts.map +0 -1
  110. package/dist/skills-D3CEpgDc.mjs +0 -630
  111. package/dist/skills-D3CEpgDc.mjs.map +0 -1
  112. package/dist/types-BS1N92Jt.d.mts +0 -183
  113. package/dist/types-BS1N92Jt.d.mts.map +0 -1
  114. /package/dist/{chunk-Ct1HF2bE.mjs → chunk-CkXuGtQK.mjs} +0 -0
@@ -0,0 +1,2492 @@
1
+ //#region src/core/models.ts
2
+ const BUILTIN_MODELS = {
3
+ "qwen3-0.6b": {
4
+ id: "qwen3-0.6b",
5
+ repo: "onnx-community/Qwen3-0.6B-ONNX",
6
+ description: "Qwen3 0.6B - Best balance of speed and quality, supports thinking",
7
+ size: "~400MB",
8
+ contextLength: 32768,
9
+ supportsThinking: true,
10
+ supportsJson: true,
11
+ family: "qwen"
12
+ },
13
+ "qwen2.5-0.5b": {
14
+ id: "qwen2.5-0.5b",
15
+ repo: "onnx-community/Qwen2.5-0.5B-Instruct",
16
+ description: "Qwen2.5 0.5B - Fast and capable",
17
+ size: "~350MB",
18
+ contextLength: 32768,
19
+ supportsThinking: false,
20
+ supportsJson: true,
21
+ family: "qwen"
22
+ },
23
+ "qwen2.5-coder-0.5b": {
24
+ id: "qwen2.5-coder-0.5b",
25
+ repo: "onnx-community/Qwen2.5-Coder-0.5B-Instruct",
26
+ description: "Qwen2.5 Coder 0.5B - Optimized for code",
27
+ size: "~400MB",
28
+ contextLength: 32768,
29
+ supportsThinking: false,
30
+ supportsJson: true,
31
+ family: "qwen"
32
+ },
33
+ "smollm2-360m": {
34
+ id: "smollm2-360m",
35
+ repo: "HuggingFaceTB/SmolLM2-360M-Instruct",
36
+ description: "SmolLM2 360M - Fast, good for simple tasks",
37
+ size: "~250MB",
38
+ contextLength: 8192,
39
+ supportsThinking: false,
40
+ supportsJson: false,
41
+ family: "smollm"
42
+ },
43
+ "smollm2-135m": {
44
+ id: "smollm2-135m",
45
+ repo: "HuggingFaceTB/SmolLM2-135M-Instruct",
46
+ description: "SmolLM2 135M - Fastest, basic generation",
47
+ size: "~100MB",
48
+ contextLength: 8192,
49
+ supportsThinking: false,
50
+ supportsJson: false,
51
+ family: "smollm"
52
+ },
53
+ "phi-3-mini": {
54
+ id: "phi-3-mini",
55
+ repo: "microsoft/Phi-3-mini-4k-instruct-onnx",
56
+ description: "Phi-3 Mini - High quality, larger model",
57
+ size: "~2.1GB",
58
+ contextLength: 4096,
59
+ supportsThinking: false,
60
+ supportsJson: true,
61
+ family: "phi"
62
+ },
63
+ "ministral-3b": {
64
+ id: "ministral-3b",
65
+ repo: "mistralai/Ministral-3-3B-Instruct-2512-ONNX",
66
+ description: "Ministral 3 3B - Vision + Reasoning, 256k context",
67
+ size: "~2.5GB",
68
+ contextLength: 262144,
69
+ supportsThinking: true,
70
+ supportsJson: true,
71
+ supportsVision: true,
72
+ visionEncoderSize: "0.4B",
73
+ family: "mistral"
74
+ }
75
+ };
76
+ /**
77
+ * Parse model identifier and resolve to source
78
+ *
79
+ * Supported formats:
80
+ * - "qwen3-0.6b" (built-in)
81
+ * - "hf:org/model" (HuggingFace shorthand)
82
+ * - "https://huggingface.co/org/model" (full URL)
83
+ * - "file:./path/to/model" (local path)
84
+ */
85
+ function resolveModel(modelId) {
86
+ if (BUILTIN_MODELS[modelId]) return {
87
+ type: "builtin",
88
+ path: BUILTIN_MODELS[modelId].repo
89
+ };
90
+ if (modelId.startsWith("hf:")) return {
91
+ type: "huggingface",
92
+ path: modelId.slice(3)
93
+ };
94
+ if (modelId.startsWith("https://huggingface.co/")) return {
95
+ type: "huggingface",
96
+ path: modelId.replace("https://huggingface.co/", "")
97
+ };
98
+ if (modelId.startsWith("file:")) return {
99
+ type: "local",
100
+ path: modelId.slice(5)
101
+ };
102
+ if (modelId.includes("/")) return {
103
+ type: "huggingface",
104
+ path: modelId
105
+ };
106
+ return {
107
+ type: "huggingface",
108
+ path: modelId
109
+ };
110
+ }
111
+
112
+ //#endregion
113
+ //#region src/browser/index.ts
114
+ /**
115
+ * Gerbil Browser Support
116
+ *
117
+ * Run LLMs directly in the browser with WebGPU acceleration.
118
+ *
119
+ * @example useChat (React)
120
+ * ```tsx
121
+ * import { useChat } from "@tryhamster/gerbil/browser";
122
+ *
123
+ * function Chat() {
124
+ * const { messages, input, setInput, handleSubmit, isLoading } = useChat();
125
+ *
126
+ * if (isLoading) return <div>Loading model...</div>;
127
+ *
128
+ * return (
129
+ * <form onSubmit={handleSubmit}>
130
+ * {messages.map(m => <div key={m.id}>{m.role}: {m.content}</div>)}
131
+ * <input value={input} onChange={e => setInput(e.target.value)} />
132
+ * </form>
133
+ * );
134
+ * }
135
+ * ```
136
+ *
137
+ * @example useCompletion (React)
138
+ * ```tsx
139
+ * import { useCompletion } from "@tryhamster/gerbil/browser";
140
+ *
141
+ * function App() {
142
+ * const { complete, completion, isLoading } = useCompletion();
143
+ * if (isLoading) return <div>Loading...</div>;
144
+ * return <button onClick={() => complete("Write a haiku")}>{completion}</button>;
145
+ * }
146
+ * ```
147
+ *
148
+ * @example Low-level API
149
+ * ```ts
150
+ * import { createGerbilWorker } from "@tryhamster/gerbil/browser";
151
+ *
152
+ * const gerbil = await createGerbilWorker({
153
+ * modelId: "qwen3-0.6b",
154
+ * onToken: (token) => console.log(token.text),
155
+ * });
156
+ * await gerbil.generate("Hello!");
157
+ * gerbil.terminate();
158
+ * ```
159
+ */
160
+ /**
161
+ * Create a Gerbil worker for streaming WebGPU inference
162
+ *
163
+ * Uses a Web Worker to keep the UI responsive during model loading
164
+ * and text generation, with real-time token streaming.
165
+ */
166
+ async function createGerbilWorker(options = {}) {
167
+ const { modelId = "qwen3-0.6b", onProgress, onToken, onComplete, onError } = options;
168
+ const source = resolveModel(modelId);
169
+ return new Promise((resolve, reject) => {
170
+ const blob = new Blob([`
171
+ import {
172
+ AutoTokenizer,
173
+ AutoModelForCausalLM,
174
+ AutoProcessor,
175
+ AutoModelForImageTextToText,
176
+ RawImage,
177
+ TextStreamer,
178
+ InterruptableStoppingCriteria,
179
+ env,
180
+ } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
181
+
182
+ // Enable IndexedDB caching for browser (prevents re-downloading models)
183
+ env.useBrowserCache = true;
184
+ env.allowLocalModels = false;
185
+
186
+ class ModelPipeline {
187
+ static tokenizer = null;
188
+ static model = null;
189
+ static processor = null;
190
+ static visionModel = null;
191
+ static modelId = "";
192
+ static isVision = false;
193
+
194
+ static async getInstance(modelId, options = {}, progressCallback) {
195
+ if (this.modelId !== modelId) {
196
+ this.tokenizer = null;
197
+ this.model = null;
198
+ this.processor = null;
199
+ this.visionModel = null;
200
+ }
201
+ this.modelId = modelId;
202
+
203
+ // Detect vision models
204
+ this.isVision = options.vision ||
205
+ modelId.toLowerCase().includes("ministral") ||
206
+ modelId.toLowerCase().includes("vision") ||
207
+ modelId.toLowerCase().includes("vlm");
208
+
209
+ const dtype = options.dtype || "q4f16";
210
+ const device = options.device || "webgpu";
211
+
212
+ if (this.isVision) {
213
+ // Load vision model components
214
+ // Note: Don't specify dtype for vision models - let transformers.js pick defaults
215
+ if (!this.processor) {
216
+ this.processor = await AutoProcessor.from_pretrained(modelId, {
217
+ progress_callback: progressCallback,
218
+ });
219
+ }
220
+ if (!this.visionModel) {
221
+ this.visionModel = await AutoModelForImageTextToText.from_pretrained(modelId, {
222
+ device,
223
+ progress_callback: progressCallback,
224
+ });
225
+ }
226
+ return {
227
+ processor: this.processor,
228
+ model: this.visionModel,
229
+ tokenizer: this.processor.tokenizer,
230
+ isVision: true
231
+ };
232
+ } else {
233
+ // Load text-only model components
234
+ if (!this.tokenizer) {
235
+ this.tokenizer = await AutoTokenizer.from_pretrained(modelId, {
236
+ progress_callback: progressCallback,
237
+ });
238
+ }
239
+ if (!this.model) {
240
+ this.model = await AutoModelForCausalLM.from_pretrained(modelId, {
241
+ dtype,
242
+ device,
243
+ progress_callback: progressCallback,
244
+ });
245
+ }
246
+ return {
247
+ tokenizer: this.tokenizer,
248
+ model: this.model,
249
+ isVision: false
250
+ };
251
+ }
252
+ }
253
+ }
254
+
255
+ const stoppingCriteria = new InterruptableStoppingCriteria();
256
+ let pastKeyValuesCache = null;
257
+
258
+ async function load(data) {
259
+ const { modelId, options = {} } = data;
260
+ self.postMessage({ status: "loading", message: "Loading model..." });
261
+
262
+ const downloadState = {
263
+ downloading: new Set(),
264
+ completed: new Set(),
265
+ isDownloading: false,
266
+ };
267
+
268
+ try {
269
+ const result = await ModelPipeline.getInstance(
270
+ modelId,
271
+ options,
272
+ (progress) => {
273
+ if (progress.status === "progress" && progress.file) {
274
+ const pct = Math.round(progress.progress || 0);
275
+ if (pct < 100) {
276
+ downloadState.downloading.add(progress.file);
277
+ downloadState.isDownloading = true;
278
+ } else if (pct === 100) {
279
+ downloadState.downloading.delete(progress.file);
280
+ downloadState.completed.add(progress.file);
281
+ }
282
+ if (downloadState.isDownloading) {
283
+ self.postMessage({
284
+ status: "downloading",
285
+ file: progress.file,
286
+ progress: pct,
287
+ downloadCount: downloadState.downloading.size,
288
+ totalFiles: downloadState.completed.size + downloadState.downloading.size,
289
+ });
290
+ }
291
+ }
292
+ }
293
+ );
294
+
295
+ self.postMessage({ status: "loading", message: "Compiling shaders..." });
296
+
297
+ // Warmup differs for vision vs text models
298
+ if (result.isVision) {
299
+ // Vision models need both text and vision warmup
300
+ // Text warmup first
301
+ const textWarmupInputs = result.tokenizer("hello");
302
+ await result.model.generate({ ...textWarmupInputs, max_new_tokens: 1 });
303
+
304
+ // Vision warmup with synthetic image
305
+ self.postMessage({ status: "loading", message: "Warming up vision encoder..." });
306
+ try {
307
+ // Create a tiny 8x8 test image using OffscreenCanvas
308
+ const canvas = new OffscreenCanvas(8, 8);
309
+ const ctx = canvas.getContext("2d");
310
+ ctx.fillStyle = "red";
311
+ ctx.fillRect(0, 0, 8, 8);
312
+ const blob = await canvas.convertToBlob({ type: "image/png" });
313
+ const warmupImage = await RawImage.fromBlob(blob);
314
+
315
+ // Process with vision pipeline
316
+ const warmupContent = [{ type: "image" }, { type: "text", text: "hi" }];
317
+ const warmupMessages = [{ role: "user", content: warmupContent }];
318
+ const warmupPrompt = result.processor.apply_chat_template(warmupMessages, { add_generation_prompt: true });
319
+ const warmupInputs = await result.processor(warmupImage, warmupPrompt, { add_special_tokens: false });
320
+
321
+ // Run vision warmup generation
322
+ await result.model.generate({
323
+ ...warmupInputs,
324
+ max_new_tokens: 1,
325
+ });
326
+ } catch (warmupErr) {
327
+ console.warn("Vision warmup failed (non-fatal):", warmupErr);
328
+ }
329
+ } else {
330
+ const warmupInputs = result.tokenizer("a");
331
+ await result.model.generate({ ...warmupInputs, max_new_tokens: 1 });
332
+ }
333
+
334
+ self.postMessage({ status: "ready", isVision: result.isVision });
335
+ } catch (error) {
336
+ self.postMessage({ status: "error", error: error.message || String(error) });
337
+ }
338
+ }
339
+
340
+ async function generate(data) {
341
+ const { messages, images = [], options = {} } = data;
342
+ const { maxTokens = 256, temperature = 0.7, topP = 0.9, topK = 20, thinking = false } = options;
343
+
344
+ try {
345
+ const result = await ModelPipeline.getInstance(ModelPipeline.modelId, {});
346
+
347
+ // Route to vision or text generation
348
+ if (result.isVision && images.length > 0) {
349
+ await generateVision(result, messages, images, options);
350
+ } else {
351
+ await generateText(result, messages, options);
352
+ }
353
+ } catch (error) {
354
+ self.postMessage({ status: "error", error: error.message || String(error) });
355
+ }
356
+ }
357
+
358
+ async function generateText(result, messages, options) {
359
+ const { maxTokens = 256, temperature = 0.7, topP = 0.9, topK = 20, thinking = false } = options;
360
+ const { tokenizer, model } = result;
361
+
362
+ const inputs = tokenizer.apply_chat_template(messages, {
363
+ add_generation_prompt: true,
364
+ return_dict: true,
365
+ enable_thinking: thinking,
366
+ });
367
+
368
+ let state = "answering";
369
+ const [START_THINKING_TOKEN_ID, END_THINKING_TOKEN_ID] = tokenizer.encode(
370
+ "<think></think>",
371
+ { add_special_tokens: false }
372
+ );
373
+
374
+ let startTime = null;
375
+ let numTokens = 0;
376
+
377
+ const tokenCallback = (tokens) => {
378
+ startTime ??= performance.now();
379
+ numTokens += 1;
380
+ const tokenId = Number(tokens[0]);
381
+ if (tokenId === START_THINKING_TOKEN_ID) state = "thinking";
382
+ else if (tokenId === END_THINKING_TOKEN_ID) state = "answering";
383
+ };
384
+
385
+ const streamCallback = (text) => {
386
+ const tps = startTime ? (numTokens / (performance.now() - startTime)) * 1000 : 0;
387
+ self.postMessage({ status: "token", text, state, numTokens, tps });
388
+ };
389
+
390
+ const streamer = new TextStreamer(tokenizer, {
391
+ skip_prompt: true,
392
+ skip_special_tokens: true,
393
+ callback_function: streamCallback,
394
+ token_callback_function: tokenCallback,
395
+ });
396
+
397
+ self.postMessage({ status: "start" });
398
+
399
+ const { past_key_values, sequences } = await model.generate({
400
+ ...inputs,
401
+ past_key_values: pastKeyValuesCache,
402
+ do_sample: temperature > 0,
403
+ temperature: temperature > 0 ? temperature : undefined,
404
+ top_p: topP,
405
+ top_k: topK,
406
+ max_new_tokens: maxTokens,
407
+ streamer,
408
+ stopping_criteria: stoppingCriteria,
409
+ return_dict_in_generate: true,
410
+ });
411
+
412
+ pastKeyValuesCache = past_key_values;
413
+
414
+ const endTime = performance.now();
415
+ const totalTime = startTime ? endTime - startTime : 0;
416
+ const decoded = tokenizer.batch_decode(sequences, { skip_special_tokens: true });
417
+
418
+ self.postMessage({
419
+ status: "complete",
420
+ text: decoded[0] || "",
421
+ numTokens,
422
+ totalTime,
423
+ tps: totalTime > 0 ? (numTokens / totalTime) * 1000 : 0,
424
+ });
425
+ }
426
+
427
+ async function generateVision(result, messages, images, options) {
428
+ const { maxTokens = 2048, temperature = 0.7, topP = 0.9, topK = 20 } = options;
429
+ const { processor, model, tokenizer } = result;
430
+
431
+ self.postMessage({ status: "progress", message: "Preparing vision request..." });
432
+
433
+ // Build message content with image placeholders and text
434
+ const lastMessage = messages[messages.length - 1];
435
+ const content = [];
436
+ for (const _ of images) {
437
+ content.push({ type: "image" });
438
+ }
439
+ content.push({ type: "text", text: lastMessage.content });
440
+
441
+ // For vision models, include a brief system instruction for concise responses
442
+ // Note: Vision processors handle system differently than text models
443
+ const visionMessages = [
444
+ { role: "system", content: "You are a helpful assistant. Be concise and direct in your responses." },
445
+ { role: "user", content }
446
+ ];
447
+
448
+ // Apply chat template with generation prompt
449
+ const chatPrompt = processor.apply_chat_template(visionMessages, {
450
+ add_generation_prompt: true
451
+ });
452
+
453
+ // Load images (handle both string URLs and { source: string } objects)
454
+ self.postMessage({ status: "progress", message: "Loading images..." });
455
+ const loadedImages = await Promise.all(
456
+ images.map(img => {
457
+ const url = typeof img === "string" ? img : img.source;
458
+ return RawImage.fromURL(url);
459
+ })
460
+ );
461
+ self.postMessage({ status: "progress", message: "Processing inputs..." });
462
+
463
+ // Process inputs
464
+ const inputs = await processor(
465
+ loadedImages.length === 1 ? loadedImages[0] : loadedImages,
466
+ chatPrompt,
467
+ { add_special_tokens: false }
468
+ );
469
+ self.postMessage({ status: "progress", message: "Generating response..." });
470
+
471
+ let startTime = null;
472
+ let numTokens = 0;
473
+
474
+ const streamCallback = (text) => {
475
+ startTime ??= performance.now();
476
+ numTokens += 1;
477
+ const tps = (numTokens / (performance.now() - startTime)) * 1000;
478
+ self.postMessage({ status: "token", text, state: "answering", numTokens, tps });
479
+ };
480
+
481
+ const streamer = new TextStreamer(tokenizer, {
482
+ skip_prompt: true,
483
+ skip_special_tokens: true,
484
+ callback_function: streamCallback,
485
+ });
486
+
487
+ self.postMessage({ status: "start" });
488
+
489
+ const outputs = await model.generate({
490
+ ...inputs,
491
+ max_new_tokens: maxTokens,
492
+ do_sample: temperature > 0,
493
+ temperature: temperature > 0 ? temperature : undefined,
494
+ top_p: topP,
495
+ top_k: topK,
496
+ streamer,
497
+ stopping_criteria: stoppingCriteria,
498
+ });
499
+
500
+ // Decode output (skip prompt)
501
+ const inputLength = inputs.input_ids.dims?.at(-1) || 0;
502
+ const decoded = processor.batch_decode(
503
+ outputs.slice(null, [inputLength, null]),
504
+ { skip_special_tokens: true }
505
+ );
506
+
507
+ const endTime = performance.now();
508
+ const totalTime = startTime ? endTime - startTime : 0;
509
+
510
+ self.postMessage({
511
+ status: "complete",
512
+ text: decoded[0] || "",
513
+ numTokens,
514
+ totalTime,
515
+ tps: totalTime > 0 ? (numTokens / totalTime) * 1000 : 0,
516
+ });
517
+ }
518
+
519
+ self.addEventListener("message", async (e) => {
520
+ const { type, ...data } = e.data;
521
+ switch (type) {
522
+ case "load": await load(data); break;
523
+ case "generate": stoppingCriteria.reset(); await generate(data); break;
524
+ case "interrupt": stoppingCriteria.interrupt(); break;
525
+ case "reset": pastKeyValuesCache = null; stoppingCriteria.reset(); break;
526
+ }
527
+ });
528
+
529
+ self.postMessage({ status: "init" });
530
+ `], { type: "application/javascript" });
531
+ const workerUrl = URL.createObjectURL(blob);
532
+ const worker = new Worker(workerUrl, { type: "module" });
533
+ let isReady = false;
534
+ let currentResolve = null;
535
+ let currentReject = null;
536
+ let _generatedText = "";
537
+ worker.onmessage = (e) => {
538
+ const msg = e.data;
539
+ switch (msg.status) {
540
+ case "init":
541
+ worker.postMessage({
542
+ type: "load",
543
+ modelId: source.path
544
+ });
545
+ break;
546
+ case "loading":
547
+ case "downloading":
548
+ onProgress?.(msg);
549
+ break;
550
+ case "ready":
551
+ isReady = true;
552
+ onProgress?.(msg);
553
+ resolve(gerbilWorker);
554
+ break;
555
+ case "start":
556
+ _generatedText = "";
557
+ break;
558
+ case "token":
559
+ _generatedText += msg.text;
560
+ onToken?.(msg);
561
+ break;
562
+ case "complete":
563
+ onComplete?.(msg);
564
+ currentResolve?.(msg.text);
565
+ currentResolve = null;
566
+ currentReject = null;
567
+ break;
568
+ case "error":
569
+ onError?.(msg.error);
570
+ onProgress?.({
571
+ status: "error",
572
+ error: msg.error
573
+ });
574
+ if (currentReject) {
575
+ currentReject(new Error(msg.error));
576
+ currentResolve = null;
577
+ currentReject = null;
578
+ } else reject(new Error(msg.error));
579
+ break;
580
+ }
581
+ };
582
+ worker.onerror = (e) => {
583
+ const error = e.message || "Worker error";
584
+ onError?.(error);
585
+ reject(new Error(error));
586
+ };
587
+ const gerbilWorker = {
588
+ generate: (prompt, options$1 = {}) => new Promise((res, rej) => {
589
+ currentResolve = res;
590
+ currentReject = rej;
591
+ const system = options$1.system || "You are a helpful assistant.";
592
+ const messages = options$1.history ? [{
593
+ role: "system",
594
+ content: system
595
+ }, ...options$1.history] : [{
596
+ role: "system",
597
+ content: system
598
+ }, {
599
+ role: "user",
600
+ content: prompt
601
+ }];
602
+ if (options$1.history) worker.postMessage({ type: "reset" });
603
+ worker.postMessage({
604
+ type: "generate",
605
+ messages,
606
+ images: options$1.images || [],
607
+ options: {
608
+ maxTokens: options$1.maxTokens ?? (options$1.images?.length ? 2048 : 256),
609
+ temperature: options$1.temperature ?? .7,
610
+ topP: options$1.topP ?? .9,
611
+ topK: options$1.topK ?? 20,
612
+ thinking: options$1.thinking ?? false
613
+ }
614
+ });
615
+ }),
616
+ interrupt: () => {
617
+ worker.postMessage({ type: "interrupt" });
618
+ },
619
+ reset: () => {
620
+ worker.postMessage({ type: "reset" });
621
+ },
622
+ terminate: () => {
623
+ worker.terminate();
624
+ URL.revokeObjectURL(workerUrl);
625
+ },
626
+ isReady: () => isReady
627
+ };
628
+ });
629
+ }
630
+ /**
631
+ * React hook for chat with local LLM
632
+ *
633
+ * @example
634
+ * ```tsx
635
+ * import { useChat } from "@tryhamster/gerbil/browser";
636
+ *
637
+ * function Chat() {
638
+ * const { messages, input, setInput, handleSubmit, isLoading, isGenerating } = useChat();
639
+ *
640
+ * if (isLoading) return <div>Loading model...</div>;
641
+ *
642
+ * return (
643
+ * <div>
644
+ * {messages.map(m => (
645
+ * <div key={m.id}>{m.role}: {m.content}</div>
646
+ * ))}
647
+ * <form onSubmit={handleSubmit}>
648
+ * <input value={input} onChange={e => setInput(e.target.value)} />
649
+ * <button disabled={isGenerating}>Send</button>
650
+ * </form>
651
+ * </div>
652
+ * );
653
+ * }
654
+ * ```
655
+ */
656
+ function useChat(options = {}) {
657
+ const React = globalThis.React;
658
+ if (!React) throw new Error("useChat requires React. Import React before using this hook.");
659
+ const { useState, useEffect, useRef, useCallback } = React;
660
+ const { model = "qwen3-0.6b", system = "You are a helpful assistant.", thinking: enableThinking = false, maxTokens = 512, temperature = .7, initialMessages = [], autoLoad = false, onReady, onError } = options;
661
+ const [messages, setMessages] = useState(initialMessages);
662
+ const [input, setInput] = useState("");
663
+ const [isLoading, setIsLoading] = useState(autoLoad);
664
+ const [loadingProgress, setLoadingProgress] = useState(null);
665
+ const [isGenerating, setIsGenerating] = useState(false);
666
+ const [thinking, setThinking] = useState("");
667
+ const [currentResponse, setCurrentResponse] = useState("");
668
+ const [tps, setTps] = useState(0);
669
+ const [error, setError] = useState(null);
670
+ const [isReady, setIsReady] = useState(false);
671
+ const [shouldLoad, setShouldLoad] = useState(autoLoad);
672
+ const [attachedImages, setAttachedImages] = useState([]);
673
+ const workerRef = useRef(null);
674
+ const messageIdRef = useRef(0);
675
+ const mountedRef = useRef(true);
676
+ const load = useCallback(() => {
677
+ if (workerRef.current || isLoading) return;
678
+ setIsLoading(true);
679
+ setShouldLoad(true);
680
+ }, [isLoading]);
681
+ useEffect(() => {
682
+ if (!shouldLoad) return;
683
+ if (!isWebGPUSupported()) {
684
+ setError("WebGPU not supported. Use Chrome/Edge 113+.");
685
+ setIsLoading(false);
686
+ onError?.("WebGPU not supported");
687
+ return;
688
+ }
689
+ mountedRef.current = true;
690
+ createGerbilWorker({
691
+ modelId: model,
692
+ onProgress: (p) => {
693
+ if (!mountedRef.current) return;
694
+ setLoadingProgress(p);
695
+ if (p.status === "ready") {
696
+ setIsLoading(false);
697
+ setIsReady(true);
698
+ onReady?.();
699
+ }
700
+ },
701
+ onToken: (token) => {
702
+ if (!mountedRef.current) return;
703
+ setTps(token.tps);
704
+ if (token.state === "thinking") setThinking((t) => t + token.text);
705
+ else setCurrentResponse((r) => r + token.text);
706
+ },
707
+ onComplete: () => {
708
+ if (!mountedRef.current) return;
709
+ setIsGenerating(false);
710
+ },
711
+ onError: (err) => {
712
+ if (!mountedRef.current) return;
713
+ setError(err);
714
+ setIsGenerating(false);
715
+ onError?.(err);
716
+ }
717
+ }).then((worker) => {
718
+ if (mountedRef.current) workerRef.current = worker;
719
+ else worker.terminate();
720
+ }).catch((err) => {
721
+ if (mountedRef.current) {
722
+ setError(err.message);
723
+ setIsLoading(false);
724
+ onError?.(err.message);
725
+ }
726
+ });
727
+ return () => {
728
+ mountedRef.current = false;
729
+ workerRef.current?.terminate();
730
+ };
731
+ }, [model, shouldLoad]);
732
+ useEffect(() => {
733
+ if (!isGenerating && currentResponse) {
734
+ setMessages((msgs) => {
735
+ if (msgs.at(-1)?.role === "assistant") return msgs.map((m, i) => i === msgs.length - 1 ? {
736
+ ...m,
737
+ content: currentResponse,
738
+ thinking: thinking || void 0
739
+ } : m);
740
+ return msgs;
741
+ });
742
+ setCurrentResponse("");
743
+ setThinking("");
744
+ }
745
+ }, [
746
+ isGenerating,
747
+ currentResponse,
748
+ thinking
749
+ ]);
750
+ const pendingMessageRef = useRef(null);
751
+ const pendingImagesRef = useRef([]);
752
+ const attachImage = useCallback((imageUrl) => {
753
+ setAttachedImages((imgs) => [...imgs, imageUrl]);
754
+ }, []);
755
+ const removeImage = useCallback((index) => {
756
+ setAttachedImages((imgs) => imgs.filter((_, i) => i !== index));
757
+ }, []);
758
+ const clearImages = useCallback(() => {
759
+ setAttachedImages([]);
760
+ }, []);
761
+ const sendMessageWithImages = useCallback((text, images) => {
762
+ if (!text.trim() || isGenerating) return;
763
+ messageIdRef.current += 1;
764
+ const userMessage = {
765
+ id: `msg-${messageIdRef.current}`,
766
+ role: "user",
767
+ content: text.trim(),
768
+ images: images.length > 0 ? images : void 0
769
+ };
770
+ messageIdRef.current += 1;
771
+ const assistantMessage = {
772
+ id: `msg-${messageIdRef.current}`,
773
+ role: "assistant",
774
+ content: ""
775
+ };
776
+ setMessages((msgs) => [
777
+ ...msgs,
778
+ userMessage,
779
+ assistantMessage
780
+ ]);
781
+ setCurrentResponse("");
782
+ setThinking("");
783
+ if (!workerRef.current) {
784
+ pendingMessageRef.current = text.trim();
785
+ pendingImagesRef.current = images;
786
+ load();
787
+ return;
788
+ }
789
+ setIsGenerating(true);
790
+ workerRef.current.generate(text.trim(), {
791
+ system,
792
+ thinking: enableThinking,
793
+ maxTokens: images.length > 0 ? Math.max(maxTokens, 2048) : maxTokens,
794
+ temperature,
795
+ images: images.length > 0 ? images : void 0
796
+ });
797
+ }, [
798
+ isGenerating,
799
+ system,
800
+ enableThinking,
801
+ maxTokens,
802
+ temperature,
803
+ load
804
+ ]);
805
+ const handleSubmit = useCallback((e) => {
806
+ e?.preventDefault?.();
807
+ if (!input.trim() || isGenerating) return;
808
+ sendMessageWithImages(input, attachedImages);
809
+ setInput("");
810
+ setAttachedImages([]);
811
+ }, [
812
+ input,
813
+ isGenerating,
814
+ attachedImages,
815
+ sendMessageWithImages
816
+ ]);
817
+ const sendWithImages = useCallback((text, images) => {
818
+ sendMessageWithImages(text, images);
819
+ }, [sendMessageWithImages]);
820
+ useEffect(() => {
821
+ if (isReady && pendingMessageRef.current && workerRef.current) {
822
+ const pendingContent = pendingMessageRef.current;
823
+ const pendingImages = pendingImagesRef.current;
824
+ pendingMessageRef.current = null;
825
+ pendingImagesRef.current = [];
826
+ setIsGenerating(true);
827
+ workerRef.current.generate(pendingContent, {
828
+ system,
829
+ thinking: enableThinking,
830
+ maxTokens: pendingImages.length > 0 ? Math.max(maxTokens, 2048) : maxTokens,
831
+ temperature,
832
+ images: pendingImages.length > 0 ? pendingImages : void 0
833
+ });
834
+ }
835
+ }, [
836
+ isReady,
837
+ system,
838
+ enableThinking,
839
+ maxTokens,
840
+ temperature
841
+ ]);
842
+ const stop = useCallback(() => {
843
+ workerRef.current?.interrupt();
844
+ setIsGenerating(false);
845
+ }, []);
846
+ const clear = useCallback(() => {
847
+ workerRef.current?.reset();
848
+ setMessages([]);
849
+ setCurrentResponse("");
850
+ setThinking("");
851
+ setAttachedImages([]);
852
+ }, []);
853
+ return {
854
+ messages: messages.map((m, i) => {
855
+ if (i === messages.length - 1 && m.role === "assistant" && isGenerating) return {
856
+ ...m,
857
+ content: currentResponse,
858
+ thinking: thinking || void 0
859
+ };
860
+ return m;
861
+ }),
862
+ input,
863
+ setInput,
864
+ handleSubmit,
865
+ isLoading,
866
+ loadingProgress,
867
+ isGenerating,
868
+ thinking,
869
+ stop,
870
+ clear,
871
+ tps,
872
+ isReady,
873
+ error,
874
+ load,
875
+ attachedImages,
876
+ attachImage,
877
+ removeImage,
878
+ clearImages,
879
+ sendWithImages
880
+ };
881
+ }
882
+ /**
883
+ * React hook for text completion with local LLM
884
+ *
885
+ * @example
886
+ * ```tsx
887
+ * import { useCompletion } from "@tryhamster/gerbil/browser";
888
+ *
889
+ * function App() {
890
+ * const { complete, completion, isLoading, isGenerating } = useCompletion();
891
+ *
892
+ * if (isLoading) return <div>Loading...</div>;
893
+ *
894
+ * return (
895
+ * <div>
896
+ * <button onClick={() => complete("Write a haiku")}>Generate</button>
897
+ * <p>{completion}</p>
898
+ * </div>
899
+ * );
900
+ * }
901
+ * ```
902
+ */
903
+ function useCompletion(options = {}) {
904
+ const React = globalThis.React;
905
+ if (!React) throw new Error("useCompletion requires React. Import React before using this hook.");
906
+ const { useState, useEffect, useRef, useCallback } = React;
907
+ const { model = "qwen3-0.6b", system = "You are a helpful assistant.", thinking: enableThinking = false, maxTokens = 512, temperature = .7, autoLoad = false, onReady, onError } = options;
908
+ const [completion, setCompletion] = useState("");
909
+ const [thinking, setThinking] = useState("");
910
+ const [isLoading, setIsLoading] = useState(autoLoad);
911
+ const [loadingProgress, setLoadingProgress] = useState(null);
912
+ const [isGenerating, setIsGenerating] = useState(false);
913
+ const [tps, setTps] = useState(0);
914
+ const [error, setError] = useState(null);
915
+ const [isReady, setIsReady] = useState(false);
916
+ const [shouldLoad, setShouldLoad] = useState(autoLoad);
917
+ const workerRef = useRef(null);
918
+ const resolveRef = useRef(null);
919
+ const rejectRef = useRef(null);
920
+ const pendingPromptRef = useRef(null);
921
+ const pendingImagesRef = useRef(void 0);
922
+ const mountedRef = useRef(true);
923
+ const load = useCallback(() => {
924
+ if (workerRef.current || isLoading) return;
925
+ setIsLoading(true);
926
+ setShouldLoad(true);
927
+ }, [isLoading]);
928
+ useEffect(() => {
929
+ if (!shouldLoad) return;
930
+ if (!isWebGPUSupported()) {
931
+ setError("WebGPU not supported. Use Chrome/Edge 113+.");
932
+ setIsLoading(false);
933
+ onError?.("WebGPU not supported");
934
+ return;
935
+ }
936
+ mountedRef.current = true;
937
+ createGerbilWorker({
938
+ modelId: model,
939
+ onProgress: (p) => {
940
+ if (!mountedRef.current) return;
941
+ setLoadingProgress(p);
942
+ if (p.status === "ready") {
943
+ setIsLoading(false);
944
+ setIsReady(true);
945
+ onReady?.();
946
+ }
947
+ },
948
+ onToken: (token) => {
949
+ if (!mountedRef.current) return;
950
+ setTps(token.tps);
951
+ if (token.state === "thinking") setThinking((t) => t + token.text);
952
+ else setCompletion((c) => c + token.text);
953
+ },
954
+ onComplete: (result) => {
955
+ if (!mountedRef.current) return;
956
+ setIsGenerating(false);
957
+ resolveRef.current?.(result.text);
958
+ resolveRef.current = null;
959
+ },
960
+ onError: (err) => {
961
+ if (!mountedRef.current) return;
962
+ setError(err);
963
+ setIsGenerating(false);
964
+ onError?.(err);
965
+ }
966
+ }).then((worker) => {
967
+ if (mountedRef.current) workerRef.current = worker;
968
+ else worker.terminate();
969
+ }).catch((err) => {
970
+ if (mountedRef.current) {
971
+ setError(err.message);
972
+ setIsLoading(false);
973
+ onError?.(err.message);
974
+ }
975
+ });
976
+ return () => {
977
+ mountedRef.current = false;
978
+ workerRef.current?.terminate();
979
+ };
980
+ }, [model, shouldLoad]);
981
+ const complete = useCallback((prompt, completeOptions) => {
982
+ return new Promise((resolve, reject) => {
983
+ setCompletion("");
984
+ setThinking("");
985
+ resolveRef.current = resolve;
986
+ rejectRef.current = reject;
987
+ if (!workerRef.current) {
988
+ pendingPromptRef.current = prompt;
989
+ pendingImagesRef.current = completeOptions?.images;
990
+ load();
991
+ return;
992
+ }
993
+ setIsGenerating(true);
994
+ workerRef.current.generate(prompt, {
995
+ system,
996
+ thinking: enableThinking,
997
+ maxTokens,
998
+ temperature,
999
+ images: completeOptions?.images
1000
+ });
1001
+ });
1002
+ }, [
1003
+ system,
1004
+ enableThinking,
1005
+ maxTokens,
1006
+ temperature,
1007
+ load
1008
+ ]);
1009
+ useEffect(() => {
1010
+ if (isReady && pendingPromptRef.current && workerRef.current) {
1011
+ const pendingPrompt = pendingPromptRef.current;
1012
+ const pendingImages = pendingImagesRef.current;
1013
+ pendingPromptRef.current = null;
1014
+ pendingImagesRef.current = void 0;
1015
+ setIsGenerating(true);
1016
+ workerRef.current.generate(pendingPrompt, {
1017
+ system,
1018
+ thinking: enableThinking,
1019
+ maxTokens,
1020
+ temperature,
1021
+ images: pendingImages
1022
+ });
1023
+ }
1024
+ }, [
1025
+ isReady,
1026
+ system,
1027
+ enableThinking,
1028
+ maxTokens,
1029
+ temperature
1030
+ ]);
1031
+ return {
1032
+ completion,
1033
+ thinking,
1034
+ complete,
1035
+ isLoading,
1036
+ loadingProgress,
1037
+ isGenerating,
1038
+ stop: useCallback(() => {
1039
+ workerRef.current?.interrupt();
1040
+ setIsGenerating(false);
1041
+ }, []),
1042
+ tps,
1043
+ isReady,
1044
+ error,
1045
+ load
1046
+ };
1047
+ }
1048
+ /** Kokoro voice definitions (24kHz, high quality) */
1049
+ const KOKORO_BROWSER_VOICES = [
1050
+ {
1051
+ id: "af_heart",
1052
+ name: "Heart",
1053
+ gender: "female",
1054
+ language: "en-us",
1055
+ description: "American female, highest quality (Grade A)"
1056
+ },
1057
+ {
1058
+ id: "af_bella",
1059
+ name: "Bella",
1060
+ gender: "female",
1061
+ language: "en-us",
1062
+ description: "American female, warm and friendly (Grade A-)"
1063
+ },
1064
+ {
1065
+ id: "af_nicole",
1066
+ name: "Nicole",
1067
+ gender: "female",
1068
+ language: "en-us",
1069
+ description: "American female, soft and gentle"
1070
+ },
1071
+ {
1072
+ id: "af_sarah",
1073
+ name: "Sarah",
1074
+ gender: "female",
1075
+ language: "en-us",
1076
+ description: "American female, clear and professional"
1077
+ },
1078
+ {
1079
+ id: "af_sky",
1080
+ name: "Sky",
1081
+ gender: "female",
1082
+ language: "en-us",
1083
+ description: "American female, young and energetic"
1084
+ },
1085
+ {
1086
+ id: "af_alloy",
1087
+ name: "Alloy",
1088
+ gender: "female",
1089
+ language: "en-us",
1090
+ description: "American female"
1091
+ },
1092
+ {
1093
+ id: "af_aoede",
1094
+ name: "Aoede",
1095
+ gender: "female",
1096
+ language: "en-us",
1097
+ description: "American female, mythical"
1098
+ },
1099
+ {
1100
+ id: "af_jessica",
1101
+ name: "Jessica",
1102
+ gender: "female",
1103
+ language: "en-us",
1104
+ description: "American female"
1105
+ },
1106
+ {
1107
+ id: "af_kore",
1108
+ name: "Kore",
1109
+ gender: "female",
1110
+ language: "en-us",
1111
+ description: "American female"
1112
+ },
1113
+ {
1114
+ id: "af_nova",
1115
+ name: "Nova",
1116
+ gender: "female",
1117
+ language: "en-us",
1118
+ description: "American female"
1119
+ },
1120
+ {
1121
+ id: "af_river",
1122
+ name: "River",
1123
+ gender: "female",
1124
+ language: "en-us",
1125
+ description: "American female"
1126
+ },
1127
+ {
1128
+ id: "am_fenrir",
1129
+ name: "Fenrir",
1130
+ gender: "male",
1131
+ language: "en-us",
1132
+ description: "American male, best quality"
1133
+ },
1134
+ {
1135
+ id: "am_michael",
1136
+ name: "Michael",
1137
+ gender: "male",
1138
+ language: "en-us",
1139
+ description: "American male, warm and friendly"
1140
+ },
1141
+ {
1142
+ id: "am_adam",
1143
+ name: "Adam",
1144
+ gender: "male",
1145
+ language: "en-us",
1146
+ description: "American male"
1147
+ },
1148
+ {
1149
+ id: "am_echo",
1150
+ name: "Echo",
1151
+ gender: "male",
1152
+ language: "en-us",
1153
+ description: "American male"
1154
+ },
1155
+ {
1156
+ id: "am_eric",
1157
+ name: "Eric",
1158
+ gender: "male",
1159
+ language: "en-us",
1160
+ description: "American male"
1161
+ },
1162
+ {
1163
+ id: "am_liam",
1164
+ name: "Liam",
1165
+ gender: "male",
1166
+ language: "en-us",
1167
+ description: "American male"
1168
+ },
1169
+ {
1170
+ id: "am_onyx",
1171
+ name: "Onyx",
1172
+ gender: "male",
1173
+ language: "en-us",
1174
+ description: "American male"
1175
+ },
1176
+ {
1177
+ id: "am_puck",
1178
+ name: "Puck",
1179
+ gender: "male",
1180
+ language: "en-us",
1181
+ description: "American male"
1182
+ },
1183
+ {
1184
+ id: "am_santa",
1185
+ name: "Santa",
1186
+ gender: "male",
1187
+ language: "en-us",
1188
+ description: "American male, festive"
1189
+ },
1190
+ {
1191
+ id: "bf_emma",
1192
+ name: "Emma",
1193
+ gender: "female",
1194
+ language: "en-gb",
1195
+ description: "British female, elegant and clear"
1196
+ },
1197
+ {
1198
+ id: "bf_isabella",
1199
+ name: "Isabella",
1200
+ gender: "female",
1201
+ language: "en-gb",
1202
+ description: "British female, sophisticated"
1203
+ },
1204
+ {
1205
+ id: "bf_alice",
1206
+ name: "Alice",
1207
+ gender: "female",
1208
+ language: "en-gb",
1209
+ description: "British female"
1210
+ },
1211
+ {
1212
+ id: "bf_lily",
1213
+ name: "Lily",
1214
+ gender: "female",
1215
+ language: "en-gb",
1216
+ description: "British female"
1217
+ },
1218
+ {
1219
+ id: "bm_george",
1220
+ name: "George",
1221
+ gender: "male",
1222
+ language: "en-gb",
1223
+ description: "British male, distinguished"
1224
+ },
1225
+ {
1226
+ id: "bm_lewis",
1227
+ name: "Lewis",
1228
+ gender: "male",
1229
+ language: "en-gb",
1230
+ description: "British male, friendly"
1231
+ },
1232
+ {
1233
+ id: "bm_daniel",
1234
+ name: "Daniel",
1235
+ gender: "male",
1236
+ language: "en-gb",
1237
+ description: "British male"
1238
+ },
1239
+ {
1240
+ id: "bm_fable",
1241
+ name: "Fable",
1242
+ gender: "male",
1243
+ language: "en-gb",
1244
+ description: "British male"
1245
+ }
1246
+ ];
1247
+ /** Supertonic voice definitions (44.1kHz, faster) */
1248
+ const SUPERTONIC_BROWSER_VOICES = [
1249
+ {
1250
+ id: "F1",
1251
+ name: "Female 1",
1252
+ gender: "female",
1253
+ language: "en",
1254
+ description: "Female voice 1 - Clear and natural"
1255
+ },
1256
+ {
1257
+ id: "F2",
1258
+ name: "Female 2",
1259
+ gender: "female",
1260
+ language: "en",
1261
+ description: "Female voice 2 - Warm and expressive"
1262
+ },
1263
+ {
1264
+ id: "M1",
1265
+ name: "Male 1",
1266
+ gender: "male",
1267
+ language: "en",
1268
+ description: "Male voice 1 - Deep and confident"
1269
+ },
1270
+ {
1271
+ id: "M2",
1272
+ name: "Male 2",
1273
+ gender: "male",
1274
+ language: "en",
1275
+ description: "Male voice 2 - Friendly and casual"
1276
+ }
1277
+ ];
1278
+ /** TTS model configuration */
1279
+ const TTS_MODELS = {
1280
+ "kokoro-82m": {
1281
+ repo: "onnx-community/Kokoro-82M-v1.0-ONNX",
1282
+ defaultVoice: "af_heart",
1283
+ sampleRate: 24e3,
1284
+ voices: KOKORO_BROWSER_VOICES
1285
+ },
1286
+ "supertonic-66m": {
1287
+ repo: "onnx-community/Supertonic-TTS-ONNX",
1288
+ defaultVoice: "F1",
1289
+ sampleRate: 44100,
1290
+ voices: SUPERTONIC_BROWSER_VOICES
1291
+ }
1292
+ };
1293
+ /**
1294
+ * React hook for text-to-speech with Web Audio API playback
1295
+ *
1296
+ * Supports both Kokoro (24kHz, high quality) and Supertonic (44.1kHz, faster).
1297
+ *
1298
+ * @example
1299
+ * ```tsx
1300
+ * import { useSpeech } from "@tryhamster/gerbil/browser";
1301
+ *
1302
+ * function App() {
1303
+ * // Default: Kokoro TTS
1304
+ * const { speak, stop, isLoading, isSpeaking, listVoices, setVoice } = useSpeech();
1305
+ *
1306
+ * // Or use Supertonic (44.1kHz, faster)
1307
+ * // const { speak, listVoices } = useSpeech({ model: "supertonic-66m" });
1308
+ *
1309
+ * if (isLoading) return <div>Loading TTS...</div>;
1310
+ *
1311
+ * return (
1312
+ * <div>
1313
+ * <select onChange={e => setVoice(e.target.value)}>
1314
+ * {listVoices().map(v => (
1315
+ * <option key={v.id} value={v.id}>{v.name}</option>
1316
+ * ))}
1317
+ * </select>
1318
+ * <button onClick={() => speak("Hello world!")}>
1319
+ * {isSpeaking ? "Speaking..." : "Speak"}
1320
+ * </button>
1321
+ * {isSpeaking && <button onClick={stop}>Stop</button>}
1322
+ * </div>
1323
+ * );
1324
+ * }
1325
+ * ```
1326
+ */
1327
+ function useSpeech(options = {}) {
1328
+ const React = globalThis.React;
1329
+ if (!React) throw new Error("useSpeech requires React. Import React before using this hook.");
1330
+ const { useState, useEffect, useRef, useCallback } = React;
1331
+ const { model: modelId = "kokoro-82m", speed: defaultSpeed = 1, autoLoad = false, onReady, onError, onStart, onEnd } = options;
1332
+ const modelConfig = TTS_MODELS[modelId];
1333
+ const defaultVoice = options.voice || modelConfig.defaultVoice;
1334
+ const [isLoading, setIsLoading] = useState(autoLoad);
1335
+ const [loadingProgress, setLoadingProgress] = useState(null);
1336
+ const [isSpeaking, setIsSpeaking] = useState(false);
1337
+ const [isReady, setIsReady] = useState(false);
1338
+ const [error, setError] = useState(null);
1339
+ const [shouldLoad, setShouldLoad] = useState(autoLoad);
1340
+ const [currentVoice, setCurrentVoice] = useState(defaultVoice);
1341
+ const [currentSpeed, setCurrentSpeed] = useState(defaultSpeed);
1342
+ const ttsRef = useRef(null);
1343
+ const voiceEmbeddingsRef = useRef(/* @__PURE__ */ new Map());
1344
+ const audioContextRef = useRef(null);
1345
+ const sourceNodeRef = useRef(null);
1346
+ const mountedRef = useRef(true);
1347
+ const modelIdRef = useRef(modelId);
1348
+ const listVoices = useCallback(() => {
1349
+ return modelConfig.voices;
1350
+ }, [modelConfig.voices]);
1351
+ const load = useCallback(() => {
1352
+ if (ttsRef.current || isLoading) return;
1353
+ setIsLoading(true);
1354
+ setShouldLoad(true);
1355
+ }, [isLoading]);
1356
+ useEffect(() => {
1357
+ if (!shouldLoad) return;
1358
+ mountedRef.current = true;
1359
+ modelIdRef.current = modelId;
1360
+ const initTTS = async () => {
1361
+ try {
1362
+ const isSupertonic = modelId === "supertonic-66m";
1363
+ const config = TTS_MODELS[modelId];
1364
+ setLoadingProgress({
1365
+ status: "loading",
1366
+ message: `Loading ${isSupertonic ? "Supertonic" : "Kokoro"} TTS...`
1367
+ });
1368
+ if (isSupertonic) {
1369
+ const { pipeline, env } = await import("../transformers.web-u34VxRFM.js");
1370
+ if (env.backends?.onnx?.wasm) env.backends.onnx.wasm.wasmPaths = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.21.0/dist/";
1371
+ const tts = await pipeline("text-to-speech", config.repo, {
1372
+ device: "webgpu",
1373
+ progress_callback: (progress) => {
1374
+ if (!mountedRef.current) return;
1375
+ if (progress.status === "progress" && progress.file) setLoadingProgress({
1376
+ status: "downloading",
1377
+ file: progress.file,
1378
+ progress: Math.round(progress.progress || 0)
1379
+ });
1380
+ }
1381
+ });
1382
+ if (!mountedRef.current) return;
1383
+ const voicesUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/`;
1384
+ const embeddingsMap = /* @__PURE__ */ new Map();
1385
+ await Promise.all(config.voices.map(async (voice) => {
1386
+ try {
1387
+ const response = await fetch(`${voicesUrl}${voice.id}.bin`);
1388
+ if (response.ok) {
1389
+ const buffer = await response.arrayBuffer();
1390
+ embeddingsMap.set(voice.id, new Float32Array(buffer));
1391
+ }
1392
+ } catch (e) {
1393
+ console.warn(`Failed to load voice embedding for ${voice.id}:`, e);
1394
+ }
1395
+ }));
1396
+ if (!mountedRef.current) return;
1397
+ try {
1398
+ await tts("Hello", {
1399
+ speaker_embeddings: new Float32Array(12928),
1400
+ num_inference_steps: 1,
1401
+ speed: 1
1402
+ });
1403
+ } catch (e) {
1404
+ console.warn("Supertonic warmup failed:", e);
1405
+ }
1406
+ voiceEmbeddingsRef.current = embeddingsMap;
1407
+ ttsRef.current = {
1408
+ type: "supertonic",
1409
+ pipeline: tts,
1410
+ config
1411
+ };
1412
+ } else {
1413
+ const { KokoroTTS } = await import("../kokoro-CMOGDSgT.js");
1414
+ const tts = await KokoroTTS.from_pretrained(config.repo, {
1415
+ dtype: "fp32",
1416
+ progress_callback: (progress) => {
1417
+ if (!mountedRef.current) return;
1418
+ if (progress.status === "progress" && progress.file) setLoadingProgress({
1419
+ status: "downloading",
1420
+ file: progress.file,
1421
+ progress: Math.round(progress.progress || 0)
1422
+ });
1423
+ }
1424
+ });
1425
+ if (!mountedRef.current) return;
1426
+ ttsRef.current = {
1427
+ type: "kokoro",
1428
+ instance: tts,
1429
+ config
1430
+ };
1431
+ }
1432
+ setIsLoading(false);
1433
+ setIsReady(true);
1434
+ setLoadingProgress({ status: "ready" });
1435
+ onReady?.();
1436
+ } catch (err) {
1437
+ if (!mountedRef.current) return;
1438
+ const errorMsg = err instanceof Error ? err.message : String(err);
1439
+ setError(errorMsg);
1440
+ setIsLoading(false);
1441
+ setLoadingProgress({
1442
+ status: "error",
1443
+ error: errorMsg
1444
+ });
1445
+ onError?.(errorMsg);
1446
+ }
1447
+ };
1448
+ initTTS();
1449
+ return () => {
1450
+ mountedRef.current = false;
1451
+ };
1452
+ }, [
1453
+ shouldLoad,
1454
+ modelId,
1455
+ onReady,
1456
+ onError
1457
+ ]);
1458
+ useEffect(() => {
1459
+ return () => {
1460
+ try {
1461
+ sourceNodeRef.current?.stop();
1462
+ } catch {}
1463
+ try {
1464
+ if (audioContextRef.current && audioContextRef.current.state !== "closed") audioContextRef.current.close();
1465
+ } catch {}
1466
+ };
1467
+ }, []);
1468
+ return {
1469
+ speak: useCallback(async (text, opts) => {
1470
+ const voice = opts?.voice || currentVoice;
1471
+ const speed = opts?.speed || currentSpeed;
1472
+ if (!ttsRef.current) {
1473
+ load();
1474
+ return;
1475
+ }
1476
+ try {
1477
+ setIsSpeaking(true);
1478
+ onStart?.();
1479
+ let audioData;
1480
+ let sampleRate;
1481
+ const ttsBackend = ttsRef.current;
1482
+ if (ttsBackend.type === "supertonic") {
1483
+ const config = ttsBackend.config;
1484
+ if (!config.voices.find((v) => v.id === voice)) {
1485
+ const validVoices = config.voices.map((v) => v.id).join(", ");
1486
+ throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
1487
+ }
1488
+ let speakerEmbedding = voiceEmbeddingsRef.current.get(voice);
1489
+ if (!speakerEmbedding) try {
1490
+ const voiceUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/${voice}.bin`;
1491
+ const response = await fetch(voiceUrl);
1492
+ if (response.ok) {
1493
+ const buffer = await response.arrayBuffer();
1494
+ speakerEmbedding = new Float32Array(buffer);
1495
+ voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
1496
+ } else throw new Error(`Failed to load voice: ${response.status}`);
1497
+ } catch {
1498
+ speakerEmbedding = new Float32Array(12928).fill(.1);
1499
+ voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
1500
+ }
1501
+ const result = await ttsBackend.pipeline(text, {
1502
+ speaker_embeddings: speakerEmbedding,
1503
+ speed
1504
+ });
1505
+ audioData = result.audio;
1506
+ sampleRate = result.sampling_rate;
1507
+ } else {
1508
+ const config = ttsBackend.config;
1509
+ if (!config.voices.find((v) => v.id === voice)) {
1510
+ const validVoices = config.voices.map((v) => v.id).join(", ");
1511
+ throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
1512
+ }
1513
+ const result = await ttsBackend.instance.generate(text, {
1514
+ voice,
1515
+ speed
1516
+ });
1517
+ audioData = result.audio;
1518
+ sampleRate = result.sampling_rate;
1519
+ }
1520
+ if (!mountedRef.current) return;
1521
+ if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext();
1522
+ const audioContext = audioContextRef.current;
1523
+ if (audioContext.state === "suspended") await audioContext.resume();
1524
+ const audioBuffer = audioContext.createBuffer(1, audioData.length, sampleRate);
1525
+ const channelData = new Float32Array(audioData);
1526
+ audioBuffer.copyToChannel(channelData, 0);
1527
+ if (sourceNodeRef.current) {
1528
+ sourceNodeRef.current.stop();
1529
+ sourceNodeRef.current.disconnect();
1530
+ }
1531
+ const sourceNode = audioContext.createBufferSource();
1532
+ sourceNode.buffer = audioBuffer;
1533
+ sourceNode.connect(audioContext.destination);
1534
+ sourceNode.onended = () => {
1535
+ if (mountedRef.current) {
1536
+ setIsSpeaking(false);
1537
+ onEnd?.();
1538
+ }
1539
+ };
1540
+ sourceNodeRef.current = sourceNode;
1541
+ sourceNode.start();
1542
+ } catch (err) {
1543
+ if (!mountedRef.current) return;
1544
+ const errorMsg = err instanceof Error ? err.message : String(err);
1545
+ setError(errorMsg);
1546
+ setIsSpeaking(false);
1547
+ onError?.(errorMsg);
1548
+ }
1549
+ }, [
1550
+ currentVoice,
1551
+ currentSpeed,
1552
+ load,
1553
+ onStart,
1554
+ onEnd,
1555
+ onError
1556
+ ]),
1557
+ stop: useCallback(() => {
1558
+ if (sourceNodeRef.current) {
1559
+ sourceNodeRef.current.stop();
1560
+ sourceNodeRef.current.disconnect();
1561
+ sourceNodeRef.current = null;
1562
+ }
1563
+ setIsSpeaking(false);
1564
+ }, []),
1565
+ isLoading,
1566
+ loadingProgress,
1567
+ isSpeaking,
1568
+ isReady,
1569
+ load,
1570
+ error,
1571
+ listVoices,
1572
+ currentVoice,
1573
+ setVoice: useCallback((voiceId) => {
1574
+ if (modelConfig.voices.find((v) => v.id === voiceId)) setCurrentVoice(voiceId);
1575
+ else console.warn(`Voice "${voiceId}" not valid for ${modelId}. Available: ${modelConfig.voices.map((v) => v.id).join(", ")}`);
1576
+ }, [modelConfig.voices, modelId]),
1577
+ currentSpeed,
1578
+ setSpeed: useCallback((speed) => {
1579
+ setCurrentSpeed(Math.max(.5, Math.min(2, speed)));
1580
+ }, []),
1581
+ currentModel: modelId,
1582
+ sampleRate: modelConfig.sampleRate
1583
+ };
1584
+ }
1585
+ /**
1586
+ * Play audio from Float32Array using Web Audio API
1587
+ *
1588
+ * @example
1589
+ * ```ts
1590
+ * import { playAudio } from "@tryhamster/gerbil/browser";
1591
+ *
1592
+ * const audio = new Float32Array([...]); // TTS output
1593
+ * const controller = await playAudio(audio, 24000);
1594
+ *
1595
+ * // Stop playback
1596
+ * controller.stop();
1597
+ * ```
1598
+ */
1599
+ async function playAudio(audio, sampleRate = 24e3) {
1600
+ const audioContext = new AudioContext();
1601
+ if (audioContext.state === "suspended") await audioContext.resume();
1602
+ const audioBuffer = audioContext.createBuffer(1, audio.length, sampleRate);
1603
+ const channelData = new Float32Array(audio);
1604
+ audioBuffer.copyToChannel(channelData, 0);
1605
+ const sourceNode = audioContext.createBufferSource();
1606
+ sourceNode.buffer = audioBuffer;
1607
+ sourceNode.connect(audioContext.destination);
1608
+ const onEnded = new Promise((resolve) => {
1609
+ sourceNode.onended = () => {
1610
+ audioContext.close();
1611
+ resolve();
1612
+ };
1613
+ });
1614
+ sourceNode.start();
1615
+ return {
1616
+ stop: () => {
1617
+ sourceNode.stop();
1618
+ audioContext.close();
1619
+ },
1620
+ onEnded
1621
+ };
1622
+ }
1623
+ /**
1624
+ * Create a reusable audio player for streaming TTS
1625
+ *
1626
+ * @example
1627
+ * ```ts
1628
+ * import { createAudioPlayer } from "@tryhamster/gerbil/browser";
1629
+ *
1630
+ * const player = createAudioPlayer(24000);
1631
+ *
1632
+ * // Queue audio chunks as they arrive
1633
+ * player.queue(chunk1);
1634
+ * player.queue(chunk2);
1635
+ *
1636
+ * // Stop and clear
1637
+ * player.stop();
1638
+ * ```
1639
+ */
1640
+ function createAudioPlayer(sampleRate = 24e3) {
1641
+ let audioContext = null;
1642
+ let nextStartTime = 0;
1643
+ let isActive = false;
1644
+ const ensureContext = async () => {
1645
+ if (!audioContext) audioContext = new AudioContext();
1646
+ if (audioContext.state === "suspended") await audioContext.resume();
1647
+ return audioContext;
1648
+ };
1649
+ return {
1650
+ queue: async (audio) => {
1651
+ const ctx = await ensureContext();
1652
+ isActive = true;
1653
+ const buffer = ctx.createBuffer(1, audio.length, sampleRate);
1654
+ const channelData = new Float32Array(audio);
1655
+ buffer.copyToChannel(channelData, 0);
1656
+ const source = ctx.createBufferSource();
1657
+ source.buffer = buffer;
1658
+ source.connect(ctx.destination);
1659
+ const startTime = Math.max(ctx.currentTime, nextStartTime);
1660
+ source.start(startTime);
1661
+ nextStartTime = startTime + buffer.duration;
1662
+ source.onended = () => {
1663
+ if (ctx.currentTime >= nextStartTime - .1) isActive = false;
1664
+ };
1665
+ },
1666
+ stop: () => {
1667
+ isActive = false;
1668
+ nextStartTime = 0;
1669
+ if (audioContext) {
1670
+ audioContext.close();
1671
+ audioContext = null;
1672
+ }
1673
+ },
1674
+ isPlaying: () => isActive
1675
+ };
1676
+ }
1677
+ /**
1678
+ * React hook for voice input with browser microphone
1679
+ *
1680
+ * Uses MediaRecorder to capture audio and Whisper for transcription.
1681
+ * Supports both one-shot and streaming transcription modes.
1682
+ *
1683
+ * @example Basic usage (one-shot)
1684
+ * ```tsx
1685
+ * function VoiceInput() {
1686
+ * const { startRecording, stopRecording, isRecording, transcript } = useVoiceInput({
1687
+ * onTranscript: (text) => console.log("User said:", text),
1688
+ * });
1689
+ *
1690
+ * return (
1691
+ * <button onClick={isRecording ? stopRecording : startRecording}>
1692
+ * {isRecording ? "Stop" : "Record"}
1693
+ * </button>
1694
+ * );
1695
+ * }
1696
+ * ```
1697
+ *
1698
+ * @example Streaming transcription (real-time)
1699
+ * ```tsx
1700
+ * function LiveTranscription() {
1701
+ * const { startRecording, stopRecording, isRecording, transcript, streamingChunk } = useVoiceInput({
1702
+ * streaming: true, // Enable streaming mode
1703
+ * chunkDuration: 1500, // Transcribe every 1.5 seconds (default)
1704
+ * onChunk: (text, idx) => console.log(`Chunk ${idx}: ${text}`),
1705
+ * });
1706
+ *
1707
+ * return (
1708
+ * <div>
1709
+ * <button onClick={isRecording ? stopRecording : startRecording}>
1710
+ * {isRecording ? "Stop" : "Start Live Transcription"}
1711
+ * </button>
1712
+ * <p>Current chunk: {streamingChunk}</p>
1713
+ * <p>Full transcript: {transcript}</p>
1714
+ * </div>
1715
+ * );
1716
+ * }
1717
+ * ```
1718
+ */
1719
+ function useVoiceInput(options = {}) {
1720
+ const React = globalThis.React;
1721
+ if (!React) throw new Error("useVoiceInput requires React. Import React before using this hook.");
1722
+ const { useState, useEffect, useRef, useCallback } = React;
1723
+ const { model = "whisper-tiny.en", autoLoad = false, onReady, onTranscript, onError, onProgress, streaming = false, chunkDuration = 1500, onChunk } = options;
1724
+ const [isLoading, setIsLoading] = useState(autoLoad);
1725
+ const [loadingProgress, setLoadingProgress] = useState(null);
1726
+ const [isReady, setIsReady] = useState(false);
1727
+ const [isRecording, setIsRecording] = useState(false);
1728
+ const [isTranscribing, setIsTranscribing] = useState(false);
1729
+ const [transcript, setTranscript] = useState("");
1730
+ const [streamingChunk, setStreamingChunk] = useState("");
1731
+ const [chunkCount, setChunkCount] = useState(0);
1732
+ const [error, setError] = useState(null);
1733
+ const [shouldLoad, setShouldLoad] = useState(autoLoad);
1734
+ const sttRef = useRef(null);
1735
+ const mediaRecorderRef = useRef(null);
1736
+ const audioChunksRef = useRef([]);
1737
+ const streamRef = useRef(null);
1738
+ const mountedRef = useRef(true);
1739
+ const streamingIntervalRef = useRef(null);
1740
+ const pendingChunksRef = useRef([]);
1741
+ const fullTranscriptRef = useRef("");
1742
+ useEffect(() => {
1743
+ if (!shouldLoad || isReady) return;
1744
+ let cancelled = false;
1745
+ const loadModel = async () => {
1746
+ try {
1747
+ setIsLoading(true);
1748
+ setLoadingProgress({
1749
+ status: "loading",
1750
+ message: "Loading STT model..."
1751
+ });
1752
+ onProgress?.({
1753
+ status: "loading",
1754
+ message: "Loading STT model..."
1755
+ });
1756
+ const { WhisperSTT } = await import("../stt-Dne6SENv.js");
1757
+ if (cancelled || !mountedRef.current) return;
1758
+ const stt = new WhisperSTT(model);
1759
+ await stt.load({ onProgress: (p) => {
1760
+ if (!mountedRef.current) return;
1761
+ const progress = {
1762
+ status: p.progress !== void 0 ? "downloading" : "loading",
1763
+ message: p.status,
1764
+ progress: p.progress,
1765
+ file: p.file
1766
+ };
1767
+ setLoadingProgress(progress);
1768
+ onProgress?.(progress);
1769
+ } });
1770
+ if (cancelled || !mountedRef.current) {
1771
+ stt.dispose();
1772
+ return;
1773
+ }
1774
+ sttRef.current = stt;
1775
+ setIsReady(true);
1776
+ setIsLoading(false);
1777
+ setLoadingProgress({ status: "ready" });
1778
+ onProgress?.({ status: "ready" });
1779
+ onReady?.();
1780
+ } catch (e) {
1781
+ if (!mountedRef.current) return;
1782
+ const errMsg = e.message || "Failed to load STT model";
1783
+ setError(errMsg);
1784
+ setIsLoading(false);
1785
+ setLoadingProgress({
1786
+ status: "error",
1787
+ message: errMsg
1788
+ });
1789
+ onProgress?.({
1790
+ status: "error",
1791
+ message: errMsg
1792
+ });
1793
+ onError?.(errMsg);
1794
+ }
1795
+ };
1796
+ loadModel();
1797
+ return () => {
1798
+ cancelled = true;
1799
+ };
1800
+ }, [
1801
+ shouldLoad,
1802
+ isReady,
1803
+ model,
1804
+ onReady,
1805
+ onError,
1806
+ onProgress
1807
+ ]);
1808
+ useEffect(() => {
1809
+ mountedRef.current = true;
1810
+ return () => {
1811
+ mountedRef.current = false;
1812
+ if (sttRef.current) sttRef.current.dispose();
1813
+ if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
1814
+ };
1815
+ }, []);
1816
+ const load = useCallback(() => {
1817
+ if (!shouldLoad && !isReady && !isLoading) setShouldLoad(true);
1818
+ }, [
1819
+ shouldLoad,
1820
+ isReady,
1821
+ isLoading
1822
+ ]);
1823
+ const blobToFloat32 = useCallback(async (blob) => {
1824
+ const audioContext = new AudioContext({ sampleRate: 16e3 });
1825
+ const arrayBuffer = await blob.arrayBuffer();
1826
+ const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
1827
+ const channelData = audioBuffer.getChannelData(0);
1828
+ if (audioBuffer.sampleRate !== 16e3) {
1829
+ const ratio = 16e3 / audioBuffer.sampleRate;
1830
+ const newLength = Math.round(channelData.length * ratio);
1831
+ const resampled = new Float32Array(newLength);
1832
+ for (let i = 0; i < newLength; i++) {
1833
+ const srcIndex = i / ratio;
1834
+ const floor = Math.floor(srcIndex);
1835
+ const ceil = Math.min(floor + 1, channelData.length - 1);
1836
+ const t = srcIndex - floor;
1837
+ resampled[i] = channelData[floor] * (1 - t) + channelData[ceil] * t;
1838
+ }
1839
+ audioContext.close();
1840
+ return resampled;
1841
+ }
1842
+ audioContext.close();
1843
+ return new Float32Array(channelData);
1844
+ }, []);
1845
+ const transcribe = useCallback(async (audio) => {
1846
+ if (!sttRef.current) {
1847
+ if (!shouldLoad) {
1848
+ setShouldLoad(true);
1849
+ throw new Error("STT model not loaded. Loading now, please try again.");
1850
+ }
1851
+ throw new Error("STT model not loaded");
1852
+ }
1853
+ setIsTranscribing(true);
1854
+ try {
1855
+ let text = (await sttRef.current.transcribe(audio)).text.trim();
1856
+ if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
1857
+ setTranscript(text);
1858
+ onTranscript?.(text);
1859
+ return text;
1860
+ } finally {
1861
+ if (mountedRef.current) setIsTranscribing(false);
1862
+ }
1863
+ }, [shouldLoad, onTranscript]);
1864
+ const processedSamplesRef = useRef(0);
1865
+ const transcribeChunk = useCallback(async (chunkIdx) => {
1866
+ if (!sttRef.current || audioChunksRef.current.length === 0) return "";
1867
+ try {
1868
+ const audioData = await blobToFloat32(new Blob(audioChunksRef.current, { type: "audio/webm" }));
1869
+ const newSamplesStart = processedSamplesRef.current;
1870
+ const totalSamples = audioData.length;
1871
+ if (totalSamples - newSamplesStart < 8e3) return "";
1872
+ const newAudio = audioData.slice(newSamplesStart);
1873
+ processedSamplesRef.current = totalSamples;
1874
+ let text = (await sttRef.current.transcribe(newAudio)).text.trim();
1875
+ if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
1876
+ if (text && mountedRef.current) {
1877
+ setStreamingChunk(text);
1878
+ onChunk?.(text, chunkIdx);
1879
+ }
1880
+ return text;
1881
+ } catch {
1882
+ return "";
1883
+ }
1884
+ }, [blobToFloat32, onChunk]);
1885
+ return {
1886
+ startRecording: useCallback(async () => {
1887
+ if (isRecording) return;
1888
+ try {
1889
+ if (streaming && !sttRef.current) {
1890
+ if (!shouldLoad) setShouldLoad(true);
1891
+ setIsLoading(true);
1892
+ const { WhisperSTT } = await import("../stt-Dne6SENv.js");
1893
+ const stt = new WhisperSTT(model);
1894
+ await stt.load({ onProgress: (p) => {
1895
+ if (mountedRef.current) {
1896
+ const progress = {
1897
+ status: p.status === "downloading" ? "downloading" : p.status === "ready" ? "ready" : "loading",
1898
+ message: p.status,
1899
+ progress: p.progress,
1900
+ file: p.file
1901
+ };
1902
+ setLoadingProgress(progress);
1903
+ onProgress?.(progress);
1904
+ }
1905
+ } });
1906
+ if (!mountedRef.current) {
1907
+ stt.dispose();
1908
+ return;
1909
+ }
1910
+ sttRef.current = stt;
1911
+ setIsReady(true);
1912
+ setIsLoading(false);
1913
+ setLoadingProgress({ status: "ready" });
1914
+ onProgress?.({ status: "ready" });
1915
+ onReady?.();
1916
+ }
1917
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: {
1918
+ sampleRate: 16e3,
1919
+ channelCount: 1,
1920
+ echoCancellation: true,
1921
+ noiseSuppression: true
1922
+ } });
1923
+ streamRef.current = stream;
1924
+ audioChunksRef.current = [];
1925
+ pendingChunksRef.current = [];
1926
+ fullTranscriptRef.current = "";
1927
+ processedSamplesRef.current = 0;
1928
+ setTranscript("");
1929
+ setStreamingChunk("");
1930
+ setChunkCount(0);
1931
+ const mediaRecorder = new MediaRecorder(stream);
1932
+ mediaRecorderRef.current = mediaRecorder;
1933
+ mediaRecorder.ondataavailable = (event) => {
1934
+ if (event.data.size > 0) {
1935
+ audioChunksRef.current.push(event.data);
1936
+ if (streaming) pendingChunksRef.current.push(event.data);
1937
+ }
1938
+ };
1939
+ mediaRecorder.start(100);
1940
+ setIsRecording(true);
1941
+ setError(null);
1942
+ if (streaming && sttRef.current) {
1943
+ let chunkIdx = 0;
1944
+ let shouldContinue = true;
1945
+ const processNextChunk = async () => {
1946
+ if (!shouldContinue || !mountedRef.current) return;
1947
+ if (pendingChunksRef.current.length > 0) {
1948
+ pendingChunksRef.current = [];
1949
+ try {
1950
+ setIsTranscribing(true);
1951
+ const chunkText = await transcribeChunk(chunkIdx);
1952
+ if (chunkText && mountedRef.current) {
1953
+ chunkIdx++;
1954
+ setChunkCount(chunkIdx);
1955
+ setTranscript((prev) => {
1956
+ const newTranscript = prev + (prev ? " " : "") + chunkText;
1957
+ fullTranscriptRef.current = newTranscript;
1958
+ onTranscript?.(newTranscript);
1959
+ return newTranscript;
1960
+ });
1961
+ }
1962
+ } catch (e) {
1963
+ console.error("[useVoiceInput] Chunk transcription error:", e);
1964
+ } finally {
1965
+ if (mountedRef.current) setIsTranscribing(false);
1966
+ }
1967
+ }
1968
+ if (shouldContinue && mountedRef.current) streamingIntervalRef.current = setTimeout(processNextChunk, chunkDuration);
1969
+ };
1970
+ streamingIntervalRef.current = setTimeout(processNextChunk, chunkDuration);
1971
+ streamingIntervalRef._stop = () => {
1972
+ shouldContinue = false;
1973
+ };
1974
+ }
1975
+ } catch (e) {
1976
+ const errMsg = e.message || "Failed to start recording";
1977
+ setError(errMsg);
1978
+ onError?.(errMsg);
1979
+ }
1980
+ }, [
1981
+ isRecording,
1982
+ streaming,
1983
+ shouldLoad,
1984
+ model,
1985
+ chunkDuration,
1986
+ transcribeChunk,
1987
+ onTranscript,
1988
+ onError,
1989
+ onProgress,
1990
+ onReady
1991
+ ]),
1992
+ stopRecording: useCallback(async () => {
1993
+ if (streamingIntervalRef._stop) streamingIntervalRef._stop();
1994
+ if (streamingIntervalRef.current) {
1995
+ clearTimeout(streamingIntervalRef.current);
1996
+ streamingIntervalRef.current = null;
1997
+ }
1998
+ return new Promise((resolve, reject) => {
1999
+ if (!mediaRecorderRef.current || !isRecording) {
2000
+ reject(/* @__PURE__ */ new Error("Not recording"));
2001
+ return;
2002
+ }
2003
+ const mediaRecorder = mediaRecorderRef.current;
2004
+ mediaRecorder.onstop = async () => {
2005
+ if (streamRef.current) {
2006
+ for (const track of streamRef.current.getTracks()) track.stop();
2007
+ streamRef.current = null;
2008
+ }
2009
+ setIsRecording(false);
2010
+ if (streaming) {
2011
+ if (audioChunksRef.current.length > 0 && processedSamplesRef.current > 0) {
2012
+ setIsTranscribing(true);
2013
+ pendingChunksRef.current = [];
2014
+ try {
2015
+ const finalChunkText = await transcribeChunk(chunkCount);
2016
+ if (finalChunkText && mountedRef.current) setTranscript((prev) => {
2017
+ const newTranscript = prev + (prev ? " " : "") + finalChunkText;
2018
+ fullTranscriptRef.current = newTranscript;
2019
+ return newTranscript;
2020
+ });
2021
+ } finally {
2022
+ if (mountedRef.current) setIsTranscribing(false);
2023
+ }
2024
+ }
2025
+ const finalText = fullTranscriptRef.current;
2026
+ onTranscript?.(finalText);
2027
+ resolve(finalText);
2028
+ return;
2029
+ }
2030
+ const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm" });
2031
+ try {
2032
+ if (!sttRef.current) {
2033
+ if (!shouldLoad) setShouldLoad(true);
2034
+ await new Promise((res, rej) => {
2035
+ const checkReady = setInterval(() => {
2036
+ if (sttRef.current) {
2037
+ clearInterval(checkReady);
2038
+ res();
2039
+ }
2040
+ }, 100);
2041
+ setTimeout(() => {
2042
+ clearInterval(checkReady);
2043
+ rej(/* @__PURE__ */ new Error("Timeout waiting for STT model"));
2044
+ }, 3e4);
2045
+ });
2046
+ }
2047
+ resolve(await transcribe(await blobToFloat32(audioBlob)));
2048
+ } catch (e) {
2049
+ const errMsg = e.message || "Transcription failed";
2050
+ setError(errMsg);
2051
+ onError?.(errMsg);
2052
+ reject(e);
2053
+ }
2054
+ };
2055
+ mediaRecorder.stop();
2056
+ });
2057
+ }, [
2058
+ isRecording,
2059
+ streaming,
2060
+ chunkCount,
2061
+ shouldLoad,
2062
+ blobToFloat32,
2063
+ transcribe,
2064
+ transcribeChunk,
2065
+ onTranscript,
2066
+ onError
2067
+ ]),
2068
+ cancelRecording: useCallback(() => {
2069
+ if (streamingIntervalRef._stop) streamingIntervalRef._stop();
2070
+ if (streamingIntervalRef.current) {
2071
+ clearTimeout(streamingIntervalRef.current);
2072
+ streamingIntervalRef.current = null;
2073
+ }
2074
+ if (mediaRecorderRef.current && isRecording) mediaRecorderRef.current.stop();
2075
+ if (streamRef.current) {
2076
+ for (const track of streamRef.current.getTracks()) track.stop();
2077
+ streamRef.current = null;
2078
+ }
2079
+ audioChunksRef.current = [];
2080
+ pendingChunksRef.current = [];
2081
+ processedSamplesRef.current = 0;
2082
+ setIsRecording(false);
2083
+ }, [isRecording]),
2084
+ transcribe,
2085
+ isRecording,
2086
+ isTranscribing,
2087
+ isLoading,
2088
+ isReady,
2089
+ transcript,
2090
+ streamingChunk,
2091
+ chunkCount,
2092
+ loadingProgress,
2093
+ error,
2094
+ load
2095
+ };
2096
+ }
2097
+ /**
2098
+ * React hook for voice conversation with STT + LLM + TTS
2099
+ *
2100
+ * Complete voice-to-voice conversation loop:
2101
+ * 1. User presses button to speak
2102
+ * 2. Speech is transcribed (Whisper)
2103
+ * 3. LLM generates response
2104
+ * 4. Response is spoken aloud (Kokoro or Supertonic TTS)
2105
+ *
2106
+ * @example
2107
+ * ```tsx
2108
+ * function VoiceChat() {
2109
+ * const {
2110
+ * messages,
2111
+ * startListening,
2112
+ * stopListening,
2113
+ * isListening,
2114
+ * isSpeaking,
2115
+ * stage,
2116
+ * } = useVoiceChat({
2117
+ * system: "You are a helpful voice assistant.",
2118
+ * voice: "af_bella",
2119
+ * // Or use Supertonic for faster synthesis:
2120
+ * // ttsModel: "supertonic-66m",
2121
+ * // voice: "F1",
2122
+ * });
2123
+ *
2124
+ * return (
2125
+ * <div>
2126
+ * {messages.map(m => (
2127
+ * <div key={m.id}>{m.role}: {m.content}</div>
2128
+ * ))}
2129
+ * <button
2130
+ * onMouseDown={startListening}
2131
+ * onMouseUp={stopListening}
2132
+ * >
2133
+ * {stage === "idle" ? "🎤 Hold to Speak" : stage}
2134
+ * </button>
2135
+ * </div>
2136
+ * );
2137
+ * }
2138
+ * ```
2139
+ */
2140
+ function useVoiceChat(options = {}) {
2141
+ const React = globalThis.React;
2142
+ if (!React) throw new Error("useVoiceChat requires React. Import React before using this hook.");
2143
+ const { useState, useEffect, useRef, useCallback } = React;
2144
+ const ttsModelId = options.ttsModel || "kokoro-82m";
2145
+ const ttsConfig = TTS_MODELS[ttsModelId];
2146
+ const { llmModel = "qwen3-0.6b", sttModel = "whisper-tiny.en", system = "You are a helpful voice assistant. Keep responses brief and conversational.", thinking = false, voice = ttsConfig.defaultVoice, speed = 1, autoLoad = false, onUserSpeak, onAssistantSpeak, onError } = options;
2147
+ const [messages, setMessages] = useState([]);
2148
+ const [stage, setStage] = useState("idle");
2149
+ const [isLoading, setIsLoading] = useState(autoLoad);
2150
+ const [loadingMessage, setLoadingMessage] = useState("");
2151
+ const [isReady, setIsReady] = useState(false);
2152
+ const [error, setError] = useState(null);
2153
+ const [shouldLoad, setShouldLoad] = useState(autoLoad);
2154
+ const llmWorkerRef = useRef(null);
2155
+ const sttRef = useRef(null);
2156
+ const ttsRef = useRef(null);
2157
+ const mediaRecorderRef = useRef(null);
2158
+ const audioChunksRef = useRef([]);
2159
+ const streamRef = useRef(null);
2160
+ const audioContextRef = useRef(null);
2161
+ const sourceNodeRef = useRef(null);
2162
+ const mountedRef = useRef(true);
2163
+ const cancelledRef = useRef(false);
2164
+ const isListening = stage === "listening";
2165
+ const isProcessing = stage === "transcribing" || stage === "thinking";
2166
+ const isSpeaking = stage === "speaking";
2167
+ useEffect(() => {
2168
+ if (!shouldLoad || isReady) return;
2169
+ let cancelled = false;
2170
+ const loadModels = async () => {
2171
+ try {
2172
+ setIsLoading(true);
2173
+ setError(null);
2174
+ setLoadingMessage("Loading speech recognition (Whisper)...");
2175
+ const { WhisperSTT } = await import("../stt-Dne6SENv.js");
2176
+ if (cancelled || !mountedRef.current) return;
2177
+ const stt = new WhisperSTT(sttModel);
2178
+ await stt.load({ onProgress: (p) => {
2179
+ if (!mountedRef.current) return;
2180
+ setLoadingMessage(p.status || "Loading STT...");
2181
+ } });
2182
+ if (cancelled || !mountedRef.current) {
2183
+ stt.dispose();
2184
+ return;
2185
+ }
2186
+ sttRef.current = stt;
2187
+ setLoadingMessage("Loading language model...");
2188
+ const worker = await createGerbilWorker({
2189
+ modelId: llmModel,
2190
+ onProgress: (p) => {
2191
+ if (!mountedRef.current) return;
2192
+ setLoadingMessage(p.message || "Loading LLM...");
2193
+ }
2194
+ });
2195
+ if (cancelled || !mountedRef.current) {
2196
+ worker.terminate();
2197
+ return;
2198
+ }
2199
+ llmWorkerRef.current = worker;
2200
+ setLoadingMessage(`Loading text-to-speech (${ttsModelId === "supertonic-66m" ? "Supertonic" : "Kokoro"})...`);
2201
+ const { createTTS } = await import("../tts-C2FzKuSx.js");
2202
+ if (cancelled || !mountedRef.current) return;
2203
+ const tts = createTTS(ttsModelId);
2204
+ await tts.load({ onProgress: (p) => {
2205
+ if (!mountedRef.current) return;
2206
+ setLoadingMessage(p.status || "Loading TTS...");
2207
+ } });
2208
+ if (cancelled || !mountedRef.current) {
2209
+ await tts.dispose();
2210
+ return;
2211
+ }
2212
+ ttsRef.current = tts;
2213
+ setIsReady(true);
2214
+ setIsLoading(false);
2215
+ setLoadingMessage("Ready!");
2216
+ } catch (e) {
2217
+ if (!mountedRef.current) return;
2218
+ const errMsg = e.message || "Failed to load models";
2219
+ setError(errMsg);
2220
+ setIsLoading(false);
2221
+ onError?.(errMsg);
2222
+ }
2223
+ };
2224
+ loadModels();
2225
+ return () => {
2226
+ cancelled = true;
2227
+ };
2228
+ }, [
2229
+ shouldLoad,
2230
+ isReady,
2231
+ llmModel,
2232
+ sttModel,
2233
+ ttsModelId,
2234
+ onError
2235
+ ]);
2236
+ useEffect(() => {
2237
+ mountedRef.current = true;
2238
+ return () => {
2239
+ mountedRef.current = false;
2240
+ llmWorkerRef.current?.terminate();
2241
+ sttRef.current?.dispose();
2242
+ ttsRef.current?.dispose();
2243
+ if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
2244
+ audioContextRef.current?.close();
2245
+ };
2246
+ }, []);
2247
+ const load = useCallback(() => {
2248
+ if (!shouldLoad && !isReady && !isLoading) setShouldLoad(true);
2249
+ }, [
2250
+ shouldLoad,
2251
+ isReady,
2252
+ isLoading
2253
+ ]);
2254
+ const blobToFloat32 = useCallback(async (blob) => {
2255
+ const audioContext = new AudioContext({ sampleRate: 16e3 });
2256
+ const arrayBuffer = await blob.arrayBuffer();
2257
+ const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
2258
+ const channelData = audioBuffer.getChannelData(0);
2259
+ if (audioBuffer.sampleRate !== 16e3) {
2260
+ const ratio = 16e3 / audioBuffer.sampleRate;
2261
+ const newLength = Math.round(channelData.length * ratio);
2262
+ const resampled = new Float32Array(newLength);
2263
+ for (let i = 0; i < newLength; i++) {
2264
+ const srcIndex = i / ratio;
2265
+ const floor = Math.floor(srcIndex);
2266
+ const ceil = Math.min(floor + 1, channelData.length - 1);
2267
+ const t = srcIndex - floor;
2268
+ resampled[i] = channelData[floor] * (1 - t) + channelData[ceil] * t;
2269
+ }
2270
+ audioContext.close();
2271
+ return resampled;
2272
+ }
2273
+ audioContext.close();
2274
+ return new Float32Array(channelData);
2275
+ }, []);
2276
+ const playAudioBuffer = useCallback(async (audio, sampleRate) => {
2277
+ return new Promise((resolve) => {
2278
+ if (!audioContextRef.current) audioContextRef.current = new AudioContext();
2279
+ const ctx = audioContextRef.current;
2280
+ const buffer = ctx.createBuffer(1, audio.length, sampleRate);
2281
+ const channelData = new Float32Array(audio);
2282
+ buffer.copyToChannel(channelData, 0);
2283
+ const source = ctx.createBufferSource();
2284
+ source.buffer = buffer;
2285
+ source.connect(ctx.destination);
2286
+ source.onended = () => {
2287
+ if (mountedRef.current) resolve();
2288
+ };
2289
+ source.start();
2290
+ sourceNodeRef.current = source;
2291
+ });
2292
+ }, []);
2293
+ return {
2294
+ messages,
2295
+ startListening: useCallback(async () => {
2296
+ if (stage !== "idle") return;
2297
+ if (!isReady && !isLoading) {
2298
+ setShouldLoad(true);
2299
+ return;
2300
+ }
2301
+ cancelledRef.current = false;
2302
+ try {
2303
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: {
2304
+ sampleRate: 16e3,
2305
+ channelCount: 1,
2306
+ echoCancellation: true
2307
+ } });
2308
+ streamRef.current = stream;
2309
+ audioChunksRef.current = [];
2310
+ const mediaRecorder = new MediaRecorder(stream);
2311
+ mediaRecorderRef.current = mediaRecorder;
2312
+ mediaRecorder.ondataavailable = (event) => {
2313
+ if (event.data.size > 0) audioChunksRef.current.push(event.data);
2314
+ };
2315
+ mediaRecorder.start(100);
2316
+ setStage("listening");
2317
+ setError(null);
2318
+ } catch (e) {
2319
+ const errMsg = e.message || "Failed to access microphone";
2320
+ setError(errMsg);
2321
+ onError?.(errMsg);
2322
+ }
2323
+ }, [
2324
+ stage,
2325
+ isReady,
2326
+ isLoading,
2327
+ onError
2328
+ ]),
2329
+ stopListening: useCallback(async () => {
2330
+ if (stage !== "listening") return;
2331
+ const mediaRecorder = mediaRecorderRef.current;
2332
+ if (!mediaRecorder) return;
2333
+ return new Promise((resolve) => {
2334
+ mediaRecorder.onstop = async () => {
2335
+ if (streamRef.current) {
2336
+ for (const track of streamRef.current.getTracks()) track.stop();
2337
+ streamRef.current = null;
2338
+ }
2339
+ if (cancelledRef.current) {
2340
+ setStage("idle");
2341
+ resolve();
2342
+ return;
2343
+ }
2344
+ const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm" });
2345
+ try {
2346
+ setStage("transcribing");
2347
+ const audioData = await blobToFloat32(audioBlob);
2348
+ let userText = (await sttRef.current.transcribe(audioData)).text.trim();
2349
+ if (userText === "[BLANK_AUDIO]" || userText === "(blank audio)" || userText === "[BLANK AUDIO]") userText = "";
2350
+ if (cancelledRef.current || !userText) {
2351
+ setStage("idle");
2352
+ resolve();
2353
+ return;
2354
+ }
2355
+ const userMsgId = `user-${Date.now()}`;
2356
+ setMessages((m) => [...m, {
2357
+ id: userMsgId,
2358
+ role: "user",
2359
+ content: userText
2360
+ }]);
2361
+ onUserSpeak?.(userText);
2362
+ setStage("thinking");
2363
+ const history = messages.map((m) => ({
2364
+ role: m.role,
2365
+ content: m.content
2366
+ }));
2367
+ history.push({
2368
+ role: "user",
2369
+ content: userText
2370
+ });
2371
+ let responseText = "";
2372
+ let thinkingText = "";
2373
+ await llmWorkerRef.current.generate(userText, {
2374
+ system,
2375
+ thinking,
2376
+ history,
2377
+ onToken: (token) => {
2378
+ if (cancelledRef.current) return;
2379
+ if (token.state === "thinking") thinkingText += token.text;
2380
+ else responseText += token.text;
2381
+ }
2382
+ });
2383
+ if (cancelledRef.current) {
2384
+ setStage("idle");
2385
+ resolve();
2386
+ return;
2387
+ }
2388
+ const assistantMsgId = `assistant-${Date.now()}`;
2389
+ setMessages((m) => [...m, {
2390
+ id: assistantMsgId,
2391
+ role: "assistant",
2392
+ content: responseText,
2393
+ thinking: thinkingText || void 0
2394
+ }]);
2395
+ onAssistantSpeak?.(responseText);
2396
+ if (responseText.trim()) {
2397
+ setStage("speaking");
2398
+ const ttsResult = await ttsRef.current.speak(responseText, {
2399
+ voice,
2400
+ speed
2401
+ });
2402
+ if (!cancelledRef.current) await playAudioBuffer(ttsResult.audio, ttsResult.sampleRate);
2403
+ }
2404
+ setStage("idle");
2405
+ resolve();
2406
+ } catch (e) {
2407
+ if (!mountedRef.current) return;
2408
+ const errMsg = e.message || "Processing failed";
2409
+ setError(errMsg);
2410
+ setStage("idle");
2411
+ onError?.(errMsg);
2412
+ resolve();
2413
+ }
2414
+ };
2415
+ mediaRecorder.stop();
2416
+ });
2417
+ }, [
2418
+ stage,
2419
+ messages,
2420
+ system,
2421
+ thinking,
2422
+ voice,
2423
+ speed,
2424
+ blobToFloat32,
2425
+ playAudioBuffer,
2426
+ onUserSpeak,
2427
+ onAssistantSpeak,
2428
+ onError
2429
+ ]),
2430
+ cancel: useCallback(() => {
2431
+ cancelledRef.current = true;
2432
+ if (mediaRecorderRef.current && stage === "listening") mediaRecorderRef.current.stop();
2433
+ if (streamRef.current) {
2434
+ for (const track of streamRef.current.getTracks()) track.stop();
2435
+ streamRef.current = null;
2436
+ }
2437
+ if (sourceNodeRef.current) try {
2438
+ sourceNodeRef.current.stop();
2439
+ } catch {}
2440
+ audioChunksRef.current = [];
2441
+ setStage("idle");
2442
+ }, [stage]),
2443
+ clear: useCallback(() => {
2444
+ setMessages([]);
2445
+ }, []),
2446
+ isListening,
2447
+ isProcessing,
2448
+ isSpeaking,
2449
+ stage,
2450
+ isReady,
2451
+ isLoading,
2452
+ loadingMessage,
2453
+ error,
2454
+ load
2455
+ };
2456
+ }
2457
+ /**
2458
+ * Check if WebGPU is supported
2459
+ */
2460
+ function isWebGPUSupported() {
2461
+ if (typeof navigator === "undefined") return false;
2462
+ return "gpu" in navigator;
2463
+ }
2464
+ /**
2465
+ * Get WebGPU adapter info
2466
+ */
2467
+ async function getWebGPUInfo() {
2468
+ if (!isWebGPUSupported()) return { supported: false };
2469
+ try {
2470
+ const adapter = await navigator.gpu.requestAdapter();
2471
+ if (!adapter) return { supported: false };
2472
+ const info = await adapter.requestAdapterInfo();
2473
+ return {
2474
+ supported: true,
2475
+ adapter: info.vendor,
2476
+ device: info.device
2477
+ };
2478
+ } catch {
2479
+ return { supported: false };
2480
+ }
2481
+ }
2482
+ var browser_default = {
2483
+ isWebGPUSupported,
2484
+ getWebGPUInfo,
2485
+ createGerbilWorker,
2486
+ playAudio,
2487
+ createAudioPlayer
2488
+ };
2489
+
2490
+ //#endregion
2491
+ export { BUILTIN_MODELS, createAudioPlayer, createGerbilWorker, browser_default as default, getWebGPUInfo, isWebGPUSupported, playAudio, useChat, useCompletion, useSpeech, useVoiceChat, useVoiceInput };
2492
+ //# sourceMappingURL=index.js.map