@tryhamster/gerbil 1.0.0-rc.0 → 1.0.0-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +79 -14
  2. package/dist/auto-update-DsWBBnEk.mjs +3 -0
  3. package/dist/browser/index.d.mts +401 -5
  4. package/dist/browser/index.d.mts.map +1 -1
  5. package/dist/browser/index.mjs +1772 -146
  6. package/dist/browser/index.mjs.map +1 -1
  7. package/dist/{chrome-backend-CtwPENIW.mjs → chrome-backend-JEPeM2YE.mjs} +1 -1
  8. package/dist/{chrome-backend-C5Un08O4.mjs → chrome-backend-Y9F7W5VQ.mjs} +514 -73
  9. package/dist/chrome-backend-Y9F7W5VQ.mjs.map +1 -0
  10. package/dist/cli.mjs +3359 -646
  11. package/dist/cli.mjs.map +1 -1
  12. package/dist/frameworks/express.d.mts +1 -1
  13. package/dist/frameworks/express.mjs +3 -3
  14. package/dist/frameworks/fastify.d.mts +1 -1
  15. package/dist/frameworks/fastify.mjs +3 -3
  16. package/dist/frameworks/hono.d.mts +1 -1
  17. package/dist/frameworks/hono.mjs +3 -3
  18. package/dist/frameworks/next.d.mts +2 -2
  19. package/dist/frameworks/next.mjs +3 -3
  20. package/dist/frameworks/react.d.mts +1 -1
  21. package/dist/frameworks/trpc.d.mts +1 -1
  22. package/dist/frameworks/trpc.mjs +3 -3
  23. package/dist/gerbil-DeQlX_Mt.mjs +5 -0
  24. package/dist/gerbil-POAz8peb.d.mts +431 -0
  25. package/dist/gerbil-POAz8peb.d.mts.map +1 -0
  26. package/dist/gerbil-yoSpRHgv.mjs +1463 -0
  27. package/dist/gerbil-yoSpRHgv.mjs.map +1 -0
  28. package/dist/index.d.mts +395 -9
  29. package/dist/index.d.mts.map +1 -1
  30. package/dist/index.mjs +8 -6
  31. package/dist/index.mjs.map +1 -1
  32. package/dist/integrations/ai-sdk.d.mts +122 -4
  33. package/dist/integrations/ai-sdk.d.mts.map +1 -1
  34. package/dist/integrations/ai-sdk.mjs +239 -11
  35. package/dist/integrations/ai-sdk.mjs.map +1 -1
  36. package/dist/integrations/langchain.d.mts +132 -2
  37. package/dist/integrations/langchain.d.mts.map +1 -1
  38. package/dist/integrations/langchain.mjs +176 -8
  39. package/dist/integrations/langchain.mjs.map +1 -1
  40. package/dist/integrations/llamaindex.d.mts +1 -1
  41. package/dist/integrations/llamaindex.mjs +3 -3
  42. package/dist/integrations/mcp-client.mjs +4 -4
  43. package/dist/integrations/mcp-client.mjs.map +1 -1
  44. package/dist/integrations/mcp.d.mts +2 -2
  45. package/dist/integrations/mcp.d.mts.map +1 -1
  46. package/dist/integrations/mcp.mjs +6 -6
  47. package/dist/{mcp-R8kRLIKb.mjs → mcp-Bitg4sjX.mjs} +10 -37
  48. package/dist/mcp-Bitg4sjX.mjs.map +1 -0
  49. package/dist/microphone-D-6y9aiE.mjs +3 -0
  50. package/dist/{models-DKULvhOr.mjs → models-BAtL8qsA.mjs} +42 -7
  51. package/dist/models-BAtL8qsA.mjs.map +1 -0
  52. package/dist/{models-De2-_GmQ.d.mts → models-CE0fBq0U.d.mts} +2 -2
  53. package/dist/models-CE0fBq0U.d.mts.map +1 -0
  54. package/dist/{one-liner-BUQR0nqq.mjs → one-liner-B1rmFto6.mjs} +2 -2
  55. package/dist/{one-liner-BUQR0nqq.mjs.map → one-liner-B1rmFto6.mjs.map} +1 -1
  56. package/dist/repl-D20JO260.mjs +10 -0
  57. package/dist/skills/index.d.mts +303 -12
  58. package/dist/skills/index.d.mts.map +1 -1
  59. package/dist/skills/index.mjs +6 -6
  60. package/dist/skills-5DxAV-rn.mjs +1435 -0
  61. package/dist/skills-5DxAV-rn.mjs.map +1 -0
  62. package/dist/stt-Bv_dum-R.mjs +433 -0
  63. package/dist/stt-Bv_dum-R.mjs.map +1 -0
  64. package/dist/stt-KzSoNvwI.mjs +3 -0
  65. package/dist/{tools-BsiEE6f2.mjs → tools-IYPrqoek.mjs} +6 -7
  66. package/dist/{tools-BsiEE6f2.mjs.map → tools-IYPrqoek.mjs.map} +1 -1
  67. package/dist/tts-5yWeP_I0.mjs +3 -0
  68. package/dist/tts-DG6denWG.mjs +729 -0
  69. package/dist/tts-DG6denWG.mjs.map +1 -0
  70. package/dist/types-s6Py2_DL.d.mts +353 -0
  71. package/dist/types-s6Py2_DL.d.mts.map +1 -0
  72. package/dist/{utils-7vXqtq2Q.mjs → utils-CkB4Roi6.mjs} +1 -1
  73. package/dist/{utils-7vXqtq2Q.mjs.map → utils-CkB4Roi6.mjs.map} +1 -1
  74. package/docs/ai-sdk.md +137 -21
  75. package/docs/browser.md +241 -2
  76. package/docs/memory.md +72 -0
  77. package/docs/stt.md +494 -0
  78. package/docs/tts.md +569 -0
  79. package/docs/vision.md +396 -0
  80. package/package.json +17 -18
  81. package/dist/auto-update-BbNHbSU1.mjs +0 -3
  82. package/dist/chrome-backend-C5Un08O4.mjs.map +0 -1
  83. package/dist/gerbil-BfnsFWRE.mjs +0 -644
  84. package/dist/gerbil-BfnsFWRE.mjs.map +0 -1
  85. package/dist/gerbil-BjW-z7Fq.mjs +0 -5
  86. package/dist/gerbil-DZ1k3ChC.d.mts +0 -138
  87. package/dist/gerbil-DZ1k3ChC.d.mts.map +0 -1
  88. package/dist/mcp-R8kRLIKb.mjs.map +0 -1
  89. package/dist/models-DKULvhOr.mjs.map +0 -1
  90. package/dist/models-De2-_GmQ.d.mts.map +0 -1
  91. package/dist/skills-D3CEpgDc.mjs +0 -630
  92. package/dist/skills-D3CEpgDc.mjs.map +0 -1
  93. package/dist/types-BS1N92Jt.d.mts +0 -183
  94. package/dist/types-BS1N92Jt.d.mts.map +0 -1
@@ -1,4 +1,4 @@
1
- import { a as resolveModel, t as BUILTIN_MODELS } from "../models-DKULvhOr.mjs";
1
+ import { o as resolveModel, t as BUILTIN_MODELS } from "../models-BAtL8qsA.mjs";
2
2
 
3
3
  //#region src/browser/index.ts
4
4
  /**
@@ -61,40 +61,84 @@ async function createGerbilWorker(options = {}) {
61
61
  import {
62
62
  AutoTokenizer,
63
63
  AutoModelForCausalLM,
64
+ AutoProcessor,
65
+ AutoModelForImageTextToText,
66
+ RawImage,
64
67
  TextStreamer,
65
68
  InterruptableStoppingCriteria,
66
- } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.0";
69
+ env,
70
+ } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
71
+
72
+ // Enable IndexedDB caching for browser (prevents re-downloading models)
73
+ env.useBrowserCache = true;
74
+ env.allowLocalModels = false;
67
75
 
68
76
  class ModelPipeline {
69
77
  static tokenizer = null;
70
78
  static model = null;
79
+ static processor = null;
80
+ static visionModel = null;
71
81
  static modelId = "";
82
+ static isVision = false;
72
83
 
73
84
  static async getInstance(modelId, options = {}, progressCallback) {
74
85
  if (this.modelId !== modelId) {
75
86
  this.tokenizer = null;
76
87
  this.model = null;
88
+ this.processor = null;
89
+ this.visionModel = null;
77
90
  }
78
91
  this.modelId = modelId;
92
+
93
+ // Detect vision models
94
+ this.isVision = options.vision ||
95
+ modelId.toLowerCase().includes("ministral") ||
96
+ modelId.toLowerCase().includes("vision") ||
97
+ modelId.toLowerCase().includes("vlm");
79
98
 
80
99
  const dtype = options.dtype || "q4f16";
81
100
  const device = options.device || "webgpu";
82
101
 
83
- if (!this.tokenizer) {
84
- this.tokenizer = await AutoTokenizer.from_pretrained(modelId, {
85
- progress_callback: progressCallback,
86
- });
87
- }
88
-
89
- if (!this.model) {
90
- this.model = await AutoModelForCausalLM.from_pretrained(modelId, {
91
- dtype,
92
- device,
93
- progress_callback: progressCallback,
94
- });
102
+ if (this.isVision) {
103
+ // Load vision model components
104
+ // Note: Don't specify dtype for vision models - let transformers.js pick defaults
105
+ if (!this.processor) {
106
+ this.processor = await AutoProcessor.from_pretrained(modelId, {
107
+ progress_callback: progressCallback,
108
+ });
109
+ }
110
+ if (!this.visionModel) {
111
+ this.visionModel = await AutoModelForImageTextToText.from_pretrained(modelId, {
112
+ device,
113
+ progress_callback: progressCallback,
114
+ });
115
+ }
116
+ return {
117
+ processor: this.processor,
118
+ model: this.visionModel,
119
+ tokenizer: this.processor.tokenizer,
120
+ isVision: true
121
+ };
122
+ } else {
123
+ // Load text-only model components
124
+ if (!this.tokenizer) {
125
+ this.tokenizer = await AutoTokenizer.from_pretrained(modelId, {
126
+ progress_callback: progressCallback,
127
+ });
128
+ }
129
+ if (!this.model) {
130
+ this.model = await AutoModelForCausalLM.from_pretrained(modelId, {
131
+ dtype,
132
+ device,
133
+ progress_callback: progressCallback,
134
+ });
135
+ }
136
+ return {
137
+ tokenizer: this.tokenizer,
138
+ model: this.model,
139
+ isVision: false
140
+ };
95
141
  }
96
-
97
- return { tokenizer: this.tokenizer, model: this.model };
98
142
  }
99
143
  }
100
144
 
@@ -105,22 +149,19 @@ async function createGerbilWorker(options = {}) {
105
149
  const { modelId, options = {} } = data;
106
150
  self.postMessage({ status: "loading", message: "Loading model..." });
107
151
 
108
- // Track download state - if we see progress < 100, we're downloading
109
152
  const downloadState = {
110
- downloading: new Set(), // Files currently downloading
111
- completed: new Set(), // Files completed
112
- isDownloading: false, // True if any file needed download
153
+ downloading: new Set(),
154
+ completed: new Set(),
155
+ isDownloading: false,
113
156
  };
114
157
 
115
158
  try {
116
- const { tokenizer, model } = await ModelPipeline.getInstance(
159
+ const result = await ModelPipeline.getInstance(
117
160
  modelId,
118
161
  options,
119
162
  (progress) => {
120
163
  if (progress.status === "progress" && progress.file) {
121
164
  const pct = Math.round(progress.progress || 0);
122
-
123
- // If we see progress < 100, this file is being downloaded (not from cache)
124
165
  if (pct < 100) {
125
166
  downloadState.downloading.add(progress.file);
126
167
  downloadState.isDownloading = true;
@@ -128,8 +169,6 @@ async function createGerbilWorker(options = {}) {
128
169
  downloadState.downloading.delete(progress.file);
129
170
  downloadState.completed.add(progress.file);
130
171
  }
131
-
132
- // Only emit downloading status if actually downloading
133
172
  if (downloadState.isDownloading) {
134
173
  self.postMessage({
135
174
  status: "downloading",
@@ -144,96 +183,229 @@ async function createGerbilWorker(options = {}) {
144
183
  );
145
184
 
146
185
  self.postMessage({ status: "loading", message: "Compiling shaders..." });
147
- const warmupInputs = tokenizer("a");
148
- await model.generate({ ...warmupInputs, max_new_tokens: 1 });
186
+
187
+ // Warmup differs for vision vs text models
188
+ if (result.isVision) {
189
+ // Vision models need both text and vision warmup
190
+ // Text warmup first
191
+ const textWarmupInputs = result.tokenizer("hello");
192
+ await result.model.generate({ ...textWarmupInputs, max_new_tokens: 1 });
193
+
194
+ // Vision warmup with synthetic image
195
+ self.postMessage({ status: "loading", message: "Warming up vision encoder..." });
196
+ try {
197
+ // Create a tiny 8x8 test image using OffscreenCanvas
198
+ const canvas = new OffscreenCanvas(8, 8);
199
+ const ctx = canvas.getContext("2d");
200
+ ctx.fillStyle = "red";
201
+ ctx.fillRect(0, 0, 8, 8);
202
+ const blob = await canvas.convertToBlob({ type: "image/png" });
203
+ const warmupImage = await RawImage.fromBlob(blob);
204
+
205
+ // Process with vision pipeline
206
+ const warmupContent = [{ type: "image" }, { type: "text", text: "hi" }];
207
+ const warmupMessages = [{ role: "user", content: warmupContent }];
208
+ const warmupPrompt = result.processor.apply_chat_template(warmupMessages, { add_generation_prompt: true });
209
+ const warmupInputs = await result.processor(warmupImage, warmupPrompt, { add_special_tokens: false });
210
+
211
+ // Run vision warmup generation
212
+ await result.model.generate({
213
+ ...warmupInputs,
214
+ max_new_tokens: 1,
215
+ });
216
+ } catch (warmupErr) {
217
+ console.warn("Vision warmup failed (non-fatal):", warmupErr);
218
+ }
219
+ } else {
220
+ const warmupInputs = result.tokenizer("a");
221
+ await result.model.generate({ ...warmupInputs, max_new_tokens: 1 });
222
+ }
149
223
 
150
- self.postMessage({ status: "ready" });
224
+ self.postMessage({ status: "ready", isVision: result.isVision });
151
225
  } catch (error) {
152
226
  self.postMessage({ status: "error", error: error.message || String(error) });
153
227
  }
154
228
  }
155
229
 
156
230
  async function generate(data) {
157
- const { messages, options = {} } = data;
231
+ const { messages, images = [], options = {} } = data;
158
232
  const { maxTokens = 256, temperature = 0.7, topP = 0.9, topK = 20, thinking = false } = options;
159
233
 
160
234
  try {
161
- const { tokenizer, model } = await ModelPipeline.getInstance(ModelPipeline.modelId, {});
162
-
163
- const inputs = tokenizer.apply_chat_template(messages, {
164
- add_generation_prompt: true,
165
- return_dict: true,
166
- enable_thinking: thinking,
167
- });
168
-
169
- let state = "answering";
170
- const [START_THINKING_TOKEN_ID, END_THINKING_TOKEN_ID] = tokenizer.encode(
171
- "<think></think>",
172
- { add_special_tokens: false }
173
- );
174
-
175
- let startTime = null;
176
- let numTokens = 0;
177
-
178
- // Token callback for state tracking (receives raw token IDs)
179
- const tokenCallback = (tokens) => {
180
- startTime ??= performance.now();
181
- numTokens++;
182
-
183
- const tokenId = Number(tokens[0]);
184
- if (tokenId === START_THINKING_TOKEN_ID) {
185
- state = "thinking";
186
- } else if (tokenId === END_THINKING_TOKEN_ID) {
187
- state = "answering";
188
- }
189
- };
190
-
191
- // Text callback for streaming (receives decoded text)
192
- const streamCallback = (text) => {
193
- const tps = startTime ? (numTokens / (performance.now() - startTime)) * 1000 : 0;
194
- self.postMessage({ status: "token", text, state, numTokens, tps });
195
- };
196
-
197
- const streamer = new TextStreamer(tokenizer, {
198
- skip_prompt: true,
199
- skip_special_tokens: true,
200
- callback_function: streamCallback,
201
- token_callback_function: tokenCallback,
202
- });
203
-
204
- self.postMessage({ status: "start" });
205
-
206
- const { past_key_values, sequences } = await model.generate({
207
- ...inputs,
208
- past_key_values: pastKeyValuesCache,
209
- do_sample: temperature > 0,
210
- temperature: temperature > 0 ? temperature : undefined,
211
- top_p: topP,
212
- top_k: topK,
213
- max_new_tokens: maxTokens,
214
- streamer,
215
- stopping_criteria: stoppingCriteria,
216
- return_dict_in_generate: true,
217
- });
218
-
219
- pastKeyValuesCache = past_key_values;
220
-
221
- const endTime = performance.now();
222
- const totalTime = startTime ? endTime - startTime : 0;
223
- const decoded = tokenizer.batch_decode(sequences, { skip_special_tokens: true });
224
-
225
- self.postMessage({
226
- status: "complete",
227
- text: decoded[0] || "",
228
- numTokens,
229
- totalTime,
230
- tps: totalTime > 0 ? (numTokens / totalTime) * 1000 : 0,
231
- });
235
+ const result = await ModelPipeline.getInstance(ModelPipeline.modelId, {});
236
+
237
+ // Route to vision or text generation
238
+ if (result.isVision && images.length > 0) {
239
+ await generateVision(result, messages, images, options);
240
+ } else {
241
+ await generateText(result, messages, options);
242
+ }
232
243
  } catch (error) {
233
244
  self.postMessage({ status: "error", error: error.message || String(error) });
234
245
  }
235
246
  }
236
247
 
248
+ async function generateText(result, messages, options) {
249
+ const { maxTokens = 256, temperature = 0.7, topP = 0.9, topK = 20, thinking = false } = options;
250
+ const { tokenizer, model } = result;
251
+
252
+ const inputs = tokenizer.apply_chat_template(messages, {
253
+ add_generation_prompt: true,
254
+ return_dict: true,
255
+ enable_thinking: thinking,
256
+ });
257
+
258
+ let state = "answering";
259
+ const [START_THINKING_TOKEN_ID, END_THINKING_TOKEN_ID] = tokenizer.encode(
260
+ "<think></think>",
261
+ { add_special_tokens: false }
262
+ );
263
+
264
+ let startTime = null;
265
+ let numTokens = 0;
266
+
267
+ const tokenCallback = (tokens) => {
268
+ startTime ??= performance.now();
269
+ numTokens += 1;
270
+ const tokenId = Number(tokens[0]);
271
+ if (tokenId === START_THINKING_TOKEN_ID) state = "thinking";
272
+ else if (tokenId === END_THINKING_TOKEN_ID) state = "answering";
273
+ };
274
+
275
+ const streamCallback = (text) => {
276
+ const tps = startTime ? (numTokens / (performance.now() - startTime)) * 1000 : 0;
277
+ self.postMessage({ status: "token", text, state, numTokens, tps });
278
+ };
279
+
280
+ const streamer = new TextStreamer(tokenizer, {
281
+ skip_prompt: true,
282
+ skip_special_tokens: true,
283
+ callback_function: streamCallback,
284
+ token_callback_function: tokenCallback,
285
+ });
286
+
287
+ self.postMessage({ status: "start" });
288
+
289
+ const { past_key_values, sequences } = await model.generate({
290
+ ...inputs,
291
+ past_key_values: pastKeyValuesCache,
292
+ do_sample: temperature > 0,
293
+ temperature: temperature > 0 ? temperature : undefined,
294
+ top_p: topP,
295
+ top_k: topK,
296
+ max_new_tokens: maxTokens,
297
+ streamer,
298
+ stopping_criteria: stoppingCriteria,
299
+ return_dict_in_generate: true,
300
+ });
301
+
302
+ pastKeyValuesCache = past_key_values;
303
+
304
+ const endTime = performance.now();
305
+ const totalTime = startTime ? endTime - startTime : 0;
306
+ const decoded = tokenizer.batch_decode(sequences, { skip_special_tokens: true });
307
+
308
+ self.postMessage({
309
+ status: "complete",
310
+ text: decoded[0] || "",
311
+ numTokens,
312
+ totalTime,
313
+ tps: totalTime > 0 ? (numTokens / totalTime) * 1000 : 0,
314
+ });
315
+ }
316
+
317
+ async function generateVision(result, messages, images, options) {
318
+ const { maxTokens = 2048, temperature = 0.7, topP = 0.9, topK = 20 } = options;
319
+ const { processor, model, tokenizer } = result;
320
+
321
+ self.postMessage({ status: "progress", message: "Preparing vision request..." });
322
+
323
+ // Build message content with image placeholders and text
324
+ const lastMessage = messages[messages.length - 1];
325
+ const content = [];
326
+ for (const _ of images) {
327
+ content.push({ type: "image" });
328
+ }
329
+ content.push({ type: "text", text: lastMessage.content });
330
+
331
+ // For vision models, include a brief system instruction for concise responses
332
+ // Note: Vision processors handle system differently than text models
333
+ const visionMessages = [
334
+ { role: "system", content: "You are a helpful assistant. Be concise and direct in your responses." },
335
+ { role: "user", content }
336
+ ];
337
+
338
+ // Apply chat template with generation prompt
339
+ const chatPrompt = processor.apply_chat_template(visionMessages, {
340
+ add_generation_prompt: true
341
+ });
342
+
343
+ // Load images (handle both string URLs and { source: string } objects)
344
+ self.postMessage({ status: "progress", message: "Loading images..." });
345
+ const loadedImages = await Promise.all(
346
+ images.map(img => {
347
+ const url = typeof img === "string" ? img : img.source;
348
+ return RawImage.fromURL(url);
349
+ })
350
+ );
351
+ self.postMessage({ status: "progress", message: "Processing inputs..." });
352
+
353
+ // Process inputs
354
+ const inputs = await processor(
355
+ loadedImages.length === 1 ? loadedImages[0] : loadedImages,
356
+ chatPrompt,
357
+ { add_special_tokens: false }
358
+ );
359
+ self.postMessage({ status: "progress", message: "Generating response..." });
360
+
361
+ let startTime = null;
362
+ let numTokens = 0;
363
+
364
+ const streamCallback = (text) => {
365
+ startTime ??= performance.now();
366
+ numTokens += 1;
367
+ const tps = (numTokens / (performance.now() - startTime)) * 1000;
368
+ self.postMessage({ status: "token", text, state: "answering", numTokens, tps });
369
+ };
370
+
371
+ const streamer = new TextStreamer(tokenizer, {
372
+ skip_prompt: true,
373
+ skip_special_tokens: true,
374
+ callback_function: streamCallback,
375
+ });
376
+
377
+ self.postMessage({ status: "start" });
378
+
379
+ const outputs = await model.generate({
380
+ ...inputs,
381
+ max_new_tokens: maxTokens,
382
+ do_sample: temperature > 0,
383
+ temperature: temperature > 0 ? temperature : undefined,
384
+ top_p: topP,
385
+ top_k: topK,
386
+ streamer,
387
+ stopping_criteria: stoppingCriteria,
388
+ });
389
+
390
+ // Decode output (skip prompt)
391
+ const inputLength = inputs.input_ids.dims?.at(-1) || 0;
392
+ const decoded = processor.batch_decode(
393
+ outputs.slice(null, [inputLength, null]),
394
+ { skip_special_tokens: true }
395
+ );
396
+
397
+ const endTime = performance.now();
398
+ const totalTime = startTime ? endTime - startTime : 0;
399
+
400
+ self.postMessage({
401
+ status: "complete",
402
+ text: decoded[0] || "",
403
+ numTokens,
404
+ totalTime,
405
+ tps: totalTime > 0 ? (numTokens / totalTime) * 1000 : 0,
406
+ });
407
+ }
408
+
237
409
  self.addEventListener("message", async (e) => {
238
410
  const { type, ...data } = e.data;
239
411
  switch (type) {
@@ -303,30 +475,34 @@ async function createGerbilWorker(options = {}) {
303
475
  reject(new Error(error));
304
476
  };
305
477
  const gerbilWorker = {
306
- generate: (prompt, options$1 = {}) => {
307
- return new Promise((res, rej) => {
308
- currentResolve = res;
309
- currentReject = rej;
310
- const messages = [{
311
- role: "system",
312
- content: options$1.system || "You are a helpful assistant."
313
- }, {
314
- role: "user",
315
- content: prompt
316
- }];
317
- worker.postMessage({
318
- type: "generate",
319
- messages,
320
- options: {
321
- maxTokens: options$1.maxTokens ?? 256,
322
- temperature: options$1.temperature ?? .7,
323
- topP: options$1.topP ?? .9,
324
- topK: options$1.topK ?? 20,
325
- thinking: options$1.thinking ?? false
326
- }
327
- });
478
+ generate: (prompt, options$1 = {}) => new Promise((res, rej) => {
479
+ currentResolve = res;
480
+ currentReject = rej;
481
+ const system = options$1.system || "You are a helpful assistant.";
482
+ const messages = options$1.history ? [{
483
+ role: "system",
484
+ content: system
485
+ }, ...options$1.history] : [{
486
+ role: "system",
487
+ content: system
488
+ }, {
489
+ role: "user",
490
+ content: prompt
491
+ }];
492
+ if (options$1.history) worker.postMessage({ type: "reset" });
493
+ worker.postMessage({
494
+ type: "generate",
495
+ messages,
496
+ images: options$1.images || [],
497
+ options: {
498
+ maxTokens: options$1.maxTokens ?? (options$1.images?.length ? 2048 : 256),
499
+ temperature: options$1.temperature ?? .7,
500
+ topP: options$1.topP ?? .9,
501
+ topK: options$1.topK ?? 20,
502
+ thinking: options$1.thinking ?? false
503
+ }
328
504
  });
329
- },
505
+ }),
330
506
  interrupt: () => {
331
507
  worker.postMessage({ type: "interrupt" });
332
508
  },
@@ -383,6 +559,7 @@ function useChat(options = {}) {
383
559
  const [error, setError] = useState(null);
384
560
  const [isReady, setIsReady] = useState(false);
385
561
  const [shouldLoad, setShouldLoad] = useState(autoLoad);
562
+ const [attachedImages, setAttachedImages] = useState([]);
386
563
  const workerRef = useRef(null);
387
564
  const messageIdRef = useRef(0);
388
565
  const mountedRef = useRef(true);
@@ -455,23 +632,34 @@ function useChat(options = {}) {
455
632
  setCurrentResponse("");
456
633
  setThinking("");
457
634
  }
458
- return () => {};
459
635
  }, [
460
636
  isGenerating,
461
637
  currentResponse,
462
638
  thinking
463
639
  ]);
464
640
  const pendingMessageRef = useRef(null);
465
- const handleSubmit = useCallback((e) => {
466
- e?.preventDefault?.();
467
- if (!input.trim() || isGenerating) return;
641
+ const pendingImagesRef = useRef([]);
642
+ const attachImage = useCallback((imageUrl) => {
643
+ setAttachedImages((imgs) => [...imgs, imageUrl]);
644
+ }, []);
645
+ const removeImage = useCallback((index) => {
646
+ setAttachedImages((imgs) => imgs.filter((_, i) => i !== index));
647
+ }, []);
648
+ const clearImages = useCallback(() => {
649
+ setAttachedImages([]);
650
+ }, []);
651
+ const sendMessageWithImages = useCallback((text, images) => {
652
+ if (!text.trim() || isGenerating) return;
653
+ messageIdRef.current += 1;
468
654
  const userMessage = {
469
- id: `msg-${++messageIdRef.current}`,
655
+ id: `msg-${messageIdRef.current}`,
470
656
  role: "user",
471
- content: input.trim()
657
+ content: text.trim(),
658
+ images: images.length > 0 ? images : void 0
472
659
  };
660
+ messageIdRef.current += 1;
473
661
  const assistantMessage = {
474
- id: `msg-${++messageIdRef.current}`,
662
+ id: `msg-${messageIdRef.current}`,
475
663
  role: "assistant",
476
664
  content: ""
477
665
  };
@@ -480,23 +668,23 @@ function useChat(options = {}) {
480
668
  userMessage,
481
669
  assistantMessage
482
670
  ]);
483
- setInput("");
484
671
  setCurrentResponse("");
485
672
  setThinking("");
486
673
  if (!workerRef.current) {
487
- pendingMessageRef.current = userMessage.content;
674
+ pendingMessageRef.current = text.trim();
675
+ pendingImagesRef.current = images;
488
676
  load();
489
677
  return;
490
678
  }
491
679
  setIsGenerating(true);
492
- workerRef.current.generate(userMessage.content, {
680
+ workerRef.current.generate(text.trim(), {
493
681
  system,
494
682
  thinking: enableThinking,
495
- maxTokens,
496
- temperature
683
+ maxTokens: images.length > 0 ? Math.max(maxTokens, 2048) : maxTokens,
684
+ temperature,
685
+ images: images.length > 0 ? images : void 0
497
686
  });
498
687
  }, [
499
- input,
500
688
  isGenerating,
501
689
  system,
502
690
  enableThinking,
@@ -504,19 +692,36 @@ function useChat(options = {}) {
504
692
  temperature,
505
693
  load
506
694
  ]);
695
+ const handleSubmit = useCallback((e) => {
696
+ e?.preventDefault?.();
697
+ if (!input.trim() || isGenerating) return;
698
+ sendMessageWithImages(input, attachedImages);
699
+ setInput("");
700
+ setAttachedImages([]);
701
+ }, [
702
+ input,
703
+ isGenerating,
704
+ attachedImages,
705
+ sendMessageWithImages
706
+ ]);
707
+ const sendWithImages = useCallback((text, images) => {
708
+ sendMessageWithImages(text, images);
709
+ }, [sendMessageWithImages]);
507
710
  useEffect(() => {
508
711
  if (isReady && pendingMessageRef.current && workerRef.current) {
509
712
  const pendingContent = pendingMessageRef.current;
713
+ const pendingImages = pendingImagesRef.current;
510
714
  pendingMessageRef.current = null;
715
+ pendingImagesRef.current = [];
511
716
  setIsGenerating(true);
512
717
  workerRef.current.generate(pendingContent, {
513
718
  system,
514
719
  thinking: enableThinking,
515
- maxTokens,
516
- temperature
720
+ maxTokens: pendingImages.length > 0 ? Math.max(maxTokens, 2048) : maxTokens,
721
+ temperature,
722
+ images: pendingImages.length > 0 ? pendingImages : void 0
517
723
  });
518
724
  }
519
- return () => {};
520
725
  }, [
521
726
  isReady,
522
727
  system,
@@ -533,6 +738,7 @@ function useChat(options = {}) {
533
738
  setMessages([]);
534
739
  setCurrentResponse("");
535
740
  setThinking("");
741
+ setAttachedImages([]);
536
742
  }, []);
537
743
  return {
538
744
  messages: messages.map((m, i) => {
@@ -555,7 +761,12 @@ function useChat(options = {}) {
555
761
  tps,
556
762
  isReady,
557
763
  error,
558
- load
764
+ load,
765
+ attachedImages,
766
+ attachImage,
767
+ removeImage,
768
+ clearImages,
769
+ sendWithImages
559
770
  };
560
771
  }
561
772
  /**
@@ -597,6 +808,7 @@ function useCompletion(options = {}) {
597
808
  const resolveRef = useRef(null);
598
809
  const rejectRef = useRef(null);
599
810
  const pendingPromptRef = useRef(null);
811
+ const pendingImagesRef = useRef(void 0);
600
812
  const mountedRef = useRef(true);
601
813
  const load = useCallback(() => {
602
814
  if (workerRef.current || isLoading) return;
@@ -656,7 +868,7 @@ function useCompletion(options = {}) {
656
868
  workerRef.current?.terminate();
657
869
  };
658
870
  }, [model, shouldLoad]);
659
- const complete = useCallback((prompt) => {
871
+ const complete = useCallback((prompt, completeOptions) => {
660
872
  return new Promise((resolve, reject) => {
661
873
  setCompletion("");
662
874
  setThinking("");
@@ -664,6 +876,7 @@ function useCompletion(options = {}) {
664
876
  rejectRef.current = reject;
665
877
  if (!workerRef.current) {
666
878
  pendingPromptRef.current = prompt;
879
+ pendingImagesRef.current = completeOptions?.images;
667
880
  load();
668
881
  return;
669
882
  }
@@ -672,7 +885,8 @@ function useCompletion(options = {}) {
672
885
  system,
673
886
  thinking: enableThinking,
674
887
  maxTokens,
675
- temperature
888
+ temperature,
889
+ images: completeOptions?.images
676
890
  });
677
891
  });
678
892
  }, [
@@ -685,16 +899,18 @@ function useCompletion(options = {}) {
685
899
  useEffect(() => {
686
900
  if (isReady && pendingPromptRef.current && workerRef.current) {
687
901
  const pendingPrompt = pendingPromptRef.current;
902
+ const pendingImages = pendingImagesRef.current;
688
903
  pendingPromptRef.current = null;
904
+ pendingImagesRef.current = void 0;
689
905
  setIsGenerating(true);
690
906
  workerRef.current.generate(pendingPrompt, {
691
907
  system,
692
908
  thinking: enableThinking,
693
909
  maxTokens,
694
- temperature
910
+ temperature,
911
+ images: pendingImages
695
912
  });
696
913
  }
697
- return () => {};
698
914
  }, [
699
915
  isReady,
700
916
  system,
@@ -719,6 +935,1414 @@ function useCompletion(options = {}) {
719
935
  load
720
936
  };
721
937
  }
938
+ /** Kokoro voice definitions (24kHz, high quality) */
939
+ const KOKORO_BROWSER_VOICES = [
940
+ {
941
+ id: "af_heart",
942
+ name: "Heart",
943
+ gender: "female",
944
+ language: "en-us",
945
+ description: "American female, highest quality (Grade A)"
946
+ },
947
+ {
948
+ id: "af_bella",
949
+ name: "Bella",
950
+ gender: "female",
951
+ language: "en-us",
952
+ description: "American female, warm and friendly (Grade A-)"
953
+ },
954
+ {
955
+ id: "af_nicole",
956
+ name: "Nicole",
957
+ gender: "female",
958
+ language: "en-us",
959
+ description: "American female, soft and gentle"
960
+ },
961
+ {
962
+ id: "af_sarah",
963
+ name: "Sarah",
964
+ gender: "female",
965
+ language: "en-us",
966
+ description: "American female, clear and professional"
967
+ },
968
+ {
969
+ id: "af_sky",
970
+ name: "Sky",
971
+ gender: "female",
972
+ language: "en-us",
973
+ description: "American female, young and energetic"
974
+ },
975
+ {
976
+ id: "af_alloy",
977
+ name: "Alloy",
978
+ gender: "female",
979
+ language: "en-us",
980
+ description: "American female"
981
+ },
982
+ {
983
+ id: "af_aoede",
984
+ name: "Aoede",
985
+ gender: "female",
986
+ language: "en-us",
987
+ description: "American female, mythical"
988
+ },
989
+ {
990
+ id: "af_jessica",
991
+ name: "Jessica",
992
+ gender: "female",
993
+ language: "en-us",
994
+ description: "American female"
995
+ },
996
+ {
997
+ id: "af_kore",
998
+ name: "Kore",
999
+ gender: "female",
1000
+ language: "en-us",
1001
+ description: "American female"
1002
+ },
1003
+ {
1004
+ id: "af_nova",
1005
+ name: "Nova",
1006
+ gender: "female",
1007
+ language: "en-us",
1008
+ description: "American female"
1009
+ },
1010
+ {
1011
+ id: "af_river",
1012
+ name: "River",
1013
+ gender: "female",
1014
+ language: "en-us",
1015
+ description: "American female"
1016
+ },
1017
+ {
1018
+ id: "am_fenrir",
1019
+ name: "Fenrir",
1020
+ gender: "male",
1021
+ language: "en-us",
1022
+ description: "American male, best quality"
1023
+ },
1024
+ {
1025
+ id: "am_michael",
1026
+ name: "Michael",
1027
+ gender: "male",
1028
+ language: "en-us",
1029
+ description: "American male, warm and friendly"
1030
+ },
1031
+ {
1032
+ id: "am_adam",
1033
+ name: "Adam",
1034
+ gender: "male",
1035
+ language: "en-us",
1036
+ description: "American male"
1037
+ },
1038
+ {
1039
+ id: "am_echo",
1040
+ name: "Echo",
1041
+ gender: "male",
1042
+ language: "en-us",
1043
+ description: "American male"
1044
+ },
1045
+ {
1046
+ id: "am_eric",
1047
+ name: "Eric",
1048
+ gender: "male",
1049
+ language: "en-us",
1050
+ description: "American male"
1051
+ },
1052
+ {
1053
+ id: "am_liam",
1054
+ name: "Liam",
1055
+ gender: "male",
1056
+ language: "en-us",
1057
+ description: "American male"
1058
+ },
1059
+ {
1060
+ id: "am_onyx",
1061
+ name: "Onyx",
1062
+ gender: "male",
1063
+ language: "en-us",
1064
+ description: "American male"
1065
+ },
1066
+ {
1067
+ id: "am_puck",
1068
+ name: "Puck",
1069
+ gender: "male",
1070
+ language: "en-us",
1071
+ description: "American male"
1072
+ },
1073
+ {
1074
+ id: "am_santa",
1075
+ name: "Santa",
1076
+ gender: "male",
1077
+ language: "en-us",
1078
+ description: "American male, festive"
1079
+ },
1080
+ {
1081
+ id: "bf_emma",
1082
+ name: "Emma",
1083
+ gender: "female",
1084
+ language: "en-gb",
1085
+ description: "British female, elegant and clear"
1086
+ },
1087
+ {
1088
+ id: "bf_isabella",
1089
+ name: "Isabella",
1090
+ gender: "female",
1091
+ language: "en-gb",
1092
+ description: "British female, sophisticated"
1093
+ },
1094
+ {
1095
+ id: "bf_alice",
1096
+ name: "Alice",
1097
+ gender: "female",
1098
+ language: "en-gb",
1099
+ description: "British female"
1100
+ },
1101
+ {
1102
+ id: "bf_lily",
1103
+ name: "Lily",
1104
+ gender: "female",
1105
+ language: "en-gb",
1106
+ description: "British female"
1107
+ },
1108
+ {
1109
+ id: "bm_george",
1110
+ name: "George",
1111
+ gender: "male",
1112
+ language: "en-gb",
1113
+ description: "British male, distinguished"
1114
+ },
1115
+ {
1116
+ id: "bm_lewis",
1117
+ name: "Lewis",
1118
+ gender: "male",
1119
+ language: "en-gb",
1120
+ description: "British male, friendly"
1121
+ },
1122
+ {
1123
+ id: "bm_daniel",
1124
+ name: "Daniel",
1125
+ gender: "male",
1126
+ language: "en-gb",
1127
+ description: "British male"
1128
+ },
1129
+ {
1130
+ id: "bm_fable",
1131
+ name: "Fable",
1132
+ gender: "male",
1133
+ language: "en-gb",
1134
+ description: "British male"
1135
+ }
1136
+ ];
1137
+ /** Supertonic voice definitions (44.1kHz, faster) */
1138
+ const SUPERTONIC_BROWSER_VOICES = [
1139
+ {
1140
+ id: "F1",
1141
+ name: "Female 1",
1142
+ gender: "female",
1143
+ language: "en",
1144
+ description: "Female voice 1 - Clear and natural"
1145
+ },
1146
+ {
1147
+ id: "F2",
1148
+ name: "Female 2",
1149
+ gender: "female",
1150
+ language: "en",
1151
+ description: "Female voice 2 - Warm and expressive"
1152
+ },
1153
+ {
1154
+ id: "M1",
1155
+ name: "Male 1",
1156
+ gender: "male",
1157
+ language: "en",
1158
+ description: "Male voice 1 - Deep and confident"
1159
+ },
1160
+ {
1161
+ id: "M2",
1162
+ name: "Male 2",
1163
+ gender: "male",
1164
+ language: "en",
1165
+ description: "Male voice 2 - Friendly and casual"
1166
+ }
1167
+ ];
1168
+ /** TTS model configuration */
1169
+ const TTS_MODELS = {
1170
+ "kokoro-82m": {
1171
+ repo: "onnx-community/Kokoro-82M-v1.0-ONNX",
1172
+ defaultVoice: "af_heart",
1173
+ sampleRate: 24e3,
1174
+ voices: KOKORO_BROWSER_VOICES
1175
+ },
1176
+ "supertonic-66m": {
1177
+ repo: "onnx-community/Supertonic-TTS-ONNX",
1178
+ defaultVoice: "F1",
1179
+ sampleRate: 44100,
1180
+ voices: SUPERTONIC_BROWSER_VOICES
1181
+ }
1182
+ };
1183
+ /**
1184
+ * React hook for text-to-speech with Web Audio API playback
1185
+ *
1186
+ * Supports both Kokoro (24kHz, high quality) and Supertonic (44.1kHz, faster).
1187
+ *
1188
+ * @example
1189
+ * ```tsx
1190
+ * import { useSpeech } from "@tryhamster/gerbil/browser";
1191
+ *
1192
+ * function App() {
1193
+ * // Default: Kokoro TTS
1194
+ * const { speak, stop, isLoading, isSpeaking, listVoices, setVoice } = useSpeech();
1195
+ *
1196
+ * // Or use Supertonic (44.1kHz, faster)
1197
+ * // const { speak, listVoices } = useSpeech({ model: "supertonic-66m" });
1198
+ *
1199
+ * if (isLoading) return <div>Loading TTS...</div>;
1200
+ *
1201
+ * return (
1202
+ * <div>
1203
+ * <select onChange={e => setVoice(e.target.value)}>
1204
+ * {listVoices().map(v => (
1205
+ * <option key={v.id} value={v.id}>{v.name}</option>
1206
+ * ))}
1207
+ * </select>
1208
+ * <button onClick={() => speak("Hello world!")}>
1209
+ * {isSpeaking ? "Speaking..." : "Speak"}
1210
+ * </button>
1211
+ * {isSpeaking && <button onClick={stop}>Stop</button>}
1212
+ * </div>
1213
+ * );
1214
+ * }
1215
+ * ```
1216
+ */
1217
+ function useSpeech(options = {}) {
1218
+ const React = globalThis.React;
1219
+ if (!React) throw new Error("useSpeech requires React. Import React before using this hook.");
1220
+ const { useState, useEffect, useRef, useCallback } = React;
1221
+ const { model: modelId = "kokoro-82m", speed: defaultSpeed = 1, autoLoad = false, onReady, onError, onStart, onEnd } = options;
1222
+ const modelConfig = TTS_MODELS[modelId];
1223
+ const defaultVoice = options.voice || modelConfig.defaultVoice;
1224
+ const [isLoading, setIsLoading] = useState(autoLoad);
1225
+ const [loadingProgress, setLoadingProgress] = useState(null);
1226
+ const [isSpeaking, setIsSpeaking] = useState(false);
1227
+ const [isReady, setIsReady] = useState(false);
1228
+ const [error, setError] = useState(null);
1229
+ const [shouldLoad, setShouldLoad] = useState(autoLoad);
1230
+ const [currentVoice, setCurrentVoice] = useState(defaultVoice);
1231
+ const [currentSpeed, setCurrentSpeed] = useState(defaultSpeed);
1232
+ const ttsRef = useRef(null);
1233
+ const voiceEmbeddingsRef = useRef(/* @__PURE__ */ new Map());
1234
+ const audioContextRef = useRef(null);
1235
+ const sourceNodeRef = useRef(null);
1236
+ const mountedRef = useRef(true);
1237
+ const modelIdRef = useRef(modelId);
1238
+ const listVoices = useCallback(() => {
1239
+ return modelConfig.voices;
1240
+ }, [modelConfig.voices]);
1241
+ const load = useCallback(() => {
1242
+ if (ttsRef.current || isLoading) return;
1243
+ setIsLoading(true);
1244
+ setShouldLoad(true);
1245
+ }, [isLoading]);
1246
+ useEffect(() => {
1247
+ if (!shouldLoad) return;
1248
+ mountedRef.current = true;
1249
+ modelIdRef.current = modelId;
1250
+ const initTTS = async () => {
1251
+ try {
1252
+ const isSupertonic = modelId === "supertonic-66m";
1253
+ const config = TTS_MODELS[modelId];
1254
+ setLoadingProgress({
1255
+ status: "loading",
1256
+ message: `Loading ${isSupertonic ? "Supertonic" : "Kokoro"} TTS...`
1257
+ });
1258
+ if (isSupertonic) {
1259
+ const { pipeline } = await import("@huggingface/transformers");
1260
+ const tts = await pipeline("text-to-speech", config.repo, {
1261
+ device: "webgpu",
1262
+ progress_callback: (progress) => {
1263
+ if (!mountedRef.current) return;
1264
+ if (progress.status === "progress" && progress.file) setLoadingProgress({
1265
+ status: "downloading",
1266
+ file: progress.file,
1267
+ progress: Math.round(progress.progress || 0)
1268
+ });
1269
+ }
1270
+ });
1271
+ if (!mountedRef.current) return;
1272
+ const voicesUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/`;
1273
+ const embeddingsMap = /* @__PURE__ */ new Map();
1274
+ await Promise.all(config.voices.map(async (voice) => {
1275
+ try {
1276
+ const response = await fetch(`${voicesUrl}${voice.id}.bin`);
1277
+ if (response.ok) {
1278
+ const buffer = await response.arrayBuffer();
1279
+ embeddingsMap.set(voice.id, new Float32Array(buffer));
1280
+ }
1281
+ } catch (e) {
1282
+ console.warn(`Failed to load voice embedding for ${voice.id}:`, e);
1283
+ }
1284
+ }));
1285
+ if (!mountedRef.current) return;
1286
+ try {
1287
+ await tts("Hello", {
1288
+ speaker_embeddings: new Float32Array(12928),
1289
+ num_inference_steps: 1,
1290
+ speed: 1
1291
+ });
1292
+ } catch (e) {
1293
+ console.warn("Supertonic warmup failed:", e);
1294
+ }
1295
+ voiceEmbeddingsRef.current = embeddingsMap;
1296
+ ttsRef.current = {
1297
+ type: "supertonic",
1298
+ pipeline: tts,
1299
+ config
1300
+ };
1301
+ } else {
1302
+ const { KokoroTTS } = await import("kokoro-js");
1303
+ const tts = await KokoroTTS.from_pretrained(config.repo, {
1304
+ dtype: "fp32",
1305
+ progress_callback: (progress) => {
1306
+ if (!mountedRef.current) return;
1307
+ if (progress.status === "progress" && progress.file) setLoadingProgress({
1308
+ status: "downloading",
1309
+ file: progress.file,
1310
+ progress: Math.round(progress.progress || 0)
1311
+ });
1312
+ }
1313
+ });
1314
+ if (!mountedRef.current) return;
1315
+ ttsRef.current = {
1316
+ type: "kokoro",
1317
+ instance: tts,
1318
+ config
1319
+ };
1320
+ }
1321
+ setIsLoading(false);
1322
+ setIsReady(true);
1323
+ setLoadingProgress({ status: "ready" });
1324
+ onReady?.();
1325
+ } catch (err) {
1326
+ if (!mountedRef.current) return;
1327
+ const errorMsg = err instanceof Error ? err.message : String(err);
1328
+ setError(errorMsg);
1329
+ setIsLoading(false);
1330
+ setLoadingProgress({
1331
+ status: "error",
1332
+ error: errorMsg
1333
+ });
1334
+ onError?.(errorMsg);
1335
+ }
1336
+ };
1337
+ initTTS();
1338
+ return () => {
1339
+ mountedRef.current = false;
1340
+ };
1341
+ }, [
1342
+ shouldLoad,
1343
+ modelId,
1344
+ onReady,
1345
+ onError
1346
+ ]);
1347
+ useEffect(() => {
1348
+ return () => {
1349
+ try {
1350
+ sourceNodeRef.current?.stop();
1351
+ } catch {}
1352
+ try {
1353
+ if (audioContextRef.current && audioContextRef.current.state !== "closed") audioContextRef.current.close();
1354
+ } catch {}
1355
+ };
1356
+ }, []);
1357
+ return {
1358
+ speak: useCallback(async (text, opts) => {
1359
+ const voice = opts?.voice || currentVoice;
1360
+ const speed = opts?.speed || currentSpeed;
1361
+ if (!ttsRef.current) {
1362
+ load();
1363
+ return;
1364
+ }
1365
+ try {
1366
+ setIsSpeaking(true);
1367
+ onStart?.();
1368
+ let audioData;
1369
+ let sampleRate;
1370
+ const ttsBackend = ttsRef.current;
1371
+ if (ttsBackend.type === "supertonic") {
1372
+ const config = ttsBackend.config;
1373
+ if (!config.voices.find((v) => v.id === voice)) {
1374
+ const validVoices = config.voices.map((v) => v.id).join(", ");
1375
+ throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
1376
+ }
1377
+ let speakerEmbedding = voiceEmbeddingsRef.current.get(voice);
1378
+ if (!speakerEmbedding) try {
1379
+ const voiceUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/${voice}.bin`;
1380
+ const response = await fetch(voiceUrl);
1381
+ if (response.ok) {
1382
+ const buffer = await response.arrayBuffer();
1383
+ speakerEmbedding = new Float32Array(buffer);
1384
+ voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
1385
+ } else throw new Error(`Failed to load voice: ${response.status}`);
1386
+ } catch {
1387
+ speakerEmbedding = new Float32Array(12928).fill(.1);
1388
+ voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
1389
+ }
1390
+ const result = await ttsBackend.pipeline(text, {
1391
+ speaker_embeddings: speakerEmbedding,
1392
+ speed
1393
+ });
1394
+ audioData = result.audio;
1395
+ sampleRate = result.sampling_rate;
1396
+ } else {
1397
+ const config = ttsBackend.config;
1398
+ if (!config.voices.find((v) => v.id === voice)) {
1399
+ const validVoices = config.voices.map((v) => v.id).join(", ");
1400
+ throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
1401
+ }
1402
+ const result = await ttsBackend.instance.generate(text, {
1403
+ voice,
1404
+ speed
1405
+ });
1406
+ audioData = result.audio;
1407
+ sampleRate = result.sampling_rate;
1408
+ }
1409
+ if (!mountedRef.current) return;
1410
+ if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext();
1411
+ const audioContext = audioContextRef.current;
1412
+ if (audioContext.state === "suspended") await audioContext.resume();
1413
+ const audioBuffer = audioContext.createBuffer(1, audioData.length, sampleRate);
1414
+ const channelData = new Float32Array(audioData);
1415
+ audioBuffer.copyToChannel(channelData, 0);
1416
+ if (sourceNodeRef.current) {
1417
+ sourceNodeRef.current.stop();
1418
+ sourceNodeRef.current.disconnect();
1419
+ }
1420
+ const sourceNode = audioContext.createBufferSource();
1421
+ sourceNode.buffer = audioBuffer;
1422
+ sourceNode.connect(audioContext.destination);
1423
+ sourceNode.onended = () => {
1424
+ if (mountedRef.current) {
1425
+ setIsSpeaking(false);
1426
+ onEnd?.();
1427
+ }
1428
+ };
1429
+ sourceNodeRef.current = sourceNode;
1430
+ sourceNode.start();
1431
+ } catch (err) {
1432
+ if (!mountedRef.current) return;
1433
+ const errorMsg = err instanceof Error ? err.message : String(err);
1434
+ setError(errorMsg);
1435
+ setIsSpeaking(false);
1436
+ onError?.(errorMsg);
1437
+ }
1438
+ }, [
1439
+ currentVoice,
1440
+ currentSpeed,
1441
+ load,
1442
+ onStart,
1443
+ onEnd,
1444
+ onError
1445
+ ]),
1446
+ stop: useCallback(() => {
1447
+ if (sourceNodeRef.current) {
1448
+ sourceNodeRef.current.stop();
1449
+ sourceNodeRef.current.disconnect();
1450
+ sourceNodeRef.current = null;
1451
+ }
1452
+ setIsSpeaking(false);
1453
+ }, []),
1454
+ isLoading,
1455
+ loadingProgress,
1456
+ isSpeaking,
1457
+ isReady,
1458
+ load,
1459
+ error,
1460
+ listVoices,
1461
+ currentVoice,
1462
+ setVoice: useCallback((voiceId) => {
1463
+ if (modelConfig.voices.find((v) => v.id === voiceId)) setCurrentVoice(voiceId);
1464
+ else console.warn(`Voice "${voiceId}" not valid for ${modelId}. Available: ${modelConfig.voices.map((v) => v.id).join(", ")}`);
1465
+ }, [modelConfig.voices, modelId]),
1466
+ currentSpeed,
1467
+ setSpeed: useCallback((speed) => {
1468
+ setCurrentSpeed(Math.max(.5, Math.min(2, speed)));
1469
+ }, []),
1470
+ currentModel: modelId,
1471
+ sampleRate: modelConfig.sampleRate
1472
+ };
1473
+ }
1474
+ /**
1475
+ * Play audio from Float32Array using Web Audio API
1476
+ *
1477
+ * @example
1478
+ * ```ts
1479
+ * import { playAudio } from "@tryhamster/gerbil/browser";
1480
+ *
1481
+ * const audio = new Float32Array([...]); // TTS output
1482
+ * const controller = await playAudio(audio, 24000);
1483
+ *
1484
+ * // Stop playback
1485
+ * controller.stop();
1486
+ * ```
1487
+ */
1488
+ async function playAudio(audio, sampleRate = 24e3) {
1489
+ const audioContext = new AudioContext();
1490
+ if (audioContext.state === "suspended") await audioContext.resume();
1491
+ const audioBuffer = audioContext.createBuffer(1, audio.length, sampleRate);
1492
+ const channelData = new Float32Array(audio);
1493
+ audioBuffer.copyToChannel(channelData, 0);
1494
+ const sourceNode = audioContext.createBufferSource();
1495
+ sourceNode.buffer = audioBuffer;
1496
+ sourceNode.connect(audioContext.destination);
1497
+ const onEnded = new Promise((resolve) => {
1498
+ sourceNode.onended = () => {
1499
+ audioContext.close();
1500
+ resolve();
1501
+ };
1502
+ });
1503
+ sourceNode.start();
1504
+ return {
1505
+ stop: () => {
1506
+ sourceNode.stop();
1507
+ audioContext.close();
1508
+ },
1509
+ onEnded
1510
+ };
1511
+ }
1512
+ /**
1513
+ * Create a reusable audio player for streaming TTS
1514
+ *
1515
+ * @example
1516
+ * ```ts
1517
+ * import { createAudioPlayer } from "@tryhamster/gerbil/browser";
1518
+ *
1519
+ * const player = createAudioPlayer(24000);
1520
+ *
1521
+ * // Queue audio chunks as they arrive
1522
+ * player.queue(chunk1);
1523
+ * player.queue(chunk2);
1524
+ *
1525
+ * // Stop and clear
1526
+ * player.stop();
1527
+ * ```
1528
+ */
1529
+ function createAudioPlayer(sampleRate = 24e3) {
1530
+ let audioContext = null;
1531
+ let nextStartTime = 0;
1532
+ let isActive = false;
1533
+ const ensureContext = async () => {
1534
+ if (!audioContext) audioContext = new AudioContext();
1535
+ if (audioContext.state === "suspended") await audioContext.resume();
1536
+ return audioContext;
1537
+ };
1538
+ return {
1539
+ queue: async (audio) => {
1540
+ const ctx = await ensureContext();
1541
+ isActive = true;
1542
+ const buffer = ctx.createBuffer(1, audio.length, sampleRate);
1543
+ const channelData = new Float32Array(audio);
1544
+ buffer.copyToChannel(channelData, 0);
1545
+ const source = ctx.createBufferSource();
1546
+ source.buffer = buffer;
1547
+ source.connect(ctx.destination);
1548
+ const startTime = Math.max(ctx.currentTime, nextStartTime);
1549
+ source.start(startTime);
1550
+ nextStartTime = startTime + buffer.duration;
1551
+ source.onended = () => {
1552
+ if (ctx.currentTime >= nextStartTime - .1) isActive = false;
1553
+ };
1554
+ },
1555
+ stop: () => {
1556
+ isActive = false;
1557
+ nextStartTime = 0;
1558
+ if (audioContext) {
1559
+ audioContext.close();
1560
+ audioContext = null;
1561
+ }
1562
+ },
1563
+ isPlaying: () => isActive
1564
+ };
1565
+ }
1566
+ /**
1567
+ * React hook for voice input with browser microphone
1568
+ *
1569
+ * Uses MediaRecorder to capture audio and Whisper for transcription.
1570
+ * Supports both one-shot and streaming transcription modes.
1571
+ *
1572
+ * @example Basic usage (one-shot)
1573
+ * ```tsx
1574
+ * function VoiceInput() {
1575
+ * const { startRecording, stopRecording, isRecording, transcript } = useVoiceInput({
1576
+ * onTranscript: (text) => console.log("User said:", text),
1577
+ * });
1578
+ *
1579
+ * return (
1580
+ * <button onClick={isRecording ? stopRecording : startRecording}>
1581
+ * {isRecording ? "Stop" : "Record"}
1582
+ * </button>
1583
+ * );
1584
+ * }
1585
+ * ```
1586
+ *
1587
+ * @example Streaming transcription (real-time)
1588
+ * ```tsx
1589
+ * function LiveTranscription() {
1590
+ * const { startRecording, stopRecording, isRecording, transcript, streamingChunk } = useVoiceInput({
1591
+ * streaming: true, // Enable streaming mode
1592
+ * chunkDuration: 1500, // Transcribe every 1.5 seconds (default)
1593
+ * onChunk: (text, idx) => console.log(`Chunk ${idx}: ${text}`),
1594
+ * });
1595
+ *
1596
+ * return (
1597
+ * <div>
1598
+ * <button onClick={isRecording ? stopRecording : startRecording}>
1599
+ * {isRecording ? "Stop" : "Start Live Transcription"}
1600
+ * </button>
1601
+ * <p>Current chunk: {streamingChunk}</p>
1602
+ * <p>Full transcript: {transcript}</p>
1603
+ * </div>
1604
+ * );
1605
+ * }
1606
+ * ```
1607
+ */
1608
+ function useVoiceInput(options = {}) {
1609
+ const React = globalThis.React;
1610
+ if (!React) throw new Error("useVoiceInput requires React. Import React before using this hook.");
1611
+ const { useState, useEffect, useRef, useCallback } = React;
1612
+ const { model = "whisper-tiny.en", autoLoad = false, onReady, onTranscript, onError, onProgress, streaming = false, chunkDuration = 1500, onChunk } = options;
1613
+ const [isLoading, setIsLoading] = useState(autoLoad);
1614
+ const [loadingProgress, setLoadingProgress] = useState(null);
1615
+ const [isReady, setIsReady] = useState(false);
1616
+ const [isRecording, setIsRecording] = useState(false);
1617
+ const [isTranscribing, setIsTranscribing] = useState(false);
1618
+ const [transcript, setTranscript] = useState("");
1619
+ const [streamingChunk, setStreamingChunk] = useState("");
1620
+ const [chunkCount, setChunkCount] = useState(0);
1621
+ const [error, setError] = useState(null);
1622
+ const [shouldLoad, setShouldLoad] = useState(autoLoad);
1623
+ const sttRef = useRef(null);
1624
+ const mediaRecorderRef = useRef(null);
1625
+ const audioChunksRef = useRef([]);
1626
+ const streamRef = useRef(null);
1627
+ const mountedRef = useRef(true);
1628
+ const streamingIntervalRef = useRef(null);
1629
+ const pendingChunksRef = useRef([]);
1630
+ const fullTranscriptRef = useRef("");
1631
+ useEffect(() => {
1632
+ if (!shouldLoad || isReady) return;
1633
+ let cancelled = false;
1634
+ const loadModel = async () => {
1635
+ try {
1636
+ setIsLoading(true);
1637
+ setLoadingProgress({
1638
+ status: "loading",
1639
+ message: "Loading STT model..."
1640
+ });
1641
+ onProgress?.({
1642
+ status: "loading",
1643
+ message: "Loading STT model..."
1644
+ });
1645
+ const { WhisperSTT } = await import("../stt-KzSoNvwI.mjs");
1646
+ if (cancelled || !mountedRef.current) return;
1647
+ const stt = new WhisperSTT(model);
1648
+ await stt.load({ onProgress: (p) => {
1649
+ if (!mountedRef.current) return;
1650
+ const progress = {
1651
+ status: p.progress !== void 0 ? "downloading" : "loading",
1652
+ message: p.status,
1653
+ progress: p.progress,
1654
+ file: p.file
1655
+ };
1656
+ setLoadingProgress(progress);
1657
+ onProgress?.(progress);
1658
+ } });
1659
+ if (cancelled || !mountedRef.current) {
1660
+ stt.dispose();
1661
+ return;
1662
+ }
1663
+ sttRef.current = stt;
1664
+ setIsReady(true);
1665
+ setIsLoading(false);
1666
+ setLoadingProgress({ status: "ready" });
1667
+ onProgress?.({ status: "ready" });
1668
+ onReady?.();
1669
+ } catch (e) {
1670
+ if (!mountedRef.current) return;
1671
+ const errMsg = e.message || "Failed to load STT model";
1672
+ setError(errMsg);
1673
+ setIsLoading(false);
1674
+ setLoadingProgress({
1675
+ status: "error",
1676
+ message: errMsg
1677
+ });
1678
+ onProgress?.({
1679
+ status: "error",
1680
+ message: errMsg
1681
+ });
1682
+ onError?.(errMsg);
1683
+ }
1684
+ };
1685
+ loadModel();
1686
+ return () => {
1687
+ cancelled = true;
1688
+ };
1689
+ }, [
1690
+ shouldLoad,
1691
+ isReady,
1692
+ model,
1693
+ onReady,
1694
+ onError,
1695
+ onProgress
1696
+ ]);
1697
+ useEffect(() => {
1698
+ mountedRef.current = true;
1699
+ return () => {
1700
+ mountedRef.current = false;
1701
+ if (sttRef.current) sttRef.current.dispose();
1702
+ if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
1703
+ };
1704
+ }, []);
1705
+ const load = useCallback(() => {
1706
+ if (!shouldLoad && !isReady && !isLoading) setShouldLoad(true);
1707
+ }, [
1708
+ shouldLoad,
1709
+ isReady,
1710
+ isLoading
1711
+ ]);
1712
+ const blobToFloat32 = useCallback(async (blob) => {
1713
+ const audioContext = new AudioContext({ sampleRate: 16e3 });
1714
+ const arrayBuffer = await blob.arrayBuffer();
1715
+ const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
1716
+ const channelData = audioBuffer.getChannelData(0);
1717
+ if (audioBuffer.sampleRate !== 16e3) {
1718
+ const ratio = 16e3 / audioBuffer.sampleRate;
1719
+ const newLength = Math.round(channelData.length * ratio);
1720
+ const resampled = new Float32Array(newLength);
1721
+ for (let i = 0; i < newLength; i++) {
1722
+ const srcIndex = i / ratio;
1723
+ const floor = Math.floor(srcIndex);
1724
+ const ceil = Math.min(floor + 1, channelData.length - 1);
1725
+ const t = srcIndex - floor;
1726
+ resampled[i] = channelData[floor] * (1 - t) + channelData[ceil] * t;
1727
+ }
1728
+ audioContext.close();
1729
+ return resampled;
1730
+ }
1731
+ audioContext.close();
1732
+ return new Float32Array(channelData);
1733
+ }, []);
1734
+ const transcribe = useCallback(async (audio) => {
1735
+ if (!sttRef.current) {
1736
+ if (!shouldLoad) {
1737
+ setShouldLoad(true);
1738
+ throw new Error("STT model not loaded. Loading now, please try again.");
1739
+ }
1740
+ throw new Error("STT model not loaded");
1741
+ }
1742
+ setIsTranscribing(true);
1743
+ try {
1744
+ let text = (await sttRef.current.transcribe(audio)).text.trim();
1745
+ if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
1746
+ setTranscript(text);
1747
+ onTranscript?.(text);
1748
+ return text;
1749
+ } finally {
1750
+ if (mountedRef.current) setIsTranscribing(false);
1751
+ }
1752
+ }, [shouldLoad, onTranscript]);
1753
+ const processedSamplesRef = useRef(0);
1754
+ const transcribeChunk = useCallback(async (chunkIdx) => {
1755
+ if (!sttRef.current || audioChunksRef.current.length === 0) return "";
1756
+ try {
1757
+ const audioData = await blobToFloat32(new Blob(audioChunksRef.current, { type: "audio/webm" }));
1758
+ const newSamplesStart = processedSamplesRef.current;
1759
+ const totalSamples = audioData.length;
1760
+ if (totalSamples - newSamplesStart < 8e3) return "";
1761
+ const newAudio = audioData.slice(newSamplesStart);
1762
+ processedSamplesRef.current = totalSamples;
1763
+ let text = (await sttRef.current.transcribe(newAudio)).text.trim();
1764
+ if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
1765
+ if (text && mountedRef.current) {
1766
+ setStreamingChunk(text);
1767
+ onChunk?.(text, chunkIdx);
1768
+ }
1769
+ return text;
1770
+ } catch {
1771
+ return "";
1772
+ }
1773
+ }, [blobToFloat32, onChunk]);
1774
+ return {
1775
+ startRecording: useCallback(async () => {
1776
+ if (isRecording) return;
1777
+ try {
1778
+ if (streaming && !sttRef.current) {
1779
+ if (!shouldLoad) setShouldLoad(true);
1780
+ setIsLoading(true);
1781
+ const { WhisperSTT } = await import("../stt-KzSoNvwI.mjs");
1782
+ const stt = new WhisperSTT(model);
1783
+ await stt.load({ onProgress: (p) => {
1784
+ if (mountedRef.current) {
1785
+ const progress = {
1786
+ status: p.status === "downloading" ? "downloading" : p.status === "ready" ? "ready" : "loading",
1787
+ message: p.status,
1788
+ progress: p.progress,
1789
+ file: p.file
1790
+ };
1791
+ setLoadingProgress(progress);
1792
+ onProgress?.(progress);
1793
+ }
1794
+ } });
1795
+ if (!mountedRef.current) {
1796
+ stt.dispose();
1797
+ return;
1798
+ }
1799
+ sttRef.current = stt;
1800
+ setIsReady(true);
1801
+ setIsLoading(false);
1802
+ setLoadingProgress({ status: "ready" });
1803
+ onProgress?.({ status: "ready" });
1804
+ onReady?.();
1805
+ }
1806
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: {
1807
+ sampleRate: 16e3,
1808
+ channelCount: 1,
1809
+ echoCancellation: true,
1810
+ noiseSuppression: true
1811
+ } });
1812
+ streamRef.current = stream;
1813
+ audioChunksRef.current = [];
1814
+ pendingChunksRef.current = [];
1815
+ fullTranscriptRef.current = "";
1816
+ processedSamplesRef.current = 0;
1817
+ setTranscript("");
1818
+ setStreamingChunk("");
1819
+ setChunkCount(0);
1820
+ const mediaRecorder = new MediaRecorder(stream);
1821
+ mediaRecorderRef.current = mediaRecorder;
1822
+ mediaRecorder.ondataavailable = (event) => {
1823
+ if (event.data.size > 0) {
1824
+ audioChunksRef.current.push(event.data);
1825
+ if (streaming) pendingChunksRef.current.push(event.data);
1826
+ }
1827
+ };
1828
+ mediaRecorder.start(100);
1829
+ setIsRecording(true);
1830
+ setError(null);
1831
+ if (streaming && sttRef.current) {
1832
+ let chunkIdx = 0;
1833
+ let shouldContinue = true;
1834
+ const processNextChunk = async () => {
1835
+ if (!shouldContinue || !mountedRef.current) return;
1836
+ if (pendingChunksRef.current.length > 0) {
1837
+ pendingChunksRef.current = [];
1838
+ try {
1839
+ setIsTranscribing(true);
1840
+ const chunkText = await transcribeChunk(chunkIdx);
1841
+ if (chunkText && mountedRef.current) {
1842
+ chunkIdx++;
1843
+ setChunkCount(chunkIdx);
1844
+ setTranscript((prev) => {
1845
+ const newTranscript = prev + (prev ? " " : "") + chunkText;
1846
+ fullTranscriptRef.current = newTranscript;
1847
+ onTranscript?.(newTranscript);
1848
+ return newTranscript;
1849
+ });
1850
+ }
1851
+ } catch (e) {
1852
+ console.error("[useVoiceInput] Chunk transcription error:", e);
1853
+ } finally {
1854
+ if (mountedRef.current) setIsTranscribing(false);
1855
+ }
1856
+ }
1857
+ if (shouldContinue && mountedRef.current) streamingIntervalRef.current = setTimeout(processNextChunk, chunkDuration);
1858
+ };
1859
+ streamingIntervalRef.current = setTimeout(processNextChunk, chunkDuration);
1860
+ streamingIntervalRef._stop = () => {
1861
+ shouldContinue = false;
1862
+ };
1863
+ }
1864
+ } catch (e) {
1865
+ const errMsg = e.message || "Failed to start recording";
1866
+ setError(errMsg);
1867
+ onError?.(errMsg);
1868
+ }
1869
+ }, [
1870
+ isRecording,
1871
+ streaming,
1872
+ shouldLoad,
1873
+ model,
1874
+ chunkDuration,
1875
+ transcribeChunk,
1876
+ onTranscript,
1877
+ onError,
1878
+ onProgress,
1879
+ onReady
1880
+ ]),
1881
+ stopRecording: useCallback(async () => {
1882
+ if (streamingIntervalRef._stop) streamingIntervalRef._stop();
1883
+ if (streamingIntervalRef.current) {
1884
+ clearTimeout(streamingIntervalRef.current);
1885
+ streamingIntervalRef.current = null;
1886
+ }
1887
+ return new Promise((resolve, reject) => {
1888
+ if (!mediaRecorderRef.current || !isRecording) {
1889
+ reject(/* @__PURE__ */ new Error("Not recording"));
1890
+ return;
1891
+ }
1892
+ const mediaRecorder = mediaRecorderRef.current;
1893
+ mediaRecorder.onstop = async () => {
1894
+ if (streamRef.current) {
1895
+ for (const track of streamRef.current.getTracks()) track.stop();
1896
+ streamRef.current = null;
1897
+ }
1898
+ setIsRecording(false);
1899
+ if (streaming) {
1900
+ if (audioChunksRef.current.length > 0 && processedSamplesRef.current > 0) {
1901
+ setIsTranscribing(true);
1902
+ pendingChunksRef.current = [];
1903
+ try {
1904
+ const finalChunkText = await transcribeChunk(chunkCount);
1905
+ if (finalChunkText && mountedRef.current) setTranscript((prev) => {
1906
+ const newTranscript = prev + (prev ? " " : "") + finalChunkText;
1907
+ fullTranscriptRef.current = newTranscript;
1908
+ return newTranscript;
1909
+ });
1910
+ } finally {
1911
+ if (mountedRef.current) setIsTranscribing(false);
1912
+ }
1913
+ }
1914
+ const finalText = fullTranscriptRef.current;
1915
+ onTranscript?.(finalText);
1916
+ resolve(finalText);
1917
+ return;
1918
+ }
1919
+ const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm" });
1920
+ try {
1921
+ if (!sttRef.current) {
1922
+ if (!shouldLoad) setShouldLoad(true);
1923
+ await new Promise((res, rej) => {
1924
+ const checkReady = setInterval(() => {
1925
+ if (sttRef.current) {
1926
+ clearInterval(checkReady);
1927
+ res();
1928
+ }
1929
+ }, 100);
1930
+ setTimeout(() => {
1931
+ clearInterval(checkReady);
1932
+ rej(/* @__PURE__ */ new Error("Timeout waiting for STT model"));
1933
+ }, 3e4);
1934
+ });
1935
+ }
1936
+ resolve(await transcribe(await blobToFloat32(audioBlob)));
1937
+ } catch (e) {
1938
+ const errMsg = e.message || "Transcription failed";
1939
+ setError(errMsg);
1940
+ onError?.(errMsg);
1941
+ reject(e);
1942
+ }
1943
+ };
1944
+ mediaRecorder.stop();
1945
+ });
1946
+ }, [
1947
+ isRecording,
1948
+ streaming,
1949
+ chunkCount,
1950
+ shouldLoad,
1951
+ blobToFloat32,
1952
+ transcribe,
1953
+ transcribeChunk,
1954
+ onTranscript,
1955
+ onError
1956
+ ]),
1957
+ cancelRecording: useCallback(() => {
1958
+ if (streamingIntervalRef._stop) streamingIntervalRef._stop();
1959
+ if (streamingIntervalRef.current) {
1960
+ clearTimeout(streamingIntervalRef.current);
1961
+ streamingIntervalRef.current = null;
1962
+ }
1963
+ if (mediaRecorderRef.current && isRecording) mediaRecorderRef.current.stop();
1964
+ if (streamRef.current) {
1965
+ for (const track of streamRef.current.getTracks()) track.stop();
1966
+ streamRef.current = null;
1967
+ }
1968
+ audioChunksRef.current = [];
1969
+ pendingChunksRef.current = [];
1970
+ processedSamplesRef.current = 0;
1971
+ setIsRecording(false);
1972
+ }, [isRecording]),
1973
+ transcribe,
1974
+ isRecording,
1975
+ isTranscribing,
1976
+ isLoading,
1977
+ isReady,
1978
+ transcript,
1979
+ streamingChunk,
1980
+ chunkCount,
1981
+ loadingProgress,
1982
+ error,
1983
+ load
1984
+ };
1985
+ }
1986
+ /**
1987
+ * React hook for voice conversation with STT + LLM + TTS
1988
+ *
1989
+ * Complete voice-to-voice conversation loop:
1990
+ * 1. User presses button to speak
1991
+ * 2. Speech is transcribed (Whisper)
1992
+ * 3. LLM generates response
1993
+ * 4. Response is spoken aloud (Kokoro or Supertonic TTS)
1994
+ *
1995
+ * @example
1996
+ * ```tsx
1997
+ * function VoiceChat() {
1998
+ * const {
1999
+ * messages,
2000
+ * startListening,
2001
+ * stopListening,
2002
+ * isListening,
2003
+ * isSpeaking,
2004
+ * stage,
2005
+ * } = useVoiceChat({
2006
+ * system: "You are a helpful voice assistant.",
2007
+ * voice: "af_bella",
2008
+ * // Or use Supertonic for faster synthesis:
2009
+ * // ttsModel: "supertonic-66m",
2010
+ * // voice: "F1",
2011
+ * });
2012
+ *
2013
+ * return (
2014
+ * <div>
2015
+ * {messages.map(m => (
2016
+ * <div key={m.id}>{m.role}: {m.content}</div>
2017
+ * ))}
2018
+ * <button
2019
+ * onMouseDown={startListening}
2020
+ * onMouseUp={stopListening}
2021
+ * >
2022
+ * {stage === "idle" ? "🎤 Hold to Speak" : stage}
2023
+ * </button>
2024
+ * </div>
2025
+ * );
2026
+ * }
2027
+ * ```
2028
+ */
2029
+ function useVoiceChat(options = {}) {
2030
+ const React = globalThis.React;
2031
+ if (!React) throw new Error("useVoiceChat requires React. Import React before using this hook.");
2032
+ const { useState, useEffect, useRef, useCallback } = React;
2033
+ const ttsModelId = options.ttsModel || "kokoro-82m";
2034
+ const ttsConfig = TTS_MODELS[ttsModelId];
2035
+ const { llmModel = "qwen3-0.6b", sttModel = "whisper-tiny.en", system = "You are a helpful voice assistant. Keep responses brief and conversational.", thinking = false, voice = ttsConfig.defaultVoice, speed = 1, autoLoad = false, onUserSpeak, onAssistantSpeak, onError } = options;
2036
+ const [messages, setMessages] = useState([]);
2037
+ const [stage, setStage] = useState("idle");
2038
+ const [isLoading, setIsLoading] = useState(autoLoad);
2039
+ const [loadingMessage, setLoadingMessage] = useState("");
2040
+ const [isReady, setIsReady] = useState(false);
2041
+ const [error, setError] = useState(null);
2042
+ const [shouldLoad, setShouldLoad] = useState(autoLoad);
2043
+ const llmWorkerRef = useRef(null);
2044
+ const sttRef = useRef(null);
2045
+ const ttsRef = useRef(null);
2046
+ const mediaRecorderRef = useRef(null);
2047
+ const audioChunksRef = useRef([]);
2048
+ const streamRef = useRef(null);
2049
+ const audioContextRef = useRef(null);
2050
+ const sourceNodeRef = useRef(null);
2051
+ const mountedRef = useRef(true);
2052
+ const cancelledRef = useRef(false);
2053
+ const isListening = stage === "listening";
2054
+ const isProcessing = stage === "transcribing" || stage === "thinking";
2055
+ const isSpeaking = stage === "speaking";
2056
+ useEffect(() => {
2057
+ if (!shouldLoad || isReady) return;
2058
+ let cancelled = false;
2059
+ const loadModels = async () => {
2060
+ try {
2061
+ setIsLoading(true);
2062
+ setError(null);
2063
+ setLoadingMessage("Loading speech recognition (Whisper)...");
2064
+ const { WhisperSTT } = await import("../stt-KzSoNvwI.mjs");
2065
+ if (cancelled || !mountedRef.current) return;
2066
+ const stt = new WhisperSTT(sttModel);
2067
+ await stt.load({ onProgress: (p) => {
2068
+ if (!mountedRef.current) return;
2069
+ setLoadingMessage(p.status || "Loading STT...");
2070
+ } });
2071
+ if (cancelled || !mountedRef.current) {
2072
+ stt.dispose();
2073
+ return;
2074
+ }
2075
+ sttRef.current = stt;
2076
+ setLoadingMessage("Loading language model...");
2077
+ const worker = await createGerbilWorker({
2078
+ modelId: llmModel,
2079
+ onProgress: (p) => {
2080
+ if (!mountedRef.current) return;
2081
+ setLoadingMessage(p.message || "Loading LLM...");
2082
+ }
2083
+ });
2084
+ if (cancelled || !mountedRef.current) {
2085
+ worker.terminate();
2086
+ return;
2087
+ }
2088
+ llmWorkerRef.current = worker;
2089
+ setLoadingMessage(`Loading text-to-speech (${ttsModelId === "supertonic-66m" ? "Supertonic" : "Kokoro"})...`);
2090
+ const { createTTS } = await import("../tts-5yWeP_I0.mjs");
2091
+ if (cancelled || !mountedRef.current) return;
2092
+ const tts = createTTS(ttsModelId);
2093
+ await tts.load({ onProgress: (p) => {
2094
+ if (!mountedRef.current) return;
2095
+ setLoadingMessage(p.status || "Loading TTS...");
2096
+ } });
2097
+ if (cancelled || !mountedRef.current) {
2098
+ await tts.dispose();
2099
+ return;
2100
+ }
2101
+ ttsRef.current = tts;
2102
+ setIsReady(true);
2103
+ setIsLoading(false);
2104
+ setLoadingMessage("Ready!");
2105
+ } catch (e) {
2106
+ if (!mountedRef.current) return;
2107
+ const errMsg = e.message || "Failed to load models";
2108
+ setError(errMsg);
2109
+ setIsLoading(false);
2110
+ onError?.(errMsg);
2111
+ }
2112
+ };
2113
+ loadModels();
2114
+ return () => {
2115
+ cancelled = true;
2116
+ };
2117
+ }, [
2118
+ shouldLoad,
2119
+ isReady,
2120
+ llmModel,
2121
+ sttModel,
2122
+ ttsModelId,
2123
+ onError
2124
+ ]);
2125
+ useEffect(() => {
2126
+ mountedRef.current = true;
2127
+ return () => {
2128
+ mountedRef.current = false;
2129
+ llmWorkerRef.current?.terminate();
2130
+ sttRef.current?.dispose();
2131
+ ttsRef.current?.dispose();
2132
+ if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
2133
+ audioContextRef.current?.close();
2134
+ };
2135
+ }, []);
2136
+ const load = useCallback(() => {
2137
+ if (!shouldLoad && !isReady && !isLoading) setShouldLoad(true);
2138
+ }, [
2139
+ shouldLoad,
2140
+ isReady,
2141
+ isLoading
2142
+ ]);
2143
+ const blobToFloat32 = useCallback(async (blob) => {
2144
+ const audioContext = new AudioContext({ sampleRate: 16e3 });
2145
+ const arrayBuffer = await blob.arrayBuffer();
2146
+ const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
2147
+ const channelData = audioBuffer.getChannelData(0);
2148
+ if (audioBuffer.sampleRate !== 16e3) {
2149
+ const ratio = 16e3 / audioBuffer.sampleRate;
2150
+ const newLength = Math.round(channelData.length * ratio);
2151
+ const resampled = new Float32Array(newLength);
2152
+ for (let i = 0; i < newLength; i++) {
2153
+ const srcIndex = i / ratio;
2154
+ const floor = Math.floor(srcIndex);
2155
+ const ceil = Math.min(floor + 1, channelData.length - 1);
2156
+ const t = srcIndex - floor;
2157
+ resampled[i] = channelData[floor] * (1 - t) + channelData[ceil] * t;
2158
+ }
2159
+ audioContext.close();
2160
+ return resampled;
2161
+ }
2162
+ audioContext.close();
2163
+ return new Float32Array(channelData);
2164
+ }, []);
2165
+ const playAudioBuffer = useCallback(async (audio, sampleRate) => {
2166
+ return new Promise((resolve) => {
2167
+ if (!audioContextRef.current) audioContextRef.current = new AudioContext();
2168
+ const ctx = audioContextRef.current;
2169
+ const buffer = ctx.createBuffer(1, audio.length, sampleRate);
2170
+ const channelData = new Float32Array(audio);
2171
+ buffer.copyToChannel(channelData, 0);
2172
+ const source = ctx.createBufferSource();
2173
+ source.buffer = buffer;
2174
+ source.connect(ctx.destination);
2175
+ source.onended = () => {
2176
+ if (mountedRef.current) resolve();
2177
+ };
2178
+ source.start();
2179
+ sourceNodeRef.current = source;
2180
+ });
2181
+ }, []);
2182
+ return {
2183
+ messages,
2184
+ startListening: useCallback(async () => {
2185
+ if (stage !== "idle") return;
2186
+ if (!isReady && !isLoading) {
2187
+ setShouldLoad(true);
2188
+ return;
2189
+ }
2190
+ cancelledRef.current = false;
2191
+ try {
2192
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: {
2193
+ sampleRate: 16e3,
2194
+ channelCount: 1,
2195
+ echoCancellation: true
2196
+ } });
2197
+ streamRef.current = stream;
2198
+ audioChunksRef.current = [];
2199
+ const mediaRecorder = new MediaRecorder(stream);
2200
+ mediaRecorderRef.current = mediaRecorder;
2201
+ mediaRecorder.ondataavailable = (event) => {
2202
+ if (event.data.size > 0) audioChunksRef.current.push(event.data);
2203
+ };
2204
+ mediaRecorder.start(100);
2205
+ setStage("listening");
2206
+ setError(null);
2207
+ } catch (e) {
2208
+ const errMsg = e.message || "Failed to access microphone";
2209
+ setError(errMsg);
2210
+ onError?.(errMsg);
2211
+ }
2212
+ }, [
2213
+ stage,
2214
+ isReady,
2215
+ isLoading,
2216
+ onError
2217
+ ]),
2218
+ stopListening: useCallback(async () => {
2219
+ if (stage !== "listening") return;
2220
+ const mediaRecorder = mediaRecorderRef.current;
2221
+ if (!mediaRecorder) return;
2222
+ return new Promise((resolve) => {
2223
+ mediaRecorder.onstop = async () => {
2224
+ if (streamRef.current) {
2225
+ for (const track of streamRef.current.getTracks()) track.stop();
2226
+ streamRef.current = null;
2227
+ }
2228
+ if (cancelledRef.current) {
2229
+ setStage("idle");
2230
+ resolve();
2231
+ return;
2232
+ }
2233
+ const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm" });
2234
+ try {
2235
+ setStage("transcribing");
2236
+ const audioData = await blobToFloat32(audioBlob);
2237
+ let userText = (await sttRef.current.transcribe(audioData)).text.trim();
2238
+ if (userText === "[BLANK_AUDIO]" || userText === "(blank audio)" || userText === "[BLANK AUDIO]") userText = "";
2239
+ if (cancelledRef.current || !userText) {
2240
+ setStage("idle");
2241
+ resolve();
2242
+ return;
2243
+ }
2244
+ const userMsgId = `user-${Date.now()}`;
2245
+ setMessages((m) => [...m, {
2246
+ id: userMsgId,
2247
+ role: "user",
2248
+ content: userText
2249
+ }]);
2250
+ onUserSpeak?.(userText);
2251
+ setStage("thinking");
2252
+ const history = messages.map((m) => ({
2253
+ role: m.role,
2254
+ content: m.content
2255
+ }));
2256
+ history.push({
2257
+ role: "user",
2258
+ content: userText
2259
+ });
2260
+ let responseText = "";
2261
+ let thinkingText = "";
2262
+ await llmWorkerRef.current.generate(userText, {
2263
+ system,
2264
+ thinking,
2265
+ history,
2266
+ onToken: (token) => {
2267
+ if (cancelledRef.current) return;
2268
+ if (token.state === "thinking") thinkingText += token.text;
2269
+ else responseText += token.text;
2270
+ }
2271
+ });
2272
+ if (cancelledRef.current) {
2273
+ setStage("idle");
2274
+ resolve();
2275
+ return;
2276
+ }
2277
+ const assistantMsgId = `assistant-${Date.now()}`;
2278
+ setMessages((m) => [...m, {
2279
+ id: assistantMsgId,
2280
+ role: "assistant",
2281
+ content: responseText,
2282
+ thinking: thinkingText || void 0
2283
+ }]);
2284
+ onAssistantSpeak?.(responseText);
2285
+ if (responseText.trim()) {
2286
+ setStage("speaking");
2287
+ const ttsResult = await ttsRef.current.speak(responseText, {
2288
+ voice,
2289
+ speed
2290
+ });
2291
+ if (!cancelledRef.current) await playAudioBuffer(ttsResult.audio, ttsResult.sampleRate);
2292
+ }
2293
+ setStage("idle");
2294
+ resolve();
2295
+ } catch (e) {
2296
+ if (!mountedRef.current) return;
2297
+ const errMsg = e.message || "Processing failed";
2298
+ setError(errMsg);
2299
+ setStage("idle");
2300
+ onError?.(errMsg);
2301
+ resolve();
2302
+ }
2303
+ };
2304
+ mediaRecorder.stop();
2305
+ });
2306
+ }, [
2307
+ stage,
2308
+ messages,
2309
+ system,
2310
+ thinking,
2311
+ voice,
2312
+ speed,
2313
+ blobToFloat32,
2314
+ playAudioBuffer,
2315
+ onUserSpeak,
2316
+ onAssistantSpeak,
2317
+ onError
2318
+ ]),
2319
+ cancel: useCallback(() => {
2320
+ cancelledRef.current = true;
2321
+ if (mediaRecorderRef.current && stage === "listening") mediaRecorderRef.current.stop();
2322
+ if (streamRef.current) {
2323
+ for (const track of streamRef.current.getTracks()) track.stop();
2324
+ streamRef.current = null;
2325
+ }
2326
+ if (sourceNodeRef.current) try {
2327
+ sourceNodeRef.current.stop();
2328
+ } catch {}
2329
+ audioChunksRef.current = [];
2330
+ setStage("idle");
2331
+ }, [stage]),
2332
+ clear: useCallback(() => {
2333
+ setMessages([]);
2334
+ }, []),
2335
+ isListening,
2336
+ isProcessing,
2337
+ isSpeaking,
2338
+ stage,
2339
+ isReady,
2340
+ isLoading,
2341
+ loadingMessage,
2342
+ error,
2343
+ load
2344
+ };
2345
+ }
722
2346
  /**
723
2347
  * Check if WebGPU is supported
724
2348
  */
@@ -747,9 +2371,11 @@ async function getWebGPUInfo() {
747
2371
  var browser_default = {
748
2372
  isWebGPUSupported,
749
2373
  getWebGPUInfo,
750
- createGerbilWorker
2374
+ createGerbilWorker,
2375
+ playAudio,
2376
+ createAudioPlayer
751
2377
  };
752
2378
 
753
2379
  //#endregion
754
- export { BUILTIN_MODELS, createGerbilWorker, browser_default as default, getWebGPUInfo, isWebGPUSupported, useChat, useCompletion };
2380
+ export { BUILTIN_MODELS, createAudioPlayer, createGerbilWorker, browser_default as default, getWebGPUInfo, isWebGPUSupported, playAudio, useChat, useCompletion, useSpeech, useVoiceChat, useVoiceInput };
755
2381
  //# sourceMappingURL=index.mjs.map