@tryhamster/gerbil 1.0.0-rc.0 → 1.0.0-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -14
- package/dist/auto-update-DsWBBnEk.mjs +3 -0
- package/dist/browser/index.d.mts +401 -5
- package/dist/browser/index.d.mts.map +1 -1
- package/dist/browser/index.mjs +1772 -146
- package/dist/browser/index.mjs.map +1 -1
- package/dist/{chrome-backend-CtwPENIW.mjs → chrome-backend-JEPeM2YE.mjs} +1 -1
- package/dist/{chrome-backend-C5Un08O4.mjs → chrome-backend-Y9F7W5VQ.mjs} +514 -73
- package/dist/chrome-backend-Y9F7W5VQ.mjs.map +1 -0
- package/dist/cli.mjs +3359 -646
- package/dist/cli.mjs.map +1 -1
- package/dist/frameworks/express.d.mts +1 -1
- package/dist/frameworks/express.mjs +3 -3
- package/dist/frameworks/fastify.d.mts +1 -1
- package/dist/frameworks/fastify.mjs +3 -3
- package/dist/frameworks/hono.d.mts +1 -1
- package/dist/frameworks/hono.mjs +3 -3
- package/dist/frameworks/next.d.mts +2 -2
- package/dist/frameworks/next.mjs +3 -3
- package/dist/frameworks/react.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts +1 -1
- package/dist/frameworks/trpc.mjs +3 -3
- package/dist/gerbil-DeQlX_Mt.mjs +5 -0
- package/dist/gerbil-POAz8peb.d.mts +431 -0
- package/dist/gerbil-POAz8peb.d.mts.map +1 -0
- package/dist/gerbil-yoSpRHgv.mjs +1463 -0
- package/dist/gerbil-yoSpRHgv.mjs.map +1 -0
- package/dist/index.d.mts +395 -9
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +8 -6
- package/dist/index.mjs.map +1 -1
- package/dist/integrations/ai-sdk.d.mts +122 -4
- package/dist/integrations/ai-sdk.d.mts.map +1 -1
- package/dist/integrations/ai-sdk.mjs +239 -11
- package/dist/integrations/ai-sdk.mjs.map +1 -1
- package/dist/integrations/langchain.d.mts +132 -2
- package/dist/integrations/langchain.d.mts.map +1 -1
- package/dist/integrations/langchain.mjs +176 -8
- package/dist/integrations/langchain.mjs.map +1 -1
- package/dist/integrations/llamaindex.d.mts +1 -1
- package/dist/integrations/llamaindex.mjs +3 -3
- package/dist/integrations/mcp-client.mjs +4 -4
- package/dist/integrations/mcp-client.mjs.map +1 -1
- package/dist/integrations/mcp.d.mts +2 -2
- package/dist/integrations/mcp.d.mts.map +1 -1
- package/dist/integrations/mcp.mjs +6 -6
- package/dist/{mcp-R8kRLIKb.mjs → mcp-Bitg4sjX.mjs} +10 -37
- package/dist/mcp-Bitg4sjX.mjs.map +1 -0
- package/dist/microphone-D-6y9aiE.mjs +3 -0
- package/dist/{models-DKULvhOr.mjs → models-BAtL8qsA.mjs} +42 -7
- package/dist/models-BAtL8qsA.mjs.map +1 -0
- package/dist/{models-De2-_GmQ.d.mts → models-CE0fBq0U.d.mts} +2 -2
- package/dist/models-CE0fBq0U.d.mts.map +1 -0
- package/dist/{one-liner-BUQR0nqq.mjs → one-liner-B1rmFto6.mjs} +2 -2
- package/dist/{one-liner-BUQR0nqq.mjs.map → one-liner-B1rmFto6.mjs.map} +1 -1
- package/dist/repl-D20JO260.mjs +10 -0
- package/dist/skills/index.d.mts +303 -12
- package/dist/skills/index.d.mts.map +1 -1
- package/dist/skills/index.mjs +6 -6
- package/dist/skills-5DxAV-rn.mjs +1435 -0
- package/dist/skills-5DxAV-rn.mjs.map +1 -0
- package/dist/stt-Bv_dum-R.mjs +433 -0
- package/dist/stt-Bv_dum-R.mjs.map +1 -0
- package/dist/stt-KzSoNvwI.mjs +3 -0
- package/dist/{tools-BsiEE6f2.mjs → tools-IYPrqoek.mjs} +6 -7
- package/dist/{tools-BsiEE6f2.mjs.map → tools-IYPrqoek.mjs.map} +1 -1
- package/dist/tts-5yWeP_I0.mjs +3 -0
- package/dist/tts-DG6denWG.mjs +729 -0
- package/dist/tts-DG6denWG.mjs.map +1 -0
- package/dist/types-s6Py2_DL.d.mts +353 -0
- package/dist/types-s6Py2_DL.d.mts.map +1 -0
- package/dist/{utils-7vXqtq2Q.mjs → utils-CkB4Roi6.mjs} +1 -1
- package/dist/{utils-7vXqtq2Q.mjs.map → utils-CkB4Roi6.mjs.map} +1 -1
- package/docs/ai-sdk.md +137 -21
- package/docs/browser.md +241 -2
- package/docs/memory.md +72 -0
- package/docs/stt.md +494 -0
- package/docs/tts.md +569 -0
- package/docs/vision.md +396 -0
- package/package.json +17 -18
- package/dist/auto-update-BbNHbSU1.mjs +0 -3
- package/dist/chrome-backend-C5Un08O4.mjs.map +0 -1
- package/dist/gerbil-BfnsFWRE.mjs +0 -644
- package/dist/gerbil-BfnsFWRE.mjs.map +0 -1
- package/dist/gerbil-BjW-z7Fq.mjs +0 -5
- package/dist/gerbil-DZ1k3ChC.d.mts +0 -138
- package/dist/gerbil-DZ1k3ChC.d.mts.map +0 -1
- package/dist/mcp-R8kRLIKb.mjs.map +0 -1
- package/dist/models-DKULvhOr.mjs.map +0 -1
- package/dist/models-De2-_GmQ.d.mts.map +0 -1
- package/dist/skills-D3CEpgDc.mjs +0 -630
- package/dist/skills-D3CEpgDc.mjs.map +0 -1
- package/dist/types-BS1N92Jt.d.mts +0 -183
- package/dist/types-BS1N92Jt.d.mts.map +0 -1
package/dist/browser/index.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { o as resolveModel, t as BUILTIN_MODELS } from "../models-BAtL8qsA.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/browser/index.ts
|
|
4
4
|
/**
|
|
@@ -61,40 +61,84 @@ async function createGerbilWorker(options = {}) {
|
|
|
61
61
|
import {
|
|
62
62
|
AutoTokenizer,
|
|
63
63
|
AutoModelForCausalLM,
|
|
64
|
+
AutoProcessor,
|
|
65
|
+
AutoModelForImageTextToText,
|
|
66
|
+
RawImage,
|
|
64
67
|
TextStreamer,
|
|
65
68
|
InterruptableStoppingCriteria,
|
|
66
|
-
|
|
69
|
+
env,
|
|
70
|
+
} from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
|
|
71
|
+
|
|
72
|
+
// Enable IndexedDB caching for browser (prevents re-downloading models)
|
|
73
|
+
env.useBrowserCache = true;
|
|
74
|
+
env.allowLocalModels = false;
|
|
67
75
|
|
|
68
76
|
class ModelPipeline {
|
|
69
77
|
static tokenizer = null;
|
|
70
78
|
static model = null;
|
|
79
|
+
static processor = null;
|
|
80
|
+
static visionModel = null;
|
|
71
81
|
static modelId = "";
|
|
82
|
+
static isVision = false;
|
|
72
83
|
|
|
73
84
|
static async getInstance(modelId, options = {}, progressCallback) {
|
|
74
85
|
if (this.modelId !== modelId) {
|
|
75
86
|
this.tokenizer = null;
|
|
76
87
|
this.model = null;
|
|
88
|
+
this.processor = null;
|
|
89
|
+
this.visionModel = null;
|
|
77
90
|
}
|
|
78
91
|
this.modelId = modelId;
|
|
92
|
+
|
|
93
|
+
// Detect vision models
|
|
94
|
+
this.isVision = options.vision ||
|
|
95
|
+
modelId.toLowerCase().includes("ministral") ||
|
|
96
|
+
modelId.toLowerCase().includes("vision") ||
|
|
97
|
+
modelId.toLowerCase().includes("vlm");
|
|
79
98
|
|
|
80
99
|
const dtype = options.dtype || "q4f16";
|
|
81
100
|
const device = options.device || "webgpu";
|
|
82
101
|
|
|
83
|
-
if (
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
102
|
+
if (this.isVision) {
|
|
103
|
+
// Load vision model components
|
|
104
|
+
// Note: Don't specify dtype for vision models - let transformers.js pick defaults
|
|
105
|
+
if (!this.processor) {
|
|
106
|
+
this.processor = await AutoProcessor.from_pretrained(modelId, {
|
|
107
|
+
progress_callback: progressCallback,
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
if (!this.visionModel) {
|
|
111
|
+
this.visionModel = await AutoModelForImageTextToText.from_pretrained(modelId, {
|
|
112
|
+
device,
|
|
113
|
+
progress_callback: progressCallback,
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
return {
|
|
117
|
+
processor: this.processor,
|
|
118
|
+
model: this.visionModel,
|
|
119
|
+
tokenizer: this.processor.tokenizer,
|
|
120
|
+
isVision: true
|
|
121
|
+
};
|
|
122
|
+
} else {
|
|
123
|
+
// Load text-only model components
|
|
124
|
+
if (!this.tokenizer) {
|
|
125
|
+
this.tokenizer = await AutoTokenizer.from_pretrained(modelId, {
|
|
126
|
+
progress_callback: progressCallback,
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
if (!this.model) {
|
|
130
|
+
this.model = await AutoModelForCausalLM.from_pretrained(modelId, {
|
|
131
|
+
dtype,
|
|
132
|
+
device,
|
|
133
|
+
progress_callback: progressCallback,
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
return {
|
|
137
|
+
tokenizer: this.tokenizer,
|
|
138
|
+
model: this.model,
|
|
139
|
+
isVision: false
|
|
140
|
+
};
|
|
95
141
|
}
|
|
96
|
-
|
|
97
|
-
return { tokenizer: this.tokenizer, model: this.model };
|
|
98
142
|
}
|
|
99
143
|
}
|
|
100
144
|
|
|
@@ -105,22 +149,19 @@ async function createGerbilWorker(options = {}) {
|
|
|
105
149
|
const { modelId, options = {} } = data;
|
|
106
150
|
self.postMessage({ status: "loading", message: "Loading model..." });
|
|
107
151
|
|
|
108
|
-
// Track download state - if we see progress < 100, we're downloading
|
|
109
152
|
const downloadState = {
|
|
110
|
-
downloading: new Set(),
|
|
111
|
-
completed: new Set(),
|
|
112
|
-
isDownloading: false,
|
|
153
|
+
downloading: new Set(),
|
|
154
|
+
completed: new Set(),
|
|
155
|
+
isDownloading: false,
|
|
113
156
|
};
|
|
114
157
|
|
|
115
158
|
try {
|
|
116
|
-
const
|
|
159
|
+
const result = await ModelPipeline.getInstance(
|
|
117
160
|
modelId,
|
|
118
161
|
options,
|
|
119
162
|
(progress) => {
|
|
120
163
|
if (progress.status === "progress" && progress.file) {
|
|
121
164
|
const pct = Math.round(progress.progress || 0);
|
|
122
|
-
|
|
123
|
-
// If we see progress < 100, this file is being downloaded (not from cache)
|
|
124
165
|
if (pct < 100) {
|
|
125
166
|
downloadState.downloading.add(progress.file);
|
|
126
167
|
downloadState.isDownloading = true;
|
|
@@ -128,8 +169,6 @@ async function createGerbilWorker(options = {}) {
|
|
|
128
169
|
downloadState.downloading.delete(progress.file);
|
|
129
170
|
downloadState.completed.add(progress.file);
|
|
130
171
|
}
|
|
131
|
-
|
|
132
|
-
// Only emit downloading status if actually downloading
|
|
133
172
|
if (downloadState.isDownloading) {
|
|
134
173
|
self.postMessage({
|
|
135
174
|
status: "downloading",
|
|
@@ -144,96 +183,229 @@ async function createGerbilWorker(options = {}) {
|
|
|
144
183
|
);
|
|
145
184
|
|
|
146
185
|
self.postMessage({ status: "loading", message: "Compiling shaders..." });
|
|
147
|
-
|
|
148
|
-
|
|
186
|
+
|
|
187
|
+
// Warmup differs for vision vs text models
|
|
188
|
+
if (result.isVision) {
|
|
189
|
+
// Vision models need both text and vision warmup
|
|
190
|
+
// Text warmup first
|
|
191
|
+
const textWarmupInputs = result.tokenizer("hello");
|
|
192
|
+
await result.model.generate({ ...textWarmupInputs, max_new_tokens: 1 });
|
|
193
|
+
|
|
194
|
+
// Vision warmup with synthetic image
|
|
195
|
+
self.postMessage({ status: "loading", message: "Warming up vision encoder..." });
|
|
196
|
+
try {
|
|
197
|
+
// Create a tiny 8x8 test image using OffscreenCanvas
|
|
198
|
+
const canvas = new OffscreenCanvas(8, 8);
|
|
199
|
+
const ctx = canvas.getContext("2d");
|
|
200
|
+
ctx.fillStyle = "red";
|
|
201
|
+
ctx.fillRect(0, 0, 8, 8);
|
|
202
|
+
const blob = await canvas.convertToBlob({ type: "image/png" });
|
|
203
|
+
const warmupImage = await RawImage.fromBlob(blob);
|
|
204
|
+
|
|
205
|
+
// Process with vision pipeline
|
|
206
|
+
const warmupContent = [{ type: "image" }, { type: "text", text: "hi" }];
|
|
207
|
+
const warmupMessages = [{ role: "user", content: warmupContent }];
|
|
208
|
+
const warmupPrompt = result.processor.apply_chat_template(warmupMessages, { add_generation_prompt: true });
|
|
209
|
+
const warmupInputs = await result.processor(warmupImage, warmupPrompt, { add_special_tokens: false });
|
|
210
|
+
|
|
211
|
+
// Run vision warmup generation
|
|
212
|
+
await result.model.generate({
|
|
213
|
+
...warmupInputs,
|
|
214
|
+
max_new_tokens: 1,
|
|
215
|
+
});
|
|
216
|
+
} catch (warmupErr) {
|
|
217
|
+
console.warn("Vision warmup failed (non-fatal):", warmupErr);
|
|
218
|
+
}
|
|
219
|
+
} else {
|
|
220
|
+
const warmupInputs = result.tokenizer("a");
|
|
221
|
+
await result.model.generate({ ...warmupInputs, max_new_tokens: 1 });
|
|
222
|
+
}
|
|
149
223
|
|
|
150
|
-
self.postMessage({ status: "ready" });
|
|
224
|
+
self.postMessage({ status: "ready", isVision: result.isVision });
|
|
151
225
|
} catch (error) {
|
|
152
226
|
self.postMessage({ status: "error", error: error.message || String(error) });
|
|
153
227
|
}
|
|
154
228
|
}
|
|
155
229
|
|
|
156
230
|
async function generate(data) {
|
|
157
|
-
const { messages, options = {} } = data;
|
|
231
|
+
const { messages, images = [], options = {} } = data;
|
|
158
232
|
const { maxTokens = 256, temperature = 0.7, topP = 0.9, topK = 20, thinking = false } = options;
|
|
159
233
|
|
|
160
234
|
try {
|
|
161
|
-
const
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
let state = "answering";
|
|
170
|
-
const [START_THINKING_TOKEN_ID, END_THINKING_TOKEN_ID] = tokenizer.encode(
|
|
171
|
-
"<think></think>",
|
|
172
|
-
{ add_special_tokens: false }
|
|
173
|
-
);
|
|
174
|
-
|
|
175
|
-
let startTime = null;
|
|
176
|
-
let numTokens = 0;
|
|
177
|
-
|
|
178
|
-
// Token callback for state tracking (receives raw token IDs)
|
|
179
|
-
const tokenCallback = (tokens) => {
|
|
180
|
-
startTime ??= performance.now();
|
|
181
|
-
numTokens++;
|
|
182
|
-
|
|
183
|
-
const tokenId = Number(tokens[0]);
|
|
184
|
-
if (tokenId === START_THINKING_TOKEN_ID) {
|
|
185
|
-
state = "thinking";
|
|
186
|
-
} else if (tokenId === END_THINKING_TOKEN_ID) {
|
|
187
|
-
state = "answering";
|
|
188
|
-
}
|
|
189
|
-
};
|
|
190
|
-
|
|
191
|
-
// Text callback for streaming (receives decoded text)
|
|
192
|
-
const streamCallback = (text) => {
|
|
193
|
-
const tps = startTime ? (numTokens / (performance.now() - startTime)) * 1000 : 0;
|
|
194
|
-
self.postMessage({ status: "token", text, state, numTokens, tps });
|
|
195
|
-
};
|
|
196
|
-
|
|
197
|
-
const streamer = new TextStreamer(tokenizer, {
|
|
198
|
-
skip_prompt: true,
|
|
199
|
-
skip_special_tokens: true,
|
|
200
|
-
callback_function: streamCallback,
|
|
201
|
-
token_callback_function: tokenCallback,
|
|
202
|
-
});
|
|
203
|
-
|
|
204
|
-
self.postMessage({ status: "start" });
|
|
205
|
-
|
|
206
|
-
const { past_key_values, sequences } = await model.generate({
|
|
207
|
-
...inputs,
|
|
208
|
-
past_key_values: pastKeyValuesCache,
|
|
209
|
-
do_sample: temperature > 0,
|
|
210
|
-
temperature: temperature > 0 ? temperature : undefined,
|
|
211
|
-
top_p: topP,
|
|
212
|
-
top_k: topK,
|
|
213
|
-
max_new_tokens: maxTokens,
|
|
214
|
-
streamer,
|
|
215
|
-
stopping_criteria: stoppingCriteria,
|
|
216
|
-
return_dict_in_generate: true,
|
|
217
|
-
});
|
|
218
|
-
|
|
219
|
-
pastKeyValuesCache = past_key_values;
|
|
220
|
-
|
|
221
|
-
const endTime = performance.now();
|
|
222
|
-
const totalTime = startTime ? endTime - startTime : 0;
|
|
223
|
-
const decoded = tokenizer.batch_decode(sequences, { skip_special_tokens: true });
|
|
224
|
-
|
|
225
|
-
self.postMessage({
|
|
226
|
-
status: "complete",
|
|
227
|
-
text: decoded[0] || "",
|
|
228
|
-
numTokens,
|
|
229
|
-
totalTime,
|
|
230
|
-
tps: totalTime > 0 ? (numTokens / totalTime) * 1000 : 0,
|
|
231
|
-
});
|
|
235
|
+
const result = await ModelPipeline.getInstance(ModelPipeline.modelId, {});
|
|
236
|
+
|
|
237
|
+
// Route to vision or text generation
|
|
238
|
+
if (result.isVision && images.length > 0) {
|
|
239
|
+
await generateVision(result, messages, images, options);
|
|
240
|
+
} else {
|
|
241
|
+
await generateText(result, messages, options);
|
|
242
|
+
}
|
|
232
243
|
} catch (error) {
|
|
233
244
|
self.postMessage({ status: "error", error: error.message || String(error) });
|
|
234
245
|
}
|
|
235
246
|
}
|
|
236
247
|
|
|
248
|
+
async function generateText(result, messages, options) {
|
|
249
|
+
const { maxTokens = 256, temperature = 0.7, topP = 0.9, topK = 20, thinking = false } = options;
|
|
250
|
+
const { tokenizer, model } = result;
|
|
251
|
+
|
|
252
|
+
const inputs = tokenizer.apply_chat_template(messages, {
|
|
253
|
+
add_generation_prompt: true,
|
|
254
|
+
return_dict: true,
|
|
255
|
+
enable_thinking: thinking,
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
let state = "answering";
|
|
259
|
+
const [START_THINKING_TOKEN_ID, END_THINKING_TOKEN_ID] = tokenizer.encode(
|
|
260
|
+
"<think></think>",
|
|
261
|
+
{ add_special_tokens: false }
|
|
262
|
+
);
|
|
263
|
+
|
|
264
|
+
let startTime = null;
|
|
265
|
+
let numTokens = 0;
|
|
266
|
+
|
|
267
|
+
const tokenCallback = (tokens) => {
|
|
268
|
+
startTime ??= performance.now();
|
|
269
|
+
numTokens += 1;
|
|
270
|
+
const tokenId = Number(tokens[0]);
|
|
271
|
+
if (tokenId === START_THINKING_TOKEN_ID) state = "thinking";
|
|
272
|
+
else if (tokenId === END_THINKING_TOKEN_ID) state = "answering";
|
|
273
|
+
};
|
|
274
|
+
|
|
275
|
+
const streamCallback = (text) => {
|
|
276
|
+
const tps = startTime ? (numTokens / (performance.now() - startTime)) * 1000 : 0;
|
|
277
|
+
self.postMessage({ status: "token", text, state, numTokens, tps });
|
|
278
|
+
};
|
|
279
|
+
|
|
280
|
+
const streamer = new TextStreamer(tokenizer, {
|
|
281
|
+
skip_prompt: true,
|
|
282
|
+
skip_special_tokens: true,
|
|
283
|
+
callback_function: streamCallback,
|
|
284
|
+
token_callback_function: tokenCallback,
|
|
285
|
+
});
|
|
286
|
+
|
|
287
|
+
self.postMessage({ status: "start" });
|
|
288
|
+
|
|
289
|
+
const { past_key_values, sequences } = await model.generate({
|
|
290
|
+
...inputs,
|
|
291
|
+
past_key_values: pastKeyValuesCache,
|
|
292
|
+
do_sample: temperature > 0,
|
|
293
|
+
temperature: temperature > 0 ? temperature : undefined,
|
|
294
|
+
top_p: topP,
|
|
295
|
+
top_k: topK,
|
|
296
|
+
max_new_tokens: maxTokens,
|
|
297
|
+
streamer,
|
|
298
|
+
stopping_criteria: stoppingCriteria,
|
|
299
|
+
return_dict_in_generate: true,
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
pastKeyValuesCache = past_key_values;
|
|
303
|
+
|
|
304
|
+
const endTime = performance.now();
|
|
305
|
+
const totalTime = startTime ? endTime - startTime : 0;
|
|
306
|
+
const decoded = tokenizer.batch_decode(sequences, { skip_special_tokens: true });
|
|
307
|
+
|
|
308
|
+
self.postMessage({
|
|
309
|
+
status: "complete",
|
|
310
|
+
text: decoded[0] || "",
|
|
311
|
+
numTokens,
|
|
312
|
+
totalTime,
|
|
313
|
+
tps: totalTime > 0 ? (numTokens / totalTime) * 1000 : 0,
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
async function generateVision(result, messages, images, options) {
|
|
318
|
+
const { maxTokens = 2048, temperature = 0.7, topP = 0.9, topK = 20 } = options;
|
|
319
|
+
const { processor, model, tokenizer } = result;
|
|
320
|
+
|
|
321
|
+
self.postMessage({ status: "progress", message: "Preparing vision request..." });
|
|
322
|
+
|
|
323
|
+
// Build message content with image placeholders and text
|
|
324
|
+
const lastMessage = messages[messages.length - 1];
|
|
325
|
+
const content = [];
|
|
326
|
+
for (const _ of images) {
|
|
327
|
+
content.push({ type: "image" });
|
|
328
|
+
}
|
|
329
|
+
content.push({ type: "text", text: lastMessage.content });
|
|
330
|
+
|
|
331
|
+
// For vision models, include a brief system instruction for concise responses
|
|
332
|
+
// Note: Vision processors handle system differently than text models
|
|
333
|
+
const visionMessages = [
|
|
334
|
+
{ role: "system", content: "You are a helpful assistant. Be concise and direct in your responses." },
|
|
335
|
+
{ role: "user", content }
|
|
336
|
+
];
|
|
337
|
+
|
|
338
|
+
// Apply chat template with generation prompt
|
|
339
|
+
const chatPrompt = processor.apply_chat_template(visionMessages, {
|
|
340
|
+
add_generation_prompt: true
|
|
341
|
+
});
|
|
342
|
+
|
|
343
|
+
// Load images (handle both string URLs and { source: string } objects)
|
|
344
|
+
self.postMessage({ status: "progress", message: "Loading images..." });
|
|
345
|
+
const loadedImages = await Promise.all(
|
|
346
|
+
images.map(img => {
|
|
347
|
+
const url = typeof img === "string" ? img : img.source;
|
|
348
|
+
return RawImage.fromURL(url);
|
|
349
|
+
})
|
|
350
|
+
);
|
|
351
|
+
self.postMessage({ status: "progress", message: "Processing inputs..." });
|
|
352
|
+
|
|
353
|
+
// Process inputs
|
|
354
|
+
const inputs = await processor(
|
|
355
|
+
loadedImages.length === 1 ? loadedImages[0] : loadedImages,
|
|
356
|
+
chatPrompt,
|
|
357
|
+
{ add_special_tokens: false }
|
|
358
|
+
);
|
|
359
|
+
self.postMessage({ status: "progress", message: "Generating response..." });
|
|
360
|
+
|
|
361
|
+
let startTime = null;
|
|
362
|
+
let numTokens = 0;
|
|
363
|
+
|
|
364
|
+
const streamCallback = (text) => {
|
|
365
|
+
startTime ??= performance.now();
|
|
366
|
+
numTokens += 1;
|
|
367
|
+
const tps = (numTokens / (performance.now() - startTime)) * 1000;
|
|
368
|
+
self.postMessage({ status: "token", text, state: "answering", numTokens, tps });
|
|
369
|
+
};
|
|
370
|
+
|
|
371
|
+
const streamer = new TextStreamer(tokenizer, {
|
|
372
|
+
skip_prompt: true,
|
|
373
|
+
skip_special_tokens: true,
|
|
374
|
+
callback_function: streamCallback,
|
|
375
|
+
});
|
|
376
|
+
|
|
377
|
+
self.postMessage({ status: "start" });
|
|
378
|
+
|
|
379
|
+
const outputs = await model.generate({
|
|
380
|
+
...inputs,
|
|
381
|
+
max_new_tokens: maxTokens,
|
|
382
|
+
do_sample: temperature > 0,
|
|
383
|
+
temperature: temperature > 0 ? temperature : undefined,
|
|
384
|
+
top_p: topP,
|
|
385
|
+
top_k: topK,
|
|
386
|
+
streamer,
|
|
387
|
+
stopping_criteria: stoppingCriteria,
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
// Decode output (skip prompt)
|
|
391
|
+
const inputLength = inputs.input_ids.dims?.at(-1) || 0;
|
|
392
|
+
const decoded = processor.batch_decode(
|
|
393
|
+
outputs.slice(null, [inputLength, null]),
|
|
394
|
+
{ skip_special_tokens: true }
|
|
395
|
+
);
|
|
396
|
+
|
|
397
|
+
const endTime = performance.now();
|
|
398
|
+
const totalTime = startTime ? endTime - startTime : 0;
|
|
399
|
+
|
|
400
|
+
self.postMessage({
|
|
401
|
+
status: "complete",
|
|
402
|
+
text: decoded[0] || "",
|
|
403
|
+
numTokens,
|
|
404
|
+
totalTime,
|
|
405
|
+
tps: totalTime > 0 ? (numTokens / totalTime) * 1000 : 0,
|
|
406
|
+
});
|
|
407
|
+
}
|
|
408
|
+
|
|
237
409
|
self.addEventListener("message", async (e) => {
|
|
238
410
|
const { type, ...data } = e.data;
|
|
239
411
|
switch (type) {
|
|
@@ -303,30 +475,34 @@ async function createGerbilWorker(options = {}) {
|
|
|
303
475
|
reject(new Error(error));
|
|
304
476
|
};
|
|
305
477
|
const gerbilWorker = {
|
|
306
|
-
generate: (prompt, options$1 = {}) => {
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
478
|
+
generate: (prompt, options$1 = {}) => new Promise((res, rej) => {
|
|
479
|
+
currentResolve = res;
|
|
480
|
+
currentReject = rej;
|
|
481
|
+
const system = options$1.system || "You are a helpful assistant.";
|
|
482
|
+
const messages = options$1.history ? [{
|
|
483
|
+
role: "system",
|
|
484
|
+
content: system
|
|
485
|
+
}, ...options$1.history] : [{
|
|
486
|
+
role: "system",
|
|
487
|
+
content: system
|
|
488
|
+
}, {
|
|
489
|
+
role: "user",
|
|
490
|
+
content: prompt
|
|
491
|
+
}];
|
|
492
|
+
if (options$1.history) worker.postMessage({ type: "reset" });
|
|
493
|
+
worker.postMessage({
|
|
494
|
+
type: "generate",
|
|
495
|
+
messages,
|
|
496
|
+
images: options$1.images || [],
|
|
497
|
+
options: {
|
|
498
|
+
maxTokens: options$1.maxTokens ?? (options$1.images?.length ? 2048 : 256),
|
|
499
|
+
temperature: options$1.temperature ?? .7,
|
|
500
|
+
topP: options$1.topP ?? .9,
|
|
501
|
+
topK: options$1.topK ?? 20,
|
|
502
|
+
thinking: options$1.thinking ?? false
|
|
503
|
+
}
|
|
328
504
|
});
|
|
329
|
-
},
|
|
505
|
+
}),
|
|
330
506
|
interrupt: () => {
|
|
331
507
|
worker.postMessage({ type: "interrupt" });
|
|
332
508
|
},
|
|
@@ -383,6 +559,7 @@ function useChat(options = {}) {
|
|
|
383
559
|
const [error, setError] = useState(null);
|
|
384
560
|
const [isReady, setIsReady] = useState(false);
|
|
385
561
|
const [shouldLoad, setShouldLoad] = useState(autoLoad);
|
|
562
|
+
const [attachedImages, setAttachedImages] = useState([]);
|
|
386
563
|
const workerRef = useRef(null);
|
|
387
564
|
const messageIdRef = useRef(0);
|
|
388
565
|
const mountedRef = useRef(true);
|
|
@@ -455,23 +632,34 @@ function useChat(options = {}) {
|
|
|
455
632
|
setCurrentResponse("");
|
|
456
633
|
setThinking("");
|
|
457
634
|
}
|
|
458
|
-
return () => {};
|
|
459
635
|
}, [
|
|
460
636
|
isGenerating,
|
|
461
637
|
currentResponse,
|
|
462
638
|
thinking
|
|
463
639
|
]);
|
|
464
640
|
const pendingMessageRef = useRef(null);
|
|
465
|
-
const
|
|
466
|
-
|
|
467
|
-
|
|
641
|
+
const pendingImagesRef = useRef([]);
|
|
642
|
+
const attachImage = useCallback((imageUrl) => {
|
|
643
|
+
setAttachedImages((imgs) => [...imgs, imageUrl]);
|
|
644
|
+
}, []);
|
|
645
|
+
const removeImage = useCallback((index) => {
|
|
646
|
+
setAttachedImages((imgs) => imgs.filter((_, i) => i !== index));
|
|
647
|
+
}, []);
|
|
648
|
+
const clearImages = useCallback(() => {
|
|
649
|
+
setAttachedImages([]);
|
|
650
|
+
}, []);
|
|
651
|
+
const sendMessageWithImages = useCallback((text, images) => {
|
|
652
|
+
if (!text.trim() || isGenerating) return;
|
|
653
|
+
messageIdRef.current += 1;
|
|
468
654
|
const userMessage = {
|
|
469
|
-
id: `msg-${
|
|
655
|
+
id: `msg-${messageIdRef.current}`,
|
|
470
656
|
role: "user",
|
|
471
|
-
content:
|
|
657
|
+
content: text.trim(),
|
|
658
|
+
images: images.length > 0 ? images : void 0
|
|
472
659
|
};
|
|
660
|
+
messageIdRef.current += 1;
|
|
473
661
|
const assistantMessage = {
|
|
474
|
-
id: `msg-${
|
|
662
|
+
id: `msg-${messageIdRef.current}`,
|
|
475
663
|
role: "assistant",
|
|
476
664
|
content: ""
|
|
477
665
|
};
|
|
@@ -480,23 +668,23 @@ function useChat(options = {}) {
|
|
|
480
668
|
userMessage,
|
|
481
669
|
assistantMessage
|
|
482
670
|
]);
|
|
483
|
-
setInput("");
|
|
484
671
|
setCurrentResponse("");
|
|
485
672
|
setThinking("");
|
|
486
673
|
if (!workerRef.current) {
|
|
487
|
-
pendingMessageRef.current =
|
|
674
|
+
pendingMessageRef.current = text.trim();
|
|
675
|
+
pendingImagesRef.current = images;
|
|
488
676
|
load();
|
|
489
677
|
return;
|
|
490
678
|
}
|
|
491
679
|
setIsGenerating(true);
|
|
492
|
-
workerRef.current.generate(
|
|
680
|
+
workerRef.current.generate(text.trim(), {
|
|
493
681
|
system,
|
|
494
682
|
thinking: enableThinking,
|
|
495
|
-
maxTokens,
|
|
496
|
-
temperature
|
|
683
|
+
maxTokens: images.length > 0 ? Math.max(maxTokens, 2048) : maxTokens,
|
|
684
|
+
temperature,
|
|
685
|
+
images: images.length > 0 ? images : void 0
|
|
497
686
|
});
|
|
498
687
|
}, [
|
|
499
|
-
input,
|
|
500
688
|
isGenerating,
|
|
501
689
|
system,
|
|
502
690
|
enableThinking,
|
|
@@ -504,19 +692,36 @@ function useChat(options = {}) {
|
|
|
504
692
|
temperature,
|
|
505
693
|
load
|
|
506
694
|
]);
|
|
695
|
+
const handleSubmit = useCallback((e) => {
|
|
696
|
+
e?.preventDefault?.();
|
|
697
|
+
if (!input.trim() || isGenerating) return;
|
|
698
|
+
sendMessageWithImages(input, attachedImages);
|
|
699
|
+
setInput("");
|
|
700
|
+
setAttachedImages([]);
|
|
701
|
+
}, [
|
|
702
|
+
input,
|
|
703
|
+
isGenerating,
|
|
704
|
+
attachedImages,
|
|
705
|
+
sendMessageWithImages
|
|
706
|
+
]);
|
|
707
|
+
const sendWithImages = useCallback((text, images) => {
|
|
708
|
+
sendMessageWithImages(text, images);
|
|
709
|
+
}, [sendMessageWithImages]);
|
|
507
710
|
useEffect(() => {
|
|
508
711
|
if (isReady && pendingMessageRef.current && workerRef.current) {
|
|
509
712
|
const pendingContent = pendingMessageRef.current;
|
|
713
|
+
const pendingImages = pendingImagesRef.current;
|
|
510
714
|
pendingMessageRef.current = null;
|
|
715
|
+
pendingImagesRef.current = [];
|
|
511
716
|
setIsGenerating(true);
|
|
512
717
|
workerRef.current.generate(pendingContent, {
|
|
513
718
|
system,
|
|
514
719
|
thinking: enableThinking,
|
|
515
|
-
maxTokens,
|
|
516
|
-
temperature
|
|
720
|
+
maxTokens: pendingImages.length > 0 ? Math.max(maxTokens, 2048) : maxTokens,
|
|
721
|
+
temperature,
|
|
722
|
+
images: pendingImages.length > 0 ? pendingImages : void 0
|
|
517
723
|
});
|
|
518
724
|
}
|
|
519
|
-
return () => {};
|
|
520
725
|
}, [
|
|
521
726
|
isReady,
|
|
522
727
|
system,
|
|
@@ -533,6 +738,7 @@ function useChat(options = {}) {
|
|
|
533
738
|
setMessages([]);
|
|
534
739
|
setCurrentResponse("");
|
|
535
740
|
setThinking("");
|
|
741
|
+
setAttachedImages([]);
|
|
536
742
|
}, []);
|
|
537
743
|
return {
|
|
538
744
|
messages: messages.map((m, i) => {
|
|
@@ -555,7 +761,12 @@ function useChat(options = {}) {
|
|
|
555
761
|
tps,
|
|
556
762
|
isReady,
|
|
557
763
|
error,
|
|
558
|
-
load
|
|
764
|
+
load,
|
|
765
|
+
attachedImages,
|
|
766
|
+
attachImage,
|
|
767
|
+
removeImage,
|
|
768
|
+
clearImages,
|
|
769
|
+
sendWithImages
|
|
559
770
|
};
|
|
560
771
|
}
|
|
561
772
|
/**
|
|
@@ -597,6 +808,7 @@ function useCompletion(options = {}) {
|
|
|
597
808
|
const resolveRef = useRef(null);
|
|
598
809
|
const rejectRef = useRef(null);
|
|
599
810
|
const pendingPromptRef = useRef(null);
|
|
811
|
+
const pendingImagesRef = useRef(void 0);
|
|
600
812
|
const mountedRef = useRef(true);
|
|
601
813
|
const load = useCallback(() => {
|
|
602
814
|
if (workerRef.current || isLoading) return;
|
|
@@ -656,7 +868,7 @@ function useCompletion(options = {}) {
|
|
|
656
868
|
workerRef.current?.terminate();
|
|
657
869
|
};
|
|
658
870
|
}, [model, shouldLoad]);
|
|
659
|
-
const complete = useCallback((prompt) => {
|
|
871
|
+
const complete = useCallback((prompt, completeOptions) => {
|
|
660
872
|
return new Promise((resolve, reject) => {
|
|
661
873
|
setCompletion("");
|
|
662
874
|
setThinking("");
|
|
@@ -664,6 +876,7 @@ function useCompletion(options = {}) {
|
|
|
664
876
|
rejectRef.current = reject;
|
|
665
877
|
if (!workerRef.current) {
|
|
666
878
|
pendingPromptRef.current = prompt;
|
|
879
|
+
pendingImagesRef.current = completeOptions?.images;
|
|
667
880
|
load();
|
|
668
881
|
return;
|
|
669
882
|
}
|
|
@@ -672,7 +885,8 @@ function useCompletion(options = {}) {
|
|
|
672
885
|
system,
|
|
673
886
|
thinking: enableThinking,
|
|
674
887
|
maxTokens,
|
|
675
|
-
temperature
|
|
888
|
+
temperature,
|
|
889
|
+
images: completeOptions?.images
|
|
676
890
|
});
|
|
677
891
|
});
|
|
678
892
|
}, [
|
|
@@ -685,16 +899,18 @@ function useCompletion(options = {}) {
|
|
|
685
899
|
useEffect(() => {
|
|
686
900
|
if (isReady && pendingPromptRef.current && workerRef.current) {
|
|
687
901
|
const pendingPrompt = pendingPromptRef.current;
|
|
902
|
+
const pendingImages = pendingImagesRef.current;
|
|
688
903
|
pendingPromptRef.current = null;
|
|
904
|
+
pendingImagesRef.current = void 0;
|
|
689
905
|
setIsGenerating(true);
|
|
690
906
|
workerRef.current.generate(pendingPrompt, {
|
|
691
907
|
system,
|
|
692
908
|
thinking: enableThinking,
|
|
693
909
|
maxTokens,
|
|
694
|
-
temperature
|
|
910
|
+
temperature,
|
|
911
|
+
images: pendingImages
|
|
695
912
|
});
|
|
696
913
|
}
|
|
697
|
-
return () => {};
|
|
698
914
|
}, [
|
|
699
915
|
isReady,
|
|
700
916
|
system,
|
|
@@ -719,6 +935,1414 @@ function useCompletion(options = {}) {
|
|
|
719
935
|
load
|
|
720
936
|
};
|
|
721
937
|
}
|
|
938
|
+
/** Kokoro voice definitions (24kHz, high quality) */
|
|
939
|
+
const KOKORO_BROWSER_VOICES = [
|
|
940
|
+
{
|
|
941
|
+
id: "af_heart",
|
|
942
|
+
name: "Heart",
|
|
943
|
+
gender: "female",
|
|
944
|
+
language: "en-us",
|
|
945
|
+
description: "American female, highest quality (Grade A)"
|
|
946
|
+
},
|
|
947
|
+
{
|
|
948
|
+
id: "af_bella",
|
|
949
|
+
name: "Bella",
|
|
950
|
+
gender: "female",
|
|
951
|
+
language: "en-us",
|
|
952
|
+
description: "American female, warm and friendly (Grade A-)"
|
|
953
|
+
},
|
|
954
|
+
{
|
|
955
|
+
id: "af_nicole",
|
|
956
|
+
name: "Nicole",
|
|
957
|
+
gender: "female",
|
|
958
|
+
language: "en-us",
|
|
959
|
+
description: "American female, soft and gentle"
|
|
960
|
+
},
|
|
961
|
+
{
|
|
962
|
+
id: "af_sarah",
|
|
963
|
+
name: "Sarah",
|
|
964
|
+
gender: "female",
|
|
965
|
+
language: "en-us",
|
|
966
|
+
description: "American female, clear and professional"
|
|
967
|
+
},
|
|
968
|
+
{
|
|
969
|
+
id: "af_sky",
|
|
970
|
+
name: "Sky",
|
|
971
|
+
gender: "female",
|
|
972
|
+
language: "en-us",
|
|
973
|
+
description: "American female, young and energetic"
|
|
974
|
+
},
|
|
975
|
+
{
|
|
976
|
+
id: "af_alloy",
|
|
977
|
+
name: "Alloy",
|
|
978
|
+
gender: "female",
|
|
979
|
+
language: "en-us",
|
|
980
|
+
description: "American female"
|
|
981
|
+
},
|
|
982
|
+
{
|
|
983
|
+
id: "af_aoede",
|
|
984
|
+
name: "Aoede",
|
|
985
|
+
gender: "female",
|
|
986
|
+
language: "en-us",
|
|
987
|
+
description: "American female, mythical"
|
|
988
|
+
},
|
|
989
|
+
{
|
|
990
|
+
id: "af_jessica",
|
|
991
|
+
name: "Jessica",
|
|
992
|
+
gender: "female",
|
|
993
|
+
language: "en-us",
|
|
994
|
+
description: "American female"
|
|
995
|
+
},
|
|
996
|
+
{
|
|
997
|
+
id: "af_kore",
|
|
998
|
+
name: "Kore",
|
|
999
|
+
gender: "female",
|
|
1000
|
+
language: "en-us",
|
|
1001
|
+
description: "American female"
|
|
1002
|
+
},
|
|
1003
|
+
{
|
|
1004
|
+
id: "af_nova",
|
|
1005
|
+
name: "Nova",
|
|
1006
|
+
gender: "female",
|
|
1007
|
+
language: "en-us",
|
|
1008
|
+
description: "American female"
|
|
1009
|
+
},
|
|
1010
|
+
{
|
|
1011
|
+
id: "af_river",
|
|
1012
|
+
name: "River",
|
|
1013
|
+
gender: "female",
|
|
1014
|
+
language: "en-us",
|
|
1015
|
+
description: "American female"
|
|
1016
|
+
},
|
|
1017
|
+
{
|
|
1018
|
+
id: "am_fenrir",
|
|
1019
|
+
name: "Fenrir",
|
|
1020
|
+
gender: "male",
|
|
1021
|
+
language: "en-us",
|
|
1022
|
+
description: "American male, best quality"
|
|
1023
|
+
},
|
|
1024
|
+
{
|
|
1025
|
+
id: "am_michael",
|
|
1026
|
+
name: "Michael",
|
|
1027
|
+
gender: "male",
|
|
1028
|
+
language: "en-us",
|
|
1029
|
+
description: "American male, warm and friendly"
|
|
1030
|
+
},
|
|
1031
|
+
{
|
|
1032
|
+
id: "am_adam",
|
|
1033
|
+
name: "Adam",
|
|
1034
|
+
gender: "male",
|
|
1035
|
+
language: "en-us",
|
|
1036
|
+
description: "American male"
|
|
1037
|
+
},
|
|
1038
|
+
{
|
|
1039
|
+
id: "am_echo",
|
|
1040
|
+
name: "Echo",
|
|
1041
|
+
gender: "male",
|
|
1042
|
+
language: "en-us",
|
|
1043
|
+
description: "American male"
|
|
1044
|
+
},
|
|
1045
|
+
{
|
|
1046
|
+
id: "am_eric",
|
|
1047
|
+
name: "Eric",
|
|
1048
|
+
gender: "male",
|
|
1049
|
+
language: "en-us",
|
|
1050
|
+
description: "American male"
|
|
1051
|
+
},
|
|
1052
|
+
{
|
|
1053
|
+
id: "am_liam",
|
|
1054
|
+
name: "Liam",
|
|
1055
|
+
gender: "male",
|
|
1056
|
+
language: "en-us",
|
|
1057
|
+
description: "American male"
|
|
1058
|
+
},
|
|
1059
|
+
{
|
|
1060
|
+
id: "am_onyx",
|
|
1061
|
+
name: "Onyx",
|
|
1062
|
+
gender: "male",
|
|
1063
|
+
language: "en-us",
|
|
1064
|
+
description: "American male"
|
|
1065
|
+
},
|
|
1066
|
+
{
|
|
1067
|
+
id: "am_puck",
|
|
1068
|
+
name: "Puck",
|
|
1069
|
+
gender: "male",
|
|
1070
|
+
language: "en-us",
|
|
1071
|
+
description: "American male"
|
|
1072
|
+
},
|
|
1073
|
+
{
|
|
1074
|
+
id: "am_santa",
|
|
1075
|
+
name: "Santa",
|
|
1076
|
+
gender: "male",
|
|
1077
|
+
language: "en-us",
|
|
1078
|
+
description: "American male, festive"
|
|
1079
|
+
},
|
|
1080
|
+
{
|
|
1081
|
+
id: "bf_emma",
|
|
1082
|
+
name: "Emma",
|
|
1083
|
+
gender: "female",
|
|
1084
|
+
language: "en-gb",
|
|
1085
|
+
description: "British female, elegant and clear"
|
|
1086
|
+
},
|
|
1087
|
+
{
|
|
1088
|
+
id: "bf_isabella",
|
|
1089
|
+
name: "Isabella",
|
|
1090
|
+
gender: "female",
|
|
1091
|
+
language: "en-gb",
|
|
1092
|
+
description: "British female, sophisticated"
|
|
1093
|
+
},
|
|
1094
|
+
{
|
|
1095
|
+
id: "bf_alice",
|
|
1096
|
+
name: "Alice",
|
|
1097
|
+
gender: "female",
|
|
1098
|
+
language: "en-gb",
|
|
1099
|
+
description: "British female"
|
|
1100
|
+
},
|
|
1101
|
+
{
|
|
1102
|
+
id: "bf_lily",
|
|
1103
|
+
name: "Lily",
|
|
1104
|
+
gender: "female",
|
|
1105
|
+
language: "en-gb",
|
|
1106
|
+
description: "British female"
|
|
1107
|
+
},
|
|
1108
|
+
{
|
|
1109
|
+
id: "bm_george",
|
|
1110
|
+
name: "George",
|
|
1111
|
+
gender: "male",
|
|
1112
|
+
language: "en-gb",
|
|
1113
|
+
description: "British male, distinguished"
|
|
1114
|
+
},
|
|
1115
|
+
{
|
|
1116
|
+
id: "bm_lewis",
|
|
1117
|
+
name: "Lewis",
|
|
1118
|
+
gender: "male",
|
|
1119
|
+
language: "en-gb",
|
|
1120
|
+
description: "British male, friendly"
|
|
1121
|
+
},
|
|
1122
|
+
{
|
|
1123
|
+
id: "bm_daniel",
|
|
1124
|
+
name: "Daniel",
|
|
1125
|
+
gender: "male",
|
|
1126
|
+
language: "en-gb",
|
|
1127
|
+
description: "British male"
|
|
1128
|
+
},
|
|
1129
|
+
{
|
|
1130
|
+
id: "bm_fable",
|
|
1131
|
+
name: "Fable",
|
|
1132
|
+
gender: "male",
|
|
1133
|
+
language: "en-gb",
|
|
1134
|
+
description: "British male"
|
|
1135
|
+
}
|
|
1136
|
+
];
|
|
1137
|
+
/** Supertonic voice definitions (44.1kHz, faster) */
|
|
1138
|
+
const SUPERTONIC_BROWSER_VOICES = [
|
|
1139
|
+
{
|
|
1140
|
+
id: "F1",
|
|
1141
|
+
name: "Female 1",
|
|
1142
|
+
gender: "female",
|
|
1143
|
+
language: "en",
|
|
1144
|
+
description: "Female voice 1 - Clear and natural"
|
|
1145
|
+
},
|
|
1146
|
+
{
|
|
1147
|
+
id: "F2",
|
|
1148
|
+
name: "Female 2",
|
|
1149
|
+
gender: "female",
|
|
1150
|
+
language: "en",
|
|
1151
|
+
description: "Female voice 2 - Warm and expressive"
|
|
1152
|
+
},
|
|
1153
|
+
{
|
|
1154
|
+
id: "M1",
|
|
1155
|
+
name: "Male 1",
|
|
1156
|
+
gender: "male",
|
|
1157
|
+
language: "en",
|
|
1158
|
+
description: "Male voice 1 - Deep and confident"
|
|
1159
|
+
},
|
|
1160
|
+
{
|
|
1161
|
+
id: "M2",
|
|
1162
|
+
name: "Male 2",
|
|
1163
|
+
gender: "male",
|
|
1164
|
+
language: "en",
|
|
1165
|
+
description: "Male voice 2 - Friendly and casual"
|
|
1166
|
+
}
|
|
1167
|
+
];
|
|
1168
|
+
/** TTS model configuration */
|
|
1169
|
+
const TTS_MODELS = {
|
|
1170
|
+
"kokoro-82m": {
|
|
1171
|
+
repo: "onnx-community/Kokoro-82M-v1.0-ONNX",
|
|
1172
|
+
defaultVoice: "af_heart",
|
|
1173
|
+
sampleRate: 24e3,
|
|
1174
|
+
voices: KOKORO_BROWSER_VOICES
|
|
1175
|
+
},
|
|
1176
|
+
"supertonic-66m": {
|
|
1177
|
+
repo: "onnx-community/Supertonic-TTS-ONNX",
|
|
1178
|
+
defaultVoice: "F1",
|
|
1179
|
+
sampleRate: 44100,
|
|
1180
|
+
voices: SUPERTONIC_BROWSER_VOICES
|
|
1181
|
+
}
|
|
1182
|
+
};
|
|
1183
|
+
/**
|
|
1184
|
+
* React hook for text-to-speech with Web Audio API playback
|
|
1185
|
+
*
|
|
1186
|
+
* Supports both Kokoro (24kHz, high quality) and Supertonic (44.1kHz, faster).
|
|
1187
|
+
*
|
|
1188
|
+
* @example
|
|
1189
|
+
* ```tsx
|
|
1190
|
+
* import { useSpeech } from "@tryhamster/gerbil/browser";
|
|
1191
|
+
*
|
|
1192
|
+
* function App() {
|
|
1193
|
+
* // Default: Kokoro TTS
|
|
1194
|
+
* const { speak, stop, isLoading, isSpeaking, listVoices, setVoice } = useSpeech();
|
|
1195
|
+
*
|
|
1196
|
+
* // Or use Supertonic (44.1kHz, faster)
|
|
1197
|
+
* // const { speak, listVoices } = useSpeech({ model: "supertonic-66m" });
|
|
1198
|
+
*
|
|
1199
|
+
* if (isLoading) return <div>Loading TTS...</div>;
|
|
1200
|
+
*
|
|
1201
|
+
* return (
|
|
1202
|
+
* <div>
|
|
1203
|
+
* <select onChange={e => setVoice(e.target.value)}>
|
|
1204
|
+
* {listVoices().map(v => (
|
|
1205
|
+
* <option key={v.id} value={v.id}>{v.name}</option>
|
|
1206
|
+
* ))}
|
|
1207
|
+
* </select>
|
|
1208
|
+
* <button onClick={() => speak("Hello world!")}>
|
|
1209
|
+
* {isSpeaking ? "Speaking..." : "Speak"}
|
|
1210
|
+
* </button>
|
|
1211
|
+
* {isSpeaking && <button onClick={stop}>Stop</button>}
|
|
1212
|
+
* </div>
|
|
1213
|
+
* );
|
|
1214
|
+
* }
|
|
1215
|
+
* ```
|
|
1216
|
+
*/
|
|
1217
|
+
function useSpeech(options = {}) {
|
|
1218
|
+
const React = globalThis.React;
|
|
1219
|
+
if (!React) throw new Error("useSpeech requires React. Import React before using this hook.");
|
|
1220
|
+
const { useState, useEffect, useRef, useCallback } = React;
|
|
1221
|
+
const { model: modelId = "kokoro-82m", speed: defaultSpeed = 1, autoLoad = false, onReady, onError, onStart, onEnd } = options;
|
|
1222
|
+
const modelConfig = TTS_MODELS[modelId];
|
|
1223
|
+
const defaultVoice = options.voice || modelConfig.defaultVoice;
|
|
1224
|
+
const [isLoading, setIsLoading] = useState(autoLoad);
|
|
1225
|
+
const [loadingProgress, setLoadingProgress] = useState(null);
|
|
1226
|
+
const [isSpeaking, setIsSpeaking] = useState(false);
|
|
1227
|
+
const [isReady, setIsReady] = useState(false);
|
|
1228
|
+
const [error, setError] = useState(null);
|
|
1229
|
+
const [shouldLoad, setShouldLoad] = useState(autoLoad);
|
|
1230
|
+
const [currentVoice, setCurrentVoice] = useState(defaultVoice);
|
|
1231
|
+
const [currentSpeed, setCurrentSpeed] = useState(defaultSpeed);
|
|
1232
|
+
const ttsRef = useRef(null);
|
|
1233
|
+
const voiceEmbeddingsRef = useRef(/* @__PURE__ */ new Map());
|
|
1234
|
+
const audioContextRef = useRef(null);
|
|
1235
|
+
const sourceNodeRef = useRef(null);
|
|
1236
|
+
const mountedRef = useRef(true);
|
|
1237
|
+
const modelIdRef = useRef(modelId);
|
|
1238
|
+
const listVoices = useCallback(() => {
|
|
1239
|
+
return modelConfig.voices;
|
|
1240
|
+
}, [modelConfig.voices]);
|
|
1241
|
+
const load = useCallback(() => {
|
|
1242
|
+
if (ttsRef.current || isLoading) return;
|
|
1243
|
+
setIsLoading(true);
|
|
1244
|
+
setShouldLoad(true);
|
|
1245
|
+
}, [isLoading]);
|
|
1246
|
+
useEffect(() => {
|
|
1247
|
+
if (!shouldLoad) return;
|
|
1248
|
+
mountedRef.current = true;
|
|
1249
|
+
modelIdRef.current = modelId;
|
|
1250
|
+
const initTTS = async () => {
|
|
1251
|
+
try {
|
|
1252
|
+
const isSupertonic = modelId === "supertonic-66m";
|
|
1253
|
+
const config = TTS_MODELS[modelId];
|
|
1254
|
+
setLoadingProgress({
|
|
1255
|
+
status: "loading",
|
|
1256
|
+
message: `Loading ${isSupertonic ? "Supertonic" : "Kokoro"} TTS...`
|
|
1257
|
+
});
|
|
1258
|
+
if (isSupertonic) {
|
|
1259
|
+
const { pipeline } = await import("https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1");
|
|
1260
|
+
const tts = await pipeline("text-to-speech", config.repo, {
|
|
1261
|
+
device: "webgpu",
|
|
1262
|
+
progress_callback: (progress) => {
|
|
1263
|
+
if (!mountedRef.current) return;
|
|
1264
|
+
if (progress.status === "progress" && progress.file) setLoadingProgress({
|
|
1265
|
+
status: "downloading",
|
|
1266
|
+
file: progress.file,
|
|
1267
|
+
progress: Math.round(progress.progress || 0)
|
|
1268
|
+
});
|
|
1269
|
+
}
|
|
1270
|
+
});
|
|
1271
|
+
if (!mountedRef.current) return;
|
|
1272
|
+
const voicesUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/`;
|
|
1273
|
+
const embeddingsMap = /* @__PURE__ */ new Map();
|
|
1274
|
+
await Promise.all(config.voices.map(async (voice) => {
|
|
1275
|
+
try {
|
|
1276
|
+
const response = await fetch(`${voicesUrl}${voice.id}.bin`);
|
|
1277
|
+
if (response.ok) {
|
|
1278
|
+
const buffer = await response.arrayBuffer();
|
|
1279
|
+
embeddingsMap.set(voice.id, new Float32Array(buffer));
|
|
1280
|
+
}
|
|
1281
|
+
} catch (e) {
|
|
1282
|
+
console.warn(`Failed to load voice embedding for ${voice.id}:`, e);
|
|
1283
|
+
}
|
|
1284
|
+
}));
|
|
1285
|
+
if (!mountedRef.current) return;
|
|
1286
|
+
try {
|
|
1287
|
+
await tts("Hello", {
|
|
1288
|
+
speaker_embeddings: new Float32Array(12928),
|
|
1289
|
+
num_inference_steps: 1,
|
|
1290
|
+
speed: 1
|
|
1291
|
+
});
|
|
1292
|
+
} catch (e) {
|
|
1293
|
+
console.warn("Supertonic warmup failed:", e);
|
|
1294
|
+
}
|
|
1295
|
+
voiceEmbeddingsRef.current = embeddingsMap;
|
|
1296
|
+
ttsRef.current = {
|
|
1297
|
+
type: "supertonic",
|
|
1298
|
+
pipeline: tts,
|
|
1299
|
+
config
|
|
1300
|
+
};
|
|
1301
|
+
} else {
|
|
1302
|
+
const { KokoroTTS } = await import("kokoro-js");
|
|
1303
|
+
const tts = await KokoroTTS.from_pretrained(config.repo, {
|
|
1304
|
+
dtype: "fp32",
|
|
1305
|
+
progress_callback: (progress) => {
|
|
1306
|
+
if (!mountedRef.current) return;
|
|
1307
|
+
if (progress.status === "progress" && progress.file) setLoadingProgress({
|
|
1308
|
+
status: "downloading",
|
|
1309
|
+
file: progress.file,
|
|
1310
|
+
progress: Math.round(progress.progress || 0)
|
|
1311
|
+
});
|
|
1312
|
+
}
|
|
1313
|
+
});
|
|
1314
|
+
if (!mountedRef.current) return;
|
|
1315
|
+
ttsRef.current = {
|
|
1316
|
+
type: "kokoro",
|
|
1317
|
+
instance: tts,
|
|
1318
|
+
config
|
|
1319
|
+
};
|
|
1320
|
+
}
|
|
1321
|
+
setIsLoading(false);
|
|
1322
|
+
setIsReady(true);
|
|
1323
|
+
setLoadingProgress({ status: "ready" });
|
|
1324
|
+
onReady?.();
|
|
1325
|
+
} catch (err) {
|
|
1326
|
+
if (!mountedRef.current) return;
|
|
1327
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
1328
|
+
setError(errorMsg);
|
|
1329
|
+
setIsLoading(false);
|
|
1330
|
+
setLoadingProgress({
|
|
1331
|
+
status: "error",
|
|
1332
|
+
error: errorMsg
|
|
1333
|
+
});
|
|
1334
|
+
onError?.(errorMsg);
|
|
1335
|
+
}
|
|
1336
|
+
};
|
|
1337
|
+
initTTS();
|
|
1338
|
+
return () => {
|
|
1339
|
+
mountedRef.current = false;
|
|
1340
|
+
};
|
|
1341
|
+
}, [
|
|
1342
|
+
shouldLoad,
|
|
1343
|
+
modelId,
|
|
1344
|
+
onReady,
|
|
1345
|
+
onError
|
|
1346
|
+
]);
|
|
1347
|
+
useEffect(() => {
|
|
1348
|
+
return () => {
|
|
1349
|
+
try {
|
|
1350
|
+
sourceNodeRef.current?.stop();
|
|
1351
|
+
} catch {}
|
|
1352
|
+
try {
|
|
1353
|
+
if (audioContextRef.current && audioContextRef.current.state !== "closed") audioContextRef.current.close();
|
|
1354
|
+
} catch {}
|
|
1355
|
+
};
|
|
1356
|
+
}, []);
|
|
1357
|
+
return {
|
|
1358
|
+
speak: useCallback(async (text, opts) => {
|
|
1359
|
+
const voice = opts?.voice || currentVoice;
|
|
1360
|
+
const speed = opts?.speed || currentSpeed;
|
|
1361
|
+
if (!ttsRef.current) {
|
|
1362
|
+
load();
|
|
1363
|
+
return;
|
|
1364
|
+
}
|
|
1365
|
+
try {
|
|
1366
|
+
setIsSpeaking(true);
|
|
1367
|
+
onStart?.();
|
|
1368
|
+
let audioData;
|
|
1369
|
+
let sampleRate;
|
|
1370
|
+
const ttsBackend = ttsRef.current;
|
|
1371
|
+
if (ttsBackend.type === "supertonic") {
|
|
1372
|
+
const config = ttsBackend.config;
|
|
1373
|
+
if (!config.voices.find((v) => v.id === voice)) {
|
|
1374
|
+
const validVoices = config.voices.map((v) => v.id).join(", ");
|
|
1375
|
+
throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
|
|
1376
|
+
}
|
|
1377
|
+
let speakerEmbedding = voiceEmbeddingsRef.current.get(voice);
|
|
1378
|
+
if (!speakerEmbedding) try {
|
|
1379
|
+
const voiceUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/${voice}.bin`;
|
|
1380
|
+
const response = await fetch(voiceUrl);
|
|
1381
|
+
if (response.ok) {
|
|
1382
|
+
const buffer = await response.arrayBuffer();
|
|
1383
|
+
speakerEmbedding = new Float32Array(buffer);
|
|
1384
|
+
voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
|
|
1385
|
+
} else throw new Error(`Failed to load voice: ${response.status}`);
|
|
1386
|
+
} catch {
|
|
1387
|
+
speakerEmbedding = new Float32Array(12928).fill(.1);
|
|
1388
|
+
voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
|
|
1389
|
+
}
|
|
1390
|
+
const result = await ttsBackend.pipeline(text, {
|
|
1391
|
+
speaker_embeddings: speakerEmbedding,
|
|
1392
|
+
speed
|
|
1393
|
+
});
|
|
1394
|
+
audioData = result.audio;
|
|
1395
|
+
sampleRate = result.sampling_rate;
|
|
1396
|
+
} else {
|
|
1397
|
+
const config = ttsBackend.config;
|
|
1398
|
+
if (!config.voices.find((v) => v.id === voice)) {
|
|
1399
|
+
const validVoices = config.voices.map((v) => v.id).join(", ");
|
|
1400
|
+
throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
|
|
1401
|
+
}
|
|
1402
|
+
const result = await ttsBackend.instance.generate(text, {
|
|
1403
|
+
voice,
|
|
1404
|
+
speed
|
|
1405
|
+
});
|
|
1406
|
+
audioData = result.audio;
|
|
1407
|
+
sampleRate = result.sampling_rate;
|
|
1408
|
+
}
|
|
1409
|
+
if (!mountedRef.current) return;
|
|
1410
|
+
if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext();
|
|
1411
|
+
const audioContext = audioContextRef.current;
|
|
1412
|
+
if (audioContext.state === "suspended") await audioContext.resume();
|
|
1413
|
+
const audioBuffer = audioContext.createBuffer(1, audioData.length, sampleRate);
|
|
1414
|
+
const channelData = new Float32Array(audioData);
|
|
1415
|
+
audioBuffer.copyToChannel(channelData, 0);
|
|
1416
|
+
if (sourceNodeRef.current) {
|
|
1417
|
+
sourceNodeRef.current.stop();
|
|
1418
|
+
sourceNodeRef.current.disconnect();
|
|
1419
|
+
}
|
|
1420
|
+
const sourceNode = audioContext.createBufferSource();
|
|
1421
|
+
sourceNode.buffer = audioBuffer;
|
|
1422
|
+
sourceNode.connect(audioContext.destination);
|
|
1423
|
+
sourceNode.onended = () => {
|
|
1424
|
+
if (mountedRef.current) {
|
|
1425
|
+
setIsSpeaking(false);
|
|
1426
|
+
onEnd?.();
|
|
1427
|
+
}
|
|
1428
|
+
};
|
|
1429
|
+
sourceNodeRef.current = sourceNode;
|
|
1430
|
+
sourceNode.start();
|
|
1431
|
+
} catch (err) {
|
|
1432
|
+
if (!mountedRef.current) return;
|
|
1433
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
1434
|
+
setError(errorMsg);
|
|
1435
|
+
setIsSpeaking(false);
|
|
1436
|
+
onError?.(errorMsg);
|
|
1437
|
+
}
|
|
1438
|
+
}, [
|
|
1439
|
+
currentVoice,
|
|
1440
|
+
currentSpeed,
|
|
1441
|
+
load,
|
|
1442
|
+
onStart,
|
|
1443
|
+
onEnd,
|
|
1444
|
+
onError
|
|
1445
|
+
]),
|
|
1446
|
+
stop: useCallback(() => {
|
|
1447
|
+
if (sourceNodeRef.current) {
|
|
1448
|
+
sourceNodeRef.current.stop();
|
|
1449
|
+
sourceNodeRef.current.disconnect();
|
|
1450
|
+
sourceNodeRef.current = null;
|
|
1451
|
+
}
|
|
1452
|
+
setIsSpeaking(false);
|
|
1453
|
+
}, []),
|
|
1454
|
+
isLoading,
|
|
1455
|
+
loadingProgress,
|
|
1456
|
+
isSpeaking,
|
|
1457
|
+
isReady,
|
|
1458
|
+
load,
|
|
1459
|
+
error,
|
|
1460
|
+
listVoices,
|
|
1461
|
+
currentVoice,
|
|
1462
|
+
setVoice: useCallback((voiceId) => {
|
|
1463
|
+
if (modelConfig.voices.find((v) => v.id === voiceId)) setCurrentVoice(voiceId);
|
|
1464
|
+
else console.warn(`Voice "${voiceId}" not valid for ${modelId}. Available: ${modelConfig.voices.map((v) => v.id).join(", ")}`);
|
|
1465
|
+
}, [modelConfig.voices, modelId]),
|
|
1466
|
+
currentSpeed,
|
|
1467
|
+
setSpeed: useCallback((speed) => {
|
|
1468
|
+
setCurrentSpeed(Math.max(.5, Math.min(2, speed)));
|
|
1469
|
+
}, []),
|
|
1470
|
+
currentModel: modelId,
|
|
1471
|
+
sampleRate: modelConfig.sampleRate
|
|
1472
|
+
};
|
|
1473
|
+
}
|
|
1474
|
+
/**
|
|
1475
|
+
* Play audio from Float32Array using Web Audio API
|
|
1476
|
+
*
|
|
1477
|
+
* @example
|
|
1478
|
+
* ```ts
|
|
1479
|
+
* import { playAudio } from "@tryhamster/gerbil/browser";
|
|
1480
|
+
*
|
|
1481
|
+
* const audio = new Float32Array([...]); // TTS output
|
|
1482
|
+
* const controller = await playAudio(audio, 24000);
|
|
1483
|
+
*
|
|
1484
|
+
* // Stop playback
|
|
1485
|
+
* controller.stop();
|
|
1486
|
+
* ```
|
|
1487
|
+
*/
|
|
1488
|
+
async function playAudio(audio, sampleRate = 24e3) {
|
|
1489
|
+
const audioContext = new AudioContext();
|
|
1490
|
+
if (audioContext.state === "suspended") await audioContext.resume();
|
|
1491
|
+
const audioBuffer = audioContext.createBuffer(1, audio.length, sampleRate);
|
|
1492
|
+
const channelData = new Float32Array(audio);
|
|
1493
|
+
audioBuffer.copyToChannel(channelData, 0);
|
|
1494
|
+
const sourceNode = audioContext.createBufferSource();
|
|
1495
|
+
sourceNode.buffer = audioBuffer;
|
|
1496
|
+
sourceNode.connect(audioContext.destination);
|
|
1497
|
+
const onEnded = new Promise((resolve) => {
|
|
1498
|
+
sourceNode.onended = () => {
|
|
1499
|
+
audioContext.close();
|
|
1500
|
+
resolve();
|
|
1501
|
+
};
|
|
1502
|
+
});
|
|
1503
|
+
sourceNode.start();
|
|
1504
|
+
return {
|
|
1505
|
+
stop: () => {
|
|
1506
|
+
sourceNode.stop();
|
|
1507
|
+
audioContext.close();
|
|
1508
|
+
},
|
|
1509
|
+
onEnded
|
|
1510
|
+
};
|
|
1511
|
+
}
|
|
1512
|
+
/**
|
|
1513
|
+
* Create a reusable audio player for streaming TTS
|
|
1514
|
+
*
|
|
1515
|
+
* @example
|
|
1516
|
+
* ```ts
|
|
1517
|
+
* import { createAudioPlayer } from "@tryhamster/gerbil/browser";
|
|
1518
|
+
*
|
|
1519
|
+
* const player = createAudioPlayer(24000);
|
|
1520
|
+
*
|
|
1521
|
+
* // Queue audio chunks as they arrive
|
|
1522
|
+
* player.queue(chunk1);
|
|
1523
|
+
* player.queue(chunk2);
|
|
1524
|
+
*
|
|
1525
|
+
* // Stop and clear
|
|
1526
|
+
* player.stop();
|
|
1527
|
+
* ```
|
|
1528
|
+
*/
|
|
1529
|
+
function createAudioPlayer(sampleRate = 24e3) {
|
|
1530
|
+
let audioContext = null;
|
|
1531
|
+
let nextStartTime = 0;
|
|
1532
|
+
let isActive = false;
|
|
1533
|
+
const ensureContext = async () => {
|
|
1534
|
+
if (!audioContext) audioContext = new AudioContext();
|
|
1535
|
+
if (audioContext.state === "suspended") await audioContext.resume();
|
|
1536
|
+
return audioContext;
|
|
1537
|
+
};
|
|
1538
|
+
return {
|
|
1539
|
+
queue: async (audio) => {
|
|
1540
|
+
const ctx = await ensureContext();
|
|
1541
|
+
isActive = true;
|
|
1542
|
+
const buffer = ctx.createBuffer(1, audio.length, sampleRate);
|
|
1543
|
+
const channelData = new Float32Array(audio);
|
|
1544
|
+
buffer.copyToChannel(channelData, 0);
|
|
1545
|
+
const source = ctx.createBufferSource();
|
|
1546
|
+
source.buffer = buffer;
|
|
1547
|
+
source.connect(ctx.destination);
|
|
1548
|
+
const startTime = Math.max(ctx.currentTime, nextStartTime);
|
|
1549
|
+
source.start(startTime);
|
|
1550
|
+
nextStartTime = startTime + buffer.duration;
|
|
1551
|
+
source.onended = () => {
|
|
1552
|
+
if (ctx.currentTime >= nextStartTime - .1) isActive = false;
|
|
1553
|
+
};
|
|
1554
|
+
},
|
|
1555
|
+
stop: () => {
|
|
1556
|
+
isActive = false;
|
|
1557
|
+
nextStartTime = 0;
|
|
1558
|
+
if (audioContext) {
|
|
1559
|
+
audioContext.close();
|
|
1560
|
+
audioContext = null;
|
|
1561
|
+
}
|
|
1562
|
+
},
|
|
1563
|
+
isPlaying: () => isActive
|
|
1564
|
+
};
|
|
1565
|
+
}
|
|
1566
|
+
/**
|
|
1567
|
+
* React hook for voice input with browser microphone
|
|
1568
|
+
*
|
|
1569
|
+
* Uses MediaRecorder to capture audio and Whisper for transcription.
|
|
1570
|
+
* Supports both one-shot and streaming transcription modes.
|
|
1571
|
+
*
|
|
1572
|
+
* @example Basic usage (one-shot)
|
|
1573
|
+
* ```tsx
|
|
1574
|
+
* function VoiceInput() {
|
|
1575
|
+
* const { startRecording, stopRecording, isRecording, transcript } = useVoiceInput({
|
|
1576
|
+
* onTranscript: (text) => console.log("User said:", text),
|
|
1577
|
+
* });
|
|
1578
|
+
*
|
|
1579
|
+
* return (
|
|
1580
|
+
* <button onClick={isRecording ? stopRecording : startRecording}>
|
|
1581
|
+
* {isRecording ? "Stop" : "Record"}
|
|
1582
|
+
* </button>
|
|
1583
|
+
* );
|
|
1584
|
+
* }
|
|
1585
|
+
* ```
|
|
1586
|
+
*
|
|
1587
|
+
* @example Streaming transcription (real-time)
|
|
1588
|
+
* ```tsx
|
|
1589
|
+
* function LiveTranscription() {
|
|
1590
|
+
* const { startRecording, stopRecording, isRecording, transcript, streamingChunk } = useVoiceInput({
|
|
1591
|
+
* streaming: true, // Enable streaming mode
|
|
1592
|
+
* chunkDuration: 1500, // Transcribe every 1.5 seconds (default)
|
|
1593
|
+
* onChunk: (text, idx) => console.log(`Chunk ${idx}: ${text}`),
|
|
1594
|
+
* });
|
|
1595
|
+
*
|
|
1596
|
+
* return (
|
|
1597
|
+
* <div>
|
|
1598
|
+
* <button onClick={isRecording ? stopRecording : startRecording}>
|
|
1599
|
+
* {isRecording ? "Stop" : "Start Live Transcription"}
|
|
1600
|
+
* </button>
|
|
1601
|
+
* <p>Current chunk: {streamingChunk}</p>
|
|
1602
|
+
* <p>Full transcript: {transcript}</p>
|
|
1603
|
+
* </div>
|
|
1604
|
+
* );
|
|
1605
|
+
* }
|
|
1606
|
+
* ```
|
|
1607
|
+
*/
|
|
1608
|
+
function useVoiceInput(options = {}) {
|
|
1609
|
+
const React = globalThis.React;
|
|
1610
|
+
if (!React) throw new Error("useVoiceInput requires React. Import React before using this hook.");
|
|
1611
|
+
const { useState, useEffect, useRef, useCallback } = React;
|
|
1612
|
+
const { model = "whisper-tiny.en", autoLoad = false, onReady, onTranscript, onError, onProgress, streaming = false, chunkDuration = 1500, onChunk } = options;
|
|
1613
|
+
const [isLoading, setIsLoading] = useState(autoLoad);
|
|
1614
|
+
const [loadingProgress, setLoadingProgress] = useState(null);
|
|
1615
|
+
const [isReady, setIsReady] = useState(false);
|
|
1616
|
+
const [isRecording, setIsRecording] = useState(false);
|
|
1617
|
+
const [isTranscribing, setIsTranscribing] = useState(false);
|
|
1618
|
+
const [transcript, setTranscript] = useState("");
|
|
1619
|
+
const [streamingChunk, setStreamingChunk] = useState("");
|
|
1620
|
+
const [chunkCount, setChunkCount] = useState(0);
|
|
1621
|
+
const [error, setError] = useState(null);
|
|
1622
|
+
const [shouldLoad, setShouldLoad] = useState(autoLoad);
|
|
1623
|
+
const sttRef = useRef(null);
|
|
1624
|
+
const mediaRecorderRef = useRef(null);
|
|
1625
|
+
const audioChunksRef = useRef([]);
|
|
1626
|
+
const streamRef = useRef(null);
|
|
1627
|
+
const mountedRef = useRef(true);
|
|
1628
|
+
const streamingIntervalRef = useRef(null);
|
|
1629
|
+
const pendingChunksRef = useRef([]);
|
|
1630
|
+
const fullTranscriptRef = useRef("");
|
|
1631
|
+
useEffect(() => {
|
|
1632
|
+
if (!shouldLoad || isReady) return;
|
|
1633
|
+
let cancelled = false;
|
|
1634
|
+
const loadModel = async () => {
|
|
1635
|
+
try {
|
|
1636
|
+
setIsLoading(true);
|
|
1637
|
+
setLoadingProgress({
|
|
1638
|
+
status: "loading",
|
|
1639
|
+
message: "Loading STT model..."
|
|
1640
|
+
});
|
|
1641
|
+
onProgress?.({
|
|
1642
|
+
status: "loading",
|
|
1643
|
+
message: "Loading STT model..."
|
|
1644
|
+
});
|
|
1645
|
+
const { WhisperSTT } = await import("../stt-KzSoNvwI.mjs");
|
|
1646
|
+
if (cancelled || !mountedRef.current) return;
|
|
1647
|
+
const stt = new WhisperSTT(model);
|
|
1648
|
+
await stt.load({ onProgress: (p) => {
|
|
1649
|
+
if (!mountedRef.current) return;
|
|
1650
|
+
const progress = {
|
|
1651
|
+
status: p.progress !== void 0 ? "downloading" : "loading",
|
|
1652
|
+
message: p.status,
|
|
1653
|
+
progress: p.progress,
|
|
1654
|
+
file: p.file
|
|
1655
|
+
};
|
|
1656
|
+
setLoadingProgress(progress);
|
|
1657
|
+
onProgress?.(progress);
|
|
1658
|
+
} });
|
|
1659
|
+
if (cancelled || !mountedRef.current) {
|
|
1660
|
+
stt.dispose();
|
|
1661
|
+
return;
|
|
1662
|
+
}
|
|
1663
|
+
sttRef.current = stt;
|
|
1664
|
+
setIsReady(true);
|
|
1665
|
+
setIsLoading(false);
|
|
1666
|
+
setLoadingProgress({ status: "ready" });
|
|
1667
|
+
onProgress?.({ status: "ready" });
|
|
1668
|
+
onReady?.();
|
|
1669
|
+
} catch (e) {
|
|
1670
|
+
if (!mountedRef.current) return;
|
|
1671
|
+
const errMsg = e.message || "Failed to load STT model";
|
|
1672
|
+
setError(errMsg);
|
|
1673
|
+
setIsLoading(false);
|
|
1674
|
+
setLoadingProgress({
|
|
1675
|
+
status: "error",
|
|
1676
|
+
message: errMsg
|
|
1677
|
+
});
|
|
1678
|
+
onProgress?.({
|
|
1679
|
+
status: "error",
|
|
1680
|
+
message: errMsg
|
|
1681
|
+
});
|
|
1682
|
+
onError?.(errMsg);
|
|
1683
|
+
}
|
|
1684
|
+
};
|
|
1685
|
+
loadModel();
|
|
1686
|
+
return () => {
|
|
1687
|
+
cancelled = true;
|
|
1688
|
+
};
|
|
1689
|
+
}, [
|
|
1690
|
+
shouldLoad,
|
|
1691
|
+
isReady,
|
|
1692
|
+
model,
|
|
1693
|
+
onReady,
|
|
1694
|
+
onError,
|
|
1695
|
+
onProgress
|
|
1696
|
+
]);
|
|
1697
|
+
useEffect(() => {
|
|
1698
|
+
mountedRef.current = true;
|
|
1699
|
+
return () => {
|
|
1700
|
+
mountedRef.current = false;
|
|
1701
|
+
if (sttRef.current) sttRef.current.dispose();
|
|
1702
|
+
if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
|
|
1703
|
+
};
|
|
1704
|
+
}, []);
|
|
1705
|
+
const load = useCallback(() => {
|
|
1706
|
+
if (!shouldLoad && !isReady && !isLoading) setShouldLoad(true);
|
|
1707
|
+
}, [
|
|
1708
|
+
shouldLoad,
|
|
1709
|
+
isReady,
|
|
1710
|
+
isLoading
|
|
1711
|
+
]);
|
|
1712
|
+
const blobToFloat32 = useCallback(async (blob) => {
|
|
1713
|
+
const audioContext = new AudioContext({ sampleRate: 16e3 });
|
|
1714
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
1715
|
+
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
|
|
1716
|
+
const channelData = audioBuffer.getChannelData(0);
|
|
1717
|
+
if (audioBuffer.sampleRate !== 16e3) {
|
|
1718
|
+
const ratio = 16e3 / audioBuffer.sampleRate;
|
|
1719
|
+
const newLength = Math.round(channelData.length * ratio);
|
|
1720
|
+
const resampled = new Float32Array(newLength);
|
|
1721
|
+
for (let i = 0; i < newLength; i++) {
|
|
1722
|
+
const srcIndex = i / ratio;
|
|
1723
|
+
const floor = Math.floor(srcIndex);
|
|
1724
|
+
const ceil = Math.min(floor + 1, channelData.length - 1);
|
|
1725
|
+
const t = srcIndex - floor;
|
|
1726
|
+
resampled[i] = channelData[floor] * (1 - t) + channelData[ceil] * t;
|
|
1727
|
+
}
|
|
1728
|
+
audioContext.close();
|
|
1729
|
+
return resampled;
|
|
1730
|
+
}
|
|
1731
|
+
audioContext.close();
|
|
1732
|
+
return new Float32Array(channelData);
|
|
1733
|
+
}, []);
|
|
1734
|
+
const transcribe = useCallback(async (audio) => {
|
|
1735
|
+
if (!sttRef.current) {
|
|
1736
|
+
if (!shouldLoad) {
|
|
1737
|
+
setShouldLoad(true);
|
|
1738
|
+
throw new Error("STT model not loaded. Loading now, please try again.");
|
|
1739
|
+
}
|
|
1740
|
+
throw new Error("STT model not loaded");
|
|
1741
|
+
}
|
|
1742
|
+
setIsTranscribing(true);
|
|
1743
|
+
try {
|
|
1744
|
+
let text = (await sttRef.current.transcribe(audio)).text.trim();
|
|
1745
|
+
if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
|
|
1746
|
+
setTranscript(text);
|
|
1747
|
+
onTranscript?.(text);
|
|
1748
|
+
return text;
|
|
1749
|
+
} finally {
|
|
1750
|
+
if (mountedRef.current) setIsTranscribing(false);
|
|
1751
|
+
}
|
|
1752
|
+
}, [shouldLoad, onTranscript]);
|
|
1753
|
+
const processedSamplesRef = useRef(0);
|
|
1754
|
+
const transcribeChunk = useCallback(async (chunkIdx) => {
|
|
1755
|
+
if (!sttRef.current || audioChunksRef.current.length === 0) return "";
|
|
1756
|
+
try {
|
|
1757
|
+
const audioData = await blobToFloat32(new Blob(audioChunksRef.current, { type: "audio/webm" }));
|
|
1758
|
+
const newSamplesStart = processedSamplesRef.current;
|
|
1759
|
+
const totalSamples = audioData.length;
|
|
1760
|
+
if (totalSamples - newSamplesStart < 8e3) return "";
|
|
1761
|
+
const newAudio = audioData.slice(newSamplesStart);
|
|
1762
|
+
processedSamplesRef.current = totalSamples;
|
|
1763
|
+
let text = (await sttRef.current.transcribe(newAudio)).text.trim();
|
|
1764
|
+
if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
|
|
1765
|
+
if (text && mountedRef.current) {
|
|
1766
|
+
setStreamingChunk(text);
|
|
1767
|
+
onChunk?.(text, chunkIdx);
|
|
1768
|
+
}
|
|
1769
|
+
return text;
|
|
1770
|
+
} catch {
|
|
1771
|
+
return "";
|
|
1772
|
+
}
|
|
1773
|
+
}, [blobToFloat32, onChunk]);
|
|
1774
|
+
return {
|
|
1775
|
+
startRecording: useCallback(async () => {
|
|
1776
|
+
if (isRecording) return;
|
|
1777
|
+
try {
|
|
1778
|
+
if (streaming && !sttRef.current) {
|
|
1779
|
+
if (!shouldLoad) setShouldLoad(true);
|
|
1780
|
+
setIsLoading(true);
|
|
1781
|
+
const { WhisperSTT } = await import("../stt-KzSoNvwI.mjs");
|
|
1782
|
+
const stt = new WhisperSTT(model);
|
|
1783
|
+
await stt.load({ onProgress: (p) => {
|
|
1784
|
+
if (mountedRef.current) {
|
|
1785
|
+
const progress = {
|
|
1786
|
+
status: p.status === "downloading" ? "downloading" : p.status === "ready" ? "ready" : "loading",
|
|
1787
|
+
message: p.status,
|
|
1788
|
+
progress: p.progress,
|
|
1789
|
+
file: p.file
|
|
1790
|
+
};
|
|
1791
|
+
setLoadingProgress(progress);
|
|
1792
|
+
onProgress?.(progress);
|
|
1793
|
+
}
|
|
1794
|
+
} });
|
|
1795
|
+
if (!mountedRef.current) {
|
|
1796
|
+
stt.dispose();
|
|
1797
|
+
return;
|
|
1798
|
+
}
|
|
1799
|
+
sttRef.current = stt;
|
|
1800
|
+
setIsReady(true);
|
|
1801
|
+
setIsLoading(false);
|
|
1802
|
+
setLoadingProgress({ status: "ready" });
|
|
1803
|
+
onProgress?.({ status: "ready" });
|
|
1804
|
+
onReady?.();
|
|
1805
|
+
}
|
|
1806
|
+
const stream = await navigator.mediaDevices.getUserMedia({ audio: {
|
|
1807
|
+
sampleRate: 16e3,
|
|
1808
|
+
channelCount: 1,
|
|
1809
|
+
echoCancellation: true,
|
|
1810
|
+
noiseSuppression: true
|
|
1811
|
+
} });
|
|
1812
|
+
streamRef.current = stream;
|
|
1813
|
+
audioChunksRef.current = [];
|
|
1814
|
+
pendingChunksRef.current = [];
|
|
1815
|
+
fullTranscriptRef.current = "";
|
|
1816
|
+
processedSamplesRef.current = 0;
|
|
1817
|
+
setTranscript("");
|
|
1818
|
+
setStreamingChunk("");
|
|
1819
|
+
setChunkCount(0);
|
|
1820
|
+
const mediaRecorder = new MediaRecorder(stream);
|
|
1821
|
+
mediaRecorderRef.current = mediaRecorder;
|
|
1822
|
+
mediaRecorder.ondataavailable = (event) => {
|
|
1823
|
+
if (event.data.size > 0) {
|
|
1824
|
+
audioChunksRef.current.push(event.data);
|
|
1825
|
+
if (streaming) pendingChunksRef.current.push(event.data);
|
|
1826
|
+
}
|
|
1827
|
+
};
|
|
1828
|
+
mediaRecorder.start(100);
|
|
1829
|
+
setIsRecording(true);
|
|
1830
|
+
setError(null);
|
|
1831
|
+
if (streaming && sttRef.current) {
|
|
1832
|
+
let chunkIdx = 0;
|
|
1833
|
+
let shouldContinue = true;
|
|
1834
|
+
const processNextChunk = async () => {
|
|
1835
|
+
if (!shouldContinue || !mountedRef.current) return;
|
|
1836
|
+
if (pendingChunksRef.current.length > 0) {
|
|
1837
|
+
pendingChunksRef.current = [];
|
|
1838
|
+
try {
|
|
1839
|
+
setIsTranscribing(true);
|
|
1840
|
+
const chunkText = await transcribeChunk(chunkIdx);
|
|
1841
|
+
if (chunkText && mountedRef.current) {
|
|
1842
|
+
chunkIdx++;
|
|
1843
|
+
setChunkCount(chunkIdx);
|
|
1844
|
+
setTranscript((prev) => {
|
|
1845
|
+
const newTranscript = prev + (prev ? " " : "") + chunkText;
|
|
1846
|
+
fullTranscriptRef.current = newTranscript;
|
|
1847
|
+
onTranscript?.(newTranscript);
|
|
1848
|
+
return newTranscript;
|
|
1849
|
+
});
|
|
1850
|
+
}
|
|
1851
|
+
} catch (e) {
|
|
1852
|
+
console.error("[useVoiceInput] Chunk transcription error:", e);
|
|
1853
|
+
} finally {
|
|
1854
|
+
if (mountedRef.current) setIsTranscribing(false);
|
|
1855
|
+
}
|
|
1856
|
+
}
|
|
1857
|
+
if (shouldContinue && mountedRef.current) streamingIntervalRef.current = setTimeout(processNextChunk, chunkDuration);
|
|
1858
|
+
};
|
|
1859
|
+
streamingIntervalRef.current = setTimeout(processNextChunk, chunkDuration);
|
|
1860
|
+
streamingIntervalRef._stop = () => {
|
|
1861
|
+
shouldContinue = false;
|
|
1862
|
+
};
|
|
1863
|
+
}
|
|
1864
|
+
} catch (e) {
|
|
1865
|
+
const errMsg = e.message || "Failed to start recording";
|
|
1866
|
+
setError(errMsg);
|
|
1867
|
+
onError?.(errMsg);
|
|
1868
|
+
}
|
|
1869
|
+
}, [
|
|
1870
|
+
isRecording,
|
|
1871
|
+
streaming,
|
|
1872
|
+
shouldLoad,
|
|
1873
|
+
model,
|
|
1874
|
+
chunkDuration,
|
|
1875
|
+
transcribeChunk,
|
|
1876
|
+
onTranscript,
|
|
1877
|
+
onError,
|
|
1878
|
+
onProgress,
|
|
1879
|
+
onReady
|
|
1880
|
+
]),
|
|
1881
|
+
stopRecording: useCallback(async () => {
|
|
1882
|
+
if (streamingIntervalRef._stop) streamingIntervalRef._stop();
|
|
1883
|
+
if (streamingIntervalRef.current) {
|
|
1884
|
+
clearTimeout(streamingIntervalRef.current);
|
|
1885
|
+
streamingIntervalRef.current = null;
|
|
1886
|
+
}
|
|
1887
|
+
return new Promise((resolve, reject) => {
|
|
1888
|
+
if (!mediaRecorderRef.current || !isRecording) {
|
|
1889
|
+
reject(/* @__PURE__ */ new Error("Not recording"));
|
|
1890
|
+
return;
|
|
1891
|
+
}
|
|
1892
|
+
const mediaRecorder = mediaRecorderRef.current;
|
|
1893
|
+
mediaRecorder.onstop = async () => {
|
|
1894
|
+
if (streamRef.current) {
|
|
1895
|
+
for (const track of streamRef.current.getTracks()) track.stop();
|
|
1896
|
+
streamRef.current = null;
|
|
1897
|
+
}
|
|
1898
|
+
setIsRecording(false);
|
|
1899
|
+
if (streaming) {
|
|
1900
|
+
if (audioChunksRef.current.length > 0 && processedSamplesRef.current > 0) {
|
|
1901
|
+
setIsTranscribing(true);
|
|
1902
|
+
pendingChunksRef.current = [];
|
|
1903
|
+
try {
|
|
1904
|
+
const finalChunkText = await transcribeChunk(chunkCount);
|
|
1905
|
+
if (finalChunkText && mountedRef.current) setTranscript((prev) => {
|
|
1906
|
+
const newTranscript = prev + (prev ? " " : "") + finalChunkText;
|
|
1907
|
+
fullTranscriptRef.current = newTranscript;
|
|
1908
|
+
return newTranscript;
|
|
1909
|
+
});
|
|
1910
|
+
} finally {
|
|
1911
|
+
if (mountedRef.current) setIsTranscribing(false);
|
|
1912
|
+
}
|
|
1913
|
+
}
|
|
1914
|
+
const finalText = fullTranscriptRef.current;
|
|
1915
|
+
onTranscript?.(finalText);
|
|
1916
|
+
resolve(finalText);
|
|
1917
|
+
return;
|
|
1918
|
+
}
|
|
1919
|
+
const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm" });
|
|
1920
|
+
try {
|
|
1921
|
+
if (!sttRef.current) {
|
|
1922
|
+
if (!shouldLoad) setShouldLoad(true);
|
|
1923
|
+
await new Promise((res, rej) => {
|
|
1924
|
+
const checkReady = setInterval(() => {
|
|
1925
|
+
if (sttRef.current) {
|
|
1926
|
+
clearInterval(checkReady);
|
|
1927
|
+
res();
|
|
1928
|
+
}
|
|
1929
|
+
}, 100);
|
|
1930
|
+
setTimeout(() => {
|
|
1931
|
+
clearInterval(checkReady);
|
|
1932
|
+
rej(/* @__PURE__ */ new Error("Timeout waiting for STT model"));
|
|
1933
|
+
}, 3e4);
|
|
1934
|
+
});
|
|
1935
|
+
}
|
|
1936
|
+
resolve(await transcribe(await blobToFloat32(audioBlob)));
|
|
1937
|
+
} catch (e) {
|
|
1938
|
+
const errMsg = e.message || "Transcription failed";
|
|
1939
|
+
setError(errMsg);
|
|
1940
|
+
onError?.(errMsg);
|
|
1941
|
+
reject(e);
|
|
1942
|
+
}
|
|
1943
|
+
};
|
|
1944
|
+
mediaRecorder.stop();
|
|
1945
|
+
});
|
|
1946
|
+
}, [
|
|
1947
|
+
isRecording,
|
|
1948
|
+
streaming,
|
|
1949
|
+
chunkCount,
|
|
1950
|
+
shouldLoad,
|
|
1951
|
+
blobToFloat32,
|
|
1952
|
+
transcribe,
|
|
1953
|
+
transcribeChunk,
|
|
1954
|
+
onTranscript,
|
|
1955
|
+
onError
|
|
1956
|
+
]),
|
|
1957
|
+
cancelRecording: useCallback(() => {
|
|
1958
|
+
if (streamingIntervalRef._stop) streamingIntervalRef._stop();
|
|
1959
|
+
if (streamingIntervalRef.current) {
|
|
1960
|
+
clearTimeout(streamingIntervalRef.current);
|
|
1961
|
+
streamingIntervalRef.current = null;
|
|
1962
|
+
}
|
|
1963
|
+
if (mediaRecorderRef.current && isRecording) mediaRecorderRef.current.stop();
|
|
1964
|
+
if (streamRef.current) {
|
|
1965
|
+
for (const track of streamRef.current.getTracks()) track.stop();
|
|
1966
|
+
streamRef.current = null;
|
|
1967
|
+
}
|
|
1968
|
+
audioChunksRef.current = [];
|
|
1969
|
+
pendingChunksRef.current = [];
|
|
1970
|
+
processedSamplesRef.current = 0;
|
|
1971
|
+
setIsRecording(false);
|
|
1972
|
+
}, [isRecording]),
|
|
1973
|
+
transcribe,
|
|
1974
|
+
isRecording,
|
|
1975
|
+
isTranscribing,
|
|
1976
|
+
isLoading,
|
|
1977
|
+
isReady,
|
|
1978
|
+
transcript,
|
|
1979
|
+
streamingChunk,
|
|
1980
|
+
chunkCount,
|
|
1981
|
+
loadingProgress,
|
|
1982
|
+
error,
|
|
1983
|
+
load
|
|
1984
|
+
};
|
|
1985
|
+
}
|
|
1986
|
+
/**
|
|
1987
|
+
* React hook for voice conversation with STT + LLM + TTS
|
|
1988
|
+
*
|
|
1989
|
+
* Complete voice-to-voice conversation loop:
|
|
1990
|
+
* 1. User presses button to speak
|
|
1991
|
+
* 2. Speech is transcribed (Whisper)
|
|
1992
|
+
* 3. LLM generates response
|
|
1993
|
+
* 4. Response is spoken aloud (Kokoro or Supertonic TTS)
|
|
1994
|
+
*
|
|
1995
|
+
* @example
|
|
1996
|
+
* ```tsx
|
|
1997
|
+
* function VoiceChat() {
|
|
1998
|
+
* const {
|
|
1999
|
+
* messages,
|
|
2000
|
+
* startListening,
|
|
2001
|
+
* stopListening,
|
|
2002
|
+
* isListening,
|
|
2003
|
+
* isSpeaking,
|
|
2004
|
+
* stage,
|
|
2005
|
+
* } = useVoiceChat({
|
|
2006
|
+
* system: "You are a helpful voice assistant.",
|
|
2007
|
+
* voice: "af_bella",
|
|
2008
|
+
* // Or use Supertonic for faster synthesis:
|
|
2009
|
+
* // ttsModel: "supertonic-66m",
|
|
2010
|
+
* // voice: "F1",
|
|
2011
|
+
* });
|
|
2012
|
+
*
|
|
2013
|
+
* return (
|
|
2014
|
+
* <div>
|
|
2015
|
+
* {messages.map(m => (
|
|
2016
|
+
* <div key={m.id}>{m.role}: {m.content}</div>
|
|
2017
|
+
* ))}
|
|
2018
|
+
* <button
|
|
2019
|
+
* onMouseDown={startListening}
|
|
2020
|
+
* onMouseUp={stopListening}
|
|
2021
|
+
* >
|
|
2022
|
+
* {stage === "idle" ? "🎤 Hold to Speak" : stage}
|
|
2023
|
+
* </button>
|
|
2024
|
+
* </div>
|
|
2025
|
+
* );
|
|
2026
|
+
* }
|
|
2027
|
+
* ```
|
|
2028
|
+
*/
|
|
2029
|
+
function useVoiceChat(options = {}) {
|
|
2030
|
+
const React = globalThis.React;
|
|
2031
|
+
if (!React) throw new Error("useVoiceChat requires React. Import React before using this hook.");
|
|
2032
|
+
const { useState, useEffect, useRef, useCallback } = React;
|
|
2033
|
+
const ttsModelId = options.ttsModel || "kokoro-82m";
|
|
2034
|
+
const ttsConfig = TTS_MODELS[ttsModelId];
|
|
2035
|
+
const { llmModel = "qwen3-0.6b", sttModel = "whisper-tiny.en", system = "You are a helpful voice assistant. Keep responses brief and conversational.", thinking = false, voice = ttsConfig.defaultVoice, speed = 1, autoLoad = false, onUserSpeak, onAssistantSpeak, onError } = options;
|
|
2036
|
+
const [messages, setMessages] = useState([]);
|
|
2037
|
+
const [stage, setStage] = useState("idle");
|
|
2038
|
+
const [isLoading, setIsLoading] = useState(autoLoad);
|
|
2039
|
+
const [loadingMessage, setLoadingMessage] = useState("");
|
|
2040
|
+
const [isReady, setIsReady] = useState(false);
|
|
2041
|
+
const [error, setError] = useState(null);
|
|
2042
|
+
const [shouldLoad, setShouldLoad] = useState(autoLoad);
|
|
2043
|
+
const llmWorkerRef = useRef(null);
|
|
2044
|
+
const sttRef = useRef(null);
|
|
2045
|
+
const ttsRef = useRef(null);
|
|
2046
|
+
const mediaRecorderRef = useRef(null);
|
|
2047
|
+
const audioChunksRef = useRef([]);
|
|
2048
|
+
const streamRef = useRef(null);
|
|
2049
|
+
const audioContextRef = useRef(null);
|
|
2050
|
+
const sourceNodeRef = useRef(null);
|
|
2051
|
+
const mountedRef = useRef(true);
|
|
2052
|
+
const cancelledRef = useRef(false);
|
|
2053
|
+
const isListening = stage === "listening";
|
|
2054
|
+
const isProcessing = stage === "transcribing" || stage === "thinking";
|
|
2055
|
+
const isSpeaking = stage === "speaking";
|
|
2056
|
+
useEffect(() => {
|
|
2057
|
+
if (!shouldLoad || isReady) return;
|
|
2058
|
+
let cancelled = false;
|
|
2059
|
+
const loadModels = async () => {
|
|
2060
|
+
try {
|
|
2061
|
+
setIsLoading(true);
|
|
2062
|
+
setError(null);
|
|
2063
|
+
setLoadingMessage("Loading speech recognition (Whisper)...");
|
|
2064
|
+
const { WhisperSTT } = await import("../stt-KzSoNvwI.mjs");
|
|
2065
|
+
if (cancelled || !mountedRef.current) return;
|
|
2066
|
+
const stt = new WhisperSTT(sttModel);
|
|
2067
|
+
await stt.load({ onProgress: (p) => {
|
|
2068
|
+
if (!mountedRef.current) return;
|
|
2069
|
+
setLoadingMessage(p.status || "Loading STT...");
|
|
2070
|
+
} });
|
|
2071
|
+
if (cancelled || !mountedRef.current) {
|
|
2072
|
+
stt.dispose();
|
|
2073
|
+
return;
|
|
2074
|
+
}
|
|
2075
|
+
sttRef.current = stt;
|
|
2076
|
+
setLoadingMessage("Loading language model...");
|
|
2077
|
+
const worker = await createGerbilWorker({
|
|
2078
|
+
modelId: llmModel,
|
|
2079
|
+
onProgress: (p) => {
|
|
2080
|
+
if (!mountedRef.current) return;
|
|
2081
|
+
setLoadingMessage(p.message || "Loading LLM...");
|
|
2082
|
+
}
|
|
2083
|
+
});
|
|
2084
|
+
if (cancelled || !mountedRef.current) {
|
|
2085
|
+
worker.terminate();
|
|
2086
|
+
return;
|
|
2087
|
+
}
|
|
2088
|
+
llmWorkerRef.current = worker;
|
|
2089
|
+
setLoadingMessage(`Loading text-to-speech (${ttsModelId === "supertonic-66m" ? "Supertonic" : "Kokoro"})...`);
|
|
2090
|
+
const { createTTS } = await import("../tts-5yWeP_I0.mjs");
|
|
2091
|
+
if (cancelled || !mountedRef.current) return;
|
|
2092
|
+
const tts = createTTS(ttsModelId);
|
|
2093
|
+
await tts.load({ onProgress: (p) => {
|
|
2094
|
+
if (!mountedRef.current) return;
|
|
2095
|
+
setLoadingMessage(p.status || "Loading TTS...");
|
|
2096
|
+
} });
|
|
2097
|
+
if (cancelled || !mountedRef.current) {
|
|
2098
|
+
await tts.dispose();
|
|
2099
|
+
return;
|
|
2100
|
+
}
|
|
2101
|
+
ttsRef.current = tts;
|
|
2102
|
+
setIsReady(true);
|
|
2103
|
+
setIsLoading(false);
|
|
2104
|
+
setLoadingMessage("Ready!");
|
|
2105
|
+
} catch (e) {
|
|
2106
|
+
if (!mountedRef.current) return;
|
|
2107
|
+
const errMsg = e.message || "Failed to load models";
|
|
2108
|
+
setError(errMsg);
|
|
2109
|
+
setIsLoading(false);
|
|
2110
|
+
onError?.(errMsg);
|
|
2111
|
+
}
|
|
2112
|
+
};
|
|
2113
|
+
loadModels();
|
|
2114
|
+
return () => {
|
|
2115
|
+
cancelled = true;
|
|
2116
|
+
};
|
|
2117
|
+
}, [
|
|
2118
|
+
shouldLoad,
|
|
2119
|
+
isReady,
|
|
2120
|
+
llmModel,
|
|
2121
|
+
sttModel,
|
|
2122
|
+
ttsModelId,
|
|
2123
|
+
onError
|
|
2124
|
+
]);
|
|
2125
|
+
useEffect(() => {
|
|
2126
|
+
mountedRef.current = true;
|
|
2127
|
+
return () => {
|
|
2128
|
+
mountedRef.current = false;
|
|
2129
|
+
llmWorkerRef.current?.terminate();
|
|
2130
|
+
sttRef.current?.dispose();
|
|
2131
|
+
ttsRef.current?.dispose();
|
|
2132
|
+
if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
|
|
2133
|
+
audioContextRef.current?.close();
|
|
2134
|
+
};
|
|
2135
|
+
}, []);
|
|
2136
|
+
const load = useCallback(() => {
|
|
2137
|
+
if (!shouldLoad && !isReady && !isLoading) setShouldLoad(true);
|
|
2138
|
+
}, [
|
|
2139
|
+
shouldLoad,
|
|
2140
|
+
isReady,
|
|
2141
|
+
isLoading
|
|
2142
|
+
]);
|
|
2143
|
+
const blobToFloat32 = useCallback(async (blob) => {
|
|
2144
|
+
const audioContext = new AudioContext({ sampleRate: 16e3 });
|
|
2145
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
2146
|
+
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
|
|
2147
|
+
const channelData = audioBuffer.getChannelData(0);
|
|
2148
|
+
if (audioBuffer.sampleRate !== 16e3) {
|
|
2149
|
+
const ratio = 16e3 / audioBuffer.sampleRate;
|
|
2150
|
+
const newLength = Math.round(channelData.length * ratio);
|
|
2151
|
+
const resampled = new Float32Array(newLength);
|
|
2152
|
+
for (let i = 0; i < newLength; i++) {
|
|
2153
|
+
const srcIndex = i / ratio;
|
|
2154
|
+
const floor = Math.floor(srcIndex);
|
|
2155
|
+
const ceil = Math.min(floor + 1, channelData.length - 1);
|
|
2156
|
+
const t = srcIndex - floor;
|
|
2157
|
+
resampled[i] = channelData[floor] * (1 - t) + channelData[ceil] * t;
|
|
2158
|
+
}
|
|
2159
|
+
audioContext.close();
|
|
2160
|
+
return resampled;
|
|
2161
|
+
}
|
|
2162
|
+
audioContext.close();
|
|
2163
|
+
return new Float32Array(channelData);
|
|
2164
|
+
}, []);
|
|
2165
|
+
const playAudioBuffer = useCallback(async (audio, sampleRate) => {
|
|
2166
|
+
return new Promise((resolve) => {
|
|
2167
|
+
if (!audioContextRef.current) audioContextRef.current = new AudioContext();
|
|
2168
|
+
const ctx = audioContextRef.current;
|
|
2169
|
+
const buffer = ctx.createBuffer(1, audio.length, sampleRate);
|
|
2170
|
+
const channelData = new Float32Array(audio);
|
|
2171
|
+
buffer.copyToChannel(channelData, 0);
|
|
2172
|
+
const source = ctx.createBufferSource();
|
|
2173
|
+
source.buffer = buffer;
|
|
2174
|
+
source.connect(ctx.destination);
|
|
2175
|
+
source.onended = () => {
|
|
2176
|
+
if (mountedRef.current) resolve();
|
|
2177
|
+
};
|
|
2178
|
+
source.start();
|
|
2179
|
+
sourceNodeRef.current = source;
|
|
2180
|
+
});
|
|
2181
|
+
}, []);
|
|
2182
|
+
return {
|
|
2183
|
+
messages,
|
|
2184
|
+
startListening: useCallback(async () => {
|
|
2185
|
+
if (stage !== "idle") return;
|
|
2186
|
+
if (!isReady && !isLoading) {
|
|
2187
|
+
setShouldLoad(true);
|
|
2188
|
+
return;
|
|
2189
|
+
}
|
|
2190
|
+
cancelledRef.current = false;
|
|
2191
|
+
try {
|
|
2192
|
+
const stream = await navigator.mediaDevices.getUserMedia({ audio: {
|
|
2193
|
+
sampleRate: 16e3,
|
|
2194
|
+
channelCount: 1,
|
|
2195
|
+
echoCancellation: true
|
|
2196
|
+
} });
|
|
2197
|
+
streamRef.current = stream;
|
|
2198
|
+
audioChunksRef.current = [];
|
|
2199
|
+
const mediaRecorder = new MediaRecorder(stream);
|
|
2200
|
+
mediaRecorderRef.current = mediaRecorder;
|
|
2201
|
+
mediaRecorder.ondataavailable = (event) => {
|
|
2202
|
+
if (event.data.size > 0) audioChunksRef.current.push(event.data);
|
|
2203
|
+
};
|
|
2204
|
+
mediaRecorder.start(100);
|
|
2205
|
+
setStage("listening");
|
|
2206
|
+
setError(null);
|
|
2207
|
+
} catch (e) {
|
|
2208
|
+
const errMsg = e.message || "Failed to access microphone";
|
|
2209
|
+
setError(errMsg);
|
|
2210
|
+
onError?.(errMsg);
|
|
2211
|
+
}
|
|
2212
|
+
}, [
|
|
2213
|
+
stage,
|
|
2214
|
+
isReady,
|
|
2215
|
+
isLoading,
|
|
2216
|
+
onError
|
|
2217
|
+
]),
|
|
2218
|
+
stopListening: useCallback(async () => {
|
|
2219
|
+
if (stage !== "listening") return;
|
|
2220
|
+
const mediaRecorder = mediaRecorderRef.current;
|
|
2221
|
+
if (!mediaRecorder) return;
|
|
2222
|
+
return new Promise((resolve) => {
|
|
2223
|
+
mediaRecorder.onstop = async () => {
|
|
2224
|
+
if (streamRef.current) {
|
|
2225
|
+
for (const track of streamRef.current.getTracks()) track.stop();
|
|
2226
|
+
streamRef.current = null;
|
|
2227
|
+
}
|
|
2228
|
+
if (cancelledRef.current) {
|
|
2229
|
+
setStage("idle");
|
|
2230
|
+
resolve();
|
|
2231
|
+
return;
|
|
2232
|
+
}
|
|
2233
|
+
const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm" });
|
|
2234
|
+
try {
|
|
2235
|
+
setStage("transcribing");
|
|
2236
|
+
const audioData = await blobToFloat32(audioBlob);
|
|
2237
|
+
let userText = (await sttRef.current.transcribe(audioData)).text.trim();
|
|
2238
|
+
if (userText === "[BLANK_AUDIO]" || userText === "(blank audio)" || userText === "[BLANK AUDIO]") userText = "";
|
|
2239
|
+
if (cancelledRef.current || !userText) {
|
|
2240
|
+
setStage("idle");
|
|
2241
|
+
resolve();
|
|
2242
|
+
return;
|
|
2243
|
+
}
|
|
2244
|
+
const userMsgId = `user-${Date.now()}`;
|
|
2245
|
+
setMessages((m) => [...m, {
|
|
2246
|
+
id: userMsgId,
|
|
2247
|
+
role: "user",
|
|
2248
|
+
content: userText
|
|
2249
|
+
}]);
|
|
2250
|
+
onUserSpeak?.(userText);
|
|
2251
|
+
setStage("thinking");
|
|
2252
|
+
const history = messages.map((m) => ({
|
|
2253
|
+
role: m.role,
|
|
2254
|
+
content: m.content
|
|
2255
|
+
}));
|
|
2256
|
+
history.push({
|
|
2257
|
+
role: "user",
|
|
2258
|
+
content: userText
|
|
2259
|
+
});
|
|
2260
|
+
let responseText = "";
|
|
2261
|
+
let thinkingText = "";
|
|
2262
|
+
await llmWorkerRef.current.generate(userText, {
|
|
2263
|
+
system,
|
|
2264
|
+
thinking,
|
|
2265
|
+
history,
|
|
2266
|
+
onToken: (token) => {
|
|
2267
|
+
if (cancelledRef.current) return;
|
|
2268
|
+
if (token.state === "thinking") thinkingText += token.text;
|
|
2269
|
+
else responseText += token.text;
|
|
2270
|
+
}
|
|
2271
|
+
});
|
|
2272
|
+
if (cancelledRef.current) {
|
|
2273
|
+
setStage("idle");
|
|
2274
|
+
resolve();
|
|
2275
|
+
return;
|
|
2276
|
+
}
|
|
2277
|
+
const assistantMsgId = `assistant-${Date.now()}`;
|
|
2278
|
+
setMessages((m) => [...m, {
|
|
2279
|
+
id: assistantMsgId,
|
|
2280
|
+
role: "assistant",
|
|
2281
|
+
content: responseText,
|
|
2282
|
+
thinking: thinkingText || void 0
|
|
2283
|
+
}]);
|
|
2284
|
+
onAssistantSpeak?.(responseText);
|
|
2285
|
+
if (responseText.trim()) {
|
|
2286
|
+
setStage("speaking");
|
|
2287
|
+
const ttsResult = await ttsRef.current.speak(responseText, {
|
|
2288
|
+
voice,
|
|
2289
|
+
speed
|
|
2290
|
+
});
|
|
2291
|
+
if (!cancelledRef.current) await playAudioBuffer(ttsResult.audio, ttsResult.sampleRate);
|
|
2292
|
+
}
|
|
2293
|
+
setStage("idle");
|
|
2294
|
+
resolve();
|
|
2295
|
+
} catch (e) {
|
|
2296
|
+
if (!mountedRef.current) return;
|
|
2297
|
+
const errMsg = e.message || "Processing failed";
|
|
2298
|
+
setError(errMsg);
|
|
2299
|
+
setStage("idle");
|
|
2300
|
+
onError?.(errMsg);
|
|
2301
|
+
resolve();
|
|
2302
|
+
}
|
|
2303
|
+
};
|
|
2304
|
+
mediaRecorder.stop();
|
|
2305
|
+
});
|
|
2306
|
+
}, [
|
|
2307
|
+
stage,
|
|
2308
|
+
messages,
|
|
2309
|
+
system,
|
|
2310
|
+
thinking,
|
|
2311
|
+
voice,
|
|
2312
|
+
speed,
|
|
2313
|
+
blobToFloat32,
|
|
2314
|
+
playAudioBuffer,
|
|
2315
|
+
onUserSpeak,
|
|
2316
|
+
onAssistantSpeak,
|
|
2317
|
+
onError
|
|
2318
|
+
]),
|
|
2319
|
+
cancel: useCallback(() => {
|
|
2320
|
+
cancelledRef.current = true;
|
|
2321
|
+
if (mediaRecorderRef.current && stage === "listening") mediaRecorderRef.current.stop();
|
|
2322
|
+
if (streamRef.current) {
|
|
2323
|
+
for (const track of streamRef.current.getTracks()) track.stop();
|
|
2324
|
+
streamRef.current = null;
|
|
2325
|
+
}
|
|
2326
|
+
if (sourceNodeRef.current) try {
|
|
2327
|
+
sourceNodeRef.current.stop();
|
|
2328
|
+
} catch {}
|
|
2329
|
+
audioChunksRef.current = [];
|
|
2330
|
+
setStage("idle");
|
|
2331
|
+
}, [stage]),
|
|
2332
|
+
clear: useCallback(() => {
|
|
2333
|
+
setMessages([]);
|
|
2334
|
+
}, []),
|
|
2335
|
+
isListening,
|
|
2336
|
+
isProcessing,
|
|
2337
|
+
isSpeaking,
|
|
2338
|
+
stage,
|
|
2339
|
+
isReady,
|
|
2340
|
+
isLoading,
|
|
2341
|
+
loadingMessage,
|
|
2342
|
+
error,
|
|
2343
|
+
load
|
|
2344
|
+
};
|
|
2345
|
+
}
|
|
722
2346
|
/**
|
|
723
2347
|
* Check if WebGPU is supported
|
|
724
2348
|
*/
|
|
@@ -747,9 +2371,11 @@ async function getWebGPUInfo() {
|
|
|
747
2371
|
var browser_default = {
|
|
748
2372
|
isWebGPUSupported,
|
|
749
2373
|
getWebGPUInfo,
|
|
750
|
-
createGerbilWorker
|
|
2374
|
+
createGerbilWorker,
|
|
2375
|
+
playAudio,
|
|
2376
|
+
createAudioPlayer
|
|
751
2377
|
};
|
|
752
2378
|
|
|
753
2379
|
//#endregion
|
|
754
|
-
export { BUILTIN_MODELS, createGerbilWorker, browser_default as default, getWebGPUInfo, isWebGPUSupported, useChat, useCompletion };
|
|
2380
|
+
export { BUILTIN_MODELS, createAudioPlayer, createGerbilWorker, browser_default as default, getWebGPUInfo, isWebGPUSupported, playAudio, useChat, useCompletion, useSpeech, useVoiceChat, useVoiceInput };
|
|
755
2381
|
//# sourceMappingURL=index.mjs.map
|