agentgui 1.0.175 → 1.0.177
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +125 -51
- package/package.json +1 -1
- package/server.js +54 -0
- package/static/js/voice.js +47 -2
package/lib/speech.js
CHANGED
|
@@ -8,7 +8,7 @@ const require = createRequire(import.meta.url);
|
|
|
8
8
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
9
9
|
const ROOT = path.dirname(__dirname);
|
|
10
10
|
const DATA_DIR = path.join(ROOT, 'data');
|
|
11
|
-
const AUDIO_EXTENSIONS = ['.
|
|
11
|
+
const AUDIO_EXTENSIONS = ['.wav', '.mp3', '.ogg', '.flac', '.m4a'];
|
|
12
12
|
|
|
13
13
|
function getVoiceDirs() {
|
|
14
14
|
const dirs = [];
|
|
@@ -106,6 +106,7 @@ const SPEAKER_EMBEDDINGS_URL = 'https://huggingface.co/datasets/Xenova/speaker_e
|
|
|
106
106
|
const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
|
|
107
107
|
const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Xenova%2Fspeaker_embeddings&config=default&split=train';
|
|
108
108
|
const SAMPLES_TO_AVERAGE = 30;
|
|
109
|
+
const DEFAULT_EMBEDDING_B64 = 'xhibvao34LylqXQ8cNg7Pd1cCTw0keG8awRRvRqje7070G48AtOgPMFbnr1oeKC9I4ZuPZzqGT1DjWs8y3iMPB/SZLzdl7E6b9QaPKSpHTwYuh49FrMlO9YnebwmTzu9/3CPvQuvCbxsSWC9Sb2bO+tvXj0Cjpo8mTMxu/FDrjzQ4x09gyxCvUn6STxjAo+9vtXdPJtsYT3iMna9dQ+EvfQ72zuvxk69GAonPU8KdjsNPAU96e/8veN7lrwgyzk8HA5vvYE1Rz3gpZ484MsLPUKkxTzM54U81ECwvcbFHzv8gT08T6/7POCqBT2fv5E8fvsXPfZiJrzEhme8dg8kPR+mKTutQOU822maPMlMDb1x/IS93+6KvdyThzwhry880JBqvRVOhjzZods8SD08PLpObTn/0wk9BnAwvWiiz72EWgS9RpcjvV4VR73ZqJW9PoUFvfZYYb1h26S98levPHZbTjxH6qU9RPfoPHmJu70mSNo8ztJmvWgMBj0IX8i7TE3lPINY2DzoEma9wMObvTwKCT3pObe8t9KEvaWixjzc5fI8hj6MvaKv4Txl4h09d2a+PHCvTDxorJ69ekRrPeoPjz1JPfI7rUH7PIaJgz0O1YW9JLumvCxDnr1bmMm8GbIFPBX1oL3bRN08oYcXPEaFfL13Vxo9EKfbvTFcOTxdogA9XS3kPEWJoLvChc887BEgPMOvUT2Ba3s8tUDBvYPMZ72dNRG80AuTvQt7d72foTU9qO20O4INEb1u1iE9ibqJvZYaOj2nbYc8lsodvS5HPD1lCqK9EkBYPR0I/rySMIK9plcpPdpJEz2E/DY88d2DPIRTf71ZQZS9b1v5PPseFT2YiJu8OiOwPC8Wnr2QW4Q8n+o7PPQ8PD0QqAg9Vk7APDT6+jzreP88KH6GvTvAKD0AYiO9qOavvORySjvQ6y+9epb5PFvZijxYzlK9BwjUPK0HXL3acWc7dmwmPc/kXb2VBg68MGYRPR5q9zzmFiS9al2IvdVTfDwJOa88SzVkvVlrPD0WvJQ8Vm76PMUAQDzNgyK8QQZVPdMoibxrCBc9BgKTPDLoV70Iu6g7k+kBPZ3lhTy6sOU8OGkVvFaLRD14oqa9a4UVO4z4Gr1eYlO9u5BgPWS1ZL3kFPE8JGEwPQFTl71tHso8g+ElPd9Rgr2XCtc8axudvWC2IL09wSg9E7ZzPT6uBz2XmK09A1HcPJK8rTxK8Zu8GuMTPTuINTyRAhS9OSqDPDralLza3q48EgtePPf797rIWKo9NtkrvbO34zxKZ6m97l0GPQYVlL2igDA9UyfEPJhZyjx4/2Q8ggBpPYcAkzzIVu08ykYNPESdZr3uqmq8fS/zPKUYvzv67x49cUkqvXDlJj1us/88gASuvcs6G7sUshY9SgWiOqu4OD1WQ7k7/sLoPKuLJjwZYFm9an+zPOnfNry9Jh49/XX3vN1sc731fBM9TnBDPHzOAD26/dS9mg57vY+TA7wVJCw9pPb1PE30l7019la9UyRTPXFqljyRDnw9eZ6nvU03kTtS9907L+wavIBtab3k6cs8KVr6vPZ5zTxy+Zs8VuopPQTTUj0tNxg96qZyPY69lTzQEp48BXGJvVopBDvskUg9G2dOPaJMXDylJZU8FxcMvBQkNzzjPKs8FYUpvepYYj1AQsK9upQsvS4037xDcO48GhmIvWb1iT1gJhy9TG7iPHKAG70cuCQ8F1ZwPYqtj7300T89rTujPbXy2r3/cK69FtBNvY3iMT0DoqI4KK0QPYKEqr2Z6RU9ni0UPUNDLb3BsCi8+GttvZYp9zwUaHe9TqrFPOnlH7yCXJC9U8vDu8u2MjxA8xs9SAGxvPpphr29y2e9y2AYvTv+Eb1Elus9DdpGPSfmNL39Ggu85RVXPZbLh70Jvna7XkLGvR230DtGjpu7Ih8HPJKnIz1o35i8x5NVvXwFNDzs/ZM8+kw8PfFJSTwdlJA9ZJ+tvaoVZ7zTvVi8p6wluwh/IT0Kmg088o1rPRhiwjxpWIe9a+LuvYuYtjwAxE09WkPJPBuFh73UotY820JjvXpnQD3fJ/w8TM3JPOz0pTnbTim9tpe6PBHzJT1HEb66SkAKPasLgr1l/Mm8IOGgvM2pZbzwd4a9znOIO4d4Bb1DW5I8EZXzOxvBKDqKpHG9UwCHvd/Epb2cDRi9V1ztPNPBNTrLXHa8FdGHPPo+hb3DnJ08G+SvvVPQBL6zzrC8Omksvc+eIjyvGfU8eG9nvaVkdL1HBvs8eaeGPfcbVD1/Pfw8+TUFvU6aTL2JN5W8HXDNvGKFEj1i+T09UiCIOySbDD2x2/y7VTmnvTe3gb0ZhJw8WrKIuU5RGT09mKU7eFGtPFpr6DzaoyI9hsItPKU+YzuQlXK8f9IePSmUxTwXdoo9W6FJPV2kLzwkU1o8fGnfPInxg70rEVe9H7sNPWJDbbxSqLY8cQAOPUdpAD2YknK9ykFXPeVALz1mq3W96kO/PLERzjyXIRC7jxsXPRnLzjyUEoU7gTKvu+stlb1D1g45IH+2u5sOIj0wXPA8yTqDvT6mV72NsFq8ExeuPJlGyDxvjgk9lJeJvWSF8DwFvaW7oZ9GvHq1Rr1FJsk83zxVvfyGqTz7thG9fslpPF5RPb1Q6BQ9iXGovTeDeb2cmic8oBsRPYeni72TPcI8EKcPvfCJUbyQJqW9fCAYPRk8qT2q6rk8mEw2PfDeXL0=';
|
|
109
110
|
|
|
110
111
|
let transformersModule = null;
|
|
111
112
|
let sttPipeline = null;
|
|
@@ -116,13 +117,17 @@ let sttLoading = false;
|
|
|
116
117
|
let ttsLoading = false;
|
|
117
118
|
let speakerEmbeddingLoading = false;
|
|
118
119
|
let ttsLoadError = null;
|
|
120
|
+
let ttsLoadErrorTime = 0;
|
|
119
121
|
let sttLoadError = null;
|
|
120
122
|
const voiceEmbeddingsCache = new Map();
|
|
121
123
|
const SAMPLE_RATE_STT = 16000;
|
|
122
124
|
const SAMPLE_RATE_TTS = 16000;
|
|
125
|
+
const TTS_ERROR_RETRY_MS = 30000;
|
|
123
126
|
|
|
124
|
-
const
|
|
127
|
+
const TTS_CACHE_MAX_BYTES = 10 * 1024 * 1024;
|
|
128
|
+
let ttsCacheBytes = 0;
|
|
125
129
|
const ttsCache = new Map();
|
|
130
|
+
const ttsInflight = new Map();
|
|
126
131
|
|
|
127
132
|
async function loadTransformers() {
|
|
128
133
|
if (transformersModule) return transformersModule;
|
|
@@ -139,16 +144,36 @@ function whisperModelPath() {
|
|
|
139
144
|
return 'onnx-community/whisper-base';
|
|
140
145
|
}
|
|
141
146
|
|
|
147
|
+
function defaultEmbedding() {
|
|
148
|
+
const buf = Buffer.from(DEFAULT_EMBEDDING_B64, 'base64');
|
|
149
|
+
return new Float32Array(new Uint8Array(buf).buffer);
|
|
150
|
+
}
|
|
151
|
+
|
|
142
152
|
async function ensureSpeakerEmbeddings() {
|
|
143
153
|
if (speakerEmbeddings) return speakerEmbeddings;
|
|
144
154
|
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
145
|
-
if (
|
|
146
|
-
const
|
|
147
|
-
if (
|
|
148
|
-
|
|
155
|
+
if (fs.existsSync(SPEAKER_EMBEDDINGS_PATH)) {
|
|
156
|
+
const buf = fs.readFileSync(SPEAKER_EMBEDDINGS_PATH);
|
|
157
|
+
if (buf.length === 2048) {
|
|
158
|
+
speakerEmbeddings = new Float32Array(new Uint8Array(buf).buffer);
|
|
159
|
+
return speakerEmbeddings;
|
|
160
|
+
}
|
|
149
161
|
}
|
|
150
|
-
|
|
151
|
-
|
|
162
|
+
try {
|
|
163
|
+
const resp = await fetch(SPEAKER_EMBEDDINGS_URL);
|
|
164
|
+
if (resp.ok) {
|
|
165
|
+
const data = Buffer.from(await resp.arrayBuffer());
|
|
166
|
+
if (data.length >= 2048) {
|
|
167
|
+
fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, data);
|
|
168
|
+
speakerEmbeddings = new Float32Array(new Uint8Array(data).buffer);
|
|
169
|
+
return speakerEmbeddings;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
} catch (_) {}
|
|
173
|
+
console.log('[TTS] Using bundled default speaker embedding');
|
|
174
|
+
speakerEmbeddings = defaultEmbedding();
|
|
175
|
+
const buf = Buffer.from(speakerEmbeddings.buffer);
|
|
176
|
+
fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, buf);
|
|
152
177
|
return speakerEmbeddings;
|
|
153
178
|
}
|
|
154
179
|
|
|
@@ -167,25 +192,30 @@ async function loadVoiceEmbedding(voiceId) {
|
|
|
167
192
|
}
|
|
168
193
|
const offset = SPEAKER_OFFSETS[voiceId];
|
|
169
194
|
if (offset === undefined) return ensureSpeakerEmbeddings();
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
const
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
195
|
+
try {
|
|
196
|
+
const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
|
|
197
|
+
const resp = await fetch(url);
|
|
198
|
+
if (!resp.ok) throw new Error('HTTP ' + resp.status);
|
|
199
|
+
const data = await resp.json();
|
|
200
|
+
const avg = new Float32Array(512);
|
|
201
|
+
let count = 0;
|
|
202
|
+
for (const item of data.rows) {
|
|
203
|
+
const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
|
|
204
|
+
if (match && match[1] === voiceId) {
|
|
205
|
+
for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
|
|
206
|
+
count++;
|
|
207
|
+
}
|
|
181
208
|
}
|
|
209
|
+
if (count === 0) return ensureSpeakerEmbeddings();
|
|
210
|
+
for (let i = 0; i < 512; i++) avg[i] /= count;
|
|
211
|
+
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
212
|
+
fs.writeFileSync(binPath, Buffer.from(avg.buffer));
|
|
213
|
+
voiceEmbeddingsCache.set(voiceId, avg);
|
|
214
|
+
return avg;
|
|
215
|
+
} catch (err) {
|
|
216
|
+
console.error('[TTS] Failed to fetch voice embedding for ' + voiceId + ':', err.message);
|
|
217
|
+
return ensureSpeakerEmbeddings();
|
|
182
218
|
}
|
|
183
|
-
if (count === 0) return ensureSpeakerEmbeddings();
|
|
184
|
-
for (let i = 0; i < 512; i++) avg[i] /= count;
|
|
185
|
-
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
186
|
-
fs.writeFileSync(binPath, Buffer.from(avg.buffer));
|
|
187
|
-
voiceEmbeddingsCache.set(voiceId, avg);
|
|
188
|
-
return avg;
|
|
189
219
|
}
|
|
190
220
|
|
|
191
221
|
async function getSpeakerEmbeddingPipeline() {
|
|
@@ -230,6 +260,12 @@ async function decodeAudioFile(filePath) {
|
|
|
230
260
|
const decoded = decodeWavToFloat32(buf);
|
|
231
261
|
return resampleTo16k(decoded.audio, decoded.sampleRate);
|
|
232
262
|
}
|
|
263
|
+
const wavPath = filePath.replace(/\.[^.]+$/, '.wav');
|
|
264
|
+
if (fs.existsSync(wavPath)) {
|
|
265
|
+
const wavBuf = fs.readFileSync(wavPath);
|
|
266
|
+
const decoded = decodeWavToFloat32(wavBuf);
|
|
267
|
+
return resampleTo16k(decoded.audio, decoded.sampleRate);
|
|
268
|
+
}
|
|
233
269
|
const decode = (await import('audio-decode')).default;
|
|
234
270
|
const audioBuffer = await decode(buf);
|
|
235
271
|
const mono = audioBuffer.getChannelData(0);
|
|
@@ -242,23 +278,28 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
|
|
|
242
278
|
console.error('[VOICES] Custom voice file not found for:', voiceId);
|
|
243
279
|
return ensureSpeakerEmbeddings();
|
|
244
280
|
}
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
281
|
+
try {
|
|
282
|
+
console.log('[VOICES] Generating embedding from:', audioFile);
|
|
283
|
+
const audio = await decodeAudioFile(audioFile);
|
|
284
|
+
if (audio.length < SAMPLE_RATE_STT * 0.5) {
|
|
285
|
+
throw new Error('Audio too short for embedding extraction');
|
|
286
|
+
}
|
|
287
|
+
const pipe = await getSpeakerEmbeddingPipeline();
|
|
288
|
+
const output = await pipe(audio, { pooling: 'mean', normalize: true });
|
|
289
|
+
const embedding = new Float32Array(512);
|
|
290
|
+
for (let i = 0; i < Math.min(512, output.data.length); i++) {
|
|
291
|
+
embedding[i] = output.data[i];
|
|
292
|
+
}
|
|
293
|
+
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
294
|
+
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
295
|
+
fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
|
|
296
|
+
voiceEmbeddingsCache.set(voiceId, embedding);
|
|
297
|
+
console.log('[VOICES] Generated embedding for custom voice:', voiceId);
|
|
298
|
+
return embedding;
|
|
299
|
+
} catch (err) {
|
|
300
|
+
console.error('[VOICES] Failed to generate embedding for', voiceId + ':', err.message);
|
|
301
|
+
return ensureSpeakerEmbeddings();
|
|
255
302
|
}
|
|
256
|
-
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
257
|
-
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
258
|
-
fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
|
|
259
|
-
voiceEmbeddingsCache.set(voiceId, embedding);
|
|
260
|
-
console.log('[VOICES] Generated embedding for custom voice:', voiceId);
|
|
261
|
-
return embedding;
|
|
262
303
|
}
|
|
263
304
|
|
|
264
305
|
async function getSTT() {
|
|
@@ -295,7 +336,11 @@ async function getSTT() {
|
|
|
295
336
|
|
|
296
337
|
async function getTTS() {
|
|
297
338
|
if (ttsPipeline) return ttsPipeline;
|
|
298
|
-
if (ttsLoadError)
|
|
339
|
+
if (ttsLoadError) {
|
|
340
|
+
if (Date.now() - ttsLoadErrorTime < TTS_ERROR_RETRY_MS) throw ttsLoadError;
|
|
341
|
+
ttsLoadError = null;
|
|
342
|
+
ttsLoadErrorTime = 0;
|
|
343
|
+
}
|
|
299
344
|
if (ttsLoading) {
|
|
300
345
|
while (ttsLoading) await new Promise(r => setTimeout(r, 100));
|
|
301
346
|
if (ttsLoadError) throw ttsLoadError;
|
|
@@ -312,10 +357,12 @@ async function getTTS() {
|
|
|
312
357
|
});
|
|
313
358
|
await ensureSpeakerEmbeddings();
|
|
314
359
|
ttsLoadError = null;
|
|
360
|
+
ttsLoadErrorTime = 0;
|
|
315
361
|
return ttsPipeline;
|
|
316
362
|
} catch (err) {
|
|
317
363
|
ttsPipeline = null;
|
|
318
364
|
ttsLoadError = new Error('TTS model load failed: ' + err.message);
|
|
365
|
+
ttsLoadErrorTime = Date.now();
|
|
319
366
|
throw ttsLoadError;
|
|
320
367
|
} finally {
|
|
321
368
|
ttsLoading = false;
|
|
@@ -443,11 +490,17 @@ function splitSentences(text) {
|
|
|
443
490
|
}
|
|
444
491
|
|
|
445
492
|
function cachePut(key, buf) {
|
|
446
|
-
if (ttsCache.
|
|
493
|
+
if (ttsCache.has(key)) {
|
|
494
|
+
ttsCacheBytes -= ttsCache.get(key).length;
|
|
495
|
+
ttsCache.delete(key);
|
|
496
|
+
}
|
|
497
|
+
while (ttsCacheBytes + buf.length > TTS_CACHE_MAX_BYTES && ttsCache.size > 0) {
|
|
447
498
|
const oldest = ttsCache.keys().next().value;
|
|
499
|
+
ttsCacheBytes -= ttsCache.get(oldest).length;
|
|
448
500
|
ttsCache.delete(oldest);
|
|
449
501
|
}
|
|
450
502
|
ttsCache.set(key, buf);
|
|
503
|
+
ttsCacheBytes += buf.length;
|
|
451
504
|
}
|
|
452
505
|
|
|
453
506
|
async function synthesize(text, voiceId) {
|
|
@@ -458,12 +511,18 @@ async function synthesize(text, voiceId) {
|
|
|
458
511
|
ttsCache.set(cacheKey, cached);
|
|
459
512
|
return cached;
|
|
460
513
|
}
|
|
461
|
-
const
|
|
462
|
-
|
|
463
|
-
const
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
514
|
+
const inflight = ttsInflight.get(cacheKey);
|
|
515
|
+
if (inflight) return inflight;
|
|
516
|
+
const promise = (async () => {
|
|
517
|
+
const tts = await getTTS();
|
|
518
|
+
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
519
|
+
const result = await tts(text, { speaker_embeddings: embeddings });
|
|
520
|
+
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
521
|
+
cachePut(cacheKey, wav);
|
|
522
|
+
return wav;
|
|
523
|
+
})();
|
|
524
|
+
ttsInflight.set(cacheKey, promise);
|
|
525
|
+
try { return await promise; } finally { ttsInflight.delete(cacheKey); }
|
|
467
526
|
}
|
|
468
527
|
|
|
469
528
|
async function* synthesizeStream(text, voiceId) {
|
|
@@ -487,14 +546,29 @@ async function* synthesizeStream(text, voiceId) {
|
|
|
487
546
|
}
|
|
488
547
|
|
|
489
548
|
function getStatus() {
|
|
549
|
+
const ttsRetryExpired = ttsLoadError && (Date.now() - ttsLoadErrorTime >= TTS_ERROR_RETRY_MS);
|
|
490
550
|
return {
|
|
491
551
|
sttReady: !!sttPipeline,
|
|
492
552
|
ttsReady: !!ttsPipeline,
|
|
493
553
|
sttLoading,
|
|
494
554
|
ttsLoading,
|
|
495
555
|
sttError: sttLoadError ? sttLoadError.message : null,
|
|
496
|
-
ttsError: ttsLoadError ? ttsLoadError.message : null,
|
|
556
|
+
ttsError: (ttsLoadError && !ttsRetryExpired) ? ttsLoadError.message : null,
|
|
497
557
|
};
|
|
498
558
|
}
|
|
499
559
|
|
|
500
|
-
|
|
560
|
+
function preloadTTS() {
|
|
561
|
+
getTTS().catch(err => console.error('[TTS] Preload failed:', err.message));
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
function ttsCacheKey(text, voiceId) {
|
|
565
|
+
return (voiceId || 'default') + ':' + text;
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
function ttsCacheGet(key) {
|
|
569
|
+
const cached = ttsCache.get(key);
|
|
570
|
+
if (cached) { ttsCache.delete(key); ttsCache.set(key, cached); }
|
|
571
|
+
return cached || null;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
|
package/package.json
CHANGED
package/server.js
CHANGED
|
@@ -15,6 +15,49 @@ async function getSpeech() {
|
|
|
15
15
|
return speechModule;
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
function eagerTTS(text, conversationId, sessionId) {
|
|
19
|
+
getSpeech().then(speech => {
|
|
20
|
+
const status = speech.getStatus();
|
|
21
|
+
if (!status.ttsReady || status.ttsError) return;
|
|
22
|
+
const voices = new Set();
|
|
23
|
+
for (const ws of syncClients) {
|
|
24
|
+
const vid = ws.ttsVoiceId || 'default';
|
|
25
|
+
const convKey = `conv-${conversationId}`;
|
|
26
|
+
if (ws.subscriptions && (ws.subscriptions.has(sessionId) || ws.subscriptions.has(convKey))) {
|
|
27
|
+
voices.add(vid);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
if (voices.size === 0) return;
|
|
31
|
+
const sentences = speech.splitSentences(text);
|
|
32
|
+
for (const vid of voices) {
|
|
33
|
+
for (const sentence of sentences) {
|
|
34
|
+
const cacheKey = speech.ttsCacheKey(sentence, vid);
|
|
35
|
+
const cached = speech.ttsCacheGet(cacheKey);
|
|
36
|
+
if (cached) {
|
|
37
|
+
pushTTSAudio(cacheKey, cached, conversationId, sessionId, vid);
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
speech.synthesize(sentence, vid).then(wav => {
|
|
41
|
+
pushTTSAudio(cacheKey, wav, conversationId, sessionId, vid);
|
|
42
|
+
}).catch(() => {});
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}).catch(() => {});
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function pushTTSAudio(cacheKey, wav, conversationId, sessionId, voiceId) {
|
|
49
|
+
const b64 = wav.toString('base64');
|
|
50
|
+
broadcastSync({
|
|
51
|
+
type: 'tts_audio',
|
|
52
|
+
cacheKey,
|
|
53
|
+
audio: b64,
|
|
54
|
+
voiceId,
|
|
55
|
+
conversationId,
|
|
56
|
+
sessionId,
|
|
57
|
+
timestamp: Date.now()
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
|
|
18
61
|
const require = createRequire(import.meta.url);
|
|
19
62
|
const express = require('express');
|
|
20
63
|
const Busboy = require('busboy');
|
|
@@ -852,6 +895,10 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
|
|
|
852
895
|
blockIndex: allBlocks.length - 1,
|
|
853
896
|
timestamp: Date.now()
|
|
854
897
|
});
|
|
898
|
+
|
|
899
|
+
if (block.type === 'text' && block.text) {
|
|
900
|
+
eagerTTS(block.text, conversationId, sessionId);
|
|
901
|
+
}
|
|
855
902
|
}
|
|
856
903
|
} else if (parsed.type === 'user' && parsed.message?.content) {
|
|
857
904
|
for (const block of parsed.message.content) {
|
|
@@ -900,6 +947,11 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
|
|
|
900
947
|
timestamp: Date.now()
|
|
901
948
|
});
|
|
902
949
|
|
|
950
|
+
if (parsed.result) {
|
|
951
|
+
const resultText = typeof parsed.result === 'string' ? parsed.result : JSON.stringify(parsed.result);
|
|
952
|
+
if (resultText) eagerTTS(resultText, conversationId, sessionId);
|
|
953
|
+
}
|
|
954
|
+
|
|
903
955
|
if (parsed.result && allBlocks.length === 0) {
|
|
904
956
|
allBlocks.push({ type: 'text', text: String(parsed.result) });
|
|
905
957
|
}
|
|
@@ -1128,6 +1180,8 @@ wss.on('connection', (ws, req) => {
|
|
|
1128
1180
|
subscriptions: Array.from(ws.subscriptions),
|
|
1129
1181
|
timestamp: Date.now()
|
|
1130
1182
|
}));
|
|
1183
|
+
} else if (data.type === 'set_voice') {
|
|
1184
|
+
ws.ttsVoiceId = data.voiceId || 'default';
|
|
1131
1185
|
} else if (data.type === 'ping') {
|
|
1132
1186
|
ws.send(JSON.stringify({
|
|
1133
1187
|
type: 'pong',
|
package/static/js/voice.js
CHANGED
|
@@ -15,6 +15,8 @@
|
|
|
15
15
|
var spokenChunks = new Set();
|
|
16
16
|
var isLoadingHistory = false;
|
|
17
17
|
var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
|
|
18
|
+
var ttsAudioCache = new Map();
|
|
19
|
+
var TTS_CLIENT_CACHE_MAX = 50;
|
|
18
20
|
|
|
19
21
|
function init() {
|
|
20
22
|
setupTTSToggle();
|
|
@@ -69,6 +71,7 @@
|
|
|
69
71
|
selector.addEventListener('change', function() {
|
|
70
72
|
selectedVoiceId = selector.value;
|
|
71
73
|
localStorage.setItem('voice-selected-id', selectedVoiceId);
|
|
74
|
+
sendVoiceToServer();
|
|
72
75
|
});
|
|
73
76
|
}
|
|
74
77
|
|
|
@@ -295,12 +298,29 @@
|
|
|
295
298
|
processQueue();
|
|
296
299
|
}
|
|
297
300
|
|
|
301
|
+
function cacheTTSAudio(cacheKey, b64) {
|
|
302
|
+
if (ttsAudioCache.size >= TTS_CLIENT_CACHE_MAX) {
|
|
303
|
+
var oldest = ttsAudioCache.keys().next().value;
|
|
304
|
+
ttsAudioCache.delete(oldest);
|
|
305
|
+
}
|
|
306
|
+
var binary = atob(b64);
|
|
307
|
+
var bytes = new Uint8Array(binary.length);
|
|
308
|
+
for (var i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
|
|
309
|
+
ttsAudioCache.set(cacheKey, new Blob([bytes], { type: 'audio/wav' }));
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function getCachedTTSBlob(text) {
|
|
313
|
+
var key = selectedVoiceId + ':' + text;
|
|
314
|
+
return ttsAudioCache.get(key) || null;
|
|
315
|
+
}
|
|
316
|
+
|
|
298
317
|
var audioChunkQueue = [];
|
|
299
318
|
var isPlayingChunk = false;
|
|
300
319
|
var streamDone = false;
|
|
301
320
|
var ttsConsecutiveFailures = 0;
|
|
302
321
|
var TTS_MAX_FAILURES = 3;
|
|
303
322
|
var ttsDisabledUntilReset = false;
|
|
323
|
+
var streamingSupported = true;
|
|
304
324
|
|
|
305
325
|
function playNextChunk() {
|
|
306
326
|
if (audioChunkQueue.length === 0) {
|
|
@@ -344,6 +364,15 @@
|
|
|
344
364
|
audioChunkQueue = [];
|
|
345
365
|
isPlayingChunk = false;
|
|
346
366
|
|
|
367
|
+
var cachedBlob = getCachedTTSBlob(text);
|
|
368
|
+
if (cachedBlob) {
|
|
369
|
+
ttsConsecutiveFailures = 0;
|
|
370
|
+
audioChunkQueue.push(cachedBlob);
|
|
371
|
+
streamDone = true;
|
|
372
|
+
if (!isPlayingChunk) playNextChunk();
|
|
373
|
+
return;
|
|
374
|
+
}
|
|
375
|
+
|
|
347
376
|
function onTtsSuccess() {
|
|
348
377
|
ttsConsecutiveFailures = 0;
|
|
349
378
|
}
|
|
@@ -363,12 +392,16 @@
|
|
|
363
392
|
}
|
|
364
393
|
|
|
365
394
|
function tryStreaming() {
|
|
395
|
+
if (!streamingSupported) { tryNonStreaming(text); return; }
|
|
366
396
|
fetch(BASE + '/api/tts-stream', {
|
|
367
397
|
method: 'POST',
|
|
368
398
|
headers: { 'Content-Type': 'application/json' },
|
|
369
399
|
body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
|
|
370
400
|
}).then(function(resp) {
|
|
371
|
-
if (!resp.ok)
|
|
401
|
+
if (!resp.ok) {
|
|
402
|
+
streamingSupported = false;
|
|
403
|
+
throw new Error('TTS stream failed: ' + resp.status);
|
|
404
|
+
}
|
|
372
405
|
var reader = resp.body.getReader();
|
|
373
406
|
var buffer = new Uint8Array(0);
|
|
374
407
|
|
|
@@ -532,11 +565,23 @@
|
|
|
532
565
|
}
|
|
533
566
|
}
|
|
534
567
|
|
|
568
|
+
function sendVoiceToServer() {
|
|
569
|
+
if (typeof agentGUIClient !== 'undefined' && agentGUIClient && agentGUIClient.wsManager && agentGUIClient.wsManager.isConnected) {
|
|
570
|
+
agentGUIClient.wsManager.sendMessage({ type: 'set_voice', voiceId: selectedVoiceId });
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
|
|
535
574
|
function setupStreamingListener() {
|
|
536
575
|
window.addEventListener('ws-message', function(e) {
|
|
537
|
-
if (!voiceActive) return;
|
|
538
576
|
var data = e.detail;
|
|
539
577
|
if (!data) return;
|
|
578
|
+
if (data.type === 'tts_audio' && data.audio && data.voiceId === selectedVoiceId) {
|
|
579
|
+
cacheTTSAudio(data.cacheKey, data.audio);
|
|
580
|
+
}
|
|
581
|
+
if (data.type === 'sync_connected') {
|
|
582
|
+
sendVoiceToServer();
|
|
583
|
+
}
|
|
584
|
+
if (!voiceActive) return;
|
|
540
585
|
if (data.type === 'streaming_progress' && data.block) {
|
|
541
586
|
handleVoiceBlock(data.block, true);
|
|
542
587
|
}
|