agentgui 1.0.174 → 1.0.176
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +59 -11
- package/package.json +1 -1
- package/server.js +74 -6
- package/static/js/voice.js +76 -9
package/lib/speech.js
CHANGED
|
@@ -115,12 +115,18 @@ let speakerEmbeddingPipeline = null;
|
|
|
115
115
|
let sttLoading = false;
|
|
116
116
|
let ttsLoading = false;
|
|
117
117
|
let speakerEmbeddingLoading = false;
|
|
118
|
+
let ttsLoadError = null;
|
|
119
|
+
let ttsLoadErrorTime = 0;
|
|
120
|
+
let sttLoadError = null;
|
|
118
121
|
const voiceEmbeddingsCache = new Map();
|
|
119
122
|
const SAMPLE_RATE_STT = 16000;
|
|
120
123
|
const SAMPLE_RATE_TTS = 16000;
|
|
124
|
+
const TTS_ERROR_RETRY_MS = 30000;
|
|
121
125
|
|
|
122
|
-
const
|
|
126
|
+
const TTS_CACHE_MAX_BYTES = 10 * 1024 * 1024;
|
|
127
|
+
let ttsCacheBytes = 0;
|
|
123
128
|
const ttsCache = new Map();
|
|
129
|
+
const ttsInflight = new Map();
|
|
124
130
|
|
|
125
131
|
async function loadTransformers() {
|
|
126
132
|
if (transformersModule) return transformersModule;
|
|
@@ -261,8 +267,10 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
|
|
|
261
267
|
|
|
262
268
|
async function getSTT() {
|
|
263
269
|
if (sttPipeline) return sttPipeline;
|
|
270
|
+
if (sttLoadError) throw sttLoadError;
|
|
264
271
|
if (sttLoading) {
|
|
265
272
|
while (sttLoading) await new Promise(r => setTimeout(r, 100));
|
|
273
|
+
if (sttLoadError) throw sttLoadError;
|
|
266
274
|
if (!sttPipeline) throw new Error('STT pipeline failed to load');
|
|
267
275
|
return sttPipeline;
|
|
268
276
|
}
|
|
@@ -278,10 +286,12 @@ async function getSTT() {
|
|
|
278
286
|
device: 'cpu',
|
|
279
287
|
local_files_only: isLocal,
|
|
280
288
|
});
|
|
289
|
+
sttLoadError = null;
|
|
281
290
|
return sttPipeline;
|
|
282
291
|
} catch (err) {
|
|
283
292
|
sttPipeline = null;
|
|
284
|
-
|
|
293
|
+
sttLoadError = new Error('STT model load failed: ' + err.message);
|
|
294
|
+
throw sttLoadError;
|
|
285
295
|
} finally {
|
|
286
296
|
sttLoading = false;
|
|
287
297
|
}
|
|
@@ -289,8 +299,14 @@ async function getSTT() {
|
|
|
289
299
|
|
|
290
300
|
async function getTTS() {
|
|
291
301
|
if (ttsPipeline) return ttsPipeline;
|
|
302
|
+
if (ttsLoadError) {
|
|
303
|
+
if (Date.now() - ttsLoadErrorTime < TTS_ERROR_RETRY_MS) throw ttsLoadError;
|
|
304
|
+
ttsLoadError = null;
|
|
305
|
+
ttsLoadErrorTime = 0;
|
|
306
|
+
}
|
|
292
307
|
if (ttsLoading) {
|
|
293
308
|
while (ttsLoading) await new Promise(r => setTimeout(r, 100));
|
|
309
|
+
if (ttsLoadError) throw ttsLoadError;
|
|
294
310
|
if (!ttsPipeline) throw new Error('TTS pipeline failed to load');
|
|
295
311
|
return ttsPipeline;
|
|
296
312
|
}
|
|
@@ -303,10 +319,14 @@ async function getTTS() {
|
|
|
303
319
|
dtype: 'fp32',
|
|
304
320
|
});
|
|
305
321
|
await ensureSpeakerEmbeddings();
|
|
322
|
+
ttsLoadError = null;
|
|
323
|
+
ttsLoadErrorTime = 0;
|
|
306
324
|
return ttsPipeline;
|
|
307
325
|
} catch (err) {
|
|
308
326
|
ttsPipeline = null;
|
|
309
|
-
|
|
327
|
+
ttsLoadError = new Error('TTS model load failed: ' + err.message);
|
|
328
|
+
ttsLoadErrorTime = Date.now();
|
|
329
|
+
throw ttsLoadError;
|
|
310
330
|
} finally {
|
|
311
331
|
ttsLoading = false;
|
|
312
332
|
}
|
|
@@ -433,11 +453,17 @@ function splitSentences(text) {
|
|
|
433
453
|
}
|
|
434
454
|
|
|
435
455
|
function cachePut(key, buf) {
|
|
436
|
-
if (ttsCache.
|
|
456
|
+
if (ttsCache.has(key)) {
|
|
457
|
+
ttsCacheBytes -= ttsCache.get(key).length;
|
|
458
|
+
ttsCache.delete(key);
|
|
459
|
+
}
|
|
460
|
+
while (ttsCacheBytes + buf.length > TTS_CACHE_MAX_BYTES && ttsCache.size > 0) {
|
|
437
461
|
const oldest = ttsCache.keys().next().value;
|
|
462
|
+
ttsCacheBytes -= ttsCache.get(oldest).length;
|
|
438
463
|
ttsCache.delete(oldest);
|
|
439
464
|
}
|
|
440
465
|
ttsCache.set(key, buf);
|
|
466
|
+
ttsCacheBytes += buf.length;
|
|
441
467
|
}
|
|
442
468
|
|
|
443
469
|
async function synthesize(text, voiceId) {
|
|
@@ -448,12 +474,18 @@ async function synthesize(text, voiceId) {
|
|
|
448
474
|
ttsCache.set(cacheKey, cached);
|
|
449
475
|
return cached;
|
|
450
476
|
}
|
|
451
|
-
const
|
|
452
|
-
|
|
453
|
-
const
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
477
|
+
const inflight = ttsInflight.get(cacheKey);
|
|
478
|
+
if (inflight) return inflight;
|
|
479
|
+
const promise = (async () => {
|
|
480
|
+
const tts = await getTTS();
|
|
481
|
+
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
482
|
+
const result = await tts(text, { speaker_embeddings: embeddings });
|
|
483
|
+
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
484
|
+
cachePut(cacheKey, wav);
|
|
485
|
+
return wav;
|
|
486
|
+
})();
|
|
487
|
+
ttsInflight.set(cacheKey, promise);
|
|
488
|
+
try { return await promise; } finally { ttsInflight.delete(cacheKey); }
|
|
457
489
|
}
|
|
458
490
|
|
|
459
491
|
async function* synthesizeStream(text, voiceId) {
|
|
@@ -482,7 +514,23 @@ function getStatus() {
|
|
|
482
514
|
ttsReady: !!ttsPipeline,
|
|
483
515
|
sttLoading,
|
|
484
516
|
ttsLoading,
|
|
517
|
+
sttError: sttLoadError ? sttLoadError.message : null,
|
|
518
|
+
ttsError: ttsLoadError ? ttsLoadError.message : null,
|
|
485
519
|
};
|
|
486
520
|
}
|
|
487
521
|
|
|
488
|
-
|
|
522
|
+
function preloadTTS() {
|
|
523
|
+
getTTS().catch(err => console.error('[TTS] Preload failed:', err.message));
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
function ttsCacheKey(text, voiceId) {
|
|
527
|
+
return (voiceId || 'default') + ':' + text;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
function ttsCacheGet(key) {
|
|
531
|
+
const cached = ttsCache.get(key);
|
|
532
|
+
if (cached) { ttsCache.delete(key); ttsCache.set(key, cached); }
|
|
533
|
+
return cached || null;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
|
package/package.json
CHANGED
package/server.js
CHANGED
|
@@ -15,6 +15,49 @@ async function getSpeech() {
|
|
|
15
15
|
return speechModule;
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
function eagerTTS(text, conversationId, sessionId) {
|
|
19
|
+
getSpeech().then(speech => {
|
|
20
|
+
const status = speech.getStatus();
|
|
21
|
+
if (!status.ttsReady || status.ttsError) return;
|
|
22
|
+
const voices = new Set();
|
|
23
|
+
for (const ws of syncClients) {
|
|
24
|
+
const vid = ws.ttsVoiceId || 'default';
|
|
25
|
+
const convKey = `conv-${conversationId}`;
|
|
26
|
+
if (ws.subscriptions && (ws.subscriptions.has(sessionId) || ws.subscriptions.has(convKey))) {
|
|
27
|
+
voices.add(vid);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
if (voices.size === 0) return;
|
|
31
|
+
const sentences = speech.splitSentences(text);
|
|
32
|
+
for (const vid of voices) {
|
|
33
|
+
for (const sentence of sentences) {
|
|
34
|
+
const cacheKey = speech.ttsCacheKey(sentence, vid);
|
|
35
|
+
const cached = speech.ttsCacheGet(cacheKey);
|
|
36
|
+
if (cached) {
|
|
37
|
+
pushTTSAudio(cacheKey, cached, conversationId, sessionId, vid);
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
speech.synthesize(sentence, vid).then(wav => {
|
|
41
|
+
pushTTSAudio(cacheKey, wav, conversationId, sessionId, vid);
|
|
42
|
+
}).catch(() => {});
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}).catch(() => {});
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function pushTTSAudio(cacheKey, wav, conversationId, sessionId, voiceId) {
|
|
49
|
+
const b64 = wav.toString('base64');
|
|
50
|
+
broadcastSync({
|
|
51
|
+
type: 'tts_audio',
|
|
52
|
+
cacheKey,
|
|
53
|
+
audio: b64,
|
|
54
|
+
voiceId,
|
|
55
|
+
conversationId,
|
|
56
|
+
sessionId,
|
|
57
|
+
timestamp: Date.now()
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
|
|
18
61
|
const require = createRequire(import.meta.url);
|
|
19
62
|
const express = require('express');
|
|
20
63
|
const Busboy = require('busboy');
|
|
@@ -554,13 +597,20 @@ const server = http.createServer(async (req, res) => {
|
|
|
554
597
|
sendJSON(req, res, 400, { error: 'No text provided' });
|
|
555
598
|
return;
|
|
556
599
|
}
|
|
557
|
-
const
|
|
558
|
-
const
|
|
600
|
+
const speech = await getSpeech();
|
|
601
|
+
const status = speech.getStatus();
|
|
602
|
+
if (status.ttsError) {
|
|
603
|
+
sendJSON(req, res, 503, { error: status.ttsError, retryable: false });
|
|
604
|
+
return;
|
|
605
|
+
}
|
|
606
|
+
const wavBuffer = await speech.synthesize(text, voiceId);
|
|
559
607
|
res.writeHead(200, { 'Content-Type': 'audio/wav', 'Content-Length': wavBuffer.length });
|
|
560
608
|
res.end(wavBuffer);
|
|
561
609
|
} catch (err) {
|
|
562
610
|
debugLog('[TTS] Error: ' + err.message);
|
|
563
|
-
|
|
611
|
+
const isModelError = /model.*load|pipeline.*failed|failed to load/i.test(err.message);
|
|
612
|
+
const statusCode = isModelError ? 503 : 500;
|
|
613
|
+
if (!res.headersSent) sendJSON(req, res, statusCode, { error: err.message || 'TTS failed', retryable: !isModelError });
|
|
564
614
|
}
|
|
565
615
|
return;
|
|
566
616
|
}
|
|
@@ -574,14 +624,19 @@ const server = http.createServer(async (req, res) => {
|
|
|
574
624
|
sendJSON(req, res, 400, { error: 'No text provided' });
|
|
575
625
|
return;
|
|
576
626
|
}
|
|
577
|
-
const
|
|
627
|
+
const speech = await getSpeech();
|
|
628
|
+
const status = speech.getStatus();
|
|
629
|
+
if (status.ttsError) {
|
|
630
|
+
sendJSON(req, res, 503, { error: status.ttsError, retryable: false });
|
|
631
|
+
return;
|
|
632
|
+
}
|
|
578
633
|
res.writeHead(200, {
|
|
579
634
|
'Content-Type': 'application/octet-stream',
|
|
580
635
|
'Transfer-Encoding': 'chunked',
|
|
581
636
|
'X-Content-Type': 'audio/wav-stream',
|
|
582
637
|
'Cache-Control': 'no-cache'
|
|
583
638
|
});
|
|
584
|
-
for await (const wavChunk of synthesizeStream(text, voiceId)) {
|
|
639
|
+
for await (const wavChunk of speech.synthesizeStream(text, voiceId)) {
|
|
585
640
|
const lenBuf = Buffer.alloc(4);
|
|
586
641
|
lenBuf.writeUInt32BE(wavChunk.length, 0);
|
|
587
642
|
res.write(lenBuf);
|
|
@@ -590,7 +645,9 @@ const server = http.createServer(async (req, res) => {
|
|
|
590
645
|
res.end();
|
|
591
646
|
} catch (err) {
|
|
592
647
|
debugLog('[TTS-STREAM] Error: ' + err.message);
|
|
593
|
-
|
|
648
|
+
const isModelError = /model.*load|pipeline.*failed|failed to load/i.test(err.message);
|
|
649
|
+
const statusCode = isModelError ? 503 : 500;
|
|
650
|
+
if (!res.headersSent) sendJSON(req, res, statusCode, { error: err.message || 'TTS stream failed', retryable: !isModelError });
|
|
594
651
|
else res.end();
|
|
595
652
|
}
|
|
596
653
|
return;
|
|
@@ -838,6 +895,10 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
|
|
|
838
895
|
blockIndex: allBlocks.length - 1,
|
|
839
896
|
timestamp: Date.now()
|
|
840
897
|
});
|
|
898
|
+
|
|
899
|
+
if (block.type === 'text' && block.text) {
|
|
900
|
+
eagerTTS(block.text, conversationId, sessionId);
|
|
901
|
+
}
|
|
841
902
|
}
|
|
842
903
|
} else if (parsed.type === 'user' && parsed.message?.content) {
|
|
843
904
|
for (const block of parsed.message.content) {
|
|
@@ -886,6 +947,11 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
|
|
|
886
947
|
timestamp: Date.now()
|
|
887
948
|
});
|
|
888
949
|
|
|
950
|
+
if (parsed.result) {
|
|
951
|
+
const resultText = typeof parsed.result === 'string' ? parsed.result : JSON.stringify(parsed.result);
|
|
952
|
+
if (resultText) eagerTTS(resultText, conversationId, sessionId);
|
|
953
|
+
}
|
|
954
|
+
|
|
889
955
|
if (parsed.result && allBlocks.length === 0) {
|
|
890
956
|
allBlocks.push({ type: 'text', text: String(parsed.result) });
|
|
891
957
|
}
|
|
@@ -1114,6 +1180,8 @@ wss.on('connection', (ws, req) => {
|
|
|
1114
1180
|
subscriptions: Array.from(ws.subscriptions),
|
|
1115
1181
|
timestamp: Date.now()
|
|
1116
1182
|
}));
|
|
1183
|
+
} else if (data.type === 'set_voice') {
|
|
1184
|
+
ws.ttsVoiceId = data.voiceId || 'default';
|
|
1117
1185
|
} else if (data.type === 'ping') {
|
|
1118
1186
|
ws.send(JSON.stringify({
|
|
1119
1187
|
type: 'pong',
|
package/static/js/voice.js
CHANGED
|
@@ -15,6 +15,8 @@
|
|
|
15
15
|
var spokenChunks = new Set();
|
|
16
16
|
var isLoadingHistory = false;
|
|
17
17
|
var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
|
|
18
|
+
var ttsAudioCache = new Map();
|
|
19
|
+
var TTS_CLIENT_CACHE_MAX = 50;
|
|
18
20
|
|
|
19
21
|
function init() {
|
|
20
22
|
setupTTSToggle();
|
|
@@ -69,6 +71,7 @@
|
|
|
69
71
|
selector.addEventListener('change', function() {
|
|
70
72
|
selectedVoiceId = selector.value;
|
|
71
73
|
localStorage.setItem('voice-selected-id', selectedVoiceId);
|
|
74
|
+
sendVoiceToServer();
|
|
72
75
|
});
|
|
73
76
|
}
|
|
74
77
|
|
|
@@ -295,9 +298,28 @@
|
|
|
295
298
|
processQueue();
|
|
296
299
|
}
|
|
297
300
|
|
|
301
|
+
function cacheTTSAudio(cacheKey, b64) {
|
|
302
|
+
if (ttsAudioCache.size >= TTS_CLIENT_CACHE_MAX) {
|
|
303
|
+
var oldest = ttsAudioCache.keys().next().value;
|
|
304
|
+
ttsAudioCache.delete(oldest);
|
|
305
|
+
}
|
|
306
|
+
var binary = atob(b64);
|
|
307
|
+
var bytes = new Uint8Array(binary.length);
|
|
308
|
+
for (var i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
|
|
309
|
+
ttsAudioCache.set(cacheKey, new Blob([bytes], { type: 'audio/wav' }));
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function getCachedTTSBlob(text) {
|
|
313
|
+
var key = selectedVoiceId + ':' + text;
|
|
314
|
+
return ttsAudioCache.get(key) || null;
|
|
315
|
+
}
|
|
316
|
+
|
|
298
317
|
var audioChunkQueue = [];
|
|
299
318
|
var isPlayingChunk = false;
|
|
300
319
|
var streamDone = false;
|
|
320
|
+
var ttsConsecutiveFailures = 0;
|
|
321
|
+
var TTS_MAX_FAILURES = 3;
|
|
322
|
+
var ttsDisabledUntilReset = false;
|
|
301
323
|
|
|
302
324
|
function playNextChunk() {
|
|
303
325
|
if (audioChunkQueue.length === 0) {
|
|
@@ -331,19 +353,50 @@
|
|
|
331
353
|
|
|
332
354
|
function processQueue() {
|
|
333
355
|
if (isSpeaking || speechQueue.length === 0) return;
|
|
356
|
+
if (ttsDisabledUntilReset) {
|
|
357
|
+
speechQueue = [];
|
|
358
|
+
return;
|
|
359
|
+
}
|
|
334
360
|
isSpeaking = true;
|
|
335
361
|
streamDone = false;
|
|
336
362
|
var text = speechQueue.shift();
|
|
337
363
|
audioChunkQueue = [];
|
|
338
364
|
isPlayingChunk = false;
|
|
339
|
-
|
|
365
|
+
|
|
366
|
+
var cachedBlob = getCachedTTSBlob(text);
|
|
367
|
+
if (cachedBlob) {
|
|
368
|
+
ttsConsecutiveFailures = 0;
|
|
369
|
+
audioChunkQueue.push(cachedBlob);
|
|
370
|
+
streamDone = true;
|
|
371
|
+
if (!isPlayingChunk) playNextChunk();
|
|
372
|
+
return;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
function onTtsSuccess() {
|
|
376
|
+
ttsConsecutiveFailures = 0;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
function onTtsFailed() {
|
|
380
|
+
ttsConsecutiveFailures++;
|
|
381
|
+
if (ttsConsecutiveFailures >= TTS_MAX_FAILURES) {
|
|
382
|
+
console.warn('[Voice] TTS failed ' + ttsConsecutiveFailures + ' times consecutively, disabling until reset');
|
|
383
|
+
ttsDisabledUntilReset = true;
|
|
384
|
+
speechQueue = [];
|
|
385
|
+
}
|
|
386
|
+
streamDone = true;
|
|
387
|
+
isSpeaking = false;
|
|
388
|
+
if (!ttsDisabledUntilReset) {
|
|
389
|
+
processQueue();
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
340
393
|
function tryStreaming() {
|
|
341
394
|
fetch(BASE + '/api/tts-stream', {
|
|
342
395
|
method: 'POST',
|
|
343
396
|
headers: { 'Content-Type': 'application/json' },
|
|
344
397
|
body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
|
|
345
398
|
}).then(function(resp) {
|
|
346
|
-
if (!resp.ok) throw new Error('TTS stream failed');
|
|
399
|
+
if (!resp.ok) throw new Error('TTS stream failed: ' + resp.status);
|
|
347
400
|
var reader = resp.body.getReader();
|
|
348
401
|
var buffer = new Uint8Array(0);
|
|
349
402
|
|
|
@@ -357,6 +410,7 @@
|
|
|
357
410
|
function pump() {
|
|
358
411
|
return reader.read().then(function(result) {
|
|
359
412
|
if (result.done) {
|
|
413
|
+
onTtsSuccess();
|
|
360
414
|
streamDone = true;
|
|
361
415
|
if (!isPlayingChunk && audioChunkQueue.length === 0) {
|
|
362
416
|
isSpeaking = false;
|
|
@@ -384,16 +438,17 @@
|
|
|
384
438
|
tryNonStreaming(text);
|
|
385
439
|
});
|
|
386
440
|
}
|
|
387
|
-
|
|
441
|
+
|
|
388
442
|
function tryNonStreaming(txt) {
|
|
389
443
|
fetch(BASE + '/api/tts', {
|
|
390
444
|
method: 'POST',
|
|
391
445
|
headers: { 'Content-Type': 'application/json' },
|
|
392
446
|
body: JSON.stringify({ text: txt, voiceId: selectedVoiceId })
|
|
393
447
|
}).then(function(resp) {
|
|
394
|
-
if (!resp.ok) throw new Error('TTS failed');
|
|
448
|
+
if (!resp.ok) throw new Error('TTS failed: ' + resp.status);
|
|
395
449
|
return resp.arrayBuffer();
|
|
396
450
|
}).then(function(buf) {
|
|
451
|
+
onTtsSuccess();
|
|
397
452
|
var blob = new Blob([buf], { type: 'audio/wav' });
|
|
398
453
|
audioChunkQueue.push(blob);
|
|
399
454
|
if (!isPlayingChunk) playNextChunk();
|
|
@@ -401,12 +456,10 @@
|
|
|
401
456
|
isSpeaking = false;
|
|
402
457
|
processQueue();
|
|
403
458
|
}).catch(function() {
|
|
404
|
-
|
|
405
|
-
isSpeaking = false;
|
|
406
|
-
processQueue();
|
|
459
|
+
onTtsFailed();
|
|
407
460
|
});
|
|
408
461
|
}
|
|
409
|
-
|
|
462
|
+
|
|
410
463
|
tryStreaming();
|
|
411
464
|
}
|
|
412
465
|
|
|
@@ -415,6 +468,8 @@
|
|
|
415
468
|
audioChunkQueue = [];
|
|
416
469
|
isPlayingChunk = false;
|
|
417
470
|
isSpeaking = false;
|
|
471
|
+
ttsConsecutiveFailures = 0;
|
|
472
|
+
ttsDisabledUntilReset = false;
|
|
418
473
|
if (currentAudio) {
|
|
419
474
|
currentAudio.pause();
|
|
420
475
|
currentAudio = null;
|
|
@@ -505,11 +560,23 @@
|
|
|
505
560
|
}
|
|
506
561
|
}
|
|
507
562
|
|
|
563
|
+
function sendVoiceToServer() {
|
|
564
|
+
if (typeof agentGUIClient !== 'undefined' && agentGUIClient && agentGUIClient.wsManager && agentGUIClient.wsManager.isConnected) {
|
|
565
|
+
agentGUIClient.wsManager.sendMessage({ type: 'set_voice', voiceId: selectedVoiceId });
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
|
|
508
569
|
function setupStreamingListener() {
|
|
509
570
|
window.addEventListener('ws-message', function(e) {
|
|
510
|
-
if (!voiceActive) return;
|
|
511
571
|
var data = e.detail;
|
|
512
572
|
if (!data) return;
|
|
573
|
+
if (data.type === 'tts_audio' && data.audio && data.voiceId === selectedVoiceId) {
|
|
574
|
+
cacheTTSAudio(data.cacheKey, data.audio);
|
|
575
|
+
}
|
|
576
|
+
if (data.type === 'sync_connected') {
|
|
577
|
+
sendVoiceToServer();
|
|
578
|
+
}
|
|
579
|
+
if (!voiceActive) return;
|
|
513
580
|
if (data.type === 'streaming_progress' && data.block) {
|
|
514
581
|
handleVoiceBlock(data.block, true);
|
|
515
582
|
}
|