agentgui 1.0.175 → 1.0.176
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +46 -10
- package/package.json +1 -1
- package/server.js +54 -0
- package/static/js/voice.js +41 -1
package/lib/speech.js
CHANGED
|
@@ -116,13 +116,17 @@ let sttLoading = false;
|
|
|
116
116
|
let ttsLoading = false;
|
|
117
117
|
let speakerEmbeddingLoading = false;
|
|
118
118
|
let ttsLoadError = null;
|
|
119
|
+
let ttsLoadErrorTime = 0;
|
|
119
120
|
let sttLoadError = null;
|
|
120
121
|
const voiceEmbeddingsCache = new Map();
|
|
121
122
|
const SAMPLE_RATE_STT = 16000;
|
|
122
123
|
const SAMPLE_RATE_TTS = 16000;
|
|
124
|
+
const TTS_ERROR_RETRY_MS = 30000;
|
|
123
125
|
|
|
124
|
-
const
|
|
126
|
+
const TTS_CACHE_MAX_BYTES = 10 * 1024 * 1024;
|
|
127
|
+
let ttsCacheBytes = 0;
|
|
125
128
|
const ttsCache = new Map();
|
|
129
|
+
const ttsInflight = new Map();
|
|
126
130
|
|
|
127
131
|
async function loadTransformers() {
|
|
128
132
|
if (transformersModule) return transformersModule;
|
|
@@ -295,7 +299,11 @@ async function getSTT() {
|
|
|
295
299
|
|
|
296
300
|
async function getTTS() {
|
|
297
301
|
if (ttsPipeline) return ttsPipeline;
|
|
298
|
-
if (ttsLoadError)
|
|
302
|
+
if (ttsLoadError) {
|
|
303
|
+
if (Date.now() - ttsLoadErrorTime < TTS_ERROR_RETRY_MS) throw ttsLoadError;
|
|
304
|
+
ttsLoadError = null;
|
|
305
|
+
ttsLoadErrorTime = 0;
|
|
306
|
+
}
|
|
299
307
|
if (ttsLoading) {
|
|
300
308
|
while (ttsLoading) await new Promise(r => setTimeout(r, 100));
|
|
301
309
|
if (ttsLoadError) throw ttsLoadError;
|
|
@@ -312,10 +320,12 @@ async function getTTS() {
|
|
|
312
320
|
});
|
|
313
321
|
await ensureSpeakerEmbeddings();
|
|
314
322
|
ttsLoadError = null;
|
|
323
|
+
ttsLoadErrorTime = 0;
|
|
315
324
|
return ttsPipeline;
|
|
316
325
|
} catch (err) {
|
|
317
326
|
ttsPipeline = null;
|
|
318
327
|
ttsLoadError = new Error('TTS model load failed: ' + err.message);
|
|
328
|
+
ttsLoadErrorTime = Date.now();
|
|
319
329
|
throw ttsLoadError;
|
|
320
330
|
} finally {
|
|
321
331
|
ttsLoading = false;
|
|
@@ -443,11 +453,17 @@ function splitSentences(text) {
|
|
|
443
453
|
}
|
|
444
454
|
|
|
445
455
|
function cachePut(key, buf) {
|
|
446
|
-
if (ttsCache.
|
|
456
|
+
if (ttsCache.has(key)) {
|
|
457
|
+
ttsCacheBytes -= ttsCache.get(key).length;
|
|
458
|
+
ttsCache.delete(key);
|
|
459
|
+
}
|
|
460
|
+
while (ttsCacheBytes + buf.length > TTS_CACHE_MAX_BYTES && ttsCache.size > 0) {
|
|
447
461
|
const oldest = ttsCache.keys().next().value;
|
|
462
|
+
ttsCacheBytes -= ttsCache.get(oldest).length;
|
|
448
463
|
ttsCache.delete(oldest);
|
|
449
464
|
}
|
|
450
465
|
ttsCache.set(key, buf);
|
|
466
|
+
ttsCacheBytes += buf.length;
|
|
451
467
|
}
|
|
452
468
|
|
|
453
469
|
async function synthesize(text, voiceId) {
|
|
@@ -458,12 +474,18 @@ async function synthesize(text, voiceId) {
|
|
|
458
474
|
ttsCache.set(cacheKey, cached);
|
|
459
475
|
return cached;
|
|
460
476
|
}
|
|
461
|
-
const
|
|
462
|
-
|
|
463
|
-
const
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
477
|
+
const inflight = ttsInflight.get(cacheKey);
|
|
478
|
+
if (inflight) return inflight;
|
|
479
|
+
const promise = (async () => {
|
|
480
|
+
const tts = await getTTS();
|
|
481
|
+
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
482
|
+
const result = await tts(text, { speaker_embeddings: embeddings });
|
|
483
|
+
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
484
|
+
cachePut(cacheKey, wav);
|
|
485
|
+
return wav;
|
|
486
|
+
})();
|
|
487
|
+
ttsInflight.set(cacheKey, promise);
|
|
488
|
+
try { return await promise; } finally { ttsInflight.delete(cacheKey); }
|
|
467
489
|
}
|
|
468
490
|
|
|
469
491
|
async function* synthesizeStream(text, voiceId) {
|
|
@@ -497,4 +519,18 @@ function getStatus() {
|
|
|
497
519
|
};
|
|
498
520
|
}
|
|
499
521
|
|
|
500
|
-
|
|
522
|
+
function preloadTTS() {
|
|
523
|
+
getTTS().catch(err => console.error('[TTS] Preload failed:', err.message));
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
function ttsCacheKey(text, voiceId) {
|
|
527
|
+
return (voiceId || 'default') + ':' + text;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
function ttsCacheGet(key) {
|
|
531
|
+
const cached = ttsCache.get(key);
|
|
532
|
+
if (cached) { ttsCache.delete(key); ttsCache.set(key, cached); }
|
|
533
|
+
return cached || null;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
|
package/package.json
CHANGED
package/server.js
CHANGED
|
@@ -15,6 +15,49 @@ async function getSpeech() {
|
|
|
15
15
|
return speechModule;
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
function eagerTTS(text, conversationId, sessionId) {
|
|
19
|
+
getSpeech().then(speech => {
|
|
20
|
+
const status = speech.getStatus();
|
|
21
|
+
if (!status.ttsReady || status.ttsError) return;
|
|
22
|
+
const voices = new Set();
|
|
23
|
+
for (const ws of syncClients) {
|
|
24
|
+
const vid = ws.ttsVoiceId || 'default';
|
|
25
|
+
const convKey = `conv-${conversationId}`;
|
|
26
|
+
if (ws.subscriptions && (ws.subscriptions.has(sessionId) || ws.subscriptions.has(convKey))) {
|
|
27
|
+
voices.add(vid);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
if (voices.size === 0) return;
|
|
31
|
+
const sentences = speech.splitSentences(text);
|
|
32
|
+
for (const vid of voices) {
|
|
33
|
+
for (const sentence of sentences) {
|
|
34
|
+
const cacheKey = speech.ttsCacheKey(sentence, vid);
|
|
35
|
+
const cached = speech.ttsCacheGet(cacheKey);
|
|
36
|
+
if (cached) {
|
|
37
|
+
pushTTSAudio(cacheKey, cached, conversationId, sessionId, vid);
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
speech.synthesize(sentence, vid).then(wav => {
|
|
41
|
+
pushTTSAudio(cacheKey, wav, conversationId, sessionId, vid);
|
|
42
|
+
}).catch(() => {});
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}).catch(() => {});
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function pushTTSAudio(cacheKey, wav, conversationId, sessionId, voiceId) {
|
|
49
|
+
const b64 = wav.toString('base64');
|
|
50
|
+
broadcastSync({
|
|
51
|
+
type: 'tts_audio',
|
|
52
|
+
cacheKey,
|
|
53
|
+
audio: b64,
|
|
54
|
+
voiceId,
|
|
55
|
+
conversationId,
|
|
56
|
+
sessionId,
|
|
57
|
+
timestamp: Date.now()
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
|
|
18
61
|
const require = createRequire(import.meta.url);
|
|
19
62
|
const express = require('express');
|
|
20
63
|
const Busboy = require('busboy');
|
|
@@ -852,6 +895,10 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
|
|
|
852
895
|
blockIndex: allBlocks.length - 1,
|
|
853
896
|
timestamp: Date.now()
|
|
854
897
|
});
|
|
898
|
+
|
|
899
|
+
if (block.type === 'text' && block.text) {
|
|
900
|
+
eagerTTS(block.text, conversationId, sessionId);
|
|
901
|
+
}
|
|
855
902
|
}
|
|
856
903
|
} else if (parsed.type === 'user' && parsed.message?.content) {
|
|
857
904
|
for (const block of parsed.message.content) {
|
|
@@ -900,6 +947,11 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
|
|
|
900
947
|
timestamp: Date.now()
|
|
901
948
|
});
|
|
902
949
|
|
|
950
|
+
if (parsed.result) {
|
|
951
|
+
const resultText = typeof parsed.result === 'string' ? parsed.result : JSON.stringify(parsed.result);
|
|
952
|
+
if (resultText) eagerTTS(resultText, conversationId, sessionId);
|
|
953
|
+
}
|
|
954
|
+
|
|
903
955
|
if (parsed.result && allBlocks.length === 0) {
|
|
904
956
|
allBlocks.push({ type: 'text', text: String(parsed.result) });
|
|
905
957
|
}
|
|
@@ -1128,6 +1180,8 @@ wss.on('connection', (ws, req) => {
|
|
|
1128
1180
|
subscriptions: Array.from(ws.subscriptions),
|
|
1129
1181
|
timestamp: Date.now()
|
|
1130
1182
|
}));
|
|
1183
|
+
} else if (data.type === 'set_voice') {
|
|
1184
|
+
ws.ttsVoiceId = data.voiceId || 'default';
|
|
1131
1185
|
} else if (data.type === 'ping') {
|
|
1132
1186
|
ws.send(JSON.stringify({
|
|
1133
1187
|
type: 'pong',
|
package/static/js/voice.js
CHANGED
|
@@ -15,6 +15,8 @@
|
|
|
15
15
|
var spokenChunks = new Set();
|
|
16
16
|
var isLoadingHistory = false;
|
|
17
17
|
var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
|
|
18
|
+
var ttsAudioCache = new Map();
|
|
19
|
+
var TTS_CLIENT_CACHE_MAX = 50;
|
|
18
20
|
|
|
19
21
|
function init() {
|
|
20
22
|
setupTTSToggle();
|
|
@@ -69,6 +71,7 @@
|
|
|
69
71
|
selector.addEventListener('change', function() {
|
|
70
72
|
selectedVoiceId = selector.value;
|
|
71
73
|
localStorage.setItem('voice-selected-id', selectedVoiceId);
|
|
74
|
+
sendVoiceToServer();
|
|
72
75
|
});
|
|
73
76
|
}
|
|
74
77
|
|
|
@@ -295,6 +298,22 @@
|
|
|
295
298
|
processQueue();
|
|
296
299
|
}
|
|
297
300
|
|
|
301
|
+
function cacheTTSAudio(cacheKey, b64) {
|
|
302
|
+
if (ttsAudioCache.size >= TTS_CLIENT_CACHE_MAX) {
|
|
303
|
+
var oldest = ttsAudioCache.keys().next().value;
|
|
304
|
+
ttsAudioCache.delete(oldest);
|
|
305
|
+
}
|
|
306
|
+
var binary = atob(b64);
|
|
307
|
+
var bytes = new Uint8Array(binary.length);
|
|
308
|
+
for (var i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
|
|
309
|
+
ttsAudioCache.set(cacheKey, new Blob([bytes], { type: 'audio/wav' }));
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function getCachedTTSBlob(text) {
|
|
313
|
+
var key = selectedVoiceId + ':' + text;
|
|
314
|
+
return ttsAudioCache.get(key) || null;
|
|
315
|
+
}
|
|
316
|
+
|
|
298
317
|
var audioChunkQueue = [];
|
|
299
318
|
var isPlayingChunk = false;
|
|
300
319
|
var streamDone = false;
|
|
@@ -344,6 +363,15 @@
|
|
|
344
363
|
audioChunkQueue = [];
|
|
345
364
|
isPlayingChunk = false;
|
|
346
365
|
|
|
366
|
+
var cachedBlob = getCachedTTSBlob(text);
|
|
367
|
+
if (cachedBlob) {
|
|
368
|
+
ttsConsecutiveFailures = 0;
|
|
369
|
+
audioChunkQueue.push(cachedBlob);
|
|
370
|
+
streamDone = true;
|
|
371
|
+
if (!isPlayingChunk) playNextChunk();
|
|
372
|
+
return;
|
|
373
|
+
}
|
|
374
|
+
|
|
347
375
|
function onTtsSuccess() {
|
|
348
376
|
ttsConsecutiveFailures = 0;
|
|
349
377
|
}
|
|
@@ -532,11 +560,23 @@
|
|
|
532
560
|
}
|
|
533
561
|
}
|
|
534
562
|
|
|
563
|
+
function sendVoiceToServer() {
|
|
564
|
+
if (typeof agentGUIClient !== 'undefined' && agentGUIClient && agentGUIClient.wsManager && agentGUIClient.wsManager.isConnected) {
|
|
565
|
+
agentGUIClient.wsManager.sendMessage({ type: 'set_voice', voiceId: selectedVoiceId });
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
|
|
535
569
|
function setupStreamingListener() {
|
|
536
570
|
window.addEventListener('ws-message', function(e) {
|
|
537
|
-
if (!voiceActive) return;
|
|
538
571
|
var data = e.detail;
|
|
539
572
|
if (!data) return;
|
|
573
|
+
if (data.type === 'tts_audio' && data.audio && data.voiceId === selectedVoiceId) {
|
|
574
|
+
cacheTTSAudio(data.cacheKey, data.audio);
|
|
575
|
+
}
|
|
576
|
+
if (data.type === 'sync_connected') {
|
|
577
|
+
sendVoiceToServer();
|
|
578
|
+
}
|
|
579
|
+
if (!voiceActive) return;
|
|
540
580
|
if (data.type === 'streaming_progress' && data.block) {
|
|
541
581
|
handleVoiceBlock(data.block, true);
|
|
542
582
|
}
|