agentgui 1.0.175 → 1.0.176

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/speech.js CHANGED
@@ -116,13 +116,17 @@ let sttLoading = false;
116
116
  let ttsLoading = false;
117
117
  let speakerEmbeddingLoading = false;
118
118
  let ttsLoadError = null;
119
+ let ttsLoadErrorTime = 0;
119
120
  let sttLoadError = null;
120
121
  const voiceEmbeddingsCache = new Map();
121
122
  const SAMPLE_RATE_STT = 16000;
122
123
  const SAMPLE_RATE_TTS = 16000;
124
+ const TTS_ERROR_RETRY_MS = 30000;
123
125
 
124
- const TTS_CACHE_MAX = 100;
126
+ const TTS_CACHE_MAX_BYTES = 10 * 1024 * 1024;
127
+ let ttsCacheBytes = 0;
125
128
  const ttsCache = new Map();
129
+ const ttsInflight = new Map();
126
130
 
127
131
  async function loadTransformers() {
128
132
  if (transformersModule) return transformersModule;
@@ -295,7 +299,11 @@ async function getSTT() {
295
299
 
296
300
  async function getTTS() {
297
301
  if (ttsPipeline) return ttsPipeline;
298
- if (ttsLoadError) throw ttsLoadError;
302
+ if (ttsLoadError) {
303
+ if (Date.now() - ttsLoadErrorTime < TTS_ERROR_RETRY_MS) throw ttsLoadError;
304
+ ttsLoadError = null;
305
+ ttsLoadErrorTime = 0;
306
+ }
299
307
  if (ttsLoading) {
300
308
  while (ttsLoading) await new Promise(r => setTimeout(r, 100));
301
309
  if (ttsLoadError) throw ttsLoadError;
@@ -312,10 +320,12 @@ async function getTTS() {
312
320
  });
313
321
  await ensureSpeakerEmbeddings();
314
322
  ttsLoadError = null;
323
+ ttsLoadErrorTime = 0;
315
324
  return ttsPipeline;
316
325
  } catch (err) {
317
326
  ttsPipeline = null;
318
327
  ttsLoadError = new Error('TTS model load failed: ' + err.message);
328
+ ttsLoadErrorTime = Date.now();
319
329
  throw ttsLoadError;
320
330
  } finally {
321
331
  ttsLoading = false;
@@ -443,11 +453,17 @@ function splitSentences(text) {
443
453
  }
444
454
 
445
455
  function cachePut(key, buf) {
446
- if (ttsCache.size >= TTS_CACHE_MAX) {
456
+ if (ttsCache.has(key)) {
457
+ ttsCacheBytes -= ttsCache.get(key).length;
458
+ ttsCache.delete(key);
459
+ }
460
+ while (ttsCacheBytes + buf.length > TTS_CACHE_MAX_BYTES && ttsCache.size > 0) {
447
461
  const oldest = ttsCache.keys().next().value;
462
+ ttsCacheBytes -= ttsCache.get(oldest).length;
448
463
  ttsCache.delete(oldest);
449
464
  }
450
465
  ttsCache.set(key, buf);
466
+ ttsCacheBytes += buf.length;
451
467
  }
452
468
 
453
469
  async function synthesize(text, voiceId) {
@@ -458,12 +474,18 @@ async function synthesize(text, voiceId) {
458
474
  ttsCache.set(cacheKey, cached);
459
475
  return cached;
460
476
  }
461
- const tts = await getTTS();
462
- const embeddings = await loadVoiceEmbedding(voiceId);
463
- const result = await tts(text, { speaker_embeddings: embeddings });
464
- const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
465
- cachePut(cacheKey, wav);
466
- return wav;
477
+ const inflight = ttsInflight.get(cacheKey);
478
+ if (inflight) return inflight;
479
+ const promise = (async () => {
480
+ const tts = await getTTS();
481
+ const embeddings = await loadVoiceEmbedding(voiceId);
482
+ const result = await tts(text, { speaker_embeddings: embeddings });
483
+ const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
484
+ cachePut(cacheKey, wav);
485
+ return wav;
486
+ })();
487
+ ttsInflight.set(cacheKey, promise);
488
+ try { return await promise; } finally { ttsInflight.delete(cacheKey); }
467
489
  }
468
490
 
469
491
  async function* synthesizeStream(text, voiceId) {
@@ -497,4 +519,18 @@ function getStatus() {
497
519
  };
498
520
  }
499
521
 
500
- export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices };
522
+ function preloadTTS() {
523
+ getTTS().catch(err => console.error('[TTS] Preload failed:', err.message));
524
+ }
525
+
526
+ function ttsCacheKey(text, voiceId) {
527
+ return (voiceId || 'default') + ':' + text;
528
+ }
529
+
530
+ function ttsCacheGet(key) {
531
+ const cached = ttsCache.get(key);
532
+ if (cached) { ttsCache.delete(key); ttsCache.set(key, cached); }
533
+ return cached || null;
534
+ }
535
+
536
+ export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgui",
3
- "version": "1.0.175",
3
+ "version": "1.0.176",
4
4
  "description": "Multi-agent ACP client with real-time communication",
5
5
  "type": "module",
6
6
  "main": "server.js",
package/server.js CHANGED
@@ -15,6 +15,49 @@ async function getSpeech() {
15
15
  return speechModule;
16
16
  }
17
17
 
18
+ function eagerTTS(text, conversationId, sessionId) {
19
+ getSpeech().then(speech => {
20
+ const status = speech.getStatus();
21
+ if (!status.ttsReady || status.ttsError) return;
22
+ const voices = new Set();
23
+ for (const ws of syncClients) {
24
+ const vid = ws.ttsVoiceId || 'default';
25
+ const convKey = `conv-${conversationId}`;
26
+ if (ws.subscriptions && (ws.subscriptions.has(sessionId) || ws.subscriptions.has(convKey))) {
27
+ voices.add(vid);
28
+ }
29
+ }
30
+ if (voices.size === 0) return;
31
+ const sentences = speech.splitSentences(text);
32
+ for (const vid of voices) {
33
+ for (const sentence of sentences) {
34
+ const cacheKey = speech.ttsCacheKey(sentence, vid);
35
+ const cached = speech.ttsCacheGet(cacheKey);
36
+ if (cached) {
37
+ pushTTSAudio(cacheKey, cached, conversationId, sessionId, vid);
38
+ continue;
39
+ }
40
+ speech.synthesize(sentence, vid).then(wav => {
41
+ pushTTSAudio(cacheKey, wav, conversationId, sessionId, vid);
42
+ }).catch(() => {});
43
+ }
44
+ }
45
+ }).catch(() => {});
46
+ }
47
+
48
+ function pushTTSAudio(cacheKey, wav, conversationId, sessionId, voiceId) {
49
+ const b64 = wav.toString('base64');
50
+ broadcastSync({
51
+ type: 'tts_audio',
52
+ cacheKey,
53
+ audio: b64,
54
+ voiceId,
55
+ conversationId,
56
+ sessionId,
57
+ timestamp: Date.now()
58
+ });
59
+ }
60
+
18
61
  const require = createRequire(import.meta.url);
19
62
  const express = require('express');
20
63
  const Busboy = require('busboy');
@@ -852,6 +895,10 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
852
895
  blockIndex: allBlocks.length - 1,
853
896
  timestamp: Date.now()
854
897
  });
898
+
899
+ if (block.type === 'text' && block.text) {
900
+ eagerTTS(block.text, conversationId, sessionId);
901
+ }
855
902
  }
856
903
  } else if (parsed.type === 'user' && parsed.message?.content) {
857
904
  for (const block of parsed.message.content) {
@@ -900,6 +947,11 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
900
947
  timestamp: Date.now()
901
948
  });
902
949
 
950
+ if (parsed.result) {
951
+ const resultText = typeof parsed.result === 'string' ? parsed.result : JSON.stringify(parsed.result);
952
+ if (resultText) eagerTTS(resultText, conversationId, sessionId);
953
+ }
954
+
903
955
  if (parsed.result && allBlocks.length === 0) {
904
956
  allBlocks.push({ type: 'text', text: String(parsed.result) });
905
957
  }
@@ -1128,6 +1180,8 @@ wss.on('connection', (ws, req) => {
1128
1180
  subscriptions: Array.from(ws.subscriptions),
1129
1181
  timestamp: Date.now()
1130
1182
  }));
1183
+ } else if (data.type === 'set_voice') {
1184
+ ws.ttsVoiceId = data.voiceId || 'default';
1131
1185
  } else if (data.type === 'ping') {
1132
1186
  ws.send(JSON.stringify({
1133
1187
  type: 'pong',
@@ -15,6 +15,8 @@
15
15
  var spokenChunks = new Set();
16
16
  var isLoadingHistory = false;
17
17
  var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
18
+ var ttsAudioCache = new Map();
19
+ var TTS_CLIENT_CACHE_MAX = 50;
18
20
 
19
21
  function init() {
20
22
  setupTTSToggle();
@@ -69,6 +71,7 @@
69
71
  selector.addEventListener('change', function() {
70
72
  selectedVoiceId = selector.value;
71
73
  localStorage.setItem('voice-selected-id', selectedVoiceId);
74
+ sendVoiceToServer();
72
75
  });
73
76
  }
74
77
 
@@ -295,6 +298,22 @@
295
298
  processQueue();
296
299
  }
297
300
 
301
+ function cacheTTSAudio(cacheKey, b64) {
302
+ if (ttsAudioCache.size >= TTS_CLIENT_CACHE_MAX) {
303
+ var oldest = ttsAudioCache.keys().next().value;
304
+ ttsAudioCache.delete(oldest);
305
+ }
306
+ var binary = atob(b64);
307
+ var bytes = new Uint8Array(binary.length);
308
+ for (var i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
309
+ ttsAudioCache.set(cacheKey, new Blob([bytes], { type: 'audio/wav' }));
310
+ }
311
+
312
+ function getCachedTTSBlob(text) {
313
+ var key = selectedVoiceId + ':' + text;
314
+ return ttsAudioCache.get(key) || null;
315
+ }
316
+
298
317
  var audioChunkQueue = [];
299
318
  var isPlayingChunk = false;
300
319
  var streamDone = false;
@@ -344,6 +363,15 @@
344
363
  audioChunkQueue = [];
345
364
  isPlayingChunk = false;
346
365
 
366
+ var cachedBlob = getCachedTTSBlob(text);
367
+ if (cachedBlob) {
368
+ ttsConsecutiveFailures = 0;
369
+ audioChunkQueue.push(cachedBlob);
370
+ streamDone = true;
371
+ if (!isPlayingChunk) playNextChunk();
372
+ return;
373
+ }
374
+
347
375
  function onTtsSuccess() {
348
376
  ttsConsecutiveFailures = 0;
349
377
  }
@@ -532,11 +560,23 @@
532
560
  }
533
561
  }
534
562
 
563
+ function sendVoiceToServer() {
564
+ if (typeof agentGUIClient !== 'undefined' && agentGUIClient && agentGUIClient.wsManager && agentGUIClient.wsManager.isConnected) {
565
+ agentGUIClient.wsManager.sendMessage({ type: 'set_voice', voiceId: selectedVoiceId });
566
+ }
567
+ }
568
+
535
569
  function setupStreamingListener() {
536
570
  window.addEventListener('ws-message', function(e) {
537
- if (!voiceActive) return;
538
571
  var data = e.detail;
539
572
  if (!data) return;
573
+ if (data.type === 'tts_audio' && data.audio && data.voiceId === selectedVoiceId) {
574
+ cacheTTSAudio(data.cacheKey, data.audio);
575
+ }
576
+ if (data.type === 'sync_connected') {
577
+ sendVoiceToServer();
578
+ }
579
+ if (!voiceActive) return;
540
580
  if (data.type === 'streaming_progress' && data.block) {
541
581
  handleVoiceBlock(data.block, true);
542
582
  }