npm - agentgui - Versions diffs - 1.0.174 → 1.0.176 - Mend

agentgui 1.0.174 → 1.0.176

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/lib/speech.js CHANGED Viewed

@@ -115,12 +115,18 @@ let speakerEmbeddingPipeline = null;
 let sttLoading = false;
 let ttsLoading = false;
 let speakerEmbeddingLoading = false;
+let ttsLoadError = null;
+let ttsLoadErrorTime = 0;
+let sttLoadError = null;
 const voiceEmbeddingsCache = new Map();
 const SAMPLE_RATE_STT = 16000;
 const SAMPLE_RATE_TTS = 16000;
+const TTS_ERROR_RETRY_MS = 30000;
-const TTS_CACHE_MAX = 100;
+const TTS_CACHE_MAX_BYTES = 10 * 1024 * 1024;
+let ttsCacheBytes = 0;
 const ttsCache = new Map();
+const ttsInflight = new Map();
 async function loadTransformers() {
   if (transformersModule) return transformersModule;
@@ -261,8 +267,10 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
 async function getSTT() {
   if (sttPipeline) return sttPipeline;
+  if (sttLoadError) throw sttLoadError;
   if (sttLoading) {
     while (sttLoading) await new Promise(r => setTimeout(r, 100));
+    if (sttLoadError) throw sttLoadError;
     if (!sttPipeline) throw new Error('STT pipeline failed to load');
     return sttPipeline;
   }
@@ -278,10 +286,12 @@ async function getSTT() {
       device: 'cpu',
       local_files_only: isLocal,
     });
+    sttLoadError = null;
     return sttPipeline;
   } catch (err) {
     sttPipeline = null;
-    throw new Error('STT model load failed: ' + err.message);
+    sttLoadError = new Error('STT model load failed: ' + err.message);
+    throw sttLoadError;
   } finally {
     sttLoading = false;
   }
@@ -289,8 +299,14 @@ async function getSTT() {
 async function getTTS() {
   if (ttsPipeline) return ttsPipeline;
+  if (ttsLoadError) {
+    if (Date.now() - ttsLoadErrorTime < TTS_ERROR_RETRY_MS) throw ttsLoadError;
+    ttsLoadError = null;
+    ttsLoadErrorTime = 0;
+  }
   if (ttsLoading) {
     while (ttsLoading) await new Promise(r => setTimeout(r, 100));
+    if (ttsLoadError) throw ttsLoadError;
     if (!ttsPipeline) throw new Error('TTS pipeline failed to load');
     return ttsPipeline;
   }
@@ -303,10 +319,14 @@ async function getTTS() {
       dtype: 'fp32',
     });
     await ensureSpeakerEmbeddings();
+    ttsLoadError = null;
+    ttsLoadErrorTime = 0;
     return ttsPipeline;
   } catch (err) {
     ttsPipeline = null;
-    throw new Error('TTS model load failed: ' + err.message);
+    ttsLoadError = new Error('TTS model load failed: ' + err.message);
+    ttsLoadErrorTime = Date.now();
+    throw ttsLoadError;
   } finally {
     ttsLoading = false;
   }
@@ -433,11 +453,17 @@ function splitSentences(text) {
 }
 function cachePut(key, buf) {
-  if (ttsCache.size >= TTS_CACHE_MAX) {
+  if (ttsCache.has(key)) {
+    ttsCacheBytes -= ttsCache.get(key).length;
+    ttsCache.delete(key);
+  }
+  while (ttsCacheBytes + buf.length > TTS_CACHE_MAX_BYTES && ttsCache.size > 0) {
     const oldest = ttsCache.keys().next().value;
+    ttsCacheBytes -= ttsCache.get(oldest).length;
     ttsCache.delete(oldest);
   }
   ttsCache.set(key, buf);
+  ttsCacheBytes += buf.length;
 }
 async function synthesize(text, voiceId) {
@@ -448,12 +474,18 @@ async function synthesize(text, voiceId) {
     ttsCache.set(cacheKey, cached);
     return cached;
   }
-  const tts = await getTTS();
-  const embeddings = await loadVoiceEmbedding(voiceId);
-  const result = await tts(text, { speaker_embeddings: embeddings });
-  const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
-  cachePut(cacheKey, wav);
-  return wav;
+  const inflight = ttsInflight.get(cacheKey);
+  if (inflight) return inflight;
+  const promise = (async () => {
+    const tts = await getTTS();
+    const embeddings = await loadVoiceEmbedding(voiceId);
+    const result = await tts(text, { speaker_embeddings: embeddings });
+    const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
+    cachePut(cacheKey, wav);
+    return wav;
+  })();
+  ttsInflight.set(cacheKey, promise);
+  try { return await promise; } finally { ttsInflight.delete(cacheKey); }
 }
 async function* synthesizeStream(text, voiceId) {
@@ -482,7 +514,23 @@ function getStatus() {
     ttsReady: !!ttsPipeline,
     sttLoading,
     ttsLoading,
+    sttError: sttLoadError ? sttLoadError.message : null,
+    ttsError: ttsLoadError ? ttsLoadError.message : null,
   };
 }
-export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices };
+function preloadTTS() {
+  getTTS().catch(err => console.error('[TTS] Preload failed:', err.message));
+}
+function ttsCacheKey(text, voiceId) {
+  return (voiceId || 'default') + ':' + text;
+}
+function ttsCacheGet(key) {
+  const cached = ttsCache.get(key);
+  if (cached) { ttsCache.delete(key); ttsCache.set(key, cached); }
+  return cached || null;
+}
+export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agentgui",
-  "version": "1.0.174",
+  "version": "1.0.176",
   "description": "Multi-agent ACP client with real-time communication",
   "type": "module",
   "main": "server.js",

package/server.js CHANGED Viewed

@@ -15,6 +15,49 @@ async function getSpeech() {
   return speechModule;
 }
+function eagerTTS(text, conversationId, sessionId) {
+  getSpeech().then(speech => {
+    const status = speech.getStatus();
+    if (!status.ttsReady || status.ttsError) return;
+    const voices = new Set();
+    for (const ws of syncClients) {
+      const vid = ws.ttsVoiceId || 'default';
+      const convKey = `conv-${conversationId}`;
+      if (ws.subscriptions && (ws.subscriptions.has(sessionId) || ws.subscriptions.has(convKey))) {
+        voices.add(vid);
+      }
+    }
+    if (voices.size === 0) return;
+    const sentences = speech.splitSentences(text);
+    for (const vid of voices) {
+      for (const sentence of sentences) {
+        const cacheKey = speech.ttsCacheKey(sentence, vid);
+        const cached = speech.ttsCacheGet(cacheKey);
+        if (cached) {
+          pushTTSAudio(cacheKey, cached, conversationId, sessionId, vid);
+          continue;
+        }
+        speech.synthesize(sentence, vid).then(wav => {
+          pushTTSAudio(cacheKey, wav, conversationId, sessionId, vid);
+        }).catch(() => {});
+      }
+    }
+  }).catch(() => {});
+}
+function pushTTSAudio(cacheKey, wav, conversationId, sessionId, voiceId) {
+  const b64 = wav.toString('base64');
+  broadcastSync({
+    type: 'tts_audio',
+    cacheKey,
+    audio: b64,
+    voiceId,
+    conversationId,
+    sessionId,
+    timestamp: Date.now()
+  });
+}
 const require = createRequire(import.meta.url);
 const express = require('express');
 const Busboy = require('busboy');
@@ -554,13 +597,20 @@ const server = http.createServer(async (req, res) => {
           sendJSON(req, res, 400, { error: 'No text provided' });
           return;
         }
-        const { synthesize } = await getSpeech();
-        const wavBuffer = await synthesize(text, voiceId);
+        const speech = await getSpeech();
+        const status = speech.getStatus();
+        if (status.ttsError) {
+          sendJSON(req, res, 503, { error: status.ttsError, retryable: false });
+          return;
+        }
+        const wavBuffer = await speech.synthesize(text, voiceId);
         res.writeHead(200, { 'Content-Type': 'audio/wav', 'Content-Length': wavBuffer.length });
         res.end(wavBuffer);
       } catch (err) {
         debugLog('[TTS] Error: ' + err.message);
-        if (!res.headersSent) sendJSON(req, res, 500, { error: err.message || 'TTS failed' });
+        const isModelError = /model.*load|pipeline.*failed|failed to load/i.test(err.message);
+        const statusCode = isModelError ? 503 : 500;
+        if (!res.headersSent) sendJSON(req, res, statusCode, { error: err.message || 'TTS failed', retryable: !isModelError });
       }
       return;
     }
@@ -574,14 +624,19 @@ const server = http.createServer(async (req, res) => {
           sendJSON(req, res, 400, { error: 'No text provided' });
           return;
         }
-        const { synthesizeStream } = await getSpeech();
+        const speech = await getSpeech();
+        const status = speech.getStatus();
+        if (status.ttsError) {
+          sendJSON(req, res, 503, { error: status.ttsError, retryable: false });
+          return;
+        }
         res.writeHead(200, {
           'Content-Type': 'application/octet-stream',
           'Transfer-Encoding': 'chunked',
           'X-Content-Type': 'audio/wav-stream',
           'Cache-Control': 'no-cache'
         });
-        for await (const wavChunk of synthesizeStream(text, voiceId)) {
+        for await (const wavChunk of speech.synthesizeStream(text, voiceId)) {
           const lenBuf = Buffer.alloc(4);
           lenBuf.writeUInt32BE(wavChunk.length, 0);
           res.write(lenBuf);
@@ -590,7 +645,9 @@ const server = http.createServer(async (req, res) => {
         res.end();
       } catch (err) {
         debugLog('[TTS-STREAM] Error: ' + err.message);
-        if (!res.headersSent) sendJSON(req, res, 500, { error: err.message || 'TTS stream failed' });
+        const isModelError = /model.*load|pipeline.*failed|failed to load/i.test(err.message);
+        const statusCode = isModelError ? 503 : 500;
+        if (!res.headersSent) sendJSON(req, res, statusCode, { error: err.message || 'TTS stream failed', retryable: !isModelError });
         else res.end();
       }
       return;
@@ -838,6 +895,10 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
             blockIndex: allBlocks.length - 1,
             timestamp: Date.now()
           });
+          if (block.type === 'text' && block.text) {
+            eagerTTS(block.text, conversationId, sessionId);
+          }
         }
       } else if (parsed.type === 'user' && parsed.message?.content) {
         for (const block of parsed.message.content) {
@@ -886,6 +947,11 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
           timestamp: Date.now()
         });
+        if (parsed.result) {
+          const resultText = typeof parsed.result === 'string' ? parsed.result : JSON.stringify(parsed.result);
+          if (resultText) eagerTTS(resultText, conversationId, sessionId);
+        }
         if (parsed.result && allBlocks.length === 0) {
           allBlocks.push({ type: 'text', text: String(parsed.result) });
         }
@@ -1114,6 +1180,8 @@ wss.on('connection', (ws, req) => {
             subscriptions: Array.from(ws.subscriptions),
             timestamp: Date.now()
           }));
+        } else if (data.type === 'set_voice') {
+          ws.ttsVoiceId = data.voiceId || 'default';
         } else if (data.type === 'ping') {
           ws.send(JSON.stringify({
             type: 'pong',

package/static/js/voice.js CHANGED Viewed

@@ -15,6 +15,8 @@
   var spokenChunks = new Set();
   var isLoadingHistory = false;
   var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
+  var ttsAudioCache = new Map();
+  var TTS_CLIENT_CACHE_MAX = 50;
   function init() {
     setupTTSToggle();
@@ -69,6 +71,7 @@
     selector.addEventListener('change', function() {
       selectedVoiceId = selector.value;
       localStorage.setItem('voice-selected-id', selectedVoiceId);
+      sendVoiceToServer();
     });
   }
@@ -295,9 +298,28 @@
     processQueue();
   }
+  function cacheTTSAudio(cacheKey, b64) {
+    if (ttsAudioCache.size >= TTS_CLIENT_CACHE_MAX) {
+      var oldest = ttsAudioCache.keys().next().value;
+      ttsAudioCache.delete(oldest);
+    }
+    var binary = atob(b64);
+    var bytes = new Uint8Array(binary.length);
+    for (var i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
+    ttsAudioCache.set(cacheKey, new Blob([bytes], { type: 'audio/wav' }));
+  }
+  function getCachedTTSBlob(text) {
+    var key = selectedVoiceId + ':' + text;
+    return ttsAudioCache.get(key) || null;
+  }
   var audioChunkQueue = [];
   var isPlayingChunk = false;
   var streamDone = false;
+  var ttsConsecutiveFailures = 0;
+  var TTS_MAX_FAILURES = 3;
+  var ttsDisabledUntilReset = false;
   function playNextChunk() {
     if (audioChunkQueue.length === 0) {
@@ -331,19 +353,50 @@
   function processQueue() {
     if (isSpeaking || speechQueue.length === 0) return;
+    if (ttsDisabledUntilReset) {
+      speechQueue = [];
+      return;
+    }
     isSpeaking = true;
     streamDone = false;
     var text = speechQueue.shift();
     audioChunkQueue = [];
     isPlayingChunk = false;
+    var cachedBlob = getCachedTTSBlob(text);
+    if (cachedBlob) {
+      ttsConsecutiveFailures = 0;
+      audioChunkQueue.push(cachedBlob);
+      streamDone = true;
+      if (!isPlayingChunk) playNextChunk();
+      return;
+    }
+    function onTtsSuccess() {
+      ttsConsecutiveFailures = 0;
+    }
+    function onTtsFailed() {
+      ttsConsecutiveFailures++;
+      if (ttsConsecutiveFailures >= TTS_MAX_FAILURES) {
+        console.warn('[Voice] TTS failed ' + ttsConsecutiveFailures + ' times consecutively, disabling until reset');
+        ttsDisabledUntilReset = true;
+        speechQueue = [];
+      }
+      streamDone = true;
+      isSpeaking = false;
+      if (!ttsDisabledUntilReset) {
+        processQueue();
+      }
+    }
     function tryStreaming() {
       fetch(BASE + '/api/tts-stream', {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
         body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
       }).then(function(resp) {
-        if (!resp.ok) throw new Error('TTS stream failed');
+        if (!resp.ok) throw new Error('TTS stream failed: ' + resp.status);
         var reader = resp.body.getReader();
         var buffer = new Uint8Array(0);
@@ -357,6 +410,7 @@
         function pump() {
           return reader.read().then(function(result) {
             if (result.done) {
+              onTtsSuccess();
               streamDone = true;
               if (!isPlayingChunk && audioChunkQueue.length === 0) {
                 isSpeaking = false;
@@ -384,16 +438,17 @@
         tryNonStreaming(text);
       });
     }
     function tryNonStreaming(txt) {
       fetch(BASE + '/api/tts', {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
         body: JSON.stringify({ text: txt, voiceId: selectedVoiceId })
       }).then(function(resp) {
-        if (!resp.ok) throw new Error('TTS failed');
+        if (!resp.ok) throw new Error('TTS failed: ' + resp.status);
         return resp.arrayBuffer();
       }).then(function(buf) {
+        onTtsSuccess();
         var blob = new Blob([buf], { type: 'audio/wav' });
         audioChunkQueue.push(blob);
         if (!isPlayingChunk) playNextChunk();
@@ -401,12 +456,10 @@
         isSpeaking = false;
         processQueue();
       }).catch(function() {
-        streamDone = true;
-        isSpeaking = false;
-        processQueue();
+        onTtsFailed();
       });
     }
     tryStreaming();
   }
@@ -415,6 +468,8 @@
     audioChunkQueue = [];
     isPlayingChunk = false;
     isSpeaking = false;
+    ttsConsecutiveFailures = 0;
+    ttsDisabledUntilReset = false;
     if (currentAudio) {
       currentAudio.pause();
       currentAudio = null;
@@ -505,11 +560,23 @@
     }
   }
+  function sendVoiceToServer() {
+    if (typeof agentGUIClient !== 'undefined' && agentGUIClient && agentGUIClient.wsManager && agentGUIClient.wsManager.isConnected) {
+      agentGUIClient.wsManager.sendMessage({ type: 'set_voice', voiceId: selectedVoiceId });
+    }
+  }
   function setupStreamingListener() {
     window.addEventListener('ws-message', function(e) {
-      if (!voiceActive) return;
       var data = e.detail;
       if (!data) return;
+      if (data.type === 'tts_audio' && data.audio && data.voiceId === selectedVoiceId) {
+        cacheTTSAudio(data.cacheKey, data.audio);
+      }
+      if (data.type === 'sync_connected') {
+        sendVoiceToServer();
+      }
+      if (!voiceActive) return;
       if (data.type === 'streaming_progress' && data.block) {
         handleVoiceBlock(data.block, true);
       }