npm - agentgui - Versions diffs - 1.0.168 → 1.0.169 - Mend

agentgui 1.0.168 → 1.0.169

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/lib/speech.js CHANGED Viewed

@@ -13,6 +13,21 @@ const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
 const SAMPLE_RATE_TTS = 16000;
 const SAMPLE_RATE_STT = 16000;
 const MIN_WAV_SIZE = 44;
+const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Matthijs%2Fcmu-arctic-xvectors&config=default&split=validation';
+const SAMPLES_TO_AVERAGE = 10;
+const VOICE_CATALOG = [
+  { id: 'default', name: 'Default', gender: 'male', accent: 'US' },
+  { id: 'bdl', name: 'BDL', gender: 'male', accent: 'US' },
+  { id: 'slt', name: 'SLT', gender: 'female', accent: 'US' },
+  { id: 'clb', name: 'CLB', gender: 'female', accent: 'US' },
+  { id: 'rms', name: 'RMS', gender: 'male', accent: 'US' },
+  { id: 'awb', name: 'AWB', gender: 'male', accent: 'Scottish' },
+  { id: 'jmk', name: 'JMK', gender: 'male', accent: 'Canadian' },
+  { id: 'ksp', name: 'KSP', gender: 'male', accent: 'Indian' },
+];
+const SPEAKER_OFFSETS = { awb: 0, bdl: 1200, clb: 2300, jmk: 3500, ksp: 4700, rms: 5900, slt: 7100 };
 let transformersModule = null;
 let sttPipeline = null;
@@ -20,6 +35,7 @@ let ttsPipeline = null;
 let speakerEmbeddings = null;
 let sttLoading = false;
 let ttsLoading = false;
+const voiceEmbeddingsCache = new Map();
 const TTS_CACHE_MAX = 100;
 const ttsCache = new Map();
@@ -52,6 +68,43 @@ async function ensureSpeakerEmbeddings() {
   return speakerEmbeddings;
 }
+async function loadVoiceEmbedding(voiceId) {
+  if (!voiceId || voiceId === 'default') return ensureSpeakerEmbeddings();
+  if (voiceEmbeddingsCache.has(voiceId)) return voiceEmbeddingsCache.get(voiceId);
+  const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
+  if (fs.existsSync(binPath)) {
+    const buf = fs.readFileSync(binPath);
+    const emb = new Float32Array(new Uint8Array(buf).buffer);
+    voiceEmbeddingsCache.set(voiceId, emb);
+    return emb;
+  }
+  const offset = SPEAKER_OFFSETS[voiceId];
+  if (offset === undefined) return ensureSpeakerEmbeddings();
+  const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
+  const resp = await fetch(url);
+  if (!resp.ok) throw new Error('Failed to fetch voice embeddings for ' + voiceId);
+  const data = await resp.json();
+  const avg = new Float32Array(512);
+  let count = 0;
+  for (const item of data.rows) {
+    const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
+    if (match && match[1] === voiceId) {
+      for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
+      count++;
+    }
+  }
+  if (count === 0) return ensureSpeakerEmbeddings();
+  for (let i = 0; i < 512; i++) avg[i] /= count;
+  if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
+  fs.writeFileSync(binPath, Buffer.from(avg.buffer));
+  voiceEmbeddingsCache.set(voiceId, avg);
+  return avg;
+}
+function getVoices() {
+  return VOICE_CATALOG;
+}
 async function getSTT() {
   if (sttPipeline) return sttPipeline;
   if (sttLoading) {
@@ -233,36 +286,38 @@ function cachePut(key, buf) {
   ttsCache.set(key, buf);
 }
-async function synthesize(text) {
-  const cached = ttsCache.get(text);
+async function synthesize(text, voiceId) {
+  const cacheKey = (voiceId || 'default') + ':' + text;
+  const cached = ttsCache.get(cacheKey);
   if (cached) {
-    ttsCache.delete(text);
-    ttsCache.set(text, cached);
+    ttsCache.delete(cacheKey);
+    ttsCache.set(cacheKey, cached);
     return cached;
   }
   const tts = await getTTS();
-  const embeddings = await ensureSpeakerEmbeddings();
+  const embeddings = await loadVoiceEmbedding(voiceId);
   const result = await tts(text, { speaker_embeddings: embeddings });
   const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
-  cachePut(text, wav);
+  cachePut(cacheKey, wav);
   return wav;
 }
-async function* synthesizeStream(text) {
+async function* synthesizeStream(text, voiceId) {
   const sentences = splitSentences(text);
   const tts = await getTTS();
-  const embeddings = await ensureSpeakerEmbeddings();
+  const embeddings = await loadVoiceEmbedding(voiceId);
   for (const sentence of sentences) {
-    const cached = ttsCache.get(sentence);
+    const cacheKey = (voiceId || 'default') + ':' + sentence;
+    const cached = ttsCache.get(cacheKey);
     if (cached) {
-      ttsCache.delete(sentence);
-      ttsCache.set(sentence, cached);
+      ttsCache.delete(cacheKey);
+      ttsCache.set(cacheKey, cached);
       yield cached;
       continue;
     }
     const result = await tts(sentence, { speaker_embeddings: embeddings });
     const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
-    cachePut(sentence, wav);
+    cachePut(cacheKey, wav);
     yield wav;
   }
 }
@@ -276,4 +331,4 @@ function getStatus() {
   };
 }
-export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus };
+export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agentgui",
-  "version": "1.0.168",
+  "version": "1.0.169",
   "description": "Multi-agent ACP client with real-time communication",
   "type": "module",
   "main": "server.js",

package/server.js CHANGED Viewed

@@ -535,16 +535,27 @@ const server = http.createServer(async (req, res) => {
       return;
     }
+    if (routePath === '/api/voices' && req.method === 'GET') {
+      try {
+        const { getVoices } = await getSpeech();
+        sendJSON(req, res, 200, { ok: true, voices: getVoices() });
+      } catch (err) {
+        sendJSON(req, res, 200, { ok: true, voices: [] });
+      }
+      return;
+    }
     if (routePath === '/api/tts' && req.method === 'POST') {
       try {
         const body = await parseBody(req);
         const text = body.text || '';
+        const voiceId = body.voiceId || null;
         if (!text) {
-                    sendJSON(req, res, 400, { error: 'No text provided' });
+          sendJSON(req, res, 400, { error: 'No text provided' });
           return;
         }
         const { synthesize } = await getSpeech();
-        const wavBuffer = await synthesize(text);
+        const wavBuffer = await synthesize(text, voiceId);
         res.writeHead(200, { 'Content-Type': 'audio/wav', 'Content-Length': wavBuffer.length });
         res.end(wavBuffer);
       } catch (err) {
@@ -558,6 +569,7 @@ const server = http.createServer(async (req, res) => {
       try {
         const body = await parseBody(req);
         const text = body.text || '';
+        const voiceId = body.voiceId || null;
         if (!text) {
           sendJSON(req, res, 400, { error: 'No text provided' });
           return;
@@ -569,7 +581,7 @@ const server = http.createServer(async (req, res) => {
           'X-Content-Type': 'audio/wav-stream',
           'Cache-Control': 'no-cache'
         });
-        for await (const wavChunk of synthesizeStream(text)) {
+        for await (const wavChunk of synthesizeStream(text, voiceId)) {
           const lenBuf = Buffer.alloc(4);
           lenBuf.writeUInt32BE(wavChunk.length, 0);
           res.write(lenBuf);
@@ -589,7 +601,7 @@ const server = http.createServer(async (req, res) => {
         const { getStatus } = await getSpeech();
         sendJSON(req, res, 200, getStatus());
       } catch (err) {
-                sendJSON(req, res, 200, { sttReady: false, ttsReady: false, sttLoading: false, ttsLoading: false });
+        sendJSON(req, res, 200, { sttReady: false, ttsReady: false, sttLoading: false, ttsLoading: false });
       }
       return;
     }

package/static/index.html CHANGED Viewed

@@ -1056,6 +1056,28 @@
       border-color: var(--color-error);
     }
+    .voice-selector-wrapper {
+      display: flex;
+      align-items: center;
+      gap: 0.25rem;
+    }
+    .voice-selector {
+      padding: 0.2rem 0.5rem;
+      border: 1px solid var(--color-border);
+      border-radius: 0.375rem;
+      background: var(--color-bg-secondary);
+      color: var(--color-text-primary);
+      font-size: 0.75rem;
+      cursor: pointer;
+      max-width: 160px;
+    }
+    .voice-selector:focus {
+      outline: none;
+      border-color: var(--color-primary);
+    }
     .voice-empty {
       text-align: center;
       color: var(--color-text-secondary);
@@ -2146,6 +2168,11 @@
               <input type="checkbox" id="voiceTTSToggle" checked>
               <span>Auto-speak responses</span>
             </label>
+            <div class="voice-selector-wrapper">
+              <select class="voice-selector" id="voiceSelector" title="Select voice">
+                <option value="default">Default</option>
+              </select>
+            </div>
             <button class="voice-stop-btn" id="voiceStopSpeaking" title="Stop speaking">Stop</button>
           </div>
         </div>

package/static/js/voice.js CHANGED Viewed

@@ -14,12 +14,48 @@
   var TARGET_SAMPLE_RATE = 16000;
   var spokenChunks = new Set();
   var isLoadingHistory = false;
+  var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
   function init() {
     setupTTSToggle();
     setupUI();
     setupStreamingListener();
     setupAgentSelector();
+    setupVoiceSelector();
+  }
+  function setupVoiceSelector() {
+    var selector = document.getElementById('voiceSelector');
+    if (!selector) return;
+    var saved = localStorage.getItem('voice-selected-id');
+    if (saved) selectedVoiceId = saved;
+    fetch(BASE + '/api/voices')
+      .then(function(res) { return res.json(); })
+      .then(function(data) {
+        if (!data.ok || !Array.isArray(data.voices)) return;
+        selector.innerHTML = '';
+        data.voices.forEach(function(voice) {
+          var opt = document.createElement('option');
+          opt.value = voice.id;
+          var label = voice.name;
+          if (voice.gender || voice.accent) {
+            var parts = [];
+            if (voice.gender) parts.push(voice.gender);
+            if (voice.accent) parts.push(voice.accent);
+            label += ' (' + parts.join(', ') + ')';
+          }
+          opt.textContent = label;
+          selector.appendChild(opt);
+        });
+        if (saved && selector.querySelector('option[value="' + saved + '"]')) {
+          selector.value = saved;
+        }
+      })
+      .catch(function() {});
+    selector.addEventListener('change', function() {
+      selectedVoiceId = selector.value;
+      localStorage.setItem('voice-selected-id', selectedVoiceId);
+    });
   }
   function syncVoiceSelector() {
@@ -289,7 +325,7 @@
     fetch(BASE + '/api/tts-stream', {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ text: text })
+      body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
     }).then(function(resp) {
       if (!resp.ok) throw new Error('TTS failed');
       var reader = resp.body.getReader();