agentgui 1.0.168 → 1.0.169

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/speech.js CHANGED
@@ -13,6 +13,21 @@ const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
13
13
  const SAMPLE_RATE_TTS = 16000;
14
14
  const SAMPLE_RATE_STT = 16000;
15
15
  const MIN_WAV_SIZE = 44;
16
+ const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Matthijs%2Fcmu-arctic-xvectors&config=default&split=validation';
17
+ const SAMPLES_TO_AVERAGE = 10;
18
+
19
+ const VOICE_CATALOG = [
20
+ { id: 'default', name: 'Default', gender: 'male', accent: 'US' },
21
+ { id: 'bdl', name: 'BDL', gender: 'male', accent: 'US' },
22
+ { id: 'slt', name: 'SLT', gender: 'female', accent: 'US' },
23
+ { id: 'clb', name: 'CLB', gender: 'female', accent: 'US' },
24
+ { id: 'rms', name: 'RMS', gender: 'male', accent: 'US' },
25
+ { id: 'awb', name: 'AWB', gender: 'male', accent: 'Scottish' },
26
+ { id: 'jmk', name: 'JMK', gender: 'male', accent: 'Canadian' },
27
+ { id: 'ksp', name: 'KSP', gender: 'male', accent: 'Indian' },
28
+ ];
29
+
30
+ const SPEAKER_OFFSETS = { awb: 0, bdl: 1200, clb: 2300, jmk: 3500, ksp: 4700, rms: 5900, slt: 7100 };
16
31
 
17
32
  let transformersModule = null;
18
33
  let sttPipeline = null;
@@ -20,6 +35,7 @@ let ttsPipeline = null;
20
35
  let speakerEmbeddings = null;
21
36
  let sttLoading = false;
22
37
  let ttsLoading = false;
38
+ const voiceEmbeddingsCache = new Map();
23
39
 
24
40
  const TTS_CACHE_MAX = 100;
25
41
  const ttsCache = new Map();
@@ -52,6 +68,43 @@ async function ensureSpeakerEmbeddings() {
52
68
  return speakerEmbeddings;
53
69
  }
54
70
 
71
+ async function loadVoiceEmbedding(voiceId) {
72
+ if (!voiceId || voiceId === 'default') return ensureSpeakerEmbeddings();
73
+ if (voiceEmbeddingsCache.has(voiceId)) return voiceEmbeddingsCache.get(voiceId);
74
+ const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
75
+ if (fs.existsSync(binPath)) {
76
+ const buf = fs.readFileSync(binPath);
77
+ const emb = new Float32Array(new Uint8Array(buf).buffer);
78
+ voiceEmbeddingsCache.set(voiceId, emb);
79
+ return emb;
80
+ }
81
+ const offset = SPEAKER_OFFSETS[voiceId];
82
+ if (offset === undefined) return ensureSpeakerEmbeddings();
83
+ const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
84
+ const resp = await fetch(url);
85
+ if (!resp.ok) throw new Error('Failed to fetch voice embeddings for ' + voiceId);
86
+ const data = await resp.json();
87
+ const avg = new Float32Array(512);
88
+ let count = 0;
89
+ for (const item of data.rows) {
90
+ const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
91
+ if (match && match[1] === voiceId) {
92
+ for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
93
+ count++;
94
+ }
95
+ }
96
+ if (count === 0) return ensureSpeakerEmbeddings();
97
+ for (let i = 0; i < 512; i++) avg[i] /= count;
98
+ if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
99
+ fs.writeFileSync(binPath, Buffer.from(avg.buffer));
100
+ voiceEmbeddingsCache.set(voiceId, avg);
101
+ return avg;
102
+ }
103
+
104
+ function getVoices() {
105
+ return VOICE_CATALOG;
106
+ }
107
+
55
108
  async function getSTT() {
56
109
  if (sttPipeline) return sttPipeline;
57
110
  if (sttLoading) {
@@ -233,36 +286,38 @@ function cachePut(key, buf) {
233
286
  ttsCache.set(key, buf);
234
287
  }
235
288
 
236
- async function synthesize(text) {
237
- const cached = ttsCache.get(text);
289
+ async function synthesize(text, voiceId) {
290
+ const cacheKey = (voiceId || 'default') + ':' + text;
291
+ const cached = ttsCache.get(cacheKey);
238
292
  if (cached) {
239
- ttsCache.delete(text);
240
- ttsCache.set(text, cached);
293
+ ttsCache.delete(cacheKey);
294
+ ttsCache.set(cacheKey, cached);
241
295
  return cached;
242
296
  }
243
297
  const tts = await getTTS();
244
- const embeddings = await ensureSpeakerEmbeddings();
298
+ const embeddings = await loadVoiceEmbedding(voiceId);
245
299
  const result = await tts(text, { speaker_embeddings: embeddings });
246
300
  const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
247
- cachePut(text, wav);
301
+ cachePut(cacheKey, wav);
248
302
  return wav;
249
303
  }
250
304
 
251
- async function* synthesizeStream(text) {
305
+ async function* synthesizeStream(text, voiceId) {
252
306
  const sentences = splitSentences(text);
253
307
  const tts = await getTTS();
254
- const embeddings = await ensureSpeakerEmbeddings();
308
+ const embeddings = await loadVoiceEmbedding(voiceId);
255
309
  for (const sentence of sentences) {
256
- const cached = ttsCache.get(sentence);
310
+ const cacheKey = (voiceId || 'default') + ':' + sentence;
311
+ const cached = ttsCache.get(cacheKey);
257
312
  if (cached) {
258
- ttsCache.delete(sentence);
259
- ttsCache.set(sentence, cached);
313
+ ttsCache.delete(cacheKey);
314
+ ttsCache.set(cacheKey, cached);
260
315
  yield cached;
261
316
  continue;
262
317
  }
263
318
  const result = await tts(sentence, { speaker_embeddings: embeddings });
264
319
  const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
265
- cachePut(sentence, wav);
320
+ cachePut(cacheKey, wav);
266
321
  yield wav;
267
322
  }
268
323
  }
@@ -276,4 +331,4 @@ function getStatus() {
276
331
  };
277
332
  }
278
333
 
279
- export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus };
334
+ export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgui",
3
- "version": "1.0.168",
3
+ "version": "1.0.169",
4
4
  "description": "Multi-agent ACP client with real-time communication",
5
5
  "type": "module",
6
6
  "main": "server.js",
package/server.js CHANGED
@@ -535,16 +535,27 @@ const server = http.createServer(async (req, res) => {
535
535
  return;
536
536
  }
537
537
 
538
+ if (routePath === '/api/voices' && req.method === 'GET') {
539
+ try {
540
+ const { getVoices } = await getSpeech();
541
+ sendJSON(req, res, 200, { ok: true, voices: getVoices() });
542
+ } catch (err) {
543
+ sendJSON(req, res, 200, { ok: true, voices: [] });
544
+ }
545
+ return;
546
+ }
547
+
538
548
  if (routePath === '/api/tts' && req.method === 'POST') {
539
549
  try {
540
550
  const body = await parseBody(req);
541
551
  const text = body.text || '';
552
+ const voiceId = body.voiceId || null;
542
553
  if (!text) {
543
- sendJSON(req, res, 400, { error: 'No text provided' });
554
+ sendJSON(req, res, 400, { error: 'No text provided' });
544
555
  return;
545
556
  }
546
557
  const { synthesize } = await getSpeech();
547
- const wavBuffer = await synthesize(text);
558
+ const wavBuffer = await synthesize(text, voiceId);
548
559
  res.writeHead(200, { 'Content-Type': 'audio/wav', 'Content-Length': wavBuffer.length });
549
560
  res.end(wavBuffer);
550
561
  } catch (err) {
@@ -558,6 +569,7 @@ const server = http.createServer(async (req, res) => {
558
569
  try {
559
570
  const body = await parseBody(req);
560
571
  const text = body.text || '';
572
+ const voiceId = body.voiceId || null;
561
573
  if (!text) {
562
574
  sendJSON(req, res, 400, { error: 'No text provided' });
563
575
  return;
@@ -569,7 +581,7 @@ const server = http.createServer(async (req, res) => {
569
581
  'X-Content-Type': 'audio/wav-stream',
570
582
  'Cache-Control': 'no-cache'
571
583
  });
572
- for await (const wavChunk of synthesizeStream(text)) {
584
+ for await (const wavChunk of synthesizeStream(text, voiceId)) {
573
585
  const lenBuf = Buffer.alloc(4);
574
586
  lenBuf.writeUInt32BE(wavChunk.length, 0);
575
587
  res.write(lenBuf);
@@ -589,7 +601,7 @@ const server = http.createServer(async (req, res) => {
589
601
  const { getStatus } = await getSpeech();
590
602
  sendJSON(req, res, 200, getStatus());
591
603
  } catch (err) {
592
- sendJSON(req, res, 200, { sttReady: false, ttsReady: false, sttLoading: false, ttsLoading: false });
604
+ sendJSON(req, res, 200, { sttReady: false, ttsReady: false, sttLoading: false, ttsLoading: false });
593
605
  }
594
606
  return;
595
607
  }
package/static/index.html CHANGED
@@ -1056,6 +1056,28 @@
1056
1056
  border-color: var(--color-error);
1057
1057
  }
1058
1058
 
1059
+ .voice-selector-wrapper {
1060
+ display: flex;
1061
+ align-items: center;
1062
+ gap: 0.25rem;
1063
+ }
1064
+
1065
+ .voice-selector {
1066
+ padding: 0.2rem 0.5rem;
1067
+ border: 1px solid var(--color-border);
1068
+ border-radius: 0.375rem;
1069
+ background: var(--color-bg-secondary);
1070
+ color: var(--color-text-primary);
1071
+ font-size: 0.75rem;
1072
+ cursor: pointer;
1073
+ max-width: 160px;
1074
+ }
1075
+
1076
+ .voice-selector:focus {
1077
+ outline: none;
1078
+ border-color: var(--color-primary);
1079
+ }
1080
+
1059
1081
  .voice-empty {
1060
1082
  text-align: center;
1061
1083
  color: var(--color-text-secondary);
@@ -2146,6 +2168,11 @@
2146
2168
  <input type="checkbox" id="voiceTTSToggle" checked>
2147
2169
  <span>Auto-speak responses</span>
2148
2170
  </label>
2171
+ <div class="voice-selector-wrapper">
2172
+ <select class="voice-selector" id="voiceSelector" title="Select voice">
2173
+ <option value="default">Default</option>
2174
+ </select>
2175
+ </div>
2149
2176
  <button class="voice-stop-btn" id="voiceStopSpeaking" title="Stop speaking">Stop</button>
2150
2177
  </div>
2151
2178
  </div>
@@ -14,12 +14,48 @@
14
14
  var TARGET_SAMPLE_RATE = 16000;
15
15
  var spokenChunks = new Set();
16
16
  var isLoadingHistory = false;
17
+ var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
17
18
 
18
19
  function init() {
19
20
  setupTTSToggle();
20
21
  setupUI();
21
22
  setupStreamingListener();
22
23
  setupAgentSelector();
24
+ setupVoiceSelector();
25
+ }
26
+
27
+ function setupVoiceSelector() {
28
+ var selector = document.getElementById('voiceSelector');
29
+ if (!selector) return;
30
+ var saved = localStorage.getItem('voice-selected-id');
31
+ if (saved) selectedVoiceId = saved;
32
+ fetch(BASE + '/api/voices')
33
+ .then(function(res) { return res.json(); })
34
+ .then(function(data) {
35
+ if (!data.ok || !Array.isArray(data.voices)) return;
36
+ selector.innerHTML = '';
37
+ data.voices.forEach(function(voice) {
38
+ var opt = document.createElement('option');
39
+ opt.value = voice.id;
40
+ var label = voice.name;
41
+ if (voice.gender || voice.accent) {
42
+ var parts = [];
43
+ if (voice.gender) parts.push(voice.gender);
44
+ if (voice.accent) parts.push(voice.accent);
45
+ label += ' (' + parts.join(', ') + ')';
46
+ }
47
+ opt.textContent = label;
48
+ selector.appendChild(opt);
49
+ });
50
+ if (saved && selector.querySelector('option[value="' + saved + '"]')) {
51
+ selector.value = saved;
52
+ }
53
+ })
54
+ .catch(function() {});
55
+ selector.addEventListener('change', function() {
56
+ selectedVoiceId = selector.value;
57
+ localStorage.setItem('voice-selected-id', selectedVoiceId);
58
+ });
23
59
  }
24
60
 
25
61
  function syncVoiceSelector() {
@@ -289,7 +325,7 @@
289
325
  fetch(BASE + '/api/tts-stream', {
290
326
  method: 'POST',
291
327
  headers: { 'Content-Type': 'application/json' },
292
- body: JSON.stringify({ text: text })
328
+ body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
293
329
  }).then(function(resp) {
294
330
  if (!resp.ok) throw new Error('TTS failed');
295
331
  var reader = resp.body.getReader();