agentgui 1.0.174 → 1.0.176

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/speech.js CHANGED
@@ -115,12 +115,18 @@ let speakerEmbeddingPipeline = null;
115
115
  let sttLoading = false;
116
116
  let ttsLoading = false;
117
117
  let speakerEmbeddingLoading = false;
118
+ let ttsLoadError = null;
119
+ let ttsLoadErrorTime = 0;
120
+ let sttLoadError = null;
118
121
  const voiceEmbeddingsCache = new Map();
119
122
  const SAMPLE_RATE_STT = 16000;
120
123
  const SAMPLE_RATE_TTS = 16000;
124
+ const TTS_ERROR_RETRY_MS = 30000;
121
125
 
122
- const TTS_CACHE_MAX = 100;
126
+ const TTS_CACHE_MAX_BYTES = 10 * 1024 * 1024;
127
+ let ttsCacheBytes = 0;
123
128
  const ttsCache = new Map();
129
+ const ttsInflight = new Map();
124
130
 
125
131
  async function loadTransformers() {
126
132
  if (transformersModule) return transformersModule;
@@ -261,8 +267,10 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
261
267
 
262
268
  async function getSTT() {
263
269
  if (sttPipeline) return sttPipeline;
270
+ if (sttLoadError) throw sttLoadError;
264
271
  if (sttLoading) {
265
272
  while (sttLoading) await new Promise(r => setTimeout(r, 100));
273
+ if (sttLoadError) throw sttLoadError;
266
274
  if (!sttPipeline) throw new Error('STT pipeline failed to load');
267
275
  return sttPipeline;
268
276
  }
@@ -278,10 +286,12 @@ async function getSTT() {
278
286
  device: 'cpu',
279
287
  local_files_only: isLocal,
280
288
  });
289
+ sttLoadError = null;
281
290
  return sttPipeline;
282
291
  } catch (err) {
283
292
  sttPipeline = null;
284
- throw new Error('STT model load failed: ' + err.message);
293
+ sttLoadError = new Error('STT model load failed: ' + err.message);
294
+ throw sttLoadError;
285
295
  } finally {
286
296
  sttLoading = false;
287
297
  }
@@ -289,8 +299,14 @@ async function getSTT() {
289
299
 
290
300
  async function getTTS() {
291
301
  if (ttsPipeline) return ttsPipeline;
302
+ if (ttsLoadError) {
303
+ if (Date.now() - ttsLoadErrorTime < TTS_ERROR_RETRY_MS) throw ttsLoadError;
304
+ ttsLoadError = null;
305
+ ttsLoadErrorTime = 0;
306
+ }
292
307
  if (ttsLoading) {
293
308
  while (ttsLoading) await new Promise(r => setTimeout(r, 100));
309
+ if (ttsLoadError) throw ttsLoadError;
294
310
  if (!ttsPipeline) throw new Error('TTS pipeline failed to load');
295
311
  return ttsPipeline;
296
312
  }
@@ -303,10 +319,14 @@ async function getTTS() {
303
319
  dtype: 'fp32',
304
320
  });
305
321
  await ensureSpeakerEmbeddings();
322
+ ttsLoadError = null;
323
+ ttsLoadErrorTime = 0;
306
324
  return ttsPipeline;
307
325
  } catch (err) {
308
326
  ttsPipeline = null;
309
- throw new Error('TTS model load failed: ' + err.message);
327
+ ttsLoadError = new Error('TTS model load failed: ' + err.message);
328
+ ttsLoadErrorTime = Date.now();
329
+ throw ttsLoadError;
310
330
  } finally {
311
331
  ttsLoading = false;
312
332
  }
@@ -433,11 +453,17 @@ function splitSentences(text) {
433
453
  }
434
454
 
435
455
  function cachePut(key, buf) {
436
- if (ttsCache.size >= TTS_CACHE_MAX) {
456
+ if (ttsCache.has(key)) {
457
+ ttsCacheBytes -= ttsCache.get(key).length;
458
+ ttsCache.delete(key);
459
+ }
460
+ while (ttsCacheBytes + buf.length > TTS_CACHE_MAX_BYTES && ttsCache.size > 0) {
437
461
  const oldest = ttsCache.keys().next().value;
462
+ ttsCacheBytes -= ttsCache.get(oldest).length;
438
463
  ttsCache.delete(oldest);
439
464
  }
440
465
  ttsCache.set(key, buf);
466
+ ttsCacheBytes += buf.length;
441
467
  }
442
468
 
443
469
  async function synthesize(text, voiceId) {
@@ -448,12 +474,18 @@ async function synthesize(text, voiceId) {
448
474
  ttsCache.set(cacheKey, cached);
449
475
  return cached;
450
476
  }
451
- const tts = await getTTS();
452
- const embeddings = await loadVoiceEmbedding(voiceId);
453
- const result = await tts(text, { speaker_embeddings: embeddings });
454
- const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
455
- cachePut(cacheKey, wav);
456
- return wav;
477
+ const inflight = ttsInflight.get(cacheKey);
478
+ if (inflight) return inflight;
479
+ const promise = (async () => {
480
+ const tts = await getTTS();
481
+ const embeddings = await loadVoiceEmbedding(voiceId);
482
+ const result = await tts(text, { speaker_embeddings: embeddings });
483
+ const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
484
+ cachePut(cacheKey, wav);
485
+ return wav;
486
+ })();
487
+ ttsInflight.set(cacheKey, promise);
488
+ try { return await promise; } finally { ttsInflight.delete(cacheKey); }
457
489
  }
458
490
 
459
491
  async function* synthesizeStream(text, voiceId) {
@@ -482,7 +514,23 @@ function getStatus() {
482
514
  ttsReady: !!ttsPipeline,
483
515
  sttLoading,
484
516
  ttsLoading,
517
+ sttError: sttLoadError ? sttLoadError.message : null,
518
+ ttsError: ttsLoadError ? ttsLoadError.message : null,
485
519
  };
486
520
  }
487
521
 
488
- export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices };
522
+ function preloadTTS() {
523
+ getTTS().catch(err => console.error('[TTS] Preload failed:', err.message));
524
+ }
525
+
526
+ function ttsCacheKey(text, voiceId) {
527
+ return (voiceId || 'default') + ':' + text;
528
+ }
529
+
530
+ function ttsCacheGet(key) {
531
+ const cached = ttsCache.get(key);
532
+ if (cached) { ttsCache.delete(key); ttsCache.set(key, cached); }
533
+ return cached || null;
534
+ }
535
+
536
+ export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgui",
3
- "version": "1.0.174",
3
+ "version": "1.0.176",
4
4
  "description": "Multi-agent ACP client with real-time communication",
5
5
  "type": "module",
6
6
  "main": "server.js",
package/server.js CHANGED
@@ -15,6 +15,49 @@ async function getSpeech() {
15
15
  return speechModule;
16
16
  }
17
17
 
18
+ function eagerTTS(text, conversationId, sessionId) {
19
+ getSpeech().then(speech => {
20
+ const status = speech.getStatus();
21
+ if (!status.ttsReady || status.ttsError) return;
22
+ const voices = new Set();
23
+ for (const ws of syncClients) {
24
+ const vid = ws.ttsVoiceId || 'default';
25
+ const convKey = `conv-${conversationId}`;
26
+ if (ws.subscriptions && (ws.subscriptions.has(sessionId) || ws.subscriptions.has(convKey))) {
27
+ voices.add(vid);
28
+ }
29
+ }
30
+ if (voices.size === 0) return;
31
+ const sentences = speech.splitSentences(text);
32
+ for (const vid of voices) {
33
+ for (const sentence of sentences) {
34
+ const cacheKey = speech.ttsCacheKey(sentence, vid);
35
+ const cached = speech.ttsCacheGet(cacheKey);
36
+ if (cached) {
37
+ pushTTSAudio(cacheKey, cached, conversationId, sessionId, vid);
38
+ continue;
39
+ }
40
+ speech.synthesize(sentence, vid).then(wav => {
41
+ pushTTSAudio(cacheKey, wav, conversationId, sessionId, vid);
42
+ }).catch(() => {});
43
+ }
44
+ }
45
+ }).catch(() => {});
46
+ }
47
+
48
+ function pushTTSAudio(cacheKey, wav, conversationId, sessionId, voiceId) {
49
+ const b64 = wav.toString('base64');
50
+ broadcastSync({
51
+ type: 'tts_audio',
52
+ cacheKey,
53
+ audio: b64,
54
+ voiceId,
55
+ conversationId,
56
+ sessionId,
57
+ timestamp: Date.now()
58
+ });
59
+ }
60
+
18
61
  const require = createRequire(import.meta.url);
19
62
  const express = require('express');
20
63
  const Busboy = require('busboy');
@@ -554,13 +597,20 @@ const server = http.createServer(async (req, res) => {
554
597
  sendJSON(req, res, 400, { error: 'No text provided' });
555
598
  return;
556
599
  }
557
- const { synthesize } = await getSpeech();
558
- const wavBuffer = await synthesize(text, voiceId);
600
+ const speech = await getSpeech();
601
+ const status = speech.getStatus();
602
+ if (status.ttsError) {
603
+ sendJSON(req, res, 503, { error: status.ttsError, retryable: false });
604
+ return;
605
+ }
606
+ const wavBuffer = await speech.synthesize(text, voiceId);
559
607
  res.writeHead(200, { 'Content-Type': 'audio/wav', 'Content-Length': wavBuffer.length });
560
608
  res.end(wavBuffer);
561
609
  } catch (err) {
562
610
  debugLog('[TTS] Error: ' + err.message);
563
- if (!res.headersSent) sendJSON(req, res, 500, { error: err.message || 'TTS failed' });
611
+ const isModelError = /model.*load|pipeline.*failed|failed to load/i.test(err.message);
612
+ const statusCode = isModelError ? 503 : 500;
613
+ if (!res.headersSent) sendJSON(req, res, statusCode, { error: err.message || 'TTS failed', retryable: !isModelError });
564
614
  }
565
615
  return;
566
616
  }
@@ -574,14 +624,19 @@ const server = http.createServer(async (req, res) => {
574
624
  sendJSON(req, res, 400, { error: 'No text provided' });
575
625
  return;
576
626
  }
577
- const { synthesizeStream } = await getSpeech();
627
+ const speech = await getSpeech();
628
+ const status = speech.getStatus();
629
+ if (status.ttsError) {
630
+ sendJSON(req, res, 503, { error: status.ttsError, retryable: false });
631
+ return;
632
+ }
578
633
  res.writeHead(200, {
579
634
  'Content-Type': 'application/octet-stream',
580
635
  'Transfer-Encoding': 'chunked',
581
636
  'X-Content-Type': 'audio/wav-stream',
582
637
  'Cache-Control': 'no-cache'
583
638
  });
584
- for await (const wavChunk of synthesizeStream(text, voiceId)) {
639
+ for await (const wavChunk of speech.synthesizeStream(text, voiceId)) {
585
640
  const lenBuf = Buffer.alloc(4);
586
641
  lenBuf.writeUInt32BE(wavChunk.length, 0);
587
642
  res.write(lenBuf);
@@ -590,7 +645,9 @@ const server = http.createServer(async (req, res) => {
590
645
  res.end();
591
646
  } catch (err) {
592
647
  debugLog('[TTS-STREAM] Error: ' + err.message);
593
- if (!res.headersSent) sendJSON(req, res, 500, { error: err.message || 'TTS stream failed' });
648
+ const isModelError = /model.*load|pipeline.*failed|failed to load/i.test(err.message);
649
+ const statusCode = isModelError ? 503 : 500;
650
+ if (!res.headersSent) sendJSON(req, res, statusCode, { error: err.message || 'TTS stream failed', retryable: !isModelError });
594
651
  else res.end();
595
652
  }
596
653
  return;
@@ -838,6 +895,10 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
838
895
  blockIndex: allBlocks.length - 1,
839
896
  timestamp: Date.now()
840
897
  });
898
+
899
+ if (block.type === 'text' && block.text) {
900
+ eagerTTS(block.text, conversationId, sessionId);
901
+ }
841
902
  }
842
903
  } else if (parsed.type === 'user' && parsed.message?.content) {
843
904
  for (const block of parsed.message.content) {
@@ -886,6 +947,11 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
886
947
  timestamp: Date.now()
887
948
  });
888
949
 
950
+ if (parsed.result) {
951
+ const resultText = typeof parsed.result === 'string' ? parsed.result : JSON.stringify(parsed.result);
952
+ if (resultText) eagerTTS(resultText, conversationId, sessionId);
953
+ }
954
+
889
955
  if (parsed.result && allBlocks.length === 0) {
890
956
  allBlocks.push({ type: 'text', text: String(parsed.result) });
891
957
  }
@@ -1114,6 +1180,8 @@ wss.on('connection', (ws, req) => {
1114
1180
  subscriptions: Array.from(ws.subscriptions),
1115
1181
  timestamp: Date.now()
1116
1182
  }));
1183
+ } else if (data.type === 'set_voice') {
1184
+ ws.ttsVoiceId = data.voiceId || 'default';
1117
1185
  } else if (data.type === 'ping') {
1118
1186
  ws.send(JSON.stringify({
1119
1187
  type: 'pong',
@@ -15,6 +15,8 @@
15
15
  var spokenChunks = new Set();
16
16
  var isLoadingHistory = false;
17
17
  var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
18
+ var ttsAudioCache = new Map();
19
+ var TTS_CLIENT_CACHE_MAX = 50;
18
20
 
19
21
  function init() {
20
22
  setupTTSToggle();
@@ -69,6 +71,7 @@
69
71
  selector.addEventListener('change', function() {
70
72
  selectedVoiceId = selector.value;
71
73
  localStorage.setItem('voice-selected-id', selectedVoiceId);
74
+ sendVoiceToServer();
72
75
  });
73
76
  }
74
77
 
@@ -295,9 +298,28 @@
295
298
  processQueue();
296
299
  }
297
300
 
301
+ function cacheTTSAudio(cacheKey, b64) {
302
+ if (ttsAudioCache.size >= TTS_CLIENT_CACHE_MAX) {
303
+ var oldest = ttsAudioCache.keys().next().value;
304
+ ttsAudioCache.delete(oldest);
305
+ }
306
+ var binary = atob(b64);
307
+ var bytes = new Uint8Array(binary.length);
308
+ for (var i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
309
+ ttsAudioCache.set(cacheKey, new Blob([bytes], { type: 'audio/wav' }));
310
+ }
311
+
312
+ function getCachedTTSBlob(text) {
313
+ var key = selectedVoiceId + ':' + text;
314
+ return ttsAudioCache.get(key) || null;
315
+ }
316
+
298
317
  var audioChunkQueue = [];
299
318
  var isPlayingChunk = false;
300
319
  var streamDone = false;
320
+ var ttsConsecutiveFailures = 0;
321
+ var TTS_MAX_FAILURES = 3;
322
+ var ttsDisabledUntilReset = false;
301
323
 
302
324
  function playNextChunk() {
303
325
  if (audioChunkQueue.length === 0) {
@@ -331,19 +353,50 @@
331
353
 
332
354
  function processQueue() {
333
355
  if (isSpeaking || speechQueue.length === 0) return;
356
+ if (ttsDisabledUntilReset) {
357
+ speechQueue = [];
358
+ return;
359
+ }
334
360
  isSpeaking = true;
335
361
  streamDone = false;
336
362
  var text = speechQueue.shift();
337
363
  audioChunkQueue = [];
338
364
  isPlayingChunk = false;
339
-
365
+
366
+ var cachedBlob = getCachedTTSBlob(text);
367
+ if (cachedBlob) {
368
+ ttsConsecutiveFailures = 0;
369
+ audioChunkQueue.push(cachedBlob);
370
+ streamDone = true;
371
+ if (!isPlayingChunk) playNextChunk();
372
+ return;
373
+ }
374
+
375
+ function onTtsSuccess() {
376
+ ttsConsecutiveFailures = 0;
377
+ }
378
+
379
+ function onTtsFailed() {
380
+ ttsConsecutiveFailures++;
381
+ if (ttsConsecutiveFailures >= TTS_MAX_FAILURES) {
382
+ console.warn('[Voice] TTS failed ' + ttsConsecutiveFailures + ' times consecutively, disabling until reset');
383
+ ttsDisabledUntilReset = true;
384
+ speechQueue = [];
385
+ }
386
+ streamDone = true;
387
+ isSpeaking = false;
388
+ if (!ttsDisabledUntilReset) {
389
+ processQueue();
390
+ }
391
+ }
392
+
340
393
  function tryStreaming() {
341
394
  fetch(BASE + '/api/tts-stream', {
342
395
  method: 'POST',
343
396
  headers: { 'Content-Type': 'application/json' },
344
397
  body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
345
398
  }).then(function(resp) {
346
- if (!resp.ok) throw new Error('TTS stream failed');
399
+ if (!resp.ok) throw new Error('TTS stream failed: ' + resp.status);
347
400
  var reader = resp.body.getReader();
348
401
  var buffer = new Uint8Array(0);
349
402
 
@@ -357,6 +410,7 @@
357
410
  function pump() {
358
411
  return reader.read().then(function(result) {
359
412
  if (result.done) {
413
+ onTtsSuccess();
360
414
  streamDone = true;
361
415
  if (!isPlayingChunk && audioChunkQueue.length === 0) {
362
416
  isSpeaking = false;
@@ -384,16 +438,17 @@
384
438
  tryNonStreaming(text);
385
439
  });
386
440
  }
387
-
441
+
388
442
  function tryNonStreaming(txt) {
389
443
  fetch(BASE + '/api/tts', {
390
444
  method: 'POST',
391
445
  headers: { 'Content-Type': 'application/json' },
392
446
  body: JSON.stringify({ text: txt, voiceId: selectedVoiceId })
393
447
  }).then(function(resp) {
394
- if (!resp.ok) throw new Error('TTS failed');
448
+ if (!resp.ok) throw new Error('TTS failed: ' + resp.status);
395
449
  return resp.arrayBuffer();
396
450
  }).then(function(buf) {
451
+ onTtsSuccess();
397
452
  var blob = new Blob([buf], { type: 'audio/wav' });
398
453
  audioChunkQueue.push(blob);
399
454
  if (!isPlayingChunk) playNextChunk();
@@ -401,12 +456,10 @@
401
456
  isSpeaking = false;
402
457
  processQueue();
403
458
  }).catch(function() {
404
- streamDone = true;
405
- isSpeaking = false;
406
- processQueue();
459
+ onTtsFailed();
407
460
  });
408
461
  }
409
-
462
+
410
463
  tryStreaming();
411
464
  }
412
465
 
@@ -415,6 +468,8 @@
415
468
  audioChunkQueue = [];
416
469
  isPlayingChunk = false;
417
470
  isSpeaking = false;
471
+ ttsConsecutiveFailures = 0;
472
+ ttsDisabledUntilReset = false;
418
473
  if (currentAudio) {
419
474
  currentAudio.pause();
420
475
  currentAudio = null;
@@ -505,11 +560,23 @@
505
560
  }
506
561
  }
507
562
 
563
+ function sendVoiceToServer() {
564
+ if (typeof agentGUIClient !== 'undefined' && agentGUIClient && agentGUIClient.wsManager && agentGUIClient.wsManager.isConnected) {
565
+ agentGUIClient.wsManager.sendMessage({ type: 'set_voice', voiceId: selectedVoiceId });
566
+ }
567
+ }
568
+
508
569
  function setupStreamingListener() {
509
570
  window.addEventListener('ws-message', function(e) {
510
- if (!voiceActive) return;
511
571
  var data = e.detail;
512
572
  if (!data) return;
573
+ if (data.type === 'tts_audio' && data.audio && data.voiceId === selectedVoiceId) {
574
+ cacheTTSAudio(data.cacheKey, data.audio);
575
+ }
576
+ if (data.type === 'sync_connected') {
577
+ sendVoiceToServer();
578
+ }
579
+ if (!voiceActive) return;
513
580
  if (data.type === 'streaming_progress' && data.block) {
514
581
  handleVoiceBlock(data.block, true);
515
582
  }