agentgui 1.0.175 → 1.0.177

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/speech.js CHANGED
@@ -8,7 +8,7 @@ const require = createRequire(import.meta.url);
8
8
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
9
9
  const ROOT = path.dirname(__dirname);
10
10
  const DATA_DIR = path.join(ROOT, 'data');
11
- const AUDIO_EXTENSIONS = ['.mp3', '.wav', '.ogg', '.flac', '.m4a'];
11
+ const AUDIO_EXTENSIONS = ['.wav', '.mp3', '.ogg', '.flac', '.m4a'];
12
12
 
13
13
  function getVoiceDirs() {
14
14
  const dirs = [];
@@ -106,6 +106,7 @@ const SPEAKER_EMBEDDINGS_URL = 'https://huggingface.co/datasets/Xenova/speaker_e
106
106
  const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
107
107
  const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Xenova%2Fspeaker_embeddings&config=default&split=train';
108
108
  const SAMPLES_TO_AVERAGE = 30;
109
+ const DEFAULT_EMBEDDING_B64 = 'xhibvao34LylqXQ8cNg7Pd1cCTw0keG8awRRvRqje7070G48AtOgPMFbnr1oeKC9I4ZuPZzqGT1DjWs8y3iMPB/SZLzdl7E6b9QaPKSpHTwYuh49FrMlO9YnebwmTzu9/3CPvQuvCbxsSWC9Sb2bO+tvXj0Cjpo8mTMxu/FDrjzQ4x09gyxCvUn6STxjAo+9vtXdPJtsYT3iMna9dQ+EvfQ72zuvxk69GAonPU8KdjsNPAU96e/8veN7lrwgyzk8HA5vvYE1Rz3gpZ484MsLPUKkxTzM54U81ECwvcbFHzv8gT08T6/7POCqBT2fv5E8fvsXPfZiJrzEhme8dg8kPR+mKTutQOU822maPMlMDb1x/IS93+6KvdyThzwhry880JBqvRVOhjzZods8SD08PLpObTn/0wk9BnAwvWiiz72EWgS9RpcjvV4VR73ZqJW9PoUFvfZYYb1h26S98levPHZbTjxH6qU9RPfoPHmJu70mSNo8ztJmvWgMBj0IX8i7TE3lPINY2DzoEma9wMObvTwKCT3pObe8t9KEvaWixjzc5fI8hj6MvaKv4Txl4h09d2a+PHCvTDxorJ69ekRrPeoPjz1JPfI7rUH7PIaJgz0O1YW9JLumvCxDnr1bmMm8GbIFPBX1oL3bRN08oYcXPEaFfL13Vxo9EKfbvTFcOTxdogA9XS3kPEWJoLvChc887BEgPMOvUT2Ba3s8tUDBvYPMZ72dNRG80AuTvQt7d72foTU9qO20O4INEb1u1iE9ibqJvZYaOj2nbYc8lsodvS5HPD1lCqK9EkBYPR0I/rySMIK9plcpPdpJEz2E/DY88d2DPIRTf71ZQZS9b1v5PPseFT2YiJu8OiOwPC8Wnr2QW4Q8n+o7PPQ8PD0QqAg9Vk7APDT6+jzreP88KH6GvTvAKD0AYiO9qOavvORySjvQ6y+9epb5PFvZijxYzlK9BwjUPK0HXL3acWc7dmwmPc/kXb2VBg68MGYRPR5q9zzmFiS9al2IvdVTfDwJOa88SzVkvVlrPD0WvJQ8Vm76PMUAQDzNgyK8QQZVPdMoibxrCBc9BgKTPDLoV70Iu6g7k+kBPZ3lhTy6sOU8OGkVvFaLRD14oqa9a4UVO4z4Gr1eYlO9u5BgPWS1ZL3kFPE8JGEwPQFTl71tHso8g+ElPd9Rgr2XCtc8axudvWC2IL09wSg9E7ZzPT6uBz2XmK09A1HcPJK8rTxK8Zu8GuMTPTuINTyRAhS9OSqDPDralLza3q48EgtePPf797rIWKo9NtkrvbO34zxKZ6m97l0GPQYVlL2igDA9UyfEPJhZyjx4/2Q8ggBpPYcAkzzIVu08ykYNPESdZr3uqmq8fS/zPKUYvzv67x49cUkqvXDlJj1us/88gASuvcs6G7sUshY9SgWiOqu4OD1WQ7k7/sLoPKuLJjwZYFm9an+zPOnfNry9Jh49/XX3vN1sc731fBM9TnBDPHzOAD26/dS9mg57vY+TA7wVJCw9pPb1PE30l7019la9UyRTPXFqljyRDnw9eZ6nvU03kTtS9907L+wavIBtab3k6cs8KVr6vPZ5zTxy+Zs8VuopPQTTUj0tNxg96qZyPY69lTzQEp48BXGJvVopBDvskUg9G2dOPaJMXDylJZU8FxcMvBQkNzzjPKs8FYUpvepYYj1AQsK9upQsvS4037xDcO48GhmIvWb1iT1gJhy9TG7iPHKAG70cuCQ8F1ZwPYqtj7300T89rTujPbXy2r3/cK69FtBNvY3iMT0DoqI4KK0QPYKEqr2Z6RU9ni0UPUNDLb3BsCi8+GttvZYp9zwUaHe9TqrFPOnlH7yCXJC9U8vDu8u2MjxA8xs9SAGxvPpphr29y2e9y2AYvTv+Eb1Elus9DdpGPSfmNL39Ggu85RVXPZbLh70Jvna7XkLGvR230DtGjpu7Ih8HPJKnIz1o35i8x5NVvXwFNDzs/ZM8+kw8PfFJSTwdlJA9ZJ+tvaoVZ7zTvVi8p6wluwh/IT0Kmg088o1rPRhiwjxpWIe9a+LuvYuYtjwAxE09WkPJPBuFh73UotY820JjvXpnQD3fJ/w8TM3JPOz0pTnbTim9tpe6PBHzJT1HEb66SkAKPasLgr1l/Mm8IOGgvM2pZbzwd4a9znOIO4d4Bb1DW5I8EZXzOxvBKDqKpHG9UwCHvd/Epb2cDRi9V1ztPNPBNTrLXHa8FdGHPPo+hb3DnJ08G+SvvVPQBL6zzrC8Omksvc+eIjyvGfU8eG9nvaVkdL1HBvs8eaeGPfcbVD1/Pfw8+TUFvU6aTL2JN5W8HXDNvGKFEj1i+T09UiCIOySbDD2x2/y7VTmnvTe3gb0ZhJw8WrKIuU5RGT09mKU7eFGtPFpr6DzaoyI9hsItPKU+YzuQlXK8f9IePSmUxTwXdoo9W6FJPV2kLzwkU1o8fGnfPInxg70rEVe9H7sNPWJDbbxSqLY8cQAOPUdpAD2YknK9ykFXPeVALz1mq3W96kO/PLERzjyXIRC7jxsXPRnLzjyUEoU7gTKvu+stlb1D1g45IH+2u5sOIj0wXPA8yTqDvT6mV72NsFq8ExeuPJlGyDxvjgk9lJeJvWSF8DwFvaW7oZ9GvHq1Rr1FJsk83zxVvfyGqTz7thG9fslpPF5RPb1Q6BQ9iXGovTeDeb2cmic8oBsRPYeni72TPcI8EKcPvfCJUbyQJqW9fCAYPRk8qT2q6rk8mEw2PfDeXL0=';
109
110
 
110
111
  let transformersModule = null;
111
112
  let sttPipeline = null;
@@ -116,13 +117,17 @@ let sttLoading = false;
116
117
  let ttsLoading = false;
117
118
  let speakerEmbeddingLoading = false;
118
119
  let ttsLoadError = null;
120
+ let ttsLoadErrorTime = 0;
119
121
  let sttLoadError = null;
120
122
  const voiceEmbeddingsCache = new Map();
121
123
  const SAMPLE_RATE_STT = 16000;
122
124
  const SAMPLE_RATE_TTS = 16000;
125
+ const TTS_ERROR_RETRY_MS = 30000;
123
126
 
124
- const TTS_CACHE_MAX = 100;
127
+ const TTS_CACHE_MAX_BYTES = 10 * 1024 * 1024;
128
+ let ttsCacheBytes = 0;
125
129
  const ttsCache = new Map();
130
+ const ttsInflight = new Map();
126
131
 
127
132
  async function loadTransformers() {
128
133
  if (transformersModule) return transformersModule;
@@ -139,16 +144,36 @@ function whisperModelPath() {
139
144
  return 'onnx-community/whisper-base';
140
145
  }
141
146
 
147
+ function defaultEmbedding() {
148
+ const buf = Buffer.from(DEFAULT_EMBEDDING_B64, 'base64');
149
+ return new Float32Array(new Uint8Array(buf).buffer);
150
+ }
151
+
142
152
  async function ensureSpeakerEmbeddings() {
143
153
  if (speakerEmbeddings) return speakerEmbeddings;
144
154
  if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
145
- if (!fs.existsSync(SPEAKER_EMBEDDINGS_PATH)) {
146
- const resp = await fetch(SPEAKER_EMBEDDINGS_URL);
147
- if (!resp.ok) throw new Error('Failed to download speaker embeddings');
148
- fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, Buffer.from(await resp.arrayBuffer()));
155
+ if (fs.existsSync(SPEAKER_EMBEDDINGS_PATH)) {
156
+ const buf = fs.readFileSync(SPEAKER_EMBEDDINGS_PATH);
157
+ if (buf.length === 2048) {
158
+ speakerEmbeddings = new Float32Array(new Uint8Array(buf).buffer);
159
+ return speakerEmbeddings;
160
+ }
149
161
  }
150
- const buf = fs.readFileSync(SPEAKER_EMBEDDINGS_PATH);
151
- speakerEmbeddings = new Float32Array(new Uint8Array(buf).buffer);
162
+ try {
163
+ const resp = await fetch(SPEAKER_EMBEDDINGS_URL);
164
+ if (resp.ok) {
165
+ const data = Buffer.from(await resp.arrayBuffer());
166
+ if (data.length >= 2048) {
167
+ fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, data);
168
+ speakerEmbeddings = new Float32Array(new Uint8Array(data).buffer);
169
+ return speakerEmbeddings;
170
+ }
171
+ }
172
+ } catch (_) {}
173
+ console.log('[TTS] Using bundled default speaker embedding');
174
+ speakerEmbeddings = defaultEmbedding();
175
+ const buf = Buffer.from(speakerEmbeddings.buffer);
176
+ fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, buf);
152
177
  return speakerEmbeddings;
153
178
  }
154
179
 
@@ -167,25 +192,30 @@ async function loadVoiceEmbedding(voiceId) {
167
192
  }
168
193
  const offset = SPEAKER_OFFSETS[voiceId];
169
194
  if (offset === undefined) return ensureSpeakerEmbeddings();
170
- const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
171
- const resp = await fetch(url);
172
- if (!resp.ok) throw new Error('Failed to fetch voice embeddings for ' + voiceId);
173
- const data = await resp.json();
174
- const avg = new Float32Array(512);
175
- let count = 0;
176
- for (const item of data.rows) {
177
- const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
178
- if (match && match[1] === voiceId) {
179
- for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
180
- count++;
195
+ try {
196
+ const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
197
+ const resp = await fetch(url);
198
+ if (!resp.ok) throw new Error('HTTP ' + resp.status);
199
+ const data = await resp.json();
200
+ const avg = new Float32Array(512);
201
+ let count = 0;
202
+ for (const item of data.rows) {
203
+ const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
204
+ if (match && match[1] === voiceId) {
205
+ for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
206
+ count++;
207
+ }
181
208
  }
209
+ if (count === 0) return ensureSpeakerEmbeddings();
210
+ for (let i = 0; i < 512; i++) avg[i] /= count;
211
+ if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
212
+ fs.writeFileSync(binPath, Buffer.from(avg.buffer));
213
+ voiceEmbeddingsCache.set(voiceId, avg);
214
+ return avg;
215
+ } catch (err) {
216
+ console.error('[TTS] Failed to fetch voice embedding for ' + voiceId + ':', err.message);
217
+ return ensureSpeakerEmbeddings();
182
218
  }
183
- if (count === 0) return ensureSpeakerEmbeddings();
184
- for (let i = 0; i < 512; i++) avg[i] /= count;
185
- if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
186
- fs.writeFileSync(binPath, Buffer.from(avg.buffer));
187
- voiceEmbeddingsCache.set(voiceId, avg);
188
- return avg;
189
219
  }
190
220
 
191
221
  async function getSpeakerEmbeddingPipeline() {
@@ -230,6 +260,12 @@ async function decodeAudioFile(filePath) {
230
260
  const decoded = decodeWavToFloat32(buf);
231
261
  return resampleTo16k(decoded.audio, decoded.sampleRate);
232
262
  }
263
+ const wavPath = filePath.replace(/\.[^.]+$/, '.wav');
264
+ if (fs.existsSync(wavPath)) {
265
+ const wavBuf = fs.readFileSync(wavPath);
266
+ const decoded = decodeWavToFloat32(wavBuf);
267
+ return resampleTo16k(decoded.audio, decoded.sampleRate);
268
+ }
233
269
  const decode = (await import('audio-decode')).default;
234
270
  const audioBuffer = await decode(buf);
235
271
  const mono = audioBuffer.getChannelData(0);
@@ -242,23 +278,28 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
242
278
  console.error('[VOICES] Custom voice file not found for:', voiceId);
243
279
  return ensureSpeakerEmbeddings();
244
280
  }
245
- console.log('[VOICES] Generating embedding from:', audioFile);
246
- const audio = await decodeAudioFile(audioFile);
247
- if (audio.length < SAMPLE_RATE_STT * 0.5) {
248
- throw new Error('Audio too short for embedding extraction (need at least 0.5 seconds)');
249
- }
250
- const pipe = await getSpeakerEmbeddingPipeline();
251
- const output = await pipe(audio, { pooling: 'mean', normalize: true });
252
- const embedding = new Float32Array(512);
253
- for (let i = 0; i < Math.min(512, output.data.length); i++) {
254
- embedding[i] = output.data[i];
281
+ try {
282
+ console.log('[VOICES] Generating embedding from:', audioFile);
283
+ const audio = await decodeAudioFile(audioFile);
284
+ if (audio.length < SAMPLE_RATE_STT * 0.5) {
285
+ throw new Error('Audio too short for embedding extraction');
286
+ }
287
+ const pipe = await getSpeakerEmbeddingPipeline();
288
+ const output = await pipe(audio, { pooling: 'mean', normalize: true });
289
+ const embedding = new Float32Array(512);
290
+ for (let i = 0; i < Math.min(512, output.data.length); i++) {
291
+ embedding[i] = output.data[i];
292
+ }
293
+ if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
294
+ const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
295
+ fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
296
+ voiceEmbeddingsCache.set(voiceId, embedding);
297
+ console.log('[VOICES] Generated embedding for custom voice:', voiceId);
298
+ return embedding;
299
+ } catch (err) {
300
+ console.error('[VOICES] Failed to generate embedding for', voiceId + ':', err.message);
301
+ return ensureSpeakerEmbeddings();
255
302
  }
256
- if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
257
- const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
258
- fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
259
- voiceEmbeddingsCache.set(voiceId, embedding);
260
- console.log('[VOICES] Generated embedding for custom voice:', voiceId);
261
- return embedding;
262
303
  }
263
304
 
264
305
  async function getSTT() {
@@ -295,7 +336,11 @@ async function getSTT() {
295
336
 
296
337
  async function getTTS() {
297
338
  if (ttsPipeline) return ttsPipeline;
298
- if (ttsLoadError) throw ttsLoadError;
339
+ if (ttsLoadError) {
340
+ if (Date.now() - ttsLoadErrorTime < TTS_ERROR_RETRY_MS) throw ttsLoadError;
341
+ ttsLoadError = null;
342
+ ttsLoadErrorTime = 0;
343
+ }
299
344
  if (ttsLoading) {
300
345
  while (ttsLoading) await new Promise(r => setTimeout(r, 100));
301
346
  if (ttsLoadError) throw ttsLoadError;
@@ -312,10 +357,12 @@ async function getTTS() {
312
357
  });
313
358
  await ensureSpeakerEmbeddings();
314
359
  ttsLoadError = null;
360
+ ttsLoadErrorTime = 0;
315
361
  return ttsPipeline;
316
362
  } catch (err) {
317
363
  ttsPipeline = null;
318
364
  ttsLoadError = new Error('TTS model load failed: ' + err.message);
365
+ ttsLoadErrorTime = Date.now();
319
366
  throw ttsLoadError;
320
367
  } finally {
321
368
  ttsLoading = false;
@@ -443,11 +490,17 @@ function splitSentences(text) {
443
490
  }
444
491
 
445
492
  function cachePut(key, buf) {
446
- if (ttsCache.size >= TTS_CACHE_MAX) {
493
+ if (ttsCache.has(key)) {
494
+ ttsCacheBytes -= ttsCache.get(key).length;
495
+ ttsCache.delete(key);
496
+ }
497
+ while (ttsCacheBytes + buf.length > TTS_CACHE_MAX_BYTES && ttsCache.size > 0) {
447
498
  const oldest = ttsCache.keys().next().value;
499
+ ttsCacheBytes -= ttsCache.get(oldest).length;
448
500
  ttsCache.delete(oldest);
449
501
  }
450
502
  ttsCache.set(key, buf);
503
+ ttsCacheBytes += buf.length;
451
504
  }
452
505
 
453
506
  async function synthesize(text, voiceId) {
@@ -458,12 +511,18 @@ async function synthesize(text, voiceId) {
458
511
  ttsCache.set(cacheKey, cached);
459
512
  return cached;
460
513
  }
461
- const tts = await getTTS();
462
- const embeddings = await loadVoiceEmbedding(voiceId);
463
- const result = await tts(text, { speaker_embeddings: embeddings });
464
- const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
465
- cachePut(cacheKey, wav);
466
- return wav;
514
+ const inflight = ttsInflight.get(cacheKey);
515
+ if (inflight) return inflight;
516
+ const promise = (async () => {
517
+ const tts = await getTTS();
518
+ const embeddings = await loadVoiceEmbedding(voiceId);
519
+ const result = await tts(text, { speaker_embeddings: embeddings });
520
+ const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
521
+ cachePut(cacheKey, wav);
522
+ return wav;
523
+ })();
524
+ ttsInflight.set(cacheKey, promise);
525
+ try { return await promise; } finally { ttsInflight.delete(cacheKey); }
467
526
  }
468
527
 
469
528
  async function* synthesizeStream(text, voiceId) {
@@ -487,14 +546,29 @@ async function* synthesizeStream(text, voiceId) {
487
546
  }
488
547
 
489
548
  function getStatus() {
549
+ const ttsRetryExpired = ttsLoadError && (Date.now() - ttsLoadErrorTime >= TTS_ERROR_RETRY_MS);
490
550
  return {
491
551
  sttReady: !!sttPipeline,
492
552
  ttsReady: !!ttsPipeline,
493
553
  sttLoading,
494
554
  ttsLoading,
495
555
  sttError: sttLoadError ? sttLoadError.message : null,
496
- ttsError: ttsLoadError ? ttsLoadError.message : null,
556
+ ttsError: (ttsLoadError && !ttsRetryExpired) ? ttsLoadError.message : null,
497
557
  };
498
558
  }
499
559
 
500
- export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices };
560
+ function preloadTTS() {
561
+ getTTS().catch(err => console.error('[TTS] Preload failed:', err.message));
562
+ }
563
+
564
+ function ttsCacheKey(text, voiceId) {
565
+ return (voiceId || 'default') + ':' + text;
566
+ }
567
+
568
+ function ttsCacheGet(key) {
569
+ const cached = ttsCache.get(key);
570
+ if (cached) { ttsCache.delete(key); ttsCache.set(key, cached); }
571
+ return cached || null;
572
+ }
573
+
574
+ export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgui",
3
- "version": "1.0.175",
3
+ "version": "1.0.177",
4
4
  "description": "Multi-agent ACP client with real-time communication",
5
5
  "type": "module",
6
6
  "main": "server.js",
package/server.js CHANGED
@@ -15,6 +15,49 @@ async function getSpeech() {
15
15
  return speechModule;
16
16
  }
17
17
 
18
+ function eagerTTS(text, conversationId, sessionId) {
19
+ getSpeech().then(speech => {
20
+ const status = speech.getStatus();
21
+ if (!status.ttsReady || status.ttsError) return;
22
+ const voices = new Set();
23
+ for (const ws of syncClients) {
24
+ const vid = ws.ttsVoiceId || 'default';
25
+ const convKey = `conv-${conversationId}`;
26
+ if (ws.subscriptions && (ws.subscriptions.has(sessionId) || ws.subscriptions.has(convKey))) {
27
+ voices.add(vid);
28
+ }
29
+ }
30
+ if (voices.size === 0) return;
31
+ const sentences = speech.splitSentences(text);
32
+ for (const vid of voices) {
33
+ for (const sentence of sentences) {
34
+ const cacheKey = speech.ttsCacheKey(sentence, vid);
35
+ const cached = speech.ttsCacheGet(cacheKey);
36
+ if (cached) {
37
+ pushTTSAudio(cacheKey, cached, conversationId, sessionId, vid);
38
+ continue;
39
+ }
40
+ speech.synthesize(sentence, vid).then(wav => {
41
+ pushTTSAudio(cacheKey, wav, conversationId, sessionId, vid);
42
+ }).catch(() => {});
43
+ }
44
+ }
45
+ }).catch(() => {});
46
+ }
47
+
48
+ function pushTTSAudio(cacheKey, wav, conversationId, sessionId, voiceId) {
49
+ const b64 = wav.toString('base64');
50
+ broadcastSync({
51
+ type: 'tts_audio',
52
+ cacheKey,
53
+ audio: b64,
54
+ voiceId,
55
+ conversationId,
56
+ sessionId,
57
+ timestamp: Date.now()
58
+ });
59
+ }
60
+
18
61
  const require = createRequire(import.meta.url);
19
62
  const express = require('express');
20
63
  const Busboy = require('busboy');
@@ -852,6 +895,10 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
852
895
  blockIndex: allBlocks.length - 1,
853
896
  timestamp: Date.now()
854
897
  });
898
+
899
+ if (block.type === 'text' && block.text) {
900
+ eagerTTS(block.text, conversationId, sessionId);
901
+ }
855
902
  }
856
903
  } else if (parsed.type === 'user' && parsed.message?.content) {
857
904
  for (const block of parsed.message.content) {
@@ -900,6 +947,11 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
900
947
  timestamp: Date.now()
901
948
  });
902
949
 
950
+ if (parsed.result) {
951
+ const resultText = typeof parsed.result === 'string' ? parsed.result : JSON.stringify(parsed.result);
952
+ if (resultText) eagerTTS(resultText, conversationId, sessionId);
953
+ }
954
+
903
955
  if (parsed.result && allBlocks.length === 0) {
904
956
  allBlocks.push({ type: 'text', text: String(parsed.result) });
905
957
  }
@@ -1128,6 +1180,8 @@ wss.on('connection', (ws, req) => {
1128
1180
  subscriptions: Array.from(ws.subscriptions),
1129
1181
  timestamp: Date.now()
1130
1182
  }));
1183
+ } else if (data.type === 'set_voice') {
1184
+ ws.ttsVoiceId = data.voiceId || 'default';
1131
1185
  } else if (data.type === 'ping') {
1132
1186
  ws.send(JSON.stringify({
1133
1187
  type: 'pong',
@@ -15,6 +15,8 @@
15
15
  var spokenChunks = new Set();
16
16
  var isLoadingHistory = false;
17
17
  var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
18
+ var ttsAudioCache = new Map();
19
+ var TTS_CLIENT_CACHE_MAX = 50;
18
20
 
19
21
  function init() {
20
22
  setupTTSToggle();
@@ -69,6 +71,7 @@
69
71
  selector.addEventListener('change', function() {
70
72
  selectedVoiceId = selector.value;
71
73
  localStorage.setItem('voice-selected-id', selectedVoiceId);
74
+ sendVoiceToServer();
72
75
  });
73
76
  }
74
77
 
@@ -295,12 +298,29 @@
295
298
  processQueue();
296
299
  }
297
300
 
301
+ function cacheTTSAudio(cacheKey, b64) {
302
+ if (ttsAudioCache.size >= TTS_CLIENT_CACHE_MAX) {
303
+ var oldest = ttsAudioCache.keys().next().value;
304
+ ttsAudioCache.delete(oldest);
305
+ }
306
+ var binary = atob(b64);
307
+ var bytes = new Uint8Array(binary.length);
308
+ for (var i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
309
+ ttsAudioCache.set(cacheKey, new Blob([bytes], { type: 'audio/wav' }));
310
+ }
311
+
312
+ function getCachedTTSBlob(text) {
313
+ var key = selectedVoiceId + ':' + text;
314
+ return ttsAudioCache.get(key) || null;
315
+ }
316
+
298
317
  var audioChunkQueue = [];
299
318
  var isPlayingChunk = false;
300
319
  var streamDone = false;
301
320
  var ttsConsecutiveFailures = 0;
302
321
  var TTS_MAX_FAILURES = 3;
303
322
  var ttsDisabledUntilReset = false;
323
+ var streamingSupported = true;
304
324
 
305
325
  function playNextChunk() {
306
326
  if (audioChunkQueue.length === 0) {
@@ -344,6 +364,15 @@
344
364
  audioChunkQueue = [];
345
365
  isPlayingChunk = false;
346
366
 
367
+ var cachedBlob = getCachedTTSBlob(text);
368
+ if (cachedBlob) {
369
+ ttsConsecutiveFailures = 0;
370
+ audioChunkQueue.push(cachedBlob);
371
+ streamDone = true;
372
+ if (!isPlayingChunk) playNextChunk();
373
+ return;
374
+ }
375
+
347
376
  function onTtsSuccess() {
348
377
  ttsConsecutiveFailures = 0;
349
378
  }
@@ -363,12 +392,16 @@
363
392
  }
364
393
 
365
394
  function tryStreaming() {
395
+ if (!streamingSupported) { tryNonStreaming(text); return; }
366
396
  fetch(BASE + '/api/tts-stream', {
367
397
  method: 'POST',
368
398
  headers: { 'Content-Type': 'application/json' },
369
399
  body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
370
400
  }).then(function(resp) {
371
- if (!resp.ok) throw new Error('TTS stream failed: ' + resp.status);
401
+ if (!resp.ok) {
402
+ streamingSupported = false;
403
+ throw new Error('TTS stream failed: ' + resp.status);
404
+ }
372
405
  var reader = resp.body.getReader();
373
406
  var buffer = new Uint8Array(0);
374
407
 
@@ -532,11 +565,23 @@
532
565
  }
533
566
  }
534
567
 
568
+ function sendVoiceToServer() {
569
+ if (typeof agentGUIClient !== 'undefined' && agentGUIClient && agentGUIClient.wsManager && agentGUIClient.wsManager.isConnected) {
570
+ agentGUIClient.wsManager.sendMessage({ type: 'set_voice', voiceId: selectedVoiceId });
571
+ }
572
+ }
573
+
535
574
  function setupStreamingListener() {
536
575
  window.addEventListener('ws-message', function(e) {
537
- if (!voiceActive) return;
538
576
  var data = e.detail;
539
577
  if (!data) return;
578
+ if (data.type === 'tts_audio' && data.audio && data.voiceId === selectedVoiceId) {
579
+ cacheTTSAudio(data.cacheKey, data.audio);
580
+ }
581
+ if (data.type === 'sync_connected') {
582
+ sendVoiceToServer();
583
+ }
584
+ if (!voiceActive) return;
540
585
  if (data.type === 'streaming_progress' && data.block) {
541
586
  handleVoiceBlock(data.block, true);
542
587
  }