agentgui 1.0.166 → 1.0.167

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/speech.js CHANGED
@@ -21,6 +21,9 @@ let speakerEmbeddings = null;
21
21
  let sttLoading = false;
22
22
  let ttsLoading = false;
23
23
 
24
+ const TTS_CACHE_MAX = 100;
25
+ const ttsCache = new Map();
26
+
24
27
  async function loadTransformers() {
25
28
  if (transformersModule) return transformersModule;
26
29
  transformersModule = await import('@huggingface/transformers');
@@ -216,11 +219,52 @@ async function transcribe(audioBuffer) {
216
219
  return result.text;
217
220
  }
218
221
 
222
+ function splitSentences(text) {
223
+ const raw = text.match(/[^.!?]+[.!?]+[\s]?|[^.!?]+$/g);
224
+ if (!raw) return [text];
225
+ return raw.map(s => s.trim()).filter(s => s.length > 0);
226
+ }
227
+
228
+ function cachePut(key, buf) {
229
+ if (ttsCache.size >= TTS_CACHE_MAX) {
230
+ const oldest = ttsCache.keys().next().value;
231
+ ttsCache.delete(oldest);
232
+ }
233
+ ttsCache.set(key, buf);
234
+ }
235
+
219
236
  async function synthesize(text) {
237
+ const cached = ttsCache.get(text);
238
+ if (cached) {
239
+ ttsCache.delete(text);
240
+ ttsCache.set(text, cached);
241
+ return cached;
242
+ }
220
243
  const tts = await getTTS();
221
244
  const embeddings = await ensureSpeakerEmbeddings();
222
245
  const result = await tts(text, { speaker_embeddings: embeddings });
223
- return encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
246
+ const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
247
+ cachePut(text, wav);
248
+ return wav;
249
+ }
250
+
251
+ async function* synthesizeStream(text) {
252
+ const sentences = splitSentences(text);
253
+ const tts = await getTTS();
254
+ const embeddings = await ensureSpeakerEmbeddings();
255
+ for (const sentence of sentences) {
256
+ const cached = ttsCache.get(sentence);
257
+ if (cached) {
258
+ ttsCache.delete(sentence);
259
+ ttsCache.set(sentence, cached);
260
+ yield cached;
261
+ continue;
262
+ }
263
+ const result = await tts(sentence, { speaker_embeddings: embeddings });
264
+ const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
265
+ cachePut(sentence, wav);
266
+ yield wav;
267
+ }
224
268
  }
225
269
 
226
270
  function getStatus() {
@@ -232,4 +276,4 @@ function getStatus() {
232
276
  };
233
277
  }
234
278
 
235
- export { transcribe, synthesize, getSTT, getTTS, getStatus };
279
+ export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgui",
3
- "version": "1.0.166",
3
+ "version": "1.0.167",
4
4
  "description": "Multi-agent ACP client with real-time communication",
5
5
  "type": "module",
6
6
  "main": "server.js",
package/server.js CHANGED
@@ -554,6 +554,36 @@ const server = http.createServer(async (req, res) => {
554
554
  return;
555
555
  }
556
556
 
557
+ if (routePath === '/api/tts-stream' && req.method === 'POST') {
558
+ try {
559
+ const body = await parseBody(req);
560
+ const text = body.text || '';
561
+ if (!text) {
562
+ sendJSON(req, res, 400, { error: 'No text provided' });
563
+ return;
564
+ }
565
+ const { synthesizeStream } = await getSpeech();
566
+ res.writeHead(200, {
567
+ 'Content-Type': 'application/octet-stream',
568
+ 'Transfer-Encoding': 'chunked',
569
+ 'X-Content-Type': 'audio/wav-stream',
570
+ 'Cache-Control': 'no-cache'
571
+ });
572
+ for await (const wavChunk of synthesizeStream(text)) {
573
+ const lenBuf = Buffer.alloc(4);
574
+ lenBuf.writeUInt32BE(wavChunk.length, 0);
575
+ res.write(lenBuf);
576
+ res.write(wavChunk);
577
+ }
578
+ res.end();
579
+ } catch (err) {
580
+ debugLog('[TTS-STREAM] Error: ' + err.message);
581
+ if (!res.headersSent) sendJSON(req, res, 500, { error: err.message || 'TTS stream failed' });
582
+ else res.end();
583
+ }
584
+ return;
585
+ }
586
+
557
587
  if (routePath === '/api/speech-status' && req.method === 'GET') {
558
588
  try {
559
589
  const { getStatus } = await getSpeech();
@@ -1304,7 +1334,8 @@ function onServerReady() {
1304
1334
  // Recover stale active sessions from previous run
1305
1335
  recoverStaleSessions();
1306
1336
 
1307
- // Run auto-import immediately
1337
+ getSpeech().then(s => s.getTTS()).then(() => debugLog('[TTS] Model preloaded')).catch(e => debugLog('[TTS] Preload failed: ' + e.message));
1338
+
1308
1339
  performAutoImport();
1309
1340
 
1310
1341
  // Then run it every 30 seconds (constant automatic importing)
@@ -245,39 +245,91 @@
245
245
  processQueue();
246
246
  }
247
247
 
248
+ var audioChunkQueue = [];
249
+ var isPlayingChunk = false;
250
+ var streamDone = false;
251
+
252
+ function playNextChunk() {
253
+ if (audioChunkQueue.length === 0) {
254
+ isPlayingChunk = false;
255
+ if (streamDone) {
256
+ isSpeaking = false;
257
+ processQueue();
258
+ }
259
+ return;
260
+ }
261
+ isPlayingChunk = true;
262
+ var blob = audioChunkQueue.shift();
263
+ var url = URL.createObjectURL(blob);
264
+ currentAudio = new Audio(url);
265
+ currentAudio.onended = function() {
266
+ URL.revokeObjectURL(url);
267
+ currentAudio = null;
268
+ playNextChunk();
269
+ };
270
+ currentAudio.onerror = function() {
271
+ URL.revokeObjectURL(url);
272
+ currentAudio = null;
273
+ playNextChunk();
274
+ };
275
+ currentAudio.play().catch(function() {
276
+ URL.revokeObjectURL(url);
277
+ currentAudio = null;
278
+ playNextChunk();
279
+ });
280
+ }
281
+
248
282
  function processQueue() {
249
283
  if (isSpeaking || speechQueue.length === 0) return;
250
284
  isSpeaking = true;
285
+ streamDone = false;
251
286
  var text = speechQueue.shift();
252
- fetch(BASE + '/api/tts', {
287
+ audioChunkQueue = [];
288
+ isPlayingChunk = false;
289
+ fetch(BASE + '/api/tts-stream', {
253
290
  method: 'POST',
254
291
  headers: { 'Content-Type': 'application/json' },
255
292
  body: JSON.stringify({ text: text })
256
293
  }).then(function(resp) {
257
294
  if (!resp.ok) throw new Error('TTS failed');
258
- return resp.blob();
259
- }).then(function(blob) {
260
- var url = URL.createObjectURL(blob);
261
- currentAudio = new Audio(url);
262
- currentAudio.onended = function() {
263
- URL.revokeObjectURL(url);
264
- currentAudio = null;
265
- isSpeaking = false;
266
- processQueue();
267
- };
268
- currentAudio.onerror = function() {
269
- URL.revokeObjectURL(url);
270
- currentAudio = null;
271
- isSpeaking = false;
272
- processQueue();
273
- };
274
- currentAudio.play().catch(function() {
275
- URL.revokeObjectURL(url);
276
- currentAudio = null;
277
- isSpeaking = false;
278
- processQueue();
279
- });
295
+ var reader = resp.body.getReader();
296
+ var buffer = new Uint8Array(0);
297
+
298
+ function concat(a, b) {
299
+ var c = new Uint8Array(a.length + b.length);
300
+ c.set(a, 0);
301
+ c.set(b, a.length);
302
+ return c;
303
+ }
304
+
305
+ function pump() {
306
+ return reader.read().then(function(result) {
307
+ if (result.done) {
308
+ streamDone = true;
309
+ if (!isPlayingChunk && audioChunkQueue.length === 0) {
310
+ isSpeaking = false;
311
+ processQueue();
312
+ }
313
+ return;
314
+ }
315
+ buffer = concat(buffer, result.value);
316
+ while (buffer.length >= 4) {
317
+ var view = new DataView(buffer.buffer, buffer.byteOffset, 4);
318
+ var chunkLen = view.getUint32(0, false);
319
+ if (buffer.length < 4 + chunkLen) break;
320
+ var wavData = buffer.slice(4, 4 + chunkLen);
321
+ buffer = buffer.slice(4 + chunkLen);
322
+ var blob = new Blob([wavData], { type: 'audio/wav' });
323
+ audioChunkQueue.push(blob);
324
+ if (!isPlayingChunk) playNextChunk();
325
+ }
326
+ return pump();
327
+ });
328
+ }
329
+
330
+ return pump();
280
331
  }).catch(function() {
332
+ streamDone = true;
281
333
  isSpeaking = false;
282
334
  processQueue();
283
335
  });
@@ -285,6 +337,8 @@
285
337
 
286
338
  function stopSpeaking() {
287
339
  speechQueue = [];
340
+ audioChunkQueue = [];
341
+ isPlayingChunk = false;
288
342
  isSpeaking = false;
289
343
  if (currentAudio) {
290
344
  currentAudio.pause();