agentgui 1.0.174 → 1.0.175
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +14 -2
- package/package.json +1 -1
- package/server.js +20 -6
- package/static/js/voice.js +35 -8
package/lib/speech.js
CHANGED
|
@@ -115,6 +115,8 @@ let speakerEmbeddingPipeline = null;
|
|
|
115
115
|
let sttLoading = false;
|
|
116
116
|
let ttsLoading = false;
|
|
117
117
|
let speakerEmbeddingLoading = false;
|
|
118
|
+
let ttsLoadError = null;
|
|
119
|
+
let sttLoadError = null;
|
|
118
120
|
const voiceEmbeddingsCache = new Map();
|
|
119
121
|
const SAMPLE_RATE_STT = 16000;
|
|
120
122
|
const SAMPLE_RATE_TTS = 16000;
|
|
@@ -261,8 +263,10 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
|
|
|
261
263
|
|
|
262
264
|
async function getSTT() {
|
|
263
265
|
if (sttPipeline) return sttPipeline;
|
|
266
|
+
if (sttLoadError) throw sttLoadError;
|
|
264
267
|
if (sttLoading) {
|
|
265
268
|
while (sttLoading) await new Promise(r => setTimeout(r, 100));
|
|
269
|
+
if (sttLoadError) throw sttLoadError;
|
|
266
270
|
if (!sttPipeline) throw new Error('STT pipeline failed to load');
|
|
267
271
|
return sttPipeline;
|
|
268
272
|
}
|
|
@@ -278,10 +282,12 @@ async function getSTT() {
|
|
|
278
282
|
device: 'cpu',
|
|
279
283
|
local_files_only: isLocal,
|
|
280
284
|
});
|
|
285
|
+
sttLoadError = null;
|
|
281
286
|
return sttPipeline;
|
|
282
287
|
} catch (err) {
|
|
283
288
|
sttPipeline = null;
|
|
284
|
-
|
|
289
|
+
sttLoadError = new Error('STT model load failed: ' + err.message);
|
|
290
|
+
throw sttLoadError;
|
|
285
291
|
} finally {
|
|
286
292
|
sttLoading = false;
|
|
287
293
|
}
|
|
@@ -289,8 +295,10 @@ async function getSTT() {
|
|
|
289
295
|
|
|
290
296
|
async function getTTS() {
|
|
291
297
|
if (ttsPipeline) return ttsPipeline;
|
|
298
|
+
if (ttsLoadError) throw ttsLoadError;
|
|
292
299
|
if (ttsLoading) {
|
|
293
300
|
while (ttsLoading) await new Promise(r => setTimeout(r, 100));
|
|
301
|
+
if (ttsLoadError) throw ttsLoadError;
|
|
294
302
|
if (!ttsPipeline) throw new Error('TTS pipeline failed to load');
|
|
295
303
|
return ttsPipeline;
|
|
296
304
|
}
|
|
@@ -303,10 +311,12 @@ async function getTTS() {
|
|
|
303
311
|
dtype: 'fp32',
|
|
304
312
|
});
|
|
305
313
|
await ensureSpeakerEmbeddings();
|
|
314
|
+
ttsLoadError = null;
|
|
306
315
|
return ttsPipeline;
|
|
307
316
|
} catch (err) {
|
|
308
317
|
ttsPipeline = null;
|
|
309
|
-
|
|
318
|
+
ttsLoadError = new Error('TTS model load failed: ' + err.message);
|
|
319
|
+
throw ttsLoadError;
|
|
310
320
|
} finally {
|
|
311
321
|
ttsLoading = false;
|
|
312
322
|
}
|
|
@@ -482,6 +492,8 @@ function getStatus() {
|
|
|
482
492
|
ttsReady: !!ttsPipeline,
|
|
483
493
|
sttLoading,
|
|
484
494
|
ttsLoading,
|
|
495
|
+
sttError: sttLoadError ? sttLoadError.message : null,
|
|
496
|
+
ttsError: ttsLoadError ? ttsLoadError.message : null,
|
|
485
497
|
};
|
|
486
498
|
}
|
|
487
499
|
|
package/package.json
CHANGED
package/server.js
CHANGED
|
@@ -554,13 +554,20 @@ const server = http.createServer(async (req, res) => {
|
|
|
554
554
|
sendJSON(req, res, 400, { error: 'No text provided' });
|
|
555
555
|
return;
|
|
556
556
|
}
|
|
557
|
-
const
|
|
558
|
-
const
|
|
557
|
+
const speech = await getSpeech();
|
|
558
|
+
const status = speech.getStatus();
|
|
559
|
+
if (status.ttsError) {
|
|
560
|
+
sendJSON(req, res, 503, { error: status.ttsError, retryable: false });
|
|
561
|
+
return;
|
|
562
|
+
}
|
|
563
|
+
const wavBuffer = await speech.synthesize(text, voiceId);
|
|
559
564
|
res.writeHead(200, { 'Content-Type': 'audio/wav', 'Content-Length': wavBuffer.length });
|
|
560
565
|
res.end(wavBuffer);
|
|
561
566
|
} catch (err) {
|
|
562
567
|
debugLog('[TTS] Error: ' + err.message);
|
|
563
|
-
|
|
568
|
+
const isModelError = /model.*load|pipeline.*failed|failed to load/i.test(err.message);
|
|
569
|
+
const statusCode = isModelError ? 503 : 500;
|
|
570
|
+
if (!res.headersSent) sendJSON(req, res, statusCode, { error: err.message || 'TTS failed', retryable: !isModelError });
|
|
564
571
|
}
|
|
565
572
|
return;
|
|
566
573
|
}
|
|
@@ -574,14 +581,19 @@ const server = http.createServer(async (req, res) => {
|
|
|
574
581
|
sendJSON(req, res, 400, { error: 'No text provided' });
|
|
575
582
|
return;
|
|
576
583
|
}
|
|
577
|
-
const
|
|
584
|
+
const speech = await getSpeech();
|
|
585
|
+
const status = speech.getStatus();
|
|
586
|
+
if (status.ttsError) {
|
|
587
|
+
sendJSON(req, res, 503, { error: status.ttsError, retryable: false });
|
|
588
|
+
return;
|
|
589
|
+
}
|
|
578
590
|
res.writeHead(200, {
|
|
579
591
|
'Content-Type': 'application/octet-stream',
|
|
580
592
|
'Transfer-Encoding': 'chunked',
|
|
581
593
|
'X-Content-Type': 'audio/wav-stream',
|
|
582
594
|
'Cache-Control': 'no-cache'
|
|
583
595
|
});
|
|
584
|
-
for await (const wavChunk of synthesizeStream(text, voiceId)) {
|
|
596
|
+
for await (const wavChunk of speech.synthesizeStream(text, voiceId)) {
|
|
585
597
|
const lenBuf = Buffer.alloc(4);
|
|
586
598
|
lenBuf.writeUInt32BE(wavChunk.length, 0);
|
|
587
599
|
res.write(lenBuf);
|
|
@@ -590,7 +602,9 @@ const server = http.createServer(async (req, res) => {
|
|
|
590
602
|
res.end();
|
|
591
603
|
} catch (err) {
|
|
592
604
|
debugLog('[TTS-STREAM] Error: ' + err.message);
|
|
593
|
-
|
|
605
|
+
const isModelError = /model.*load|pipeline.*failed|failed to load/i.test(err.message);
|
|
606
|
+
const statusCode = isModelError ? 503 : 500;
|
|
607
|
+
if (!res.headersSent) sendJSON(req, res, statusCode, { error: err.message || 'TTS stream failed', retryable: !isModelError });
|
|
594
608
|
else res.end();
|
|
595
609
|
}
|
|
596
610
|
return;
|
package/static/js/voice.js
CHANGED
|
@@ -298,6 +298,9 @@
|
|
|
298
298
|
var audioChunkQueue = [];
|
|
299
299
|
var isPlayingChunk = false;
|
|
300
300
|
var streamDone = false;
|
|
301
|
+
var ttsConsecutiveFailures = 0;
|
|
302
|
+
var TTS_MAX_FAILURES = 3;
|
|
303
|
+
var ttsDisabledUntilReset = false;
|
|
301
304
|
|
|
302
305
|
function playNextChunk() {
|
|
303
306
|
if (audioChunkQueue.length === 0) {
|
|
@@ -331,19 +334,41 @@
|
|
|
331
334
|
|
|
332
335
|
function processQueue() {
|
|
333
336
|
if (isSpeaking || speechQueue.length === 0) return;
|
|
337
|
+
if (ttsDisabledUntilReset) {
|
|
338
|
+
speechQueue = [];
|
|
339
|
+
return;
|
|
340
|
+
}
|
|
334
341
|
isSpeaking = true;
|
|
335
342
|
streamDone = false;
|
|
336
343
|
var text = speechQueue.shift();
|
|
337
344
|
audioChunkQueue = [];
|
|
338
345
|
isPlayingChunk = false;
|
|
339
|
-
|
|
346
|
+
|
|
347
|
+
function onTtsSuccess() {
|
|
348
|
+
ttsConsecutiveFailures = 0;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
function onTtsFailed() {
|
|
352
|
+
ttsConsecutiveFailures++;
|
|
353
|
+
if (ttsConsecutiveFailures >= TTS_MAX_FAILURES) {
|
|
354
|
+
console.warn('[Voice] TTS failed ' + ttsConsecutiveFailures + ' times consecutively, disabling until reset');
|
|
355
|
+
ttsDisabledUntilReset = true;
|
|
356
|
+
speechQueue = [];
|
|
357
|
+
}
|
|
358
|
+
streamDone = true;
|
|
359
|
+
isSpeaking = false;
|
|
360
|
+
if (!ttsDisabledUntilReset) {
|
|
361
|
+
processQueue();
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
340
365
|
function tryStreaming() {
|
|
341
366
|
fetch(BASE + '/api/tts-stream', {
|
|
342
367
|
method: 'POST',
|
|
343
368
|
headers: { 'Content-Type': 'application/json' },
|
|
344
369
|
body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
|
|
345
370
|
}).then(function(resp) {
|
|
346
|
-
if (!resp.ok) throw new Error('TTS stream failed');
|
|
371
|
+
if (!resp.ok) throw new Error('TTS stream failed: ' + resp.status);
|
|
347
372
|
var reader = resp.body.getReader();
|
|
348
373
|
var buffer = new Uint8Array(0);
|
|
349
374
|
|
|
@@ -357,6 +382,7 @@
|
|
|
357
382
|
function pump() {
|
|
358
383
|
return reader.read().then(function(result) {
|
|
359
384
|
if (result.done) {
|
|
385
|
+
onTtsSuccess();
|
|
360
386
|
streamDone = true;
|
|
361
387
|
if (!isPlayingChunk && audioChunkQueue.length === 0) {
|
|
362
388
|
isSpeaking = false;
|
|
@@ -384,16 +410,17 @@
|
|
|
384
410
|
tryNonStreaming(text);
|
|
385
411
|
});
|
|
386
412
|
}
|
|
387
|
-
|
|
413
|
+
|
|
388
414
|
function tryNonStreaming(txt) {
|
|
389
415
|
fetch(BASE + '/api/tts', {
|
|
390
416
|
method: 'POST',
|
|
391
417
|
headers: { 'Content-Type': 'application/json' },
|
|
392
418
|
body: JSON.stringify({ text: txt, voiceId: selectedVoiceId })
|
|
393
419
|
}).then(function(resp) {
|
|
394
|
-
if (!resp.ok) throw new Error('TTS failed');
|
|
420
|
+
if (!resp.ok) throw new Error('TTS failed: ' + resp.status);
|
|
395
421
|
return resp.arrayBuffer();
|
|
396
422
|
}).then(function(buf) {
|
|
423
|
+
onTtsSuccess();
|
|
397
424
|
var blob = new Blob([buf], { type: 'audio/wav' });
|
|
398
425
|
audioChunkQueue.push(blob);
|
|
399
426
|
if (!isPlayingChunk) playNextChunk();
|
|
@@ -401,12 +428,10 @@
|
|
|
401
428
|
isSpeaking = false;
|
|
402
429
|
processQueue();
|
|
403
430
|
}).catch(function() {
|
|
404
|
-
|
|
405
|
-
isSpeaking = false;
|
|
406
|
-
processQueue();
|
|
431
|
+
onTtsFailed();
|
|
407
432
|
});
|
|
408
433
|
}
|
|
409
|
-
|
|
434
|
+
|
|
410
435
|
tryStreaming();
|
|
411
436
|
}
|
|
412
437
|
|
|
@@ -415,6 +440,8 @@
|
|
|
415
440
|
audioChunkQueue = [];
|
|
416
441
|
isPlayingChunk = false;
|
|
417
442
|
isSpeaking = false;
|
|
443
|
+
ttsConsecutiveFailures = 0;
|
|
444
|
+
ttsDisabledUntilReset = false;
|
|
418
445
|
if (currentAudio) {
|
|
419
446
|
currentAudio.pause();
|
|
420
447
|
currentAudio = null;
|