agentgui 1.0.166 → 1.0.168
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +46 -2
- package/package.json +1 -1
- package/server.js +32 -1
- package/static/js/client.js +1 -2
- package/static/js/voice.js +77 -23
package/lib/speech.js
CHANGED
|
@@ -21,6 +21,9 @@ let speakerEmbeddings = null;
|
|
|
21
21
|
let sttLoading = false;
|
|
22
22
|
let ttsLoading = false;
|
|
23
23
|
|
|
24
|
+
const TTS_CACHE_MAX = 100;
|
|
25
|
+
const ttsCache = new Map();
|
|
26
|
+
|
|
24
27
|
async function loadTransformers() {
|
|
25
28
|
if (transformersModule) return transformersModule;
|
|
26
29
|
transformersModule = await import('@huggingface/transformers');
|
|
@@ -216,11 +219,52 @@ async function transcribe(audioBuffer) {
|
|
|
216
219
|
return result.text;
|
|
217
220
|
}
|
|
218
221
|
|
|
222
|
+
function splitSentences(text) {
|
|
223
|
+
const raw = text.match(/[^.!?]+[.!?]+[\s]?|[^.!?]+$/g);
|
|
224
|
+
if (!raw) return [text];
|
|
225
|
+
return raw.map(s => s.trim()).filter(s => s.length > 0);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
function cachePut(key, buf) {
|
|
229
|
+
if (ttsCache.size >= TTS_CACHE_MAX) {
|
|
230
|
+
const oldest = ttsCache.keys().next().value;
|
|
231
|
+
ttsCache.delete(oldest);
|
|
232
|
+
}
|
|
233
|
+
ttsCache.set(key, buf);
|
|
234
|
+
}
|
|
235
|
+
|
|
219
236
|
async function synthesize(text) {
|
|
237
|
+
const cached = ttsCache.get(text);
|
|
238
|
+
if (cached) {
|
|
239
|
+
ttsCache.delete(text);
|
|
240
|
+
ttsCache.set(text, cached);
|
|
241
|
+
return cached;
|
|
242
|
+
}
|
|
220
243
|
const tts = await getTTS();
|
|
221
244
|
const embeddings = await ensureSpeakerEmbeddings();
|
|
222
245
|
const result = await tts(text, { speaker_embeddings: embeddings });
|
|
223
|
-
|
|
246
|
+
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
247
|
+
cachePut(text, wav);
|
|
248
|
+
return wav;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
async function* synthesizeStream(text) {
|
|
252
|
+
const sentences = splitSentences(text);
|
|
253
|
+
const tts = await getTTS();
|
|
254
|
+
const embeddings = await ensureSpeakerEmbeddings();
|
|
255
|
+
for (const sentence of sentences) {
|
|
256
|
+
const cached = ttsCache.get(sentence);
|
|
257
|
+
if (cached) {
|
|
258
|
+
ttsCache.delete(sentence);
|
|
259
|
+
ttsCache.set(sentence, cached);
|
|
260
|
+
yield cached;
|
|
261
|
+
continue;
|
|
262
|
+
}
|
|
263
|
+
const result = await tts(sentence, { speaker_embeddings: embeddings });
|
|
264
|
+
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
265
|
+
cachePut(sentence, wav);
|
|
266
|
+
yield wav;
|
|
267
|
+
}
|
|
224
268
|
}
|
|
225
269
|
|
|
226
270
|
function getStatus() {
|
|
@@ -232,4 +276,4 @@ function getStatus() {
|
|
|
232
276
|
};
|
|
233
277
|
}
|
|
234
278
|
|
|
235
|
-
export { transcribe, synthesize, getSTT, getTTS, getStatus };
|
|
279
|
+
export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus };
|
package/package.json
CHANGED
package/server.js
CHANGED
|
@@ -554,6 +554,36 @@ const server = http.createServer(async (req, res) => {
|
|
|
554
554
|
return;
|
|
555
555
|
}
|
|
556
556
|
|
|
557
|
+
if (routePath === '/api/tts-stream' && req.method === 'POST') {
|
|
558
|
+
try {
|
|
559
|
+
const body = await parseBody(req);
|
|
560
|
+
const text = body.text || '';
|
|
561
|
+
if (!text) {
|
|
562
|
+
sendJSON(req, res, 400, { error: 'No text provided' });
|
|
563
|
+
return;
|
|
564
|
+
}
|
|
565
|
+
const { synthesizeStream } = await getSpeech();
|
|
566
|
+
res.writeHead(200, {
|
|
567
|
+
'Content-Type': 'application/octet-stream',
|
|
568
|
+
'Transfer-Encoding': 'chunked',
|
|
569
|
+
'X-Content-Type': 'audio/wav-stream',
|
|
570
|
+
'Cache-Control': 'no-cache'
|
|
571
|
+
});
|
|
572
|
+
for await (const wavChunk of synthesizeStream(text)) {
|
|
573
|
+
const lenBuf = Buffer.alloc(4);
|
|
574
|
+
lenBuf.writeUInt32BE(wavChunk.length, 0);
|
|
575
|
+
res.write(lenBuf);
|
|
576
|
+
res.write(wavChunk);
|
|
577
|
+
}
|
|
578
|
+
res.end();
|
|
579
|
+
} catch (err) {
|
|
580
|
+
debugLog('[TTS-STREAM] Error: ' + err.message);
|
|
581
|
+
if (!res.headersSent) sendJSON(req, res, 500, { error: err.message || 'TTS stream failed' });
|
|
582
|
+
else res.end();
|
|
583
|
+
}
|
|
584
|
+
return;
|
|
585
|
+
}
|
|
586
|
+
|
|
557
587
|
if (routePath === '/api/speech-status' && req.method === 'GET') {
|
|
558
588
|
try {
|
|
559
589
|
const { getStatus } = await getSpeech();
|
|
@@ -1304,7 +1334,8 @@ function onServerReady() {
|
|
|
1304
1334
|
// Recover stale active sessions from previous run
|
|
1305
1335
|
recoverStaleSessions();
|
|
1306
1336
|
|
|
1307
|
-
|
|
1337
|
+
getSpeech().then(s => s.getTTS()).then(() => debugLog('[TTS] Model preloaded')).catch(e => debugLog('[TTS] Preload failed: ' + e.message));
|
|
1338
|
+
|
|
1308
1339
|
performAutoImport();
|
|
1309
1340
|
|
|
1310
1341
|
// Then run it every 30 seconds (constant automatic importing)
|
package/static/js/client.js
CHANGED
|
@@ -1288,12 +1288,11 @@ class AgentGUIClient {
|
|
|
1288
1288
|
}
|
|
1289
1289
|
|
|
1290
1290
|
const { conversation } = await response.json();
|
|
1291
|
-
this.state.currentConversation = conversation;
|
|
1292
1291
|
|
|
1293
1292
|
await this.loadConversations();
|
|
1294
1293
|
|
|
1295
1294
|
if (window.conversationManager) {
|
|
1296
|
-
window.conversationManager.loadConversations();
|
|
1295
|
+
await window.conversationManager.loadConversations();
|
|
1297
1296
|
window.conversationManager.select(conversation.id);
|
|
1298
1297
|
}
|
|
1299
1298
|
|
package/static/js/voice.js
CHANGED
|
@@ -245,39 +245,91 @@
|
|
|
245
245
|
processQueue();
|
|
246
246
|
}
|
|
247
247
|
|
|
248
|
+
var audioChunkQueue = [];
|
|
249
|
+
var isPlayingChunk = false;
|
|
250
|
+
var streamDone = false;
|
|
251
|
+
|
|
252
|
+
function playNextChunk() {
|
|
253
|
+
if (audioChunkQueue.length === 0) {
|
|
254
|
+
isPlayingChunk = false;
|
|
255
|
+
if (streamDone) {
|
|
256
|
+
isSpeaking = false;
|
|
257
|
+
processQueue();
|
|
258
|
+
}
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
isPlayingChunk = true;
|
|
262
|
+
var blob = audioChunkQueue.shift();
|
|
263
|
+
var url = URL.createObjectURL(blob);
|
|
264
|
+
currentAudio = new Audio(url);
|
|
265
|
+
currentAudio.onended = function() {
|
|
266
|
+
URL.revokeObjectURL(url);
|
|
267
|
+
currentAudio = null;
|
|
268
|
+
playNextChunk();
|
|
269
|
+
};
|
|
270
|
+
currentAudio.onerror = function() {
|
|
271
|
+
URL.revokeObjectURL(url);
|
|
272
|
+
currentAudio = null;
|
|
273
|
+
playNextChunk();
|
|
274
|
+
};
|
|
275
|
+
currentAudio.play().catch(function() {
|
|
276
|
+
URL.revokeObjectURL(url);
|
|
277
|
+
currentAudio = null;
|
|
278
|
+
playNextChunk();
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
|
|
248
282
|
function processQueue() {
|
|
249
283
|
if (isSpeaking || speechQueue.length === 0) return;
|
|
250
284
|
isSpeaking = true;
|
|
285
|
+
streamDone = false;
|
|
251
286
|
var text = speechQueue.shift();
|
|
252
|
-
|
|
287
|
+
audioChunkQueue = [];
|
|
288
|
+
isPlayingChunk = false;
|
|
289
|
+
fetch(BASE + '/api/tts-stream', {
|
|
253
290
|
method: 'POST',
|
|
254
291
|
headers: { 'Content-Type': 'application/json' },
|
|
255
292
|
body: JSON.stringify({ text: text })
|
|
256
293
|
}).then(function(resp) {
|
|
257
294
|
if (!resp.ok) throw new Error('TTS failed');
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
295
|
+
var reader = resp.body.getReader();
|
|
296
|
+
var buffer = new Uint8Array(0);
|
|
297
|
+
|
|
298
|
+
function concat(a, b) {
|
|
299
|
+
var c = new Uint8Array(a.length + b.length);
|
|
300
|
+
c.set(a, 0);
|
|
301
|
+
c.set(b, a.length);
|
|
302
|
+
return c;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
function pump() {
|
|
306
|
+
return reader.read().then(function(result) {
|
|
307
|
+
if (result.done) {
|
|
308
|
+
streamDone = true;
|
|
309
|
+
if (!isPlayingChunk && audioChunkQueue.length === 0) {
|
|
310
|
+
isSpeaking = false;
|
|
311
|
+
processQueue();
|
|
312
|
+
}
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
315
|
+
buffer = concat(buffer, result.value);
|
|
316
|
+
while (buffer.length >= 4) {
|
|
317
|
+
var view = new DataView(buffer.buffer, buffer.byteOffset, 4);
|
|
318
|
+
var chunkLen = view.getUint32(0, false);
|
|
319
|
+
if (buffer.length < 4 + chunkLen) break;
|
|
320
|
+
var wavData = buffer.slice(4, 4 + chunkLen);
|
|
321
|
+
buffer = buffer.slice(4 + chunkLen);
|
|
322
|
+
var blob = new Blob([wavData], { type: 'audio/wav' });
|
|
323
|
+
audioChunkQueue.push(blob);
|
|
324
|
+
if (!isPlayingChunk) playNextChunk();
|
|
325
|
+
}
|
|
326
|
+
return pump();
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
return pump();
|
|
280
331
|
}).catch(function() {
|
|
332
|
+
streamDone = true;
|
|
281
333
|
isSpeaking = false;
|
|
282
334
|
processQueue();
|
|
283
335
|
});
|
|
@@ -285,6 +337,8 @@
|
|
|
285
337
|
|
|
286
338
|
function stopSpeaking() {
|
|
287
339
|
speechQueue = [];
|
|
340
|
+
audioChunkQueue = [];
|
|
341
|
+
isPlayingChunk = false;
|
|
288
342
|
isSpeaking = false;
|
|
289
343
|
if (currentAudio) {
|
|
290
344
|
currentAudio.pause();
|