agentgui 1.0.168 → 1.0.169
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +68 -13
- package/package.json +1 -1
- package/server.js +16 -4
- package/static/index.html +27 -0
- package/static/js/voice.js +37 -1
package/lib/speech.js
CHANGED
|
@@ -13,6 +13,21 @@ const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
|
|
|
13
13
|
const SAMPLE_RATE_TTS = 16000;
|
|
14
14
|
const SAMPLE_RATE_STT = 16000;
|
|
15
15
|
const MIN_WAV_SIZE = 44;
|
|
16
|
+
const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Matthijs%2Fcmu-arctic-xvectors&config=default&split=validation';
|
|
17
|
+
const SAMPLES_TO_AVERAGE = 10;
|
|
18
|
+
|
|
19
|
+
const VOICE_CATALOG = [
|
|
20
|
+
{ id: 'default', name: 'Default', gender: 'male', accent: 'US' },
|
|
21
|
+
{ id: 'bdl', name: 'BDL', gender: 'male', accent: 'US' },
|
|
22
|
+
{ id: 'slt', name: 'SLT', gender: 'female', accent: 'US' },
|
|
23
|
+
{ id: 'clb', name: 'CLB', gender: 'female', accent: 'US' },
|
|
24
|
+
{ id: 'rms', name: 'RMS', gender: 'male', accent: 'US' },
|
|
25
|
+
{ id: 'awb', name: 'AWB', gender: 'male', accent: 'Scottish' },
|
|
26
|
+
{ id: 'jmk', name: 'JMK', gender: 'male', accent: 'Canadian' },
|
|
27
|
+
{ id: 'ksp', name: 'KSP', gender: 'male', accent: 'Indian' },
|
|
28
|
+
];
|
|
29
|
+
|
|
30
|
+
const SPEAKER_OFFSETS = { awb: 0, bdl: 1200, clb: 2300, jmk: 3500, ksp: 4700, rms: 5900, slt: 7100 };
|
|
16
31
|
|
|
17
32
|
let transformersModule = null;
|
|
18
33
|
let sttPipeline = null;
|
|
@@ -20,6 +35,7 @@ let ttsPipeline = null;
|
|
|
20
35
|
let speakerEmbeddings = null;
|
|
21
36
|
let sttLoading = false;
|
|
22
37
|
let ttsLoading = false;
|
|
38
|
+
const voiceEmbeddingsCache = new Map();
|
|
23
39
|
|
|
24
40
|
const TTS_CACHE_MAX = 100;
|
|
25
41
|
const ttsCache = new Map();
|
|
@@ -52,6 +68,43 @@ async function ensureSpeakerEmbeddings() {
|
|
|
52
68
|
return speakerEmbeddings;
|
|
53
69
|
}
|
|
54
70
|
|
|
71
|
+
async function loadVoiceEmbedding(voiceId) {
|
|
72
|
+
if (!voiceId || voiceId === 'default') return ensureSpeakerEmbeddings();
|
|
73
|
+
if (voiceEmbeddingsCache.has(voiceId)) return voiceEmbeddingsCache.get(voiceId);
|
|
74
|
+
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
75
|
+
if (fs.existsSync(binPath)) {
|
|
76
|
+
const buf = fs.readFileSync(binPath);
|
|
77
|
+
const emb = new Float32Array(new Uint8Array(buf).buffer);
|
|
78
|
+
voiceEmbeddingsCache.set(voiceId, emb);
|
|
79
|
+
return emb;
|
|
80
|
+
}
|
|
81
|
+
const offset = SPEAKER_OFFSETS[voiceId];
|
|
82
|
+
if (offset === undefined) return ensureSpeakerEmbeddings();
|
|
83
|
+
const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
|
|
84
|
+
const resp = await fetch(url);
|
|
85
|
+
if (!resp.ok) throw new Error('Failed to fetch voice embeddings for ' + voiceId);
|
|
86
|
+
const data = await resp.json();
|
|
87
|
+
const avg = new Float32Array(512);
|
|
88
|
+
let count = 0;
|
|
89
|
+
for (const item of data.rows) {
|
|
90
|
+
const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
|
|
91
|
+
if (match && match[1] === voiceId) {
|
|
92
|
+
for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
|
|
93
|
+
count++;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
if (count === 0) return ensureSpeakerEmbeddings();
|
|
97
|
+
for (let i = 0; i < 512; i++) avg[i] /= count;
|
|
98
|
+
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
99
|
+
fs.writeFileSync(binPath, Buffer.from(avg.buffer));
|
|
100
|
+
voiceEmbeddingsCache.set(voiceId, avg);
|
|
101
|
+
return avg;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function getVoices() {
|
|
105
|
+
return VOICE_CATALOG;
|
|
106
|
+
}
|
|
107
|
+
|
|
55
108
|
async function getSTT() {
|
|
56
109
|
if (sttPipeline) return sttPipeline;
|
|
57
110
|
if (sttLoading) {
|
|
@@ -233,36 +286,38 @@ function cachePut(key, buf) {
|
|
|
233
286
|
ttsCache.set(key, buf);
|
|
234
287
|
}
|
|
235
288
|
|
|
236
|
-
async function synthesize(text) {
|
|
237
|
-
const
|
|
289
|
+
async function synthesize(text, voiceId) {
|
|
290
|
+
const cacheKey = (voiceId || 'default') + ':' + text;
|
|
291
|
+
const cached = ttsCache.get(cacheKey);
|
|
238
292
|
if (cached) {
|
|
239
|
-
ttsCache.delete(
|
|
240
|
-
ttsCache.set(
|
|
293
|
+
ttsCache.delete(cacheKey);
|
|
294
|
+
ttsCache.set(cacheKey, cached);
|
|
241
295
|
return cached;
|
|
242
296
|
}
|
|
243
297
|
const tts = await getTTS();
|
|
244
|
-
const embeddings = await
|
|
298
|
+
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
245
299
|
const result = await tts(text, { speaker_embeddings: embeddings });
|
|
246
300
|
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
247
|
-
cachePut(
|
|
301
|
+
cachePut(cacheKey, wav);
|
|
248
302
|
return wav;
|
|
249
303
|
}
|
|
250
304
|
|
|
251
|
-
async function* synthesizeStream(text) {
|
|
305
|
+
async function* synthesizeStream(text, voiceId) {
|
|
252
306
|
const sentences = splitSentences(text);
|
|
253
307
|
const tts = await getTTS();
|
|
254
|
-
const embeddings = await
|
|
308
|
+
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
255
309
|
for (const sentence of sentences) {
|
|
256
|
-
const
|
|
310
|
+
const cacheKey = (voiceId || 'default') + ':' + sentence;
|
|
311
|
+
const cached = ttsCache.get(cacheKey);
|
|
257
312
|
if (cached) {
|
|
258
|
-
ttsCache.delete(
|
|
259
|
-
ttsCache.set(
|
|
313
|
+
ttsCache.delete(cacheKey);
|
|
314
|
+
ttsCache.set(cacheKey, cached);
|
|
260
315
|
yield cached;
|
|
261
316
|
continue;
|
|
262
317
|
}
|
|
263
318
|
const result = await tts(sentence, { speaker_embeddings: embeddings });
|
|
264
319
|
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
265
|
-
cachePut(
|
|
320
|
+
cachePut(cacheKey, wav);
|
|
266
321
|
yield wav;
|
|
267
322
|
}
|
|
268
323
|
}
|
|
@@ -276,4 +331,4 @@ function getStatus() {
|
|
|
276
331
|
};
|
|
277
332
|
}
|
|
278
333
|
|
|
279
|
-
export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus };
|
|
334
|
+
export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices };
|
package/package.json
CHANGED
package/server.js
CHANGED
|
@@ -535,16 +535,27 @@ const server = http.createServer(async (req, res) => {
|
|
|
535
535
|
return;
|
|
536
536
|
}
|
|
537
537
|
|
|
538
|
+
if (routePath === '/api/voices' && req.method === 'GET') {
|
|
539
|
+
try {
|
|
540
|
+
const { getVoices } = await getSpeech();
|
|
541
|
+
sendJSON(req, res, 200, { ok: true, voices: getVoices() });
|
|
542
|
+
} catch (err) {
|
|
543
|
+
sendJSON(req, res, 200, { ok: true, voices: [] });
|
|
544
|
+
}
|
|
545
|
+
return;
|
|
546
|
+
}
|
|
547
|
+
|
|
538
548
|
if (routePath === '/api/tts' && req.method === 'POST') {
|
|
539
549
|
try {
|
|
540
550
|
const body = await parseBody(req);
|
|
541
551
|
const text = body.text || '';
|
|
552
|
+
const voiceId = body.voiceId || null;
|
|
542
553
|
if (!text) {
|
|
543
|
-
|
|
554
|
+
sendJSON(req, res, 400, { error: 'No text provided' });
|
|
544
555
|
return;
|
|
545
556
|
}
|
|
546
557
|
const { synthesize } = await getSpeech();
|
|
547
|
-
const wavBuffer = await synthesize(text);
|
|
558
|
+
const wavBuffer = await synthesize(text, voiceId);
|
|
548
559
|
res.writeHead(200, { 'Content-Type': 'audio/wav', 'Content-Length': wavBuffer.length });
|
|
549
560
|
res.end(wavBuffer);
|
|
550
561
|
} catch (err) {
|
|
@@ -558,6 +569,7 @@ const server = http.createServer(async (req, res) => {
|
|
|
558
569
|
try {
|
|
559
570
|
const body = await parseBody(req);
|
|
560
571
|
const text = body.text || '';
|
|
572
|
+
const voiceId = body.voiceId || null;
|
|
561
573
|
if (!text) {
|
|
562
574
|
sendJSON(req, res, 400, { error: 'No text provided' });
|
|
563
575
|
return;
|
|
@@ -569,7 +581,7 @@ const server = http.createServer(async (req, res) => {
|
|
|
569
581
|
'X-Content-Type': 'audio/wav-stream',
|
|
570
582
|
'Cache-Control': 'no-cache'
|
|
571
583
|
});
|
|
572
|
-
for await (const wavChunk of synthesizeStream(text)) {
|
|
584
|
+
for await (const wavChunk of synthesizeStream(text, voiceId)) {
|
|
573
585
|
const lenBuf = Buffer.alloc(4);
|
|
574
586
|
lenBuf.writeUInt32BE(wavChunk.length, 0);
|
|
575
587
|
res.write(lenBuf);
|
|
@@ -589,7 +601,7 @@ const server = http.createServer(async (req, res) => {
|
|
|
589
601
|
const { getStatus } = await getSpeech();
|
|
590
602
|
sendJSON(req, res, 200, getStatus());
|
|
591
603
|
} catch (err) {
|
|
592
|
-
|
|
604
|
+
sendJSON(req, res, 200, { sttReady: false, ttsReady: false, sttLoading: false, ttsLoading: false });
|
|
593
605
|
}
|
|
594
606
|
return;
|
|
595
607
|
}
|
package/static/index.html
CHANGED
|
@@ -1056,6 +1056,28 @@
|
|
|
1056
1056
|
border-color: var(--color-error);
|
|
1057
1057
|
}
|
|
1058
1058
|
|
|
1059
|
+
.voice-selector-wrapper {
|
|
1060
|
+
display: flex;
|
|
1061
|
+
align-items: center;
|
|
1062
|
+
gap: 0.25rem;
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
.voice-selector {
|
|
1066
|
+
padding: 0.2rem 0.5rem;
|
|
1067
|
+
border: 1px solid var(--color-border);
|
|
1068
|
+
border-radius: 0.375rem;
|
|
1069
|
+
background: var(--color-bg-secondary);
|
|
1070
|
+
color: var(--color-text-primary);
|
|
1071
|
+
font-size: 0.75rem;
|
|
1072
|
+
cursor: pointer;
|
|
1073
|
+
max-width: 160px;
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
.voice-selector:focus {
|
|
1077
|
+
outline: none;
|
|
1078
|
+
border-color: var(--color-primary);
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1059
1081
|
.voice-empty {
|
|
1060
1082
|
text-align: center;
|
|
1061
1083
|
color: var(--color-text-secondary);
|
|
@@ -2146,6 +2168,11 @@
|
|
|
2146
2168
|
<input type="checkbox" id="voiceTTSToggle" checked>
|
|
2147
2169
|
<span>Auto-speak responses</span>
|
|
2148
2170
|
</label>
|
|
2171
|
+
<div class="voice-selector-wrapper">
|
|
2172
|
+
<select class="voice-selector" id="voiceSelector" title="Select voice">
|
|
2173
|
+
<option value="default">Default</option>
|
|
2174
|
+
</select>
|
|
2175
|
+
</div>
|
|
2149
2176
|
<button class="voice-stop-btn" id="voiceStopSpeaking" title="Stop speaking">Stop</button>
|
|
2150
2177
|
</div>
|
|
2151
2178
|
</div>
|
package/static/js/voice.js
CHANGED
|
@@ -14,12 +14,48 @@
|
|
|
14
14
|
var TARGET_SAMPLE_RATE = 16000;
|
|
15
15
|
var spokenChunks = new Set();
|
|
16
16
|
var isLoadingHistory = false;
|
|
17
|
+
var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
|
|
17
18
|
|
|
18
19
|
function init() {
|
|
19
20
|
setupTTSToggle();
|
|
20
21
|
setupUI();
|
|
21
22
|
setupStreamingListener();
|
|
22
23
|
setupAgentSelector();
|
|
24
|
+
setupVoiceSelector();
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function setupVoiceSelector() {
|
|
28
|
+
var selector = document.getElementById('voiceSelector');
|
|
29
|
+
if (!selector) return;
|
|
30
|
+
var saved = localStorage.getItem('voice-selected-id');
|
|
31
|
+
if (saved) selectedVoiceId = saved;
|
|
32
|
+
fetch(BASE + '/api/voices')
|
|
33
|
+
.then(function(res) { return res.json(); })
|
|
34
|
+
.then(function(data) {
|
|
35
|
+
if (!data.ok || !Array.isArray(data.voices)) return;
|
|
36
|
+
selector.innerHTML = '';
|
|
37
|
+
data.voices.forEach(function(voice) {
|
|
38
|
+
var opt = document.createElement('option');
|
|
39
|
+
opt.value = voice.id;
|
|
40
|
+
var label = voice.name;
|
|
41
|
+
if (voice.gender || voice.accent) {
|
|
42
|
+
var parts = [];
|
|
43
|
+
if (voice.gender) parts.push(voice.gender);
|
|
44
|
+
if (voice.accent) parts.push(voice.accent);
|
|
45
|
+
label += ' (' + parts.join(', ') + ')';
|
|
46
|
+
}
|
|
47
|
+
opt.textContent = label;
|
|
48
|
+
selector.appendChild(opt);
|
|
49
|
+
});
|
|
50
|
+
if (saved && selector.querySelector('option[value="' + saved + '"]')) {
|
|
51
|
+
selector.value = saved;
|
|
52
|
+
}
|
|
53
|
+
})
|
|
54
|
+
.catch(function() {});
|
|
55
|
+
selector.addEventListener('change', function() {
|
|
56
|
+
selectedVoiceId = selector.value;
|
|
57
|
+
localStorage.setItem('voice-selected-id', selectedVoiceId);
|
|
58
|
+
});
|
|
23
59
|
}
|
|
24
60
|
|
|
25
61
|
function syncVoiceSelector() {
|
|
@@ -289,7 +325,7 @@
|
|
|
289
325
|
fetch(BASE + '/api/tts-stream', {
|
|
290
326
|
method: 'POST',
|
|
291
327
|
headers: { 'Content-Type': 'application/json' },
|
|
292
|
-
body: JSON.stringify({ text: text })
|
|
328
|
+
body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
|
|
293
329
|
}).then(function(resp) {
|
|
294
330
|
if (!resp.ok) throw new Error('TTS failed');
|
|
295
331
|
var reader = resp.body.getReader();
|