agentgui 1.0.168 → 1.0.170
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +184 -17
- package/package.json +2 -1
- package/server.js +16 -4
- package/static/index.html +27 -0
- package/static/js/voice.js +119 -44
package/lib/speech.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { createRequire } from 'module';
|
|
2
2
|
import fs from 'fs';
|
|
3
|
+
import os from 'os';
|
|
3
4
|
import path from 'path';
|
|
4
5
|
import { fileURLToPath } from 'url';
|
|
5
6
|
|
|
@@ -7,19 +8,74 @@ const require = createRequire(import.meta.url);
|
|
|
7
8
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
8
9
|
const ROOT = path.dirname(__dirname);
|
|
9
10
|
const DATA_DIR = path.join(ROOT, 'data');
|
|
11
|
+
const VOICES_DIR = path.join(ROOT, 'voices');
|
|
12
|
+
const HOME_VOICES_DIR = path.join(os.homedir(), 'voices');
|
|
13
|
+
const AUDIO_EXTENSIONS = ['.mp3', '.wav', '.ogg', '.flac', '.m4a'];
|
|
14
|
+
const MIN_WAV_SIZE = 1000;
|
|
10
15
|
|
|
11
|
-
const
|
|
16
|
+
const BASE_VOICES = [
|
|
17
|
+
{ id: 'default', name: 'Default', gender: 'male', accent: 'US' },
|
|
18
|
+
{ id: 'bdl', name: 'BDL', gender: 'male', accent: 'US' },
|
|
19
|
+
{ id: 'slt', name: 'SLT', gender: 'female', accent: 'US' },
|
|
20
|
+
{ id: 'clb', name: 'CLB', gender: 'female', accent: 'US' },
|
|
21
|
+
{ id: 'rms', name: 'RMS', gender: 'male', accent: 'US' },
|
|
22
|
+
{ id: 'awb', name: 'AWB', gender: 'male', accent: 'Scottish' },
|
|
23
|
+
{ id: 'jmk', name: 'JMK', gender: 'male', accent: 'Canadian' },
|
|
24
|
+
{ id: 'ksp', name: 'KSP', gender: 'male', accent: 'Indian' },
|
|
25
|
+
];
|
|
26
|
+
|
|
27
|
+
function scanVoiceDir(dir) {
|
|
28
|
+
const voices = [];
|
|
29
|
+
try {
|
|
30
|
+
if (!fs.existsSync(dir)) return voices;
|
|
31
|
+
for (const file of fs.readdirSync(dir)) {
|
|
32
|
+
const ext = path.extname(file).toLowerCase();
|
|
33
|
+
if (!AUDIO_EXTENSIONS.includes(ext)) continue;
|
|
34
|
+
const baseName = path.basename(file, ext);
|
|
35
|
+
const id = 'custom_' + baseName.replace(/[^a-zA-Z0-9_-]/g, '_');
|
|
36
|
+
const name = baseName.replace(/_/g, ' ');
|
|
37
|
+
voices.push({ id, name, gender: 'custom', accent: 'custom', isCustom: true, sourceDir: dir });
|
|
38
|
+
}
|
|
39
|
+
} catch (err) {
|
|
40
|
+
console.error('[VOICES] Error scanning', dir + ':', err.message);
|
|
41
|
+
}
|
|
42
|
+
return voices;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function loadCustomVoices() {
|
|
46
|
+
const seen = new Set();
|
|
47
|
+
const voices = [];
|
|
48
|
+
for (const dir of [VOICES_DIR, HOME_VOICES_DIR]) {
|
|
49
|
+
for (const v of scanVoiceDir(dir)) {
|
|
50
|
+
if (seen.has(v.id)) continue;
|
|
51
|
+
seen.add(v.id);
|
|
52
|
+
voices.push(v);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return voices;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function getVoices() {
|
|
59
|
+
return [...BASE_VOICES, ...loadCustomVoices()];
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const SPEAKER_OFFSETS = { awb: 0, bdl: 1200, clb: 2300, jmk: 3500, ksp: 4700, rms: 5900, slt: 7100 };
|
|
63
|
+
const SPEAKER_EMBEDDINGS_URL = 'https://huggingface.co/datasets/Xenova/speaker_embeddings/resolve/main/spkrec-xvectors-voxceleb.hf';
|
|
12
64
|
const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
|
|
13
|
-
const
|
|
14
|
-
const
|
|
15
|
-
const MIN_WAV_SIZE = 44;
|
|
65
|
+
const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Xenova%2Fspeaker_embeddings&config=default&split=train';
|
|
66
|
+
const SAMPLES_TO_AVERAGE = 30;
|
|
16
67
|
|
|
17
68
|
let transformersModule = null;
|
|
18
69
|
let sttPipeline = null;
|
|
19
70
|
let ttsPipeline = null;
|
|
20
71
|
let speakerEmbeddings = null;
|
|
72
|
+
let speakerEmbeddingPipeline = null;
|
|
21
73
|
let sttLoading = false;
|
|
22
74
|
let ttsLoading = false;
|
|
75
|
+
let speakerEmbeddingLoading = false;
|
|
76
|
+
const voiceEmbeddingsCache = new Map();
|
|
77
|
+
const SAMPLE_RATE_STT = 16000;
|
|
78
|
+
const SAMPLE_RATE_TTS = 16000;
|
|
23
79
|
|
|
24
80
|
const TTS_CACHE_MAX = 100;
|
|
25
81
|
const ttsCache = new Map();
|
|
@@ -52,6 +108,115 @@ async function ensureSpeakerEmbeddings() {
|
|
|
52
108
|
return speakerEmbeddings;
|
|
53
109
|
}
|
|
54
110
|
|
|
111
|
+
async function loadVoiceEmbedding(voiceId) {
|
|
112
|
+
if (!voiceId || voiceId === 'default') return ensureSpeakerEmbeddings();
|
|
113
|
+
if (voiceEmbeddingsCache.has(voiceId)) return voiceEmbeddingsCache.get(voiceId);
|
|
114
|
+
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
115
|
+
if (fs.existsSync(binPath)) {
|
|
116
|
+
const buf = fs.readFileSync(binPath);
|
|
117
|
+
const emb = new Float32Array(new Uint8Array(buf).buffer);
|
|
118
|
+
voiceEmbeddingsCache.set(voiceId, emb);
|
|
119
|
+
return emb;
|
|
120
|
+
}
|
|
121
|
+
if (voiceId.startsWith('custom_')) {
|
|
122
|
+
return generateEmbeddingFromCustomVoice(voiceId);
|
|
123
|
+
}
|
|
124
|
+
const offset = SPEAKER_OFFSETS[voiceId];
|
|
125
|
+
if (offset === undefined) return ensureSpeakerEmbeddings();
|
|
126
|
+
const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
|
|
127
|
+
const resp = await fetch(url);
|
|
128
|
+
if (!resp.ok) throw new Error('Failed to fetch voice embeddings for ' + voiceId);
|
|
129
|
+
const data = await resp.json();
|
|
130
|
+
const avg = new Float32Array(512);
|
|
131
|
+
let count = 0;
|
|
132
|
+
for (const item of data.rows) {
|
|
133
|
+
const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
|
|
134
|
+
if (match && match[1] === voiceId) {
|
|
135
|
+
for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
|
|
136
|
+
count++;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
if (count === 0) return ensureSpeakerEmbeddings();
|
|
140
|
+
for (let i = 0; i < 512; i++) avg[i] /= count;
|
|
141
|
+
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
142
|
+
fs.writeFileSync(binPath, Buffer.from(avg.buffer));
|
|
143
|
+
voiceEmbeddingsCache.set(voiceId, avg);
|
|
144
|
+
return avg;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
async function getSpeakerEmbeddingPipeline() {
|
|
148
|
+
if (speakerEmbeddingPipeline) return speakerEmbeddingPipeline;
|
|
149
|
+
if (speakerEmbeddingLoading) {
|
|
150
|
+
while (speakerEmbeddingLoading) await new Promise(r => setTimeout(r, 100));
|
|
151
|
+
if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding pipeline failed to load');
|
|
152
|
+
return speakerEmbeddingPipeline;
|
|
153
|
+
}
|
|
154
|
+
speakerEmbeddingLoading = true;
|
|
155
|
+
try {
|
|
156
|
+
const { pipeline, env } = await loadTransformers();
|
|
157
|
+
env.allowRemoteModels = true;
|
|
158
|
+
speakerEmbeddingPipeline = await pipeline('feature-extraction', 'speechbrain/spkrec-xvectors-voxceleb', {
|
|
159
|
+
device: 'cpu',
|
|
160
|
+
dtype: 'fp32',
|
|
161
|
+
});
|
|
162
|
+
return speakerEmbeddingPipeline;
|
|
163
|
+
} catch (err) {
|
|
164
|
+
speakerEmbeddingPipeline = null;
|
|
165
|
+
throw new Error('Speaker embedding model load failed: ' + err.message);
|
|
166
|
+
} finally {
|
|
167
|
+
speakerEmbeddingLoading = false;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
function findCustomVoiceFile(voiceId) {
|
|
172
|
+
const baseName = voiceId.replace(/^custom_/, '');
|
|
173
|
+
for (const dir of [VOICES_DIR, HOME_VOICES_DIR]) {
|
|
174
|
+
for (const ext of AUDIO_EXTENSIONS) {
|
|
175
|
+
const candidate = path.join(dir, baseName + ext);
|
|
176
|
+
if (fs.existsSync(candidate)) return candidate;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return null;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
async function decodeAudioFile(filePath) {
|
|
183
|
+
const buf = fs.readFileSync(filePath);
|
|
184
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
185
|
+
if (ext === '.wav') {
|
|
186
|
+
const decoded = decodeWavToFloat32(buf);
|
|
187
|
+
return resampleTo16k(decoded.audio, decoded.sampleRate);
|
|
188
|
+
}
|
|
189
|
+
const decode = (await import('audio-decode')).default;
|
|
190
|
+
const audioBuffer = await decode(buf);
|
|
191
|
+
const mono = audioBuffer.getChannelData(0);
|
|
192
|
+
return resampleTo16k(mono, audioBuffer.sampleRate);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
async function generateEmbeddingFromCustomVoice(voiceId) {
|
|
196
|
+
const audioFile = findCustomVoiceFile(voiceId);
|
|
197
|
+
if (!audioFile) {
|
|
198
|
+
console.error('[VOICES] Custom voice file not found for:', voiceId);
|
|
199
|
+
return ensureSpeakerEmbeddings();
|
|
200
|
+
}
|
|
201
|
+
console.log('[VOICES] Generating embedding from:', audioFile);
|
|
202
|
+
const audio = await decodeAudioFile(audioFile);
|
|
203
|
+
if (audio.length < SAMPLE_RATE_STT * 0.5) {
|
|
204
|
+
throw new Error('Audio too short for embedding extraction (need at least 0.5 seconds)');
|
|
205
|
+
}
|
|
206
|
+
const pipe = await getSpeakerEmbeddingPipeline();
|
|
207
|
+
const output = await pipe(audio, { pooling: 'mean', normalize: true });
|
|
208
|
+
const embedding = new Float32Array(512);
|
|
209
|
+
for (let i = 0; i < Math.min(512, output.data.length); i++) {
|
|
210
|
+
embedding[i] = output.data[i];
|
|
211
|
+
}
|
|
212
|
+
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
213
|
+
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
214
|
+
fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
|
|
215
|
+
voiceEmbeddingsCache.set(voiceId, embedding);
|
|
216
|
+
console.log('[VOICES] Generated embedding for custom voice:', voiceId);
|
|
217
|
+
return embedding;
|
|
218
|
+
}
|
|
219
|
+
|
|
55
220
|
async function getSTT() {
|
|
56
221
|
if (sttPipeline) return sttPipeline;
|
|
57
222
|
if (sttLoading) {
|
|
@@ -233,36 +398,38 @@ function cachePut(key, buf) {
|
|
|
233
398
|
ttsCache.set(key, buf);
|
|
234
399
|
}
|
|
235
400
|
|
|
236
|
-
async function synthesize(text) {
|
|
237
|
-
const
|
|
401
|
+
async function synthesize(text, voiceId) {
|
|
402
|
+
const cacheKey = (voiceId || 'default') + ':' + text;
|
|
403
|
+
const cached = ttsCache.get(cacheKey);
|
|
238
404
|
if (cached) {
|
|
239
|
-
ttsCache.delete(
|
|
240
|
-
ttsCache.set(
|
|
405
|
+
ttsCache.delete(cacheKey);
|
|
406
|
+
ttsCache.set(cacheKey, cached);
|
|
241
407
|
return cached;
|
|
242
408
|
}
|
|
243
409
|
const tts = await getTTS();
|
|
244
|
-
const embeddings = await
|
|
410
|
+
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
245
411
|
const result = await tts(text, { speaker_embeddings: embeddings });
|
|
246
412
|
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
247
|
-
cachePut(
|
|
413
|
+
cachePut(cacheKey, wav);
|
|
248
414
|
return wav;
|
|
249
415
|
}
|
|
250
416
|
|
|
251
|
-
async function* synthesizeStream(text) {
|
|
417
|
+
async function* synthesizeStream(text, voiceId) {
|
|
252
418
|
const sentences = splitSentences(text);
|
|
253
419
|
const tts = await getTTS();
|
|
254
|
-
const embeddings = await
|
|
420
|
+
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
255
421
|
for (const sentence of sentences) {
|
|
256
|
-
const
|
|
422
|
+
const cacheKey = (voiceId || 'default') + ':' + sentence;
|
|
423
|
+
const cached = ttsCache.get(cacheKey);
|
|
257
424
|
if (cached) {
|
|
258
|
-
ttsCache.delete(
|
|
259
|
-
ttsCache.set(
|
|
425
|
+
ttsCache.delete(cacheKey);
|
|
426
|
+
ttsCache.set(cacheKey, cached);
|
|
260
427
|
yield cached;
|
|
261
428
|
continue;
|
|
262
429
|
}
|
|
263
430
|
const result = await tts(sentence, { speaker_embeddings: embeddings });
|
|
264
431
|
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
265
|
-
cachePut(
|
|
432
|
+
cachePut(cacheKey, wav);
|
|
266
433
|
yield wav;
|
|
267
434
|
}
|
|
268
435
|
}
|
|
@@ -276,4 +443,4 @@ function getStatus() {
|
|
|
276
443
|
};
|
|
277
444
|
}
|
|
278
445
|
|
|
279
|
-
export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus };
|
|
446
|
+
export { transcribe, synthesize, synthesizeStream, getSTT, getTTS, getStatus, getVoices };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgui",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.170",
|
|
4
4
|
"description": "Multi-agent ACP client with real-time communication",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "server.js",
|
|
@@ -23,6 +23,7 @@
|
|
|
23
23
|
"dependencies": {
|
|
24
24
|
"@anthropic-ai/claude-code": "^2.1.37",
|
|
25
25
|
"@huggingface/transformers": "^3.8.1",
|
|
26
|
+
"audio-decode": "^2.2.3",
|
|
26
27
|
"better-sqlite3": "^12.6.2",
|
|
27
28
|
"busboy": "^1.6.0",
|
|
28
29
|
"express": "^5.2.1",
|
package/server.js
CHANGED
|
@@ -535,16 +535,27 @@ const server = http.createServer(async (req, res) => {
|
|
|
535
535
|
return;
|
|
536
536
|
}
|
|
537
537
|
|
|
538
|
+
if (routePath === '/api/voices' && req.method === 'GET') {
|
|
539
|
+
try {
|
|
540
|
+
const { getVoices } = await getSpeech();
|
|
541
|
+
sendJSON(req, res, 200, { ok: true, voices: getVoices() });
|
|
542
|
+
} catch (err) {
|
|
543
|
+
sendJSON(req, res, 200, { ok: true, voices: [] });
|
|
544
|
+
}
|
|
545
|
+
return;
|
|
546
|
+
}
|
|
547
|
+
|
|
538
548
|
if (routePath === '/api/tts' && req.method === 'POST') {
|
|
539
549
|
try {
|
|
540
550
|
const body = await parseBody(req);
|
|
541
551
|
const text = body.text || '';
|
|
552
|
+
const voiceId = body.voiceId || null;
|
|
542
553
|
if (!text) {
|
|
543
|
-
|
|
554
|
+
sendJSON(req, res, 400, { error: 'No text provided' });
|
|
544
555
|
return;
|
|
545
556
|
}
|
|
546
557
|
const { synthesize } = await getSpeech();
|
|
547
|
-
const wavBuffer = await synthesize(text);
|
|
558
|
+
const wavBuffer = await synthesize(text, voiceId);
|
|
548
559
|
res.writeHead(200, { 'Content-Type': 'audio/wav', 'Content-Length': wavBuffer.length });
|
|
549
560
|
res.end(wavBuffer);
|
|
550
561
|
} catch (err) {
|
|
@@ -558,6 +569,7 @@ const server = http.createServer(async (req, res) => {
|
|
|
558
569
|
try {
|
|
559
570
|
const body = await parseBody(req);
|
|
560
571
|
const text = body.text || '';
|
|
572
|
+
const voiceId = body.voiceId || null;
|
|
561
573
|
if (!text) {
|
|
562
574
|
sendJSON(req, res, 400, { error: 'No text provided' });
|
|
563
575
|
return;
|
|
@@ -569,7 +581,7 @@ const server = http.createServer(async (req, res) => {
|
|
|
569
581
|
'X-Content-Type': 'audio/wav-stream',
|
|
570
582
|
'Cache-Control': 'no-cache'
|
|
571
583
|
});
|
|
572
|
-
for await (const wavChunk of synthesizeStream(text)) {
|
|
584
|
+
for await (const wavChunk of synthesizeStream(text, voiceId)) {
|
|
573
585
|
const lenBuf = Buffer.alloc(4);
|
|
574
586
|
lenBuf.writeUInt32BE(wavChunk.length, 0);
|
|
575
587
|
res.write(lenBuf);
|
|
@@ -589,7 +601,7 @@ const server = http.createServer(async (req, res) => {
|
|
|
589
601
|
const { getStatus } = await getSpeech();
|
|
590
602
|
sendJSON(req, res, 200, getStatus());
|
|
591
603
|
} catch (err) {
|
|
592
|
-
|
|
604
|
+
sendJSON(req, res, 200, { sttReady: false, ttsReady: false, sttLoading: false, ttsLoading: false });
|
|
593
605
|
}
|
|
594
606
|
return;
|
|
595
607
|
}
|
package/static/index.html
CHANGED
|
@@ -1056,6 +1056,28 @@
|
|
|
1056
1056
|
border-color: var(--color-error);
|
|
1057
1057
|
}
|
|
1058
1058
|
|
|
1059
|
+
.voice-selector-wrapper {
|
|
1060
|
+
display: flex;
|
|
1061
|
+
align-items: center;
|
|
1062
|
+
gap: 0.25rem;
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
.voice-selector {
|
|
1066
|
+
padding: 0.2rem 0.5rem;
|
|
1067
|
+
border: 1px solid var(--color-border);
|
|
1068
|
+
border-radius: 0.375rem;
|
|
1069
|
+
background: var(--color-bg-secondary);
|
|
1070
|
+
color: var(--color-text-primary);
|
|
1071
|
+
font-size: 0.75rem;
|
|
1072
|
+
cursor: pointer;
|
|
1073
|
+
max-width: 160px;
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
.voice-selector:focus {
|
|
1077
|
+
outline: none;
|
|
1078
|
+
border-color: var(--color-primary);
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1059
1081
|
.voice-empty {
|
|
1060
1082
|
text-align: center;
|
|
1061
1083
|
color: var(--color-text-secondary);
|
|
@@ -2146,6 +2168,11 @@
|
|
|
2146
2168
|
<input type="checkbox" id="voiceTTSToggle" checked>
|
|
2147
2169
|
<span>Auto-speak responses</span>
|
|
2148
2170
|
</label>
|
|
2171
|
+
<div class="voice-selector-wrapper">
|
|
2172
|
+
<select class="voice-selector" id="voiceSelector" title="Select voice">
|
|
2173
|
+
<option value="default">Default</option>
|
|
2174
|
+
</select>
|
|
2175
|
+
</div>
|
|
2149
2176
|
<button class="voice-stop-btn" id="voiceStopSpeaking" title="Stop speaking">Stop</button>
|
|
2150
2177
|
</div>
|
|
2151
2178
|
</div>
|
package/static/js/voice.js
CHANGED
|
@@ -14,12 +14,62 @@
|
|
|
14
14
|
var TARGET_SAMPLE_RATE = 16000;
|
|
15
15
|
var spokenChunks = new Set();
|
|
16
16
|
var isLoadingHistory = false;
|
|
17
|
+
var selectedVoiceId = localStorage.getItem('voice-selected-id') || 'default';
|
|
17
18
|
|
|
18
19
|
function init() {
|
|
19
20
|
setupTTSToggle();
|
|
20
21
|
setupUI();
|
|
21
22
|
setupStreamingListener();
|
|
22
23
|
setupAgentSelector();
|
|
24
|
+
setupVoiceSelector();
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function setupVoiceSelector() {
|
|
28
|
+
var selector = document.getElementById('voiceSelector');
|
|
29
|
+
if (!selector) return;
|
|
30
|
+
var saved = localStorage.getItem('voice-selected-id');
|
|
31
|
+
if (saved) selectedVoiceId = saved;
|
|
32
|
+
fetch(BASE + '/api/voices')
|
|
33
|
+
.then(function(res) { return res.json(); })
|
|
34
|
+
.then(function(data) {
|
|
35
|
+
if (!data.ok || !Array.isArray(data.voices)) return;
|
|
36
|
+
selector.innerHTML = '';
|
|
37
|
+
var builtIn = data.voices.filter(function(v) { return !v.isCustom; });
|
|
38
|
+
var custom = data.voices.filter(function(v) { return v.isCustom; });
|
|
39
|
+
if (builtIn.length) {
|
|
40
|
+
var grp1 = document.createElement('optgroup');
|
|
41
|
+
grp1.label = 'Built-in Voices';
|
|
42
|
+
builtIn.forEach(function(voice) {
|
|
43
|
+
var opt = document.createElement('option');
|
|
44
|
+
opt.value = voice.id;
|
|
45
|
+
var parts = [];
|
|
46
|
+
if (voice.gender) parts.push(voice.gender);
|
|
47
|
+
if (voice.accent) parts.push(voice.accent);
|
|
48
|
+
opt.textContent = voice.name + (parts.length ? ' (' + parts.join(', ') + ')' : '');
|
|
49
|
+
grp1.appendChild(opt);
|
|
50
|
+
});
|
|
51
|
+
selector.appendChild(grp1);
|
|
52
|
+
}
|
|
53
|
+
if (custom.length) {
|
|
54
|
+
var grp2 = document.createElement('optgroup');
|
|
55
|
+
grp2.label = 'Custom Voices';
|
|
56
|
+
custom.forEach(function(voice) {
|
|
57
|
+
var opt = document.createElement('option');
|
|
58
|
+
opt.value = voice.id;
|
|
59
|
+
opt.textContent = voice.name;
|
|
60
|
+
grp2.appendChild(opt);
|
|
61
|
+
});
|
|
62
|
+
selector.appendChild(grp2);
|
|
63
|
+
}
|
|
64
|
+
if (saved && selector.querySelector('option[value="' + saved + '"]')) {
|
|
65
|
+
selector.value = saved;
|
|
66
|
+
}
|
|
67
|
+
})
|
|
68
|
+
.catch(function() {});
|
|
69
|
+
selector.addEventListener('change', function() {
|
|
70
|
+
selectedVoiceId = selector.value;
|
|
71
|
+
localStorage.setItem('voice-selected-id', selectedVoiceId);
|
|
72
|
+
});
|
|
23
73
|
}
|
|
24
74
|
|
|
25
75
|
function syncVoiceSelector() {
|
|
@@ -286,53 +336,78 @@
|
|
|
286
336
|
var text = speechQueue.shift();
|
|
287
337
|
audioChunkQueue = [];
|
|
288
338
|
isPlayingChunk = false;
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
var c = new Uint8Array(a.length + b.length);
|
|
300
|
-
c.set(a, 0);
|
|
301
|
-
c.set(b, a.length);
|
|
302
|
-
return c;
|
|
303
|
-
}
|
|
339
|
+
|
|
340
|
+
function tryStreaming() {
|
|
341
|
+
fetch(BASE + '/api/tts-stream', {
|
|
342
|
+
method: 'POST',
|
|
343
|
+
headers: { 'Content-Type': 'application/json' },
|
|
344
|
+
body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
|
|
345
|
+
}).then(function(resp) {
|
|
346
|
+
if (!resp.ok) throw new Error('TTS stream failed');
|
|
347
|
+
var reader = resp.body.getReader();
|
|
348
|
+
var buffer = new Uint8Array(0);
|
|
304
349
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
350
|
+
function concat(a, b) {
|
|
351
|
+
var c = new Uint8Array(a.length + b.length);
|
|
352
|
+
c.set(a, 0);
|
|
353
|
+
c.set(b, a.length);
|
|
354
|
+
return c;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
function pump() {
|
|
358
|
+
return reader.read().then(function(result) {
|
|
359
|
+
if (result.done) {
|
|
360
|
+
streamDone = true;
|
|
361
|
+
if (!isPlayingChunk && audioChunkQueue.length === 0) {
|
|
362
|
+
isSpeaking = false;
|
|
363
|
+
processQueue();
|
|
364
|
+
}
|
|
365
|
+
return;
|
|
312
366
|
}
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
});
|
|
328
|
-
}
|
|
367
|
+
buffer = concat(buffer, result.value);
|
|
368
|
+
while (buffer.length >= 4) {
|
|
369
|
+
var view = new DataView(buffer.buffer, buffer.byteOffset, 4);
|
|
370
|
+
var chunkLen = view.getUint32(0, false);
|
|
371
|
+
if (buffer.length < 4 + chunkLen) break;
|
|
372
|
+
var wavData = buffer.slice(4, 4 + chunkLen);
|
|
373
|
+
buffer = buffer.slice(4 + chunkLen);
|
|
374
|
+
var blob = new Blob([wavData], { type: 'audio/wav' });
|
|
375
|
+
audioChunkQueue.push(blob);
|
|
376
|
+
if (!isPlayingChunk) playNextChunk();
|
|
377
|
+
}
|
|
378
|
+
return pump();
|
|
379
|
+
});
|
|
380
|
+
}
|
|
329
381
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
382
|
+
return pump();
|
|
383
|
+
}).catch(function() {
|
|
384
|
+
tryNonStreaming(text);
|
|
385
|
+
});
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
function tryNonStreaming(txt) {
|
|
389
|
+
fetch(BASE + '/api/tts', {
|
|
390
|
+
method: 'POST',
|
|
391
|
+
headers: { 'Content-Type': 'application/json' },
|
|
392
|
+
body: JSON.stringify({ text: txt, voiceId: selectedVoiceId })
|
|
393
|
+
}).then(function(resp) {
|
|
394
|
+
if (!resp.ok) throw new Error('TTS failed');
|
|
395
|
+
return resp.arrayBuffer();
|
|
396
|
+
}).then(function(buf) {
|
|
397
|
+
var blob = new Blob([buf], { type: 'audio/wav' });
|
|
398
|
+
audioChunkQueue.push(blob);
|
|
399
|
+
if (!isPlayingChunk) playNextChunk();
|
|
400
|
+
streamDone = true;
|
|
401
|
+
isSpeaking = false;
|
|
402
|
+
processQueue();
|
|
403
|
+
}).catch(function() {
|
|
404
|
+
streamDone = true;
|
|
405
|
+
isSpeaking = false;
|
|
406
|
+
processQueue();
|
|
407
|
+
});
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
tryStreaming();
|
|
336
411
|
}
|
|
337
412
|
|
|
338
413
|
function stopSpeaking() {
|