agentgui 1.0.176 → 1.0.178
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +89 -44
- package/package.json +1 -1
- package/static/js/voice.js +6 -1
package/lib/speech.js
CHANGED
|
@@ -8,7 +8,7 @@ const require = createRequire(import.meta.url);
|
|
|
8
8
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
9
9
|
const ROOT = path.dirname(__dirname);
|
|
10
10
|
const DATA_DIR = path.join(ROOT, 'data');
|
|
11
|
-
const AUDIO_EXTENSIONS = ['.
|
|
11
|
+
const AUDIO_EXTENSIONS = ['.wav', '.mp3', '.ogg', '.flac', '.m4a'];
|
|
12
12
|
|
|
13
13
|
function getVoiceDirs() {
|
|
14
14
|
const dirs = [];
|
|
@@ -106,6 +106,7 @@ const SPEAKER_EMBEDDINGS_URL = 'https://huggingface.co/datasets/Xenova/speaker_e
|
|
|
106
106
|
const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
|
|
107
107
|
const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Xenova%2Fspeaker_embeddings&config=default&split=train';
|
|
108
108
|
const SAMPLES_TO_AVERAGE = 30;
|
|
109
|
+
const DEFAULT_EMBEDDING_B64 = 'xhibvao34LylqXQ8cNg7Pd1cCTw0keG8awRRvRqje7070G48AtOgPMFbnr1oeKC9I4ZuPZzqGT1DjWs8y3iMPB/SZLzdl7E6b9QaPKSpHTwYuh49FrMlO9YnebwmTzu9/3CPvQuvCbxsSWC9Sb2bO+tvXj0Cjpo8mTMxu/FDrjzQ4x09gyxCvUn6STxjAo+9vtXdPJtsYT3iMna9dQ+EvfQ72zuvxk69GAonPU8KdjsNPAU96e/8veN7lrwgyzk8HA5vvYE1Rz3gpZ484MsLPUKkxTzM54U81ECwvcbFHzv8gT08T6/7POCqBT2fv5E8fvsXPfZiJrzEhme8dg8kPR+mKTutQOU822maPMlMDb1x/IS93+6KvdyThzwhry880JBqvRVOhjzZods8SD08PLpObTn/0wk9BnAwvWiiz72EWgS9RpcjvV4VR73ZqJW9PoUFvfZYYb1h26S98levPHZbTjxH6qU9RPfoPHmJu70mSNo8ztJmvWgMBj0IX8i7TE3lPINY2DzoEma9wMObvTwKCT3pObe8t9KEvaWixjzc5fI8hj6MvaKv4Txl4h09d2a+PHCvTDxorJ69ekRrPeoPjz1JPfI7rUH7PIaJgz0O1YW9JLumvCxDnr1bmMm8GbIFPBX1oL3bRN08oYcXPEaFfL13Vxo9EKfbvTFcOTxdogA9XS3kPEWJoLvChc887BEgPMOvUT2Ba3s8tUDBvYPMZ72dNRG80AuTvQt7d72foTU9qO20O4INEb1u1iE9ibqJvZYaOj2nbYc8lsodvS5HPD1lCqK9EkBYPR0I/rySMIK9plcpPdpJEz2E/DY88d2DPIRTf71ZQZS9b1v5PPseFT2YiJu8OiOwPC8Wnr2QW4Q8n+o7PPQ8PD0QqAg9Vk7APDT6+jzreP88KH6GvTvAKD0AYiO9qOavvORySjvQ6y+9epb5PFvZijxYzlK9BwjUPK0HXL3acWc7dmwmPc/kXb2VBg68MGYRPR5q9zzmFiS9al2IvdVTfDwJOa88SzVkvVlrPD0WvJQ8Vm76PMUAQDzNgyK8QQZVPdMoibxrCBc9BgKTPDLoV70Iu6g7k+kBPZ3lhTy6sOU8OGkVvFaLRD14oqa9a4UVO4z4Gr1eYlO9u5BgPWS1ZL3kFPE8JGEwPQFTl71tHso8g+ElPd9Rgr2XCtc8axudvWC2IL09wSg9E7ZzPT6uBz2XmK09A1HcPJK8rTxK8Zu8GuMTPTuINTyRAhS9OSqDPDralLza3q48EgtePPf797rIWKo9NtkrvbO34zxKZ6m97l0GPQYVlL2igDA9UyfEPJhZyjx4/2Q8ggBpPYcAkzzIVu08ykYNPESdZr3uqmq8fS/zPKUYvzv67x49cUkqvXDlJj1us/88gASuvcs6G7sUshY9SgWiOqu4OD1WQ7k7/sLoPKuLJjwZYFm9an+zPOnfNry9Jh49/XX3vN1sc731fBM9TnBDPHzOAD26/dS9mg57vY+TA7wVJCw9pPb1PE30l7019la9UyRTPXFqljyRDnw9eZ6nvU03kTtS9907L+wavIBtab3k6cs8KVr6vPZ5zTxy+Zs8VuopPQTTUj0tNxg96qZyPY69lTzQEp48BXGJvVopBDvskUg9G2dOPaJMXDylJZU8FxcMvBQkNzzjPKs8FYUpvepYYj1AQsK9upQsvS4037xDcO48GhmIvWb1iT1gJhy9TG7iPHKAG70cuCQ8F1ZwPYqtj7300T89rTujPbXy2r3/cK69FtBNvY3iMT0DoqI4KK0QPYKEqr2Z6RU9ni0UPUNDLb3BsCi8+GttvZYp9zwUaHe9TqrFPOnlH7yCXJC9U8vDu8u2MjxA8xs9SAGxvPpphr29y2e9y2AYvTv+Eb1Elus9DdpGPSfmNL39Ggu85RVXPZbLh70Jvna7XkLGvR230DtGjpu7Ih8HPJKnIz1o35i8x5NVvXwFNDzs/ZM8+kw8PfFJSTwdlJA9ZJ+tvaoVZ7zTvVi8p6wluwh/IT0Kmg088o1rPRhiwjxpWIe9a+LuvYuYtjwAxE09WkPJPBuFh73UotY820JjvXpnQD3fJ/w8TM3JPOz0pTnbTim9tpe6PBHzJT1HEb66SkAKPasLgr1l/Mm8IOGgvM2pZbzwd4a9znOIO4d4Bb1DW5I8EZXzOxvBKDqKpHG9UwCHvd/Epb2cDRi9V1ztPNPBNTrLXHa8FdGHPPo+hb3DnJ08G+SvvVPQBL6zzrC8Omksvc+eIjyvGfU8eG9nvaVkdL1HBvs8eaeGPfcbVD1/Pfw8+TUFvU6aTL2JN5W8HXDNvGKFEj1i+T09UiCIOySbDD2x2/y7VTmnvTe3gb0ZhJw8WrKIuU5RGT09mKU7eFGtPFpr6DzaoyI9hsItPKU+YzuQlXK8f9IePSmUxTwXdoo9W6FJPV2kLzwkU1o8fGnfPInxg70rEVe9H7sNPWJDbbxSqLY8cQAOPUdpAD2YknK9ykFXPeVALz1mq3W96kO/PLERzjyXIRC7jxsXPRnLzjyUEoU7gTKvu+stlb1D1g45IH+2u5sOIj0wXPA8yTqDvT6mV72NsFq8ExeuPJlGyDxvjgk9lJeJvWSF8DwFvaW7oZ9GvHq1Rr1FJsk83zxVvfyGqTz7thG9fslpPF5RPb1Q6BQ9iXGovTeDeb2cmic8oBsRPYeni72TPcI8EKcPvfCJUbyQJqW9fCAYPRk8qT2q6rk8mEw2PfDeXL0=';
|
|
109
110
|
|
|
110
111
|
let transformersModule = null;
|
|
111
112
|
let sttPipeline = null;
|
|
@@ -143,16 +144,36 @@ function whisperModelPath() {
|
|
|
143
144
|
return 'onnx-community/whisper-base';
|
|
144
145
|
}
|
|
145
146
|
|
|
147
|
+
function defaultEmbedding() {
|
|
148
|
+
const buf = Buffer.from(DEFAULT_EMBEDDING_B64, 'base64');
|
|
149
|
+
return new Float32Array(new Uint8Array(buf).buffer);
|
|
150
|
+
}
|
|
151
|
+
|
|
146
152
|
async function ensureSpeakerEmbeddings() {
|
|
147
153
|
if (speakerEmbeddings) return speakerEmbeddings;
|
|
148
154
|
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
149
|
-
if (
|
|
150
|
-
const
|
|
151
|
-
if (
|
|
152
|
-
|
|
155
|
+
if (fs.existsSync(SPEAKER_EMBEDDINGS_PATH)) {
|
|
156
|
+
const buf = fs.readFileSync(SPEAKER_EMBEDDINGS_PATH);
|
|
157
|
+
if (buf.length === 2048) {
|
|
158
|
+
speakerEmbeddings = new Float32Array(new Uint8Array(buf).buffer);
|
|
159
|
+
return speakerEmbeddings;
|
|
160
|
+
}
|
|
153
161
|
}
|
|
154
|
-
|
|
155
|
-
|
|
162
|
+
try {
|
|
163
|
+
const resp = await fetch(SPEAKER_EMBEDDINGS_URL);
|
|
164
|
+
if (resp.ok) {
|
|
165
|
+
const data = Buffer.from(await resp.arrayBuffer());
|
|
166
|
+
if (data.length >= 2048) {
|
|
167
|
+
fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, data);
|
|
168
|
+
speakerEmbeddings = new Float32Array(new Uint8Array(data).buffer);
|
|
169
|
+
return speakerEmbeddings;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
} catch (_) {}
|
|
173
|
+
console.log('[TTS] Using bundled default speaker embedding');
|
|
174
|
+
speakerEmbeddings = defaultEmbedding();
|
|
175
|
+
const buf = Buffer.from(speakerEmbeddings.buffer);
|
|
176
|
+
fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, buf);
|
|
156
177
|
return speakerEmbeddings;
|
|
157
178
|
}
|
|
158
179
|
|
|
@@ -171,45 +192,55 @@ async function loadVoiceEmbedding(voiceId) {
|
|
|
171
192
|
}
|
|
172
193
|
const offset = SPEAKER_OFFSETS[voiceId];
|
|
173
194
|
if (offset === undefined) return ensureSpeakerEmbeddings();
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
const
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
195
|
+
try {
|
|
196
|
+
const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
|
|
197
|
+
const resp = await fetch(url);
|
|
198
|
+
if (!resp.ok) throw new Error('HTTP ' + resp.status);
|
|
199
|
+
const data = await resp.json();
|
|
200
|
+
const avg = new Float32Array(512);
|
|
201
|
+
let count = 0;
|
|
202
|
+
for (const item of data.rows) {
|
|
203
|
+
const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
|
|
204
|
+
if (match && match[1] === voiceId) {
|
|
205
|
+
for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
|
|
206
|
+
count++;
|
|
207
|
+
}
|
|
185
208
|
}
|
|
209
|
+
if (count === 0) return ensureSpeakerEmbeddings();
|
|
210
|
+
for (let i = 0; i < 512; i++) avg[i] /= count;
|
|
211
|
+
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
212
|
+
fs.writeFileSync(binPath, Buffer.from(avg.buffer));
|
|
213
|
+
voiceEmbeddingsCache.set(voiceId, avg);
|
|
214
|
+
return avg;
|
|
215
|
+
} catch (err) {
|
|
216
|
+
console.error('[TTS] Failed to fetch voice embedding for ' + voiceId + ':', err.message);
|
|
217
|
+
return ensureSpeakerEmbeddings();
|
|
186
218
|
}
|
|
187
|
-
if (count === 0) return ensureSpeakerEmbeddings();
|
|
188
|
-
for (let i = 0; i < 512; i++) avg[i] /= count;
|
|
189
|
-
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
190
|
-
fs.writeFileSync(binPath, Buffer.from(avg.buffer));
|
|
191
|
-
voiceEmbeddingsCache.set(voiceId, avg);
|
|
192
|
-
return avg;
|
|
193
219
|
}
|
|
194
220
|
|
|
221
|
+
let speakerFeatureExtractor = null;
|
|
222
|
+
|
|
195
223
|
async function getSpeakerEmbeddingPipeline() {
|
|
196
224
|
if (speakerEmbeddingPipeline) return speakerEmbeddingPipeline;
|
|
197
225
|
if (speakerEmbeddingLoading) {
|
|
198
226
|
while (speakerEmbeddingLoading) await new Promise(r => setTimeout(r, 100));
|
|
199
|
-
if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding
|
|
227
|
+
if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding model failed to load');
|
|
200
228
|
return speakerEmbeddingPipeline;
|
|
201
229
|
}
|
|
202
230
|
speakerEmbeddingLoading = true;
|
|
203
231
|
try {
|
|
204
|
-
const {
|
|
232
|
+
const { AutoModelForXVector, AutoFeatureExtractor, env } = await loadTransformers();
|
|
205
233
|
env.allowRemoteModels = true;
|
|
206
|
-
|
|
234
|
+
const modelId = 'Xenova/wavlm-base-plus-sv';
|
|
235
|
+
speakerEmbeddingPipeline = await AutoModelForXVector.from_pretrained(modelId, {
|
|
207
236
|
device: 'cpu',
|
|
208
237
|
dtype: 'fp32',
|
|
209
238
|
});
|
|
239
|
+
speakerFeatureExtractor = await AutoFeatureExtractor.from_pretrained(modelId);
|
|
210
240
|
return speakerEmbeddingPipeline;
|
|
211
241
|
} catch (err) {
|
|
212
242
|
speakerEmbeddingPipeline = null;
|
|
243
|
+
speakerFeatureExtractor = null;
|
|
213
244
|
throw new Error('Speaker embedding model load failed: ' + err.message);
|
|
214
245
|
} finally {
|
|
215
246
|
speakerEmbeddingLoading = false;
|
|
@@ -234,6 +265,12 @@ async function decodeAudioFile(filePath) {
|
|
|
234
265
|
const decoded = decodeWavToFloat32(buf);
|
|
235
266
|
return resampleTo16k(decoded.audio, decoded.sampleRate);
|
|
236
267
|
}
|
|
268
|
+
const wavPath = filePath.replace(/\.[^.]+$/, '.wav');
|
|
269
|
+
if (fs.existsSync(wavPath)) {
|
|
270
|
+
const wavBuf = fs.readFileSync(wavPath);
|
|
271
|
+
const decoded = decodeWavToFloat32(wavBuf);
|
|
272
|
+
return resampleTo16k(decoded.audio, decoded.sampleRate);
|
|
273
|
+
}
|
|
237
274
|
const decode = (await import('audio-decode')).default;
|
|
238
275
|
const audioBuffer = await decode(buf);
|
|
239
276
|
const mono = audioBuffer.getChannelData(0);
|
|
@@ -246,23 +283,30 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
|
|
|
246
283
|
console.error('[VOICES] Custom voice file not found for:', voiceId);
|
|
247
284
|
return ensureSpeakerEmbeddings();
|
|
248
285
|
}
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
286
|
+
try {
|
|
287
|
+
console.log('[VOICES] Generating embedding from:', audioFile);
|
|
288
|
+
const audio = await decodeAudioFile(audioFile);
|
|
289
|
+
if (audio.length < SAMPLE_RATE_STT * 0.5) {
|
|
290
|
+
throw new Error('Audio too short for embedding extraction');
|
|
291
|
+
}
|
|
292
|
+
const model = await getSpeakerEmbeddingPipeline();
|
|
293
|
+
const inputs = await speakerFeatureExtractor(audio, { sampling_rate: SAMPLE_RATE_STT });
|
|
294
|
+
const output = await model(inputs);
|
|
295
|
+
const embData = output.embeddings.data;
|
|
296
|
+
const embedding = new Float32Array(512);
|
|
297
|
+
for (let i = 0; i < Math.min(512, embData.length); i++) {
|
|
298
|
+
embedding[i] = embData[i];
|
|
299
|
+
}
|
|
300
|
+
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
301
|
+
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
302
|
+
fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
|
|
303
|
+
voiceEmbeddingsCache.set(voiceId, embedding);
|
|
304
|
+
console.log('[VOICES] Generated embedding for custom voice:', voiceId);
|
|
305
|
+
return embedding;
|
|
306
|
+
} catch (err) {
|
|
307
|
+
console.error('[VOICES] Failed to generate embedding for', voiceId + ':', err.message);
|
|
308
|
+
return ensureSpeakerEmbeddings();
|
|
259
309
|
}
|
|
260
|
-
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
261
|
-
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
262
|
-
fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
|
|
263
|
-
voiceEmbeddingsCache.set(voiceId, embedding);
|
|
264
|
-
console.log('[VOICES] Generated embedding for custom voice:', voiceId);
|
|
265
|
-
return embedding;
|
|
266
310
|
}
|
|
267
311
|
|
|
268
312
|
async function getSTT() {
|
|
@@ -509,13 +553,14 @@ async function* synthesizeStream(text, voiceId) {
|
|
|
509
553
|
}
|
|
510
554
|
|
|
511
555
|
function getStatus() {
|
|
556
|
+
const ttsRetryExpired = ttsLoadError && (Date.now() - ttsLoadErrorTime >= TTS_ERROR_RETRY_MS);
|
|
512
557
|
return {
|
|
513
558
|
sttReady: !!sttPipeline,
|
|
514
559
|
ttsReady: !!ttsPipeline,
|
|
515
560
|
sttLoading,
|
|
516
561
|
ttsLoading,
|
|
517
562
|
sttError: sttLoadError ? sttLoadError.message : null,
|
|
518
|
-
ttsError: ttsLoadError ? ttsLoadError.message : null,
|
|
563
|
+
ttsError: (ttsLoadError && !ttsRetryExpired) ? ttsLoadError.message : null,
|
|
519
564
|
};
|
|
520
565
|
}
|
|
521
566
|
|
package/package.json
CHANGED
package/static/js/voice.js
CHANGED
|
@@ -320,6 +320,7 @@
|
|
|
320
320
|
var ttsConsecutiveFailures = 0;
|
|
321
321
|
var TTS_MAX_FAILURES = 3;
|
|
322
322
|
var ttsDisabledUntilReset = false;
|
|
323
|
+
var streamingSupported = true;
|
|
323
324
|
|
|
324
325
|
function playNextChunk() {
|
|
325
326
|
if (audioChunkQueue.length === 0) {
|
|
@@ -391,12 +392,16 @@
|
|
|
391
392
|
}
|
|
392
393
|
|
|
393
394
|
function tryStreaming() {
|
|
395
|
+
if (!streamingSupported) { tryNonStreaming(text); return; }
|
|
394
396
|
fetch(BASE + '/api/tts-stream', {
|
|
395
397
|
method: 'POST',
|
|
396
398
|
headers: { 'Content-Type': 'application/json' },
|
|
397
399
|
body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
|
|
398
400
|
}).then(function(resp) {
|
|
399
|
-
if (!resp.ok)
|
|
401
|
+
if (!resp.ok) {
|
|
402
|
+
streamingSupported = false;
|
|
403
|
+
throw new Error('TTS stream failed: ' + resp.status);
|
|
404
|
+
}
|
|
400
405
|
var reader = resp.body.getReader();
|
|
401
406
|
var buffer = new Uint8Array(0);
|
|
402
407
|
|