agentgui 1.0.169 → 1.0.171

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/speech.js CHANGED
@@ -1,5 +1,6 @@
1
1
  import { createRequire } from 'module';
2
2
  import fs from 'fs';
3
+ import os from 'os';
3
4
  import path from 'path';
4
5
  import { fileURLToPath } from 'url';
5
6
 
@@ -7,16 +8,22 @@ const require = createRequire(import.meta.url);
7
8
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
8
9
  const ROOT = path.dirname(__dirname);
9
10
  const DATA_DIR = path.join(ROOT, 'data');
11
+ const AUDIO_EXTENSIONS = ['.mp3', '.wav', '.ogg', '.flac', '.m4a'];
10
12
 
11
- const SPEAKER_EMBEDDINGS_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin';
12
- const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
13
- const SAMPLE_RATE_TTS = 16000;
14
- const SAMPLE_RATE_STT = 16000;
15
- const MIN_WAV_SIZE = 44;
16
- const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Matthijs%2Fcmu-arctic-xvectors&config=default&split=validation';
17
- const SAMPLES_TO_AVERAGE = 10;
13
+ function getVoiceDirs() {
14
+ const dirs = [];
15
+ const seen = new Set();
16
+ const add = (d) => { const r = path.resolve(d); if (!seen.has(r)) { seen.add(r); dirs.push(r); } };
17
+ const startupCwd = process.env.STARTUP_CWD || process.cwd();
18
+ add(path.join(startupCwd, 'voices'));
19
+ add(path.join(ROOT, 'voices'));
20
+ add(path.join(os.homedir(), 'voices'));
21
+ return dirs;
22
+ }
18
23
 
19
- const VOICE_CATALOG = [
24
+ const MIN_WAV_SIZE = 1000;
25
+
26
+ const BASE_VOICES = [
20
27
  { id: 'default', name: 'Default', gender: 'male', accent: 'US' },
21
28
  { id: 'bdl', name: 'BDL', gender: 'male', accent: 'US' },
22
29
  { id: 'slt', name: 'SLT', gender: 'female', accent: 'US' },
@@ -27,15 +34,88 @@ const VOICE_CATALOG = [
27
34
  { id: 'ksp', name: 'KSP', gender: 'male', accent: 'Indian' },
28
35
  ];
29
36
 
37
+ async function convertToWav(filePath) {
38
+ const wavPath = filePath.replace(/\.[^.]+$/, '.wav');
39
+ if (fs.existsSync(wavPath)) return wavPath;
40
+ try {
41
+ console.log('[VOICES] Converting to WAV:', filePath);
42
+ const audio = await decodeAudioFile(filePath);
43
+ const wav = encodeWav(audio, SAMPLE_RATE_STT);
44
+ fs.writeFileSync(wavPath, wav);
45
+ console.log('[VOICES] Converted:', path.basename(wavPath));
46
+ return wavPath;
47
+ } catch (err) {
48
+ console.error('[VOICES] Conversion failed for', filePath + ':', err.message);
49
+ return null;
50
+ }
51
+ }
52
+
53
+ const pendingConversions = new Map();
54
+
55
+ function scanVoiceDir(dir) {
56
+ const voices = [];
57
+ try {
58
+ if (!fs.existsSync(dir)) return voices;
59
+ for (const file of fs.readdirSync(dir)) {
60
+ const ext = path.extname(file).toLowerCase();
61
+ if (!AUDIO_EXTENSIONS.includes(ext)) continue;
62
+ const baseName = path.basename(file, ext);
63
+ if (ext !== '.wav') {
64
+ const wavFile = baseName + '.wav';
65
+ if (fs.existsSync(path.join(dir, wavFile))) continue;
66
+ const fullPath = path.join(dir, file);
67
+ if (!pendingConversions.has(fullPath)) {
68
+ pendingConversions.set(fullPath, convertToWav(fullPath).then(result => {
69
+ pendingConversions.delete(fullPath);
70
+ return result;
71
+ }));
72
+ }
73
+ continue;
74
+ }
75
+ const id = 'custom_' + baseName.replace(/[^a-zA-Z0-9_-]/g, '_');
76
+ const name = baseName.replace(/_/g, ' ');
77
+ voices.push({ id, name, gender: 'custom', accent: 'custom', isCustom: true, sourceDir: dir });
78
+ }
79
+ } catch (err) {
80
+ console.error('[VOICES] Error scanning', dir + ':', err.message);
81
+ }
82
+ return voices;
83
+ }
84
+
85
+ function loadCustomVoices() {
86
+ const seen = new Set();
87
+ const voices = [];
88
+ for (const dir of getVoiceDirs()) {
89
+ for (const v of scanVoiceDir(dir)) {
90
+ if (seen.has(v.id)) continue;
91
+ seen.add(v.id);
92
+ voices.push(v);
93
+ }
94
+ }
95
+ return voices;
96
+ }
97
+
98
+ function getVoices() {
99
+ return [...BASE_VOICES, ...loadCustomVoices()];
100
+ }
101
+
30
102
  const SPEAKER_OFFSETS = { awb: 0, bdl: 1200, clb: 2300, jmk: 3500, ksp: 4700, rms: 5900, slt: 7100 };
103
+ const SPEAKER_EMBEDDINGS_URL = 'https://huggingface.co/datasets/Xenova/speaker_embeddings/resolve/main/spkrec-xvectors-voxceleb.hf';
104
+ const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
105
+ const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Xenova%2Fspeaker_embeddings&config=default&split=train';
106
+ const SAMPLES_TO_AVERAGE = 30;
31
107
 
32
108
  let transformersModule = null;
33
109
  let sttPipeline = null;
34
110
  let ttsPipeline = null;
35
111
  let speakerEmbeddings = null;
112
+ let speakerEmbeddingPipeline = null;
36
113
  let sttLoading = false;
37
114
  let ttsLoading = false;
115
+ let speakerEmbeddingLoading = false;
38
116
  const voiceEmbeddingsCache = new Map();
117
+ const SAMPLE_RATE_STT = 16000;
118
+ const SAMPLE_RATE_TTS = 16000;
39
119
 
40
120
  const TTS_CACHE_MAX = 100;
41
121
  const ttsCache = new Map();
@@ -78,6 +158,9 @@ async function loadVoiceEmbedding(voiceId) {
78
158
  voiceEmbeddingsCache.set(voiceId, emb);
79
159
  return emb;
80
160
  }
161
+ if (voiceId.startsWith('custom_')) {
162
+ return generateEmbeddingFromCustomVoice(voiceId);
163
+ }
81
164
  const offset = SPEAKER_OFFSETS[voiceId];
82
165
  if (offset === undefined) return ensureSpeakerEmbeddings();
83
166
  const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
@@ -101,8 +184,77 @@ async function loadVoiceEmbedding(voiceId) {
101
184
  return avg;
102
185
  }
103
186
 
104
- function getVoices() {
105
- return VOICE_CATALOG;
187
+ async function getSpeakerEmbeddingPipeline() {
188
+ if (speakerEmbeddingPipeline) return speakerEmbeddingPipeline;
189
+ if (speakerEmbeddingLoading) {
190
+ while (speakerEmbeddingLoading) await new Promise(r => setTimeout(r, 100));
191
+ if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding pipeline failed to load');
192
+ return speakerEmbeddingPipeline;
193
+ }
194
+ speakerEmbeddingLoading = true;
195
+ try {
196
+ const { pipeline, env } = await loadTransformers();
197
+ env.allowRemoteModels = true;
198
+ speakerEmbeddingPipeline = await pipeline('feature-extraction', 'speechbrain/spkrec-xvectors-voxceleb', {
199
+ device: 'cpu',
200
+ dtype: 'fp32',
201
+ });
202
+ return speakerEmbeddingPipeline;
203
+ } catch (err) {
204
+ speakerEmbeddingPipeline = null;
205
+ throw new Error('Speaker embedding model load failed: ' + err.message);
206
+ } finally {
207
+ speakerEmbeddingLoading = false;
208
+ }
209
+ }
210
+
211
+ function findCustomVoiceFile(voiceId) {
212
+ const baseName = voiceId.replace(/^custom_/, '');
213
+ for (const dir of getVoiceDirs()) {
214
+ for (const ext of AUDIO_EXTENSIONS) {
215
+ const candidate = path.join(dir, baseName + ext);
216
+ if (fs.existsSync(candidate)) return candidate;
217
+ }
218
+ }
219
+ return null;
220
+ }
221
+
222
+ async function decodeAudioFile(filePath) {
223
+ const buf = fs.readFileSync(filePath);
224
+ const ext = path.extname(filePath).toLowerCase();
225
+ if (ext === '.wav') {
226
+ const decoded = decodeWavToFloat32(buf);
227
+ return resampleTo16k(decoded.audio, decoded.sampleRate);
228
+ }
229
+ const decode = (await import('audio-decode')).default;
230
+ const audioBuffer = await decode(buf);
231
+ const mono = audioBuffer.getChannelData(0);
232
+ return resampleTo16k(mono, audioBuffer.sampleRate);
233
+ }
234
+
235
+ async function generateEmbeddingFromCustomVoice(voiceId) {
236
+ const audioFile = findCustomVoiceFile(voiceId);
237
+ if (!audioFile) {
238
+ console.error('[VOICES] Custom voice file not found for:', voiceId);
239
+ return ensureSpeakerEmbeddings();
240
+ }
241
+ console.log('[VOICES] Generating embedding from:', audioFile);
242
+ const audio = await decodeAudioFile(audioFile);
243
+ if (audio.length < SAMPLE_RATE_STT * 0.5) {
244
+ throw new Error('Audio too short for embedding extraction (need at least 0.5 seconds)');
245
+ }
246
+ const pipe = await getSpeakerEmbeddingPipeline();
247
+ const output = await pipe(audio, { pooling: 'mean', normalize: true });
248
+ const embedding = new Float32Array(512);
249
+ for (let i = 0; i < Math.min(512, output.data.length); i++) {
250
+ embedding[i] = output.data[i];
251
+ }
252
+ if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
253
+ const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
254
+ fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
255
+ voiceEmbeddingsCache.set(voiceId, embedding);
256
+ console.log('[VOICES] Generated embedding for custom voice:', voiceId);
257
+ return embedding;
106
258
  }
107
259
 
108
260
  async function getSTT() {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgui",
3
- "version": "1.0.169",
3
+ "version": "1.0.171",
4
4
  "description": "Multi-agent ACP client with real-time communication",
5
5
  "type": "module",
6
6
  "main": "server.js",
@@ -23,6 +23,7 @@
23
23
  "dependencies": {
24
24
  "@anthropic-ai/claude-code": "^2.1.37",
25
25
  "@huggingface/transformers": "^3.8.1",
26
+ "audio-decode": "^2.2.3",
26
27
  "better-sqlite3": "^12.6.2",
27
28
  "busboy": "^1.6.0",
28
29
  "express": "^5.2.1",
@@ -34,19 +34,33 @@
34
34
  .then(function(data) {
35
35
  if (!data.ok || !Array.isArray(data.voices)) return;
36
36
  selector.innerHTML = '';
37
- data.voices.forEach(function(voice) {
38
- var opt = document.createElement('option');
39
- opt.value = voice.id;
40
- var label = voice.name;
41
- if (voice.gender || voice.accent) {
37
+ var builtIn = data.voices.filter(function(v) { return !v.isCustom; });
38
+ var custom = data.voices.filter(function(v) { return v.isCustom; });
39
+ if (builtIn.length) {
40
+ var grp1 = document.createElement('optgroup');
41
+ grp1.label = 'Built-in Voices';
42
+ builtIn.forEach(function(voice) {
43
+ var opt = document.createElement('option');
44
+ opt.value = voice.id;
42
45
  var parts = [];
43
46
  if (voice.gender) parts.push(voice.gender);
44
47
  if (voice.accent) parts.push(voice.accent);
45
- label += ' (' + parts.join(', ') + ')';
46
- }
47
- opt.textContent = label;
48
- selector.appendChild(opt);
49
- });
48
+ opt.textContent = voice.name + (parts.length ? ' (' + parts.join(', ') + ')' : '');
49
+ grp1.appendChild(opt);
50
+ });
51
+ selector.appendChild(grp1);
52
+ }
53
+ if (custom.length) {
54
+ var grp2 = document.createElement('optgroup');
55
+ grp2.label = 'Custom Voices';
56
+ custom.forEach(function(voice) {
57
+ var opt = document.createElement('option');
58
+ opt.value = voice.id;
59
+ opt.textContent = voice.name;
60
+ grp2.appendChild(opt);
61
+ });
62
+ selector.appendChild(grp2);
63
+ }
50
64
  if (saved && selector.querySelector('option[value="' + saved + '"]')) {
51
65
  selector.value = saved;
52
66
  }
@@ -322,53 +336,78 @@
322
336
  var text = speechQueue.shift();
323
337
  audioChunkQueue = [];
324
338
  isPlayingChunk = false;
325
- fetch(BASE + '/api/tts-stream', {
326
- method: 'POST',
327
- headers: { 'Content-Type': 'application/json' },
328
- body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
329
- }).then(function(resp) {
330
- if (!resp.ok) throw new Error('TTS failed');
331
- var reader = resp.body.getReader();
332
- var buffer = new Uint8Array(0);
333
-
334
- function concat(a, b) {
335
- var c = new Uint8Array(a.length + b.length);
336
- c.set(a, 0);
337
- c.set(b, a.length);
338
- return c;
339
- }
339
+
340
+ function tryStreaming() {
341
+ fetch(BASE + '/api/tts-stream', {
342
+ method: 'POST',
343
+ headers: { 'Content-Type': 'application/json' },
344
+ body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
345
+ }).then(function(resp) {
346
+ if (!resp.ok) throw new Error('TTS stream failed');
347
+ var reader = resp.body.getReader();
348
+ var buffer = new Uint8Array(0);
349
+
350
+ function concat(a, b) {
351
+ var c = new Uint8Array(a.length + b.length);
352
+ c.set(a, 0);
353
+ c.set(b, a.length);
354
+ return c;
355
+ }
340
356
 
341
- function pump() {
342
- return reader.read().then(function(result) {
343
- if (result.done) {
344
- streamDone = true;
345
- if (!isPlayingChunk && audioChunkQueue.length === 0) {
346
- isSpeaking = false;
347
- processQueue();
357
+ function pump() {
358
+ return reader.read().then(function(result) {
359
+ if (result.done) {
360
+ streamDone = true;
361
+ if (!isPlayingChunk && audioChunkQueue.length === 0) {
362
+ isSpeaking = false;
363
+ processQueue();
364
+ }
365
+ return;
348
366
  }
349
- return;
350
- }
351
- buffer = concat(buffer, result.value);
352
- while (buffer.length >= 4) {
353
- var view = new DataView(buffer.buffer, buffer.byteOffset, 4);
354
- var chunkLen = view.getUint32(0, false);
355
- if (buffer.length < 4 + chunkLen) break;
356
- var wavData = buffer.slice(4, 4 + chunkLen);
357
- buffer = buffer.slice(4 + chunkLen);
358
- var blob = new Blob([wavData], { type: 'audio/wav' });
359
- audioChunkQueue.push(blob);
360
- if (!isPlayingChunk) playNextChunk();
361
- }
362
- return pump();
363
- });
364
- }
367
+ buffer = concat(buffer, result.value);
368
+ while (buffer.length >= 4) {
369
+ var view = new DataView(buffer.buffer, buffer.byteOffset, 4);
370
+ var chunkLen = view.getUint32(0, false);
371
+ if (buffer.length < 4 + chunkLen) break;
372
+ var wavData = buffer.slice(4, 4 + chunkLen);
373
+ buffer = buffer.slice(4 + chunkLen);
374
+ var blob = new Blob([wavData], { type: 'audio/wav' });
375
+ audioChunkQueue.push(blob);
376
+ if (!isPlayingChunk) playNextChunk();
377
+ }
378
+ return pump();
379
+ });
380
+ }
365
381
 
366
- return pump();
367
- }).catch(function() {
368
- streamDone = true;
369
- isSpeaking = false;
370
- processQueue();
371
- });
382
+ return pump();
383
+ }).catch(function() {
384
+ tryNonStreaming(text);
385
+ });
386
+ }
387
+
388
+ function tryNonStreaming(txt) {
389
+ fetch(BASE + '/api/tts', {
390
+ method: 'POST',
391
+ headers: { 'Content-Type': 'application/json' },
392
+ body: JSON.stringify({ text: txt, voiceId: selectedVoiceId })
393
+ }).then(function(resp) {
394
+ if (!resp.ok) throw new Error('TTS failed');
395
+ return resp.arrayBuffer();
396
+ }).then(function(buf) {
397
+ var blob = new Blob([buf], { type: 'audio/wav' });
398
+ audioChunkQueue.push(blob);
399
+ if (!isPlayingChunk) playNextChunk();
400
+ streamDone = true;
401
+ isSpeaking = false;
402
+ processQueue();
403
+ }).catch(function() {
404
+ streamDone = true;
405
+ isSpeaking = false;
406
+ processQueue();
407
+ });
408
+ }
409
+
410
+ tryStreaming();
372
411
  }
373
412
 
374
413
  function stopSpeaking() {