agentgui 1.0.176 → 1.0.178

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/speech.js CHANGED
@@ -8,7 +8,7 @@ const require = createRequire(import.meta.url);
8
8
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
9
9
  const ROOT = path.dirname(__dirname);
10
10
  const DATA_DIR = path.join(ROOT, 'data');
11
- const AUDIO_EXTENSIONS = ['.mp3', '.wav', '.ogg', '.flac', '.m4a'];
11
+ const AUDIO_EXTENSIONS = ['.wav', '.mp3', '.ogg', '.flac', '.m4a'];
12
12
 
13
13
  function getVoiceDirs() {
14
14
  const dirs = [];
@@ -106,6 +106,7 @@ const SPEAKER_EMBEDDINGS_URL = 'https://huggingface.co/datasets/Xenova/speaker_e
106
106
  const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
107
107
  const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Xenova%2Fspeaker_embeddings&config=default&split=train';
108
108
  const SAMPLES_TO_AVERAGE = 30;
109
+ const DEFAULT_EMBEDDING_B64 = 'xhibvao34LylqXQ8cNg7Pd1cCTw0keG8awRRvRqje7070G48AtOgPMFbnr1oeKC9I4ZuPZzqGT1DjWs8y3iMPB/SZLzdl7E6b9QaPKSpHTwYuh49FrMlO9YnebwmTzu9/3CPvQuvCbxsSWC9Sb2bO+tvXj0Cjpo8mTMxu/FDrjzQ4x09gyxCvUn6STxjAo+9vtXdPJtsYT3iMna9dQ+EvfQ72zuvxk69GAonPU8KdjsNPAU96e/8veN7lrwgyzk8HA5vvYE1Rz3gpZ484MsLPUKkxTzM54U81ECwvcbFHzv8gT08T6/7POCqBT2fv5E8fvsXPfZiJrzEhme8dg8kPR+mKTutQOU822maPMlMDb1x/IS93+6KvdyThzwhry880JBqvRVOhjzZods8SD08PLpObTn/0wk9BnAwvWiiz72EWgS9RpcjvV4VR73ZqJW9PoUFvfZYYb1h26S98levPHZbTjxH6qU9RPfoPHmJu70mSNo8ztJmvWgMBj0IX8i7TE3lPINY2DzoEma9wMObvTwKCT3pObe8t9KEvaWixjzc5fI8hj6MvaKv4Txl4h09d2a+PHCvTDxorJ69ekRrPeoPjz1JPfI7rUH7PIaJgz0O1YW9JLumvCxDnr1bmMm8GbIFPBX1oL3bRN08oYcXPEaFfL13Vxo9EKfbvTFcOTxdogA9XS3kPEWJoLvChc887BEgPMOvUT2Ba3s8tUDBvYPMZ72dNRG80AuTvQt7d72foTU9qO20O4INEb1u1iE9ibqJvZYaOj2nbYc8lsodvS5HPD1lCqK9EkBYPR0I/rySMIK9plcpPdpJEz2E/DY88d2DPIRTf71ZQZS9b1v5PPseFT2YiJu8OiOwPC8Wnr2QW4Q8n+o7PPQ8PD0QqAg9Vk7APDT6+jzreP88KH6GvTvAKD0AYiO9qOavvORySjvQ6y+9epb5PFvZijxYzlK9BwjUPK0HXL3acWc7dmwmPc/kXb2VBg68MGYRPR5q9zzmFiS9al2IvdVTfDwJOa88SzVkvVlrPD0WvJQ8Vm76PMUAQDzNgyK8QQZVPdMoibxrCBc9BgKTPDLoV70Iu6g7k+kBPZ3lhTy6sOU8OGkVvFaLRD14oqa9a4UVO4z4Gr1eYlO9u5BgPWS1ZL3kFPE8JGEwPQFTl71tHso8g+ElPd9Rgr2XCtc8axudvWC2IL09wSg9E7ZzPT6uBz2XmK09A1HcPJK8rTxK8Zu8GuMTPTuINTyRAhS9OSqDPDralLza3q48EgtePPf797rIWKo9NtkrvbO34zxKZ6m97l0GPQYVlL2igDA9UyfEPJhZyjx4/2Q8ggBpPYcAkzzIVu08ykYNPESdZr3uqmq8fS/zPKUYvzv67x49cUkqvXDlJj1us/88gASuvcs6G7sUshY9SgWiOqu4OD1WQ7k7/sLoPKuLJjwZYFm9an+zPOnfNry9Jh49/XX3vN1sc731fBM9TnBDPHzOAD26/dS9mg57vY+TA7wVJCw9pPb1PE30l7019la9UyRTPXFqljyRDnw9eZ6nvU03kTtS9907L+wavIBtab3k6cs8KVr6vPZ5zTxy+Zs8VuopPQTTUj0tNxg96qZyPY69lTzQEp48BXGJvVopBDvskUg9G2dOPaJMXDylJZU8FxcMvBQkNzzjPKs8FYUpvepYYj1AQsK9upQsvS4037xDcO48GhmIvWb1iT1gJhy9TG7iPHKAG70cuCQ8F1ZwPYqtj7300T89rTujPbXy2r3/cK69FtBNvY3iMT0DoqI4KK0QPYKEqr2Z6RU9ni0UPUNDLb3BsCi8+GttvZYp9zwUaHe9TqrFPOnlH7yCXJC9U8vDu8u2MjxA8xs9SAGxvPpphr29y2e9y2AYvTv+Eb1Elus9DdpGPSfmNL39Ggu85RVXPZbLh70Jvna7XkLGvR230DtGjpu7Ih8HPJKnIz1o35i8x5NVvXwFNDzs/ZM8+kw8PfFJSTwdlJA9ZJ+tvaoVZ7zTvVi8p6wluwh/IT0Kmg088o1rPRhiwjxpWIe9a+LuvYuYtjwAxE09WkPJPBuFh73UotY820JjvXpnQD3fJ/w8TM3JPOz0pTnbTim9tpe6PBHzJT1HEb66SkAKPasLgr1l/Mm8IOGgvM2pZbzwd4a9znOIO4d4Bb1DW5I8EZXzOxvBKDqKpHG9UwCHvd/Epb2cDRi9V1ztPNPBNTrLXHa8FdGHPPo+hb3DnJ08G+SvvVPQBL6zzrC8Omksvc+eIjyvGfU8eG9nvaVkdL1HBvs8eaeGPfcbVD1/Pfw8+TUFvU6aTL2JN5W8HXDNvGKFEj1i+T09UiCIOySbDD2x2/y7VTmnvTe3gb0ZhJw8WrKIuU5RGT09mKU7eFGtPFpr6DzaoyI9hsItPKU+YzuQlXK8f9IePSmUxTwXdoo9W6FJPV2kLzwkU1o8fGnfPInxg70rEVe9H7sNPWJDbbxSqLY8cQAOPUdpAD2YknK9ykFXPeVALz1mq3W96kO/PLERzjyXIRC7jxsXPRnLzjyUEoU7gTKvu+stlb1D1g45IH+2u5sOIj0wXPA8yTqDvT6mV72NsFq8ExeuPJlGyDxvjgk9lJeJvWSF8DwFvaW7oZ9GvHq1Rr1FJsk83zxVvfyGqTz7thG9fslpPF5RPb1Q6BQ9iXGovTeDeb2cmic8oBsRPYeni72TPcI8EKcPvfCJUbyQJqW9fCAYPRk8qT2q6rk8mEw2PfDeXL0=';
109
110
 
110
111
  let transformersModule = null;
111
112
  let sttPipeline = null;
@@ -143,16 +144,36 @@ function whisperModelPath() {
143
144
  return 'onnx-community/whisper-base';
144
145
  }
145
146
 
147
+ function defaultEmbedding() {
148
+ const buf = Buffer.from(DEFAULT_EMBEDDING_B64, 'base64');
149
+ return new Float32Array(new Uint8Array(buf).buffer);
150
+ }
151
+
146
152
  async function ensureSpeakerEmbeddings() {
147
153
  if (speakerEmbeddings) return speakerEmbeddings;
148
154
  if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
149
- if (!fs.existsSync(SPEAKER_EMBEDDINGS_PATH)) {
150
- const resp = await fetch(SPEAKER_EMBEDDINGS_URL);
151
- if (!resp.ok) throw new Error('Failed to download speaker embeddings');
152
- fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, Buffer.from(await resp.arrayBuffer()));
155
+ if (fs.existsSync(SPEAKER_EMBEDDINGS_PATH)) {
156
+ const buf = fs.readFileSync(SPEAKER_EMBEDDINGS_PATH);
157
+ if (buf.length === 2048) {
158
+ speakerEmbeddings = new Float32Array(new Uint8Array(buf).buffer);
159
+ return speakerEmbeddings;
160
+ }
153
161
  }
154
- const buf = fs.readFileSync(SPEAKER_EMBEDDINGS_PATH);
155
- speakerEmbeddings = new Float32Array(new Uint8Array(buf).buffer);
162
+ try {
163
+ const resp = await fetch(SPEAKER_EMBEDDINGS_URL);
164
+ if (resp.ok) {
165
+ const data = Buffer.from(await resp.arrayBuffer());
166
+ if (data.length >= 2048) {
167
+ fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, data);
168
+ speakerEmbeddings = new Float32Array(new Uint8Array(data).buffer);
169
+ return speakerEmbeddings;
170
+ }
171
+ }
172
+ } catch (_) {}
173
+ console.log('[TTS] Using bundled default speaker embedding');
174
+ speakerEmbeddings = defaultEmbedding();
175
+ const buf = Buffer.from(speakerEmbeddings.buffer);
176
+ fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, buf);
156
177
  return speakerEmbeddings;
157
178
  }
158
179
 
@@ -171,45 +192,55 @@ async function loadVoiceEmbedding(voiceId) {
171
192
  }
172
193
  const offset = SPEAKER_OFFSETS[voiceId];
173
194
  if (offset === undefined) return ensureSpeakerEmbeddings();
174
- const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
175
- const resp = await fetch(url);
176
- if (!resp.ok) throw new Error('Failed to fetch voice embeddings for ' + voiceId);
177
- const data = await resp.json();
178
- const avg = new Float32Array(512);
179
- let count = 0;
180
- for (const item of data.rows) {
181
- const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
182
- if (match && match[1] === voiceId) {
183
- for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
184
- count++;
195
+ try {
196
+ const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
197
+ const resp = await fetch(url);
198
+ if (!resp.ok) throw new Error('HTTP ' + resp.status);
199
+ const data = await resp.json();
200
+ const avg = new Float32Array(512);
201
+ let count = 0;
202
+ for (const item of data.rows) {
203
+ const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
204
+ if (match && match[1] === voiceId) {
205
+ for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
206
+ count++;
207
+ }
185
208
  }
209
+ if (count === 0) return ensureSpeakerEmbeddings();
210
+ for (let i = 0; i < 512; i++) avg[i] /= count;
211
+ if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
212
+ fs.writeFileSync(binPath, Buffer.from(avg.buffer));
213
+ voiceEmbeddingsCache.set(voiceId, avg);
214
+ return avg;
215
+ } catch (err) {
216
+ console.error('[TTS] Failed to fetch voice embedding for ' + voiceId + ':', err.message);
217
+ return ensureSpeakerEmbeddings();
186
218
  }
187
- if (count === 0) return ensureSpeakerEmbeddings();
188
- for (let i = 0; i < 512; i++) avg[i] /= count;
189
- if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
190
- fs.writeFileSync(binPath, Buffer.from(avg.buffer));
191
- voiceEmbeddingsCache.set(voiceId, avg);
192
- return avg;
193
219
  }
194
220
 
221
+ let speakerFeatureExtractor = null;
222
+
195
223
  async function getSpeakerEmbeddingPipeline() {
196
224
  if (speakerEmbeddingPipeline) return speakerEmbeddingPipeline;
197
225
  if (speakerEmbeddingLoading) {
198
226
  while (speakerEmbeddingLoading) await new Promise(r => setTimeout(r, 100));
199
- if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding pipeline failed to load');
227
+ if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding model failed to load');
200
228
  return speakerEmbeddingPipeline;
201
229
  }
202
230
  speakerEmbeddingLoading = true;
203
231
  try {
204
- const { pipeline, env } = await loadTransformers();
232
+ const { AutoModelForXVector, AutoFeatureExtractor, env } = await loadTransformers();
205
233
  env.allowRemoteModels = true;
206
- speakerEmbeddingPipeline = await pipeline('feature-extraction', 'speechbrain/spkrec-xvectors-voxceleb', {
234
+ const modelId = 'Xenova/wavlm-base-plus-sv';
235
+ speakerEmbeddingPipeline = await AutoModelForXVector.from_pretrained(modelId, {
207
236
  device: 'cpu',
208
237
  dtype: 'fp32',
209
238
  });
239
+ speakerFeatureExtractor = await AutoFeatureExtractor.from_pretrained(modelId);
210
240
  return speakerEmbeddingPipeline;
211
241
  } catch (err) {
212
242
  speakerEmbeddingPipeline = null;
243
+ speakerFeatureExtractor = null;
213
244
  throw new Error('Speaker embedding model load failed: ' + err.message);
214
245
  } finally {
215
246
  speakerEmbeddingLoading = false;
@@ -234,6 +265,12 @@ async function decodeAudioFile(filePath) {
234
265
  const decoded = decodeWavToFloat32(buf);
235
266
  return resampleTo16k(decoded.audio, decoded.sampleRate);
236
267
  }
268
+ const wavPath = filePath.replace(/\.[^.]+$/, '.wav');
269
+ if (fs.existsSync(wavPath)) {
270
+ const wavBuf = fs.readFileSync(wavPath);
271
+ const decoded = decodeWavToFloat32(wavBuf);
272
+ return resampleTo16k(decoded.audio, decoded.sampleRate);
273
+ }
237
274
  const decode = (await import('audio-decode')).default;
238
275
  const audioBuffer = await decode(buf);
239
276
  const mono = audioBuffer.getChannelData(0);
@@ -246,23 +283,30 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
246
283
  console.error('[VOICES] Custom voice file not found for:', voiceId);
247
284
  return ensureSpeakerEmbeddings();
248
285
  }
249
- console.log('[VOICES] Generating embedding from:', audioFile);
250
- const audio = await decodeAudioFile(audioFile);
251
- if (audio.length < SAMPLE_RATE_STT * 0.5) {
252
- throw new Error('Audio too short for embedding extraction (need at least 0.5 seconds)');
253
- }
254
- const pipe = await getSpeakerEmbeddingPipeline();
255
- const output = await pipe(audio, { pooling: 'mean', normalize: true });
256
- const embedding = new Float32Array(512);
257
- for (let i = 0; i < Math.min(512, output.data.length); i++) {
258
- embedding[i] = output.data[i];
286
+ try {
287
+ console.log('[VOICES] Generating embedding from:', audioFile);
288
+ const audio = await decodeAudioFile(audioFile);
289
+ if (audio.length < SAMPLE_RATE_STT * 0.5) {
290
+ throw new Error('Audio too short for embedding extraction');
291
+ }
292
+ const model = await getSpeakerEmbeddingPipeline();
293
+ const inputs = await speakerFeatureExtractor(audio, { sampling_rate: SAMPLE_RATE_STT });
294
+ const output = await model(inputs);
295
+ const embData = output.embeddings.data;
296
+ const embedding = new Float32Array(512);
297
+ for (let i = 0; i < Math.min(512, embData.length); i++) {
298
+ embedding[i] = embData[i];
299
+ }
300
+ if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
301
+ const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
302
+ fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
303
+ voiceEmbeddingsCache.set(voiceId, embedding);
304
+ console.log('[VOICES] Generated embedding for custom voice:', voiceId);
305
+ return embedding;
306
+ } catch (err) {
307
+ console.error('[VOICES] Failed to generate embedding for', voiceId + ':', err.message);
308
+ return ensureSpeakerEmbeddings();
259
309
  }
260
- if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
261
- const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
262
- fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
263
- voiceEmbeddingsCache.set(voiceId, embedding);
264
- console.log('[VOICES] Generated embedding for custom voice:', voiceId);
265
- return embedding;
266
310
  }
267
311
 
268
312
  async function getSTT() {
@@ -509,13 +553,14 @@ async function* synthesizeStream(text, voiceId) {
509
553
  }
510
554
 
511
555
  function getStatus() {
556
+ const ttsRetryExpired = ttsLoadError && (Date.now() - ttsLoadErrorTime >= TTS_ERROR_RETRY_MS);
512
557
  return {
513
558
  sttReady: !!sttPipeline,
514
559
  ttsReady: !!ttsPipeline,
515
560
  sttLoading,
516
561
  ttsLoading,
517
562
  sttError: sttLoadError ? sttLoadError.message : null,
518
- ttsError: ttsLoadError ? ttsLoadError.message : null,
563
+ ttsError: (ttsLoadError && !ttsRetryExpired) ? ttsLoadError.message : null,
519
564
  };
520
565
  }
521
566
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgui",
3
- "version": "1.0.176",
3
+ "version": "1.0.178",
4
4
  "description": "Multi-agent ACP client with real-time communication",
5
5
  "type": "module",
6
6
  "main": "server.js",
@@ -320,6 +320,7 @@
320
320
  var ttsConsecutiveFailures = 0;
321
321
  var TTS_MAX_FAILURES = 3;
322
322
  var ttsDisabledUntilReset = false;
323
+ var streamingSupported = true;
323
324
 
324
325
  function playNextChunk() {
325
326
  if (audioChunkQueue.length === 0) {
@@ -391,12 +392,16 @@
391
392
  }
392
393
 
393
394
  function tryStreaming() {
395
+ if (!streamingSupported) { tryNonStreaming(text); return; }
394
396
  fetch(BASE + '/api/tts-stream', {
395
397
  method: 'POST',
396
398
  headers: { 'Content-Type': 'application/json' },
397
399
  body: JSON.stringify({ text: text, voiceId: selectedVoiceId })
398
400
  }).then(function(resp) {
399
- if (!resp.ok) throw new Error('TTS stream failed: ' + resp.status);
401
+ if (!resp.ok) {
402
+ streamingSupported = false;
403
+ throw new Error('TTS stream failed: ' + resp.status);
404
+ }
400
405
  var reader = resp.body.getReader();
401
406
  var buffer = new Uint8Array(0);
402
407