agentgui 1.0.177 → 1.0.179

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,190 @@
1
+ import { spawn } from 'child_process';
2
+ import path from 'path';
3
+ import fs from 'fs';
4
+ import os from 'os';
5
+ import { fileURLToPath } from 'url';
6
+ import http from 'http';
7
+
8
+ const ROOT = path.dirname(path.dirname(fileURLToPath(import.meta.url)));
9
+ const POCKET_BIN = path.join(ROOT, 'data', 'pocket-venv', 'bin', 'pocket-tts');
10
+ const PORT = 8787;
11
+
12
+ const FALLBACK_VOICE = 'alba';
13
+ const state = {
14
+ process: null, port: PORT, status: 'stopped', pid: null,
15
+ restartCount: 0, failureCount: 0, lastError: null,
16
+ healthy: false, voicePath: null, starting: false,
17
+ shutdownRequested: false, healthTimer: null, restartTimer: null,
18
+ voiceCloning: false,
19
+ };
20
+ globalThis.__pocketSidecar = state;
21
+
22
+ function isInstalled() { return fs.existsSync(POCKET_BIN); }
23
+
24
+ function findVoiceFile(voiceId) {
25
+ if (!voiceId || voiceId === 'default') return null;
26
+ const baseName = voiceId.replace(/^custom_/, '');
27
+ const dirs = [
28
+ path.join(process.env.STARTUP_CWD || process.cwd(), 'voices'),
29
+ path.join(ROOT, 'voices'), path.join(os.homedir(), 'voices'), '/config/voices',
30
+ ];
31
+ for (const dir of dirs)
32
+ for (const ext of ['.wav', '.mp3', '.ogg', '.flac']) {
33
+ const p = path.join(dir, baseName + ext);
34
+ if (fs.existsSync(p)) return p;
35
+ }
36
+ return null;
37
+ }
38
+
39
+ function healthCheck() {
40
+ return new Promise((resolve) => {
41
+ const req = http.get(`http://127.0.0.1:${PORT}/health`, { timeout: 3000 }, (res) => {
42
+ res.resume();
43
+ res.on('end', () => { state.healthy = res.statusCode === 200; resolve(state.healthy); });
44
+ });
45
+ req.on('error', () => { state.healthy = false; resolve(false); });
46
+ req.on('timeout', () => { req.destroy(); state.healthy = false; resolve(false); });
47
+ });
48
+ }
49
+
50
+ function killProcess() {
51
+ if (state.process) { try { state.process.kill('SIGTERM'); } catch (_) {} }
52
+ state.process = null; state.pid = null; state.healthy = false; state.status = 'stopped';
53
+ }
54
+
55
+ function scheduleRestart() {
56
+ if (state.shutdownRequested) return;
57
+ killProcess();
58
+ const delay = Math.min(1000 * Math.pow(2, state.restartCount), 30000);
59
+ state.restartCount++;
60
+ console.log(`[POCKET-TTS] Restart in ${delay}ms (attempt ${state.restartCount})`);
61
+ state.restartTimer = setTimeout(() => {
62
+ state.restartTimer = null;
63
+ start(state.voicePath).catch(e => console.error('[POCKET-TTS] Restart failed:', e.message));
64
+ }, delay);
65
+ }
66
+
67
+ function spawnSidecar(voice) {
68
+ const args = ['serve', '--host', '0.0.0.0', '--port', String(PORT)];
69
+ if (voice) args.push('--voice', voice);
70
+ console.log('[POCKET-TTS] Starting:', POCKET_BIN, args.join(' '));
71
+ return spawn(POCKET_BIN, args, {
72
+ stdio: ['ignore', 'pipe', 'pipe'],
73
+ env: { ...process.env, PYTHONUNBUFFERED: '1' },
74
+ });
75
+ }
76
+
77
+ function attachProc(proc) {
78
+ state.process = proc; state.pid = proc.pid; state.status = 'starting';
79
+ proc.stdout.on('data', d => { const l = d.toString().trim(); if (l) console.log('[POCKET-TTS]', l); });
80
+ proc.stderr.on('data', d => { const l = d.toString().trim(); if (l) console.error('[POCKET-TTS]', l); });
81
+ proc.on('error', e => { state.lastError = e.message; });
82
+ }
83
+
84
+ async function waitForReady(proc, timeoutSec) {
85
+ let exited = false;
86
+ proc.on('exit', () => { exited = true; });
87
+ for (let i = 0; i < timeoutSec; i++) {
88
+ if (exited) return false;
89
+ await new Promise(r => setTimeout(r, 1000));
90
+ if (await healthCheck()) return true;
91
+ }
92
+ return false;
93
+ }
94
+
95
+ async function start(voicePath) {
96
+ if (!isInstalled()) { state.lastError = 'not installed'; state.status = 'unavailable'; return false; }
97
+ if (state.starting) return false;
98
+ if (state.status === 'running' && state.healthy) return true;
99
+ state.starting = true; state.shutdownRequested = false;
100
+ const requestedVoice = voicePath || state.voicePath;
101
+ try {
102
+ killProcess();
103
+ let proc = spawnSidecar(requestedVoice);
104
+ attachProc(proc);
105
+ let ready = await waitForReady(proc, 120);
106
+ if (!ready && requestedVoice && requestedVoice !== FALLBACK_VOICE) {
107
+ console.log('[POCKET-TTS] Custom voice failed, trying predefined voice:', FALLBACK_VOICE);
108
+ killProcess();
109
+ proc = spawnSidecar(FALLBACK_VOICE);
110
+ attachProc(proc);
111
+ state.voiceCloning = false;
112
+ ready = await waitForReady(proc, 120);
113
+ if (ready) state.voicePath = FALLBACK_VOICE;
114
+ } else if (ready) {
115
+ state.voicePath = requestedVoice;
116
+ state.voiceCloning = !!requestedVoice && !['alba','marius','javert','jean','fantine','cosette','eponine','azelma'].includes(requestedVoice);
117
+ }
118
+ if (ready) {
119
+ state.status = 'running'; state.restartCount = 0; state.failureCount = 0; state.lastError = null;
120
+ proc.on('exit', (code, sig) => {
121
+ console.log(`[POCKET-TTS] Exited: code=${code} signal=${sig}`);
122
+ state.process = null; state.pid = null; state.healthy = false; state.status = 'stopped';
123
+ if (!state.shutdownRequested) scheduleRestart();
124
+ });
125
+ if (!state.healthTimer) state.healthTimer = setInterval(async () => {
126
+ if (state.status !== 'running') return;
127
+ const ok = await healthCheck();
128
+ if (!ok && !state.shutdownRequested) {
129
+ state.failureCount++;
130
+ if (state.failureCount >= 3) scheduleRestart();
131
+ } else if (ok) state.failureCount = 0;
132
+ }, 10000);
133
+ console.log('[POCKET-TTS] Ready on port', PORT, '(voice cloning:', state.voiceCloning + ')');
134
+ return true;
135
+ }
136
+ state.lastError = 'Start timeout'; state.status = 'error'; killProcess(); return false;
137
+ } catch (err) {
138
+ state.lastError = err.message; state.status = 'error'; return false;
139
+ } finally { state.starting = false; }
140
+ }
141
+
142
+ async function stop() {
143
+ state.shutdownRequested = true;
144
+ if (state.healthTimer) { clearInterval(state.healthTimer); state.healthTimer = null; }
145
+ if (state.restartTimer) { clearTimeout(state.restartTimer); state.restartTimer = null; }
146
+ killProcess();
147
+ }
148
+
149
+ async function synthesize(text, voicePath) {
150
+ if (!state.healthy) throw new Error('pocket-tts not ready');
151
+ const boundary = '----PocketTTS' + Date.now();
152
+ const parts = [];
153
+ parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="text"\r\n\r\n${text}\r\n`);
154
+ if (state.voiceCloning && voicePath && voicePath !== state.voicePath) {
155
+ const data = fs.readFileSync(voicePath);
156
+ const name = path.basename(voicePath);
157
+ parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="voice_wav"; filename="${name}"\r\nContent-Type: audio/wav\r\n\r\n`);
158
+ parts.push(data); parts.push('\r\n');
159
+ }
160
+ parts.push(`--${boundary}--\r\n`);
161
+ const body = Buffer.concat(parts.map(p => Buffer.isBuffer(p) ? p : Buffer.from(p)));
162
+ return new Promise((resolve, reject) => {
163
+ const req = http.request({
164
+ hostname: '127.0.0.1', port: PORT, path: '/tts', method: 'POST',
165
+ headers: { 'Content-Type': `multipart/form-data; boundary=${boundary}`, 'Content-Length': body.length },
166
+ timeout: 60000,
167
+ }, res => {
168
+ if (res.statusCode !== 200) {
169
+ let e = ''; res.on('data', d => e += d);
170
+ res.on('end', () => reject(new Error(`pocket-tts HTTP ${res.statusCode}: ${e}`)));
171
+ return;
172
+ }
173
+ const chunks = []; res.on('data', d => chunks.push(d));
174
+ res.on('end', () => resolve(Buffer.concat(chunks)));
175
+ });
176
+ req.on('error', reject);
177
+ req.on('timeout', () => { req.destroy(); reject(new Error('pocket-tts timeout')); });
178
+ req.write(body); req.end();
179
+ });
180
+ }
181
+
182
+ function getState() {
183
+ return {
184
+ status: state.status, healthy: state.healthy, pid: state.pid, port: state.port,
185
+ restartCount: state.restartCount, failureCount: state.failureCount,
186
+ lastError: state.lastError, installed: isInstalled(), voiceCloning: state.voiceCloning,
187
+ };
188
+ }
189
+
190
+ export { start, stop, synthesize, healthCheck, getState, isInstalled, findVoiceFile };
package/lib/speech.js CHANGED
@@ -3,6 +3,7 @@ import fs from 'fs';
3
3
  import os from 'os';
4
4
  import path from 'path';
5
5
  import { fileURLToPath } from 'url';
6
+ import * as pocket from './pocket-sidecar.js';
6
7
 
7
8
  const require = createRequire(import.meta.url);
8
9
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -218,24 +219,29 @@ async function loadVoiceEmbedding(voiceId) {
218
219
  }
219
220
  }
220
221
 
222
+ let speakerFeatureExtractor = null;
223
+
221
224
  async function getSpeakerEmbeddingPipeline() {
222
225
  if (speakerEmbeddingPipeline) return speakerEmbeddingPipeline;
223
226
  if (speakerEmbeddingLoading) {
224
227
  while (speakerEmbeddingLoading) await new Promise(r => setTimeout(r, 100));
225
- if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding pipeline failed to load');
228
+ if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding model failed to load');
226
229
  return speakerEmbeddingPipeline;
227
230
  }
228
231
  speakerEmbeddingLoading = true;
229
232
  try {
230
- const { pipeline, env } = await loadTransformers();
233
+ const { AutoModelForXVector, AutoFeatureExtractor, env } = await loadTransformers();
231
234
  env.allowRemoteModels = true;
232
- speakerEmbeddingPipeline = await pipeline('feature-extraction', 'speechbrain/spkrec-xvectors-voxceleb', {
235
+ const modelId = 'Xenova/wavlm-base-plus-sv';
236
+ speakerEmbeddingPipeline = await AutoModelForXVector.from_pretrained(modelId, {
233
237
  device: 'cpu',
234
238
  dtype: 'fp32',
235
239
  });
240
+ speakerFeatureExtractor = await AutoFeatureExtractor.from_pretrained(modelId);
236
241
  return speakerEmbeddingPipeline;
237
242
  } catch (err) {
238
243
  speakerEmbeddingPipeline = null;
244
+ speakerFeatureExtractor = null;
239
245
  throw new Error('Speaker embedding model load failed: ' + err.message);
240
246
  } finally {
241
247
  speakerEmbeddingLoading = false;
@@ -284,11 +290,13 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
284
290
  if (audio.length < SAMPLE_RATE_STT * 0.5) {
285
291
  throw new Error('Audio too short for embedding extraction');
286
292
  }
287
- const pipe = await getSpeakerEmbeddingPipeline();
288
- const output = await pipe(audio, { pooling: 'mean', normalize: true });
293
+ const model = await getSpeakerEmbeddingPipeline();
294
+ const inputs = await speakerFeatureExtractor(audio, { sampling_rate: SAMPLE_RATE_STT });
295
+ const output = await model(inputs);
296
+ const embData = output.embeddings.data;
289
297
  const embedding = new Float32Array(512);
290
- for (let i = 0; i < Math.min(512, output.data.length); i++) {
291
- embedding[i] = output.data[i];
298
+ for (let i = 0; i < Math.min(512, embData.length); i++) {
299
+ embedding[i] = embData[i];
292
300
  }
293
301
  if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
294
302
  const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
@@ -503,6 +511,24 @@ function cachePut(key, buf) {
503
511
  ttsCacheBytes += buf.length;
504
512
  }
505
513
 
514
+ function resolveVoicePath(voiceId) {
515
+ if (!voiceId || voiceId === 'default') return null;
516
+ return pocket.findVoiceFile(voiceId) || findCustomVoiceFile(voiceId);
517
+ }
518
+
519
+ async function synthesizeViaPocket(text, voiceId) {
520
+ const pState = pocket.getState();
521
+ if (!pState.healthy) return null;
522
+ try {
523
+ const voicePath = resolveVoicePath(voiceId);
524
+ const wav = await pocket.synthesize(text, voicePath);
525
+ if (wav && wav.length > 44) return wav;
526
+ } catch (err) {
527
+ console.error('[TTS] pocket-tts failed, falling back:', err.message);
528
+ }
529
+ return null;
530
+ }
531
+
506
532
  async function synthesize(text, voiceId) {
507
533
  const cacheKey = (voiceId || 'default') + ':' + text;
508
534
  const cached = ttsCache.get(cacheKey);
@@ -514,6 +540,8 @@ async function synthesize(text, voiceId) {
514
540
  const inflight = ttsInflight.get(cacheKey);
515
541
  if (inflight) return inflight;
516
542
  const promise = (async () => {
543
+ const pocketWav = await synthesizeViaPocket(text, voiceId);
544
+ if (pocketWav) { cachePut(cacheKey, pocketWav); return pocketWav; }
517
545
  const tts = await getTTS();
518
546
  const embeddings = await loadVoiceEmbedding(voiceId);
519
547
  const result = await tts(text, { speaker_embeddings: embeddings });
@@ -527,8 +555,12 @@ async function synthesize(text, voiceId) {
527
555
 
528
556
  async function* synthesizeStream(text, voiceId) {
529
557
  const sentences = splitSentences(text);
530
- const tts = await getTTS();
531
- const embeddings = await loadVoiceEmbedding(voiceId);
558
+ const usePocket = pocket.getState().healthy;
559
+ let tts, embeddings;
560
+ if (!usePocket) {
561
+ tts = await getTTS();
562
+ embeddings = await loadVoiceEmbedding(voiceId);
563
+ }
532
564
  for (const sentence of sentences) {
533
565
  const cacheKey = (voiceId || 'default') + ':' + sentence;
534
566
  const cached = ttsCache.get(cacheKey);
@@ -538,6 +570,11 @@ async function* synthesizeStream(text, voiceId) {
538
570
  yield cached;
539
571
  continue;
540
572
  }
573
+ if (usePocket) {
574
+ const pocketWav = await synthesizeViaPocket(sentence, voiceId);
575
+ if (pocketWav) { cachePut(cacheKey, pocketWav); yield pocketWav; continue; }
576
+ }
577
+ if (!tts) { tts = await getTTS(); embeddings = await loadVoiceEmbedding(voiceId); }
541
578
  const result = await tts(sentence, { speaker_embeddings: embeddings });
542
579
  const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
543
580
  cachePut(cacheKey, wav);
@@ -547,18 +584,30 @@ async function* synthesizeStream(text, voiceId) {
547
584
 
548
585
  function getStatus() {
549
586
  const ttsRetryExpired = ttsLoadError && (Date.now() - ttsLoadErrorTime >= TTS_ERROR_RETRY_MS);
587
+ const pState = pocket.getState();
550
588
  return {
551
589
  sttReady: !!sttPipeline,
552
- ttsReady: !!ttsPipeline,
590
+ ttsReady: !!ttsPipeline || pState.healthy,
553
591
  sttLoading,
554
592
  ttsLoading,
555
593
  sttError: sttLoadError ? sttLoadError.message : null,
556
- ttsError: (ttsLoadError && !ttsRetryExpired) ? ttsLoadError.message : null,
594
+ ttsError: (ttsLoadError && !ttsRetryExpired && !pState.healthy) ? ttsLoadError.message : null,
595
+ pocketTts: pState,
557
596
  };
558
597
  }
559
598
 
560
599
  function preloadTTS() {
561
- getTTS().catch(err => console.error('[TTS] Preload failed:', err.message));
600
+ const defaultVoice = findCustomVoiceFile('custom_cleetus') || '/config/voices/cleetus.wav';
601
+ const voicePath = fs.existsSync(defaultVoice) ? defaultVoice : null;
602
+ pocket.start(voicePath).then(ok => {
603
+ if (ok) console.log('[TTS] pocket-tts sidecar started');
604
+ else {
605
+ console.log('[TTS] pocket-tts unavailable, falling back to SpeechT5');
606
+ getTTS().catch(err => console.error('[TTS] SpeechT5 preload failed:', err.message));
607
+ }
608
+ }).catch(() => {
609
+ getTTS().catch(err => console.error('[TTS] SpeechT5 preload failed:', err.message));
610
+ });
562
611
  }
563
612
 
564
613
  function ttsCacheKey(text, voiceId) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgui",
3
- "version": "1.0.177",
3
+ "version": "1.0.179",
4
4
  "description": "Multi-agent ACP client with real-time communication",
5
5
  "type": "module",
6
6
  "main": "server.js",
package/server.js CHANGED
@@ -1414,7 +1414,7 @@ function onServerReady() {
1414
1414
  // Recover stale active sessions from previous run
1415
1415
  recoverStaleSessions();
1416
1416
 
1417
- getSpeech().then(s => s.getTTS()).then(() => debugLog('[TTS] Model preloaded')).catch(e => debugLog('[TTS] Preload failed: ' + e.message));
1417
+ getSpeech().then(s => s.preloadTTS()).catch(e => debugLog('[TTS] Preload failed: ' + e.message));
1418
1418
 
1419
1419
  performAutoImport();
1420
1420