agentgui 1.0.177 → 1.0.179
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/pocket-sidecar.js +190 -0
- package/lib/speech.js +61 -12
- package/package.json +1 -1
- package/server.js +1 -1
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import { spawn } from 'child_process';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import fs from 'fs';
|
|
4
|
+
import os from 'os';
|
|
5
|
+
import { fileURLToPath } from 'url';
|
|
6
|
+
import http from 'http';
|
|
7
|
+
|
|
8
|
+
const ROOT = path.dirname(path.dirname(fileURLToPath(import.meta.url)));
|
|
9
|
+
const POCKET_BIN = path.join(ROOT, 'data', 'pocket-venv', 'bin', 'pocket-tts');
|
|
10
|
+
const PORT = 8787;
|
|
11
|
+
|
|
12
|
+
const FALLBACK_VOICE = 'alba';
|
|
13
|
+
const state = {
|
|
14
|
+
process: null, port: PORT, status: 'stopped', pid: null,
|
|
15
|
+
restartCount: 0, failureCount: 0, lastError: null,
|
|
16
|
+
healthy: false, voicePath: null, starting: false,
|
|
17
|
+
shutdownRequested: false, healthTimer: null, restartTimer: null,
|
|
18
|
+
voiceCloning: false,
|
|
19
|
+
};
|
|
20
|
+
globalThis.__pocketSidecar = state;
|
|
21
|
+
|
|
22
|
+
function isInstalled() { return fs.existsSync(POCKET_BIN); }
|
|
23
|
+
|
|
24
|
+
function findVoiceFile(voiceId) {
|
|
25
|
+
if (!voiceId || voiceId === 'default') return null;
|
|
26
|
+
const baseName = voiceId.replace(/^custom_/, '');
|
|
27
|
+
const dirs = [
|
|
28
|
+
path.join(process.env.STARTUP_CWD || process.cwd(), 'voices'),
|
|
29
|
+
path.join(ROOT, 'voices'), path.join(os.homedir(), 'voices'), '/config/voices',
|
|
30
|
+
];
|
|
31
|
+
for (const dir of dirs)
|
|
32
|
+
for (const ext of ['.wav', '.mp3', '.ogg', '.flac']) {
|
|
33
|
+
const p = path.join(dir, baseName + ext);
|
|
34
|
+
if (fs.existsSync(p)) return p;
|
|
35
|
+
}
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function healthCheck() {
|
|
40
|
+
return new Promise((resolve) => {
|
|
41
|
+
const req = http.get(`http://127.0.0.1:${PORT}/health`, { timeout: 3000 }, (res) => {
|
|
42
|
+
res.resume();
|
|
43
|
+
res.on('end', () => { state.healthy = res.statusCode === 200; resolve(state.healthy); });
|
|
44
|
+
});
|
|
45
|
+
req.on('error', () => { state.healthy = false; resolve(false); });
|
|
46
|
+
req.on('timeout', () => { req.destroy(); state.healthy = false; resolve(false); });
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function killProcess() {
|
|
51
|
+
if (state.process) { try { state.process.kill('SIGTERM'); } catch (_) {} }
|
|
52
|
+
state.process = null; state.pid = null; state.healthy = false; state.status = 'stopped';
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function scheduleRestart() {
|
|
56
|
+
if (state.shutdownRequested) return;
|
|
57
|
+
killProcess();
|
|
58
|
+
const delay = Math.min(1000 * Math.pow(2, state.restartCount), 30000);
|
|
59
|
+
state.restartCount++;
|
|
60
|
+
console.log(`[POCKET-TTS] Restart in ${delay}ms (attempt ${state.restartCount})`);
|
|
61
|
+
state.restartTimer = setTimeout(() => {
|
|
62
|
+
state.restartTimer = null;
|
|
63
|
+
start(state.voicePath).catch(e => console.error('[POCKET-TTS] Restart failed:', e.message));
|
|
64
|
+
}, delay);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function spawnSidecar(voice) {
|
|
68
|
+
const args = ['serve', '--host', '0.0.0.0', '--port', String(PORT)];
|
|
69
|
+
if (voice) args.push('--voice', voice);
|
|
70
|
+
console.log('[POCKET-TTS] Starting:', POCKET_BIN, args.join(' '));
|
|
71
|
+
return spawn(POCKET_BIN, args, {
|
|
72
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
73
|
+
env: { ...process.env, PYTHONUNBUFFERED: '1' },
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function attachProc(proc) {
|
|
78
|
+
state.process = proc; state.pid = proc.pid; state.status = 'starting';
|
|
79
|
+
proc.stdout.on('data', d => { const l = d.toString().trim(); if (l) console.log('[POCKET-TTS]', l); });
|
|
80
|
+
proc.stderr.on('data', d => { const l = d.toString().trim(); if (l) console.error('[POCKET-TTS]', l); });
|
|
81
|
+
proc.on('error', e => { state.lastError = e.message; });
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
async function waitForReady(proc, timeoutSec) {
|
|
85
|
+
let exited = false;
|
|
86
|
+
proc.on('exit', () => { exited = true; });
|
|
87
|
+
for (let i = 0; i < timeoutSec; i++) {
|
|
88
|
+
if (exited) return false;
|
|
89
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
90
|
+
if (await healthCheck()) return true;
|
|
91
|
+
}
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
async function start(voicePath) {
|
|
96
|
+
if (!isInstalled()) { state.lastError = 'not installed'; state.status = 'unavailable'; return false; }
|
|
97
|
+
if (state.starting) return false;
|
|
98
|
+
if (state.status === 'running' && state.healthy) return true;
|
|
99
|
+
state.starting = true; state.shutdownRequested = false;
|
|
100
|
+
const requestedVoice = voicePath || state.voicePath;
|
|
101
|
+
try {
|
|
102
|
+
killProcess();
|
|
103
|
+
let proc = spawnSidecar(requestedVoice);
|
|
104
|
+
attachProc(proc);
|
|
105
|
+
let ready = await waitForReady(proc, 120);
|
|
106
|
+
if (!ready && requestedVoice && requestedVoice !== FALLBACK_VOICE) {
|
|
107
|
+
console.log('[POCKET-TTS] Custom voice failed, trying predefined voice:', FALLBACK_VOICE);
|
|
108
|
+
killProcess();
|
|
109
|
+
proc = spawnSidecar(FALLBACK_VOICE);
|
|
110
|
+
attachProc(proc);
|
|
111
|
+
state.voiceCloning = false;
|
|
112
|
+
ready = await waitForReady(proc, 120);
|
|
113
|
+
if (ready) state.voicePath = FALLBACK_VOICE;
|
|
114
|
+
} else if (ready) {
|
|
115
|
+
state.voicePath = requestedVoice;
|
|
116
|
+
state.voiceCloning = !!requestedVoice && !['alba','marius','javert','jean','fantine','cosette','eponine','azelma'].includes(requestedVoice);
|
|
117
|
+
}
|
|
118
|
+
if (ready) {
|
|
119
|
+
state.status = 'running'; state.restartCount = 0; state.failureCount = 0; state.lastError = null;
|
|
120
|
+
proc.on('exit', (code, sig) => {
|
|
121
|
+
console.log(`[POCKET-TTS] Exited: code=${code} signal=${sig}`);
|
|
122
|
+
state.process = null; state.pid = null; state.healthy = false; state.status = 'stopped';
|
|
123
|
+
if (!state.shutdownRequested) scheduleRestart();
|
|
124
|
+
});
|
|
125
|
+
if (!state.healthTimer) state.healthTimer = setInterval(async () => {
|
|
126
|
+
if (state.status !== 'running') return;
|
|
127
|
+
const ok = await healthCheck();
|
|
128
|
+
if (!ok && !state.shutdownRequested) {
|
|
129
|
+
state.failureCount++;
|
|
130
|
+
if (state.failureCount >= 3) scheduleRestart();
|
|
131
|
+
} else if (ok) state.failureCount = 0;
|
|
132
|
+
}, 10000);
|
|
133
|
+
console.log('[POCKET-TTS] Ready on port', PORT, '(voice cloning:', state.voiceCloning + ')');
|
|
134
|
+
return true;
|
|
135
|
+
}
|
|
136
|
+
state.lastError = 'Start timeout'; state.status = 'error'; killProcess(); return false;
|
|
137
|
+
} catch (err) {
|
|
138
|
+
state.lastError = err.message; state.status = 'error'; return false;
|
|
139
|
+
} finally { state.starting = false; }
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
async function stop() {
|
|
143
|
+
state.shutdownRequested = true;
|
|
144
|
+
if (state.healthTimer) { clearInterval(state.healthTimer); state.healthTimer = null; }
|
|
145
|
+
if (state.restartTimer) { clearTimeout(state.restartTimer); state.restartTimer = null; }
|
|
146
|
+
killProcess();
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
async function synthesize(text, voicePath) {
|
|
150
|
+
if (!state.healthy) throw new Error('pocket-tts not ready');
|
|
151
|
+
const boundary = '----PocketTTS' + Date.now();
|
|
152
|
+
const parts = [];
|
|
153
|
+
parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="text"\r\n\r\n${text}\r\n`);
|
|
154
|
+
if (state.voiceCloning && voicePath && voicePath !== state.voicePath) {
|
|
155
|
+
const data = fs.readFileSync(voicePath);
|
|
156
|
+
const name = path.basename(voicePath);
|
|
157
|
+
parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="voice_wav"; filename="${name}"\r\nContent-Type: audio/wav\r\n\r\n`);
|
|
158
|
+
parts.push(data); parts.push('\r\n');
|
|
159
|
+
}
|
|
160
|
+
parts.push(`--${boundary}--\r\n`);
|
|
161
|
+
const body = Buffer.concat(parts.map(p => Buffer.isBuffer(p) ? p : Buffer.from(p)));
|
|
162
|
+
return new Promise((resolve, reject) => {
|
|
163
|
+
const req = http.request({
|
|
164
|
+
hostname: '127.0.0.1', port: PORT, path: '/tts', method: 'POST',
|
|
165
|
+
headers: { 'Content-Type': `multipart/form-data; boundary=${boundary}`, 'Content-Length': body.length },
|
|
166
|
+
timeout: 60000,
|
|
167
|
+
}, res => {
|
|
168
|
+
if (res.statusCode !== 200) {
|
|
169
|
+
let e = ''; res.on('data', d => e += d);
|
|
170
|
+
res.on('end', () => reject(new Error(`pocket-tts HTTP ${res.statusCode}: ${e}`)));
|
|
171
|
+
return;
|
|
172
|
+
}
|
|
173
|
+
const chunks = []; res.on('data', d => chunks.push(d));
|
|
174
|
+
res.on('end', () => resolve(Buffer.concat(chunks)));
|
|
175
|
+
});
|
|
176
|
+
req.on('error', reject);
|
|
177
|
+
req.on('timeout', () => { req.destroy(); reject(new Error('pocket-tts timeout')); });
|
|
178
|
+
req.write(body); req.end();
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
function getState() {
|
|
183
|
+
return {
|
|
184
|
+
status: state.status, healthy: state.healthy, pid: state.pid, port: state.port,
|
|
185
|
+
restartCount: state.restartCount, failureCount: state.failureCount,
|
|
186
|
+
lastError: state.lastError, installed: isInstalled(), voiceCloning: state.voiceCloning,
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
export { start, stop, synthesize, healthCheck, getState, isInstalled, findVoiceFile };
|
package/lib/speech.js
CHANGED
|
@@ -3,6 +3,7 @@ import fs from 'fs';
|
|
|
3
3
|
import os from 'os';
|
|
4
4
|
import path from 'path';
|
|
5
5
|
import { fileURLToPath } from 'url';
|
|
6
|
+
import * as pocket from './pocket-sidecar.js';
|
|
6
7
|
|
|
7
8
|
const require = createRequire(import.meta.url);
|
|
8
9
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
@@ -218,24 +219,29 @@ async function loadVoiceEmbedding(voiceId) {
|
|
|
218
219
|
}
|
|
219
220
|
}
|
|
220
221
|
|
|
222
|
+
let speakerFeatureExtractor = null;
|
|
223
|
+
|
|
221
224
|
async function getSpeakerEmbeddingPipeline() {
|
|
222
225
|
if (speakerEmbeddingPipeline) return speakerEmbeddingPipeline;
|
|
223
226
|
if (speakerEmbeddingLoading) {
|
|
224
227
|
while (speakerEmbeddingLoading) await new Promise(r => setTimeout(r, 100));
|
|
225
|
-
if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding
|
|
228
|
+
if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding model failed to load');
|
|
226
229
|
return speakerEmbeddingPipeline;
|
|
227
230
|
}
|
|
228
231
|
speakerEmbeddingLoading = true;
|
|
229
232
|
try {
|
|
230
|
-
const {
|
|
233
|
+
const { AutoModelForXVector, AutoFeatureExtractor, env } = await loadTransformers();
|
|
231
234
|
env.allowRemoteModels = true;
|
|
232
|
-
|
|
235
|
+
const modelId = 'Xenova/wavlm-base-plus-sv';
|
|
236
|
+
speakerEmbeddingPipeline = await AutoModelForXVector.from_pretrained(modelId, {
|
|
233
237
|
device: 'cpu',
|
|
234
238
|
dtype: 'fp32',
|
|
235
239
|
});
|
|
240
|
+
speakerFeatureExtractor = await AutoFeatureExtractor.from_pretrained(modelId);
|
|
236
241
|
return speakerEmbeddingPipeline;
|
|
237
242
|
} catch (err) {
|
|
238
243
|
speakerEmbeddingPipeline = null;
|
|
244
|
+
speakerFeatureExtractor = null;
|
|
239
245
|
throw new Error('Speaker embedding model load failed: ' + err.message);
|
|
240
246
|
} finally {
|
|
241
247
|
speakerEmbeddingLoading = false;
|
|
@@ -284,11 +290,13 @@ async function generateEmbeddingFromCustomVoice(voiceId) {
|
|
|
284
290
|
if (audio.length < SAMPLE_RATE_STT * 0.5) {
|
|
285
291
|
throw new Error('Audio too short for embedding extraction');
|
|
286
292
|
}
|
|
287
|
-
const
|
|
288
|
-
const
|
|
293
|
+
const model = await getSpeakerEmbeddingPipeline();
|
|
294
|
+
const inputs = await speakerFeatureExtractor(audio, { sampling_rate: SAMPLE_RATE_STT });
|
|
295
|
+
const output = await model(inputs);
|
|
296
|
+
const embData = output.embeddings.data;
|
|
289
297
|
const embedding = new Float32Array(512);
|
|
290
|
-
for (let i = 0; i < Math.min(512,
|
|
291
|
-
embedding[i] =
|
|
298
|
+
for (let i = 0; i < Math.min(512, embData.length); i++) {
|
|
299
|
+
embedding[i] = embData[i];
|
|
292
300
|
}
|
|
293
301
|
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
294
302
|
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
@@ -503,6 +511,24 @@ function cachePut(key, buf) {
|
|
|
503
511
|
ttsCacheBytes += buf.length;
|
|
504
512
|
}
|
|
505
513
|
|
|
514
|
+
function resolveVoicePath(voiceId) {
|
|
515
|
+
if (!voiceId || voiceId === 'default') return null;
|
|
516
|
+
return pocket.findVoiceFile(voiceId) || findCustomVoiceFile(voiceId);
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
async function synthesizeViaPocket(text, voiceId) {
|
|
520
|
+
const pState = pocket.getState();
|
|
521
|
+
if (!pState.healthy) return null;
|
|
522
|
+
try {
|
|
523
|
+
const voicePath = resolveVoicePath(voiceId);
|
|
524
|
+
const wav = await pocket.synthesize(text, voicePath);
|
|
525
|
+
if (wav && wav.length > 44) return wav;
|
|
526
|
+
} catch (err) {
|
|
527
|
+
console.error('[TTS] pocket-tts failed, falling back:', err.message);
|
|
528
|
+
}
|
|
529
|
+
return null;
|
|
530
|
+
}
|
|
531
|
+
|
|
506
532
|
async function synthesize(text, voiceId) {
|
|
507
533
|
const cacheKey = (voiceId || 'default') + ':' + text;
|
|
508
534
|
const cached = ttsCache.get(cacheKey);
|
|
@@ -514,6 +540,8 @@ async function synthesize(text, voiceId) {
|
|
|
514
540
|
const inflight = ttsInflight.get(cacheKey);
|
|
515
541
|
if (inflight) return inflight;
|
|
516
542
|
const promise = (async () => {
|
|
543
|
+
const pocketWav = await synthesizeViaPocket(text, voiceId);
|
|
544
|
+
if (pocketWav) { cachePut(cacheKey, pocketWav); return pocketWav; }
|
|
517
545
|
const tts = await getTTS();
|
|
518
546
|
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
519
547
|
const result = await tts(text, { speaker_embeddings: embeddings });
|
|
@@ -527,8 +555,12 @@ async function synthesize(text, voiceId) {
|
|
|
527
555
|
|
|
528
556
|
async function* synthesizeStream(text, voiceId) {
|
|
529
557
|
const sentences = splitSentences(text);
|
|
530
|
-
const
|
|
531
|
-
|
|
558
|
+
const usePocket = pocket.getState().healthy;
|
|
559
|
+
let tts, embeddings;
|
|
560
|
+
if (!usePocket) {
|
|
561
|
+
tts = await getTTS();
|
|
562
|
+
embeddings = await loadVoiceEmbedding(voiceId);
|
|
563
|
+
}
|
|
532
564
|
for (const sentence of sentences) {
|
|
533
565
|
const cacheKey = (voiceId || 'default') + ':' + sentence;
|
|
534
566
|
const cached = ttsCache.get(cacheKey);
|
|
@@ -538,6 +570,11 @@ async function* synthesizeStream(text, voiceId) {
|
|
|
538
570
|
yield cached;
|
|
539
571
|
continue;
|
|
540
572
|
}
|
|
573
|
+
if (usePocket) {
|
|
574
|
+
const pocketWav = await synthesizeViaPocket(sentence, voiceId);
|
|
575
|
+
if (pocketWav) { cachePut(cacheKey, pocketWav); yield pocketWav; continue; }
|
|
576
|
+
}
|
|
577
|
+
if (!tts) { tts = await getTTS(); embeddings = await loadVoiceEmbedding(voiceId); }
|
|
541
578
|
const result = await tts(sentence, { speaker_embeddings: embeddings });
|
|
542
579
|
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
543
580
|
cachePut(cacheKey, wav);
|
|
@@ -547,18 +584,30 @@ async function* synthesizeStream(text, voiceId) {
|
|
|
547
584
|
|
|
548
585
|
function getStatus() {
|
|
549
586
|
const ttsRetryExpired = ttsLoadError && (Date.now() - ttsLoadErrorTime >= TTS_ERROR_RETRY_MS);
|
|
587
|
+
const pState = pocket.getState();
|
|
550
588
|
return {
|
|
551
589
|
sttReady: !!sttPipeline,
|
|
552
|
-
ttsReady: !!ttsPipeline,
|
|
590
|
+
ttsReady: !!ttsPipeline || pState.healthy,
|
|
553
591
|
sttLoading,
|
|
554
592
|
ttsLoading,
|
|
555
593
|
sttError: sttLoadError ? sttLoadError.message : null,
|
|
556
|
-
ttsError: (ttsLoadError && !ttsRetryExpired) ? ttsLoadError.message : null,
|
|
594
|
+
ttsError: (ttsLoadError && !ttsRetryExpired && !pState.healthy) ? ttsLoadError.message : null,
|
|
595
|
+
pocketTts: pState,
|
|
557
596
|
};
|
|
558
597
|
}
|
|
559
598
|
|
|
560
599
|
function preloadTTS() {
|
|
561
|
-
|
|
600
|
+
const defaultVoice = findCustomVoiceFile('custom_cleetus') || '/config/voices/cleetus.wav';
|
|
601
|
+
const voicePath = fs.existsSync(defaultVoice) ? defaultVoice : null;
|
|
602
|
+
pocket.start(voicePath).then(ok => {
|
|
603
|
+
if (ok) console.log('[TTS] pocket-tts sidecar started');
|
|
604
|
+
else {
|
|
605
|
+
console.log('[TTS] pocket-tts unavailable, falling back to SpeechT5');
|
|
606
|
+
getTTS().catch(err => console.error('[TTS] SpeechT5 preload failed:', err.message));
|
|
607
|
+
}
|
|
608
|
+
}).catch(() => {
|
|
609
|
+
getTTS().catch(err => console.error('[TTS] SpeechT5 preload failed:', err.message));
|
|
610
|
+
});
|
|
562
611
|
}
|
|
563
612
|
|
|
564
613
|
function ttsCacheKey(text, voiceId) {
|
package/package.json
CHANGED
package/server.js
CHANGED
|
@@ -1414,7 +1414,7 @@ function onServerReady() {
|
|
|
1414
1414
|
// Recover stale active sessions from previous run
|
|
1415
1415
|
recoverStaleSessions();
|
|
1416
1416
|
|
|
1417
|
-
getSpeech().then(s => s.
|
|
1417
|
+
getSpeech().then(s => s.preloadTTS()).catch(e => debugLog('[TTS] Preload failed: ' + e.message));
|
|
1418
1418
|
|
|
1419
1419
|
performAutoImport();
|
|
1420
1420
|
|