agentgui 1.0.178 → 1.0.180
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/pocket-sidecar.js +190 -0
- package/lib/speech.js +31 -202
- package/package.json +1 -1
- package/server.js +1 -1
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import { spawn } from 'child_process';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import fs from 'fs';
|
|
4
|
+
import os from 'os';
|
|
5
|
+
import { fileURLToPath } from 'url';
|
|
6
|
+
import http from 'http';
|
|
7
|
+
|
|
8
|
+
const ROOT = path.dirname(path.dirname(fileURLToPath(import.meta.url)));
|
|
9
|
+
const POCKET_BIN = path.join(ROOT, 'data', 'pocket-venv', 'bin', 'pocket-tts');
|
|
10
|
+
const PORT = 8787;
|
|
11
|
+
|
|
12
|
+
const FALLBACK_VOICE = 'alba';
|
|
13
|
+
const state = {
|
|
14
|
+
process: null, port: PORT, status: 'stopped', pid: null,
|
|
15
|
+
restartCount: 0, failureCount: 0, lastError: null,
|
|
16
|
+
healthy: false, voicePath: null, starting: false,
|
|
17
|
+
shutdownRequested: false, healthTimer: null, restartTimer: null,
|
|
18
|
+
voiceCloning: false,
|
|
19
|
+
};
|
|
20
|
+
globalThis.__pocketSidecar = state;
|
|
21
|
+
|
|
22
|
+
function isInstalled() { return fs.existsSync(POCKET_BIN); }
|
|
23
|
+
|
|
24
|
+
function findVoiceFile(voiceId) {
|
|
25
|
+
if (!voiceId || voiceId === 'default') return null;
|
|
26
|
+
const baseName = voiceId.replace(/^custom_/, '');
|
|
27
|
+
const dirs = [
|
|
28
|
+
path.join(process.env.STARTUP_CWD || process.cwd(), 'voices'),
|
|
29
|
+
path.join(ROOT, 'voices'), path.join(os.homedir(), 'voices'), '/config/voices',
|
|
30
|
+
];
|
|
31
|
+
for (const dir of dirs)
|
|
32
|
+
for (const ext of ['.wav', '.mp3', '.ogg', '.flac']) {
|
|
33
|
+
const p = path.join(dir, baseName + ext);
|
|
34
|
+
if (fs.existsSync(p)) return p;
|
|
35
|
+
}
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function healthCheck() {
|
|
40
|
+
return new Promise((resolve) => {
|
|
41
|
+
const req = http.get(`http://127.0.0.1:${PORT}/health`, { timeout: 3000 }, (res) => {
|
|
42
|
+
res.resume();
|
|
43
|
+
res.on('end', () => { state.healthy = res.statusCode === 200; resolve(state.healthy); });
|
|
44
|
+
});
|
|
45
|
+
req.on('error', () => { state.healthy = false; resolve(false); });
|
|
46
|
+
req.on('timeout', () => { req.destroy(); state.healthy = false; resolve(false); });
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function killProcess() {
|
|
51
|
+
if (state.process) { try { state.process.kill('SIGTERM'); } catch (_) {} }
|
|
52
|
+
state.process = null; state.pid = null; state.healthy = false; state.status = 'stopped';
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function scheduleRestart() {
|
|
56
|
+
if (state.shutdownRequested) return;
|
|
57
|
+
killProcess();
|
|
58
|
+
const delay = Math.min(1000 * Math.pow(2, state.restartCount), 30000);
|
|
59
|
+
state.restartCount++;
|
|
60
|
+
console.log(`[POCKET-TTS] Restart in ${delay}ms (attempt ${state.restartCount})`);
|
|
61
|
+
state.restartTimer = setTimeout(() => {
|
|
62
|
+
state.restartTimer = null;
|
|
63
|
+
start(state.voicePath).catch(e => console.error('[POCKET-TTS] Restart failed:', e.message));
|
|
64
|
+
}, delay);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function spawnSidecar(voice) {
|
|
68
|
+
const args = ['serve', '--host', '0.0.0.0', '--port', String(PORT)];
|
|
69
|
+
if (voice) args.push('--voice', voice);
|
|
70
|
+
console.log('[POCKET-TTS] Starting:', POCKET_BIN, args.join(' '));
|
|
71
|
+
return spawn(POCKET_BIN, args, {
|
|
72
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
73
|
+
env: { ...process.env, PYTHONUNBUFFERED: '1' },
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function attachProc(proc) {
|
|
78
|
+
state.process = proc; state.pid = proc.pid; state.status = 'starting';
|
|
79
|
+
proc.stdout.on('data', d => { const l = d.toString().trim(); if (l) console.log('[POCKET-TTS]', l); });
|
|
80
|
+
proc.stderr.on('data', d => { const l = d.toString().trim(); if (l) console.error('[POCKET-TTS]', l); });
|
|
81
|
+
proc.on('error', e => { state.lastError = e.message; });
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
async function waitForReady(proc, timeoutSec) {
|
|
85
|
+
let exited = false;
|
|
86
|
+
proc.on('exit', () => { exited = true; });
|
|
87
|
+
for (let i = 0; i < timeoutSec; i++) {
|
|
88
|
+
if (exited) return false;
|
|
89
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
90
|
+
if (await healthCheck()) return true;
|
|
91
|
+
}
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
async function start(voicePath) {
|
|
96
|
+
if (!isInstalled()) { state.lastError = 'not installed'; state.status = 'unavailable'; return false; }
|
|
97
|
+
if (state.starting) return false;
|
|
98
|
+
if (state.status === 'running' && state.healthy) return true;
|
|
99
|
+
state.starting = true; state.shutdownRequested = false;
|
|
100
|
+
const requestedVoice = voicePath || state.voicePath;
|
|
101
|
+
try {
|
|
102
|
+
killProcess();
|
|
103
|
+
let proc = spawnSidecar(requestedVoice);
|
|
104
|
+
attachProc(proc);
|
|
105
|
+
let ready = await waitForReady(proc, 120);
|
|
106
|
+
if (!ready && requestedVoice && requestedVoice !== FALLBACK_VOICE) {
|
|
107
|
+
console.log('[POCKET-TTS] Custom voice failed, trying predefined voice:', FALLBACK_VOICE);
|
|
108
|
+
killProcess();
|
|
109
|
+
proc = spawnSidecar(FALLBACK_VOICE);
|
|
110
|
+
attachProc(proc);
|
|
111
|
+
state.voiceCloning = false;
|
|
112
|
+
ready = await waitForReady(proc, 120);
|
|
113
|
+
if (ready) state.voicePath = FALLBACK_VOICE;
|
|
114
|
+
} else if (ready) {
|
|
115
|
+
state.voicePath = requestedVoice;
|
|
116
|
+
state.voiceCloning = !!requestedVoice && !['alba','marius','javert','jean','fantine','cosette','eponine','azelma'].includes(requestedVoice);
|
|
117
|
+
}
|
|
118
|
+
if (ready) {
|
|
119
|
+
state.status = 'running'; state.restartCount = 0; state.failureCount = 0; state.lastError = null;
|
|
120
|
+
proc.on('exit', (code, sig) => {
|
|
121
|
+
console.log(`[POCKET-TTS] Exited: code=${code} signal=${sig}`);
|
|
122
|
+
state.process = null; state.pid = null; state.healthy = false; state.status = 'stopped';
|
|
123
|
+
if (!state.shutdownRequested) scheduleRestart();
|
|
124
|
+
});
|
|
125
|
+
if (!state.healthTimer) state.healthTimer = setInterval(async () => {
|
|
126
|
+
if (state.status !== 'running') return;
|
|
127
|
+
const ok = await healthCheck();
|
|
128
|
+
if (!ok && !state.shutdownRequested) {
|
|
129
|
+
state.failureCount++;
|
|
130
|
+
if (state.failureCount >= 3) scheduleRestart();
|
|
131
|
+
} else if (ok) state.failureCount = 0;
|
|
132
|
+
}, 10000);
|
|
133
|
+
console.log('[POCKET-TTS] Ready on port', PORT, '(voice cloning:', state.voiceCloning + ')');
|
|
134
|
+
return true;
|
|
135
|
+
}
|
|
136
|
+
state.lastError = 'Start timeout'; state.status = 'error'; killProcess(); return false;
|
|
137
|
+
} catch (err) {
|
|
138
|
+
state.lastError = err.message; state.status = 'error'; return false;
|
|
139
|
+
} finally { state.starting = false; }
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
async function stop() {
|
|
143
|
+
state.shutdownRequested = true;
|
|
144
|
+
if (state.healthTimer) { clearInterval(state.healthTimer); state.healthTimer = null; }
|
|
145
|
+
if (state.restartTimer) { clearTimeout(state.restartTimer); state.restartTimer = null; }
|
|
146
|
+
killProcess();
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
async function synthesize(text, voicePath) {
|
|
150
|
+
if (!state.healthy) throw new Error('pocket-tts not ready');
|
|
151
|
+
const boundary = '----PocketTTS' + Date.now();
|
|
152
|
+
const parts = [];
|
|
153
|
+
parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="text"\r\n\r\n${text}\r\n`);
|
|
154
|
+
if (state.voiceCloning && voicePath && voicePath !== state.voicePath) {
|
|
155
|
+
const data = fs.readFileSync(voicePath);
|
|
156
|
+
const name = path.basename(voicePath);
|
|
157
|
+
parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="voice_wav"; filename="${name}"\r\nContent-Type: audio/wav\r\n\r\n`);
|
|
158
|
+
parts.push(data); parts.push('\r\n');
|
|
159
|
+
}
|
|
160
|
+
parts.push(`--${boundary}--\r\n`);
|
|
161
|
+
const body = Buffer.concat(parts.map(p => Buffer.isBuffer(p) ? p : Buffer.from(p)));
|
|
162
|
+
return new Promise((resolve, reject) => {
|
|
163
|
+
const req = http.request({
|
|
164
|
+
hostname: '127.0.0.1', port: PORT, path: '/tts', method: 'POST',
|
|
165
|
+
headers: { 'Content-Type': `multipart/form-data; boundary=${boundary}`, 'Content-Length': body.length },
|
|
166
|
+
timeout: 60000,
|
|
167
|
+
}, res => {
|
|
168
|
+
if (res.statusCode !== 200) {
|
|
169
|
+
let e = ''; res.on('data', d => e += d);
|
|
170
|
+
res.on('end', () => reject(new Error(`pocket-tts HTTP ${res.statusCode}: ${e}`)));
|
|
171
|
+
return;
|
|
172
|
+
}
|
|
173
|
+
const chunks = []; res.on('data', d => chunks.push(d));
|
|
174
|
+
res.on('end', () => resolve(Buffer.concat(chunks)));
|
|
175
|
+
});
|
|
176
|
+
req.on('error', reject);
|
|
177
|
+
req.on('timeout', () => { req.destroy(); reject(new Error('pocket-tts timeout')); });
|
|
178
|
+
req.write(body); req.end();
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
function getState() {
|
|
183
|
+
return {
|
|
184
|
+
status: state.status, healthy: state.healthy, pid: state.pid, port: state.port,
|
|
185
|
+
restartCount: state.restartCount, failureCount: state.failureCount,
|
|
186
|
+
lastError: state.lastError, installed: isInstalled(), voiceCloning: state.voiceCloning,
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
export { start, stop, synthesize, healthCheck, getState, isInstalled, findVoiceFile };
|
package/lib/speech.js
CHANGED
|
@@ -3,11 +3,11 @@ import fs from 'fs';
|
|
|
3
3
|
import os from 'os';
|
|
4
4
|
import path from 'path';
|
|
5
5
|
import { fileURLToPath } from 'url';
|
|
6
|
+
import * as pocket from './pocket-sidecar.js';
|
|
6
7
|
|
|
7
8
|
const require = createRequire(import.meta.url);
|
|
8
9
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
9
10
|
const ROOT = path.dirname(__dirname);
|
|
10
|
-
const DATA_DIR = path.join(ROOT, 'data');
|
|
11
11
|
const AUDIO_EXTENSIONS = ['.wav', '.mp3', '.ogg', '.flac', '.m4a'];
|
|
12
12
|
|
|
13
13
|
function getVoiceDirs() {
|
|
@@ -101,28 +101,11 @@ function getVoices() {
|
|
|
101
101
|
return [...BASE_VOICES, ...loadCustomVoices()];
|
|
102
102
|
}
|
|
103
103
|
|
|
104
|
-
const SPEAKER_OFFSETS = { awb: 0, bdl: 1200, clb: 2300, jmk: 3500, ksp: 4700, rms: 5900, slt: 7100 };
|
|
105
|
-
const SPEAKER_EMBEDDINGS_URL = 'https://huggingface.co/datasets/Xenova/speaker_embeddings/resolve/main/spkrec-xvectors-voxceleb.hf';
|
|
106
|
-
const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
|
|
107
|
-
const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Xenova%2Fspeaker_embeddings&config=default&split=train';
|
|
108
|
-
const SAMPLES_TO_AVERAGE = 30;
|
|
109
|
-
const DEFAULT_EMBEDDING_B64 = 'xhibvao34LylqXQ8cNg7Pd1cCTw0keG8awRRvRqje7070G48AtOgPMFbnr1oeKC9I4ZuPZzqGT1DjWs8y3iMPB/SZLzdl7E6b9QaPKSpHTwYuh49FrMlO9YnebwmTzu9/3CPvQuvCbxsSWC9Sb2bO+tvXj0Cjpo8mTMxu/FDrjzQ4x09gyxCvUn6STxjAo+9vtXdPJtsYT3iMna9dQ+EvfQ72zuvxk69GAonPU8KdjsNPAU96e/8veN7lrwgyzk8HA5vvYE1Rz3gpZ484MsLPUKkxTzM54U81ECwvcbFHzv8gT08T6/7POCqBT2fv5E8fvsXPfZiJrzEhme8dg8kPR+mKTutQOU822maPMlMDb1x/IS93+6KvdyThzwhry880JBqvRVOhjzZods8SD08PLpObTn/0wk9BnAwvWiiz72EWgS9RpcjvV4VR73ZqJW9PoUFvfZYYb1h26S98levPHZbTjxH6qU9RPfoPHmJu70mSNo8ztJmvWgMBj0IX8i7TE3lPINY2DzoEma9wMObvTwKCT3pObe8t9KEvaWixjzc5fI8hj6MvaKv4Txl4h09d2a+PHCvTDxorJ69ekRrPeoPjz1JPfI7rUH7PIaJgz0O1YW9JLumvCxDnr1bmMm8GbIFPBX1oL3bRN08oYcXPEaFfL13Vxo9EKfbvTFcOTxdogA9XS3kPEWJoLvChc887BEgPMOvUT2Ba3s8tUDBvYPMZ72dNRG80AuTvQt7d72foTU9qO20O4INEb1u1iE9ibqJvZYaOj2nbYc8lsodvS5HPD1lCqK9EkBYPR0I/rySMIK9plcpPdpJEz2E/DY88d2DPIRTf71ZQZS9b1v5PPseFT2YiJu8OiOwPC8Wnr2QW4Q8n+o7PPQ8PD0QqAg9Vk7APDT6+jzreP88KH6GvTvAKD0AYiO9qOavvORySjvQ6y+9epb5PFvZijxYzlK9BwjUPK0HXL3acWc7dmwmPc/kXb2VBg68MGYRPR5q9zzmFiS9al2IvdVTfDwJOa88SzVkvVlrPD0WvJQ8Vm76PMUAQDzNgyK8QQZVPdMoibxrCBc9BgKTPDLoV70Iu6g7k+kBPZ3lhTy6sOU8OGkVvFaLRD14oqa9a4UVO4z4Gr1eYlO9u5BgPWS1ZL3kFPE8JGEwPQFTl71tHso8g+ElPd9Rgr2XCtc8axudvWC2IL09wSg9E7ZzPT6uBz2XmK09A1HcPJK8rTxK8Zu8GuMTPTuINTyRAhS9OSqDPDralLza3q48EgtePPf797rIWKo9NtkrvbO34zxKZ6m97l0GPQYVlL2igDA9UyfEPJhZyjx4/2Q8ggBpPYcAkzzIVu08ykYNPESdZr3uqmq8fS/zPKUYvzv67x49cUkqvXDlJj1us/88gASuvcs6G7sUshY9SgWiOqu4OD1WQ7k7/sLoPKuLJjwZYFm9an+zPOnfNry9Jh49/XX3vN1sc731fBM9TnBDPHzOAD26/dS9mg57vY+TA7wVJCw9pPb1PE30l7019la9UyRTPXFqljyRDnw9eZ6nvU03kTtS9907L+wavIBtab3k6cs8KVr6vPZ5zTxy+Zs8VuopPQTTUj0tNxg96qZyPY69lTzQEp48BXGJvVopBDvskUg9G2dOPaJMXDylJZU8FxcMvBQkNzzjPKs8FYUpvepYYj1AQsK9upQsvS4037xDcO48GhmIvWb1iT1gJhy9TG7iPHKAG70cuCQ8F1ZwPYqtj7300T89rTujPbXy2r3/cK69FtBNvY3iMT0DoqI4KK0QPYKEqr2Z6RU9ni0UPUNDLb3BsCi8+GttvZYp9zwUaHe9TqrFPOnlH7yCXJC9U8vDu8u2MjxA8xs9SAGxvPpphr29y2e9y2AYvTv+Eb1Elus9DdpGPSfmNL39Ggu85RVXPZbLh70Jvna7XkLGvR230DtGjpu7Ih8HPJKnIz1o35i8x5NVvXwFNDzs/ZM8+kw8PfFJSTwdlJA9ZJ+tvaoVZ7zTvVi8p6wluwh/IT0Kmg088o1rPRhiwjxpWIe9a+LuvYuYtjwAxE09WkPJPBuFh73UotY820JjvXpnQD3fJ/w8TM3JPOz0pTnbTim9tpe6PBHzJT1HEb66SkAKPasLgr1l/Mm8IOGgvM2pZbzwd4a9znOIO4d4Bb1DW5I8EZXzOxvBKDqKpHG9UwCHvd/Epb2cDRi9V1ztPNPBNTrLXHa8FdGHPPo+hb3DnJ08G+SvvVPQBL6zzrC8Omksvc+eIjyvGfU8eG9nvaVkdL1HBvs8eaeGPfcbVD1/Pfw8+TUFvU6aTL2JN5W8HXDNvGKFEj1i+T09UiCIOySbDD2x2/y7VTmnvTe3gb0ZhJw8WrKIuU5RGT09mKU7eFGtPFpr6DzaoyI9hsItPKU+YzuQlXK8f9IePSmUxTwXdoo9W6FJPV2kLzwkU1o8fGnfPInxg70rEVe9H7sNPWJDbbxSqLY8cQAOPUdpAD2YknK9ykFXPeVALz1mq3W96kO/PLERzjyXIRC7jxsXPRnLzjyUEoU7gTKvu+stlb1D1g45IH+2u5sOIj0wXPA8yTqDvT6mV72NsFq8ExeuPJlGyDxvjgk9lJeJvWSF8DwFvaW7oZ9GvHq1Rr1FJsk83zxVvfyGqTz7thG9fslpPF5RPb1Q6BQ9iXGovTeDeb2cmic8oBsRPYeni72TPcI8EKcPvfCJUbyQJqW9fCAYPRk8qT2q6rk8mEw2PfDeXL0=';
|
|
110
|
-
|
|
111
104
|
let transformersModule = null;
|
|
112
105
|
let sttPipeline = null;
|
|
113
|
-
let ttsPipeline = null;
|
|
114
|
-
let speakerEmbeddings = null;
|
|
115
|
-
let speakerEmbeddingPipeline = null;
|
|
116
106
|
let sttLoading = false;
|
|
117
|
-
let ttsLoading = false;
|
|
118
|
-
let speakerEmbeddingLoading = false;
|
|
119
|
-
let ttsLoadError = null;
|
|
120
|
-
let ttsLoadErrorTime = 0;
|
|
121
107
|
let sttLoadError = null;
|
|
122
|
-
const voiceEmbeddingsCache = new Map();
|
|
123
108
|
const SAMPLE_RATE_STT = 16000;
|
|
124
|
-
const SAMPLE_RATE_TTS = 16000;
|
|
125
|
-
const TTS_ERROR_RETRY_MS = 30000;
|
|
126
109
|
|
|
127
110
|
const TTS_CACHE_MAX_BYTES = 10 * 1024 * 1024;
|
|
128
111
|
let ttsCacheBytes = 0;
|
|
@@ -144,109 +127,6 @@ function whisperModelPath() {
|
|
|
144
127
|
return 'onnx-community/whisper-base';
|
|
145
128
|
}
|
|
146
129
|
|
|
147
|
-
function defaultEmbedding() {
|
|
148
|
-
const buf = Buffer.from(DEFAULT_EMBEDDING_B64, 'base64');
|
|
149
|
-
return new Float32Array(new Uint8Array(buf).buffer);
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
async function ensureSpeakerEmbeddings() {
|
|
153
|
-
if (speakerEmbeddings) return speakerEmbeddings;
|
|
154
|
-
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
155
|
-
if (fs.existsSync(SPEAKER_EMBEDDINGS_PATH)) {
|
|
156
|
-
const buf = fs.readFileSync(SPEAKER_EMBEDDINGS_PATH);
|
|
157
|
-
if (buf.length === 2048) {
|
|
158
|
-
speakerEmbeddings = new Float32Array(new Uint8Array(buf).buffer);
|
|
159
|
-
return speakerEmbeddings;
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
try {
|
|
163
|
-
const resp = await fetch(SPEAKER_EMBEDDINGS_URL);
|
|
164
|
-
if (resp.ok) {
|
|
165
|
-
const data = Buffer.from(await resp.arrayBuffer());
|
|
166
|
-
if (data.length >= 2048) {
|
|
167
|
-
fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, data);
|
|
168
|
-
speakerEmbeddings = new Float32Array(new Uint8Array(data).buffer);
|
|
169
|
-
return speakerEmbeddings;
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
} catch (_) {}
|
|
173
|
-
console.log('[TTS] Using bundled default speaker embedding');
|
|
174
|
-
speakerEmbeddings = defaultEmbedding();
|
|
175
|
-
const buf = Buffer.from(speakerEmbeddings.buffer);
|
|
176
|
-
fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, buf);
|
|
177
|
-
return speakerEmbeddings;
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
async function loadVoiceEmbedding(voiceId) {
|
|
181
|
-
if (!voiceId || voiceId === 'default') return ensureSpeakerEmbeddings();
|
|
182
|
-
if (voiceEmbeddingsCache.has(voiceId)) return voiceEmbeddingsCache.get(voiceId);
|
|
183
|
-
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
184
|
-
if (fs.existsSync(binPath)) {
|
|
185
|
-
const buf = fs.readFileSync(binPath);
|
|
186
|
-
const emb = new Float32Array(new Uint8Array(buf).buffer);
|
|
187
|
-
voiceEmbeddingsCache.set(voiceId, emb);
|
|
188
|
-
return emb;
|
|
189
|
-
}
|
|
190
|
-
if (voiceId.startsWith('custom_')) {
|
|
191
|
-
return generateEmbeddingFromCustomVoice(voiceId);
|
|
192
|
-
}
|
|
193
|
-
const offset = SPEAKER_OFFSETS[voiceId];
|
|
194
|
-
if (offset === undefined) return ensureSpeakerEmbeddings();
|
|
195
|
-
try {
|
|
196
|
-
const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
|
|
197
|
-
const resp = await fetch(url);
|
|
198
|
-
if (!resp.ok) throw new Error('HTTP ' + resp.status);
|
|
199
|
-
const data = await resp.json();
|
|
200
|
-
const avg = new Float32Array(512);
|
|
201
|
-
let count = 0;
|
|
202
|
-
for (const item of data.rows) {
|
|
203
|
-
const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
|
|
204
|
-
if (match && match[1] === voiceId) {
|
|
205
|
-
for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
|
|
206
|
-
count++;
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
if (count === 0) return ensureSpeakerEmbeddings();
|
|
210
|
-
for (let i = 0; i < 512; i++) avg[i] /= count;
|
|
211
|
-
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
212
|
-
fs.writeFileSync(binPath, Buffer.from(avg.buffer));
|
|
213
|
-
voiceEmbeddingsCache.set(voiceId, avg);
|
|
214
|
-
return avg;
|
|
215
|
-
} catch (err) {
|
|
216
|
-
console.error('[TTS] Failed to fetch voice embedding for ' + voiceId + ':', err.message);
|
|
217
|
-
return ensureSpeakerEmbeddings();
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
let speakerFeatureExtractor = null;
|
|
222
|
-
|
|
223
|
-
async function getSpeakerEmbeddingPipeline() {
|
|
224
|
-
if (speakerEmbeddingPipeline) return speakerEmbeddingPipeline;
|
|
225
|
-
if (speakerEmbeddingLoading) {
|
|
226
|
-
while (speakerEmbeddingLoading) await new Promise(r => setTimeout(r, 100));
|
|
227
|
-
if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding model failed to load');
|
|
228
|
-
return speakerEmbeddingPipeline;
|
|
229
|
-
}
|
|
230
|
-
speakerEmbeddingLoading = true;
|
|
231
|
-
try {
|
|
232
|
-
const { AutoModelForXVector, AutoFeatureExtractor, env } = await loadTransformers();
|
|
233
|
-
env.allowRemoteModels = true;
|
|
234
|
-
const modelId = 'Xenova/wavlm-base-plus-sv';
|
|
235
|
-
speakerEmbeddingPipeline = await AutoModelForXVector.from_pretrained(modelId, {
|
|
236
|
-
device: 'cpu',
|
|
237
|
-
dtype: 'fp32',
|
|
238
|
-
});
|
|
239
|
-
speakerFeatureExtractor = await AutoFeatureExtractor.from_pretrained(modelId);
|
|
240
|
-
return speakerEmbeddingPipeline;
|
|
241
|
-
} catch (err) {
|
|
242
|
-
speakerEmbeddingPipeline = null;
|
|
243
|
-
speakerFeatureExtractor = null;
|
|
244
|
-
throw new Error('Speaker embedding model load failed: ' + err.message);
|
|
245
|
-
} finally {
|
|
246
|
-
speakerEmbeddingLoading = false;
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
|
|
250
130
|
function findCustomVoiceFile(voiceId) {
|
|
251
131
|
const baseName = voiceId.replace(/^custom_/, '');
|
|
252
132
|
for (const dir of getVoiceDirs()) {
|
|
@@ -277,38 +157,6 @@ async function decodeAudioFile(filePath) {
|
|
|
277
157
|
return resampleTo16k(mono, audioBuffer.sampleRate);
|
|
278
158
|
}
|
|
279
159
|
|
|
280
|
-
async function generateEmbeddingFromCustomVoice(voiceId) {
|
|
281
|
-
const audioFile = findCustomVoiceFile(voiceId);
|
|
282
|
-
if (!audioFile) {
|
|
283
|
-
console.error('[VOICES] Custom voice file not found for:', voiceId);
|
|
284
|
-
return ensureSpeakerEmbeddings();
|
|
285
|
-
}
|
|
286
|
-
try {
|
|
287
|
-
console.log('[VOICES] Generating embedding from:', audioFile);
|
|
288
|
-
const audio = await decodeAudioFile(audioFile);
|
|
289
|
-
if (audio.length < SAMPLE_RATE_STT * 0.5) {
|
|
290
|
-
throw new Error('Audio too short for embedding extraction');
|
|
291
|
-
}
|
|
292
|
-
const model = await getSpeakerEmbeddingPipeline();
|
|
293
|
-
const inputs = await speakerFeatureExtractor(audio, { sampling_rate: SAMPLE_RATE_STT });
|
|
294
|
-
const output = await model(inputs);
|
|
295
|
-
const embData = output.embeddings.data;
|
|
296
|
-
const embedding = new Float32Array(512);
|
|
297
|
-
for (let i = 0; i < Math.min(512, embData.length); i++) {
|
|
298
|
-
embedding[i] = embData[i];
|
|
299
|
-
}
|
|
300
|
-
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
301
|
-
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
302
|
-
fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
|
|
303
|
-
voiceEmbeddingsCache.set(voiceId, embedding);
|
|
304
|
-
console.log('[VOICES] Generated embedding for custom voice:', voiceId);
|
|
305
|
-
return embedding;
|
|
306
|
-
} catch (err) {
|
|
307
|
-
console.error('[VOICES] Failed to generate embedding for', voiceId + ':', err.message);
|
|
308
|
-
return ensureSpeakerEmbeddings();
|
|
309
|
-
}
|
|
310
|
-
}
|
|
311
|
-
|
|
312
160
|
async function getSTT() {
|
|
313
161
|
if (sttPipeline) return sttPipeline;
|
|
314
162
|
if (sttLoadError) throw sttLoadError;
|
|
@@ -341,41 +189,6 @@ async function getSTT() {
|
|
|
341
189
|
}
|
|
342
190
|
}
|
|
343
191
|
|
|
344
|
-
async function getTTS() {
|
|
345
|
-
if (ttsPipeline) return ttsPipeline;
|
|
346
|
-
if (ttsLoadError) {
|
|
347
|
-
if (Date.now() - ttsLoadErrorTime < TTS_ERROR_RETRY_MS) throw ttsLoadError;
|
|
348
|
-
ttsLoadError = null;
|
|
349
|
-
ttsLoadErrorTime = 0;
|
|
350
|
-
}
|
|
351
|
-
if (ttsLoading) {
|
|
352
|
-
while (ttsLoading) await new Promise(r => setTimeout(r, 100));
|
|
353
|
-
if (ttsLoadError) throw ttsLoadError;
|
|
354
|
-
if (!ttsPipeline) throw new Error('TTS pipeline failed to load');
|
|
355
|
-
return ttsPipeline;
|
|
356
|
-
}
|
|
357
|
-
ttsLoading = true;
|
|
358
|
-
try {
|
|
359
|
-
const { pipeline, env } = await loadTransformers();
|
|
360
|
-
env.allowRemoteModels = true;
|
|
361
|
-
ttsPipeline = await pipeline('text-to-speech', 'Xenova/speecht5_tts', {
|
|
362
|
-
device: 'cpu',
|
|
363
|
-
dtype: 'fp32',
|
|
364
|
-
});
|
|
365
|
-
await ensureSpeakerEmbeddings();
|
|
366
|
-
ttsLoadError = null;
|
|
367
|
-
ttsLoadErrorTime = 0;
|
|
368
|
-
return ttsPipeline;
|
|
369
|
-
} catch (err) {
|
|
370
|
-
ttsPipeline = null;
|
|
371
|
-
ttsLoadError = new Error('TTS model load failed: ' + err.message);
|
|
372
|
-
ttsLoadErrorTime = Date.now();
|
|
373
|
-
throw ttsLoadError;
|
|
374
|
-
} finally {
|
|
375
|
-
ttsLoading = false;
|
|
376
|
-
}
|
|
377
|
-
}
|
|
378
|
-
|
|
379
192
|
function decodeWavToFloat32(buffer) {
|
|
380
193
|
const view = new DataView(buffer.buffer || buffer);
|
|
381
194
|
const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3));
|
|
@@ -510,6 +323,20 @@ function cachePut(key, buf) {
|
|
|
510
323
|
ttsCacheBytes += buf.length;
|
|
511
324
|
}
|
|
512
325
|
|
|
326
|
+
function resolveVoicePath(voiceId) {
|
|
327
|
+
if (!voiceId || voiceId === 'default') return null;
|
|
328
|
+
return pocket.findVoiceFile(voiceId) || findCustomVoiceFile(voiceId);
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
async function synthesizeViaPocket(text, voiceId) {
|
|
332
|
+
const pState = pocket.getState();
|
|
333
|
+
if (!pState.healthy) throw new Error('pocket-tts not healthy');
|
|
334
|
+
const voicePath = resolveVoicePath(voiceId);
|
|
335
|
+
const wav = await pocket.synthesize(text, voicePath);
|
|
336
|
+
if (wav && wav.length > 44) return wav;
|
|
337
|
+
throw new Error('pocket-tts returned empty audio');
|
|
338
|
+
}
|
|
339
|
+
|
|
513
340
|
async function synthesize(text, voiceId) {
|
|
514
341
|
const cacheKey = (voiceId || 'default') + ':' + text;
|
|
515
342
|
const cached = ttsCache.get(cacheKey);
|
|
@@ -521,10 +348,7 @@ async function synthesize(text, voiceId) {
|
|
|
521
348
|
const inflight = ttsInflight.get(cacheKey);
|
|
522
349
|
if (inflight) return inflight;
|
|
523
350
|
const promise = (async () => {
|
|
524
|
-
const
|
|
525
|
-
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
526
|
-
const result = await tts(text, { speaker_embeddings: embeddings });
|
|
527
|
-
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
351
|
+
const wav = await synthesizeViaPocket(text, voiceId);
|
|
528
352
|
cachePut(cacheKey, wav);
|
|
529
353
|
return wav;
|
|
530
354
|
})();
|
|
@@ -534,8 +358,6 @@ async function synthesize(text, voiceId) {
|
|
|
534
358
|
|
|
535
359
|
async function* synthesizeStream(text, voiceId) {
|
|
536
360
|
const sentences = splitSentences(text);
|
|
537
|
-
const tts = await getTTS();
|
|
538
|
-
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
539
361
|
for (const sentence of sentences) {
|
|
540
362
|
const cacheKey = (voiceId || 'default') + ':' + sentence;
|
|
541
363
|
const cached = ttsCache.get(cacheKey);
|
|
@@ -545,27 +367,34 @@ async function* synthesizeStream(text, voiceId) {
|
|
|
545
367
|
yield cached;
|
|
546
368
|
continue;
|
|
547
369
|
}
|
|
548
|
-
const
|
|
549
|
-
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
370
|
+
const wav = await synthesizeViaPocket(sentence, voiceId);
|
|
550
371
|
cachePut(cacheKey, wav);
|
|
551
372
|
yield wav;
|
|
552
373
|
}
|
|
553
374
|
}
|
|
554
375
|
|
|
555
376
|
function getStatus() {
|
|
556
|
-
const
|
|
377
|
+
const pState = pocket.getState();
|
|
557
378
|
return {
|
|
558
379
|
sttReady: !!sttPipeline,
|
|
559
|
-
ttsReady:
|
|
380
|
+
ttsReady: pState.healthy,
|
|
560
381
|
sttLoading,
|
|
561
|
-
ttsLoading,
|
|
382
|
+
ttsLoading: false,
|
|
562
383
|
sttError: sttLoadError ? sttLoadError.message : null,
|
|
563
|
-
ttsError:
|
|
384
|
+
ttsError: pState.healthy ? null : (pState.lastError || 'pocket-tts not running'),
|
|
385
|
+
pocketTts: pState,
|
|
564
386
|
};
|
|
565
387
|
}
|
|
566
388
|
|
|
567
389
|
function preloadTTS() {
|
|
568
|
-
|
|
390
|
+
const defaultVoice = findCustomVoiceFile('custom_cleetus') || '/config/voices/cleetus.wav';
|
|
391
|
+
const voicePath = fs.existsSync(defaultVoice) ? defaultVoice : null;
|
|
392
|
+
pocket.start(voicePath).then(ok => {
|
|
393
|
+
if (ok) console.log('[TTS] pocket-tts sidecar started');
|
|
394
|
+
else console.log('[TTS] pocket-tts failed to start');
|
|
395
|
+
}).catch(err => {
|
|
396
|
+
console.error('[TTS] pocket-tts start error:', err.message);
|
|
397
|
+
});
|
|
569
398
|
}
|
|
570
399
|
|
|
571
400
|
function ttsCacheKey(text, voiceId) {
|
|
@@ -578,4 +407,4 @@ function ttsCacheGet(key) {
|
|
|
578
407
|
return cached || null;
|
|
579
408
|
}
|
|
580
409
|
|
|
581
|
-
export { transcribe, synthesize, synthesizeStream, getSTT,
|
|
410
|
+
export { transcribe, synthesize, synthesizeStream, getSTT, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
|
package/package.json
CHANGED
package/server.js
CHANGED
|
@@ -1414,7 +1414,7 @@ function onServerReady() {
|
|
|
1414
1414
|
// Recover stale active sessions from previous run
|
|
1415
1415
|
recoverStaleSessions();
|
|
1416
1416
|
|
|
1417
|
-
getSpeech().then(s => s.
|
|
1417
|
+
getSpeech().then(s => s.preloadTTS()).catch(e => debugLog('[TTS] Preload failed: ' + e.message));
|
|
1418
1418
|
|
|
1419
1419
|
performAutoImport();
|
|
1420
1420
|
|