agentgui 1.0.179 → 1.0.181
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/pocket-sidecar.js +38 -7
- package/lib/speech.js +14 -227
- package/package.json +1 -1
package/lib/pocket-sidecar.js
CHANGED
|
@@ -6,7 +6,6 @@ import { fileURLToPath } from 'url';
|
|
|
6
6
|
import http from 'http';
|
|
7
7
|
|
|
8
8
|
const ROOT = path.dirname(path.dirname(fileURLToPath(import.meta.url)));
|
|
9
|
-
const POCKET_BIN = path.join(ROOT, 'data', 'pocket-venv', 'bin', 'pocket-tts');
|
|
10
9
|
const PORT = 8787;
|
|
11
10
|
|
|
12
11
|
const FALLBACK_VOICE = 'alba';
|
|
@@ -15,11 +14,21 @@ const state = {
|
|
|
15
14
|
restartCount: 0, failureCount: 0, lastError: null,
|
|
16
15
|
healthy: false, voicePath: null, starting: false,
|
|
17
16
|
shutdownRequested: false, healthTimer: null, restartTimer: null,
|
|
18
|
-
voiceCloning: false,
|
|
17
|
+
voiceCloning: false, adopted: false,
|
|
19
18
|
};
|
|
20
19
|
globalThis.__pocketSidecar = state;
|
|
21
20
|
|
|
22
|
-
function
|
|
21
|
+
function findBinary() {
|
|
22
|
+
const candidates = [
|
|
23
|
+
path.join(ROOT, 'data', 'pocket-venv', 'bin', 'pocket-tts'),
|
|
24
|
+
'/config/workspace/agentgui/data/pocket-venv/bin/pocket-tts',
|
|
25
|
+
path.join(os.homedir(), '.gmgui', 'pocket-venv', 'bin', 'pocket-tts'),
|
|
26
|
+
];
|
|
27
|
+
for (const p of candidates) if (fs.existsSync(p)) return p;
|
|
28
|
+
return null;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function isInstalled() { return !!findBinary(); }
|
|
23
32
|
|
|
24
33
|
function findVoiceFile(voiceId) {
|
|
25
34
|
if (!voiceId || voiceId === 'default') return null;
|
|
@@ -54,21 +63,24 @@ function killProcess() {
|
|
|
54
63
|
|
|
55
64
|
function scheduleRestart() {
|
|
56
65
|
if (state.shutdownRequested) return;
|
|
57
|
-
killProcess();
|
|
66
|
+
if (!state.adopted) killProcess();
|
|
58
67
|
const delay = Math.min(1000 * Math.pow(2, state.restartCount), 30000);
|
|
59
68
|
state.restartCount++;
|
|
60
69
|
console.log(`[POCKET-TTS] Restart in ${delay}ms (attempt ${state.restartCount})`);
|
|
61
70
|
state.restartTimer = setTimeout(() => {
|
|
62
71
|
state.restartTimer = null;
|
|
72
|
+
state.adopted = false;
|
|
63
73
|
start(state.voicePath).catch(e => console.error('[POCKET-TTS] Restart failed:', e.message));
|
|
64
74
|
}, delay);
|
|
65
75
|
}
|
|
66
76
|
|
|
67
77
|
function spawnSidecar(voice) {
|
|
78
|
+
const bin = findBinary();
|
|
79
|
+
if (!bin) throw new Error('pocket-tts binary not found');
|
|
68
80
|
const args = ['serve', '--host', '0.0.0.0', '--port', String(PORT)];
|
|
69
81
|
if (voice) args.push('--voice', voice);
|
|
70
|
-
console.log('[POCKET-TTS] Starting:',
|
|
71
|
-
return spawn(
|
|
82
|
+
console.log('[POCKET-TTS] Starting:', bin, args.join(' '));
|
|
83
|
+
return spawn(bin, args, {
|
|
72
84
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
73
85
|
env: { ...process.env, PYTHONUNBUFFERED: '1' },
|
|
74
86
|
});
|
|
@@ -92,10 +104,29 @@ async function waitForReady(proc, timeoutSec) {
|
|
|
92
104
|
return false;
|
|
93
105
|
}
|
|
94
106
|
|
|
107
|
+
async function adoptRunning() {
|
|
108
|
+
if (await healthCheck()) {
|
|
109
|
+
state.status = 'running'; state.healthy = true; state.adopted = true;
|
|
110
|
+
state.restartCount = 0; state.failureCount = 0; state.lastError = null;
|
|
111
|
+
if (!state.healthTimer) state.healthTimer = setInterval(async () => {
|
|
112
|
+
if (state.status !== 'running') return;
|
|
113
|
+
const ok = await healthCheck();
|
|
114
|
+
if (!ok && !state.shutdownRequested) {
|
|
115
|
+
state.failureCount++;
|
|
116
|
+
if (state.failureCount >= 3) { state.adopted = false; scheduleRestart(); }
|
|
117
|
+
} else if (ok) state.failureCount = 0;
|
|
118
|
+
}, 10000);
|
|
119
|
+
console.log('[POCKET-TTS] Adopted existing instance on port', PORT);
|
|
120
|
+
return true;
|
|
121
|
+
}
|
|
122
|
+
return false;
|
|
123
|
+
}
|
|
124
|
+
|
|
95
125
|
async function start(voicePath) {
|
|
96
|
-
if (!isInstalled()) { state.lastError = 'not installed'; state.status = 'unavailable'; return false; }
|
|
97
126
|
if (state.starting) return false;
|
|
98
127
|
if (state.status === 'running' && state.healthy) return true;
|
|
128
|
+
if (await adoptRunning()) return true;
|
|
129
|
+
if (!isInstalled()) { state.lastError = 'not installed'; state.status = 'unavailable'; return false; }
|
|
99
130
|
state.starting = true; state.shutdownRequested = false;
|
|
100
131
|
const requestedVoice = voicePath || state.voicePath;
|
|
101
132
|
try {
|
package/lib/speech.js
CHANGED
|
@@ -8,7 +8,6 @@ import * as pocket from './pocket-sidecar.js';
|
|
|
8
8
|
const require = createRequire(import.meta.url);
|
|
9
9
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
10
|
const ROOT = path.dirname(__dirname);
|
|
11
|
-
const DATA_DIR = path.join(ROOT, 'data');
|
|
12
11
|
const AUDIO_EXTENSIONS = ['.wav', '.mp3', '.ogg', '.flac', '.m4a'];
|
|
13
12
|
|
|
14
13
|
function getVoiceDirs() {
|
|
@@ -102,28 +101,11 @@ function getVoices() {
|
|
|
102
101
|
return [...BASE_VOICES, ...loadCustomVoices()];
|
|
103
102
|
}
|
|
104
103
|
|
|
105
|
-
const SPEAKER_OFFSETS = { awb: 0, bdl: 1200, clb: 2300, jmk: 3500, ksp: 4700, rms: 5900, slt: 7100 };
|
|
106
|
-
const SPEAKER_EMBEDDINGS_URL = 'https://huggingface.co/datasets/Xenova/speaker_embeddings/resolve/main/spkrec-xvectors-voxceleb.hf';
|
|
107
|
-
const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
|
|
108
|
-
const DATASET_API = 'https://datasets-server.huggingface.co/rows?dataset=Xenova%2Fspeaker_embeddings&config=default&split=train';
|
|
109
|
-
const SAMPLES_TO_AVERAGE = 30;
|
|
110
|
-
const DEFAULT_EMBEDDING_B64 = 'xhibvao34LylqXQ8cNg7Pd1cCTw0keG8awRRvRqje7070G48AtOgPMFbnr1oeKC9I4ZuPZzqGT1DjWs8y3iMPB/SZLzdl7E6b9QaPKSpHTwYuh49FrMlO9YnebwmTzu9/3CPvQuvCbxsSWC9Sb2bO+tvXj0Cjpo8mTMxu/FDrjzQ4x09gyxCvUn6STxjAo+9vtXdPJtsYT3iMna9dQ+EvfQ72zuvxk69GAonPU8KdjsNPAU96e/8veN7lrwgyzk8HA5vvYE1Rz3gpZ484MsLPUKkxTzM54U81ECwvcbFHzv8gT08T6/7POCqBT2fv5E8fvsXPfZiJrzEhme8dg8kPR+mKTutQOU822maPMlMDb1x/IS93+6KvdyThzwhry880JBqvRVOhjzZods8SD08PLpObTn/0wk9BnAwvWiiz72EWgS9RpcjvV4VR73ZqJW9PoUFvfZYYb1h26S98levPHZbTjxH6qU9RPfoPHmJu70mSNo8ztJmvWgMBj0IX8i7TE3lPINY2DzoEma9wMObvTwKCT3pObe8t9KEvaWixjzc5fI8hj6MvaKv4Txl4h09d2a+PHCvTDxorJ69ekRrPeoPjz1JPfI7rUH7PIaJgz0O1YW9JLumvCxDnr1bmMm8GbIFPBX1oL3bRN08oYcXPEaFfL13Vxo9EKfbvTFcOTxdogA9XS3kPEWJoLvChc887BEgPMOvUT2Ba3s8tUDBvYPMZ72dNRG80AuTvQt7d72foTU9qO20O4INEb1u1iE9ibqJvZYaOj2nbYc8lsodvS5HPD1lCqK9EkBYPR0I/rySMIK9plcpPdpJEz2E/DY88d2DPIRTf71ZQZS9b1v5PPseFT2YiJu8OiOwPC8Wnr2QW4Q8n+o7PPQ8PD0QqAg9Vk7APDT6+jzreP88KH6GvTvAKD0AYiO9qOavvORySjvQ6y+9epb5PFvZijxYzlK9BwjUPK0HXL3acWc7dmwmPc/kXb2VBg68MGYRPR5q9zzmFiS9al2IvdVTfDwJOa88SzVkvVlrPD0WvJQ8Vm76PMUAQDzNgyK8QQZVPdMoibxrCBc9BgKTPDLoV70Iu6g7k+kBPZ3lhTy6sOU8OGkVvFaLRD14oqa9a4UVO4z4Gr1eYlO9u5BgPWS1ZL3kFPE8JGEwPQFTl71tHso8g+ElPd9Rgr2XCtc8axudvWC2IL09wSg9E7ZzPT6uBz2XmK09A1HcPJK8rTxK8Zu8GuMTPTuINTyRAhS9OSqDPDralLza3q48EgtePPf797rIWKo9NtkrvbO34zxKZ6m97l0GPQYVlL2igDA9UyfEPJhZyjx4/2Q8ggBpPYcAkzzIVu08ykYNPESdZr3uqmq8fS/zPKUYvzv67x49cUkqvXDlJj1us/88gASuvcs6G7sUshY9SgWiOqu4OD1WQ7k7/sLoPKuLJjwZYFm9an+zPOnfNry9Jh49/XX3vN1sc731fBM9TnBDPHzOAD26/dS9mg57vY+TA7wVJCw9pPb1PE30l7019la9UyRTPXFqljyRDnw9eZ6nvU03kTtS9907L+wavIBtab3k6cs8KVr6vPZ5zTxy+Zs8VuopPQTTUj0tNxg96qZyPY69lTzQEp48BXGJvVopBDvskUg9G2dOPaJMXDylJZU8FxcMvBQkNzzjPKs8FYUpvepYYj1AQsK9upQsvS4037xDcO48GhmIvWb1iT1gJhy9TG7iPHKAG70cuCQ8F1ZwPYqtj7300T89rTujPbXy2r3/cK69FtBNvY3iMT0DoqI4KK0QPYKEqr2Z6RU9ni0UPUNDLb3BsCi8+GttvZYp9zwUaHe9TqrFPOnlH7yCXJC9U8vDu8u2MjxA8xs9SAGxvPpphr29y2e9y2AYvTv+Eb1Elus9DdpGPSfmNL39Ggu85RVXPZbLh70Jvna7XkLGvR230DtGjpu7Ih8HPJKnIz1o35i8x5NVvXwFNDzs/ZM8+kw8PfFJSTwdlJA9ZJ+tvaoVZ7zTvVi8p6wluwh/IT0Kmg088o1rPRhiwjxpWIe9a+LuvYuYtjwAxE09WkPJPBuFh73UotY820JjvXpnQD3fJ/w8TM3JPOz0pTnbTim9tpe6PBHzJT1HEb66SkAKPasLgr1l/Mm8IOGgvM2pZbzwd4a9znOIO4d4Bb1DW5I8EZXzOxvBKDqKpHG9UwCHvd/Epb2cDRi9V1ztPNPBNTrLXHa8FdGHPPo+hb3DnJ08G+SvvVPQBL6zzrC8Omksvc+eIjyvGfU8eG9nvaVkdL1HBvs8eaeGPfcbVD1/Pfw8+TUFvU6aTL2JN5W8HXDNvGKFEj1i+T09UiCIOySbDD2x2/y7VTmnvTe3gb0ZhJw8WrKIuU5RGT09mKU7eFGtPFpr6DzaoyI9hsItPKU+YzuQlXK8f9IePSmUxTwXdoo9W6FJPV2kLzwkU1o8fGnfPInxg70rEVe9H7sNPWJDbbxSqLY8cQAOPUdpAD2YknK9ykFXPeVALz1mq3W96kO/PLERzjyXIRC7jxsXPRnLzjyUEoU7gTKvu+stlb1D1g45IH+2u5sOIj0wXPA8yTqDvT6mV72NsFq8ExeuPJlGyDxvjgk9lJeJvWSF8DwFvaW7oZ9GvHq1Rr1FJsk83zxVvfyGqTz7thG9fslpPF5RPb1Q6BQ9iXGovTeDeb2cmic8oBsRPYeni72TPcI8EKcPvfCJUbyQJqW9fCAYPRk8qT2q6rk8mEw2PfDeXL0=';
|
|
111
|
-
|
|
112
104
|
let transformersModule = null;
|
|
113
105
|
let sttPipeline = null;
|
|
114
|
-
let ttsPipeline = null;
|
|
115
|
-
let speakerEmbeddings = null;
|
|
116
|
-
let speakerEmbeddingPipeline = null;
|
|
117
106
|
let sttLoading = false;
|
|
118
|
-
let ttsLoading = false;
|
|
119
|
-
let speakerEmbeddingLoading = false;
|
|
120
|
-
let ttsLoadError = null;
|
|
121
|
-
let ttsLoadErrorTime = 0;
|
|
122
107
|
let sttLoadError = null;
|
|
123
|
-
const voiceEmbeddingsCache = new Map();
|
|
124
108
|
const SAMPLE_RATE_STT = 16000;
|
|
125
|
-
const SAMPLE_RATE_TTS = 16000;
|
|
126
|
-
const TTS_ERROR_RETRY_MS = 30000;
|
|
127
109
|
|
|
128
110
|
const TTS_CACHE_MAX_BYTES = 10 * 1024 * 1024;
|
|
129
111
|
let ttsCacheBytes = 0;
|
|
@@ -145,109 +127,6 @@ function whisperModelPath() {
|
|
|
145
127
|
return 'onnx-community/whisper-base';
|
|
146
128
|
}
|
|
147
129
|
|
|
148
|
-
function defaultEmbedding() {
|
|
149
|
-
const buf = Buffer.from(DEFAULT_EMBEDDING_B64, 'base64');
|
|
150
|
-
return new Float32Array(new Uint8Array(buf).buffer);
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
async function ensureSpeakerEmbeddings() {
|
|
154
|
-
if (speakerEmbeddings) return speakerEmbeddings;
|
|
155
|
-
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
156
|
-
if (fs.existsSync(SPEAKER_EMBEDDINGS_PATH)) {
|
|
157
|
-
const buf = fs.readFileSync(SPEAKER_EMBEDDINGS_PATH);
|
|
158
|
-
if (buf.length === 2048) {
|
|
159
|
-
speakerEmbeddings = new Float32Array(new Uint8Array(buf).buffer);
|
|
160
|
-
return speakerEmbeddings;
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
try {
|
|
164
|
-
const resp = await fetch(SPEAKER_EMBEDDINGS_URL);
|
|
165
|
-
if (resp.ok) {
|
|
166
|
-
const data = Buffer.from(await resp.arrayBuffer());
|
|
167
|
-
if (data.length >= 2048) {
|
|
168
|
-
fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, data);
|
|
169
|
-
speakerEmbeddings = new Float32Array(new Uint8Array(data).buffer);
|
|
170
|
-
return speakerEmbeddings;
|
|
171
|
-
}
|
|
172
|
-
}
|
|
173
|
-
} catch (_) {}
|
|
174
|
-
console.log('[TTS] Using bundled default speaker embedding');
|
|
175
|
-
speakerEmbeddings = defaultEmbedding();
|
|
176
|
-
const buf = Buffer.from(speakerEmbeddings.buffer);
|
|
177
|
-
fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, buf);
|
|
178
|
-
return speakerEmbeddings;
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
async function loadVoiceEmbedding(voiceId) {
|
|
182
|
-
if (!voiceId || voiceId === 'default') return ensureSpeakerEmbeddings();
|
|
183
|
-
if (voiceEmbeddingsCache.has(voiceId)) return voiceEmbeddingsCache.get(voiceId);
|
|
184
|
-
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
185
|
-
if (fs.existsSync(binPath)) {
|
|
186
|
-
const buf = fs.readFileSync(binPath);
|
|
187
|
-
const emb = new Float32Array(new Uint8Array(buf).buffer);
|
|
188
|
-
voiceEmbeddingsCache.set(voiceId, emb);
|
|
189
|
-
return emb;
|
|
190
|
-
}
|
|
191
|
-
if (voiceId.startsWith('custom_')) {
|
|
192
|
-
return generateEmbeddingFromCustomVoice(voiceId);
|
|
193
|
-
}
|
|
194
|
-
const offset = SPEAKER_OFFSETS[voiceId];
|
|
195
|
-
if (offset === undefined) return ensureSpeakerEmbeddings();
|
|
196
|
-
try {
|
|
197
|
-
const url = `${DATASET_API}&offset=${offset}&length=${SAMPLES_TO_AVERAGE}`;
|
|
198
|
-
const resp = await fetch(url);
|
|
199
|
-
if (!resp.ok) throw new Error('HTTP ' + resp.status);
|
|
200
|
-
const data = await resp.json();
|
|
201
|
-
const avg = new Float32Array(512);
|
|
202
|
-
let count = 0;
|
|
203
|
-
for (const item of data.rows) {
|
|
204
|
-
const match = item.row.filename.match(/cmu_us_(\w+)_arctic/);
|
|
205
|
-
if (match && match[1] === voiceId) {
|
|
206
|
-
for (let i = 0; i < 512; i++) avg[i] += item.row.xvector[i];
|
|
207
|
-
count++;
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
if (count === 0) return ensureSpeakerEmbeddings();
|
|
211
|
-
for (let i = 0; i < 512; i++) avg[i] /= count;
|
|
212
|
-
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
213
|
-
fs.writeFileSync(binPath, Buffer.from(avg.buffer));
|
|
214
|
-
voiceEmbeddingsCache.set(voiceId, avg);
|
|
215
|
-
return avg;
|
|
216
|
-
} catch (err) {
|
|
217
|
-
console.error('[TTS] Failed to fetch voice embedding for ' + voiceId + ':', err.message);
|
|
218
|
-
return ensureSpeakerEmbeddings();
|
|
219
|
-
}
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
let speakerFeatureExtractor = null;
|
|
223
|
-
|
|
224
|
-
async function getSpeakerEmbeddingPipeline() {
|
|
225
|
-
if (speakerEmbeddingPipeline) return speakerEmbeddingPipeline;
|
|
226
|
-
if (speakerEmbeddingLoading) {
|
|
227
|
-
while (speakerEmbeddingLoading) await new Promise(r => setTimeout(r, 100));
|
|
228
|
-
if (!speakerEmbeddingPipeline) throw new Error('Speaker embedding model failed to load');
|
|
229
|
-
return speakerEmbeddingPipeline;
|
|
230
|
-
}
|
|
231
|
-
speakerEmbeddingLoading = true;
|
|
232
|
-
try {
|
|
233
|
-
const { AutoModelForXVector, AutoFeatureExtractor, env } = await loadTransformers();
|
|
234
|
-
env.allowRemoteModels = true;
|
|
235
|
-
const modelId = 'Xenova/wavlm-base-plus-sv';
|
|
236
|
-
speakerEmbeddingPipeline = await AutoModelForXVector.from_pretrained(modelId, {
|
|
237
|
-
device: 'cpu',
|
|
238
|
-
dtype: 'fp32',
|
|
239
|
-
});
|
|
240
|
-
speakerFeatureExtractor = await AutoFeatureExtractor.from_pretrained(modelId);
|
|
241
|
-
return speakerEmbeddingPipeline;
|
|
242
|
-
} catch (err) {
|
|
243
|
-
speakerEmbeddingPipeline = null;
|
|
244
|
-
speakerFeatureExtractor = null;
|
|
245
|
-
throw new Error('Speaker embedding model load failed: ' + err.message);
|
|
246
|
-
} finally {
|
|
247
|
-
speakerEmbeddingLoading = false;
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
|
|
251
130
|
function findCustomVoiceFile(voiceId) {
|
|
252
131
|
const baseName = voiceId.replace(/^custom_/, '');
|
|
253
132
|
for (const dir of getVoiceDirs()) {
|
|
@@ -278,38 +157,6 @@ async function decodeAudioFile(filePath) {
|
|
|
278
157
|
return resampleTo16k(mono, audioBuffer.sampleRate);
|
|
279
158
|
}
|
|
280
159
|
|
|
281
|
-
async function generateEmbeddingFromCustomVoice(voiceId) {
|
|
282
|
-
const audioFile = findCustomVoiceFile(voiceId);
|
|
283
|
-
if (!audioFile) {
|
|
284
|
-
console.error('[VOICES] Custom voice file not found for:', voiceId);
|
|
285
|
-
return ensureSpeakerEmbeddings();
|
|
286
|
-
}
|
|
287
|
-
try {
|
|
288
|
-
console.log('[VOICES] Generating embedding from:', audioFile);
|
|
289
|
-
const audio = await decodeAudioFile(audioFile);
|
|
290
|
-
if (audio.length < SAMPLE_RATE_STT * 0.5) {
|
|
291
|
-
throw new Error('Audio too short for embedding extraction');
|
|
292
|
-
}
|
|
293
|
-
const model = await getSpeakerEmbeddingPipeline();
|
|
294
|
-
const inputs = await speakerFeatureExtractor(audio, { sampling_rate: SAMPLE_RATE_STT });
|
|
295
|
-
const output = await model(inputs);
|
|
296
|
-
const embData = output.embeddings.data;
|
|
297
|
-
const embedding = new Float32Array(512);
|
|
298
|
-
for (let i = 0; i < Math.min(512, embData.length); i++) {
|
|
299
|
-
embedding[i] = embData[i];
|
|
300
|
-
}
|
|
301
|
-
if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
|
|
302
|
-
const binPath = path.join(DATA_DIR, `speaker_${voiceId}.bin`);
|
|
303
|
-
fs.writeFileSync(binPath, Buffer.from(embedding.buffer));
|
|
304
|
-
voiceEmbeddingsCache.set(voiceId, embedding);
|
|
305
|
-
console.log('[VOICES] Generated embedding for custom voice:', voiceId);
|
|
306
|
-
return embedding;
|
|
307
|
-
} catch (err) {
|
|
308
|
-
console.error('[VOICES] Failed to generate embedding for', voiceId + ':', err.message);
|
|
309
|
-
return ensureSpeakerEmbeddings();
|
|
310
|
-
}
|
|
311
|
-
}
|
|
312
|
-
|
|
313
160
|
async function getSTT() {
|
|
314
161
|
if (sttPipeline) return sttPipeline;
|
|
315
162
|
if (sttLoadError) throw sttLoadError;
|
|
@@ -342,41 +189,6 @@ async function getSTT() {
|
|
|
342
189
|
}
|
|
343
190
|
}
|
|
344
191
|
|
|
345
|
-
async function getTTS() {
|
|
346
|
-
if (ttsPipeline) return ttsPipeline;
|
|
347
|
-
if (ttsLoadError) {
|
|
348
|
-
if (Date.now() - ttsLoadErrorTime < TTS_ERROR_RETRY_MS) throw ttsLoadError;
|
|
349
|
-
ttsLoadError = null;
|
|
350
|
-
ttsLoadErrorTime = 0;
|
|
351
|
-
}
|
|
352
|
-
if (ttsLoading) {
|
|
353
|
-
while (ttsLoading) await new Promise(r => setTimeout(r, 100));
|
|
354
|
-
if (ttsLoadError) throw ttsLoadError;
|
|
355
|
-
if (!ttsPipeline) throw new Error('TTS pipeline failed to load');
|
|
356
|
-
return ttsPipeline;
|
|
357
|
-
}
|
|
358
|
-
ttsLoading = true;
|
|
359
|
-
try {
|
|
360
|
-
const { pipeline, env } = await loadTransformers();
|
|
361
|
-
env.allowRemoteModels = true;
|
|
362
|
-
ttsPipeline = await pipeline('text-to-speech', 'Xenova/speecht5_tts', {
|
|
363
|
-
device: 'cpu',
|
|
364
|
-
dtype: 'fp32',
|
|
365
|
-
});
|
|
366
|
-
await ensureSpeakerEmbeddings();
|
|
367
|
-
ttsLoadError = null;
|
|
368
|
-
ttsLoadErrorTime = 0;
|
|
369
|
-
return ttsPipeline;
|
|
370
|
-
} catch (err) {
|
|
371
|
-
ttsPipeline = null;
|
|
372
|
-
ttsLoadError = new Error('TTS model load failed: ' + err.message);
|
|
373
|
-
ttsLoadErrorTime = Date.now();
|
|
374
|
-
throw ttsLoadError;
|
|
375
|
-
} finally {
|
|
376
|
-
ttsLoading = false;
|
|
377
|
-
}
|
|
378
|
-
}
|
|
379
|
-
|
|
380
192
|
function decodeWavToFloat32(buffer) {
|
|
381
193
|
const view = new DataView(buffer.buffer || buffer);
|
|
382
194
|
const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3));
|
|
@@ -518,15 +330,11 @@ function resolveVoicePath(voiceId) {
|
|
|
518
330
|
|
|
519
331
|
async function synthesizeViaPocket(text, voiceId) {
|
|
520
332
|
const pState = pocket.getState();
|
|
521
|
-
if (!pState.healthy)
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
} catch (err) {
|
|
527
|
-
console.error('[TTS] pocket-tts failed, falling back:', err.message);
|
|
528
|
-
}
|
|
529
|
-
return null;
|
|
333
|
+
if (!pState.healthy) throw new Error('pocket-tts not healthy');
|
|
334
|
+
const voicePath = resolveVoicePath(voiceId);
|
|
335
|
+
const wav = await pocket.synthesize(text, voicePath);
|
|
336
|
+
if (wav && wav.length > 44) return wav;
|
|
337
|
+
throw new Error('pocket-tts returned empty audio');
|
|
530
338
|
}
|
|
531
339
|
|
|
532
340
|
async function synthesize(text, voiceId) {
|
|
@@ -540,12 +348,7 @@ async function synthesize(text, voiceId) {
|
|
|
540
348
|
const inflight = ttsInflight.get(cacheKey);
|
|
541
349
|
if (inflight) return inflight;
|
|
542
350
|
const promise = (async () => {
|
|
543
|
-
const
|
|
544
|
-
if (pocketWav) { cachePut(cacheKey, pocketWav); return pocketWav; }
|
|
545
|
-
const tts = await getTTS();
|
|
546
|
-
const embeddings = await loadVoiceEmbedding(voiceId);
|
|
547
|
-
const result = await tts(text, { speaker_embeddings: embeddings });
|
|
548
|
-
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
351
|
+
const wav = await synthesizeViaPocket(text, voiceId);
|
|
549
352
|
cachePut(cacheKey, wav);
|
|
550
353
|
return wav;
|
|
551
354
|
})();
|
|
@@ -555,12 +358,6 @@ async function synthesize(text, voiceId) {
|
|
|
555
358
|
|
|
556
359
|
async function* synthesizeStream(text, voiceId) {
|
|
557
360
|
const sentences = splitSentences(text);
|
|
558
|
-
const usePocket = pocket.getState().healthy;
|
|
559
|
-
let tts, embeddings;
|
|
560
|
-
if (!usePocket) {
|
|
561
|
-
tts = await getTTS();
|
|
562
|
-
embeddings = await loadVoiceEmbedding(voiceId);
|
|
563
|
-
}
|
|
564
361
|
for (const sentence of sentences) {
|
|
565
362
|
const cacheKey = (voiceId || 'default') + ':' + sentence;
|
|
566
363
|
const cached = ttsCache.get(cacheKey);
|
|
@@ -570,28 +367,21 @@ async function* synthesizeStream(text, voiceId) {
|
|
|
570
367
|
yield cached;
|
|
571
368
|
continue;
|
|
572
369
|
}
|
|
573
|
-
|
|
574
|
-
const pocketWav = await synthesizeViaPocket(sentence, voiceId);
|
|
575
|
-
if (pocketWav) { cachePut(cacheKey, pocketWav); yield pocketWav; continue; }
|
|
576
|
-
}
|
|
577
|
-
if (!tts) { tts = await getTTS(); embeddings = await loadVoiceEmbedding(voiceId); }
|
|
578
|
-
const result = await tts(sentence, { speaker_embeddings: embeddings });
|
|
579
|
-
const wav = encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
|
|
370
|
+
const wav = await synthesizeViaPocket(sentence, voiceId);
|
|
580
371
|
cachePut(cacheKey, wav);
|
|
581
372
|
yield wav;
|
|
582
373
|
}
|
|
583
374
|
}
|
|
584
375
|
|
|
585
376
|
function getStatus() {
|
|
586
|
-
const ttsRetryExpired = ttsLoadError && (Date.now() - ttsLoadErrorTime >= TTS_ERROR_RETRY_MS);
|
|
587
377
|
const pState = pocket.getState();
|
|
588
378
|
return {
|
|
589
379
|
sttReady: !!sttPipeline,
|
|
590
|
-
ttsReady:
|
|
380
|
+
ttsReady: pState.healthy,
|
|
591
381
|
sttLoading,
|
|
592
|
-
ttsLoading,
|
|
382
|
+
ttsLoading: false,
|
|
593
383
|
sttError: sttLoadError ? sttLoadError.message : null,
|
|
594
|
-
ttsError:
|
|
384
|
+
ttsError: pState.healthy ? null : (pState.lastError || 'pocket-tts not running'),
|
|
595
385
|
pocketTts: pState,
|
|
596
386
|
};
|
|
597
387
|
}
|
|
@@ -601,12 +391,9 @@ function preloadTTS() {
|
|
|
601
391
|
const voicePath = fs.existsSync(defaultVoice) ? defaultVoice : null;
|
|
602
392
|
pocket.start(voicePath).then(ok => {
|
|
603
393
|
if (ok) console.log('[TTS] pocket-tts sidecar started');
|
|
604
|
-
else
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
}
|
|
608
|
-
}).catch(() => {
|
|
609
|
-
getTTS().catch(err => console.error('[TTS] SpeechT5 preload failed:', err.message));
|
|
394
|
+
else console.log('[TTS] pocket-tts failed to start');
|
|
395
|
+
}).catch(err => {
|
|
396
|
+
console.error('[TTS] pocket-tts start error:', err.message);
|
|
610
397
|
});
|
|
611
398
|
}
|
|
612
399
|
|
|
@@ -620,4 +407,4 @@ function ttsCacheGet(key) {
|
|
|
620
407
|
return cached || null;
|
|
621
408
|
}
|
|
622
409
|
|
|
623
|
-
export { transcribe, synthesize, synthesizeStream, getSTT,
|
|
410
|
+
export { transcribe, synthesize, synthesizeStream, getSTT, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
|