agentgui 1.0.190 → 1.0.192
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +102 -361
- package/package.json +2 -2
- package/server.js +1 -1
- package/static/index.html +13 -0
- package/lib/pocket-sidecar.js +0 -221
package/lib/speech.js
CHANGED
|
@@ -1,404 +1,139 @@
|
|
|
1
1
|
import { createRequire } from 'module';
|
|
2
2
|
import fs from 'fs';
|
|
3
|
-
import os from 'os';
|
|
4
3
|
import path from 'path';
|
|
4
|
+
import http from 'http';
|
|
5
5
|
import { fileURLToPath } from 'url';
|
|
6
|
-
import * as pocket from './pocket-sidecar.js';
|
|
7
6
|
|
|
8
7
|
const require = createRequire(import.meta.url);
|
|
9
8
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
9
|
const ROOT = path.dirname(__dirname);
|
|
11
|
-
const AUDIO_EXTENSIONS = ['.wav', '.mp3', '.ogg', '.flac', '.m4a'];
|
|
12
10
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
const seen = new Set();
|
|
16
|
-
const add = (d) => { const r = path.resolve(d); if (!seen.has(r)) { seen.add(r); dirs.push(r); } };
|
|
17
|
-
const startupCwd = process.env.STARTUP_CWD || process.cwd();
|
|
18
|
-
add(path.join(startupCwd, 'voices'));
|
|
19
|
-
add(path.join(ROOT, 'voices'));
|
|
20
|
-
add(path.join(os.homedir(), 'voices'));
|
|
21
|
-
return dirs;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
const MIN_WAV_SIZE = 1000;
|
|
25
|
-
|
|
26
|
-
const BASE_VOICES = [
|
|
27
|
-
{ id: 'default', name: 'Default', gender: 'male', accent: 'US' },
|
|
28
|
-
{ id: 'bdl', name: 'BDL', gender: 'male', accent: 'US' },
|
|
29
|
-
{ id: 'slt', name: 'SLT', gender: 'female', accent: 'US' },
|
|
30
|
-
{ id: 'clb', name: 'CLB', gender: 'female', accent: 'US' },
|
|
31
|
-
{ id: 'rms', name: 'RMS', gender: 'male', accent: 'US' },
|
|
32
|
-
{ id: 'awb', name: 'AWB', gender: 'male', accent: 'Scottish' },
|
|
33
|
-
{ id: 'jmk', name: 'JMK', gender: 'male', accent: 'Canadian' },
|
|
34
|
-
{ id: 'ksp', name: 'KSP', gender: 'male', accent: 'Indian' },
|
|
35
|
-
];
|
|
36
|
-
|
|
37
|
-
async function convertToWav(filePath) {
|
|
38
|
-
const wavPath = filePath.replace(/\.[^.]+$/, '.wav');
|
|
39
|
-
if (fs.existsSync(wavPath)) return wavPath;
|
|
40
|
-
try {
|
|
41
|
-
console.log('[VOICES] Converting to WAV:', filePath);
|
|
42
|
-
const audio = await decodeAudioFile(filePath);
|
|
43
|
-
const wav = encodeWav(audio, SAMPLE_RATE_STT);
|
|
44
|
-
fs.writeFileSync(wavPath, wav);
|
|
45
|
-
console.log('[VOICES] Converted:', path.basename(wavPath));
|
|
46
|
-
return wavPath;
|
|
47
|
-
} catch (err) {
|
|
48
|
-
console.error('[VOICES] Conversion failed for', filePath + ':', err.message);
|
|
49
|
-
return null;
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
const pendingConversions = new Map();
|
|
54
|
-
|
|
55
|
-
function scanVoiceDir(dir) {
|
|
56
|
-
const voices = [];
|
|
57
|
-
try {
|
|
58
|
-
if (!fs.existsSync(dir)) return voices;
|
|
59
|
-
const listed = new Set();
|
|
60
|
-
for (const file of fs.readdirSync(dir)) {
|
|
61
|
-
const ext = path.extname(file).toLowerCase();
|
|
62
|
-
if (!AUDIO_EXTENSIONS.includes(ext)) continue;
|
|
63
|
-
const baseName = path.basename(file, ext);
|
|
64
|
-
if (ext !== '.wav') {
|
|
65
|
-
const wavExists = fs.existsSync(path.join(dir, baseName + '.wav'));
|
|
66
|
-
if (wavExists) continue;
|
|
67
|
-
const fullPath = path.join(dir, file);
|
|
68
|
-
if (!pendingConversions.has(fullPath)) {
|
|
69
|
-
pendingConversions.set(fullPath, convertToWav(fullPath).then(result => {
|
|
70
|
-
pendingConversions.delete(fullPath);
|
|
71
|
-
return result;
|
|
72
|
-
}));
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
if (listed.has(baseName)) continue;
|
|
76
|
-
listed.add(baseName);
|
|
77
|
-
const id = 'custom_' + baseName.replace(/[^a-zA-Z0-9_-]/g, '_');
|
|
78
|
-
const name = baseName.replace(/_/g, ' ');
|
|
79
|
-
voices.push({ id, name, gender: 'custom', accent: 'custom', isCustom: true, sourceDir: dir });
|
|
80
|
-
}
|
|
81
|
-
} catch (err) {
|
|
82
|
-
console.error('[VOICES] Error scanning', dir + ':', err.message);
|
|
83
|
-
}
|
|
84
|
-
return voices;
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
function loadCustomVoices() {
|
|
88
|
-
const seen = new Set();
|
|
89
|
-
const voices = [];
|
|
90
|
-
for (const dir of getVoiceDirs()) {
|
|
91
|
-
for (const v of scanVoiceDir(dir)) {
|
|
92
|
-
if (seen.has(v.id)) continue;
|
|
93
|
-
seen.add(v.id);
|
|
94
|
-
voices.push(v);
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
return voices;
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
function getVoices() {
|
|
101
|
-
return [...BASE_VOICES, ...loadCustomVoices()];
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
let transformersModule = null;
|
|
105
|
-
let sttPipeline = null;
|
|
106
|
-
let sttLoading = false;
|
|
107
|
-
let sttLoadError = null;
|
|
108
|
-
let sttLoadErrorTime = 0;
|
|
109
|
-
const STT_RETRY_MS = 30000;
|
|
110
|
-
const SAMPLE_RATE_STT = 16000;
|
|
11
|
+
const serverSTT = require('webtalk/server-stt');
|
|
12
|
+
const serverTTS = require('webtalk/server-tts');
|
|
111
13
|
|
|
112
|
-
const
|
|
113
|
-
|
|
114
|
-
const ttsCache = new Map();
|
|
115
|
-
const ttsInflight = new Map();
|
|
14
|
+
const EXTRA_VOICE_DIRS = [path.join(ROOT, 'voices')];
|
|
15
|
+
const TTS_PORT = 8787;
|
|
116
16
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
const PERSISTENT_CACHE = path.join(os.homedir(), '.gmgui', 'models');
|
|
17
|
+
const TTS_CACHE_MAX = 10 * 1024 * 1024;
|
|
18
|
+
let cacheBytes = 0;
|
|
19
|
+
const cache = new Map();
|
|
20
|
+
const inflight = new Map();
|
|
124
21
|
|
|
125
|
-
function
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
const p = path.join(webtalkDir, 'models', 'onnx-community', 'whisper-base');
|
|
129
|
-
if (fs.existsSync(p)) return p;
|
|
130
|
-
} catch (_) {}
|
|
131
|
-
const cached = path.join(PERSISTENT_CACHE, 'onnx-community', 'whisper-base');
|
|
132
|
-
if (fs.existsSync(cached)) return cached;
|
|
133
|
-
return 'onnx-community/whisper-base';
|
|
22
|
+
function resolveVoice(voiceId) {
|
|
23
|
+
if (!voiceId || voiceId === 'default') return null;
|
|
24
|
+
return serverTTS.findVoiceFile(voiceId, EXTRA_VOICE_DIRS);
|
|
134
25
|
}
|
|
135
26
|
|
|
136
|
-
function
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
27
|
+
function cachePut(key, buf) {
|
|
28
|
+
if (cache.has(key)) { cacheBytes -= cache.get(key).length; cache.delete(key); }
|
|
29
|
+
while (cacheBytes + buf.length > TTS_CACHE_MAX && cache.size > 0) {
|
|
30
|
+
const oldest = cache.keys().next().value;
|
|
31
|
+
cacheBytes -= cache.get(oldest).length;
|
|
32
|
+
cache.delete(oldest);
|
|
33
|
+
}
|
|
34
|
+
cache.set(key, buf);
|
|
35
|
+
cacheBytes += buf.length;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function sendToPocket(text, voicePath) {
|
|
39
|
+
return new Promise((resolve, reject) => {
|
|
40
|
+
const boundary = '----PocketTTS' + Date.now();
|
|
41
|
+
const parts = [];
|
|
42
|
+
parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="text"\r\n\r\n${text}\r\n`);
|
|
43
|
+
if (voicePath) {
|
|
44
|
+
const data = fs.readFileSync(voicePath);
|
|
45
|
+
const name = path.basename(voicePath);
|
|
46
|
+
parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="voice_wav"; filename="${name}"\r\nContent-Type: audio/wav\r\n\r\n`);
|
|
47
|
+
parts.push(data);
|
|
48
|
+
parts.push('\r\n');
|
|
142
49
|
}
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
}
|
|
160
|
-
const decode = (await import('audio-decode')).default;
|
|
161
|
-
const audioBuffer = await decode(buf);
|
|
162
|
-
const mono = audioBuffer.getChannelData(0);
|
|
163
|
-
return resampleTo16k(mono, audioBuffer.sampleRate);
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
async function getSTT() {
|
|
167
|
-
if (sttPipeline) return sttPipeline;
|
|
168
|
-
if (sttLoadError && (Date.now() - sttLoadErrorTime < STT_RETRY_MS)) throw sttLoadError;
|
|
169
|
-
if (sttLoading) {
|
|
170
|
-
while (sttLoading) await new Promise(r => setTimeout(r, 100));
|
|
171
|
-
if (sttLoadError && (Date.now() - sttLoadErrorTime < STT_RETRY_MS)) throw sttLoadError;
|
|
172
|
-
if (!sttPipeline) throw new Error('STT pipeline failed to load');
|
|
173
|
-
return sttPipeline;
|
|
174
|
-
}
|
|
175
|
-
sttLoading = true;
|
|
176
|
-
try {
|
|
177
|
-
const { pipeline, env } = await loadTransformers();
|
|
178
|
-
const modelPath = whisperModelPath();
|
|
179
|
-
const isLocal = !modelPath.includes('/') || fs.existsSync(modelPath);
|
|
180
|
-
env.allowLocalModels = true;
|
|
181
|
-
env.allowRemoteModels = !isLocal;
|
|
182
|
-
env.cacheDir = PERSISTENT_CACHE;
|
|
183
|
-
if (isLocal) env.localModelPath = '';
|
|
184
|
-
sttPipeline = await pipeline('automatic-speech-recognition', modelPath, {
|
|
185
|
-
device: 'cpu',
|
|
186
|
-
cache_dir: PERSISTENT_CACHE,
|
|
187
|
-
local_files_only: isLocal,
|
|
50
|
+
parts.push(`--${boundary}--\r\n`);
|
|
51
|
+
const body = Buffer.concat(parts.map(p => Buffer.isBuffer(p) ? p : Buffer.from(p)));
|
|
52
|
+
const req = http.request({
|
|
53
|
+
hostname: '127.0.0.1', port: TTS_PORT, path: '/tts', method: 'POST',
|
|
54
|
+
headers: { 'Content-Type': `multipart/form-data; boundary=${boundary}`, 'Content-Length': body.length },
|
|
55
|
+
timeout: 60000,
|
|
56
|
+
}, res => {
|
|
57
|
+
if (res.statusCode !== 200) {
|
|
58
|
+
let e = '';
|
|
59
|
+
res.on('data', d => e += d);
|
|
60
|
+
res.on('end', () => reject(new Error(`pocket-tts HTTP ${res.statusCode}: ${e}`)));
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
const chunks = [];
|
|
64
|
+
res.on('data', d => chunks.push(d));
|
|
65
|
+
res.on('end', () => resolve(Buffer.concat(chunks)));
|
|
188
66
|
});
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
sttLoadErrorTime = Date.now();
|
|
195
|
-
throw sttLoadError;
|
|
196
|
-
} finally {
|
|
197
|
-
sttLoading = false;
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
function decodeWavToFloat32(buffer) {
|
|
202
|
-
const view = new DataView(buffer.buffer || buffer);
|
|
203
|
-
const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3));
|
|
204
|
-
if (riff !== 'RIFF') throw new Error('Not a WAV file');
|
|
205
|
-
const numChannels = view.getUint16(22, true);
|
|
206
|
-
const sampleRate = view.getUint32(24, true);
|
|
207
|
-
const bitsPerSample = view.getUint16(34, true);
|
|
208
|
-
let dataOffset = 44;
|
|
209
|
-
for (let i = 36; i < view.byteLength - 8; i++) {
|
|
210
|
-
if (view.getUint8(i) === 0x64 && view.getUint8(i+1) === 0x61 &&
|
|
211
|
-
view.getUint8(i+2) === 0x74 && view.getUint8(i+3) === 0x61) {
|
|
212
|
-
dataOffset = i + 8;
|
|
213
|
-
break;
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
const bytesPerSample = bitsPerSample / 8;
|
|
217
|
-
const numSamples = Math.floor((view.byteLength - dataOffset) / (bytesPerSample * numChannels));
|
|
218
|
-
const audio = new Float32Array(numSamples);
|
|
219
|
-
for (let i = 0; i < numSamples; i++) {
|
|
220
|
-
const offset = dataOffset + i * bytesPerSample * numChannels;
|
|
221
|
-
if (bitsPerSample === 16) {
|
|
222
|
-
audio[i] = view.getInt16(offset, true) / 32768;
|
|
223
|
-
} else if (bitsPerSample === 32) {
|
|
224
|
-
audio[i] = view.getFloat32(offset, true);
|
|
225
|
-
} else {
|
|
226
|
-
audio[i] = (view.getUint8(offset) - 128) / 128;
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
return { audio, sampleRate };
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
function resampleTo16k(audio, fromRate) {
|
|
233
|
-
if (fromRate === SAMPLE_RATE_STT) return audio;
|
|
234
|
-
const ratio = fromRate / SAMPLE_RATE_STT;
|
|
235
|
-
const newLen = Math.round(audio.length / ratio);
|
|
236
|
-
const result = new Float32Array(newLen);
|
|
237
|
-
for (let i = 0; i < newLen; i++) {
|
|
238
|
-
const srcIdx = i * ratio;
|
|
239
|
-
const lo = Math.floor(srcIdx);
|
|
240
|
-
const hi = Math.min(lo + 1, audio.length - 1);
|
|
241
|
-
const frac = srcIdx - lo;
|
|
242
|
-
result[i] = audio[lo] * (1 - frac) + audio[hi] * frac;
|
|
243
|
-
}
|
|
244
|
-
return result;
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
function encodeWav(float32Audio, sampleRate) {
|
|
248
|
-
const numSamples = float32Audio.length;
|
|
249
|
-
const bytesPerSample = 2;
|
|
250
|
-
const dataSize = numSamples * bytesPerSample;
|
|
251
|
-
const buffer = new ArrayBuffer(44 + dataSize);
|
|
252
|
-
const view = new DataView(buffer);
|
|
253
|
-
const writeStr = (off, str) => { for (let i = 0; i < str.length; i++) view.setUint8(off + i, str.charCodeAt(i)); };
|
|
254
|
-
writeStr(0, 'RIFF');
|
|
255
|
-
view.setUint32(4, 36 + dataSize, true);
|
|
256
|
-
writeStr(8, 'WAVE');
|
|
257
|
-
writeStr(12, 'fmt ');
|
|
258
|
-
view.setUint32(16, 16, true);
|
|
259
|
-
view.setUint16(20, 1, true);
|
|
260
|
-
view.setUint16(22, 1, true);
|
|
261
|
-
view.setUint32(24, sampleRate, true);
|
|
262
|
-
view.setUint32(28, sampleRate * bytesPerSample, true);
|
|
263
|
-
view.setUint16(32, bytesPerSample, true);
|
|
264
|
-
view.setUint16(34, 16, true);
|
|
265
|
-
writeStr(36, 'data');
|
|
266
|
-
view.setUint32(40, dataSize, true);
|
|
267
|
-
for (let i = 0; i < numSamples; i++) {
|
|
268
|
-
const s = Math.max(-1, Math.min(1, float32Audio[i]));
|
|
269
|
-
view.setInt16(44 + i * 2, s < 0 ? s * 32768 : s * 32767, true);
|
|
270
|
-
}
|
|
271
|
-
return Buffer.from(buffer);
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
async function transcribe(audioBuffer) {
|
|
275
|
-
const buf = Buffer.isBuffer(audioBuffer) ? audioBuffer : Buffer.from(audioBuffer);
|
|
276
|
-
if (buf.length < MIN_WAV_SIZE) {
|
|
277
|
-
throw new Error('Audio too short (' + buf.length + ' bytes)');
|
|
278
|
-
}
|
|
279
|
-
let audio;
|
|
280
|
-
const isWav = buf.length > 4 && buf.toString('ascii', 0, 4) === 'RIFF';
|
|
281
|
-
if (isWav) {
|
|
282
|
-
let decoded;
|
|
283
|
-
try {
|
|
284
|
-
decoded = decodeWavToFloat32(buf);
|
|
285
|
-
} catch (err) {
|
|
286
|
-
throw new Error('WAV decode failed: ' + err.message);
|
|
287
|
-
}
|
|
288
|
-
if (!decoded.audio || decoded.audio.length === 0) {
|
|
289
|
-
throw new Error('WAV contains no audio samples');
|
|
290
|
-
}
|
|
291
|
-
audio = resampleTo16k(decoded.audio, decoded.sampleRate);
|
|
292
|
-
} else {
|
|
293
|
-
const sampleCount = Math.floor(buf.byteLength / 4);
|
|
294
|
-
if (sampleCount === 0) throw new Error('Audio buffer too small');
|
|
295
|
-
const aligned = new ArrayBuffer(sampleCount * 4);
|
|
296
|
-
new Uint8Array(aligned).set(buf.subarray(0, sampleCount * 4));
|
|
297
|
-
audio = new Float32Array(aligned);
|
|
298
|
-
}
|
|
299
|
-
if (audio.length < 100) {
|
|
300
|
-
throw new Error('Audio too short for transcription');
|
|
301
|
-
}
|
|
302
|
-
const stt = await getSTT();
|
|
303
|
-
let result;
|
|
304
|
-
try {
|
|
305
|
-
result = await stt(audio);
|
|
306
|
-
} catch (err) {
|
|
307
|
-
throw new Error('Transcription engine error: ' + err.message);
|
|
308
|
-
}
|
|
309
|
-
if (!result || typeof result.text !== 'string') {
|
|
310
|
-
return '';
|
|
311
|
-
}
|
|
312
|
-
return result.text;
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
function splitSentences(text) {
|
|
316
|
-
const raw = text.match(/[^.!?]+[.!?]+[\s]?|[^.!?]+$/g);
|
|
317
|
-
if (!raw) return [text];
|
|
318
|
-
return raw.map(s => s.trim()).filter(s => s.length > 0);
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
function cachePut(key, buf) {
|
|
322
|
-
if (ttsCache.has(key)) {
|
|
323
|
-
ttsCacheBytes -= ttsCache.get(key).length;
|
|
324
|
-
ttsCache.delete(key);
|
|
325
|
-
}
|
|
326
|
-
while (ttsCacheBytes + buf.length > TTS_CACHE_MAX_BYTES && ttsCache.size > 0) {
|
|
327
|
-
const oldest = ttsCache.keys().next().value;
|
|
328
|
-
ttsCacheBytes -= ttsCache.get(oldest).length;
|
|
329
|
-
ttsCache.delete(oldest);
|
|
330
|
-
}
|
|
331
|
-
ttsCache.set(key, buf);
|
|
332
|
-
ttsCacheBytes += buf.length;
|
|
67
|
+
req.on('error', reject);
|
|
68
|
+
req.on('timeout', () => { req.destroy(); reject(new Error('pocket-tts timeout')); });
|
|
69
|
+
req.write(body);
|
|
70
|
+
req.end();
|
|
71
|
+
});
|
|
333
72
|
}
|
|
334
73
|
|
|
335
|
-
function
|
|
336
|
-
|
|
337
|
-
return pocket.findVoiceFile(voiceId) || findCustomVoiceFile(voiceId);
|
|
74
|
+
function transcribe(audioBuffer) {
|
|
75
|
+
return serverSTT.transcribe(audioBuffer);
|
|
338
76
|
}
|
|
339
77
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
if (!pState.healthy) throw new Error('pocket-tts not healthy');
|
|
343
|
-
const voicePath = resolveVoicePath(voiceId);
|
|
344
|
-
const wav = await pocket.synthesize(text, voicePath);
|
|
345
|
-
if (wav && wav.length > 44) return wav;
|
|
346
|
-
throw new Error('pocket-tts returned empty audio');
|
|
78
|
+
function getSTT() {
|
|
79
|
+
return serverSTT.getSTT();
|
|
347
80
|
}
|
|
348
81
|
|
|
349
82
|
async function synthesize(text, voiceId) {
|
|
350
|
-
const
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
const inflight = ttsInflight.get(cacheKey);
|
|
358
|
-
if (inflight) return inflight;
|
|
83
|
+
const status = serverTTS.getStatus();
|
|
84
|
+
if (!status.ready) throw new Error('pocket-tts not healthy');
|
|
85
|
+
const key = (voiceId || 'default') + ':' + text;
|
|
86
|
+
const cached = cache.get(key);
|
|
87
|
+
if (cached) { cache.delete(key); cache.set(key, cached); return cached; }
|
|
88
|
+
const existing = inflight.get(key);
|
|
89
|
+
if (existing) return existing;
|
|
359
90
|
const promise = (async () => {
|
|
360
|
-
const
|
|
361
|
-
|
|
91
|
+
const voicePath = resolveVoice(voiceId);
|
|
92
|
+
const wav = await sendToPocket(text, voicePath);
|
|
93
|
+
if (!wav || wav.length <= 44) throw new Error('pocket-tts returned empty audio');
|
|
94
|
+
cachePut(key, wav);
|
|
362
95
|
return wav;
|
|
363
96
|
})();
|
|
364
|
-
|
|
365
|
-
try { return await promise; } finally {
|
|
97
|
+
inflight.set(key, promise);
|
|
98
|
+
try { return await promise; } finally { inflight.delete(key); }
|
|
366
99
|
}
|
|
367
100
|
|
|
368
101
|
async function* synthesizeStream(text, voiceId) {
|
|
102
|
+
const status = serverTTS.getStatus();
|
|
103
|
+
if (!status.ready) throw new Error('pocket-tts not healthy');
|
|
369
104
|
const sentences = splitSentences(text);
|
|
370
105
|
for (const sentence of sentences) {
|
|
371
|
-
const
|
|
372
|
-
const cached =
|
|
373
|
-
if (cached) {
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
continue;
|
|
378
|
-
}
|
|
379
|
-
const wav = await synthesizeViaPocket(sentence, voiceId);
|
|
380
|
-
cachePut(cacheKey, wav);
|
|
381
|
-
yield wav;
|
|
106
|
+
const key = (voiceId || 'default') + ':' + sentence;
|
|
107
|
+
const cached = cache.get(key);
|
|
108
|
+
if (cached) { cache.delete(key); cache.set(key, cached); yield cached; continue; }
|
|
109
|
+
const voicePath = resolveVoice(voiceId);
|
|
110
|
+
const wav = await sendToPocket(sentence, voicePath);
|
|
111
|
+
if (wav && wav.length > 44) { cachePut(key, wav); yield wav; }
|
|
382
112
|
}
|
|
383
113
|
}
|
|
384
114
|
|
|
115
|
+
function getVoices() {
|
|
116
|
+
return serverTTS.getVoices(EXTRA_VOICE_DIRS);
|
|
117
|
+
}
|
|
118
|
+
|
|
385
119
|
function getStatus() {
|
|
386
|
-
const
|
|
120
|
+
const sttStatus = serverSTT.getStatus();
|
|
121
|
+
const ttsStatus = serverTTS.getStatus();
|
|
387
122
|
return {
|
|
388
|
-
sttReady:
|
|
389
|
-
ttsReady:
|
|
390
|
-
sttLoading,
|
|
123
|
+
sttReady: sttStatus.ready,
|
|
124
|
+
ttsReady: ttsStatus.ready,
|
|
125
|
+
sttLoading: sttStatus.loading,
|
|
391
126
|
ttsLoading: false,
|
|
392
|
-
sttError:
|
|
393
|
-
ttsError:
|
|
394
|
-
pocketTts:
|
|
127
|
+
sttError: sttStatus.error,
|
|
128
|
+
ttsError: ttsStatus.ready ? null : (ttsStatus.lastError || 'pocket-tts not running'),
|
|
129
|
+
pocketTts: ttsStatus,
|
|
395
130
|
};
|
|
396
131
|
}
|
|
397
132
|
|
|
398
133
|
function preloadTTS() {
|
|
399
|
-
const defaultVoice =
|
|
134
|
+
const defaultVoice = serverTTS.findVoiceFile('custom_cleetus', EXTRA_VOICE_DIRS) || '/config/voices/cleetus.wav';
|
|
400
135
|
const voicePath = fs.existsSync(defaultVoice) ? defaultVoice : null;
|
|
401
|
-
|
|
136
|
+
serverTTS.start(voicePath).then(ok => {
|
|
402
137
|
if (ok) console.log('[TTS] pocket-tts sidecar started');
|
|
403
138
|
else console.log('[TTS] pocket-tts failed to start');
|
|
404
139
|
}).catch(err => {
|
|
@@ -411,9 +146,15 @@ function ttsCacheKey(text, voiceId) {
|
|
|
411
146
|
}
|
|
412
147
|
|
|
413
148
|
function ttsCacheGet(key) {
|
|
414
|
-
const cached =
|
|
415
|
-
if (cached) {
|
|
149
|
+
const cached = cache.get(key);
|
|
150
|
+
if (cached) { cache.delete(key); cache.set(key, cached); }
|
|
416
151
|
return cached || null;
|
|
417
152
|
}
|
|
418
153
|
|
|
154
|
+
function splitSentences(text) {
|
|
155
|
+
const raw = text.match(/[^.!?]+[.!?]+[\s]?|[^.!?]+$/g);
|
|
156
|
+
if (!raw) return [text];
|
|
157
|
+
return raw.map(s => s.trim()).filter(s => s.length > 0);
|
|
158
|
+
}
|
|
159
|
+
|
|
419
160
|
export { transcribe, synthesize, synthesizeStream, getSTT, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgui",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.192",
|
|
4
4
|
"description": "Multi-agent ACP client with real-time communication",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "server.js",
|
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
"express": "^5.2.1",
|
|
30
30
|
"fsbrowse": "^0.2.13",
|
|
31
31
|
"onnxruntime-node": "^1.24.1",
|
|
32
|
-
"webtalk": "github:
|
|
32
|
+
"webtalk": "github:AnEntrypoint/webtalk",
|
|
33
33
|
"ws": "^8.14.2"
|
|
34
34
|
}
|
|
35
35
|
}
|
package/server.js
CHANGED
|
@@ -63,7 +63,7 @@ const express = require('express');
|
|
|
63
63
|
const Busboy = require('busboy');
|
|
64
64
|
const fsbrowse = require('fsbrowse');
|
|
65
65
|
|
|
66
|
-
const SYSTEM_PROMPT = `Write
|
|
66
|
+
const SYSTEM_PROMPT = `Your output will be spoken aloud by a text-to-speech system. Write ONLY plain conversational sentences that sound natural when read aloud. Never use markdown, bold, italics, headers, bullet points, numbered lists, tables, or any formatting. Never use colons to introduce lists or options. Never use labels like "Option A" or "1." followed by a title. Instead of listing options, describe them conversationally in flowing sentences. For example, instead of "**Option 1**: Do X" say "One approach would be to do X." Keep sentences short and simple. Use transition words like "also", "another option", "or alternatively" to connect ideas. Write as if you are speaking to someone in a casual conversation.`;
|
|
67
67
|
|
|
68
68
|
const activeExecutions = new Map();
|
|
69
69
|
const messageQueues = new Map();
|
package/static/index.html
CHANGED
|
@@ -435,6 +435,19 @@
|
|
|
435
435
|
border-bottom-left-radius: 0.25rem;
|
|
436
436
|
}
|
|
437
437
|
|
|
438
|
+
/* Consecutive assistant messages: join them visually */
|
|
439
|
+
.message-assistant + .message-assistant {
|
|
440
|
+
border-top-left-radius: 0;
|
|
441
|
+
border-top-right-radius: 0;
|
|
442
|
+
margin-top: -0.125rem;
|
|
443
|
+
padding-top: 0.25rem;
|
|
444
|
+
}
|
|
445
|
+
.message-assistant:has(+ .message-assistant) {
|
|
446
|
+
border-bottom-left-radius: 0;
|
|
447
|
+
border-bottom-right-radius: 0;
|
|
448
|
+
padding-bottom: 0.25rem;
|
|
449
|
+
}
|
|
450
|
+
|
|
438
451
|
.message-role {
|
|
439
452
|
font-weight: 600;
|
|
440
453
|
font-size: 0.7rem;
|
package/lib/pocket-sidecar.js
DELETED
|
@@ -1,221 +0,0 @@
|
|
|
1
|
-
import { spawn } from 'child_process';
|
|
2
|
-
import path from 'path';
|
|
3
|
-
import fs from 'fs';
|
|
4
|
-
import os from 'os';
|
|
5
|
-
import { fileURLToPath } from 'url';
|
|
6
|
-
import http from 'http';
|
|
7
|
-
|
|
8
|
-
const ROOT = path.dirname(path.dirname(fileURLToPath(import.meta.url)));
|
|
9
|
-
const PORT = 8787;
|
|
10
|
-
|
|
11
|
-
const FALLBACK_VOICE = 'alba';
|
|
12
|
-
const state = {
|
|
13
|
-
process: null, port: PORT, status: 'stopped', pid: null,
|
|
14
|
-
restartCount: 0, failureCount: 0, lastError: null,
|
|
15
|
-
healthy: false, voicePath: null, starting: false,
|
|
16
|
-
shutdownRequested: false, healthTimer: null, restartTimer: null,
|
|
17
|
-
voiceCloning: false, adopted: false,
|
|
18
|
-
};
|
|
19
|
-
globalThis.__pocketSidecar = state;
|
|
20
|
-
|
|
21
|
-
function findBinary() {
|
|
22
|
-
const candidates = [
|
|
23
|
-
path.join(ROOT, 'data', 'pocket-venv', 'bin', 'pocket-tts'),
|
|
24
|
-
'/config/workspace/agentgui/data/pocket-venv/bin/pocket-tts',
|
|
25
|
-
path.join(os.homedir(), '.gmgui', 'pocket-venv', 'bin', 'pocket-tts'),
|
|
26
|
-
];
|
|
27
|
-
for (const p of candidates) if (fs.existsSync(p)) return p;
|
|
28
|
-
return null;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
function isInstalled() { return !!findBinary(); }
|
|
32
|
-
|
|
33
|
-
function findVoiceFile(voiceId) {
|
|
34
|
-
if (!voiceId || voiceId === 'default') return null;
|
|
35
|
-
const baseName = voiceId.replace(/^custom_/, '');
|
|
36
|
-
const dirs = [
|
|
37
|
-
path.join(process.env.STARTUP_CWD || process.cwd(), 'voices'),
|
|
38
|
-
path.join(ROOT, 'voices'), path.join(os.homedir(), 'voices'), '/config/voices',
|
|
39
|
-
];
|
|
40
|
-
for (const dir of dirs)
|
|
41
|
-
for (const ext of ['.wav', '.mp3', '.ogg', '.flac']) {
|
|
42
|
-
const p = path.join(dir, baseName + ext);
|
|
43
|
-
if (fs.existsSync(p)) return p;
|
|
44
|
-
}
|
|
45
|
-
return null;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
function healthCheck() {
|
|
49
|
-
return new Promise((resolve) => {
|
|
50
|
-
const req = http.get(`http://127.0.0.1:${PORT}/health`, { timeout: 3000 }, (res) => {
|
|
51
|
-
res.resume();
|
|
52
|
-
res.on('end', () => { state.healthy = res.statusCode === 200; resolve(state.healthy); });
|
|
53
|
-
});
|
|
54
|
-
req.on('error', () => { state.healthy = false; resolve(false); });
|
|
55
|
-
req.on('timeout', () => { req.destroy(); state.healthy = false; resolve(false); });
|
|
56
|
-
});
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
function killProcess() {
|
|
60
|
-
if (state.process) { try { state.process.kill('SIGTERM'); } catch (_) {} }
|
|
61
|
-
state.process = null; state.pid = null; state.healthy = false; state.status = 'stopped';
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
function scheduleRestart() {
|
|
65
|
-
if (state.shutdownRequested) return;
|
|
66
|
-
if (!state.adopted) killProcess();
|
|
67
|
-
const delay = Math.min(1000 * Math.pow(2, state.restartCount), 30000);
|
|
68
|
-
state.restartCount++;
|
|
69
|
-
console.log(`[POCKET-TTS] Restart in ${delay}ms (attempt ${state.restartCount})`);
|
|
70
|
-
state.restartTimer = setTimeout(() => {
|
|
71
|
-
state.restartTimer = null;
|
|
72
|
-
state.adopted = false;
|
|
73
|
-
start(state.voicePath).catch(e => console.error('[POCKET-TTS] Restart failed:', e.message));
|
|
74
|
-
}, delay);
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
function spawnSidecar(voice) {
|
|
78
|
-
const bin = findBinary();
|
|
79
|
-
if (!bin) throw new Error('pocket-tts binary not found');
|
|
80
|
-
const args = ['serve', '--host', '0.0.0.0', '--port', String(PORT)];
|
|
81
|
-
if (voice) args.push('--voice', voice);
|
|
82
|
-
console.log('[POCKET-TTS] Starting:', bin, args.join(' '));
|
|
83
|
-
return spawn(bin, args, {
|
|
84
|
-
stdio: ['ignore', 'pipe', 'pipe'],
|
|
85
|
-
env: { ...process.env, PYTHONUNBUFFERED: '1' },
|
|
86
|
-
});
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
function attachProc(proc) {
|
|
90
|
-
state.process = proc; state.pid = proc.pid; state.status = 'starting';
|
|
91
|
-
proc.stdout.on('data', d => { const l = d.toString().trim(); if (l) console.log('[POCKET-TTS]', l); });
|
|
92
|
-
proc.stderr.on('data', d => { const l = d.toString().trim(); if (l) console.error('[POCKET-TTS]', l); });
|
|
93
|
-
proc.on('error', e => { state.lastError = e.message; });
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
async function waitForReady(proc, timeoutSec) {
|
|
97
|
-
let exited = false;
|
|
98
|
-
proc.on('exit', () => { exited = true; });
|
|
99
|
-
for (let i = 0; i < timeoutSec; i++) {
|
|
100
|
-
if (exited) return false;
|
|
101
|
-
await new Promise(r => setTimeout(r, 1000));
|
|
102
|
-
if (await healthCheck()) return true;
|
|
103
|
-
}
|
|
104
|
-
return false;
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
async function adoptRunning() {
|
|
108
|
-
if (await healthCheck()) {
|
|
109
|
-
state.status = 'running'; state.healthy = true; state.adopted = true;
|
|
110
|
-
state.restartCount = 0; state.failureCount = 0; state.lastError = null;
|
|
111
|
-
if (!state.healthTimer) state.healthTimer = setInterval(async () => {
|
|
112
|
-
if (state.status !== 'running') return;
|
|
113
|
-
const ok = await healthCheck();
|
|
114
|
-
if (!ok && !state.shutdownRequested) {
|
|
115
|
-
state.failureCount++;
|
|
116
|
-
if (state.failureCount >= 3) { state.adopted = false; scheduleRestart(); }
|
|
117
|
-
} else if (ok) state.failureCount = 0;
|
|
118
|
-
}, 10000);
|
|
119
|
-
console.log('[POCKET-TTS] Adopted existing instance on port', PORT);
|
|
120
|
-
return true;
|
|
121
|
-
}
|
|
122
|
-
return false;
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
async function start(voicePath) {
|
|
126
|
-
if (state.starting) return false;
|
|
127
|
-
if (state.status === 'running' && state.healthy) return true;
|
|
128
|
-
if (await adoptRunning()) return true;
|
|
129
|
-
if (!isInstalled()) { state.lastError = 'not installed'; state.status = 'unavailable'; return false; }
|
|
130
|
-
state.starting = true; state.shutdownRequested = false;
|
|
131
|
-
const requestedVoice = voicePath || state.voicePath;
|
|
132
|
-
try {
|
|
133
|
-
killProcess();
|
|
134
|
-
let proc = spawnSidecar(requestedVoice);
|
|
135
|
-
attachProc(proc);
|
|
136
|
-
let ready = await waitForReady(proc, 120);
|
|
137
|
-
if (!ready && requestedVoice && requestedVoice !== FALLBACK_VOICE) {
|
|
138
|
-
console.log('[POCKET-TTS] Custom voice failed, trying predefined voice:', FALLBACK_VOICE);
|
|
139
|
-
killProcess();
|
|
140
|
-
proc = spawnSidecar(FALLBACK_VOICE);
|
|
141
|
-
attachProc(proc);
|
|
142
|
-
state.voiceCloning = false;
|
|
143
|
-
ready = await waitForReady(proc, 120);
|
|
144
|
-
if (ready) state.voicePath = FALLBACK_VOICE;
|
|
145
|
-
} else if (ready) {
|
|
146
|
-
state.voicePath = requestedVoice;
|
|
147
|
-
state.voiceCloning = !!requestedVoice && !['alba','marius','javert','jean','fantine','cosette','eponine','azelma'].includes(requestedVoice);
|
|
148
|
-
}
|
|
149
|
-
if (ready) {
|
|
150
|
-
state.status = 'running'; state.restartCount = 0; state.failureCount = 0; state.lastError = null;
|
|
151
|
-
proc.on('exit', (code, sig) => {
|
|
152
|
-
console.log(`[POCKET-TTS] Exited: code=${code} signal=${sig}`);
|
|
153
|
-
state.process = null; state.pid = null; state.healthy = false; state.status = 'stopped';
|
|
154
|
-
if (!state.shutdownRequested) scheduleRestart();
|
|
155
|
-
});
|
|
156
|
-
if (!state.healthTimer) state.healthTimer = setInterval(async () => {
|
|
157
|
-
if (state.status !== 'running') return;
|
|
158
|
-
const ok = await healthCheck();
|
|
159
|
-
if (!ok && !state.shutdownRequested) {
|
|
160
|
-
state.failureCount++;
|
|
161
|
-
if (state.failureCount >= 3) scheduleRestart();
|
|
162
|
-
} else if (ok) state.failureCount = 0;
|
|
163
|
-
}, 10000);
|
|
164
|
-
console.log('[POCKET-TTS] Ready on port', PORT, '(voice cloning:', state.voiceCloning + ')');
|
|
165
|
-
return true;
|
|
166
|
-
}
|
|
167
|
-
state.lastError = 'Start timeout'; state.status = 'error'; killProcess(); return false;
|
|
168
|
-
} catch (err) {
|
|
169
|
-
state.lastError = err.message; state.status = 'error'; return false;
|
|
170
|
-
} finally { state.starting = false; }
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
async function stop() {
|
|
174
|
-
state.shutdownRequested = true;
|
|
175
|
-
if (state.healthTimer) { clearInterval(state.healthTimer); state.healthTimer = null; }
|
|
176
|
-
if (state.restartTimer) { clearTimeout(state.restartTimer); state.restartTimer = null; }
|
|
177
|
-
killProcess();
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
async function synthesize(text, voicePath) {
|
|
181
|
-
if (!state.healthy) throw new Error('pocket-tts not ready');
|
|
182
|
-
const boundary = '----PocketTTS' + Date.now();
|
|
183
|
-
const parts = [];
|
|
184
|
-
parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="text"\r\n\r\n${text}\r\n`);
|
|
185
|
-
if (state.voiceCloning && voicePath && voicePath !== state.voicePath) {
|
|
186
|
-
const data = fs.readFileSync(voicePath);
|
|
187
|
-
const name = path.basename(voicePath);
|
|
188
|
-
parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="voice_wav"; filename="${name}"\r\nContent-Type: audio/wav\r\n\r\n`);
|
|
189
|
-
parts.push(data); parts.push('\r\n');
|
|
190
|
-
}
|
|
191
|
-
parts.push(`--${boundary}--\r\n`);
|
|
192
|
-
const body = Buffer.concat(parts.map(p => Buffer.isBuffer(p) ? p : Buffer.from(p)));
|
|
193
|
-
return new Promise((resolve, reject) => {
|
|
194
|
-
const req = http.request({
|
|
195
|
-
hostname: '127.0.0.1', port: PORT, path: '/tts', method: 'POST',
|
|
196
|
-
headers: { 'Content-Type': `multipart/form-data; boundary=${boundary}`, 'Content-Length': body.length },
|
|
197
|
-
timeout: 60000,
|
|
198
|
-
}, res => {
|
|
199
|
-
if (res.statusCode !== 200) {
|
|
200
|
-
let e = ''; res.on('data', d => e += d);
|
|
201
|
-
res.on('end', () => reject(new Error(`pocket-tts HTTP ${res.statusCode}: ${e}`)));
|
|
202
|
-
return;
|
|
203
|
-
}
|
|
204
|
-
const chunks = []; res.on('data', d => chunks.push(d));
|
|
205
|
-
res.on('end', () => resolve(Buffer.concat(chunks)));
|
|
206
|
-
});
|
|
207
|
-
req.on('error', reject);
|
|
208
|
-
req.on('timeout', () => { req.destroy(); reject(new Error('pocket-tts timeout')); });
|
|
209
|
-
req.write(body); req.end();
|
|
210
|
-
});
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
function getState() {
|
|
214
|
-
return {
|
|
215
|
-
status: state.status, healthy: state.healthy, pid: state.pid, port: state.port,
|
|
216
|
-
restartCount: state.restartCount, failureCount: state.failureCount,
|
|
217
|
-
lastError: state.lastError, installed: isInstalled(), voiceCloning: state.voiceCloning,
|
|
218
|
-
};
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
export { start, stop, synthesize, healthCheck, getState, isInstalled, findVoiceFile };
|