agentgui 1.0.190 → 1.0.191
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +30 -376
- package/package.json +2 -2
- package/server.js +1 -1
- package/static/index.html +13 -0
package/lib/speech.js
CHANGED
|
@@ -1,404 +1,56 @@
|
|
|
1
1
|
import { createRequire } from 'module';
|
|
2
2
|
import fs from 'fs';
|
|
3
|
-
import os from 'os';
|
|
4
3
|
import path from 'path';
|
|
5
4
|
import { fileURLToPath } from 'url';
|
|
6
|
-
import * as pocket from './pocket-sidecar.js';
|
|
7
5
|
|
|
8
6
|
const require = createRequire(import.meta.url);
|
|
9
7
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
8
|
const ROOT = path.dirname(__dirname);
|
|
11
|
-
const AUDIO_EXTENSIONS = ['.wav', '.mp3', '.ogg', '.flac', '.m4a'];
|
|
12
9
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
const add = (d) => { const r = path.resolve(d); if (!seen.has(r)) { seen.add(r); dirs.push(r); } };
|
|
17
|
-
const startupCwd = process.env.STARTUP_CWD || process.cwd();
|
|
18
|
-
add(path.join(startupCwd, 'voices'));
|
|
19
|
-
add(path.join(ROOT, 'voices'));
|
|
20
|
-
add(path.join(os.homedir(), 'voices'));
|
|
21
|
-
return dirs;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
const MIN_WAV_SIZE = 1000;
|
|
25
|
-
|
|
26
|
-
const BASE_VOICES = [
|
|
27
|
-
{ id: 'default', name: 'Default', gender: 'male', accent: 'US' },
|
|
28
|
-
{ id: 'bdl', name: 'BDL', gender: 'male', accent: 'US' },
|
|
29
|
-
{ id: 'slt', name: 'SLT', gender: 'female', accent: 'US' },
|
|
30
|
-
{ id: 'clb', name: 'CLB', gender: 'female', accent: 'US' },
|
|
31
|
-
{ id: 'rms', name: 'RMS', gender: 'male', accent: 'US' },
|
|
32
|
-
{ id: 'awb', name: 'AWB', gender: 'male', accent: 'Scottish' },
|
|
33
|
-
{ id: 'jmk', name: 'JMK', gender: 'male', accent: 'Canadian' },
|
|
34
|
-
{ id: 'ksp', name: 'KSP', gender: 'male', accent: 'Indian' },
|
|
35
|
-
];
|
|
36
|
-
|
|
37
|
-
async function convertToWav(filePath) {
|
|
38
|
-
const wavPath = filePath.replace(/\.[^.]+$/, '.wav');
|
|
39
|
-
if (fs.existsSync(wavPath)) return wavPath;
|
|
40
|
-
try {
|
|
41
|
-
console.log('[VOICES] Converting to WAV:', filePath);
|
|
42
|
-
const audio = await decodeAudioFile(filePath);
|
|
43
|
-
const wav = encodeWav(audio, SAMPLE_RATE_STT);
|
|
44
|
-
fs.writeFileSync(wavPath, wav);
|
|
45
|
-
console.log('[VOICES] Converted:', path.basename(wavPath));
|
|
46
|
-
return wavPath;
|
|
47
|
-
} catch (err) {
|
|
48
|
-
console.error('[VOICES] Conversion failed for', filePath + ':', err.message);
|
|
49
|
-
return null;
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
const pendingConversions = new Map();
|
|
54
|
-
|
|
55
|
-
function scanVoiceDir(dir) {
|
|
56
|
-
const voices = [];
|
|
57
|
-
try {
|
|
58
|
-
if (!fs.existsSync(dir)) return voices;
|
|
59
|
-
const listed = new Set();
|
|
60
|
-
for (const file of fs.readdirSync(dir)) {
|
|
61
|
-
const ext = path.extname(file).toLowerCase();
|
|
62
|
-
if (!AUDIO_EXTENSIONS.includes(ext)) continue;
|
|
63
|
-
const baseName = path.basename(file, ext);
|
|
64
|
-
if (ext !== '.wav') {
|
|
65
|
-
const wavExists = fs.existsSync(path.join(dir, baseName + '.wav'));
|
|
66
|
-
if (wavExists) continue;
|
|
67
|
-
const fullPath = path.join(dir, file);
|
|
68
|
-
if (!pendingConversions.has(fullPath)) {
|
|
69
|
-
pendingConversions.set(fullPath, convertToWav(fullPath).then(result => {
|
|
70
|
-
pendingConversions.delete(fullPath);
|
|
71
|
-
return result;
|
|
72
|
-
}));
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
if (listed.has(baseName)) continue;
|
|
76
|
-
listed.add(baseName);
|
|
77
|
-
const id = 'custom_' + baseName.replace(/[^a-zA-Z0-9_-]/g, '_');
|
|
78
|
-
const name = baseName.replace(/_/g, ' ');
|
|
79
|
-
voices.push({ id, name, gender: 'custom', accent: 'custom', isCustom: true, sourceDir: dir });
|
|
80
|
-
}
|
|
81
|
-
} catch (err) {
|
|
82
|
-
console.error('[VOICES] Error scanning', dir + ':', err.message);
|
|
83
|
-
}
|
|
84
|
-
return voices;
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
function loadCustomVoices() {
|
|
88
|
-
const seen = new Set();
|
|
89
|
-
const voices = [];
|
|
90
|
-
for (const dir of getVoiceDirs()) {
|
|
91
|
-
for (const v of scanVoiceDir(dir)) {
|
|
92
|
-
if (seen.has(v.id)) continue;
|
|
93
|
-
seen.add(v.id);
|
|
94
|
-
voices.push(v);
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
return voices;
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
function getVoices() {
|
|
101
|
-
return [...BASE_VOICES, ...loadCustomVoices()];
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
let transformersModule = null;
|
|
105
|
-
let sttPipeline = null;
|
|
106
|
-
let sttLoading = false;
|
|
107
|
-
let sttLoadError = null;
|
|
108
|
-
let sttLoadErrorTime = 0;
|
|
109
|
-
const STT_RETRY_MS = 30000;
|
|
110
|
-
const SAMPLE_RATE_STT = 16000;
|
|
111
|
-
|
|
112
|
-
const TTS_CACHE_MAX_BYTES = 10 * 1024 * 1024;
|
|
113
|
-
let ttsCacheBytes = 0;
|
|
114
|
-
const ttsCache = new Map();
|
|
115
|
-
const ttsInflight = new Map();
|
|
116
|
-
|
|
117
|
-
async function loadTransformers() {
|
|
118
|
-
if (transformersModule) return transformersModule;
|
|
119
|
-
transformersModule = await import('@huggingface/transformers');
|
|
120
|
-
return transformersModule;
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
const PERSISTENT_CACHE = path.join(os.homedir(), '.gmgui', 'models');
|
|
124
|
-
|
|
125
|
-
function whisperModelPath() {
|
|
126
|
-
try {
|
|
127
|
-
const webtalkDir = path.dirname(require.resolve('webtalk'));
|
|
128
|
-
const p = path.join(webtalkDir, 'models', 'onnx-community', 'whisper-base');
|
|
129
|
-
if (fs.existsSync(p)) return p;
|
|
130
|
-
} catch (_) {}
|
|
131
|
-
const cached = path.join(PERSISTENT_CACHE, 'onnx-community', 'whisper-base');
|
|
132
|
-
if (fs.existsSync(cached)) return cached;
|
|
133
|
-
return 'onnx-community/whisper-base';
|
|
134
|
-
}
|
|
10
|
+
// Use webtalk's server-side modules
|
|
11
|
+
const serverSTT = require('webtalk/server-stt');
|
|
12
|
+
const serverTTS = require('webtalk/server-tts');
|
|
135
13
|
|
|
136
|
-
|
|
137
|
-
const baseName = voiceId.replace(/^custom_/, '');
|
|
138
|
-
for (const dir of getVoiceDirs()) {
|
|
139
|
-
for (const ext of AUDIO_EXTENSIONS) {
|
|
140
|
-
const candidate = path.join(dir, baseName + ext);
|
|
141
|
-
if (fs.existsSync(candidate)) return candidate;
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
return null;
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
async function decodeAudioFile(filePath) {
|
|
148
|
-
const buf = fs.readFileSync(filePath);
|
|
149
|
-
const ext = path.extname(filePath).toLowerCase();
|
|
150
|
-
if (ext === '.wav') {
|
|
151
|
-
const decoded = decodeWavToFloat32(buf);
|
|
152
|
-
return resampleTo16k(decoded.audio, decoded.sampleRate);
|
|
153
|
-
}
|
|
154
|
-
const wavPath = filePath.replace(/\.[^.]+$/, '.wav');
|
|
155
|
-
if (fs.existsSync(wavPath)) {
|
|
156
|
-
const wavBuf = fs.readFileSync(wavPath);
|
|
157
|
-
const decoded = decodeWavToFloat32(wavBuf);
|
|
158
|
-
return resampleTo16k(decoded.audio, decoded.sampleRate);
|
|
159
|
-
}
|
|
160
|
-
const decode = (await import('audio-decode')).default;
|
|
161
|
-
const audioBuffer = await decode(buf);
|
|
162
|
-
const mono = audioBuffer.getChannelData(0);
|
|
163
|
-
return resampleTo16k(mono, audioBuffer.sampleRate);
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
async function getSTT() {
|
|
167
|
-
if (sttPipeline) return sttPipeline;
|
|
168
|
-
if (sttLoadError && (Date.now() - sttLoadErrorTime < STT_RETRY_MS)) throw sttLoadError;
|
|
169
|
-
if (sttLoading) {
|
|
170
|
-
while (sttLoading) await new Promise(r => setTimeout(r, 100));
|
|
171
|
-
if (sttLoadError && (Date.now() - sttLoadErrorTime < STT_RETRY_MS)) throw sttLoadError;
|
|
172
|
-
if (!sttPipeline) throw new Error('STT pipeline failed to load');
|
|
173
|
-
return sttPipeline;
|
|
174
|
-
}
|
|
175
|
-
sttLoading = true;
|
|
176
|
-
try {
|
|
177
|
-
const { pipeline, env } = await loadTransformers();
|
|
178
|
-
const modelPath = whisperModelPath();
|
|
179
|
-
const isLocal = !modelPath.includes('/') || fs.existsSync(modelPath);
|
|
180
|
-
env.allowLocalModels = true;
|
|
181
|
-
env.allowRemoteModels = !isLocal;
|
|
182
|
-
env.cacheDir = PERSISTENT_CACHE;
|
|
183
|
-
if (isLocal) env.localModelPath = '';
|
|
184
|
-
sttPipeline = await pipeline('automatic-speech-recognition', modelPath, {
|
|
185
|
-
device: 'cpu',
|
|
186
|
-
cache_dir: PERSISTENT_CACHE,
|
|
187
|
-
local_files_only: isLocal,
|
|
188
|
-
});
|
|
189
|
-
sttLoadError = null;
|
|
190
|
-
return sttPipeline;
|
|
191
|
-
} catch (err) {
|
|
192
|
-
sttPipeline = null;
|
|
193
|
-
sttLoadError = new Error('STT model load failed: ' + err.message);
|
|
194
|
-
sttLoadErrorTime = Date.now();
|
|
195
|
-
throw sttLoadError;
|
|
196
|
-
} finally {
|
|
197
|
-
sttLoading = false;
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
function decodeWavToFloat32(buffer) {
|
|
202
|
-
const view = new DataView(buffer.buffer || buffer);
|
|
203
|
-
const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3));
|
|
204
|
-
if (riff !== 'RIFF') throw new Error('Not a WAV file');
|
|
205
|
-
const numChannels = view.getUint16(22, true);
|
|
206
|
-
const sampleRate = view.getUint32(24, true);
|
|
207
|
-
const bitsPerSample = view.getUint16(34, true);
|
|
208
|
-
let dataOffset = 44;
|
|
209
|
-
for (let i = 36; i < view.byteLength - 8; i++) {
|
|
210
|
-
if (view.getUint8(i) === 0x64 && view.getUint8(i+1) === 0x61 &&
|
|
211
|
-
view.getUint8(i+2) === 0x74 && view.getUint8(i+3) === 0x61) {
|
|
212
|
-
dataOffset = i + 8;
|
|
213
|
-
break;
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
const bytesPerSample = bitsPerSample / 8;
|
|
217
|
-
const numSamples = Math.floor((view.byteLength - dataOffset) / (bytesPerSample * numChannels));
|
|
218
|
-
const audio = new Float32Array(numSamples);
|
|
219
|
-
for (let i = 0; i < numSamples; i++) {
|
|
220
|
-
const offset = dataOffset + i * bytesPerSample * numChannels;
|
|
221
|
-
if (bitsPerSample === 16) {
|
|
222
|
-
audio[i] = view.getInt16(offset, true) / 32768;
|
|
223
|
-
} else if (bitsPerSample === 32) {
|
|
224
|
-
audio[i] = view.getFloat32(offset, true);
|
|
225
|
-
} else {
|
|
226
|
-
audio[i] = (view.getUint8(offset) - 128) / 128;
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
return { audio, sampleRate };
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
function resampleTo16k(audio, fromRate) {
|
|
233
|
-
if (fromRate === SAMPLE_RATE_STT) return audio;
|
|
234
|
-
const ratio = fromRate / SAMPLE_RATE_STT;
|
|
235
|
-
const newLen = Math.round(audio.length / ratio);
|
|
236
|
-
const result = new Float32Array(newLen);
|
|
237
|
-
for (let i = 0; i < newLen; i++) {
|
|
238
|
-
const srcIdx = i * ratio;
|
|
239
|
-
const lo = Math.floor(srcIdx);
|
|
240
|
-
const hi = Math.min(lo + 1, audio.length - 1);
|
|
241
|
-
const frac = srcIdx - lo;
|
|
242
|
-
result[i] = audio[lo] * (1 - frac) + audio[hi] * frac;
|
|
243
|
-
}
|
|
244
|
-
return result;
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
function encodeWav(float32Audio, sampleRate) {
|
|
248
|
-
const numSamples = float32Audio.length;
|
|
249
|
-
const bytesPerSample = 2;
|
|
250
|
-
const dataSize = numSamples * bytesPerSample;
|
|
251
|
-
const buffer = new ArrayBuffer(44 + dataSize);
|
|
252
|
-
const view = new DataView(buffer);
|
|
253
|
-
const writeStr = (off, str) => { for (let i = 0; i < str.length; i++) view.setUint8(off + i, str.charCodeAt(i)); };
|
|
254
|
-
writeStr(0, 'RIFF');
|
|
255
|
-
view.setUint32(4, 36 + dataSize, true);
|
|
256
|
-
writeStr(8, 'WAVE');
|
|
257
|
-
writeStr(12, 'fmt ');
|
|
258
|
-
view.setUint32(16, 16, true);
|
|
259
|
-
view.setUint16(20, 1, true);
|
|
260
|
-
view.setUint16(22, 1, true);
|
|
261
|
-
view.setUint32(24, sampleRate, true);
|
|
262
|
-
view.setUint32(28, sampleRate * bytesPerSample, true);
|
|
263
|
-
view.setUint16(32, bytesPerSample, true);
|
|
264
|
-
view.setUint16(34, 16, true);
|
|
265
|
-
writeStr(36, 'data');
|
|
266
|
-
view.setUint32(40, dataSize, true);
|
|
267
|
-
for (let i = 0; i < numSamples; i++) {
|
|
268
|
-
const s = Math.max(-1, Math.min(1, float32Audio[i]));
|
|
269
|
-
view.setInt16(44 + i * 2, s < 0 ? s * 32768 : s * 32767, true);
|
|
270
|
-
}
|
|
271
|
-
return Buffer.from(buffer);
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
async function transcribe(audioBuffer) {
|
|
275
|
-
const buf = Buffer.isBuffer(audioBuffer) ? audioBuffer : Buffer.from(audioBuffer);
|
|
276
|
-
if (buf.length < MIN_WAV_SIZE) {
|
|
277
|
-
throw new Error('Audio too short (' + buf.length + ' bytes)');
|
|
278
|
-
}
|
|
279
|
-
let audio;
|
|
280
|
-
const isWav = buf.length > 4 && buf.toString('ascii', 0, 4) === 'RIFF';
|
|
281
|
-
if (isWav) {
|
|
282
|
-
let decoded;
|
|
283
|
-
try {
|
|
284
|
-
decoded = decodeWavToFloat32(buf);
|
|
285
|
-
} catch (err) {
|
|
286
|
-
throw new Error('WAV decode failed: ' + err.message);
|
|
287
|
-
}
|
|
288
|
-
if (!decoded.audio || decoded.audio.length === 0) {
|
|
289
|
-
throw new Error('WAV contains no audio samples');
|
|
290
|
-
}
|
|
291
|
-
audio = resampleTo16k(decoded.audio, decoded.sampleRate);
|
|
292
|
-
} else {
|
|
293
|
-
const sampleCount = Math.floor(buf.byteLength / 4);
|
|
294
|
-
if (sampleCount === 0) throw new Error('Audio buffer too small');
|
|
295
|
-
const aligned = new ArrayBuffer(sampleCount * 4);
|
|
296
|
-
new Uint8Array(aligned).set(buf.subarray(0, sampleCount * 4));
|
|
297
|
-
audio = new Float32Array(aligned);
|
|
298
|
-
}
|
|
299
|
-
if (audio.length < 100) {
|
|
300
|
-
throw new Error('Audio too short for transcription');
|
|
301
|
-
}
|
|
302
|
-
const stt = await getSTT();
|
|
303
|
-
let result;
|
|
304
|
-
try {
|
|
305
|
-
result = await stt(audio);
|
|
306
|
-
} catch (err) {
|
|
307
|
-
throw new Error('Transcription engine error: ' + err.message);
|
|
308
|
-
}
|
|
309
|
-
if (!result || typeof result.text !== 'string') {
|
|
310
|
-
return '';
|
|
311
|
-
}
|
|
312
|
-
return result.text;
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
function splitSentences(text) {
|
|
316
|
-
const raw = text.match(/[^.!?]+[.!?]+[\s]?|[^.!?]+$/g);
|
|
317
|
-
if (!raw) return [text];
|
|
318
|
-
return raw.map(s => s.trim()).filter(s => s.length > 0);
|
|
319
|
-
}
|
|
14
|
+
const EXTRA_VOICE_DIRS = [path.join(ROOT, 'voices')];
|
|
320
15
|
|
|
321
|
-
function
|
|
322
|
-
|
|
323
|
-
ttsCacheBytes -= ttsCache.get(key).length;
|
|
324
|
-
ttsCache.delete(key);
|
|
325
|
-
}
|
|
326
|
-
while (ttsCacheBytes + buf.length > TTS_CACHE_MAX_BYTES && ttsCache.size > 0) {
|
|
327
|
-
const oldest = ttsCache.keys().next().value;
|
|
328
|
-
ttsCacheBytes -= ttsCache.get(oldest).length;
|
|
329
|
-
ttsCache.delete(oldest);
|
|
330
|
-
}
|
|
331
|
-
ttsCache.set(key, buf);
|
|
332
|
-
ttsCacheBytes += buf.length;
|
|
16
|
+
function transcribe(audioBuffer) {
|
|
17
|
+
return serverSTT.transcribe(audioBuffer);
|
|
333
18
|
}
|
|
334
19
|
|
|
335
|
-
function
|
|
336
|
-
|
|
337
|
-
return pocket.findVoiceFile(voiceId) || findCustomVoiceFile(voiceId);
|
|
20
|
+
function getSTT() {
|
|
21
|
+
return serverSTT.getSTT();
|
|
338
22
|
}
|
|
339
23
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
if (!pState.healthy) throw new Error('pocket-tts not healthy');
|
|
343
|
-
const voicePath = resolveVoicePath(voiceId);
|
|
344
|
-
const wav = await pocket.synthesize(text, voicePath);
|
|
345
|
-
if (wav && wav.length > 44) return wav;
|
|
346
|
-
throw new Error('pocket-tts returned empty audio');
|
|
24
|
+
function synthesize(text, voiceId) {
|
|
25
|
+
return serverTTS.synthesize(text, voiceId, EXTRA_VOICE_DIRS);
|
|
347
26
|
}
|
|
348
27
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
const cached = ttsCache.get(cacheKey);
|
|
352
|
-
if (cached) {
|
|
353
|
-
ttsCache.delete(cacheKey);
|
|
354
|
-
ttsCache.set(cacheKey, cached);
|
|
355
|
-
return cached;
|
|
356
|
-
}
|
|
357
|
-
const inflight = ttsInflight.get(cacheKey);
|
|
358
|
-
if (inflight) return inflight;
|
|
359
|
-
const promise = (async () => {
|
|
360
|
-
const wav = await synthesizeViaPocket(text, voiceId);
|
|
361
|
-
cachePut(cacheKey, wav);
|
|
362
|
-
return wav;
|
|
363
|
-
})();
|
|
364
|
-
ttsInflight.set(cacheKey, promise);
|
|
365
|
-
try { return await promise; } finally { ttsInflight.delete(cacheKey); }
|
|
28
|
+
function synthesizeStream(text, voiceId) {
|
|
29
|
+
return serverTTS.synthesizeStream(text, voiceId, EXTRA_VOICE_DIRS);
|
|
366
30
|
}
|
|
367
31
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
for (const sentence of sentences) {
|
|
371
|
-
const cacheKey = (voiceId || 'default') + ':' + sentence;
|
|
372
|
-
const cached = ttsCache.get(cacheKey);
|
|
373
|
-
if (cached) {
|
|
374
|
-
ttsCache.delete(cacheKey);
|
|
375
|
-
ttsCache.set(cacheKey, cached);
|
|
376
|
-
yield cached;
|
|
377
|
-
continue;
|
|
378
|
-
}
|
|
379
|
-
const wav = await synthesizeViaPocket(sentence, voiceId);
|
|
380
|
-
cachePut(cacheKey, wav);
|
|
381
|
-
yield wav;
|
|
382
|
-
}
|
|
32
|
+
function getVoices() {
|
|
33
|
+
return serverTTS.getVoices(EXTRA_VOICE_DIRS);
|
|
383
34
|
}
|
|
384
35
|
|
|
385
36
|
function getStatus() {
|
|
386
|
-
const
|
|
37
|
+
const sttStatus = serverSTT.getStatus();
|
|
38
|
+
const ttsStatus = serverTTS.getStatus();
|
|
387
39
|
return {
|
|
388
|
-
sttReady:
|
|
389
|
-
ttsReady:
|
|
390
|
-
sttLoading,
|
|
40
|
+
sttReady: sttStatus.ready,
|
|
41
|
+
ttsReady: ttsStatus.ready,
|
|
42
|
+
sttLoading: sttStatus.loading,
|
|
391
43
|
ttsLoading: false,
|
|
392
|
-
sttError:
|
|
393
|
-
ttsError:
|
|
394
|
-
pocketTts:
|
|
44
|
+
sttError: sttStatus.error,
|
|
45
|
+
ttsError: ttsStatus.ready ? null : (ttsStatus.lastError || 'pocket-tts not running'),
|
|
46
|
+
pocketTts: ttsStatus,
|
|
395
47
|
};
|
|
396
48
|
}
|
|
397
49
|
|
|
398
50
|
function preloadTTS() {
|
|
399
|
-
const defaultVoice =
|
|
51
|
+
const defaultVoice = serverTTS.findVoiceFile('custom_cleetus', EXTRA_VOICE_DIRS) || '/config/voices/cleetus.wav';
|
|
400
52
|
const voicePath = fs.existsSync(defaultVoice) ? defaultVoice : null;
|
|
401
|
-
|
|
53
|
+
serverTTS.start(voicePath).then(ok => {
|
|
402
54
|
if (ok) console.log('[TTS] pocket-tts sidecar started');
|
|
403
55
|
else console.log('[TTS] pocket-tts failed to start');
|
|
404
56
|
}).catch(err => {
|
|
@@ -407,13 +59,15 @@ function preloadTTS() {
|
|
|
407
59
|
}
|
|
408
60
|
|
|
409
61
|
function ttsCacheKey(text, voiceId) {
|
|
410
|
-
return (voiceId
|
|
62
|
+
return serverTTS.ttsCacheKey(text, voiceId);
|
|
411
63
|
}
|
|
412
64
|
|
|
413
65
|
function ttsCacheGet(key) {
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
66
|
+
return serverTTS.ttsCacheGet(key);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function splitSentences(text) {
|
|
70
|
+
return serverTTS.splitSentences(text);
|
|
417
71
|
}
|
|
418
72
|
|
|
419
73
|
export { transcribe, synthesize, synthesizeStream, getSTT, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgui",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.191",
|
|
4
4
|
"description": "Multi-agent ACP client with real-time communication",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "server.js",
|
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
"express": "^5.2.1",
|
|
30
30
|
"fsbrowse": "^0.2.13",
|
|
31
31
|
"onnxruntime-node": "^1.24.1",
|
|
32
|
-
"webtalk": "github:
|
|
32
|
+
"webtalk": "github:AnEntrypoint/webtalk",
|
|
33
33
|
"ws": "^8.14.2"
|
|
34
34
|
}
|
|
35
35
|
}
|
package/server.js
CHANGED
|
@@ -63,7 +63,7 @@ const express = require('express');
|
|
|
63
63
|
const Busboy = require('busboy');
|
|
64
64
|
const fsbrowse = require('fsbrowse');
|
|
65
65
|
|
|
66
|
-
const SYSTEM_PROMPT = `Write
|
|
66
|
+
const SYSTEM_PROMPT = `Your output will be spoken aloud by a text-to-speech system. Write ONLY plain conversational sentences that sound natural when read aloud. Never use markdown, bold, italics, headers, bullet points, numbered lists, tables, or any formatting. Never use colons to introduce lists or options. Never use labels like "Option A" or "1." followed by a title. Instead of listing options, describe them conversationally in flowing sentences. For example, instead of "**Option 1**: Do X" say "One approach would be to do X." Keep sentences short and simple. Use transition words like "also", "another option", "or alternatively" to connect ideas. Write as if you are speaking to someone in a casual conversation.`;
|
|
67
67
|
|
|
68
68
|
const activeExecutions = new Map();
|
|
69
69
|
const messageQueues = new Map();
|
package/static/index.html
CHANGED
|
@@ -435,6 +435,19 @@
|
|
|
435
435
|
border-bottom-left-radius: 0.25rem;
|
|
436
436
|
}
|
|
437
437
|
|
|
438
|
+
/* Consecutive assistant messages: join them visually */
|
|
439
|
+
.message-assistant + .message-assistant {
|
|
440
|
+
border-top-left-radius: 0;
|
|
441
|
+
border-top-right-radius: 0;
|
|
442
|
+
margin-top: -0.125rem;
|
|
443
|
+
padding-top: 0.25rem;
|
|
444
|
+
}
|
|
445
|
+
.message-assistant:has(+ .message-assistant) {
|
|
446
|
+
border-bottom-left-radius: 0;
|
|
447
|
+
border-bottom-right-radius: 0;
|
|
448
|
+
padding-bottom: 0.25rem;
|
|
449
|
+
}
|
|
450
|
+
|
|
438
451
|
.message-role {
|
|
439
452
|
font-weight: 600;
|
|
440
453
|
font-size: 0.7rem;
|