agentgui 1.0.291 → 1.0.293

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/lib/speech.js +188 -54
  2. package/package.json +1 -1
package/lib/speech.js CHANGED
@@ -1,18 +1,39 @@
1
1
  import { createRequire } from 'module';
2
2
  import fs from 'fs';
3
3
  import path from 'path';
4
- import http from 'http';
4
+ import os from 'os';
5
5
  import { fileURLToPath } from 'url';
6
6
 
7
7
  const require = createRequire(import.meta.url);
8
8
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
9
9
  const ROOT = path.dirname(__dirname);
10
10
 
11
+ // Load modules
12
+ let serverTTS = null;
11
13
  let serverSTT = null;
14
+ let audioDecode = null;
15
+ let sttttsmodels = null;
16
+ let ttsUtils = null;
17
+
18
+ try { serverTTS = require('webtalk/server-tts'); } catch(e) { console.warn('[TTS] webtalk/server-tts unavailable:', e.message); }
12
19
  try { serverSTT = require('webtalk/server-stt'); } catch(e) { console.warn('[STT] webtalk/server-stt unavailable:', e.message); }
20
+ try { audioDecode = require('audio-decode'); } catch(e) { console.warn('[TTS] audio-decode unavailable:', e.message); }
21
+ try { sttttsmodels = require('sttttsmodels'); } catch(e) { console.warn('[TTS] sttttsmodels unavailable:', e.message); }
22
+ try { ttsUtils = require('webtalk/tts-utils'); } catch(e) {}
23
+
24
+ // Detect webtalk API type: old (server-tts.js with getVoices/synthesizeViaPocket)
25
+ // vs new ONNX (server-tts-onnx.js with encodeVoiceAudio)
26
+ const isOnnxApi = serverTTS && typeof serverTTS.encodeVoiceAudio === 'function';
27
+ const isPocketApi = serverTTS && typeof serverTTS.getVoices === 'function';
28
+
29
+ // Voice directories to scan
30
+ const VOICE_DIRS = [
31
+ path.join(os.homedir(), 'voices'),
32
+ path.join(ROOT, 'voices'),
33
+ '/config/voices',
34
+ ];
13
35
 
14
- const VOICE_DIRS = [path.join(ROOT, 'voices')];
15
- const POCKET_PORT = 8787;
36
+ const AUDIO_EXTENSIONS = ['.wav', '.mp3', '.ogg', '.flac', '.m4a'];
16
37
 
17
38
  const POCKET_TTS_VOICES = [
18
39
  { id: 'default', name: 'Default', gender: 'female', accent: 'French' },
@@ -26,7 +47,104 @@ const POCKET_TTS_VOICES = [
26
47
  { id: 'azelma', name: 'Azelma', gender: 'female', accent: 'French' },
27
48
  ];
28
49
 
29
- const PREDEFINED_IDS = new Set(POCKET_TTS_VOICES.filter(v => v.id !== 'default').map(v => v.id));
50
+ const SAMPLE_RATE = 24000;
51
+
52
+ // Embedding cache: voiceId -> {data, shape}
53
+ const voiceEmbeddingCache = new Map();
54
+
55
+ function getModelDir() {
56
+ if (sttttsmodels && sttttsmodels.ttsDir && fs.existsSync(sttttsmodels.ttsDir)) {
57
+ return sttttsmodels.ttsDir;
58
+ }
59
+ // Fallback to persistent cache dir
60
+ return path.join(os.homedir(), '.gmgui', 'models', 'tts');
61
+ }
62
+
63
+ function findVoiceFile(voiceId) {
64
+ if (!voiceId || voiceId === 'default') return null;
65
+ const baseName = voiceId.replace(/^custom_/, '');
66
+ for (const dir of VOICE_DIRS) {
67
+ for (const ext of AUDIO_EXTENSIONS) {
68
+ const p = path.join(dir, baseName + ext);
69
+ if (fs.existsSync(p)) return p;
70
+ }
71
+ }
72
+ return null;
73
+ }
74
+
75
+ function scanVoiceDir(dir) {
76
+ const voices = [];
77
+ try {
78
+ if (!fs.existsSync(dir)) return voices;
79
+ const seen = new Set();
80
+ for (const file of fs.readdirSync(dir)) {
81
+ const ext = path.extname(file).toLowerCase();
82
+ if (!AUDIO_EXTENSIONS.includes(ext)) continue;
83
+ const baseName = path.basename(file, ext);
84
+ if (seen.has(baseName)) continue;
85
+ seen.add(baseName);
86
+ voices.push({
87
+ id: 'custom_' + baseName.replace(/[^a-zA-Z0-9_-]/g, '_'),
88
+ name: baseName.replace(/_/g, ' '),
89
+ gender: 'custom', accent: 'custom', isCustom: true,
90
+ });
91
+ }
92
+ } catch (_) {}
93
+ return voices;
94
+ }
95
+
96
+ // Encode a voice WAV file to an ONNX voice embedding
97
+ async function getVoiceEmbedding(voiceId) {
98
+ if (voiceEmbeddingCache.has(voiceId)) return voiceEmbeddingCache.get(voiceId);
99
+ const voicePath = findVoiceFile(voiceId);
100
+ if (!voicePath) return null;
101
+ if (!audioDecode || !serverTTS || !isOnnxApi) return null;
102
+
103
+ const modelDir = getModelDir();
104
+ if (serverTTS.loadModels) await serverTTS.loadModels(modelDir);
105
+
106
+ const raw = fs.readFileSync(voicePath);
107
+ const decoded = await audioDecode.default(raw);
108
+ let pcm = decoded.getChannelData(0);
109
+ if (decoded.sampleRate !== SAMPLE_RATE) {
110
+ pcm = ttsUtils ? ttsUtils.resample(pcm, decoded.sampleRate, SAMPLE_RATE)
111
+ : (() => {
112
+ const ratio = decoded.sampleRate / SAMPLE_RATE;
113
+ const out = new Float32Array(Math.round(pcm.length / ratio));
114
+ for (let i = 0; i < out.length; i++) out[i] = pcm[Math.floor(i * ratio)];
115
+ return out;
116
+ })();
117
+ }
118
+
119
+ const embedding = await serverTTS.encodeVoiceAudio(pcm);
120
+ voiceEmbeddingCache.set(voiceId, embedding);
121
+ return embedding;
122
+ }
123
+
124
+ // Convert Float32Array PCM to WAV buffer
125
+ function pcmToWav(samples, sampleRate = SAMPLE_RATE) {
126
+ const numSamples = samples.length;
127
+ const numChannels = 1;
128
+ const bitsPerSample = 16;
129
+ const byteRate = sampleRate * numChannels * bitsPerSample / 8;
130
+ const blockAlign = numChannels * bitsPerSample / 8;
131
+ const dataSize = numSamples * blockAlign;
132
+ const buf = Buffer.alloc(44 + dataSize);
133
+
134
+ buf.write('RIFF', 0); buf.writeUInt32LE(36 + dataSize, 4);
135
+ buf.write('WAVE', 8); buf.write('fmt ', 12);
136
+ buf.writeUInt32LE(16, 16); buf.writeUInt16LE(1, 20);
137
+ buf.writeUInt16LE(numChannels, 22); buf.writeUInt32LE(sampleRate, 24);
138
+ buf.writeUInt32LE(byteRate, 28); buf.writeUInt16LE(blockAlign, 32);
139
+ buf.writeUInt16LE(bitsPerSample, 34); buf.write('data', 36);
140
+ buf.writeUInt32LE(dataSize, 40);
141
+
142
+ for (let i = 0; i < numSamples; i++) {
143
+ const s = Math.max(-1, Math.min(1, samples[i]));
144
+ buf.writeInt16LE(Math.round(s * 32767), 44 + i * 2);
145
+ }
146
+ return buf;
147
+ }
30
148
 
31
149
  function getSttOptions() {
32
150
  if (process.env.PORTABLE_EXE_DIR) {
@@ -38,56 +156,40 @@ function getSttOptions() {
38
156
  return {};
39
157
  }
40
158
 
41
- function findVoiceFile(voiceId) {
42
- for (const dir of VOICE_DIRS) {
43
- const p = path.join(dir, `custom_${voiceId}.wav`);
44
- if (fs.existsSync(p)) return p;
159
+ async function synthesize(text, voiceId) {
160
+ if (isOnnxApi) {
161
+ // Node.js ONNX TTS - no Python required
162
+ const modelDir = getModelDir();
163
+ const embedding = voiceId ? await getVoiceEmbedding(voiceId) : null;
164
+ const pcm = await serverTTS.synthesize(text, embedding, modelDir);
165
+ return pcmToWav(pcm);
45
166
  }
46
- return null;
47
- }
48
167
 
49
- function synthesize(text, voiceId) {
50
- const voicePath = voiceId ? findVoiceFile(voiceId) : null;
51
- const isPredefined = voiceId && PREDEFINED_IDS.has(voiceId);
52
- const boundary = '----PocketTTS' + Date.now();
53
- const parts = [];
54
- parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="text"\r\n\r\n${text}\r\n`);
55
- if (voicePath) {
56
- const data = fs.readFileSync(voicePath);
57
- const name = path.basename(voicePath);
58
- parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="voice_wav"; filename="${name}"\r\nContent-Type: audio/wav\r\n\r\n`);
59
- parts.push(data);
60
- parts.push('\r\n');
61
- } else if (isPredefined) {
62
- parts.push(`--${boundary}\r\nContent-Disposition: form-data; name="voice_url"\r\n\r\n${voiceId}\r\n`);
168
+ if (isPocketApi) {
169
+ // Old server-tts.js with pocket-tts sidecar
170
+ return serverTTS.synthesize(text, voiceId, VOICE_DIRS);
63
171
  }
64
- parts.push(`--${boundary}--\r\n`);
65
- const body = Buffer.concat(parts.map(p => Buffer.isBuffer(p) ? p : Buffer.from(p)));
66
- return new Promise((resolve, reject) => {
67
- const req = http.request({
68
- hostname: '127.0.0.1', port: POCKET_PORT, path: '/tts', method: 'POST',
69
- headers: { 'Content-Type': `multipart/form-data; boundary=${boundary}`, 'Content-Length': body.length },
70
- timeout: 60000,
71
- }, res => {
72
- if (res.statusCode !== 200) {
73
- let e = '';
74
- res.on('data', d => e += d);
75
- res.on('end', () => reject(new Error(`pocket-tts HTTP ${res.statusCode}: ${e}`)));
76
- return;
77
- }
78
- const chunks = [];
79
- res.on('data', d => chunks.push(d));
80
- res.on('end', () => resolve(Buffer.concat(chunks)));
81
- });
82
- req.on('error', reject);
83
- req.on('timeout', () => { req.destroy(); reject(new Error('pocket-tts timeout')); });
84
- req.write(body);
85
- req.end();
86
- });
172
+
173
+ throw new Error('No TTS backend available');
87
174
  }
88
175
 
89
176
  async function* synthesizeStream(text, voiceId) {
90
- yield await synthesize(text, voiceId);
177
+ if (isOnnxApi) {
178
+ const modelDir = getModelDir();
179
+ const embedding = voiceId ? await getVoiceEmbedding(voiceId) : null;
180
+ const pcm = await serverTTS.synthesize(text, embedding, modelDir);
181
+ yield pcmToWav(pcm);
182
+ return;
183
+ }
184
+
185
+ if (isPocketApi) {
186
+ for await (const chunk of serverTTS.synthesizeStream(text, voiceId, VOICE_DIRS)) {
187
+ yield chunk;
188
+ }
189
+ return;
190
+ }
191
+
192
+ throw new Error('No TTS backend available');
91
193
  }
92
194
 
93
195
  function transcribe(audioBuffer) {
@@ -101,29 +203,61 @@ function getSTT() {
101
203
  }
102
204
 
103
205
  function getVoices() {
104
- return POCKET_TTS_VOICES;
206
+ const seen = new Set();
207
+ const custom = [];
208
+ for (const dir of VOICE_DIRS) {
209
+ for (const v of scanVoiceDir(dir)) {
210
+ if (seen.has(v.id)) continue;
211
+ seen.add(v.id);
212
+ custom.push(v);
213
+ }
214
+ }
215
+ // Include built-in voices from old server-tts if available
216
+ if (isPocketApi) {
217
+ const upstream = serverTTS.getVoices(VOICE_DIRS).filter(v => v.isCustom);
218
+ for (const v of upstream) {
219
+ if (!seen.has(v.id)) { seen.add(v.id); custom.push(v); }
220
+ }
221
+ }
222
+ return [...POCKET_TTS_VOICES, ...custom];
105
223
  }
106
224
 
107
225
  function getStatus() {
108
226
  const sttStatus = serverSTT ? serverSTT.getStatus() : { ready: false, loading: false, error: 'STT unavailable' };
227
+ const ttsBackend = isOnnxApi ? 'onnx-node' : isPocketApi ? 'pocket-tts' : 'none';
109
228
  return {
110
229
  sttReady: sttStatus.ready,
111
- ttsReady: true,
230
+ ttsReady: isOnnxApi || isPocketApi,
112
231
  sttLoading: sttStatus.loading,
113
232
  ttsLoading: false,
114
233
  sttError: sttStatus.error,
115
- ttsError: null,
234
+ ttsError: (!isOnnxApi && !isPocketApi) ? 'No TTS backend available' : null,
235
+ ttsBackend,
116
236
  };
117
237
  }
118
238
 
119
239
  function preloadTTS() {
120
- // pocket-tts is managed externally; nothing to preload
240
+ if (isOnnxApi) {
241
+ // Pre-load ONNX models in background
242
+ const modelDir = getModelDir();
243
+ if (serverTTS.loadModels) {
244
+ serverTTS.loadModels(modelDir).catch(e => console.warn('[TTS] ONNX preload failed:', e.message));
245
+ }
246
+ } else if (isPocketApi && serverTTS.preload) {
247
+ serverTTS.preload(null, {});
248
+ }
121
249
  }
122
250
 
123
- function ttsCacheKey(text, voiceId) { return null; }
124
- function ttsCacheGet(key) { return null; }
251
+ function ttsCacheKey(text, voiceId) {
252
+ return isPocketApi && serverTTS.ttsCacheKey ? serverTTS.ttsCacheKey(text, voiceId) : null;
253
+ }
254
+
255
+ function ttsCacheGet(key) {
256
+ return isPocketApi && serverTTS.ttsCacheGet ? serverTTS.ttsCacheGet(key) : null;
257
+ }
125
258
 
126
259
  function splitSentences(text) {
260
+ if (isPocketApi && serverTTS.splitSentences) return serverTTS.splitSentences(text);
127
261
  return text.match(/[^.!?]+[.!?]*/g)?.map(s => s.trim()).filter(Boolean) || [text];
128
262
  }
129
263
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgui",
3
- "version": "1.0.291",
3
+ "version": "1.0.293",
4
4
  "description": "Multi-agent ACP client with real-time communication",
5
5
  "type": "module",
6
6
  "main": "server.js",