agentgui 1.0.282 → 1.0.284
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/speech.js +92 -40
- package/package.json +1 -1
- package/server.js +3813 -3767
- package/wave1-test-results.json +7 -0
package/lib/speech.js
CHANGED
|
@@ -9,8 +9,10 @@ const require = createRequire(import.meta.url);
|
|
|
9
9
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
10
|
const ROOT = path.dirname(__dirname);
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
let serverSTT = null, serverTTS = null, edgeTTS = null;
|
|
13
|
+
try { serverSTT = require('webtalk/server-stt'); } catch(e) { console.warn('[STT] webtalk/server-stt unavailable:', e.message); }
|
|
14
|
+
try { serverTTS = require('webtalk/server-tts'); } catch(e) { console.warn('[TTS] webtalk/server-tts unavailable:', e.message); }
|
|
15
|
+
try { edgeTTS = require('edge-tts-universal'); } catch(e) { console.warn('[TTS] edge-tts-universal unavailable:', e.message); }
|
|
14
16
|
|
|
15
17
|
const EXTRA_VOICE_DIRS = [path.join(ROOT, 'voices')];
|
|
16
18
|
|
|
@@ -26,20 +28,50 @@ const POCKET_TTS_VOICES = [
|
|
|
26
28
|
{ id: 'azelma', name: 'Azelma', gender: 'female', accent: 'French' },
|
|
27
29
|
];
|
|
28
30
|
|
|
31
|
+
const EDGE_VOICE_MAP = {
|
|
32
|
+
default: 'fr-FR-DeniseNeural', alba: 'fr-FR-DeniseNeural',
|
|
33
|
+
marius: 'fr-FR-HenriNeural', javert: 'fr-FR-HenriNeural',
|
|
34
|
+
jean: 'fr-FR-HenriNeural', fantine: 'fr-FR-DeniseNeural',
|
|
35
|
+
cosette: 'fr-FR-DeniseNeural', eponine: 'fr-FR-DeniseNeural',
|
|
36
|
+
azelma: 'fr-FR-DeniseNeural',
|
|
37
|
+
};
|
|
38
|
+
|
|
29
39
|
const PREDEFINED_IDS = new Set(POCKET_TTS_VOICES.filter(v => v.id !== 'default').map(v => v.id));
|
|
30
40
|
const POCKET_PORT = 8787;
|
|
31
41
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
42
|
+
let needsPatch = true;
|
|
43
|
+
try {
|
|
44
|
+
if (serverTTS && typeof serverTTS.getVoices === 'function') {
|
|
45
|
+
needsPatch = !serverTTS.getVoices(EXTRA_VOICE_DIRS).some(v => v.id === 'alba' && !v.isCustom);
|
|
46
|
+
}
|
|
47
|
+
} catch(e) { needsPatch = true; }
|
|
48
|
+
|
|
49
|
+
function getSttOptions() {
|
|
50
|
+
if (process.env.PORTABLE_DATA_DIR) {
|
|
51
|
+
return { cacheDir: path.join(process.env.PORTABLE_DATA_DIR, 'models') };
|
|
35
52
|
}
|
|
36
|
-
return
|
|
53
|
+
return {};
|
|
37
54
|
}
|
|
38
55
|
|
|
39
|
-
|
|
56
|
+
async function edgeSynthesize(text, voiceId) {
|
|
57
|
+
if (!edgeTTS) throw new Error('edge-tts-universal not available');
|
|
58
|
+
const voice = EDGE_VOICE_MAP[voiceId] || EDGE_VOICE_MAP.default;
|
|
59
|
+
const c = new edgeTTS.Communicate(text, voice);
|
|
60
|
+
const chunks = [];
|
|
61
|
+
const timeout = new Promise((_, rej) => setTimeout(() => rej(new Error('edge-tts timeout')), 30000));
|
|
62
|
+
const collect = (async () => {
|
|
63
|
+
for await (const chunk of c.stream()) {
|
|
64
|
+
if (chunk.type === 'audio' && chunk.data) chunks.push(Buffer.from(chunk.data));
|
|
65
|
+
}
|
|
66
|
+
})();
|
|
67
|
+
await Promise.race([collect, timeout]);
|
|
68
|
+
if (!chunks.length) throw new Error('edge-tts returned no audio');
|
|
69
|
+
return Buffer.concat(chunks);
|
|
70
|
+
}
|
|
40
71
|
|
|
41
72
|
function synthesizeDirect(text, voiceId) {
|
|
42
|
-
const voicePath = serverTTS.findVoiceFile
|
|
73
|
+
const voicePath = serverTTS && typeof serverTTS.findVoiceFile === 'function'
|
|
74
|
+
? serverTTS.findVoiceFile(voiceId, EXTRA_VOICE_DIRS) : null;
|
|
43
75
|
const isPredefined = voiceId && PREDEFINED_IDS.has(voiceId);
|
|
44
76
|
const boundary = '----PocketTTS' + Date.now();
|
|
45
77
|
const parts = [];
|
|
@@ -78,60 +110,75 @@ function synthesizeDirect(text, voiceId) {
|
|
|
78
110
|
});
|
|
79
111
|
}
|
|
80
112
|
|
|
81
|
-
function getSttOptions() {
|
|
82
|
-
if (process.env.PORTABLE_DATA_DIR) {
|
|
83
|
-
return { cacheDir: path.join(process.env.PORTABLE_DATA_DIR, 'models') };
|
|
84
|
-
}
|
|
85
|
-
return {};
|
|
86
|
-
}
|
|
87
|
-
|
|
88
113
|
function transcribe(audioBuffer) {
|
|
114
|
+
if (!serverSTT) throw new Error('STT not available');
|
|
89
115
|
return serverSTT.transcribe(audioBuffer, getSttOptions());
|
|
90
116
|
}
|
|
91
117
|
|
|
92
118
|
function getSTT() {
|
|
119
|
+
if (!serverSTT) throw new Error('STT not available');
|
|
93
120
|
return serverSTT.getSTT(getSttOptions());
|
|
94
121
|
}
|
|
95
122
|
|
|
96
|
-
function synthesize(text, voiceId) {
|
|
97
|
-
if (
|
|
98
|
-
|
|
123
|
+
async function synthesize(text, voiceId) {
|
|
124
|
+
if (serverTTS) {
|
|
125
|
+
try {
|
|
126
|
+
if (needsPatch && voiceId && PREDEFINED_IDS.has(voiceId)) {
|
|
127
|
+
return await synthesizeDirect(text, voiceId);
|
|
128
|
+
}
|
|
129
|
+
return await serverTTS.synthesize(text, voiceId, EXTRA_VOICE_DIRS);
|
|
130
|
+
} catch(e) {
|
|
131
|
+
console.warn('[TTS] webtalk synthesize failed, falling back to edge-tts:', e.message);
|
|
132
|
+
}
|
|
99
133
|
}
|
|
100
|
-
return
|
|
134
|
+
return edgeSynthesize(text, voiceId);
|
|
101
135
|
}
|
|
102
136
|
|
|
103
|
-
function synthesizeStream(text, voiceId) {
|
|
104
|
-
if (
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
137
|
+
async function* synthesizeStream(text, voiceId) {
|
|
138
|
+
if (serverTTS) {
|
|
139
|
+
try {
|
|
140
|
+
if (needsPatch && voiceId && PREDEFINED_IDS.has(voiceId)) {
|
|
141
|
+
yield await synthesizeDirect(text, voiceId);
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
for await (const chunk of serverTTS.synthesizeStream(text, voiceId, EXTRA_VOICE_DIRS)) {
|
|
145
|
+
yield chunk;
|
|
146
|
+
}
|
|
147
|
+
return;
|
|
148
|
+
} catch(e) {
|
|
149
|
+
console.warn('[TTS] webtalk stream failed, falling back to edge-tts:', e.message);
|
|
150
|
+
}
|
|
108
151
|
}
|
|
109
|
-
|
|
152
|
+
yield await edgeSynthesize(text, voiceId);
|
|
110
153
|
}
|
|
111
154
|
|
|
112
155
|
function getVoices() {
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
156
|
+
try {
|
|
157
|
+
const upstream = serverTTS && typeof serverTTS.getVoices === 'function'
|
|
158
|
+
? serverTTS.getVoices(EXTRA_VOICE_DIRS) : [];
|
|
159
|
+
const custom = upstream.filter(v => v.isCustom);
|
|
160
|
+
return [...POCKET_TTS_VOICES, ...custom];
|
|
161
|
+
} catch(e) { return POCKET_TTS_VOICES; }
|
|
116
162
|
}
|
|
117
163
|
|
|
118
164
|
function getStatus() {
|
|
119
|
-
const sttStatus = serverSTT.getStatus();
|
|
120
|
-
const ttsStatus = serverTTS.getStatus();
|
|
165
|
+
const sttStatus = serverSTT ? serverSTT.getStatus() : { ready: false, loading: false, error: 'STT unavailable' };
|
|
166
|
+
const ttsStatus = serverTTS ? serverTTS.getStatus() : { ready: false, lastError: 'TTS unavailable' };
|
|
121
167
|
return {
|
|
122
168
|
sttReady: sttStatus.ready,
|
|
123
|
-
ttsReady: ttsStatus.ready,
|
|
169
|
+
ttsReady: ttsStatus.ready || !!edgeTTS,
|
|
124
170
|
sttLoading: sttStatus.loading,
|
|
125
171
|
ttsLoading: false,
|
|
126
172
|
sttError: sttStatus.error,
|
|
127
|
-
ttsError: ttsStatus.ready ? null : (ttsStatus.lastError || '
|
|
173
|
+
ttsError: (ttsStatus.ready || edgeTTS) ? null : (ttsStatus.lastError || 'TTS not available'),
|
|
128
174
|
pocketTts: ttsStatus,
|
|
175
|
+
edgeTtsAvailable: !!edgeTTS,
|
|
129
176
|
};
|
|
130
177
|
}
|
|
131
178
|
|
|
132
179
|
function preloadTTS() {
|
|
133
|
-
if (
|
|
134
|
-
console.log('[TTS]
|
|
180
|
+
if (!serverTTS || typeof serverTTS.start !== 'function') {
|
|
181
|
+
if (edgeTTS) console.log('[TTS] Using edge-tts fallback');
|
|
135
182
|
return;
|
|
136
183
|
}
|
|
137
184
|
if (typeof serverTTS.isInstalled === 'function' && !serverTTS.isInstalled()) {
|
|
@@ -143,26 +190,31 @@ function preloadTTS() {
|
|
|
143
190
|
path.join(portableDataDir, 'pocket-venv', 'Scripts', 'pocket-tts.exe'),
|
|
144
191
|
path.join(portableDataDir, 'pocket-venv', 'bin', 'pocket-tts'),
|
|
145
192
|
] : undefined;
|
|
146
|
-
|
|
147
|
-
|
|
193
|
+
let voicePath = null;
|
|
194
|
+
try {
|
|
195
|
+
const defaultVoice = typeof serverTTS.findVoiceFile === 'function'
|
|
196
|
+
? (serverTTS.findVoiceFile('custom_cleetus', EXTRA_VOICE_DIRS) || '/config/voices/cleetus.wav')
|
|
197
|
+
: '/config/voices/cleetus.wav';
|
|
198
|
+
voicePath = fs.existsSync(defaultVoice) ? defaultVoice : null;
|
|
199
|
+
} catch(e) {}
|
|
148
200
|
serverTTS.start(voicePath, binaryPaths ? { binaryPaths } : {}).then(ok => {
|
|
149
201
|
if (ok) console.log('[TTS] pocket-tts sidecar started');
|
|
150
|
-
else console.log('[TTS] pocket-tts
|
|
202
|
+
else console.log('[TTS] pocket-tts unavailable, edge-tts fallback active:', !!edgeTTS);
|
|
151
203
|
}).catch(err => {
|
|
152
204
|
console.error('[TTS] pocket-tts start error:', err.message);
|
|
153
205
|
});
|
|
154
206
|
}
|
|
155
207
|
|
|
156
208
|
function ttsCacheKey(text, voiceId) {
|
|
157
|
-
return typeof serverTTS.ttsCacheKey === 'function' ? serverTTS.ttsCacheKey(text, voiceId) : null;
|
|
209
|
+
return serverTTS && typeof serverTTS.ttsCacheKey === 'function' ? serverTTS.ttsCacheKey(text, voiceId) : null;
|
|
158
210
|
}
|
|
159
211
|
|
|
160
212
|
function ttsCacheGet(key) {
|
|
161
|
-
return typeof serverTTS.ttsCacheGet === 'function' ? serverTTS.ttsCacheGet(key) : null;
|
|
213
|
+
return serverTTS && typeof serverTTS.ttsCacheGet === 'function' ? serverTTS.ttsCacheGet(key) : null;
|
|
162
214
|
}
|
|
163
215
|
|
|
164
216
|
function splitSentences(text) {
|
|
165
|
-
return typeof serverTTS.splitSentences === 'function' ? serverTTS.splitSentences(text) : [text];
|
|
217
|
+
return serverTTS && typeof serverTTS.splitSentences === 'function' ? serverTTS.splitSentences(text) : [text];
|
|
166
218
|
}
|
|
167
219
|
|
|
168
220
|
export { transcribe, synthesize, synthesizeStream, getSTT, getStatus, getVoices, preloadTTS, ttsCacheKey, ttsCacheGet, splitSentences };
|