@mooncompany/uplink-chat 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of @mooncompany/uplink-chat might be problematic. Click here for more details.
- package/LICENSE +21 -0
- package/README.md +185 -0
- package/bin/uplink.js +279 -0
- package/middleware/error-handler.js +69 -0
- package/package.json +93 -0
- package/public/css/agents.36b98c0f.css +1469 -0
- package/public/css/agents.css +1469 -0
- package/public/css/app.a6a7f8f5.css +2731 -0
- package/public/css/app.css +2731 -0
- package/public/css/artifacts.css +444 -0
- package/public/css/commands.css +55 -0
- package/public/css/connection.css +131 -0
- package/public/css/dashboard.css +233 -0
- package/public/css/developer.css +328 -0
- package/public/css/files.css +123 -0
- package/public/css/markdown.css +156 -0
- package/public/css/message-actions.css +278 -0
- package/public/css/mobile.css +614 -0
- package/public/css/panels-unified.css +483 -0
- package/public/css/premium.css +415 -0
- package/public/css/realtime.css +189 -0
- package/public/css/satellites.css +401 -0
- package/public/css/shortcuts.css +185 -0
- package/public/css/split-view.4def0262.css +673 -0
- package/public/css/split-view.css +673 -0
- package/public/css/theme-generator.css +391 -0
- package/public/css/themes.css +387 -0
- package/public/css/timestamps.css +54 -0
- package/public/css/variables.css +78 -0
- package/public/dist/bundle.b55050c4.js +15757 -0
- package/public/favicon.svg +24 -0
- package/public/img/agents/ada.png +0 -0
- package/public/img/agents/clarice.png +0 -0
- package/public/img/agents/dennis-nedry.png +0 -0
- package/public/img/agents/elliot-alderson.png +0 -0
- package/public/img/agents/main.png +0 -0
- package/public/img/agents/scotty.png +0 -0
- package/public/img/agents/top-flight-security.png +0 -0
- package/public/index.html +1083 -0
- package/public/js/agents-data.js +234 -0
- package/public/js/agents-ui.js +72 -0
- package/public/js/agents.js +1525 -0
- package/public/js/app.js +79 -0
- package/public/js/appearance-settings.js +111 -0
- package/public/js/artifacts.js +432 -0
- package/public/js/audio-queue.js +168 -0
- package/public/js/bootstrap.js +54 -0
- package/public/js/chat.js +1211 -0
- package/public/js/commands.js +581 -0
- package/public/js/connection-api.js +121 -0
- package/public/js/connection.js +1231 -0
- package/public/js/context-tracker.js +271 -0
- package/public/js/core.js +172 -0
- package/public/js/dashboard.js +452 -0
- package/public/js/developer.js +432 -0
- package/public/js/encryption.js +124 -0
- package/public/js/errors.js +122 -0
- package/public/js/event-bus.js +77 -0
- package/public/js/fetch-utils.js +171 -0
- package/public/js/file-handler.js +229 -0
- package/public/js/files.js +352 -0
- package/public/js/gateway-chat.js +538 -0
- package/public/js/logger.js +112 -0
- package/public/js/markdown.js +190 -0
- package/public/js/message-actions.js +431 -0
- package/public/js/message-renderer.js +288 -0
- package/public/js/missed-messages.js +235 -0
- package/public/js/mobile-debug.js +95 -0
- package/public/js/notifications.js +367 -0
- package/public/js/offline-queue.js +178 -0
- package/public/js/onboarding.js +543 -0
- package/public/js/panels.js +156 -0
- package/public/js/premium.js +412 -0
- package/public/js/realtime-voice.js +844 -0
- package/public/js/satellite-sync.js +256 -0
- package/public/js/satellite-ui.js +175 -0
- package/public/js/satellites.js +1516 -0
- package/public/js/settings.js +1087 -0
- package/public/js/shortcuts.js +381 -0
- package/public/js/split-chat.js +1234 -0
- package/public/js/split-resize.js +211 -0
- package/public/js/splitview.js +340 -0
- package/public/js/storage.js +408 -0
- package/public/js/streaming-handler.js +324 -0
- package/public/js/stt-settings.js +316 -0
- package/public/js/theme-generator.js +661 -0
- package/public/js/themes.js +164 -0
- package/public/js/timestamps.js +198 -0
- package/public/js/tts-settings.js +575 -0
- package/public/js/ui.js +267 -0
- package/public/js/update-notifier.js +143 -0
- package/public/js/utils/constants.js +165 -0
- package/public/js/utils/sanitize.js +93 -0
- package/public/js/utils/sse-parser.js +195 -0
- package/public/js/voice.js +883 -0
- package/public/manifest.json +58 -0
- package/public/moon_texture.jpg +0 -0
- package/public/sw.js +221 -0
- package/public/three.min.js +6 -0
- package/server/channel.js +529 -0
- package/server/chat.js +270 -0
- package/server/config-store.js +362 -0
- package/server/config.js +159 -0
- package/server/context.js +131 -0
- package/server/gateway-commands.js +211 -0
- package/server/gateway-proxy.js +318 -0
- package/server/index.js +22 -0
- package/server/logger.js +89 -0
- package/server/middleware/auth.js +188 -0
- package/server/middleware.js +218 -0
- package/server/openclaw-discover.js +308 -0
- package/server/premium/index.js +156 -0
- package/server/premium/license.js +140 -0
- package/server/realtime/bridge.js +837 -0
- package/server/realtime/index.js +349 -0
- package/server/realtime/tts-stream.js +446 -0
- package/server/routes/agents.js +564 -0
- package/server/routes/artifacts.js +174 -0
- package/server/routes/chat.js +311 -0
- package/server/routes/config-settings.js +345 -0
- package/server/routes/config.js +603 -0
- package/server/routes/files.js +307 -0
- package/server/routes/index.js +18 -0
- package/server/routes/media.js +451 -0
- package/server/routes/missed-messages.js +107 -0
- package/server/routes/premium.js +75 -0
- package/server/routes/push.js +156 -0
- package/server/routes/satellite.js +406 -0
- package/server/routes/status.js +251 -0
- package/server/routes/stt.js +35 -0
- package/server/routes/voice.js +260 -0
- package/server/routes/webhooks.js +203 -0
- package/server/routes.js +206 -0
- package/server/runtime-config.js +336 -0
- package/server/share.js +305 -0
- package/server/stt/faster-whisper.js +72 -0
- package/server/stt/groq.js +51 -0
- package/server/stt/index.js +196 -0
- package/server/stt/openai.js +49 -0
- package/server/sync.js +244 -0
- package/server/tailscale-https.js +175 -0
- package/server/tts.js +646 -0
- package/server/update-checker.js +172 -0
- package/server/utils/filename.js +129 -0
- package/server/utils.js +147 -0
- package/server/watchdog.js +318 -0
- package/server/websocket/broadcast.js +359 -0
- package/server/websocket/connections.js +339 -0
- package/server/websocket/index.js +215 -0
- package/server/websocket/routing.js +277 -0
- package/server/websocket/sync.js +102 -0
- package/server.js +404 -0
- package/utils/detect-tool-usage.js +93 -0
- package/utils/errors.js +158 -0
- package/utils/html-escape.js +84 -0
- package/utils/id-sanitize.js +94 -0
- package/utils/response.js +130 -0
- package/utils/with-retry.js +105 -0
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TTS Streaming Module — Streams text-to-speech as PCM 24kHz 16-bit mono chunks
|
|
3
|
+
*
|
|
4
|
+
* Supports two engines:
|
|
5
|
+
* - 'openai' — OpenAI TTS API (POST /v1/audio/speech, response_format=pcm)
|
|
6
|
+
* - 'edge' — Microsoft Edge TTS via raw WebSocket (no npm dependency)
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* import { streamTTS, splitSentences } from './tts-stream.js';
|
|
10
|
+
*
|
|
11
|
+
* for await (const pcmChunk of streamTTS('Hello world', config)) {
|
|
12
|
+
* // pcmChunk is a Buffer of PCM 24kHz 16-bit mono audio
|
|
13
|
+
* }
|
|
14
|
+
*
|
|
15
|
+
* Config shape:
|
|
16
|
+
* {
|
|
17
|
+
* engine: 'openai' | 'edge',
|
|
18
|
+
* openaiApiKey: string,
|
|
19
|
+
* openaiTtsVoice: string, // alloy, echo, fable, onyx, nova, shimmer
|
|
20
|
+
* openaiTtsModel: string, // tts-1, tts-1-hd
|
|
21
|
+
* edgeTtsVoice: string, // e.g. 'en-US-AriaNeural'
|
|
22
|
+
* }
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import { WebSocket } from 'ws';
|
|
26
|
+
import { randomUUID } from 'crypto';
|
|
27
|
+
import { createLogger } from '../logger.js';
|
|
28
|
+
|
|
29
|
+
const log = createLogger('tts-stream');
|
|
30
|
+
|
|
31
|
+
// ─── Constants ──────────────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
const OPENAI_TTS_URL = 'https://api.openai.com/v1/audio/speech';
|
|
34
|
+
const OPENAI_TTS_TIMEOUT_MS = 30_000;
|
|
35
|
+
|
|
36
|
+
// Edge TTS WebSocket endpoint (Microsoft Cognitive Services)
|
|
37
|
+
const EDGE_TTS_WS_URL = 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1';
|
|
38
|
+
const EDGE_TTS_TRUSTED_TOKEN = '6A5AA1D4EAFF4E9FB37E23D68491D6F4';
|
|
39
|
+
const EDGE_TTS_TIMEOUT_MS = 30_000;
|
|
40
|
+
|
|
41
|
+
// ─── Sentence Splitting ────────────────────────────────────────────────────
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Split text into sentence-level chunks for streaming TTS.
|
|
45
|
+
* Splits at sentence boundaries (. ! ? followed by space or end)
|
|
46
|
+
* and at newlines. Returns an array of non-empty strings.
|
|
47
|
+
*
|
|
48
|
+
* @param {string} text - Full response text
|
|
49
|
+
* @returns {string[]} - Array of sentence chunks
|
|
50
|
+
*/
|
|
51
|
+
export function splitSentences(text) {
|
|
52
|
+
if (!text || typeof text !== 'string') return [];
|
|
53
|
+
|
|
54
|
+
// First split on newlines (paragraph boundaries)
|
|
55
|
+
const paragraphs = text.split(/\n+/);
|
|
56
|
+
const sentences = [];
|
|
57
|
+
|
|
58
|
+
for (const para of paragraphs) {
|
|
59
|
+
const trimmed = para.trim();
|
|
60
|
+
if (!trimmed) continue;
|
|
61
|
+
|
|
62
|
+
// Split on sentence-ending punctuation followed by whitespace or end-of-string.
|
|
63
|
+
// Handles: . ! ? and also ellipsis (...)
|
|
64
|
+
// Keeps the punctuation with the sentence.
|
|
65
|
+
const parts = trimmed.match(/[^.!?]*[.!?]+[\s]?|[^.!?]+$/g);
|
|
66
|
+
|
|
67
|
+
if (parts) {
|
|
68
|
+
for (const part of parts) {
|
|
69
|
+
const s = part.trim();
|
|
70
|
+
if (s) sentences.push(s);
|
|
71
|
+
}
|
|
72
|
+
} else {
|
|
73
|
+
// No sentence-ending punctuation found — return whole paragraph
|
|
74
|
+
sentences.push(trimmed);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return sentences;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Clean text for TTS — strip markdown, emojis, code blocks, etc.
|
|
83
|
+
* @param {string} text
|
|
84
|
+
* @returns {string}
|
|
85
|
+
*/
|
|
86
|
+
function cleanForTTS(text) {
|
|
87
|
+
return text
|
|
88
|
+
.replace(/```[\s\S]*?```/g, '') // Code blocks
|
|
89
|
+
.replace(/`([^`]+)`/g, '$1') // Inline code → just the text
|
|
90
|
+
.replace(/\*\*([^*]+)\*\*/g, '$1') // Bold
|
|
91
|
+
.replace(/\*([^*]+)\*/g, '$1') // Italic
|
|
92
|
+
.replace(/__([^_]+)__/g, '$1') // Bold alt
|
|
93
|
+
.replace(/_([^_]+)_/g, '$1') // Italic alt
|
|
94
|
+
.replace(/~~([^~]+)~~/g, '$1') // Strikethrough
|
|
95
|
+
.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1') // Links → link text
|
|
96
|
+
.replace(/#{1,6}\s/g, '') // Headers
|
|
97
|
+
.replace(/^[-*+]\s/gm, '') // List bullets
|
|
98
|
+
.replace(/^\d+\.\s/gm, '') // Numbered lists
|
|
99
|
+
.replace(/^>\s?/gm, '') // Block quotes
|
|
100
|
+
.replace(/\|/g, ' ') // Table pipes
|
|
101
|
+
.replace(/---+/g, '') // Horizontal rules
|
|
102
|
+
.replace(/[\u{1F600}-\u{1F64F}]/gu, '') // Emojis
|
|
103
|
+
.replace(/[\u{1F300}-\u{1F5FF}]/gu, '')
|
|
104
|
+
.replace(/[\u{1F680}-\u{1F6FF}]/gu, '')
|
|
105
|
+
.replace(/[\u{2600}-\u{26FF}]/gu, '')
|
|
106
|
+
.replace(/[\u{2700}-\u{27BF}]/gu, '')
|
|
107
|
+
.replace(/\s+/g, ' ') // Collapse whitespace
|
|
108
|
+
.trim();
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// ─── OpenAI TTS Streaming ──────────────────────────────────────────────────
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Stream TTS from OpenAI API as PCM 24kHz 16-bit mono chunks.
|
|
115
|
+
*
|
|
116
|
+
* OpenAI's /v1/audio/speech endpoint with response_format=pcm returns
|
|
117
|
+
* raw PCM 24000 Hz, 16-bit signed little-endian mono audio.
|
|
118
|
+
*
|
|
119
|
+
* @param {string} text - Text to synthesize
|
|
120
|
+
* @param {Object} config - TTS configuration
|
|
121
|
+
* @yields {Buffer} PCM audio chunks
|
|
122
|
+
*/
|
|
123
|
+
async function* streamOpenAITTS(text, config) {
|
|
124
|
+
const apiKey = config.openaiApiKey;
|
|
125
|
+
if (!apiKey) {
|
|
126
|
+
throw new Error('OpenAI API key required for OpenAI TTS');
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const voice = config.openaiTtsVoice || 'nova';
|
|
130
|
+
const model = config.openaiTtsModel || 'tts-1';
|
|
131
|
+
const cleanText = cleanForTTS(text);
|
|
132
|
+
|
|
133
|
+
if (!cleanText) {
|
|
134
|
+
log.warn('Empty text after cleaning — skipping TTS');
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
log.debug(`OpenAI TTS: voice=${voice}, model=${model}, text="${cleanText.substring(0, 80)}..."`);
|
|
139
|
+
|
|
140
|
+
const controller = new AbortController();
|
|
141
|
+
const timeout = setTimeout(() => controller.abort(), OPENAI_TTS_TIMEOUT_MS);
|
|
142
|
+
|
|
143
|
+
try {
|
|
144
|
+
const response = await fetch(OPENAI_TTS_URL, {
|
|
145
|
+
method: 'POST',
|
|
146
|
+
headers: {
|
|
147
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
148
|
+
'Content-Type': 'application/json',
|
|
149
|
+
},
|
|
150
|
+
body: JSON.stringify({
|
|
151
|
+
model,
|
|
152
|
+
input: cleanText,
|
|
153
|
+
voice,
|
|
154
|
+
response_format: 'pcm', // Raw PCM 24kHz 16-bit mono
|
|
155
|
+
}),
|
|
156
|
+
signal: controller.signal,
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
clearTimeout(timeout);
|
|
160
|
+
|
|
161
|
+
if (!response.ok) {
|
|
162
|
+
const errorText = await response.text().catch(() => 'Unknown error');
|
|
163
|
+
throw new Error(`OpenAI TTS API error ${response.status}: ${errorText}`);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
if (!response.body) {
|
|
167
|
+
throw new Error('OpenAI TTS response has no body');
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Stream the response body as PCM chunks
|
|
171
|
+
// PCM 16-bit requires 2-byte alignment — buffer partial bytes
|
|
172
|
+
const reader = response.body.getReader();
|
|
173
|
+
let remainder = null; // leftover byte from previous chunk
|
|
174
|
+
try {
|
|
175
|
+
while (true) {
|
|
176
|
+
const { done, value } = await reader.read();
|
|
177
|
+
if (done) break;
|
|
178
|
+
if (!value || value.length === 0) continue;
|
|
179
|
+
|
|
180
|
+
let chunk = Buffer.from(value);
|
|
181
|
+
|
|
182
|
+
// Prepend any leftover byte from previous chunk
|
|
183
|
+
if (remainder) {
|
|
184
|
+
chunk = Buffer.concat([remainder, chunk]);
|
|
185
|
+
remainder = null;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// If odd number of bytes, save the last byte for next chunk
|
|
189
|
+
if (chunk.length % 2 !== 0) {
|
|
190
|
+
remainder = chunk.slice(-1);
|
|
191
|
+
chunk = chunk.slice(0, -1);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if (chunk.length > 0) {
|
|
195
|
+
yield chunk;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
// Flush any remaining byte (pad with zero)
|
|
199
|
+
if (remainder && remainder.length > 0) {
|
|
200
|
+
yield Buffer.concat([remainder, Buffer.alloc(1)]);
|
|
201
|
+
}
|
|
202
|
+
} finally {
|
|
203
|
+
reader.releaseLock();
|
|
204
|
+
}
|
|
205
|
+
} catch (err) {
|
|
206
|
+
clearTimeout(timeout);
|
|
207
|
+
if (err.name === 'AbortError') {
|
|
208
|
+
throw new Error('OpenAI TTS request timed out');
|
|
209
|
+
}
|
|
210
|
+
throw err;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// ─── Edge TTS Streaming ────────────────────────────────────────────────────
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Generate SSML for Edge TTS.
|
|
218
|
+
* @param {string} text - Text to speak
|
|
219
|
+
* @param {string} voice - Voice name (e.g. 'en-US-AriaNeural')
|
|
220
|
+
* @returns {string} SSML string
|
|
221
|
+
*/
|
|
222
|
+
function buildSSML(text, voice) {
|
|
223
|
+
// Escape XML special characters
|
|
224
|
+
const escaped = text
|
|
225
|
+
.replace(/&/g, '&')
|
|
226
|
+
.replace(/</g, '<')
|
|
227
|
+
.replace(/>/g, '>')
|
|
228
|
+
.replace(/"/g, '"')
|
|
229
|
+
.replace(/'/g, ''');
|
|
230
|
+
|
|
231
|
+
return `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>` +
|
|
232
|
+
`<voice name='${voice}'>${escaped}</voice></speak>`;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Generate an Edge TTS request ID (32-char hex, no dashes).
|
|
237
|
+
* @returns {string}
|
|
238
|
+
*/
|
|
239
|
+
function edgeRequestId() {
|
|
240
|
+
return randomUUID().replace(/-/g, '');
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Build the Edge TTS WebSocket URL with required query parameters.
|
|
245
|
+
* @returns {string}
|
|
246
|
+
*/
|
|
247
|
+
function buildEdgeTTSUrl() {
|
|
248
|
+
return `${EDGE_TTS_WS_URL}?TrustedClientToken=${EDGE_TTS_TRUSTED_TOKEN}&ConnectionId=${edgeRequestId()}`;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Stream TTS from Edge TTS via raw WebSocket as PCM 24kHz 16-bit mono.
|
|
253
|
+
*
|
|
254
|
+
* Edge TTS natively outputs MP3. To get PCM we request a raw audio format
|
|
255
|
+
* via the output format header. Edge supports `raw-24khz-16bit-mono-pcm`.
|
|
256
|
+
*
|
|
257
|
+
* @param {string} text - Text to synthesize
|
|
258
|
+
* @param {Object} config - TTS configuration
|
|
259
|
+
* @yields {Buffer} PCM audio chunks
|
|
260
|
+
*/
|
|
261
|
+
async function* streamEdgeTTS(text, config) {
|
|
262
|
+
const voice = config.edgeTtsVoice || 'en-US-AriaNeural';
|
|
263
|
+
const cleanText = cleanForTTS(text);
|
|
264
|
+
|
|
265
|
+
if (!cleanText) {
|
|
266
|
+
log.warn('Empty text after cleaning — skipping Edge TTS');
|
|
267
|
+
return;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
log.debug(`Edge TTS: voice=${voice}, text="${cleanText.substring(0, 80)}..."`);
|
|
271
|
+
|
|
272
|
+
const requestId = edgeRequestId();
|
|
273
|
+
|
|
274
|
+
// Yield PCM chunks from the WebSocket via a promise-based async iterator
|
|
275
|
+
const chunks = [];
|
|
276
|
+
let resolve;
|
|
277
|
+
let reject;
|
|
278
|
+
let done = false;
|
|
279
|
+
let error = null;
|
|
280
|
+
|
|
281
|
+
// Queue mechanism for async iteration
|
|
282
|
+
const queue = [];
|
|
283
|
+
let waiter = null;
|
|
284
|
+
|
|
285
|
+
function enqueue(chunk) {
|
|
286
|
+
if (waiter) {
|
|
287
|
+
const w = waiter;
|
|
288
|
+
waiter = null;
|
|
289
|
+
w({ value: chunk, done: false });
|
|
290
|
+
} else {
|
|
291
|
+
queue.push(chunk);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
function finish(err) {
|
|
296
|
+
done = true;
|
|
297
|
+
error = err || null;
|
|
298
|
+
if (waiter) {
|
|
299
|
+
const w = waiter;
|
|
300
|
+
waiter = null;
|
|
301
|
+
if (err) {
|
|
302
|
+
w(Promise.reject(err));
|
|
303
|
+
} else {
|
|
304
|
+
w({ value: undefined, done: true });
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Connect
|
|
310
|
+
const ws = new WebSocket(buildEdgeTTSUrl(), {
|
|
311
|
+
headers: {
|
|
312
|
+
'Origin': 'chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold',
|
|
313
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
|
|
314
|
+
},
|
|
315
|
+
});
|
|
316
|
+
|
|
317
|
+
const connectionTimeout = setTimeout(() => {
|
|
318
|
+
ws.close();
|
|
319
|
+
finish(new Error('Edge TTS connection timed out'));
|
|
320
|
+
}, EDGE_TTS_TIMEOUT_MS);
|
|
321
|
+
|
|
322
|
+
ws.on('open', () => {
|
|
323
|
+
log.debug('Edge TTS WebSocket connected');
|
|
324
|
+
|
|
325
|
+
// 1. Send speech config
|
|
326
|
+
const configMessage =
|
|
327
|
+
`Content-Type:application/json; charset=utf-8\r\n` +
|
|
328
|
+
`Path:speech.config\r\n\r\n` +
|
|
329
|
+
JSON.stringify({
|
|
330
|
+
context: {
|
|
331
|
+
synthesis: {
|
|
332
|
+
audio: {
|
|
333
|
+
metadataOptions: { sentenceBoundaryEnabled: false, wordBoundaryEnabled: false },
|
|
334
|
+
outputFormat: 'raw-24khz-16bit-mono-pcm',
|
|
335
|
+
},
|
|
336
|
+
},
|
|
337
|
+
},
|
|
338
|
+
});
|
|
339
|
+
ws.send(configMessage);
|
|
340
|
+
|
|
341
|
+
// 2. Send SSML synthesis request
|
|
342
|
+
const ssml = buildSSML(cleanText, voice);
|
|
343
|
+
const synthMessage =
|
|
344
|
+
`X-RequestId:${requestId}\r\n` +
|
|
345
|
+
`Content-Type:application/ssml+xml\r\n` +
|
|
346
|
+
`Path:ssml\r\n\r\n` +
|
|
347
|
+
ssml;
|
|
348
|
+
ws.send(synthMessage);
|
|
349
|
+
});
|
|
350
|
+
|
|
351
|
+
ws.on('message', (data, isBinary) => {
|
|
352
|
+
if (isBinary) {
|
|
353
|
+
// Binary message: contains header + audio data
|
|
354
|
+
// The header is separated from audio by a known pattern
|
|
355
|
+
// Format: text-header\r\n\r\n<binary audio data>
|
|
356
|
+
// OR it could be prefixed with a 2-byte header length
|
|
357
|
+
const buf = Buffer.isBuffer(data) ? data : Buffer.from(data);
|
|
358
|
+
|
|
359
|
+
// Edge TTS binary frames: first 2 bytes = header length (big-endian)
|
|
360
|
+
// followed by header text, then raw audio
|
|
361
|
+
if (buf.length > 2) {
|
|
362
|
+
const headerLen = buf.readUInt16BE(0);
|
|
363
|
+
if (headerLen > 0 && headerLen < buf.length) {
|
|
364
|
+
const audioData = buf.subarray(2 + headerLen);
|
|
365
|
+
if (audioData.length > 0) {
|
|
366
|
+
enqueue(audioData);
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
} else {
|
|
371
|
+
// Text message — check for turn.end
|
|
372
|
+
const msg = data.toString();
|
|
373
|
+
if (msg.includes('Path:turn.end')) {
|
|
374
|
+
clearTimeout(connectionTimeout);
|
|
375
|
+
ws.close();
|
|
376
|
+
finish(null);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
});
|
|
380
|
+
|
|
381
|
+
ws.on('error', (err) => {
|
|
382
|
+
clearTimeout(connectionTimeout);
|
|
383
|
+
log.error('Edge TTS WebSocket error:', err.message);
|
|
384
|
+
finish(new Error(`Edge TTS error: ${err.message}`));
|
|
385
|
+
});
|
|
386
|
+
|
|
387
|
+
ws.on('close', () => {
|
|
388
|
+
clearTimeout(connectionTimeout);
|
|
389
|
+
if (!done) {
|
|
390
|
+
finish(null);
|
|
391
|
+
}
|
|
392
|
+
});
|
|
393
|
+
|
|
394
|
+
// Async iteration
|
|
395
|
+
while (true) {
|
|
396
|
+
if (queue.length > 0) {
|
|
397
|
+
yield queue.shift();
|
|
398
|
+
} else if (done) {
|
|
399
|
+
if (error) throw error;
|
|
400
|
+
return;
|
|
401
|
+
} else {
|
|
402
|
+
// Wait for next chunk or finish
|
|
403
|
+
const result = await new Promise((res, rej) => {
|
|
404
|
+
waiter = res;
|
|
405
|
+
});
|
|
406
|
+
if (result.done) return;
|
|
407
|
+
yield result.value;
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// ─── Public API ─────────────────────────────────────────────────────────────
|
|
413
|
+
|
|
414
|
+
/**
|
|
415
|
+
* Stream TTS audio as PCM 24kHz 16-bit mono chunks.
|
|
416
|
+
*
|
|
417
|
+
* @param {string} text - Text to synthesize (single sentence or short chunk)
|
|
418
|
+
* @param {Object} config - TTS configuration
|
|
419
|
+
* @param {string} config.engine - 'openai' or 'edge'
|
|
420
|
+
* @param {string} [config.openaiApiKey] - Required for OpenAI engine
|
|
421
|
+
* @param {string} [config.openaiTtsVoice] - OpenAI voice (default: 'nova')
|
|
422
|
+
* @param {string} [config.openaiTtsModel] - OpenAI model (default: 'tts-1')
|
|
423
|
+
* @param {string} [config.edgeTtsVoice] - Edge TTS voice (default: 'en-US-AriaNeural')
|
|
424
|
+
* @yields {Buffer} PCM audio chunks (24kHz, 16-bit signed LE, mono)
|
|
425
|
+
*/
|
|
426
|
+
export async function* streamTTS(text, config) {
|
|
427
|
+
if (!text || typeof text !== 'string') {
|
|
428
|
+
log.warn('streamTTS called with empty text');
|
|
429
|
+
return;
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
const engine = config.engine || 'openai';
|
|
433
|
+
|
|
434
|
+
switch (engine) {
|
|
435
|
+
case 'openai':
|
|
436
|
+
yield* streamOpenAITTS(text, config);
|
|
437
|
+
break;
|
|
438
|
+
|
|
439
|
+
case 'edge':
|
|
440
|
+
yield* streamEdgeTTS(text, config);
|
|
441
|
+
break;
|
|
442
|
+
|
|
443
|
+
default:
|
|
444
|
+
throw new Error(`Unknown TTS engine: ${engine}`);
|
|
445
|
+
}
|
|
446
|
+
}
|