@acpfx/tts-deepgram 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +21 -0
- package/manifest.yaml +12 -0
- package/package.json +16 -0
- package/src/index.ts +316 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# @acpfx/tts-deepgram
|
|
2
|
+
|
|
3
|
+
## 0.2.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- d757640: Initial release: type-safe contracts, Rust orchestrator, manifest-driven event filtering
|
|
8
|
+
|
|
9
|
+
- Rust schema crate as canonical event type source of truth with codegen to TypeScript + Zod
|
|
10
|
+
- Node manifests (manifest.yaml) declaring consumes/emits contracts
|
|
11
|
+
- Orchestrator event filtering: nodes only receive declared events
|
|
12
|
+
- Rust orchestrator with ratatui TUI (--ui flag)
|
|
13
|
+
- node-sdk with structured logging helpers
|
|
14
|
+
- CI/CD with GitHub Actions and changesets
|
|
15
|
+
- Platform-specific npm packages for Rust binaries (esbuild-style distribution)
|
|
16
|
+
|
|
17
|
+
### Patch Changes
|
|
18
|
+
|
|
19
|
+
- Updated dependencies [d757640]
|
|
20
|
+
- @acpfx/core@0.2.0
|
|
21
|
+
- @acpfx/node-sdk@0.2.0
|
package/manifest.yaml
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@acpfx/tts-deepgram",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"bin": {
|
|
6
|
+
"acpfx-tts-deepgram": "./dist/index.js"
|
|
7
|
+
},
|
|
8
|
+
"main": "./dist/index.js",
|
|
9
|
+
"dependencies": {
|
|
10
|
+
"@acpfx/core": "0.2.0",
|
|
11
|
+
"@acpfx/node-sdk": "0.2.0"
|
|
12
|
+
},
|
|
13
|
+
"scripts": {
|
|
14
|
+
"build": "esbuild src/index.ts --bundle --platform=node --format=esm --outfile=dist/index.js --packages=external"
|
|
15
|
+
}
|
|
16
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tts-deepgram node — Deepgram Aura streaming TTS via WebSocket.
|
|
3
|
+
*
|
|
4
|
+
* Reads agent.delta events, streams text tokens to Deepgram WebSocket,
|
|
5
|
+
* emits audio.chunk events as audio arrives.
|
|
6
|
+
*
|
|
7
|
+
* True streaming: sends each delta token as it arrives via {"type":"Speak","text":"..."}.
|
|
8
|
+
* Explicit segment control:
|
|
9
|
+
* - Flush on agent.tool_start (finalize current segment)
|
|
10
|
+
* - Clear on control.interrupt (discard buffered text)
|
|
11
|
+
*
|
|
12
|
+
* Settings (via ACPFX_SETTINGS):
|
|
13
|
+
* voice?: string — Deepgram voice model (default: aura-2-apollo-en)
|
|
14
|
+
* apiKey?: string — API key (falls back to DEEPGRAM_API_KEY env)
|
|
15
|
+
* sampleRate?: number — output sample rate (default: 16000)
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { createInterface } from "node:readline";
|
|
19
|
+
import { emit, log, handleManifestFlag } from "@acpfx/node-sdk";
|
|
20
|
+
|
|
21
|
+
handleManifestFlag();
|
|
22
|
+
|
|
23
|
+
const WS_URL = "wss://api.deepgram.com/v1/speak";
|
|
24
|
+
const DEFAULT_VOICE = "aura-2-apollo-en";
|
|
25
|
+
const TRACK_ID = "tts";
|
|
26
|
+
|
|
27
|
+
type Settings = {
|
|
28
|
+
voice?: string;
|
|
29
|
+
apiKey?: string;
|
|
30
|
+
sampleRate?: number;
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
const settings: Settings = JSON.parse(process.env.ACPFX_SETTINGS || "{}");
|
|
34
|
+
const API_KEY = settings.apiKey ?? process.env.DEEPGRAM_API_KEY ?? "";
|
|
35
|
+
const VOICE = settings.voice ?? DEFAULT_VOICE;
|
|
36
|
+
const SAMPLE_RATE = settings.sampleRate ?? 16000;
|
|
37
|
+
const CHANNELS = 1;
|
|
38
|
+
const BYTES_PER_SAMPLE = 2;
|
|
39
|
+
const CHUNK_DURATION_MS = 100;
|
|
40
|
+
const CHUNK_SIZE = Math.floor(
|
|
41
|
+
(SAMPLE_RATE * CHANNELS * BYTES_PER_SAMPLE * CHUNK_DURATION_MS) / 1000,
|
|
42
|
+
);
|
|
43
|
+
|
|
44
|
+
if (!API_KEY) {
|
|
45
|
+
log.error("No API key. Set DEEPGRAM_API_KEY or settings.apiKey");
|
|
46
|
+
process.exit(1);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
let ws: WebSocket | null = null;
|
|
50
|
+
let connected = false;
|
|
51
|
+
let interrupted = false;
|
|
52
|
+
let pcmBuffer = Buffer.alloc(0);
|
|
53
|
+
let currentRequestId: string | null = null;
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async function openWebSocket(): Promise<void> {
|
|
57
|
+
if (ws && connected) return;
|
|
58
|
+
|
|
59
|
+
const url =
|
|
60
|
+
`${WS_URL}?model=${encodeURIComponent(VOICE)}` +
|
|
61
|
+
`&encoding=linear16` +
|
|
62
|
+
`&sample_rate=${SAMPLE_RATE}`;
|
|
63
|
+
|
|
64
|
+
ws = new WebSocket(url, ["token", API_KEY]);
|
|
65
|
+
|
|
66
|
+
await new Promise<void>((resolve, reject) => {
|
|
67
|
+
ws!.addEventListener(
|
|
68
|
+
"open",
|
|
69
|
+
() => {
|
|
70
|
+
connected = true;
|
|
71
|
+
log.info("Connected to Deepgram TTS");
|
|
72
|
+
resolve();
|
|
73
|
+
},
|
|
74
|
+
{ once: true },
|
|
75
|
+
);
|
|
76
|
+
ws!.addEventListener(
|
|
77
|
+
"error",
|
|
78
|
+
() => reject(new Error("TTS WebSocket connection failed")),
|
|
79
|
+
{ once: true },
|
|
80
|
+
);
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
ws.addEventListener("message", (event: MessageEvent) => {
|
|
84
|
+
if (interrupted) return;
|
|
85
|
+
|
|
86
|
+
const data = event.data;
|
|
87
|
+
|
|
88
|
+
// Handle Blob (browser-style WebSocket returns Blobs for binary)
|
|
89
|
+
if (typeof data === "object" && data !== null && typeof (data as any).arrayBuffer === "function") {
|
|
90
|
+
(data as Blob).arrayBuffer().then((ab) => {
|
|
91
|
+
if (interrupted) return;
|
|
92
|
+
handleAudioData(Buffer.from(ab));
|
|
93
|
+
});
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Handle ArrayBuffer / Buffer
|
|
98
|
+
if (data instanceof ArrayBuffer || Buffer.isBuffer(data)) {
|
|
99
|
+
handleAudioData(Buffer.from(data as ArrayBuffer));
|
|
100
|
+
return;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Text frame — metadata/control message
|
|
104
|
+
if (typeof data === "string") {
|
|
105
|
+
try {
|
|
106
|
+
const msg = JSON.parse(data);
|
|
107
|
+
if (msg.type === "Flushed") {
|
|
108
|
+
if (pcmBuffer.length > 0) {
|
|
109
|
+
emitAudioChunk(pcmBuffer);
|
|
110
|
+
pcmBuffer = Buffer.alloc(0);
|
|
111
|
+
}
|
|
112
|
+
} else if (msg.type === "Warning") {
|
|
113
|
+
log.warn(`Deepgram warning: ${msg.description ?? msg.code ?? "unknown"}`);
|
|
114
|
+
}
|
|
115
|
+
} catch {
|
|
116
|
+
// ignore
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
function handleAudioData(rawPcm: Buffer): void {
|
|
122
|
+
pcmBuffer = Buffer.concat([pcmBuffer, rawPcm]);
|
|
123
|
+
while (pcmBuffer.length >= CHUNK_SIZE) {
|
|
124
|
+
const chunk = pcmBuffer.subarray(0, CHUNK_SIZE);
|
|
125
|
+
pcmBuffer = pcmBuffer.subarray(CHUNK_SIZE);
|
|
126
|
+
emitAudioChunk(chunk);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
ws.addEventListener("error", (event: Event) => {
|
|
131
|
+
log.error(`WebSocket error: ${(event as ErrorEvent).message ?? "unknown"}`);
|
|
132
|
+
emit({
|
|
133
|
+
type: "control.error",
|
|
134
|
+
component: "tts-deepgram",
|
|
135
|
+
message: "TTS WebSocket error",
|
|
136
|
+
fatal: false,
|
|
137
|
+
});
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
ws.addEventListener("close", (event: CloseEvent) => {
|
|
141
|
+
log.info(`WebSocket closed (code=${event.code})`);
|
|
142
|
+
connected = false;
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function emitAudioChunk(pcm: Buffer): void {
|
|
147
|
+
const durationMs = Math.round(
|
|
148
|
+
(pcm.length / (SAMPLE_RATE * CHANNELS * BYTES_PER_SAMPLE)) * 1000,
|
|
149
|
+
);
|
|
150
|
+
emit({
|
|
151
|
+
type: "audio.chunk",
|
|
152
|
+
trackId: TRACK_ID,
|
|
153
|
+
format: "pcm_s16le",
|
|
154
|
+
sampleRate: SAMPLE_RATE,
|
|
155
|
+
channels: CHANNELS,
|
|
156
|
+
data: Buffer.from(pcm).toString("base64"),
|
|
157
|
+
durationMs,
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Strip markdown characters from streaming tokens.
|
|
163
|
+
* Since tokens arrive fragmented (e.g., "**" then "bold" then "**"),
|
|
164
|
+
* we can't use pattern-based regex. Instead, just remove markdown
|
|
165
|
+
* syntax characters and track URL state to skip link targets.
|
|
166
|
+
*/
|
|
167
|
+
let inUrl = false;
|
|
168
|
+
let inCodeBlock = false;
|
|
169
|
+
|
|
170
|
+
function stripMarkdown(text: string): string {
|
|
171
|
+
// Track code block state across tokens
|
|
172
|
+
if (text.includes("```")) {
|
|
173
|
+
inCodeBlock = !inCodeBlock;
|
|
174
|
+
return "";
|
|
175
|
+
}
|
|
176
|
+
if (inCodeBlock) return "";
|
|
177
|
+
|
|
178
|
+
// Track markdown link URL: after "](" skip until ")"
|
|
179
|
+
let result = "";
|
|
180
|
+
for (let i = 0; i < text.length; i++) {
|
|
181
|
+
const ch = text[i];
|
|
182
|
+
if (inUrl) {
|
|
183
|
+
if (ch === ")") inUrl = false;
|
|
184
|
+
continue; // skip URL characters
|
|
185
|
+
}
|
|
186
|
+
if (ch === "]" && i + 1 < text.length && text[i + 1] === "(") {
|
|
187
|
+
inUrl = true;
|
|
188
|
+
i++; // skip the "("
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
if (ch === "[" || ch === "]") continue; // link brackets
|
|
192
|
+
if (ch === "*" || ch === "~" || ch === "`") continue;
|
|
193
|
+
if (ch === "#" && (i === 0 || text[i - 1] === "\n")) continue;
|
|
194
|
+
result += ch;
|
|
195
|
+
}
|
|
196
|
+
return result;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
function sendText(text: string): void {
|
|
200
|
+
if (!ws || !connected) {
|
|
201
|
+
log.warn(`sendText dropped (connected=${connected}): "${text.slice(0, 30)}"`);
|
|
202
|
+
return;
|
|
203
|
+
}
|
|
204
|
+
const clean = stripMarkdown(text);
|
|
205
|
+
if (!clean) return;
|
|
206
|
+
ws.send(JSON.stringify({ type: "Speak", text: clean }));
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function flushStream(): void {
|
|
210
|
+
if (!ws || !connected) return;
|
|
211
|
+
log.debug("Sending Flush");
|
|
212
|
+
ws.send(JSON.stringify({ type: "Flush" }));
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
function clearStream(): void {
|
|
216
|
+
if (!ws || !connected) return;
|
|
217
|
+
log.debug("Sending Clear");
|
|
218
|
+
ws.send(JSON.stringify({ type: "Clear" }));
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
function closeWebSocket(): void {
|
|
222
|
+
connected = false;
|
|
223
|
+
pcmBuffer = Buffer.alloc(0);
|
|
224
|
+
if (ws) {
|
|
225
|
+
try {
|
|
226
|
+
ws.send(JSON.stringify({ type: "Close" }));
|
|
227
|
+
ws.close();
|
|
228
|
+
} catch {
|
|
229
|
+
// ignore
|
|
230
|
+
}
|
|
231
|
+
ws = null;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// --- Main ---
|
|
236
|
+
|
|
237
|
+
async function main(): Promise<void> {
|
|
238
|
+
await openWebSocket();
|
|
239
|
+
|
|
240
|
+
emit({ type: "lifecycle.ready", component: "tts-deepgram" });
|
|
241
|
+
|
|
242
|
+
const rl = createInterface({ input: process.stdin });
|
|
243
|
+
|
|
244
|
+
const eventQueue: string[] = [];
|
|
245
|
+
let processing = false;
|
|
246
|
+
|
|
247
|
+
async function processQueue(): Promise<void> {
|
|
248
|
+
if (processing) return;
|
|
249
|
+
processing = true;
|
|
250
|
+
|
|
251
|
+
while (eventQueue.length > 0) {
|
|
252
|
+
const line = eventQueue.shift()!;
|
|
253
|
+
try {
|
|
254
|
+
const event = JSON.parse(line);
|
|
255
|
+
await handleEvent(event);
|
|
256
|
+
} catch {
|
|
257
|
+
// ignore
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
processing = false;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
async function handleEvent(event: Record<string, unknown>): Promise<void> {
|
|
265
|
+
if (event.type === "agent.delta") {
|
|
266
|
+
if (event.delta) {
|
|
267
|
+
if (interrupted || !connected) {
|
|
268
|
+
log.info(`Reconnecting (interrupted=${interrupted}, connected=${connected})`);
|
|
269
|
+
interrupted = false;
|
|
270
|
+
closeWebSocket();
|
|
271
|
+
await openWebSocket();
|
|
272
|
+
}
|
|
273
|
+
currentRequestId = event.requestId as string;
|
|
274
|
+
sendText(event.delta as string);
|
|
275
|
+
}
|
|
276
|
+
} else if (event.type === "agent.tool_start" && !interrupted) {
|
|
277
|
+
// Tool call started — flush current segment
|
|
278
|
+
if (connected) {
|
|
279
|
+
log.info("Tool started — flushing TTS segment");
|
|
280
|
+
flushStream();
|
|
281
|
+
}
|
|
282
|
+
} else if (event.type === "agent.complete" && !interrupted) {
|
|
283
|
+
// Agent done — flush remaining text
|
|
284
|
+
flushStream();
|
|
285
|
+
currentRequestId = null;
|
|
286
|
+
} else if (event.type === "control.interrupt") {
|
|
287
|
+
interrupted = true;
|
|
288
|
+
// Clear discards buffered text immediately
|
|
289
|
+
clearStream();
|
|
290
|
+
closeWebSocket();
|
|
291
|
+
currentRequestId = null;
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
rl.on("line", (line) => {
|
|
296
|
+
if (!line.trim()) return;
|
|
297
|
+
eventQueue.push(line);
|
|
298
|
+
processQueue();
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
rl.on("close", () => {
|
|
302
|
+
closeWebSocket();
|
|
303
|
+
emit({ type: "lifecycle.done", component: "tts-deepgram" });
|
|
304
|
+
process.exit(0);
|
|
305
|
+
});
|
|
306
|
+
|
|
307
|
+
process.on("SIGTERM", () => {
|
|
308
|
+
closeWebSocket();
|
|
309
|
+
process.exit(0);
|
|
310
|
+
});
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
main().catch((err) => {
|
|
314
|
+
log.error(`Fatal: ${err.message}`);
|
|
315
|
+
process.exit(1);
|
|
316
|
+
});
|