@acpfx/tts-elevenlabs 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,15 @@
1
+ ISC License
2
+
3
+ Copyright (c) 2024-2026 acpfx contributors
4
+
5
+ Permission to use, copy, modify, and/or distribute this software for any
6
+ purpose with or without fee is hereby granted, provided that the above
7
+ copyright notice and this permission notice appear in all copies.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
10
+ REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11
+ AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
12
+ INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13
+ LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
14
+ OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15
+ PERFORMANCE OF THIS SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,43 @@
1
+ # @acpfx/tts-elevenlabs
2
+
3
+ Text-to-speech via ElevenLabs streaming API. Converts agent text deltas into audio chunks in real time.
4
+
5
+ ## Usage
6
+
7
+ This package is a pipeline node for [@acpfx/cli](../orchestrator/README.md). See the CLI package for installation and usage.
8
+
9
+ Requires an `ELEVENLABS_API_KEY` environment variable.
10
+
11
+ ## Manifest
12
+
13
+ - **Consumes:** `agent.delta`, `agent.complete`, `agent.tool_start`, `control.interrupt`
14
+ - **Emits:** `audio.chunk`, `lifecycle.ready`, `lifecycle.done`, `control.error`
15
+
16
+ ## Settings
17
+
18
+ | Name | Type | Default | Description |
19
+ |------|------|---------|-------------|
20
+ | `voiceId` | string | | ElevenLabs voice ID |
21
+ | `model` | string | | ElevenLabs model name |
22
+ | `apiKey` | string | | Overrides `ELEVENLABS_API_KEY` env var |
23
+
24
+ ## Pipeline Example
25
+
26
+ ```yaml
27
+ nodes:
28
+ tts:
29
+ use: "@acpfx/tts-elevenlabs"
30
+ settings: { voiceId: "your-voice-id" }
31
+ outputs: [player]
32
+ env:
33
+ ELEVENLABS_API_KEY: ${ELEVENLABS_API_KEY}
34
+ ```
35
+
36
+ ## External Links
37
+
38
+ - [ElevenLabs](https://elevenlabs.io) -- AI voice platform
39
+ - [ElevenLabs API Docs](https://elevenlabs.io/docs/api-reference) -- API reference
40
+
41
+ ## License
42
+
43
+ ISC
package/dist/index.js ADDED
@@ -0,0 +1,349 @@
1
+ #!/usr/bin/env node
2
+
3
+ // src/index.ts
4
+ import { createInterface } from "node:readline";
5
+
6
+ // ../core/src/config.ts
7
+ import { parse as parseYaml } from "yaml";
8
+
9
+ // ../core/src/manifest.ts
10
+ import { readFileSync } from "node:fs";
11
+ import { join, dirname } from "node:path";
12
+ import { z as z2 } from "zod";
13
+
14
+ // ../core/src/acpfx-flags.ts
15
+ import { z } from "zod";
16
+ var SetupCheckResponseSchema = z.object({
17
+ needed: z.boolean(),
18
+ description: z.string().optional()
19
+ });
20
+ var SetupProgressSchema = z.discriminatedUnion("type", [
21
+ z.object({
22
+ type: z.literal("progress"),
23
+ message: z.string(),
24
+ pct: z.number().optional()
25
+ }),
26
+ z.object({ type: z.literal("complete"), message: z.string() }),
27
+ z.object({ type: z.literal("error"), message: z.string() })
28
+ ]);
29
+ var UnsupportedFlagResponseSchema = z.object({
30
+ unsupported: z.boolean(),
31
+ flag: z.string()
32
+ });
33
+
34
+ // ../core/src/manifest.ts
35
+ var ArgumentTypeSchema = z2.enum(["string", "number", "boolean"]);
36
+ var ManifestArgumentSchema = z2.object({
37
+ type: ArgumentTypeSchema,
38
+ default: z2.unknown().optional(),
39
+ description: z2.string().optional(),
40
+ required: z2.boolean().optional(),
41
+ enum: z2.array(z2.unknown()).optional()
42
+ });
43
+ var ManifestEnvFieldSchema = z2.object({
44
+ required: z2.boolean().optional(),
45
+ description: z2.string().optional()
46
+ });
47
+ var NodeManifestSchema = z2.object({
48
+ name: z2.string(),
49
+ description: z2.string().optional(),
50
+ consumes: z2.array(z2.string()),
51
+ emits: z2.array(z2.string()),
52
+ arguments: z2.record(z2.string(), ManifestArgumentSchema).optional(),
53
+ additional_arguments: z2.boolean().optional(),
54
+ env: z2.record(z2.string(), ManifestEnvFieldSchema).optional()
55
+ });
56
+ function handleAcpfxFlags(manifestPath) {
57
+ const acpfxFlag = process.argv.find((a) => a.startsWith("--acpfx-"));
58
+ const legacyManifest = process.argv.includes("--manifest");
59
+ if (!acpfxFlag && !legacyManifest) return;
60
+ const flag = acpfxFlag ?? "--acpfx-manifest";
61
+ switch (flag) {
62
+ case "--acpfx-manifest":
63
+ printManifest(manifestPath);
64
+ break;
65
+ case "--acpfx-setup-check":
66
+ process.stdout.write(JSON.stringify({ needed: false }) + "\n");
67
+ process.exit(0);
68
+ break;
69
+ default:
70
+ process.stdout.write(
71
+ JSON.stringify({ unsupported: true, flag }) + "\n"
72
+ );
73
+ process.exit(0);
74
+ }
75
+ }
76
+ function handleManifestFlag(manifestPath) {
77
+ handleAcpfxFlags(manifestPath);
78
+ }
79
+ function printManifest(manifestPath) {
80
+ if (!manifestPath) {
81
+ const script = process.argv[1];
82
+ const scriptDir = dirname(script);
83
+ const scriptBase = script.replace(/\.[^.]+$/, "");
84
+ const colocated = `${scriptBase}.manifest.json`;
85
+ try {
86
+ readFileSync(colocated);
87
+ manifestPath = colocated;
88
+ } catch {
89
+ manifestPath = join(scriptDir, "manifest.json");
90
+ }
91
+ }
92
+ try {
93
+ const content = readFileSync(manifestPath, "utf8");
94
+ process.stdout.write(content.trim() + "\n");
95
+ process.exit(0);
96
+ } catch (err) {
97
+ process.stderr.write(`Failed to read manifest: ${err}
98
+ `);
99
+ process.exit(1);
100
+ }
101
+ }
102
+
103
+ // ../node-sdk/src/index.ts
104
+ var NODE_NAME = process.env.ACPFX_NODE_NAME ?? "unknown";
105
+ function emit(event) {
106
+ process.stdout.write(JSON.stringify(event) + "\n");
107
+ }
108
+ function log(level, message) {
109
+ emit({ type: "log", level, component: NODE_NAME, message });
110
+ }
111
+ log.info = (message) => log("info", message);
112
+ log.warn = (message) => log("warn", message);
113
+ log.error = (message) => log("error", message);
114
+ log.debug = (message) => log("debug", message);
115
+
116
+ // src/index.ts
117
+ handleManifestFlag();
118
+ var WS_BASE_URL = "wss://api.elevenlabs.io/v1/text-to-speech";
119
+ var DEFAULT_MODEL = "eleven_turbo_v2_5";
120
+ var DEFAULT_VOICE_ID = "21m00Tcm4TlvDq8ikWAM";
121
+ var OUTPUT_FORMAT = "pcm_16000";
122
+ var SAMPLE_RATE = 16e3;
123
+ var CHANNELS = 1;
124
+ var BYTES_PER_SAMPLE = 2;
125
+ var CHUNK_DURATION_MS = 100;
126
+ var CHUNK_SIZE = Math.floor(
127
+ SAMPLE_RATE * CHANNELS * BYTES_PER_SAMPLE * CHUNK_DURATION_MS / 1e3
128
+ );
129
+ var TRACK_ID = "tts";
130
+ var settings = JSON.parse(process.env.ACPFX_SETTINGS || "{}");
131
+ var API_KEY = settings.apiKey ?? process.env.ELEVENLABS_API_KEY ?? "";
132
+ var VOICE_ID = settings.voiceId ?? DEFAULT_VOICE_ID;
133
+ var MODEL = settings.model ?? DEFAULT_MODEL;
134
+ if (!API_KEY) {
135
+ log.error("No API key. Set ELEVENLABS_API_KEY or settings.apiKey");
136
+ process.exit(1);
137
+ }
138
+ var ws = null;
139
+ var connected = false;
140
+ var interrupted = false;
141
+ var pcmBuffer = Buffer.alloc(0);
142
+ var currentRequestId = null;
143
+ async function openWebSocket() {
144
+ if (ws && connected) return;
145
+ const url = `${WS_BASE_URL}/${VOICE_ID}/stream-input?model_id=${encodeURIComponent(MODEL)}&output_format=${OUTPUT_FORMAT}&xi_api_key=${encodeURIComponent(API_KEY)}`;
146
+ ws = new WebSocket(url);
147
+ await new Promise((resolve, reject) => {
148
+ ws.addEventListener(
149
+ "open",
150
+ () => {
151
+ connected = true;
152
+ log.info("Connected to ElevenLabs TTS");
153
+ resolve();
154
+ },
155
+ { once: true }
156
+ );
157
+ ws.addEventListener(
158
+ "error",
159
+ () => reject(new Error("TTS WebSocket connection failed")),
160
+ { once: true }
161
+ );
162
+ });
163
+ ws.send(
164
+ JSON.stringify({
165
+ text: " ",
166
+ xi_api_key: API_KEY,
167
+ voice_settings: {
168
+ stability: 0.5,
169
+ similarity_boost: 0.75
170
+ },
171
+ generation_config: {
172
+ chunk_length_schedule: [50]
173
+ }
174
+ })
175
+ );
176
+ ws.addEventListener("message", (event) => {
177
+ if (interrupted) return;
178
+ try {
179
+ const data = typeof event.data === "string" ? event.data : Buffer.from(event.data).toString("utf-8");
180
+ const msg = JSON.parse(data);
181
+ if (msg.audio) {
182
+ const rawPcm = Buffer.from(msg.audio, "base64");
183
+ pcmBuffer = Buffer.concat([pcmBuffer, rawPcm]);
184
+ while (pcmBuffer.length >= CHUNK_SIZE) {
185
+ const chunk = pcmBuffer.subarray(0, CHUNK_SIZE);
186
+ pcmBuffer = pcmBuffer.subarray(CHUNK_SIZE);
187
+ emitAudioChunk(chunk);
188
+ }
189
+ }
190
+ if (msg.isFinal) {
191
+ if (pcmBuffer.length > 0) {
192
+ emitAudioChunk(pcmBuffer);
193
+ pcmBuffer = Buffer.alloc(0);
194
+ }
195
+ }
196
+ } catch {
197
+ }
198
+ });
199
+ ws.addEventListener("error", (event) => {
200
+ log.error(`WebSocket error: ${event.message ?? "unknown"}`);
201
+ emit({
202
+ type: "control.error",
203
+ component: "tts-elevenlabs",
204
+ message: "TTS WebSocket error",
205
+ fatal: false
206
+ });
207
+ });
208
+ ws.addEventListener("close", (event) => {
209
+ log.info(`WebSocket closed (code=${event.code}, reason=${event.reason || "none"})`);
210
+ connected = false;
211
+ });
212
+ }
213
+ function emitAudioChunk(pcm) {
214
+ const durationMs = Math.round(
215
+ pcm.length / (SAMPLE_RATE * CHANNELS * BYTES_PER_SAMPLE) * 1e3
216
+ );
217
+ emit({
218
+ type: "audio.chunk",
219
+ trackId: TRACK_ID,
220
+ format: "pcm_s16le",
221
+ sampleRate: SAMPLE_RATE,
222
+ channels: CHANNELS,
223
+ data: Buffer.from(pcm).toString("base64"),
224
+ durationMs
225
+ });
226
+ }
227
+ var inUrl = false;
228
+ var inCodeBlock = false;
229
+ function stripMarkdown(text) {
230
+ if (text.includes("```")) {
231
+ inCodeBlock = !inCodeBlock;
232
+ return "";
233
+ }
234
+ if (inCodeBlock) return "";
235
+ let result = "";
236
+ for (let i = 0; i < text.length; i++) {
237
+ const ch = text[i];
238
+ if (inUrl) {
239
+ if (ch === ")") inUrl = false;
240
+ continue;
241
+ }
242
+ if (ch === "]" && i + 1 < text.length && text[i + 1] === "(") {
243
+ inUrl = true;
244
+ i++;
245
+ continue;
246
+ }
247
+ if (ch === "[" || ch === "]") continue;
248
+ if (ch === "*" || ch === "~" || ch === "`") continue;
249
+ if (ch === "#" && (i === 0 || text[i - 1] === "\n")) continue;
250
+ result += ch;
251
+ }
252
+ return result;
253
+ }
254
+ function sendText(text) {
255
+ if (!ws || !connected) {
256
+ log.warn(`sendText dropped (connected=${connected}): "${text.slice(0, 30)}"`);
257
+ return;
258
+ }
259
+ const clean = stripMarkdown(text);
260
+ if (!clean) return;
261
+ ws.send(JSON.stringify({ text: clean }));
262
+ }
263
+ function endStream() {
264
+ if (!ws || !connected) return;
265
+ log.debug("Sending EOS");
266
+ ws.send(JSON.stringify({ text: "" }));
267
+ }
268
+ function closeWebSocket() {
269
+ connected = false;
270
+ pcmBuffer = Buffer.alloc(0);
271
+ if (ws) {
272
+ try {
273
+ ws.close();
274
+ } catch {
275
+ }
276
+ ws = null;
277
+ }
278
+ }
279
+ async function main() {
280
+ await openWebSocket();
281
+ emit({ type: "lifecycle.ready", component: "tts-elevenlabs" });
282
+ const rl = createInterface({ input: process.stdin });
283
+ const eventQueue = [];
284
+ let processing = false;
285
+ async function processQueue() {
286
+ if (processing) return;
287
+ processing = true;
288
+ while (eventQueue.length > 0) {
289
+ const line = eventQueue.shift();
290
+ try {
291
+ const event = JSON.parse(line);
292
+ await handleEvent(event);
293
+ } catch {
294
+ }
295
+ }
296
+ processing = false;
297
+ }
298
+ let afterTool = false;
299
+ async function handleEvent(event) {
300
+ if (event.type === "agent.delta") {
301
+ if (event.delta) {
302
+ if (interrupted || !connected || afterTool) {
303
+ log.info(`Opening TTS stream (interrupted=${interrupted}, connected=${connected}, afterTool=${afterTool})`);
304
+ interrupted = false;
305
+ afterTool = false;
306
+ closeWebSocket();
307
+ await openWebSocket();
308
+ }
309
+ currentRequestId = event.requestId;
310
+ sendText(event.delta);
311
+ }
312
+ } else if (event.type === "agent.tool_start" && !interrupted) {
313
+ if (connected) {
314
+ log.info("Tool started \u2014 closing TTS stream for segment break");
315
+ endStream();
316
+ setTimeout(() => {
317
+ closeWebSocket();
318
+ }, 500);
319
+ afterTool = true;
320
+ }
321
+ } else if (event.type === "agent.complete" && !interrupted) {
322
+ endStream();
323
+ currentRequestId = null;
324
+ } else if (event.type === "control.interrupt") {
325
+ interrupted = true;
326
+ afterTool = false;
327
+ closeWebSocket();
328
+ currentRequestId = null;
329
+ }
330
+ }
331
+ rl.on("line", (line) => {
332
+ if (!line.trim()) return;
333
+ eventQueue.push(line);
334
+ processQueue();
335
+ });
336
+ rl.on("close", () => {
337
+ closeWebSocket();
338
+ emit({ type: "lifecycle.done", component: "tts-elevenlabs" });
339
+ process.exit(0);
340
+ });
341
+ process.on("SIGTERM", () => {
342
+ closeWebSocket();
343
+ process.exit(0);
344
+ });
345
+ }
346
+ main().catch((err) => {
347
+ log.error(`Fatal: ${err.message}`);
348
+ process.exit(1);
349
+ });
@@ -0,0 +1 @@
1
+ {"name":"tts-elevenlabs","description":"Text-to-speech via ElevenLabs streaming API","consumes":["agent.delta","agent.complete","agent.tool_start","control.interrupt"],"emits":["audio.chunk","lifecycle.ready","lifecycle.done","control.error"],"arguments":{"voiceId":{"type":"string","description":"ElevenLabs voice ID"},"model":{"type":"string","description":"ElevenLabs model name"},"apiKey":{"type":"string","description":"ElevenLabs API key (overrides ELEVENLABS_API_KEY env var)"}},"env":{"ELEVENLABS_API_KEY":{"required":true,"description":"ElevenLabs API key for TTS"}}}
package/package.json CHANGED
@@ -1,16 +1,20 @@
1
1
  {
2
2
  "name": "@acpfx/tts-elevenlabs",
3
- "version": "0.2.2",
3
+ "version": "0.2.4",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "acpfx-tts-elevenlabs": "./dist/index.js"
7
7
  },
8
8
  "main": "./dist/index.js",
9
+ "files": [
10
+ "dist",
11
+ "manifest.yaml"
12
+ ],
9
13
  "dependencies": {
10
14
  "@acpfx/core": "0.4.0",
11
15
  "@acpfx/node-sdk": "0.3.0"
12
16
  },
13
17
  "scripts": {
14
- "build": "esbuild src/index.ts --bundle --platform=node --format=esm --outfile=dist/index.js --packages=external"
18
+ "build": "esbuild src/index.ts --bundle --banner:js=\"#!/usr/bin/env node\" --platform=node --format=esm --outfile=dist/index.js --packages=external && node ../../scripts/copy-manifest.js"
15
19
  }
16
20
  }
package/CHANGELOG.md DELETED
@@ -1,38 +0,0 @@
1
- # @acpfx/tts-elevenlabs
2
-
3
- ## 0.2.2
4
-
5
- ### Patch Changes
6
-
7
- - Updated dependencies [0e6838e]
8
- - @acpfx/core@0.4.0
9
- - @acpfx/node-sdk@0.3.0
10
-
11
- ## 0.2.1
12
-
13
- ### Patch Changes
14
-
15
- - Updated dependencies [79c6694]
16
- - Updated dependencies [a0320a1]
17
- - @acpfx/core@0.3.0
18
- - @acpfx/node-sdk@0.2.1
19
-
20
- ## 0.2.0
21
-
22
- ### Minor Changes
23
-
24
- - d757640: Initial release: type-safe contracts, Rust orchestrator, manifest-driven event filtering
25
-
26
- - Rust schema crate as canonical event type source of truth with codegen to TypeScript + Zod
27
- - Node manifests (manifest.yaml) declaring consumes/emits contracts
28
- - Orchestrator event filtering: nodes only receive declared events
29
- - Rust orchestrator with ratatui TUI (--ui flag)
30
- - node-sdk with structured logging helpers
31
- - CI/CD with GitHub Actions and changesets
32
- - Platform-specific npm packages for Rust binaries (esbuild-style distribution)
33
-
34
- ### Patch Changes
35
-
36
- - Updated dependencies [d757640]
37
- - @acpfx/core@0.2.0
38
- - @acpfx/node-sdk@0.2.0
package/src/index.ts DELETED
@@ -1,322 +0,0 @@
1
- /**
2
- * tts-elevenlabs node — reads agent.delta events, streams text to ElevenLabs
3
- * WebSocket TTS, emits audio.chunk events as audio arrives.
4
- *
5
- * True streaming: sends each delta token to the WebSocket as it arrives,
6
- * so audio generation starts before the full response is complete.
7
- *
8
- * Settings (via ACPFX_SETTINGS):
9
- * voiceId?: string — ElevenLabs voice ID (default: Rachel)
10
- * model?: string — TTS model (default: eleven_turbo_v2_5)
11
- * apiKey?: string — API key (falls back to ELEVENLABS_API_KEY env)
12
- */
13
-
14
- import { createInterface } from "node:readline";
15
- import { emit, log, handleManifestFlag } from "@acpfx/node-sdk";
16
-
17
- handleManifestFlag();
18
-
19
- const WS_BASE_URL = "wss://api.elevenlabs.io/v1/text-to-speech";
20
- const DEFAULT_MODEL = "eleven_turbo_v2_5";
21
- const DEFAULT_VOICE_ID = "21m00Tcm4TlvDq8ikWAM"; // Rachel
22
- const OUTPUT_FORMAT = "pcm_16000";
23
- const SAMPLE_RATE = 16000;
24
- const CHANNELS = 1;
25
- const BYTES_PER_SAMPLE = 2;
26
- const CHUNK_DURATION_MS = 100;
27
- const CHUNK_SIZE = Math.floor(
28
- (SAMPLE_RATE * CHANNELS * BYTES_PER_SAMPLE * CHUNK_DURATION_MS) / 1000,
29
- );
30
- const TRACK_ID = "tts";
31
-
32
- type Settings = {
33
- voiceId?: string;
34
- model?: string;
35
- apiKey?: string;
36
- };
37
-
38
- const settings: Settings = JSON.parse(process.env.ACPFX_SETTINGS || "{}");
39
- const API_KEY = settings.apiKey ?? process.env.ELEVENLABS_API_KEY ?? "";
40
- const VOICE_ID = settings.voiceId ?? DEFAULT_VOICE_ID;
41
- const MODEL = settings.model ?? DEFAULT_MODEL;
42
-
43
- if (!API_KEY) {
44
- log.error("No API key. Set ELEVENLABS_API_KEY or settings.apiKey");
45
- process.exit(1);
46
- }
47
-
48
- let ws: WebSocket | null = null;
49
- let connected = false;
50
- let interrupted = false;
51
- let pcmBuffer = Buffer.alloc(0);
52
- let currentRequestId: string | null = null;
53
-
54
-
55
- async function openWebSocket(): Promise<void> {
56
- if (ws && connected) return;
57
-
58
- const url =
59
- `${WS_BASE_URL}/${VOICE_ID}/stream-input` +
60
- `?model_id=${encodeURIComponent(MODEL)}` +
61
- `&output_format=${OUTPUT_FORMAT}` +
62
- `&xi_api_key=${encodeURIComponent(API_KEY)}`;
63
-
64
- ws = new WebSocket(url);
65
-
66
- await new Promise<void>((resolve, reject) => {
67
- ws!.addEventListener(
68
- "open",
69
- () => {
70
- connected = true;
71
- log.info("Connected to ElevenLabs TTS");
72
- resolve();
73
- },
74
- { once: true },
75
- );
76
- ws!.addEventListener(
77
- "error",
78
- () => reject(new Error("TTS WebSocket connection failed")),
79
- { once: true },
80
- );
81
- });
82
-
83
- // Send BOS (beginning of stream) with voice settings
84
- ws.send(
85
- JSON.stringify({
86
- text: " ",
87
- xi_api_key: API_KEY,
88
- voice_settings: {
89
- stability: 0.5,
90
- similarity_boost: 0.75,
91
- },
92
- generation_config: {
93
- chunk_length_schedule: [50],
94
- },
95
- }),
96
- );
97
-
98
- ws.addEventListener("message", (event: MessageEvent) => {
99
- if (interrupted) return;
100
- try {
101
- const data =
102
- typeof event.data === "string"
103
- ? event.data
104
- : Buffer.from(event.data as ArrayBuffer).toString("utf-8");
105
- const msg = JSON.parse(data);
106
-
107
- if (msg.audio) {
108
- const rawPcm = Buffer.from(msg.audio, "base64");
109
- pcmBuffer = Buffer.concat([pcmBuffer, rawPcm]);
110
-
111
- // Emit fixed-size audio chunks
112
- while (pcmBuffer.length >= CHUNK_SIZE) {
113
- const chunk = pcmBuffer.subarray(0, CHUNK_SIZE);
114
- pcmBuffer = pcmBuffer.subarray(CHUNK_SIZE);
115
- emitAudioChunk(chunk);
116
- }
117
- }
118
-
119
- if (msg.isFinal) {
120
- // Flush remaining buffer
121
- if (pcmBuffer.length > 0) {
122
- emitAudioChunk(pcmBuffer);
123
- pcmBuffer = Buffer.alloc(0);
124
- }
125
- }
126
- } catch {
127
- // ignore parse errors
128
- }
129
- });
130
-
131
- ws.addEventListener("error", (event: Event) => {
132
- log.error(`WebSocket error: ${(event as ErrorEvent).message ?? "unknown"}`);
133
- emit({
134
- type: "control.error",
135
- component: "tts-elevenlabs",
136
- message: "TTS WebSocket error",
137
- fatal: false,
138
- });
139
- });
140
-
141
- ws.addEventListener("close", (event: CloseEvent) => {
142
- log.info(`WebSocket closed (code=${event.code}, reason=${event.reason || "none"})`);
143
- connected = false;
144
- });
145
- }
146
-
147
- function emitAudioChunk(pcm: Buffer): void {
148
- const durationMs = Math.round(
149
- (pcm.length / (SAMPLE_RATE * CHANNELS * BYTES_PER_SAMPLE)) * 1000,
150
- );
151
- emit({
152
- type: "audio.chunk",
153
- trackId: TRACK_ID,
154
- format: "pcm_s16le",
155
- sampleRate: SAMPLE_RATE,
156
- channels: CHANNELS,
157
- data: Buffer.from(pcm).toString("base64"),
158
- durationMs,
159
- });
160
- }
161
-
162
- /**
163
- * Strip markdown characters from streaming tokens.
164
- * Tokens arrive fragmented, so we strip character-by-character
165
- * and track state for URLs and code blocks.
166
- */
167
- let inUrl = false;
168
- let inCodeBlock = false;
169
-
170
- function stripMarkdown(text: string): string {
171
- if (text.includes("```")) {
172
- inCodeBlock = !inCodeBlock;
173
- return "";
174
- }
175
- if (inCodeBlock) return "";
176
-
177
- let result = "";
178
- for (let i = 0; i < text.length; i++) {
179
- const ch = text[i];
180
- if (inUrl) {
181
- if (ch === ")") inUrl = false;
182
- continue;
183
- }
184
- if (ch === "]" && i + 1 < text.length && text[i + 1] === "(") {
185
- inUrl = true;
186
- i++;
187
- continue;
188
- }
189
- if (ch === "[" || ch === "]") continue;
190
- if (ch === "*" || ch === "~" || ch === "`") continue;
191
- if (ch === "#" && (i === 0 || text[i - 1] === "\n")) continue;
192
- result += ch;
193
- }
194
- return result;
195
- }
196
-
197
- function sendText(text: string): void {
198
- if (!ws || !connected) {
199
- log.warn(`sendText dropped (connected=${connected}): "${text.slice(0, 30)}"`);
200
- return;
201
- }
202
- const clean = stripMarkdown(text);
203
- if (!clean) return;
204
- ws.send(JSON.stringify({ text: clean }));
205
- }
206
-
207
- function endStream(): void {
208
- if (!ws || !connected) return;
209
- // Send empty text to signal EOS (end of stream)
210
- log.debug("Sending EOS");
211
- ws.send(JSON.stringify({ text: "" }));
212
- // Don't close the WebSocket — let ElevenLabs close it after isFinal
213
- }
214
-
215
- function closeWebSocket(): void {
216
- connected = false;
217
- pcmBuffer = Buffer.alloc(0);
218
- if (ws) {
219
- try {
220
- ws.close();
221
- } catch {
222
- // ignore
223
- }
224
- ws = null;
225
- }
226
- }
227
-
228
- // --- Main ---
229
-
230
- async function main(): Promise<void> {
231
- await openWebSocket();
232
-
233
- // Emit lifecycle.ready after WS connected
234
- emit({ type: "lifecycle.ready", component: "tts-elevenlabs" });
235
-
236
- const rl = createInterface({ input: process.stdin });
237
-
238
- // Queue events and process sequentially to avoid async races
239
- const eventQueue: string[] = [];
240
- let processing = false;
241
-
242
- async function processQueue(): Promise<void> {
243
- if (processing) return;
244
- processing = true;
245
-
246
- while (eventQueue.length > 0) {
247
- const line = eventQueue.shift()!;
248
- try {
249
- const event = JSON.parse(line);
250
- await handleEvent(event);
251
- } catch {
252
- // ignore
253
- }
254
- }
255
-
256
- processing = false;
257
- }
258
-
259
- let afterTool = false;
260
-
261
- async function handleEvent(event: Record<string, unknown>): Promise<void> {
262
- if (event.type === "agent.delta") {
263
- if (event.delta) {
264
- // Reconnect if WebSocket is down, we were interrupted, or we're
265
- // starting a new segment after a tool call.
266
- if (interrupted || !connected || afterTool) {
267
- log.info(`Opening TTS stream (interrupted=${interrupted}, connected=${connected}, afterTool=${afterTool})`);
268
- interrupted = false;
269
- afterTool = false;
270
- closeWebSocket();
271
- await openWebSocket();
272
- }
273
- currentRequestId = event.requestId as string;
274
- sendText(event.delta as string);
275
- }
276
- } else if (event.type === "agent.tool_start" && !interrupted) {
277
- // Tool call started — close the WebSocket to force ElevenLabs to
278
- // finalize audio for the text sent so far. EOS alone may not work
279
- // if the text was mid-sentence.
280
- if (connected) {
281
- log.info("Tool started — closing TTS stream for segment break");
282
- endStream();
283
- // Give ElevenLabs a moment to send final audio, then force close
284
- setTimeout(() => {
285
- closeWebSocket();
286
- }, 500);
287
- afterTool = true;
288
- }
289
- } else if (event.type === "agent.complete" && !interrupted) {
290
- // Agent is done — signal end of text stream so TTS can finalize
291
- endStream();
292
- currentRequestId = null;
293
- } else if (event.type === "control.interrupt") {
294
- interrupted = true;
295
- afterTool = false;
296
- closeWebSocket();
297
- currentRequestId = null;
298
- }
299
- }
300
-
301
- rl.on("line", (line) => {
302
- if (!line.trim()) return;
303
- eventQueue.push(line);
304
- processQueue();
305
- });
306
-
307
- rl.on("close", () => {
308
- closeWebSocket();
309
- emit({ type: "lifecycle.done", component: "tts-elevenlabs" });
310
- process.exit(0);
311
- });
312
-
313
- process.on("SIGTERM", () => {
314
- closeWebSocket();
315
- process.exit(0);
316
- });
317
- }
318
-
319
- main().catch((err) => {
320
- log.error(`Fatal: ${err.message}`);
321
- process.exit(1);
322
- });