@acpfx/stt-elevenlabs 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,21 @@
1
+ # @acpfx/stt-elevenlabs
2
+
3
+ ## 0.2.0
4
+
5
+ ### Minor Changes
6
+
7
+ - d757640: Initial release: type-safe contracts, Rust orchestrator, manifest-driven event filtering
8
+
9
+ - Rust schema crate as canonical event type source of truth with codegen to TypeScript + Zod
10
+ - Node manifests (manifest.yaml) declaring consumes/emits contracts
11
+ - Orchestrator event filtering: nodes only receive declared events
12
+ - Rust orchestrator with ratatui TUI (--ui flag)
13
+ - node-sdk with structured logging helpers
14
+ - CI/CD with GitHub Actions and changesets
15
+ - Platform-specific npm packages for Rust binaries (esbuild-style distribution)
16
+
17
+ ### Patch Changes
18
+
19
+ - Updated dependencies [d757640]
20
+ - @acpfx/core@0.2.0
21
+ - @acpfx/node-sdk@0.2.0
package/manifest.yaml ADDED
@@ -0,0 +1,12 @@
1
+ name: stt-elevenlabs
2
+ description: Speech-to-text via ElevenLabs streaming API
3
+ consumes:
4
+ - audio.chunk
5
+ emits:
6
+ - speech.partial
7
+ - speech.delta
8
+ - speech.final
9
+ - speech.pause
10
+ - lifecycle.ready
11
+ - lifecycle.done
12
+ - control.error
package/package.json ADDED
@@ -0,0 +1,16 @@
1
+ {
2
+ "name": "@acpfx/stt-elevenlabs",
3
+ "version": "0.2.0",
4
+ "type": "module",
5
+ "bin": {
6
+ "acpfx-stt-elevenlabs": "./dist/index.js"
7
+ },
8
+ "main": "./dist/index.js",
9
+ "dependencies": {
10
+ "@acpfx/core": "0.2.0",
11
+ "@acpfx/node-sdk": "0.2.0"
12
+ },
13
+ "scripts": {
14
+ "build": "esbuild src/index.ts --bundle --platform=node --format=esm --outfile=dist/index.js --packages=external"
15
+ }
16
+ }
package/src/index.ts ADDED
@@ -0,0 +1,273 @@
1
+ /**
2
+ * stt-elevenlabs node — ElevenLabs Scribe v2 Realtime STT with built-in VAD.
3
+ *
4
+ * Reads audio.chunk events from stdin, streams to ElevenLabs WebSocket,
5
+ * emits speech.partial, speech.delta, speech.final, and speech.pause events.
6
+ *
7
+ * Uses commit_strategy=vad so ElevenLabs handles pause detection server-side.
8
+ *
9
+ * Settings (via ACPFX_SETTINGS):
10
+ * language?: string — language code (default: "en")
11
+ * apiKey?: string — ElevenLabs API key (falls back to ELEVENLABS_API_KEY env)
12
+ * pauseMs?: number — VAD silence threshold hint (default: 600)
13
+ */
14
+
15
+ import { emit, log, onEvent, handleManifestFlag } from "@acpfx/node-sdk";
16
+
17
+ handleManifestFlag();
18
+
19
+ const WS_URL = "wss://api.elevenlabs.io/v1/speech-to-text/realtime";
20
+ const MODEL = "scribe_v2_realtime";
21
+
22
+ type Settings = {
23
+ language?: string;
24
+ apiKey?: string;
25
+ pauseMs?: number;
26
+ vadThreshold?: number; // 0-1, default 0.5 (higher = less sensitive)
27
+ minSpeechDurationMs?: number; // default 250 (ignore short noise bursts)
28
+ minSilenceDurationMs?: number; // default 100
29
+ };
30
+
31
+ const settings: Settings = JSON.parse(process.env.ACPFX_SETTINGS || "{}");
32
+ const LANGUAGE = settings.language ?? "en";
33
+ const API_KEY = settings.apiKey ?? process.env.ELEVENLABS_API_KEY ?? "";
34
+ const TRACK_ID = "stt";
35
+
36
+ if (!API_KEY) {
37
+ log.error("No API key. Set ELEVENLABS_API_KEY or settings.apiKey");
38
+ process.exit(1);
39
+ }
40
+
41
+ let ws: WebSocket | null = null;
42
+ let connected = false;
43
+ let reconnecting = false;
44
+ let interrupted = false;
45
+ let lastPartialText = "";
46
+ let accumulatedText = "";
47
+ let partialStaleTimer: ReturnType<typeof setTimeout> | null = null;
48
+ const PARTIAL_STALE_MS = 3000;
49
+
50
+
51
+ async function connectWebSocket(): Promise<void> {
52
+ const vadSilenceSecs = (settings.pauseMs ?? 600) / 1000;
53
+ const vadThreshold = settings.vadThreshold ?? 0.5;
54
+ const minSpeechMs = settings.minSpeechDurationMs ?? 250;
55
+ const minSilenceMs = settings.minSilenceDurationMs ?? 100;
56
+ const url =
57
+ `${WS_URL}?model_id=${MODEL}` +
58
+ `&language_code=${encodeURIComponent(LANGUAGE)}` +
59
+ `&sample_rate=16000` +
60
+ `&encoding=pcm_s16le` +
61
+ `&commit_strategy=vad` +
62
+ `&vad_silence_threshold_secs=${vadSilenceSecs}` +
63
+ `&vad_threshold=${vadThreshold}` +
64
+ `&min_speech_duration_ms=${minSpeechMs}` +
65
+ `&min_silence_duration_ms=${minSilenceMs}`;
66
+
67
+ ws = new WebSocket(url, {
68
+ headers: { "xi-api-key": API_KEY },
69
+ } as unknown as string[]);
70
+
71
+ await new Promise<void>((resolve, reject) => {
72
+ ws!.addEventListener(
73
+ "open",
74
+ () => {
75
+ connected = true;
76
+ log.info("Connected to ElevenLabs STT");
77
+ resolve();
78
+ },
79
+ { once: true },
80
+ );
81
+
82
+ ws!.addEventListener(
83
+ "error",
84
+ () => {
85
+ reject(new Error("WebSocket connection failed"));
86
+ },
87
+ { once: true },
88
+ );
89
+ });
90
+
91
+ ws.addEventListener("message", (event: MessageEvent) => {
92
+ try {
93
+ const data =
94
+ typeof event.data === "string"
95
+ ? event.data
96
+ : Buffer.from(event.data as ArrayBuffer).toString("utf-8");
97
+ const msg = JSON.parse(data);
98
+ handleServerMessage(msg);
99
+ } catch {
100
+ // ignore parse errors
101
+ }
102
+ });
103
+
104
+ ws.addEventListener("error", (event: Event) => {
105
+ log.error(`WebSocket error: ${(event as ErrorEvent).message ?? "unknown"}`);
106
+ emit({
107
+ type: "control.error",
108
+ component: "stt-elevenlabs",
109
+ message: "WebSocket error",
110
+ fatal: false,
111
+ });
112
+ });
113
+
114
+ ws.addEventListener("close", () => {
115
+ connected = false;
116
+ log.info("WebSocket closed — will reconnect on next audio");
117
+ });
118
+ }
119
+
120
+ function handleServerMessage(msg: Record<string, unknown>): void {
121
+ const msgType = msg.message_type as string;
122
+
123
+ if (interrupted) return;
124
+
125
+ if (msgType === "partial_transcript") {
126
+ const text = (msg.text as string) ?? "";
127
+ if (!text) return;
128
+
129
+ // Check if this is a correction of a previous partial
130
+ if (lastPartialText && text !== lastPartialText && !text.startsWith(lastPartialText)) {
131
+ // This is a correction — emit speech.delta with replaces
132
+ emit({
133
+ type: "speech.delta",
134
+ trackId: TRACK_ID,
135
+ text,
136
+ replaces: lastPartialText,
137
+ });
138
+ } else {
139
+ emit({
140
+ type: "speech.partial",
141
+ trackId: TRACK_ID,
142
+ text,
143
+ });
144
+ }
145
+ lastPartialText = text;
146
+
147
+ // If partial never gets committed, force a commit after timeout.
148
+ // Continuous audio stream means the API may never see "end of speech."
149
+ if (partialStaleTimer) clearTimeout(partialStaleTimer);
150
+ partialStaleTimer = setTimeout(() => {
151
+ if (lastPartialText && !interrupted && ws && connected) {
152
+ log.info(`Stale partial: forcing commit`);
153
+ ws.send(JSON.stringify({
154
+ message_type: "input_audio_chunk",
155
+ audio_base_64: "",
156
+ commit: true,
157
+ sample_rate: 16000,
158
+ }));
159
+ }
160
+ partialStaleTimer = null;
161
+ }, PARTIAL_STALE_MS);
162
+ } else if (
163
+ msgType === "committed_transcript" ||
164
+ msgType === "committed_transcript_with_timestamps"
165
+ ) {
166
+ const text = (msg.text as string) ?? "";
167
+ if (!text) return;
168
+ // Clear stale timer — proper commit arrived
169
+ if (partialStaleTimer) { clearTimeout(partialStaleTimer); partialStaleTimer = null; }
170
+ lastPartialText = "";
171
+
172
+ // Emit speech.final
173
+ emit({
174
+ type: "speech.final",
175
+ trackId: TRACK_ID,
176
+ text,
177
+ });
178
+
179
+ accumulatedText = accumulatedText ? `${accumulatedText} ${text}` : text;
180
+
181
+ // When using VAD commit_strategy, a committed_transcript means
182
+ // ElevenLabs detected a pause. Emit speech.pause.
183
+ emit({
184
+ type: "speech.pause",
185
+ trackId: TRACK_ID,
186
+ pendingText: accumulatedText,
187
+ silenceMs: settings.pauseMs ?? 600,
188
+ });
189
+
190
+ // Reset for next utterance
191
+ lastPartialText = "";
192
+ accumulatedText = "";
193
+ } else if (msgType === "auth_error" || msgType === "error") {
194
+ const errMsg =
195
+ (msg.message as string) ?? (msg.error as string) ?? msgType;
196
+ log.error(`Server error: ${errMsg}`);
197
+ emit({
198
+ type: "control.error",
199
+ component: "stt-elevenlabs",
200
+ message: errMsg,
201
+ fatal: msgType === "auth_error",
202
+ });
203
+ }
204
+ }
205
+
206
+ function sendAudio(base64Data: string): void {
207
+ if (!ws || !connected) return;
208
+ ws.send(
209
+ JSON.stringify({
210
+ message_type: "input_audio_chunk",
211
+ audio_base_64: base64Data,
212
+ commit: false,
213
+ sample_rate: 16000,
214
+ }),
215
+ );
216
+ }
217
+
218
+ function closeWebSocket(): void {
219
+ connected = false;
220
+ if (ws) {
221
+ try {
222
+ ws.close();
223
+ } catch {
224
+ // ignore
225
+ }
226
+ ws = null;
227
+ }
228
+ }
229
+
230
+ // --- Main ---
231
+
232
+ async function main(): Promise<void> {
233
+ await connectWebSocket();
234
+
235
+ // Emit lifecycle.ready after WS is connected
236
+ emit({ type: "lifecycle.ready", component: "stt-elevenlabs" });
237
+
238
+ const rl = onEvent((event) => {
239
+ if (event.type === "audio.chunk") {
240
+ if (!connected && !reconnecting) {
241
+ reconnecting = true;
242
+ interrupted = false;
243
+ log.info("Reconnecting...");
244
+ connectWebSocket().then(() => {
245
+ reconnecting = false;
246
+ sendAudio(event.data as string);
247
+ }).catch(() => {
248
+ reconnecting = false;
249
+ });
250
+ } else if (connected && !interrupted) {
251
+ sendAudio(event.data as string);
252
+ }
253
+ } else if (event.type === "control.interrupt") {
254
+ // Don't close WebSocket — STT should keep listening for barge-in.
255
+ }
256
+ });
257
+
258
+ rl.on("close", () => {
259
+ closeWebSocket();
260
+ emit({ type: "lifecycle.done", component: "stt-elevenlabs" });
261
+ process.exit(0);
262
+ });
263
+
264
+ process.on("SIGTERM", () => {
265
+ closeWebSocket();
266
+ process.exit(0);
267
+ });
268
+ }
269
+
270
+ main().catch((err) => {
271
+ log.error(`Fatal: ${err.message}`);
272
+ process.exit(1);
273
+ });