@acpfx/stt-elevenlabs 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +46 -0
- package/dist/index.js +319 -0
- package/dist/manifest.json +1 -0
- package/package.json +6 -2
- package/CHANGELOG.md +0 -38
- package/src/index.ts +0 -273
package/LICENSE
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
ISC License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024-2026 acpfx contributors
|
|
4
|
+
|
|
5
|
+
Permission to use, copy, modify, and/or distribute this software for any
|
|
6
|
+
purpose with or without fee is hereby granted, provided that the above
|
|
7
|
+
copyright notice and this permission notice appear in all copies.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
|
|
10
|
+
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
|
|
11
|
+
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
|
|
12
|
+
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
|
13
|
+
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
|
|
14
|
+
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
15
|
+
PERFORMANCE OF THIS SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# @acpfx/stt-elevenlabs
|
|
2
|
+
|
|
3
|
+
Speech-to-text via ElevenLabs streaming API. Streams partial and delta transcriptions in real time.
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
This package is a pipeline node for [@acpfx/cli](../orchestrator/README.md). See the CLI package for installation and usage.
|
|
8
|
+
|
|
9
|
+
Requires an `ELEVENLABS_API_KEY` environment variable.
|
|
10
|
+
|
|
11
|
+
## Manifest
|
|
12
|
+
|
|
13
|
+
- **Consumes:** `audio.chunk`
|
|
14
|
+
- **Emits:** `speech.partial`, `speech.delta`, `speech.final`, `speech.pause`, `lifecycle.ready`, `lifecycle.done`, `control.error`
|
|
15
|
+
|
|
16
|
+
## Settings
|
|
17
|
+
|
|
18
|
+
| Name | Type | Default | Description |
|
|
19
|
+
|------|------|---------|-------------|
|
|
20
|
+
| `language` | string | `en` | Language code |
|
|
21
|
+
| `apiKey` | string | | Overrides `ELEVENLABS_API_KEY` env var |
|
|
22
|
+
| `pauseMs` | number | | Pause duration threshold in ms |
|
|
23
|
+
| `vadThreshold` | number | | VAD threshold 0-1 (higher = less sensitive) |
|
|
24
|
+
| `minSpeechDurationMs` | number | | Minimum speech duration in ms |
|
|
25
|
+
| `minSilenceDurationMs` | number | | Minimum silence duration in ms |
|
|
26
|
+
|
|
27
|
+
## Pipeline Example
|
|
28
|
+
|
|
29
|
+
```yaml
|
|
30
|
+
nodes:
|
|
31
|
+
stt:
|
|
32
|
+
use: "@acpfx/stt-elevenlabs"
|
|
33
|
+
settings: { language: en }
|
|
34
|
+
outputs: [bridge]
|
|
35
|
+
env:
|
|
36
|
+
ELEVENLABS_API_KEY: ${ELEVENLABS_API_KEY}
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## External Links
|
|
40
|
+
|
|
41
|
+
- [ElevenLabs](https://elevenlabs.io) -- AI voice platform
|
|
42
|
+
- [ElevenLabs API Docs](https://elevenlabs.io/docs/api-reference) -- API reference
|
|
43
|
+
|
|
44
|
+
## License
|
|
45
|
+
|
|
46
|
+
ISC
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// ../node-sdk/src/index.ts
|
|
4
|
+
import { createInterface } from "node:readline";
|
|
5
|
+
|
|
6
|
+
// ../core/src/config.ts
|
|
7
|
+
import { parse as parseYaml } from "yaml";
|
|
8
|
+
|
|
9
|
+
// ../core/src/manifest.ts
|
|
10
|
+
import { readFileSync } from "node:fs";
|
|
11
|
+
import { join, dirname } from "node:path";
|
|
12
|
+
import { z as z2 } from "zod";
|
|
13
|
+
|
|
14
|
+
// ../core/src/acpfx-flags.ts
|
|
15
|
+
import { z } from "zod";
|
|
16
|
+
var SetupCheckResponseSchema = z.object({
|
|
17
|
+
needed: z.boolean(),
|
|
18
|
+
description: z.string().optional()
|
|
19
|
+
});
|
|
20
|
+
var SetupProgressSchema = z.discriminatedUnion("type", [
|
|
21
|
+
z.object({
|
|
22
|
+
type: z.literal("progress"),
|
|
23
|
+
message: z.string(),
|
|
24
|
+
pct: z.number().optional()
|
|
25
|
+
}),
|
|
26
|
+
z.object({ type: z.literal("complete"), message: z.string() }),
|
|
27
|
+
z.object({ type: z.literal("error"), message: z.string() })
|
|
28
|
+
]);
|
|
29
|
+
var UnsupportedFlagResponseSchema = z.object({
|
|
30
|
+
unsupported: z.boolean(),
|
|
31
|
+
flag: z.string()
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
// ../core/src/manifest.ts
|
|
35
|
+
var ArgumentTypeSchema = z2.enum(["string", "number", "boolean"]);
|
|
36
|
+
var ManifestArgumentSchema = z2.object({
|
|
37
|
+
type: ArgumentTypeSchema,
|
|
38
|
+
default: z2.unknown().optional(),
|
|
39
|
+
description: z2.string().optional(),
|
|
40
|
+
required: z2.boolean().optional(),
|
|
41
|
+
enum: z2.array(z2.unknown()).optional()
|
|
42
|
+
});
|
|
43
|
+
var ManifestEnvFieldSchema = z2.object({
|
|
44
|
+
required: z2.boolean().optional(),
|
|
45
|
+
description: z2.string().optional()
|
|
46
|
+
});
|
|
47
|
+
var NodeManifestSchema = z2.object({
|
|
48
|
+
name: z2.string(),
|
|
49
|
+
description: z2.string().optional(),
|
|
50
|
+
consumes: z2.array(z2.string()),
|
|
51
|
+
emits: z2.array(z2.string()),
|
|
52
|
+
arguments: z2.record(z2.string(), ManifestArgumentSchema).optional(),
|
|
53
|
+
additional_arguments: z2.boolean().optional(),
|
|
54
|
+
env: z2.record(z2.string(), ManifestEnvFieldSchema).optional()
|
|
55
|
+
});
|
|
56
|
+
function handleAcpfxFlags(manifestPath) {
|
|
57
|
+
const acpfxFlag = process.argv.find((a) => a.startsWith("--acpfx-"));
|
|
58
|
+
const legacyManifest = process.argv.includes("--manifest");
|
|
59
|
+
if (!acpfxFlag && !legacyManifest) return;
|
|
60
|
+
const flag = acpfxFlag ?? "--acpfx-manifest";
|
|
61
|
+
switch (flag) {
|
|
62
|
+
case "--acpfx-manifest":
|
|
63
|
+
printManifest(manifestPath);
|
|
64
|
+
break;
|
|
65
|
+
case "--acpfx-setup-check":
|
|
66
|
+
process.stdout.write(JSON.stringify({ needed: false }) + "\n");
|
|
67
|
+
process.exit(0);
|
|
68
|
+
break;
|
|
69
|
+
default:
|
|
70
|
+
process.stdout.write(
|
|
71
|
+
JSON.stringify({ unsupported: true, flag }) + "\n"
|
|
72
|
+
);
|
|
73
|
+
process.exit(0);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
function handleManifestFlag(manifestPath) {
|
|
77
|
+
handleAcpfxFlags(manifestPath);
|
|
78
|
+
}
|
|
79
|
+
function printManifest(manifestPath) {
|
|
80
|
+
if (!manifestPath) {
|
|
81
|
+
const script = process.argv[1];
|
|
82
|
+
const scriptDir = dirname(script);
|
|
83
|
+
const scriptBase = script.replace(/\.[^.]+$/, "");
|
|
84
|
+
const colocated = `${scriptBase}.manifest.json`;
|
|
85
|
+
try {
|
|
86
|
+
readFileSync(colocated);
|
|
87
|
+
manifestPath = colocated;
|
|
88
|
+
} catch {
|
|
89
|
+
manifestPath = join(scriptDir, "manifest.json");
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
try {
|
|
93
|
+
const content = readFileSync(manifestPath, "utf8");
|
|
94
|
+
process.stdout.write(content.trim() + "\n");
|
|
95
|
+
process.exit(0);
|
|
96
|
+
} catch (err) {
|
|
97
|
+
process.stderr.write(`Failed to read manifest: ${err}
|
|
98
|
+
`);
|
|
99
|
+
process.exit(1);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// ../node-sdk/src/index.ts
|
|
104
|
+
var NODE_NAME = process.env.ACPFX_NODE_NAME ?? "unknown";
|
|
105
|
+
function emit(event) {
|
|
106
|
+
process.stdout.write(JSON.stringify(event) + "\n");
|
|
107
|
+
}
|
|
108
|
+
function log(level, message) {
|
|
109
|
+
emit({ type: "log", level, component: NODE_NAME, message });
|
|
110
|
+
}
|
|
111
|
+
log.info = (message) => log("info", message);
|
|
112
|
+
log.warn = (message) => log("warn", message);
|
|
113
|
+
log.error = (message) => log("error", message);
|
|
114
|
+
log.debug = (message) => log("debug", message);
|
|
115
|
+
function onEvent(handler) {
|
|
116
|
+
const rl = createInterface({ input: process.stdin });
|
|
117
|
+
rl.on("line", (line) => {
|
|
118
|
+
if (!line.trim()) return;
|
|
119
|
+
try {
|
|
120
|
+
const event = JSON.parse(line);
|
|
121
|
+
handler(event);
|
|
122
|
+
} catch {
|
|
123
|
+
}
|
|
124
|
+
});
|
|
125
|
+
return rl;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// src/index.ts
|
|
129
|
+
handleManifestFlag();
|
|
130
|
+
var WS_URL = "wss://api.elevenlabs.io/v1/speech-to-text/realtime";
|
|
131
|
+
var MODEL = "scribe_v2_realtime";
|
|
132
|
+
var settings = JSON.parse(process.env.ACPFX_SETTINGS || "{}");
|
|
133
|
+
var LANGUAGE = settings.language ?? "en";
|
|
134
|
+
var API_KEY = settings.apiKey ?? process.env.ELEVENLABS_API_KEY ?? "";
|
|
135
|
+
var TRACK_ID = "stt";
|
|
136
|
+
if (!API_KEY) {
|
|
137
|
+
log.error("No API key. Set ELEVENLABS_API_KEY or settings.apiKey");
|
|
138
|
+
process.exit(1);
|
|
139
|
+
}
|
|
140
|
+
var ws = null;
|
|
141
|
+
var connected = false;
|
|
142
|
+
var reconnecting = false;
|
|
143
|
+
var interrupted = false;
|
|
144
|
+
var lastPartialText = "";
|
|
145
|
+
var accumulatedText = "";
|
|
146
|
+
var partialStaleTimer = null;
|
|
147
|
+
var PARTIAL_STALE_MS = 3e3;
|
|
148
|
+
async function connectWebSocket() {
|
|
149
|
+
const vadSilenceSecs = (settings.pauseMs ?? 600) / 1e3;
|
|
150
|
+
const vadThreshold = settings.vadThreshold ?? 0.5;
|
|
151
|
+
const minSpeechMs = settings.minSpeechDurationMs ?? 250;
|
|
152
|
+
const minSilenceMs = settings.minSilenceDurationMs ?? 100;
|
|
153
|
+
const url = `${WS_URL}?model_id=${MODEL}&language_code=${encodeURIComponent(LANGUAGE)}&sample_rate=16000&encoding=pcm_s16le&commit_strategy=vad&vad_silence_threshold_secs=${vadSilenceSecs}&vad_threshold=${vadThreshold}&min_speech_duration_ms=${minSpeechMs}&min_silence_duration_ms=${minSilenceMs}`;
|
|
154
|
+
ws = new WebSocket(url, {
|
|
155
|
+
headers: { "xi-api-key": API_KEY }
|
|
156
|
+
});
|
|
157
|
+
await new Promise((resolve, reject) => {
|
|
158
|
+
ws.addEventListener(
|
|
159
|
+
"open",
|
|
160
|
+
() => {
|
|
161
|
+
connected = true;
|
|
162
|
+
log.info("Connected to ElevenLabs STT");
|
|
163
|
+
resolve();
|
|
164
|
+
},
|
|
165
|
+
{ once: true }
|
|
166
|
+
);
|
|
167
|
+
ws.addEventListener(
|
|
168
|
+
"error",
|
|
169
|
+
() => {
|
|
170
|
+
reject(new Error("WebSocket connection failed"));
|
|
171
|
+
},
|
|
172
|
+
{ once: true }
|
|
173
|
+
);
|
|
174
|
+
});
|
|
175
|
+
ws.addEventListener("message", (event) => {
|
|
176
|
+
try {
|
|
177
|
+
const data = typeof event.data === "string" ? event.data : Buffer.from(event.data).toString("utf-8");
|
|
178
|
+
const msg = JSON.parse(data);
|
|
179
|
+
handleServerMessage(msg);
|
|
180
|
+
} catch {
|
|
181
|
+
}
|
|
182
|
+
});
|
|
183
|
+
ws.addEventListener("error", (event) => {
|
|
184
|
+
log.error(`WebSocket error: ${event.message ?? "unknown"}`);
|
|
185
|
+
emit({
|
|
186
|
+
type: "control.error",
|
|
187
|
+
component: "stt-elevenlabs",
|
|
188
|
+
message: "WebSocket error",
|
|
189
|
+
fatal: false
|
|
190
|
+
});
|
|
191
|
+
});
|
|
192
|
+
ws.addEventListener("close", () => {
|
|
193
|
+
connected = false;
|
|
194
|
+
log.info("WebSocket closed \u2014 will reconnect on next audio");
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
function handleServerMessage(msg) {
|
|
198
|
+
const msgType = msg.message_type;
|
|
199
|
+
if (interrupted) return;
|
|
200
|
+
if (msgType === "partial_transcript") {
|
|
201
|
+
const text = msg.text ?? "";
|
|
202
|
+
if (!text) return;
|
|
203
|
+
if (lastPartialText && text !== lastPartialText && !text.startsWith(lastPartialText)) {
|
|
204
|
+
emit({
|
|
205
|
+
type: "speech.delta",
|
|
206
|
+
trackId: TRACK_ID,
|
|
207
|
+
text,
|
|
208
|
+
replaces: lastPartialText
|
|
209
|
+
});
|
|
210
|
+
} else {
|
|
211
|
+
emit({
|
|
212
|
+
type: "speech.partial",
|
|
213
|
+
trackId: TRACK_ID,
|
|
214
|
+
text
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
lastPartialText = text;
|
|
218
|
+
if (partialStaleTimer) clearTimeout(partialStaleTimer);
|
|
219
|
+
partialStaleTimer = setTimeout(() => {
|
|
220
|
+
if (lastPartialText && !interrupted && ws && connected) {
|
|
221
|
+
log.info(`Stale partial: forcing commit`);
|
|
222
|
+
ws.send(JSON.stringify({
|
|
223
|
+
message_type: "input_audio_chunk",
|
|
224
|
+
audio_base_64: "",
|
|
225
|
+
commit: true,
|
|
226
|
+
sample_rate: 16e3
|
|
227
|
+
}));
|
|
228
|
+
}
|
|
229
|
+
partialStaleTimer = null;
|
|
230
|
+
}, PARTIAL_STALE_MS);
|
|
231
|
+
} else if (msgType === "committed_transcript" || msgType === "committed_transcript_with_timestamps") {
|
|
232
|
+
const text = msg.text ?? "";
|
|
233
|
+
if (!text) return;
|
|
234
|
+
if (partialStaleTimer) {
|
|
235
|
+
clearTimeout(partialStaleTimer);
|
|
236
|
+
partialStaleTimer = null;
|
|
237
|
+
}
|
|
238
|
+
lastPartialText = "";
|
|
239
|
+
emit({
|
|
240
|
+
type: "speech.final",
|
|
241
|
+
trackId: TRACK_ID,
|
|
242
|
+
text
|
|
243
|
+
});
|
|
244
|
+
accumulatedText = accumulatedText ? `${accumulatedText} ${text}` : text;
|
|
245
|
+
emit({
|
|
246
|
+
type: "speech.pause",
|
|
247
|
+
trackId: TRACK_ID,
|
|
248
|
+
pendingText: accumulatedText,
|
|
249
|
+
silenceMs: settings.pauseMs ?? 600
|
|
250
|
+
});
|
|
251
|
+
lastPartialText = "";
|
|
252
|
+
accumulatedText = "";
|
|
253
|
+
} else if (msgType === "auth_error" || msgType === "error") {
|
|
254
|
+
const errMsg = msg.message ?? msg.error ?? msgType;
|
|
255
|
+
log.error(`Server error: ${errMsg}`);
|
|
256
|
+
emit({
|
|
257
|
+
type: "control.error",
|
|
258
|
+
component: "stt-elevenlabs",
|
|
259
|
+
message: errMsg,
|
|
260
|
+
fatal: msgType === "auth_error"
|
|
261
|
+
});
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
function sendAudio(base64Data) {
|
|
265
|
+
if (!ws || !connected) return;
|
|
266
|
+
ws.send(
|
|
267
|
+
JSON.stringify({
|
|
268
|
+
message_type: "input_audio_chunk",
|
|
269
|
+
audio_base_64: base64Data,
|
|
270
|
+
commit: false,
|
|
271
|
+
sample_rate: 16e3
|
|
272
|
+
})
|
|
273
|
+
);
|
|
274
|
+
}
|
|
275
|
+
function closeWebSocket() {
|
|
276
|
+
connected = false;
|
|
277
|
+
if (ws) {
|
|
278
|
+
try {
|
|
279
|
+
ws.close();
|
|
280
|
+
} catch {
|
|
281
|
+
}
|
|
282
|
+
ws = null;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
async function main() {
|
|
286
|
+
await connectWebSocket();
|
|
287
|
+
emit({ type: "lifecycle.ready", component: "stt-elevenlabs" });
|
|
288
|
+
const rl = onEvent((event) => {
|
|
289
|
+
if (event.type === "audio.chunk") {
|
|
290
|
+
if (!connected && !reconnecting) {
|
|
291
|
+
reconnecting = true;
|
|
292
|
+
interrupted = false;
|
|
293
|
+
log.info("Reconnecting...");
|
|
294
|
+
connectWebSocket().then(() => {
|
|
295
|
+
reconnecting = false;
|
|
296
|
+
sendAudio(event.data);
|
|
297
|
+
}).catch(() => {
|
|
298
|
+
reconnecting = false;
|
|
299
|
+
});
|
|
300
|
+
} else if (connected && !interrupted) {
|
|
301
|
+
sendAudio(event.data);
|
|
302
|
+
}
|
|
303
|
+
} else if (event.type === "control.interrupt") {
|
|
304
|
+
}
|
|
305
|
+
});
|
|
306
|
+
rl.on("close", () => {
|
|
307
|
+
closeWebSocket();
|
|
308
|
+
emit({ type: "lifecycle.done", component: "stt-elevenlabs" });
|
|
309
|
+
process.exit(0);
|
|
310
|
+
});
|
|
311
|
+
process.on("SIGTERM", () => {
|
|
312
|
+
closeWebSocket();
|
|
313
|
+
process.exit(0);
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
main().catch((err) => {
|
|
317
|
+
log.error(`Fatal: ${err.message}`);
|
|
318
|
+
process.exit(1);
|
|
319
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"name":"stt-elevenlabs","description":"Speech-to-text via ElevenLabs streaming API","consumes":["audio.chunk"],"emits":["speech.partial","speech.delta","speech.final","speech.pause","lifecycle.ready","lifecycle.done","control.error"],"arguments":{"language":{"type":"string","default":"en","description":"Language code for transcription"},"apiKey":{"type":"string","description":"ElevenLabs API key (overrides ELEVENLABS_API_KEY env var)"},"pauseMs":{"type":"number","description":"Pause duration threshold in ms"},"vadThreshold":{"type":"number","description":"VAD threshold 0-1 (higher = less sensitive)"},"minSpeechDurationMs":{"type":"number","description":"Minimum speech duration in ms (ignore short noise bursts)"},"minSilenceDurationMs":{"type":"number","description":"Minimum silence duration in ms"}},"env":{"ELEVENLABS_API_KEY":{"required":true,"description":"ElevenLabs API key for STT"}}}
|
package/package.json
CHANGED
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@acpfx/stt-elevenlabs",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.4",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"acpfx-stt-elevenlabs": "./dist/index.js"
|
|
7
7
|
},
|
|
8
8
|
"main": "./dist/index.js",
|
|
9
|
+
"files": [
|
|
10
|
+
"dist",
|
|
11
|
+
"manifest.yaml"
|
|
12
|
+
],
|
|
9
13
|
"dependencies": {
|
|
10
14
|
"@acpfx/core": "0.4.0",
|
|
11
15
|
"@acpfx/node-sdk": "0.3.0"
|
|
12
16
|
},
|
|
13
17
|
"scripts": {
|
|
14
|
-
"build": "esbuild src/index.ts --bundle --platform=node --format=esm --outfile=dist/index.js --packages=external"
|
|
18
|
+
"build": "esbuild src/index.ts --bundle --banner:js=\"#!/usr/bin/env node\" --platform=node --format=esm --outfile=dist/index.js --packages=external && node ../../scripts/copy-manifest.js"
|
|
15
19
|
}
|
|
16
20
|
}
|
package/CHANGELOG.md
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
# @acpfx/stt-elevenlabs
|
|
2
|
-
|
|
3
|
-
## 0.2.2
|
|
4
|
-
|
|
5
|
-
### Patch Changes
|
|
6
|
-
|
|
7
|
-
- Updated dependencies [0e6838e]
|
|
8
|
-
- @acpfx/core@0.4.0
|
|
9
|
-
- @acpfx/node-sdk@0.3.0
|
|
10
|
-
|
|
11
|
-
## 0.2.1
|
|
12
|
-
|
|
13
|
-
### Patch Changes
|
|
14
|
-
|
|
15
|
-
- Updated dependencies [79c6694]
|
|
16
|
-
- Updated dependencies [a0320a1]
|
|
17
|
-
- @acpfx/core@0.3.0
|
|
18
|
-
- @acpfx/node-sdk@0.2.1
|
|
19
|
-
|
|
20
|
-
## 0.2.0
|
|
21
|
-
|
|
22
|
-
### Minor Changes
|
|
23
|
-
|
|
24
|
-
- d757640: Initial release: type-safe contracts, Rust orchestrator, manifest-driven event filtering
|
|
25
|
-
|
|
26
|
-
- Rust schema crate as canonical event type source of truth with codegen to TypeScript + Zod
|
|
27
|
-
- Node manifests (manifest.yaml) declaring consumes/emits contracts
|
|
28
|
-
- Orchestrator event filtering: nodes only receive declared events
|
|
29
|
-
- Rust orchestrator with ratatui TUI (--ui flag)
|
|
30
|
-
- node-sdk with structured logging helpers
|
|
31
|
-
- CI/CD with GitHub Actions and changesets
|
|
32
|
-
- Platform-specific npm packages for Rust binaries (esbuild-style distribution)
|
|
33
|
-
|
|
34
|
-
### Patch Changes
|
|
35
|
-
|
|
36
|
-
- Updated dependencies [d757640]
|
|
37
|
-
- @acpfx/core@0.2.0
|
|
38
|
-
- @acpfx/node-sdk@0.2.0
|
package/src/index.ts
DELETED
|
@@ -1,273 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* stt-elevenlabs node — ElevenLabs Scribe v2 Realtime STT with built-in VAD.
|
|
3
|
-
*
|
|
4
|
-
* Reads audio.chunk events from stdin, streams to ElevenLabs WebSocket,
|
|
5
|
-
* emits speech.partial, speech.delta, speech.final, and speech.pause events.
|
|
6
|
-
*
|
|
7
|
-
* Uses commit_strategy=vad so ElevenLabs handles pause detection server-side.
|
|
8
|
-
*
|
|
9
|
-
* Settings (via ACPFX_SETTINGS):
|
|
10
|
-
* language?: string — language code (default: "en")
|
|
11
|
-
* apiKey?: string — ElevenLabs API key (falls back to ELEVENLABS_API_KEY env)
|
|
12
|
-
* pauseMs?: number — VAD silence threshold hint (default: 600)
|
|
13
|
-
*/
|
|
14
|
-
|
|
15
|
-
import { emit, log, onEvent, handleManifestFlag } from "@acpfx/node-sdk";
|
|
16
|
-
|
|
17
|
-
handleManifestFlag();
|
|
18
|
-
|
|
19
|
-
const WS_URL = "wss://api.elevenlabs.io/v1/speech-to-text/realtime";
|
|
20
|
-
const MODEL = "scribe_v2_realtime";
|
|
21
|
-
|
|
22
|
-
type Settings = {
|
|
23
|
-
language?: string;
|
|
24
|
-
apiKey?: string;
|
|
25
|
-
pauseMs?: number;
|
|
26
|
-
vadThreshold?: number; // 0-1, default 0.5 (higher = less sensitive)
|
|
27
|
-
minSpeechDurationMs?: number; // default 250 (ignore short noise bursts)
|
|
28
|
-
minSilenceDurationMs?: number; // default 100
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
const settings: Settings = JSON.parse(process.env.ACPFX_SETTINGS || "{}");
|
|
32
|
-
const LANGUAGE = settings.language ?? "en";
|
|
33
|
-
const API_KEY = settings.apiKey ?? process.env.ELEVENLABS_API_KEY ?? "";
|
|
34
|
-
const TRACK_ID = "stt";
|
|
35
|
-
|
|
36
|
-
if (!API_KEY) {
|
|
37
|
-
log.error("No API key. Set ELEVENLABS_API_KEY or settings.apiKey");
|
|
38
|
-
process.exit(1);
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
let ws: WebSocket | null = null;
|
|
42
|
-
let connected = false;
|
|
43
|
-
let reconnecting = false;
|
|
44
|
-
let interrupted = false;
|
|
45
|
-
let lastPartialText = "";
|
|
46
|
-
let accumulatedText = "";
|
|
47
|
-
let partialStaleTimer: ReturnType<typeof setTimeout> | null = null;
|
|
48
|
-
const PARTIAL_STALE_MS = 3000;
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
async function connectWebSocket(): Promise<void> {
|
|
52
|
-
const vadSilenceSecs = (settings.pauseMs ?? 600) / 1000;
|
|
53
|
-
const vadThreshold = settings.vadThreshold ?? 0.5;
|
|
54
|
-
const minSpeechMs = settings.minSpeechDurationMs ?? 250;
|
|
55
|
-
const minSilenceMs = settings.minSilenceDurationMs ?? 100;
|
|
56
|
-
const url =
|
|
57
|
-
`${WS_URL}?model_id=${MODEL}` +
|
|
58
|
-
`&language_code=${encodeURIComponent(LANGUAGE)}` +
|
|
59
|
-
`&sample_rate=16000` +
|
|
60
|
-
`&encoding=pcm_s16le` +
|
|
61
|
-
`&commit_strategy=vad` +
|
|
62
|
-
`&vad_silence_threshold_secs=${vadSilenceSecs}` +
|
|
63
|
-
`&vad_threshold=${vadThreshold}` +
|
|
64
|
-
`&min_speech_duration_ms=${minSpeechMs}` +
|
|
65
|
-
`&min_silence_duration_ms=${minSilenceMs}`;
|
|
66
|
-
|
|
67
|
-
ws = new WebSocket(url, {
|
|
68
|
-
headers: { "xi-api-key": API_KEY },
|
|
69
|
-
} as unknown as string[]);
|
|
70
|
-
|
|
71
|
-
await new Promise<void>((resolve, reject) => {
|
|
72
|
-
ws!.addEventListener(
|
|
73
|
-
"open",
|
|
74
|
-
() => {
|
|
75
|
-
connected = true;
|
|
76
|
-
log.info("Connected to ElevenLabs STT");
|
|
77
|
-
resolve();
|
|
78
|
-
},
|
|
79
|
-
{ once: true },
|
|
80
|
-
);
|
|
81
|
-
|
|
82
|
-
ws!.addEventListener(
|
|
83
|
-
"error",
|
|
84
|
-
() => {
|
|
85
|
-
reject(new Error("WebSocket connection failed"));
|
|
86
|
-
},
|
|
87
|
-
{ once: true },
|
|
88
|
-
);
|
|
89
|
-
});
|
|
90
|
-
|
|
91
|
-
ws.addEventListener("message", (event: MessageEvent) => {
|
|
92
|
-
try {
|
|
93
|
-
const data =
|
|
94
|
-
typeof event.data === "string"
|
|
95
|
-
? event.data
|
|
96
|
-
: Buffer.from(event.data as ArrayBuffer).toString("utf-8");
|
|
97
|
-
const msg = JSON.parse(data);
|
|
98
|
-
handleServerMessage(msg);
|
|
99
|
-
} catch {
|
|
100
|
-
// ignore parse errors
|
|
101
|
-
}
|
|
102
|
-
});
|
|
103
|
-
|
|
104
|
-
ws.addEventListener("error", (event: Event) => {
|
|
105
|
-
log.error(`WebSocket error: ${(event as ErrorEvent).message ?? "unknown"}`);
|
|
106
|
-
emit({
|
|
107
|
-
type: "control.error",
|
|
108
|
-
component: "stt-elevenlabs",
|
|
109
|
-
message: "WebSocket error",
|
|
110
|
-
fatal: false,
|
|
111
|
-
});
|
|
112
|
-
});
|
|
113
|
-
|
|
114
|
-
ws.addEventListener("close", () => {
|
|
115
|
-
connected = false;
|
|
116
|
-
log.info("WebSocket closed — will reconnect on next audio");
|
|
117
|
-
});
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
function handleServerMessage(msg: Record<string, unknown>): void {
|
|
121
|
-
const msgType = msg.message_type as string;
|
|
122
|
-
|
|
123
|
-
if (interrupted) return;
|
|
124
|
-
|
|
125
|
-
if (msgType === "partial_transcript") {
|
|
126
|
-
const text = (msg.text as string) ?? "";
|
|
127
|
-
if (!text) return;
|
|
128
|
-
|
|
129
|
-
// Check if this is a correction of a previous partial
|
|
130
|
-
if (lastPartialText && text !== lastPartialText && !text.startsWith(lastPartialText)) {
|
|
131
|
-
// This is a correction — emit speech.delta with replaces
|
|
132
|
-
emit({
|
|
133
|
-
type: "speech.delta",
|
|
134
|
-
trackId: TRACK_ID,
|
|
135
|
-
text,
|
|
136
|
-
replaces: lastPartialText,
|
|
137
|
-
});
|
|
138
|
-
} else {
|
|
139
|
-
emit({
|
|
140
|
-
type: "speech.partial",
|
|
141
|
-
trackId: TRACK_ID,
|
|
142
|
-
text,
|
|
143
|
-
});
|
|
144
|
-
}
|
|
145
|
-
lastPartialText = text;
|
|
146
|
-
|
|
147
|
-
// If partial never gets committed, force a commit after timeout.
|
|
148
|
-
// Continuous audio stream means the API may never see "end of speech."
|
|
149
|
-
if (partialStaleTimer) clearTimeout(partialStaleTimer);
|
|
150
|
-
partialStaleTimer = setTimeout(() => {
|
|
151
|
-
if (lastPartialText && !interrupted && ws && connected) {
|
|
152
|
-
log.info(`Stale partial: forcing commit`);
|
|
153
|
-
ws.send(JSON.stringify({
|
|
154
|
-
message_type: "input_audio_chunk",
|
|
155
|
-
audio_base_64: "",
|
|
156
|
-
commit: true,
|
|
157
|
-
sample_rate: 16000,
|
|
158
|
-
}));
|
|
159
|
-
}
|
|
160
|
-
partialStaleTimer = null;
|
|
161
|
-
}, PARTIAL_STALE_MS);
|
|
162
|
-
} else if (
|
|
163
|
-
msgType === "committed_transcript" ||
|
|
164
|
-
msgType === "committed_transcript_with_timestamps"
|
|
165
|
-
) {
|
|
166
|
-
const text = (msg.text as string) ?? "";
|
|
167
|
-
if (!text) return;
|
|
168
|
-
// Clear stale timer — proper commit arrived
|
|
169
|
-
if (partialStaleTimer) { clearTimeout(partialStaleTimer); partialStaleTimer = null; }
|
|
170
|
-
lastPartialText = "";
|
|
171
|
-
|
|
172
|
-
// Emit speech.final
|
|
173
|
-
emit({
|
|
174
|
-
type: "speech.final",
|
|
175
|
-
trackId: TRACK_ID,
|
|
176
|
-
text,
|
|
177
|
-
});
|
|
178
|
-
|
|
179
|
-
accumulatedText = accumulatedText ? `${accumulatedText} ${text}` : text;
|
|
180
|
-
|
|
181
|
-
// When using VAD commit_strategy, a committed_transcript means
|
|
182
|
-
// ElevenLabs detected a pause. Emit speech.pause.
|
|
183
|
-
emit({
|
|
184
|
-
type: "speech.pause",
|
|
185
|
-
trackId: TRACK_ID,
|
|
186
|
-
pendingText: accumulatedText,
|
|
187
|
-
silenceMs: settings.pauseMs ?? 600,
|
|
188
|
-
});
|
|
189
|
-
|
|
190
|
-
// Reset for next utterance
|
|
191
|
-
lastPartialText = "";
|
|
192
|
-
accumulatedText = "";
|
|
193
|
-
} else if (msgType === "auth_error" || msgType === "error") {
|
|
194
|
-
const errMsg =
|
|
195
|
-
(msg.message as string) ?? (msg.error as string) ?? msgType;
|
|
196
|
-
log.error(`Server error: ${errMsg}`);
|
|
197
|
-
emit({
|
|
198
|
-
type: "control.error",
|
|
199
|
-
component: "stt-elevenlabs",
|
|
200
|
-
message: errMsg,
|
|
201
|
-
fatal: msgType === "auth_error",
|
|
202
|
-
});
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
function sendAudio(base64Data: string): void {
|
|
207
|
-
if (!ws || !connected) return;
|
|
208
|
-
ws.send(
|
|
209
|
-
JSON.stringify({
|
|
210
|
-
message_type: "input_audio_chunk",
|
|
211
|
-
audio_base_64: base64Data,
|
|
212
|
-
commit: false,
|
|
213
|
-
sample_rate: 16000,
|
|
214
|
-
}),
|
|
215
|
-
);
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
function closeWebSocket(): void {
|
|
219
|
-
connected = false;
|
|
220
|
-
if (ws) {
|
|
221
|
-
try {
|
|
222
|
-
ws.close();
|
|
223
|
-
} catch {
|
|
224
|
-
// ignore
|
|
225
|
-
}
|
|
226
|
-
ws = null;
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
// --- Main ---
|
|
231
|
-
|
|
232
|
-
async function main(): Promise<void> {
|
|
233
|
-
await connectWebSocket();
|
|
234
|
-
|
|
235
|
-
// Emit lifecycle.ready after WS is connected
|
|
236
|
-
emit({ type: "lifecycle.ready", component: "stt-elevenlabs" });
|
|
237
|
-
|
|
238
|
-
const rl = onEvent((event) => {
|
|
239
|
-
if (event.type === "audio.chunk") {
|
|
240
|
-
if (!connected && !reconnecting) {
|
|
241
|
-
reconnecting = true;
|
|
242
|
-
interrupted = false;
|
|
243
|
-
log.info("Reconnecting...");
|
|
244
|
-
connectWebSocket().then(() => {
|
|
245
|
-
reconnecting = false;
|
|
246
|
-
sendAudio(event.data as string);
|
|
247
|
-
}).catch(() => {
|
|
248
|
-
reconnecting = false;
|
|
249
|
-
});
|
|
250
|
-
} else if (connected && !interrupted) {
|
|
251
|
-
sendAudio(event.data as string);
|
|
252
|
-
}
|
|
253
|
-
} else if (event.type === "control.interrupt") {
|
|
254
|
-
// Don't close WebSocket — STT should keep listening for barge-in.
|
|
255
|
-
}
|
|
256
|
-
});
|
|
257
|
-
|
|
258
|
-
rl.on("close", () => {
|
|
259
|
-
closeWebSocket();
|
|
260
|
-
emit({ type: "lifecycle.done", component: "stt-elevenlabs" });
|
|
261
|
-
process.exit(0);
|
|
262
|
-
});
|
|
263
|
-
|
|
264
|
-
process.on("SIGTERM", () => {
|
|
265
|
-
closeWebSocket();
|
|
266
|
-
process.exit(0);
|
|
267
|
-
});
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
main().catch((err) => {
|
|
271
|
-
log.error(`Fatal: ${err.message}`);
|
|
272
|
-
process.exit(1);
|
|
273
|
-
});
|