@acpfx/stt-deepgram 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +45 -0
- package/dist/index.js +289 -0
- package/dist/manifest.json +1 -0
- package/package.json +6 -2
- package/CHANGELOG.md +0 -38
- package/src/index.ts +0 -244
package/LICENSE
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
ISC License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024-2026 acpfx contributors
|
|
4
|
+
|
|
5
|
+
Permission to use, copy, modify, and/or distribute this software for any
|
|
6
|
+
purpose with or without fee is hereby granted, provided that the above
|
|
7
|
+
copyright notice and this permission notice appear in all copies.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
|
|
10
|
+
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
|
|
11
|
+
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
|
|
12
|
+
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
|
13
|
+
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
|
|
14
|
+
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
15
|
+
PERFORMANCE OF THIS SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# @acpfx/stt-deepgram
|
|
2
|
+
|
|
3
|
+
Speech-to-text via Deepgram streaming API. Streams partial transcriptions in real time with configurable VAD and endpointing.
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
This package is a pipeline node for [@acpfx/cli](../orchestrator/README.md). See the CLI package for installation and usage.
|
|
8
|
+
|
|
9
|
+
Requires a `DEEPGRAM_API_KEY` environment variable.
|
|
10
|
+
|
|
11
|
+
## Manifest
|
|
12
|
+
|
|
13
|
+
- **Consumes:** `audio.chunk`
|
|
14
|
+
- **Emits:** `speech.partial`, `speech.final`, `speech.pause`, `lifecycle.ready`, `lifecycle.done`, `control.error`
|
|
15
|
+
|
|
16
|
+
## Settings
|
|
17
|
+
|
|
18
|
+
| Name | Type | Default | Description |
|
|
19
|
+
|------|------|---------|-------------|
|
|
20
|
+
| `language` | string | `en` | Language code |
|
|
21
|
+
| `model` | string | `nova-3` | Deepgram model name |
|
|
22
|
+
| `utteranceEndMs` | number | `1000` | Silence ms before utterance end |
|
|
23
|
+
| `endpointing` | number | `300` | VAD endpointing threshold in ms |
|
|
24
|
+
| `apiKey` | string | | Overrides `DEEPGRAM_API_KEY` env var |
|
|
25
|
+
|
|
26
|
+
## Pipeline Example
|
|
27
|
+
|
|
28
|
+
```yaml
|
|
29
|
+
nodes:
|
|
30
|
+
stt:
|
|
31
|
+
use: "@acpfx/stt-deepgram"
|
|
32
|
+
settings: { language: en, model: nova-3 }
|
|
33
|
+
outputs: [bridge]
|
|
34
|
+
env:
|
|
35
|
+
DEEPGRAM_API_KEY: ${DEEPGRAM_API_KEY}
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## External Links
|
|
39
|
+
|
|
40
|
+
- [Deepgram](https://deepgram.com) -- Speech AI platform
|
|
41
|
+
- [Deepgram Developer Docs](https://developers.deepgram.com) -- API reference and guides
|
|
42
|
+
|
|
43
|
+
## License
|
|
44
|
+
|
|
45
|
+
ISC
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// ../node-sdk/src/index.ts
|
|
4
|
+
import { createInterface } from "node:readline";
|
|
5
|
+
|
|
6
|
+
// ../core/src/config.ts
|
|
7
|
+
import { parse as parseYaml } from "yaml";
|
|
8
|
+
|
|
9
|
+
// ../core/src/manifest.ts
|
|
10
|
+
import { readFileSync } from "node:fs";
|
|
11
|
+
import { join, dirname } from "node:path";
|
|
12
|
+
import { z as z2 } from "zod";
|
|
13
|
+
|
|
14
|
+
// ../core/src/acpfx-flags.ts
|
|
15
|
+
import { z } from "zod";
|
|
16
|
+
var SetupCheckResponseSchema = z.object({
|
|
17
|
+
needed: z.boolean(),
|
|
18
|
+
description: z.string().optional()
|
|
19
|
+
});
|
|
20
|
+
var SetupProgressSchema = z.discriminatedUnion("type", [
|
|
21
|
+
z.object({
|
|
22
|
+
type: z.literal("progress"),
|
|
23
|
+
message: z.string(),
|
|
24
|
+
pct: z.number().optional()
|
|
25
|
+
}),
|
|
26
|
+
z.object({ type: z.literal("complete"), message: z.string() }),
|
|
27
|
+
z.object({ type: z.literal("error"), message: z.string() })
|
|
28
|
+
]);
|
|
29
|
+
var UnsupportedFlagResponseSchema = z.object({
|
|
30
|
+
unsupported: z.boolean(),
|
|
31
|
+
flag: z.string()
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
// ../core/src/manifest.ts
|
|
35
|
+
var ArgumentTypeSchema = z2.enum(["string", "number", "boolean"]);
|
|
36
|
+
var ManifestArgumentSchema = z2.object({
|
|
37
|
+
type: ArgumentTypeSchema,
|
|
38
|
+
default: z2.unknown().optional(),
|
|
39
|
+
description: z2.string().optional(),
|
|
40
|
+
required: z2.boolean().optional(),
|
|
41
|
+
enum: z2.array(z2.unknown()).optional()
|
|
42
|
+
});
|
|
43
|
+
var ManifestEnvFieldSchema = z2.object({
|
|
44
|
+
required: z2.boolean().optional(),
|
|
45
|
+
description: z2.string().optional()
|
|
46
|
+
});
|
|
47
|
+
var NodeManifestSchema = z2.object({
|
|
48
|
+
name: z2.string(),
|
|
49
|
+
description: z2.string().optional(),
|
|
50
|
+
consumes: z2.array(z2.string()),
|
|
51
|
+
emits: z2.array(z2.string()),
|
|
52
|
+
arguments: z2.record(z2.string(), ManifestArgumentSchema).optional(),
|
|
53
|
+
additional_arguments: z2.boolean().optional(),
|
|
54
|
+
env: z2.record(z2.string(), ManifestEnvFieldSchema).optional()
|
|
55
|
+
});
|
|
56
|
+
function handleAcpfxFlags(manifestPath) {
|
|
57
|
+
const acpfxFlag = process.argv.find((a) => a.startsWith("--acpfx-"));
|
|
58
|
+
const legacyManifest = process.argv.includes("--manifest");
|
|
59
|
+
if (!acpfxFlag && !legacyManifest) return;
|
|
60
|
+
const flag = acpfxFlag ?? "--acpfx-manifest";
|
|
61
|
+
switch (flag) {
|
|
62
|
+
case "--acpfx-manifest":
|
|
63
|
+
printManifest(manifestPath);
|
|
64
|
+
break;
|
|
65
|
+
case "--acpfx-setup-check":
|
|
66
|
+
process.stdout.write(JSON.stringify({ needed: false }) + "\n");
|
|
67
|
+
process.exit(0);
|
|
68
|
+
break;
|
|
69
|
+
default:
|
|
70
|
+
process.stdout.write(
|
|
71
|
+
JSON.stringify({ unsupported: true, flag }) + "\n"
|
|
72
|
+
);
|
|
73
|
+
process.exit(0);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
function handleManifestFlag(manifestPath) {
|
|
77
|
+
handleAcpfxFlags(manifestPath);
|
|
78
|
+
}
|
|
79
|
+
function printManifest(manifestPath) {
|
|
80
|
+
if (!manifestPath) {
|
|
81
|
+
const script = process.argv[1];
|
|
82
|
+
const scriptDir = dirname(script);
|
|
83
|
+
const scriptBase = script.replace(/\.[^.]+$/, "");
|
|
84
|
+
const colocated = `${scriptBase}.manifest.json`;
|
|
85
|
+
try {
|
|
86
|
+
readFileSync(colocated);
|
|
87
|
+
manifestPath = colocated;
|
|
88
|
+
} catch {
|
|
89
|
+
manifestPath = join(scriptDir, "manifest.json");
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
try {
|
|
93
|
+
const content = readFileSync(manifestPath, "utf8");
|
|
94
|
+
process.stdout.write(content.trim() + "\n");
|
|
95
|
+
process.exit(0);
|
|
96
|
+
} catch (err) {
|
|
97
|
+
process.stderr.write(`Failed to read manifest: ${err}
|
|
98
|
+
`);
|
|
99
|
+
process.exit(1);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// ../node-sdk/src/index.ts
|
|
104
|
+
var NODE_NAME = process.env.ACPFX_NODE_NAME ?? "unknown";
|
|
105
|
+
function emit(event) {
|
|
106
|
+
process.stdout.write(JSON.stringify(event) + "\n");
|
|
107
|
+
}
|
|
108
|
+
function log(level, message) {
|
|
109
|
+
emit({ type: "log", level, component: NODE_NAME, message });
|
|
110
|
+
}
|
|
111
|
+
log.info = (message) => log("info", message);
|
|
112
|
+
log.warn = (message) => log("warn", message);
|
|
113
|
+
log.error = (message) => log("error", message);
|
|
114
|
+
log.debug = (message) => log("debug", message);
|
|
115
|
+
function onEvent(handler) {
|
|
116
|
+
const rl = createInterface({ input: process.stdin });
|
|
117
|
+
rl.on("line", (line) => {
|
|
118
|
+
if (!line.trim()) return;
|
|
119
|
+
try {
|
|
120
|
+
const event = JSON.parse(line);
|
|
121
|
+
handler(event);
|
|
122
|
+
} catch {
|
|
123
|
+
}
|
|
124
|
+
});
|
|
125
|
+
return rl;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// src/index.ts
|
|
129
|
+
handleManifestFlag();
|
|
130
|
+
var WS_URL = "wss://api.deepgram.com/v1/listen";
|
|
131
|
+
var settings = JSON.parse(process.env.ACPFX_SETTINGS || "{}");
|
|
132
|
+
var API_KEY = settings.apiKey ?? process.env.DEEPGRAM_API_KEY ?? "";
|
|
133
|
+
var LANGUAGE = settings.language ?? "en";
|
|
134
|
+
var MODEL = settings.model ?? "nova-3";
|
|
135
|
+
var UTTERANCE_END_MS = settings.utteranceEndMs ?? 1e3;
|
|
136
|
+
var ENDPOINTING = settings.endpointing ?? 300;
|
|
137
|
+
var TRACK_ID = "stt";
|
|
138
|
+
if (!API_KEY) {
|
|
139
|
+
log.error("No API key. Set DEEPGRAM_API_KEY or settings.apiKey");
|
|
140
|
+
process.exit(1);
|
|
141
|
+
}
|
|
142
|
+
var ws = null;
|
|
143
|
+
var connected = false;
|
|
144
|
+
var lastFinalText = "";
|
|
145
|
+
var pendingText = "";
|
|
146
|
+
async function connectWebSocket() {
|
|
147
|
+
const url = `${WS_URL}?model=${MODEL}&language=${encodeURIComponent(LANGUAGE)}&encoding=linear16&sample_rate=16000&channels=1&interim_results=true&punctuate=true&smart_format=true&utterance_end_ms=${UTTERANCE_END_MS}&endpointing=${ENDPOINTING}&vad_events=true`;
|
|
148
|
+
ws = new WebSocket(url, ["token", API_KEY]);
|
|
149
|
+
await new Promise((resolve, reject) => {
|
|
150
|
+
ws.addEventListener(
|
|
151
|
+
"open",
|
|
152
|
+
() => {
|
|
153
|
+
connected = true;
|
|
154
|
+
log.info("Connected to Deepgram STT");
|
|
155
|
+
resolve();
|
|
156
|
+
},
|
|
157
|
+
{ once: true }
|
|
158
|
+
);
|
|
159
|
+
ws.addEventListener(
|
|
160
|
+
"error",
|
|
161
|
+
() => {
|
|
162
|
+
reject(new Error("WebSocket connection failed"));
|
|
163
|
+
},
|
|
164
|
+
{ once: true }
|
|
165
|
+
);
|
|
166
|
+
});
|
|
167
|
+
ws.addEventListener("message", (event) => {
|
|
168
|
+
try {
|
|
169
|
+
const data = typeof event.data === "string" ? event.data : Buffer.from(event.data).toString("utf-8");
|
|
170
|
+
const msg = JSON.parse(data);
|
|
171
|
+
handleServerMessage(msg);
|
|
172
|
+
} catch {
|
|
173
|
+
}
|
|
174
|
+
});
|
|
175
|
+
ws.addEventListener("error", (event) => {
|
|
176
|
+
log.error(`WebSocket error: ${event.message ?? "unknown"}`);
|
|
177
|
+
emit({
|
|
178
|
+
type: "control.error",
|
|
179
|
+
component: "stt-deepgram",
|
|
180
|
+
message: "STT WebSocket error",
|
|
181
|
+
fatal: false
|
|
182
|
+
});
|
|
183
|
+
});
|
|
184
|
+
ws.addEventListener("close", (event) => {
|
|
185
|
+
log.info(`WebSocket closed (code=${event.code})`);
|
|
186
|
+
connected = false;
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
function handleServerMessage(msg) {
|
|
190
|
+
const type = msg.type;
|
|
191
|
+
if (type === "UtteranceEnd") {
|
|
192
|
+
if (pendingText) {
|
|
193
|
+
emit({
|
|
194
|
+
type: "speech.pause",
|
|
195
|
+
trackId: TRACK_ID,
|
|
196
|
+
pendingText,
|
|
197
|
+
silenceMs: UTTERANCE_END_MS
|
|
198
|
+
});
|
|
199
|
+
pendingText = "";
|
|
200
|
+
}
|
|
201
|
+
return;
|
|
202
|
+
}
|
|
203
|
+
if (type === "SpeechStarted") {
|
|
204
|
+
return;
|
|
205
|
+
}
|
|
206
|
+
if (type === "Results") {
|
|
207
|
+
const channel = msg.channel;
|
|
208
|
+
const alternatives = channel?.alternatives ?? [];
|
|
209
|
+
if (alternatives.length === 0) return;
|
|
210
|
+
const transcript = alternatives[0].transcript ?? "";
|
|
211
|
+
const isFinal = msg.is_final === true;
|
|
212
|
+
const speechFinal = msg.speech_final === true;
|
|
213
|
+
if (!transcript) return;
|
|
214
|
+
if (isFinal) {
|
|
215
|
+
lastFinalText = transcript;
|
|
216
|
+
pendingText = pendingText ? pendingText + " " + transcript : transcript;
|
|
217
|
+
emit({
|
|
218
|
+
type: "speech.final",
|
|
219
|
+
trackId: TRACK_ID,
|
|
220
|
+
text: transcript,
|
|
221
|
+
confidence: alternatives[0].confidence ?? void 0
|
|
222
|
+
});
|
|
223
|
+
if (speechFinal) {
|
|
224
|
+
emit({
|
|
225
|
+
type: "speech.pause",
|
|
226
|
+
trackId: TRACK_ID,
|
|
227
|
+
pendingText,
|
|
228
|
+
silenceMs: ENDPOINTING
|
|
229
|
+
});
|
|
230
|
+
pendingText = "";
|
|
231
|
+
}
|
|
232
|
+
} else {
|
|
233
|
+
emit({
|
|
234
|
+
type: "speech.partial",
|
|
235
|
+
trackId: TRACK_ID,
|
|
236
|
+
text: transcript
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
function sendAudio(base64Pcm) {
|
|
242
|
+
if (!ws || !connected) return;
|
|
243
|
+
const pcm = Buffer.from(base64Pcm, "base64");
|
|
244
|
+
try {
|
|
245
|
+
ws.send(pcm);
|
|
246
|
+
} catch {
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
function closeWebSocket() {
|
|
250
|
+
connected = false;
|
|
251
|
+
if (ws) {
|
|
252
|
+
try {
|
|
253
|
+
ws.send(JSON.stringify({ type: "CloseStream" }));
|
|
254
|
+
ws.close();
|
|
255
|
+
} catch {
|
|
256
|
+
}
|
|
257
|
+
ws = null;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
async function main() {
|
|
261
|
+
await connectWebSocket();
|
|
262
|
+
emit({ type: "lifecycle.ready", component: "stt-deepgram" });
|
|
263
|
+
const rl = onEvent((event) => {
|
|
264
|
+
if (event.type === "audio.chunk") {
|
|
265
|
+
if (!connected) {
|
|
266
|
+
connectWebSocket().then(() => {
|
|
267
|
+
sendAudio(event.data);
|
|
268
|
+
}).catch(() => {
|
|
269
|
+
});
|
|
270
|
+
} else {
|
|
271
|
+
sendAudio(event.data);
|
|
272
|
+
}
|
|
273
|
+
} else if (event.type === "control.interrupt") {
|
|
274
|
+
}
|
|
275
|
+
});
|
|
276
|
+
rl.on("close", () => {
|
|
277
|
+
closeWebSocket();
|
|
278
|
+
emit({ type: "lifecycle.done", component: "stt-deepgram" });
|
|
279
|
+
process.exit(0);
|
|
280
|
+
});
|
|
281
|
+
process.on("SIGTERM", () => {
|
|
282
|
+
closeWebSocket();
|
|
283
|
+
process.exit(0);
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
main().catch((err) => {
|
|
287
|
+
log.error(`Fatal: ${err.message}`);
|
|
288
|
+
process.exit(1);
|
|
289
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"name":"stt-deepgram","description":"Speech-to-text via Deepgram streaming API","consumes":["audio.chunk"],"emits":["speech.partial","speech.final","speech.pause","lifecycle.ready","lifecycle.done","control.error"],"arguments":{"language":{"type":"string","default":"en","description":"Language code for transcription"},"model":{"type":"string","default":"nova-3","description":"Deepgram model name"},"utteranceEndMs":{"type":"number","default":1000,"description":"Milliseconds of silence before utterance end"},"endpointing":{"type":"number","default":300,"description":"VAD endpointing threshold in ms"},"apiKey":{"type":"string","description":"Deepgram API key (overrides DEEPGRAM_API_KEY env var)"}},"env":{"DEEPGRAM_API_KEY":{"required":true,"description":"Deepgram API key for STT"}}}
|
package/package.json
CHANGED
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@acpfx/stt-deepgram",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.4",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"acpfx-stt-deepgram": "./dist/index.js"
|
|
7
7
|
},
|
|
8
8
|
"main": "./dist/index.js",
|
|
9
|
+
"files": [
|
|
10
|
+
"dist",
|
|
11
|
+
"manifest.yaml"
|
|
12
|
+
],
|
|
9
13
|
"dependencies": {
|
|
10
14
|
"@acpfx/core": "0.4.0",
|
|
11
15
|
"@acpfx/node-sdk": "0.3.0"
|
|
12
16
|
},
|
|
13
17
|
"scripts": {
|
|
14
|
-
"build": "esbuild src/index.ts --bundle --platform=node --format=esm --outfile=dist/index.js --packages=external"
|
|
18
|
+
"build": "esbuild src/index.ts --bundle --banner:js=\"#!/usr/bin/env node\" --platform=node --format=esm --outfile=dist/index.js --packages=external && node ../../scripts/copy-manifest.js"
|
|
15
19
|
}
|
|
16
20
|
}
|
package/CHANGELOG.md
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
# @acpfx/stt-deepgram
|
|
2
|
-
|
|
3
|
-
## 0.2.2
|
|
4
|
-
|
|
5
|
-
### Patch Changes
|
|
6
|
-
|
|
7
|
-
- Updated dependencies [0e6838e]
|
|
8
|
-
- @acpfx/core@0.4.0
|
|
9
|
-
- @acpfx/node-sdk@0.3.0
|
|
10
|
-
|
|
11
|
-
## 0.2.1
|
|
12
|
-
|
|
13
|
-
### Patch Changes
|
|
14
|
-
|
|
15
|
-
- Updated dependencies [79c6694]
|
|
16
|
-
- Updated dependencies [a0320a1]
|
|
17
|
-
- @acpfx/core@0.3.0
|
|
18
|
-
- @acpfx/node-sdk@0.2.1
|
|
19
|
-
|
|
20
|
-
## 0.2.0
|
|
21
|
-
|
|
22
|
-
### Minor Changes
|
|
23
|
-
|
|
24
|
-
- d757640: Initial release: type-safe contracts, Rust orchestrator, manifest-driven event filtering
|
|
25
|
-
|
|
26
|
-
- Rust schema crate as canonical event type source of truth with codegen to TypeScript + Zod
|
|
27
|
-
- Node manifests (manifest.yaml) declaring consumes/emits contracts
|
|
28
|
-
- Orchestrator event filtering: nodes only receive declared events
|
|
29
|
-
- Rust orchestrator with ratatui TUI (--ui flag)
|
|
30
|
-
- node-sdk with structured logging helpers
|
|
31
|
-
- CI/CD with GitHub Actions and changesets
|
|
32
|
-
- Platform-specific npm packages for Rust binaries (esbuild-style distribution)
|
|
33
|
-
|
|
34
|
-
### Patch Changes
|
|
35
|
-
|
|
36
|
-
- Updated dependencies [d757640]
|
|
37
|
-
- @acpfx/core@0.2.0
|
|
38
|
-
- @acpfx/node-sdk@0.2.0
|
package/src/index.ts
DELETED
|
@@ -1,244 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* stt-deepgram node — Deepgram Nova-3 Realtime STT with UtteranceEnd detection.
|
|
3
|
-
*
|
|
4
|
-
* Reads audio.chunk events from stdin, streams to Deepgram WebSocket,
|
|
5
|
-
* emits speech.partial, speech.final, and speech.pause events.
|
|
6
|
-
*
|
|
7
|
-
* Uses UtteranceEnd for end-of-turn detection — analyzes word timing gaps,
|
|
8
|
-
* ignores non-speech audio (won't false-trigger on SFX sounds).
|
|
9
|
-
*
|
|
10
|
-
* Settings (via ACPFX_SETTINGS):
|
|
11
|
-
* language?: string — language code (default: "en")
|
|
12
|
-
* apiKey?: string — Deepgram API key (falls back to DEEPGRAM_API_KEY env)
|
|
13
|
-
* model?: string — STT model (default: "nova-3")
|
|
14
|
-
* utteranceEndMs?: number — ms gap for utterance end (default: 1000)
|
|
15
|
-
* endpointing?: number — VAD endpointing ms (default: 300)
|
|
16
|
-
*/
|
|
17
|
-
|
|
18
|
-
import { emit, log, onEvent, handleManifestFlag } from "@acpfx/node-sdk";
|
|
19
|
-
|
|
20
|
-
handleManifestFlag();
|
|
21
|
-
|
|
22
|
-
const WS_URL = "wss://api.deepgram.com/v1/listen";
|
|
23
|
-
|
|
24
|
-
type Settings = {
|
|
25
|
-
language?: string;
|
|
26
|
-
apiKey?: string;
|
|
27
|
-
model?: string;
|
|
28
|
-
utteranceEndMs?: number;
|
|
29
|
-
endpointing?: number;
|
|
30
|
-
};
|
|
31
|
-
|
|
32
|
-
const settings: Settings = JSON.parse(process.env.ACPFX_SETTINGS || "{}");
|
|
33
|
-
const API_KEY = settings.apiKey ?? process.env.DEEPGRAM_API_KEY ?? "";
|
|
34
|
-
const LANGUAGE = settings.language ?? "en";
|
|
35
|
-
const MODEL = settings.model ?? "nova-3";
|
|
36
|
-
const UTTERANCE_END_MS = settings.utteranceEndMs ?? 1000;
|
|
37
|
-
const ENDPOINTING = settings.endpointing ?? 300;
|
|
38
|
-
const TRACK_ID = "stt";
|
|
39
|
-
|
|
40
|
-
if (!API_KEY) {
|
|
41
|
-
log.error("No API key. Set DEEPGRAM_API_KEY or settings.apiKey");
|
|
42
|
-
process.exit(1);
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
let ws: WebSocket | null = null;
|
|
46
|
-
let connected = false;
|
|
47
|
-
let lastFinalText = "";
|
|
48
|
-
let pendingText = "";
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
async function connectWebSocket(): Promise<void> {
|
|
52
|
-
const url =
|
|
53
|
-
`${WS_URL}?model=${MODEL}` +
|
|
54
|
-
`&language=${encodeURIComponent(LANGUAGE)}` +
|
|
55
|
-
`&encoding=linear16` +
|
|
56
|
-
`&sample_rate=16000` +
|
|
57
|
-
`&channels=1` +
|
|
58
|
-
`&interim_results=true` +
|
|
59
|
-
`&punctuate=true` +
|
|
60
|
-
`&smart_format=true` +
|
|
61
|
-
`&utterance_end_ms=${UTTERANCE_END_MS}` +
|
|
62
|
-
`&endpointing=${ENDPOINTING}` +
|
|
63
|
-
`&vad_events=true`;
|
|
64
|
-
|
|
65
|
-
ws = new WebSocket(url, ["token", API_KEY]);
|
|
66
|
-
|
|
67
|
-
await new Promise<void>((resolve, reject) => {
|
|
68
|
-
ws!.addEventListener(
|
|
69
|
-
"open",
|
|
70
|
-
() => {
|
|
71
|
-
connected = true;
|
|
72
|
-
log.info("Connected to Deepgram STT");
|
|
73
|
-
resolve();
|
|
74
|
-
},
|
|
75
|
-
{ once: true },
|
|
76
|
-
);
|
|
77
|
-
|
|
78
|
-
ws!.addEventListener(
|
|
79
|
-
"error",
|
|
80
|
-
() => {
|
|
81
|
-
reject(new Error("WebSocket connection failed"));
|
|
82
|
-
},
|
|
83
|
-
{ once: true },
|
|
84
|
-
);
|
|
85
|
-
});
|
|
86
|
-
|
|
87
|
-
ws.addEventListener("message", (event: MessageEvent) => {
|
|
88
|
-
try {
|
|
89
|
-
const data =
|
|
90
|
-
typeof event.data === "string"
|
|
91
|
-
? event.data
|
|
92
|
-
: Buffer.from(event.data as ArrayBuffer).toString("utf-8");
|
|
93
|
-
const msg = JSON.parse(data);
|
|
94
|
-
handleServerMessage(msg);
|
|
95
|
-
} catch {
|
|
96
|
-
// ignore parse errors
|
|
97
|
-
}
|
|
98
|
-
});
|
|
99
|
-
|
|
100
|
-
ws.addEventListener("error", (event: Event) => {
|
|
101
|
-
log.error(`WebSocket error: ${(event as ErrorEvent).message ?? "unknown"}`);
|
|
102
|
-
emit({
|
|
103
|
-
type: "control.error",
|
|
104
|
-
component: "stt-deepgram",
|
|
105
|
-
message: "STT WebSocket error",
|
|
106
|
-
fatal: false,
|
|
107
|
-
});
|
|
108
|
-
});
|
|
109
|
-
|
|
110
|
-
ws.addEventListener("close", (event: CloseEvent) => {
|
|
111
|
-
log.info(`WebSocket closed (code=${event.code})`);
|
|
112
|
-
connected = false;
|
|
113
|
-
});
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
function handleServerMessage(msg: Record<string, unknown>): void {
|
|
117
|
-
const type = msg.type as string | undefined;
|
|
118
|
-
|
|
119
|
-
// UtteranceEnd — speaker finished their turn (word-timing based, ignores noise)
|
|
120
|
-
if (type === "UtteranceEnd") {
|
|
121
|
-
if (pendingText) {
|
|
122
|
-
emit({
|
|
123
|
-
type: "speech.pause",
|
|
124
|
-
trackId: TRACK_ID,
|
|
125
|
-
pendingText,
|
|
126
|
-
silenceMs: UTTERANCE_END_MS,
|
|
127
|
-
});
|
|
128
|
-
pendingText = "";
|
|
129
|
-
}
|
|
130
|
-
return;
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
// SpeechStarted — VAD detected speech beginning
|
|
134
|
-
if (type === "SpeechStarted") {
|
|
135
|
-
return;
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
// Transcription result
|
|
139
|
-
if (type === "Results") {
|
|
140
|
-
const channel = msg.channel as Record<string, unknown> | undefined;
|
|
141
|
-
const alternatives = (channel?.alternatives as Array<Record<string, unknown>>) ?? [];
|
|
142
|
-
if (alternatives.length === 0) return;
|
|
143
|
-
|
|
144
|
-
const transcript = (alternatives[0].transcript as string) ?? "";
|
|
145
|
-
const isFinal = msg.is_final === true;
|
|
146
|
-
const speechFinal = msg.speech_final === true;
|
|
147
|
-
|
|
148
|
-
if (!transcript) return;
|
|
149
|
-
|
|
150
|
-
if (isFinal) {
|
|
151
|
-
// Clear stale partial timer — proper final arrived
|
|
152
|
-
// Final transcript for this segment
|
|
153
|
-
lastFinalText = transcript;
|
|
154
|
-
pendingText = pendingText ? pendingText + " " + transcript : transcript;
|
|
155
|
-
|
|
156
|
-
emit({
|
|
157
|
-
type: "speech.final",
|
|
158
|
-
trackId: TRACK_ID,
|
|
159
|
-
text: transcript,
|
|
160
|
-
confidence: (alternatives[0].confidence as number) ?? undefined,
|
|
161
|
-
});
|
|
162
|
-
|
|
163
|
-
// If speech_final (endpointing detected silence), also emit pause
|
|
164
|
-
if (speechFinal) {
|
|
165
|
-
emit({
|
|
166
|
-
type: "speech.pause",
|
|
167
|
-
trackId: TRACK_ID,
|
|
168
|
-
pendingText,
|
|
169
|
-
silenceMs: ENDPOINTING,
|
|
170
|
-
});
|
|
171
|
-
pendingText = "";
|
|
172
|
-
}
|
|
173
|
-
} else {
|
|
174
|
-
// Interim result — partial transcript
|
|
175
|
-
emit({
|
|
176
|
-
type: "speech.partial",
|
|
177
|
-
trackId: TRACK_ID,
|
|
178
|
-
text: transcript,
|
|
179
|
-
});
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
function sendAudio(base64Pcm: string): void {
|
|
185
|
-
if (!ws || !connected) return;
|
|
186
|
-
const pcm = Buffer.from(base64Pcm, "base64");
|
|
187
|
-
try {
|
|
188
|
-
ws.send(pcm);
|
|
189
|
-
} catch {
|
|
190
|
-
// WebSocket may have closed
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
function closeWebSocket(): void {
|
|
195
|
-
connected = false;
|
|
196
|
-
if (ws) {
|
|
197
|
-
try {
|
|
198
|
-
// Send close message per Deepgram protocol
|
|
199
|
-
ws.send(JSON.stringify({ type: "CloseStream" }));
|
|
200
|
-
ws.close();
|
|
201
|
-
} catch {
|
|
202
|
-
// ignore
|
|
203
|
-
}
|
|
204
|
-
ws = null;
|
|
205
|
-
}
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
// --- Main ---
|
|
209
|
-
|
|
210
|
-
async function main(): Promise<void> {
|
|
211
|
-
await connectWebSocket();
|
|
212
|
-
|
|
213
|
-
emit({ type: "lifecycle.ready", component: "stt-deepgram" });
|
|
214
|
-
|
|
215
|
-
const rl = onEvent((event) => {
|
|
216
|
-
if (event.type === "audio.chunk") {
|
|
217
|
-
if (!connected) {
|
|
218
|
-
connectWebSocket().then(() => {
|
|
219
|
-
sendAudio(event.data as string);
|
|
220
|
-
}).catch(() => {});
|
|
221
|
-
} else {
|
|
222
|
-
sendAudio(event.data as string);
|
|
223
|
-
}
|
|
224
|
-
} else if (event.type === "control.interrupt") {
|
|
225
|
-
// Don't close WebSocket — STT should keep listening for barge-in.
|
|
226
|
-
}
|
|
227
|
-
});
|
|
228
|
-
|
|
229
|
-
rl.on("close", () => {
|
|
230
|
-
closeWebSocket();
|
|
231
|
-
emit({ type: "lifecycle.done", component: "stt-deepgram" });
|
|
232
|
-
process.exit(0);
|
|
233
|
-
});
|
|
234
|
-
|
|
235
|
-
process.on("SIGTERM", () => {
|
|
236
|
-
closeWebSocket();
|
|
237
|
-
process.exit(0);
|
|
238
|
-
});
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
main().catch((err) => {
|
|
242
|
-
log.error(`Fatal: ${err.message}`);
|
|
243
|
-
process.exit(1);
|
|
244
|
-
});
|