@acpfx/stt-deepgram 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,21 @@
1
+ # @acpfx/stt-deepgram
2
+
3
+ ## 0.2.0
4
+
5
+ ### Minor Changes
6
+
7
+ - d757640: Initial release: type-safe contracts, Rust orchestrator, manifest-driven event filtering
8
+
9
+ - Rust schema crate as canonical event type source of truth with codegen to TypeScript + Zod
10
+ - Node manifests (manifest.yaml) declaring consumes/emits contracts
11
+ - Orchestrator event filtering: nodes only receive declared events
12
+ - Rust orchestrator with ratatui TUI (--ui flag)
13
+ - node-sdk with structured logging helpers
14
+ - CI/CD with GitHub Actions and changesets
15
+ - Platform-specific npm packages for Rust binaries (esbuild-style distribution)
16
+
17
+ ### Patch Changes
18
+
19
+ - Updated dependencies [d757640]
20
+ - @acpfx/core@0.2.0
21
+ - @acpfx/node-sdk@0.2.0
package/manifest.yaml ADDED
@@ -0,0 +1,11 @@
1
+ name: stt-deepgram
2
+ description: Speech-to-text via Deepgram streaming API
3
+ consumes:
4
+ - audio.chunk
5
+ emits:
6
+ - speech.partial
7
+ - speech.final
8
+ - speech.pause
9
+ - lifecycle.ready
10
+ - lifecycle.done
11
+ - control.error
package/package.json ADDED
@@ -0,0 +1,16 @@
1
+ {
2
+ "name": "@acpfx/stt-deepgram",
3
+ "version": "0.2.0",
4
+ "type": "module",
5
+ "bin": {
6
+ "acpfx-stt-deepgram": "./dist/index.js"
7
+ },
8
+ "main": "./dist/index.js",
9
+ "dependencies": {
10
+ "@acpfx/core": "0.2.0",
11
+ "@acpfx/node-sdk": "0.2.0"
12
+ },
13
+ "scripts": {
14
+ "build": "esbuild src/index.ts --bundle --platform=node --format=esm --outfile=dist/index.js --packages=external"
15
+ }
16
+ }
package/src/index.ts ADDED
@@ -0,0 +1,244 @@
1
+ /**
2
+ * stt-deepgram node — Deepgram Nova-3 Realtime STT with UtteranceEnd detection.
3
+ *
4
+ * Reads audio.chunk events from stdin, streams to Deepgram WebSocket,
5
+ * emits speech.partial, speech.final, and speech.pause events.
6
+ *
7
+ * Uses UtteranceEnd for end-of-turn detection — analyzes word timing gaps,
8
+ * ignores non-speech audio (won't false-trigger on SFX sounds).
9
+ *
10
+ * Settings (via ACPFX_SETTINGS):
11
+ * language?: string — language code (default: "en")
12
+ * apiKey?: string — Deepgram API key (falls back to DEEPGRAM_API_KEY env)
13
+ * model?: string — STT model (default: "nova-3")
14
+ * utteranceEndMs?: number — ms gap for utterance end (default: 1000)
15
+ * endpointing?: number — VAD endpointing ms (default: 300)
16
+ */
17
+
18
+ import { emit, log, onEvent, handleManifestFlag } from "@acpfx/node-sdk";
19
+
20
+ handleManifestFlag();
21
+
22
+ const WS_URL = "wss://api.deepgram.com/v1/listen";
23
+
24
+ type Settings = {
25
+ language?: string;
26
+ apiKey?: string;
27
+ model?: string;
28
+ utteranceEndMs?: number;
29
+ endpointing?: number;
30
+ };
31
+
32
+ const settings: Settings = JSON.parse(process.env.ACPFX_SETTINGS || "{}");
33
+ const API_KEY = settings.apiKey ?? process.env.DEEPGRAM_API_KEY ?? "";
34
+ const LANGUAGE = settings.language ?? "en";
35
+ const MODEL = settings.model ?? "nova-3";
36
+ const UTTERANCE_END_MS = settings.utteranceEndMs ?? 1000;
37
+ const ENDPOINTING = settings.endpointing ?? 300;
38
+ const TRACK_ID = "stt";
39
+
40
+ if (!API_KEY) {
41
+ log.error("No API key. Set DEEPGRAM_API_KEY or settings.apiKey");
42
+ process.exit(1);
43
+ }
44
+
45
+ let ws: WebSocket | null = null;
46
+ let connected = false;
47
+ let lastFinalText = "";
48
+ let pendingText = "";
49
+
50
+
51
+ async function connectWebSocket(): Promise<void> {
52
+ const url =
53
+ `${WS_URL}?model=${MODEL}` +
54
+ `&language=${encodeURIComponent(LANGUAGE)}` +
55
+ `&encoding=linear16` +
56
+ `&sample_rate=16000` +
57
+ `&channels=1` +
58
+ `&interim_results=true` +
59
+ `&punctuate=true` +
60
+ `&smart_format=true` +
61
+ `&utterance_end_ms=${UTTERANCE_END_MS}` +
62
+ `&endpointing=${ENDPOINTING}` +
63
+ `&vad_events=true`;
64
+
65
+ ws = new WebSocket(url, ["token", API_KEY]);
66
+
67
+ await new Promise<void>((resolve, reject) => {
68
+ ws!.addEventListener(
69
+ "open",
70
+ () => {
71
+ connected = true;
72
+ log.info("Connected to Deepgram STT");
73
+ resolve();
74
+ },
75
+ { once: true },
76
+ );
77
+
78
+ ws!.addEventListener(
79
+ "error",
80
+ () => {
81
+ reject(new Error("WebSocket connection failed"));
82
+ },
83
+ { once: true },
84
+ );
85
+ });
86
+
87
+ ws.addEventListener("message", (event: MessageEvent) => {
88
+ try {
89
+ const data =
90
+ typeof event.data === "string"
91
+ ? event.data
92
+ : Buffer.from(event.data as ArrayBuffer).toString("utf-8");
93
+ const msg = JSON.parse(data);
94
+ handleServerMessage(msg);
95
+ } catch {
96
+ // ignore parse errors
97
+ }
98
+ });
99
+
100
+ ws.addEventListener("error", (event: Event) => {
101
+ log.error(`WebSocket error: ${(event as ErrorEvent).message ?? "unknown"}`);
102
+ emit({
103
+ type: "control.error",
104
+ component: "stt-deepgram",
105
+ message: "STT WebSocket error",
106
+ fatal: false,
107
+ });
108
+ });
109
+
110
+ ws.addEventListener("close", (event: CloseEvent) => {
111
+ log.info(`WebSocket closed (code=${event.code})`);
112
+ connected = false;
113
+ });
114
+ }
115
+
116
+ function handleServerMessage(msg: Record<string, unknown>): void {
117
+ const type = msg.type as string | undefined;
118
+
119
+ // UtteranceEnd — speaker finished their turn (word-timing based, ignores noise)
120
+ if (type === "UtteranceEnd") {
121
+ if (pendingText) {
122
+ emit({
123
+ type: "speech.pause",
124
+ trackId: TRACK_ID,
125
+ pendingText,
126
+ silenceMs: UTTERANCE_END_MS,
127
+ });
128
+ pendingText = "";
129
+ }
130
+ return;
131
+ }
132
+
133
+ // SpeechStarted — VAD detected speech beginning
134
+ if (type === "SpeechStarted") {
135
+ return;
136
+ }
137
+
138
+ // Transcription result
139
+ if (type === "Results") {
140
+ const channel = msg.channel as Record<string, unknown> | undefined;
141
+ const alternatives = (channel?.alternatives as Array<Record<string, unknown>>) ?? [];
142
+ if (alternatives.length === 0) return;
143
+
144
+ const transcript = (alternatives[0].transcript as string) ?? "";
145
+ const isFinal = msg.is_final === true;
146
+ const speechFinal = msg.speech_final === true;
147
+
148
+ if (!transcript) return;
149
+
150
+ if (isFinal) {
151
+ // Clear stale partial timer — proper final arrived
152
+ // Final transcript for this segment
153
+ lastFinalText = transcript;
154
+ pendingText = transcript;
155
+
156
+ emit({
157
+ type: "speech.final",
158
+ trackId: TRACK_ID,
159
+ text: transcript,
160
+ confidence: (alternatives[0].confidence as number) ?? undefined,
161
+ });
162
+
163
+ // If speech_final (endpointing detected silence), also emit pause
164
+ if (speechFinal) {
165
+ emit({
166
+ type: "speech.pause",
167
+ trackId: TRACK_ID,
168
+ pendingText: transcript,
169
+ silenceMs: ENDPOINTING,
170
+ });
171
+ pendingText = "";
172
+ }
173
+ } else {
174
+ // Interim result — partial transcript
175
+ emit({
176
+ type: "speech.partial",
177
+ trackId: TRACK_ID,
178
+ text: transcript,
179
+ });
180
+ }
181
+ }
182
+ }
183
+
184
+ function sendAudio(base64Pcm: string): void {
185
+ if (!ws || !connected) return;
186
+ const pcm = Buffer.from(base64Pcm, "base64");
187
+ try {
188
+ ws.send(pcm);
189
+ } catch {
190
+ // WebSocket may have closed
191
+ }
192
+ }
193
+
194
+ function closeWebSocket(): void {
195
+ connected = false;
196
+ if (ws) {
197
+ try {
198
+ // Send close message per Deepgram protocol
199
+ ws.send(JSON.stringify({ type: "CloseStream" }));
200
+ ws.close();
201
+ } catch {
202
+ // ignore
203
+ }
204
+ ws = null;
205
+ }
206
+ }
207
+
208
+ // --- Main ---
209
+
210
+ async function main(): Promise<void> {
211
+ await connectWebSocket();
212
+
213
+ emit({ type: "lifecycle.ready", component: "stt-deepgram" });
214
+
215
+ const rl = onEvent((event) => {
216
+ if (event.type === "audio.chunk") {
217
+ if (!connected) {
218
+ connectWebSocket().then(() => {
219
+ sendAudio(event.data as string);
220
+ }).catch(() => {});
221
+ } else {
222
+ sendAudio(event.data as string);
223
+ }
224
+ } else if (event.type === "control.interrupt") {
225
+ // Don't close WebSocket — STT should keep listening for barge-in.
226
+ }
227
+ });
228
+
229
+ rl.on("close", () => {
230
+ closeWebSocket();
231
+ emit({ type: "lifecycle.done", component: "stt-deepgram" });
232
+ process.exit(0);
233
+ });
234
+
235
+ process.on("SIGTERM", () => {
236
+ closeWebSocket();
237
+ process.exit(0);
238
+ });
239
+ }
240
+
241
+ main().catch((err) => {
242
+ log.error(`Fatal: ${err.message}`);
243
+ process.exit(1);
244
+ });