@cheeko-ai/esp32-voice 2026.2.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,215 @@
1
+ /**
2
+ * Deepgram streaming Speech-to-Text provider.
3
+ *
4
+ * Ported from cheekoclaw_bridge/deepgram_stt.py
5
+ *
6
+ * Uses Deepgram's WebSocket API for real-time speech recognition.
7
+ * Receives Opus frames from the ESP32 and streams them directly
8
+ * to Deepgram (Deepgram supports Opus natively).
9
+ *
10
+ * WebSocket URL: wss://api.deepgram.com/v1/listen
11
+ */
12
+
13
+ import WebSocket from "ws";
14
+ import type { SttProvider, SttProviderConfig, SttProviderMeta, SttTranscriptCallback } from "./stt-provider.js";
15
+ import { sttRegistry } from "./stt-registry.js";
16
+
17
+ const DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
18
+
19
+ export class DeepgramSttProvider implements SttProvider {
20
+ readonly id = "deepgram";
21
+ readonly name = "Deepgram";
22
+ readonly streaming = true;
23
+
24
+ onTranscript: SttTranscriptCallback | null = null;
25
+ onSpeechEnd: (() => void | Promise<void>) | null = null;
26
+
27
+ private apiKey: string;
28
+ private model: string;
29
+ private language: string;
30
+ private ws: WebSocket | null = null;
31
+ private finalTranscript = "";
32
+ private finalizeResolve: ((value: string) => void) | null = null;
33
+ private finalizePromise: Promise<string> | null = null;
34
+
35
+ constructor(config: SttProviderConfig) {
36
+ this.apiKey = config.apiKey;
37
+ this.model = config.model ?? "nova-2";
38
+ this.language = config.language ?? "en";
39
+ }
40
+
41
+ async connect(): Promise<void> {
42
+ const params = new URLSearchParams({
43
+ encoding: "opus",
44
+ sample_rate: "16000",
45
+ channels: "1",
46
+ model: this.model,
47
+ language: this.language,
48
+ interim_results: "true",
49
+ punctuate: "true",
50
+ // Enable server-side VAD: fires speech_final when speech ends
51
+ endpointing: "300",
52
+ utterance_end_ms: "1000",
53
+ });
54
+
55
+ const url = `${DEEPGRAM_WS_URL}?${params.toString()}`;
56
+
57
+ return new Promise<void>((resolve, reject) => {
58
+ this.ws = new WebSocket(url, {
59
+ headers: { Authorization: `Token ${this.apiKey}` },
60
+ });
61
+
62
+ this.finalTranscript = "";
63
+ this.finalizePromise = new Promise<string>((res) => {
64
+ this.finalizeResolve = res;
65
+ });
66
+
67
+ this.ws.on("open", () => {
68
+ console.log("[deepgram-stt] Connected");
69
+ resolve();
70
+ });
71
+
72
+ this.ws.on("message", (data: Buffer) => {
73
+ this.handleMessage(data);
74
+ });
75
+
76
+ this.ws.on("error", (err) => {
77
+ console.error("[deepgram-stt] WebSocket error:", err.message);
78
+ reject(err);
79
+ });
80
+
81
+ this.ws.on("close", () => {
82
+ console.log("[deepgram-stt] Connection closed");
83
+ // Resolve finalize if still pending
84
+ if (this.finalizeResolve) {
85
+ this.finalizeResolve(this.finalTranscript);
86
+ this.finalizeResolve = null;
87
+ }
88
+ });
89
+ });
90
+ }
91
+
92
+ async sendAudio(audioData: Buffer): Promise<void> {
93
+ if (this.ws?.readyState === WebSocket.OPEN) {
94
+ this.ws.send(audioData);
95
+ }
96
+ }
97
+
98
+ async finalize(): Promise<string> {
99
+ // Send Deepgram's CloseStream message to trigger final results
100
+ if (this.ws?.readyState === WebSocket.OPEN) {
101
+ this.ws.send(JSON.stringify({ type: "CloseStream" }));
102
+ }
103
+
104
+ // Wait for the final transcript (with timeout)
105
+ if (this.finalizePromise) {
106
+ const timeoutPromise = new Promise<string>((resolve) => {
107
+ setTimeout(() => {
108
+ console.warn("[deepgram-stt] Timeout waiting for final transcript");
109
+ resolve(this.finalTranscript);
110
+ }, 5000);
111
+ });
112
+ return Promise.race([this.finalizePromise, timeoutPromise]);
113
+ }
114
+
115
+ return this.finalTranscript;
116
+ }
117
+
118
+ async close(): Promise<void> {
119
+ if (this.ws) {
120
+ try {
121
+ this.ws.close();
122
+ } catch {
123
+ // Ignore close errors
124
+ }
125
+ this.ws = null;
126
+ }
127
+ }
128
+
129
+ private handleMessage(data: Buffer): void {
130
+ try {
131
+ const msg = JSON.parse(data.toString());
132
+ const msgType = msg.type;
133
+
134
+ if (msgType === "Results") {
135
+ const alternatives = msg.channel?.alternatives ?? [];
136
+ if (alternatives.length === 0) return;
137
+
138
+ const text: string = alternatives[0].transcript ?? "";
139
+ const isFinal: boolean = msg.is_final ?? false;
140
+ const speechFinal: boolean = msg.speech_final ?? false;
141
+
142
+ if (text.trim()) {
143
+ // Fire transcript callback
144
+ if (this.onTranscript) {
145
+ const result = this.onTranscript(text, isFinal);
146
+ if (result instanceof Promise) {
147
+ result.catch((err) => console.error("[deepgram-stt] Transcript callback error:", err));
148
+ }
149
+ }
150
+
151
+ // Accumulate final segments
152
+ if (isFinal) {
153
+ if (this.finalTranscript) {
154
+ this.finalTranscript += " " + text;
155
+ } else {
156
+ this.finalTranscript = text;
157
+ }
158
+ }
159
+ }
160
+
161
+ // speech_final = server-side VAD detected end of utterance
162
+ // Resolve finalize promise immediately so processUtterance can proceed
163
+ if (speechFinal && this.finalizeResolve) {
164
+ console.log(`[deepgram-stt] speech_final — triggering utterance end (transcript: "${this.finalTranscript}")`);
165
+ this.finalizeResolve(this.finalTranscript);
166
+ this.finalizeResolve = null;
167
+ // Fire onSpeechEnd so the session calls processUtterance
168
+ if (this.onSpeechEnd) {
169
+ const result = this.onSpeechEnd();
170
+ if (result instanceof Promise) {
171
+ result.catch((err) => console.error("[deepgram-stt] onSpeechEnd error:", err));
172
+ }
173
+ }
174
+ }
175
+ } else if (msgType === "UtteranceEnd") {
176
+ // Fallback: Deepgram UtteranceEnd event (requires utterance_end_ms param)
177
+ console.log("[deepgram-stt] UtteranceEnd received");
178
+ if (this.finalizeResolve) {
179
+ this.finalizeResolve(this.finalTranscript);
180
+ this.finalizeResolve = null;
181
+ }
182
+ if (this.onSpeechEnd) {
183
+ const result = this.onSpeechEnd();
184
+ if (result instanceof Promise) {
185
+ result.catch((err) => console.error("[deepgram-stt] onSpeechEnd (UtteranceEnd) error:", err));
186
+ }
187
+ }
188
+ } else if (msgType === "Error") {
189
+ console.error("[deepgram-stt] Error:", msg);
190
+ }
191
+ // Ignore "Metadata" and other message types
192
+ } catch {
193
+ // Ignore parse errors
194
+ }
195
+ }
196
+ }
197
+
198
+ /** Deepgram STT provider metadata. */
199
+ export const deepgramMeta: SttProviderMeta = {
200
+ id: "deepgram",
201
+ name: "Deepgram",
202
+ description: "Fast, accurate streaming STT with Nova-2 model. Supports Opus input natively.",
203
+ streaming: true,
204
+ envVar: "DEEPGRAM_API_KEY",
205
+ defaultModel: "nova-2",
206
+ docsUrl: "https://developers.deepgram.com/docs/streaming",
207
+ };
208
+
209
+ /** Factory function for creating Deepgram STT instances. */
210
+ function createDeepgramStt(config: SttProviderConfig): SttProvider {
211
+ return new DeepgramSttProvider(config);
212
+ }
213
+
214
+ // Auto-register with the STT registry
215
+ sttRegistry.register(deepgramMeta, createDeepgramStt);
@@ -0,0 +1,107 @@
1
+ /**
2
+ * Speech-to-Text (STT) provider interface.
3
+ *
4
+ * All STT providers must implement this interface. Providers can be
5
+ * streaming (WebSocket-based, sending audio chunks in real time) or
6
+ * batch (send complete audio, receive transcript).
7
+ *
8
+ * Modeled after OpenClaw's multi-provider architecture — new providers
9
+ * can be added by implementing this interface and registering with
10
+ * the STT registry.
11
+ */
12
+
13
+ export type SttTranscriptCallback = (text: string, isFinal: boolean) => void | Promise<void>;
14
+
15
+ /** Called when the STT provider detects end of speech (server-side VAD). */
16
+ export type SttSpeechEndCallback = () => void | Promise<void>;
17
+
18
+ export interface SttProviderConfig {
19
+ /** Provider-specific API key. */
20
+ apiKey: string;
21
+ /** Model identifier (e.g., "nova-2" for Deepgram). */
22
+ model?: string;
23
+ /** Language code (e.g., "en", "es"). */
24
+ language?: string;
25
+ /** Additional provider-specific options. */
26
+ options?: Record<string, unknown>;
27
+ }
28
+
29
+ export interface SttProvider {
30
+ /** Unique provider identifier (e.g., "deepgram", "google", "whisper"). */
31
+ readonly id: string;
32
+
33
+ /** Human-readable provider name. */
34
+ readonly name: string;
35
+
36
+ /** Whether this provider supports real-time streaming via WebSocket. */
37
+ readonly streaming: boolean;
38
+
39
+ /**
40
+ * Called when a transcript (partial or final) is received.
41
+ * Set this before calling `connect()`.
42
+ */
43
+ onTranscript: SttTranscriptCallback | null;
44
+
45
+ /**
46
+ * Called when the STT provider detects end of speech via server-side VAD
47
+ * (e.g., Deepgram speech_final). Used to trigger utterance processing
48
+ * automatically when the firmware doesn't send a speech_end message.
49
+ */
50
+ onSpeechEnd?: SttSpeechEndCallback | null;
51
+
52
+ /**
53
+ * Open a connection to the STT service.
54
+ * For streaming providers, this opens a WebSocket.
55
+ * For batch providers, this may be a no-op.
56
+ */
57
+ connect(): Promise<void>;
58
+
59
+ /**
60
+ * Send an audio chunk to the STT service.
61
+ * For streaming providers: sends Opus/PCM frames in real time.
62
+ * For batch providers: buffers audio until `finalize()` is called.
63
+ *
64
+ * @param audioData - Raw audio bytes (Opus frames from ESP32)
65
+ */
66
+ sendAudio(audioData: Buffer): Promise<void>;
67
+
68
+ /**
69
+ * Signal end of audio and retrieve the final transcript.
70
+ * For streaming providers: sends a "close stream" signal and waits.
71
+ * For batch providers: sends buffered audio and waits for result.
72
+ *
73
+ * @returns The complete, final transcript text.
74
+ */
75
+ finalize(): Promise<string>;
76
+
77
+ /**
78
+ * Close the connection and release resources.
79
+ */
80
+ close(): Promise<void>;
81
+ }
82
+
83
+ /**
84
+ * Factory function type for creating STT provider instances.
85
+ * Each call creates a fresh provider for one utterance/session.
86
+ */
87
+ export type SttProviderFactory = (config: SttProviderConfig) => SttProvider;
88
+
89
+ /**
90
+ * Metadata about a registered STT provider.
91
+ */
92
+ export interface SttProviderMeta {
93
+ /** Provider ID. */
94
+ id: string;
95
+ /** Human-readable name. */
96
+ name: string;
97
+ /** Short description. */
98
+ description: string;
99
+ /** Whether it supports streaming. */
100
+ streaming: boolean;
101
+ /** Required environment variable for the API key. */
102
+ envVar: string;
103
+ /** Default model identifier. */
104
+ defaultModel?: string;
105
+ /** Documentation URL. */
106
+ docsUrl?: string;
107
+ }
@@ -0,0 +1,71 @@
1
+ /**
2
+ * STT Provider Registry.
3
+ *
4
+ * Central registry for speech-to-text providers. Providers register
5
+ * themselves with a factory function, and the voice session creates
6
+ * instances as needed.
7
+ *
8
+ * Usage:
9
+ * sttRegistry.register(deepgramMeta, createDeepgramStt);
10
+ * const provider = sttRegistry.create("deepgram", { apiKey: "..." });
11
+ */
12
+
13
+ import type { SttProvider, SttProviderConfig, SttProviderFactory, SttProviderMeta } from "./stt-provider.js";
14
+
15
+ interface RegisteredSttProvider {
16
+ meta: SttProviderMeta;
17
+ factory: SttProviderFactory;
18
+ }
19
+
20
+ class SttRegistry {
21
+ private providers = new Map<string, RegisteredSttProvider>();
22
+
23
+ /**
24
+ * Register a new STT provider.
25
+ */
26
+ register(meta: SttProviderMeta, factory: SttProviderFactory): void {
27
+ if (this.providers.has(meta.id)) {
28
+ console.warn(`[stt-registry] Provider "${meta.id}" is already registered, overwriting.`);
29
+ }
30
+ this.providers.set(meta.id, { meta, factory });
31
+ console.log(`[stt-registry] Registered STT provider: ${meta.name} (${meta.id})`);
32
+ }
33
+
34
+ /**
35
+ * Create an instance of a registered STT provider.
36
+ */
37
+ create(providerId: string, config: SttProviderConfig): SttProvider {
38
+ const registered = this.providers.get(providerId);
39
+ if (!registered) {
40
+ const available = [...this.providers.keys()].join(", ");
41
+ throw new Error(
42
+ `STT provider "${providerId}" not found. Available: ${available || "none"}`,
43
+ );
44
+ }
45
+ return registered.factory(config);
46
+ }
47
+
48
+ /**
49
+ * Get metadata for a registered provider.
50
+ */
51
+ getMeta(providerId: string): SttProviderMeta | undefined {
52
+ return this.providers.get(providerId)?.meta;
53
+ }
54
+
55
+ /**
56
+ * List all registered providers.
57
+ */
58
+ list(): SttProviderMeta[] {
59
+ return [...this.providers.values()].map((p) => p.meta);
60
+ }
61
+
62
+ /**
63
+ * Check if a provider is registered.
64
+ */
65
+ has(providerId: string): boolean {
66
+ return this.providers.has(providerId);
67
+ }
68
+ }
69
+
70
+ /** Global STT provider registry. */
71
+ export const sttRegistry = new SttRegistry();
@@ -0,0 +1,215 @@
1
+ /**
2
+ * ElevenLabs streaming Text-to-Speech provider.
3
+ *
4
+ * Ported from cheekoclaw_bridge/elevenlabs_tts.py
5
+ *
6
+ * Uses ElevenLabs' WebSocket API for real-time text-to-speech.
7
+ * Sends text and receives base64-encoded PCM audio (24kHz, 16-bit mono).
8
+ *
9
+ * WebSocket URL: wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input
10
+ */
11
+
12
+ import WebSocket from "ws";
13
+ import type { TtsProvider, TtsProviderConfig, TtsProviderMeta, TtsAudioCallback, TtsDoneCallback } from "./tts-provider.js";
14
+ import { ttsRegistry } from "./tts-registry.js";
15
+
16
+ const ELEVENLABS_WS_URL = "wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input";
17
+
18
+ const DEFAULT_VOICE_ID = "21m00Tcm4TlvDq8ikWAM"; // Rachel
19
+ const DEFAULT_MODEL_ID = "eleven_turbo_v2_5";
20
+
21
+ export class ElevenLabsTtsProvider implements TtsProvider {
22
+ readonly id = "elevenlabs";
23
+ readonly name = "ElevenLabs";
24
+ readonly streaming = true;
25
+ readonly outputSampleRate = 24000;
26
+
27
+ onAudio: TtsAudioCallback | null = null;
28
+ onDone: TtsDoneCallback | null = null;
29
+
30
+ private apiKey: string;
31
+ private voiceId: string;
32
+ private modelId: string;
33
+ private ws: WebSocket | null = null;
34
+ private doneResolve: (() => void) | null = null;
35
+ private donePromise: Promise<void> | null = null;
36
+ // Serialises onAudio calls: each chunk waits for the previous one to finish
37
+ // (including any pacing sleeps) before being dispatched. Without this, all
38
+ // chunks are fired in parallel and the pacing in the callback is bypassed.
39
+ private audioChain: Promise<void> = Promise.resolve();
40
+ private isFinalReceived = false;
41
+
42
+ constructor(config: TtsProviderConfig) {
43
+ this.apiKey = config.apiKey;
44
+ this.voiceId = config.voiceId ?? DEFAULT_VOICE_ID;
45
+ this.modelId = config.model ?? DEFAULT_MODEL_ID;
46
+ }
47
+
48
+ async connect(): Promise<void> {
49
+ const url =
50
+ ELEVENLABS_WS_URL.replace("{voice_id}", this.voiceId) +
51
+ `?model_id=${this.modelId}&output_format=pcm_24000`;
52
+
53
+ return new Promise<void>((resolve, reject) => {
54
+ this.ws = new WebSocket(url, {
55
+ headers: { "xi-api-key": this.apiKey },
56
+ });
57
+
58
+ this.donePromise = new Promise<void>((res) => {
59
+ this.doneResolve = res;
60
+ });
61
+ // Reset chain and final flag for this connection
62
+ this.audioChain = Promise.resolve();
63
+ this.isFinalReceived = false;
64
+
65
+ this.ws.on("open", async () => {
66
+ console.log("[elevenlabs-tts] Connected");
67
+
68
+ // Send BOS (beginning of stream) message
69
+ const bos = {
70
+ text: " ",
71
+ voice_settings: {
72
+ stability: 0.5,
73
+ similarity_boost: 0.75,
74
+ },
75
+ generation_config: {
76
+ flush: true,
77
+ },
78
+ };
79
+
80
+ this.ws!.send(JSON.stringify(bos));
81
+ resolve();
82
+ });
83
+
84
+ this.ws.on("message", (data: Buffer) => {
85
+ this.handleMessage(data);
86
+ });
87
+
88
+ this.ws.on("error", (err) => {
89
+ console.error("[elevenlabs-tts] WebSocket error:", err.message);
90
+ reject(err);
91
+ });
92
+
93
+ this.ws.on("close", () => {
94
+ console.log("[elevenlabs-tts] Connection closed");
95
+ // If isFinal already triggered the chain drain, fireDone will be a no-op.
96
+ // Otherwise (unexpected close) drain the chain first then fire done.
97
+ if (!this.isFinalReceived) {
98
+ this.audioChain
99
+ .then(() => this.fireDone())
100
+ .catch(() => this.fireDone());
101
+ }
102
+ });
103
+ });
104
+ }
105
+
106
+ async synthesize(text: string): Promise<void> {
107
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
108
+ throw new Error("[elevenlabs-tts] Not connected");
109
+ }
110
+
111
+ const msg = {
112
+ text,
113
+ generation_config: { flush: true },
114
+ };
115
+
116
+ this.ws.send(JSON.stringify(msg));
117
+ }
118
+
119
+ async flush(): Promise<void> {
120
+ // Send EOS (end of stream) — empty text signals end of input
121
+ if (this.ws?.readyState === WebSocket.OPEN) {
122
+ this.ws.send(JSON.stringify({ text: "" }));
123
+ }
124
+
125
+ // Wait for all audio to be delivered
126
+ if (this.donePromise) {
127
+ const timeoutPromise = new Promise<void>((resolve) => {
128
+ setTimeout(() => {
129
+ console.warn("[elevenlabs-tts] Timeout waiting for audio completion");
130
+ resolve();
131
+ }, 30000);
132
+ });
133
+ await Promise.race([this.donePromise, timeoutPromise]);
134
+ }
135
+ }
136
+
137
+ async close(): Promise<void> {
138
+ if (this.ws) {
139
+ try {
140
+ this.ws.close();
141
+ } catch {
142
+ // Ignore close errors
143
+ }
144
+ this.ws = null;
145
+ }
146
+ }
147
+
148
+ private handleMessage(data: Buffer): void {
149
+ try {
150
+ const msg = JSON.parse(data.toString());
151
+
152
+ // Audio chunk: base64-encoded PCM.
153
+ // Chain onto audioChain so each chunk is processed *after* the previous
154
+ // one finishes — including any pacing sleeps in the onAudio callback.
155
+ const audioB64: string | undefined = msg.audio;
156
+ if (audioB64) {
157
+ const pcmBytes = Buffer.from(audioB64, "base64");
158
+ if (pcmBytes.length > 0 && this.onAudio) {
159
+ const cb = this.onAudio;
160
+ this.audioChain = this.audioChain
161
+ .then(() => cb(pcmBytes))
162
+ .catch((err) => console.error("[elevenlabs-tts] Audio callback error:", err));
163
+ }
164
+ }
165
+
166
+ // isFinal: ElevenLabs signals all audio has been sent.
167
+ // We must wait for the entire audioChain to drain before firing done,
168
+ // so the caller (flush) only unblocks after all pacing sleeps complete.
169
+ if (msg.isFinal) {
170
+ console.log("[elevenlabs-tts] Stream complete (isFinal)");
171
+ this.isFinalReceived = true;
172
+ this.audioChain
173
+ .then(() => this.fireDone())
174
+ .catch(() => this.fireDone());
175
+ }
176
+ } catch {
177
+ // Ignore parse errors
178
+ }
179
+ }
180
+
181
+ private fireDone(): void {
182
+ if (this.onDone) {
183
+ const result = this.onDone();
184
+ if (result instanceof Promise) {
185
+ result.catch((err) => console.error("[elevenlabs-tts] Done callback error:", err));
186
+ }
187
+ }
188
+ if (this.doneResolve) {
189
+ this.doneResolve();
190
+ this.doneResolve = null;
191
+ }
192
+ }
193
+ }
194
+
195
+ /** ElevenLabs TTS provider metadata. */
196
+ export const elevenlabsMeta: TtsProviderMeta = {
197
+ id: "elevenlabs",
198
+ name: "ElevenLabs",
199
+ description:
200
+ "High-quality streaming TTS with natural-sounding voices. Supports WebSocket streaming for low latency.",
201
+ streaming: true,
202
+ envVar: "ELEVENLABS_API_KEY",
203
+ defaultVoiceId: DEFAULT_VOICE_ID,
204
+ defaultModel: DEFAULT_MODEL_ID,
205
+ outputSampleRate: 24000,
206
+ docsUrl: "https://elevenlabs.io/docs/api-reference/text-to-speech-websockets",
207
+ };
208
+
209
+ /** Factory function for creating ElevenLabs TTS instances. */
210
+ function createElevenLabsTts(config: TtsProviderConfig): TtsProvider {
211
+ return new ElevenLabsTtsProvider(config);
212
+ }
213
+
214
+ // Auto-register with the TTS registry
215
+ ttsRegistry.register(elevenlabsMeta, createElevenLabsTts);