voicecc 1.1.36 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/bin/voicecc.js +94 -1
  2. package/dashboard/dist/assets/index-DCeOdulF.js +28 -0
  3. package/dashboard/dist/index.html +1 -1
  4. package/dashboard/routes/agents.ts +28 -8
  5. package/dashboard/routes/browser-call.ts +3 -2
  6. package/dashboard/routes/chat.ts +75 -55
  7. package/dashboard/routes/providers.ts +5 -74
  8. package/dashboard/routes/twilio.ts +104 -5
  9. package/dashboard/routes/voice.ts +98 -0
  10. package/dashboard/server.ts +48 -1
  11. package/package.json +2 -3
  12. package/server/index.ts +96 -8
  13. package/server/services/twilio-manager.ts +29 -10
  14. package/dashboard/dist/assets/index-C62C9Gp0.js +0 -28
  15. package/dashboard/dist/audio-processor.js +0 -126
  16. package/server/services/heartbeat.ts +0 -403
  17. package/server/voice/assets/chime.wav +0 -0
  18. package/server/voice/assets/startup.pcm +0 -0
  19. package/server/voice/audio-adapter.ts +0 -60
  20. package/server/voice/audio-inactivity.test.ts +0 -108
  21. package/server/voice/audio-inactivity.ts +0 -91
  22. package/server/voice/browser-audio-playback.test.ts +0 -149
  23. package/server/voice/browser-audio.ts +0 -147
  24. package/server/voice/browser-server.ts +0 -311
  25. package/server/voice/chat-server.ts +0 -236
  26. package/server/voice/chime.test.ts +0 -69
  27. package/server/voice/chime.ts +0 -36
  28. package/server/voice/claude-session.ts +0 -293
  29. package/server/voice/endpointing.ts +0 -163
  30. package/server/voice/mic-vpio +0 -0
  31. package/server/voice/narration.ts +0 -204
  32. package/server/voice/prompt-builder.ts +0 -108
  33. package/server/voice/session-lock.ts +0 -123
  34. package/server/voice/stt-elevenlabs.ts +0 -210
  35. package/server/voice/stt-provider.ts +0 -106
  36. package/server/voice/tts-elevenlabs-hiss.test.ts +0 -183
  37. package/server/voice/tts-elevenlabs.ts +0 -397
  38. package/server/voice/tts-provider.ts +0 -155
  39. package/server/voice/twilio-audio.ts +0 -338
  40. package/server/voice/twilio-server.ts +0 -540
  41. package/server/voice/types.ts +0 -282
  42. package/server/voice/vad.ts +0 -101
  43. package/server/voice/voice-loop-bugs.test.ts +0 -348
  44. package/server/voice/voice-server.ts +0 -129
  45. package/server/voice/voice-session.ts +0 -539
@@ -1,397 +0,0 @@
1
- /**
2
- * ElevenLabs TTS provider via streaming HTTP API.
3
- *
4
- * Calls the ElevenLabs text-to-speech streaming endpoint to generate audio,
5
- * then writes raw PCM chunks to the speaker stream for playback. No subprocess
6
- * is needed -- audio is fetched over HTTP and piped directly into the pipeline.
7
- *
8
- * Responsibilities:
9
- * - POST text to the ElevenLabs TTS streaming API and receive chunked PCM audio
10
- * - Buffer streaming text deltas into sentences via shared bufferSentences utility
11
- * - Write PCM audio to the speaker stream with backpressure handling
12
- * - Track playback timing and wait for audio to finish before resolving
13
- * - Support interruption by cancelling in-flight requests and clearing playback
14
- */
15
-
16
- import type { Writable } from "stream";
17
- import type { TtsPlayer, TextChunk } from "./types.js";
18
-
19
- // ============================================================================
20
- // CONSTANTS
21
- // ============================================================================
22
-
23
- /** ElevenLabs TTS streaming API base URL */
24
- const ELEVENLABS_TTS_BASE_URL = "https://api.elevenlabs.io/v1/text-to-speech";
25
-
26
- /** PCM output sample rate in Hz (must match speaker pipeline) */
27
- const TTS_SAMPLE_RATE = 24000;
28
-
29
- /** Speaker audio bit depth */
30
- const SPEAKER_BIT_DEPTH = 16;
31
-
32
- /** Speaker channel count */
33
- const SPEAKER_CHANNELS = 1;
34
-
35
- /** Bytes per second of PCM audio at 24kHz 16-bit mono */
36
- const BYTES_PER_SECOND = TTS_SAMPLE_RATE * (SPEAKER_BIT_DEPTH / 8) * SPEAKER_CHANNELS;
37
-
38
- /** Interval (ms) for checking the interrupt flag during playback wait */
39
- const INTERRUPT_CHECK_INTERVAL_MS = 50;
40
-
41
- // ============================================================================
42
- // INTERFACES
43
- // ============================================================================
44
-
45
- /**
46
- * Configuration for the ElevenLabs TTS provider.
47
- */
48
- export interface ElevenlabsTtsConfig {
49
- /** ElevenLabs API key for authentication */
50
- apiKey: string;
51
- /** ElevenLabs voice ID to use for generation */
52
- voiceId: string;
53
- /** ElevenLabs model ID (e.g. "eleven_monolingual_v1") */
54
- modelId: string;
55
- /** Writable stream for PCM audio output */
56
- speakerInput: Writable;
57
- /** Callback to clear the playback buffer on interruption */
58
- interruptPlayback: () => void;
59
- /** Callback to resume playback after an interrupt */
60
- resumePlayback: () => void;
61
- }
62
-
63
- // ============================================================================
64
- // MAIN HANDLERS
65
- // ============================================================================
66
-
67
- /**
68
- * Create a TtsPlayer that uses the ElevenLabs streaming TTS API.
69
- *
70
- * Sends text to the ElevenLabs API and receives raw PCM audio at 24kHz 16-bit
71
- * mono, which is written directly to the speaker stream. No format conversion
72
- * is needed since the output matches the speaker pipeline exactly.
73
- *
74
- * @param config - ElevenLabs TTS configuration (API key, voice, model, speaker stream)
75
- * @returns A TtsPlayer instance ready for playback
76
- */
77
- export async function createElevenlabsTts(config: ElevenlabsTtsConfig): Promise<TtsPlayer> {
78
- const { apiKey, voiceId, modelId, speakerInput, interruptPlayback, resumePlayback } = config;
79
-
80
- let destroyed = false;
81
- let speaking = false;
82
- let interruptFlag = false;
83
- let wasInterrupted = false;
84
-
85
- /**
86
- * POST text to the ElevenLabs TTS streaming endpoint and stream PCM chunks
87
- * to the speaker. Returns the total number of PCM bytes written.
88
- *
89
- * @param text - The text to synthesize
90
- * @returns Total PCM bytes written to the speaker stream
91
- */
92
- async function streamTtsToSpeaker(text: string): Promise<number> {
93
- const url = `${ELEVENLABS_TTS_BASE_URL}/${voiceId}/stream?output_format=pcm_24000`;
94
-
95
- const response = await fetch(url, {
96
- method: "POST",
97
- headers: {
98
- "Content-Type": "application/json",
99
- "xi-api-key": apiKey,
100
- },
101
- body: JSON.stringify({ text, model_id: modelId }),
102
- });
103
-
104
- if (!response.ok) {
105
- const errorText = await response.text().catch(() => "unknown error");
106
- throw new Error(`ElevenLabs TTS API error ${response.status}: ${errorText}`);
107
- }
108
-
109
- let totalBytes = 0;
110
-
111
- for await (const chunk of readResponseChunks(response)) {
112
- if (interruptFlag) break;
113
-
114
- const pcmBuffer = Buffer.from(chunk);
115
- totalBytes += pcmBuffer.length;
116
- await writePcm(speakerInput, pcmBuffer);
117
- }
118
-
119
- return totalBytes;
120
- }
121
-
122
- /**
123
- * Wait for the estimated remaining playback time, allowing interruption to cancel.
124
- *
125
- * @param remainingMs - Milliseconds to wait for playback to finish
126
- */
127
- function waitForPlayback(remainingMs: number): Promise<void> {
128
- return new Promise<void>((resolve) => {
129
- const timer = setTimeout(resolve, remainingMs);
130
-
131
- // Poll the interrupt flag to allow early cancellation
132
- const check = setInterval(() => {
133
- if (interruptFlag) {
134
- clearTimeout(timer);
135
- clearInterval(check);
136
- resolve();
137
- }
138
- }, INTERRUPT_CHECK_INTERVAL_MS);
139
-
140
- // Clean up interval when timer fires naturally
141
- setTimeout(() => clearInterval(check), remainingMs + 100);
142
- });
143
- }
144
-
145
- /**
146
- * Generate audio for a single text string via ElevenLabs API and play it.
147
- * @param text - The text to speak
148
- */
149
- async function speak(text: string): Promise<void> {
150
- if (destroyed) throw new Error("TtsPlayer has been destroyed");
151
-
152
- interruptFlag = false;
153
- speaking = true;
154
-
155
- if (wasInterrupted) {
156
- resumePlayback();
157
- wasInterrupted = false;
158
- }
159
-
160
- try {
161
- await streamTtsToSpeaker(text);
162
- } finally {
163
- speaking = false;
164
- }
165
- }
166
-
167
- /**
168
- * Stream text chunks into TTS for pipelined playback.
169
- * Buffers text deltas into sentences, generates audio per sentence via
170
- * the ElevenLabs API, and writes PCM to the speaker stream.
171
- * @param texts - Async iterable of text chunks from the narrator
172
- */
173
- async function speakStream(texts: AsyncIterable<TextChunk>): Promise<void> {
174
- if (destroyed) throw new Error("TtsPlayer has been destroyed");
175
-
176
- const t0 = Date.now();
177
- let firstTextLogged = false;
178
- let chunkIndex = 0;
179
- let playbackFinishAt = 0;
180
-
181
- interruptFlag = false;
182
- speaking = true;
183
-
184
- if (wasInterrupted) {
185
- resumePlayback();
186
- wasInterrupted = false;
187
- }
188
-
189
- try {
190
- for await (const sentence of bufferSentences(texts)) {
191
- if (interruptFlag) break;
192
-
193
- if (!firstTextLogged) {
194
- console.log(`[tts-elevenlabs] first sentence at +${Date.now() - t0}ms: "${sentence.slice(0, 50)}${sentence.length > 50 ? "..." : ""}"`);
195
- firstTextLogged = true;
196
- }
197
-
198
- const sentAt = Date.now();
199
-
200
- // Fetch streamed PCM from ElevenLabs
201
- const url = `${ELEVENLABS_TTS_BASE_URL}/${voiceId}/stream?output_format=pcm_24000`;
202
- const response = await fetch(url, {
203
- method: "POST",
204
- headers: {
205
- "Content-Type": "application/json",
206
- "xi-api-key": apiKey,
207
- },
208
- body: JSON.stringify({ text: sentence, model_id: modelId }),
209
- });
210
-
211
- if (!response.ok) {
212
- const errorText = await response.text().catch(() => "unknown error");
213
- throw new Error(`ElevenLabs TTS API error ${response.status}: ${errorText}`);
214
- }
215
-
216
- if (interruptFlag) break;
217
-
218
- // Read chunked PCM from the response body
219
- for await (const chunk of readResponseChunks(response)) {
220
- if (interruptFlag) break;
221
-
222
- const pcmBuffer = Buffer.from(chunk);
223
- const now = Date.now() - t0;
224
- const audioDurationMs = (pcmBuffer.length / BYTES_PER_SECOND) * 1000;
225
- const genMs = Date.now() - sentAt;
226
-
227
- chunkIndex++;
228
-
229
- await writePcm(speakerInput, pcmBuffer);
230
-
231
- // Track estimated playback end. If the speaker buffer drained during a
232
- // gap (e.g. tool call), new audio starts from now, not after previous audio.
233
- playbackFinishAt = Math.max(playbackFinishAt, Date.now()) + audioDurationMs;
234
- }
235
-
236
- if (interruptFlag) break;
237
- }
238
-
239
- // Wait for buffered audio to finish playing through the speakers
240
- if (!interruptFlag && playbackFinishAt > 0) {
241
- const remainingMs = playbackFinishAt - Date.now();
242
- if (remainingMs > 0) {
243
- console.log(`[tts-elevenlabs] waiting ${(remainingMs / 1000).toFixed(1)}s for playback to finish`);
244
- await waitForPlayback(remainingMs);
245
- }
246
- }
247
- } finally {
248
- speaking = false;
249
- }
250
- }
251
-
252
- /**
253
- * Interrupt current playback and cancel in-flight generation.
254
- * Clears the playback buffer and sets the interrupt flag.
255
- */
256
- function interrupt(): void {
257
- if (destroyed) return;
258
- interruptFlag = true;
259
- wasInterrupted = true;
260
- interruptPlayback();
261
- }
262
-
263
- /**
264
- * Check whether TTS is currently active.
265
- * @returns true if a speak/speakStream call is in progress
266
- */
267
- function checkIsSpeaking(): boolean {
268
- return speaking;
269
- }
270
-
271
- /**
272
- * Free all resources and prevent further usage.
273
- */
274
- function destroyPlayer(): void {
275
- if (destroyed) return;
276
- destroyed = true;
277
- interrupt();
278
- }
279
-
280
- return {
281
- speak,
282
- speakStream,
283
- interrupt,
284
- isSpeaking: checkIsSpeaking,
285
- destroy: destroyPlayer,
286
- };
287
- }
288
-
289
- // ============================================================================
290
- // HELPER FUNCTIONS
291
- // ============================================================================
292
-
293
- /**
294
- * Write a PCM buffer to the speaker stream, respecting backpressure.
295
- * @param stream - The speaker writable stream
296
- * @param pcmBuffer - Raw PCM bytes to write
297
- */
298
- function writePcm(stream: Writable, pcmBuffer: Buffer): Promise<void> {
299
- return new Promise<void>((resolve, reject) => {
300
- const ok = stream.write(pcmBuffer, (err: Error | null | undefined) => {
301
- if (err) reject(err);
302
- });
303
- if (ok) {
304
- resolve();
305
- } else {
306
- stream.once("drain", () => resolve());
307
- }
308
- });
309
- }
310
-
311
- /** Sentence-ending punctuation pattern: .!? followed by whitespace or end */
312
- const SENTENCE_END_RE = /[.!?][\s]+/;
313
-
314
- /** Minimum sentence length before we'll split on punctuation */
315
- const MIN_SENTENCE_LENGTH = 20;
316
-
317
- /**
318
- * Buffer streaming text deltas into complete sentences for TTS generation.
319
- * Chunks tagged with { flush: true } are yielded immediately (e.g. tool narration).
320
- * Plain string chunks are buffered and split on sentence-ending punctuation.
321
- * @param texts - Async iterable of TextChunk from the narrator
322
- * @yields Complete sentences ready for TTS
323
- */
324
- async function* bufferSentences(texts: AsyncIterable<TextChunk>): AsyncGenerator<string> {
325
- let buffer = "";
326
-
327
- for await (const raw of texts) {
328
- if (typeof raw !== "string") {
329
- if (buffer.trim()) {
330
- yield buffer.trim();
331
- buffer = "";
332
- }
333
- yield raw.text;
334
- continue;
335
- }
336
-
337
- buffer += raw;
338
-
339
- while (buffer.length >= MIN_SENTENCE_LENGTH) {
340
- const match = SENTENCE_END_RE.exec(buffer.slice(MIN_SENTENCE_LENGTH - 1));
341
- if (!match) break;
342
-
343
- const splitIndex = MIN_SENTENCE_LENGTH - 1 + match.index + match[0].length;
344
- const sentence = buffer.slice(0, splitIndex).trim();
345
- buffer = buffer.slice(splitIndex);
346
-
347
- if (sentence) yield sentence;
348
- }
349
- }
350
-
351
- const remaining = buffer.trim();
352
- if (remaining) yield remaining;
353
- }
354
-
355
- /**
356
- * Read chunks from a fetch Response body as an async iterable, ensuring each
357
- * yielded chunk is aligned to 16-bit sample boundaries (even byte count).
358
- *
359
- * HTTP streaming splits the byte stream at arbitrary TCP packet boundaries.
360
- * A chunk with an odd byte count splits a 16-bit PCM sample in half. Downstream
361
- * consumers (browser WebSocket -> Int16Array) interpret each chunk independently,
362
- * so a misaligned chunk corrupts all its samples (heard as hiss/static).
363
- *
364
- * @param response - The fetch Response to read from
365
- * @yields Buffer chunks of sample-aligned raw PCM audio data
366
- */
367
- async function* readResponseChunks(response: Response): AsyncGenerator<Buffer> {
368
- const body = response.body;
369
- if (!body) throw new Error("ElevenLabs TTS response has no body");
370
-
371
- const reader = body.getReader();
372
- let leftover: Buffer | null = null;
373
-
374
- try {
375
- while (true) {
376
- const { done, value } = await reader.read();
377
- if (done) break;
378
- if (!value) continue;
379
-
380
- let chunk: Buffer = leftover ? Buffer.concat([leftover, value]) : Buffer.from(value);
381
- leftover = null;
382
-
383
- // Hold back the last byte if odd length (split sample)
384
- if (chunk.byteLength % 2 !== 0) {
385
- leftover = Buffer.from(chunk.subarray(chunk.byteLength - 1));
386
- chunk = chunk.subarray(0, chunk.byteLength - 1);
387
- }
388
-
389
- if (chunk.byteLength > 0) yield chunk;
390
- }
391
-
392
- // Flush any remaining byte (only happens with malformed PCM)
393
- if (leftover && leftover.byteLength > 0) yield leftover;
394
- } finally {
395
- reader.releaseLock();
396
- }
397
- }
@@ -1,155 +0,0 @@
1
- /**
2
- * TTS provider factory and readiness checks.
3
- *
4
- * Routes TTS creation to the ElevenLabs provider implementation.
5
- * Checks provider readiness (API keys) for dashboard status.
6
- *
7
- * Responsibilities:
8
- * - Create a TtsPlayer for the configured provider
9
- * - Check provider readiness (API keys set)
10
- * - Provide static metadata about available TTS providers
11
- */
12
-
13
- import { createElevenlabsTts } from "./tts-elevenlabs.js";
14
- import { readEnv } from "../services/env.js";
15
-
16
- import type { Writable } from "stream";
17
- import type { TtsPlayer, TtsProviderType, TtsProviderConfig, ProviderStatus } from "./types.js";
18
-
19
- // ============================================================================
20
- // CONSTANTS
21
- // ============================================================================
22
-
23
- /** ElevenLabs API base URL */
24
- const ELEVENLABS_API_BASE = "https://api.elevenlabs.io/v1";
25
-
26
- // ============================================================================
27
- // INTERFACES
28
- // ============================================================================
29
-
30
- /**
31
- * A voice option returned by listVoicesForProvider.
32
- */
33
- export interface VoiceOption {
34
- id: string;
35
- name: string;
36
- }
37
-
38
- /**
39
- * Metadata about a TTS provider for display in the dashboard.
40
- */
41
- export interface TtsProviderInfo {
42
- /** Provider type identifier */
43
- type: TtsProviderType;
44
- /** Human-readable provider name */
45
- name: string;
46
- /** Short description of the provider */
47
- description: string;
48
- /** Environment variable name for the API key (undefined = no key needed) */
49
- requiresApiKey?: string;
50
- }
51
-
52
- /**
53
- * Options for creating a TTS player via the provider factory.
54
- */
55
- export interface CreateTtsOptions {
56
- /** Provider configuration (which provider + per-provider settings) */
57
- providerConfig: TtsProviderConfig;
58
- /** Writable stream for PCM audio output */
59
- speakerInput: Writable;
60
- /** Callback to clear the playback buffer on interruption */
61
- interruptPlayback: () => void;
62
- /** Callback to resume playback after an interrupt */
63
- resumePlayback: () => void;
64
- }
65
-
66
- // ============================================================================
67
- // MAIN HANDLERS
68
- // ============================================================================
69
-
70
- /**
71
- * Create a TtsPlayer for the configured provider.
72
- *
73
- * @param options - Provider config, speaker stream, and playback callbacks
74
- * @returns A TtsPlayer instance ready for playback
75
- * @throws Error if the provider is not implemented
76
- */
77
- export async function createTtsForProvider(options: CreateTtsOptions): Promise<TtsPlayer> {
78
- const { providerConfig, speakerInput, interruptPlayback, resumePlayback } = options;
79
-
80
- switch (providerConfig.provider) {
81
- case "elevenlabs":
82
- return createElevenlabsTts({
83
- apiKey: providerConfig.elevenlabs.apiKey,
84
- voiceId: providerConfig.elevenlabs.voiceId,
85
- modelId: providerConfig.elevenlabs.modelId,
86
- speakerInput,
87
- interruptPlayback,
88
- resumePlayback,
89
- });
90
-
91
- default:
92
- throw new Error(`Unknown TTS provider: ${providerConfig.provider}`);
93
- }
94
- }
95
-
96
- /**
97
- * Check whether a TTS provider is ready to use.
98
- * Checks ELEVENLABS_API_KEY is set in .env.
99
- *
100
- * @param providerType - The provider to check
101
- * @returns Readiness status with reason if not ready
102
- */
103
- export async function getTtsProviderStatus(providerType: TtsProviderType): Promise<ProviderStatus> {
104
- switch (providerType) {
105
- case "elevenlabs": {
106
- const env = await readEnv();
107
- if (!env.ELEVENLABS_API_KEY) {
108
- return { ready: false, reason: "missing_api_key", detail: "ELEVENLABS_API_KEY is not set in .env" };
109
- }
110
- return { ready: true };
111
- }
112
-
113
- default:
114
- throw new Error(`Unknown TTS provider: ${providerType}`);
115
- }
116
- }
117
-
118
- /**
119
- * List available voices for a TTS provider.
120
- * Fetches available voices from the ElevenLabs API.
121
- *
122
- * @param providerType - The provider to list voices for
123
- * @returns Array of voice options
124
- */
125
- export async function listVoicesForProvider(
126
- providerType: TtsProviderType,
127
- ): Promise<VoiceOption[]> {
128
- switch (providerType) {
129
- case "elevenlabs": {
130
- const res = await fetch(`${ELEVENLABS_API_BASE}/voices`);
131
- if (!res.ok) throw new Error(`ElevenLabs API error: ${res.status}`);
132
- const data = (await res.json()) as { voices: Array<{ voice_id: string; name: string }> };
133
- return data.voices.map((v) => ({ id: v.voice_id, name: v.name }));
134
- }
135
-
136
- default:
137
- throw new Error(`Unknown TTS provider: ${providerType}`);
138
- }
139
- }
140
-
141
- /**
142
- * Get the list of all known TTS providers with metadata.
143
- *
144
- * @returns Static array of TTS provider info
145
- */
146
- export function getAvailableTtsProviders(): TtsProviderInfo[] {
147
- return [
148
- {
149
- type: "elevenlabs",
150
- name: "ElevenLabs",
151
- description: "Cloud TTS via ElevenLabs streaming API",
152
- requiresApiKey: "ELEVENLABS_API_KEY",
153
- },
154
- ];
155
- }