voicecc 1.1.36 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/bin/voicecc.js +94 -1
  2. package/dashboard/dist/assets/index-DCeOdulF.js +28 -0
  3. package/dashboard/dist/index.html +1 -1
  4. package/dashboard/routes/agents.ts +28 -8
  5. package/dashboard/routes/browser-call.ts +3 -2
  6. package/dashboard/routes/chat.ts +75 -55
  7. package/dashboard/routes/providers.ts +5 -74
  8. package/dashboard/routes/twilio.ts +104 -5
  9. package/dashboard/routes/voice.ts +98 -0
  10. package/dashboard/server.ts +48 -1
  11. package/package.json +2 -3
  12. package/server/index.ts +96 -8
  13. package/server/services/twilio-manager.ts +29 -10
  14. package/dashboard/dist/assets/index-C62C9Gp0.js +0 -28
  15. package/dashboard/dist/audio-processor.js +0 -126
  16. package/server/services/heartbeat.ts +0 -403
  17. package/server/voice/assets/chime.wav +0 -0
  18. package/server/voice/assets/startup.pcm +0 -0
  19. package/server/voice/audio-adapter.ts +0 -60
  20. package/server/voice/audio-inactivity.test.ts +0 -108
  21. package/server/voice/audio-inactivity.ts +0 -91
  22. package/server/voice/browser-audio-playback.test.ts +0 -149
  23. package/server/voice/browser-audio.ts +0 -147
  24. package/server/voice/browser-server.ts +0 -311
  25. package/server/voice/chat-server.ts +0 -236
  26. package/server/voice/chime.test.ts +0 -69
  27. package/server/voice/chime.ts +0 -36
  28. package/server/voice/claude-session.ts +0 -293
  29. package/server/voice/endpointing.ts +0 -163
  30. package/server/voice/mic-vpio +0 -0
  31. package/server/voice/narration.ts +0 -204
  32. package/server/voice/prompt-builder.ts +0 -108
  33. package/server/voice/session-lock.ts +0 -123
  34. package/server/voice/stt-elevenlabs.ts +0 -210
  35. package/server/voice/stt-provider.ts +0 -106
  36. package/server/voice/tts-elevenlabs-hiss.test.ts +0 -183
  37. package/server/voice/tts-elevenlabs.ts +0 -397
  38. package/server/voice/tts-provider.ts +0 -155
  39. package/server/voice/twilio-audio.ts +0 -338
  40. package/server/voice/twilio-server.ts +0 -540
  41. package/server/voice/types.ts +0 -282
  42. package/server/voice/vad.ts +0 -101
  43. package/server/voice/voice-loop-bugs.test.ts +0 -348
  44. package/server/voice/voice-server.ts +0 -129
  45. package/server/voice/voice-session.ts +0 -539
@@ -1,282 +0,0 @@
1
- /**
2
- * Shared types for the Claude Code voice server.
3
- *
4
- * Defines all DTOs and interfaces used across the voice pipeline modules:
5
- * - Voice loop configuration and state
6
- * - Audio frame representation
7
- * - VAD (voice activity detection) events
8
- * - STT (speech-to-text) results
9
- * - Endpointing decisions for turn detection
10
- * - Claude session streaming events
11
- * - TTS (text-to-speech) configuration
12
- * - Narration configuration
13
- */
14
-
15
- // ============================================================================
16
- // CONFIGURATION INTERFACES
17
- // ============================================================================
18
-
19
- /**
20
- * Top-level configuration for the voice loop.
21
- * Passed to `startVoiceLoop` to initialize all modules.
22
- */
23
- export interface VoiceLoopConfig {
24
- /** Endpointing configuration for turn detection */
25
- endpointing: EndpointingConfig;
26
- /** Narration configuration for Claude response processing */
27
- narration: NarrationConfig;
28
- /** Claude Agent SDK session configuration */
29
- claudeSession: ClaudeSessionConfig;
30
- /** Phrase that stops the voice loop when spoken */
31
- stopPhrase: string;
32
- }
33
-
34
- /**
35
- * Configuration for the endpointing module.
36
- * Controls how the system decides when the user is done speaking.
37
- */
38
- export interface EndpointingConfig {
39
- /** Silence duration (ms) before considering speech complete */
40
- silenceThresholdMs: number;
41
- /** Maximum silence duration (ms) before forcing completion regardless */
42
- maxSilenceBeforeTimeoutMs: number;
43
- /** Minimum word count for the VAD fast path (skips Haiku check) */
44
- minWordCountForFastPath: number;
45
- /** Whether to use Haiku API for ambiguous short utterances */
46
- enableHaikuFallback: boolean;
47
- }
48
-
49
- /**
50
- * Configuration for the Claude Agent SDK session.
51
- */
52
- export interface ClaudeSessionConfig {
53
- /** List of allowed tool names (empty array means all tools allowed) */
54
- allowedTools: string[];
55
- /** Permission mode -- must be "bypassPermissions" for voice loop */
56
- permissionMode: string;
57
- /** System prompt appended to the default (includes CLAUDE.md) */
58
- systemPrompt: string;
59
- /** If set, replaces the entire system prompt (skips CLAUDE.md). Used for agent calls. */
60
- customSystemPrompt?: string;
61
- /** Working directory for the Claude Code session */
62
- cwd?: string;
63
- }
64
-
65
- /**
66
- * Configuration for the narration module.
67
- * Controls how Claude's streaming output is processed into speakable text.
68
- */
69
- export interface NarrationConfig {
70
- /** Interval (ms) between "still working..." summaries during long tool runs */
71
- summaryIntervalMs: number;
72
- }
73
-
74
- /**
75
- * TTS player instance that converts text to spoken audio output.
76
- */
77
- export interface TtsPlayer {
78
- /**
79
- * Convert text to audio and play it through the speakers.
80
- * @param text - The text to speak
81
- */
82
- speak(text: string): Promise<void>;
83
-
84
- /**
85
- * Stream text chunks into TTS for incremental playback.
86
- * @param texts - Async iterable of text chunks
87
- */
88
- speakStream(texts: AsyncIterable<TextChunk>): Promise<void>;
89
-
90
- /** Interrupt current playback immediately. */
91
- interrupt(): void;
92
-
93
- /** Check whether TTS is currently generating and playing audio. */
94
- isSpeaking(): boolean;
95
-
96
- /** Free all TTS resources. */
97
- destroy(): void;
98
- }
99
-
100
- /**
101
- * STT processor instance that converts speech audio to text.
102
- */
103
- export interface SttProcessor {
104
- /**
105
- * Appends audio samples to the internal buffer.
106
- * @param samples - Float32Array of audio samples (16kHz, normalized -1.0 to 1.0)
107
- */
108
- accumulate(samples: Float32Array): void;
109
-
110
- /**
111
- * Batch-transcribes the accumulated audio buffer. Clears the buffer afterward.
112
- * @returns Transcription result with text, isFinal flag, and timestamp
113
- */
114
- transcribe(): Promise<TranscriptionResult>;
115
-
116
- /** Clears the accumulated audio buffer without transcribing. */
117
- clearBuffer(): void;
118
-
119
- /** Frees underlying resources. */
120
- destroy(): void;
121
- }
122
-
123
- // ============================================================================
124
- // AUDIO TYPES
125
- // ============================================================================
126
-
127
- /**
128
- * A single frame of audio data from the microphone.
129
- */
130
- export interface AudioFrame {
131
- /** PCM audio samples normalized to -1.0 to 1.0 range */
132
- pcm: Float32Array;
133
- /** Sample rate in Hz */
134
- sampleRate: number;
135
- /** Timestamp in milliseconds when this frame was captured */
136
- timestamp: number;
137
- }
138
-
139
- // ============================================================================
140
- // VAD TYPES
141
- // ============================================================================
142
-
143
- /** Possible VAD event types indicating speech activity state */
144
- export type VadEventType = "SPEECH_START" | "SPEECH_CONTINUE" | "SPEECH_END" | "SILENCE";
145
-
146
- /**
147
- * Event emitted by the VAD processor after analyzing an audio frame.
148
- */
149
- export interface VadEvent {
150
- /** The detected speech activity state */
151
- type: VadEventType;
152
- /** Speech probability from the VAD model (0.0 to 1.0) */
153
- probability: number;
154
- /** Timestamp in milliseconds */
155
- timestamp: number;
156
- }
157
-
158
- // ============================================================================
159
- // STT TYPES
160
- // ============================================================================
161
-
162
- /**
163
- * Result from the speech-to-text transcription.
164
- */
165
- export interface TranscriptionResult {
166
- /** The transcribed text */
167
- text: string;
168
- /** Whether this is a final transcription (always true for batch/offline mode) */
169
- isFinal: boolean;
170
- /** Timestamp in milliseconds when transcription completed */
171
- timestamp: number;
172
- }
173
-
174
- // ============================================================================
175
- // ENDPOINTING TYPES
176
- // ============================================================================
177
-
178
- /** Method used to determine that the user finished speaking */
179
- export type EndpointMethod = "vad_fast" | "haiku_semantic" | "timeout";
180
-
181
- /**
182
- * Decision from the endpointing module on whether the user has finished speaking.
183
- */
184
- export interface EndpointDecision {
185
- /** Whether the user's turn is considered complete */
186
- isComplete: boolean;
187
- /** The current accumulated transcript */
188
- transcript: string;
189
- /** Which method was used to make the decision */
190
- method: EndpointMethod;
191
- }
192
-
193
- // ============================================================================
194
- // CLAUDE SESSION TYPES
195
- // ============================================================================
196
-
197
- /** Possible event types from the Claude streaming response */
198
- export type ClaudeStreamEventType = "text_delta" | "tool_start" | "tool_end" | "result" | "error";
199
-
200
- /**
201
- * Simplified streaming event from the Claude Agent SDK session.
202
- * Mapped from the raw SDKMessage types for downstream consumption.
203
- */
204
- export interface ClaudeStreamEvent {
205
- /** The type of streaming event */
206
- type: ClaudeStreamEventType;
207
- /** Text content (for text_delta events) or error message (for error events) */
208
- content: string;
209
- /** Tool name (only present for tool_start events) */
210
- toolName?: string;
211
- }
212
-
213
- // ============================================================================
214
- // TTS TEXT CHUNK TYPES
215
- // ============================================================================
216
-
217
- /** A text chunk for TTS. Plain string = streaming fragment (buffer it).
218
- * Object with flush = complete sentence (speak immediately). */
219
- export type TextChunk = string | { text: string; flush: true };
220
-
221
- // ============================================================================
222
- // VOICE LOOP STATE
223
- // ============================================================================
224
-
225
- /** Possible states of the voice loop state machine */
226
- export type VoiceLoopStatus = "idle" | "listening" | "processing" | "speaking";
227
-
228
- /**
229
- * Current state of the voice loop.
230
- * Used by the state machine in index.ts.
231
- */
232
- export interface VoiceLoopState {
233
- /** Current state of the voice loop */
234
- status: VoiceLoopStatus;
235
- /** Active Claude session ID, or null if no session is active */
236
- sessionId: string | null;
237
- }
238
-
239
- // ============================================================================
240
- // PROVIDER TYPES
241
- // ============================================================================
242
-
243
- /** Available TTS provider backends */
244
- export type TtsProviderType = "elevenlabs";
245
-
246
- /** Available STT provider backends */
247
- export type SttProviderType = "elevenlabs";
248
-
249
- /**
250
- * Readiness status for a provider.
251
- * Returned by getTtsProviderStatus / getSttProviderStatus.
252
- */
253
- export interface ProviderStatus {
254
- /** Whether the provider is ready to use */
255
- ready: boolean;
256
- /** Reason the provider is not ready (only present when ready is false) */
257
- reason?: "missing_api_key";
258
- /** Human-readable detail about why the provider is not ready */
259
- detail?: string;
260
- }
261
-
262
- /**
263
- * Configuration that selects a TTS provider and holds per-provider settings.
264
- * Built from environment variables in each entry point.
265
- */
266
- export interface TtsProviderConfig {
267
- /** Which TTS provider to use */
268
- provider: TtsProviderType;
269
- /** Settings for the ElevenLabs TTS provider */
270
- elevenlabs: { apiKey: string; voiceId: string; modelId: string };
271
- }
272
-
273
- /**
274
- * Configuration that selects an STT provider and holds per-provider settings.
275
- * Built from environment variables in each entry point.
276
- */
277
- export interface SttProviderConfig {
278
- /** Which STT provider to use */
279
- provider: SttProviderType;
280
- /** Settings for the ElevenLabs STT provider */
281
- elevenlabs: { apiKey: string; modelId: string };
282
- }
@@ -1,101 +0,0 @@
1
- /**
2
- * Voice Activity Detection (VAD) via avr-vad (Silero VAD v5).
3
- *
4
- * Wraps the avr-vad callback-based API into a simpler event queue model.
5
- * avr-vad handles its own framing internally (512-sample frames at 16kHz).
6
- * We feed raw audio via processAudio and collect speech events from callbacks.
7
- *
8
- * Responsibilities:
9
- * - Initialize the Silero VAD v5 model
10
- * - Feed raw audio and collect speech start/end events
11
- * - Expose per-frame probability via onFrameProcessed callback
12
- * - Manage model lifecycle (reset between utterances, destroy on shutdown)
13
- */
14
-
15
- import type { VadEvent, VadEventType } from "./types.js";
16
-
17
- // ============================================================================
18
- // INTERFACES
19
- // ============================================================================
20
-
21
- /** Callback invoked for each VAD event detected in the audio stream. */
22
- type VadEventCallback = (event: VadEvent) => void;
23
-
24
- /** Internal interface for the VAD processor returned by createVad. */
25
- interface VadProcessor {
26
- /**
27
- * Feed raw audio samples to the VAD. avr-vad handles framing internally.
28
- * Events are emitted via the callback provided at creation.
29
- *
30
- * @param samples - Float32Array of audio samples (16kHz, normalized -1.0 to 1.0)
31
- */
32
- processAudio(samples: Float32Array): Promise<void>;
33
-
34
- /**
35
- * Resets internal VAD state. Call between utterances to avoid
36
- * state leakage across speech segments.
37
- */
38
- reset(): void;
39
-
40
- /**
41
- * Frees the underlying ONNX model resources.
42
- * Call on shutdown to prevent resource leaks.
43
- */
44
- destroy(): void;
45
- }
46
-
47
- // ============================================================================
48
- // MAIN HANDLERS
49
- // ============================================================================
50
-
51
- /**
52
- * Initializes the Silero VAD v5 model via avr-vad and returns a VadProcessor.
53
- * Events (SPEECH_START, SPEECH_END, etc.) are delivered via the onEvent callback.
54
- *
55
- * @param onEvent - Callback invoked for each detected VAD event
56
- * @returns Promise resolving to a VadProcessor instance
57
- * @throws Error if the ONNX model fails to load
58
- */
59
- async function createVad(onEvent: VadEventCallback): Promise<VadProcessor> {
60
- // Dynamic import for avr-vad.
61
- const { RealTimeVAD } = await import("avr-vad");
62
-
63
- let lastProbability = 0;
64
-
65
- const vad = await RealTimeVAD.new({
66
- onSpeechStart: () => {
67
- onEvent({ type: "SPEECH_START", probability: lastProbability, timestamp: Date.now() });
68
- },
69
- onSpeechRealStart: () => {
70
- // Emitted after minSpeechFrames confirm real speech.
71
- // We treat this as SPEECH_CONTINUE to signal sustained speech.
72
- onEvent({ type: "SPEECH_CONTINUE", probability: lastProbability, timestamp: Date.now() });
73
- },
74
- onSpeechEnd: () => {
75
- onEvent({ type: "SPEECH_END", probability: lastProbability, timestamp: Date.now() });
76
- },
77
- onFrameProcessed: (probs: { isSpeech: number }) => {
78
- lastProbability = probs.isSpeech;
79
- },
80
- });
81
-
82
- // Must call start() to activate processing
83
- vad.start();
84
-
85
- return {
86
- async processAudio(samples: Float32Array): Promise<void> {
87
- await vad.processAudio(samples);
88
- },
89
-
90
- reset(): void {
91
- vad.reset();
92
- },
93
-
94
- destroy(): void {
95
- vad.destroy();
96
- },
97
- };
98
- }
99
-
100
- export { createVad };
101
- export type { VadProcessor, VadEventCallback };