voicecc 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/.claude-plugin/plugin.json +6 -0
  2. package/README.md +48 -0
  3. package/bin/voicecc.js +39 -0
  4. package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
  5. package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
  6. package/dashboard/dist/audio-processor.js +126 -0
  7. package/dashboard/dist/index.html +13 -0
  8. package/dashboard/routes/auth.ts +119 -0
  9. package/dashboard/routes/browser-call.ts +87 -0
  10. package/dashboard/routes/claude-md.ts +50 -0
  11. package/dashboard/routes/conversations.ts +203 -0
  12. package/dashboard/routes/integrations.ts +154 -0
  13. package/dashboard/routes/mcp-servers.ts +198 -0
  14. package/dashboard/routes/settings.ts +64 -0
  15. package/dashboard/routes/tunnel.ts +66 -0
  16. package/dashboard/routes/twilio.ts +120 -0
  17. package/dashboard/routes/voice.ts +48 -0
  18. package/dashboard/routes/webrtc.ts +85 -0
  19. package/dashboard/server.ts +130 -0
  20. package/dashboard/tsconfig.json +13 -0
  21. package/init/CLAUDE.md +18 -0
  22. package/package.json +59 -0
  23. package/run.ts +68 -0
  24. package/scripts/postinstall.js +228 -0
  25. package/services/browser-call-manager.ts +106 -0
  26. package/services/device-pairing.ts +176 -0
  27. package/services/env.ts +88 -0
  28. package/services/tunnel.ts +204 -0
  29. package/services/twilio-manager.ts +126 -0
  30. package/sidecar/assets/startup.pcm +0 -0
  31. package/sidecar/audio-adapter.ts +60 -0
  32. package/sidecar/audio-capture.ts +220 -0
  33. package/sidecar/browser-audio-playback.test.ts +149 -0
  34. package/sidecar/browser-audio.ts +147 -0
  35. package/sidecar/browser-server.ts +331 -0
  36. package/sidecar/chime.test.ts +69 -0
  37. package/sidecar/chime.ts +54 -0
  38. package/sidecar/claude-session.ts +295 -0
  39. package/sidecar/endpointing.ts +163 -0
  40. package/sidecar/index.ts +83 -0
  41. package/sidecar/local-audio.ts +126 -0
  42. package/sidecar/mic-vpio +0 -0
  43. package/sidecar/mic-vpio.swift +484 -0
  44. package/sidecar/mock-tts-server-tagged.mjs +132 -0
  45. package/sidecar/narration.ts +204 -0
  46. package/sidecar/scripts/generate-startup-audio.py +79 -0
  47. package/sidecar/session-lock.ts +123 -0
  48. package/sidecar/sherpa-onnx-node.d.ts +4 -0
  49. package/sidecar/stt.ts +199 -0
  50. package/sidecar/tts-server.py +193 -0
  51. package/sidecar/tts.ts +481 -0
  52. package/sidecar/twilio-audio.ts +338 -0
  53. package/sidecar/twilio-server.ts +436 -0
  54. package/sidecar/types.ts +210 -0
  55. package/sidecar/vad.ts +101 -0
  56. package/sidecar/voice-loop-bugs.test.ts +522 -0
  57. package/sidecar/voice-session.ts +523 -0
  58. package/skills/voice/SKILL.md +26 -0
  59. package/tsconfig.json +22 -0
@@ -0,0 +1,338 @@
1
+ /**
2
+ * Twilio audio adapter for WebSocket-based media streams.
3
+ *
4
+ * Implements the AudioAdapter interface for Twilio phone calls by converting
5
+ * between Twilio's mulaw 8kHz format and the pipeline's float32 16kHz (input)
6
+ * and int16 24kHz (output) formats.
7
+ *
8
+ * Responsibilities:
9
+ * - Encode/decode G.711 mu-law audio (ITU-T standard)
10
+ * - Resample between 8kHz, 16kHz, and 24kHz (integer-ratio conversions)
11
+ * - Convert Twilio base64 media payloads to Float32Array for VAD/STT
12
+ * - Convert 24kHz PCM from TTS to Twilio base64 media payloads
13
+ * - Manage WebSocket message I/O with close-state tracking
14
+ * - Cache the ready chime as 24kHz PCM for playback over the call
15
+ */
16
+
17
+ import type { WebSocket } from "ws";
18
+ import type { AudioAdapter } from "./audio-adapter.js";
19
+
20
+ import { decodeChimeToPcm } from "./chime.js";
21
+
22
+ // ============================================================================
23
+ // CONSTANTS
24
+ // ============================================================================
25
+
26
+ /** Bias constant for G.711 mu-law encoding */
27
+ const MULAW_BIAS = 0x84;
28
+
29
+ /** Maximum value for G.711 mu-law encoding (clamped to avoid overflow) */
30
+ const MULAW_CLIP = 32635;
31
+
32
+ // ============================================================================
33
+ // TYPES
34
+ // ============================================================================
35
+
36
+ /** Configuration for creating a Twilio audio adapter */
37
+ export interface TwilioAudioAdapterConfig {
38
+ /** Active Twilio WebSocket connection for the media stream */
39
+ ws: WebSocket;
40
+ /** Twilio stream identifier for outgoing messages */
41
+ streamSid: string;
42
+ }
43
+
44
+ // ============================================================================
45
+ // MAIN ENTRYPOINT
46
+ // ============================================================================
47
+
48
+ /**
49
+ * Create an AudioAdapter that reads/writes audio over a Twilio WebSocket media stream.
50
+ *
51
+ * Decodes the macOS Glass.aiff chime to raw 24kHz PCM during initialization
52
+ * and caches the buffer for playChime(). Tracks WebSocket open/closed state
53
+ * so that writes to a closed socket are silently ignored.
54
+ *
55
+ * @param config - Twilio WebSocket and stream identifier
56
+ * @returns An AudioAdapter for Twilio phone call I/O
57
+ */
58
+ export function createTwilioAudioAdapter(config: TwilioAudioAdapterConfig): AudioAdapter {
59
+ const { ws, streamSid } = config;
60
+
61
+ let wsClosed = false;
62
+ let audioCallback: ((samples: Float32Array) => void) | null = null;
63
+
64
+ // Track WebSocket close state
65
+ ws.on("close", () => {
66
+ wsClosed = true;
67
+ });
68
+
69
+ // Decode chime to raw 24kHz PCM and cache it
70
+ const chimePcm = decodeChimeToPcm();
71
+
72
+ // --------------------------------------------------------------------------
73
+ // AudioAdapter methods
74
+ // --------------------------------------------------------------------------
75
+
76
+ /**
77
+ * Subscribe to incoming audio chunks from the Twilio media stream.
78
+ * Registers a WebSocket message handler that decodes media events and
79
+ * invokes the callback with Float32Array samples (16kHz, normalized).
80
+ *
81
+ * @param callback - Called with each audio chunk as Float32Array
82
+ */
83
+ function onAudio(callback: (samples: Float32Array) => void): void {
84
+ audioCallback = callback;
85
+
86
+ ws.on("message", (data: Buffer | string) => {
87
+ if (wsClosed) return;
88
+
89
+ const msg = JSON.parse(typeof data === "string" ? data : data.toString("utf-8"));
90
+ if (msg.event !== "media") return;
91
+
92
+ const samples = twilioPayloadToFloat32(msg.media.payload);
93
+ audioCallback?.(samples);
94
+ });
95
+ }
96
+
97
+ /**
98
+ * Write PCM audio to the Twilio media stream.
99
+ * Converts 24kHz int16 PCM to base64 mulaw 8kHz and sends it over WebSocket.
100
+ * Silently returns if the WebSocket has closed (session is tearing down).
101
+ *
102
+ * @param pcm - Raw PCM buffer (16-bit signed, 24kHz mono)
103
+ * @returns Resolves immediately (no backpressure at telephony bitrates)
104
+ */
105
+ async function writeSpeaker(pcm: Buffer): Promise<void> {
106
+ if (wsClosed) return;
107
+
108
+ const payload = pcm24kToTwilioPayload(pcm);
109
+ const message = JSON.stringify({
110
+ event: "media",
111
+ streamSid,
112
+ media: { payload },
113
+ });
114
+
115
+ ws.send(message);
116
+ }
117
+
118
+ /**
119
+ * Clear Twilio's audio playback buffer immediately (user interruption).
120
+ * Sends a "clear" event over the WebSocket.
121
+ */
122
+ function interrupt(): void {
123
+ if (wsClosed) return;
124
+
125
+ ws.send(JSON.stringify({ event: "clear", streamSid }));
126
+ }
127
+
128
+ /**
129
+ * Resume output after an interrupt. No-op for Twilio -- it accepts
130
+ * new audio immediately after a clear event.
131
+ */
132
+ function resume(): void {
133
+ // No-op: Twilio accepts audio immediately after clear
134
+ }
135
+
136
+ /**
137
+ * Play the ready chime by sending the cached 24kHz PCM through writeSpeaker.
138
+ */
139
+ function playChime(): void {
140
+ writeSpeaker(chimePcm);
141
+ }
142
+
143
+ /**
144
+ * Clean up resources. No-op for Twilio -- WebSocket lifecycle is
145
+ * managed by twilio-server.ts.
146
+ */
147
+ function destroy(): void {
148
+ // No-op: WebSocket lifecycle managed by twilio-server.ts
149
+ }
150
+
151
+ return {
152
+ onAudio,
153
+ writeSpeaker,
154
+ interrupt,
155
+ resume,
156
+ playChime,
157
+ destroy,
158
+ };
159
+ }
160
+
161
+ // ============================================================================
162
+ // CODEC FUNCTIONS
163
+ // ============================================================================
164
+
165
+ /**
166
+ * Decode a single mu-law byte to a 16-bit linear PCM sample.
167
+ * Implements the G.711 ITU-T standard mu-law decompression.
168
+ *
169
+ * @param byte - Mu-law encoded byte (0-255)
170
+ * @returns 16-bit signed PCM sample (-32768 to 32767)
171
+ */
172
+ export function mulawDecode(byte: number): number {
173
+ // Complement the byte (mu-law stores inverted)
174
+ byte = ~byte & 0xff;
175
+
176
+ // Extract sign bit (bit 7)
177
+ const sign = byte & 0x80;
178
+
179
+ // Extract exponent (bits 6-4) and mantissa (bits 3-0)
180
+ const exponent = (byte >> 4) & 0x07;
181
+ const mantissa = byte & 0x0f;
182
+
183
+ // Reconstruct the magnitude: add mantissa with implicit bit, shift by exponent, subtract bias
184
+ let sample = ((mantissa << 1) + 33) << exponent;
185
+ sample -= 33;
186
+
187
+ return sign ? -sample : sample;
188
+ }
189
+
190
+ /**
191
+ * Encode a single 16-bit linear PCM sample to a mu-law byte.
192
+ * Implements the G.711 ITU-T standard mu-law compression.
193
+ *
194
+ * @param sample - 16-bit signed PCM sample (-32768 to 32767)
195
+ * @returns Mu-law encoded byte (0-255)
196
+ */
197
+ export function mulawEncode(sample: number): number {
198
+ // Determine sign and work with magnitude
199
+ let sign = 0;
200
+ if (sample < 0) {
201
+ sign = 0x80;
202
+ sample = -sample;
203
+ }
204
+
205
+ // Add bias and clamp to prevent overflow
206
+ sample += MULAW_BIAS;
207
+ if (sample > MULAW_CLIP) {
208
+ sample = MULAW_CLIP;
209
+ }
210
+
211
+ // Find the segment (exponent) by counting leading magnitude bits
212
+ let exponent = 7;
213
+ let mask = 0x4000;
214
+ while (exponent > 0 && (sample & mask) === 0) {
215
+ exponent--;
216
+ mask >>= 1;
217
+ }
218
+
219
+ // Extract the 4-bit mantissa from the appropriate position
220
+ const mantissa = (sample >> (exponent + 3)) & 0x0f;
221
+
222
+ // Combine sign, exponent, mantissa and complement
223
+ const mulaw = ~(sign | (exponent << 4) | mantissa) & 0xff;
224
+ return mulaw;
225
+ }
226
+
227
+ // ============================================================================
228
+ // RESAMPLING FUNCTIONS
229
+ // ============================================================================
230
+
231
+ /**
232
+ * Upsample audio from 8kHz to 16kHz using linear interpolation (2x).
233
+ * For N input samples, outputs 2N samples. Each pair consists of the
234
+ * original sample and the average of it and the next sample.
235
+ *
236
+ * @param input - Int16 PCM samples at 8kHz
237
+ * @returns Int16 PCM samples at 16kHz (2x length)
238
+ */
239
+ function upsample8to16(input: Int16Array): Int16Array {
240
+ const output = new Int16Array(input.length * 2);
241
+
242
+ for (let i = 0; i < input.length - 1; i++) {
243
+ output[2 * i] = input[i];
244
+ output[2 * i + 1] = ((input[i] + input[i + 1]) >> 1) as number;
245
+ }
246
+
247
+ // Last sample: duplicate (no next sample to interpolate with)
248
+ const last = input.length - 1;
249
+ output[2 * last] = input[last];
250
+ output[2 * last + 1] = input[last];
251
+
252
+ return output;
253
+ }
254
+
255
+ /**
256
+ * Downsample audio from 24kHz to 8kHz by averaging groups of 3 samples (3x).
257
+ * Acts as a simple low-pass filter before decimation. Adequate for telephony
258
+ * output which has no useful content above 4kHz.
259
+ *
260
+ * @param input - Int16 PCM samples at 24kHz
261
+ * @returns Int16 PCM samples at 8kHz (1/3 length)
262
+ */
263
+ function downsample24to8(input: Int16Array): Int16Array {
264
+ const outputLen = Math.floor(input.length / 3);
265
+ const output = new Int16Array(outputLen);
266
+
267
+ for (let i = 0; i < outputLen; i++) {
268
+ const offset = i * 3;
269
+ output[i] = Math.round((input[offset] + input[offset + 1] + input[offset + 2]) / 3);
270
+ }
271
+
272
+ return output;
273
+ }
274
+
275
+ // ============================================================================
276
+ // PAYLOAD CONVERSION FUNCTIONS
277
+ // ============================================================================
278
+
279
+ /**
280
+ * Convert a Twilio media payload to Float32Array at 16kHz for VAD/STT.
281
+ *
282
+ * Pipeline: base64 decode -> mulaw to PCM int16 -> upsample 2x (8kHz to 16kHz)
283
+ * -> normalize to float32 (-1.0 to 1.0).
284
+ *
285
+ * @param base64Payload - Base64-encoded mulaw audio at 8kHz from Twilio
286
+ * @returns Float32Array of normalized samples at 16kHz
287
+ */
288
+ export function twilioPayloadToFloat32(base64Payload: string): Float32Array {
289
+ // Base64 decode to raw mulaw bytes
290
+ const mulawBytes = Buffer.from(base64Payload, "base64");
291
+
292
+ // Decode mulaw to int16 PCM at 8kHz
293
+ const pcm8k = new Int16Array(mulawBytes.length);
294
+ for (let i = 0; i < mulawBytes.length; i++) {
295
+ pcm8k[i] = mulawDecode(mulawBytes[i]);
296
+ }
297
+
298
+ // Upsample from 8kHz to 16kHz
299
+ const pcm16k = upsample8to16(pcm8k);
300
+
301
+ // Normalize int16 to float32 (-1.0 to 1.0)
302
+ const float32 = new Float32Array(pcm16k.length);
303
+ for (let i = 0; i < pcm16k.length; i++) {
304
+ float32[i] = pcm16k[i] / 32768;
305
+ }
306
+
307
+ return float32;
308
+ }
309
+
310
+ /**
311
+ * Convert TTS PCM output to a Twilio media payload.
312
+ *
313
+ * Pipeline: read int16 samples from buffer -> downsample 3x (24kHz to 8kHz)
314
+ * -> mulaw encode -> base64.
315
+ *
316
+ * @param pcmBuffer - Raw PCM buffer (16-bit signed, 24kHz mono) from TTS
317
+ * @returns Base64-encoded mulaw audio at 8kHz for Twilio
318
+ */
319
+ export function pcm24kToTwilioPayload(pcmBuffer: Buffer): string {
320
+ // Read int16 samples from the PCM buffer
321
+ const sampleCount = pcmBuffer.length / 2;
322
+ const pcm24k = new Int16Array(sampleCount);
323
+ for (let i = 0; i < sampleCount; i++) {
324
+ pcm24k[i] = pcmBuffer.readInt16LE(i * 2);
325
+ }
326
+
327
+ // Downsample from 24kHz to 8kHz
328
+ const pcm8k = downsample24to8(pcm24k);
329
+
330
+ // Encode each sample to mulaw
331
+ const mulawBytes = Buffer.alloc(pcm8k.length);
332
+ for (let i = 0; i < pcm8k.length; i++) {
333
+ mulawBytes[i] = mulawEncode(pcm8k[i]);
334
+ }
335
+
336
+ // Base64 encode
337
+ return mulawBytes.toString("base64");
338
+ }