npm - voicecc - Versions diffs - 1.1.36 → 1.2.0 - Mend

voicecc 1.1.36 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/bin/voicecc.js +94 -1
package/dashboard/dist/assets/index-DCeOdulF.js +28 -0
package/dashboard/dist/index.html +1 -1
package/dashboard/routes/agents.ts +28 -8
package/dashboard/routes/browser-call.ts +3 -2
package/dashboard/routes/chat.ts +75 -55
package/dashboard/routes/providers.ts +5 -74
package/dashboard/routes/twilio.ts +104 -5
package/dashboard/routes/voice.ts +98 -0
package/dashboard/server.ts +48 -1
package/package.json +2 -3
package/server/index.ts +96 -8
package/server/services/twilio-manager.ts +29 -10
package/dashboard/dist/assets/index-C62C9Gp0.js +0 -28
package/dashboard/dist/audio-processor.js +0 -126
package/server/services/heartbeat.ts +0 -403
package/server/voice/assets/chime.wav +0 -0
package/server/voice/assets/startup.pcm +0 -0
package/server/voice/audio-adapter.ts +0 -60
package/server/voice/audio-inactivity.test.ts +0 -108
package/server/voice/audio-inactivity.ts +0 -91
package/server/voice/browser-audio-playback.test.ts +0 -149
package/server/voice/browser-audio.ts +0 -147
package/server/voice/browser-server.ts +0 -311
package/server/voice/chat-server.ts +0 -236
package/server/voice/chime.test.ts +0 -69
package/server/voice/chime.ts +0 -36
package/server/voice/claude-session.ts +0 -293
package/server/voice/endpointing.ts +0 -163
package/server/voice/mic-vpio +0 -0
package/server/voice/narration.ts +0 -204
package/server/voice/prompt-builder.ts +0 -108
package/server/voice/session-lock.ts +0 -123
package/server/voice/stt-elevenlabs.ts +0 -210
package/server/voice/stt-provider.ts +0 -106
package/server/voice/tts-elevenlabs-hiss.test.ts +0 -183
package/server/voice/tts-elevenlabs.ts +0 -397
package/server/voice/tts-provider.ts +0 -155
package/server/voice/twilio-audio.ts +0 -338
package/server/voice/twilio-server.ts +0 -540
package/server/voice/types.ts +0 -282
package/server/voice/vad.ts +0 -101
package/server/voice/voice-loop-bugs.test.ts +0 -348
package/server/voice/voice-server.ts +0 -129
package/server/voice/voice-session.ts +0 -539

package/server/voice/types.ts DELETED Viewed

@@ -1,282 +0,0 @@
-/**
- * Shared types for the Claude Code voice server.
- *
- * Defines all DTOs and interfaces used across the voice pipeline modules:
- * - Voice loop configuration and state
- * - Audio frame representation
- * - VAD (voice activity detection) events
- * - STT (speech-to-text) results
- * - Endpointing decisions for turn detection
- * - Claude session streaming events
- * - TTS (text-to-speech) configuration
- * - Narration configuration
- */
-// ============================================================================
-// CONFIGURATION INTERFACES
-// ============================================================================
-/**
- * Top-level configuration for the voice loop.
- * Passed to `startVoiceLoop` to initialize all modules.
- */
-export interface VoiceLoopConfig {
-  /** Endpointing configuration for turn detection */
-  endpointing: EndpointingConfig;
-  /** Narration configuration for Claude response processing */
-  narration: NarrationConfig;
-  /** Claude Agent SDK session configuration */
-  claudeSession: ClaudeSessionConfig;
-  /** Phrase that stops the voice loop when spoken */
-  stopPhrase: string;
-}
-/**
- * Configuration for the endpointing module.
- * Controls how the system decides when the user is done speaking.
- */
-export interface EndpointingConfig {
-  /** Silence duration (ms) before considering speech complete */
-  silenceThresholdMs: number;
-  /** Maximum silence duration (ms) before forcing completion regardless */
-  maxSilenceBeforeTimeoutMs: number;
-  /** Minimum word count for the VAD fast path (skips Haiku check) */
-  minWordCountForFastPath: number;
-  /** Whether to use Haiku API for ambiguous short utterances */
-  enableHaikuFallback: boolean;
-}
-/**
- * Configuration for the Claude Agent SDK session.
- */
-export interface ClaudeSessionConfig {
-  /** List of allowed tool names (empty array means all tools allowed) */
-  allowedTools: string[];
-  /** Permission mode -- must be "bypassPermissions" for voice loop */
-  permissionMode: string;
-  /** System prompt appended to the default (includes CLAUDE.md) */
-  systemPrompt: string;
-  /** If set, replaces the entire system prompt (skips CLAUDE.md). Used for agent calls. */
-  customSystemPrompt?: string;
-  /** Working directory for the Claude Code session */
-  cwd?: string;
-}
-/**
- * Configuration for the narration module.
- * Controls how Claude's streaming output is processed into speakable text.
- */
-export interface NarrationConfig {
-  /** Interval (ms) between "still working..." summaries during long tool runs */
-  summaryIntervalMs: number;
-}
-/**
- * TTS player instance that converts text to spoken audio output.
- */
-export interface TtsPlayer {
-  /**
-   * Convert text to audio and play it through the speakers.
-   * @param text - The text to speak
-   */
-  speak(text: string): Promise<void>;
-  /**
-   * Stream text chunks into TTS for incremental playback.
-   * @param texts - Async iterable of text chunks
-   */
-  speakStream(texts: AsyncIterable<TextChunk>): Promise<void>;
-  /** Interrupt current playback immediately. */
-  interrupt(): void;
-  /** Check whether TTS is currently generating and playing audio. */
-  isSpeaking(): boolean;
-  /** Free all TTS resources. */
-  destroy(): void;
-}
-/**
- * STT processor instance that converts speech audio to text.
- */
-export interface SttProcessor {
-  /**
-   * Appends audio samples to the internal buffer.
-   * @param samples - Float32Array of audio samples (16kHz, normalized -1.0 to 1.0)
-   */
-  accumulate(samples: Float32Array): void;
-  /**
-   * Batch-transcribes the accumulated audio buffer. Clears the buffer afterward.
-   * @returns Transcription result with text, isFinal flag, and timestamp
-   */
-  transcribe(): Promise<TranscriptionResult>;
-  /** Clears the accumulated audio buffer without transcribing. */
-  clearBuffer(): void;
-  /** Frees underlying resources. */
-  destroy(): void;
-}
-// ============================================================================
-// AUDIO TYPES
-// ============================================================================
-/**
- * A single frame of audio data from the microphone.
- */
-export interface AudioFrame {
-  /** PCM audio samples normalized to -1.0 to 1.0 range */
-  pcm: Float32Array;
-  /** Sample rate in Hz */
-  sampleRate: number;
-  /** Timestamp in milliseconds when this frame was captured */
-  timestamp: number;
-}
-// ============================================================================
-// VAD TYPES
-// ============================================================================
-/** Possible VAD event types indicating speech activity state */
-export type VadEventType = "SPEECH_START" | "SPEECH_CONTINUE" | "SPEECH_END" | "SILENCE";
-/**
- * Event emitted by the VAD processor after analyzing an audio frame.
- */
-export interface VadEvent {
-  /** The detected speech activity state */
-  type: VadEventType;
-  /** Speech probability from the VAD model (0.0 to 1.0) */
-  probability: number;
-  /** Timestamp in milliseconds */
-  timestamp: number;
-}
-// ============================================================================
-// STT TYPES
-// ============================================================================
-/**
- * Result from the speech-to-text transcription.
- */
-export interface TranscriptionResult {
-  /** The transcribed text */
-  text: string;
-  /** Whether this is a final transcription (always true for batch/offline mode) */
-  isFinal: boolean;
-  /** Timestamp in milliseconds when transcription completed */
-  timestamp: number;
-}
-// ============================================================================
-// ENDPOINTING TYPES
-// ============================================================================
-/** Method used to determine that the user finished speaking */
-export type EndpointMethod = "vad_fast" | "haiku_semantic" | "timeout";
-/**
- * Decision from the endpointing module on whether the user has finished speaking.
- */
-export interface EndpointDecision {
-  /** Whether the user's turn is considered complete */
-  isComplete: boolean;
-  /** The current accumulated transcript */
-  transcript: string;
-  /** Which method was used to make the decision */
-  method: EndpointMethod;
-}
-// ============================================================================
-// CLAUDE SESSION TYPES
-// ============================================================================
-/** Possible event types from the Claude streaming response */
-export type ClaudeStreamEventType = "text_delta" | "tool_start" | "tool_end" | "result" | "error";
-/**
- * Simplified streaming event from the Claude Agent SDK session.
- * Mapped from the raw SDKMessage types for downstream consumption.
- */
-export interface ClaudeStreamEvent {
-  /** The type of streaming event */
-  type: ClaudeStreamEventType;
-  /** Text content (for text_delta events) or error message (for error events) */
-  content: string;
-  /** Tool name (only present for tool_start events) */
-  toolName?: string;
-}
-// ============================================================================
-// TTS TEXT CHUNK TYPES
-// ============================================================================
-/** A text chunk for TTS. Plain string = streaming fragment (buffer it).
- * Object with flush = complete sentence (speak immediately). */
-export type TextChunk = string | { text: string; flush: true };
-// ============================================================================
-// VOICE LOOP STATE
-// ============================================================================
-/** Possible states of the voice loop state machine */
-export type VoiceLoopStatus = "idle" | "listening" | "processing" | "speaking";
-/**
- * Current state of the voice loop.
- * Used by the state machine in index.ts.
- */
-export interface VoiceLoopState {
-  /** Current state of the voice loop */
-  status: VoiceLoopStatus;
-  /** Active Claude session ID, or null if no session is active */
-  sessionId: string | null;
-}
-// ============================================================================
-// PROVIDER TYPES
-// ============================================================================
-/** Available TTS provider backends */
-export type TtsProviderType = "elevenlabs";
-/** Available STT provider backends */
-export type SttProviderType = "elevenlabs";
-/**
- * Readiness status for a provider.
- * Returned by getTtsProviderStatus / getSttProviderStatus.
- */
-export interface ProviderStatus {
-  /** Whether the provider is ready to use */
-  ready: boolean;
-  /** Reason the provider is not ready (only present when ready is false) */
-  reason?: "missing_api_key";
-  /** Human-readable detail about why the provider is not ready */
-  detail?: string;
-}
-/**
- * Configuration that selects a TTS provider and holds per-provider settings.
- * Built from environment variables in each entry point.
- */
-export interface TtsProviderConfig {
-  /** Which TTS provider to use */
-  provider: TtsProviderType;
-  /** Settings for the ElevenLabs TTS provider */
-  elevenlabs: { apiKey: string; voiceId: string; modelId: string };
-}
-/**
- * Configuration that selects an STT provider and holds per-provider settings.
- * Built from environment variables in each entry point.
- */
-export interface SttProviderConfig {
-  /** Which STT provider to use */
-  provider: SttProviderType;
-  /** Settings for the ElevenLabs STT provider */
-  elevenlabs: { apiKey: string; modelId: string };
-}

package/server/voice/vad.ts DELETED Viewed

@@ -1,101 +0,0 @@
-/**
- * Voice Activity Detection (VAD) via avr-vad (Silero VAD v5).
- *
- * Wraps the avr-vad callback-based API into a simpler event queue model.
- * avr-vad handles its own framing internally (512-sample frames at 16kHz).
- * We feed raw audio via processAudio and collect speech events from callbacks.
- *
- * Responsibilities:
- * - Initialize the Silero VAD v5 model
- * - Feed raw audio and collect speech start/end events
- * - Expose per-frame probability via onFrameProcessed callback
- * - Manage model lifecycle (reset between utterances, destroy on shutdown)
- */
-import type { VadEvent, VadEventType } from "./types.js";
-// ============================================================================
-// INTERFACES
-// ============================================================================
-/** Callback invoked for each VAD event detected in the audio stream. */
-type VadEventCallback = (event: VadEvent) => void;
-/** Internal interface for the VAD processor returned by createVad. */
-interface VadProcessor {
-  /**
-   * Feed raw audio samples to the VAD. avr-vad handles framing internally.
-   * Events are emitted via the callback provided at creation.
-   *
-   * @param samples - Float32Array of audio samples (16kHz, normalized -1.0 to 1.0)
-   */
-  processAudio(samples: Float32Array): Promise<void>;
-  /**
-   * Resets internal VAD state. Call between utterances to avoid
-   * state leakage across speech segments.
-   */
-  reset(): void;
-  /**
-   * Frees the underlying ONNX model resources.
-   * Call on shutdown to prevent resource leaks.
-   */
-  destroy(): void;
-}
-// ============================================================================
-// MAIN HANDLERS
-// ============================================================================
-/**
- * Initializes the Silero VAD v5 model via avr-vad and returns a VadProcessor.
- * Events (SPEECH_START, SPEECH_END, etc.) are delivered via the onEvent callback.
- *
- * @param onEvent - Callback invoked for each detected VAD event
- * @returns Promise resolving to a VadProcessor instance
- * @throws Error if the ONNX model fails to load
- */
-async function createVad(onEvent: VadEventCallback): Promise<VadProcessor> {
-  // Dynamic import for avr-vad.
-  const { RealTimeVAD } = await import("avr-vad");
-  let lastProbability = 0;
-  const vad = await RealTimeVAD.new({
-    onSpeechStart: () => {
-      onEvent({ type: "SPEECH_START", probability: lastProbability, timestamp: Date.now() });
-    },
-    onSpeechRealStart: () => {
-      // Emitted after minSpeechFrames confirm real speech.
-      // We treat this as SPEECH_CONTINUE to signal sustained speech.
-      onEvent({ type: "SPEECH_CONTINUE", probability: lastProbability, timestamp: Date.now() });
-    },
-    onSpeechEnd: () => {
-      onEvent({ type: "SPEECH_END", probability: lastProbability, timestamp: Date.now() });
-    },
-    onFrameProcessed: (probs: { isSpeech: number }) => {
-      lastProbability = probs.isSpeech;
-    },
-  });
-  // Must call start() to activate processing
-  vad.start();
-  return {
-    async processAudio(samples: Float32Array): Promise<void> {
-      await vad.processAudio(samples);
-    },
-    reset(): void {
-      vad.reset();
-    },
-    destroy(): void {
-      vad.destroy();
-    },
-  };
-}
-export { createVad };
-export type { VadProcessor, VadEventCallback };