npm - @hamsa-ai/voice-agents-sdk - Versions diffs - 0.4.5 → 0.4.6 - Mend

@hamsa-ai/voice-agents-sdk 0.4.5 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.md +216 -0
package/dist/index.cjs.js +1 -1
package/dist/index.cjs.js.map +1 -1
package/dist/index.esm.js +1 -1
package/dist/index.esm.js.map +1 -1
package/dist/index.umd.js +1 -1
package/dist/index.umd.js.map +1 -1
package/package.json +1 -1
package/types/classes/livekit-audio-manager.d.ts +60 -1
package/types/classes/types.d.ts +48 -0
package/types/main.d.ts +154 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@hamsa-ai/voice-agents-sdk",
-  "version": "0.4.5",
+  "version": "0.4.6",
   "description": "Hamsa AI - Voice Agents JavaScript SDK",
   "main": "dist/index.cjs.js",
   "module": "dist/index.esm.js",

package/types/classes/livekit-audio-manager.d.ts CHANGED Viewed

@@ -155,7 +155,7 @@
  */
 import { EventEmitter } from 'events';
 import { type RemoteParticipant, type RemoteTrack, type RemoteTrackPublication, type Room } from 'livekit-client';
-import type { TrackStatsData, TrackStatsResult } from './types';
+import type { AudioCaptureOptions, TrackStatsData, TrackStatsResult } from './types';
 /**
  * LiveKitAudioManager class for comprehensive audio stream management
  *
@@ -176,6 +176,13 @@ export declare class LiveKitAudioManager extends EventEmitter {
     private audioContext;
     private inputAnalyser;
     private outputAnalyser;
+    /** Audio capture state */
+    private audioCaptureEnabled;
+    private audioCaptureOptions;
+    private readonly recorders;
+    private readonly processors;
+    /** Map of track IDs to their capture state */
+    private readonly trackCaptureMap;
     /**
      * Provides the LiveKit Room to the audio manager for microphone control.
      */
@@ -734,5 +741,57 @@ export declare class LiveKitAudioManager extends EventEmitter {
      * audioManager.cleanup(); // Still safe to call
      * ```
      */
+    /**
+     * Enables audio capture with specified options
+     *
+     * This method sets up audio capture from the agent, user, or both, allowing
+     * clients to receive raw audio data for forwarding to third-party services,
+     * recording, or custom processing.
+     *
+     * @param options - Configuration options for audio capture
+     *
+     * @example Capture agent audio in Opus format
+     * ```typescript
+     * audioManager.enableAudioCapture({
+     *   source: 'agent',
+     *   format: 'opus-webm',
+     *   chunkSize: 100,
+     *   callback: (audioData, metadata) => {
+     *     console.log(`Audio from ${metadata.participant}:`, audioData.byteLength, 'bytes');
+     *     sendToThirdParty(audioData);
+     *   }
+     * });
+     * ```
+     *
+     * @example Capture both user and agent in PCM format
+     * ```typescript
+     * audioManager.enableAudioCapture({
+     *   source: 'both',
+     *   format: 'pcm-f32',
+     *   bufferSize: 4096,
+     *   callback: (audioData, metadata) => {
+     *     if (metadata.source === 'agent') {
+     *       processAgentAudio(audioData as Float32Array);
+     *     } else {
+     *       processUserAudio(audioData as Float32Array);
+     *     }
+     *   }
+     * });
+     * ```
+     */
+    enableAudioCapture(options: AudioCaptureOptions): void;
+    /**
+     * Disables audio capture and cleans up all capture resources
+     *
+     * Stops all active MediaRecorders and ScriptProcessorNodes, releases
+     * audio capture resources, and clears capture state.
+     *
+     * @example
+     * ```typescript
+     * // Stop capturing audio
+     * audioManager.disableAudioCapture();
+     * ```
+     */
+    disableAudioCapture(): void;
     cleanup(): void;
 }

package/types/classes/types.d.ts CHANGED Viewed

@@ -316,6 +316,54 @@ export type CustomEventMetadata = {
     /** The original raw message data from LiveKit */
     rawMessage: Record<string, unknown>;
 };
+/**
+ * Audio format types supported for audio capture
+ */
+export type AudioCaptureFormat = 'opus-webm' | 'pcm-f32' | 'pcm-i16';
+/**
+ * Source of audio to capture
+ */
+export type AudioCaptureSource = 'agent' | 'user' | 'both';
+/**
+ * Metadata provided with each audio data chunk
+ */
+export type AudioCaptureMetadata = {
+    /** Identity of the participant this audio is from */
+    participant: string;
+    /** Type of participant ('agent' or 'user') */
+    source: 'agent' | 'user';
+    /** Unix timestamp when this audio chunk was captured */
+    timestamp: number;
+    /** Track ID associated with this audio */
+    trackId: string;
+    /** Audio format of this chunk */
+    format: AudioCaptureFormat;
+    /** Sample rate in Hz (for PCM formats) */
+    sampleRate?: number;
+    /** Number of channels (typically 1 for mono) */
+    channels?: number;
+};
+/**
+ * Callback function type for audio data capture
+ */
+export type AudioCaptureCallback = (audioData: ArrayBuffer | Float32Array | Int16Array, metadata: AudioCaptureMetadata) => void;
+/**
+ * Options for configuring audio capture
+ */
+export type AudioCaptureOptions = {
+    /** Source of audio to capture (default: 'agent') */
+    source?: AudioCaptureSource;
+    /** Audio format to deliver (default: 'opus-webm') */
+    format?: AudioCaptureFormat;
+    /** Chunk size in milliseconds for encoded formats (default: 100ms) */
+    chunkSize?: number;
+    /** Buffer size for PCM formats in samples (default: 4096) */
+    bufferSize?: number;
+    /** Callback function to receive audio data (Level 3 API - Full control) */
+    callback?: AudioCaptureCallback;
+    /** Simpler callback alias (Level 2 API - recommended for inline usage) */
+    onData?: AudioCaptureCallback;
+};
 /**
  * LiveKit access token payload structure used by Hamsa backend.
  * Represents the decoded JWT payload fields relevant for SDK logic.

package/types/main.d.ts CHANGED Viewed

@@ -2,8 +2,9 @@ import { EventEmitter } from 'events';
 import type { ConnectionState, LocalTrack, LocalTrackPublication, Participant, RemoteParticipant, RemoteTrack, Room } from 'livekit-client';
 import LiveKitManager, { type AgentState, type AudioLevelsResult, type CallAnalyticsResult, type ConnectionStatsResult, type ParticipantData, type PerformanceMetricsResult, type TrackStatsResult } from './classes/livekit-manager';
 import ScreenWakeLock from './classes/screen-wake-lock';
-import type { ConnectionQualityData, TrackSubscriptionData, TrackUnsubscriptionData } from './classes/types';
+import type { AudioCaptureCallback, AudioCaptureOptions, ConnectionQualityData, TrackSubscriptionData, TrackUnsubscriptionData } from './classes/types';
 export type { AgentState } from './classes/livekit-manager';
+export type { AudioCaptureCallback, AudioCaptureFormat, AudioCaptureMetadata, AudioCaptureOptions, AudioCaptureSource, } from './classes/types';
 /**
  * Custom error class that includes both human-readable message and machine-readable messageKey
  * for internationalization and programmatic error handling
@@ -60,6 +61,32 @@ type StartOptions = {
     connectionDelay?: ConnectionDelays;
     /** Disable wake lock to allow device sleep during conversation */
     disableWakeLock?: boolean;
+    /**
+     * Simple callback to receive agent audio data (Level 1 API - Simplest)
+     * Automatically captures agent audio in opus-webm format with 100ms chunks
+     * @example
+     * ```typescript
+     * onAudioData: (audioData) => {
+     *   thirdPartySocket.send(audioData);
+     * }
+     * ```
+     */
+    onAudioData?: AudioCaptureCallback;
+    /**
+     * Advanced audio capture configuration (Level 2 API - More Control)
+     * Use this if you need to specify format, source, or other options
+     * @example
+     * ```typescript
+     * captureAudio: {
+     *   source: 'both',
+     *   format: 'pcm-f32',
+     *   onData: (audioData, metadata) => {
+     *     processAudio(audioData, metadata);
+     *   }
+     * }
+     * ```
+     */
+    captureAudio?: AudioCaptureOptions;
 };
 /**
  * Definition of a client-side tool that can be called by the voice agent
@@ -523,6 +550,131 @@ declare class HamsaVoiceAgent extends EventEmitter {
      * ```
      */
     getOutputByteFrequencyData(): Uint8Array;
+    /**
+     * Enables real-time audio capture from the conversation
+     *
+     * This method allows clients to receive raw audio data from the agent, user, or both,
+     * enabling use cases like forwarding audio to third-party services, custom recording,
+     * real-time transcription, or audio analysis.
+     *
+     * The audio can be captured in three formats:
+     * - `opus-webm`: Efficient Opus codec in WebM container (default, recommended)
+     * - `pcm-f32`: Raw PCM audio as Float32Array for advanced processing
+     * - `pcm-i16`: Raw PCM audio as Int16Array for compatibility
+     *
+     * @param options - Configuration options for audio capture
+     * @param options.source - Which audio to capture: 'agent' (default), 'user', or 'both'
+     * @param options.format - Audio format to receive (default: 'opus-webm')
+     * @param options.chunkSize - Chunk size in milliseconds for encoded formats (default: 100ms)
+     * @param options.bufferSize - Buffer size in samples for PCM formats (default: 4096)
+     * @param options.callback - Function called with each audio chunk
+     *
+     * @example Forward agent audio to third-party service
+     * ```typescript
+     * // Start capturing agent audio when call begins
+     * agent.on('callStarted', () => {
+     *   agent.enableAudioCapture({
+     *     source: 'agent',
+     *     format: 'opus-webm',
+     *     chunkSize: 100, // 100ms chunks
+     *     callback: (audioData, metadata) => {
+     *       console.log(`Audio from ${metadata.participant}: ${audioData.byteLength} bytes`);
+     *
+     *       // Send to third-party service via WebSocket
+     *       thirdPartyWebSocket.send(audioData);
+     *
+     *       // Or via HTTP
+     *       fetch('https://api.example.com/audio', {
+     *         method: 'POST',
+     *         body: audioData,
+     *         headers: {
+     *           'Content-Type': 'audio/webm',
+     *           'X-Participant': metadata.participant,
+     *           'X-Timestamp': metadata.timestamp.toString()
+     *         }
+     *       });
+     *     }
+     *   });
+     * });
+     * ```
+     *
+     * @example Capture both agent and user for custom processing
+     * ```typescript
+     * agent.enableAudioCapture({
+     *   source: 'both',
+     *   format: 'pcm-f32',
+     *   bufferSize: 4096,
+     *   callback: (audioData, metadata) => {
+     *     const samples = audioData as Float32Array;
+     *
+     *     if (metadata.source === 'agent') {
+     *       // Process agent audio
+     *       analyzeAgentVoice(samples, metadata.sampleRate);
+     *     } else {
+     *       // Process user audio
+     *       analyzeUserVoice(samples, metadata.sampleRate);
+     *     }
+     *
+     *     // Save to custom recorder
+     *     customRecorder.addAudioChunk({
+     *       source: metadata.source,
+     *       samples,
+     *       sampleRate: metadata.sampleRate,
+     *       timestamp: metadata.timestamp
+     *     });
+     *   }
+     * });
+     * ```
+     *
+     * @example Real-time transcription integration
+     * ```typescript
+     * import { AudioCaptureOptions } from '@hamsa-ai/voice-agents-sdk';
+     *
+     * const transcriptionService = new WebSocket('wss://transcription.example.com');
+     *
+     * agent.enableAudioCapture({
+     *   source: 'user',
+     *   format: 'opus-webm',
+     *   chunkSize: 50, // Lower latency for real-time transcription
+     *   callback: (audioData, metadata) => {
+     *     // Forward user audio to transcription service
+     *     transcriptionService.send(JSON.stringify({
+     *       audio: Array.from(new Uint8Array(audioData as ArrayBuffer)),
+     *       timestamp: metadata.timestamp,
+     *       sampleRate: metadata.sampleRate
+     *     }));
+     *   }
+     * });
+     *
+     * transcriptionService.onmessage = (event) => {
+     *   const transcription = JSON.parse(event.data);
+     *   console.log('Real-time transcription:', transcription.text);
+     *   displayUserSpeech(transcription.text);
+     * };
+     * ```
+     */
+    enableAudioCapture(options: AudioCaptureOptions): void;
+    /**
+     * Disables audio capture and releases all capture resources
+     *
+     * Stops all active audio capture, cleans up MediaRecorders and audio processors,
+     * and releases associated resources. Safe to call even if audio capture is not enabled.
+     *
+     * @example
+     * ```typescript
+     * // Stop capturing when call ends
+     * agent.on('callEnded', () => {
+     *   agent.disableAudioCapture();
+     *   console.log('Audio capture stopped');
+     * });
+     *
+     * // Or stop manually
+     * stopCaptureButton.addEventListener('click', () => {
+     *   agent.disableAudioCapture();
+     * });
+     * ```
+     */
+    disableAudioCapture(): void;
     /**
      * Initiates a new voice agent conversation
      *
@@ -595,7 +747,7 @@ declare class HamsaVoiceAgent extends EventEmitter {
      * await agent.start({ agentId: 'my_agent', voiceEnablement: true });
      * ```
      */
-    start({ agentId, params, voiceEnablement, tools, userId: _userId, preferHeadphonesForIosDevices: _preferHeadphonesForIosDevices, connectionDelay: _connectionDelay, disableWakeLock: _disableWakeLock, }: StartOptions): Promise<void>;
+    start({ agentId, params, voiceEnablement, tools, userId: _userId, preferHeadphonesForIosDevices: _preferHeadphonesForIosDevices, connectionDelay: _connectionDelay, disableWakeLock: _disableWakeLock, onAudioData, captureAudio, }: StartOptions): Promise<void>;
     /**
      * Terminates the current voice agent conversation
      *