@hamsa-ai/voice-agents-sdk 0.4.5 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hamsa-ai/voice-agents-sdk",
3
- "version": "0.4.5",
3
+ "version": "0.4.6",
4
4
  "description": "Hamsa AI - Voice Agents JavaScript SDK",
5
5
  "main": "dist/index.cjs.js",
6
6
  "module": "dist/index.esm.js",
@@ -155,7 +155,7 @@
155
155
  */
156
156
  import { EventEmitter } from 'events';
157
157
  import { type RemoteParticipant, type RemoteTrack, type RemoteTrackPublication, type Room } from 'livekit-client';
158
- import type { TrackStatsData, TrackStatsResult } from './types';
158
+ import type { AudioCaptureOptions, TrackStatsData, TrackStatsResult } from './types';
159
159
  /**
160
160
  * LiveKitAudioManager class for comprehensive audio stream management
161
161
  *
@@ -176,6 +176,13 @@ export declare class LiveKitAudioManager extends EventEmitter {
176
176
  private audioContext;
177
177
  private inputAnalyser;
178
178
  private outputAnalyser;
179
+ /** Audio capture state */
180
+ private audioCaptureEnabled;
181
+ private audioCaptureOptions;
182
+ private readonly recorders;
183
+ private readonly processors;
184
+ /** Map of track IDs to their capture state */
185
+ private readonly trackCaptureMap;
179
186
  /**
180
187
  * Provides the LiveKit Room to the audio manager for microphone control.
181
188
  */
@@ -734,5 +741,57 @@ export declare class LiveKitAudioManager extends EventEmitter {
734
741
  * audioManager.cleanup(); // Still safe to call
735
742
  * ```
736
743
  */
744
+ /**
745
+ * Enables audio capture with specified options
746
+ *
747
+ * This method sets up audio capture from the agent, user, or both, allowing
748
+ * clients to receive raw audio data for forwarding to third-party services,
749
+ * recording, or custom processing.
750
+ *
751
+ * @param options - Configuration options for audio capture
752
+ *
753
+ * @example Capture agent audio in Opus format
754
+ * ```typescript
755
+ * audioManager.enableAudioCapture({
756
+ * source: 'agent',
757
+ * format: 'opus-webm',
758
+ * chunkSize: 100,
759
+ * callback: (audioData, metadata) => {
760
+ * console.log(`Audio from ${metadata.participant}:`, audioData.byteLength, 'bytes');
761
+ * sendToThirdParty(audioData);
762
+ * }
763
+ * });
764
+ * ```
765
+ *
766
+ * @example Capture both user and agent in PCM format
767
+ * ```typescript
768
+ * audioManager.enableAudioCapture({
769
+ * source: 'both',
770
+ * format: 'pcm-f32',
771
+ * bufferSize: 4096,
772
+ * callback: (audioData, metadata) => {
773
+ * if (metadata.source === 'agent') {
774
+ * processAgentAudio(audioData as Float32Array);
775
+ * } else {
776
+ * processUserAudio(audioData as Float32Array);
777
+ * }
778
+ * }
779
+ * });
780
+ * ```
781
+ */
782
+ enableAudioCapture(options: AudioCaptureOptions): void;
783
+ /**
784
+ * Disables audio capture and cleans up all capture resources
785
+ *
786
+ * Stops all active MediaRecorders and ScriptProcessorNodes, releases
787
+ * audio capture resources, and clears capture state.
788
+ *
789
+ * @example
790
+ * ```typescript
791
+ * // Stop capturing audio
792
+ * audioManager.disableAudioCapture();
793
+ * ```
794
+ */
795
+ disableAudioCapture(): void;
737
796
  cleanup(): void;
738
797
  }
@@ -316,6 +316,54 @@ export type CustomEventMetadata = {
316
316
  /** The original raw message data from LiveKit */
317
317
  rawMessage: Record<string, unknown>;
318
318
  };
319
+ /**
320
+ * Audio format types supported for audio capture
321
+ */
322
+ export type AudioCaptureFormat = 'opus-webm' | 'pcm-f32' | 'pcm-i16';
323
+ /**
324
+ * Source of audio to capture
325
+ */
326
+ export type AudioCaptureSource = 'agent' | 'user' | 'both';
327
+ /**
328
+ * Metadata provided with each audio data chunk
329
+ */
330
+ export type AudioCaptureMetadata = {
331
+ /** Identity of the participant this audio is from */
332
+ participant: string;
333
+ /** Type of participant ('agent' or 'user') */
334
+ source: 'agent' | 'user';
335
+ /** Unix timestamp when this audio chunk was captured */
336
+ timestamp: number;
337
+ /** Track ID associated with this audio */
338
+ trackId: string;
339
+ /** Audio format of this chunk */
340
+ format: AudioCaptureFormat;
341
+ /** Sample rate in Hz (for PCM formats) */
342
+ sampleRate?: number;
343
+ /** Number of channels (typically 1 for mono) */
344
+ channels?: number;
345
+ };
346
+ /**
347
+ * Callback function type for audio data capture
348
+ */
349
+ export type AudioCaptureCallback = (audioData: ArrayBuffer | Float32Array | Int16Array, metadata: AudioCaptureMetadata) => void;
350
+ /**
351
+ * Options for configuring audio capture
352
+ */
353
+ export type AudioCaptureOptions = {
354
+ /** Source of audio to capture (default: 'agent') */
355
+ source?: AudioCaptureSource;
356
+ /** Audio format to deliver (default: 'opus-webm') */
357
+ format?: AudioCaptureFormat;
358
+ /** Chunk size in milliseconds for encoded formats (default: 100ms) */
359
+ chunkSize?: number;
360
+ /** Buffer size for PCM formats in samples (default: 4096) */
361
+ bufferSize?: number;
362
+ /** Callback function to receive audio data (Level 3 API - Full control) */
363
+ callback?: AudioCaptureCallback;
364
+ /** Simpler callback alias (Level 2 API - recommended for inline usage) */
365
+ onData?: AudioCaptureCallback;
366
+ };
319
367
  /**
320
368
  * LiveKit access token payload structure used by Hamsa backend.
321
369
  * Represents the decoded JWT payload fields relevant for SDK logic.
package/types/main.d.ts CHANGED
@@ -2,8 +2,9 @@ import { EventEmitter } from 'events';
2
2
  import type { ConnectionState, LocalTrack, LocalTrackPublication, Participant, RemoteParticipant, RemoteTrack, Room } from 'livekit-client';
3
3
  import LiveKitManager, { type AgentState, type AudioLevelsResult, type CallAnalyticsResult, type ConnectionStatsResult, type ParticipantData, type PerformanceMetricsResult, type TrackStatsResult } from './classes/livekit-manager';
4
4
  import ScreenWakeLock from './classes/screen-wake-lock';
5
- import type { ConnectionQualityData, TrackSubscriptionData, TrackUnsubscriptionData } from './classes/types';
5
+ import type { AudioCaptureCallback, AudioCaptureOptions, ConnectionQualityData, TrackSubscriptionData, TrackUnsubscriptionData } from './classes/types';
6
6
  export type { AgentState } from './classes/livekit-manager';
7
+ export type { AudioCaptureCallback, AudioCaptureFormat, AudioCaptureMetadata, AudioCaptureOptions, AudioCaptureSource, } from './classes/types';
7
8
  /**
8
9
  * Custom error class that includes both human-readable message and machine-readable messageKey
9
10
  * for internationalization and programmatic error handling
@@ -60,6 +61,32 @@ type StartOptions = {
60
61
  connectionDelay?: ConnectionDelays;
61
62
  /** Disable wake lock to allow device sleep during conversation */
62
63
  disableWakeLock?: boolean;
64
+ /**
65
+ * Simple callback to receive agent audio data (Level 1 API - Simplest)
66
+ * Automatically captures agent audio in opus-webm format with 100ms chunks
67
+ * @example
68
+ * ```typescript
69
+ * onAudioData: (audioData) => {
70
+ * thirdPartySocket.send(audioData);
71
+ * }
72
+ * ```
73
+ */
74
+ onAudioData?: AudioCaptureCallback;
75
+ /**
76
+ * Advanced audio capture configuration (Level 2 API - More Control)
77
+ * Use this if you need to specify format, source, or other options
78
+ * @example
79
+ * ```typescript
80
+ * captureAudio: {
81
+ * source: 'both',
82
+ * format: 'pcm-f32',
83
+ * onData: (audioData, metadata) => {
84
+ * processAudio(audioData, metadata);
85
+ * }
86
+ * }
87
+ * ```
88
+ */
89
+ captureAudio?: AudioCaptureOptions;
63
90
  };
64
91
  /**
65
92
  * Definition of a client-side tool that can be called by the voice agent
@@ -523,6 +550,131 @@ declare class HamsaVoiceAgent extends EventEmitter {
523
550
  * ```
524
551
  */
525
552
  getOutputByteFrequencyData(): Uint8Array;
553
+ /**
554
+ * Enables real-time audio capture from the conversation
555
+ *
556
+ * This method allows clients to receive raw audio data from the agent, user, or both,
557
+ * enabling use cases like forwarding audio to third-party services, custom recording,
558
+ * real-time transcription, or audio analysis.
559
+ *
560
+ * The audio can be captured in three formats:
561
+ * - `opus-webm`: Efficient Opus codec in WebM container (default, recommended)
562
+ * - `pcm-f32`: Raw PCM audio as Float32Array for advanced processing
563
+ * - `pcm-i16`: Raw PCM audio as Int16Array for compatibility
564
+ *
565
+ * @param options - Configuration options for audio capture
566
+ * @param options.source - Which audio to capture: 'agent' (default), 'user', or 'both'
567
+ * @param options.format - Audio format to receive (default: 'opus-webm')
568
+ * @param options.chunkSize - Chunk size in milliseconds for encoded formats (default: 100ms)
569
+ * @param options.bufferSize - Buffer size in samples for PCM formats (default: 4096)
570
+ * @param options.callback - Function called with each audio chunk
571
+ *
572
+ * @example Forward agent audio to third-party service
573
+ * ```typescript
574
+ * // Start capturing agent audio when call begins
575
+ * agent.on('callStarted', () => {
576
+ * agent.enableAudioCapture({
577
+ * source: 'agent',
578
+ * format: 'opus-webm',
579
+ * chunkSize: 100, // 100ms chunks
580
+ * callback: (audioData, metadata) => {
581
+ * console.log(`Audio from ${metadata.participant}: ${audioData.byteLength} bytes`);
582
+ *
583
+ * // Send to third-party service via WebSocket
584
+ * thirdPartyWebSocket.send(audioData);
585
+ *
586
+ * // Or via HTTP
587
+ * fetch('https://api.example.com/audio', {
588
+ * method: 'POST',
589
+ * body: audioData,
590
+ * headers: {
591
+ * 'Content-Type': 'audio/webm',
592
+ * 'X-Participant': metadata.participant,
593
+ * 'X-Timestamp': metadata.timestamp.toString()
594
+ * }
595
+ * });
596
+ * }
597
+ * });
598
+ * });
599
+ * ```
600
+ *
601
+ * @example Capture both agent and user for custom processing
602
+ * ```typescript
603
+ * agent.enableAudioCapture({
604
+ * source: 'both',
605
+ * format: 'pcm-f32',
606
+ * bufferSize: 4096,
607
+ * callback: (audioData, metadata) => {
608
+ * const samples = audioData as Float32Array;
609
+ *
610
+ * if (metadata.source === 'agent') {
611
+ * // Process agent audio
612
+ * analyzeAgentVoice(samples, metadata.sampleRate);
613
+ * } else {
614
+ * // Process user audio
615
+ * analyzeUserVoice(samples, metadata.sampleRate);
616
+ * }
617
+ *
618
+ * // Save to custom recorder
619
+ * customRecorder.addAudioChunk({
620
+ * source: metadata.source,
621
+ * samples,
622
+ * sampleRate: metadata.sampleRate,
623
+ * timestamp: metadata.timestamp
624
+ * });
625
+ * }
626
+ * });
627
+ * ```
628
+ *
629
+ * @example Real-time transcription integration
630
+ * ```typescript
631
+ * import { AudioCaptureOptions } from '@hamsa-ai/voice-agents-sdk';
632
+ *
633
+ * const transcriptionService = new WebSocket('wss://transcription.example.com');
634
+ *
635
+ * agent.enableAudioCapture({
636
+ * source: 'user',
637
+ * format: 'opus-webm',
638
+ * chunkSize: 50, // Lower latency for real-time transcription
639
+ * callback: (audioData, metadata) => {
640
+ * // Forward user audio to transcription service
641
+ * transcriptionService.send(JSON.stringify({
642
+ * audio: Array.from(new Uint8Array(audioData as ArrayBuffer)),
643
+ * timestamp: metadata.timestamp,
644
+ * sampleRate: metadata.sampleRate
645
+ * }));
646
+ * }
647
+ * });
648
+ *
649
+ * transcriptionService.onmessage = (event) => {
650
+ * const transcription = JSON.parse(event.data);
651
+ * console.log('Real-time transcription:', transcription.text);
652
+ * displayUserSpeech(transcription.text);
653
+ * };
654
+ * ```
655
+ */
656
+ enableAudioCapture(options: AudioCaptureOptions): void;
657
+ /**
658
+ * Disables audio capture and releases all capture resources
659
+ *
660
+ * Stops all active audio capture, cleans up MediaRecorders and audio processors,
661
+ * and releases associated resources. Safe to call even if audio capture is not enabled.
662
+ *
663
+ * @example
664
+ * ```typescript
665
+ * // Stop capturing when call ends
666
+ * agent.on('callEnded', () => {
667
+ * agent.disableAudioCapture();
668
+ * console.log('Audio capture stopped');
669
+ * });
670
+ *
671
+ * // Or stop manually
672
+ * stopCaptureButton.addEventListener('click', () => {
673
+ * agent.disableAudioCapture();
674
+ * });
675
+ * ```
676
+ */
677
+ disableAudioCapture(): void;
526
678
  /**
527
679
  * Initiates a new voice agent conversation
528
680
  *
@@ -595,7 +747,7 @@ declare class HamsaVoiceAgent extends EventEmitter {
595
747
  * await agent.start({ agentId: 'my_agent', voiceEnablement: true });
596
748
  * ```
597
749
  */
598
- start({ agentId, params, voiceEnablement, tools, userId: _userId, preferHeadphonesForIosDevices: _preferHeadphonesForIosDevices, connectionDelay: _connectionDelay, disableWakeLock: _disableWakeLock, }: StartOptions): Promise<void>;
750
+ start({ agentId, params, voiceEnablement, tools, userId: _userId, preferHeadphonesForIosDevices: _preferHeadphonesForIosDevices, connectionDelay: _connectionDelay, disableWakeLock: _disableWakeLock, onAudioData, captureAudio, }: StartOptions): Promise<void>;
599
751
  /**
600
752
  * Terminates the current voice agent conversation
601
753
  *