@hamsa-ai/voice-agents-sdk 0.4.5 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +216 -0
- package/dist/index.cjs.js +1 -1
- package/dist/index.cjs.js.map +1 -1
- package/dist/index.esm.js +1 -1
- package/dist/index.esm.js.map +1 -1
- package/dist/index.umd.js +1 -1
- package/dist/index.umd.js.map +1 -1
- package/package.json +1 -1
- package/types/classes/livekit-audio-manager.d.ts +60 -1
- package/types/classes/types.d.ts +48 -0
- package/types/main.d.ts +154 -2
package/package.json
CHANGED
|
@@ -155,7 +155,7 @@
|
|
|
155
155
|
*/
|
|
156
156
|
import { EventEmitter } from 'events';
|
|
157
157
|
import { type RemoteParticipant, type RemoteTrack, type RemoteTrackPublication, type Room } from 'livekit-client';
|
|
158
|
-
import type { TrackStatsData, TrackStatsResult } from './types';
|
|
158
|
+
import type { AudioCaptureOptions, TrackStatsData, TrackStatsResult } from './types';
|
|
159
159
|
/**
|
|
160
160
|
* LiveKitAudioManager class for comprehensive audio stream management
|
|
161
161
|
*
|
|
@@ -176,6 +176,13 @@ export declare class LiveKitAudioManager extends EventEmitter {
|
|
|
176
176
|
private audioContext;
|
|
177
177
|
private inputAnalyser;
|
|
178
178
|
private outputAnalyser;
|
|
179
|
+
/** Audio capture state */
|
|
180
|
+
private audioCaptureEnabled;
|
|
181
|
+
private audioCaptureOptions;
|
|
182
|
+
private readonly recorders;
|
|
183
|
+
private readonly processors;
|
|
184
|
+
/** Map of track IDs to their capture state */
|
|
185
|
+
private readonly trackCaptureMap;
|
|
179
186
|
/**
|
|
180
187
|
* Provides the LiveKit Room to the audio manager for microphone control.
|
|
181
188
|
*/
|
|
@@ -734,5 +741,57 @@ export declare class LiveKitAudioManager extends EventEmitter {
|
|
|
734
741
|
* audioManager.cleanup(); // Still safe to call
|
|
735
742
|
* ```
|
|
736
743
|
*/
|
|
744
|
+
/**
|
|
745
|
+
* Enables audio capture with specified options
|
|
746
|
+
*
|
|
747
|
+
* This method sets up audio capture from the agent, user, or both, allowing
|
|
748
|
+
* clients to receive raw audio data for forwarding to third-party services,
|
|
749
|
+
* recording, or custom processing.
|
|
750
|
+
*
|
|
751
|
+
* @param options - Configuration options for audio capture
|
|
752
|
+
*
|
|
753
|
+
* @example Capture agent audio in Opus format
|
|
754
|
+
* ```typescript
|
|
755
|
+
* audioManager.enableAudioCapture({
|
|
756
|
+
* source: 'agent',
|
|
757
|
+
* format: 'opus-webm',
|
|
758
|
+
* chunkSize: 100,
|
|
759
|
+
* callback: (audioData, metadata) => {
|
|
760
|
+
* console.log(`Audio from ${metadata.participant}:`, audioData.byteLength, 'bytes');
|
|
761
|
+
* sendToThirdParty(audioData);
|
|
762
|
+
* }
|
|
763
|
+
* });
|
|
764
|
+
* ```
|
|
765
|
+
*
|
|
766
|
+
* @example Capture both user and agent in PCM format
|
|
767
|
+
* ```typescript
|
|
768
|
+
* audioManager.enableAudioCapture({
|
|
769
|
+
* source: 'both',
|
|
770
|
+
* format: 'pcm-f32',
|
|
771
|
+
* bufferSize: 4096,
|
|
772
|
+
* callback: (audioData, metadata) => {
|
|
773
|
+
* if (metadata.source === 'agent') {
|
|
774
|
+
* processAgentAudio(audioData as Float32Array);
|
|
775
|
+
* } else {
|
|
776
|
+
* processUserAudio(audioData as Float32Array);
|
|
777
|
+
* }
|
|
778
|
+
* }
|
|
779
|
+
* });
|
|
780
|
+
* ```
|
|
781
|
+
*/
|
|
782
|
+
enableAudioCapture(options: AudioCaptureOptions): void;
|
|
783
|
+
/**
|
|
784
|
+
* Disables audio capture and cleans up all capture resources
|
|
785
|
+
*
|
|
786
|
+
* Stops all active MediaRecorders and ScriptProcessorNodes, releases
|
|
787
|
+
* audio capture resources, and clears capture state.
|
|
788
|
+
*
|
|
789
|
+
* @example
|
|
790
|
+
* ```typescript
|
|
791
|
+
* // Stop capturing audio
|
|
792
|
+
* audioManager.disableAudioCapture();
|
|
793
|
+
* ```
|
|
794
|
+
*/
|
|
795
|
+
disableAudioCapture(): void;
|
|
737
796
|
cleanup(): void;
|
|
738
797
|
}
|
package/types/classes/types.d.ts
CHANGED
|
@@ -316,6 +316,54 @@ export type CustomEventMetadata = {
|
|
|
316
316
|
/** The original raw message data from LiveKit */
|
|
317
317
|
rawMessage: Record<string, unknown>;
|
|
318
318
|
};
|
|
319
|
+
/**
|
|
320
|
+
* Audio format types supported for audio capture
|
|
321
|
+
*/
|
|
322
|
+
export type AudioCaptureFormat = 'opus-webm' | 'pcm-f32' | 'pcm-i16';
|
|
323
|
+
/**
|
|
324
|
+
* Source of audio to capture
|
|
325
|
+
*/
|
|
326
|
+
export type AudioCaptureSource = 'agent' | 'user' | 'both';
|
|
327
|
+
/**
|
|
328
|
+
* Metadata provided with each audio data chunk
|
|
329
|
+
*/
|
|
330
|
+
export type AudioCaptureMetadata = {
|
|
331
|
+
/** Identity of the participant this audio is from */
|
|
332
|
+
participant: string;
|
|
333
|
+
/** Type of participant ('agent' or 'user') */
|
|
334
|
+
source: 'agent' | 'user';
|
|
335
|
+
/** Unix timestamp when this audio chunk was captured */
|
|
336
|
+
timestamp: number;
|
|
337
|
+
/** Track ID associated with this audio */
|
|
338
|
+
trackId: string;
|
|
339
|
+
/** Audio format of this chunk */
|
|
340
|
+
format: AudioCaptureFormat;
|
|
341
|
+
/** Sample rate in Hz (for PCM formats) */
|
|
342
|
+
sampleRate?: number;
|
|
343
|
+
/** Number of channels (typically 1 for mono) */
|
|
344
|
+
channels?: number;
|
|
345
|
+
};
|
|
346
|
+
/**
|
|
347
|
+
* Callback function type for audio data capture
|
|
348
|
+
*/
|
|
349
|
+
export type AudioCaptureCallback = (audioData: ArrayBuffer | Float32Array | Int16Array, metadata: AudioCaptureMetadata) => void;
|
|
350
|
+
/**
|
|
351
|
+
* Options for configuring audio capture
|
|
352
|
+
*/
|
|
353
|
+
export type AudioCaptureOptions = {
|
|
354
|
+
/** Source of audio to capture (default: 'agent') */
|
|
355
|
+
source?: AudioCaptureSource;
|
|
356
|
+
/** Audio format to deliver (default: 'opus-webm') */
|
|
357
|
+
format?: AudioCaptureFormat;
|
|
358
|
+
/** Chunk size in milliseconds for encoded formats (default: 100ms) */
|
|
359
|
+
chunkSize?: number;
|
|
360
|
+
/** Buffer size for PCM formats in samples (default: 4096) */
|
|
361
|
+
bufferSize?: number;
|
|
362
|
+
/** Callback function to receive audio data (Level 3 API - Full control) */
|
|
363
|
+
callback?: AudioCaptureCallback;
|
|
364
|
+
/** Simpler callback alias (Level 2 API - recommended for inline usage) */
|
|
365
|
+
onData?: AudioCaptureCallback;
|
|
366
|
+
};
|
|
319
367
|
/**
|
|
320
368
|
* LiveKit access token payload structure used by Hamsa backend.
|
|
321
369
|
* Represents the decoded JWT payload fields relevant for SDK logic.
|
package/types/main.d.ts
CHANGED
|
@@ -2,8 +2,9 @@ import { EventEmitter } from 'events';
|
|
|
2
2
|
import type { ConnectionState, LocalTrack, LocalTrackPublication, Participant, RemoteParticipant, RemoteTrack, Room } from 'livekit-client';
|
|
3
3
|
import LiveKitManager, { type AgentState, type AudioLevelsResult, type CallAnalyticsResult, type ConnectionStatsResult, type ParticipantData, type PerformanceMetricsResult, type TrackStatsResult } from './classes/livekit-manager';
|
|
4
4
|
import ScreenWakeLock from './classes/screen-wake-lock';
|
|
5
|
-
import type { ConnectionQualityData, TrackSubscriptionData, TrackUnsubscriptionData } from './classes/types';
|
|
5
|
+
import type { AudioCaptureCallback, AudioCaptureOptions, ConnectionQualityData, TrackSubscriptionData, TrackUnsubscriptionData } from './classes/types';
|
|
6
6
|
export type { AgentState } from './classes/livekit-manager';
|
|
7
|
+
export type { AudioCaptureCallback, AudioCaptureFormat, AudioCaptureMetadata, AudioCaptureOptions, AudioCaptureSource, } from './classes/types';
|
|
7
8
|
/**
|
|
8
9
|
* Custom error class that includes both human-readable message and machine-readable messageKey
|
|
9
10
|
* for internationalization and programmatic error handling
|
|
@@ -60,6 +61,32 @@ type StartOptions = {
|
|
|
60
61
|
connectionDelay?: ConnectionDelays;
|
|
61
62
|
/** Disable wake lock to allow device sleep during conversation */
|
|
62
63
|
disableWakeLock?: boolean;
|
|
64
|
+
/**
|
|
65
|
+
* Simple callback to receive agent audio data (Level 1 API - Simplest)
|
|
66
|
+
* Automatically captures agent audio in opus-webm format with 100ms chunks
|
|
67
|
+
* @example
|
|
68
|
+
* ```typescript
|
|
69
|
+
* onAudioData: (audioData) => {
|
|
70
|
+
* thirdPartySocket.send(audioData);
|
|
71
|
+
* }
|
|
72
|
+
* ```
|
|
73
|
+
*/
|
|
74
|
+
onAudioData?: AudioCaptureCallback;
|
|
75
|
+
/**
|
|
76
|
+
* Advanced audio capture configuration (Level 2 API - More Control)
|
|
77
|
+
* Use this if you need to specify format, source, or other options
|
|
78
|
+
* @example
|
|
79
|
+
* ```typescript
|
|
80
|
+
* captureAudio: {
|
|
81
|
+
* source: 'both',
|
|
82
|
+
* format: 'pcm-f32',
|
|
83
|
+
* onData: (audioData, metadata) => {
|
|
84
|
+
* processAudio(audioData, metadata);
|
|
85
|
+
* }
|
|
86
|
+
* }
|
|
87
|
+
* ```
|
|
88
|
+
*/
|
|
89
|
+
captureAudio?: AudioCaptureOptions;
|
|
63
90
|
};
|
|
64
91
|
/**
|
|
65
92
|
* Definition of a client-side tool that can be called by the voice agent
|
|
@@ -523,6 +550,131 @@ declare class HamsaVoiceAgent extends EventEmitter {
|
|
|
523
550
|
* ```
|
|
524
551
|
*/
|
|
525
552
|
getOutputByteFrequencyData(): Uint8Array;
|
|
553
|
+
/**
|
|
554
|
+
* Enables real-time audio capture from the conversation
|
|
555
|
+
*
|
|
556
|
+
* This method allows clients to receive raw audio data from the agent, user, or both,
|
|
557
|
+
* enabling use cases like forwarding audio to third-party services, custom recording,
|
|
558
|
+
* real-time transcription, or audio analysis.
|
|
559
|
+
*
|
|
560
|
+
* The audio can be captured in three formats:
|
|
561
|
+
* - `opus-webm`: Efficient Opus codec in WebM container (default, recommended)
|
|
562
|
+
* - `pcm-f32`: Raw PCM audio as Float32Array for advanced processing
|
|
563
|
+
* - `pcm-i16`: Raw PCM audio as Int16Array for compatibility
|
|
564
|
+
*
|
|
565
|
+
* @param options - Configuration options for audio capture
|
|
566
|
+
* @param options.source - Which audio to capture: 'agent' (default), 'user', or 'both'
|
|
567
|
+
* @param options.format - Audio format to receive (default: 'opus-webm')
|
|
568
|
+
* @param options.chunkSize - Chunk size in milliseconds for encoded formats (default: 100ms)
|
|
569
|
+
* @param options.bufferSize - Buffer size in samples for PCM formats (default: 4096)
|
|
570
|
+
* @param options.callback - Function called with each audio chunk
|
|
571
|
+
*
|
|
572
|
+
* @example Forward agent audio to third-party service
|
|
573
|
+
* ```typescript
|
|
574
|
+
* // Start capturing agent audio when call begins
|
|
575
|
+
* agent.on('callStarted', () => {
|
|
576
|
+
* agent.enableAudioCapture({
|
|
577
|
+
* source: 'agent',
|
|
578
|
+
* format: 'opus-webm',
|
|
579
|
+
* chunkSize: 100, // 100ms chunks
|
|
580
|
+
* callback: (audioData, metadata) => {
|
|
581
|
+
* console.log(`Audio from ${metadata.participant}: ${audioData.byteLength} bytes`);
|
|
582
|
+
*
|
|
583
|
+
* // Send to third-party service via WebSocket
|
|
584
|
+
* thirdPartyWebSocket.send(audioData);
|
|
585
|
+
*
|
|
586
|
+
* // Or via HTTP
|
|
587
|
+
* fetch('https://api.example.com/audio', {
|
|
588
|
+
* method: 'POST',
|
|
589
|
+
* body: audioData,
|
|
590
|
+
* headers: {
|
|
591
|
+
* 'Content-Type': 'audio/webm',
|
|
592
|
+
* 'X-Participant': metadata.participant,
|
|
593
|
+
* 'X-Timestamp': metadata.timestamp.toString()
|
|
594
|
+
* }
|
|
595
|
+
* });
|
|
596
|
+
* }
|
|
597
|
+
* });
|
|
598
|
+
* });
|
|
599
|
+
* ```
|
|
600
|
+
*
|
|
601
|
+
* @example Capture both agent and user for custom processing
|
|
602
|
+
* ```typescript
|
|
603
|
+
* agent.enableAudioCapture({
|
|
604
|
+
* source: 'both',
|
|
605
|
+
* format: 'pcm-f32',
|
|
606
|
+
* bufferSize: 4096,
|
|
607
|
+
* callback: (audioData, metadata) => {
|
|
608
|
+
* const samples = audioData as Float32Array;
|
|
609
|
+
*
|
|
610
|
+
* if (metadata.source === 'agent') {
|
|
611
|
+
* // Process agent audio
|
|
612
|
+
* analyzeAgentVoice(samples, metadata.sampleRate);
|
|
613
|
+
* } else {
|
|
614
|
+
* // Process user audio
|
|
615
|
+
* analyzeUserVoice(samples, metadata.sampleRate);
|
|
616
|
+
* }
|
|
617
|
+
*
|
|
618
|
+
* // Save to custom recorder
|
|
619
|
+
* customRecorder.addAudioChunk({
|
|
620
|
+
* source: metadata.source,
|
|
621
|
+
* samples,
|
|
622
|
+
* sampleRate: metadata.sampleRate,
|
|
623
|
+
* timestamp: metadata.timestamp
|
|
624
|
+
* });
|
|
625
|
+
* }
|
|
626
|
+
* });
|
|
627
|
+
* ```
|
|
628
|
+
*
|
|
629
|
+
* @example Real-time transcription integration
|
|
630
|
+
* ```typescript
|
|
631
|
+
* import { AudioCaptureOptions } from '@hamsa-ai/voice-agents-sdk';
|
|
632
|
+
*
|
|
633
|
+
* const transcriptionService = new WebSocket('wss://transcription.example.com');
|
|
634
|
+
*
|
|
635
|
+
* agent.enableAudioCapture({
|
|
636
|
+
* source: 'user',
|
|
637
|
+
* format: 'opus-webm',
|
|
638
|
+
* chunkSize: 50, // Lower latency for real-time transcription
|
|
639
|
+
* callback: (audioData, metadata) => {
|
|
640
|
+
* // Forward user audio to transcription service
|
|
641
|
+
* transcriptionService.send(JSON.stringify({
|
|
642
|
+
* audio: Array.from(new Uint8Array(audioData as ArrayBuffer)),
|
|
643
|
+
* timestamp: metadata.timestamp,
|
|
644
|
+
* sampleRate: metadata.sampleRate
|
|
645
|
+
* }));
|
|
646
|
+
* }
|
|
647
|
+
* });
|
|
648
|
+
*
|
|
649
|
+
* transcriptionService.onmessage = (event) => {
|
|
650
|
+
* const transcription = JSON.parse(event.data);
|
|
651
|
+
* console.log('Real-time transcription:', transcription.text);
|
|
652
|
+
* displayUserSpeech(transcription.text);
|
|
653
|
+
* };
|
|
654
|
+
* ```
|
|
655
|
+
*/
|
|
656
|
+
enableAudioCapture(options: AudioCaptureOptions): void;
|
|
657
|
+
/**
|
|
658
|
+
* Disables audio capture and releases all capture resources
|
|
659
|
+
*
|
|
660
|
+
* Stops all active audio capture, cleans up MediaRecorders and audio processors,
|
|
661
|
+
* and releases associated resources. Safe to call even if audio capture is not enabled.
|
|
662
|
+
*
|
|
663
|
+
* @example
|
|
664
|
+
* ```typescript
|
|
665
|
+
* // Stop capturing when call ends
|
|
666
|
+
* agent.on('callEnded', () => {
|
|
667
|
+
* agent.disableAudioCapture();
|
|
668
|
+
* console.log('Audio capture stopped');
|
|
669
|
+
* });
|
|
670
|
+
*
|
|
671
|
+
* // Or stop manually
|
|
672
|
+
* stopCaptureButton.addEventListener('click', () => {
|
|
673
|
+
* agent.disableAudioCapture();
|
|
674
|
+
* });
|
|
675
|
+
* ```
|
|
676
|
+
*/
|
|
677
|
+
disableAudioCapture(): void;
|
|
526
678
|
/**
|
|
527
679
|
* Initiates a new voice agent conversation
|
|
528
680
|
*
|
|
@@ -595,7 +747,7 @@ declare class HamsaVoiceAgent extends EventEmitter {
|
|
|
595
747
|
* await agent.start({ agentId: 'my_agent', voiceEnablement: true });
|
|
596
748
|
* ```
|
|
597
749
|
*/
|
|
598
|
-
start({ agentId, params, voiceEnablement, tools, userId: _userId, preferHeadphonesForIosDevices: _preferHeadphonesForIosDevices, connectionDelay: _connectionDelay, disableWakeLock: _disableWakeLock, }: StartOptions): Promise<void>;
|
|
750
|
+
start({ agentId, params, voiceEnablement, tools, userId: _userId, preferHeadphonesForIosDevices: _preferHeadphonesForIosDevices, connectionDelay: _connectionDelay, disableWakeLock: _disableWakeLock, onAudioData, captureAudio, }: StartOptions): Promise<void>;
|
|
599
751
|
/**
|
|
600
752
|
* Terminates the current voice agent conversation
|
|
601
753
|
*
|