@hamsa-ai/voice-agents-sdk 0.4.5 → 0.4.6-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -197,6 +197,205 @@ function createOutputVisualizer() {
197
197
  }
198
198
  ```
199
199
 
200
+ ### Audio Capture
201
+
202
+ Capture raw audio data from the agent or user for forwarding to third-party services, custom recording, or advanced audio processing.
203
+
204
+ The SDK provides **three levels of API** for different use cases:
205
+
206
+ #### Level 1: Simple Callback (Recommended for Most Users)
207
+
208
+ The easiest way - just pass a callback to `start()`:
209
+
210
+ ```javascript
211
+ // Dead simple - captures agent audio automatically
212
+ await agent.start({
213
+ agentId: 'agent-123',
214
+ voiceEnablement: true,
215
+ onAudioData: (audioData) => {
216
+ // Send to third-party service
217
+ thirdPartyWebSocket.send(audioData);
218
+ }
219
+ });
220
+ ```
221
+
222
+ This automatically:
223
+ - ✅ Captures **agent audio** only
224
+ - ✅ Uses **opus-webm** format (efficient, compressed)
225
+ - ✅ Delivers **100ms chunks** (good balance of latency/efficiency)
226
+ - ✅ Starts immediately when call connects
227
+ - ✅ No timing issues or event handling needed
228
+
229
+ #### Level 2: Inline Configuration
230
+
231
+ Need more control? Use `captureAudio` options:
232
+
233
+ ```javascript
234
+ await agent.start({
235
+ agentId: 'agent-123',
236
+ voiceEnablement: true,
237
+ captureAudio: {
238
+ source: 'both', // Capture both agent and user
239
+ format: 'pcm-f32', // Raw PCM for processing
240
+ bufferSize: 4096,
241
+ onData: (audioData, metadata) => {
242
+ if (metadata.source === 'agent') {
243
+ processAgentAudio(audioData);
244
+ } else {
245
+ processUserAudio(audioData);
246
+ }
247
+ }
248
+ }
249
+ });
250
+ ```
251
+
252
+ #### Level 3: Dynamic Control
253
+
254
+ For advanced users who need runtime control:
255
+
256
+ ```javascript
257
+ // Start without capture
258
+ await agent.start({
259
+ agentId: 'agent-123',
260
+ voiceEnablement: true
261
+ });
262
+
263
+ // Enable capture later, conditionally
264
+ if (userWantsRecording) {
265
+ agent.enableAudioCapture({
266
+ source: 'agent',
267
+ format: 'opus-webm',
268
+ chunkSize: 100,
269
+ callback: (audioData, metadata) => {
270
+ thirdPartyWebSocket.send(audioData);
271
+ }
272
+ });
273
+ }
274
+
275
+ // Disable when done
276
+ agent.disableAudioCapture();
277
+ ```
278
+
279
+ #### Audio Capture Formats
280
+
281
+ The SDK supports three audio formats:
282
+
283
+ 1. **`opus-webm`** (default, recommended)
284
+ - Efficient Opus codec in WebM container
285
+ - Small file size, good quality
286
+ - Best for forwarding to services or recording
287
+ - `audioData` is an `ArrayBuffer`
288
+
289
+ 2. **`pcm-f32`**
290
+ - Raw PCM audio as Float32Array
291
+ - Values range from -1.0 to 1.0
292
+ - Best for audio analysis or DSP
293
+ - `audioData` is a `Float32Array`
294
+
295
+ 3. **`pcm-i16`**
296
+ - Raw PCM audio as Int16Array
297
+ - Values range from -32768 to 32767
298
+ - Best for compatibility with audio APIs
299
+ - `audioData` is an `Int16Array`
300
+
301
+ #### Common Use Cases
302
+
303
+ **Forward agent audio to third-party service:**
304
+ ```javascript
305
+ const socket = new WebSocket('wss://your-service.com/audio');
306
+
307
+ agent.enableAudioCapture({
308
+ source: 'agent',
309
+ format: 'opus-webm',
310
+ chunkSize: 100,
311
+ callback: (audioData, metadata) => {
312
+ socket.send(audioData);
313
+ }
314
+ });
315
+ ```
316
+
317
+ **Capture both agent and user audio:**
318
+ ```javascript
319
+ agent.enableAudioCapture({
320
+ source: 'both',
321
+ format: 'opus-webm',
322
+ chunkSize: 100,
323
+ callback: (audioData, metadata) => {
324
+ if (metadata.source === 'agent') {
325
+ processAgentAudio(audioData);
326
+ } else {
327
+ processUserAudio(audioData);
328
+ }
329
+ }
330
+ });
331
+ ```
332
+
333
+ **Advanced: Custom audio analysis with PCM:**
334
+ ```javascript
335
+ agent.enableAudioCapture({
336
+ source: 'agent',
337
+ format: 'pcm-f32',
338
+ bufferSize: 4096,
339
+ callback: (audioData, metadata) => {
340
+ const samples = audioData; // Float32Array
341
+
342
+ // Calculate RMS volume
343
+ let sum = 0;
344
+ for (let i = 0; i < samples.length; i++) {
345
+ sum += samples[i] * samples[i];
346
+ }
347
+ const rms = Math.sqrt(sum / samples.length);
348
+
349
+ console.log('Agent voice level:', rms);
350
+
351
+ // Apply custom DSP, analyze frequencies, etc.
352
+ customAudioProcessor.process(samples, metadata.sampleRate);
353
+ }
354
+ });
355
+ ```
356
+
357
+ **Real-time transcription:**
358
+ ```javascript
359
+ const transcriptionWS = new WebSocket('wss://transcription-service.com');
360
+
361
+ agent.enableAudioCapture({
362
+ source: 'user',
363
+ format: 'opus-webm',
364
+ chunkSize: 50, // Lower latency
365
+ callback: (audioData, metadata) => {
366
+ transcriptionWS.send(JSON.stringify({
367
+ audio: Array.from(new Uint8Array(audioData)),
368
+ timestamp: metadata.timestamp,
369
+ participant: metadata.participant
370
+ }));
371
+ }
372
+ });
373
+ ```
374
+
375
+ **TypeScript support:**
376
+ ```typescript
377
+ import { AudioCaptureOptions, AudioCaptureMetadata } from '@hamsa-ai/voice-agents-sdk';
378
+
379
+ const options: AudioCaptureOptions = {
380
+ source: 'agent',
381
+ format: 'pcm-f32',
382
+ bufferSize: 4096,
383
+ callback: (audioData: Float32Array | Int16Array | ArrayBuffer, metadata: AudioCaptureMetadata) => {
384
+ console.log('Audio captured:', {
385
+ participant: metadata.participant,
386
+ source: metadata.source, // 'agent' | 'user'
387
+ trackId: metadata.trackId,
388
+ timestamp: metadata.timestamp,
389
+ sampleRate: metadata.sampleRate, // For PCM formats
390
+ channels: metadata.channels, // For PCM formats
391
+ format: metadata.format
392
+ });
393
+ }
394
+ };
395
+
396
+ agent.enableAudioCapture(options);
397
+ ```
398
+
200
399
 
201
400
  ## Advanced Configuration Options
202
401
 
@@ -586,6 +785,8 @@ The SDK includes comprehensive TypeScript definitions with detailed analytics in
586
785
  import {
587
786
  HamsaVoiceAgent,
588
787
  AgentState,
788
+ AudioCaptureOptions,
789
+ AudioCaptureMetadata,
589
790
  CallAnalyticsResult,
590
791
  ParticipantData,
591
792
  CustomEventMetadata,
@@ -609,6 +810,21 @@ const isMuted = agent.isMicMuted(); // boolean
609
810
  const inputFreqData = agent.getInputByteFrequencyData(); // Uint8Array
610
811
  const outputFreqData = agent.getOutputByteFrequencyData(); // Uint8Array
611
812
 
813
+ // Audio capture with full type safety
814
+ agent.enableAudioCapture({
815
+ source: 'agent',
816
+ format: 'opus-webm',
817
+ chunkSize: 100,
818
+ callback: (audioData: ArrayBuffer | Float32Array | Int16Array, metadata: AudioCaptureMetadata) => {
819
+ // Full TypeScript autocomplete for metadata
820
+ console.log(metadata.participant); // string
821
+ console.log(metadata.source); // 'agent' | 'user'
822
+ console.log(metadata.timestamp); // number
823
+ console.log(metadata.trackId); // string
824
+ console.log(metadata.sampleRate); // number | undefined
825
+ }
826
+ });
827
+
612
828
  // Strongly typed start options with all advanced features
613
829
  await agent.start({
614
830
  agentId: "agent-id",