@hamsa-ai/voice-agents-sdk 0.4.5 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +216 -0
- package/dist/index.cjs.js +1 -1
- package/dist/index.cjs.js.map +1 -1
- package/dist/index.esm.js +1 -1
- package/dist/index.esm.js.map +1 -1
- package/dist/index.umd.js +1 -1
- package/dist/index.umd.js.map +1 -1
- package/package.json +1 -1
- package/types/classes/livekit-audio-manager.d.ts +60 -1
- package/types/classes/types.d.ts +48 -0
- package/types/main.d.ts +154 -2
package/README.md
CHANGED
|
@@ -197,6 +197,205 @@ function createOutputVisualizer() {
|
|
|
197
197
|
}
|
|
198
198
|
```
|
|
199
199
|
|
|
200
|
+
### Audio Capture
|
|
201
|
+
|
|
202
|
+
Capture raw audio data from the agent or user for forwarding to third-party services, custom recording, or advanced audio processing.
|
|
203
|
+
|
|
204
|
+
The SDK provides **three levels of API** for different use cases:
|
|
205
|
+
|
|
206
|
+
#### Level 1: Simple Callback (Recommended for Most Users)
|
|
207
|
+
|
|
208
|
+
The easiest way - just pass a callback to `start()`:
|
|
209
|
+
|
|
210
|
+
```javascript
|
|
211
|
+
// Dead simple - captures agent audio automatically
|
|
212
|
+
await agent.start({
|
|
213
|
+
agentId: 'agent-123',
|
|
214
|
+
voiceEnablement: true,
|
|
215
|
+
onAudioData: (audioData) => {
|
|
216
|
+
// Send to third-party service
|
|
217
|
+
thirdPartyWebSocket.send(audioData);
|
|
218
|
+
}
|
|
219
|
+
});
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
This automatically:
|
|
223
|
+
- ✅ Captures **agent audio** only
|
|
224
|
+
- ✅ Uses **opus-webm** format (efficient, compressed)
|
|
225
|
+
- ✅ Delivers **100ms chunks** (good balance of latency/efficiency)
|
|
226
|
+
- ✅ Starts immediately when call connects
|
|
227
|
+
- ✅ No timing issues or event handling needed
|
|
228
|
+
|
|
229
|
+
#### Level 2: Inline Configuration
|
|
230
|
+
|
|
231
|
+
Need more control? Use `captureAudio` options:
|
|
232
|
+
|
|
233
|
+
```javascript
|
|
234
|
+
await agent.start({
|
|
235
|
+
agentId: 'agent-123',
|
|
236
|
+
voiceEnablement: true,
|
|
237
|
+
captureAudio: {
|
|
238
|
+
source: 'both', // Capture both agent and user
|
|
239
|
+
format: 'pcm-f32', // Raw PCM for processing
|
|
240
|
+
bufferSize: 4096,
|
|
241
|
+
onData: (audioData, metadata) => {
|
|
242
|
+
if (metadata.source === 'agent') {
|
|
243
|
+
processAgentAudio(audioData);
|
|
244
|
+
} else {
|
|
245
|
+
processUserAudio(audioData);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
});
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
#### Level 3: Dynamic Control
|
|
253
|
+
|
|
254
|
+
For advanced users who need runtime control:
|
|
255
|
+
|
|
256
|
+
```javascript
|
|
257
|
+
// Start without capture
|
|
258
|
+
await agent.start({
|
|
259
|
+
agentId: 'agent-123',
|
|
260
|
+
voiceEnablement: true
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
// Enable capture later, conditionally
|
|
264
|
+
if (userWantsRecording) {
|
|
265
|
+
agent.enableAudioCapture({
|
|
266
|
+
source: 'agent',
|
|
267
|
+
format: 'opus-webm',
|
|
268
|
+
chunkSize: 100,
|
|
269
|
+
callback: (audioData, metadata) => {
|
|
270
|
+
thirdPartyWebSocket.send(audioData);
|
|
271
|
+
}
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Disable when done
|
|
276
|
+
agent.disableAudioCapture();
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
#### Audio Capture Formats
|
|
280
|
+
|
|
281
|
+
The SDK supports three audio formats:
|
|
282
|
+
|
|
283
|
+
1. **`opus-webm`** (default, recommended)
|
|
284
|
+
- Efficient Opus codec in WebM container
|
|
285
|
+
- Small file size, good quality
|
|
286
|
+
- Best for forwarding to services or recording
|
|
287
|
+
- `audioData` is an `ArrayBuffer`
|
|
288
|
+
|
|
289
|
+
2. **`pcm-f32`**
|
|
290
|
+
- Raw PCM audio as Float32Array
|
|
291
|
+
- Values range from -1.0 to 1.0
|
|
292
|
+
- Best for audio analysis or DSP
|
|
293
|
+
- `audioData` is a `Float32Array`
|
|
294
|
+
|
|
295
|
+
3. **`pcm-i16`**
|
|
296
|
+
- Raw PCM audio as Int16Array
|
|
297
|
+
- Values range from -32768 to 32767
|
|
298
|
+
- Best for compatibility with audio APIs
|
|
299
|
+
- `audioData` is an `Int16Array`
|
|
300
|
+
|
|
301
|
+
#### Common Use Cases
|
|
302
|
+
|
|
303
|
+
**Forward agent audio to third-party service:**
|
|
304
|
+
```javascript
|
|
305
|
+
const socket = new WebSocket('wss://your-service.com/audio');
|
|
306
|
+
|
|
307
|
+
agent.enableAudioCapture({
|
|
308
|
+
source: 'agent',
|
|
309
|
+
format: 'opus-webm',
|
|
310
|
+
chunkSize: 100,
|
|
311
|
+
callback: (audioData, metadata) => {
|
|
312
|
+
socket.send(audioData);
|
|
313
|
+
}
|
|
314
|
+
});
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
**Capture both agent and user audio:**
|
|
318
|
+
```javascript
|
|
319
|
+
agent.enableAudioCapture({
|
|
320
|
+
source: 'both',
|
|
321
|
+
format: 'opus-webm',
|
|
322
|
+
chunkSize: 100,
|
|
323
|
+
callback: (audioData, metadata) => {
|
|
324
|
+
if (metadata.source === 'agent') {
|
|
325
|
+
processAgentAudio(audioData);
|
|
326
|
+
} else {
|
|
327
|
+
processUserAudio(audioData);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
});
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
**Advanced: Custom audio analysis with PCM:**
|
|
334
|
+
```javascript
|
|
335
|
+
agent.enableAudioCapture({
|
|
336
|
+
source: 'agent',
|
|
337
|
+
format: 'pcm-f32',
|
|
338
|
+
bufferSize: 4096,
|
|
339
|
+
callback: (audioData, metadata) => {
|
|
340
|
+
const samples = audioData; // Float32Array
|
|
341
|
+
|
|
342
|
+
// Calculate RMS volume
|
|
343
|
+
let sum = 0;
|
|
344
|
+
for (let i = 0; i < samples.length; i++) {
|
|
345
|
+
sum += samples[i] * samples[i];
|
|
346
|
+
}
|
|
347
|
+
const rms = Math.sqrt(sum / samples.length);
|
|
348
|
+
|
|
349
|
+
console.log('Agent voice level:', rms);
|
|
350
|
+
|
|
351
|
+
// Apply custom DSP, analyze frequencies, etc.
|
|
352
|
+
customAudioProcessor.process(samples, metadata.sampleRate);
|
|
353
|
+
}
|
|
354
|
+
});
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
**Real-time transcription:**
|
|
358
|
+
```javascript
|
|
359
|
+
const transcriptionWS = new WebSocket('wss://transcription-service.com');
|
|
360
|
+
|
|
361
|
+
agent.enableAudioCapture({
|
|
362
|
+
source: 'user',
|
|
363
|
+
format: 'opus-webm',
|
|
364
|
+
chunkSize: 50, // Lower latency
|
|
365
|
+
callback: (audioData, metadata) => {
|
|
366
|
+
transcriptionWS.send(JSON.stringify({
|
|
367
|
+
audio: Array.from(new Uint8Array(audioData)),
|
|
368
|
+
timestamp: metadata.timestamp,
|
|
369
|
+
participant: metadata.participant
|
|
370
|
+
}));
|
|
371
|
+
}
|
|
372
|
+
});
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
**TypeScript support:**
|
|
376
|
+
```typescript
|
|
377
|
+
import { AudioCaptureOptions, AudioCaptureMetadata } from '@hamsa-ai/voice-agents-sdk';
|
|
378
|
+
|
|
379
|
+
const options: AudioCaptureOptions = {
|
|
380
|
+
source: 'agent',
|
|
381
|
+
format: 'pcm-f32',
|
|
382
|
+
bufferSize: 4096,
|
|
383
|
+
callback: (audioData: Float32Array | Int16Array | ArrayBuffer, metadata: AudioCaptureMetadata) => {
|
|
384
|
+
console.log('Audio captured:', {
|
|
385
|
+
participant: metadata.participant,
|
|
386
|
+
source: metadata.source, // 'agent' | 'user'
|
|
387
|
+
trackId: metadata.trackId,
|
|
388
|
+
timestamp: metadata.timestamp,
|
|
389
|
+
sampleRate: metadata.sampleRate, // For PCM formats
|
|
390
|
+
channels: metadata.channels, // For PCM formats
|
|
391
|
+
format: metadata.format
|
|
392
|
+
});
|
|
393
|
+
}
|
|
394
|
+
};
|
|
395
|
+
|
|
396
|
+
agent.enableAudioCapture(options);
|
|
397
|
+
```
|
|
398
|
+
|
|
200
399
|
|
|
201
400
|
## Advanced Configuration Options
|
|
202
401
|
|
|
@@ -586,6 +785,8 @@ The SDK includes comprehensive TypeScript definitions with detailed analytics in
|
|
|
586
785
|
import {
|
|
587
786
|
HamsaVoiceAgent,
|
|
588
787
|
AgentState,
|
|
788
|
+
AudioCaptureOptions,
|
|
789
|
+
AudioCaptureMetadata,
|
|
589
790
|
CallAnalyticsResult,
|
|
590
791
|
ParticipantData,
|
|
591
792
|
CustomEventMetadata,
|
|
@@ -609,6 +810,21 @@ const isMuted = agent.isMicMuted(); // boolean
|
|
|
609
810
|
const inputFreqData = agent.getInputByteFrequencyData(); // Uint8Array
|
|
610
811
|
const outputFreqData = agent.getOutputByteFrequencyData(); // Uint8Array
|
|
611
812
|
|
|
813
|
+
// Audio capture with full type safety
|
|
814
|
+
agent.enableAudioCapture({
|
|
815
|
+
source: 'agent',
|
|
816
|
+
format: 'opus-webm',
|
|
817
|
+
chunkSize: 100,
|
|
818
|
+
callback: (audioData: ArrayBuffer | Float32Array | Int16Array, metadata: AudioCaptureMetadata) => {
|
|
819
|
+
// Full TypeScript autocomplete for metadata
|
|
820
|
+
console.log(metadata.participant); // string
|
|
821
|
+
console.log(metadata.source); // 'agent' | 'user'
|
|
822
|
+
console.log(metadata.timestamp); // number
|
|
823
|
+
console.log(metadata.trackId); // string
|
|
824
|
+
console.log(metadata.sampleRate); // number | undefined
|
|
825
|
+
}
|
|
826
|
+
});
|
|
827
|
+
|
|
612
828
|
// Strongly typed start options with all advanced features
|
|
613
829
|
await agent.start({
|
|
614
830
|
agentId: "agent-id",
|