@telnyx/ai-agent-lib 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -149,6 +149,7 @@ function VoiceChat() {
149
149
  | `versionId` | `string` | ❌ | `"main"` | Agent version to use |
150
150
  | `environment` | `"production" \| "development"` | ❌ | `"production"` | Telnyx environment |
151
151
  | `debug` | `boolean` | ❌ | `false` | Enable debug logging |
152
+ | `vad` | `VADOptions` | ❌ | See below | Voice Activity Detection configuration |
152
153
 
153
154
  ### Debug Logging
154
155
 
@@ -339,21 +340,81 @@ agent.on('conversation.update', (notification) => {
339
340
  The library automatically measures round-trip latency using client-side Voice Activity Detection (VAD). This provides accurate timing from when the user stops speaking until the agent's response audio begins.
340
341
 
341
342
  **How it works:**
342
- 1. **Local VAD (User's microphone)**: Monitors the user's audio stream. After detecting 1 second of silence following speech, the library records `thinkingStartedAt` timestamp and transitions to "thinking" state.
343
- 2. **Remote VAD (Agent's audio)**: Monitors the agent's audio stream. When audio volume crosses the threshold, the library calculates `latencyMs` as the time elapsed since `thinkingStartedAt` and transitions to "speaking" state.
343
+ 1. **Local VAD (User's microphone)**: Monitors the user's audio stream. After detecting silence following speech, the library records `thinkingStartedAt` timestamp and transitions to "thinking" state.
344
+ 2. **Remote VAD (Agent's audio)**: Monitors the agent's audio stream. When audio volume crosses the threshold, the library calculates `userPerceivedLatencyMs` as the time elapsed since user went silent and transitions to "speaking" state.
345
+ 3. **Greeting Latency**: For the first agent speech (greeting), the library calculates `greetingLatencyMs` from when the audio stream monitoring started.
344
346
 
345
- **Configuration constants:**
346
- - Volume threshold: 10 (frequency data average)
347
- - Silence duration: 1000ms (time of silence before triggering "thinking" state)
348
- - Check interval: 20ms (polling frequency for local audio)
347
+ **VAD Configuration:**
348
+
349
+ The VAD behavior can be customized using the `vad` option:
350
+
351
+ ```typescript
352
+ // React
353
+ <TelnyxAIAgentProvider
354
+ agentId="your-agent-id"
355
+ vad={{
356
+ volumeThreshold: 10, // 0-255, audio level to detect speech
357
+ silenceDurationMs: 500, // ms of silence before "thinking" state
358
+ minSpeechDurationMs: 100, // min ms of speech to count as real (filters noise)
359
+ maxLatencyMs: 15000, // ignore latency above this (optional, filters stale)
360
+ }}
361
+ >
362
+
363
+ // Direct usage
364
+ const agent = new TelnyxAIAgent({
365
+ agentId: 'your-agent-id',
366
+ vad: {
367
+ volumeThreshold: 10,
368
+ silenceDurationMs: 500,
369
+ minSpeechDurationMs: 100,
370
+ maxLatencyMs: 15000,
371
+ },
372
+ });
373
+ ```
374
+
375
+ **VAD Options:**
376
+
377
+ | Option | Type | Default | Description |
378
+ |--------|------|---------|-------------|
379
+ | `volumeThreshold` | `number` | `10` | Audio level (0-255) to detect speech |
380
+ | `silenceDurationMs` | `number` | `500` | Silence duration before triggering "thinking" state |
381
+ | `minSpeechDurationMs` | `number` | `100` | Minimum speech duration to count as real user speech (filters brief noise) |
382
+ | `maxLatencyMs` | `number` | `undefined` | Maximum latency to report (values above are ignored as stale) |
383
+
384
+ **Tuning for different scenarios:**
385
+
386
+ ```typescript
387
+ // Fast-paced conversation (aggressive turn detection)
388
+ vad: {
389
+ silenceDurationMs: 500,
390
+ minSpeechDurationMs: 80,
391
+ }
392
+
393
+ // Thoughtful conversation (tolerant of pauses)
394
+ vad: {
395
+ silenceDurationMs: 1500,
396
+ minSpeechDurationMs: 150,
397
+ }
398
+
399
+ // Noisy environment
400
+ vad: {
401
+ volumeThreshold: 20,
402
+ minSpeechDurationMs: 200,
403
+ }
404
+ ```
405
+
406
+ **Note:** Silence-based VAD has inherent tradeoffs. Lower `silenceDurationMs` values detect turn-endings faster but may cut off natural pauses ("I need to... think about that"). Higher values are more tolerant but add latency. For production use cases requiring precise turn detection, consider integrating server-side semantic endpointing.
349
407
 
350
408
  ```tsx
351
409
  const agentState = useAgentState();
352
410
 
353
411
  // Access latency when agent starts speaking
354
412
  useEffect(() => {
355
- if (agentState.latencyMs !== undefined) {
356
- console.log(`Response latency: ${agentState.latencyMs}ms`);
413
+ if (agentState.greetingLatencyMs !== undefined) {
414
+ console.log(`Greeting latency: ${agentState.greetingLatencyMs}ms`);
415
+ }
416
+ if (agentState.userPerceivedLatencyMs !== undefined) {
417
+ console.log(`Response latency: ${agentState.userPerceivedLatencyMs}ms`);
357
418
  }
358
419
  if (agentState.thinkingStartedAt) {
359
420
  console.log(`Started thinking at: ${agentState.thinkingStartedAt}`);
@@ -446,9 +507,12 @@ type TranscriptItem = {
446
507
  // Agent state with optional latency information
447
508
  type AgentStateData = {
448
509
  state: "speaking" | "listening" | "thinking";
449
- // Round-trip latency in ms from when user stopped speaking until agent response began.
510
+ // Latency in ms from when user stopped speaking until agent response began.
450
511
  // Only present when state is "speaking"
451
- latencyMs?: number;
512
+ userPerceivedLatencyMs?: number;
513
+ // Latency in ms for the initial agent greeting (first speech).
514
+ // Only present on first "speaking" state
515
+ greetingLatencyMs?: number;
452
516
  // UTC timestamp (ISO 8601) when user stopped speaking and thinking state began.
453
517
  // Only present when state is "thinking"
454
518
  thinkingStartedAt?: string;
@@ -501,8 +565,11 @@ agent.on('conversation.agent.state', (data) => {
501
565
  break;
502
566
  case 'speaking':
503
567
  // Show speaking indicator (e.g., animated waveform)
504
- if (data.latencyMs !== undefined) {
505
- console.log(`Response latency: ${data.latencyMs}ms`);
568
+ if (data.greetingLatencyMs !== undefined) {
569
+ console.log(`Greeting latency: ${data.greetingLatencyMs}ms`);
570
+ }
571
+ if (data.userPerceivedLatencyMs !== undefined) {
572
+ console.log(`Response latency: ${data.userPerceivedLatencyMs}ms`);
506
573
  // Track latency for analytics or display to user
507
574
  }
508
575
  break;
@@ -585,8 +652,8 @@ function ConversationMonitor() {
585
652
  // Subscribe to events for additional handling beyond the built-in hooks
586
653
  useEffect(() => {
587
654
  const handleAgentState = (data: AgentStateData) => {
588
- if (data.latencyMs !== undefined) {
589
- setLatencyHistory(prev => [...prev, data.latencyMs!]);
655
+ if (data.userPerceivedLatencyMs !== undefined) {
656
+ setLatencyHistory(prev => [...prev, data.userPerceivedLatencyMs!]);
590
657
  }
591
658
  };
592
659
 
@@ -1,3 +1,4 @@
1
+ import type { VADOptions } from "./types";
1
2
  export declare class AudioStreamMonitor {
2
3
  private remoteIntervalId;
3
4
  private localIntervalId;
@@ -13,7 +14,15 @@ export declare class AudioStreamMonitor {
13
14
  private lastState;
14
15
  private userIsSpeaking;
15
16
  private lastUserAudioTime;
16
- constructor();
17
+ private userSpeechStartTime;
18
+ private userSilenceStartTime;
19
+ private isFirstAgentSpeech;
20
+ private monitorStartTime;
21
+ private volumeThreshold;
22
+ private silenceDurationMs;
23
+ private minSpeechDurationMs;
24
+ private maxLatencyMs;
25
+ constructor(options?: VADOptions);
17
26
  private updateAgentState;
18
27
  /**
19
28
  * Set the remote audio stream (agent's voice) to monitor for speech detection
package/dist/client.d.ts CHANGED
@@ -1,12 +1,16 @@
1
1
  import { Call } from "@telnyx/webrtc";
2
2
  import EventEmitter from "eventemitter3";
3
- import type { AIAgentEvents, TranscriptItem } from "./types";
3
+ import type { AIAgentEvents, TranscriptItem, VADOptions } from "./types";
4
4
  export type TelnyxAIAgentConstructorParams = {
5
5
  agentId: string;
6
6
  versionId?: string;
7
7
  environment?: "production" | "development";
8
8
  debug?: boolean;
9
9
  trickleIce?: boolean;
10
+ /**
11
+ * Voice Activity Detection options for controlling speech detection and latency measurement.
12
+ */
13
+ vad?: VADOptions;
10
14
  };
11
15
  export declare class TelnyxAIAgent extends EventEmitter<AIAgentEvents> {
12
16
  private telnyxRTC;