@livekit/agents 1.0.36 → 1.0.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/dist/cli.cjs.map +1 -1
  2. package/dist/inference/api_protos.cjs +68 -0
  3. package/dist/inference/api_protos.cjs.map +1 -1
  4. package/dist/inference/api_protos.d.cts +345 -4
  5. package/dist/inference/api_protos.d.ts +345 -4
  6. package/dist/inference/api_protos.d.ts.map +1 -1
  7. package/dist/inference/api_protos.js +60 -0
  8. package/dist/inference/api_protos.js.map +1 -1
  9. package/dist/inference/stt.cjs +32 -21
  10. package/dist/inference/stt.cjs.map +1 -1
  11. package/dist/inference/stt.d.ts.map +1 -1
  12. package/dist/inference/stt.js +34 -21
  13. package/dist/inference/stt.js.map +1 -1
  14. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  15. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  16. package/dist/stt/stt.cjs +10 -0
  17. package/dist/stt/stt.cjs.map +1 -1
  18. package/dist/stt/stt.d.cts +12 -0
  19. package/dist/stt/stt.d.ts +12 -0
  20. package/dist/stt/stt.d.ts.map +1 -1
  21. package/dist/stt/stt.js +10 -0
  22. package/dist/stt/stt.js.map +1 -1
  23. package/dist/telemetry/traces.cjs +4 -3
  24. package/dist/telemetry/traces.cjs.map +1 -1
  25. package/dist/telemetry/traces.d.cts +2 -0
  26. package/dist/telemetry/traces.d.ts +2 -0
  27. package/dist/telemetry/traces.d.ts.map +1 -1
  28. package/dist/telemetry/traces.js +4 -3
  29. package/dist/telemetry/traces.js.map +1 -1
  30. package/dist/utils.cjs +6 -0
  31. package/dist/utils.cjs.map +1 -1
  32. package/dist/utils.d.cts +2 -0
  33. package/dist/utils.d.ts +2 -0
  34. package/dist/utils.d.ts.map +1 -1
  35. package/dist/utils.js +6 -0
  36. package/dist/utils.js.map +1 -1
  37. package/dist/voice/agent.cjs +5 -0
  38. package/dist/voice/agent.cjs.map +1 -1
  39. package/dist/voice/agent.d.ts.map +1 -1
  40. package/dist/voice/agent.js +5 -0
  41. package/dist/voice/agent.js.map +1 -1
  42. package/dist/voice/agent_activity.cjs +49 -23
  43. package/dist/voice/agent_activity.cjs.map +1 -1
  44. package/dist/voice/agent_activity.d.cts +1 -1
  45. package/dist/voice/agent_activity.d.ts +1 -1
  46. package/dist/voice/agent_activity.d.ts.map +1 -1
  47. package/dist/voice/agent_activity.js +50 -24
  48. package/dist/voice/agent_activity.js.map +1 -1
  49. package/dist/voice/agent_session.cjs +7 -5
  50. package/dist/voice/agent_session.cjs.map +1 -1
  51. package/dist/voice/agent_session.d.cts +5 -2
  52. package/dist/voice/agent_session.d.ts +5 -2
  53. package/dist/voice/agent_session.d.ts.map +1 -1
  54. package/dist/voice/agent_session.js +7 -5
  55. package/dist/voice/agent_session.js.map +1 -1
  56. package/dist/voice/audio_recognition.cjs +3 -1
  57. package/dist/voice/audio_recognition.cjs.map +1 -1
  58. package/dist/voice/audio_recognition.d.ts.map +1 -1
  59. package/dist/voice/audio_recognition.js +3 -1
  60. package/dist/voice/audio_recognition.js.map +1 -1
  61. package/dist/voice/avatar/datastream_io.cjs +6 -0
  62. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  63. package/dist/voice/avatar/datastream_io.d.cts +1 -0
  64. package/dist/voice/avatar/datastream_io.d.ts +1 -0
  65. package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
  66. package/dist/voice/avatar/datastream_io.js +6 -0
  67. package/dist/voice/avatar/datastream_io.js.map +1 -1
  68. package/dist/voice/background_audio.cjs.map +1 -1
  69. package/dist/voice/generation.cjs +14 -5
  70. package/dist/voice/generation.cjs.map +1 -1
  71. package/dist/voice/generation.d.cts +3 -2
  72. package/dist/voice/generation.d.ts +3 -2
  73. package/dist/voice/generation.d.ts.map +1 -1
  74. package/dist/voice/generation.js +14 -5
  75. package/dist/voice/generation.js.map +1 -1
  76. package/dist/voice/io.cjs +12 -0
  77. package/dist/voice/io.cjs.map +1 -1
  78. package/dist/voice/io.d.cts +19 -1
  79. package/dist/voice/io.d.ts +19 -1
  80. package/dist/voice/io.d.ts.map +1 -1
  81. package/dist/voice/io.js +12 -0
  82. package/dist/voice/io.js.map +1 -1
  83. package/dist/voice/recorder_io/recorder_io.cjs +91 -28
  84. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  85. package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
  86. package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
  87. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  88. package/dist/voice/recorder_io/recorder_io.js +91 -28
  89. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  90. package/dist/voice/room_io/_input.cjs +40 -11
  91. package/dist/voice/room_io/_input.cjs.map +1 -1
  92. package/dist/voice/room_io/_input.d.cts +4 -1
  93. package/dist/voice/room_io/_input.d.ts +4 -1
  94. package/dist/voice/room_io/_input.d.ts.map +1 -1
  95. package/dist/voice/room_io/_input.js +31 -2
  96. package/dist/voice/room_io/_input.js.map +1 -1
  97. package/dist/voice/room_io/_output.cjs +6 -0
  98. package/dist/voice/room_io/_output.cjs.map +1 -1
  99. package/dist/voice/room_io/_output.d.cts +1 -0
  100. package/dist/voice/room_io/_output.d.ts +1 -0
  101. package/dist/voice/room_io/_output.d.ts.map +1 -1
  102. package/dist/voice/room_io/_output.js +6 -0
  103. package/dist/voice/room_io/_output.js.map +1 -1
  104. package/dist/voice/room_io/room_io.cjs.map +1 -1
  105. package/dist/voice/room_io/room_io.d.cts +2 -2
  106. package/dist/voice/room_io/room_io.d.ts +2 -2
  107. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  108. package/dist/voice/room_io/room_io.js.map +1 -1
  109. package/dist/voice/speech_handle.cjs +2 -0
  110. package/dist/voice/speech_handle.cjs.map +1 -1
  111. package/dist/voice/speech_handle.d.cts +3 -0
  112. package/dist/voice/speech_handle.d.ts +3 -0
  113. package/dist/voice/speech_handle.d.ts.map +1 -1
  114. package/dist/voice/speech_handle.js +2 -0
  115. package/dist/voice/speech_handle.js.map +1 -1
  116. package/dist/voice/testing/index.cjs +2 -0
  117. package/dist/voice/testing/index.cjs.map +1 -1
  118. package/dist/voice/testing/index.d.cts +1 -1
  119. package/dist/voice/testing/index.d.ts +1 -1
  120. package/dist/voice/testing/index.d.ts.map +1 -1
  121. package/dist/voice/testing/index.js +2 -0
  122. package/dist/voice/testing/index.js.map +1 -1
  123. package/dist/voice/testing/run_result.cjs +294 -5
  124. package/dist/voice/testing/run_result.cjs.map +1 -1
  125. package/dist/voice/testing/run_result.d.cts +149 -1
  126. package/dist/voice/testing/run_result.d.ts +149 -1
  127. package/dist/voice/testing/run_result.d.ts.map +1 -1
  128. package/dist/voice/testing/run_result.js +293 -5
  129. package/dist/voice/testing/run_result.js.map +1 -1
  130. package/package.json +1 -1
  131. package/src/inference/api_protos.ts +83 -0
  132. package/src/inference/stt.ts +39 -22
  133. package/src/stt/stt.ts +21 -0
  134. package/src/telemetry/traces.ts +6 -2
  135. package/src/utils.ts +7 -0
  136. package/src/voice/agent.ts +9 -0
  137. package/src/voice/agent_activity.ts +72 -26
  138. package/src/voice/agent_session.ts +6 -5
  139. package/src/voice/audio_recognition.ts +2 -0
  140. package/src/voice/avatar/datastream_io.ts +8 -0
  141. package/src/voice/generation.ts +24 -12
  142. package/src/voice/io.ts +27 -5
  143. package/src/voice/recorder_io/recorder_io.ts +123 -31
  144. package/src/voice/room_io/_input.ts +32 -4
  145. package/src/voice/room_io/_output.ts +8 -0
  146. package/src/voice/room_io/room_io.ts +3 -1
  147. package/src/voice/speech_handle.ts +4 -0
  148. package/src/voice/testing/index.ts +1 -0
  149. package/src/voice/testing/run_result.ts +373 -12
package/src/voice/io.ts CHANGED
@@ -30,12 +30,14 @@ export type TTSNode = (
30
30
  ) => Promise<ReadableStream<AudioFrame> | null>;
31
31
 
32
32
  /**
33
- * A string with timing information for word-level alignment.
33
+ *A string with optional start and end timestamps for word-level alignment.
34
34
  */
35
35
  export interface TimedString {
36
36
  text: string;
37
37
  startTime?: number; // seconds
38
38
  endTime?: number; // seconds
39
+ confidence?: number;
40
+ startTimeOffset?: number;
39
41
  }
40
42
 
41
43
  export interface AudioOutputCapabilities {
@@ -57,6 +59,7 @@ export abstract class AudioInput {
57
59
  }
58
60
 
59
61
  export abstract class AudioOutput extends EventEmitter {
62
+ static readonly EVENT_PLAYBACK_STARTED = 'playbackStarted';
60
63
  static readonly EVENT_PLAYBACK_FINISHED = 'playbackFinished';
61
64
 
62
65
  private playbackFinishedFuture: Future<void> = new Future();
@@ -77,7 +80,11 @@ export abstract class AudioOutput extends EventEmitter {
77
80
  ) {
78
81
  super();
79
82
  this.capabilities = capabilities;
83
+
80
84
  if (this.nextInChain) {
85
+ this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_STARTED, (ev: PlaybackStartedEvent) =>
86
+ this.onPlaybackStarted(ev.createdAt),
87
+ );
81
88
  this.nextInChain.on(AudioOutput.EVENT_PLAYBACK_FINISHED, (ev: PlaybackFinishedEvent) =>
82
89
  this.onPlaybackFinished(ev),
83
90
  );
@@ -117,6 +124,14 @@ export abstract class AudioOutput extends EventEmitter {
117
124
  return this.lastPlaybackEvent;
118
125
  }
119
126
 
127
+ /**
128
+ * Called when playback actually starts (first frame is sent to output).
129
+ * Developers building audio sinks should call this when the first frame is captured.
130
+ */
131
+ onPlaybackStarted(createdAt: number): void {
132
+ this.emit(AudioOutput.EVENT_PLAYBACK_STARTED, { createdAt } as PlaybackStartedEvent);
133
+ }
134
+
120
135
  /**
121
136
  * Developers building audio sinks must call this method when a playback/segment is finished.
122
137
  * Segments are segmented by calls to flush() or clearBuffer()
@@ -174,15 +189,22 @@ export abstract class AudioOutput extends EventEmitter {
174
189
  }
175
190
 
176
191
  export interface PlaybackFinishedEvent {
177
- // How much of the audio was played back
192
+ /** How much of the audio was played back, in seconds */
178
193
  playbackPosition: number;
179
- // Interrupted is True if playback was interrupted (clearBuffer() was called)
194
+ /** True if playback was interrupted (clearBuffer() was called) */
180
195
  interrupted: boolean;
181
- // Transcript synced with playback; may be partial if the audio was interrupted
182
- // When null, the transcript is not synchronized with the playback
196
+ /**
197
+ * Transcript synced with playback; may be partial if the audio was interrupted.
198
+ * When undefined, the transcript is not synchronized with the playback.
199
+ */
183
200
  synchronizedTranscript?: string;
184
201
  }
185
202
 
203
+ export interface PlaybackStartedEvent {
204
+ /** The timestamp (Date.now()) when the playback started */
205
+ createdAt: number;
206
+ }
207
+
186
208
  export abstract class TextOutput {
187
209
  constructor(protected readonly nextInChain?: TextOutput) {}
188
210
 
@@ -123,7 +123,7 @@ export class RecorderIO {
123
123
  }
124
124
 
125
125
  private writeCb(buf: AudioFrame[]): void {
126
- const inputBuf = this.inRecord!.takeBuf();
126
+ const inputBuf = this.inRecord!.takeBuf(this.outRecord?._lastSpeechEndTime);
127
127
  this.inChan.write(inputBuf);
128
128
  this.outChan.write(buf);
129
129
  }
@@ -137,8 +137,18 @@ export class RecorderIO {
137
137
  }
138
138
 
139
139
  get recordingStartedAt(): number | undefined {
140
- // Use session start time to align with trace timestamps
141
- return this.session._startedAt;
140
+ const inT = this.inRecord?.startedWallTime;
141
+ const outT = this.outRecord?.startedWallTime;
142
+
143
+ if (inT === undefined) {
144
+ return outT;
145
+ }
146
+
147
+ if (outT === undefined) {
148
+ return inT;
149
+ }
150
+
151
+ return Math.min(inT, outT);
142
152
  }
143
153
 
144
154
  /**
@@ -159,7 +169,7 @@ export class RecorderIO {
159
169
  }
160
170
 
161
171
  // Flush input buffer
162
- const inputBuf = this.inRecord!.takeBuf();
172
+ const inputBuf = this.inRecord!.takeBuf(this.outRecord!._lastSpeechEndTime);
163
173
  this.inChan
164
174
  .write(inputBuf)
165
175
  .catch((err) => this.logger.error({ err }, 'Error writing RecorderIO input buffer'));
@@ -359,6 +369,8 @@ class RecorderAudioInput extends AudioInput {
359
369
  private recorderIO: RecorderIO;
360
370
  private accFrames: AudioFrame[] = [];
361
371
  private _startedWallTime?: number;
372
+ private _padded: boolean = false;
373
+ private logger = log();
362
374
 
363
375
  constructor(recorderIO: RecorderIO, source: AudioInput) {
364
376
  super();
@@ -378,10 +390,46 @@ class RecorderAudioInput extends AudioInput {
378
390
 
379
391
  /**
380
392
  * Take accumulated frames and clear the buffer
393
+ * @param padSince - If provided and input started after this time, pad with silence
381
394
  */
382
- takeBuf(): AudioFrame[] {
383
- const frames = this.accFrames;
395
+ takeBuf(padSince?: number): AudioFrame[] {
396
+ let frames = this.accFrames;
384
397
  this.accFrames = [];
398
+
399
+ if (
400
+ padSince !== undefined &&
401
+ this._startedWallTime !== undefined &&
402
+ this._startedWallTime > padSince &&
403
+ !this._padded &&
404
+ frames.length > 0
405
+ ) {
406
+ const padding = this._startedWallTime - padSince;
407
+ this.logger.warn(
408
+ {
409
+ lastAgentSpeechTime: padSince,
410
+ inputStartedTime: this._startedWallTime,
411
+ },
412
+ 'input speech started after last agent speech ended',
413
+ );
414
+ this._padded = true;
415
+ const firstFrame = frames[0]!;
416
+ frames = [
417
+ createSilenceFrame(padding / 1000, firstFrame.sampleRate, firstFrame.channels),
418
+ ...frames,
419
+ ];
420
+ } else if (
421
+ padSince !== undefined &&
422
+ this._startedWallTime === undefined &&
423
+ !this._padded &&
424
+ frames.length === 0
425
+ ) {
426
+ // We could pad with silence here with some fixed SR and channels,
427
+ // but it's better for the user to know that this is happening
428
+ this.logger.warn(
429
+ "input speech hasn't started yet, skipping silence padding, recording may be inaccurate until the speech starts",
430
+ );
431
+ }
432
+
385
433
  return frames;
386
434
  }
387
435
 
@@ -455,6 +503,10 @@ class RecorderAudioOutput extends AudioOutput {
455
503
  private writeFn: (buf: AudioFrame[]) => void;
456
504
  private accFrames: AudioFrame[] = [];
457
505
  private _startedWallTime?: number;
506
+ private _logger = log();
507
+
508
+ _lastSpeechEndTime?: number;
509
+ private _lastSpeechStartTime?: number;
458
510
 
459
511
  // Pause tracking
460
512
  private currentPauseStart?: number;
@@ -508,9 +560,32 @@ class RecorderAudioOutput extends AudioOutput {
508
560
  }
509
561
 
510
562
  onPlaybackFinished(options: PlaybackFinishedEvent): void {
511
- const finishTime = Date.now();
563
+ const finishTime = this.currentPauseStart ?? Date.now();
564
+ const trailingSilenceDuration = Math.max(0, Date.now() - finishTime);
565
+
566
+ // Convert playbackPosition from seconds to ms for internal calculations
567
+ let playbackPosition = options.playbackPosition * 1000;
568
+
569
+ if (this._lastSpeechStartTime === undefined) {
570
+ this._logger.warn(
571
+ {
572
+ finishTime,
573
+ playbackPosition,
574
+ interrupted: options.interrupted,
575
+ },
576
+ 'playback finished before speech started',
577
+ );
578
+ playbackPosition = 0;
579
+ }
580
+
581
+ // Clamp playbackPosition to actual elapsed time (all in ms)
582
+ playbackPosition = Math.max(
583
+ 0,
584
+ Math.min(finishTime - (this._lastSpeechStartTime ?? 0), playbackPosition),
585
+ );
512
586
 
513
- super.onPlaybackFinished(options);
587
+ // Convert back to seconds for the event
588
+ super.onPlaybackFinished({ ...options, playbackPosition: playbackPosition / 1000 });
514
589
 
515
590
  if (!this.recorderIO.recording) {
516
591
  return;
@@ -523,28 +598,29 @@ class RecorderAudioOutput extends AudioOutput {
523
598
 
524
599
  if (this.accFrames.length === 0) {
525
600
  this.resetPauseState();
601
+ this._lastSpeechEndTime = Date.now();
602
+ this._lastSpeechStartTime = undefined;
526
603
  return;
527
604
  }
528
605
 
529
- const playbackPosition = options.playbackPosition;
530
-
606
+ // pauseEvents stores (position, duration) in ms
531
607
  const pauseEvents: Array<[number, number]> = [];
608
+ let playbackStartTime = finishTime - playbackPosition;
532
609
 
533
610
  if (this.pauseWallTimes.length > 0) {
534
611
  const totalPauseDuration = this.pauseWallTimes.reduce(
535
612
  (sum, [start, end]) => sum + (end - start),
536
613
  0,
537
614
  );
538
- // Convert playbackPosition from seconds to milliseconds for wall time calculations
539
- const playbackStartTime = finishTime - playbackPosition * 1000 - totalPauseDuration;
615
+ playbackStartTime = finishTime - playbackPosition - totalPauseDuration;
540
616
 
541
617
  let accumulatedPause = 0;
542
618
  for (const [pauseStart, pauseEnd] of this.pauseWallTimes) {
543
- let position = (pauseStart - playbackStartTime - accumulatedPause) / 1000; // Convert to seconds
544
- const duration = (pauseEnd - pauseStart) / 1000; // Convert to seconds
619
+ let position = pauseStart - playbackStartTime - accumulatedPause;
620
+ const duration = pauseEnd - pauseStart;
545
621
  position = Math.max(0, Math.min(position, playbackPosition));
546
622
  pauseEvents.push([position, duration]);
547
- accumulatedPause += pauseEnd - pauseStart;
623
+ accumulatedPause += duration;
548
624
  }
549
625
  }
550
626
 
@@ -558,10 +634,10 @@ class RecorderAudioOutput extends AudioOutput {
558
634
 
559
635
  for (const frame of this.accFrames) {
560
636
  let currentFrame = frame;
561
- const frameDuration = frame.samplesPerChannel / frame.sampleRate;
637
+ const frameDuration = (frame.samplesPerChannel / frame.sampleRate) * 1000;
562
638
 
563
639
  if (frameDuration + accDur > playbackPosition) {
564
- const [left] = splitFrame(currentFrame, playbackPosition - accDur);
640
+ const [left] = splitFrame(currentFrame, (playbackPosition - accDur) / 1000);
565
641
  currentFrame = left;
566
642
  shouldBreak = true;
567
643
  }
@@ -569,27 +645,29 @@ class RecorderAudioOutput extends AudioOutput {
569
645
  // Process any pauses before this frame starts
570
646
  while (pauseIdx < pauseEvents.length && pauseEvents[pauseIdx]![0] <= accDur) {
571
647
  const [, pauseDur] = pauseEvents[pauseIdx]!;
572
- buf.push(createSilenceFrame(pauseDur, sampleRate, numChannels));
648
+ buf.push(createSilenceFrame(pauseDur / 1000, sampleRate, numChannels));
573
649
  pauseIdx++;
574
650
  }
575
651
 
576
652
  // Process any pauses within this frame
577
- const currentFrameDuration = currentFrame.samplesPerChannel / currentFrame.sampleRate;
653
+ const currentFrameDuration =
654
+ (currentFrame.samplesPerChannel / currentFrame.sampleRate) * 1000;
578
655
  while (
579
656
  pauseIdx < pauseEvents.length &&
580
657
  pauseEvents[pauseIdx]![0] < accDur + currentFrameDuration
581
658
  ) {
582
659
  const [pausePos, pauseDur] = pauseEvents[pauseIdx]!;
583
- const [left, right] = splitFrame(currentFrame, pausePos - accDur);
660
+ const [left, right] = splitFrame(currentFrame, (pausePos - accDur) / 1000);
584
661
  buf.push(left);
585
- accDur += left.samplesPerChannel / left.sampleRate;
586
- buf.push(createSilenceFrame(pauseDur, sampleRate, numChannels));
662
+ accDur += (left.samplesPerChannel / left.sampleRate) * 1000;
663
+ buf.push(createSilenceFrame(pauseDur / 1000, sampleRate, numChannels));
664
+
587
665
  currentFrame = right;
588
666
  pauseIdx++;
589
667
  }
590
668
 
591
669
  buf.push(currentFrame);
592
- accDur += currentFrame.samplesPerChannel / currentFrame.sampleRate;
670
+ accDur += (currentFrame.samplesPerChannel / currentFrame.sampleRate) * 1000;
593
671
 
594
672
  if (shouldBreak) {
595
673
  break;
@@ -600,31 +678,41 @@ class RecorderAudioOutput extends AudioOutput {
600
678
  while (pauseIdx < pauseEvents.length) {
601
679
  const [pausePos, pauseDur] = pauseEvents[pauseIdx]!;
602
680
  if (pausePos <= playbackPosition) {
603
- buf.push(createSilenceFrame(pauseDur, sampleRate, numChannels));
681
+ buf.push(createSilenceFrame(pauseDur / 1000, sampleRate, numChannels));
604
682
  }
605
683
  pauseIdx++;
606
684
  }
607
685
 
608
686
  if (buf.length > 0) {
687
+ if (trailingSilenceDuration > 0) {
688
+ buf.push(createSilenceFrame(trailingSilenceDuration / 1000, sampleRate, numChannels));
689
+ }
609
690
  this.writeFn(buf);
610
691
  }
611
692
 
612
693
  this.accFrames = [];
613
694
  this.resetPauseState();
695
+ this._lastSpeechEndTime = Date.now();
696
+ this._lastSpeechStartTime = undefined;
614
697
  }
615
698
 
616
699
  async captureFrame(frame: AudioFrame): Promise<void> {
700
+ if (this.nextInChain) {
701
+ await this.nextInChain.captureFrame(frame);
702
+ }
703
+
617
704
  await super.captureFrame(frame);
618
705
 
619
706
  if (this.recorderIO.recording) {
620
- if (this._startedWallTime === undefined) {
621
- this._startedWallTime = Date.now();
622
- }
623
707
  this.accFrames.push(frame);
624
708
  }
625
709
 
626
- if (this.nextInChain) {
627
- await this.nextInChain.captureFrame(frame);
710
+ if (this._startedWallTime === undefined) {
711
+ this._startedWallTime = Date.now();
712
+ }
713
+
714
+ if (this._lastSpeechStartTime === undefined) {
715
+ this._lastSpeechStartTime = Date.now();
628
716
  }
629
717
  }
630
718
 
@@ -646,8 +734,12 @@ class RecorderAudioOutput extends AudioOutput {
646
734
  /**
647
735
  * Create a silent audio frame with the given duration
648
736
  */
649
- function createSilenceFrame(duration: number, sampleRate: number, numChannels: number): AudioFrame {
650
- const samples = Math.floor(duration * sampleRate);
737
+ function createSilenceFrame(
738
+ durationInS: number,
739
+ sampleRate: number,
740
+ numChannels: number,
741
+ ): AudioFrame {
742
+ const samples = Math.floor(durationInS * sampleRate);
651
743
  const data = new Int16Array(samples * numChannels); // Zero-filled by default
652
744
  return new AudioFrame(data, sampleRate, numChannels, samples);
653
745
  }
@@ -1,7 +1,7 @@
1
1
  // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- import type { AudioFrame } from '@livekit/rtc-node';
4
+ import { type AudioFrame, FrameProcessor } from '@livekit/rtc-node';
5
5
  import {
6
6
  AudioStream,
7
7
  type NoiseCancellationOptions,
@@ -22,6 +22,7 @@ export class ParticipantAudioInputStream extends AudioInput {
22
22
  private sampleRate: number;
23
23
  private numChannels: number;
24
24
  private noiseCancellation?: NoiseCancellationOptions;
25
+ private frameProcessor?: FrameProcessor<AudioFrame>;
25
26
  private publication: RemoteTrackPublication | null = null;
26
27
  private participantIdentity: string | null = null;
27
28
  private logger = log();
@@ -34,16 +35,21 @@ export class ParticipantAudioInputStream extends AudioInput {
34
35
  room: Room;
35
36
  sampleRate: number;
36
37
  numChannels: number;
37
- noiseCancellation?: NoiseCancellationOptions;
38
+ noiseCancellation?: NoiseCancellationOptions | FrameProcessor<AudioFrame>;
38
39
  }) {
39
40
  super();
40
41
  this.room = room;
41
42
  this.sampleRate = sampleRate;
42
43
  this.numChannels = numChannels;
43
- this.noiseCancellation = noiseCancellation;
44
+ if (noiseCancellation instanceof FrameProcessor) {
45
+ this.frameProcessor = noiseCancellation;
46
+ } else {
47
+ this.noiseCancellation = noiseCancellation;
48
+ }
44
49
 
45
50
  this.room.on(RoomEvent.TrackSubscribed, this.onTrackSubscribed);
46
51
  this.room.on(RoomEvent.TrackUnpublished, this.onTrackUnpublished);
52
+ this.room.on(RoomEvent.TokenRefreshed, this.onTokenRefreshed);
47
53
  }
48
54
 
49
55
  setParticipant(participant: RemoteParticipant | string | null) {
@@ -116,6 +122,9 @@ export class ParticipantAudioInputStream extends AudioInput {
116
122
  if (this.deferredStream.isSourceSet) {
117
123
  this.deferredStream.detachSource();
118
124
  }
125
+
126
+ this.frameProcessor?.close();
127
+
119
128
  this.publication = null;
120
129
  }
121
130
 
@@ -140,14 +149,32 @@ export class ParticipantAudioInputStream extends AudioInput {
140
149
  outputRate: this.sampleRate,
141
150
  }),
142
151
  );
152
+ this.frameProcessor?.onStreamInfoUpdated({
153
+ participantIdentity: participant.identity,
154
+ roomName: this.room.name!,
155
+ publicationSid: publication.sid!,
156
+ });
157
+ this.frameProcessor?.onCredentialsUpdated({
158
+ token: this.room.token!,
159
+ url: this.room.serverUrl!,
160
+ });
143
161
  return true;
144
162
  };
145
163
 
164
+ private onTokenRefreshed = () => {
165
+ if (this.room.token && this.room.serverUrl) {
166
+ this.frameProcessor?.onCredentialsUpdated({
167
+ token: this.room.token,
168
+ url: this.room.serverUrl,
169
+ });
170
+ }
171
+ };
172
+
146
173
  private createStream(track: RemoteTrack): ReadableStream<AudioFrame> {
147
174
  return new AudioStream(track, {
148
175
  sampleRate: this.sampleRate,
149
176
  numChannels: this.numChannels,
150
- noiseCancellation: this.noiseCancellation,
177
+ noiseCancellation: this.frameProcessor || this.noiseCancellation,
151
178
  // TODO(AJS-269): resolve compatibility issue with node-sdk to remove the forced type casting
152
179
  }) as unknown as ReadableStream<AudioFrame>;
153
180
  }
@@ -155,6 +182,7 @@ export class ParticipantAudioInputStream extends AudioInput {
155
182
  async close() {
156
183
  this.room.off(RoomEvent.TrackSubscribed, this.onTrackSubscribed);
157
184
  this.room.off(RoomEvent.TrackUnpublished, this.onTrackUnpublished);
185
+ this.room.off(RoomEvent.TokenRefreshed, this.onTokenRefreshed);
158
186
  this.closeStream();
159
187
  // Ignore errors - stream may be locked by RecorderIO or already cancelled
160
188
  await this.deferredStream.stream.cancel().catch(() => {});
@@ -326,6 +326,7 @@ export class ParticipantAudioOutput extends AudioOutput {
326
326
  private pushedDuration: number = 0;
327
327
  private startedFuture: Future<void> = new Future();
328
328
  private interruptedFuture: Future<void> = new Future();
329
+ private firstFrameEmitted: boolean = false;
329
330
 
330
331
  constructor(room: Room, options: AudioOutputOptions) {
331
332
  super(options.sampleRate, undefined, { pause: true });
@@ -347,6 +348,11 @@ export class ParticipantAudioOutput extends AudioOutput {
347
348
 
348
349
  super.captureFrame(frame);
349
350
 
351
+ if (!this.firstFrameEmitted) {
352
+ this.firstFrameEmitted = true;
353
+ this.onPlaybackStarted(Date.now());
354
+ }
355
+
350
356
  // TODO(AJS-102): use frame.durationMs once available in rtc-node
351
357
  this.pushedDuration += frame.samplesPerChannel / frame.sampleRate;
352
358
  await this.audioSource.captureFrame(frame);
@@ -382,6 +388,8 @@ export class ParticipantAudioOutput extends AudioOutput {
382
388
 
383
389
  this.pushedDuration = 0;
384
390
  this.interruptedFuture = new Future();
391
+ this.firstFrameEmitted = false;
392
+
385
393
  this.onPlaybackFinished({
386
394
  playbackPosition: pushedDuration,
387
395
  interrupted,
@@ -2,8 +2,10 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import {
5
+ type AudioFrame,
5
6
  ConnectionState,
6
7
  DisconnectReason,
8
+ type FrameProcessor,
7
9
  type NoiseCancellationOptions,
8
10
  type Participant,
9
11
  ParticipantKind,
@@ -75,7 +77,7 @@ export interface RoomInputOptions {
75
77
  Can be overridden by the `participant` argument of RoomIO constructor or `set_participant`.
76
78
  */
77
79
  participantIdentity?: string;
78
- noiseCancellation?: NoiseCancellationOptions;
80
+ noiseCancellation?: NoiseCancellationOptions | FrameProcessor<AudioFrame>;
79
81
  textInputCallback?: TextInputCallback;
80
82
  /** Participant kinds accepted for auto subscription. If not provided,
81
83
  accept `DEFAULT_PARTICIPANT_KINDS`
@@ -1,6 +1,7 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
+ import type { Context } from '@opentelemetry/api';
4
5
  import type { ChatItem } from '../llm/index.js';
5
6
  import type { Task } from '../utils.js';
6
7
  import { Event, Future, shortuuid } from '../utils.js';
@@ -42,6 +43,9 @@ export class SpeechHandle {
42
43
  /** @internal */
43
44
  _numSteps = 1;
44
45
 
46
+ /** @internal - OpenTelemetry context for the agent turn span */
47
+ _agentTurnContext?: Context;
48
+
45
49
  private itemAddedCallbacks: Set<(item: ChatItem) => void> = new Set();
46
50
  private doneCallbacks: Set<(sh: SpeechHandle) => void> = new Set();
47
51
 
@@ -24,6 +24,7 @@ export {
24
24
  AgentHandoffAssert,
25
25
  AssertionError,
26
26
  EventAssert,
27
+ EventRangeAssert,
27
28
  FunctionCallAssert,
28
29
  FunctionCallOutputAssert,
29
30
  MessageAssert,