@livekit/agents 1.0.35 → 1.0.36-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/dist/index.cjs +3 -1
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +1 -0
  4. package/dist/index.d.ts +1 -0
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +1 -0
  7. package/dist/index.js.map +1 -1
  8. package/dist/inference/interruption/AdaptiveInterruptionDetector.cjs +152 -0
  9. package/dist/inference/interruption/AdaptiveInterruptionDetector.cjs.map +1 -0
  10. package/dist/inference/interruption/AdaptiveInterruptionDetector.d.cts +50 -0
  11. package/dist/inference/interruption/AdaptiveInterruptionDetector.d.ts +50 -0
  12. package/dist/inference/interruption/AdaptiveInterruptionDetector.d.ts.map +1 -0
  13. package/dist/inference/interruption/AdaptiveInterruptionDetector.js +125 -0
  14. package/dist/inference/interruption/AdaptiveInterruptionDetector.js.map +1 -0
  15. package/dist/inference/interruption/InterruptionStream.cjs +310 -0
  16. package/dist/inference/interruption/InterruptionStream.cjs.map +1 -0
  17. package/dist/inference/interruption/InterruptionStream.d.cts +57 -0
  18. package/dist/inference/interruption/InterruptionStream.d.ts +57 -0
  19. package/dist/inference/interruption/InterruptionStream.d.ts.map +1 -0
  20. package/dist/inference/interruption/InterruptionStream.js +288 -0
  21. package/dist/inference/interruption/InterruptionStream.js.map +1 -0
  22. package/dist/inference/interruption/defaults.cjs +76 -0
  23. package/dist/inference/interruption/defaults.cjs.map +1 -0
  24. package/dist/inference/interruption/defaults.d.cts +14 -0
  25. package/dist/inference/interruption/defaults.d.ts +14 -0
  26. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  27. package/dist/inference/interruption/defaults.js +42 -0
  28. package/dist/inference/interruption/defaults.js.map +1 -0
  29. package/dist/inference/interruption/errors.cjs +2 -0
  30. package/dist/inference/interruption/errors.cjs.map +1 -0
  31. package/dist/inference/interruption/errors.d.cts +2 -0
  32. package/dist/inference/interruption/errors.d.ts +2 -0
  33. package/dist/inference/interruption/errors.d.ts.map +1 -0
  34. package/dist/inference/interruption/errors.js +1 -0
  35. package/dist/inference/interruption/errors.js.map +1 -0
  36. package/dist/inference/interruption/http_transport.cjs +57 -0
  37. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  38. package/dist/inference/interruption/http_transport.d.cts +23 -0
  39. package/dist/inference/interruption/http_transport.d.ts +23 -0
  40. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  41. package/dist/inference/interruption/http_transport.js +33 -0
  42. package/dist/inference/interruption/http_transport.js.map +1 -0
  43. package/dist/inference/interruption/index.cjs +34 -0
  44. package/dist/inference/interruption/index.cjs.map +1 -0
  45. package/dist/inference/interruption/index.d.cts +5 -0
  46. package/dist/inference/interruption/index.d.ts +5 -0
  47. package/dist/inference/interruption/index.d.ts.map +1 -0
  48. package/dist/inference/interruption/index.js +7 -0
  49. package/dist/inference/interruption/index.js.map +1 -0
  50. package/dist/inference/interruption/interruption.cjs +85 -0
  51. package/dist/inference/interruption/interruption.cjs.map +1 -0
  52. package/dist/inference/interruption/interruption.d.cts +48 -0
  53. package/dist/inference/interruption/interruption.d.ts +48 -0
  54. package/dist/inference/interruption/interruption.d.ts.map +1 -0
  55. package/dist/inference/interruption/interruption.js +59 -0
  56. package/dist/inference/interruption/interruption.js.map +1 -0
  57. package/dist/inference/utils.cjs +15 -2
  58. package/dist/inference/utils.cjs.map +1 -1
  59. package/dist/inference/utils.d.cts +1 -0
  60. package/dist/inference/utils.d.ts +1 -0
  61. package/dist/inference/utils.d.ts.map +1 -1
  62. package/dist/inference/utils.js +13 -1
  63. package/dist/inference/utils.js.map +1 -1
  64. package/dist/inference/utils.test.cjs +20 -0
  65. package/dist/inference/utils.test.cjs.map +1 -0
  66. package/dist/inference/utils.test.js +19 -0
  67. package/dist/inference/utils.test.js.map +1 -0
  68. package/dist/stream/stream_channel.cjs +3 -0
  69. package/dist/stream/stream_channel.cjs.map +1 -1
  70. package/dist/stream/stream_channel.d.cts +3 -2
  71. package/dist/stream/stream_channel.d.ts +3 -2
  72. package/dist/stream/stream_channel.d.ts.map +1 -1
  73. package/dist/stream/stream_channel.js +3 -0
  74. package/dist/stream/stream_channel.js.map +1 -1
  75. package/dist/telemetry/trace_types.cjs +15 -0
  76. package/dist/telemetry/trace_types.cjs.map +1 -1
  77. package/dist/telemetry/trace_types.d.cts +5 -0
  78. package/dist/telemetry/trace_types.d.ts +5 -0
  79. package/dist/telemetry/trace_types.d.ts.map +1 -1
  80. package/dist/telemetry/trace_types.js +10 -0
  81. package/dist/telemetry/trace_types.js.map +1 -1
  82. package/dist/utils/ws_transport.cjs +51 -0
  83. package/dist/utils/ws_transport.cjs.map +1 -0
  84. package/dist/utils/ws_transport.d.cts +9 -0
  85. package/dist/utils/ws_transport.d.ts +9 -0
  86. package/dist/utils/ws_transport.d.ts.map +1 -0
  87. package/dist/utils/ws_transport.js +17 -0
  88. package/dist/utils/ws_transport.js.map +1 -0
  89. package/dist/utils/ws_transport.test.cjs +212 -0
  90. package/dist/utils/ws_transport.test.cjs.map +1 -0
  91. package/dist/utils/ws_transport.test.js +211 -0
  92. package/dist/utils/ws_transport.test.js.map +1 -0
  93. package/dist/voice/agent_activity.cjs +49 -0
  94. package/dist/voice/agent_activity.cjs.map +1 -1
  95. package/dist/voice/agent_activity.d.cts +14 -0
  96. package/dist/voice/agent_activity.d.ts +14 -0
  97. package/dist/voice/agent_activity.d.ts.map +1 -1
  98. package/dist/voice/agent_activity.js +49 -0
  99. package/dist/voice/agent_activity.js.map +1 -1
  100. package/dist/voice/agent_session.cjs +12 -1
  101. package/dist/voice/agent_session.cjs.map +1 -1
  102. package/dist/voice/agent_session.d.cts +3 -0
  103. package/dist/voice/agent_session.d.ts +3 -0
  104. package/dist/voice/agent_session.d.ts.map +1 -1
  105. package/dist/voice/agent_session.js +12 -1
  106. package/dist/voice/agent_session.js.map +1 -1
  107. package/dist/voice/audio_recognition.cjs +124 -2
  108. package/dist/voice/audio_recognition.cjs.map +1 -1
  109. package/dist/voice/audio_recognition.d.cts +32 -1
  110. package/dist/voice/audio_recognition.d.ts +32 -1
  111. package/dist/voice/audio_recognition.d.ts.map +1 -1
  112. package/dist/voice/audio_recognition.js +127 -2
  113. package/dist/voice/audio_recognition.js.map +1 -1
  114. package/package.json +2 -1
  115. package/src/index.ts +2 -0
  116. package/src/inference/interruption/AdaptiveInterruptionDetector.ts +166 -0
  117. package/src/inference/interruption/InterruptionStream.ts +397 -0
  118. package/src/inference/interruption/defaults.ts +33 -0
  119. package/src/inference/interruption/errors.ts +0 -0
  120. package/src/inference/interruption/http_transport.ts +61 -0
  121. package/src/inference/interruption/index.ts +4 -0
  122. package/src/inference/interruption/interruption.ts +88 -0
  123. package/src/inference/utils.test.ts +31 -0
  124. package/src/inference/utils.ts +15 -0
  125. package/src/stream/stream_channel.ts +6 -2
  126. package/src/telemetry/trace_types.ts +7 -0
  127. package/src/utils/ws_transport.test.ts +282 -0
  128. package/src/utils/ws_transport.ts +22 -0
  129. package/src/voice/agent_activity.ts +61 -0
  130. package/src/voice/agent_session.ts +22 -2
  131. package/src/voice/audio_recognition.ts +161 -1
@@ -5,6 +5,12 @@ import { AudioFrame } from '@livekit/rtc-node';
5
5
  import type { Context, Span } from '@opentelemetry/api';
6
6
  import type { WritableStreamDefaultWriter } from 'node:stream/web';
7
7
  import { ReadableStream } from 'node:stream/web';
8
+ import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js';
9
+ import {
10
+ InterruptionStreamBase,
11
+ InterruptionStreamSentinel,
12
+ } from '../inference/interruption/InterruptionStream.js';
13
+ import type { InterruptionEvent } from '../inference/interruption/interruption.js';
8
14
  import { type ChatContext } from '../llm/chat_context.js';
9
15
  import { log } from '../log.js';
10
16
  import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js';
@@ -39,6 +45,7 @@ export interface RecognitionHooks {
39
45
  onFinalTranscript: (ev: SpeechEvent) => void;
40
46
  onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
41
47
  onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void;
48
+ onInterruption: (ev: InterruptionEvent) => void;
42
49
 
43
50
  retrieveChatCtx: () => ChatContext;
44
51
  }
@@ -53,6 +60,7 @@ export interface AudioRecognitionOptions {
53
60
  recognitionHooks: RecognitionHooks;
54
61
  stt?: STTNode;
55
62
  vad?: VAD;
63
+ interruptionDetector?: AdaptiveInterruptionDetector;
56
64
  turnDetector?: _TurnDetector;
57
65
  turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
58
66
  minEndpointingDelay: number;
@@ -88,6 +96,7 @@ export class AudioRecognition {
88
96
 
89
97
  private vadInputStream: ReadableStream<AudioFrame>;
90
98
  private sttInputStream: ReadableStream<AudioFrame>;
99
+ private interruptionInputStream: ReadableStream<AudioFrame>;
91
100
  private silenceAudioTransform = new IdentityTransform<AudioFrame>();
92
101
  private silenceAudioWriter: WritableStreamDefaultWriter<AudioFrame>;
93
102
 
@@ -96,11 +105,19 @@ export class AudioRecognition {
96
105
  private commitUserTurnTask?: Task<void>;
97
106
  private vadTask?: Task<void>;
98
107
  private sttTask?: Task<void>;
108
+ private interruptionTask?: Task<void>;
109
+
110
+ // interruption detection
111
+ private interruptionDetector?: AdaptiveInterruptionDetector;
112
+ private interruptionStream?: InterruptionStreamBase;
113
+ private interruptionEnabled = false;
114
+ private agentSpeaking = false;
99
115
 
100
116
  constructor(opts: AudioRecognitionOptions) {
101
117
  this.hooks = opts.recognitionHooks;
102
118
  this.stt = opts.stt;
103
119
  this.vad = opts.vad;
120
+ this.interruptionDetector = opts.interruptionDetector;
104
121
  this.turnDetector = opts.turnDetector;
105
122
  this.turnDetectionMode = opts.turnDetectionMode;
106
123
  this.minEndpointingDelay = opts.minEndpointingDelay;
@@ -108,10 +125,15 @@ export class AudioRecognition {
108
125
  this.lastLanguage = undefined;
109
126
  this.rootSpanContext = opts.rootSpanContext;
110
127
 
128
+ // Interruption detection is only enabled if both detector and VAD are provided
129
+ this.interruptionEnabled = this.interruptionDetector !== undefined && this.vad !== undefined;
130
+
111
131
  this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
112
- const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
132
+ const [vadInputStream, rest] = this.deferredInputStream.stream.tee();
133
+ const [sttInputStream, interruptionInputStream] = rest.tee();
113
134
  this.vadInputStream = vadInputStream;
114
135
  this.sttInputStream = mergeReadableStreams(sttInputStream, this.silenceAudioTransform.readable);
136
+ this.interruptionInputStream = interruptionInputStream;
115
137
  this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter();
116
138
  }
117
139
 
@@ -135,6 +157,15 @@ export class AudioRecognition {
135
157
  this.sttTask.result.catch((err) => {
136
158
  this.logger.error(`Error running STT task: ${err}`);
137
159
  });
160
+
161
+ if (this.interruptionEnabled && this.interruptionDetector) {
162
+ this.interruptionTask = Task.from(({ signal }) =>
163
+ this.createInterruptionTask(this.interruptionDetector!, signal),
164
+ );
165
+ this.interruptionTask.result.catch((err) => {
166
+ this.logger.error(`Error running interruption task: ${err}`);
167
+ });
168
+ }
138
169
  }
139
170
 
140
171
  private async onSTTEvent(ev: SpeechEvent) {
@@ -577,6 +608,11 @@ export class AudioRecognition {
577
608
  this.sampleRate = ev.frames[0].sampleRate;
578
609
  }
579
610
 
611
+ // If agent is speaking, user speech is overlap - trigger interruption detection
612
+ if (this.agentSpeaking && this.interruptionEnabled) {
613
+ this.onStartOfOverlapSpeech(ev.speechDuration, this.userTurnSpan);
614
+ }
615
+
580
616
  this.bounceEOUTask?.cancel();
581
617
  break;
582
618
  case VADEventType.INFERENCE_DONE:
@@ -597,6 +633,11 @@ export class AudioRecognition {
597
633
  // when VAD fires END_OF_SPEECH, it already waited for the silence_duration
598
634
  this.speaking = false;
599
635
 
636
+ // If we were in overlap speech (agent speaking + user speaking), end it
637
+ if (this.agentSpeaking && this.interruptionEnabled) {
638
+ this.onEndOfOverlapSpeech();
639
+ }
640
+
600
641
  if (
601
642
  this.vadBaseTurnDetection ||
602
643
  (this.turnDetectionMode === 'stt' && this.userTurnCommitted)
@@ -614,6 +655,123 @@ export class AudioRecognition {
614
655
  }
615
656
  }
616
657
 
658
+ private async createInterruptionTask(
659
+ interruptionDetector: AdaptiveInterruptionDetector,
660
+ signal: AbortSignal,
661
+ ) {
662
+ // Create the interruption stream from the detector
663
+ this.interruptionStream = interruptionDetector.createStream();
664
+
665
+ // Forward audio frames to the interruption stream
666
+ const reader = this.interruptionInputStream.getReader();
667
+
668
+ const forwardTask = (async () => {
669
+ try {
670
+ while (!signal.aborted) {
671
+ const { done, value: frame } = await reader.read();
672
+ if (done) break;
673
+ await this.interruptionStream?.pushFrame(frame);
674
+ }
675
+ } catch (e) {
676
+ if (!signal.aborted) {
677
+ this.logger.error(e, 'Error forwarding audio to interruption stream');
678
+ }
679
+ } finally {
680
+ reader.releaseLock();
681
+ }
682
+ })();
683
+
684
+ // Read interruption events from the stream
685
+ const eventStream = this.interruptionStream.stream;
686
+ const eventReader = eventStream.getReader();
687
+
688
+ const abortHandler = () => {
689
+ eventReader.releaseLock();
690
+ this.interruptionStream?.close();
691
+ signal.removeEventListener('abort', abortHandler);
692
+ };
693
+ signal.addEventListener('abort', abortHandler);
694
+
695
+ try {
696
+ while (!signal.aborted) {
697
+ const { done, value: ev } = await eventReader.read();
698
+ if (done) break;
699
+
700
+ this.logger.debug({ type: ev.type, probability: ev.probability }, 'Interruption event');
701
+ this.hooks.onInterruption(ev);
702
+ }
703
+ } catch (e) {
704
+ if (!signal.aborted) {
705
+ this.logger.error(e, 'Error in interruption task');
706
+ }
707
+ } finally {
708
+ this.logger.debug('Interruption task closed');
709
+ await forwardTask;
710
+ }
711
+ }
712
+
713
+ /**
714
+ * Called when the agent starts speaking.
715
+ * Enables interruption detection by sending the agent-speech-started sentinel.
716
+ */
717
+ onStartOfAgentSpeech(): void {
718
+ this.agentSpeaking = true;
719
+
720
+ if (!this.interruptionEnabled || !this.interruptionStream) {
721
+ return;
722
+ }
723
+
724
+ this.interruptionStream.pushFrame(InterruptionStreamSentinel.speechStarted());
725
+ }
726
+
727
+ /**
728
+ * Called when the agent stops speaking.
729
+ * Disables interruption detection by sending the agent-speech-ended sentinel.
730
+ */
731
+ onEndOfAgentSpeech(): void {
732
+ if (!this.interruptionEnabled || !this.interruptionStream) {
733
+ this.agentSpeaking = false;
734
+ return;
735
+ }
736
+
737
+ this.interruptionStream.pushFrame(InterruptionStreamSentinel.speechEnded());
738
+
739
+ if (this.agentSpeaking) {
740
+ // No interruption was detected, end the overlap inference (idempotent)
741
+ this.onEndOfOverlapSpeech();
742
+ }
743
+
744
+ this.agentSpeaking = false;
745
+ }
746
+
747
+ /**
748
+ * Called when user starts speaking while agent is speaking (overlap speech).
749
+ * This triggers the interruption detection inference.
750
+ */
751
+ onStartOfOverlapSpeech(speechDuration: number, userSpeakingSpan?: Span): void {
752
+ if (!this.interruptionEnabled || !this.interruptionStream) {
753
+ return;
754
+ }
755
+
756
+ if (this.agentSpeaking && userSpeakingSpan) {
757
+ this.interruptionStream.pushFrame(
758
+ InterruptionStreamSentinel.overlapSpeechStarted(speechDuration, userSpeakingSpan),
759
+ );
760
+ }
761
+ }
762
+
763
+ /**
764
+ * Called when user stops speaking during overlap.
765
+ * This ends the interruption detection inference for this overlap period.
766
+ */
767
+ onEndOfOverlapSpeech(): void {
768
+ if (!this.interruptionEnabled || !this.interruptionStream) {
769
+ return;
770
+ }
771
+
772
+ this.interruptionStream.pushFrame(InterruptionStreamSentinel.overlapSpeechEnded());
773
+ }
774
+
617
775
  setInputAudioStream(audioStream: ReadableStream<AudioFrame>) {
618
776
  this.deferredInputStream.setSource(audioStream);
619
777
  }
@@ -686,6 +844,8 @@ export class AudioRecognition {
686
844
  await this.sttTask?.cancelAndWait();
687
845
  await this.vadTask?.cancelAndWait();
688
846
  await this.bounceEOUTask?.cancelAndWait();
847
+ await this.interruptionTask?.cancelAndWait();
848
+ await this.interruptionStream?.close();
689
849
  }
690
850
 
691
851
  private _endUserTurnSpan({