@livekit/agents 1.0.36-dev.0 → 1.0.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. package/dist/index.cjs +1 -3
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +0 -1
  4. package/dist/index.d.ts +0 -1
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +0 -1
  7. package/dist/index.js.map +1 -1
  8. package/dist/inference/utils.cjs +2 -15
  9. package/dist/inference/utils.cjs.map +1 -1
  10. package/dist/inference/utils.d.cts +0 -1
  11. package/dist/inference/utils.d.ts +0 -1
  12. package/dist/inference/utils.d.ts.map +1 -1
  13. package/dist/inference/utils.js +1 -13
  14. package/dist/inference/utils.js.map +1 -1
  15. package/dist/stream/stream_channel.cjs +0 -3
  16. package/dist/stream/stream_channel.cjs.map +1 -1
  17. package/dist/stream/stream_channel.d.cts +2 -3
  18. package/dist/stream/stream_channel.d.ts +2 -3
  19. package/dist/stream/stream_channel.d.ts.map +1 -1
  20. package/dist/stream/stream_channel.js +0 -3
  21. package/dist/stream/stream_channel.js.map +1 -1
  22. package/dist/telemetry/trace_types.cjs +0 -15
  23. package/dist/telemetry/trace_types.cjs.map +1 -1
  24. package/dist/telemetry/trace_types.d.cts +0 -5
  25. package/dist/telemetry/trace_types.d.ts +0 -5
  26. package/dist/telemetry/trace_types.d.ts.map +1 -1
  27. package/dist/telemetry/trace_types.js +0 -10
  28. package/dist/telemetry/trace_types.js.map +1 -1
  29. package/dist/voice/agent_activity.cjs +19 -68
  30. package/dist/voice/agent_activity.cjs.map +1 -1
  31. package/dist/voice/agent_activity.d.cts +0 -14
  32. package/dist/voice/agent_activity.d.ts +0 -14
  33. package/dist/voice/agent_activity.d.ts.map +1 -1
  34. package/dist/voice/agent_activity.js +19 -68
  35. package/dist/voice/agent_activity.js.map +1 -1
  36. package/dist/voice/agent_session.cjs +65 -37
  37. package/dist/voice/agent_session.cjs.map +1 -1
  38. package/dist/voice/agent_session.d.cts +25 -4
  39. package/dist/voice/agent_session.d.ts +25 -4
  40. package/dist/voice/agent_session.d.ts.map +1 -1
  41. package/dist/voice/agent_session.js +65 -37
  42. package/dist/voice/agent_session.js.map +1 -1
  43. package/dist/voice/audio_recognition.cjs +2 -124
  44. package/dist/voice/audio_recognition.cjs.map +1 -1
  45. package/dist/voice/audio_recognition.d.cts +1 -32
  46. package/dist/voice/audio_recognition.d.ts +1 -32
  47. package/dist/voice/audio_recognition.d.ts.map +1 -1
  48. package/dist/voice/audio_recognition.js +2 -127
  49. package/dist/voice/audio_recognition.js.map +1 -1
  50. package/dist/voice/index.cjs +14 -1
  51. package/dist/voice/index.cjs.map +1 -1
  52. package/dist/voice/index.d.cts +1 -0
  53. package/dist/voice/index.d.ts +1 -0
  54. package/dist/voice/index.d.ts.map +1 -1
  55. package/dist/voice/index.js +3 -1
  56. package/dist/voice/index.js.map +1 -1
  57. package/dist/voice/room_io/room_io.cjs +1 -0
  58. package/dist/voice/room_io/room_io.cjs.map +1 -1
  59. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  60. package/dist/voice/room_io/room_io.js +1 -0
  61. package/dist/voice/room_io/room_io.js.map +1 -1
  62. package/dist/voice/speech_handle.cjs +12 -3
  63. package/dist/voice/speech_handle.cjs.map +1 -1
  64. package/dist/voice/speech_handle.d.cts +12 -2
  65. package/dist/voice/speech_handle.d.ts +12 -2
  66. package/dist/voice/speech_handle.d.ts.map +1 -1
  67. package/dist/voice/speech_handle.js +10 -2
  68. package/dist/voice/speech_handle.js.map +1 -1
  69. package/dist/voice/testing/index.cjs +54 -0
  70. package/dist/voice/testing/index.cjs.map +1 -0
  71. package/dist/voice/testing/index.d.cts +20 -0
  72. package/dist/voice/testing/index.d.ts +20 -0
  73. package/dist/voice/testing/index.d.ts.map +1 -0
  74. package/dist/voice/testing/index.js +33 -0
  75. package/dist/voice/testing/index.js.map +1 -0
  76. package/dist/voice/testing/run_result.cjs +766 -0
  77. package/dist/voice/testing/run_result.cjs.map +1 -0
  78. package/dist/voice/testing/run_result.d.cts +374 -0
  79. package/dist/voice/testing/run_result.d.ts +374 -0
  80. package/dist/voice/testing/run_result.d.ts.map +1 -0
  81. package/dist/voice/testing/run_result.js +739 -0
  82. package/dist/voice/testing/run_result.js.map +1 -0
  83. package/dist/{inference/interruption/index.cjs → voice/testing/types.cjs} +24 -12
  84. package/dist/voice/testing/types.cjs.map +1 -0
  85. package/dist/voice/testing/types.d.cts +83 -0
  86. package/dist/voice/testing/types.d.ts +83 -0
  87. package/dist/voice/testing/types.d.ts.map +1 -0
  88. package/dist/voice/testing/types.js +19 -0
  89. package/dist/voice/testing/types.js.map +1 -0
  90. package/package.json +3 -4
  91. package/src/index.ts +0 -2
  92. package/src/inference/utils.ts +0 -15
  93. package/src/stream/stream_channel.ts +2 -6
  94. package/src/telemetry/trace_types.ts +0 -7
  95. package/src/voice/agent_activity.ts +24 -83
  96. package/src/voice/agent_session.ts +74 -49
  97. package/src/voice/audio_recognition.ts +1 -161
  98. package/src/voice/index.ts +1 -0
  99. package/src/voice/room_io/room_io.ts +1 -0
  100. package/src/voice/speech_handle.ts +24 -4
  101. package/src/voice/testing/index.ts +50 -0
  102. package/src/voice/testing/run_result.ts +937 -0
  103. package/src/voice/testing/types.ts +118 -0
  104. package/dist/inference/interruption/AdaptiveInterruptionDetector.cjs +0 -152
  105. package/dist/inference/interruption/AdaptiveInterruptionDetector.cjs.map +0 -1
  106. package/dist/inference/interruption/AdaptiveInterruptionDetector.d.cts +0 -50
  107. package/dist/inference/interruption/AdaptiveInterruptionDetector.d.ts +0 -50
  108. package/dist/inference/interruption/AdaptiveInterruptionDetector.d.ts.map +0 -1
  109. package/dist/inference/interruption/AdaptiveInterruptionDetector.js +0 -125
  110. package/dist/inference/interruption/AdaptiveInterruptionDetector.js.map +0 -1
  111. package/dist/inference/interruption/InterruptionStream.cjs +0 -310
  112. package/dist/inference/interruption/InterruptionStream.cjs.map +0 -1
  113. package/dist/inference/interruption/InterruptionStream.d.cts +0 -57
  114. package/dist/inference/interruption/InterruptionStream.d.ts +0 -57
  115. package/dist/inference/interruption/InterruptionStream.d.ts.map +0 -1
  116. package/dist/inference/interruption/InterruptionStream.js +0 -288
  117. package/dist/inference/interruption/InterruptionStream.js.map +0 -1
  118. package/dist/inference/interruption/defaults.cjs +0 -76
  119. package/dist/inference/interruption/defaults.cjs.map +0 -1
  120. package/dist/inference/interruption/defaults.d.cts +0 -14
  121. package/dist/inference/interruption/defaults.d.ts +0 -14
  122. package/dist/inference/interruption/defaults.d.ts.map +0 -1
  123. package/dist/inference/interruption/defaults.js +0 -42
  124. package/dist/inference/interruption/defaults.js.map +0 -1
  125. package/dist/inference/interruption/errors.cjs +0 -2
  126. package/dist/inference/interruption/errors.cjs.map +0 -1
  127. package/dist/inference/interruption/errors.d.cts +0 -2
  128. package/dist/inference/interruption/errors.d.ts +0 -2
  129. package/dist/inference/interruption/errors.d.ts.map +0 -1
  130. package/dist/inference/interruption/errors.js +0 -1
  131. package/dist/inference/interruption/errors.js.map +0 -1
  132. package/dist/inference/interruption/http_transport.cjs +0 -57
  133. package/dist/inference/interruption/http_transport.cjs.map +0 -1
  134. package/dist/inference/interruption/http_transport.d.cts +0 -23
  135. package/dist/inference/interruption/http_transport.d.ts +0 -23
  136. package/dist/inference/interruption/http_transport.d.ts.map +0 -1
  137. package/dist/inference/interruption/http_transport.js +0 -33
  138. package/dist/inference/interruption/http_transport.js.map +0 -1
  139. package/dist/inference/interruption/index.cjs.map +0 -1
  140. package/dist/inference/interruption/index.d.cts +0 -5
  141. package/dist/inference/interruption/index.d.ts +0 -5
  142. package/dist/inference/interruption/index.d.ts.map +0 -1
  143. package/dist/inference/interruption/index.js +0 -7
  144. package/dist/inference/interruption/index.js.map +0 -1
  145. package/dist/inference/interruption/interruption.cjs +0 -85
  146. package/dist/inference/interruption/interruption.cjs.map +0 -1
  147. package/dist/inference/interruption/interruption.d.cts +0 -48
  148. package/dist/inference/interruption/interruption.d.ts +0 -48
  149. package/dist/inference/interruption/interruption.d.ts.map +0 -1
  150. package/dist/inference/interruption/interruption.js +0 -59
  151. package/dist/inference/interruption/interruption.js.map +0 -1
  152. package/dist/inference/utils.test.cjs +0 -20
  153. package/dist/inference/utils.test.cjs.map +0 -1
  154. package/dist/inference/utils.test.js +0 -19
  155. package/dist/inference/utils.test.js.map +0 -1
  156. package/dist/utils/ws_transport.cjs +0 -51
  157. package/dist/utils/ws_transport.cjs.map +0 -1
  158. package/dist/utils/ws_transport.d.cts +0 -9
  159. package/dist/utils/ws_transport.d.ts +0 -9
  160. package/dist/utils/ws_transport.d.ts.map +0 -1
  161. package/dist/utils/ws_transport.js +0 -17
  162. package/dist/utils/ws_transport.js.map +0 -1
  163. package/dist/utils/ws_transport.test.cjs +0 -212
  164. package/dist/utils/ws_transport.test.cjs.map +0 -1
  165. package/dist/utils/ws_transport.test.js +0 -211
  166. package/dist/utils/ws_transport.test.js.map +0 -1
  167. package/src/inference/interruption/AdaptiveInterruptionDetector.ts +0 -166
  168. package/src/inference/interruption/InterruptionStream.ts +0 -397
  169. package/src/inference/interruption/defaults.ts +0 -33
  170. package/src/inference/interruption/errors.ts +0 -0
  171. package/src/inference/interruption/http_transport.ts +0 -61
  172. package/src/inference/interruption/index.ts +0 -4
  173. package/src/inference/interruption/interruption.ts +0 -88
  174. package/src/inference/utils.test.ts +0 -31
  175. package/src/utils/ws_transport.test.ts +0 -282
  176. package/src/utils/ws_transport.ts +0 -22
@@ -41,8 +41,6 @@ import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js
41
41
  import { splitWords } from '../tokenize/basic/word.js';
42
42
  import { TTS, type TTSError } from '../tts/tts.js';
43
43
  import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
44
- import type { InterruptionEvent } from '../inference/interruption/interruption.js';
45
- import { InterruptionEventType } from '../inference/interruption/interruption.js';
46
44
  import { VAD, type VADEvent } from '../vad.js';
47
45
  import type { Agent, ModelSettings } from './agent.js';
48
46
  import { StopResponse, asyncLocalStorage } from './agent.js';
@@ -114,24 +112,6 @@ export class AgentActivity implements RecognitionHooks {
114
112
  _mainTask?: Task<void>;
115
113
  _userTurnCompletedTask?: Promise<void>;
116
114
 
117
- /**
118
- * Notify that agent started speaking.
119
- * This enables interruption detection in AudioRecognition.
120
- * @internal
121
- */
122
- notifyAgentSpeechStarted(): void {
123
- this.audioRecognition?.onStartOfAgentSpeech();
124
- }
125
-
126
- /**
127
- * Notify that agent stopped speaking.
128
- * This disables interruption detection in AudioRecognition.
129
- * @internal
130
- */
131
- notifyAgentSpeechEnded(): void {
132
- this.audioRecognition?.onEndOfAgentSpeech();
133
- }
134
-
135
115
  constructor(agent: Agent, agentSession: AgentSession) {
136
116
  this.agent = agent;
137
117
  this.agentSession = agentSession;
@@ -312,7 +292,6 @@ export class AgentActivity implements RecognitionHooks {
312
292
  // Disable stt node if stt is not provided
313
293
  stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
314
294
  vad: this.vad,
315
- interruptionDetector: this.agentSession.interruptionDetector,
316
295
  turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
317
296
  turnDetectionMode: this.turnDetectionMode,
318
297
  minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
@@ -718,46 +697,6 @@ export class AgentActivity implements RecognitionHooks {
718
697
  }
719
698
  }
720
699
 
721
- onInterruption(ev: InterruptionEvent): void {
722
- if (ev.type !== InterruptionEventType.INTERRUPTION) {
723
- // Only handle actual interruptions, not overlap_speech_ended events
724
- return;
725
- }
726
-
727
- this.logger.info(
728
- {
729
- probability: ev.probability,
730
- detectionDelay: ev.detectionDelay,
731
- totalDuration: ev.totalDuration,
732
- },
733
- 'adaptive interruption detected',
734
- );
735
-
736
- // Similar to onVADInferenceDone but triggered by the adaptive interruption detector
737
- if (this.turnDetection === 'manual' || this.turnDetection === 'realtime_llm') {
738
- return;
739
- }
740
-
741
- if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
742
- return;
743
- }
744
-
745
- this.realtimeSession?.startUserActivity();
746
-
747
- if (
748
- this._currentSpeech &&
749
- !this._currentSpeech.interrupted &&
750
- this._currentSpeech.allowInterruptions
751
- ) {
752
- this.logger.info(
753
- { 'speech id': this._currentSpeech.id },
754
- 'speech interrupted by adaptive interruption detector',
755
- );
756
- this.realtimeSession?.interrupt();
757
- this._currentSpeech.interrupt();
758
- }
759
- }
760
-
761
700
  onInterimTranscript(ev: SpeechEvent): void {
762
701
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
763
702
  // skip stt transcription if userTranscription is enabled on the realtime model
@@ -1411,11 +1350,14 @@ export class AgentActivity implements RecognitionHooks {
1411
1350
  );
1412
1351
  tasks.push(llmTask);
1413
1352
 
1414
- const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
1415
-
1416
1353
  let ttsTask: Task<void> | null = null;
1417
1354
  let ttsStream: ReadableStream<AudioFrame> | null = null;
1355
+ let llmOutput: ReadableStream<string>;
1356
+
1418
1357
  if (audioOutput) {
1358
+ // Only tee the stream when we need TTS
1359
+ const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
1360
+ llmOutput = textOutput;
1419
1361
  [ttsTask, ttsStream] = performTTSInference(
1420
1362
  (...args) => this.agent.ttsNode(...args),
1421
1363
  ttsTextInput,
@@ -1423,6 +1365,9 @@ export class AgentActivity implements RecognitionHooks {
1423
1365
  replyAbortController,
1424
1366
  );
1425
1367
  tasks.push(ttsTask);
1368
+ } else {
1369
+ // No TTS needed, use the stream directly
1370
+ llmOutput = llmGenData.textStream;
1426
1371
  }
1427
1372
 
1428
1373
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
@@ -1482,12 +1427,16 @@ export class AgentActivity implements RecognitionHooks {
1482
1427
  //TODO(AJS-272): before executing tools, make sure we generated all the text
1483
1428
  // (this ensure everything is kept ordered)
1484
1429
 
1485
- const onToolExecutionStarted = (_: FunctionCall) => {
1486
- // TODO(brian): handle speech_handle item_added
1430
+ const onToolExecutionStarted = (f: FunctionCall) => {
1431
+ speechHandle._itemAdded([f]);
1432
+ this.agent._chatCtx.items.push(f);
1433
+ this.agentSession._toolItemsAdded([f]);
1487
1434
  };
1488
1435
 
1489
- const onToolExecutionCompleted = (_: ToolExecutionOutput) => {
1490
- // TODO(brian): handle speech_handle item_added
1436
+ const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
1437
+ if (out.toolCallOutput) {
1438
+ speechHandle._itemAdded([out.toolCallOutput]);
1439
+ }
1491
1440
  };
1492
1441
 
1493
1442
  const [executeToolsTask, toolOutput] = performToolExecutions({
@@ -1562,6 +1511,7 @@ export class AgentActivity implements RecognitionHooks {
1562
1511
  });
1563
1512
  chatCtx.insert(message);
1564
1513
  this.agent._chatCtx.insert(message);
1514
+ speechHandle._itemAdded([message]);
1565
1515
  this.agentSession._conversationItemAdded(message);
1566
1516
  }
1567
1517
 
@@ -1589,6 +1539,7 @@ export class AgentActivity implements RecognitionHooks {
1589
1539
  });
1590
1540
  chatCtx.insert(message);
1591
1541
  this.agent._chatCtx.insert(message);
1542
+ speechHandle._itemAdded([message]);
1592
1543
  this.agentSession._conversationItemAdded(message);
1593
1544
  this.logger.info(
1594
1545
  { speech_id: speechHandle.id, message: textOut.text },
@@ -1673,28 +1624,18 @@ export class AgentActivity implements RecognitionHooks {
1673
1624
  if (shouldGenerateToolReply) {
1674
1625
  chatCtx.insert(toolMessages);
1675
1626
 
1676
- const handle = SpeechHandle.create({
1677
- allowInterruptions: speechHandle.allowInterruptions,
1678
- stepIndex: speechHandle._stepIndex + 1,
1679
- parent: speechHandle,
1680
- });
1681
- this.agentSession.emit(
1682
- AgentSessionEventTypes.SpeechCreated,
1683
- createSpeechCreatedEvent({
1684
- userInitiated: false,
1685
- source: 'tool_response',
1686
- speechHandle: handle,
1687
- }),
1688
- );
1627
+ // Increment step count on SAME handle (parity with Python agent_activity.py L2081)
1628
+ speechHandle._numSteps += 1;
1689
1629
 
1690
1630
  // Avoid setting tool_choice to "required" or a specific function when
1691
1631
  // passing tool response back to the LLM
1692
1632
  const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1693
1633
 
1634
+ // Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
1694
1635
  const toolResponseTask = this.createSpeechTask({
1695
1636
  task: Task.from(() =>
1696
1637
  this.pipelineReplyTask(
1697
- handle,
1638
+ speechHandle,
1698
1639
  chatCtx,
1699
1640
  toolCtx,
1700
1641
  { toolChoice: respondToolChoice },
@@ -1704,13 +1645,13 @@ export class AgentActivity implements RecognitionHooks {
1704
1645
  toolMessages,
1705
1646
  ),
1706
1647
  ),
1707
- ownedSpeechHandle: handle,
1648
+ ownedSpeechHandle: speechHandle,
1708
1649
  name: 'AgentActivity.pipelineReply',
1709
1650
  });
1710
1651
 
1711
1652
  toolResponseTask.finally(() => this.onPipelineReplyDone());
1712
1653
 
1713
- this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1654
+ this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1714
1655
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1715
1656
  for (const msg of toolMessages) {
1716
1657
  msg.createdAt = replyStartedAt;
@@ -15,7 +15,6 @@ import {
15
15
  type STTModelString,
16
16
  type TTSModelString,
17
17
  } from '../inference/index.js';
18
- import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js';
19
18
  import { type JobContext, getJobContext } from '../job.js';
20
19
  import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js';
21
20
  import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js';
@@ -62,6 +61,7 @@ import { RecorderIO } from './recorder_io/index.js';
62
61
  import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io/index.js';
63
62
  import type { UnknownUserData } from './run_context.js';
64
63
  import type { SpeechHandle } from './speech_handle.js';
64
+ import { RunResult } from './testing/run_result.js';
65
65
 
66
66
  export interface VoiceOptions {
67
67
  allowInterruptions: boolean;
@@ -107,7 +107,6 @@ export type AgentSessionOptions<UserData = UnknownUserData> = {
107
107
  vad?: VAD;
108
108
  llm?: LLM | RealtimeModel | LLMModels;
109
109
  tts?: TTS | TTSModelString;
110
- interruptionDetector?: AdaptiveInterruptionDetector;
111
110
  userData?: UserData;
112
111
  voiceOptions?: Partial<VoiceOptions>;
113
112
  connOptions?: SessionConnectOptions;
@@ -169,7 +168,8 @@ export class AgentSession<
169
168
  /** @internal - Timestamp when the session started (milliseconds) */
170
169
  _startedAt?: number;
171
170
 
172
- interruptionDetector?: AdaptiveInterruptionDetector;
171
+ /** @internal - Current run state for testing */
172
+ _globalRunState?: RunResult;
173
173
 
174
174
  constructor(opts: AgentSessionOptions<UserData>) {
175
175
  super();
@@ -180,7 +180,6 @@ export class AgentSession<
180
180
  llm,
181
181
  tts,
182
182
  turnDetection,
183
- interruptionDetector,
184
183
  userData,
185
184
  voiceOptions = defaultVoiceOptions,
186
185
  connOptions,
@@ -217,7 +216,6 @@ export class AgentSession<
217
216
  }
218
217
 
219
218
  this.turnDetection = turnDetection;
220
- this.interruptionDetector = interruptionDetector;
221
219
  this._userData = userData;
222
220
 
223
221
  // configurable IO
@@ -278,7 +276,7 @@ export class AgentSession<
278
276
  span,
279
277
  }: {
280
278
  agent: Agent;
281
- room: Room;
279
+ room?: Room;
282
280
  inputOptions?: Partial<RoomInputOptions>;
283
281
  outputOptions?: Partial<RoomOutputOptions>;
284
282
  span: Span;
@@ -289,41 +287,45 @@ export class AgentSession<
289
287
  this._updateAgentState('initializing');
290
288
 
291
289
  const tasks: Promise<void>[] = [];
292
- // Check for existing input/output configuration and warn if needed
293
- if (this.input.audio && inputOptions?.audioEnabled !== false) {
294
- this.logger.warn('RoomIO audio input is enabled but input.audio is already set, ignoring..');
295
- }
296
290
 
297
- if (this.output.audio && outputOptions?.audioEnabled !== false) {
298
- this.logger.warn(
299
- 'RoomIO audio output is enabled but output.audio is already set, ignoring..',
300
- );
301
- }
291
+ if (room && !this.roomIO) {
292
+ // Check for existing input/output configuration and warn if needed
293
+ if (this.input.audio && inputOptions?.audioEnabled !== false) {
294
+ this.logger.warn(
295
+ 'RoomIO audio input is enabled but input.audio is already set, ignoring..',
296
+ );
297
+ }
302
298
 
303
- if (this.output.transcription && outputOptions?.transcriptionEnabled !== false) {
304
- this.logger.warn(
305
- 'RoomIO transcription output is enabled but output.transcription is already set, ignoring..',
306
- );
307
- }
299
+ if (this.output.audio && outputOptions?.audioEnabled !== false) {
300
+ this.logger.warn(
301
+ 'RoomIO audio output is enabled but output.audio is already set, ignoring..',
302
+ );
303
+ }
308
304
 
309
- this.roomIO = new RoomIO({
310
- agentSession: this,
311
- room,
312
- inputOptions,
313
- outputOptions,
314
- });
315
- this.roomIO.start();
305
+ if (this.output.transcription && outputOptions?.transcriptionEnabled !== false) {
306
+ this.logger.warn(
307
+ 'RoomIO transcription output is enabled but output.transcription is already set, ignoring..',
308
+ );
309
+ }
310
+
311
+ this.roomIO = new RoomIO({
312
+ agentSession: this,
313
+ room,
314
+ inputOptions,
315
+ outputOptions,
316
+ });
317
+ this.roomIO.start();
318
+ }
316
319
 
317
320
  let ctx: JobContext | undefined = undefined;
318
321
  try {
319
322
  ctx = getJobContext();
320
- } catch (error) {
323
+ } catch {
321
324
  // JobContext is not available in evals
322
- this.logger.warn('JobContext is not available');
323
325
  }
324
326
 
325
327
  if (ctx) {
326
- if (ctx.room === room && !room.isConnected) {
328
+ if (room && ctx.room === room && !room.isConnected) {
327
329
  this.logger.debug('Auto-connecting to room via job context');
328
330
  tasks.push(ctx.connect());
329
331
  }
@@ -376,7 +378,7 @@ export class AgentSession<
376
378
  record,
377
379
  }: {
378
380
  agent: Agent;
379
- room: Room;
381
+ room?: Room;
380
382
  inputOptions?: Partial<RoomInputOptions>;
381
383
  outputOptions?: Partial<RoomOutputOptions>;
382
384
  record?: boolean;
@@ -503,13 +505,50 @@ export class AgentSession<
503
505
 
504
506
  // attach to the session span if called outside of the AgentSession
505
507
  const activeSpan = trace.getActiveSpan();
508
+ let handle: SpeechHandle;
506
509
  if (!activeSpan && this.rootSpanContext) {
507
- return otelContext.with(this.rootSpanContext, () =>
510
+ handle = otelContext.with(this.rootSpanContext, () =>
508
511
  doGenerateReply(this.activity!, this.nextActivity),
509
512
  );
513
+ } else {
514
+ handle = doGenerateReply(this.activity!, this.nextActivity);
510
515
  }
511
516
 
512
- return doGenerateReply(this.activity!, this.nextActivity);
517
+ if (this._globalRunState) {
518
+ this._globalRunState._watchHandle(handle);
519
+ }
520
+
521
+ return handle;
522
+ }
523
+
524
+ /**
525
+ * Run a test with user input and return a result for assertions.
526
+ *
527
+ * This method is primarily used for testing agent behavior without
528
+ * requiring a real room connection.
529
+ *
530
+ * @example
531
+ * ```typescript
532
+ * const result = await session.run({ userInput: 'Hello' });
533
+ * result.expect.nextEvent().isMessage({ role: 'assistant' });
534
+ * result.expect.noMoreEvents();
535
+ * ```
536
+ *
537
+ * @param options - Run options including user input
538
+ * @returns A RunResult that resolves when the agent finishes responding
539
+ *
540
+ * TODO: Add outputType parameter for typed outputs (parity with Python)
541
+ */
542
+ run(options: { userInput: string }): RunResult {
543
+ if (this._globalRunState && !this._globalRunState.done()) {
544
+ throw new Error('nested runs are not supported');
545
+ }
546
+
547
+ const runState = new RunResult({ userInput: options.userInput });
548
+ this._globalRunState = runState;
549
+ this.generateReply({ userInput: options.userInput });
550
+
551
+ return runState;
513
552
  }
514
553
 
515
554
  private async updateActivity(agent: Agent): Promise<void> {
@@ -643,8 +682,6 @@ export class AgentSession<
643
682
  return;
644
683
  }
645
684
 
646
- const oldState = this._agentState;
647
-
648
685
  if (state === 'speaking') {
649
686
  // Reset error counts when agent starts speaking
650
687
  this.llmErrorCounts = 0;
@@ -659,25 +696,13 @@ export class AgentSession<
659
696
  // TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available
660
697
  // (Ref: Python agent_session.py line 1161-1164)
661
698
  }
662
-
663
- // Notify AudioRecognition that agent started speaking (for interruption detection)
664
- this.activity?.notifyAgentSpeechStarted();
665
- } else if (oldState === 'speaking') {
666
- // Agent stopped speaking
667
- if (this.agentSpeakingSpan !== undefined) {
668
- // TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
669
- this.agentSpeakingSpan.end();
670
- this.agentSpeakingSpan = undefined;
671
- }
672
-
673
- // Notify AudioRecognition that agent stopped speaking (for interruption detection)
674
- this.activity?.notifyAgentSpeechEnded();
675
699
  } else if (this.agentSpeakingSpan !== undefined) {
676
- // Non-speaking to non-speaking transition but span is still open
700
+ // TODO(brian): PR4 - Set ATTR_END_TIME attribute if available
677
701
  this.agentSpeakingSpan.end();
678
702
  this.agentSpeakingSpan = undefined;
679
703
  }
680
704
 
705
+ const oldState = this._agentState;
681
706
  this._agentState = state;
682
707
 
683
708
  // Handle user away timer based on state changes
@@ -5,12 +5,6 @@ import { AudioFrame } from '@livekit/rtc-node';
5
5
  import type { Context, Span } from '@opentelemetry/api';
6
6
  import type { WritableStreamDefaultWriter } from 'node:stream/web';
7
7
  import { ReadableStream } from 'node:stream/web';
8
- import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js';
9
- import {
10
- InterruptionStreamBase,
11
- InterruptionStreamSentinel,
12
- } from '../inference/interruption/InterruptionStream.js';
13
- import type { InterruptionEvent } from '../inference/interruption/interruption.js';
14
8
  import { type ChatContext } from '../llm/chat_context.js';
15
9
  import { log } from '../log.js';
16
10
  import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js';
@@ -45,7 +39,6 @@ export interface RecognitionHooks {
45
39
  onFinalTranscript: (ev: SpeechEvent) => void;
46
40
  onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
47
41
  onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void;
48
- onInterruption: (ev: InterruptionEvent) => void;
49
42
 
50
43
  retrieveChatCtx: () => ChatContext;
51
44
  }
@@ -60,7 +53,6 @@ export interface AudioRecognitionOptions {
60
53
  recognitionHooks: RecognitionHooks;
61
54
  stt?: STTNode;
62
55
  vad?: VAD;
63
- interruptionDetector?: AdaptiveInterruptionDetector;
64
56
  turnDetector?: _TurnDetector;
65
57
  turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
66
58
  minEndpointingDelay: number;
@@ -96,7 +88,6 @@ export class AudioRecognition {
96
88
 
97
89
  private vadInputStream: ReadableStream<AudioFrame>;
98
90
  private sttInputStream: ReadableStream<AudioFrame>;
99
- private interruptionInputStream: ReadableStream<AudioFrame>;
100
91
  private silenceAudioTransform = new IdentityTransform<AudioFrame>();
101
92
  private silenceAudioWriter: WritableStreamDefaultWriter<AudioFrame>;
102
93
 
@@ -105,19 +96,11 @@ export class AudioRecognition {
105
96
  private commitUserTurnTask?: Task<void>;
106
97
  private vadTask?: Task<void>;
107
98
  private sttTask?: Task<void>;
108
- private interruptionTask?: Task<void>;
109
-
110
- // interruption detection
111
- private interruptionDetector?: AdaptiveInterruptionDetector;
112
- private interruptionStream?: InterruptionStreamBase;
113
- private interruptionEnabled = false;
114
- private agentSpeaking = false;
115
99
 
116
100
  constructor(opts: AudioRecognitionOptions) {
117
101
  this.hooks = opts.recognitionHooks;
118
102
  this.stt = opts.stt;
119
103
  this.vad = opts.vad;
120
- this.interruptionDetector = opts.interruptionDetector;
121
104
  this.turnDetector = opts.turnDetector;
122
105
  this.turnDetectionMode = opts.turnDetectionMode;
123
106
  this.minEndpointingDelay = opts.minEndpointingDelay;
@@ -125,15 +108,10 @@ export class AudioRecognition {
125
108
  this.lastLanguage = undefined;
126
109
  this.rootSpanContext = opts.rootSpanContext;
127
110
 
128
- // Interruption detection is only enabled if both detector and VAD are provided
129
- this.interruptionEnabled = this.interruptionDetector !== undefined && this.vad !== undefined;
130
-
131
111
  this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
132
- const [vadInputStream, rest] = this.deferredInputStream.stream.tee();
133
- const [sttInputStream, interruptionInputStream] = rest.tee();
112
+ const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
134
113
  this.vadInputStream = vadInputStream;
135
114
  this.sttInputStream = mergeReadableStreams(sttInputStream, this.silenceAudioTransform.readable);
136
- this.interruptionInputStream = interruptionInputStream;
137
115
  this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter();
138
116
  }
139
117
 
@@ -157,15 +135,6 @@ export class AudioRecognition {
157
135
  this.sttTask.result.catch((err) => {
158
136
  this.logger.error(`Error running STT task: ${err}`);
159
137
  });
160
-
161
- if (this.interruptionEnabled && this.interruptionDetector) {
162
- this.interruptionTask = Task.from(({ signal }) =>
163
- this.createInterruptionTask(this.interruptionDetector!, signal),
164
- );
165
- this.interruptionTask.result.catch((err) => {
166
- this.logger.error(`Error running interruption task: ${err}`);
167
- });
168
- }
169
138
  }
170
139
 
171
140
  private async onSTTEvent(ev: SpeechEvent) {
@@ -608,11 +577,6 @@ export class AudioRecognition {
608
577
  this.sampleRate = ev.frames[0].sampleRate;
609
578
  }
610
579
 
611
- // If agent is speaking, user speech is overlap - trigger interruption detection
612
- if (this.agentSpeaking && this.interruptionEnabled) {
613
- this.onStartOfOverlapSpeech(ev.speechDuration, this.userTurnSpan);
614
- }
615
-
616
580
  this.bounceEOUTask?.cancel();
617
581
  break;
618
582
  case VADEventType.INFERENCE_DONE:
@@ -633,11 +597,6 @@ export class AudioRecognition {
633
597
  // when VAD fires END_OF_SPEECH, it already waited for the silence_duration
634
598
  this.speaking = false;
635
599
 
636
- // If we were in overlap speech (agent speaking + user speaking), end it
637
- if (this.agentSpeaking && this.interruptionEnabled) {
638
- this.onEndOfOverlapSpeech();
639
- }
640
-
641
600
  if (
642
601
  this.vadBaseTurnDetection ||
643
602
  (this.turnDetectionMode === 'stt' && this.userTurnCommitted)
@@ -655,123 +614,6 @@ export class AudioRecognition {
655
614
  }
656
615
  }
657
616
 
658
- private async createInterruptionTask(
659
- interruptionDetector: AdaptiveInterruptionDetector,
660
- signal: AbortSignal,
661
- ) {
662
- // Create the interruption stream from the detector
663
- this.interruptionStream = interruptionDetector.createStream();
664
-
665
- // Forward audio frames to the interruption stream
666
- const reader = this.interruptionInputStream.getReader();
667
-
668
- const forwardTask = (async () => {
669
- try {
670
- while (!signal.aborted) {
671
- const { done, value: frame } = await reader.read();
672
- if (done) break;
673
- await this.interruptionStream?.pushFrame(frame);
674
- }
675
- } catch (e) {
676
- if (!signal.aborted) {
677
- this.logger.error(e, 'Error forwarding audio to interruption stream');
678
- }
679
- } finally {
680
- reader.releaseLock();
681
- }
682
- })();
683
-
684
- // Read interruption events from the stream
685
- const eventStream = this.interruptionStream.stream;
686
- const eventReader = eventStream.getReader();
687
-
688
- const abortHandler = () => {
689
- eventReader.releaseLock();
690
- this.interruptionStream?.close();
691
- signal.removeEventListener('abort', abortHandler);
692
- };
693
- signal.addEventListener('abort', abortHandler);
694
-
695
- try {
696
- while (!signal.aborted) {
697
- const { done, value: ev } = await eventReader.read();
698
- if (done) break;
699
-
700
- this.logger.debug({ type: ev.type, probability: ev.probability }, 'Interruption event');
701
- this.hooks.onInterruption(ev);
702
- }
703
- } catch (e) {
704
- if (!signal.aborted) {
705
- this.logger.error(e, 'Error in interruption task');
706
- }
707
- } finally {
708
- this.logger.debug('Interruption task closed');
709
- await forwardTask;
710
- }
711
- }
712
-
713
- /**
714
- * Called when the agent starts speaking.
715
- * Enables interruption detection by sending the agent-speech-started sentinel.
716
- */
717
- onStartOfAgentSpeech(): void {
718
- this.agentSpeaking = true;
719
-
720
- if (!this.interruptionEnabled || !this.interruptionStream) {
721
- return;
722
- }
723
-
724
- this.interruptionStream.pushFrame(InterruptionStreamSentinel.speechStarted());
725
- }
726
-
727
- /**
728
- * Called when the agent stops speaking.
729
- * Disables interruption detection by sending the agent-speech-ended sentinel.
730
- */
731
- onEndOfAgentSpeech(): void {
732
- if (!this.interruptionEnabled || !this.interruptionStream) {
733
- this.agentSpeaking = false;
734
- return;
735
- }
736
-
737
- this.interruptionStream.pushFrame(InterruptionStreamSentinel.speechEnded());
738
-
739
- if (this.agentSpeaking) {
740
- // No interruption was detected, end the overlap inference (idempotent)
741
- this.onEndOfOverlapSpeech();
742
- }
743
-
744
- this.agentSpeaking = false;
745
- }
746
-
747
- /**
748
- * Called when user starts speaking while agent is speaking (overlap speech).
749
- * This triggers the interruption detection inference.
750
- */
751
- onStartOfOverlapSpeech(speechDuration: number, userSpeakingSpan?: Span): void {
752
- if (!this.interruptionEnabled || !this.interruptionStream) {
753
- return;
754
- }
755
-
756
- if (this.agentSpeaking && userSpeakingSpan) {
757
- this.interruptionStream.pushFrame(
758
- InterruptionStreamSentinel.overlapSpeechStarted(speechDuration, userSpeakingSpan),
759
- );
760
- }
761
- }
762
-
763
- /**
764
- * Called when user stops speaking during overlap.
765
- * This ends the interruption detection inference for this overlap period.
766
- */
767
- onEndOfOverlapSpeech(): void {
768
- if (!this.interruptionEnabled || !this.interruptionStream) {
769
- return;
770
- }
771
-
772
- this.interruptionStream.pushFrame(InterruptionStreamSentinel.overlapSpeechEnded());
773
- }
774
-
775
617
  setInputAudioStream(audioStream: ReadableStream<AudioFrame>) {
776
618
  this.deferredInputStream.setSource(audioStream);
777
619
  }
@@ -844,8 +686,6 @@ export class AudioRecognition {
844
686
  await this.sttTask?.cancelAndWait();
845
687
  await this.vadTask?.cancelAndWait();
846
688
  await this.bounceEOUTask?.cancelAndWait();
847
- await this.interruptionTask?.cancelAndWait();
848
- await this.interruptionStream?.close();
849
689
  }
850
690
 
851
691
  private _endUserTurnSpan({
@@ -10,3 +10,4 @@ export { type TimedString } from './io.js';
10
10
  export * from './report.js';
11
11
  export * from './room_io/index.js';
12
12
  export { RunContext } from './run_context.js';
13
+ export * as testing from './testing/index.js';
@@ -51,6 +51,7 @@ const DEFAULT_TEXT_INPUT_CALLBACK: TextInputCallback = (sess: AgentSession, ev:
51
51
  };
52
52
 
53
53
  const DEFAULT_PARTICIPANT_KINDS: ParticipantKind[] = [
54
+ ParticipantKind.CONNECTOR,
54
55
  ParticipantKind.SIP,
55
56
  ParticipantKind.STANDARD,
56
57
  ];