@livekit/agents 1.1.0-dev.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (292) hide show
  1. package/dist/cli.cjs +2 -0
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +2 -0
  5. package/dist/cli.js.map +1 -1
  6. package/dist/constants.cjs +3 -0
  7. package/dist/constants.cjs.map +1 -1
  8. package/dist/constants.d.cts +1 -0
  9. package/dist/constants.d.ts +1 -0
  10. package/dist/constants.d.ts.map +1 -1
  11. package/dist/constants.js +2 -0
  12. package/dist/constants.js.map +1 -1
  13. package/dist/cpu.cjs +189 -0
  14. package/dist/cpu.cjs.map +1 -0
  15. package/dist/cpu.d.cts +24 -0
  16. package/dist/cpu.d.ts +24 -0
  17. package/dist/cpu.d.ts.map +1 -0
  18. package/dist/cpu.js +152 -0
  19. package/dist/cpu.js.map +1 -0
  20. package/dist/cpu.test.cjs +227 -0
  21. package/dist/cpu.test.cjs.map +1 -0
  22. package/dist/cpu.test.js +204 -0
  23. package/dist/cpu.test.js.map +1 -0
  24. package/dist/index.cjs +12 -10
  25. package/dist/index.cjs.map +1 -1
  26. package/dist/index.d.cts +13 -13
  27. package/dist/index.d.ts +13 -13
  28. package/dist/index.d.ts.map +1 -1
  29. package/dist/index.js +11 -10
  30. package/dist/index.js.map +1 -1
  31. package/dist/inference/interruption/defaults.cjs +1 -1
  32. package/dist/inference/interruption/defaults.cjs.map +1 -1
  33. package/dist/inference/interruption/defaults.d.cts +1 -1
  34. package/dist/inference/interruption/defaults.d.ts +1 -1
  35. package/dist/inference/interruption/defaults.d.ts.map +1 -1
  36. package/dist/inference/interruption/defaults.js +1 -1
  37. package/dist/inference/interruption/defaults.js.map +1 -1
  38. package/dist/inference/interruption/http_transport.cjs +44 -28
  39. package/dist/inference/interruption/http_transport.cjs.map +1 -1
  40. package/dist/inference/interruption/http_transport.d.ts.map +1 -1
  41. package/dist/inference/interruption/http_transport.js +45 -29
  42. package/dist/inference/interruption/http_transport.js.map +1 -1
  43. package/dist/inference/interruption/interruption_detector.cjs +22 -5
  44. package/dist/inference/interruption/interruption_detector.cjs.map +1 -1
  45. package/dist/inference/interruption/interruption_detector.d.cts +2 -2
  46. package/dist/inference/interruption/interruption_detector.d.ts +2 -2
  47. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -1
  48. package/dist/inference/interruption/interruption_detector.js +22 -5
  49. package/dist/inference/interruption/interruption_detector.js.map +1 -1
  50. package/dist/inference/interruption/interruption_stream.cjs +4 -4
  51. package/dist/inference/interruption/interruption_stream.cjs.map +1 -1
  52. package/dist/inference/interruption/interruption_stream.js +4 -4
  53. package/dist/inference/interruption/interruption_stream.js.map +1 -1
  54. package/dist/inference/interruption/types.cjs.map +1 -1
  55. package/dist/inference/interruption/types.d.cts +2 -2
  56. package/dist/inference/interruption/types.d.ts +2 -2
  57. package/dist/inference/interruption/types.d.ts.map +1 -1
  58. package/dist/inference/interruption/ws_transport.cjs +60 -47
  59. package/dist/inference/interruption/ws_transport.cjs.map +1 -1
  60. package/dist/inference/interruption/ws_transport.d.ts.map +1 -1
  61. package/dist/inference/interruption/ws_transport.js +60 -47
  62. package/dist/inference/interruption/ws_transport.js.map +1 -1
  63. package/dist/inference/llm.cjs.map +1 -1
  64. package/dist/inference/llm.d.cts +1 -1
  65. package/dist/inference/llm.d.ts +1 -1
  66. package/dist/inference/llm.d.ts.map +1 -1
  67. package/dist/inference/llm.js.map +1 -1
  68. package/dist/inference/stt.cjs +20 -12
  69. package/dist/inference/stt.cjs.map +1 -1
  70. package/dist/inference/stt.d.cts +3 -2
  71. package/dist/inference/stt.d.ts +3 -2
  72. package/dist/inference/stt.d.ts.map +1 -1
  73. package/dist/inference/stt.js +20 -12
  74. package/dist/inference/stt.js.map +1 -1
  75. package/dist/inference/stt.test.cjs +14 -0
  76. package/dist/inference/stt.test.cjs.map +1 -1
  77. package/dist/inference/stt.test.js +14 -0
  78. package/dist/inference/stt.test.js.map +1 -1
  79. package/dist/inference/tts.cjs +13 -4
  80. package/dist/inference/tts.cjs.map +1 -1
  81. package/dist/inference/tts.d.cts +8 -1
  82. package/dist/inference/tts.d.ts +8 -1
  83. package/dist/inference/tts.d.ts.map +1 -1
  84. package/dist/inference/tts.js +13 -4
  85. package/dist/inference/tts.js.map +1 -1
  86. package/dist/inference/tts.test.cjs +10 -0
  87. package/dist/inference/tts.test.cjs.map +1 -1
  88. package/dist/inference/tts.test.js +10 -0
  89. package/dist/inference/tts.test.js.map +1 -1
  90. package/dist/ipc/job_proc_lazy_main.cjs +41 -23
  91. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  92. package/dist/ipc/job_proc_lazy_main.js +41 -23
  93. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  94. package/dist/job.cjs +1 -1
  95. package/dist/job.cjs.map +1 -1
  96. package/dist/job.js +1 -1
  97. package/dist/job.js.map +1 -1
  98. package/dist/language.cjs +394 -0
  99. package/dist/language.cjs.map +1 -0
  100. package/dist/language.d.cts +15 -0
  101. package/dist/language.d.ts +15 -0
  102. package/dist/language.d.ts.map +1 -0
  103. package/dist/language.js +363 -0
  104. package/dist/language.js.map +1 -0
  105. package/dist/language.test.cjs +43 -0
  106. package/dist/language.test.cjs.map +1 -0
  107. package/dist/language.test.js +49 -0
  108. package/dist/language.test.js.map +1 -0
  109. package/dist/llm/index.cjs +2 -0
  110. package/dist/llm/index.cjs.map +1 -1
  111. package/dist/llm/index.d.cts +1 -1
  112. package/dist/llm/index.d.ts +1 -1
  113. package/dist/llm/index.d.ts.map +1 -1
  114. package/dist/llm/index.js +2 -0
  115. package/dist/llm/index.js.map +1 -1
  116. package/dist/stream/deferred_stream.cjs +6 -2
  117. package/dist/stream/deferred_stream.cjs.map +1 -1
  118. package/dist/stream/deferred_stream.d.ts.map +1 -1
  119. package/dist/stream/deferred_stream.js +6 -2
  120. package/dist/stream/deferred_stream.js.map +1 -1
  121. package/dist/stt/stt.cjs.map +1 -1
  122. package/dist/stt/stt.d.cts +2 -1
  123. package/dist/stt/stt.d.ts +2 -1
  124. package/dist/stt/stt.d.ts.map +1 -1
  125. package/dist/stt/stt.js.map +1 -1
  126. package/dist/utils.cjs +15 -0
  127. package/dist/utils.cjs.map +1 -1
  128. package/dist/utils.d.cts +8 -0
  129. package/dist/utils.d.ts +8 -0
  130. package/dist/utils.d.ts.map +1 -1
  131. package/dist/utils.js +13 -0
  132. package/dist/utils.js.map +1 -1
  133. package/dist/version.cjs +1 -1
  134. package/dist/version.js +1 -1
  135. package/dist/voice/agent.cjs +14 -17
  136. package/dist/voice/agent.cjs.map +1 -1
  137. package/dist/voice/agent.d.cts +10 -11
  138. package/dist/voice/agent.d.ts +10 -11
  139. package/dist/voice/agent.d.ts.map +1 -1
  140. package/dist/voice/agent.js +15 -18
  141. package/dist/voice/agent.js.map +1 -1
  142. package/dist/voice/agent.test.cjs +194 -0
  143. package/dist/voice/agent.test.cjs.map +1 -1
  144. package/dist/voice/agent.test.js +195 -1
  145. package/dist/voice/agent.test.js.map +1 -1
  146. package/dist/voice/agent_activity.cjs +116 -39
  147. package/dist/voice/agent_activity.cjs.map +1 -1
  148. package/dist/voice/agent_activity.d.cts +2 -0
  149. package/dist/voice/agent_activity.d.ts +2 -0
  150. package/dist/voice/agent_activity.d.ts.map +1 -1
  151. package/dist/voice/agent_activity.js +117 -40
  152. package/dist/voice/agent_activity.js.map +1 -1
  153. package/dist/voice/agent_activity.test.cjs +135 -0
  154. package/dist/voice/agent_activity.test.cjs.map +1 -0
  155. package/dist/voice/agent_activity.test.js +134 -0
  156. package/dist/voice/agent_activity.test.js.map +1 -0
  157. package/dist/voice/agent_session.cjs +38 -38
  158. package/dist/voice/agent_session.cjs.map +1 -1
  159. package/dist/voice/agent_session.d.cts +65 -56
  160. package/dist/voice/agent_session.d.ts +65 -56
  161. package/dist/voice/agent_session.d.ts.map +1 -1
  162. package/dist/voice/agent_session.js +37 -37
  163. package/dist/voice/agent_session.js.map +1 -1
  164. package/dist/voice/audio_recognition.cjs +106 -52
  165. package/dist/voice/audio_recognition.cjs.map +1 -1
  166. package/dist/voice/audio_recognition.d.cts +4 -2
  167. package/dist/voice/audio_recognition.d.ts +4 -2
  168. package/dist/voice/audio_recognition.d.ts.map +1 -1
  169. package/dist/voice/audio_recognition.js +106 -52
  170. package/dist/voice/audio_recognition.js.map +1 -1
  171. package/dist/voice/audio_recognition_span.test.cjs +84 -22
  172. package/dist/voice/audio_recognition_span.test.cjs.map +1 -1
  173. package/dist/voice/audio_recognition_span.test.js +90 -23
  174. package/dist/voice/audio_recognition_span.test.js.map +1 -1
  175. package/dist/voice/events.cjs +1 -1
  176. package/dist/voice/events.cjs.map +1 -1
  177. package/dist/voice/events.d.cts +4 -3
  178. package/dist/voice/events.d.ts +4 -3
  179. package/dist/voice/events.d.ts.map +1 -1
  180. package/dist/voice/events.js +1 -1
  181. package/dist/voice/events.js.map +1 -1
  182. package/dist/voice/index.cjs +9 -1
  183. package/dist/voice/index.cjs.map +1 -1
  184. package/dist/voice/index.d.cts +1 -1
  185. package/dist/voice/index.d.ts +1 -1
  186. package/dist/voice/index.d.ts.map +1 -1
  187. package/dist/voice/index.js +10 -1
  188. package/dist/voice/index.js.map +1 -1
  189. package/dist/voice/remote_session.cjs +922 -0
  190. package/dist/voice/remote_session.cjs.map +1 -0
  191. package/dist/voice/remote_session.d.cts +108 -0
  192. package/dist/voice/remote_session.d.ts +108 -0
  193. package/dist/voice/remote_session.d.ts.map +1 -0
  194. package/dist/voice/remote_session.js +887 -0
  195. package/dist/voice/remote_session.js.map +1 -0
  196. package/dist/voice/report.cjs +11 -10
  197. package/dist/voice/report.cjs.map +1 -1
  198. package/dist/voice/report.d.cts +5 -3
  199. package/dist/voice/report.d.ts +5 -3
  200. package/dist/voice/report.d.ts.map +1 -1
  201. package/dist/voice/report.js +11 -10
  202. package/dist/voice/report.js.map +1 -1
  203. package/dist/voice/report.test.cjs +15 -0
  204. package/dist/voice/report.test.cjs.map +1 -1
  205. package/dist/voice/report.test.js +15 -0
  206. package/dist/voice/report.test.js.map +1 -1
  207. package/dist/voice/room_io/room_io.cjs +39 -0
  208. package/dist/voice/room_io/room_io.cjs.map +1 -1
  209. package/dist/voice/room_io/room_io.d.cts +3 -1
  210. package/dist/voice/room_io/room_io.d.ts +3 -1
  211. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  212. package/dist/voice/room_io/room_io.js +40 -1
  213. package/dist/voice/room_io/room_io.js.map +1 -1
  214. package/dist/voice/turn_config/interruption.cjs.map +1 -1
  215. package/dist/voice/turn_config/interruption.d.cts +1 -1
  216. package/dist/voice/turn_config/interruption.d.ts +1 -1
  217. package/dist/voice/turn_config/interruption.d.ts.map +1 -1
  218. package/dist/voice/turn_config/interruption.js.map +1 -1
  219. package/dist/voice/turn_config/utils.cjs +95 -35
  220. package/dist/voice/turn_config/utils.cjs.map +1 -1
  221. package/dist/voice/turn_config/utils.d.cts +17 -5
  222. package/dist/voice/turn_config/utils.d.ts +17 -5
  223. package/dist/voice/turn_config/utils.d.ts.map +1 -1
  224. package/dist/voice/turn_config/utils.js +93 -35
  225. package/dist/voice/turn_config/utils.js.map +1 -1
  226. package/dist/voice/turn_config/utils.test.cjs +83 -41
  227. package/dist/voice/turn_config/utils.test.cjs.map +1 -1
  228. package/dist/voice/turn_config/utils.test.js +84 -42
  229. package/dist/voice/turn_config/utils.test.js.map +1 -1
  230. package/dist/worker.cjs +6 -29
  231. package/dist/worker.cjs.map +1 -1
  232. package/dist/worker.d.ts.map +1 -1
  233. package/dist/worker.js +6 -19
  234. package/dist/worker.js.map +1 -1
  235. package/package.json +3 -2
  236. package/src/cli.ts +2 -0
  237. package/src/constants.ts +1 -0
  238. package/src/cpu.test.ts +239 -0
  239. package/src/cpu.ts +173 -0
  240. package/src/index.ts +13 -15
  241. package/src/inference/interruption/defaults.ts +1 -1
  242. package/src/inference/interruption/http_transport.ts +49 -30
  243. package/src/inference/interruption/interruption_detector.ts +22 -6
  244. package/src/inference/interruption/interruption_stream.ts +4 -4
  245. package/src/inference/interruption/types.ts +2 -2
  246. package/src/inference/interruption/ws_transport.ts +63 -59
  247. package/src/inference/llm.ts +3 -1
  248. package/src/inference/stt.test.ts +17 -0
  249. package/src/inference/stt.ts +22 -14
  250. package/src/inference/tts.test.ts +12 -0
  251. package/src/inference/tts.ts +22 -6
  252. package/src/ipc/job_proc_lazy_main.ts +44 -24
  253. package/src/job.ts +1 -1
  254. package/src/language.test.ts +62 -0
  255. package/src/language.ts +380 -0
  256. package/src/llm/index.ts +2 -0
  257. package/src/stream/deferred_stream.ts +5 -1
  258. package/src/stt/stt.ts +2 -1
  259. package/src/utils.ts +20 -0
  260. package/src/voice/agent.test.ts +208 -1
  261. package/src/voice/agent.ts +21 -22
  262. package/src/voice/agent_activity.test.ts +194 -0
  263. package/src/voice/agent_activity.ts +161 -43
  264. package/src/voice/agent_session.ts +103 -92
  265. package/src/voice/audio_recognition.ts +124 -61
  266. package/src/voice/audio_recognition_span.test.ts +115 -35
  267. package/src/voice/events.ts +4 -3
  268. package/src/voice/index.ts +10 -1
  269. package/src/voice/remote_session.ts +1083 -0
  270. package/src/voice/report.test.ts +22 -3
  271. package/src/voice/report.ts +31 -14
  272. package/src/voice/room_io/room_io.ts +52 -2
  273. package/src/voice/turn_config/interruption.ts +1 -1
  274. package/src/voice/turn_config/utils.test.ts +91 -43
  275. package/src/voice/turn_config/utils.ts +120 -56
  276. package/src/worker.ts +34 -50
  277. package/dist/voice/client_events.cjs +0 -554
  278. package/dist/voice/client_events.cjs.map +0 -1
  279. package/dist/voice/client_events.d.cts +0 -195
  280. package/dist/voice/client_events.d.ts +0 -195
  281. package/dist/voice/client_events.d.ts.map +0 -1
  282. package/dist/voice/client_events.js +0 -548
  283. package/dist/voice/client_events.js.map +0 -1
  284. package/dist/voice/wire_format.cjs +0 -798
  285. package/dist/voice/wire_format.cjs.map +0 -1
  286. package/dist/voice/wire_format.d.cts +0 -5503
  287. package/dist/voice/wire_format.d.ts +0 -5503
  288. package/dist/voice/wire_format.d.ts.map +0 -1
  289. package/dist/voice/wire_format.js +0 -728
  290. package/dist/voice/wire_format.js.map +0 -1
  291. package/src/voice/client_events.ts +0 -838
  292. package/src/voice/wire_format.ts +0 -827
@@ -45,7 +45,7 @@ import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
45
45
  import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
46
46
  import { splitWords } from '../tokenize/basic/word.js';
47
47
  import { TTS, type TTSError } from '../tts/tts.js';
48
- import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
48
+ import { Future, Task, cancelAndWait, isDevMode, isHosted, waitFor } from '../utils.js';
49
49
  import { VAD, type VADEvent } from '../vad.js';
50
50
  import type { Agent, ModelSettings } from './agent.js';
51
51
  import {
@@ -152,10 +152,11 @@ export class AgentActivity implements RecognitionHooks {
152
152
  this.onError(ev);
153
153
 
154
154
  private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => {
155
- this.agentSession.emit(AgentSessionEventTypes.UserOverlappingSpeech, ev);
155
+ this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev);
156
156
  };
157
157
 
158
158
  private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
159
+ this.agentSession._usageCollector.collect(ev);
159
160
  this.agentSession.emit(
160
161
  AgentSessionEventTypes.MetricsCollected,
161
162
  createMetricsCollectedEvent({ metrics: ev }),
@@ -165,6 +166,13 @@ export class AgentActivity implements RecognitionHooks {
165
166
  private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
166
167
  const errorEvent = createErrorEvent(ev, this.interruptionDetector);
167
168
  this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
169
+
170
+ if (!ev.recoverable) {
171
+ this.agentSession._onError(ev);
172
+ this.fallbackToVadInterruption();
173
+ return;
174
+ }
175
+
168
176
  this.agentSession._onError(ev);
169
177
  };
170
178
 
@@ -390,8 +398,12 @@ export class AgentActivity implements RecognitionHooks {
390
398
  turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
391
399
  turnDetectionMode: this.turnDetectionMode,
392
400
  interruptionDetection: this.interruptionDetector,
393
- minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
394
- maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
401
+ minEndpointingDelay:
402
+ this.agent.turnHandling?.endpointing?.minDelay ??
403
+ this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
404
+ maxEndpointingDelay:
405
+ this.agent.turnHandling?.endpointing?.maxDelay ??
406
+ this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
395
407
  rootSpanContext: this.agentSession.rootSpanContext,
396
408
  sttModel: this.stt?.label,
397
409
  sttProvider: this.getSttProvider(),
@@ -464,8 +476,10 @@ export class AgentActivity implements RecognitionHooks {
464
476
  }
465
477
 
466
478
  get allowInterruptions(): boolean {
467
- // TODO(AJS-51): Allow options to be defined in Agent class
468
- return this.agentSession.options.turnHandling.interruption?.mode !== false;
479
+ return (
480
+ this.agent.turnHandling?.interruption?.enabled ??
481
+ this.agentSession.sessionOptions.turnHandling.interruption.enabled
482
+ );
469
483
  }
470
484
 
471
485
  get useTtsAlignedTranscript(): boolean {
@@ -474,10 +488,27 @@ export class AgentActivity implements RecognitionHooks {
474
488
  }
475
489
 
476
490
  get turnDetection(): TurnDetectionMode | undefined {
477
- // TODO(brian): prioritize using agent.turn_detection
478
- return this.agentSession.turnDetection;
491
+ return this.agent.turnHandling?.turnDetection ?? this.agentSession.turnDetection;
492
+ }
493
+
494
+ get turnHandling() {
495
+ return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling;
479
496
  }
480
497
 
498
+ // get minEndpointingDelay(): number {
499
+ // return (
500
+ // this.agent.turnHandling?.endpointing?.minDelay ??
501
+ // this.agentSession.sessionOptions.turnHandling.endpointing.minDelay
502
+ // );
503
+ // }
504
+
505
+ // get maxEndpointingDelay(): number {
506
+ // return (
507
+ // this.agent.turnHandling?.endpointing?.maxDelay ??
508
+ // this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay
509
+ // );
510
+ // }
511
+
481
512
  get toolCtx(): ToolContext {
482
513
  return this.agent.toolCtx;
483
514
  }
@@ -569,16 +600,21 @@ export class AgentActivity implements RecognitionHooks {
569
600
  });
570
601
 
571
602
  this.audioStreamId = this.audioStream.addInputStream(audioStream);
572
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
573
- .pipeThrough(aecWarmupAudioFilter)
574
- .tee();
575
603
 
576
- if (this.realtimeSession) {
604
+ if (this.realtimeSession && this.audioRecognition) {
605
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
606
+ .pipeThrough(aecWarmupAudioFilter)
607
+ .tee();
577
608
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
578
- }
579
-
580
- if (this.audioRecognition) {
581
609
  this.audioRecognition.setInputAudioStream(recognitionAudioStream);
610
+ } else if (this.realtimeSession) {
611
+ this.realtimeSession.setInputAudioStream(
612
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
613
+ );
614
+ } else if (this.audioRecognition) {
615
+ this.audioRecognition.setInputAudioStream(
616
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
617
+ );
582
618
  }
583
619
  }
584
620
 
@@ -693,6 +729,8 @@ export class AgentActivity implements RecognitionHooks {
693
729
  }
694
730
  }
695
731
 
732
+ this.agentSession._usageCollector.collect(ev);
733
+
696
734
  this.agentSession.emit(
697
735
  AgentSessionEventTypes.MetricsCollected,
698
736
  createMetricsCollectedEvent({ metrics: ev }),
@@ -828,7 +866,10 @@ export class AgentActivity implements RecognitionHooks {
828
866
  // Subtract both speechDuration and inferenceDuration to correct for VAD model latency.
829
867
  speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
830
868
  }
831
- this.agentSession._updateUserState('speaking', speechStartTime);
869
+ this.agentSession._updateUserState('speaking', {
870
+ lastSpeakingTime: speechStartTime,
871
+ otelContext: otelContext.active(),
872
+ });
832
873
  if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
833
874
  // Pass speechStartTime as the absolute startedAt timestamp.
834
875
  this.audioRecognition.onStartOfOverlapSpeech(
@@ -852,7 +893,10 @@ export class AgentActivity implements RecognitionHooks {
852
893
  this.agentSession._userSpeakingSpan,
853
894
  );
854
895
  }
855
- this.agentSession._updateUserState('listening', speechEndTime);
896
+ this.agentSession._updateUserState('listening', {
897
+ lastSpeakingTime: speechEndTime,
898
+ otelContext: otelContext.active(),
899
+ });
856
900
  }
857
901
 
858
902
  onVADInferenceDone(ev: VADEvent): void {
@@ -861,7 +905,9 @@ export class AgentActivity implements RecognitionHooks {
861
905
  return;
862
906
  }
863
907
 
864
- if (ev.speechDuration >= this.agentSession.options.turnHandling.interruption?.minDuration) {
908
+ if (
909
+ ev.speechDuration >= this.agentSession.sessionOptions.turnHandling.interruption?.minDuration
910
+ ) {
865
911
  this.interruptByAudioActivity();
866
912
  }
867
913
  }
@@ -887,7 +933,7 @@ export class AgentActivity implements RecognitionHooks {
887
933
  // - This ensures consistent behavior across all interruption scenarios
888
934
  if (
889
935
  this.stt &&
890
- this.agentSession.options.turnHandling.interruption?.minWords > 0 &&
936
+ this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0 &&
891
937
  this.audioRecognition
892
938
  ) {
893
939
  const text = this.audioRecognition.currentTranscript;
@@ -899,7 +945,7 @@ export class AgentActivity implements RecognitionHooks {
899
945
 
900
946
  // Only allow interruption if word count meets or exceeds minInterruptionWords
901
947
  // This applies to all cases: empty strings, partial speech, and full speech
902
- if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) {
948
+ if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) {
903
949
  return;
904
950
  }
905
951
  }
@@ -924,7 +970,7 @@ export class AgentActivity implements RecognitionHooks {
924
970
  this.restoreInterruptionByAudioActivity();
925
971
  this.interruptByAudioActivity();
926
972
  if (this.audioRecognition) {
927
- this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
973
+ this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.detectedAt);
928
974
  }
929
975
  }
930
976
 
@@ -982,7 +1028,7 @@ export class AgentActivity implements RecognitionHooks {
982
1028
 
983
1029
  onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
984
1030
  if (
985
- !this.agentSession.options.preemptiveGeneration ||
1031
+ !this.agentSession.sessionOptions.preemptiveGeneration ||
986
1032
  this.schedulingPaused ||
987
1033
  (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
988
1034
  !(this.llm instanceof LLM)
@@ -1099,16 +1145,17 @@ export class AgentActivity implements RecognitionHooks {
1099
1145
  this._currentSpeech &&
1100
1146
  this._currentSpeech.allowInterruptions &&
1101
1147
  !this._currentSpeech.interrupted &&
1102
- this.agentSession.options.turnHandling.interruption?.minWords > 0
1148
+ this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0
1103
1149
  ) {
1104
1150
  const wordCount = splitWords(info.newTranscript, true).length;
1105
- if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) {
1151
+ if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) {
1106
1152
  // avoid interruption if the new_transcript contains fewer words than minInterruptionWords
1107
1153
  this.cancelPreemptiveGeneration();
1108
1154
  this.logger.info(
1109
1155
  {
1110
1156
  wordCount,
1111
- minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords,
1157
+ minInterruptionWords:
1158
+ this.agentSession.sessionOptions.turnHandling.interruption.minWords,
1112
1159
  },
1113
1160
  'skipping user input, word count below minimum interruption threshold',
1114
1161
  );
@@ -1148,9 +1195,19 @@ export class AgentActivity implements RecognitionHooks {
1148
1195
  throw new Error('Speech queue is empty');
1149
1196
  }
1150
1197
  const speechHandle = heapItem[2];
1198
+
1199
+ // Skip speech handles that were already interrupted/done before being
1200
+ // picked up from the queue (e.g. interrupted during shutdown before the
1201
+ // main loop had a chance to process them). Calling _authorizeGeneration
1202
+ // on a done handle would create a generation Future that nobody resolves,
1203
+ // causing the main loop to hang forever.
1204
+ if (speechHandle.interrupted || speechHandle.done()) {
1205
+ continue;
1206
+ }
1207
+
1151
1208
  this._currentSpeech = speechHandle;
1152
1209
  speechHandle._authorizeGeneration();
1153
- await speechHandle._waitForGeneration();
1210
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForGeneration()]);
1154
1211
  this._currentSpeech = undefined;
1155
1212
  }
1156
1213
 
@@ -1344,7 +1401,24 @@ export class AgentActivity implements RecognitionHooks {
1344
1401
 
1345
1402
  this.realtimeSession?.interrupt();
1346
1403
 
1347
- if (currentSpeech === undefined) {
1404
+ if (force) {
1405
+ // Force-interrupt (used during shutdown): cancel all speech tasks so they
1406
+ // don't block on I/O that will never complete (e.g. audioOutput.waitForPlayout()
1407
+ // when the room is disconnected). Mark the current speech as done immediately
1408
+ // so the interrupt future resolves without waiting for tasks to finish.
1409
+ // Clear the queue so mainTask doesn't dequeue already-interrupted handles
1410
+ // and hang on _waitForGeneration() (the generation future created by
1411
+ // _authorizeGeneration would never resolve since _markDone is a no-op
1412
+ // once doneFut is already settled).
1413
+ for (const task of this.speechTasks) {
1414
+ task.cancel();
1415
+ }
1416
+ if (currentSpeech && !currentSpeech.done()) {
1417
+ currentSpeech._markDone();
1418
+ }
1419
+ this.speechQueue.clear();
1420
+ future.resolve();
1421
+ } else if (currentSpeech === undefined) {
1348
1422
  future.resolve();
1349
1423
  } else {
1350
1424
  currentSpeech.addDoneCallback(() => {
@@ -1942,9 +2016,7 @@ export class AgentActivity implements RecognitionHooks {
1942
2016
  }
1943
2017
 
1944
2018
  replyAbortController.abort();
1945
- await Promise.allSettled(
1946
- tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
1947
- );
2019
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1948
2020
 
1949
2021
  let forwardedText = textOut?.text || '';
1950
2022
 
@@ -2038,7 +2110,7 @@ export class AgentActivity implements RecognitionHooks {
2038
2110
  if (toolOutput.output.length === 0) return;
2039
2111
 
2040
2112
  // important: no agent output should be used after this point
2041
- const { maxToolSteps } = this.agentSession.options;
2113
+ const { maxToolSteps } = this.agentSession.sessionOptions;
2042
2114
  if (speechHandle.numSteps >= maxToolSteps) {
2043
2115
  this.logger.warn(
2044
2116
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
@@ -2505,7 +2577,7 @@ export class AgentActivity implements RecognitionHooks {
2505
2577
  }
2506
2578
 
2507
2579
  // important: no agent ouput should be used after this point
2508
- const { maxToolSteps } = this.agentSession.options;
2580
+ const { maxToolSteps } = this.agentSession.sessionOptions;
2509
2581
  if (speechHandle.numSteps >= maxToolSteps) {
2510
2582
  this.logger.warn(
2511
2583
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
@@ -2793,16 +2865,20 @@ export class AgentActivity implements RecognitionHooks {
2793
2865
  const unlock = await this.lock.lock();
2794
2866
  try {
2795
2867
  this.cancelPreemptiveGeneration();
2868
+
2869
+ await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
2870
+
2871
+ if (this._currentSpeech && !this._currentSpeech.done()) {
2872
+ this._currentSpeech._markDone();
2873
+ }
2874
+
2796
2875
  await this._closeSessionResources();
2797
2876
 
2798
2877
  if (this._mainTask) {
2799
2878
  await this._mainTask.cancelAndWait();
2800
2879
  }
2801
2880
  if (this.interruptionDetector) {
2802
- this.interruptionDetector.off(
2803
- 'user_overlapping_speech',
2804
- this.onInterruptionOverlappingSpeech,
2805
- );
2881
+ this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech);
2806
2882
  this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
2807
2883
  this.interruptionDetector.off('error', this.onInterruptionError);
2808
2884
  }
@@ -2814,8 +2890,8 @@ export class AgentActivity implements RecognitionHooks {
2814
2890
  }
2815
2891
 
2816
2892
  private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined {
2817
- const interruptionDetection =
2818
- this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
2893
+ const agentInterruptionDetection = this.agent.turnHandling?.interruption?.mode;
2894
+ const sessionInterruptionDetection = this.agentSession.interruptionDetection;
2819
2895
  if (
2820
2896
  !(
2821
2897
  this.stt &&
@@ -2827,25 +2903,43 @@ export class AgentActivity implements RecognitionHooks {
2827
2903
  !(this.llm instanceof RealtimeModel)
2828
2904
  )
2829
2905
  ) {
2830
- if (interruptionDetection === 'adaptive') {
2906
+ if (
2907
+ agentInterruptionDetection === 'adaptive' ||
2908
+ sessionInterruptionDetection === 'adaptive'
2909
+ ) {
2831
2910
  this.logger.warn(
2832
2911
  "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled",
2833
2912
  );
2834
- return undefined;
2835
2913
  }
2914
+ return undefined;
2915
+ }
2916
+
2917
+ if (!this.allowInterruptions) {
2918
+ return undefined;
2919
+ }
2920
+
2921
+ if (agentInterruptionDetection === 'vad') {
2922
+ return undefined;
2923
+ }
2924
+
2925
+ if (sessionInterruptionDetection === 'vad') {
2926
+ return undefined;
2836
2927
  }
2837
2928
 
2838
2929
  if (
2839
- (interruptionDetection !== undefined && interruptionDetection === false) ||
2840
- interruptionDetection === 'vad'
2930
+ agentInterruptionDetection === undefined &&
2931
+ sessionInterruptionDetection === undefined &&
2932
+ !isHosted() &&
2933
+ !isDevMode()
2841
2934
  ) {
2935
+ this.logger.info('adaptive interruption is disabled by default in production mode');
2842
2936
  return undefined;
2843
2937
  }
2844
2938
 
2845
2939
  try {
2846
2940
  const detector = new AdaptiveInterruptionDetector();
2847
2941
 
2848
- detector.on('user_overlapping_speech', this.onInterruptionOverlappingSpeech);
2942
+ detector.on('overlapping_speech', this.onInterruptionOverlappingSpeech);
2849
2943
  detector.on('metrics_collected', this.onInterruptionMetricsCollected);
2850
2944
  detector.on('error', this.onInterruptionError);
2851
2945
 
@@ -2860,6 +2954,30 @@ export class AgentActivity implements RecognitionHooks {
2860
2954
  this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
2861
2955
  }
2862
2956
 
2957
+ private fallbackToVadInterruption(): void {
2958
+ if (!this.isInterruptionDetectionEnabled) return;
2959
+
2960
+ this.isInterruptionDetectionEnabled = false;
2961
+ this.restoreInterruptionByAudioActivity();
2962
+
2963
+ if (this.interruptionDetector) {
2964
+ this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech);
2965
+ this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
2966
+ this.interruptionDetector.off('error', this.onInterruptionError);
2967
+ this.interruptionDetector = undefined;
2968
+ }
2969
+
2970
+ if (this.audioRecognition) {
2971
+ this.audioRecognition.disableInterruptionDetection().catch((err) => {
2972
+ this.logger.warn({ err }, 'error while disabling interruption detection');
2973
+ });
2974
+ }
2975
+
2976
+ this.logger.warn(
2977
+ 'adaptive interruption disabled due to unrecoverable error, falling back to VAD-based interruption',
2978
+ );
2979
+ }
2980
+
2863
2981
  private async _closeSessionResources(): Promise<void> {
2864
2982
  // Unregister event handlers to prevent duplicate metrics
2865
2983
  if (this.llm instanceof LLM) {