@livekit/agents 1.0.48 → 1.1.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/dist/constants.cjs +27 -0
  2. package/dist/constants.cjs.map +1 -1
  3. package/dist/constants.d.cts +9 -0
  4. package/dist/constants.d.ts +9 -0
  5. package/dist/constants.d.ts.map +1 -1
  6. package/dist/constants.js +18 -0
  7. package/dist/constants.js.map +1 -1
  8. package/dist/inference/api_protos.d.cts +71 -71
  9. package/dist/inference/api_protos.d.ts +71 -71
  10. package/dist/inference/interruption/defaults.cjs +81 -0
  11. package/dist/inference/interruption/defaults.cjs.map +1 -0
  12. package/dist/inference/interruption/defaults.d.cts +19 -0
  13. package/dist/inference/interruption/defaults.d.ts +19 -0
  14. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  15. package/dist/inference/interruption/defaults.js +46 -0
  16. package/dist/inference/interruption/defaults.js.map +1 -0
  17. package/dist/inference/interruption/errors.cjs +44 -0
  18. package/dist/inference/interruption/errors.cjs.map +1 -0
  19. package/dist/inference/interruption/errors.d.cts +12 -0
  20. package/dist/inference/interruption/errors.d.ts +12 -0
  21. package/dist/inference/interruption/errors.d.ts.map +1 -0
  22. package/dist/inference/interruption/errors.js +20 -0
  23. package/dist/inference/interruption/errors.js.map +1 -0
  24. package/dist/inference/interruption/http_transport.cjs +147 -0
  25. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  26. package/dist/inference/interruption/http_transport.d.cts +63 -0
  27. package/dist/inference/interruption/http_transport.d.ts +63 -0
  28. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  29. package/dist/inference/interruption/http_transport.js +121 -0
  30. package/dist/inference/interruption/http_transport.js.map +1 -0
  31. package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
  32. package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
  33. package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
  34. package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
  35. package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
  36. package/dist/inference/interruption/interruption_cache_entry.js +34 -0
  37. package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
  38. package/dist/inference/interruption/interruption_detector.cjs +181 -0
  39. package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
  40. package/dist/inference/interruption/interruption_detector.d.cts +59 -0
  41. package/dist/inference/interruption/interruption_detector.d.ts +59 -0
  42. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
  43. package/dist/inference/interruption/interruption_detector.js +147 -0
  44. package/dist/inference/interruption/interruption_detector.js.map +1 -0
  45. package/dist/inference/interruption/interruption_stream.cjs +368 -0
  46. package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
  47. package/dist/inference/interruption/interruption_stream.d.cts +46 -0
  48. package/dist/inference/interruption/interruption_stream.d.ts +46 -0
  49. package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
  50. package/dist/inference/interruption/interruption_stream.js +344 -0
  51. package/dist/inference/interruption/interruption_stream.js.map +1 -0
  52. package/dist/inference/interruption/types.cjs +17 -0
  53. package/dist/inference/interruption/types.cjs.map +1 -0
  54. package/dist/inference/interruption/types.d.cts +66 -0
  55. package/dist/inference/interruption/types.d.ts +66 -0
  56. package/dist/inference/interruption/types.d.ts.map +1 -0
  57. package/dist/inference/interruption/types.js +1 -0
  58. package/dist/inference/interruption/types.js.map +1 -0
  59. package/dist/inference/interruption/utils.cjs +130 -0
  60. package/dist/inference/interruption/utils.cjs.map +1 -0
  61. package/dist/inference/interruption/utils.d.cts +41 -0
  62. package/dist/inference/interruption/utils.d.ts +41 -0
  63. package/dist/inference/interruption/utils.d.ts.map +1 -0
  64. package/dist/inference/interruption/utils.js +105 -0
  65. package/dist/inference/interruption/utils.js.map +1 -0
  66. package/dist/inference/interruption/utils.test.cjs +105 -0
  67. package/dist/inference/interruption/utils.test.cjs.map +1 -0
  68. package/dist/inference/interruption/utils.test.js +104 -0
  69. package/dist/inference/interruption/utils.test.js.map +1 -0
  70. package/dist/inference/interruption/ws_transport.cjs +329 -0
  71. package/dist/inference/interruption/ws_transport.cjs.map +1 -0
  72. package/dist/inference/interruption/ws_transport.d.cts +33 -0
  73. package/dist/inference/interruption/ws_transport.d.ts +33 -0
  74. package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
  75. package/dist/inference/interruption/ws_transport.js +295 -0
  76. package/dist/inference/interruption/ws_transport.js.map +1 -0
  77. package/dist/inference/llm.cjs +14 -10
  78. package/dist/inference/llm.cjs.map +1 -1
  79. package/dist/inference/llm.d.cts +2 -1
  80. package/dist/inference/llm.d.ts +2 -1
  81. package/dist/inference/llm.d.ts.map +1 -1
  82. package/dist/inference/llm.js +8 -10
  83. package/dist/inference/llm.js.map +1 -1
  84. package/dist/inference/stt.cjs +7 -2
  85. package/dist/inference/stt.cjs.map +1 -1
  86. package/dist/inference/stt.d.cts +2 -0
  87. package/dist/inference/stt.d.ts +2 -0
  88. package/dist/inference/stt.d.ts.map +1 -1
  89. package/dist/inference/stt.js +8 -3
  90. package/dist/inference/stt.js.map +1 -1
  91. package/dist/inference/tts.cjs +7 -2
  92. package/dist/inference/tts.cjs.map +1 -1
  93. package/dist/inference/tts.d.cts +2 -0
  94. package/dist/inference/tts.d.ts +2 -0
  95. package/dist/inference/tts.d.ts.map +1 -1
  96. package/dist/inference/tts.js +8 -3
  97. package/dist/inference/tts.js.map +1 -1
  98. package/dist/inference/utils.cjs +26 -7
  99. package/dist/inference/utils.cjs.map +1 -1
  100. package/dist/inference/utils.d.cts +13 -0
  101. package/dist/inference/utils.d.ts +13 -0
  102. package/dist/inference/utils.d.ts.map +1 -1
  103. package/dist/inference/utils.js +18 -2
  104. package/dist/inference/utils.js.map +1 -1
  105. package/dist/llm/chat_context.cjs +20 -2
  106. package/dist/llm/chat_context.cjs.map +1 -1
  107. package/dist/llm/chat_context.d.cts +19 -1
  108. package/dist/llm/chat_context.d.ts +19 -1
  109. package/dist/llm/chat_context.d.ts.map +1 -1
  110. package/dist/llm/chat_context.js +20 -2
  111. package/dist/llm/chat_context.js.map +1 -1
  112. package/dist/llm/index.cjs.map +1 -1
  113. package/dist/llm/index.d.cts +1 -1
  114. package/dist/llm/index.d.ts +1 -1
  115. package/dist/llm/index.d.ts.map +1 -1
  116. package/dist/llm/index.js.map +1 -1
  117. package/dist/llm/llm.cjs +16 -1
  118. package/dist/llm/llm.cjs.map +1 -1
  119. package/dist/llm/llm.d.cts +9 -0
  120. package/dist/llm/llm.d.ts +9 -0
  121. package/dist/llm/llm.d.ts.map +1 -1
  122. package/dist/llm/llm.js +16 -1
  123. package/dist/llm/llm.js.map +1 -1
  124. package/dist/llm/realtime.cjs +3 -0
  125. package/dist/llm/realtime.cjs.map +1 -1
  126. package/dist/llm/realtime.d.cts +1 -0
  127. package/dist/llm/realtime.d.ts +1 -0
  128. package/dist/llm/realtime.d.ts.map +1 -1
  129. package/dist/llm/realtime.js +3 -0
  130. package/dist/llm/realtime.js.map +1 -1
  131. package/dist/metrics/base.cjs.map +1 -1
  132. package/dist/metrics/base.d.cts +45 -1
  133. package/dist/metrics/base.d.ts +45 -1
  134. package/dist/metrics/base.d.ts.map +1 -1
  135. package/dist/metrics/index.cjs +5 -0
  136. package/dist/metrics/index.cjs.map +1 -1
  137. package/dist/metrics/index.d.cts +2 -1
  138. package/dist/metrics/index.d.ts +2 -1
  139. package/dist/metrics/index.d.ts.map +1 -1
  140. package/dist/metrics/index.js +6 -0
  141. package/dist/metrics/index.js.map +1 -1
  142. package/dist/metrics/model_usage.cjs +189 -0
  143. package/dist/metrics/model_usage.cjs.map +1 -0
  144. package/dist/metrics/model_usage.d.cts +92 -0
  145. package/dist/metrics/model_usage.d.ts +92 -0
  146. package/dist/metrics/model_usage.d.ts.map +1 -0
  147. package/dist/metrics/model_usage.js +164 -0
  148. package/dist/metrics/model_usage.js.map +1 -0
  149. package/dist/metrics/model_usage.test.cjs +474 -0
  150. package/dist/metrics/model_usage.test.cjs.map +1 -0
  151. package/dist/metrics/model_usage.test.js +476 -0
  152. package/dist/metrics/model_usage.test.js.map +1 -0
  153. package/dist/metrics/usage_collector.cjs +3 -0
  154. package/dist/metrics/usage_collector.cjs.map +1 -1
  155. package/dist/metrics/usage_collector.d.cts +9 -0
  156. package/dist/metrics/usage_collector.d.ts +9 -0
  157. package/dist/metrics/usage_collector.d.ts.map +1 -1
  158. package/dist/metrics/usage_collector.js +3 -0
  159. package/dist/metrics/usage_collector.js.map +1 -1
  160. package/dist/metrics/utils.cjs +9 -0
  161. package/dist/metrics/utils.cjs.map +1 -1
  162. package/dist/metrics/utils.d.ts.map +1 -1
  163. package/dist/metrics/utils.js +9 -0
  164. package/dist/metrics/utils.js.map +1 -1
  165. package/dist/stream/multi_input_stream.test.cjs +4 -0
  166. package/dist/stream/multi_input_stream.test.cjs.map +1 -1
  167. package/dist/stream/multi_input_stream.test.js +5 -1
  168. package/dist/stream/multi_input_stream.test.js.map +1 -1
  169. package/dist/stream/stream_channel.cjs +31 -0
  170. package/dist/stream/stream_channel.cjs.map +1 -1
  171. package/dist/stream/stream_channel.d.cts +4 -2
  172. package/dist/stream/stream_channel.d.ts +4 -2
  173. package/dist/stream/stream_channel.d.ts.map +1 -1
  174. package/dist/stream/stream_channel.js +31 -0
  175. package/dist/stream/stream_channel.js.map +1 -1
  176. package/dist/stt/stt.cjs +34 -2
  177. package/dist/stt/stt.cjs.map +1 -1
  178. package/dist/stt/stt.d.cts +22 -0
  179. package/dist/stt/stt.d.ts +22 -0
  180. package/dist/stt/stt.d.ts.map +1 -1
  181. package/dist/stt/stt.js +34 -2
  182. package/dist/stt/stt.js.map +1 -1
  183. package/dist/telemetry/otel_http_exporter.cjs +24 -5
  184. package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
  185. package/dist/telemetry/otel_http_exporter.d.cts +1 -0
  186. package/dist/telemetry/otel_http_exporter.d.ts +1 -0
  187. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
  188. package/dist/telemetry/otel_http_exporter.js +24 -5
  189. package/dist/telemetry/otel_http_exporter.js.map +1 -1
  190. package/dist/telemetry/trace_types.cjs +5 -5
  191. package/dist/telemetry/trace_types.cjs.map +1 -1
  192. package/dist/telemetry/trace_types.d.cts +9 -5
  193. package/dist/telemetry/trace_types.d.ts +9 -5
  194. package/dist/telemetry/trace_types.d.ts.map +1 -1
  195. package/dist/telemetry/trace_types.js +5 -5
  196. package/dist/telemetry/trace_types.js.map +1 -1
  197. package/dist/telemetry/traces.cjs +47 -8
  198. package/dist/telemetry/traces.cjs.map +1 -1
  199. package/dist/telemetry/traces.d.ts.map +1 -1
  200. package/dist/telemetry/traces.js +47 -8
  201. package/dist/telemetry/traces.js.map +1 -1
  202. package/dist/tts/tts.cjs +64 -2
  203. package/dist/tts/tts.cjs.map +1 -1
  204. package/dist/tts/tts.d.cts +34 -0
  205. package/dist/tts/tts.d.ts +34 -0
  206. package/dist/tts/tts.d.ts.map +1 -1
  207. package/dist/tts/tts.js +64 -2
  208. package/dist/tts/tts.js.map +1 -1
  209. package/dist/version.cjs +1 -1
  210. package/dist/version.js +1 -1
  211. package/dist/voice/agent.cjs +25 -4
  212. package/dist/voice/agent.cjs.map +1 -1
  213. package/dist/voice/agent.d.cts +10 -2
  214. package/dist/voice/agent.d.ts +10 -2
  215. package/dist/voice/agent.d.ts.map +1 -1
  216. package/dist/voice/agent.js +25 -4
  217. package/dist/voice/agent.js.map +1 -1
  218. package/dist/voice/agent_activity.cjs +261 -36
  219. package/dist/voice/agent_activity.cjs.map +1 -1
  220. package/dist/voice/agent_activity.d.cts +20 -6
  221. package/dist/voice/agent_activity.d.ts +20 -6
  222. package/dist/voice/agent_activity.d.ts.map +1 -1
  223. package/dist/voice/agent_activity.js +262 -37
  224. package/dist/voice/agent_activity.js.map +1 -1
  225. package/dist/voice/agent_session.cjs +105 -48
  226. package/dist/voice/agent_session.cjs.map +1 -1
  227. package/dist/voice/agent_session.d.cts +90 -20
  228. package/dist/voice/agent_session.d.ts +90 -20
  229. package/dist/voice/agent_session.d.ts.map +1 -1
  230. package/dist/voice/agent_session.js +105 -46
  231. package/dist/voice/agent_session.js.map +1 -1
  232. package/dist/voice/audio_recognition.cjs +287 -6
  233. package/dist/voice/audio_recognition.cjs.map +1 -1
  234. package/dist/voice/audio_recognition.d.cts +42 -3
  235. package/dist/voice/audio_recognition.d.ts +42 -3
  236. package/dist/voice/audio_recognition.d.ts.map +1 -1
  237. package/dist/voice/audio_recognition.js +289 -7
  238. package/dist/voice/audio_recognition.js.map +1 -1
  239. package/dist/voice/client_events.cjs +554 -0
  240. package/dist/voice/client_events.cjs.map +1 -0
  241. package/dist/voice/client_events.d.cts +195 -0
  242. package/dist/voice/client_events.d.ts +195 -0
  243. package/dist/voice/client_events.d.ts.map +1 -0
  244. package/dist/voice/client_events.js +548 -0
  245. package/dist/voice/client_events.js.map +1 -0
  246. package/dist/voice/events.cjs +1 -0
  247. package/dist/voice/events.cjs.map +1 -1
  248. package/dist/voice/events.d.cts +8 -5
  249. package/dist/voice/events.d.ts +8 -5
  250. package/dist/voice/events.d.ts.map +1 -1
  251. package/dist/voice/events.js +1 -0
  252. package/dist/voice/events.js.map +1 -1
  253. package/dist/voice/generation.cjs +43 -8
  254. package/dist/voice/generation.cjs.map +1 -1
  255. package/dist/voice/generation.d.cts +3 -3
  256. package/dist/voice/generation.d.ts +3 -3
  257. package/dist/voice/generation.d.ts.map +1 -1
  258. package/dist/voice/generation.js +43 -8
  259. package/dist/voice/generation.js.map +1 -1
  260. package/dist/voice/index.cjs +1 -0
  261. package/dist/voice/index.cjs.map +1 -1
  262. package/dist/voice/index.d.cts +1 -0
  263. package/dist/voice/index.d.ts +1 -0
  264. package/dist/voice/index.d.ts.map +1 -1
  265. package/dist/voice/index.js +1 -0
  266. package/dist/voice/index.js.map +1 -1
  267. package/dist/voice/report.cjs +20 -8
  268. package/dist/voice/report.cjs.map +1 -1
  269. package/dist/voice/report.d.cts +5 -0
  270. package/dist/voice/report.d.ts +5 -0
  271. package/dist/voice/report.d.ts.map +1 -1
  272. package/dist/voice/report.js +20 -8
  273. package/dist/voice/report.js.map +1 -1
  274. package/dist/voice/report.test.cjs +106 -0
  275. package/dist/voice/report.test.cjs.map +1 -0
  276. package/dist/voice/report.test.js +105 -0
  277. package/dist/voice/report.test.js.map +1 -0
  278. package/dist/voice/room_io/room_io.cjs +5 -39
  279. package/dist/voice/room_io/room_io.cjs.map +1 -1
  280. package/dist/voice/room_io/room_io.d.cts +4 -9
  281. package/dist/voice/room_io/room_io.d.ts +4 -9
  282. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  283. package/dist/voice/room_io/room_io.js +5 -40
  284. package/dist/voice/room_io/room_io.js.map +1 -1
  285. package/dist/voice/turn_config/endpointing.cjs +33 -0
  286. package/dist/voice/turn_config/endpointing.cjs.map +1 -0
  287. package/dist/voice/turn_config/endpointing.d.cts +30 -0
  288. package/dist/voice/turn_config/endpointing.d.ts +30 -0
  289. package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
  290. package/dist/voice/turn_config/endpointing.js +9 -0
  291. package/dist/voice/turn_config/endpointing.js.map +1 -0
  292. package/dist/voice/turn_config/interruption.cjs +37 -0
  293. package/dist/voice/turn_config/interruption.cjs.map +1 -0
  294. package/dist/voice/turn_config/interruption.d.cts +53 -0
  295. package/dist/voice/turn_config/interruption.d.ts +53 -0
  296. package/dist/voice/turn_config/interruption.d.ts.map +1 -0
  297. package/dist/voice/turn_config/interruption.js +13 -0
  298. package/dist/voice/turn_config/interruption.js.map +1 -0
  299. package/dist/voice/turn_config/turn_handling.cjs +35 -0
  300. package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
  301. package/dist/voice/turn_config/turn_handling.d.cts +36 -0
  302. package/dist/voice/turn_config/turn_handling.d.ts +36 -0
  303. package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
  304. package/dist/voice/turn_config/turn_handling.js +11 -0
  305. package/dist/voice/turn_config/turn_handling.js.map +1 -0
  306. package/dist/voice/turn_config/utils.cjs +97 -0
  307. package/dist/voice/turn_config/utils.cjs.map +1 -0
  308. package/dist/voice/turn_config/utils.d.cts +25 -0
  309. package/dist/voice/turn_config/utils.d.ts +25 -0
  310. package/dist/voice/turn_config/utils.d.ts.map +1 -0
  311. package/dist/voice/turn_config/utils.js +73 -0
  312. package/dist/voice/turn_config/utils.js.map +1 -0
  313. package/dist/voice/turn_config/utils.test.cjs +86 -0
  314. package/dist/voice/turn_config/utils.test.cjs.map +1 -0
  315. package/dist/voice/turn_config/utils.test.js +85 -0
  316. package/dist/voice/turn_config/utils.test.js.map +1 -0
  317. package/dist/voice/wire_format.cjs +798 -0
  318. package/dist/voice/wire_format.cjs.map +1 -0
  319. package/dist/voice/wire_format.d.cts +5503 -0
  320. package/dist/voice/wire_format.d.ts +5503 -0
  321. package/dist/voice/wire_format.d.ts.map +1 -0
  322. package/dist/voice/wire_format.js +728 -0
  323. package/dist/voice/wire_format.js.map +1 -0
  324. package/package.json +2 -1
  325. package/src/constants.ts +13 -0
  326. package/src/inference/interruption/defaults.ts +51 -0
  327. package/src/inference/interruption/errors.ts +25 -0
  328. package/src/inference/interruption/http_transport.ts +187 -0
  329. package/src/inference/interruption/interruption_cache_entry.ts +50 -0
  330. package/src/inference/interruption/interruption_detector.ts +188 -0
  331. package/src/inference/interruption/interruption_stream.ts +467 -0
  332. package/src/inference/interruption/types.ts +84 -0
  333. package/src/inference/interruption/utils.test.ts +132 -0
  334. package/src/inference/interruption/utils.ts +137 -0
  335. package/src/inference/interruption/ws_transport.ts +402 -0
  336. package/src/inference/llm.ts +9 -12
  337. package/src/inference/stt.ts +10 -3
  338. package/src/inference/tts.ts +10 -3
  339. package/src/inference/utils.ts +29 -1
  340. package/src/llm/chat_context.ts +40 -2
  341. package/src/llm/index.ts +1 -0
  342. package/src/llm/llm.ts +16 -0
  343. package/src/llm/realtime.ts +4 -0
  344. package/src/metrics/base.ts +48 -1
  345. package/src/metrics/index.ts +11 -0
  346. package/src/metrics/model_usage.test.ts +545 -0
  347. package/src/metrics/model_usage.ts +262 -0
  348. package/src/metrics/usage_collector.ts +11 -0
  349. package/src/metrics/utils.ts +11 -0
  350. package/src/stream/multi_input_stream.test.ts +6 -1
  351. package/src/stream/stream_channel.ts +34 -2
  352. package/src/stt/stt.ts +38 -0
  353. package/src/telemetry/otel_http_exporter.ts +28 -5
  354. package/src/telemetry/trace_types.ts +11 -8
  355. package/src/telemetry/traces.ts +111 -54
  356. package/src/tts/tts.ts +69 -1
  357. package/src/voice/agent.ts +30 -3
  358. package/src/voice/agent_activity.ts +327 -28
  359. package/src/voice/agent_session.ts +207 -59
  360. package/src/voice/audio_recognition.ts +385 -9
  361. package/src/voice/client_events.ts +838 -0
  362. package/src/voice/events.ts +14 -4
  363. package/src/voice/generation.ts +52 -9
  364. package/src/voice/index.ts +1 -0
  365. package/src/voice/report.test.ts +117 -0
  366. package/src/voice/report.ts +29 -6
  367. package/src/voice/room_io/room_io.ts +7 -61
  368. package/src/voice/turn_config/endpointing.ts +33 -0
  369. package/src/voice/turn_config/interruption.ts +56 -0
  370. package/src/voice/turn_config/turn_handling.ts +45 -0
  371. package/src/voice/turn_config/utils.test.ts +100 -0
  372. package/src/voice/turn_config/utils.ts +103 -0
  373. package/src/voice/wire_format.ts +827 -0
@@ -7,8 +7,11 @@ import type { Span } from '@opentelemetry/api';
7
7
  import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
8
8
  import { Heap } from 'heap-js';
9
9
  import { AsyncLocalStorage } from 'node:async_hooks';
10
- import { ReadableStream } from 'node:stream/web';
11
- import { type ChatContext, ChatMessage } from '../llm/chat_context.js';
10
+ import { ReadableStream, TransformStream } from 'node:stream/web';
11
+ import type { InterruptionDetectionError } from '../inference/interruption/errors.js';
12
+ import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
13
+ import type { OverlappingSpeechEvent } from '../inference/interruption/types.js';
14
+ import { type ChatContext, ChatMessage, type MetricsReport } from '../llm/chat_context.js';
12
15
  import {
13
16
  type ChatItem,
14
17
  type FunctionCall,
@@ -30,6 +33,7 @@ import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
30
33
  import { log } from '../log.js';
31
34
  import type {
32
35
  EOUMetrics,
36
+ InterruptionMetrics,
33
37
  LLMMetrics,
34
38
  RealtimeModelMetrics,
35
39
  STTMetrics,
@@ -57,7 +61,6 @@ import {
57
61
  type EndOfTurnInfo,
58
62
  type PreemptiveGenerationInfo,
59
63
  type RecognitionHooks,
60
- type _TurnDetector,
61
64
  } from './audio_recognition.js';
62
65
  import {
63
66
  AgentSessionEventTypes,
@@ -101,6 +104,7 @@ interface PreemptiveGeneration {
101
104
  createdAt: number;
102
105
  }
103
106
 
107
+ // TODO add false interruption handling and barge in handling for https://github.com/livekit/agents/pull/3109/changes
104
108
  export class AgentActivity implements RecognitionHooks {
105
109
  agent: Agent;
106
110
  agentSession: AgentSession;
@@ -111,7 +115,7 @@ export class AgentActivity implements RecognitionHooks {
111
115
  private audioRecognition?: AudioRecognition;
112
116
  private realtimeSession?: RealtimeSession;
113
117
  private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
114
- private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
118
+ private turnDetectionMode?: TurnDetectionMode;
115
119
  private logger = log();
116
120
  private _schedulingPaused = true;
117
121
  private _drainBlockedTasks: Task<any>[] = [];
@@ -126,6 +130,43 @@ export class AgentActivity implements RecognitionHooks {
126
130
  // default to null as None, which maps to the default provider tool choice value
127
131
  private toolChoice: ToolChoice | null = null;
128
132
  private _preemptiveGeneration?: PreemptiveGeneration;
133
+ private interruptionDetector?: AdaptiveInterruptionDetector;
134
+ private isInterruptionDetectionEnabled: boolean;
135
+ private isInterruptionByAudioActivityEnabled: boolean;
136
+ private isDefaultInterruptionByAudioActivityEnabled: boolean;
137
+
138
+ private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent): void =>
139
+ this.onGenerationCreated(ev);
140
+
141
+ private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent): void =>
142
+ this.onInputSpeechStarted(ev);
143
+
144
+ private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent): void =>
145
+ this.onInputSpeechStopped(ev);
146
+
147
+ private readonly onRealtimeInputAudioTranscriptionCompleted = (
148
+ ev: InputTranscriptionCompleted,
149
+ ): void => this.onInputAudioTranscriptionCompleted(ev);
150
+
151
+ private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError): void =>
152
+ this.onError(ev);
153
+
154
+ private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => {
155
+ this.agentSession.emit(AgentSessionEventTypes.UserOverlappingSpeech, ev);
156
+ };
157
+
158
+ private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
159
+ this.agentSession.emit(
160
+ AgentSessionEventTypes.MetricsCollected,
161
+ createMetricsCollectedEvent({ metrics: ev }),
162
+ );
163
+ };
164
+
165
+ private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
166
+ const errorEvent = createErrorEvent(ev, this.interruptionDetector);
167
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
168
+ this.agentSession._onError(ev);
169
+ };
129
170
 
130
171
  /** @internal */
131
172
  _mainTask?: Task<void>;
@@ -133,16 +174,6 @@ export class AgentActivity implements RecognitionHooks {
133
174
  _onExitTask?: Task<void>;
134
175
  _userTurnCompletedTask?: Task<void>;
135
176
 
136
- private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
137
- this.onGenerationCreated(ev);
138
- private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
139
- this.onInputSpeechStarted(ev);
140
- private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
141
- this.onInputSpeechStopped(ev);
142
- private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
143
- this.onInputAudioTranscriptionCompleted(ev);
144
- private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
145
- this.onError(ev);
146
177
  constructor(agent: Agent, agentSession: AgentSession) {
147
178
  this.agent = agent;
148
179
  this.agentSession = agentSession;
@@ -235,6 +266,16 @@ export class AgentActivity implements RecognitionHooks {
235
266
  'for more responsive interruption handling.',
236
267
  );
237
268
  }
269
+
270
+ this.interruptionDetector = this.resolveInterruptionDetector();
271
+ this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
272
+
273
+ // this allows taking over audio interruption temporarily until interruption is detected
274
+ // by default is is ture unless turnDetection is manual or realtime_llm
275
+ this.isInterruptionByAudioActivityEnabled =
276
+ this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
277
+
278
+ this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
238
279
  }
239
280
 
240
281
  async start(): Promise<void> {
@@ -348,8 +389,9 @@ export class AgentActivity implements RecognitionHooks {
348
389
  vad: this.vad,
349
390
  turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
350
391
  turnDetectionMode: this.turnDetectionMode,
351
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
352
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
392
+ interruptionDetection: this.interruptionDetector,
393
+ minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
394
+ maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
353
395
  rootSpanContext: this.agentSession.rootSpanContext,
354
396
  sttModel: this.stt?.label,
355
397
  sttProvider: this.getSttProvider(),
@@ -423,7 +465,7 @@ export class AgentActivity implements RecognitionHooks {
423
465
 
424
466
  get allowInterruptions(): boolean {
425
467
  // TODO(AJS-51): Allow options to be defined in Agent class
426
- return this.agentSession.options.allowInterruptions;
468
+ return this.agentSession.options.turnHandling.interruption?.mode !== false;
427
469
  }
428
470
 
429
471
  get useTtsAlignedTranscript(): boolean {
@@ -440,6 +482,11 @@ export class AgentActivity implements RecognitionHooks {
440
482
  return this.agent.toolCtx;
441
483
  }
442
484
 
485
+ /** @internal */
486
+ get inputStartedAt() {
487
+ return this.audioRecognition?.inputStartedAt;
488
+ }
489
+
443
490
  async updateChatCtx(chatCtx: ChatContext): Promise<void> {
444
491
  chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
445
492
 
@@ -471,7 +518,13 @@ export class AgentActivity implements RecognitionHooks {
471
518
  }
472
519
  }
473
520
 
474
- updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void {
521
+ updateOptions({
522
+ toolChoice,
523
+ turnDetection,
524
+ }: {
525
+ toolChoice?: ToolChoice | null;
526
+ turnDetection?: TurnDetectionMode;
527
+ }): void {
475
528
  if (toolChoice !== undefined) {
476
529
  this.toolChoice = toolChoice;
477
530
  }
@@ -479,14 +532,46 @@ export class AgentActivity implements RecognitionHooks {
479
532
  if (this.realtimeSession) {
480
533
  this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
481
534
  }
535
+
536
+ if (turnDetection !== undefined) {
537
+ this.turnDetectionMode = turnDetection;
538
+ this.isDefaultInterruptionByAudioActivityEnabled =
539
+ this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
540
+
541
+ // sync live flag immediately when not speaking so the change takes effect right away
542
+ if (this.agentSession.agentState !== 'speaking') {
543
+ this.isInterruptionByAudioActivityEnabled =
544
+ this.isDefaultInterruptionByAudioActivityEnabled;
545
+ }
546
+ }
547
+
548
+ if (this.audioRecognition) {
549
+ this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
550
+ }
482
551
  }
483
552
 
484
553
  attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
485
554
  void this.audioStream.close();
486
555
  this.audioStream = new MultiInputStream<AudioFrame>();
487
556
 
557
+ // Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
558
+ // than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
559
+ // if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
560
+ // and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
561
+ const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
562
+ transform: (frame, controller) => {
563
+ const shouldDiscardForAecWarmup =
564
+ this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
565
+ if (!shouldDiscardForAecWarmup) {
566
+ controller.enqueue(frame);
567
+ }
568
+ },
569
+ });
570
+
488
571
  this.audioStreamId = this.audioStream.addInputStream(audioStream);
489
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
572
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
573
+ .pipeThrough(aecWarmupAudioFilter)
574
+ .tee();
490
575
 
491
576
  if (this.realtimeSession) {
492
577
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
@@ -639,6 +724,13 @@ export class AgentActivity implements RecognitionHooks {
639
724
 
640
725
  if (!this.vad) {
641
726
  this.agentSession._updateUserState('speaking');
727
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
728
+ this.audioRecognition.onStartOfOverlapSpeech(
729
+ 0,
730
+ Date.now(),
731
+ this.agentSession._userSpeakingSpan,
732
+ );
733
+ }
642
734
  }
643
735
 
644
736
  // this.interrupt() is going to raise when allow_interruptions is False,
@@ -657,6 +749,9 @@ export class AgentActivity implements RecognitionHooks {
657
749
  this.logger.info(ev, 'onInputSpeechStopped');
658
750
 
659
751
  if (!this.vad) {
752
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
753
+ this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
754
+ }
660
755
  this.agentSession._updateUserState('listening');
661
756
  }
662
757
 
@@ -730,15 +825,32 @@ export class AgentActivity implements RecognitionHooks {
730
825
  onStartOfSpeech(ev: VADEvent): void {
731
826
  let speechStartTime = Date.now();
732
827
  if (ev) {
733
- speechStartTime = speechStartTime - ev.speechDuration;
828
+ // Subtract both speechDuration and inferenceDuration to correct for VAD model latency.
829
+ speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
734
830
  }
735
831
  this.agentSession._updateUserState('speaking', speechStartTime);
832
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
833
+ // Pass speechStartTime as the absolute startedAt timestamp.
834
+ this.audioRecognition.onStartOfOverlapSpeech(
835
+ ev.speechDuration,
836
+ speechStartTime,
837
+ this.agentSession._userSpeakingSpan,
838
+ );
839
+ }
736
840
  }
737
841
 
738
842
  onEndOfSpeech(ev: VADEvent): void {
739
843
  let speechEndTime = Date.now();
740
844
  if (ev) {
741
- speechEndTime = speechEndTime - ev.silenceDuration;
845
+ // Subtract both silenceDuration and inferenceDuration to correct for VAD model latency.
846
+ speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
847
+ }
848
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
849
+ // Pass speechEndTime as the absolute endedAt timestamp.
850
+ this.audioRecognition.onEndOfOverlapSpeech(
851
+ speechEndTime,
852
+ this.agentSession._userSpeakingSpan,
853
+ );
742
854
  }
743
855
  this.agentSession._updateUserState('listening', speechEndTime);
744
856
  }
@@ -749,12 +861,21 @@ export class AgentActivity implements RecognitionHooks {
749
861
  return;
750
862
  }
751
863
 
752
- if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
864
+ if (ev.speechDuration >= this.agentSession.options.turnHandling.interruption?.minDuration) {
753
865
  this.interruptByAudioActivity();
754
866
  }
755
867
  }
756
868
 
757
869
  private interruptByAudioActivity(): void {
870
+ if (!this.isInterruptionByAudioActivityEnabled) {
871
+ return;
872
+ }
873
+
874
+ if (this.agentSession._aecWarmupRemaining > 0) {
875
+ // Disable interruption from audio activity while AEC warmup is active.
876
+ return;
877
+ }
878
+
758
879
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
759
880
  // skip speech handle interruption if server side turn detection is enabled
760
881
  return;
@@ -764,7 +885,11 @@ export class AgentActivity implements RecognitionHooks {
764
885
  // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
765
886
  // - Apply check to all STT results: empty string, undefined, or any length
766
887
  // - This ensures consistent behavior across all interruption scenarios
767
- if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
888
+ if (
889
+ this.stt &&
890
+ this.agentSession.options.turnHandling.interruption?.minWords > 0 &&
891
+ this.audioRecognition
892
+ ) {
768
893
  const text = this.audioRecognition.currentTranscript;
769
894
  // TODO(shubhra): better word splitting for multi-language
770
895
 
@@ -774,7 +899,7 @@ export class AgentActivity implements RecognitionHooks {
774
899
 
775
900
  // Only allow interruption if word count meets or exceeds minInterruptionWords
776
901
  // This applies to all cases: empty strings, partial speech, and full speech
777
- if (wordCount < this.agentSession.options.minInterruptionWords) {
902
+ if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) {
778
903
  return;
779
904
  }
780
905
  }
@@ -795,6 +920,14 @@ export class AgentActivity implements RecognitionHooks {
795
920
  }
796
921
  }
797
922
 
923
+ onInterruption(ev: OverlappingSpeechEvent) {
924
+ this.restoreInterruptionByAudioActivity();
925
+ this.interruptByAudioActivity();
926
+ if (this.audioRecognition) {
927
+ this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
928
+ }
929
+ }
930
+
798
931
  onInterimTranscript(ev: SpeechEvent): void {
799
932
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
800
933
  // skip stt transcription if userTranscription is enabled on the realtime model
@@ -870,6 +1003,7 @@ export class AgentActivity implements RecognitionHooks {
870
1003
  const userMessage = ChatMessage.create({
871
1004
  role: 'user',
872
1005
  content: info.newTranscript,
1006
+ transcriptConfidence: info.transcriptConfidence,
873
1007
  });
874
1008
  const chatCtx = this.agent.chatCtx.copy();
875
1009
  const speechHandle = this.generateReply({
@@ -965,16 +1099,16 @@ export class AgentActivity implements RecognitionHooks {
965
1099
  this._currentSpeech &&
966
1100
  this._currentSpeech.allowInterruptions &&
967
1101
  !this._currentSpeech.interrupted &&
968
- this.agentSession.options.minInterruptionWords > 0
1102
+ this.agentSession.options.turnHandling.interruption?.minWords > 0
969
1103
  ) {
970
1104
  const wordCount = splitWords(info.newTranscript, true).length;
971
- if (wordCount < this.agentSession.options.minInterruptionWords) {
1105
+ if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) {
972
1106
  // avoid interruption if the new_transcript contains fewer words than minInterruptionWords
973
1107
  this.cancelPreemptiveGeneration();
974
1108
  this.logger.info(
975
1109
  {
976
1110
  wordCount,
977
- minInterruptionWords: this.agentSession.options.minInterruptionWords,
1111
+ minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords,
978
1112
  },
979
1113
  'skipping user input, word count below minimum interruption threshold',
980
1114
  );
@@ -1272,6 +1406,7 @@ export class AgentActivity implements RecognitionHooks {
1272
1406
  let userMessage: ChatMessage | undefined = ChatMessage.create({
1273
1407
  role: 'user',
1274
1408
  content: info.newTranscript,
1409
+ transcriptConfidence: info.transcriptConfidence,
1275
1410
  });
1276
1411
 
1277
1412
  // create a temporary mutable chat context to pass to onUserTurnCompleted
@@ -1298,6 +1433,24 @@ export class AgentActivity implements RecognitionHooks {
1298
1433
  return;
1299
1434
  }
1300
1435
 
1436
+ const userMetricsReport: MetricsReport = {};
1437
+ if (info.startedSpeakingAt !== undefined) {
1438
+ userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1000; // ms -> seconds
1439
+ }
1440
+ if (info.stoppedSpeakingAt !== undefined) {
1441
+ userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1000; // ms -> seconds
1442
+ }
1443
+ if (info.transcriptionDelay !== undefined) {
1444
+ userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1000; // ms -> seconds
1445
+ }
1446
+ if (info.endOfUtteranceDelay !== undefined) {
1447
+ userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1000; // ms -> seconds
1448
+ }
1449
+ userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1000; // ms -> seconds
1450
+ if (userMessage) {
1451
+ userMessage.metrics = userMetricsReport;
1452
+ }
1453
+
1301
1454
  let speechHandle: SpeechHandle | undefined;
1302
1455
  if (this._preemptiveGeneration !== undefined) {
1303
1456
  const preemptive = this._preemptiveGeneration;
@@ -1310,6 +1463,14 @@ export class AgentActivity implements RecognitionHooks {
1310
1463
  isSameToolChoice(preemptive.toolChoice, this.toolChoice)
1311
1464
  ) {
1312
1465
  speechHandle = preemptive.speechHandle;
1466
+ // The preemptive userMessage was created without metrics.
1467
+ // Copy the metrics and transcriptConfidence from the new userMessage
1468
+ // to the preemptive message BEFORE scheduling (so the pipeline inserts
1469
+ // the message with metrics already set).
1470
+ if (preemptive.userMessage && userMessage) {
1471
+ preemptive.userMessage.metrics = userMetricsReport;
1472
+ preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
1473
+ }
1313
1474
  this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
1314
1475
  this.logger.debug(
1315
1476
  {
@@ -1403,11 +1564,19 @@ export class AgentActivity implements RecognitionHooks {
1403
1564
  tasks.push(textForwardTask);
1404
1565
  }
1405
1566
 
1567
+ let replyStartedSpeakingAt: number | undefined;
1568
+ let replyTtsGenData: _TTSGenerationData | null = null;
1569
+
1406
1570
  const onFirstFrame = (startedSpeakingAt?: number) => {
1571
+ replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1407
1572
  this.agentSession._updateAgentState('speaking', {
1408
1573
  startTime: startedSpeakingAt,
1409
1574
  otelContext: speechHandle._agentTurnContext,
1410
1575
  });
1576
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1577
+ this.audioRecognition.onStartOfAgentSpeech();
1578
+ this.isInterruptionByAudioActivityEnabled = false;
1579
+ }
1411
1580
  };
1412
1581
 
1413
1582
  if (!audioOutput) {
@@ -1425,8 +1594,11 @@ export class AgentActivity implements RecognitionHooks {
1425
1594
  audioSource,
1426
1595
  modelSettings,
1427
1596
  replyAbortController,
1597
+ this.tts?.model,
1598
+ this.tts?.provider,
1428
1599
  );
1429
1600
  tasks.push(ttsTask);
1601
+ replyTtsGenData = ttsGenData;
1430
1602
 
1431
1603
  const [forwardTask, _audioOut] = performAudioForwarding(
1432
1604
  ttsGenData.audioStream,
@@ -1466,10 +1638,21 @@ export class AgentActivity implements RecognitionHooks {
1466
1638
  }
1467
1639
 
1468
1640
  if (addToChatCtx) {
1641
+ const replyStoppedSpeakingAt = Date.now();
1642
+ const replyAssistantMetrics: MetricsReport = {};
1643
+ if (replyTtsGenData?.ttfb !== undefined) {
1644
+ replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
1645
+ }
1646
+ if (replyStartedSpeakingAt !== undefined) {
1647
+ replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds
1648
+ replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds
1649
+ }
1650
+
1469
1651
  const message = ChatMessage.create({
1470
1652
  role: 'assistant',
1471
1653
  content: textOut?.text || '',
1472
1654
  interrupted: speechHandle.interrupted,
1655
+ metrics: replyAssistantMetrics,
1473
1656
  });
1474
1657
  this.agent._chatCtx.insert(message);
1475
1658
  this.agentSession._conversationItemAdded(message);
@@ -1477,6 +1660,10 @@ export class AgentActivity implements RecognitionHooks {
1477
1660
 
1478
1661
  if (this.agentSession.agentState === 'speaking') {
1479
1662
  this.agentSession._updateAgentState('listening');
1663
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1664
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1665
+ }
1666
+ this.restoreInterruptionByAudioActivity();
1480
1667
  }
1481
1668
  }
1482
1669
 
@@ -1490,6 +1677,7 @@ export class AgentActivity implements RecognitionHooks {
1490
1677
  newMessage,
1491
1678
  toolsMessages,
1492
1679
  span,
1680
+ _previousUserMetrics,
1493
1681
  }: {
1494
1682
  speechHandle: SpeechHandle;
1495
1683
  chatCtx: ChatContext;
@@ -1500,6 +1688,7 @@ export class AgentActivity implements RecognitionHooks {
1500
1688
  newMessage?: ChatMessage;
1501
1689
  toolsMessages?: ChatItem[];
1502
1690
  span: Span;
1691
+ _previousUserMetrics?: MetricsReport;
1503
1692
  }): Promise<void> => {
1504
1693
  speechHandle._agentTurnContext = otelContext.active();
1505
1694
 
@@ -1552,6 +1741,8 @@ export class AgentActivity implements RecognitionHooks {
1552
1741
  toolCtx,
1553
1742
  modelSettings,
1554
1743
  replyAbortController,
1744
+ this.llm?.model,
1745
+ this.llm?.provider,
1555
1746
  );
1556
1747
  tasks.push(llmTask);
1557
1748
 
@@ -1568,6 +1759,8 @@ export class AgentActivity implements RecognitionHooks {
1568
1759
  ttsTextInput,
1569
1760
  modelSettings,
1570
1761
  replyAbortController,
1762
+ this.tts?.model,
1763
+ this.tts?.provider,
1571
1764
  );
1572
1765
  tasks.push(ttsTask);
1573
1766
  } else {
@@ -1577,10 +1770,12 @@ export class AgentActivity implements RecognitionHooks {
1577
1770
 
1578
1771
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1579
1772
 
1773
+ let userMetrics: MetricsReport | undefined = _previousUserMetrics;
1580
1774
  // Add new message to actual chat context if the speech is scheduled
1581
1775
  if (newMessage && speechHandle.scheduled) {
1582
1776
  this.agent._chatCtx.insert(newMessage);
1583
1777
  this.agentSession._conversationItemAdded(newMessage);
1778
+ userMetrics = newMessage.metrics;
1584
1779
  }
1585
1780
 
1586
1781
  if (speechHandle.interrupted) {
@@ -1626,11 +1821,17 @@ export class AgentActivity implements RecognitionHooks {
1626
1821
  textOut = _textOut;
1627
1822
  }
1628
1823
 
1824
+ let agentStartedSpeakingAt: number | undefined;
1629
1825
  const onFirstFrame = (startedSpeakingAt?: number) => {
1826
+ agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1630
1827
  this.agentSession._updateAgentState('speaking', {
1631
1828
  startTime: startedSpeakingAt,
1632
1829
  otelContext: speechHandle._agentTurnContext,
1633
1830
  });
1831
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1832
+ this.audioRecognition.onStartOfAgentSpeech();
1833
+ this.isInterruptionByAudioActivityEnabled = false;
1834
+ }
1634
1835
  };
1635
1836
 
1636
1837
  let audioOut: _AudioOut | null = null;
@@ -1687,6 +1888,29 @@ export class AgentActivity implements RecognitionHooks {
1687
1888
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1688
1889
  }
1689
1890
 
1891
+ const agentStoppedSpeakingAt = Date.now();
1892
+ const assistantMetrics: MetricsReport = {};
1893
+
1894
+ if (llmGenData.ttft !== undefined) {
1895
+ assistantMetrics.llmNodeTtft = llmGenData.ttft; // already in seconds
1896
+ }
1897
+ if (ttsGenData?.ttfb !== undefined) {
1898
+ assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb; // already in seconds
1899
+ }
1900
+ if (agentStartedSpeakingAt !== undefined) {
1901
+ assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds
1902
+ assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds
1903
+
1904
+ if (userMetrics?.stoppedSpeakingAt !== undefined) {
1905
+ const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt;
1906
+ assistantMetrics.e2eLatency = e2eLatency;
1907
+ span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
1908
+ }
1909
+ }
1910
+
1911
+ span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
1912
+ let hasSpeechMessage = false;
1913
+
1690
1914
  // add the tools messages that triggers this reply to the chat context
1691
1915
  if (toolsMessages) {
1692
1916
  for (const msg of toolsMessages) {
@@ -1741,45 +1965,54 @@ export class AgentActivity implements RecognitionHooks {
1741
1965
  }
1742
1966
 
1743
1967
  if (forwardedText) {
1968
+ hasSpeechMessage = true;
1744
1969
  const message = ChatMessage.create({
1745
1970
  role: 'assistant',
1746
1971
  content: forwardedText,
1747
1972
  id: llmGenData.id,
1748
1973
  interrupted: true,
1749
1974
  createdAt: replyStartedAt,
1975
+ metrics: assistantMetrics,
1750
1976
  });
1751
1977
  chatCtx.insert(message);
1752
1978
  this.agent._chatCtx.insert(message);
1753
1979
  speechHandle._itemAdded([message]);
1754
1980
  this.agentSession._conversationItemAdded(message);
1981
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
1755
1982
  }
1756
1983
 
1757
1984
  if (this.agentSession.agentState === 'speaking') {
1758
1985
  this.agentSession._updateAgentState('listening');
1986
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1987
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1988
+ this.restoreInterruptionByAudioActivity();
1989
+ }
1759
1990
  }
1760
1991
 
1761
1992
  this.logger.info(
1762
1993
  { speech_id: speechHandle.id, message: forwardedText },
1763
1994
  'playout completed with interrupt',
1764
1995
  );
1765
- // TODO(shubhra) add chat message to speech handle
1766
1996
  speechHandle._markGenerationDone();
1767
1997
  await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1768
1998
  return;
1769
1999
  }
1770
2000
 
1771
2001
  if (textOut && textOut.text) {
2002
+ hasSpeechMessage = true;
1772
2003
  const message = ChatMessage.create({
1773
2004
  role: 'assistant',
1774
2005
  id: llmGenData.id,
1775
2006
  interrupted: false,
1776
2007
  createdAt: replyStartedAt,
1777
2008
  content: textOut.text,
2009
+ metrics: assistantMetrics,
1778
2010
  });
1779
2011
  chatCtx.insert(message);
1780
2012
  this.agent._chatCtx.insert(message);
1781
2013
  speechHandle._itemAdded([message]);
1782
2014
  this.agentSession._conversationItemAdded(message);
2015
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
1783
2016
  this.logger.info(
1784
2017
  { speech_id: speechHandle.id, message: textOut.text },
1785
2018
  'playout completed without interruption',
@@ -1790,6 +2023,12 @@ export class AgentActivity implements RecognitionHooks {
1790
2023
  this.agentSession._updateAgentState('thinking');
1791
2024
  } else if (this.agentSession.agentState === 'speaking') {
1792
2025
  this.agentSession._updateAgentState('listening');
2026
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
2027
+ {
2028
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
2029
+ this.restoreInterruptionByAudioActivity();
2030
+ }
2031
+ }
1793
2032
  }
1794
2033
 
1795
2034
  // mark the playout done before waiting for the tool execution
@@ -1849,6 +2088,7 @@ export class AgentActivity implements RecognitionHooks {
1849
2088
  instructions,
1850
2089
  undefined,
1851
2090
  toolMessages,
2091
+ hasSpeechMessage ? undefined : userMetrics,
1852
2092
  ),
1853
2093
  ownedSpeechHandle: speechHandle,
1854
2094
  name: 'AgentActivity.pipelineReply',
@@ -1882,6 +2122,7 @@ export class AgentActivity implements RecognitionHooks {
1882
2122
  instructions?: string,
1883
2123
  newMessage?: ChatMessage,
1884
2124
  toolsMessages?: ChatItem[],
2125
+ _previousUserMetrics?: MetricsReport,
1885
2126
  ): Promise<void> =>
1886
2127
  tracer.startActiveSpan(
1887
2128
  async (span) =>
@@ -1895,6 +2136,7 @@ export class AgentActivity implements RecognitionHooks {
1895
2136
  newMessage,
1896
2137
  toolsMessages,
1897
2138
  span,
2139
+ _previousUserMetrics,
1898
2140
  }),
1899
2141
  {
1900
2142
  name: 'agent_turn',
@@ -2045,6 +2287,8 @@ export class AgentActivity implements RecognitionHooks {
2045
2287
  ttsTextInput,
2046
2288
  modelSettings,
2047
2289
  abortController,
2290
+ this.tts?.model,
2291
+ this.tts?.provider,
2048
2292
  );
2049
2293
  tasks.push(ttsTask);
2050
2294
  realtimeAudioResult = ttsGenData.audioStream;
@@ -2554,6 +2798,14 @@ export class AgentActivity implements RecognitionHooks {
2554
2798
  if (this._mainTask) {
2555
2799
  await this._mainTask.cancelAndWait();
2556
2800
  }
2801
+ if (this.interruptionDetector) {
2802
+ this.interruptionDetector.off(
2803
+ 'user_overlapping_speech',
2804
+ this.onInterruptionOverlappingSpeech,
2805
+ );
2806
+ this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
2807
+ this.interruptionDetector.off('error', this.onInterruptionError);
2808
+ }
2557
2809
 
2558
2810
  this.agent._agentActivity = undefined;
2559
2811
  } finally {
@@ -2561,6 +2813,53 @@ export class AgentActivity implements RecognitionHooks {
2561
2813
  }
2562
2814
  }
2563
2815
 
2816
+ private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined {
2817
+ const interruptionDetection =
2818
+ this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
2819
+ if (
2820
+ !(
2821
+ this.stt &&
2822
+ this.stt.capabilities.alignedTranscript &&
2823
+ this.stt.capabilities.streaming &&
2824
+ this.vad &&
2825
+ this.turnDetection !== 'manual' &&
2826
+ this.turnDetection !== 'realtime_llm' &&
2827
+ !(this.llm instanceof RealtimeModel)
2828
+ )
2829
+ ) {
2830
+ if (interruptionDetection === 'adaptive') {
2831
+ this.logger.warn(
2832
+ "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled",
2833
+ );
2834
+ return undefined;
2835
+ }
2836
+ }
2837
+
2838
+ if (
2839
+ (interruptionDetection !== undefined && interruptionDetection === false) ||
2840
+ interruptionDetection === 'vad'
2841
+ ) {
2842
+ return undefined;
2843
+ }
2844
+
2845
+ try {
2846
+ const detector = new AdaptiveInterruptionDetector();
2847
+
2848
+ detector.on('user_overlapping_speech', this.onInterruptionOverlappingSpeech);
2849
+ detector.on('metrics_collected', this.onInterruptionMetricsCollected);
2850
+ detector.on('error', this.onInterruptionError);
2851
+
2852
+ return detector;
2853
+ } catch (error: unknown) {
2854
+ this.logger.warn({ error }, 'could not instantiate AdaptiveInterruptionDetector');
2855
+ }
2856
+ return undefined;
2857
+ }
2858
+
2859
+ private restoreInterruptionByAudioActivity(): void {
2860
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
2861
+ }
2862
+
2564
2863
  private async _closeSessionResources(): Promise<void> {
2565
2864
  // Unregister event handlers to prevent duplicate metrics
2566
2865
  if (this.llm instanceof LLM) {