@livekit/agents 1.0.48 → 1.1.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/dist/constants.cjs +27 -0
  2. package/dist/constants.cjs.map +1 -1
  3. package/dist/constants.d.cts +9 -0
  4. package/dist/constants.d.ts +9 -0
  5. package/dist/constants.d.ts.map +1 -1
  6. package/dist/constants.js +18 -0
  7. package/dist/constants.js.map +1 -1
  8. package/dist/inference/api_protos.d.cts +71 -71
  9. package/dist/inference/api_protos.d.ts +71 -71
  10. package/dist/inference/interruption/defaults.cjs +81 -0
  11. package/dist/inference/interruption/defaults.cjs.map +1 -0
  12. package/dist/inference/interruption/defaults.d.cts +19 -0
  13. package/dist/inference/interruption/defaults.d.ts +19 -0
  14. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  15. package/dist/inference/interruption/defaults.js +46 -0
  16. package/dist/inference/interruption/defaults.js.map +1 -0
  17. package/dist/inference/interruption/errors.cjs +44 -0
  18. package/dist/inference/interruption/errors.cjs.map +1 -0
  19. package/dist/inference/interruption/errors.d.cts +12 -0
  20. package/dist/inference/interruption/errors.d.ts +12 -0
  21. package/dist/inference/interruption/errors.d.ts.map +1 -0
  22. package/dist/inference/interruption/errors.js +20 -0
  23. package/dist/inference/interruption/errors.js.map +1 -0
  24. package/dist/inference/interruption/http_transport.cjs +147 -0
  25. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  26. package/dist/inference/interruption/http_transport.d.cts +63 -0
  27. package/dist/inference/interruption/http_transport.d.ts +63 -0
  28. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  29. package/dist/inference/interruption/http_transport.js +121 -0
  30. package/dist/inference/interruption/http_transport.js.map +1 -0
  31. package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
  32. package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
  33. package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
  34. package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
  35. package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
  36. package/dist/inference/interruption/interruption_cache_entry.js +34 -0
  37. package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
  38. package/dist/inference/interruption/interruption_detector.cjs +181 -0
  39. package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
  40. package/dist/inference/interruption/interruption_detector.d.cts +59 -0
  41. package/dist/inference/interruption/interruption_detector.d.ts +59 -0
  42. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
  43. package/dist/inference/interruption/interruption_detector.js +147 -0
  44. package/dist/inference/interruption/interruption_detector.js.map +1 -0
  45. package/dist/inference/interruption/interruption_stream.cjs +368 -0
  46. package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
  47. package/dist/inference/interruption/interruption_stream.d.cts +46 -0
  48. package/dist/inference/interruption/interruption_stream.d.ts +46 -0
  49. package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
  50. package/dist/inference/interruption/interruption_stream.js +344 -0
  51. package/dist/inference/interruption/interruption_stream.js.map +1 -0
  52. package/dist/inference/interruption/types.cjs +17 -0
  53. package/dist/inference/interruption/types.cjs.map +1 -0
  54. package/dist/inference/interruption/types.d.cts +66 -0
  55. package/dist/inference/interruption/types.d.ts +66 -0
  56. package/dist/inference/interruption/types.d.ts.map +1 -0
  57. package/dist/inference/interruption/types.js +1 -0
  58. package/dist/inference/interruption/types.js.map +1 -0
  59. package/dist/inference/interruption/utils.cjs +130 -0
  60. package/dist/inference/interruption/utils.cjs.map +1 -0
  61. package/dist/inference/interruption/utils.d.cts +41 -0
  62. package/dist/inference/interruption/utils.d.ts +41 -0
  63. package/dist/inference/interruption/utils.d.ts.map +1 -0
  64. package/dist/inference/interruption/utils.js +105 -0
  65. package/dist/inference/interruption/utils.js.map +1 -0
  66. package/dist/inference/interruption/utils.test.cjs +105 -0
  67. package/dist/inference/interruption/utils.test.cjs.map +1 -0
  68. package/dist/inference/interruption/utils.test.js +104 -0
  69. package/dist/inference/interruption/utils.test.js.map +1 -0
  70. package/dist/inference/interruption/ws_transport.cjs +329 -0
  71. package/dist/inference/interruption/ws_transport.cjs.map +1 -0
  72. package/dist/inference/interruption/ws_transport.d.cts +33 -0
  73. package/dist/inference/interruption/ws_transport.d.ts +33 -0
  74. package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
  75. package/dist/inference/interruption/ws_transport.js +295 -0
  76. package/dist/inference/interruption/ws_transport.js.map +1 -0
  77. package/dist/inference/llm.cjs +14 -10
  78. package/dist/inference/llm.cjs.map +1 -1
  79. package/dist/inference/llm.d.cts +2 -1
  80. package/dist/inference/llm.d.ts +2 -1
  81. package/dist/inference/llm.d.ts.map +1 -1
  82. package/dist/inference/llm.js +8 -10
  83. package/dist/inference/llm.js.map +1 -1
  84. package/dist/inference/stt.cjs +7 -2
  85. package/dist/inference/stt.cjs.map +1 -1
  86. package/dist/inference/stt.d.cts +2 -0
  87. package/dist/inference/stt.d.ts +2 -0
  88. package/dist/inference/stt.d.ts.map +1 -1
  89. package/dist/inference/stt.js +8 -3
  90. package/dist/inference/stt.js.map +1 -1
  91. package/dist/inference/tts.cjs +7 -2
  92. package/dist/inference/tts.cjs.map +1 -1
  93. package/dist/inference/tts.d.cts +2 -0
  94. package/dist/inference/tts.d.ts +2 -0
  95. package/dist/inference/tts.d.ts.map +1 -1
  96. package/dist/inference/tts.js +8 -3
  97. package/dist/inference/tts.js.map +1 -1
  98. package/dist/inference/utils.cjs +26 -7
  99. package/dist/inference/utils.cjs.map +1 -1
  100. package/dist/inference/utils.d.cts +13 -0
  101. package/dist/inference/utils.d.ts +13 -0
  102. package/dist/inference/utils.d.ts.map +1 -1
  103. package/dist/inference/utils.js +18 -2
  104. package/dist/inference/utils.js.map +1 -1
  105. package/dist/llm/chat_context.cjs +20 -2
  106. package/dist/llm/chat_context.cjs.map +1 -1
  107. package/dist/llm/chat_context.d.cts +19 -1
  108. package/dist/llm/chat_context.d.ts +19 -1
  109. package/dist/llm/chat_context.d.ts.map +1 -1
  110. package/dist/llm/chat_context.js +20 -2
  111. package/dist/llm/chat_context.js.map +1 -1
  112. package/dist/llm/index.cjs.map +1 -1
  113. package/dist/llm/index.d.cts +1 -1
  114. package/dist/llm/index.d.ts +1 -1
  115. package/dist/llm/index.d.ts.map +1 -1
  116. package/dist/llm/index.js.map +1 -1
  117. package/dist/llm/llm.cjs +16 -1
  118. package/dist/llm/llm.cjs.map +1 -1
  119. package/dist/llm/llm.d.cts +9 -0
  120. package/dist/llm/llm.d.ts +9 -0
  121. package/dist/llm/llm.d.ts.map +1 -1
  122. package/dist/llm/llm.js +16 -1
  123. package/dist/llm/llm.js.map +1 -1
  124. package/dist/llm/realtime.cjs +3 -0
  125. package/dist/llm/realtime.cjs.map +1 -1
  126. package/dist/llm/realtime.d.cts +1 -0
  127. package/dist/llm/realtime.d.ts +1 -0
  128. package/dist/llm/realtime.d.ts.map +1 -1
  129. package/dist/llm/realtime.js +3 -0
  130. package/dist/llm/realtime.js.map +1 -1
  131. package/dist/metrics/base.cjs.map +1 -1
  132. package/dist/metrics/base.d.cts +45 -1
  133. package/dist/metrics/base.d.ts +45 -1
  134. package/dist/metrics/base.d.ts.map +1 -1
  135. package/dist/metrics/index.cjs +5 -0
  136. package/dist/metrics/index.cjs.map +1 -1
  137. package/dist/metrics/index.d.cts +2 -1
  138. package/dist/metrics/index.d.ts +2 -1
  139. package/dist/metrics/index.d.ts.map +1 -1
  140. package/dist/metrics/index.js +6 -0
  141. package/dist/metrics/index.js.map +1 -1
  142. package/dist/metrics/model_usage.cjs +189 -0
  143. package/dist/metrics/model_usage.cjs.map +1 -0
  144. package/dist/metrics/model_usage.d.cts +92 -0
  145. package/dist/metrics/model_usage.d.ts +92 -0
  146. package/dist/metrics/model_usage.d.ts.map +1 -0
  147. package/dist/metrics/model_usage.js +164 -0
  148. package/dist/metrics/model_usage.js.map +1 -0
  149. package/dist/metrics/model_usage.test.cjs +474 -0
  150. package/dist/metrics/model_usage.test.cjs.map +1 -0
  151. package/dist/metrics/model_usage.test.js +476 -0
  152. package/dist/metrics/model_usage.test.js.map +1 -0
  153. package/dist/metrics/usage_collector.cjs +3 -0
  154. package/dist/metrics/usage_collector.cjs.map +1 -1
  155. package/dist/metrics/usage_collector.d.cts +9 -0
  156. package/dist/metrics/usage_collector.d.ts +9 -0
  157. package/dist/metrics/usage_collector.d.ts.map +1 -1
  158. package/dist/metrics/usage_collector.js +3 -0
  159. package/dist/metrics/usage_collector.js.map +1 -1
  160. package/dist/metrics/utils.cjs +9 -0
  161. package/dist/metrics/utils.cjs.map +1 -1
  162. package/dist/metrics/utils.d.ts.map +1 -1
  163. package/dist/metrics/utils.js +9 -0
  164. package/dist/metrics/utils.js.map +1 -1
  165. package/dist/stream/multi_input_stream.test.cjs +4 -0
  166. package/dist/stream/multi_input_stream.test.cjs.map +1 -1
  167. package/dist/stream/multi_input_stream.test.js +5 -1
  168. package/dist/stream/multi_input_stream.test.js.map +1 -1
  169. package/dist/stream/stream_channel.cjs +31 -0
  170. package/dist/stream/stream_channel.cjs.map +1 -1
  171. package/dist/stream/stream_channel.d.cts +4 -2
  172. package/dist/stream/stream_channel.d.ts +4 -2
  173. package/dist/stream/stream_channel.d.ts.map +1 -1
  174. package/dist/stream/stream_channel.js +31 -0
  175. package/dist/stream/stream_channel.js.map +1 -1
  176. package/dist/stt/stt.cjs +34 -2
  177. package/dist/stt/stt.cjs.map +1 -1
  178. package/dist/stt/stt.d.cts +22 -0
  179. package/dist/stt/stt.d.ts +22 -0
  180. package/dist/stt/stt.d.ts.map +1 -1
  181. package/dist/stt/stt.js +34 -2
  182. package/dist/stt/stt.js.map +1 -1
  183. package/dist/telemetry/otel_http_exporter.cjs +24 -5
  184. package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
  185. package/dist/telemetry/otel_http_exporter.d.cts +1 -0
  186. package/dist/telemetry/otel_http_exporter.d.ts +1 -0
  187. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
  188. package/dist/telemetry/otel_http_exporter.js +24 -5
  189. package/dist/telemetry/otel_http_exporter.js.map +1 -1
  190. package/dist/telemetry/trace_types.cjs +5 -5
  191. package/dist/telemetry/trace_types.cjs.map +1 -1
  192. package/dist/telemetry/trace_types.d.cts +9 -5
  193. package/dist/telemetry/trace_types.d.ts +9 -5
  194. package/dist/telemetry/trace_types.d.ts.map +1 -1
  195. package/dist/telemetry/trace_types.js +5 -5
  196. package/dist/telemetry/trace_types.js.map +1 -1
  197. package/dist/telemetry/traces.cjs +47 -8
  198. package/dist/telemetry/traces.cjs.map +1 -1
  199. package/dist/telemetry/traces.d.ts.map +1 -1
  200. package/dist/telemetry/traces.js +47 -8
  201. package/dist/telemetry/traces.js.map +1 -1
  202. package/dist/tts/tts.cjs +64 -2
  203. package/dist/tts/tts.cjs.map +1 -1
  204. package/dist/tts/tts.d.cts +34 -0
  205. package/dist/tts/tts.d.ts +34 -0
  206. package/dist/tts/tts.d.ts.map +1 -1
  207. package/dist/tts/tts.js +64 -2
  208. package/dist/tts/tts.js.map +1 -1
  209. package/dist/version.cjs +1 -1
  210. package/dist/version.js +1 -1
  211. package/dist/voice/agent.cjs +25 -4
  212. package/dist/voice/agent.cjs.map +1 -1
  213. package/dist/voice/agent.d.cts +10 -2
  214. package/dist/voice/agent.d.ts +10 -2
  215. package/dist/voice/agent.d.ts.map +1 -1
  216. package/dist/voice/agent.js +25 -4
  217. package/dist/voice/agent.js.map +1 -1
  218. package/dist/voice/agent_activity.cjs +261 -36
  219. package/dist/voice/agent_activity.cjs.map +1 -1
  220. package/dist/voice/agent_activity.d.cts +20 -6
  221. package/dist/voice/agent_activity.d.ts +20 -6
  222. package/dist/voice/agent_activity.d.ts.map +1 -1
  223. package/dist/voice/agent_activity.js +262 -37
  224. package/dist/voice/agent_activity.js.map +1 -1
  225. package/dist/voice/agent_session.cjs +105 -48
  226. package/dist/voice/agent_session.cjs.map +1 -1
  227. package/dist/voice/agent_session.d.cts +90 -20
  228. package/dist/voice/agent_session.d.ts +90 -20
  229. package/dist/voice/agent_session.d.ts.map +1 -1
  230. package/dist/voice/agent_session.js +105 -46
  231. package/dist/voice/agent_session.js.map +1 -1
  232. package/dist/voice/audio_recognition.cjs +287 -6
  233. package/dist/voice/audio_recognition.cjs.map +1 -1
  234. package/dist/voice/audio_recognition.d.cts +42 -3
  235. package/dist/voice/audio_recognition.d.ts +42 -3
  236. package/dist/voice/audio_recognition.d.ts.map +1 -1
  237. package/dist/voice/audio_recognition.js +289 -7
  238. package/dist/voice/audio_recognition.js.map +1 -1
  239. package/dist/voice/client_events.cjs +554 -0
  240. package/dist/voice/client_events.cjs.map +1 -0
  241. package/dist/voice/client_events.d.cts +195 -0
  242. package/dist/voice/client_events.d.ts +195 -0
  243. package/dist/voice/client_events.d.ts.map +1 -0
  244. package/dist/voice/client_events.js +548 -0
  245. package/dist/voice/client_events.js.map +1 -0
  246. package/dist/voice/events.cjs +1 -0
  247. package/dist/voice/events.cjs.map +1 -1
  248. package/dist/voice/events.d.cts +8 -5
  249. package/dist/voice/events.d.ts +8 -5
  250. package/dist/voice/events.d.ts.map +1 -1
  251. package/dist/voice/events.js +1 -0
  252. package/dist/voice/events.js.map +1 -1
  253. package/dist/voice/generation.cjs +43 -8
  254. package/dist/voice/generation.cjs.map +1 -1
  255. package/dist/voice/generation.d.cts +3 -3
  256. package/dist/voice/generation.d.ts +3 -3
  257. package/dist/voice/generation.d.ts.map +1 -1
  258. package/dist/voice/generation.js +43 -8
  259. package/dist/voice/generation.js.map +1 -1
  260. package/dist/voice/index.cjs +1 -0
  261. package/dist/voice/index.cjs.map +1 -1
  262. package/dist/voice/index.d.cts +1 -0
  263. package/dist/voice/index.d.ts +1 -0
  264. package/dist/voice/index.d.ts.map +1 -1
  265. package/dist/voice/index.js +1 -0
  266. package/dist/voice/index.js.map +1 -1
  267. package/dist/voice/report.cjs +20 -8
  268. package/dist/voice/report.cjs.map +1 -1
  269. package/dist/voice/report.d.cts +5 -0
  270. package/dist/voice/report.d.ts +5 -0
  271. package/dist/voice/report.d.ts.map +1 -1
  272. package/dist/voice/report.js +20 -8
  273. package/dist/voice/report.js.map +1 -1
  274. package/dist/voice/report.test.cjs +106 -0
  275. package/dist/voice/report.test.cjs.map +1 -0
  276. package/dist/voice/report.test.js +105 -0
  277. package/dist/voice/report.test.js.map +1 -0
  278. package/dist/voice/room_io/room_io.cjs +5 -39
  279. package/dist/voice/room_io/room_io.cjs.map +1 -1
  280. package/dist/voice/room_io/room_io.d.cts +4 -9
  281. package/dist/voice/room_io/room_io.d.ts +4 -9
  282. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  283. package/dist/voice/room_io/room_io.js +5 -40
  284. package/dist/voice/room_io/room_io.js.map +1 -1
  285. package/dist/voice/turn_config/endpointing.cjs +33 -0
  286. package/dist/voice/turn_config/endpointing.cjs.map +1 -0
  287. package/dist/voice/turn_config/endpointing.d.cts +30 -0
  288. package/dist/voice/turn_config/endpointing.d.ts +30 -0
  289. package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
  290. package/dist/voice/turn_config/endpointing.js +9 -0
  291. package/dist/voice/turn_config/endpointing.js.map +1 -0
  292. package/dist/voice/turn_config/interruption.cjs +37 -0
  293. package/dist/voice/turn_config/interruption.cjs.map +1 -0
  294. package/dist/voice/turn_config/interruption.d.cts +53 -0
  295. package/dist/voice/turn_config/interruption.d.ts +53 -0
  296. package/dist/voice/turn_config/interruption.d.ts.map +1 -0
  297. package/dist/voice/turn_config/interruption.js +13 -0
  298. package/dist/voice/turn_config/interruption.js.map +1 -0
  299. package/dist/voice/turn_config/turn_handling.cjs +35 -0
  300. package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
  301. package/dist/voice/turn_config/turn_handling.d.cts +36 -0
  302. package/dist/voice/turn_config/turn_handling.d.ts +36 -0
  303. package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
  304. package/dist/voice/turn_config/turn_handling.js +11 -0
  305. package/dist/voice/turn_config/turn_handling.js.map +1 -0
  306. package/dist/voice/turn_config/utils.cjs +97 -0
  307. package/dist/voice/turn_config/utils.cjs.map +1 -0
  308. package/dist/voice/turn_config/utils.d.cts +25 -0
  309. package/dist/voice/turn_config/utils.d.ts +25 -0
  310. package/dist/voice/turn_config/utils.d.ts.map +1 -0
  311. package/dist/voice/turn_config/utils.js +73 -0
  312. package/dist/voice/turn_config/utils.js.map +1 -0
  313. package/dist/voice/turn_config/utils.test.cjs +86 -0
  314. package/dist/voice/turn_config/utils.test.cjs.map +1 -0
  315. package/dist/voice/turn_config/utils.test.js +85 -0
  316. package/dist/voice/turn_config/utils.test.js.map +1 -0
  317. package/dist/voice/wire_format.cjs +798 -0
  318. package/dist/voice/wire_format.cjs.map +1 -0
  319. package/dist/voice/wire_format.d.cts +5503 -0
  320. package/dist/voice/wire_format.d.ts +5503 -0
  321. package/dist/voice/wire_format.d.ts.map +1 -0
  322. package/dist/voice/wire_format.js +728 -0
  323. package/dist/voice/wire_format.js.map +1 -0
  324. package/package.json +2 -1
  325. package/src/constants.ts +13 -0
  326. package/src/inference/interruption/defaults.ts +51 -0
  327. package/src/inference/interruption/errors.ts +25 -0
  328. package/src/inference/interruption/http_transport.ts +187 -0
  329. package/src/inference/interruption/interruption_cache_entry.ts +50 -0
  330. package/src/inference/interruption/interruption_detector.ts +188 -0
  331. package/src/inference/interruption/interruption_stream.ts +467 -0
  332. package/src/inference/interruption/types.ts +84 -0
  333. package/src/inference/interruption/utils.test.ts +132 -0
  334. package/src/inference/interruption/utils.ts +137 -0
  335. package/src/inference/interruption/ws_transport.ts +402 -0
  336. package/src/inference/llm.ts +9 -12
  337. package/src/inference/stt.ts +10 -3
  338. package/src/inference/tts.ts +10 -3
  339. package/src/inference/utils.ts +29 -1
  340. package/src/llm/chat_context.ts +40 -2
  341. package/src/llm/index.ts +1 -0
  342. package/src/llm/llm.ts +16 -0
  343. package/src/llm/realtime.ts +4 -0
  344. package/src/metrics/base.ts +48 -1
  345. package/src/metrics/index.ts +11 -0
  346. package/src/metrics/model_usage.test.ts +545 -0
  347. package/src/metrics/model_usage.ts +262 -0
  348. package/src/metrics/usage_collector.ts +11 -0
  349. package/src/metrics/utils.ts +11 -0
  350. package/src/stream/multi_input_stream.test.ts +6 -1
  351. package/src/stream/stream_channel.ts +34 -2
  352. package/src/stt/stt.ts +38 -0
  353. package/src/telemetry/otel_http_exporter.ts +28 -5
  354. package/src/telemetry/trace_types.ts +11 -8
  355. package/src/telemetry/traces.ts +111 -54
  356. package/src/tts/tts.ts +69 -1
  357. package/src/voice/agent.ts +30 -3
  358. package/src/voice/agent_activity.ts +327 -28
  359. package/src/voice/agent_session.ts +207 -59
  360. package/src/voice/audio_recognition.ts +385 -9
  361. package/src/voice/client_events.ts +838 -0
  362. package/src/voice/events.ts +14 -4
  363. package/src/voice/generation.ts +52 -9
  364. package/src/voice/index.ts +1 -0
  365. package/src/voice/report.test.ts +117 -0
  366. package/src/voice/report.ts +29 -6
  367. package/src/voice/room_io/room_io.ts +7 -61
  368. package/src/voice/turn_config/endpointing.ts +33 -0
  369. package/src/voice/turn_config/interruption.ts +56 -0
  370. package/src/voice/turn_config/turn_handling.ts +45 -0
  371. package/src/voice/turn_config/utils.test.ts +100 -0
  372. package/src/voice/turn_config/utils.ts +103 -0
  373. package/src/voice/wire_format.ts +827 -0
@@ -2,7 +2,8 @@ import { Mutex } from "@livekit/mutex";
2
2
  import { ROOT_CONTEXT, context as otelContext, trace } from "@opentelemetry/api";
3
3
  import { Heap } from "heap-js";
4
4
  import { AsyncLocalStorage } from "node:async_hooks";
5
- import { ReadableStream } from "node:stream/web";
5
+ import { ReadableStream, TransformStream } from "node:stream/web";
6
+ import { AdaptiveInterruptionDetector } from "../inference/interruption/interruption_detector.js";
6
7
  import { ChatMessage } from "../llm/chat_context.js";
7
8
  import {
8
9
  LLM,
@@ -74,16 +75,34 @@ class AgentActivity {
74
75
  // default to null as None, which maps to the default provider tool choice value
75
76
  toolChoice = null;
76
77
  _preemptiveGeneration;
77
- /** @internal */
78
- _mainTask;
79
- _onEnterTask;
80
- _onExitTask;
81
- _userTurnCompletedTask;
78
+ interruptionDetector;
79
+ isInterruptionDetectionEnabled;
80
+ isInterruptionByAudioActivityEnabled;
81
+ isDefaultInterruptionByAudioActivityEnabled;
82
82
  onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
83
83
  onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
84
84
  onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
85
85
  onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
86
86
  onModelError = (ev) => this.onError(ev);
87
+ onInterruptionOverlappingSpeech = (ev) => {
88
+ this.agentSession.emit(AgentSessionEventTypes.UserOverlappingSpeech, ev);
89
+ };
90
+ onInterruptionMetricsCollected = (ev) => {
91
+ this.agentSession.emit(
92
+ AgentSessionEventTypes.MetricsCollected,
93
+ createMetricsCollectedEvent({ metrics: ev })
94
+ );
95
+ };
96
+ onInterruptionError = (ev) => {
97
+ const errorEvent = createErrorEvent(ev, this.interruptionDetector);
98
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
99
+ this.agentSession._onError(ev);
100
+ };
101
+ /** @internal */
102
+ _mainTask;
103
+ _onEnterTask;
104
+ _onExitTask;
105
+ _userTurnCompletedTask;
87
106
  constructor(agent, agentSession) {
88
107
  this.agent = agent;
89
108
  this.agentSession = agentSession;
@@ -142,6 +161,10 @@ class AgentActivity {
142
161
  "VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
143
162
  );
144
163
  }
164
+ this.interruptionDetector = this.resolveInterruptionDetector();
165
+ this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
166
+ this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
167
+ this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
145
168
  }
146
169
  async start() {
147
170
  const unlock = await this.lock.lock();
@@ -234,8 +257,9 @@ class AgentActivity {
234
257
  vad: this.vad,
235
258
  turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
236
259
  turnDetectionMode: this.turnDetectionMode,
237
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
238
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
260
+ interruptionDetection: this.interruptionDetector,
261
+ minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
262
+ maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
239
263
  rootSpanContext: this.agentSession.rootSpanContext,
240
264
  sttModel: (_a = this.stt) == null ? void 0 : _a.label,
241
265
  sttProvider: this.getSttProvider(),
@@ -297,7 +321,8 @@ class AgentActivity {
297
321
  return this.realtimeSession;
298
322
  }
299
323
  get allowInterruptions() {
300
- return this.agentSession.options.allowInterruptions;
324
+ var _a;
325
+ return ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.mode) !== false;
301
326
  }
302
327
  get useTtsAlignedTranscript() {
303
328
  return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
@@ -308,6 +333,11 @@ class AgentActivity {
308
333
  get toolCtx() {
309
334
  return this.agent.toolCtx;
310
335
  }
336
+ /** @internal */
337
+ get inputStartedAt() {
338
+ var _a;
339
+ return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
340
+ }
311
341
  async updateChatCtx(chatCtx) {
312
342
  chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
313
343
  this.agent._chatCtx = chatCtx;
@@ -332,19 +362,40 @@ class AgentActivity {
332
362
  await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
333
363
  }
334
364
  }
335
- updateOptions({ toolChoice }) {
365
+ updateOptions({
366
+ toolChoice,
367
+ turnDetection
368
+ }) {
336
369
  if (toolChoice !== void 0) {
337
370
  this.toolChoice = toolChoice;
338
371
  }
339
372
  if (this.realtimeSession) {
340
373
  this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
341
374
  }
375
+ if (turnDetection !== void 0) {
376
+ this.turnDetectionMode = turnDetection;
377
+ this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
378
+ if (this.agentSession.agentState !== "speaking") {
379
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
380
+ }
381
+ }
382
+ if (this.audioRecognition) {
383
+ this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
384
+ }
342
385
  }
343
386
  attachAudioInput(audioStream) {
344
387
  void this.audioStream.close();
345
388
  this.audioStream = new MultiInputStream();
389
+ const aecWarmupAudioFilter = new TransformStream({
390
+ transform: (frame, controller) => {
391
+ const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
392
+ if (!shouldDiscardForAecWarmup) {
393
+ controller.enqueue(frame);
394
+ }
395
+ }
396
+ });
346
397
  this.audioStreamId = this.audioStream.addInputStream(audioStream);
347
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
398
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
348
399
  if (this.realtimeSession) {
349
400
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
350
401
  }
@@ -450,6 +501,13 @@ class AgentActivity {
450
501
  this.logger.info("onInputSpeechStarted");
451
502
  if (!this.vad) {
452
503
  this.agentSession._updateUserState("speaking");
504
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
505
+ this.audioRecognition.onStartOfOverlapSpeech(
506
+ 0,
507
+ Date.now(),
508
+ this.agentSession._userSpeakingSpan
509
+ );
510
+ }
453
511
  }
454
512
  try {
455
513
  this.interrupt();
@@ -463,6 +521,9 @@ class AgentActivity {
463
521
  onInputSpeechStopped(ev) {
464
522
  this.logger.info(ev, "onInputSpeechStopped");
465
523
  if (!this.vad) {
524
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
525
+ this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
526
+ }
466
527
  this.agentSession._updateUserState("listening");
467
528
  }
468
529
  if (ev.userTranscriptionEnabled) {
@@ -524,48 +585,75 @@ class AgentActivity {
524
585
  onStartOfSpeech(ev) {
525
586
  let speechStartTime = Date.now();
526
587
  if (ev) {
527
- speechStartTime = speechStartTime - ev.speechDuration;
588
+ speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
528
589
  }
529
590
  this.agentSession._updateUserState("speaking", speechStartTime);
591
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
592
+ this.audioRecognition.onStartOfOverlapSpeech(
593
+ ev.speechDuration,
594
+ speechStartTime,
595
+ this.agentSession._userSpeakingSpan
596
+ );
597
+ }
530
598
  }
531
599
  onEndOfSpeech(ev) {
532
600
  let speechEndTime = Date.now();
533
601
  if (ev) {
534
- speechEndTime = speechEndTime - ev.silenceDuration;
602
+ speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
603
+ }
604
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
605
+ this.audioRecognition.onEndOfOverlapSpeech(
606
+ speechEndTime,
607
+ this.agentSession._userSpeakingSpan
608
+ );
535
609
  }
536
610
  this.agentSession._updateUserState("listening", speechEndTime);
537
611
  }
538
612
  onVADInferenceDone(ev) {
613
+ var _a;
539
614
  if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
540
615
  return;
541
616
  }
542
- if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
617
+ if (ev.speechDuration >= ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
543
618
  this.interruptByAudioActivity();
544
619
  }
545
620
  }
546
621
  interruptByAudioActivity() {
547
- var _a, _b;
622
+ var _a, _b, _c, _d;
623
+ if (!this.isInterruptionByAudioActivityEnabled) {
624
+ return;
625
+ }
626
+ if (this.agentSession._aecWarmupRemaining > 0) {
627
+ return;
628
+ }
548
629
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
549
630
  return;
550
631
  }
551
- if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
632
+ if (this.stt && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
552
633
  const text = this.audioRecognition.currentTranscript;
553
634
  const normalizedText = text ?? "";
554
635
  const wordCount = splitWords(normalizedText, true).length;
555
- if (wordCount < this.agentSession.options.minInterruptionWords) {
636
+ if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
556
637
  return;
557
638
  }
558
639
  }
559
- (_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
640
+ (_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
560
641
  if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
561
642
  this.logger.info(
562
643
  { "speech id": this._currentSpeech.id },
563
644
  "speech interrupted by audio activity"
564
645
  );
565
- (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
646
+ (_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
566
647
  this._currentSpeech.interrupt();
567
648
  }
568
649
  }
650
+ onInterruption(ev) {
651
+ this.restoreInterruptionByAudioActivity();
652
+ this.interruptByAudioActivity();
653
+ if (this.audioRecognition) {
654
+ this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
655
+ }
656
+ }
569
657
  onInterimTranscript(ev) {
570
658
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
571
659
  return;
@@ -614,7 +702,8 @@ class AgentActivity {
614
702
  );
615
703
  const userMessage = ChatMessage.create({
616
704
  role: "user",
617
- content: info.newTranscript
705
+ content: info.newTranscript,
706
+ transcriptConfidence: info.transcriptConfidence
618
707
  });
619
708
  const chatCtx = this.agent.chatCtx.copy();
620
709
  const speechHandle = this.generateReply({
@@ -672,6 +761,7 @@ class AgentActivity {
672
761
  return task;
673
762
  }
674
763
  async onEndOfTurn(info) {
764
+ var _a, _b;
675
765
  if (this.schedulingPaused) {
676
766
  this.cancelPreemptiveGeneration();
677
767
  this.logger.warn(
@@ -680,14 +770,14 @@ class AgentActivity {
680
770
  );
681
771
  return true;
682
772
  }
683
- if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
773
+ if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
684
774
  const wordCount = splitWords(info.newTranscript, true).length;
685
- if (wordCount < this.agentSession.options.minInterruptionWords) {
775
+ if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
686
776
  this.cancelPreemptiveGeneration();
687
777
  this.logger.info(
688
778
  {
689
779
  wordCount,
690
- minInterruptionWords: this.agentSession.options.minInterruptionWords
780
+ minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords
691
781
  },
692
782
  "skipping user input, word count below minimum interruption threshold"
693
783
  );
@@ -906,7 +996,8 @@ ${instructions}`;
906
996
  }
907
997
  let userMessage = ChatMessage.create({
908
998
  role: "user",
909
- content: info.newTranscript
999
+ content: info.newTranscript,
1000
+ transcriptConfidence: info.transcriptConfidence
910
1001
  });
911
1002
  const chatCtx = this.agent.chatCtx.copy();
912
1003
  const startTime = Date.now();
@@ -924,11 +1015,32 @@ ${instructions}`;
924
1015
  } else if (this.llm === void 0) {
925
1016
  return;
926
1017
  }
1018
+ const userMetricsReport = {};
1019
+ if (info.startedSpeakingAt !== void 0) {
1020
+ userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
1021
+ }
1022
+ if (info.stoppedSpeakingAt !== void 0) {
1023
+ userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
1024
+ }
1025
+ if (info.transcriptionDelay !== void 0) {
1026
+ userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
1027
+ }
1028
+ if (info.endOfUtteranceDelay !== void 0) {
1029
+ userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
1030
+ }
1031
+ userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
1032
+ if (userMessage) {
1033
+ userMessage.metrics = userMetricsReport;
1034
+ }
927
1035
  let speechHandle;
928
1036
  if (this._preemptiveGeneration !== void 0) {
929
1037
  const preemptive = this._preemptiveGeneration;
930
1038
  if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && isSameToolContext(preemptive.tools, this.tools) && isSameToolChoice(preemptive.toolChoice, this.toolChoice)) {
931
1039
  speechHandle = preemptive.speechHandle;
1040
+ if (preemptive.userMessage && userMessage) {
1041
+ preemptive.userMessage.metrics = userMetricsReport;
1042
+ preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
1043
+ }
932
1044
  this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
933
1045
  this.logger.debug(
934
1046
  {
@@ -962,6 +1074,7 @@ ${instructions}`;
962
1074
  );
963
1075
  }
964
1076
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
1077
+ var _a, _b;
965
1078
  speechHandle._agentTurnContext = otelContext.active();
966
1079
  speechHandleStorage.enterWith(speechHandle);
967
1080
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
@@ -994,11 +1107,18 @@ ${instructions}`;
994
1107
  textOut = _textOut;
995
1108
  tasks.push(textForwardTask);
996
1109
  }
1110
+ let replyStartedSpeakingAt;
1111
+ let replyTtsGenData = null;
997
1112
  const onFirstFrame = (startedSpeakingAt) => {
1113
+ replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
998
1114
  this.agentSession._updateAgentState("speaking", {
999
1115
  startTime: startedSpeakingAt,
1000
1116
  otelContext: speechHandle._agentTurnContext
1001
1117
  });
1118
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1119
+ this.audioRecognition.onStartOfAgentSpeech();
1120
+ this.isInterruptionByAudioActivityEnabled = false;
1121
+ }
1002
1122
  };
1003
1123
  if (!audioOutput) {
1004
1124
  if (textOut) {
@@ -1011,9 +1131,12 @@ ${instructions}`;
1011
1131
  (...args) => this.agent.ttsNode(...args),
1012
1132
  audioSource,
1013
1133
  modelSettings,
1014
- replyAbortController
1134
+ replyAbortController,
1135
+ (_a = this.tts) == null ? void 0 : _a.model,
1136
+ (_b = this.tts) == null ? void 0 : _b.provider
1015
1137
  );
1016
1138
  tasks.push(ttsTask);
1139
+ replyTtsGenData = ttsGenData;
1017
1140
  const [forwardTask, _audioOut] = performAudioForwarding(
1018
1141
  ttsGenData.audioStream,
1019
1142
  audioOutput,
@@ -1045,16 +1168,30 @@ ${instructions}`;
1045
1168
  }
1046
1169
  }
1047
1170
  if (addToChatCtx) {
1171
+ const replyStoppedSpeakingAt = Date.now();
1172
+ const replyAssistantMetrics = {};
1173
+ if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
1174
+ replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
1175
+ }
1176
+ if (replyStartedSpeakingAt !== void 0) {
1177
+ replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
1178
+ replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
1179
+ }
1048
1180
  const message = ChatMessage.create({
1049
1181
  role: "assistant",
1050
1182
  content: (textOut == null ? void 0 : textOut.text) || "",
1051
- interrupted: speechHandle.interrupted
1183
+ interrupted: speechHandle.interrupted,
1184
+ metrics: replyAssistantMetrics
1052
1185
  });
1053
1186
  this.agent._chatCtx.insert(message);
1054
1187
  this.agentSession._conversationItemAdded(message);
1055
1188
  }
1056
1189
  if (this.agentSession.agentState === "speaking") {
1057
1190
  this.agentSession._updateAgentState("listening");
1191
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1192
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1193
+ }
1194
+ this.restoreInterruptionByAudioActivity();
1058
1195
  }
1059
1196
  }
1060
1197
  _pipelineReplyTaskImpl = async ({
@@ -1066,9 +1203,10 @@ ${instructions}`;
1066
1203
  instructions,
1067
1204
  newMessage,
1068
1205
  toolsMessages,
1069
- span
1206
+ span,
1207
+ _previousUserMetrics
1070
1208
  }) => {
1071
- var _a, _b;
1209
+ var _a, _b, _c, _d, _e, _f;
1072
1210
  speechHandle._agentTurnContext = otelContext.active();
1073
1211
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1074
1212
  if (instructions) {
@@ -1106,7 +1244,9 @@ ${instructions}`;
1106
1244
  chatCtx,
1107
1245
  toolCtx,
1108
1246
  modelSettings,
1109
- replyAbortController
1247
+ replyAbortController,
1248
+ (_b = this.llm) == null ? void 0 : _b.model,
1249
+ (_c = this.llm) == null ? void 0 : _c.provider
1110
1250
  );
1111
1251
  tasks.push(llmTask);
1112
1252
  let ttsTask = null;
@@ -1119,16 +1259,20 @@ ${instructions}`;
1119
1259
  (...args) => this.agent.ttsNode(...args),
1120
1260
  ttsTextInput,
1121
1261
  modelSettings,
1122
- replyAbortController
1262
+ replyAbortController,
1263
+ (_d = this.tts) == null ? void 0 : _d.model,
1264
+ (_e = this.tts) == null ? void 0 : _e.provider
1123
1265
  );
1124
1266
  tasks.push(ttsTask);
1125
1267
  } else {
1126
1268
  llmOutput = llmGenData.textStream;
1127
1269
  }
1128
1270
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1271
+ let userMetrics = _previousUserMetrics;
1129
1272
  if (newMessage && speechHandle.scheduled) {
1130
1273
  this.agent._chatCtx.insert(newMessage);
1131
1274
  this.agentSession._conversationItemAdded(newMessage);
1275
+ userMetrics = newMessage.metrics;
1132
1276
  }
1133
1277
  if (speechHandle.interrupted) {
1134
1278
  replyAbortController.abort();
@@ -1140,7 +1284,7 @@ ${instructions}`;
1140
1284
  speechHandle._clearAuthorization();
1141
1285
  const replyStartedAt = Date.now();
1142
1286
  let transcriptionInput = llmOutput;
1143
- if (this.useTtsAlignedTranscript && ((_b = this.tts) == null ? void 0 : _b.capabilities.alignedTranscript) && ttsGenData) {
1287
+ if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
1144
1288
  const timedTextsStream = await Promise.race([
1145
1289
  ttsGenData.timedTextsFut.await,
1146
1290
  (ttsTask == null ? void 0 : ttsTask.result.catch(
@@ -1163,11 +1307,17 @@ ${instructions}`;
1163
1307
  tasks.push(textForwardTask);
1164
1308
  textOut = _textOut;
1165
1309
  }
1310
+ let agentStartedSpeakingAt;
1166
1311
  const onFirstFrame = (startedSpeakingAt) => {
1312
+ agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1167
1313
  this.agentSession._updateAgentState("speaking", {
1168
1314
  startTime: startedSpeakingAt,
1169
1315
  otelContext: speechHandle._agentTurnContext
1170
1316
  });
1317
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1318
+ this.audioRecognition.onStartOfAgentSpeech();
1319
+ this.isInterruptionByAudioActivityEnabled = false;
1320
+ }
1171
1321
  };
1172
1322
  let audioOut = null;
1173
1323
  if (audioOutput) {
@@ -1210,6 +1360,25 @@ ${instructions}`;
1210
1360
  if (audioOutput) {
1211
1361
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1212
1362
  }
1363
+ const agentStoppedSpeakingAt = Date.now();
1364
+ const assistantMetrics = {};
1365
+ if (llmGenData.ttft !== void 0) {
1366
+ assistantMetrics.llmNodeTtft = llmGenData.ttft;
1367
+ }
1368
+ if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
1369
+ assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
1370
+ }
1371
+ if (agentStartedSpeakingAt !== void 0) {
1372
+ assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
1373
+ assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
1374
+ if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
1375
+ const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
1376
+ assistantMetrics.e2eLatency = e2eLatency;
1377
+ span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
1378
+ }
1379
+ }
1380
+ span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
1381
+ let hasSpeechMessage = false;
1213
1382
  if (toolsMessages) {
1214
1383
  for (const msg of toolsMessages) {
1215
1384
  msg.createdAt = replyStartedAt;
@@ -1250,20 +1419,27 @@ ${instructions}`;
1250
1419
  }
1251
1420
  }
1252
1421
  if (forwardedText) {
1422
+ hasSpeechMessage = true;
1253
1423
  const message = ChatMessage.create({
1254
1424
  role: "assistant",
1255
1425
  content: forwardedText,
1256
1426
  id: llmGenData.id,
1257
1427
  interrupted: true,
1258
- createdAt: replyStartedAt
1428
+ createdAt: replyStartedAt,
1429
+ metrics: assistantMetrics
1259
1430
  });
1260
1431
  chatCtx.insert(message);
1261
1432
  this.agent._chatCtx.insert(message);
1262
1433
  speechHandle._itemAdded([message]);
1263
1434
  this.agentSession._conversationItemAdded(message);
1435
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
1264
1436
  }
1265
1437
  if (this.agentSession.agentState === "speaking") {
1266
1438
  this.agentSession._updateAgentState("listening");
1439
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1440
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1441
+ this.restoreInterruptionByAudioActivity();
1442
+ }
1267
1443
  }
1268
1444
  this.logger.info(
1269
1445
  { speech_id: speechHandle.id, message: forwardedText },
@@ -1274,17 +1450,20 @@ ${instructions}`;
1274
1450
  return;
1275
1451
  }
1276
1452
  if (textOut && textOut.text) {
1453
+ hasSpeechMessage = true;
1277
1454
  const message = ChatMessage.create({
1278
1455
  role: "assistant",
1279
1456
  id: llmGenData.id,
1280
1457
  interrupted: false,
1281
1458
  createdAt: replyStartedAt,
1282
- content: textOut.text
1459
+ content: textOut.text,
1460
+ metrics: assistantMetrics
1283
1461
  });
1284
1462
  chatCtx.insert(message);
1285
1463
  this.agent._chatCtx.insert(message);
1286
1464
  speechHandle._itemAdded([message]);
1287
1465
  this.agentSession._conversationItemAdded(message);
1466
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
1288
1467
  this.logger.info(
1289
1468
  { speech_id: speechHandle.id, message: textOut.text },
1290
1469
  "playout completed without interruption"
@@ -1294,6 +1473,12 @@ ${instructions}`;
1294
1473
  this.agentSession._updateAgentState("thinking");
1295
1474
  } else if (this.agentSession.agentState === "speaking") {
1296
1475
  this.agentSession._updateAgentState("listening");
1476
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1477
+ {
1478
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1479
+ this.restoreInterruptionByAudioActivity();
1480
+ }
1481
+ }
1297
1482
  }
1298
1483
  speechHandle._markGenerationDone();
1299
1484
  await executeToolsTask.result;
@@ -1333,7 +1518,8 @@ ${instructions}`;
1333
1518
  replyAbortController,
1334
1519
  instructions,
1335
1520
  void 0,
1336
- toolMessages
1521
+ toolMessages,
1522
+ hasSpeechMessage ? void 0 : userMetrics
1337
1523
  ),
1338
1524
  ownedSpeechHandle: speechHandle,
1339
1525
  name: "AgentActivity.pipelineReply"
@@ -1353,7 +1539,7 @@ ${instructions}`;
1353
1539
  }
1354
1540
  }
1355
1541
  };
1356
- pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => tracer.startActiveSpan(
1542
+ pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => tracer.startActiveSpan(
1357
1543
  async (span) => this._pipelineReplyTaskImpl({
1358
1544
  speechHandle,
1359
1545
  chatCtx,
@@ -1363,7 +1549,8 @@ ${instructions}`;
1363
1549
  instructions,
1364
1550
  newMessage,
1365
1551
  toolsMessages,
1366
- span
1552
+ span,
1553
+ _previousUserMetrics
1367
1554
  }),
1368
1555
  {
1369
1556
  name: "agent_turn",
@@ -1429,6 +1616,7 @@ ${instructions}`;
1429
1616
  });
1430
1617
  };
1431
1618
  const readMessages = async (abortController, outputs) => {
1619
+ var _a2, _b;
1432
1620
  replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
1433
1621
  once: true
1434
1622
  });
@@ -1475,7 +1663,9 @@ ${instructions}`;
1475
1663
  (...args) => this.agent.ttsNode(...args),
1476
1664
  ttsTextInput,
1477
1665
  modelSettings,
1478
- abortController
1666
+ abortController,
1667
+ (_a2 = this.tts) == null ? void 0 : _a2.model,
1668
+ (_b = this.tts) == null ? void 0 : _b.provider
1479
1669
  );
1480
1670
  tasks.push(ttsTask);
1481
1671
  realtimeAudioResult = ttsGenData.audioStream;
@@ -1867,11 +2057,46 @@ ${instructions}`;
1867
2057
  if (this._mainTask) {
1868
2058
  await this._mainTask.cancelAndWait();
1869
2059
  }
2060
+ if (this.interruptionDetector) {
2061
+ this.interruptionDetector.off(
2062
+ "user_overlapping_speech",
2063
+ this.onInterruptionOverlappingSpeech
2064
+ );
2065
+ this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
2066
+ this.interruptionDetector.off("error", this.onInterruptionError);
2067
+ }
1870
2068
  this.agent._agentActivity = void 0;
1871
2069
  } finally {
1872
2070
  unlock();
1873
2071
  }
1874
2072
  }
2073
+ resolveInterruptionDetector() {
2074
+ const interruptionDetection = this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
2075
+ if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof RealtimeModel))) {
2076
+ if (interruptionDetection === "adaptive") {
2077
+ this.logger.warn(
2078
+ "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
2079
+ );
2080
+ return void 0;
2081
+ }
2082
+ }
2083
+ if (interruptionDetection !== void 0 && interruptionDetection === false || interruptionDetection === "vad") {
2084
+ return void 0;
2085
+ }
2086
+ try {
2087
+ const detector = new AdaptiveInterruptionDetector();
2088
+ detector.on("user_overlapping_speech", this.onInterruptionOverlappingSpeech);
2089
+ detector.on("metrics_collected", this.onInterruptionMetricsCollected);
2090
+ detector.on("error", this.onInterruptionError);
2091
+ return detector;
2092
+ } catch (error) {
2093
+ this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
2094
+ }
2095
+ return void 0;
2096
+ }
2097
+ restoreInterruptionByAudioActivity() {
2098
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
2099
+ }
1875
2100
  async _closeSessionResources() {
1876
2101
  var _a, _b, _c;
1877
2102
  if (this.llm instanceof LLM) {