@livekit/agents 1.0.48 → 1.1.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/dist/constants.cjs +27 -0
  2. package/dist/constants.cjs.map +1 -1
  3. package/dist/constants.d.cts +9 -0
  4. package/dist/constants.d.ts +9 -0
  5. package/dist/constants.d.ts.map +1 -1
  6. package/dist/constants.js +18 -0
  7. package/dist/constants.js.map +1 -1
  8. package/dist/inference/api_protos.d.cts +71 -71
  9. package/dist/inference/api_protos.d.ts +71 -71
  10. package/dist/inference/interruption/defaults.cjs +81 -0
  11. package/dist/inference/interruption/defaults.cjs.map +1 -0
  12. package/dist/inference/interruption/defaults.d.cts +19 -0
  13. package/dist/inference/interruption/defaults.d.ts +19 -0
  14. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  15. package/dist/inference/interruption/defaults.js +46 -0
  16. package/dist/inference/interruption/defaults.js.map +1 -0
  17. package/dist/inference/interruption/errors.cjs +44 -0
  18. package/dist/inference/interruption/errors.cjs.map +1 -0
  19. package/dist/inference/interruption/errors.d.cts +12 -0
  20. package/dist/inference/interruption/errors.d.ts +12 -0
  21. package/dist/inference/interruption/errors.d.ts.map +1 -0
  22. package/dist/inference/interruption/errors.js +20 -0
  23. package/dist/inference/interruption/errors.js.map +1 -0
  24. package/dist/inference/interruption/http_transport.cjs +147 -0
  25. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  26. package/dist/inference/interruption/http_transport.d.cts +63 -0
  27. package/dist/inference/interruption/http_transport.d.ts +63 -0
  28. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  29. package/dist/inference/interruption/http_transport.js +121 -0
  30. package/dist/inference/interruption/http_transport.js.map +1 -0
  31. package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
  32. package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
  33. package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
  34. package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
  35. package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
  36. package/dist/inference/interruption/interruption_cache_entry.js +34 -0
  37. package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
  38. package/dist/inference/interruption/interruption_detector.cjs +181 -0
  39. package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
  40. package/dist/inference/interruption/interruption_detector.d.cts +59 -0
  41. package/dist/inference/interruption/interruption_detector.d.ts +59 -0
  42. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
  43. package/dist/inference/interruption/interruption_detector.js +147 -0
  44. package/dist/inference/interruption/interruption_detector.js.map +1 -0
  45. package/dist/inference/interruption/interruption_stream.cjs +368 -0
  46. package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
  47. package/dist/inference/interruption/interruption_stream.d.cts +46 -0
  48. package/dist/inference/interruption/interruption_stream.d.ts +46 -0
  49. package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
  50. package/dist/inference/interruption/interruption_stream.js +344 -0
  51. package/dist/inference/interruption/interruption_stream.js.map +1 -0
  52. package/dist/inference/interruption/types.cjs +17 -0
  53. package/dist/inference/interruption/types.cjs.map +1 -0
  54. package/dist/inference/interruption/types.d.cts +66 -0
  55. package/dist/inference/interruption/types.d.ts +66 -0
  56. package/dist/inference/interruption/types.d.ts.map +1 -0
  57. package/dist/inference/interruption/types.js +1 -0
  58. package/dist/inference/interruption/types.js.map +1 -0
  59. package/dist/inference/interruption/utils.cjs +130 -0
  60. package/dist/inference/interruption/utils.cjs.map +1 -0
  61. package/dist/inference/interruption/utils.d.cts +41 -0
  62. package/dist/inference/interruption/utils.d.ts +41 -0
  63. package/dist/inference/interruption/utils.d.ts.map +1 -0
  64. package/dist/inference/interruption/utils.js +105 -0
  65. package/dist/inference/interruption/utils.js.map +1 -0
  66. package/dist/inference/interruption/utils.test.cjs +105 -0
  67. package/dist/inference/interruption/utils.test.cjs.map +1 -0
  68. package/dist/inference/interruption/utils.test.js +104 -0
  69. package/dist/inference/interruption/utils.test.js.map +1 -0
  70. package/dist/inference/interruption/ws_transport.cjs +329 -0
  71. package/dist/inference/interruption/ws_transport.cjs.map +1 -0
  72. package/dist/inference/interruption/ws_transport.d.cts +33 -0
  73. package/dist/inference/interruption/ws_transport.d.ts +33 -0
  74. package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
  75. package/dist/inference/interruption/ws_transport.js +295 -0
  76. package/dist/inference/interruption/ws_transport.js.map +1 -0
  77. package/dist/inference/llm.cjs +14 -10
  78. package/dist/inference/llm.cjs.map +1 -1
  79. package/dist/inference/llm.d.cts +2 -1
  80. package/dist/inference/llm.d.ts +2 -1
  81. package/dist/inference/llm.d.ts.map +1 -1
  82. package/dist/inference/llm.js +8 -10
  83. package/dist/inference/llm.js.map +1 -1
  84. package/dist/inference/stt.cjs +7 -2
  85. package/dist/inference/stt.cjs.map +1 -1
  86. package/dist/inference/stt.d.cts +2 -0
  87. package/dist/inference/stt.d.ts +2 -0
  88. package/dist/inference/stt.d.ts.map +1 -1
  89. package/dist/inference/stt.js +8 -3
  90. package/dist/inference/stt.js.map +1 -1
  91. package/dist/inference/tts.cjs +7 -2
  92. package/dist/inference/tts.cjs.map +1 -1
  93. package/dist/inference/tts.d.cts +2 -0
  94. package/dist/inference/tts.d.ts +2 -0
  95. package/dist/inference/tts.d.ts.map +1 -1
  96. package/dist/inference/tts.js +8 -3
  97. package/dist/inference/tts.js.map +1 -1
  98. package/dist/inference/utils.cjs +26 -7
  99. package/dist/inference/utils.cjs.map +1 -1
  100. package/dist/inference/utils.d.cts +13 -0
  101. package/dist/inference/utils.d.ts +13 -0
  102. package/dist/inference/utils.d.ts.map +1 -1
  103. package/dist/inference/utils.js +18 -2
  104. package/dist/inference/utils.js.map +1 -1
  105. package/dist/llm/chat_context.cjs +20 -2
  106. package/dist/llm/chat_context.cjs.map +1 -1
  107. package/dist/llm/chat_context.d.cts +19 -1
  108. package/dist/llm/chat_context.d.ts +19 -1
  109. package/dist/llm/chat_context.d.ts.map +1 -1
  110. package/dist/llm/chat_context.js +20 -2
  111. package/dist/llm/chat_context.js.map +1 -1
  112. package/dist/llm/index.cjs.map +1 -1
  113. package/dist/llm/index.d.cts +1 -1
  114. package/dist/llm/index.d.ts +1 -1
  115. package/dist/llm/index.d.ts.map +1 -1
  116. package/dist/llm/index.js.map +1 -1
  117. package/dist/llm/llm.cjs +16 -1
  118. package/dist/llm/llm.cjs.map +1 -1
  119. package/dist/llm/llm.d.cts +9 -0
  120. package/dist/llm/llm.d.ts +9 -0
  121. package/dist/llm/llm.d.ts.map +1 -1
  122. package/dist/llm/llm.js +16 -1
  123. package/dist/llm/llm.js.map +1 -1
  124. package/dist/llm/realtime.cjs +3 -0
  125. package/dist/llm/realtime.cjs.map +1 -1
  126. package/dist/llm/realtime.d.cts +1 -0
  127. package/dist/llm/realtime.d.ts +1 -0
  128. package/dist/llm/realtime.d.ts.map +1 -1
  129. package/dist/llm/realtime.js +3 -0
  130. package/dist/llm/realtime.js.map +1 -1
  131. package/dist/metrics/base.cjs.map +1 -1
  132. package/dist/metrics/base.d.cts +45 -1
  133. package/dist/metrics/base.d.ts +45 -1
  134. package/dist/metrics/base.d.ts.map +1 -1
  135. package/dist/metrics/index.cjs +5 -0
  136. package/dist/metrics/index.cjs.map +1 -1
  137. package/dist/metrics/index.d.cts +2 -1
  138. package/dist/metrics/index.d.ts +2 -1
  139. package/dist/metrics/index.d.ts.map +1 -1
  140. package/dist/metrics/index.js +6 -0
  141. package/dist/metrics/index.js.map +1 -1
  142. package/dist/metrics/model_usage.cjs +189 -0
  143. package/dist/metrics/model_usage.cjs.map +1 -0
  144. package/dist/metrics/model_usage.d.cts +92 -0
  145. package/dist/metrics/model_usage.d.ts +92 -0
  146. package/dist/metrics/model_usage.d.ts.map +1 -0
  147. package/dist/metrics/model_usage.js +164 -0
  148. package/dist/metrics/model_usage.js.map +1 -0
  149. package/dist/metrics/model_usage.test.cjs +474 -0
  150. package/dist/metrics/model_usage.test.cjs.map +1 -0
  151. package/dist/metrics/model_usage.test.js +476 -0
  152. package/dist/metrics/model_usage.test.js.map +1 -0
  153. package/dist/metrics/usage_collector.cjs +3 -0
  154. package/dist/metrics/usage_collector.cjs.map +1 -1
  155. package/dist/metrics/usage_collector.d.cts +9 -0
  156. package/dist/metrics/usage_collector.d.ts +9 -0
  157. package/dist/metrics/usage_collector.d.ts.map +1 -1
  158. package/dist/metrics/usage_collector.js +3 -0
  159. package/dist/metrics/usage_collector.js.map +1 -1
  160. package/dist/metrics/utils.cjs +9 -0
  161. package/dist/metrics/utils.cjs.map +1 -1
  162. package/dist/metrics/utils.d.ts.map +1 -1
  163. package/dist/metrics/utils.js +9 -0
  164. package/dist/metrics/utils.js.map +1 -1
  165. package/dist/stream/multi_input_stream.test.cjs +4 -0
  166. package/dist/stream/multi_input_stream.test.cjs.map +1 -1
  167. package/dist/stream/multi_input_stream.test.js +5 -1
  168. package/dist/stream/multi_input_stream.test.js.map +1 -1
  169. package/dist/stream/stream_channel.cjs +31 -0
  170. package/dist/stream/stream_channel.cjs.map +1 -1
  171. package/dist/stream/stream_channel.d.cts +4 -2
  172. package/dist/stream/stream_channel.d.ts +4 -2
  173. package/dist/stream/stream_channel.d.ts.map +1 -1
  174. package/dist/stream/stream_channel.js +31 -0
  175. package/dist/stream/stream_channel.js.map +1 -1
  176. package/dist/stt/stt.cjs +34 -2
  177. package/dist/stt/stt.cjs.map +1 -1
  178. package/dist/stt/stt.d.cts +22 -0
  179. package/dist/stt/stt.d.ts +22 -0
  180. package/dist/stt/stt.d.ts.map +1 -1
  181. package/dist/stt/stt.js +34 -2
  182. package/dist/stt/stt.js.map +1 -1
  183. package/dist/telemetry/otel_http_exporter.cjs +24 -5
  184. package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
  185. package/dist/telemetry/otel_http_exporter.d.cts +1 -0
  186. package/dist/telemetry/otel_http_exporter.d.ts +1 -0
  187. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
  188. package/dist/telemetry/otel_http_exporter.js +24 -5
  189. package/dist/telemetry/otel_http_exporter.js.map +1 -1
  190. package/dist/telemetry/trace_types.cjs +5 -5
  191. package/dist/telemetry/trace_types.cjs.map +1 -1
  192. package/dist/telemetry/trace_types.d.cts +9 -5
  193. package/dist/telemetry/trace_types.d.ts +9 -5
  194. package/dist/telemetry/trace_types.d.ts.map +1 -1
  195. package/dist/telemetry/trace_types.js +5 -5
  196. package/dist/telemetry/trace_types.js.map +1 -1
  197. package/dist/telemetry/traces.cjs +47 -8
  198. package/dist/telemetry/traces.cjs.map +1 -1
  199. package/dist/telemetry/traces.d.ts.map +1 -1
  200. package/dist/telemetry/traces.js +47 -8
  201. package/dist/telemetry/traces.js.map +1 -1
  202. package/dist/tts/tts.cjs +64 -2
  203. package/dist/tts/tts.cjs.map +1 -1
  204. package/dist/tts/tts.d.cts +34 -0
  205. package/dist/tts/tts.d.ts +34 -0
  206. package/dist/tts/tts.d.ts.map +1 -1
  207. package/dist/tts/tts.js +64 -2
  208. package/dist/tts/tts.js.map +1 -1
  209. package/dist/version.cjs +1 -1
  210. package/dist/version.js +1 -1
  211. package/dist/voice/agent.cjs +25 -4
  212. package/dist/voice/agent.cjs.map +1 -1
  213. package/dist/voice/agent.d.cts +10 -2
  214. package/dist/voice/agent.d.ts +10 -2
  215. package/dist/voice/agent.d.ts.map +1 -1
  216. package/dist/voice/agent.js +25 -4
  217. package/dist/voice/agent.js.map +1 -1
  218. package/dist/voice/agent_activity.cjs +261 -36
  219. package/dist/voice/agent_activity.cjs.map +1 -1
  220. package/dist/voice/agent_activity.d.cts +20 -6
  221. package/dist/voice/agent_activity.d.ts +20 -6
  222. package/dist/voice/agent_activity.d.ts.map +1 -1
  223. package/dist/voice/agent_activity.js +262 -37
  224. package/dist/voice/agent_activity.js.map +1 -1
  225. package/dist/voice/agent_session.cjs +105 -48
  226. package/dist/voice/agent_session.cjs.map +1 -1
  227. package/dist/voice/agent_session.d.cts +90 -20
  228. package/dist/voice/agent_session.d.ts +90 -20
  229. package/dist/voice/agent_session.d.ts.map +1 -1
  230. package/dist/voice/agent_session.js +105 -46
  231. package/dist/voice/agent_session.js.map +1 -1
  232. package/dist/voice/audio_recognition.cjs +287 -6
  233. package/dist/voice/audio_recognition.cjs.map +1 -1
  234. package/dist/voice/audio_recognition.d.cts +42 -3
  235. package/dist/voice/audio_recognition.d.ts +42 -3
  236. package/dist/voice/audio_recognition.d.ts.map +1 -1
  237. package/dist/voice/audio_recognition.js +289 -7
  238. package/dist/voice/audio_recognition.js.map +1 -1
  239. package/dist/voice/client_events.cjs +554 -0
  240. package/dist/voice/client_events.cjs.map +1 -0
  241. package/dist/voice/client_events.d.cts +195 -0
  242. package/dist/voice/client_events.d.ts +195 -0
  243. package/dist/voice/client_events.d.ts.map +1 -0
  244. package/dist/voice/client_events.js +548 -0
  245. package/dist/voice/client_events.js.map +1 -0
  246. package/dist/voice/events.cjs +1 -0
  247. package/dist/voice/events.cjs.map +1 -1
  248. package/dist/voice/events.d.cts +8 -5
  249. package/dist/voice/events.d.ts +8 -5
  250. package/dist/voice/events.d.ts.map +1 -1
  251. package/dist/voice/events.js +1 -0
  252. package/dist/voice/events.js.map +1 -1
  253. package/dist/voice/generation.cjs +43 -8
  254. package/dist/voice/generation.cjs.map +1 -1
  255. package/dist/voice/generation.d.cts +3 -3
  256. package/dist/voice/generation.d.ts +3 -3
  257. package/dist/voice/generation.d.ts.map +1 -1
  258. package/dist/voice/generation.js +43 -8
  259. package/dist/voice/generation.js.map +1 -1
  260. package/dist/voice/index.cjs +1 -0
  261. package/dist/voice/index.cjs.map +1 -1
  262. package/dist/voice/index.d.cts +1 -0
  263. package/dist/voice/index.d.ts +1 -0
  264. package/dist/voice/index.d.ts.map +1 -1
  265. package/dist/voice/index.js +1 -0
  266. package/dist/voice/index.js.map +1 -1
  267. package/dist/voice/report.cjs +20 -8
  268. package/dist/voice/report.cjs.map +1 -1
  269. package/dist/voice/report.d.cts +5 -0
  270. package/dist/voice/report.d.ts +5 -0
  271. package/dist/voice/report.d.ts.map +1 -1
  272. package/dist/voice/report.js +20 -8
  273. package/dist/voice/report.js.map +1 -1
  274. package/dist/voice/report.test.cjs +106 -0
  275. package/dist/voice/report.test.cjs.map +1 -0
  276. package/dist/voice/report.test.js +105 -0
  277. package/dist/voice/report.test.js.map +1 -0
  278. package/dist/voice/room_io/room_io.cjs +5 -39
  279. package/dist/voice/room_io/room_io.cjs.map +1 -1
  280. package/dist/voice/room_io/room_io.d.cts +4 -9
  281. package/dist/voice/room_io/room_io.d.ts +4 -9
  282. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  283. package/dist/voice/room_io/room_io.js +5 -40
  284. package/dist/voice/room_io/room_io.js.map +1 -1
  285. package/dist/voice/turn_config/endpointing.cjs +33 -0
  286. package/dist/voice/turn_config/endpointing.cjs.map +1 -0
  287. package/dist/voice/turn_config/endpointing.d.cts +30 -0
  288. package/dist/voice/turn_config/endpointing.d.ts +30 -0
  289. package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
  290. package/dist/voice/turn_config/endpointing.js +9 -0
  291. package/dist/voice/turn_config/endpointing.js.map +1 -0
  292. package/dist/voice/turn_config/interruption.cjs +37 -0
  293. package/dist/voice/turn_config/interruption.cjs.map +1 -0
  294. package/dist/voice/turn_config/interruption.d.cts +53 -0
  295. package/dist/voice/turn_config/interruption.d.ts +53 -0
  296. package/dist/voice/turn_config/interruption.d.ts.map +1 -0
  297. package/dist/voice/turn_config/interruption.js +13 -0
  298. package/dist/voice/turn_config/interruption.js.map +1 -0
  299. package/dist/voice/turn_config/turn_handling.cjs +35 -0
  300. package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
  301. package/dist/voice/turn_config/turn_handling.d.cts +36 -0
  302. package/dist/voice/turn_config/turn_handling.d.ts +36 -0
  303. package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
  304. package/dist/voice/turn_config/turn_handling.js +11 -0
  305. package/dist/voice/turn_config/turn_handling.js.map +1 -0
  306. package/dist/voice/turn_config/utils.cjs +97 -0
  307. package/dist/voice/turn_config/utils.cjs.map +1 -0
  308. package/dist/voice/turn_config/utils.d.cts +25 -0
  309. package/dist/voice/turn_config/utils.d.ts +25 -0
  310. package/dist/voice/turn_config/utils.d.ts.map +1 -0
  311. package/dist/voice/turn_config/utils.js +73 -0
  312. package/dist/voice/turn_config/utils.js.map +1 -0
  313. package/dist/voice/turn_config/utils.test.cjs +86 -0
  314. package/dist/voice/turn_config/utils.test.cjs.map +1 -0
  315. package/dist/voice/turn_config/utils.test.js +85 -0
  316. package/dist/voice/turn_config/utils.test.js.map +1 -0
  317. package/dist/voice/wire_format.cjs +798 -0
  318. package/dist/voice/wire_format.cjs.map +1 -0
  319. package/dist/voice/wire_format.d.cts +5503 -0
  320. package/dist/voice/wire_format.d.ts +5503 -0
  321. package/dist/voice/wire_format.d.ts.map +1 -0
  322. package/dist/voice/wire_format.js +728 -0
  323. package/dist/voice/wire_format.js.map +1 -0
  324. package/package.json +2 -1
  325. package/src/constants.ts +13 -0
  326. package/src/inference/interruption/defaults.ts +51 -0
  327. package/src/inference/interruption/errors.ts +25 -0
  328. package/src/inference/interruption/http_transport.ts +187 -0
  329. package/src/inference/interruption/interruption_cache_entry.ts +50 -0
  330. package/src/inference/interruption/interruption_detector.ts +188 -0
  331. package/src/inference/interruption/interruption_stream.ts +467 -0
  332. package/src/inference/interruption/types.ts +84 -0
  333. package/src/inference/interruption/utils.test.ts +132 -0
  334. package/src/inference/interruption/utils.ts +137 -0
  335. package/src/inference/interruption/ws_transport.ts +402 -0
  336. package/src/inference/llm.ts +9 -12
  337. package/src/inference/stt.ts +10 -3
  338. package/src/inference/tts.ts +10 -3
  339. package/src/inference/utils.ts +29 -1
  340. package/src/llm/chat_context.ts +40 -2
  341. package/src/llm/index.ts +1 -0
  342. package/src/llm/llm.ts +16 -0
  343. package/src/llm/realtime.ts +4 -0
  344. package/src/metrics/base.ts +48 -1
  345. package/src/metrics/index.ts +11 -0
  346. package/src/metrics/model_usage.test.ts +545 -0
  347. package/src/metrics/model_usage.ts +262 -0
  348. package/src/metrics/usage_collector.ts +11 -0
  349. package/src/metrics/utils.ts +11 -0
  350. package/src/stream/multi_input_stream.test.ts +6 -1
  351. package/src/stream/stream_channel.ts +34 -2
  352. package/src/stt/stt.ts +38 -0
  353. package/src/telemetry/otel_http_exporter.ts +28 -5
  354. package/src/telemetry/trace_types.ts +11 -8
  355. package/src/telemetry/traces.ts +111 -54
  356. package/src/tts/tts.ts +69 -1
  357. package/src/voice/agent.ts +30 -3
  358. package/src/voice/agent_activity.ts +327 -28
  359. package/src/voice/agent_session.ts +207 -59
  360. package/src/voice/audio_recognition.ts +385 -9
  361. package/src/voice/client_events.ts +838 -0
  362. package/src/voice/events.ts +14 -4
  363. package/src/voice/generation.ts +52 -9
  364. package/src/voice/index.ts +1 -0
  365. package/src/voice/report.test.ts +117 -0
  366. package/src/voice/report.ts +29 -6
  367. package/src/voice/room_io/room_io.ts +7 -61
  368. package/src/voice/turn_config/endpointing.ts +33 -0
  369. package/src/voice/turn_config/interruption.ts +56 -0
  370. package/src/voice/turn_config/turn_handling.ts +45 -0
  371. package/src/voice/turn_config/utils.test.ts +100 -0
  372. package/src/voice/turn_config/utils.ts +103 -0
  373. package/src/voice/wire_format.ts +827 -0
@@ -28,6 +28,7 @@ var import_api = require("@opentelemetry/api");
28
28
  var import_heap_js = require("heap-js");
29
29
  var import_node_async_hooks = require("node:async_hooks");
30
30
  var import_web = require("node:stream/web");
31
+ var import_interruption_detector = require("../inference/interruption/interruption_detector.cjs");
31
32
  var import_chat_context = require("../llm/chat_context.cjs");
32
33
  var import_llm = require("../llm/index.cjs");
33
34
  var import_tool_context = require("../llm/tool_context.cjs");
@@ -72,16 +73,34 @@ class AgentActivity {
72
73
  // default to null as None, which maps to the default provider tool choice value
73
74
  toolChoice = null;
74
75
  _preemptiveGeneration;
75
- /** @internal */
76
- _mainTask;
77
- _onEnterTask;
78
- _onExitTask;
79
- _userTurnCompletedTask;
76
+ interruptionDetector;
77
+ isInterruptionDetectionEnabled;
78
+ isInterruptionByAudioActivityEnabled;
79
+ isDefaultInterruptionByAudioActivityEnabled;
80
80
  onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
81
81
  onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
82
82
  onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
83
83
  onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
84
84
  onModelError = (ev) => this.onError(ev);
85
+ onInterruptionOverlappingSpeech = (ev) => {
86
+ this.agentSession.emit(import_events.AgentSessionEventTypes.UserOverlappingSpeech, ev);
87
+ };
88
+ onInterruptionMetricsCollected = (ev) => {
89
+ this.agentSession.emit(
90
+ import_events.AgentSessionEventTypes.MetricsCollected,
91
+ (0, import_events.createMetricsCollectedEvent)({ metrics: ev })
92
+ );
93
+ };
94
+ onInterruptionError = (ev) => {
95
+ const errorEvent = (0, import_events.createErrorEvent)(ev, this.interruptionDetector);
96
+ this.agentSession.emit(import_events.AgentSessionEventTypes.Error, errorEvent);
97
+ this.agentSession._onError(ev);
98
+ };
99
+ /** @internal */
100
+ _mainTask;
101
+ _onEnterTask;
102
+ _onExitTask;
103
+ _userTurnCompletedTask;
85
104
  constructor(agent, agentSession) {
86
105
  this.agent = agent;
87
106
  this.agentSession = agentSession;
@@ -140,6 +159,10 @@ class AgentActivity {
140
159
  "VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
141
160
  );
142
161
  }
162
+ this.interruptionDetector = this.resolveInterruptionDetector();
163
+ this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
164
+ this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
165
+ this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
143
166
  }
144
167
  async start() {
145
168
  const unlock = await this.lock.lock();
@@ -232,8 +255,9 @@ class AgentActivity {
232
255
  vad: this.vad,
233
256
  turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
234
257
  turnDetectionMode: this.turnDetectionMode,
235
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
236
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
258
+ interruptionDetection: this.interruptionDetector,
259
+ minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
260
+ maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
237
261
  rootSpanContext: this.agentSession.rootSpanContext,
238
262
  sttModel: (_a = this.stt) == null ? void 0 : _a.label,
239
263
  sttProvider: this.getSttProvider(),
@@ -295,7 +319,8 @@ class AgentActivity {
295
319
  return this.realtimeSession;
296
320
  }
297
321
  get allowInterruptions() {
298
- return this.agentSession.options.allowInterruptions;
322
+ var _a;
323
+ return ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.mode) !== false;
299
324
  }
300
325
  get useTtsAlignedTranscript() {
301
326
  return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
@@ -306,6 +331,11 @@ class AgentActivity {
306
331
  get toolCtx() {
307
332
  return this.agent.toolCtx;
308
333
  }
334
+ /** @internal */
335
+ get inputStartedAt() {
336
+ var _a;
337
+ return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
338
+ }
309
339
  async updateChatCtx(chatCtx) {
310
340
  chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
311
341
  this.agent._chatCtx = chatCtx;
@@ -330,19 +360,40 @@ class AgentActivity {
330
360
  await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
331
361
  }
332
362
  }
333
- updateOptions({ toolChoice }) {
363
+ updateOptions({
364
+ toolChoice,
365
+ turnDetection
366
+ }) {
334
367
  if (toolChoice !== void 0) {
335
368
  this.toolChoice = toolChoice;
336
369
  }
337
370
  if (this.realtimeSession) {
338
371
  this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
339
372
  }
373
+ if (turnDetection !== void 0) {
374
+ this.turnDetectionMode = turnDetection;
375
+ this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
376
+ if (this.agentSession.agentState !== "speaking") {
377
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
378
+ }
379
+ }
380
+ if (this.audioRecognition) {
381
+ this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
382
+ }
340
383
  }
341
384
  attachAudioInput(audioStream) {
342
385
  void this.audioStream.close();
343
386
  this.audioStream = new import_multi_input_stream.MultiInputStream();
387
+ const aecWarmupAudioFilter = new import_web.TransformStream({
388
+ transform: (frame, controller) => {
389
+ const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
390
+ if (!shouldDiscardForAecWarmup) {
391
+ controller.enqueue(frame);
392
+ }
393
+ }
394
+ });
344
395
  this.audioStreamId = this.audioStream.addInputStream(audioStream);
345
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
396
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
346
397
  if (this.realtimeSession) {
347
398
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
348
399
  }
@@ -448,6 +499,13 @@ class AgentActivity {
448
499
  this.logger.info("onInputSpeechStarted");
449
500
  if (!this.vad) {
450
501
  this.agentSession._updateUserState("speaking");
502
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
503
+ this.audioRecognition.onStartOfOverlapSpeech(
504
+ 0,
505
+ Date.now(),
506
+ this.agentSession._userSpeakingSpan
507
+ );
508
+ }
451
509
  }
452
510
  try {
453
511
  this.interrupt();
@@ -461,6 +519,9 @@ class AgentActivity {
461
519
  onInputSpeechStopped(ev) {
462
520
  this.logger.info(ev, "onInputSpeechStopped");
463
521
  if (!this.vad) {
522
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
523
+ this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
524
+ }
464
525
  this.agentSession._updateUserState("listening");
465
526
  }
466
527
  if (ev.userTranscriptionEnabled) {
@@ -522,48 +583,75 @@ class AgentActivity {
522
583
  onStartOfSpeech(ev) {
523
584
  let speechStartTime = Date.now();
524
585
  if (ev) {
525
- speechStartTime = speechStartTime - ev.speechDuration;
586
+ speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
526
587
  }
527
588
  this.agentSession._updateUserState("speaking", speechStartTime);
589
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
590
+ this.audioRecognition.onStartOfOverlapSpeech(
591
+ ev.speechDuration,
592
+ speechStartTime,
593
+ this.agentSession._userSpeakingSpan
594
+ );
595
+ }
528
596
  }
529
597
  onEndOfSpeech(ev) {
530
598
  let speechEndTime = Date.now();
531
599
  if (ev) {
532
- speechEndTime = speechEndTime - ev.silenceDuration;
600
+ speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
601
+ }
602
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
603
+ this.audioRecognition.onEndOfOverlapSpeech(
604
+ speechEndTime,
605
+ this.agentSession._userSpeakingSpan
606
+ );
533
607
  }
534
608
  this.agentSession._updateUserState("listening", speechEndTime);
535
609
  }
536
610
  onVADInferenceDone(ev) {
611
+ var _a;
537
612
  if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
538
613
  return;
539
614
  }
540
- if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
615
+ if (ev.speechDuration >= ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
541
616
  this.interruptByAudioActivity();
542
617
  }
543
618
  }
544
619
  interruptByAudioActivity() {
545
- var _a, _b;
620
+ var _a, _b, _c, _d;
621
+ if (!this.isInterruptionByAudioActivityEnabled) {
622
+ return;
623
+ }
624
+ if (this.agentSession._aecWarmupRemaining > 0) {
625
+ return;
626
+ }
546
627
  if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
547
628
  return;
548
629
  }
549
- if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
630
+ if (this.stt && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
550
631
  const text = this.audioRecognition.currentTranscript;
551
632
  const normalizedText = text ?? "";
552
633
  const wordCount = (0, import_word.splitWords)(normalizedText, true).length;
553
- if (wordCount < this.agentSession.options.minInterruptionWords) {
634
+ if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
554
635
  return;
555
636
  }
556
637
  }
557
- (_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
638
+ (_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
558
639
  if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
559
640
  this.logger.info(
560
641
  { "speech id": this._currentSpeech.id },
561
642
  "speech interrupted by audio activity"
562
643
  );
563
- (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
644
+ (_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
564
645
  this._currentSpeech.interrupt();
565
646
  }
566
647
  }
648
+ onInterruption(ev) {
649
+ this.restoreInterruptionByAudioActivity();
650
+ this.interruptByAudioActivity();
651
+ if (this.audioRecognition) {
652
+ this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
653
+ }
654
+ }
567
655
  onInterimTranscript(ev) {
568
656
  if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
569
657
  return;
@@ -612,7 +700,8 @@ class AgentActivity {
612
700
  );
613
701
  const userMessage = import_chat_context.ChatMessage.create({
614
702
  role: "user",
615
- content: info.newTranscript
703
+ content: info.newTranscript,
704
+ transcriptConfidence: info.transcriptConfidence
616
705
  });
617
706
  const chatCtx = this.agent.chatCtx.copy();
618
707
  const speechHandle = this.generateReply({
@@ -670,6 +759,7 @@ class AgentActivity {
670
759
  return task;
671
760
  }
672
761
  async onEndOfTurn(info) {
762
+ var _a, _b;
673
763
  if (this.schedulingPaused) {
674
764
  this.cancelPreemptiveGeneration();
675
765
  this.logger.warn(
@@ -678,14 +768,14 @@ class AgentActivity {
678
768
  );
679
769
  return true;
680
770
  }
681
- if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
771
+ if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
682
772
  const wordCount = (0, import_word.splitWords)(info.newTranscript, true).length;
683
- if (wordCount < this.agentSession.options.minInterruptionWords) {
773
+ if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
684
774
  this.cancelPreemptiveGeneration();
685
775
  this.logger.info(
686
776
  {
687
777
  wordCount,
688
- minInterruptionWords: this.agentSession.options.minInterruptionWords
778
+ minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords
689
779
  },
690
780
  "skipping user input, word count below minimum interruption threshold"
691
781
  );
@@ -904,7 +994,8 @@ ${instructions}`;
904
994
  }
905
995
  let userMessage = import_chat_context.ChatMessage.create({
906
996
  role: "user",
907
- content: info.newTranscript
997
+ content: info.newTranscript,
998
+ transcriptConfidence: info.transcriptConfidence
908
999
  });
909
1000
  const chatCtx = this.agent.chatCtx.copy();
910
1001
  const startTime = Date.now();
@@ -922,11 +1013,32 @@ ${instructions}`;
922
1013
  } else if (this.llm === void 0) {
923
1014
  return;
924
1015
  }
1016
+ const userMetricsReport = {};
1017
+ if (info.startedSpeakingAt !== void 0) {
1018
+ userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
1019
+ }
1020
+ if (info.stoppedSpeakingAt !== void 0) {
1021
+ userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
1022
+ }
1023
+ if (info.transcriptionDelay !== void 0) {
1024
+ userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
1025
+ }
1026
+ if (info.endOfUtteranceDelay !== void 0) {
1027
+ userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
1028
+ }
1029
+ userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
1030
+ if (userMessage) {
1031
+ userMessage.metrics = userMetricsReport;
1032
+ }
925
1033
  let speechHandle;
926
1034
  if (this._preemptiveGeneration !== void 0) {
927
1035
  const preemptive = this._preemptiveGeneration;
928
1036
  if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && (0, import_tool_context.isSameToolContext)(preemptive.tools, this.tools) && (0, import_tool_context.isSameToolChoice)(preemptive.toolChoice, this.toolChoice)) {
929
1037
  speechHandle = preemptive.speechHandle;
1038
+ if (preemptive.userMessage && userMessage) {
1039
+ preemptive.userMessage.metrics = userMetricsReport;
1040
+ preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
1041
+ }
930
1042
  this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
931
1043
  this.logger.debug(
932
1044
  {
@@ -960,6 +1072,7 @@ ${instructions}`;
960
1072
  );
961
1073
  }
962
1074
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
1075
+ var _a, _b;
963
1076
  speechHandle._agentTurnContext = import_api.context.active();
964
1077
  import_agent.speechHandleStorage.enterWith(speechHandle);
965
1078
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
@@ -992,11 +1105,18 @@ ${instructions}`;
992
1105
  textOut = _textOut;
993
1106
  tasks.push(textForwardTask);
994
1107
  }
1108
+ let replyStartedSpeakingAt;
1109
+ let replyTtsGenData = null;
995
1110
  const onFirstFrame = (startedSpeakingAt) => {
1111
+ replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
996
1112
  this.agentSession._updateAgentState("speaking", {
997
1113
  startTime: startedSpeakingAt,
998
1114
  otelContext: speechHandle._agentTurnContext
999
1115
  });
1116
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1117
+ this.audioRecognition.onStartOfAgentSpeech();
1118
+ this.isInterruptionByAudioActivityEnabled = false;
1119
+ }
1000
1120
  };
1001
1121
  if (!audioOutput) {
1002
1122
  if (textOut) {
@@ -1009,9 +1129,12 @@ ${instructions}`;
1009
1129
  (...args) => this.agent.ttsNode(...args),
1010
1130
  audioSource,
1011
1131
  modelSettings,
1012
- replyAbortController
1132
+ replyAbortController,
1133
+ (_a = this.tts) == null ? void 0 : _a.model,
1134
+ (_b = this.tts) == null ? void 0 : _b.provider
1013
1135
  );
1014
1136
  tasks.push(ttsTask);
1137
+ replyTtsGenData = ttsGenData;
1015
1138
  const [forwardTask, _audioOut] = (0, import_generation.performAudioForwarding)(
1016
1139
  ttsGenData.audioStream,
1017
1140
  audioOutput,
@@ -1043,16 +1166,30 @@ ${instructions}`;
1043
1166
  }
1044
1167
  }
1045
1168
  if (addToChatCtx) {
1169
+ const replyStoppedSpeakingAt = Date.now();
1170
+ const replyAssistantMetrics = {};
1171
+ if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
1172
+ replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
1173
+ }
1174
+ if (replyStartedSpeakingAt !== void 0) {
1175
+ replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
1176
+ replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
1177
+ }
1046
1178
  const message = import_chat_context.ChatMessage.create({
1047
1179
  role: "assistant",
1048
1180
  content: (textOut == null ? void 0 : textOut.text) || "",
1049
- interrupted: speechHandle.interrupted
1181
+ interrupted: speechHandle.interrupted,
1182
+ metrics: replyAssistantMetrics
1050
1183
  });
1051
1184
  this.agent._chatCtx.insert(message);
1052
1185
  this.agentSession._conversationItemAdded(message);
1053
1186
  }
1054
1187
  if (this.agentSession.agentState === "speaking") {
1055
1188
  this.agentSession._updateAgentState("listening");
1189
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1190
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1191
+ }
1192
+ this.restoreInterruptionByAudioActivity();
1056
1193
  }
1057
1194
  }
1058
1195
  _pipelineReplyTaskImpl = async ({
@@ -1064,9 +1201,10 @@ ${instructions}`;
1064
1201
  instructions,
1065
1202
  newMessage,
1066
1203
  toolsMessages,
1067
- span
1204
+ span,
1205
+ _previousUserMetrics
1068
1206
  }) => {
1069
- var _a, _b;
1207
+ var _a, _b, _c, _d, _e, _f;
1070
1208
  speechHandle._agentTurnContext = import_api.context.active();
1071
1209
  span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1072
1210
  if (instructions) {
@@ -1104,7 +1242,9 @@ ${instructions}`;
1104
1242
  chatCtx,
1105
1243
  toolCtx,
1106
1244
  modelSettings,
1107
- replyAbortController
1245
+ replyAbortController,
1246
+ (_b = this.llm) == null ? void 0 : _b.model,
1247
+ (_c = this.llm) == null ? void 0 : _c.provider
1108
1248
  );
1109
1249
  tasks.push(llmTask);
1110
1250
  let ttsTask = null;
@@ -1117,16 +1257,20 @@ ${instructions}`;
1117
1257
  (...args) => this.agent.ttsNode(...args),
1118
1258
  ttsTextInput,
1119
1259
  modelSettings,
1120
- replyAbortController
1260
+ replyAbortController,
1261
+ (_d = this.tts) == null ? void 0 : _d.model,
1262
+ (_e = this.tts) == null ? void 0 : _e.provider
1121
1263
  );
1122
1264
  tasks.push(ttsTask);
1123
1265
  } else {
1124
1266
  llmOutput = llmGenData.textStream;
1125
1267
  }
1126
1268
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1269
+ let userMetrics = _previousUserMetrics;
1127
1270
  if (newMessage && speechHandle.scheduled) {
1128
1271
  this.agent._chatCtx.insert(newMessage);
1129
1272
  this.agentSession._conversationItemAdded(newMessage);
1273
+ userMetrics = newMessage.metrics;
1130
1274
  }
1131
1275
  if (speechHandle.interrupted) {
1132
1276
  replyAbortController.abort();
@@ -1138,7 +1282,7 @@ ${instructions}`;
1138
1282
  speechHandle._clearAuthorization();
1139
1283
  const replyStartedAt = Date.now();
1140
1284
  let transcriptionInput = llmOutput;
1141
- if (this.useTtsAlignedTranscript && ((_b = this.tts) == null ? void 0 : _b.capabilities.alignedTranscript) && ttsGenData) {
1285
+ if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
1142
1286
  const timedTextsStream = await Promise.race([
1143
1287
  ttsGenData.timedTextsFut.await,
1144
1288
  (ttsTask == null ? void 0 : ttsTask.result.catch(
@@ -1161,11 +1305,17 @@ ${instructions}`;
1161
1305
  tasks.push(textForwardTask);
1162
1306
  textOut = _textOut;
1163
1307
  }
1308
+ let agentStartedSpeakingAt;
1164
1309
  const onFirstFrame = (startedSpeakingAt) => {
1310
+ agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1165
1311
  this.agentSession._updateAgentState("speaking", {
1166
1312
  startTime: startedSpeakingAt,
1167
1313
  otelContext: speechHandle._agentTurnContext
1168
1314
  });
1315
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1316
+ this.audioRecognition.onStartOfAgentSpeech();
1317
+ this.isInterruptionByAudioActivityEnabled = false;
1318
+ }
1169
1319
  };
1170
1320
  let audioOut = null;
1171
1321
  if (audioOutput) {
@@ -1208,6 +1358,25 @@ ${instructions}`;
1208
1358
  if (audioOutput) {
1209
1359
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1210
1360
  }
1361
+ const agentStoppedSpeakingAt = Date.now();
1362
+ const assistantMetrics = {};
1363
+ if (llmGenData.ttft !== void 0) {
1364
+ assistantMetrics.llmNodeTtft = llmGenData.ttft;
1365
+ }
1366
+ if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
1367
+ assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
1368
+ }
1369
+ if (agentStartedSpeakingAt !== void 0) {
1370
+ assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
1371
+ assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
1372
+ if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
1373
+ const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
1374
+ assistantMetrics.e2eLatency = e2eLatency;
1375
+ span.setAttribute(import_telemetry.traceTypes.ATTR_E2E_LATENCY, e2eLatency);
1376
+ }
1377
+ }
1378
+ span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
1379
+ let hasSpeechMessage = false;
1211
1380
  if (toolsMessages) {
1212
1381
  for (const msg of toolsMessages) {
1213
1382
  msg.createdAt = replyStartedAt;
@@ -1248,20 +1417,27 @@ ${instructions}`;
1248
1417
  }
1249
1418
  }
1250
1419
  if (forwardedText) {
1420
+ hasSpeechMessage = true;
1251
1421
  const message = import_chat_context.ChatMessage.create({
1252
1422
  role: "assistant",
1253
1423
  content: forwardedText,
1254
1424
  id: llmGenData.id,
1255
1425
  interrupted: true,
1256
- createdAt: replyStartedAt
1426
+ createdAt: replyStartedAt,
1427
+ metrics: assistantMetrics
1257
1428
  });
1258
1429
  chatCtx.insert(message);
1259
1430
  this.agent._chatCtx.insert(message);
1260
1431
  speechHandle._itemAdded([message]);
1261
1432
  this.agentSession._conversationItemAdded(message);
1433
+ span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
1262
1434
  }
1263
1435
  if (this.agentSession.agentState === "speaking") {
1264
1436
  this.agentSession._updateAgentState("listening");
1437
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1438
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1439
+ this.restoreInterruptionByAudioActivity();
1440
+ }
1265
1441
  }
1266
1442
  this.logger.info(
1267
1443
  { speech_id: speechHandle.id, message: forwardedText },
@@ -1272,17 +1448,20 @@ ${instructions}`;
1272
1448
  return;
1273
1449
  }
1274
1450
  if (textOut && textOut.text) {
1451
+ hasSpeechMessage = true;
1275
1452
  const message = import_chat_context.ChatMessage.create({
1276
1453
  role: "assistant",
1277
1454
  id: llmGenData.id,
1278
1455
  interrupted: false,
1279
1456
  createdAt: replyStartedAt,
1280
- content: textOut.text
1457
+ content: textOut.text,
1458
+ metrics: assistantMetrics
1281
1459
  });
1282
1460
  chatCtx.insert(message);
1283
1461
  this.agent._chatCtx.insert(message);
1284
1462
  speechHandle._itemAdded([message]);
1285
1463
  this.agentSession._conversationItemAdded(message);
1464
+ span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
1286
1465
  this.logger.info(
1287
1466
  { speech_id: speechHandle.id, message: textOut.text },
1288
1467
  "playout completed without interruption"
@@ -1292,6 +1471,12 @@ ${instructions}`;
1292
1471
  this.agentSession._updateAgentState("thinking");
1293
1472
  } else if (this.agentSession.agentState === "speaking") {
1294
1473
  this.agentSession._updateAgentState("listening");
1474
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1475
+ {
1476
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1477
+ this.restoreInterruptionByAudioActivity();
1478
+ }
1479
+ }
1295
1480
  }
1296
1481
  speechHandle._markGenerationDone();
1297
1482
  await executeToolsTask.result;
@@ -1331,7 +1516,8 @@ ${instructions}`;
1331
1516
  replyAbortController,
1332
1517
  instructions,
1333
1518
  void 0,
1334
- toolMessages
1519
+ toolMessages,
1520
+ hasSpeechMessage ? void 0 : userMetrics
1335
1521
  ),
1336
1522
  ownedSpeechHandle: speechHandle,
1337
1523
  name: "AgentActivity.pipelineReply"
@@ -1351,7 +1537,7 @@ ${instructions}`;
1351
1537
  }
1352
1538
  }
1353
1539
  };
1354
- pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => import_telemetry.tracer.startActiveSpan(
1540
+ pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => import_telemetry.tracer.startActiveSpan(
1355
1541
  async (span) => this._pipelineReplyTaskImpl({
1356
1542
  speechHandle,
1357
1543
  chatCtx,
@@ -1361,7 +1547,8 @@ ${instructions}`;
1361
1547
  instructions,
1362
1548
  newMessage,
1363
1549
  toolsMessages,
1364
- span
1550
+ span,
1551
+ _previousUserMetrics
1365
1552
  }),
1366
1553
  {
1367
1554
  name: "agent_turn",
@@ -1427,6 +1614,7 @@ ${instructions}`;
1427
1614
  });
1428
1615
  };
1429
1616
  const readMessages = async (abortController, outputs) => {
1617
+ var _a2, _b;
1430
1618
  replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
1431
1619
  once: true
1432
1620
  });
@@ -1473,7 +1661,9 @@ ${instructions}`;
1473
1661
  (...args) => this.agent.ttsNode(...args),
1474
1662
  ttsTextInput,
1475
1663
  modelSettings,
1476
- abortController
1664
+ abortController,
1665
+ (_a2 = this.tts) == null ? void 0 : _a2.model,
1666
+ (_b = this.tts) == null ? void 0 : _b.provider
1477
1667
  );
1478
1668
  tasks.push(ttsTask);
1479
1669
  realtimeAudioResult = ttsGenData.audioStream;
@@ -1865,11 +2055,46 @@ ${instructions}`;
1865
2055
  if (this._mainTask) {
1866
2056
  await this._mainTask.cancelAndWait();
1867
2057
  }
2058
+ if (this.interruptionDetector) {
2059
+ this.interruptionDetector.off(
2060
+ "user_overlapping_speech",
2061
+ this.onInterruptionOverlappingSpeech
2062
+ );
2063
+ this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
2064
+ this.interruptionDetector.off("error", this.onInterruptionError);
2065
+ }
1868
2066
  this.agent._agentActivity = void 0;
1869
2067
  } finally {
1870
2068
  unlock();
1871
2069
  }
1872
2070
  }
2071
+ resolveInterruptionDetector() {
2072
+ const interruptionDetection = this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
2073
+ if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof import_llm.RealtimeModel))) {
2074
+ if (interruptionDetection === "adaptive") {
2075
+ this.logger.warn(
2076
+ "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
2077
+ );
2078
+ return void 0;
2079
+ }
2080
+ }
2081
+ if (interruptionDetection !== void 0 && interruptionDetection === false || interruptionDetection === "vad") {
2082
+ return void 0;
2083
+ }
2084
+ try {
2085
+ const detector = new import_interruption_detector.AdaptiveInterruptionDetector();
2086
+ detector.on("user_overlapping_speech", this.onInterruptionOverlappingSpeech);
2087
+ detector.on("metrics_collected", this.onInterruptionMetricsCollected);
2088
+ detector.on("error", this.onInterruptionError);
2089
+ return detector;
2090
+ } catch (error) {
2091
+ this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
2092
+ }
2093
+ return void 0;
2094
+ }
2095
+ restoreInterruptionByAudioActivity() {
2096
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
2097
+ }
1873
2098
  async _closeSessionResources() {
1874
2099
  var _a, _b, _c;
1875
2100
  if (this.llm instanceof import_llm.LLM) {