@livekit/agents 1.0.47 → 1.1.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (444) hide show
  1. package/dist/beta/index.cjs +29 -0
  2. package/dist/beta/index.cjs.map +1 -0
  3. package/dist/beta/index.d.cts +2 -0
  4. package/dist/beta/index.d.ts +2 -0
  5. package/dist/beta/index.d.ts.map +1 -0
  6. package/dist/beta/index.js +7 -0
  7. package/dist/beta/index.js.map +1 -0
  8. package/dist/beta/workflows/index.cjs +29 -0
  9. package/dist/beta/workflows/index.cjs.map +1 -0
  10. package/dist/beta/workflows/index.d.cts +2 -0
  11. package/dist/beta/workflows/index.d.ts +2 -0
  12. package/dist/beta/workflows/index.d.ts.map +1 -0
  13. package/dist/beta/workflows/index.js +7 -0
  14. package/dist/beta/workflows/index.js.map +1 -0
  15. package/dist/beta/workflows/task_group.cjs +162 -0
  16. package/dist/beta/workflows/task_group.cjs.map +1 -0
  17. package/dist/beta/workflows/task_group.d.cts +32 -0
  18. package/dist/beta/workflows/task_group.d.ts +32 -0
  19. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  20. package/dist/beta/workflows/task_group.js +138 -0
  21. package/dist/beta/workflows/task_group.js.map +1 -0
  22. package/dist/constants.cjs +27 -0
  23. package/dist/constants.cjs.map +1 -1
  24. package/dist/constants.d.cts +9 -0
  25. package/dist/constants.d.ts +9 -0
  26. package/dist/constants.d.ts.map +1 -1
  27. package/dist/constants.js +18 -0
  28. package/dist/constants.js.map +1 -1
  29. package/dist/index.cjs +3 -0
  30. package/dist/index.cjs.map +1 -1
  31. package/dist/index.d.cts +2 -1
  32. package/dist/index.d.ts +2 -1
  33. package/dist/index.d.ts.map +1 -1
  34. package/dist/index.js +2 -0
  35. package/dist/index.js.map +1 -1
  36. package/dist/inference/api_protos.d.cts +12 -12
  37. package/dist/inference/api_protos.d.ts +12 -12
  38. package/dist/inference/interruption/defaults.cjs +81 -0
  39. package/dist/inference/interruption/defaults.cjs.map +1 -0
  40. package/dist/inference/interruption/defaults.d.cts +19 -0
  41. package/dist/inference/interruption/defaults.d.ts +19 -0
  42. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  43. package/dist/inference/interruption/defaults.js +46 -0
  44. package/dist/inference/interruption/defaults.js.map +1 -0
  45. package/dist/inference/interruption/errors.cjs +44 -0
  46. package/dist/inference/interruption/errors.cjs.map +1 -0
  47. package/dist/inference/interruption/errors.d.cts +12 -0
  48. package/dist/inference/interruption/errors.d.ts +12 -0
  49. package/dist/inference/interruption/errors.d.ts.map +1 -0
  50. package/dist/inference/interruption/errors.js +20 -0
  51. package/dist/inference/interruption/errors.js.map +1 -0
  52. package/dist/inference/interruption/http_transport.cjs +147 -0
  53. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  54. package/dist/inference/interruption/http_transport.d.cts +63 -0
  55. package/dist/inference/interruption/http_transport.d.ts +63 -0
  56. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  57. package/dist/inference/interruption/http_transport.js +121 -0
  58. package/dist/inference/interruption/http_transport.js.map +1 -0
  59. package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
  60. package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
  61. package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
  62. package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
  63. package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
  64. package/dist/inference/interruption/interruption_cache_entry.js +34 -0
  65. package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
  66. package/dist/inference/interruption/interruption_detector.cjs +181 -0
  67. package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
  68. package/dist/inference/interruption/interruption_detector.d.cts +59 -0
  69. package/dist/inference/interruption/interruption_detector.d.ts +59 -0
  70. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
  71. package/dist/inference/interruption/interruption_detector.js +147 -0
  72. package/dist/inference/interruption/interruption_detector.js.map +1 -0
  73. package/dist/inference/interruption/interruption_stream.cjs +368 -0
  74. package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
  75. package/dist/inference/interruption/interruption_stream.d.cts +46 -0
  76. package/dist/inference/interruption/interruption_stream.d.ts +46 -0
  77. package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
  78. package/dist/inference/interruption/interruption_stream.js +344 -0
  79. package/dist/inference/interruption/interruption_stream.js.map +1 -0
  80. package/dist/inference/interruption/types.cjs +17 -0
  81. package/dist/inference/interruption/types.cjs.map +1 -0
  82. package/dist/inference/interruption/types.d.cts +66 -0
  83. package/dist/inference/interruption/types.d.ts +66 -0
  84. package/dist/inference/interruption/types.d.ts.map +1 -0
  85. package/dist/inference/interruption/types.js +1 -0
  86. package/dist/inference/interruption/types.js.map +1 -0
  87. package/dist/inference/interruption/utils.cjs +130 -0
  88. package/dist/inference/interruption/utils.cjs.map +1 -0
  89. package/dist/inference/interruption/utils.d.cts +41 -0
  90. package/dist/inference/interruption/utils.d.ts +41 -0
  91. package/dist/inference/interruption/utils.d.ts.map +1 -0
  92. package/dist/inference/interruption/utils.js +105 -0
  93. package/dist/inference/interruption/utils.js.map +1 -0
  94. package/dist/inference/interruption/utils.test.cjs +105 -0
  95. package/dist/inference/interruption/utils.test.cjs.map +1 -0
  96. package/dist/inference/interruption/utils.test.js +104 -0
  97. package/dist/inference/interruption/utils.test.js.map +1 -0
  98. package/dist/inference/interruption/ws_transport.cjs +329 -0
  99. package/dist/inference/interruption/ws_transport.cjs.map +1 -0
  100. package/dist/inference/interruption/ws_transport.d.cts +33 -0
  101. package/dist/inference/interruption/ws_transport.d.ts +33 -0
  102. package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
  103. package/dist/inference/interruption/ws_transport.js +295 -0
  104. package/dist/inference/interruption/ws_transport.js.map +1 -0
  105. package/dist/inference/llm.cjs +14 -10
  106. package/dist/inference/llm.cjs.map +1 -1
  107. package/dist/inference/llm.d.cts +2 -1
  108. package/dist/inference/llm.d.ts +2 -1
  109. package/dist/inference/llm.d.ts.map +1 -1
  110. package/dist/inference/llm.js +8 -10
  111. package/dist/inference/llm.js.map +1 -1
  112. package/dist/inference/stt.cjs +7 -2
  113. package/dist/inference/stt.cjs.map +1 -1
  114. package/dist/inference/stt.d.cts +2 -0
  115. package/dist/inference/stt.d.ts +2 -0
  116. package/dist/inference/stt.d.ts.map +1 -1
  117. package/dist/inference/stt.js +8 -3
  118. package/dist/inference/stt.js.map +1 -1
  119. package/dist/inference/tts.cjs +7 -2
  120. package/dist/inference/tts.cjs.map +1 -1
  121. package/dist/inference/tts.d.cts +2 -0
  122. package/dist/inference/tts.d.ts +2 -0
  123. package/dist/inference/tts.d.ts.map +1 -1
  124. package/dist/inference/tts.js +8 -3
  125. package/dist/inference/tts.js.map +1 -1
  126. package/dist/inference/utils.cjs +26 -7
  127. package/dist/inference/utils.cjs.map +1 -1
  128. package/dist/inference/utils.d.cts +13 -0
  129. package/dist/inference/utils.d.ts +13 -0
  130. package/dist/inference/utils.d.ts.map +1 -1
  131. package/dist/inference/utils.js +18 -2
  132. package/dist/inference/utils.js.map +1 -1
  133. package/dist/llm/chat_context.cjs +108 -2
  134. package/dist/llm/chat_context.cjs.map +1 -1
  135. package/dist/llm/chat_context.d.cts +28 -1
  136. package/dist/llm/chat_context.d.ts +28 -1
  137. package/dist/llm/chat_context.d.ts.map +1 -1
  138. package/dist/llm/chat_context.js +108 -2
  139. package/dist/llm/chat_context.js.map +1 -1
  140. package/dist/llm/chat_context.test.cjs +43 -0
  141. package/dist/llm/chat_context.test.cjs.map +1 -1
  142. package/dist/llm/chat_context.test.js +43 -0
  143. package/dist/llm/chat_context.test.js.map +1 -1
  144. package/dist/llm/index.cjs +2 -0
  145. package/dist/llm/index.cjs.map +1 -1
  146. package/dist/llm/index.d.cts +2 -2
  147. package/dist/llm/index.d.ts +2 -2
  148. package/dist/llm/index.d.ts.map +1 -1
  149. package/dist/llm/index.js +3 -1
  150. package/dist/llm/index.js.map +1 -1
  151. package/dist/llm/llm.cjs +16 -1
  152. package/dist/llm/llm.cjs.map +1 -1
  153. package/dist/llm/llm.d.cts +9 -0
  154. package/dist/llm/llm.d.ts +9 -0
  155. package/dist/llm/llm.d.ts.map +1 -1
  156. package/dist/llm/llm.js +16 -1
  157. package/dist/llm/llm.js.map +1 -1
  158. package/dist/llm/provider_format/index.d.cts +1 -1
  159. package/dist/llm/provider_format/index.d.ts +1 -1
  160. package/dist/llm/realtime.cjs +3 -0
  161. package/dist/llm/realtime.cjs.map +1 -1
  162. package/dist/llm/realtime.d.cts +1 -0
  163. package/dist/llm/realtime.d.ts +1 -0
  164. package/dist/llm/realtime.d.ts.map +1 -1
  165. package/dist/llm/realtime.js +3 -0
  166. package/dist/llm/realtime.js.map +1 -1
  167. package/dist/llm/tool_context.cjs +7 -0
  168. package/dist/llm/tool_context.cjs.map +1 -1
  169. package/dist/llm/tool_context.d.cts +10 -2
  170. package/dist/llm/tool_context.d.ts +10 -2
  171. package/dist/llm/tool_context.d.ts.map +1 -1
  172. package/dist/llm/tool_context.js +6 -0
  173. package/dist/llm/tool_context.js.map +1 -1
  174. package/dist/metrics/base.cjs.map +1 -1
  175. package/dist/metrics/base.d.cts +45 -1
  176. package/dist/metrics/base.d.ts +45 -1
  177. package/dist/metrics/base.d.ts.map +1 -1
  178. package/dist/metrics/index.cjs +5 -0
  179. package/dist/metrics/index.cjs.map +1 -1
  180. package/dist/metrics/index.d.cts +2 -1
  181. package/dist/metrics/index.d.ts +2 -1
  182. package/dist/metrics/index.d.ts.map +1 -1
  183. package/dist/metrics/index.js +6 -0
  184. package/dist/metrics/index.js.map +1 -1
  185. package/dist/metrics/model_usage.cjs +189 -0
  186. package/dist/metrics/model_usage.cjs.map +1 -0
  187. package/dist/metrics/model_usage.d.cts +92 -0
  188. package/dist/metrics/model_usage.d.ts +92 -0
  189. package/dist/metrics/model_usage.d.ts.map +1 -0
  190. package/dist/metrics/model_usage.js +164 -0
  191. package/dist/metrics/model_usage.js.map +1 -0
  192. package/dist/metrics/model_usage.test.cjs +474 -0
  193. package/dist/metrics/model_usage.test.cjs.map +1 -0
  194. package/dist/metrics/model_usage.test.js +476 -0
  195. package/dist/metrics/model_usage.test.js.map +1 -0
  196. package/dist/metrics/usage_collector.cjs +3 -0
  197. package/dist/metrics/usage_collector.cjs.map +1 -1
  198. package/dist/metrics/usage_collector.d.cts +9 -0
  199. package/dist/metrics/usage_collector.d.ts +9 -0
  200. package/dist/metrics/usage_collector.d.ts.map +1 -1
  201. package/dist/metrics/usage_collector.js +3 -0
  202. package/dist/metrics/usage_collector.js.map +1 -1
  203. package/dist/metrics/utils.cjs +9 -0
  204. package/dist/metrics/utils.cjs.map +1 -1
  205. package/dist/metrics/utils.d.ts.map +1 -1
  206. package/dist/metrics/utils.js +9 -0
  207. package/dist/metrics/utils.js.map +1 -1
  208. package/dist/stream/multi_input_stream.test.cjs +4 -0
  209. package/dist/stream/multi_input_stream.test.cjs.map +1 -1
  210. package/dist/stream/multi_input_stream.test.js +5 -1
  211. package/dist/stream/multi_input_stream.test.js.map +1 -1
  212. package/dist/stream/stream_channel.cjs +31 -0
  213. package/dist/stream/stream_channel.cjs.map +1 -1
  214. package/dist/stream/stream_channel.d.cts +4 -2
  215. package/dist/stream/stream_channel.d.ts +4 -2
  216. package/dist/stream/stream_channel.d.ts.map +1 -1
  217. package/dist/stream/stream_channel.js +31 -0
  218. package/dist/stream/stream_channel.js.map +1 -1
  219. package/dist/stt/stt.cjs +34 -2
  220. package/dist/stt/stt.cjs.map +1 -1
  221. package/dist/stt/stt.d.cts +22 -0
  222. package/dist/stt/stt.d.ts +22 -0
  223. package/dist/stt/stt.d.ts.map +1 -1
  224. package/dist/stt/stt.js +34 -2
  225. package/dist/stt/stt.js.map +1 -1
  226. package/dist/telemetry/otel_http_exporter.cjs +24 -5
  227. package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
  228. package/dist/telemetry/otel_http_exporter.d.cts +1 -0
  229. package/dist/telemetry/otel_http_exporter.d.ts +1 -0
  230. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
  231. package/dist/telemetry/otel_http_exporter.js +24 -5
  232. package/dist/telemetry/otel_http_exporter.js.map +1 -1
  233. package/dist/telemetry/trace_types.cjs +5 -5
  234. package/dist/telemetry/trace_types.cjs.map +1 -1
  235. package/dist/telemetry/trace_types.d.cts +9 -5
  236. package/dist/telemetry/trace_types.d.ts +9 -5
  237. package/dist/telemetry/trace_types.d.ts.map +1 -1
  238. package/dist/telemetry/trace_types.js +5 -5
  239. package/dist/telemetry/trace_types.js.map +1 -1
  240. package/dist/telemetry/traces.cjs +47 -8
  241. package/dist/telemetry/traces.cjs.map +1 -1
  242. package/dist/telemetry/traces.d.ts.map +1 -1
  243. package/dist/telemetry/traces.js +47 -8
  244. package/dist/telemetry/traces.js.map +1 -1
  245. package/dist/tts/tts.cjs +64 -2
  246. package/dist/tts/tts.cjs.map +1 -1
  247. package/dist/tts/tts.d.cts +34 -0
  248. package/dist/tts/tts.d.ts +34 -0
  249. package/dist/tts/tts.d.ts.map +1 -1
  250. package/dist/tts/tts.js +64 -2
  251. package/dist/tts/tts.js.map +1 -1
  252. package/dist/utils.cjs +1 -0
  253. package/dist/utils.cjs.map +1 -1
  254. package/dist/utils.d.ts.map +1 -1
  255. package/dist/utils.js +1 -0
  256. package/dist/utils.js.map +1 -1
  257. package/dist/version.cjs +1 -1
  258. package/dist/version.js +1 -1
  259. package/dist/voice/agent.cjs +34 -4
  260. package/dist/voice/agent.cjs.map +1 -1
  261. package/dist/voice/agent.d.cts +11 -2
  262. package/dist/voice/agent.d.ts +11 -2
  263. package/dist/voice/agent.d.ts.map +1 -1
  264. package/dist/voice/agent.js +34 -4
  265. package/dist/voice/agent.js.map +1 -1
  266. package/dist/voice/agent_activity.cjs +292 -44
  267. package/dist/voice/agent_activity.cjs.map +1 -1
  268. package/dist/voice/agent_activity.d.cts +27 -6
  269. package/dist/voice/agent_activity.d.ts +27 -6
  270. package/dist/voice/agent_activity.d.ts.map +1 -1
  271. package/dist/voice/agent_activity.js +293 -45
  272. package/dist/voice/agent_activity.js.map +1 -1
  273. package/dist/voice/agent_session.cjs +105 -48
  274. package/dist/voice/agent_session.cjs.map +1 -1
  275. package/dist/voice/agent_session.d.cts +90 -20
  276. package/dist/voice/agent_session.d.ts +90 -20
  277. package/dist/voice/agent_session.d.ts.map +1 -1
  278. package/dist/voice/agent_session.js +105 -46
  279. package/dist/voice/agent_session.js.map +1 -1
  280. package/dist/voice/audio_recognition.cjs +287 -6
  281. package/dist/voice/audio_recognition.cjs.map +1 -1
  282. package/dist/voice/audio_recognition.d.cts +42 -3
  283. package/dist/voice/audio_recognition.d.ts +42 -3
  284. package/dist/voice/audio_recognition.d.ts.map +1 -1
  285. package/dist/voice/audio_recognition.js +289 -7
  286. package/dist/voice/audio_recognition.js.map +1 -1
  287. package/dist/voice/client_events.cjs +554 -0
  288. package/dist/voice/client_events.cjs.map +1 -0
  289. package/dist/voice/client_events.d.cts +195 -0
  290. package/dist/voice/client_events.d.ts +195 -0
  291. package/dist/voice/client_events.d.ts.map +1 -0
  292. package/dist/voice/client_events.js +548 -0
  293. package/dist/voice/client_events.js.map +1 -0
  294. package/dist/voice/events.cjs +1 -0
  295. package/dist/voice/events.cjs.map +1 -1
  296. package/dist/voice/events.d.cts +8 -5
  297. package/dist/voice/events.d.ts +8 -5
  298. package/dist/voice/events.d.ts.map +1 -1
  299. package/dist/voice/events.js +1 -0
  300. package/dist/voice/events.js.map +1 -1
  301. package/dist/voice/generation.cjs +43 -8
  302. package/dist/voice/generation.cjs.map +1 -1
  303. package/dist/voice/generation.d.cts +3 -3
  304. package/dist/voice/generation.d.ts +3 -3
  305. package/dist/voice/generation.d.ts.map +1 -1
  306. package/dist/voice/generation.js +43 -8
  307. package/dist/voice/generation.js.map +1 -1
  308. package/dist/voice/index.cjs +1 -0
  309. package/dist/voice/index.cjs.map +1 -1
  310. package/dist/voice/index.d.cts +1 -0
  311. package/dist/voice/index.d.ts +1 -0
  312. package/dist/voice/index.d.ts.map +1 -1
  313. package/dist/voice/index.js +1 -0
  314. package/dist/voice/index.js.map +1 -1
  315. package/dist/voice/report.cjs +20 -8
  316. package/dist/voice/report.cjs.map +1 -1
  317. package/dist/voice/report.d.cts +5 -0
  318. package/dist/voice/report.d.ts +5 -0
  319. package/dist/voice/report.d.ts.map +1 -1
  320. package/dist/voice/report.js +20 -8
  321. package/dist/voice/report.js.map +1 -1
  322. package/dist/voice/report.test.cjs +106 -0
  323. package/dist/voice/report.test.cjs.map +1 -0
  324. package/dist/voice/report.test.js +105 -0
  325. package/dist/voice/report.test.js.map +1 -0
  326. package/dist/voice/room_io/room_io.cjs +16 -41
  327. package/dist/voice/room_io/room_io.cjs.map +1 -1
  328. package/dist/voice/room_io/room_io.d.cts +4 -9
  329. package/dist/voice/room_io/room_io.d.ts +4 -9
  330. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  331. package/dist/voice/room_io/room_io.js +17 -43
  332. package/dist/voice/room_io/room_io.js.map +1 -1
  333. package/dist/voice/testing/fake_llm.cjs +127 -0
  334. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  335. package/dist/voice/testing/fake_llm.d.cts +30 -0
  336. package/dist/voice/testing/fake_llm.d.ts +30 -0
  337. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  338. package/dist/voice/testing/fake_llm.js +103 -0
  339. package/dist/voice/testing/fake_llm.js.map +1 -0
  340. package/dist/voice/testing/index.cjs +3 -0
  341. package/dist/voice/testing/index.cjs.map +1 -1
  342. package/dist/voice/testing/index.d.cts +1 -0
  343. package/dist/voice/testing/index.d.ts +1 -0
  344. package/dist/voice/testing/index.d.ts.map +1 -1
  345. package/dist/voice/testing/index.js +2 -0
  346. package/dist/voice/testing/index.js.map +1 -1
  347. package/dist/voice/turn_config/endpointing.cjs +33 -0
  348. package/dist/voice/turn_config/endpointing.cjs.map +1 -0
  349. package/dist/voice/turn_config/endpointing.d.cts +30 -0
  350. package/dist/voice/turn_config/endpointing.d.ts +30 -0
  351. package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
  352. package/dist/voice/turn_config/endpointing.js +9 -0
  353. package/dist/voice/turn_config/endpointing.js.map +1 -0
  354. package/dist/voice/turn_config/interruption.cjs +37 -0
  355. package/dist/voice/turn_config/interruption.cjs.map +1 -0
  356. package/dist/voice/turn_config/interruption.d.cts +53 -0
  357. package/dist/voice/turn_config/interruption.d.ts +53 -0
  358. package/dist/voice/turn_config/interruption.d.ts.map +1 -0
  359. package/dist/voice/turn_config/interruption.js +13 -0
  360. package/dist/voice/turn_config/interruption.js.map +1 -0
  361. package/dist/voice/turn_config/turn_handling.cjs +35 -0
  362. package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
  363. package/dist/voice/turn_config/turn_handling.d.cts +36 -0
  364. package/dist/voice/turn_config/turn_handling.d.ts +36 -0
  365. package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
  366. package/dist/voice/turn_config/turn_handling.js +11 -0
  367. package/dist/voice/turn_config/turn_handling.js.map +1 -0
  368. package/dist/voice/turn_config/utils.cjs +97 -0
  369. package/dist/voice/turn_config/utils.cjs.map +1 -0
  370. package/dist/voice/turn_config/utils.d.cts +25 -0
  371. package/dist/voice/turn_config/utils.d.ts +25 -0
  372. package/dist/voice/turn_config/utils.d.ts.map +1 -0
  373. package/dist/voice/turn_config/utils.js +73 -0
  374. package/dist/voice/turn_config/utils.js.map +1 -0
  375. package/dist/voice/turn_config/utils.test.cjs +86 -0
  376. package/dist/voice/turn_config/utils.test.cjs.map +1 -0
  377. package/dist/voice/turn_config/utils.test.js +85 -0
  378. package/dist/voice/turn_config/utils.test.js.map +1 -0
  379. package/dist/voice/wire_format.cjs +798 -0
  380. package/dist/voice/wire_format.cjs.map +1 -0
  381. package/dist/voice/wire_format.d.cts +5503 -0
  382. package/dist/voice/wire_format.d.ts +5503 -0
  383. package/dist/voice/wire_format.d.ts.map +1 -0
  384. package/dist/voice/wire_format.js +728 -0
  385. package/dist/voice/wire_format.js.map +1 -0
  386. package/package.json +2 -1
  387. package/src/beta/index.ts +9 -0
  388. package/src/beta/workflows/index.ts +9 -0
  389. package/src/beta/workflows/task_group.ts +194 -0
  390. package/src/constants.ts +13 -0
  391. package/src/index.ts +2 -1
  392. package/src/inference/interruption/defaults.ts +51 -0
  393. package/src/inference/interruption/errors.ts +25 -0
  394. package/src/inference/interruption/http_transport.ts +187 -0
  395. package/src/inference/interruption/interruption_cache_entry.ts +50 -0
  396. package/src/inference/interruption/interruption_detector.ts +188 -0
  397. package/src/inference/interruption/interruption_stream.ts +467 -0
  398. package/src/inference/interruption/types.ts +84 -0
  399. package/src/inference/interruption/utils.test.ts +132 -0
  400. package/src/inference/interruption/utils.ts +137 -0
  401. package/src/inference/interruption/ws_transport.ts +402 -0
  402. package/src/inference/llm.ts +9 -12
  403. package/src/inference/stt.ts +10 -3
  404. package/src/inference/tts.ts +10 -3
  405. package/src/inference/utils.ts +29 -1
  406. package/src/llm/chat_context.test.ts +48 -0
  407. package/src/llm/chat_context.ts +161 -0
  408. package/src/llm/index.ts +2 -0
  409. package/src/llm/llm.ts +16 -0
  410. package/src/llm/realtime.ts +4 -0
  411. package/src/llm/tool_context.ts +14 -0
  412. package/src/metrics/base.ts +48 -1
  413. package/src/metrics/index.ts +11 -0
  414. package/src/metrics/model_usage.test.ts +545 -0
  415. package/src/metrics/model_usage.ts +262 -0
  416. package/src/metrics/usage_collector.ts +11 -0
  417. package/src/metrics/utils.ts +11 -0
  418. package/src/stream/multi_input_stream.test.ts +6 -1
  419. package/src/stream/stream_channel.ts +34 -2
  420. package/src/stt/stt.ts +38 -0
  421. package/src/telemetry/otel_http_exporter.ts +28 -5
  422. package/src/telemetry/trace_types.ts +11 -8
  423. package/src/telemetry/traces.ts +111 -54
  424. package/src/tts/tts.ts +69 -1
  425. package/src/utils.ts +5 -0
  426. package/src/voice/agent.ts +41 -3
  427. package/src/voice/agent_activity.ts +371 -34
  428. package/src/voice/agent_session.ts +207 -59
  429. package/src/voice/audio_recognition.ts +385 -9
  430. package/src/voice/client_events.ts +838 -0
  431. package/src/voice/events.ts +14 -4
  432. package/src/voice/generation.ts +52 -9
  433. package/src/voice/index.ts +1 -0
  434. package/src/voice/report.test.ts +117 -0
  435. package/src/voice/report.ts +29 -6
  436. package/src/voice/room_io/room_io.ts +21 -64
  437. package/src/voice/testing/fake_llm.ts +138 -0
  438. package/src/voice/testing/index.ts +2 -0
  439. package/src/voice/turn_config/endpointing.ts +33 -0
  440. package/src/voice/turn_config/interruption.ts +56 -0
  441. package/src/voice/turn_config/turn_handling.ts +45 -0
  442. package/src/voice/turn_config/utils.test.ts +100 -0
  443. package/src/voice/turn_config/utils.ts +103 -0
  444. package/src/voice/wire_format.ts +827 -0
@@ -7,8 +7,11 @@ import type { Span } from '@opentelemetry/api';
7
7
  import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
8
8
  import { Heap } from 'heap-js';
9
9
  import { AsyncLocalStorage } from 'node:async_hooks';
10
- import { ReadableStream } from 'node:stream/web';
11
- import { type ChatContext, ChatMessage } from '../llm/chat_context.js';
10
+ import { ReadableStream, TransformStream } from 'node:stream/web';
11
+ import type { InterruptionDetectionError } from '../inference/interruption/errors.js';
12
+ import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
13
+ import type { OverlappingSpeechEvent } from '../inference/interruption/types.js';
14
+ import { type ChatContext, ChatMessage, type MetricsReport } from '../llm/chat_context.js';
12
15
  import {
13
16
  type ChatItem,
14
17
  type FunctionCall,
@@ -23,12 +26,14 @@ import {
23
26
  type RealtimeSession,
24
27
  type ToolChoice,
25
28
  type ToolContext,
29
+ ToolFlag,
26
30
  } from '../llm/index.js';
27
31
  import type { LLMError } from '../llm/llm.js';
28
32
  import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
29
33
  import { log } from '../log.js';
30
34
  import type {
31
35
  EOUMetrics,
36
+ InterruptionMetrics,
32
37
  LLMMetrics,
33
38
  RealtimeModelMetrics,
34
39
  STTMetrics,
@@ -56,7 +61,6 @@ import {
56
61
  type EndOfTurnInfo,
57
62
  type PreemptiveGenerationInfo,
58
63
  type RecognitionHooks,
59
- type _TurnDetector,
60
64
  } from './audio_recognition.js';
61
65
  import {
62
66
  AgentSessionEventTypes,
@@ -83,6 +87,12 @@ import { SpeechHandle } from './speech_handle.js';
83
87
  import { setParticipantSpanAttributes } from './utils.js';
84
88
 
85
89
  export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
90
+ export const onEnterStorage = new AsyncLocalStorage<OnEnterData>();
91
+
92
+ interface OnEnterData {
93
+ session: AgentSession;
94
+ agent: Agent;
95
+ }
86
96
 
87
97
  interface PreemptiveGeneration {
88
98
  speechHandle: SpeechHandle;
@@ -94,6 +104,7 @@ interface PreemptiveGeneration {
94
104
  createdAt: number;
95
105
  }
96
106
 
107
+ // TODO add false interruption handling and barge in handling for https://github.com/livekit/agents/pull/3109/changes
97
108
  export class AgentActivity implements RecognitionHooks {
98
109
  agent: Agent;
99
110
  agentSession: AgentSession;
@@ -104,7 +115,7 @@ export class AgentActivity implements RecognitionHooks {
104
115
  private audioRecognition?: AudioRecognition;
105
116
  private realtimeSession?: RealtimeSession;
106
117
  private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
107
- private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
118
+ private turnDetectionMode?: TurnDetectionMode;
108
119
  private logger = log();
109
120
  private _schedulingPaused = true;
110
121
  private _drainBlockedTasks: Task<any>[] = [];
@@ -119,6 +130,43 @@ export class AgentActivity implements RecognitionHooks {
119
130
  // default to null as None, which maps to the default provider tool choice value
120
131
  private toolChoice: ToolChoice | null = null;
121
132
  private _preemptiveGeneration?: PreemptiveGeneration;
133
+ private interruptionDetector?: AdaptiveInterruptionDetector;
134
+ private isInterruptionDetectionEnabled: boolean;
135
+ private isInterruptionByAudioActivityEnabled: boolean;
136
+ private isDefaultInterruptionByAudioActivityEnabled: boolean;
137
+
138
+ private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent): void =>
139
+ this.onGenerationCreated(ev);
140
+
141
+ private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent): void =>
142
+ this.onInputSpeechStarted(ev);
143
+
144
+ private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent): void =>
145
+ this.onInputSpeechStopped(ev);
146
+
147
+ private readonly onRealtimeInputAudioTranscriptionCompleted = (
148
+ ev: InputTranscriptionCompleted,
149
+ ): void => this.onInputAudioTranscriptionCompleted(ev);
150
+
151
+ private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError): void =>
152
+ this.onError(ev);
153
+
154
+ private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => {
155
+ this.agentSession.emit(AgentSessionEventTypes.UserOverlappingSpeech, ev);
156
+ };
157
+
158
+ private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
159
+ this.agentSession.emit(
160
+ AgentSessionEventTypes.MetricsCollected,
161
+ createMetricsCollectedEvent({ metrics: ev }),
162
+ );
163
+ };
164
+
165
+ private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
166
+ const errorEvent = createErrorEvent(ev, this.interruptionDetector);
167
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
168
+ this.agentSession._onError(ev);
169
+ };
122
170
 
123
171
  /** @internal */
124
172
  _mainTask?: Task<void>;
@@ -126,16 +174,6 @@ export class AgentActivity implements RecognitionHooks {
126
174
  _onExitTask?: Task<void>;
127
175
  _userTurnCompletedTask?: Task<void>;
128
176
 
129
- private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) =>
130
- this.onGenerationCreated(ev);
131
- private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) =>
132
- this.onInputSpeechStarted(ev);
133
- private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) =>
134
- this.onInputSpeechStopped(ev);
135
- private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) =>
136
- this.onInputAudioTranscriptionCompleted(ev);
137
- private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) =>
138
- this.onError(ev);
139
177
  constructor(agent: Agent, agentSession: AgentSession) {
140
178
  this.agent = agent;
141
179
  this.agentSession = agentSession;
@@ -228,6 +266,16 @@ export class AgentActivity implements RecognitionHooks {
228
266
  'for more responsive interruption handling.',
229
267
  );
230
268
  }
269
+
270
+ this.interruptionDetector = this.resolveInterruptionDetector();
271
+ this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
272
+
273
+ // this allows taking over audio interruption temporarily until interruption is detected
274
+ // by default is is ture unless turnDetection is manual or realtime_llm
275
+ this.isInterruptionByAudioActivityEnabled =
276
+ this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
277
+
278
+ this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
231
279
  }
232
280
 
233
281
  async start(): Promise<void> {
@@ -312,6 +360,8 @@ export class AgentActivity implements RecognitionHooks {
312
360
  }
313
361
  }
314
362
 
363
+ // TODO(parity): Record initial AgentConfigUpdate in chat context
364
+
315
365
  // metrics and error handling
316
366
  if (this.llm instanceof LLM) {
317
367
  this.llm.on('metrics_collected', this.onMetricsCollected);
@@ -339,8 +389,9 @@ export class AgentActivity implements RecognitionHooks {
339
389
  vad: this.vad,
340
390
  turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
341
391
  turnDetectionMode: this.turnDetectionMode,
342
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
343
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
392
+ interruptionDetection: this.interruptionDetector,
393
+ minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
394
+ maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
344
395
  rootSpanContext: this.agentSession.rootSpanContext,
345
396
  sttModel: this.stt?.label,
346
397
  sttProvider: this.getSttProvider(),
@@ -354,11 +405,13 @@ export class AgentActivity implements RecognitionHooks {
354
405
  if (runOnEnter) {
355
406
  this._onEnterTask = this.createSpeechTask({
356
407
  taskFn: () =>
357
- tracer.startActiveSpan(async () => this.agent.onEnter(), {
358
- name: 'on_enter',
359
- context: trace.setSpan(ROOT_CONTEXT, startSpan),
360
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
361
- }),
408
+ onEnterStorage.run({ session: this.agentSession, agent: this.agent }, () =>
409
+ tracer.startActiveSpan(async () => this.agent.onEnter(), {
410
+ name: 'on_enter',
411
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
412
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
413
+ }),
414
+ ),
362
415
  inlineTask: true,
363
416
  name: 'AgentActivity_onEnter',
364
417
  });
@@ -412,7 +465,7 @@ export class AgentActivity implements RecognitionHooks {
412
465
 
413
466
  get allowInterruptions(): boolean {
414
467
  // TODO(AJS-51): Allow options to be defined in Agent class
415
- return this.agentSession.options.allowInterruptions;
468
+ return this.agentSession.options.turnHandling.interruption?.mode !== false;
416
469
  }
417
470
 
418
471
  get useTtsAlignedTranscript(): boolean {
@@ -429,6 +482,11 @@ export class AgentActivity implements RecognitionHooks {
429
482
  return this.agent.toolCtx;
430
483
  }
431
484
 
485
+ /** @internal */
486
+ get inputStartedAt() {
487
+ return this.audioRecognition?.inputStartedAt;
488
+ }
489
+
432
490
  async updateChatCtx(chatCtx: ChatContext): Promise<void> {
433
491
  chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
434
492
 
@@ -446,7 +504,27 @@ export class AgentActivity implements RecognitionHooks {
446
504
  }
447
505
  }
448
506
 
449
- updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void {
507
+ // TODO: Add when AgentConfigUpdate is ported to ChatContext.
508
+ async updateTools(tools: ToolContext): Promise<void> {
509
+ this.agent._tools = { ...tools };
510
+
511
+ if (this.realtimeSession) {
512
+ await this.realtimeSession.updateTools(tools);
513
+ }
514
+
515
+ if (this.llm instanceof LLM) {
516
+ // for realtime LLM, we assume the server will remove unvalid tool messages
517
+ await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
518
+ }
519
+ }
520
+
521
+ updateOptions({
522
+ toolChoice,
523
+ turnDetection,
524
+ }: {
525
+ toolChoice?: ToolChoice | null;
526
+ turnDetection?: TurnDetectionMode;
527
+ }): void {
450
528
  if (toolChoice !== undefined) {
451
529
  this.toolChoice = toolChoice;
452
530
  }
@@ -454,14 +532,46 @@ export class AgentActivity implements RecognitionHooks {
454
532
  if (this.realtimeSession) {
455
533
  this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
456
534
  }
535
+
536
+ if (turnDetection !== undefined) {
537
+ this.turnDetectionMode = turnDetection;
538
+ this.isDefaultInterruptionByAudioActivityEnabled =
539
+ this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
540
+
541
+ // sync live flag immediately when not speaking so the change takes effect right away
542
+ if (this.agentSession.agentState !== 'speaking') {
543
+ this.isInterruptionByAudioActivityEnabled =
544
+ this.isDefaultInterruptionByAudioActivityEnabled;
545
+ }
546
+ }
547
+
548
+ if (this.audioRecognition) {
549
+ this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
550
+ }
457
551
  }
458
552
 
459
553
  attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
460
554
  void this.audioStream.close();
461
555
  this.audioStream = new MultiInputStream<AudioFrame>();
462
556
 
557
+ // Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
558
+ // than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
559
+ // if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
560
+ // and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
561
+ const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
562
+ transform: (frame, controller) => {
563
+ const shouldDiscardForAecWarmup =
564
+ this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
565
+ if (!shouldDiscardForAecWarmup) {
566
+ controller.enqueue(frame);
567
+ }
568
+ },
569
+ });
570
+
463
571
  this.audioStreamId = this.audioStream.addInputStream(audioStream);
464
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
572
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
573
+ .pipeThrough(aecWarmupAudioFilter)
574
+ .tee();
465
575
 
466
576
  if (this.realtimeSession) {
467
577
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
@@ -614,6 +724,13 @@ export class AgentActivity implements RecognitionHooks {
614
724
 
615
725
  if (!this.vad) {
616
726
  this.agentSession._updateUserState('speaking');
727
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
728
+ this.audioRecognition.onStartOfOverlapSpeech(
729
+ 0,
730
+ Date.now(),
731
+ this.agentSession._userSpeakingSpan,
732
+ );
733
+ }
617
734
  }
618
735
 
619
736
  // this.interrupt() is going to raise when allow_interruptions is False,
@@ -632,6 +749,9 @@ export class AgentActivity implements RecognitionHooks {
632
749
  this.logger.info(ev, 'onInputSpeechStopped');
633
750
 
634
751
  if (!this.vad) {
752
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
753
+ this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
754
+ }
635
755
  this.agentSession._updateUserState('listening');
636
756
  }
637
757
 
@@ -705,15 +825,32 @@ export class AgentActivity implements RecognitionHooks {
705
825
  onStartOfSpeech(ev: VADEvent): void {
706
826
  let speechStartTime = Date.now();
707
827
  if (ev) {
708
- speechStartTime = speechStartTime - ev.speechDuration;
828
+ // Subtract both speechDuration and inferenceDuration to correct for VAD model latency.
829
+ speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
709
830
  }
710
831
  this.agentSession._updateUserState('speaking', speechStartTime);
832
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
833
+ // Pass speechStartTime as the absolute startedAt timestamp.
834
+ this.audioRecognition.onStartOfOverlapSpeech(
835
+ ev.speechDuration,
836
+ speechStartTime,
837
+ this.agentSession._userSpeakingSpan,
838
+ );
839
+ }
711
840
  }
712
841
 
713
842
  onEndOfSpeech(ev: VADEvent): void {
714
843
  let speechEndTime = Date.now();
715
844
  if (ev) {
716
- speechEndTime = speechEndTime - ev.silenceDuration;
845
+ // Subtract both silenceDuration and inferenceDuration to correct for VAD model latency.
846
+ speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
847
+ }
848
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
849
+ // Pass speechEndTime as the absolute endedAt timestamp.
850
+ this.audioRecognition.onEndOfOverlapSpeech(
851
+ speechEndTime,
852
+ this.agentSession._userSpeakingSpan,
853
+ );
717
854
  }
718
855
  this.agentSession._updateUserState('listening', speechEndTime);
719
856
  }
@@ -724,12 +861,21 @@ export class AgentActivity implements RecognitionHooks {
724
861
  return;
725
862
  }
726
863
 
727
- if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
864
+ if (ev.speechDuration >= this.agentSession.options.turnHandling.interruption?.minDuration) {
728
865
  this.interruptByAudioActivity();
729
866
  }
730
867
  }
731
868
 
732
869
  private interruptByAudioActivity(): void {
870
+ if (!this.isInterruptionByAudioActivityEnabled) {
871
+ return;
872
+ }
873
+
874
+ if (this.agentSession._aecWarmupRemaining > 0) {
875
+ // Disable interruption from audio activity while AEC warmup is active.
876
+ return;
877
+ }
878
+
733
879
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
734
880
  // skip speech handle interruption if server side turn detection is enabled
735
881
  return;
@@ -739,7 +885,11 @@ export class AgentActivity implements RecognitionHooks {
739
885
  // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
740
886
  // - Apply check to all STT results: empty string, undefined, or any length
741
887
  // - This ensures consistent behavior across all interruption scenarios
742
- if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
888
+ if (
889
+ this.stt &&
890
+ this.agentSession.options.turnHandling.interruption?.minWords > 0 &&
891
+ this.audioRecognition
892
+ ) {
743
893
  const text = this.audioRecognition.currentTranscript;
744
894
  // TODO(shubhra): better word splitting for multi-language
745
895
 
@@ -749,7 +899,7 @@ export class AgentActivity implements RecognitionHooks {
749
899
 
750
900
  // Only allow interruption if word count meets or exceeds minInterruptionWords
751
901
  // This applies to all cases: empty strings, partial speech, and full speech
752
- if (wordCount < this.agentSession.options.minInterruptionWords) {
902
+ if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) {
753
903
  return;
754
904
  }
755
905
  }
@@ -770,6 +920,14 @@ export class AgentActivity implements RecognitionHooks {
770
920
  }
771
921
  }
772
922
 
923
+ onInterruption(ev: OverlappingSpeechEvent) {
924
+ this.restoreInterruptionByAudioActivity();
925
+ this.interruptByAudioActivity();
926
+ if (this.audioRecognition) {
927
+ this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
928
+ }
929
+ }
930
+
773
931
  onInterimTranscript(ev: SpeechEvent): void {
774
932
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
775
933
  // skip stt transcription if userTranscription is enabled on the realtime model
@@ -845,6 +1003,7 @@ export class AgentActivity implements RecognitionHooks {
845
1003
  const userMessage = ChatMessage.create({
846
1004
  role: 'user',
847
1005
  content: info.newTranscript,
1006
+ transcriptConfidence: info.transcriptConfidence,
848
1007
  });
849
1008
  const chatCtx = this.agent.chatCtx.copy();
850
1009
  const speechHandle = this.generateReply({
@@ -940,16 +1099,16 @@ export class AgentActivity implements RecognitionHooks {
940
1099
  this._currentSpeech &&
941
1100
  this._currentSpeech.allowInterruptions &&
942
1101
  !this._currentSpeech.interrupted &&
943
- this.agentSession.options.minInterruptionWords > 0
1102
+ this.agentSession.options.turnHandling.interruption?.minWords > 0
944
1103
  ) {
945
1104
  const wordCount = splitWords(info.newTranscript, true).length;
946
- if (wordCount < this.agentSession.options.minInterruptionWords) {
1105
+ if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) {
947
1106
  // avoid interruption if the new_transcript contains fewer words than minInterruptionWords
948
1107
  this.cancelPreemptiveGeneration();
949
1108
  this.logger.info(
950
1109
  {
951
1110
  wordCount,
952
- minInterruptionWords: this.agentSession.options.minInterruptionWords,
1111
+ minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords,
953
1112
  },
954
1113
  'skipping user input, word count below minimum interruption threshold',
955
1114
  );
@@ -1129,12 +1288,25 @@ export class AgentActivity implements RecognitionHooks {
1129
1288
  instructions = `${this.agent.instructions}\n${instructions}`;
1130
1289
  }
1131
1290
 
1291
+ // Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter
1292
+ const onEnterData = onEnterStorage.getStore();
1293
+ const shouldFilterTools =
1294
+ onEnterData?.agent === this.agent && onEnterData?.session === this.agentSession;
1295
+
1296
+ const tools = shouldFilterTools
1297
+ ? Object.fromEntries(
1298
+ Object.entries(this.agent.toolCtx).filter(
1299
+ ([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER),
1300
+ ),
1301
+ )
1302
+ : this.agent.toolCtx;
1303
+
1132
1304
  const task = this.createSpeechTask({
1133
1305
  taskFn: (abortController: AbortController) =>
1134
1306
  this.pipelineReplyTask(
1135
1307
  handle,
1136
1308
  chatCtx ?? this.agent.chatCtx,
1137
- this.agent.toolCtx,
1309
+ tools,
1138
1310
  {
1139
1311
  toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
1140
1312
  },
@@ -1234,6 +1406,7 @@ export class AgentActivity implements RecognitionHooks {
1234
1406
  let userMessage: ChatMessage | undefined = ChatMessage.create({
1235
1407
  role: 'user',
1236
1408
  content: info.newTranscript,
1409
+ transcriptConfidence: info.transcriptConfidence,
1237
1410
  });
1238
1411
 
1239
1412
  // create a temporary mutable chat context to pass to onUserTurnCompleted
@@ -1260,6 +1433,24 @@ export class AgentActivity implements RecognitionHooks {
1260
1433
  return;
1261
1434
  }
1262
1435
 
1436
+ const userMetricsReport: MetricsReport = {};
1437
+ if (info.startedSpeakingAt !== undefined) {
1438
+ userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1000; // ms -> seconds
1439
+ }
1440
+ if (info.stoppedSpeakingAt !== undefined) {
1441
+ userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1000; // ms -> seconds
1442
+ }
1443
+ if (info.transcriptionDelay !== undefined) {
1444
+ userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1000; // ms -> seconds
1445
+ }
1446
+ if (info.endOfUtteranceDelay !== undefined) {
1447
+ userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1000; // ms -> seconds
1448
+ }
1449
+ userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1000; // ms -> seconds
1450
+ if (userMessage) {
1451
+ userMessage.metrics = userMetricsReport;
1452
+ }
1453
+
1263
1454
  let speechHandle: SpeechHandle | undefined;
1264
1455
  if (this._preemptiveGeneration !== undefined) {
1265
1456
  const preemptive = this._preemptiveGeneration;
@@ -1272,6 +1463,14 @@ export class AgentActivity implements RecognitionHooks {
1272
1463
  isSameToolChoice(preemptive.toolChoice, this.toolChoice)
1273
1464
  ) {
1274
1465
  speechHandle = preemptive.speechHandle;
1466
+ // The preemptive userMessage was created without metrics.
1467
+ // Copy the metrics and transcriptConfidence from the new userMessage
1468
+ // to the preemptive message BEFORE scheduling (so the pipeline inserts
1469
+ // the message with metrics already set).
1470
+ if (preemptive.userMessage && userMessage) {
1471
+ preemptive.userMessage.metrics = userMetricsReport;
1472
+ preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
1473
+ }
1275
1474
  this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
1276
1475
  this.logger.debug(
1277
1476
  {
@@ -1365,11 +1564,19 @@ export class AgentActivity implements RecognitionHooks {
1365
1564
  tasks.push(textForwardTask);
1366
1565
  }
1367
1566
 
1567
+ let replyStartedSpeakingAt: number | undefined;
1568
+ let replyTtsGenData: _TTSGenerationData | null = null;
1569
+
1368
1570
  const onFirstFrame = (startedSpeakingAt?: number) => {
1571
+ replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1369
1572
  this.agentSession._updateAgentState('speaking', {
1370
1573
  startTime: startedSpeakingAt,
1371
1574
  otelContext: speechHandle._agentTurnContext,
1372
1575
  });
1576
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1577
+ this.audioRecognition.onStartOfAgentSpeech();
1578
+ this.isInterruptionByAudioActivityEnabled = false;
1579
+ }
1373
1580
  };
1374
1581
 
1375
1582
  if (!audioOutput) {
@@ -1387,8 +1594,11 @@ export class AgentActivity implements RecognitionHooks {
1387
1594
  audioSource,
1388
1595
  modelSettings,
1389
1596
  replyAbortController,
1597
+ this.tts?.model,
1598
+ this.tts?.provider,
1390
1599
  );
1391
1600
  tasks.push(ttsTask);
1601
+ replyTtsGenData = ttsGenData;
1392
1602
 
1393
1603
  const [forwardTask, _audioOut] = performAudioForwarding(
1394
1604
  ttsGenData.audioStream,
@@ -1428,10 +1638,21 @@ export class AgentActivity implements RecognitionHooks {
1428
1638
  }
1429
1639
 
1430
1640
  if (addToChatCtx) {
1641
+ const replyStoppedSpeakingAt = Date.now();
1642
+ const replyAssistantMetrics: MetricsReport = {};
1643
+ if (replyTtsGenData?.ttfb !== undefined) {
1644
+ replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
1645
+ }
1646
+ if (replyStartedSpeakingAt !== undefined) {
1647
+ replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds
1648
+ replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds
1649
+ }
1650
+
1431
1651
  const message = ChatMessage.create({
1432
1652
  role: 'assistant',
1433
1653
  content: textOut?.text || '',
1434
1654
  interrupted: speechHandle.interrupted,
1655
+ metrics: replyAssistantMetrics,
1435
1656
  });
1436
1657
  this.agent._chatCtx.insert(message);
1437
1658
  this.agentSession._conversationItemAdded(message);
@@ -1439,6 +1660,10 @@ export class AgentActivity implements RecognitionHooks {
1439
1660
 
1440
1661
  if (this.agentSession.agentState === 'speaking') {
1441
1662
  this.agentSession._updateAgentState('listening');
1663
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1664
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1665
+ }
1666
+ this.restoreInterruptionByAudioActivity();
1442
1667
  }
1443
1668
  }
1444
1669
 
@@ -1452,6 +1677,7 @@ export class AgentActivity implements RecognitionHooks {
1452
1677
  newMessage,
1453
1678
  toolsMessages,
1454
1679
  span,
1680
+ _previousUserMetrics,
1455
1681
  }: {
1456
1682
  speechHandle: SpeechHandle;
1457
1683
  chatCtx: ChatContext;
@@ -1462,6 +1688,7 @@ export class AgentActivity implements RecognitionHooks {
1462
1688
  newMessage?: ChatMessage;
1463
1689
  toolsMessages?: ChatItem[];
1464
1690
  span: Span;
1691
+ _previousUserMetrics?: MetricsReport;
1465
1692
  }): Promise<void> => {
1466
1693
  speechHandle._agentTurnContext = otelContext.active();
1467
1694
 
@@ -1514,6 +1741,8 @@ export class AgentActivity implements RecognitionHooks {
1514
1741
  toolCtx,
1515
1742
  modelSettings,
1516
1743
  replyAbortController,
1744
+ this.llm?.model,
1745
+ this.llm?.provider,
1517
1746
  );
1518
1747
  tasks.push(llmTask);
1519
1748
 
@@ -1530,6 +1759,8 @@ export class AgentActivity implements RecognitionHooks {
1530
1759
  ttsTextInput,
1531
1760
  modelSettings,
1532
1761
  replyAbortController,
1762
+ this.tts?.model,
1763
+ this.tts?.provider,
1533
1764
  );
1534
1765
  tasks.push(ttsTask);
1535
1766
  } else {
@@ -1539,10 +1770,12 @@ export class AgentActivity implements RecognitionHooks {
1539
1770
 
1540
1771
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1541
1772
 
1773
+ let userMetrics: MetricsReport | undefined = _previousUserMetrics;
1542
1774
  // Add new message to actual chat context if the speech is scheduled
1543
1775
  if (newMessage && speechHandle.scheduled) {
1544
1776
  this.agent._chatCtx.insert(newMessage);
1545
1777
  this.agentSession._conversationItemAdded(newMessage);
1778
+ userMetrics = newMessage.metrics;
1546
1779
  }
1547
1780
 
1548
1781
  if (speechHandle.interrupted) {
@@ -1588,11 +1821,17 @@ export class AgentActivity implements RecognitionHooks {
1588
1821
  textOut = _textOut;
1589
1822
  }
1590
1823
 
1824
+ let agentStartedSpeakingAt: number | undefined;
1591
1825
  const onFirstFrame = (startedSpeakingAt?: number) => {
1826
+ agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1592
1827
  this.agentSession._updateAgentState('speaking', {
1593
1828
  startTime: startedSpeakingAt,
1594
1829
  otelContext: speechHandle._agentTurnContext,
1595
1830
  });
1831
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1832
+ this.audioRecognition.onStartOfAgentSpeech();
1833
+ this.isInterruptionByAudioActivityEnabled = false;
1834
+ }
1596
1835
  };
1597
1836
 
1598
1837
  let audioOut: _AudioOut | null = null;
@@ -1649,6 +1888,29 @@ export class AgentActivity implements RecognitionHooks {
1649
1888
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1650
1889
  }
1651
1890
 
1891
+ const agentStoppedSpeakingAt = Date.now();
1892
+ const assistantMetrics: MetricsReport = {};
1893
+
1894
+ if (llmGenData.ttft !== undefined) {
1895
+ assistantMetrics.llmNodeTtft = llmGenData.ttft; // already in seconds
1896
+ }
1897
+ if (ttsGenData?.ttfb !== undefined) {
1898
+ assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb; // already in seconds
1899
+ }
1900
+ if (agentStartedSpeakingAt !== undefined) {
1901
+ assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds
1902
+ assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds
1903
+
1904
+ if (userMetrics?.stoppedSpeakingAt !== undefined) {
1905
+ const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt;
1906
+ assistantMetrics.e2eLatency = e2eLatency;
1907
+ span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
1908
+ }
1909
+ }
1910
+
1911
+ span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
1912
+ let hasSpeechMessage = false;
1913
+
1652
1914
  // add the tools messages that triggers this reply to the chat context
1653
1915
  if (toolsMessages) {
1654
1916
  for (const msg of toolsMessages) {
@@ -1703,45 +1965,54 @@ export class AgentActivity implements RecognitionHooks {
1703
1965
  }
1704
1966
 
1705
1967
  if (forwardedText) {
1968
+ hasSpeechMessage = true;
1706
1969
  const message = ChatMessage.create({
1707
1970
  role: 'assistant',
1708
1971
  content: forwardedText,
1709
1972
  id: llmGenData.id,
1710
1973
  interrupted: true,
1711
1974
  createdAt: replyStartedAt,
1975
+ metrics: assistantMetrics,
1712
1976
  });
1713
1977
  chatCtx.insert(message);
1714
1978
  this.agent._chatCtx.insert(message);
1715
1979
  speechHandle._itemAdded([message]);
1716
1980
  this.agentSession._conversationItemAdded(message);
1981
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
1717
1982
  }
1718
1983
 
1719
1984
  if (this.agentSession.agentState === 'speaking') {
1720
1985
  this.agentSession._updateAgentState('listening');
1986
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1987
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1988
+ this.restoreInterruptionByAudioActivity();
1989
+ }
1721
1990
  }
1722
1991
 
1723
1992
  this.logger.info(
1724
1993
  { speech_id: speechHandle.id, message: forwardedText },
1725
1994
  'playout completed with interrupt',
1726
1995
  );
1727
- // TODO(shubhra) add chat message to speech handle
1728
1996
  speechHandle._markGenerationDone();
1729
1997
  await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1730
1998
  return;
1731
1999
  }
1732
2000
 
1733
2001
  if (textOut && textOut.text) {
2002
+ hasSpeechMessage = true;
1734
2003
  const message = ChatMessage.create({
1735
2004
  role: 'assistant',
1736
2005
  id: llmGenData.id,
1737
2006
  interrupted: false,
1738
2007
  createdAt: replyStartedAt,
1739
2008
  content: textOut.text,
2009
+ metrics: assistantMetrics,
1740
2010
  });
1741
2011
  chatCtx.insert(message);
1742
2012
  this.agent._chatCtx.insert(message);
1743
2013
  speechHandle._itemAdded([message]);
1744
2014
  this.agentSession._conversationItemAdded(message);
2015
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
1745
2016
  this.logger.info(
1746
2017
  { speech_id: speechHandle.id, message: textOut.text },
1747
2018
  'playout completed without interruption',
@@ -1752,6 +2023,12 @@ export class AgentActivity implements RecognitionHooks {
1752
2023
  this.agentSession._updateAgentState('thinking');
1753
2024
  } else if (this.agentSession.agentState === 'speaking') {
1754
2025
  this.agentSession._updateAgentState('listening');
2026
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
2027
+ {
2028
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
2029
+ this.restoreInterruptionByAudioActivity();
2030
+ }
2031
+ }
1755
2032
  }
1756
2033
 
1757
2034
  // mark the playout done before waiting for the tool execution
@@ -1811,6 +2088,7 @@ export class AgentActivity implements RecognitionHooks {
1811
2088
  instructions,
1812
2089
  undefined,
1813
2090
  toolMessages,
2091
+ hasSpeechMessage ? undefined : userMetrics,
1814
2092
  ),
1815
2093
  ownedSpeechHandle: speechHandle,
1816
2094
  name: 'AgentActivity.pipelineReply',
@@ -1844,6 +2122,7 @@ export class AgentActivity implements RecognitionHooks {
1844
2122
  instructions?: string,
1845
2123
  newMessage?: ChatMessage,
1846
2124
  toolsMessages?: ChatItem[],
2125
+ _previousUserMetrics?: MetricsReport,
1847
2126
  ): Promise<void> =>
1848
2127
  tracer.startActiveSpan(
1849
2128
  async (span) =>
@@ -1857,6 +2136,7 @@ export class AgentActivity implements RecognitionHooks {
1857
2136
  newMessage,
1858
2137
  toolsMessages,
1859
2138
  span,
2139
+ _previousUserMetrics,
1860
2140
  }),
1861
2141
  {
1862
2142
  name: 'agent_turn',
@@ -2007,6 +2287,8 @@ export class AgentActivity implements RecognitionHooks {
2007
2287
  ttsTextInput,
2008
2288
  modelSettings,
2009
2289
  abortController,
2290
+ this.tts?.model,
2291
+ this.tts?.provider,
2010
2292
  );
2011
2293
  tasks.push(ttsTask);
2012
2294
  realtimeAudioResult = ttsGenData.audioStream;
@@ -2516,6 +2798,14 @@ export class AgentActivity implements RecognitionHooks {
2516
2798
  if (this._mainTask) {
2517
2799
  await this._mainTask.cancelAndWait();
2518
2800
  }
2801
+ if (this.interruptionDetector) {
2802
+ this.interruptionDetector.off(
2803
+ 'user_overlapping_speech',
2804
+ this.onInterruptionOverlappingSpeech,
2805
+ );
2806
+ this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
2807
+ this.interruptionDetector.off('error', this.onInterruptionError);
2808
+ }
2519
2809
 
2520
2810
  this.agent._agentActivity = undefined;
2521
2811
  } finally {
@@ -2523,6 +2813,53 @@ export class AgentActivity implements RecognitionHooks {
2523
2813
  }
2524
2814
  }
2525
2815
 
2816
+ private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined {
2817
+ const interruptionDetection =
2818
+ this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
2819
+ if (
2820
+ !(
2821
+ this.stt &&
2822
+ this.stt.capabilities.alignedTranscript &&
2823
+ this.stt.capabilities.streaming &&
2824
+ this.vad &&
2825
+ this.turnDetection !== 'manual' &&
2826
+ this.turnDetection !== 'realtime_llm' &&
2827
+ !(this.llm instanceof RealtimeModel)
2828
+ )
2829
+ ) {
2830
+ if (interruptionDetection === 'adaptive') {
2831
+ this.logger.warn(
2832
+ "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled",
2833
+ );
2834
+ return undefined;
2835
+ }
2836
+ }
2837
+
2838
+ if (
2839
+ (interruptionDetection !== undefined && interruptionDetection === false) ||
2840
+ interruptionDetection === 'vad'
2841
+ ) {
2842
+ return undefined;
2843
+ }
2844
+
2845
+ try {
2846
+ const detector = new AdaptiveInterruptionDetector();
2847
+
2848
+ detector.on('user_overlapping_speech', this.onInterruptionOverlappingSpeech);
2849
+ detector.on('metrics_collected', this.onInterruptionMetricsCollected);
2850
+ detector.on('error', this.onInterruptionError);
2851
+
2852
+ return detector;
2853
+ } catch (error: unknown) {
2854
+ this.logger.warn({ error }, 'could not instantiate AdaptiveInterruptionDetector');
2855
+ }
2856
+ return undefined;
2857
+ }
2858
+
2859
+ private restoreInterruptionByAudioActivity(): void {
2860
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
2861
+ }
2862
+
2526
2863
  private async _closeSessionResources(): Promise<void> {
2527
2864
  // Unregister event handlers to prevent duplicate metrics
2528
2865
  if (this.llm instanceof LLM) {