@livekit/agents 1.0.47 → 1.1.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (444) hide show
  1. package/dist/beta/index.cjs +29 -0
  2. package/dist/beta/index.cjs.map +1 -0
  3. package/dist/beta/index.d.cts +2 -0
  4. package/dist/beta/index.d.ts +2 -0
  5. package/dist/beta/index.d.ts.map +1 -0
  6. package/dist/beta/index.js +7 -0
  7. package/dist/beta/index.js.map +1 -0
  8. package/dist/beta/workflows/index.cjs +29 -0
  9. package/dist/beta/workflows/index.cjs.map +1 -0
  10. package/dist/beta/workflows/index.d.cts +2 -0
  11. package/dist/beta/workflows/index.d.ts +2 -0
  12. package/dist/beta/workflows/index.d.ts.map +1 -0
  13. package/dist/beta/workflows/index.js +7 -0
  14. package/dist/beta/workflows/index.js.map +1 -0
  15. package/dist/beta/workflows/task_group.cjs +162 -0
  16. package/dist/beta/workflows/task_group.cjs.map +1 -0
  17. package/dist/beta/workflows/task_group.d.cts +32 -0
  18. package/dist/beta/workflows/task_group.d.ts +32 -0
  19. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  20. package/dist/beta/workflows/task_group.js +138 -0
  21. package/dist/beta/workflows/task_group.js.map +1 -0
  22. package/dist/constants.cjs +27 -0
  23. package/dist/constants.cjs.map +1 -1
  24. package/dist/constants.d.cts +9 -0
  25. package/dist/constants.d.ts +9 -0
  26. package/dist/constants.d.ts.map +1 -1
  27. package/dist/constants.js +18 -0
  28. package/dist/constants.js.map +1 -1
  29. package/dist/index.cjs +3 -0
  30. package/dist/index.cjs.map +1 -1
  31. package/dist/index.d.cts +2 -1
  32. package/dist/index.d.ts +2 -1
  33. package/dist/index.d.ts.map +1 -1
  34. package/dist/index.js +2 -0
  35. package/dist/index.js.map +1 -1
  36. package/dist/inference/api_protos.d.cts +12 -12
  37. package/dist/inference/api_protos.d.ts +12 -12
  38. package/dist/inference/interruption/defaults.cjs +81 -0
  39. package/dist/inference/interruption/defaults.cjs.map +1 -0
  40. package/dist/inference/interruption/defaults.d.cts +19 -0
  41. package/dist/inference/interruption/defaults.d.ts +19 -0
  42. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  43. package/dist/inference/interruption/defaults.js +46 -0
  44. package/dist/inference/interruption/defaults.js.map +1 -0
  45. package/dist/inference/interruption/errors.cjs +44 -0
  46. package/dist/inference/interruption/errors.cjs.map +1 -0
  47. package/dist/inference/interruption/errors.d.cts +12 -0
  48. package/dist/inference/interruption/errors.d.ts +12 -0
  49. package/dist/inference/interruption/errors.d.ts.map +1 -0
  50. package/dist/inference/interruption/errors.js +20 -0
  51. package/dist/inference/interruption/errors.js.map +1 -0
  52. package/dist/inference/interruption/http_transport.cjs +147 -0
  53. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  54. package/dist/inference/interruption/http_transport.d.cts +63 -0
  55. package/dist/inference/interruption/http_transport.d.ts +63 -0
  56. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  57. package/dist/inference/interruption/http_transport.js +121 -0
  58. package/dist/inference/interruption/http_transport.js.map +1 -0
  59. package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
  60. package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
  61. package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
  62. package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
  63. package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
  64. package/dist/inference/interruption/interruption_cache_entry.js +34 -0
  65. package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
  66. package/dist/inference/interruption/interruption_detector.cjs +181 -0
  67. package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
  68. package/dist/inference/interruption/interruption_detector.d.cts +59 -0
  69. package/dist/inference/interruption/interruption_detector.d.ts +59 -0
  70. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
  71. package/dist/inference/interruption/interruption_detector.js +147 -0
  72. package/dist/inference/interruption/interruption_detector.js.map +1 -0
  73. package/dist/inference/interruption/interruption_stream.cjs +368 -0
  74. package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
  75. package/dist/inference/interruption/interruption_stream.d.cts +46 -0
  76. package/dist/inference/interruption/interruption_stream.d.ts +46 -0
  77. package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
  78. package/dist/inference/interruption/interruption_stream.js +344 -0
  79. package/dist/inference/interruption/interruption_stream.js.map +1 -0
  80. package/dist/inference/interruption/types.cjs +17 -0
  81. package/dist/inference/interruption/types.cjs.map +1 -0
  82. package/dist/inference/interruption/types.d.cts +66 -0
  83. package/dist/inference/interruption/types.d.ts +66 -0
  84. package/dist/inference/interruption/types.d.ts.map +1 -0
  85. package/dist/inference/interruption/types.js +1 -0
  86. package/dist/inference/interruption/types.js.map +1 -0
  87. package/dist/inference/interruption/utils.cjs +130 -0
  88. package/dist/inference/interruption/utils.cjs.map +1 -0
  89. package/dist/inference/interruption/utils.d.cts +41 -0
  90. package/dist/inference/interruption/utils.d.ts +41 -0
  91. package/dist/inference/interruption/utils.d.ts.map +1 -0
  92. package/dist/inference/interruption/utils.js +105 -0
  93. package/dist/inference/interruption/utils.js.map +1 -0
  94. package/dist/inference/interruption/utils.test.cjs +105 -0
  95. package/dist/inference/interruption/utils.test.cjs.map +1 -0
  96. package/dist/inference/interruption/utils.test.js +104 -0
  97. package/dist/inference/interruption/utils.test.js.map +1 -0
  98. package/dist/inference/interruption/ws_transport.cjs +329 -0
  99. package/dist/inference/interruption/ws_transport.cjs.map +1 -0
  100. package/dist/inference/interruption/ws_transport.d.cts +33 -0
  101. package/dist/inference/interruption/ws_transport.d.ts +33 -0
  102. package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
  103. package/dist/inference/interruption/ws_transport.js +295 -0
  104. package/dist/inference/interruption/ws_transport.js.map +1 -0
  105. package/dist/inference/llm.cjs +14 -10
  106. package/dist/inference/llm.cjs.map +1 -1
  107. package/dist/inference/llm.d.cts +2 -1
  108. package/dist/inference/llm.d.ts +2 -1
  109. package/dist/inference/llm.d.ts.map +1 -1
  110. package/dist/inference/llm.js +8 -10
  111. package/dist/inference/llm.js.map +1 -1
  112. package/dist/inference/stt.cjs +7 -2
  113. package/dist/inference/stt.cjs.map +1 -1
  114. package/dist/inference/stt.d.cts +2 -0
  115. package/dist/inference/stt.d.ts +2 -0
  116. package/dist/inference/stt.d.ts.map +1 -1
  117. package/dist/inference/stt.js +8 -3
  118. package/dist/inference/stt.js.map +1 -1
  119. package/dist/inference/tts.cjs +7 -2
  120. package/dist/inference/tts.cjs.map +1 -1
  121. package/dist/inference/tts.d.cts +2 -0
  122. package/dist/inference/tts.d.ts +2 -0
  123. package/dist/inference/tts.d.ts.map +1 -1
  124. package/dist/inference/tts.js +8 -3
  125. package/dist/inference/tts.js.map +1 -1
  126. package/dist/inference/utils.cjs +26 -7
  127. package/dist/inference/utils.cjs.map +1 -1
  128. package/dist/inference/utils.d.cts +13 -0
  129. package/dist/inference/utils.d.ts +13 -0
  130. package/dist/inference/utils.d.ts.map +1 -1
  131. package/dist/inference/utils.js +18 -2
  132. package/dist/inference/utils.js.map +1 -1
  133. package/dist/llm/chat_context.cjs +108 -2
  134. package/dist/llm/chat_context.cjs.map +1 -1
  135. package/dist/llm/chat_context.d.cts +28 -1
  136. package/dist/llm/chat_context.d.ts +28 -1
  137. package/dist/llm/chat_context.d.ts.map +1 -1
  138. package/dist/llm/chat_context.js +108 -2
  139. package/dist/llm/chat_context.js.map +1 -1
  140. package/dist/llm/chat_context.test.cjs +43 -0
  141. package/dist/llm/chat_context.test.cjs.map +1 -1
  142. package/dist/llm/chat_context.test.js +43 -0
  143. package/dist/llm/chat_context.test.js.map +1 -1
  144. package/dist/llm/index.cjs +2 -0
  145. package/dist/llm/index.cjs.map +1 -1
  146. package/dist/llm/index.d.cts +2 -2
  147. package/dist/llm/index.d.ts +2 -2
  148. package/dist/llm/index.d.ts.map +1 -1
  149. package/dist/llm/index.js +3 -1
  150. package/dist/llm/index.js.map +1 -1
  151. package/dist/llm/llm.cjs +16 -1
  152. package/dist/llm/llm.cjs.map +1 -1
  153. package/dist/llm/llm.d.cts +9 -0
  154. package/dist/llm/llm.d.ts +9 -0
  155. package/dist/llm/llm.d.ts.map +1 -1
  156. package/dist/llm/llm.js +16 -1
  157. package/dist/llm/llm.js.map +1 -1
  158. package/dist/llm/provider_format/index.d.cts +1 -1
  159. package/dist/llm/provider_format/index.d.ts +1 -1
  160. package/dist/llm/realtime.cjs +3 -0
  161. package/dist/llm/realtime.cjs.map +1 -1
  162. package/dist/llm/realtime.d.cts +1 -0
  163. package/dist/llm/realtime.d.ts +1 -0
  164. package/dist/llm/realtime.d.ts.map +1 -1
  165. package/dist/llm/realtime.js +3 -0
  166. package/dist/llm/realtime.js.map +1 -1
  167. package/dist/llm/tool_context.cjs +7 -0
  168. package/dist/llm/tool_context.cjs.map +1 -1
  169. package/dist/llm/tool_context.d.cts +10 -2
  170. package/dist/llm/tool_context.d.ts +10 -2
  171. package/dist/llm/tool_context.d.ts.map +1 -1
  172. package/dist/llm/tool_context.js +6 -0
  173. package/dist/llm/tool_context.js.map +1 -1
  174. package/dist/metrics/base.cjs.map +1 -1
  175. package/dist/metrics/base.d.cts +45 -1
  176. package/dist/metrics/base.d.ts +45 -1
  177. package/dist/metrics/base.d.ts.map +1 -1
  178. package/dist/metrics/index.cjs +5 -0
  179. package/dist/metrics/index.cjs.map +1 -1
  180. package/dist/metrics/index.d.cts +2 -1
  181. package/dist/metrics/index.d.ts +2 -1
  182. package/dist/metrics/index.d.ts.map +1 -1
  183. package/dist/metrics/index.js +6 -0
  184. package/dist/metrics/index.js.map +1 -1
  185. package/dist/metrics/model_usage.cjs +189 -0
  186. package/dist/metrics/model_usage.cjs.map +1 -0
  187. package/dist/metrics/model_usage.d.cts +92 -0
  188. package/dist/metrics/model_usage.d.ts +92 -0
  189. package/dist/metrics/model_usage.d.ts.map +1 -0
  190. package/dist/metrics/model_usage.js +164 -0
  191. package/dist/metrics/model_usage.js.map +1 -0
  192. package/dist/metrics/model_usage.test.cjs +474 -0
  193. package/dist/metrics/model_usage.test.cjs.map +1 -0
  194. package/dist/metrics/model_usage.test.js +476 -0
  195. package/dist/metrics/model_usage.test.js.map +1 -0
  196. package/dist/metrics/usage_collector.cjs +3 -0
  197. package/dist/metrics/usage_collector.cjs.map +1 -1
  198. package/dist/metrics/usage_collector.d.cts +9 -0
  199. package/dist/metrics/usage_collector.d.ts +9 -0
  200. package/dist/metrics/usage_collector.d.ts.map +1 -1
  201. package/dist/metrics/usage_collector.js +3 -0
  202. package/dist/metrics/usage_collector.js.map +1 -1
  203. package/dist/metrics/utils.cjs +9 -0
  204. package/dist/metrics/utils.cjs.map +1 -1
  205. package/dist/metrics/utils.d.ts.map +1 -1
  206. package/dist/metrics/utils.js +9 -0
  207. package/dist/metrics/utils.js.map +1 -1
  208. package/dist/stream/multi_input_stream.test.cjs +4 -0
  209. package/dist/stream/multi_input_stream.test.cjs.map +1 -1
  210. package/dist/stream/multi_input_stream.test.js +5 -1
  211. package/dist/stream/multi_input_stream.test.js.map +1 -1
  212. package/dist/stream/stream_channel.cjs +31 -0
  213. package/dist/stream/stream_channel.cjs.map +1 -1
  214. package/dist/stream/stream_channel.d.cts +4 -2
  215. package/dist/stream/stream_channel.d.ts +4 -2
  216. package/dist/stream/stream_channel.d.ts.map +1 -1
  217. package/dist/stream/stream_channel.js +31 -0
  218. package/dist/stream/stream_channel.js.map +1 -1
  219. package/dist/stt/stt.cjs +34 -2
  220. package/dist/stt/stt.cjs.map +1 -1
  221. package/dist/stt/stt.d.cts +22 -0
  222. package/dist/stt/stt.d.ts +22 -0
  223. package/dist/stt/stt.d.ts.map +1 -1
  224. package/dist/stt/stt.js +34 -2
  225. package/dist/stt/stt.js.map +1 -1
  226. package/dist/telemetry/otel_http_exporter.cjs +24 -5
  227. package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
  228. package/dist/telemetry/otel_http_exporter.d.cts +1 -0
  229. package/dist/telemetry/otel_http_exporter.d.ts +1 -0
  230. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
  231. package/dist/telemetry/otel_http_exporter.js +24 -5
  232. package/dist/telemetry/otel_http_exporter.js.map +1 -1
  233. package/dist/telemetry/trace_types.cjs +5 -5
  234. package/dist/telemetry/trace_types.cjs.map +1 -1
  235. package/dist/telemetry/trace_types.d.cts +9 -5
  236. package/dist/telemetry/trace_types.d.ts +9 -5
  237. package/dist/telemetry/trace_types.d.ts.map +1 -1
  238. package/dist/telemetry/trace_types.js +5 -5
  239. package/dist/telemetry/trace_types.js.map +1 -1
  240. package/dist/telemetry/traces.cjs +47 -8
  241. package/dist/telemetry/traces.cjs.map +1 -1
  242. package/dist/telemetry/traces.d.ts.map +1 -1
  243. package/dist/telemetry/traces.js +47 -8
  244. package/dist/telemetry/traces.js.map +1 -1
  245. package/dist/tts/tts.cjs +64 -2
  246. package/dist/tts/tts.cjs.map +1 -1
  247. package/dist/tts/tts.d.cts +34 -0
  248. package/dist/tts/tts.d.ts +34 -0
  249. package/dist/tts/tts.d.ts.map +1 -1
  250. package/dist/tts/tts.js +64 -2
  251. package/dist/tts/tts.js.map +1 -1
  252. package/dist/utils.cjs +1 -0
  253. package/dist/utils.cjs.map +1 -1
  254. package/dist/utils.d.ts.map +1 -1
  255. package/dist/utils.js +1 -0
  256. package/dist/utils.js.map +1 -1
  257. package/dist/version.cjs +1 -1
  258. package/dist/version.js +1 -1
  259. package/dist/voice/agent.cjs +34 -4
  260. package/dist/voice/agent.cjs.map +1 -1
  261. package/dist/voice/agent.d.cts +11 -2
  262. package/dist/voice/agent.d.ts +11 -2
  263. package/dist/voice/agent.d.ts.map +1 -1
  264. package/dist/voice/agent.js +34 -4
  265. package/dist/voice/agent.js.map +1 -1
  266. package/dist/voice/agent_activity.cjs +292 -44
  267. package/dist/voice/agent_activity.cjs.map +1 -1
  268. package/dist/voice/agent_activity.d.cts +27 -6
  269. package/dist/voice/agent_activity.d.ts +27 -6
  270. package/dist/voice/agent_activity.d.ts.map +1 -1
  271. package/dist/voice/agent_activity.js +293 -45
  272. package/dist/voice/agent_activity.js.map +1 -1
  273. package/dist/voice/agent_session.cjs +105 -48
  274. package/dist/voice/agent_session.cjs.map +1 -1
  275. package/dist/voice/agent_session.d.cts +90 -20
  276. package/dist/voice/agent_session.d.ts +90 -20
  277. package/dist/voice/agent_session.d.ts.map +1 -1
  278. package/dist/voice/agent_session.js +105 -46
  279. package/dist/voice/agent_session.js.map +1 -1
  280. package/dist/voice/audio_recognition.cjs +287 -6
  281. package/dist/voice/audio_recognition.cjs.map +1 -1
  282. package/dist/voice/audio_recognition.d.cts +42 -3
  283. package/dist/voice/audio_recognition.d.ts +42 -3
  284. package/dist/voice/audio_recognition.d.ts.map +1 -1
  285. package/dist/voice/audio_recognition.js +289 -7
  286. package/dist/voice/audio_recognition.js.map +1 -1
  287. package/dist/voice/client_events.cjs +554 -0
  288. package/dist/voice/client_events.cjs.map +1 -0
  289. package/dist/voice/client_events.d.cts +195 -0
  290. package/dist/voice/client_events.d.ts +195 -0
  291. package/dist/voice/client_events.d.ts.map +1 -0
  292. package/dist/voice/client_events.js +548 -0
  293. package/dist/voice/client_events.js.map +1 -0
  294. package/dist/voice/events.cjs +1 -0
  295. package/dist/voice/events.cjs.map +1 -1
  296. package/dist/voice/events.d.cts +8 -5
  297. package/dist/voice/events.d.ts +8 -5
  298. package/dist/voice/events.d.ts.map +1 -1
  299. package/dist/voice/events.js +1 -0
  300. package/dist/voice/events.js.map +1 -1
  301. package/dist/voice/generation.cjs +43 -8
  302. package/dist/voice/generation.cjs.map +1 -1
  303. package/dist/voice/generation.d.cts +3 -3
  304. package/dist/voice/generation.d.ts +3 -3
  305. package/dist/voice/generation.d.ts.map +1 -1
  306. package/dist/voice/generation.js +43 -8
  307. package/dist/voice/generation.js.map +1 -1
  308. package/dist/voice/index.cjs +1 -0
  309. package/dist/voice/index.cjs.map +1 -1
  310. package/dist/voice/index.d.cts +1 -0
  311. package/dist/voice/index.d.ts +1 -0
  312. package/dist/voice/index.d.ts.map +1 -1
  313. package/dist/voice/index.js +1 -0
  314. package/dist/voice/index.js.map +1 -1
  315. package/dist/voice/report.cjs +20 -8
  316. package/dist/voice/report.cjs.map +1 -1
  317. package/dist/voice/report.d.cts +5 -0
  318. package/dist/voice/report.d.ts +5 -0
  319. package/dist/voice/report.d.ts.map +1 -1
  320. package/dist/voice/report.js +20 -8
  321. package/dist/voice/report.js.map +1 -1
  322. package/dist/voice/report.test.cjs +106 -0
  323. package/dist/voice/report.test.cjs.map +1 -0
  324. package/dist/voice/report.test.js +105 -0
  325. package/dist/voice/report.test.js.map +1 -0
  326. package/dist/voice/room_io/room_io.cjs +16 -41
  327. package/dist/voice/room_io/room_io.cjs.map +1 -1
  328. package/dist/voice/room_io/room_io.d.cts +4 -9
  329. package/dist/voice/room_io/room_io.d.ts +4 -9
  330. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  331. package/dist/voice/room_io/room_io.js +17 -43
  332. package/dist/voice/room_io/room_io.js.map +1 -1
  333. package/dist/voice/testing/fake_llm.cjs +127 -0
  334. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  335. package/dist/voice/testing/fake_llm.d.cts +30 -0
  336. package/dist/voice/testing/fake_llm.d.ts +30 -0
  337. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  338. package/dist/voice/testing/fake_llm.js +103 -0
  339. package/dist/voice/testing/fake_llm.js.map +1 -0
  340. package/dist/voice/testing/index.cjs +3 -0
  341. package/dist/voice/testing/index.cjs.map +1 -1
  342. package/dist/voice/testing/index.d.cts +1 -0
  343. package/dist/voice/testing/index.d.ts +1 -0
  344. package/dist/voice/testing/index.d.ts.map +1 -1
  345. package/dist/voice/testing/index.js +2 -0
  346. package/dist/voice/testing/index.js.map +1 -1
  347. package/dist/voice/turn_config/endpointing.cjs +33 -0
  348. package/dist/voice/turn_config/endpointing.cjs.map +1 -0
  349. package/dist/voice/turn_config/endpointing.d.cts +30 -0
  350. package/dist/voice/turn_config/endpointing.d.ts +30 -0
  351. package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
  352. package/dist/voice/turn_config/endpointing.js +9 -0
  353. package/dist/voice/turn_config/endpointing.js.map +1 -0
  354. package/dist/voice/turn_config/interruption.cjs +37 -0
  355. package/dist/voice/turn_config/interruption.cjs.map +1 -0
  356. package/dist/voice/turn_config/interruption.d.cts +53 -0
  357. package/dist/voice/turn_config/interruption.d.ts +53 -0
  358. package/dist/voice/turn_config/interruption.d.ts.map +1 -0
  359. package/dist/voice/turn_config/interruption.js +13 -0
  360. package/dist/voice/turn_config/interruption.js.map +1 -0
  361. package/dist/voice/turn_config/turn_handling.cjs +35 -0
  362. package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
  363. package/dist/voice/turn_config/turn_handling.d.cts +36 -0
  364. package/dist/voice/turn_config/turn_handling.d.ts +36 -0
  365. package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
  366. package/dist/voice/turn_config/turn_handling.js +11 -0
  367. package/dist/voice/turn_config/turn_handling.js.map +1 -0
  368. package/dist/voice/turn_config/utils.cjs +97 -0
  369. package/dist/voice/turn_config/utils.cjs.map +1 -0
  370. package/dist/voice/turn_config/utils.d.cts +25 -0
  371. package/dist/voice/turn_config/utils.d.ts +25 -0
  372. package/dist/voice/turn_config/utils.d.ts.map +1 -0
  373. package/dist/voice/turn_config/utils.js +73 -0
  374. package/dist/voice/turn_config/utils.js.map +1 -0
  375. package/dist/voice/turn_config/utils.test.cjs +86 -0
  376. package/dist/voice/turn_config/utils.test.cjs.map +1 -0
  377. package/dist/voice/turn_config/utils.test.js +85 -0
  378. package/dist/voice/turn_config/utils.test.js.map +1 -0
  379. package/dist/voice/wire_format.cjs +798 -0
  380. package/dist/voice/wire_format.cjs.map +1 -0
  381. package/dist/voice/wire_format.d.cts +5503 -0
  382. package/dist/voice/wire_format.d.ts +5503 -0
  383. package/dist/voice/wire_format.d.ts.map +1 -0
  384. package/dist/voice/wire_format.js +728 -0
  385. package/dist/voice/wire_format.js.map +1 -0
  386. package/package.json +2 -1
  387. package/src/beta/index.ts +9 -0
  388. package/src/beta/workflows/index.ts +9 -0
  389. package/src/beta/workflows/task_group.ts +194 -0
  390. package/src/constants.ts +13 -0
  391. package/src/index.ts +2 -1
  392. package/src/inference/interruption/defaults.ts +51 -0
  393. package/src/inference/interruption/errors.ts +25 -0
  394. package/src/inference/interruption/http_transport.ts +187 -0
  395. package/src/inference/interruption/interruption_cache_entry.ts +50 -0
  396. package/src/inference/interruption/interruption_detector.ts +188 -0
  397. package/src/inference/interruption/interruption_stream.ts +467 -0
  398. package/src/inference/interruption/types.ts +84 -0
  399. package/src/inference/interruption/utils.test.ts +132 -0
  400. package/src/inference/interruption/utils.ts +137 -0
  401. package/src/inference/interruption/ws_transport.ts +402 -0
  402. package/src/inference/llm.ts +9 -12
  403. package/src/inference/stt.ts +10 -3
  404. package/src/inference/tts.ts +10 -3
  405. package/src/inference/utils.ts +29 -1
  406. package/src/llm/chat_context.test.ts +48 -0
  407. package/src/llm/chat_context.ts +161 -0
  408. package/src/llm/index.ts +2 -0
  409. package/src/llm/llm.ts +16 -0
  410. package/src/llm/realtime.ts +4 -0
  411. package/src/llm/tool_context.ts +14 -0
  412. package/src/metrics/base.ts +48 -1
  413. package/src/metrics/index.ts +11 -0
  414. package/src/metrics/model_usage.test.ts +545 -0
  415. package/src/metrics/model_usage.ts +262 -0
  416. package/src/metrics/usage_collector.ts +11 -0
  417. package/src/metrics/utils.ts +11 -0
  418. package/src/stream/multi_input_stream.test.ts +6 -1
  419. package/src/stream/stream_channel.ts +34 -2
  420. package/src/stt/stt.ts +38 -0
  421. package/src/telemetry/otel_http_exporter.ts +28 -5
  422. package/src/telemetry/trace_types.ts +11 -8
  423. package/src/telemetry/traces.ts +111 -54
  424. package/src/tts/tts.ts +69 -1
  425. package/src/utils.ts +5 -0
  426. package/src/voice/agent.ts +41 -3
  427. package/src/voice/agent_activity.ts +371 -34
  428. package/src/voice/agent_session.ts +207 -59
  429. package/src/voice/audio_recognition.ts +385 -9
  430. package/src/voice/client_events.ts +838 -0
  431. package/src/voice/events.ts +14 -4
  432. package/src/voice/generation.ts +52 -9
  433. package/src/voice/index.ts +1 -0
  434. package/src/voice/report.test.ts +117 -0
  435. package/src/voice/report.ts +29 -6
  436. package/src/voice/room_io/room_io.ts +21 -64
  437. package/src/voice/testing/fake_llm.ts +138 -0
  438. package/src/voice/testing/index.ts +2 -0
  439. package/src/voice/turn_config/endpointing.ts +33 -0
  440. package/src/voice/turn_config/interruption.ts +56 -0
  441. package/src/voice/turn_config/turn_handling.ts +45 -0
  442. package/src/voice/turn_config/utils.test.ts +100 -0
  443. package/src/voice/turn_config/utils.ts +103 -0
  444. package/src/voice/wire_format.ts +827 -0
@@ -2,11 +2,13 @@ import { Mutex } from "@livekit/mutex";
2
2
  import { ROOT_CONTEXT, context as otelContext, trace } from "@opentelemetry/api";
3
3
  import { Heap } from "heap-js";
4
4
  import { AsyncLocalStorage } from "node:async_hooks";
5
- import { ReadableStream } from "node:stream/web";
5
+ import { ReadableStream, TransformStream } from "node:stream/web";
6
+ import { AdaptiveInterruptionDetector } from "../inference/interruption/interruption_detector.js";
6
7
  import { ChatMessage } from "../llm/chat_context.js";
7
8
  import {
8
9
  LLM,
9
- RealtimeModel
10
+ RealtimeModel,
11
+ ToolFlag
10
12
  } from "../llm/index.js";
11
13
  import { isSameToolChoice, isSameToolContext } from "../llm/tool_context.js";
12
14
  import { log } from "../log.js";
@@ -48,6 +50,7 @@ import {
48
50
  import { SpeechHandle } from "./speech_handle.js";
49
51
  import { setParticipantSpanAttributes } from "./utils.js";
50
52
  const agentActivityStorage = new AsyncLocalStorage();
53
+ const onEnterStorage = new AsyncLocalStorage();
51
54
  class AgentActivity {
52
55
  agent;
53
56
  agentSession;
@@ -72,16 +75,34 @@ class AgentActivity {
72
75
  // default to null as None, which maps to the default provider tool choice value
73
76
  toolChoice = null;
74
77
  _preemptiveGeneration;
75
- /** @internal */
76
- _mainTask;
77
- _onEnterTask;
78
- _onExitTask;
79
- _userTurnCompletedTask;
78
+ interruptionDetector;
79
+ isInterruptionDetectionEnabled;
80
+ isInterruptionByAudioActivityEnabled;
81
+ isDefaultInterruptionByAudioActivityEnabled;
80
82
  onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
81
83
  onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
82
84
  onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
83
85
  onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
84
86
  onModelError = (ev) => this.onError(ev);
87
+ onInterruptionOverlappingSpeech = (ev) => {
88
+ this.agentSession.emit(AgentSessionEventTypes.UserOverlappingSpeech, ev);
89
+ };
90
+ onInterruptionMetricsCollected = (ev) => {
91
+ this.agentSession.emit(
92
+ AgentSessionEventTypes.MetricsCollected,
93
+ createMetricsCollectedEvent({ metrics: ev })
94
+ );
95
+ };
96
+ onInterruptionError = (ev) => {
97
+ const errorEvent = createErrorEvent(ev, this.interruptionDetector);
98
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
99
+ this.agentSession._onError(ev);
100
+ };
101
+ /** @internal */
102
+ _mainTask;
103
+ _onEnterTask;
104
+ _onExitTask;
105
+ _userTurnCompletedTask;
85
106
  constructor(agent, agentSession) {
86
107
  this.agent = agent;
87
108
  this.agentSession = agentSession;
@@ -140,6 +161,10 @@ class AgentActivity {
140
161
  "VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
141
162
  );
142
163
  }
164
+ this.interruptionDetector = this.resolveInterruptionDetector();
165
+ this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
166
+ this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
167
+ this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
143
168
  }
144
169
  async start() {
145
170
  const unlock = await this.lock.lock();
@@ -232,8 +257,9 @@ class AgentActivity {
232
257
  vad: this.vad,
233
258
  turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
234
259
  turnDetectionMode: this.turnDetectionMode,
235
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
236
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
260
+ interruptionDetection: this.interruptionDetector,
261
+ minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
262
+ maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
237
263
  rootSpanContext: this.agentSession.rootSpanContext,
238
264
  sttModel: (_a = this.stt) == null ? void 0 : _a.label,
239
265
  sttProvider: this.getSttProvider(),
@@ -247,11 +273,14 @@ class AgentActivity {
247
273
  this._resumeSchedulingTask();
248
274
  if (runOnEnter) {
249
275
  this._onEnterTask = this.createSpeechTask({
250
- taskFn: () => tracer.startActiveSpan(async () => this.agent.onEnter(), {
251
- name: "on_enter",
252
- context: trace.setSpan(ROOT_CONTEXT, startSpan),
253
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
254
- }),
276
+ taskFn: () => onEnterStorage.run(
277
+ { session: this.agentSession, agent: this.agent },
278
+ () => tracer.startActiveSpan(async () => this.agent.onEnter(), {
279
+ name: "on_enter",
280
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
281
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
282
+ })
283
+ ),
255
284
  inlineTask: true,
256
285
  name: "AgentActivity_onEnter"
257
286
  });
@@ -292,7 +321,8 @@ class AgentActivity {
292
321
  return this.realtimeSession;
293
322
  }
294
323
  get allowInterruptions() {
295
- return this.agentSession.options.allowInterruptions;
324
+ var _a;
325
+ return ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.mode) !== false;
296
326
  }
297
327
  get useTtsAlignedTranscript() {
298
328
  return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
@@ -303,6 +333,11 @@ class AgentActivity {
303
333
  get toolCtx() {
304
334
  return this.agent.toolCtx;
305
335
  }
336
+ /** @internal */
337
+ get inputStartedAt() {
338
+ var _a;
339
+ return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
340
+ }
306
341
  async updateChatCtx(chatCtx) {
307
342
  chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
308
343
  this.agent._chatCtx = chatCtx;
@@ -317,19 +352,50 @@ class AgentActivity {
317
352
  });
318
353
  }
319
354
  }
320
- updateOptions({ toolChoice }) {
355
+ // TODO: Add when AgentConfigUpdate is ported to ChatContext.
356
+ async updateTools(tools) {
357
+ this.agent._tools = { ...tools };
358
+ if (this.realtimeSession) {
359
+ await this.realtimeSession.updateTools(tools);
360
+ }
361
+ if (this.llm instanceof LLM) {
362
+ await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
363
+ }
364
+ }
365
+ updateOptions({
366
+ toolChoice,
367
+ turnDetection
368
+ }) {
321
369
  if (toolChoice !== void 0) {
322
370
  this.toolChoice = toolChoice;
323
371
  }
324
372
  if (this.realtimeSession) {
325
373
  this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
326
374
  }
375
+ if (turnDetection !== void 0) {
376
+ this.turnDetectionMode = turnDetection;
377
+ this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
378
+ if (this.agentSession.agentState !== "speaking") {
379
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
380
+ }
381
+ }
382
+ if (this.audioRecognition) {
383
+ this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
384
+ }
327
385
  }
328
386
  attachAudioInput(audioStream) {
329
387
  void this.audioStream.close();
330
388
  this.audioStream = new MultiInputStream();
389
+ const aecWarmupAudioFilter = new TransformStream({
390
+ transform: (frame, controller) => {
391
+ const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
392
+ if (!shouldDiscardForAecWarmup) {
393
+ controller.enqueue(frame);
394
+ }
395
+ }
396
+ });
331
397
  this.audioStreamId = this.audioStream.addInputStream(audioStream);
332
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
398
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
333
399
  if (this.realtimeSession) {
334
400
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
335
401
  }
@@ -435,6 +501,13 @@ class AgentActivity {
435
501
  this.logger.info("onInputSpeechStarted");
436
502
  if (!this.vad) {
437
503
  this.agentSession._updateUserState("speaking");
504
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
505
+ this.audioRecognition.onStartOfOverlapSpeech(
506
+ 0,
507
+ Date.now(),
508
+ this.agentSession._userSpeakingSpan
509
+ );
510
+ }
438
511
  }
439
512
  try {
440
513
  this.interrupt();
@@ -448,6 +521,9 @@ class AgentActivity {
448
521
  onInputSpeechStopped(ev) {
449
522
  this.logger.info(ev, "onInputSpeechStopped");
450
523
  if (!this.vad) {
524
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
525
+ this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
526
+ }
451
527
  this.agentSession._updateUserState("listening");
452
528
  }
453
529
  if (ev.userTranscriptionEnabled) {
@@ -509,48 +585,75 @@ class AgentActivity {
509
585
  onStartOfSpeech(ev) {
510
586
  let speechStartTime = Date.now();
511
587
  if (ev) {
512
- speechStartTime = speechStartTime - ev.speechDuration;
588
+ speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
513
589
  }
514
590
  this.agentSession._updateUserState("speaking", speechStartTime);
591
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
592
+ this.audioRecognition.onStartOfOverlapSpeech(
593
+ ev.speechDuration,
594
+ speechStartTime,
595
+ this.agentSession._userSpeakingSpan
596
+ );
597
+ }
515
598
  }
516
599
  onEndOfSpeech(ev) {
517
600
  let speechEndTime = Date.now();
518
601
  if (ev) {
519
- speechEndTime = speechEndTime - ev.silenceDuration;
602
+ speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
603
+ }
604
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
605
+ this.audioRecognition.onEndOfOverlapSpeech(
606
+ speechEndTime,
607
+ this.agentSession._userSpeakingSpan
608
+ );
520
609
  }
521
610
  this.agentSession._updateUserState("listening", speechEndTime);
522
611
  }
523
612
  onVADInferenceDone(ev) {
613
+ var _a;
524
614
  if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
525
615
  return;
526
616
  }
527
- if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
617
+ if (ev.speechDuration >= ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
528
618
  this.interruptByAudioActivity();
529
619
  }
530
620
  }
531
621
  interruptByAudioActivity() {
532
- var _a, _b;
622
+ var _a, _b, _c, _d;
623
+ if (!this.isInterruptionByAudioActivityEnabled) {
624
+ return;
625
+ }
626
+ if (this.agentSession._aecWarmupRemaining > 0) {
627
+ return;
628
+ }
533
629
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
534
630
  return;
535
631
  }
536
- if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
632
+ if (this.stt && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
537
633
  const text = this.audioRecognition.currentTranscript;
538
634
  const normalizedText = text ?? "";
539
635
  const wordCount = splitWords(normalizedText, true).length;
540
- if (wordCount < this.agentSession.options.minInterruptionWords) {
636
+ if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
541
637
  return;
542
638
  }
543
639
  }
544
- (_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
640
+ (_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
545
641
  if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
546
642
  this.logger.info(
547
643
  { "speech id": this._currentSpeech.id },
548
644
  "speech interrupted by audio activity"
549
645
  );
550
- (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
646
+ (_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
551
647
  this._currentSpeech.interrupt();
552
648
  }
553
649
  }
650
+ onInterruption(ev) {
651
+ this.restoreInterruptionByAudioActivity();
652
+ this.interruptByAudioActivity();
653
+ if (this.audioRecognition) {
654
+ this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
655
+ }
656
+ }
554
657
  onInterimTranscript(ev) {
555
658
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
556
659
  return;
@@ -599,7 +702,8 @@ class AgentActivity {
599
702
  );
600
703
  const userMessage = ChatMessage.create({
601
704
  role: "user",
602
- content: info.newTranscript
705
+ content: info.newTranscript,
706
+ transcriptConfidence: info.transcriptConfidence
603
707
  });
604
708
  const chatCtx = this.agent.chatCtx.copy();
605
709
  const speechHandle = this.generateReply({
@@ -657,6 +761,7 @@ class AgentActivity {
657
761
  return task;
658
762
  }
659
763
  async onEndOfTurn(info) {
764
+ var _a, _b;
660
765
  if (this.schedulingPaused) {
661
766
  this.cancelPreemptiveGeneration();
662
767
  this.logger.warn(
@@ -665,14 +770,14 @@ class AgentActivity {
665
770
  );
666
771
  return true;
667
772
  }
668
- if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
773
+ if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
669
774
  const wordCount = splitWords(info.newTranscript, true).length;
670
- if (wordCount < this.agentSession.options.minInterruptionWords) {
775
+ if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
671
776
  this.cancelPreemptiveGeneration();
672
777
  this.logger.info(
673
778
  {
674
779
  wordCount,
675
- minInterruptionWords: this.agentSession.options.minInterruptionWords
780
+ minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords
676
781
  },
677
782
  "skipping user input, word count below minimum interruption threshold"
678
783
  );
@@ -808,11 +913,18 @@ class AgentActivity {
808
913
  instructions = `${this.agent.instructions}
809
914
  ${instructions}`;
810
915
  }
916
+ const onEnterData = onEnterStorage.getStore();
917
+ const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
918
+ const tools = shouldFilterTools ? Object.fromEntries(
919
+ Object.entries(this.agent.toolCtx).filter(
920
+ ([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER)
921
+ )
922
+ ) : this.agent.toolCtx;
811
923
  const task = this.createSpeechTask({
812
924
  taskFn: (abortController) => this.pipelineReplyTask(
813
925
  handle,
814
926
  chatCtx ?? this.agent.chatCtx,
815
- this.agent.toolCtx,
927
+ tools,
816
928
  {
817
929
  toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
818
930
  },
@@ -884,7 +996,8 @@ ${instructions}`;
884
996
  }
885
997
  let userMessage = ChatMessage.create({
886
998
  role: "user",
887
- content: info.newTranscript
999
+ content: info.newTranscript,
1000
+ transcriptConfidence: info.transcriptConfidence
888
1001
  });
889
1002
  const chatCtx = this.agent.chatCtx.copy();
890
1003
  const startTime = Date.now();
@@ -902,11 +1015,32 @@ ${instructions}`;
902
1015
  } else if (this.llm === void 0) {
903
1016
  return;
904
1017
  }
1018
+ const userMetricsReport = {};
1019
+ if (info.startedSpeakingAt !== void 0) {
1020
+ userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
1021
+ }
1022
+ if (info.stoppedSpeakingAt !== void 0) {
1023
+ userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
1024
+ }
1025
+ if (info.transcriptionDelay !== void 0) {
1026
+ userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
1027
+ }
1028
+ if (info.endOfUtteranceDelay !== void 0) {
1029
+ userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
1030
+ }
1031
+ userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
1032
+ if (userMessage) {
1033
+ userMessage.metrics = userMetricsReport;
1034
+ }
905
1035
  let speechHandle;
906
1036
  if (this._preemptiveGeneration !== void 0) {
907
1037
  const preemptive = this._preemptiveGeneration;
908
1038
  if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && isSameToolContext(preemptive.tools, this.tools) && isSameToolChoice(preemptive.toolChoice, this.toolChoice)) {
909
1039
  speechHandle = preemptive.speechHandle;
1040
+ if (preemptive.userMessage && userMessage) {
1041
+ preemptive.userMessage.metrics = userMetricsReport;
1042
+ preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
1043
+ }
910
1044
  this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
911
1045
  this.logger.debug(
912
1046
  {
@@ -940,6 +1074,7 @@ ${instructions}`;
940
1074
  );
941
1075
  }
942
1076
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
1077
+ var _a, _b;
943
1078
  speechHandle._agentTurnContext = otelContext.active();
944
1079
  speechHandleStorage.enterWith(speechHandle);
945
1080
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
@@ -972,11 +1107,18 @@ ${instructions}`;
972
1107
  textOut = _textOut;
973
1108
  tasks.push(textForwardTask);
974
1109
  }
1110
+ let replyStartedSpeakingAt;
1111
+ let replyTtsGenData = null;
975
1112
  const onFirstFrame = (startedSpeakingAt) => {
1113
+ replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
976
1114
  this.agentSession._updateAgentState("speaking", {
977
1115
  startTime: startedSpeakingAt,
978
1116
  otelContext: speechHandle._agentTurnContext
979
1117
  });
1118
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1119
+ this.audioRecognition.onStartOfAgentSpeech();
1120
+ this.isInterruptionByAudioActivityEnabled = false;
1121
+ }
980
1122
  };
981
1123
  if (!audioOutput) {
982
1124
  if (textOut) {
@@ -989,9 +1131,12 @@ ${instructions}`;
989
1131
  (...args) => this.agent.ttsNode(...args),
990
1132
  audioSource,
991
1133
  modelSettings,
992
- replyAbortController
1134
+ replyAbortController,
1135
+ (_a = this.tts) == null ? void 0 : _a.model,
1136
+ (_b = this.tts) == null ? void 0 : _b.provider
993
1137
  );
994
1138
  tasks.push(ttsTask);
1139
+ replyTtsGenData = ttsGenData;
995
1140
  const [forwardTask, _audioOut] = performAudioForwarding(
996
1141
  ttsGenData.audioStream,
997
1142
  audioOutput,
@@ -1023,16 +1168,30 @@ ${instructions}`;
1023
1168
  }
1024
1169
  }
1025
1170
  if (addToChatCtx) {
1171
+ const replyStoppedSpeakingAt = Date.now();
1172
+ const replyAssistantMetrics = {};
1173
+ if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
1174
+ replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
1175
+ }
1176
+ if (replyStartedSpeakingAt !== void 0) {
1177
+ replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
1178
+ replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
1179
+ }
1026
1180
  const message = ChatMessage.create({
1027
1181
  role: "assistant",
1028
1182
  content: (textOut == null ? void 0 : textOut.text) || "",
1029
- interrupted: speechHandle.interrupted
1183
+ interrupted: speechHandle.interrupted,
1184
+ metrics: replyAssistantMetrics
1030
1185
  });
1031
1186
  this.agent._chatCtx.insert(message);
1032
1187
  this.agentSession._conversationItemAdded(message);
1033
1188
  }
1034
1189
  if (this.agentSession.agentState === "speaking") {
1035
1190
  this.agentSession._updateAgentState("listening");
1191
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1192
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1193
+ }
1194
+ this.restoreInterruptionByAudioActivity();
1036
1195
  }
1037
1196
  }
1038
1197
  _pipelineReplyTaskImpl = async ({
@@ -1044,9 +1203,10 @@ ${instructions}`;
1044
1203
  instructions,
1045
1204
  newMessage,
1046
1205
  toolsMessages,
1047
- span
1206
+ span,
1207
+ _previousUserMetrics
1048
1208
  }) => {
1049
- var _a, _b;
1209
+ var _a, _b, _c, _d, _e, _f;
1050
1210
  speechHandle._agentTurnContext = otelContext.active();
1051
1211
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1052
1212
  if (instructions) {
@@ -1084,7 +1244,9 @@ ${instructions}`;
1084
1244
  chatCtx,
1085
1245
  toolCtx,
1086
1246
  modelSettings,
1087
- replyAbortController
1247
+ replyAbortController,
1248
+ (_b = this.llm) == null ? void 0 : _b.model,
1249
+ (_c = this.llm) == null ? void 0 : _c.provider
1088
1250
  );
1089
1251
  tasks.push(llmTask);
1090
1252
  let ttsTask = null;
@@ -1097,16 +1259,20 @@ ${instructions}`;
1097
1259
  (...args) => this.agent.ttsNode(...args),
1098
1260
  ttsTextInput,
1099
1261
  modelSettings,
1100
- replyAbortController
1262
+ replyAbortController,
1263
+ (_d = this.tts) == null ? void 0 : _d.model,
1264
+ (_e = this.tts) == null ? void 0 : _e.provider
1101
1265
  );
1102
1266
  tasks.push(ttsTask);
1103
1267
  } else {
1104
1268
  llmOutput = llmGenData.textStream;
1105
1269
  }
1106
1270
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1271
+ let userMetrics = _previousUserMetrics;
1107
1272
  if (newMessage && speechHandle.scheduled) {
1108
1273
  this.agent._chatCtx.insert(newMessage);
1109
1274
  this.agentSession._conversationItemAdded(newMessage);
1275
+ userMetrics = newMessage.metrics;
1110
1276
  }
1111
1277
  if (speechHandle.interrupted) {
1112
1278
  replyAbortController.abort();
@@ -1118,7 +1284,7 @@ ${instructions}`;
1118
1284
  speechHandle._clearAuthorization();
1119
1285
  const replyStartedAt = Date.now();
1120
1286
  let transcriptionInput = llmOutput;
1121
- if (this.useTtsAlignedTranscript && ((_b = this.tts) == null ? void 0 : _b.capabilities.alignedTranscript) && ttsGenData) {
1287
+ if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
1122
1288
  const timedTextsStream = await Promise.race([
1123
1289
  ttsGenData.timedTextsFut.await,
1124
1290
  (ttsTask == null ? void 0 : ttsTask.result.catch(
@@ -1141,11 +1307,17 @@ ${instructions}`;
1141
1307
  tasks.push(textForwardTask);
1142
1308
  textOut = _textOut;
1143
1309
  }
1310
+ let agentStartedSpeakingAt;
1144
1311
  const onFirstFrame = (startedSpeakingAt) => {
1312
+ agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1145
1313
  this.agentSession._updateAgentState("speaking", {
1146
1314
  startTime: startedSpeakingAt,
1147
1315
  otelContext: speechHandle._agentTurnContext
1148
1316
  });
1317
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1318
+ this.audioRecognition.onStartOfAgentSpeech();
1319
+ this.isInterruptionByAudioActivityEnabled = false;
1320
+ }
1149
1321
  };
1150
1322
  let audioOut = null;
1151
1323
  if (audioOutput) {
@@ -1188,6 +1360,25 @@ ${instructions}`;
1188
1360
  if (audioOutput) {
1189
1361
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1190
1362
  }
1363
+ const agentStoppedSpeakingAt = Date.now();
1364
+ const assistantMetrics = {};
1365
+ if (llmGenData.ttft !== void 0) {
1366
+ assistantMetrics.llmNodeTtft = llmGenData.ttft;
1367
+ }
1368
+ if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
1369
+ assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
1370
+ }
1371
+ if (agentStartedSpeakingAt !== void 0) {
1372
+ assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
1373
+ assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
1374
+ if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
1375
+ const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
1376
+ assistantMetrics.e2eLatency = e2eLatency;
1377
+ span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
1378
+ }
1379
+ }
1380
+ span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
1381
+ let hasSpeechMessage = false;
1191
1382
  if (toolsMessages) {
1192
1383
  for (const msg of toolsMessages) {
1193
1384
  msg.createdAt = replyStartedAt;
@@ -1228,20 +1419,27 @@ ${instructions}`;
1228
1419
  }
1229
1420
  }
1230
1421
  if (forwardedText) {
1422
+ hasSpeechMessage = true;
1231
1423
  const message = ChatMessage.create({
1232
1424
  role: "assistant",
1233
1425
  content: forwardedText,
1234
1426
  id: llmGenData.id,
1235
1427
  interrupted: true,
1236
- createdAt: replyStartedAt
1428
+ createdAt: replyStartedAt,
1429
+ metrics: assistantMetrics
1237
1430
  });
1238
1431
  chatCtx.insert(message);
1239
1432
  this.agent._chatCtx.insert(message);
1240
1433
  speechHandle._itemAdded([message]);
1241
1434
  this.agentSession._conversationItemAdded(message);
1435
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
1242
1436
  }
1243
1437
  if (this.agentSession.agentState === "speaking") {
1244
1438
  this.agentSession._updateAgentState("listening");
1439
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1440
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1441
+ this.restoreInterruptionByAudioActivity();
1442
+ }
1245
1443
  }
1246
1444
  this.logger.info(
1247
1445
  { speech_id: speechHandle.id, message: forwardedText },
@@ -1252,17 +1450,20 @@ ${instructions}`;
1252
1450
  return;
1253
1451
  }
1254
1452
  if (textOut && textOut.text) {
1453
+ hasSpeechMessage = true;
1255
1454
  const message = ChatMessage.create({
1256
1455
  role: "assistant",
1257
1456
  id: llmGenData.id,
1258
1457
  interrupted: false,
1259
1458
  createdAt: replyStartedAt,
1260
- content: textOut.text
1459
+ content: textOut.text,
1460
+ metrics: assistantMetrics
1261
1461
  });
1262
1462
  chatCtx.insert(message);
1263
1463
  this.agent._chatCtx.insert(message);
1264
1464
  speechHandle._itemAdded([message]);
1265
1465
  this.agentSession._conversationItemAdded(message);
1466
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
1266
1467
  this.logger.info(
1267
1468
  { speech_id: speechHandle.id, message: textOut.text },
1268
1469
  "playout completed without interruption"
@@ -1272,6 +1473,12 @@ ${instructions}`;
1272
1473
  this.agentSession._updateAgentState("thinking");
1273
1474
  } else if (this.agentSession.agentState === "speaking") {
1274
1475
  this.agentSession._updateAgentState("listening");
1476
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1477
+ {
1478
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1479
+ this.restoreInterruptionByAudioActivity();
1480
+ }
1481
+ }
1275
1482
  }
1276
1483
  speechHandle._markGenerationDone();
1277
1484
  await executeToolsTask.result;
@@ -1311,7 +1518,8 @@ ${instructions}`;
1311
1518
  replyAbortController,
1312
1519
  instructions,
1313
1520
  void 0,
1314
- toolMessages
1521
+ toolMessages,
1522
+ hasSpeechMessage ? void 0 : userMetrics
1315
1523
  ),
1316
1524
  ownedSpeechHandle: speechHandle,
1317
1525
  name: "AgentActivity.pipelineReply"
@@ -1331,7 +1539,7 @@ ${instructions}`;
1331
1539
  }
1332
1540
  }
1333
1541
  };
1334
- pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => tracer.startActiveSpan(
1542
+ pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => tracer.startActiveSpan(
1335
1543
  async (span) => this._pipelineReplyTaskImpl({
1336
1544
  speechHandle,
1337
1545
  chatCtx,
@@ -1341,7 +1549,8 @@ ${instructions}`;
1341
1549
  instructions,
1342
1550
  newMessage,
1343
1551
  toolsMessages,
1344
- span
1552
+ span,
1553
+ _previousUserMetrics
1345
1554
  }),
1346
1555
  {
1347
1556
  name: "agent_turn",
@@ -1407,6 +1616,7 @@ ${instructions}`;
1407
1616
  });
1408
1617
  };
1409
1618
  const readMessages = async (abortController, outputs) => {
1619
+ var _a2, _b;
1410
1620
  replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
1411
1621
  once: true
1412
1622
  });
@@ -1453,7 +1663,9 @@ ${instructions}`;
1453
1663
  (...args) => this.agent.ttsNode(...args),
1454
1664
  ttsTextInput,
1455
1665
  modelSettings,
1456
- abortController
1666
+ abortController,
1667
+ (_a2 = this.tts) == null ? void 0 : _a2.model,
1668
+ (_b = this.tts) == null ? void 0 : _b.provider
1457
1669
  );
1458
1670
  tasks.push(ttsTask);
1459
1671
  realtimeAudioResult = ttsGenData.audioStream;
@@ -1845,11 +2057,46 @@ ${instructions}`;
1845
2057
  if (this._mainTask) {
1846
2058
  await this._mainTask.cancelAndWait();
1847
2059
  }
2060
+ if (this.interruptionDetector) {
2061
+ this.interruptionDetector.off(
2062
+ "user_overlapping_speech",
2063
+ this.onInterruptionOverlappingSpeech
2064
+ );
2065
+ this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
2066
+ this.interruptionDetector.off("error", this.onInterruptionError);
2067
+ }
1848
2068
  this.agent._agentActivity = void 0;
1849
2069
  } finally {
1850
2070
  unlock();
1851
2071
  }
1852
2072
  }
2073
+ resolveInterruptionDetector() {
2074
+ const interruptionDetection = this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
2075
+ if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof RealtimeModel))) {
2076
+ if (interruptionDetection === "adaptive") {
2077
+ this.logger.warn(
2078
+ "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
2079
+ );
2080
+ return void 0;
2081
+ }
2082
+ }
2083
+ if (interruptionDetection !== void 0 && interruptionDetection === false || interruptionDetection === "vad") {
2084
+ return void 0;
2085
+ }
2086
+ try {
2087
+ const detector = new AdaptiveInterruptionDetector();
2088
+ detector.on("user_overlapping_speech", this.onInterruptionOverlappingSpeech);
2089
+ detector.on("metrics_collected", this.onInterruptionMetricsCollected);
2090
+ detector.on("error", this.onInterruptionError);
2091
+ return detector;
2092
+ } catch (error) {
2093
+ this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
2094
+ }
2095
+ return void 0;
2096
+ }
2097
+ restoreInterruptionByAudioActivity() {
2098
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
2099
+ }
1853
2100
  async _closeSessionResources() {
1854
2101
  var _a, _b, _c;
1855
2102
  if (this.llm instanceof LLM) {
@@ -1891,6 +2138,7 @@ function toOaiToolChoice(toolChoice) {
1891
2138
  }
1892
2139
  export {
1893
2140
  AgentActivity,
1894
- agentActivityStorage
2141
+ agentActivityStorage,
2142
+ onEnterStorage
1895
2143
  };
1896
2144
  //# sourceMappingURL=agent_activity.js.map