@livekit/agents 1.0.47 → 1.1.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (444) hide show
  1. package/dist/beta/index.cjs +29 -0
  2. package/dist/beta/index.cjs.map +1 -0
  3. package/dist/beta/index.d.cts +2 -0
  4. package/dist/beta/index.d.ts +2 -0
  5. package/dist/beta/index.d.ts.map +1 -0
  6. package/dist/beta/index.js +7 -0
  7. package/dist/beta/index.js.map +1 -0
  8. package/dist/beta/workflows/index.cjs +29 -0
  9. package/dist/beta/workflows/index.cjs.map +1 -0
  10. package/dist/beta/workflows/index.d.cts +2 -0
  11. package/dist/beta/workflows/index.d.ts +2 -0
  12. package/dist/beta/workflows/index.d.ts.map +1 -0
  13. package/dist/beta/workflows/index.js +7 -0
  14. package/dist/beta/workflows/index.js.map +1 -0
  15. package/dist/beta/workflows/task_group.cjs +162 -0
  16. package/dist/beta/workflows/task_group.cjs.map +1 -0
  17. package/dist/beta/workflows/task_group.d.cts +32 -0
  18. package/dist/beta/workflows/task_group.d.ts +32 -0
  19. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  20. package/dist/beta/workflows/task_group.js +138 -0
  21. package/dist/beta/workflows/task_group.js.map +1 -0
  22. package/dist/constants.cjs +27 -0
  23. package/dist/constants.cjs.map +1 -1
  24. package/dist/constants.d.cts +9 -0
  25. package/dist/constants.d.ts +9 -0
  26. package/dist/constants.d.ts.map +1 -1
  27. package/dist/constants.js +18 -0
  28. package/dist/constants.js.map +1 -1
  29. package/dist/index.cjs +3 -0
  30. package/dist/index.cjs.map +1 -1
  31. package/dist/index.d.cts +2 -1
  32. package/dist/index.d.ts +2 -1
  33. package/dist/index.d.ts.map +1 -1
  34. package/dist/index.js +2 -0
  35. package/dist/index.js.map +1 -1
  36. package/dist/inference/api_protos.d.cts +12 -12
  37. package/dist/inference/api_protos.d.ts +12 -12
  38. package/dist/inference/interruption/defaults.cjs +81 -0
  39. package/dist/inference/interruption/defaults.cjs.map +1 -0
  40. package/dist/inference/interruption/defaults.d.cts +19 -0
  41. package/dist/inference/interruption/defaults.d.ts +19 -0
  42. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  43. package/dist/inference/interruption/defaults.js +46 -0
  44. package/dist/inference/interruption/defaults.js.map +1 -0
  45. package/dist/inference/interruption/errors.cjs +44 -0
  46. package/dist/inference/interruption/errors.cjs.map +1 -0
  47. package/dist/inference/interruption/errors.d.cts +12 -0
  48. package/dist/inference/interruption/errors.d.ts +12 -0
  49. package/dist/inference/interruption/errors.d.ts.map +1 -0
  50. package/dist/inference/interruption/errors.js +20 -0
  51. package/dist/inference/interruption/errors.js.map +1 -0
  52. package/dist/inference/interruption/http_transport.cjs +147 -0
  53. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  54. package/dist/inference/interruption/http_transport.d.cts +63 -0
  55. package/dist/inference/interruption/http_transport.d.ts +63 -0
  56. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  57. package/dist/inference/interruption/http_transport.js +121 -0
  58. package/dist/inference/interruption/http_transport.js.map +1 -0
  59. package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
  60. package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
  61. package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
  62. package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
  63. package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
  64. package/dist/inference/interruption/interruption_cache_entry.js +34 -0
  65. package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
  66. package/dist/inference/interruption/interruption_detector.cjs +181 -0
  67. package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
  68. package/dist/inference/interruption/interruption_detector.d.cts +59 -0
  69. package/dist/inference/interruption/interruption_detector.d.ts +59 -0
  70. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
  71. package/dist/inference/interruption/interruption_detector.js +147 -0
  72. package/dist/inference/interruption/interruption_detector.js.map +1 -0
  73. package/dist/inference/interruption/interruption_stream.cjs +368 -0
  74. package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
  75. package/dist/inference/interruption/interruption_stream.d.cts +46 -0
  76. package/dist/inference/interruption/interruption_stream.d.ts +46 -0
  77. package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
  78. package/dist/inference/interruption/interruption_stream.js +344 -0
  79. package/dist/inference/interruption/interruption_stream.js.map +1 -0
  80. package/dist/inference/interruption/types.cjs +17 -0
  81. package/dist/inference/interruption/types.cjs.map +1 -0
  82. package/dist/inference/interruption/types.d.cts +66 -0
  83. package/dist/inference/interruption/types.d.ts +66 -0
  84. package/dist/inference/interruption/types.d.ts.map +1 -0
  85. package/dist/inference/interruption/types.js +1 -0
  86. package/dist/inference/interruption/types.js.map +1 -0
  87. package/dist/inference/interruption/utils.cjs +130 -0
  88. package/dist/inference/interruption/utils.cjs.map +1 -0
  89. package/dist/inference/interruption/utils.d.cts +41 -0
  90. package/dist/inference/interruption/utils.d.ts +41 -0
  91. package/dist/inference/interruption/utils.d.ts.map +1 -0
  92. package/dist/inference/interruption/utils.js +105 -0
  93. package/dist/inference/interruption/utils.js.map +1 -0
  94. package/dist/inference/interruption/utils.test.cjs +105 -0
  95. package/dist/inference/interruption/utils.test.cjs.map +1 -0
  96. package/dist/inference/interruption/utils.test.js +104 -0
  97. package/dist/inference/interruption/utils.test.js.map +1 -0
  98. package/dist/inference/interruption/ws_transport.cjs +329 -0
  99. package/dist/inference/interruption/ws_transport.cjs.map +1 -0
  100. package/dist/inference/interruption/ws_transport.d.cts +33 -0
  101. package/dist/inference/interruption/ws_transport.d.ts +33 -0
  102. package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
  103. package/dist/inference/interruption/ws_transport.js +295 -0
  104. package/dist/inference/interruption/ws_transport.js.map +1 -0
  105. package/dist/inference/llm.cjs +14 -10
  106. package/dist/inference/llm.cjs.map +1 -1
  107. package/dist/inference/llm.d.cts +2 -1
  108. package/dist/inference/llm.d.ts +2 -1
  109. package/dist/inference/llm.d.ts.map +1 -1
  110. package/dist/inference/llm.js +8 -10
  111. package/dist/inference/llm.js.map +1 -1
  112. package/dist/inference/stt.cjs +7 -2
  113. package/dist/inference/stt.cjs.map +1 -1
  114. package/dist/inference/stt.d.cts +2 -0
  115. package/dist/inference/stt.d.ts +2 -0
  116. package/dist/inference/stt.d.ts.map +1 -1
  117. package/dist/inference/stt.js +8 -3
  118. package/dist/inference/stt.js.map +1 -1
  119. package/dist/inference/tts.cjs +7 -2
  120. package/dist/inference/tts.cjs.map +1 -1
  121. package/dist/inference/tts.d.cts +2 -0
  122. package/dist/inference/tts.d.ts +2 -0
  123. package/dist/inference/tts.d.ts.map +1 -1
  124. package/dist/inference/tts.js +8 -3
  125. package/dist/inference/tts.js.map +1 -1
  126. package/dist/inference/utils.cjs +26 -7
  127. package/dist/inference/utils.cjs.map +1 -1
  128. package/dist/inference/utils.d.cts +13 -0
  129. package/dist/inference/utils.d.ts +13 -0
  130. package/dist/inference/utils.d.ts.map +1 -1
  131. package/dist/inference/utils.js +18 -2
  132. package/dist/inference/utils.js.map +1 -1
  133. package/dist/llm/chat_context.cjs +108 -2
  134. package/dist/llm/chat_context.cjs.map +1 -1
  135. package/dist/llm/chat_context.d.cts +28 -1
  136. package/dist/llm/chat_context.d.ts +28 -1
  137. package/dist/llm/chat_context.d.ts.map +1 -1
  138. package/dist/llm/chat_context.js +108 -2
  139. package/dist/llm/chat_context.js.map +1 -1
  140. package/dist/llm/chat_context.test.cjs +43 -0
  141. package/dist/llm/chat_context.test.cjs.map +1 -1
  142. package/dist/llm/chat_context.test.js +43 -0
  143. package/dist/llm/chat_context.test.js.map +1 -1
  144. package/dist/llm/index.cjs +2 -0
  145. package/dist/llm/index.cjs.map +1 -1
  146. package/dist/llm/index.d.cts +2 -2
  147. package/dist/llm/index.d.ts +2 -2
  148. package/dist/llm/index.d.ts.map +1 -1
  149. package/dist/llm/index.js +3 -1
  150. package/dist/llm/index.js.map +1 -1
  151. package/dist/llm/llm.cjs +16 -1
  152. package/dist/llm/llm.cjs.map +1 -1
  153. package/dist/llm/llm.d.cts +9 -0
  154. package/dist/llm/llm.d.ts +9 -0
  155. package/dist/llm/llm.d.ts.map +1 -1
  156. package/dist/llm/llm.js +16 -1
  157. package/dist/llm/llm.js.map +1 -1
  158. package/dist/llm/provider_format/index.d.cts +1 -1
  159. package/dist/llm/provider_format/index.d.ts +1 -1
  160. package/dist/llm/realtime.cjs +3 -0
  161. package/dist/llm/realtime.cjs.map +1 -1
  162. package/dist/llm/realtime.d.cts +1 -0
  163. package/dist/llm/realtime.d.ts +1 -0
  164. package/dist/llm/realtime.d.ts.map +1 -1
  165. package/dist/llm/realtime.js +3 -0
  166. package/dist/llm/realtime.js.map +1 -1
  167. package/dist/llm/tool_context.cjs +7 -0
  168. package/dist/llm/tool_context.cjs.map +1 -1
  169. package/dist/llm/tool_context.d.cts +10 -2
  170. package/dist/llm/tool_context.d.ts +10 -2
  171. package/dist/llm/tool_context.d.ts.map +1 -1
  172. package/dist/llm/tool_context.js +6 -0
  173. package/dist/llm/tool_context.js.map +1 -1
  174. package/dist/metrics/base.cjs.map +1 -1
  175. package/dist/metrics/base.d.cts +45 -1
  176. package/dist/metrics/base.d.ts +45 -1
  177. package/dist/metrics/base.d.ts.map +1 -1
  178. package/dist/metrics/index.cjs +5 -0
  179. package/dist/metrics/index.cjs.map +1 -1
  180. package/dist/metrics/index.d.cts +2 -1
  181. package/dist/metrics/index.d.ts +2 -1
  182. package/dist/metrics/index.d.ts.map +1 -1
  183. package/dist/metrics/index.js +6 -0
  184. package/dist/metrics/index.js.map +1 -1
  185. package/dist/metrics/model_usage.cjs +189 -0
  186. package/dist/metrics/model_usage.cjs.map +1 -0
  187. package/dist/metrics/model_usage.d.cts +92 -0
  188. package/dist/metrics/model_usage.d.ts +92 -0
  189. package/dist/metrics/model_usage.d.ts.map +1 -0
  190. package/dist/metrics/model_usage.js +164 -0
  191. package/dist/metrics/model_usage.js.map +1 -0
  192. package/dist/metrics/model_usage.test.cjs +474 -0
  193. package/dist/metrics/model_usage.test.cjs.map +1 -0
  194. package/dist/metrics/model_usage.test.js +476 -0
  195. package/dist/metrics/model_usage.test.js.map +1 -0
  196. package/dist/metrics/usage_collector.cjs +3 -0
  197. package/dist/metrics/usage_collector.cjs.map +1 -1
  198. package/dist/metrics/usage_collector.d.cts +9 -0
  199. package/dist/metrics/usage_collector.d.ts +9 -0
  200. package/dist/metrics/usage_collector.d.ts.map +1 -1
  201. package/dist/metrics/usage_collector.js +3 -0
  202. package/dist/metrics/usage_collector.js.map +1 -1
  203. package/dist/metrics/utils.cjs +9 -0
  204. package/dist/metrics/utils.cjs.map +1 -1
  205. package/dist/metrics/utils.d.ts.map +1 -1
  206. package/dist/metrics/utils.js +9 -0
  207. package/dist/metrics/utils.js.map +1 -1
  208. package/dist/stream/multi_input_stream.test.cjs +4 -0
  209. package/dist/stream/multi_input_stream.test.cjs.map +1 -1
  210. package/dist/stream/multi_input_stream.test.js +5 -1
  211. package/dist/stream/multi_input_stream.test.js.map +1 -1
  212. package/dist/stream/stream_channel.cjs +31 -0
  213. package/dist/stream/stream_channel.cjs.map +1 -1
  214. package/dist/stream/stream_channel.d.cts +4 -2
  215. package/dist/stream/stream_channel.d.ts +4 -2
  216. package/dist/stream/stream_channel.d.ts.map +1 -1
  217. package/dist/stream/stream_channel.js +31 -0
  218. package/dist/stream/stream_channel.js.map +1 -1
  219. package/dist/stt/stt.cjs +34 -2
  220. package/dist/stt/stt.cjs.map +1 -1
  221. package/dist/stt/stt.d.cts +22 -0
  222. package/dist/stt/stt.d.ts +22 -0
  223. package/dist/stt/stt.d.ts.map +1 -1
  224. package/dist/stt/stt.js +34 -2
  225. package/dist/stt/stt.js.map +1 -1
  226. package/dist/telemetry/otel_http_exporter.cjs +24 -5
  227. package/dist/telemetry/otel_http_exporter.cjs.map +1 -1
  228. package/dist/telemetry/otel_http_exporter.d.cts +1 -0
  229. package/dist/telemetry/otel_http_exporter.d.ts +1 -0
  230. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -1
  231. package/dist/telemetry/otel_http_exporter.js +24 -5
  232. package/dist/telemetry/otel_http_exporter.js.map +1 -1
  233. package/dist/telemetry/trace_types.cjs +5 -5
  234. package/dist/telemetry/trace_types.cjs.map +1 -1
  235. package/dist/telemetry/trace_types.d.cts +9 -5
  236. package/dist/telemetry/trace_types.d.ts +9 -5
  237. package/dist/telemetry/trace_types.d.ts.map +1 -1
  238. package/dist/telemetry/trace_types.js +5 -5
  239. package/dist/telemetry/trace_types.js.map +1 -1
  240. package/dist/telemetry/traces.cjs +47 -8
  241. package/dist/telemetry/traces.cjs.map +1 -1
  242. package/dist/telemetry/traces.d.ts.map +1 -1
  243. package/dist/telemetry/traces.js +47 -8
  244. package/dist/telemetry/traces.js.map +1 -1
  245. package/dist/tts/tts.cjs +64 -2
  246. package/dist/tts/tts.cjs.map +1 -1
  247. package/dist/tts/tts.d.cts +34 -0
  248. package/dist/tts/tts.d.ts +34 -0
  249. package/dist/tts/tts.d.ts.map +1 -1
  250. package/dist/tts/tts.js +64 -2
  251. package/dist/tts/tts.js.map +1 -1
  252. package/dist/utils.cjs +1 -0
  253. package/dist/utils.cjs.map +1 -1
  254. package/dist/utils.d.ts.map +1 -1
  255. package/dist/utils.js +1 -0
  256. package/dist/utils.js.map +1 -1
  257. package/dist/version.cjs +1 -1
  258. package/dist/version.js +1 -1
  259. package/dist/voice/agent.cjs +34 -4
  260. package/dist/voice/agent.cjs.map +1 -1
  261. package/dist/voice/agent.d.cts +11 -2
  262. package/dist/voice/agent.d.ts +11 -2
  263. package/dist/voice/agent.d.ts.map +1 -1
  264. package/dist/voice/agent.js +34 -4
  265. package/dist/voice/agent.js.map +1 -1
  266. package/dist/voice/agent_activity.cjs +292 -44
  267. package/dist/voice/agent_activity.cjs.map +1 -1
  268. package/dist/voice/agent_activity.d.cts +27 -6
  269. package/dist/voice/agent_activity.d.ts +27 -6
  270. package/dist/voice/agent_activity.d.ts.map +1 -1
  271. package/dist/voice/agent_activity.js +293 -45
  272. package/dist/voice/agent_activity.js.map +1 -1
  273. package/dist/voice/agent_session.cjs +105 -48
  274. package/dist/voice/agent_session.cjs.map +1 -1
  275. package/dist/voice/agent_session.d.cts +90 -20
  276. package/dist/voice/agent_session.d.ts +90 -20
  277. package/dist/voice/agent_session.d.ts.map +1 -1
  278. package/dist/voice/agent_session.js +105 -46
  279. package/dist/voice/agent_session.js.map +1 -1
  280. package/dist/voice/audio_recognition.cjs +287 -6
  281. package/dist/voice/audio_recognition.cjs.map +1 -1
  282. package/dist/voice/audio_recognition.d.cts +42 -3
  283. package/dist/voice/audio_recognition.d.ts +42 -3
  284. package/dist/voice/audio_recognition.d.ts.map +1 -1
  285. package/dist/voice/audio_recognition.js +289 -7
  286. package/dist/voice/audio_recognition.js.map +1 -1
  287. package/dist/voice/client_events.cjs +554 -0
  288. package/dist/voice/client_events.cjs.map +1 -0
  289. package/dist/voice/client_events.d.cts +195 -0
  290. package/dist/voice/client_events.d.ts +195 -0
  291. package/dist/voice/client_events.d.ts.map +1 -0
  292. package/dist/voice/client_events.js +548 -0
  293. package/dist/voice/client_events.js.map +1 -0
  294. package/dist/voice/events.cjs +1 -0
  295. package/dist/voice/events.cjs.map +1 -1
  296. package/dist/voice/events.d.cts +8 -5
  297. package/dist/voice/events.d.ts +8 -5
  298. package/dist/voice/events.d.ts.map +1 -1
  299. package/dist/voice/events.js +1 -0
  300. package/dist/voice/events.js.map +1 -1
  301. package/dist/voice/generation.cjs +43 -8
  302. package/dist/voice/generation.cjs.map +1 -1
  303. package/dist/voice/generation.d.cts +3 -3
  304. package/dist/voice/generation.d.ts +3 -3
  305. package/dist/voice/generation.d.ts.map +1 -1
  306. package/dist/voice/generation.js +43 -8
  307. package/dist/voice/generation.js.map +1 -1
  308. package/dist/voice/index.cjs +1 -0
  309. package/dist/voice/index.cjs.map +1 -1
  310. package/dist/voice/index.d.cts +1 -0
  311. package/dist/voice/index.d.ts +1 -0
  312. package/dist/voice/index.d.ts.map +1 -1
  313. package/dist/voice/index.js +1 -0
  314. package/dist/voice/index.js.map +1 -1
  315. package/dist/voice/report.cjs +20 -8
  316. package/dist/voice/report.cjs.map +1 -1
  317. package/dist/voice/report.d.cts +5 -0
  318. package/dist/voice/report.d.ts +5 -0
  319. package/dist/voice/report.d.ts.map +1 -1
  320. package/dist/voice/report.js +20 -8
  321. package/dist/voice/report.js.map +1 -1
  322. package/dist/voice/report.test.cjs +106 -0
  323. package/dist/voice/report.test.cjs.map +1 -0
  324. package/dist/voice/report.test.js +105 -0
  325. package/dist/voice/report.test.js.map +1 -0
  326. package/dist/voice/room_io/room_io.cjs +16 -41
  327. package/dist/voice/room_io/room_io.cjs.map +1 -1
  328. package/dist/voice/room_io/room_io.d.cts +4 -9
  329. package/dist/voice/room_io/room_io.d.ts +4 -9
  330. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  331. package/dist/voice/room_io/room_io.js +17 -43
  332. package/dist/voice/room_io/room_io.js.map +1 -1
  333. package/dist/voice/testing/fake_llm.cjs +127 -0
  334. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  335. package/dist/voice/testing/fake_llm.d.cts +30 -0
  336. package/dist/voice/testing/fake_llm.d.ts +30 -0
  337. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  338. package/dist/voice/testing/fake_llm.js +103 -0
  339. package/dist/voice/testing/fake_llm.js.map +1 -0
  340. package/dist/voice/testing/index.cjs +3 -0
  341. package/dist/voice/testing/index.cjs.map +1 -1
  342. package/dist/voice/testing/index.d.cts +1 -0
  343. package/dist/voice/testing/index.d.ts +1 -0
  344. package/dist/voice/testing/index.d.ts.map +1 -1
  345. package/dist/voice/testing/index.js +2 -0
  346. package/dist/voice/testing/index.js.map +1 -1
  347. package/dist/voice/turn_config/endpointing.cjs +33 -0
  348. package/dist/voice/turn_config/endpointing.cjs.map +1 -0
  349. package/dist/voice/turn_config/endpointing.d.cts +30 -0
  350. package/dist/voice/turn_config/endpointing.d.ts +30 -0
  351. package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
  352. package/dist/voice/turn_config/endpointing.js +9 -0
  353. package/dist/voice/turn_config/endpointing.js.map +1 -0
  354. package/dist/voice/turn_config/interruption.cjs +37 -0
  355. package/dist/voice/turn_config/interruption.cjs.map +1 -0
  356. package/dist/voice/turn_config/interruption.d.cts +53 -0
  357. package/dist/voice/turn_config/interruption.d.ts +53 -0
  358. package/dist/voice/turn_config/interruption.d.ts.map +1 -0
  359. package/dist/voice/turn_config/interruption.js +13 -0
  360. package/dist/voice/turn_config/interruption.js.map +1 -0
  361. package/dist/voice/turn_config/turn_handling.cjs +35 -0
  362. package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
  363. package/dist/voice/turn_config/turn_handling.d.cts +36 -0
  364. package/dist/voice/turn_config/turn_handling.d.ts +36 -0
  365. package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
  366. package/dist/voice/turn_config/turn_handling.js +11 -0
  367. package/dist/voice/turn_config/turn_handling.js.map +1 -0
  368. package/dist/voice/turn_config/utils.cjs +97 -0
  369. package/dist/voice/turn_config/utils.cjs.map +1 -0
  370. package/dist/voice/turn_config/utils.d.cts +25 -0
  371. package/dist/voice/turn_config/utils.d.ts +25 -0
  372. package/dist/voice/turn_config/utils.d.ts.map +1 -0
  373. package/dist/voice/turn_config/utils.js +73 -0
  374. package/dist/voice/turn_config/utils.js.map +1 -0
  375. package/dist/voice/turn_config/utils.test.cjs +86 -0
  376. package/dist/voice/turn_config/utils.test.cjs.map +1 -0
  377. package/dist/voice/turn_config/utils.test.js +85 -0
  378. package/dist/voice/turn_config/utils.test.js.map +1 -0
  379. package/dist/voice/wire_format.cjs +798 -0
  380. package/dist/voice/wire_format.cjs.map +1 -0
  381. package/dist/voice/wire_format.d.cts +5503 -0
  382. package/dist/voice/wire_format.d.ts +5503 -0
  383. package/dist/voice/wire_format.d.ts.map +1 -0
  384. package/dist/voice/wire_format.js +728 -0
  385. package/dist/voice/wire_format.js.map +1 -0
  386. package/package.json +2 -1
  387. package/src/beta/index.ts +9 -0
  388. package/src/beta/workflows/index.ts +9 -0
  389. package/src/beta/workflows/task_group.ts +194 -0
  390. package/src/constants.ts +13 -0
  391. package/src/index.ts +2 -1
  392. package/src/inference/interruption/defaults.ts +51 -0
  393. package/src/inference/interruption/errors.ts +25 -0
  394. package/src/inference/interruption/http_transport.ts +187 -0
  395. package/src/inference/interruption/interruption_cache_entry.ts +50 -0
  396. package/src/inference/interruption/interruption_detector.ts +188 -0
  397. package/src/inference/interruption/interruption_stream.ts +467 -0
  398. package/src/inference/interruption/types.ts +84 -0
  399. package/src/inference/interruption/utils.test.ts +132 -0
  400. package/src/inference/interruption/utils.ts +137 -0
  401. package/src/inference/interruption/ws_transport.ts +402 -0
  402. package/src/inference/llm.ts +9 -12
  403. package/src/inference/stt.ts +10 -3
  404. package/src/inference/tts.ts +10 -3
  405. package/src/inference/utils.ts +29 -1
  406. package/src/llm/chat_context.test.ts +48 -0
  407. package/src/llm/chat_context.ts +161 -0
  408. package/src/llm/index.ts +2 -0
  409. package/src/llm/llm.ts +16 -0
  410. package/src/llm/realtime.ts +4 -0
  411. package/src/llm/tool_context.ts +14 -0
  412. package/src/metrics/base.ts +48 -1
  413. package/src/metrics/index.ts +11 -0
  414. package/src/metrics/model_usage.test.ts +545 -0
  415. package/src/metrics/model_usage.ts +262 -0
  416. package/src/metrics/usage_collector.ts +11 -0
  417. package/src/metrics/utils.ts +11 -0
  418. package/src/stream/multi_input_stream.test.ts +6 -1
  419. package/src/stream/stream_channel.ts +34 -2
  420. package/src/stt/stt.ts +38 -0
  421. package/src/telemetry/otel_http_exporter.ts +28 -5
  422. package/src/telemetry/trace_types.ts +11 -8
  423. package/src/telemetry/traces.ts +111 -54
  424. package/src/tts/tts.ts +69 -1
  425. package/src/utils.ts +5 -0
  426. package/src/voice/agent.ts +41 -3
  427. package/src/voice/agent_activity.ts +371 -34
  428. package/src/voice/agent_session.ts +207 -59
  429. package/src/voice/audio_recognition.ts +385 -9
  430. package/src/voice/client_events.ts +838 -0
  431. package/src/voice/events.ts +14 -4
  432. package/src/voice/generation.ts +52 -9
  433. package/src/voice/index.ts +1 -0
  434. package/src/voice/report.test.ts +117 -0
  435. package/src/voice/report.ts +29 -6
  436. package/src/voice/room_io/room_io.ts +21 -64
  437. package/src/voice/testing/fake_llm.ts +138 -0
  438. package/src/voice/testing/index.ts +2 -0
  439. package/src/voice/turn_config/endpointing.ts +33 -0
  440. package/src/voice/turn_config/interruption.ts +56 -0
  441. package/src/voice/turn_config/turn_handling.ts +45 -0
  442. package/src/voice/turn_config/utils.test.ts +100 -0
  443. package/src/voice/turn_config/utils.ts +103 -0
  444. package/src/voice/wire_format.ts +827 -0
@@ -19,7 +19,8 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
19
19
  var agent_activity_exports = {};
20
20
  __export(agent_activity_exports, {
21
21
  AgentActivity: () => AgentActivity,
22
- agentActivityStorage: () => agentActivityStorage
22
+ agentActivityStorage: () => agentActivityStorage,
23
+ onEnterStorage: () => onEnterStorage
23
24
  });
24
25
  module.exports = __toCommonJS(agent_activity_exports);
25
26
  var import_mutex = require("@livekit/mutex");
@@ -27,6 +28,7 @@ var import_api = require("@opentelemetry/api");
27
28
  var import_heap_js = require("heap-js");
28
29
  var import_node_async_hooks = require("node:async_hooks");
29
30
  var import_web = require("node:stream/web");
31
+ var import_interruption_detector = require("../inference/interruption/interruption_detector.cjs");
30
32
  var import_chat_context = require("../llm/chat_context.cjs");
31
33
  var import_llm = require("../llm/index.cjs");
32
34
  var import_tool_context = require("../llm/tool_context.cjs");
@@ -46,6 +48,7 @@ var import_generation = require("./generation.cjs");
46
48
  var import_speech_handle = require("./speech_handle.cjs");
47
49
  var import_utils2 = require("./utils.cjs");
48
50
  const agentActivityStorage = new import_node_async_hooks.AsyncLocalStorage();
51
+ const onEnterStorage = new import_node_async_hooks.AsyncLocalStorage();
49
52
  class AgentActivity {
50
53
  agent;
51
54
  agentSession;
@@ -70,16 +73,34 @@ class AgentActivity {
70
73
  // default to null as None, which maps to the default provider tool choice value
71
74
  toolChoice = null;
72
75
  _preemptiveGeneration;
73
- /** @internal */
74
- _mainTask;
75
- _onEnterTask;
76
- _onExitTask;
77
- _userTurnCompletedTask;
76
+ interruptionDetector;
77
+ isInterruptionDetectionEnabled;
78
+ isInterruptionByAudioActivityEnabled;
79
+ isDefaultInterruptionByAudioActivityEnabled;
78
80
  onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
79
81
  onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
80
82
  onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
81
83
  onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
82
84
  onModelError = (ev) => this.onError(ev);
85
+ onInterruptionOverlappingSpeech = (ev) => {
86
+ this.agentSession.emit(import_events.AgentSessionEventTypes.UserOverlappingSpeech, ev);
87
+ };
88
+ onInterruptionMetricsCollected = (ev) => {
89
+ this.agentSession.emit(
90
+ import_events.AgentSessionEventTypes.MetricsCollected,
91
+ (0, import_events.createMetricsCollectedEvent)({ metrics: ev })
92
+ );
93
+ };
94
+ onInterruptionError = (ev) => {
95
+ const errorEvent = (0, import_events.createErrorEvent)(ev, this.interruptionDetector);
96
+ this.agentSession.emit(import_events.AgentSessionEventTypes.Error, errorEvent);
97
+ this.agentSession._onError(ev);
98
+ };
99
+ /** @internal */
100
+ _mainTask;
101
+ _onEnterTask;
102
+ _onExitTask;
103
+ _userTurnCompletedTask;
83
104
  constructor(agent, agentSession) {
84
105
  this.agent = agent;
85
106
  this.agentSession = agentSession;
@@ -138,6 +159,10 @@ class AgentActivity {
138
159
  "VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
139
160
  );
140
161
  }
162
+ this.interruptionDetector = this.resolveInterruptionDetector();
163
+ this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
164
+ this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
165
+ this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
141
166
  }
142
167
  async start() {
143
168
  const unlock = await this.lock.lock();
@@ -230,8 +255,9 @@ class AgentActivity {
230
255
  vad: this.vad,
231
256
  turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
232
257
  turnDetectionMode: this.turnDetectionMode,
233
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
234
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
258
+ interruptionDetection: this.interruptionDetector,
259
+ minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay,
260
+ maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay,
235
261
  rootSpanContext: this.agentSession.rootSpanContext,
236
262
  sttModel: (_a = this.stt) == null ? void 0 : _a.label,
237
263
  sttProvider: this.getSttProvider(),
@@ -245,11 +271,14 @@ class AgentActivity {
245
271
  this._resumeSchedulingTask();
246
272
  if (runOnEnter) {
247
273
  this._onEnterTask = this.createSpeechTask({
248
- taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
249
- name: "on_enter",
250
- context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
251
- attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
252
- }),
274
+ taskFn: () => onEnterStorage.run(
275
+ { session: this.agentSession, agent: this.agent },
276
+ () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
277
+ name: "on_enter",
278
+ context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
279
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
280
+ })
281
+ ),
253
282
  inlineTask: true,
254
283
  name: "AgentActivity_onEnter"
255
284
  });
@@ -290,7 +319,8 @@ class AgentActivity {
290
319
  return this.realtimeSession;
291
320
  }
292
321
  get allowInterruptions() {
293
- return this.agentSession.options.allowInterruptions;
322
+ var _a;
323
+ return ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.mode) !== false;
294
324
  }
295
325
  get useTtsAlignedTranscript() {
296
326
  return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
@@ -301,6 +331,11 @@ class AgentActivity {
301
331
  get toolCtx() {
302
332
  return this.agent.toolCtx;
303
333
  }
334
+ /** @internal */
335
+ get inputStartedAt() {
336
+ var _a;
337
+ return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
338
+ }
304
339
  async updateChatCtx(chatCtx) {
305
340
  chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
306
341
  this.agent._chatCtx = chatCtx;
@@ -315,19 +350,50 @@ class AgentActivity {
315
350
  });
316
351
  }
317
352
  }
318
- updateOptions({ toolChoice }) {
353
+ // TODO: Add when AgentConfigUpdate is ported to ChatContext.
354
+ async updateTools(tools) {
355
+ this.agent._tools = { ...tools };
356
+ if (this.realtimeSession) {
357
+ await this.realtimeSession.updateTools(tools);
358
+ }
359
+ if (this.llm instanceof import_llm.LLM) {
360
+ await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
361
+ }
362
+ }
363
+ updateOptions({
364
+ toolChoice,
365
+ turnDetection
366
+ }) {
319
367
  if (toolChoice !== void 0) {
320
368
  this.toolChoice = toolChoice;
321
369
  }
322
370
  if (this.realtimeSession) {
323
371
  this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
324
372
  }
373
+ if (turnDetection !== void 0) {
374
+ this.turnDetectionMode = turnDetection;
375
+ this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
376
+ if (this.agentSession.agentState !== "speaking") {
377
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
378
+ }
379
+ }
380
+ if (this.audioRecognition) {
381
+ this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
382
+ }
325
383
  }
326
384
  attachAudioInput(audioStream) {
327
385
  void this.audioStream.close();
328
386
  this.audioStream = new import_multi_input_stream.MultiInputStream();
387
+ const aecWarmupAudioFilter = new import_web.TransformStream({
388
+ transform: (frame, controller) => {
389
+ const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
390
+ if (!shouldDiscardForAecWarmup) {
391
+ controller.enqueue(frame);
392
+ }
393
+ }
394
+ });
329
395
  this.audioStreamId = this.audioStream.addInputStream(audioStream);
330
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
396
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
331
397
  if (this.realtimeSession) {
332
398
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
333
399
  }
@@ -433,6 +499,13 @@ class AgentActivity {
433
499
  this.logger.info("onInputSpeechStarted");
434
500
  if (!this.vad) {
435
501
  this.agentSession._updateUserState("speaking");
502
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
503
+ this.audioRecognition.onStartOfOverlapSpeech(
504
+ 0,
505
+ Date.now(),
506
+ this.agentSession._userSpeakingSpan
507
+ );
508
+ }
436
509
  }
437
510
  try {
438
511
  this.interrupt();
@@ -446,6 +519,9 @@ class AgentActivity {
446
519
  onInputSpeechStopped(ev) {
447
520
  this.logger.info(ev, "onInputSpeechStopped");
448
521
  if (!this.vad) {
522
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
523
+ this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
524
+ }
449
525
  this.agentSession._updateUserState("listening");
450
526
  }
451
527
  if (ev.userTranscriptionEnabled) {
@@ -507,48 +583,75 @@ class AgentActivity {
507
583
  onStartOfSpeech(ev) {
508
584
  let speechStartTime = Date.now();
509
585
  if (ev) {
510
- speechStartTime = speechStartTime - ev.speechDuration;
586
+ speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
511
587
  }
512
588
  this.agentSession._updateUserState("speaking", speechStartTime);
589
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
590
+ this.audioRecognition.onStartOfOverlapSpeech(
591
+ ev.speechDuration,
592
+ speechStartTime,
593
+ this.agentSession._userSpeakingSpan
594
+ );
595
+ }
513
596
  }
514
597
  onEndOfSpeech(ev) {
515
598
  let speechEndTime = Date.now();
516
599
  if (ev) {
517
- speechEndTime = speechEndTime - ev.silenceDuration;
600
+ speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
601
+ }
602
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
603
+ this.audioRecognition.onEndOfOverlapSpeech(
604
+ speechEndTime,
605
+ this.agentSession._userSpeakingSpan
606
+ );
518
607
  }
519
608
  this.agentSession._updateUserState("listening", speechEndTime);
520
609
  }
521
610
  onVADInferenceDone(ev) {
611
+ var _a;
522
612
  if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
523
613
  return;
524
614
  }
525
- if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
615
+ if (ev.speechDuration >= ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
526
616
  this.interruptByAudioActivity();
527
617
  }
528
618
  }
529
619
  interruptByAudioActivity() {
530
- var _a, _b;
620
+ var _a, _b, _c, _d;
621
+ if (!this.isInterruptionByAudioActivityEnabled) {
622
+ return;
623
+ }
624
+ if (this.agentSession._aecWarmupRemaining > 0) {
625
+ return;
626
+ }
531
627
  if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
532
628
  return;
533
629
  }
534
- if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
630
+ if (this.stt && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
535
631
  const text = this.audioRecognition.currentTranscript;
536
632
  const normalizedText = text ?? "";
537
633
  const wordCount = (0, import_word.splitWords)(normalizedText, true).length;
538
- if (wordCount < this.agentSession.options.minInterruptionWords) {
634
+ if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
539
635
  return;
540
636
  }
541
637
  }
542
- (_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
638
+ (_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
543
639
  if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
544
640
  this.logger.info(
545
641
  { "speech id": this._currentSpeech.id },
546
642
  "speech interrupted by audio activity"
547
643
  );
548
- (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
644
+ (_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
549
645
  this._currentSpeech.interrupt();
550
646
  }
551
647
  }
648
+ onInterruption(ev) {
649
+ this.restoreInterruptionByAudioActivity();
650
+ this.interruptByAudioActivity();
651
+ if (this.audioRecognition) {
652
+ this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.timestamp);
653
+ }
654
+ }
552
655
  onInterimTranscript(ev) {
553
656
  if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
554
657
  return;
@@ -597,7 +700,8 @@ class AgentActivity {
597
700
  );
598
701
  const userMessage = import_chat_context.ChatMessage.create({
599
702
  role: "user",
600
- content: info.newTranscript
703
+ content: info.newTranscript,
704
+ transcriptConfidence: info.transcriptConfidence
601
705
  });
602
706
  const chatCtx = this.agent.chatCtx.copy();
603
707
  const speechHandle = this.generateReply({
@@ -655,6 +759,7 @@ class AgentActivity {
655
759
  return task;
656
760
  }
657
761
  async onEndOfTurn(info) {
762
+ var _a, _b;
658
763
  if (this.schedulingPaused) {
659
764
  this.cancelPreemptiveGeneration();
660
765
  this.logger.warn(
@@ -663,14 +768,14 @@ class AgentActivity {
663
768
  );
664
769
  return true;
665
770
  }
666
- if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
771
+ if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
667
772
  const wordCount = (0, import_word.splitWords)(info.newTranscript, true).length;
668
- if (wordCount < this.agentSession.options.minInterruptionWords) {
773
+ if (wordCount < ((_b = this.agentSession.options.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
669
774
  this.cancelPreemptiveGeneration();
670
775
  this.logger.info(
671
776
  {
672
777
  wordCount,
673
- minInterruptionWords: this.agentSession.options.minInterruptionWords
778
+ minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords
674
779
  },
675
780
  "skipping user input, word count below minimum interruption threshold"
676
781
  );
@@ -806,11 +911,18 @@ class AgentActivity {
806
911
  instructions = `${this.agent.instructions}
807
912
  ${instructions}`;
808
913
  }
914
+ const onEnterData = onEnterStorage.getStore();
915
+ const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
916
+ const tools = shouldFilterTools ? Object.fromEntries(
917
+ Object.entries(this.agent.toolCtx).filter(
918
+ ([, fnTool]) => !(fnTool.flags & import_llm.ToolFlag.IGNORE_ON_ENTER)
919
+ )
920
+ ) : this.agent.toolCtx;
809
921
  const task = this.createSpeechTask({
810
922
  taskFn: (abortController) => this.pipelineReplyTask(
811
923
  handle,
812
924
  chatCtx ?? this.agent.chatCtx,
813
- this.agent.toolCtx,
925
+ tools,
814
926
  {
815
927
  toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
816
928
  },
@@ -882,7 +994,8 @@ ${instructions}`;
882
994
  }
883
995
  let userMessage = import_chat_context.ChatMessage.create({
884
996
  role: "user",
885
- content: info.newTranscript
997
+ content: info.newTranscript,
998
+ transcriptConfidence: info.transcriptConfidence
886
999
  });
887
1000
  const chatCtx = this.agent.chatCtx.copy();
888
1001
  const startTime = Date.now();
@@ -900,11 +1013,32 @@ ${instructions}`;
900
1013
  } else if (this.llm === void 0) {
901
1014
  return;
902
1015
  }
1016
+ const userMetricsReport = {};
1017
+ if (info.startedSpeakingAt !== void 0) {
1018
+ userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
1019
+ }
1020
+ if (info.stoppedSpeakingAt !== void 0) {
1021
+ userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
1022
+ }
1023
+ if (info.transcriptionDelay !== void 0) {
1024
+ userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
1025
+ }
1026
+ if (info.endOfUtteranceDelay !== void 0) {
1027
+ userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
1028
+ }
1029
+ userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
1030
+ if (userMessage) {
1031
+ userMessage.metrics = userMetricsReport;
1032
+ }
903
1033
  let speechHandle;
904
1034
  if (this._preemptiveGeneration !== void 0) {
905
1035
  const preemptive = this._preemptiveGeneration;
906
1036
  if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && (0, import_tool_context.isSameToolContext)(preemptive.tools, this.tools) && (0, import_tool_context.isSameToolChoice)(preemptive.toolChoice, this.toolChoice)) {
907
1037
  speechHandle = preemptive.speechHandle;
1038
+ if (preemptive.userMessage && userMessage) {
1039
+ preemptive.userMessage.metrics = userMetricsReport;
1040
+ preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
1041
+ }
908
1042
  this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
909
1043
  this.logger.debug(
910
1044
  {
@@ -938,6 +1072,7 @@ ${instructions}`;
938
1072
  );
939
1073
  }
940
1074
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
1075
+ var _a, _b;
941
1076
  speechHandle._agentTurnContext = import_api.context.active();
942
1077
  import_agent.speechHandleStorage.enterWith(speechHandle);
943
1078
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
@@ -970,11 +1105,18 @@ ${instructions}`;
970
1105
  textOut = _textOut;
971
1106
  tasks.push(textForwardTask);
972
1107
  }
1108
+ let replyStartedSpeakingAt;
1109
+ let replyTtsGenData = null;
973
1110
  const onFirstFrame = (startedSpeakingAt) => {
1111
+ replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
974
1112
  this.agentSession._updateAgentState("speaking", {
975
1113
  startTime: startedSpeakingAt,
976
1114
  otelContext: speechHandle._agentTurnContext
977
1115
  });
1116
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1117
+ this.audioRecognition.onStartOfAgentSpeech();
1118
+ this.isInterruptionByAudioActivityEnabled = false;
1119
+ }
978
1120
  };
979
1121
  if (!audioOutput) {
980
1122
  if (textOut) {
@@ -987,9 +1129,12 @@ ${instructions}`;
987
1129
  (...args) => this.agent.ttsNode(...args),
988
1130
  audioSource,
989
1131
  modelSettings,
990
- replyAbortController
1132
+ replyAbortController,
1133
+ (_a = this.tts) == null ? void 0 : _a.model,
1134
+ (_b = this.tts) == null ? void 0 : _b.provider
991
1135
  );
992
1136
  tasks.push(ttsTask);
1137
+ replyTtsGenData = ttsGenData;
993
1138
  const [forwardTask, _audioOut] = (0, import_generation.performAudioForwarding)(
994
1139
  ttsGenData.audioStream,
995
1140
  audioOutput,
@@ -1021,16 +1166,30 @@ ${instructions}`;
1021
1166
  }
1022
1167
  }
1023
1168
  if (addToChatCtx) {
1169
+ const replyStoppedSpeakingAt = Date.now();
1170
+ const replyAssistantMetrics = {};
1171
+ if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
1172
+ replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
1173
+ }
1174
+ if (replyStartedSpeakingAt !== void 0) {
1175
+ replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
1176
+ replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
1177
+ }
1024
1178
  const message = import_chat_context.ChatMessage.create({
1025
1179
  role: "assistant",
1026
1180
  content: (textOut == null ? void 0 : textOut.text) || "",
1027
- interrupted: speechHandle.interrupted
1181
+ interrupted: speechHandle.interrupted,
1182
+ metrics: replyAssistantMetrics
1028
1183
  });
1029
1184
  this.agent._chatCtx.insert(message);
1030
1185
  this.agentSession._conversationItemAdded(message);
1031
1186
  }
1032
1187
  if (this.agentSession.agentState === "speaking") {
1033
1188
  this.agentSession._updateAgentState("listening");
1189
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1190
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1191
+ }
1192
+ this.restoreInterruptionByAudioActivity();
1034
1193
  }
1035
1194
  }
1036
1195
  _pipelineReplyTaskImpl = async ({
@@ -1042,9 +1201,10 @@ ${instructions}`;
1042
1201
  instructions,
1043
1202
  newMessage,
1044
1203
  toolsMessages,
1045
- span
1204
+ span,
1205
+ _previousUserMetrics
1046
1206
  }) => {
1047
- var _a, _b;
1207
+ var _a, _b, _c, _d, _e, _f;
1048
1208
  speechHandle._agentTurnContext = import_api.context.active();
1049
1209
  span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1050
1210
  if (instructions) {
@@ -1082,7 +1242,9 @@ ${instructions}`;
1082
1242
  chatCtx,
1083
1243
  toolCtx,
1084
1244
  modelSettings,
1085
- replyAbortController
1245
+ replyAbortController,
1246
+ (_b = this.llm) == null ? void 0 : _b.model,
1247
+ (_c = this.llm) == null ? void 0 : _c.provider
1086
1248
  );
1087
1249
  tasks.push(llmTask);
1088
1250
  let ttsTask = null;
@@ -1095,16 +1257,20 @@ ${instructions}`;
1095
1257
  (...args) => this.agent.ttsNode(...args),
1096
1258
  ttsTextInput,
1097
1259
  modelSettings,
1098
- replyAbortController
1260
+ replyAbortController,
1261
+ (_d = this.tts) == null ? void 0 : _d.model,
1262
+ (_e = this.tts) == null ? void 0 : _e.provider
1099
1263
  );
1100
1264
  tasks.push(ttsTask);
1101
1265
  } else {
1102
1266
  llmOutput = llmGenData.textStream;
1103
1267
  }
1104
1268
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1269
+ let userMetrics = _previousUserMetrics;
1105
1270
  if (newMessage && speechHandle.scheduled) {
1106
1271
  this.agent._chatCtx.insert(newMessage);
1107
1272
  this.agentSession._conversationItemAdded(newMessage);
1273
+ userMetrics = newMessage.metrics;
1108
1274
  }
1109
1275
  if (speechHandle.interrupted) {
1110
1276
  replyAbortController.abort();
@@ -1116,7 +1282,7 @@ ${instructions}`;
1116
1282
  speechHandle._clearAuthorization();
1117
1283
  const replyStartedAt = Date.now();
1118
1284
  let transcriptionInput = llmOutput;
1119
- if (this.useTtsAlignedTranscript && ((_b = this.tts) == null ? void 0 : _b.capabilities.alignedTranscript) && ttsGenData) {
1285
+ if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
1120
1286
  const timedTextsStream = await Promise.race([
1121
1287
  ttsGenData.timedTextsFut.await,
1122
1288
  (ttsTask == null ? void 0 : ttsTask.result.catch(
@@ -1139,11 +1305,17 @@ ${instructions}`;
1139
1305
  tasks.push(textForwardTask);
1140
1306
  textOut = _textOut;
1141
1307
  }
1308
+ let agentStartedSpeakingAt;
1142
1309
  const onFirstFrame = (startedSpeakingAt) => {
1310
+ agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1143
1311
  this.agentSession._updateAgentState("speaking", {
1144
1312
  startTime: startedSpeakingAt,
1145
1313
  otelContext: speechHandle._agentTurnContext
1146
1314
  });
1315
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1316
+ this.audioRecognition.onStartOfAgentSpeech();
1317
+ this.isInterruptionByAudioActivityEnabled = false;
1318
+ }
1147
1319
  };
1148
1320
  let audioOut = null;
1149
1321
  if (audioOutput) {
@@ -1186,6 +1358,25 @@ ${instructions}`;
1186
1358
  if (audioOutput) {
1187
1359
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1188
1360
  }
1361
+ const agentStoppedSpeakingAt = Date.now();
1362
+ const assistantMetrics = {};
1363
+ if (llmGenData.ttft !== void 0) {
1364
+ assistantMetrics.llmNodeTtft = llmGenData.ttft;
1365
+ }
1366
+ if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
1367
+ assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
1368
+ }
1369
+ if (agentStartedSpeakingAt !== void 0) {
1370
+ assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
1371
+ assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
1372
+ if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
1373
+ const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
1374
+ assistantMetrics.e2eLatency = e2eLatency;
1375
+ span.setAttribute(import_telemetry.traceTypes.ATTR_E2E_LATENCY, e2eLatency);
1376
+ }
1377
+ }
1378
+ span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
1379
+ let hasSpeechMessage = false;
1189
1380
  if (toolsMessages) {
1190
1381
  for (const msg of toolsMessages) {
1191
1382
  msg.createdAt = replyStartedAt;
@@ -1226,20 +1417,27 @@ ${instructions}`;
1226
1417
  }
1227
1418
  }
1228
1419
  if (forwardedText) {
1420
+ hasSpeechMessage = true;
1229
1421
  const message = import_chat_context.ChatMessage.create({
1230
1422
  role: "assistant",
1231
1423
  content: forwardedText,
1232
1424
  id: llmGenData.id,
1233
1425
  interrupted: true,
1234
- createdAt: replyStartedAt
1426
+ createdAt: replyStartedAt,
1427
+ metrics: assistantMetrics
1235
1428
  });
1236
1429
  chatCtx.insert(message);
1237
1430
  this.agent._chatCtx.insert(message);
1238
1431
  speechHandle._itemAdded([message]);
1239
1432
  this.agentSession._conversationItemAdded(message);
1433
+ span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
1240
1434
  }
1241
1435
  if (this.agentSession.agentState === "speaking") {
1242
1436
  this.agentSession._updateAgentState("listening");
1437
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1438
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1439
+ this.restoreInterruptionByAudioActivity();
1440
+ }
1243
1441
  }
1244
1442
  this.logger.info(
1245
1443
  { speech_id: speechHandle.id, message: forwardedText },
@@ -1250,17 +1448,20 @@ ${instructions}`;
1250
1448
  return;
1251
1449
  }
1252
1450
  if (textOut && textOut.text) {
1451
+ hasSpeechMessage = true;
1253
1452
  const message = import_chat_context.ChatMessage.create({
1254
1453
  role: "assistant",
1255
1454
  id: llmGenData.id,
1256
1455
  interrupted: false,
1257
1456
  createdAt: replyStartedAt,
1258
- content: textOut.text
1457
+ content: textOut.text,
1458
+ metrics: assistantMetrics
1259
1459
  });
1260
1460
  chatCtx.insert(message);
1261
1461
  this.agent._chatCtx.insert(message);
1262
1462
  speechHandle._itemAdded([message]);
1263
1463
  this.agentSession._conversationItemAdded(message);
1464
+ span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
1264
1465
  this.logger.info(
1265
1466
  { speech_id: speechHandle.id, message: textOut.text },
1266
1467
  "playout completed without interruption"
@@ -1270,6 +1471,12 @@ ${instructions}`;
1270
1471
  this.agentSession._updateAgentState("thinking");
1271
1472
  } else if (this.agentSession.agentState === "speaking") {
1272
1473
  this.agentSession._updateAgentState("listening");
1474
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1475
+ {
1476
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1477
+ this.restoreInterruptionByAudioActivity();
1478
+ }
1479
+ }
1273
1480
  }
1274
1481
  speechHandle._markGenerationDone();
1275
1482
  await executeToolsTask.result;
@@ -1309,7 +1516,8 @@ ${instructions}`;
1309
1516
  replyAbortController,
1310
1517
  instructions,
1311
1518
  void 0,
1312
- toolMessages
1519
+ toolMessages,
1520
+ hasSpeechMessage ? void 0 : userMetrics
1313
1521
  ),
1314
1522
  ownedSpeechHandle: speechHandle,
1315
1523
  name: "AgentActivity.pipelineReply"
@@ -1329,7 +1537,7 @@ ${instructions}`;
1329
1537
  }
1330
1538
  }
1331
1539
  };
1332
- pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => import_telemetry.tracer.startActiveSpan(
1540
+ pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => import_telemetry.tracer.startActiveSpan(
1333
1541
  async (span) => this._pipelineReplyTaskImpl({
1334
1542
  speechHandle,
1335
1543
  chatCtx,
@@ -1339,7 +1547,8 @@ ${instructions}`;
1339
1547
  instructions,
1340
1548
  newMessage,
1341
1549
  toolsMessages,
1342
- span
1550
+ span,
1551
+ _previousUserMetrics
1343
1552
  }),
1344
1553
  {
1345
1554
  name: "agent_turn",
@@ -1405,6 +1614,7 @@ ${instructions}`;
1405
1614
  });
1406
1615
  };
1407
1616
  const readMessages = async (abortController, outputs) => {
1617
+ var _a2, _b;
1408
1618
  replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
1409
1619
  once: true
1410
1620
  });
@@ -1451,7 +1661,9 @@ ${instructions}`;
1451
1661
  (...args) => this.agent.ttsNode(...args),
1452
1662
  ttsTextInput,
1453
1663
  modelSettings,
1454
- abortController
1664
+ abortController,
1665
+ (_a2 = this.tts) == null ? void 0 : _a2.model,
1666
+ (_b = this.tts) == null ? void 0 : _b.provider
1455
1667
  );
1456
1668
  tasks.push(ttsTask);
1457
1669
  realtimeAudioResult = ttsGenData.audioStream;
@@ -1843,11 +2055,46 @@ ${instructions}`;
1843
2055
  if (this._mainTask) {
1844
2056
  await this._mainTask.cancelAndWait();
1845
2057
  }
2058
+ if (this.interruptionDetector) {
2059
+ this.interruptionDetector.off(
2060
+ "user_overlapping_speech",
2061
+ this.onInterruptionOverlappingSpeech
2062
+ );
2063
+ this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
2064
+ this.interruptionDetector.off("error", this.onInterruptionError);
2065
+ }
1846
2066
  this.agent._agentActivity = void 0;
1847
2067
  } finally {
1848
2068
  unlock();
1849
2069
  }
1850
2070
  }
2071
+ resolveInterruptionDetector() {
2072
+ const interruptionDetection = this.agent.interruptionDetection ?? this.agentSession.interruptionDetection;
2073
+ if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof import_llm.RealtimeModel))) {
2074
+ if (interruptionDetection === "adaptive") {
2075
+ this.logger.warn(
2076
+ "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
2077
+ );
2078
+ return void 0;
2079
+ }
2080
+ }
2081
+ if (interruptionDetection !== void 0 && interruptionDetection === false || interruptionDetection === "vad") {
2082
+ return void 0;
2083
+ }
2084
+ try {
2085
+ const detector = new import_interruption_detector.AdaptiveInterruptionDetector();
2086
+ detector.on("user_overlapping_speech", this.onInterruptionOverlappingSpeech);
2087
+ detector.on("metrics_collected", this.onInterruptionMetricsCollected);
2088
+ detector.on("error", this.onInterruptionError);
2089
+ return detector;
2090
+ } catch (error) {
2091
+ this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
2092
+ }
2093
+ return void 0;
2094
+ }
2095
+ restoreInterruptionByAudioActivity() {
2096
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
2097
+ }
1851
2098
  async _closeSessionResources() {
1852
2099
  var _a, _b, _c;
1853
2100
  if (this.llm instanceof import_llm.LLM) {
@@ -1890,6 +2137,7 @@ function toOaiToolChoice(toolChoice) {
1890
2137
  // Annotate the CommonJS export names for ESM import in node:
1891
2138
  0 && (module.exports = {
1892
2139
  AgentActivity,
1893
- agentActivityStorage
2140
+ agentActivityStorage,
2141
+ onEnterStorage
1894
2142
  });
1895
2143
  //# sourceMappingURL=agent_activity.cjs.map