@livekit/agents 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (959) hide show
  1. package/dist/_exceptions.cjs.map +1 -1
  2. package/dist/_exceptions.d.ts.map +1 -1
  3. package/dist/_exceptions.js.map +1 -1
  4. package/dist/audio.cjs +89 -3
  5. package/dist/audio.cjs.map +1 -1
  6. package/dist/audio.d.cts +36 -1
  7. package/dist/audio.d.ts +36 -1
  8. package/dist/audio.d.ts.map +1 -1
  9. package/dist/audio.js +76 -2
  10. package/dist/audio.js.map +1 -1
  11. package/dist/beta/index.cjs +29 -0
  12. package/dist/beta/index.cjs.map +1 -0
  13. package/dist/beta/index.d.cts +2 -0
  14. package/dist/beta/index.d.ts +2 -0
  15. package/dist/beta/index.d.ts.map +1 -0
  16. package/dist/beta/index.js +7 -0
  17. package/dist/beta/index.js.map +1 -0
  18. package/dist/beta/workflows/index.cjs +29 -0
  19. package/dist/beta/workflows/index.cjs.map +1 -0
  20. package/dist/beta/workflows/index.d.cts +2 -0
  21. package/dist/beta/workflows/index.d.ts +2 -0
  22. package/dist/beta/workflows/index.d.ts.map +1 -0
  23. package/dist/beta/workflows/index.js +7 -0
  24. package/dist/beta/workflows/index.js.map +1 -0
  25. package/dist/beta/workflows/task_group.cjs +165 -0
  26. package/dist/beta/workflows/task_group.cjs.map +1 -0
  27. package/dist/beta/workflows/task_group.d.cts +32 -0
  28. package/dist/beta/workflows/task_group.d.ts +32 -0
  29. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  30. package/dist/beta/workflows/task_group.js +141 -0
  31. package/dist/beta/workflows/task_group.js.map +1 -0
  32. package/dist/cli.cjs +44 -46
  33. package/dist/cli.cjs.map +1 -1
  34. package/dist/cli.d.cts +3 -3
  35. package/dist/cli.d.ts +3 -3
  36. package/dist/cli.d.ts.map +1 -1
  37. package/dist/cli.js +45 -47
  38. package/dist/cli.js.map +1 -1
  39. package/dist/connection_pool.cjs +242 -0
  40. package/dist/connection_pool.cjs.map +1 -0
  41. package/dist/connection_pool.d.cts +123 -0
  42. package/dist/connection_pool.d.ts +123 -0
  43. package/dist/connection_pool.d.ts.map +1 -0
  44. package/dist/connection_pool.js +218 -0
  45. package/dist/connection_pool.js.map +1 -0
  46. package/dist/connection_pool.test.cjs +256 -0
  47. package/dist/connection_pool.test.cjs.map +1 -0
  48. package/dist/connection_pool.test.js +255 -0
  49. package/dist/connection_pool.test.js.map +1 -0
  50. package/dist/constants.cjs +30 -0
  51. package/dist/constants.cjs.map +1 -1
  52. package/dist/constants.d.cts +10 -0
  53. package/dist/constants.d.ts +10 -0
  54. package/dist/constants.d.ts.map +1 -1
  55. package/dist/constants.js +20 -0
  56. package/dist/constants.js.map +1 -1
  57. package/dist/cpu.cjs +189 -0
  58. package/dist/cpu.cjs.map +1 -0
  59. package/dist/cpu.d.cts +24 -0
  60. package/dist/cpu.d.ts +24 -0
  61. package/dist/cpu.d.ts.map +1 -0
  62. package/dist/cpu.js +152 -0
  63. package/dist/cpu.js.map +1 -0
  64. package/dist/cpu.test.cjs +227 -0
  65. package/dist/cpu.test.cjs.map +1 -0
  66. package/dist/cpu.test.js +204 -0
  67. package/dist/cpu.test.js.map +1 -0
  68. package/dist/http_server.cjs +9 -6
  69. package/dist/http_server.cjs.map +1 -1
  70. package/dist/http_server.d.cts +5 -1
  71. package/dist/http_server.d.ts +5 -1
  72. package/dist/http_server.d.ts.map +1 -1
  73. package/dist/http_server.js +9 -6
  74. package/dist/http_server.js.map +1 -1
  75. package/dist/index.cjs +24 -9
  76. package/dist/index.cjs.map +1 -1
  77. package/dist/index.d.cts +15 -11
  78. package/dist/index.d.ts +15 -11
  79. package/dist/index.d.ts.map +1 -1
  80. package/dist/index.js +18 -9
  81. package/dist/index.js.map +1 -1
  82. package/dist/inference/api_protos.cjs +70 -2
  83. package/dist/inference/api_protos.cjs.map +1 -1
  84. package/dist/inference/api_protos.d.cts +373 -32
  85. package/dist/inference/api_protos.d.ts +373 -32
  86. package/dist/inference/api_protos.d.ts.map +1 -1
  87. package/dist/inference/api_protos.js +62 -2
  88. package/dist/inference/api_protos.js.map +1 -1
  89. package/dist/inference/index.cjs +8 -0
  90. package/dist/inference/index.cjs.map +1 -1
  91. package/dist/inference/index.d.cts +3 -4
  92. package/dist/inference/index.d.ts +3 -4
  93. package/dist/inference/index.d.ts.map +1 -1
  94. package/dist/inference/index.js +18 -3
  95. package/dist/inference/index.js.map +1 -1
  96. package/dist/inference/interruption/defaults.cjs +81 -0
  97. package/dist/inference/interruption/defaults.cjs.map +1 -0
  98. package/dist/inference/interruption/defaults.d.cts +19 -0
  99. package/dist/inference/interruption/defaults.d.ts +19 -0
  100. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  101. package/dist/inference/interruption/defaults.js +46 -0
  102. package/dist/inference/interruption/defaults.js.map +1 -0
  103. package/dist/inference/interruption/errors.cjs +44 -0
  104. package/dist/inference/interruption/errors.cjs.map +1 -0
  105. package/dist/inference/interruption/errors.d.cts +12 -0
  106. package/dist/inference/interruption/errors.d.ts +12 -0
  107. package/dist/inference/interruption/errors.d.ts.map +1 -0
  108. package/dist/inference/interruption/errors.js +20 -0
  109. package/dist/inference/interruption/errors.js.map +1 -0
  110. package/dist/inference/interruption/http_transport.cjs +163 -0
  111. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  112. package/dist/inference/interruption/http_transport.d.cts +65 -0
  113. package/dist/inference/interruption/http_transport.d.ts +65 -0
  114. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  115. package/dist/inference/interruption/http_transport.js +137 -0
  116. package/dist/inference/interruption/http_transport.js.map +1 -0
  117. package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
  118. package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
  119. package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
  120. package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
  121. package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
  122. package/dist/inference/interruption/interruption_cache_entry.js +34 -0
  123. package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
  124. package/dist/inference/interruption/interruption_detector.cjs +198 -0
  125. package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
  126. package/dist/inference/interruption/interruption_detector.d.cts +59 -0
  127. package/dist/inference/interruption/interruption_detector.d.ts +59 -0
  128. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
  129. package/dist/inference/interruption/interruption_detector.js +164 -0
  130. package/dist/inference/interruption/interruption_detector.js.map +1 -0
  131. package/dist/inference/interruption/interruption_stream.cjs +368 -0
  132. package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
  133. package/dist/inference/interruption/interruption_stream.d.cts +46 -0
  134. package/dist/inference/interruption/interruption_stream.d.ts +46 -0
  135. package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
  136. package/dist/inference/interruption/interruption_stream.js +344 -0
  137. package/dist/inference/interruption/interruption_stream.js.map +1 -0
  138. package/dist/inference/interruption/types.cjs +17 -0
  139. package/dist/inference/interruption/types.cjs.map +1 -0
  140. package/dist/inference/interruption/types.d.cts +66 -0
  141. package/dist/inference/interruption/types.d.ts +66 -0
  142. package/dist/inference/interruption/types.d.ts.map +1 -0
  143. package/dist/inference/interruption/types.js +1 -0
  144. package/dist/inference/interruption/types.js.map +1 -0
  145. package/dist/inference/interruption/utils.cjs +130 -0
  146. package/dist/inference/interruption/utils.cjs.map +1 -0
  147. package/dist/inference/interruption/utils.d.cts +41 -0
  148. package/dist/inference/interruption/utils.d.ts +41 -0
  149. package/dist/inference/interruption/utils.d.ts.map +1 -0
  150. package/dist/inference/interruption/utils.js +105 -0
  151. package/dist/inference/interruption/utils.js.map +1 -0
  152. package/dist/inference/interruption/utils.test.cjs +105 -0
  153. package/dist/inference/interruption/utils.test.cjs.map +1 -0
  154. package/dist/inference/interruption/utils.test.js +104 -0
  155. package/dist/inference/interruption/utils.test.js.map +1 -0
  156. package/dist/inference/interruption/ws_transport.cjs +347 -0
  157. package/dist/inference/interruption/ws_transport.cjs.map +1 -0
  158. package/dist/inference/interruption/ws_transport.d.cts +33 -0
  159. package/dist/inference/interruption/ws_transport.d.ts +33 -0
  160. package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
  161. package/dist/inference/interruption/ws_transport.js +313 -0
  162. package/dist/inference/interruption/ws_transport.js.map +1 -0
  163. package/dist/inference/llm.cjs +106 -66
  164. package/dist/inference/llm.cjs.map +1 -1
  165. package/dist/inference/llm.d.cts +65 -43
  166. package/dist/inference/llm.d.ts +65 -43
  167. package/dist/inference/llm.d.ts.map +1 -1
  168. package/dist/inference/llm.js +100 -66
  169. package/dist/inference/llm.js.map +1 -1
  170. package/dist/inference/stt.cjs +319 -170
  171. package/dist/inference/stt.cjs.map +1 -1
  172. package/dist/inference/stt.d.cts +64 -15
  173. package/dist/inference/stt.d.ts +64 -15
  174. package/dist/inference/stt.d.ts.map +1 -1
  175. package/dist/inference/stt.js +319 -170
  176. package/dist/inference/stt.js.map +1 -1
  177. package/dist/inference/stt.test.cjs +218 -0
  178. package/dist/inference/stt.test.cjs.map +1 -0
  179. package/dist/inference/stt.test.js +217 -0
  180. package/dist/inference/stt.test.js.map +1 -0
  181. package/dist/inference/tts.cjs +249 -71
  182. package/dist/inference/tts.cjs.map +1 -1
  183. package/dist/inference/tts.d.cts +94 -17
  184. package/dist/inference/tts.d.ts +94 -17
  185. package/dist/inference/tts.d.ts.map +1 -1
  186. package/dist/inference/tts.js +249 -77
  187. package/dist/inference/tts.js.map +1 -1
  188. package/dist/inference/tts.test.cjs +305 -0
  189. package/dist/inference/tts.test.cjs.map +1 -0
  190. package/dist/inference/tts.test.js +304 -0
  191. package/dist/inference/tts.test.js.map +1 -0
  192. package/dist/inference/utils.cjs +26 -7
  193. package/dist/inference/utils.cjs.map +1 -1
  194. package/dist/inference/utils.d.cts +14 -1
  195. package/dist/inference/utils.d.ts +14 -1
  196. package/dist/inference/utils.d.ts.map +1 -1
  197. package/dist/inference/utils.js +18 -2
  198. package/dist/inference/utils.js.map +1 -1
  199. package/dist/ipc/inference_proc_executor.cjs +6 -3
  200. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  201. package/dist/ipc/inference_proc_executor.d.ts.map +1 -1
  202. package/dist/ipc/inference_proc_executor.js +6 -3
  203. package/dist/ipc/inference_proc_executor.js.map +1 -1
  204. package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
  205. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
  206. package/dist/ipc/inference_proc_lazy_main.js +13 -1
  207. package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
  208. package/dist/ipc/job_proc_executor.cjs +6 -1
  209. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  210. package/dist/ipc/job_proc_executor.d.ts.map +1 -1
  211. package/dist/ipc/job_proc_executor.js +6 -1
  212. package/dist/ipc/job_proc_executor.js.map +1 -1
  213. package/dist/ipc/job_proc_lazy_main.cjs +89 -17
  214. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  215. package/dist/ipc/job_proc_lazy_main.js +68 -18
  216. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  217. package/dist/ipc/supervised_proc.cjs +34 -8
  218. package/dist/ipc/supervised_proc.cjs.map +1 -1
  219. package/dist/ipc/supervised_proc.d.cts +8 -0
  220. package/dist/ipc/supervised_proc.d.ts +8 -0
  221. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  222. package/dist/ipc/supervised_proc.js +34 -8
  223. package/dist/ipc/supervised_proc.js.map +1 -1
  224. package/dist/ipc/supervised_proc.test.cjs +145 -0
  225. package/dist/ipc/supervised_proc.test.cjs.map +1 -0
  226. package/dist/ipc/supervised_proc.test.js +122 -0
  227. package/dist/ipc/supervised_proc.test.js.map +1 -0
  228. package/dist/job.cjs +109 -1
  229. package/dist/job.cjs.map +1 -1
  230. package/dist/job.d.cts +14 -0
  231. package/dist/job.d.ts +14 -0
  232. package/dist/job.d.ts.map +1 -1
  233. package/dist/job.js +99 -1
  234. package/dist/job.js.map +1 -1
  235. package/dist/language.cjs +394 -0
  236. package/dist/language.cjs.map +1 -0
  237. package/dist/language.d.cts +15 -0
  238. package/dist/language.d.ts +15 -0
  239. package/dist/language.d.ts.map +1 -0
  240. package/dist/language.js +363 -0
  241. package/dist/language.js.map +1 -0
  242. package/dist/language.test.cjs +43 -0
  243. package/dist/language.test.cjs.map +1 -0
  244. package/dist/language.test.js +49 -0
  245. package/dist/language.test.js.map +1 -0
  246. package/dist/llm/chat_context.cjs +345 -3
  247. package/dist/llm/chat_context.cjs.map +1 -1
  248. package/dist/llm/chat_context.d.cts +86 -2
  249. package/dist/llm/chat_context.d.ts +86 -2
  250. package/dist/llm/chat_context.d.ts.map +1 -1
  251. package/dist/llm/chat_context.js +344 -3
  252. package/dist/llm/chat_context.js.map +1 -1
  253. package/dist/llm/chat_context.test.cjs +692 -0
  254. package/dist/llm/chat_context.test.cjs.map +1 -1
  255. package/dist/llm/chat_context.test.js +692 -0
  256. package/dist/llm/chat_context.test.js.map +1 -1
  257. package/dist/llm/fallback_adapter.cjs +280 -0
  258. package/dist/llm/fallback_adapter.cjs.map +1 -0
  259. package/dist/llm/fallback_adapter.d.cts +73 -0
  260. package/dist/llm/fallback_adapter.d.ts +73 -0
  261. package/dist/llm/fallback_adapter.d.ts.map +1 -0
  262. package/dist/llm/fallback_adapter.js +256 -0
  263. package/dist/llm/fallback_adapter.js.map +1 -0
  264. package/dist/llm/fallback_adapter.test.cjs +176 -0
  265. package/dist/llm/fallback_adapter.test.cjs.map +1 -0
  266. package/dist/llm/fallback_adapter.test.js +175 -0
  267. package/dist/llm/fallback_adapter.test.js.map +1 -0
  268. package/dist/llm/index.cjs +11 -0
  269. package/dist/llm/index.cjs.map +1 -1
  270. package/dist/llm/index.d.cts +4 -3
  271. package/dist/llm/index.d.ts +4 -3
  272. package/dist/llm/index.d.ts.map +1 -1
  273. package/dist/llm/index.js +13 -1
  274. package/dist/llm/index.js.map +1 -1
  275. package/dist/llm/llm.cjs +65 -11
  276. package/dist/llm/llm.cjs.map +1 -1
  277. package/dist/llm/llm.d.cts +13 -2
  278. package/dist/llm/llm.d.ts +13 -2
  279. package/dist/llm/llm.d.ts.map +1 -1
  280. package/dist/llm/llm.js +65 -11
  281. package/dist/llm/llm.js.map +1 -1
  282. package/dist/llm/provider_format/google.cjs +6 -2
  283. package/dist/llm/provider_format/google.cjs.map +1 -1
  284. package/dist/llm/provider_format/google.d.cts +1 -1
  285. package/dist/llm/provider_format/google.d.ts +1 -1
  286. package/dist/llm/provider_format/google.d.ts.map +1 -1
  287. package/dist/llm/provider_format/google.js +6 -2
  288. package/dist/llm/provider_format/google.js.map +1 -1
  289. package/dist/llm/provider_format/google.test.cjs +48 -0
  290. package/dist/llm/provider_format/google.test.cjs.map +1 -1
  291. package/dist/llm/provider_format/google.test.js +54 -1
  292. package/dist/llm/provider_format/google.test.js.map +1 -1
  293. package/dist/llm/provider_format/index.cjs +2 -0
  294. package/dist/llm/provider_format/index.cjs.map +1 -1
  295. package/dist/llm/provider_format/index.d.cts +2 -2
  296. package/dist/llm/provider_format/index.d.ts +2 -2
  297. package/dist/llm/provider_format/index.d.ts.map +1 -1
  298. package/dist/llm/provider_format/index.js +6 -1
  299. package/dist/llm/provider_format/index.js.map +1 -1
  300. package/dist/llm/provider_format/openai.cjs +126 -24
  301. package/dist/llm/provider_format/openai.cjs.map +1 -1
  302. package/dist/llm/provider_format/openai.d.cts +1 -0
  303. package/dist/llm/provider_format/openai.d.ts +1 -0
  304. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  305. package/dist/llm/provider_format/openai.js +124 -23
  306. package/dist/llm/provider_format/openai.js.map +1 -1
  307. package/dist/llm/provider_format/openai.test.cjs +393 -0
  308. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  309. package/dist/llm/provider_format/openai.test.js +400 -2
  310. package/dist/llm/provider_format/openai.test.js.map +1 -1
  311. package/dist/llm/provider_format/utils.cjs +5 -4
  312. package/dist/llm/provider_format/utils.cjs.map +1 -1
  313. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  314. package/dist/llm/provider_format/utils.js +5 -4
  315. package/dist/llm/provider_format/utils.js.map +1 -1
  316. package/dist/llm/realtime.cjs +3 -0
  317. package/dist/llm/realtime.cjs.map +1 -1
  318. package/dist/llm/realtime.d.cts +15 -1
  319. package/dist/llm/realtime.d.ts +15 -1
  320. package/dist/llm/realtime.d.ts.map +1 -1
  321. package/dist/llm/realtime.js +3 -0
  322. package/dist/llm/realtime.js.map +1 -1
  323. package/dist/llm/remote_chat_context.cjs.map +1 -1
  324. package/dist/llm/remote_chat_context.d.cts +2 -0
  325. package/dist/llm/remote_chat_context.d.ts +2 -0
  326. package/dist/llm/remote_chat_context.d.ts.map +1 -1
  327. package/dist/llm/remote_chat_context.js.map +1 -1
  328. package/dist/llm/tool_context.cjs +50 -2
  329. package/dist/llm/tool_context.cjs.map +1 -1
  330. package/dist/llm/tool_context.d.cts +47 -11
  331. package/dist/llm/tool_context.d.ts +47 -11
  332. package/dist/llm/tool_context.d.ts.map +1 -1
  333. package/dist/llm/tool_context.js +48 -3
  334. package/dist/llm/tool_context.js.map +1 -1
  335. package/dist/llm/tool_context.test.cjs +197 -0
  336. package/dist/llm/tool_context.test.cjs.map +1 -1
  337. package/dist/llm/tool_context.test.js +175 -0
  338. package/dist/llm/tool_context.test.js.map +1 -1
  339. package/dist/llm/utils.cjs +107 -12
  340. package/dist/llm/utils.cjs.map +1 -1
  341. package/dist/llm/utils.d.cts +10 -3
  342. package/dist/llm/utils.d.ts +10 -3
  343. package/dist/llm/utils.d.ts.map +1 -1
  344. package/dist/llm/utils.js +106 -12
  345. package/dist/llm/utils.js.map +1 -1
  346. package/dist/llm/utils.test.cjs +90 -0
  347. package/dist/llm/utils.test.cjs.map +1 -1
  348. package/dist/llm/utils.test.js +98 -2
  349. package/dist/llm/utils.test.js.map +1 -1
  350. package/dist/llm/zod-utils.cjs +102 -0
  351. package/dist/llm/zod-utils.cjs.map +1 -0
  352. package/dist/llm/zod-utils.d.cts +65 -0
  353. package/dist/llm/zod-utils.d.ts +65 -0
  354. package/dist/llm/zod-utils.d.ts.map +1 -0
  355. package/dist/llm/zod-utils.js +64 -0
  356. package/dist/llm/zod-utils.js.map +1 -0
  357. package/dist/llm/zod-utils.test.cjs +472 -0
  358. package/dist/llm/zod-utils.test.cjs.map +1 -0
  359. package/dist/llm/zod-utils.test.js +455 -0
  360. package/dist/llm/zod-utils.test.js.map +1 -0
  361. package/dist/log.cjs +45 -14
  362. package/dist/log.cjs.map +1 -1
  363. package/dist/log.d.cts +8 -1
  364. package/dist/log.d.ts +8 -1
  365. package/dist/log.d.ts.map +1 -1
  366. package/dist/log.js +45 -15
  367. package/dist/log.js.map +1 -1
  368. package/dist/metrics/base.cjs.map +1 -1
  369. package/dist/metrics/base.d.cts +75 -19
  370. package/dist/metrics/base.d.ts +75 -19
  371. package/dist/metrics/base.d.ts.map +1 -1
  372. package/dist/metrics/index.cjs +5 -0
  373. package/dist/metrics/index.cjs.map +1 -1
  374. package/dist/metrics/index.d.cts +2 -1
  375. package/dist/metrics/index.d.ts +2 -1
  376. package/dist/metrics/index.d.ts.map +1 -1
  377. package/dist/metrics/index.js +6 -0
  378. package/dist/metrics/index.js.map +1 -1
  379. package/dist/metrics/model_usage.cjs +189 -0
  380. package/dist/metrics/model_usage.cjs.map +1 -0
  381. package/dist/metrics/model_usage.d.cts +92 -0
  382. package/dist/metrics/model_usage.d.ts +92 -0
  383. package/dist/metrics/model_usage.d.ts.map +1 -0
  384. package/dist/metrics/model_usage.js +164 -0
  385. package/dist/metrics/model_usage.js.map +1 -0
  386. package/dist/metrics/model_usage.test.cjs +474 -0
  387. package/dist/metrics/model_usage.test.cjs.map +1 -0
  388. package/dist/metrics/model_usage.test.js +476 -0
  389. package/dist/metrics/model_usage.test.js.map +1 -0
  390. package/dist/metrics/usage_collector.cjs +5 -2
  391. package/dist/metrics/usage_collector.cjs.map +1 -1
  392. package/dist/metrics/usage_collector.d.cts +10 -1
  393. package/dist/metrics/usage_collector.d.ts +10 -1
  394. package/dist/metrics/usage_collector.d.ts.map +1 -1
  395. package/dist/metrics/usage_collector.js +5 -2
  396. package/dist/metrics/usage_collector.js.map +1 -1
  397. package/dist/metrics/utils.cjs +23 -7
  398. package/dist/metrics/utils.cjs.map +1 -1
  399. package/dist/metrics/utils.d.ts.map +1 -1
  400. package/dist/metrics/utils.js +23 -7
  401. package/dist/metrics/utils.js.map +1 -1
  402. package/dist/stream/deferred_stream.cjs +31 -10
  403. package/dist/stream/deferred_stream.cjs.map +1 -1
  404. package/dist/stream/deferred_stream.d.cts +6 -1
  405. package/dist/stream/deferred_stream.d.ts +6 -1
  406. package/dist/stream/deferred_stream.d.ts.map +1 -1
  407. package/dist/stream/deferred_stream.js +31 -10
  408. package/dist/stream/deferred_stream.js.map +1 -1
  409. package/dist/stream/deferred_stream.test.cjs +2 -2
  410. package/dist/stream/deferred_stream.test.cjs.map +1 -1
  411. package/dist/stream/deferred_stream.test.js +2 -2
  412. package/dist/stream/deferred_stream.test.js.map +1 -1
  413. package/dist/stream/index.cjs +3 -0
  414. package/dist/stream/index.cjs.map +1 -1
  415. package/dist/stream/index.d.cts +1 -0
  416. package/dist/stream/index.d.ts +1 -0
  417. package/dist/stream/index.d.ts.map +1 -1
  418. package/dist/stream/index.js +2 -0
  419. package/dist/stream/index.js.map +1 -1
  420. package/dist/stream/multi_input_stream.cjs +139 -0
  421. package/dist/stream/multi_input_stream.cjs.map +1 -0
  422. package/dist/stream/multi_input_stream.d.cts +55 -0
  423. package/dist/stream/multi_input_stream.d.ts +55 -0
  424. package/dist/stream/multi_input_stream.d.ts.map +1 -0
  425. package/dist/stream/multi_input_stream.js +115 -0
  426. package/dist/stream/multi_input_stream.js.map +1 -0
  427. package/dist/stream/multi_input_stream.test.cjs +344 -0
  428. package/dist/stream/multi_input_stream.test.cjs.map +1 -0
  429. package/dist/stream/multi_input_stream.test.js +343 -0
  430. package/dist/stream/multi_input_stream.test.js.map +1 -0
  431. package/dist/stream/stream_channel.cjs +39 -1
  432. package/dist/stream/stream_channel.cjs.map +1 -1
  433. package/dist/stream/stream_channel.d.cts +5 -2
  434. package/dist/stream/stream_channel.d.ts +5 -2
  435. package/dist/stream/stream_channel.d.ts.map +1 -1
  436. package/dist/stream/stream_channel.js +39 -1
  437. package/dist/stream/stream_channel.js.map +1 -1
  438. package/dist/stream/stream_channel.test.cjs +27 -0
  439. package/dist/stream/stream_channel.test.cjs.map +1 -1
  440. package/dist/stream/stream_channel.test.js +27 -0
  441. package/dist/stream/stream_channel.test.js.map +1 -1
  442. package/dist/stt/stream_adapter.cjs +24 -9
  443. package/dist/stt/stream_adapter.cjs.map +1 -1
  444. package/dist/stt/stream_adapter.d.cts +7 -3
  445. package/dist/stt/stream_adapter.d.ts +7 -3
  446. package/dist/stt/stream_adapter.d.ts.map +1 -1
  447. package/dist/stt/stream_adapter.js +24 -9
  448. package/dist/stt/stream_adapter.js.map +1 -1
  449. package/dist/stt/stt.cjs +94 -19
  450. package/dist/stt/stt.cjs.map +1 -1
  451. package/dist/stt/stt.d.cts +68 -5
  452. package/dist/stt/stt.d.ts +68 -5
  453. package/dist/stt/stt.d.ts.map +1 -1
  454. package/dist/stt/stt.js +96 -21
  455. package/dist/stt/stt.js.map +1 -1
  456. package/dist/telemetry/index.cjs +72 -0
  457. package/dist/telemetry/index.cjs.map +1 -0
  458. package/dist/telemetry/index.d.cts +7 -0
  459. package/dist/telemetry/index.d.ts +7 -0
  460. package/dist/telemetry/index.d.ts.map +1 -0
  461. package/dist/telemetry/index.js +37 -0
  462. package/dist/telemetry/index.js.map +1 -0
  463. package/dist/telemetry/logging.cjs +65 -0
  464. package/dist/telemetry/logging.cjs.map +1 -0
  465. package/dist/telemetry/logging.d.cts +21 -0
  466. package/dist/telemetry/logging.d.ts +21 -0
  467. package/dist/telemetry/logging.d.ts.map +1 -0
  468. package/dist/telemetry/logging.js +40 -0
  469. package/dist/telemetry/logging.js.map +1 -0
  470. package/dist/telemetry/otel_http_exporter.cjs +166 -0
  471. package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
  472. package/dist/telemetry/otel_http_exporter.d.cts +63 -0
  473. package/dist/telemetry/otel_http_exporter.d.ts +63 -0
  474. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
  475. package/dist/telemetry/otel_http_exporter.js +142 -0
  476. package/dist/telemetry/otel_http_exporter.js.map +1 -0
  477. package/dist/telemetry/pino_otel_transport.cjs +217 -0
  478. package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
  479. package/dist/telemetry/pino_otel_transport.d.cts +58 -0
  480. package/dist/telemetry/pino_otel_transport.d.ts +58 -0
  481. package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
  482. package/dist/telemetry/pino_otel_transport.js +189 -0
  483. package/dist/telemetry/pino_otel_transport.js.map +1 -0
  484. package/dist/telemetry/trace_types.cjs +233 -0
  485. package/dist/telemetry/trace_types.cjs.map +1 -0
  486. package/dist/telemetry/trace_types.d.cts +74 -0
  487. package/dist/telemetry/trace_types.d.ts +74 -0
  488. package/dist/telemetry/trace_types.d.ts.map +1 -0
  489. package/dist/telemetry/trace_types.js +141 -0
  490. package/dist/telemetry/trace_types.js.map +1 -0
  491. package/dist/telemetry/traces.cjs +484 -0
  492. package/dist/telemetry/traces.cjs.map +1 -0
  493. package/dist/telemetry/traces.d.cts +116 -0
  494. package/dist/telemetry/traces.d.ts +116 -0
  495. package/dist/telemetry/traces.d.ts.map +1 -0
  496. package/dist/telemetry/traces.js +449 -0
  497. package/dist/telemetry/traces.js.map +1 -0
  498. package/dist/telemetry/utils.cjs +86 -0
  499. package/dist/telemetry/utils.cjs.map +1 -0
  500. package/dist/telemetry/utils.d.cts +5 -0
  501. package/dist/telemetry/utils.d.ts +5 -0
  502. package/dist/telemetry/utils.d.ts.map +1 -0
  503. package/dist/telemetry/utils.js +51 -0
  504. package/dist/telemetry/utils.js.map +1 -0
  505. package/dist/tokenize/basic/sentence.cjs +3 -3
  506. package/dist/tokenize/basic/sentence.cjs.map +1 -1
  507. package/dist/tokenize/basic/sentence.js +3 -3
  508. package/dist/tokenize/basic/sentence.js.map +1 -1
  509. package/dist/tokenize/tokenizer.test.cjs +3 -1
  510. package/dist/tokenize/tokenizer.test.cjs.map +1 -1
  511. package/dist/tokenize/tokenizer.test.js +3 -1
  512. package/dist/tokenize/tokenizer.test.js.map +1 -1
  513. package/dist/transcription.cjs.map +1 -1
  514. package/dist/transcription.d.cts +6 -0
  515. package/dist/transcription.d.ts +6 -0
  516. package/dist/transcription.d.ts.map +1 -1
  517. package/dist/transcription.js.map +1 -1
  518. package/dist/tts/fallback_adapter.cjs +472 -0
  519. package/dist/tts/fallback_adapter.cjs.map +1 -0
  520. package/dist/tts/fallback_adapter.d.cts +110 -0
  521. package/dist/tts/fallback_adapter.d.ts +110 -0
  522. package/dist/tts/fallback_adapter.d.ts.map +1 -0
  523. package/dist/tts/fallback_adapter.js +448 -0
  524. package/dist/tts/fallback_adapter.js.map +1 -0
  525. package/dist/tts/index.cjs +3 -0
  526. package/dist/tts/index.cjs.map +1 -1
  527. package/dist/tts/index.d.cts +1 -0
  528. package/dist/tts/index.d.ts +1 -0
  529. package/dist/tts/index.d.ts.map +1 -1
  530. package/dist/tts/index.js +2 -0
  531. package/dist/tts/index.js.map +1 -1
  532. package/dist/tts/stream_adapter.cjs +25 -8
  533. package/dist/tts/stream_adapter.cjs.map +1 -1
  534. package/dist/tts/stream_adapter.d.cts +6 -3
  535. package/dist/tts/stream_adapter.d.ts +6 -3
  536. package/dist/tts/stream_adapter.d.ts.map +1 -1
  537. package/dist/tts/stream_adapter.js +25 -8
  538. package/dist/tts/stream_adapter.js.map +1 -1
  539. package/dist/tts/tts.cjs +189 -57
  540. package/dist/tts/tts.cjs.map +1 -1
  541. package/dist/tts/tts.d.cts +58 -6
  542. package/dist/tts/tts.d.ts +58 -6
  543. package/dist/tts/tts.d.ts.map +1 -1
  544. package/dist/tts/tts.js +191 -59
  545. package/dist/tts/tts.js.map +1 -1
  546. package/dist/typed_promise.cjs +48 -0
  547. package/dist/typed_promise.cjs.map +1 -0
  548. package/dist/typed_promise.d.cts +24 -0
  549. package/dist/typed_promise.d.ts +24 -0
  550. package/dist/typed_promise.d.ts.map +1 -0
  551. package/dist/typed_promise.js +28 -0
  552. package/dist/typed_promise.js.map +1 -0
  553. package/dist/types.cjs +24 -32
  554. package/dist/types.cjs.map +1 -1
  555. package/dist/types.d.cts +45 -10
  556. package/dist/types.d.ts +45 -10
  557. package/dist/types.d.ts.map +1 -1
  558. package/dist/types.js +20 -30
  559. package/dist/types.js.map +1 -1
  560. package/dist/utils.cjs +124 -28
  561. package/dist/utils.cjs.map +1 -1
  562. package/dist/utils.d.cts +41 -1
  563. package/dist/utils.d.ts +41 -1
  564. package/dist/utils.d.ts.map +1 -1
  565. package/dist/utils.js +119 -27
  566. package/dist/utils.js.map +1 -1
  567. package/dist/utils.test.cjs +73 -1
  568. package/dist/utils.test.cjs.map +1 -1
  569. package/dist/utils.test.js +74 -10
  570. package/dist/utils.test.js.map +1 -1
  571. package/dist/vad.cjs +35 -15
  572. package/dist/vad.cjs.map +1 -1
  573. package/dist/vad.d.cts +15 -5
  574. package/dist/vad.d.ts +15 -5
  575. package/dist/vad.d.ts.map +1 -1
  576. package/dist/vad.js +35 -15
  577. package/dist/vad.js.map +1 -1
  578. package/dist/version.cjs +1 -1
  579. package/dist/version.cjs.map +1 -1
  580. package/dist/version.d.cts +1 -1
  581. package/dist/version.d.ts +1 -1
  582. package/dist/version.d.ts.map +1 -1
  583. package/dist/version.js +1 -1
  584. package/dist/version.js.map +1 -1
  585. package/dist/voice/agent.cjs +258 -35
  586. package/dist/voice/agent.cjs.map +1 -1
  587. package/dist/voice/agent.d.cts +54 -13
  588. package/dist/voice/agent.d.ts +54 -13
  589. package/dist/voice/agent.d.ts.map +1 -1
  590. package/dist/voice/agent.js +254 -34
  591. package/dist/voice/agent.js.map +1 -1
  592. package/dist/voice/agent.test.cjs +314 -0
  593. package/dist/voice/agent.test.cjs.map +1 -1
  594. package/dist/voice/agent.test.js +316 -2
  595. package/dist/voice/agent.test.js.map +1 -1
  596. package/dist/voice/agent_activity.cjs +1116 -385
  597. package/dist/voice/agent_activity.cjs.map +1 -1
  598. package/dist/voice/agent_activity.d.cts +72 -11
  599. package/dist/voice/agent_activity.d.ts +72 -11
  600. package/dist/voice/agent_activity.d.ts.map +1 -1
  601. package/dist/voice/agent_activity.js +1119 -383
  602. package/dist/voice/agent_activity.js.map +1 -1
  603. package/dist/voice/agent_activity.test.cjs +135 -0
  604. package/dist/voice/agent_activity.test.cjs.map +1 -0
  605. package/dist/voice/agent_activity.test.js +134 -0
  606. package/dist/voice/agent_activity.test.js.map +1 -0
  607. package/dist/voice/agent_session.cjs +550 -90
  608. package/dist/voice/agent_session.cjs.map +1 -1
  609. package/dist/voice/agent_session.d.cts +185 -25
  610. package/dist/voice/agent_session.d.ts +185 -25
  611. package/dist/voice/agent_session.d.ts.map +1 -1
  612. package/dist/voice/agent_session.js +556 -91
  613. package/dist/voice/agent_session.js.map +1 -1
  614. package/dist/voice/audio_recognition.cjs +605 -46
  615. package/dist/voice/audio_recognition.cjs.map +1 -1
  616. package/dist/voice/audio_recognition.d.cts +96 -4
  617. package/dist/voice/audio_recognition.d.ts +96 -4
  618. package/dist/voice/audio_recognition.d.ts.map +1 -1
  619. package/dist/voice/audio_recognition.js +611 -47
  620. package/dist/voice/audio_recognition.js.map +1 -1
  621. package/dist/voice/audio_recognition_span.test.cjs +295 -0
  622. package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
  623. package/dist/voice/audio_recognition_span.test.js +299 -0
  624. package/dist/voice/audio_recognition_span.test.js.map +1 -0
  625. package/dist/voice/avatar/datastream_io.cjs +7 -1
  626. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  627. package/dist/voice/avatar/datastream_io.d.cts +1 -0
  628. package/dist/voice/avatar/datastream_io.d.ts +1 -0
  629. package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
  630. package/dist/voice/avatar/datastream_io.js +7 -1
  631. package/dist/voice/avatar/datastream_io.js.map +1 -1
  632. package/dist/voice/background_audio.cjs +367 -0
  633. package/dist/voice/background_audio.cjs.map +1 -0
  634. package/dist/voice/background_audio.d.cts +123 -0
  635. package/dist/voice/background_audio.d.ts +123 -0
  636. package/dist/voice/background_audio.d.ts.map +1 -0
  637. package/dist/voice/background_audio.js +343 -0
  638. package/dist/voice/background_audio.js.map +1 -0
  639. package/dist/voice/events.cjs +3 -0
  640. package/dist/voice/events.cjs.map +1 -1
  641. package/dist/voice/events.d.cts +16 -9
  642. package/dist/voice/events.d.ts +16 -9
  643. package/dist/voice/events.d.ts.map +1 -1
  644. package/dist/voice/events.js +3 -0
  645. package/dist/voice/events.js.map +1 -1
  646. package/dist/voice/generation.cjs +205 -41
  647. package/dist/voice/generation.cjs.map +1 -1
  648. package/dist/voice/generation.d.cts +21 -5
  649. package/dist/voice/generation.d.ts +21 -5
  650. package/dist/voice/generation.d.ts.map +1 -1
  651. package/dist/voice/generation.js +215 -43
  652. package/dist/voice/generation.js.map +1 -1
  653. package/dist/voice/generation_tools.test.cjs +236 -0
  654. package/dist/voice/generation_tools.test.cjs.map +1 -0
  655. package/dist/voice/generation_tools.test.js +235 -0
  656. package/dist/voice/generation_tools.test.js.map +1 -0
  657. package/dist/voice/index.cjs +33 -2
  658. package/dist/voice/index.cjs.map +1 -1
  659. package/dist/voice/index.d.cts +8 -2
  660. package/dist/voice/index.d.ts +8 -2
  661. package/dist/voice/index.d.ts.map +1 -1
  662. package/dist/voice/index.js +19 -2
  663. package/dist/voice/index.js.map +1 -1
  664. package/dist/voice/interruption_detection.test.cjs +114 -0
  665. package/dist/voice/interruption_detection.test.cjs.map +1 -0
  666. package/dist/voice/interruption_detection.test.js +113 -0
  667. package/dist/voice/interruption_detection.test.js.map +1 -0
  668. package/dist/voice/io.cjs +66 -6
  669. package/dist/voice/io.cjs.map +1 -1
  670. package/dist/voice/io.d.cts +67 -7
  671. package/dist/voice/io.d.ts +67 -7
  672. package/dist/voice/io.d.ts.map +1 -1
  673. package/dist/voice/io.js +62 -5
  674. package/dist/voice/io.js.map +1 -1
  675. package/dist/voice/recorder_io/index.cjs +23 -0
  676. package/dist/voice/recorder_io/index.cjs.map +1 -0
  677. package/dist/voice/recorder_io/index.d.cts +2 -0
  678. package/dist/voice/recorder_io/index.d.ts +2 -0
  679. package/dist/voice/recorder_io/index.d.ts.map +1 -0
  680. package/dist/voice/recorder_io/index.js +2 -0
  681. package/dist/voice/recorder_io/index.js.map +1 -0
  682. package/dist/voice/recorder_io/recorder_io.cjs +607 -0
  683. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
  684. package/dist/voice/recorder_io/recorder_io.d.cts +106 -0
  685. package/dist/voice/recorder_io/recorder_io.d.ts +106 -0
  686. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
  687. package/dist/voice/recorder_io/recorder_io.js +573 -0
  688. package/dist/voice/recorder_io/recorder_io.js.map +1 -0
  689. package/dist/voice/remote_session.cjs +922 -0
  690. package/dist/voice/remote_session.cjs.map +1 -0
  691. package/dist/voice/remote_session.d.cts +108 -0
  692. package/dist/voice/remote_session.d.ts +108 -0
  693. package/dist/voice/remote_session.d.ts.map +1 -0
  694. package/dist/voice/remote_session.js +887 -0
  695. package/dist/voice/remote_session.js.map +1 -0
  696. package/dist/voice/report.cjs +88 -0
  697. package/dist/voice/report.cjs.map +1 -0
  698. package/dist/voice/report.d.cts +49 -0
  699. package/dist/voice/report.d.ts +49 -0
  700. package/dist/voice/report.d.ts.map +1 -0
  701. package/dist/voice/report.js +63 -0
  702. package/dist/voice/report.js.map +1 -0
  703. package/dist/voice/report.test.cjs +121 -0
  704. package/dist/voice/report.test.cjs.map +1 -0
  705. package/dist/voice/report.test.js +120 -0
  706. package/dist/voice/report.test.js.map +1 -0
  707. package/dist/voice/room_io/_input.cjs +40 -7
  708. package/dist/voice/room_io/_input.cjs.map +1 -1
  709. package/dist/voice/room_io/_input.d.cts +5 -2
  710. package/dist/voice/room_io/_input.d.ts +5 -2
  711. package/dist/voice/room_io/_input.d.ts.map +1 -1
  712. package/dist/voice/room_io/_input.js +41 -8
  713. package/dist/voice/room_io/_input.js.map +1 -1
  714. package/dist/voice/room_io/_output.cjs +19 -11
  715. package/dist/voice/room_io/_output.cjs.map +1 -1
  716. package/dist/voice/room_io/_output.d.cts +7 -4
  717. package/dist/voice/room_io/_output.d.ts +7 -4
  718. package/dist/voice/room_io/_output.d.ts.map +1 -1
  719. package/dist/voice/room_io/_output.js +20 -12
  720. package/dist/voice/room_io/_output.js.map +1 -1
  721. package/dist/voice/room_io/room_io.cjs +33 -6
  722. package/dist/voice/room_io/room_io.cjs.map +1 -1
  723. package/dist/voice/room_io/room_io.d.cts +29 -9
  724. package/dist/voice/room_io/room_io.d.ts +29 -9
  725. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  726. package/dist/voice/room_io/room_io.js +33 -7
  727. package/dist/voice/room_io/room_io.js.map +1 -1
  728. package/dist/voice/speech_handle.cjs +22 -4
  729. package/dist/voice/speech_handle.cjs.map +1 -1
  730. package/dist/voice/speech_handle.d.cts +17 -2
  731. package/dist/voice/speech_handle.d.ts +17 -2
  732. package/dist/voice/speech_handle.d.ts.map +1 -1
  733. package/dist/voice/speech_handle.js +21 -4
  734. package/dist/voice/speech_handle.js.map +1 -1
  735. package/dist/voice/testing/fake_llm.cjs +127 -0
  736. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  737. package/dist/voice/testing/fake_llm.d.cts +30 -0
  738. package/dist/voice/testing/fake_llm.d.ts +30 -0
  739. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  740. package/dist/voice/testing/fake_llm.js +103 -0
  741. package/dist/voice/testing/fake_llm.js.map +1 -0
  742. package/dist/voice/testing/index.cjs +57 -0
  743. package/dist/voice/testing/index.cjs.map +1 -0
  744. package/dist/voice/testing/index.d.cts +21 -0
  745. package/dist/voice/testing/index.d.ts +21 -0
  746. package/dist/voice/testing/index.d.ts.map +1 -0
  747. package/dist/voice/testing/index.js +35 -0
  748. package/dist/voice/testing/index.js.map +1 -0
  749. package/dist/voice/testing/run_result.cjs +817 -0
  750. package/dist/voice/testing/run_result.cjs.map +1 -0
  751. package/dist/voice/testing/run_result.d.cts +385 -0
  752. package/dist/voice/testing/run_result.d.ts +385 -0
  753. package/dist/voice/testing/run_result.d.ts.map +1 -0
  754. package/dist/voice/testing/run_result.js +790 -0
  755. package/dist/voice/testing/run_result.js.map +1 -0
  756. package/dist/voice/testing/types.cjs +46 -0
  757. package/dist/voice/testing/types.cjs.map +1 -0
  758. package/dist/voice/testing/types.d.cts +83 -0
  759. package/dist/voice/testing/types.d.ts +83 -0
  760. package/dist/voice/testing/types.d.ts.map +1 -0
  761. package/dist/voice/testing/types.js +19 -0
  762. package/dist/voice/testing/types.js.map +1 -0
  763. package/dist/voice/transcription/synchronizer.cjs +139 -15
  764. package/dist/voice/transcription/synchronizer.cjs.map +1 -1
  765. package/dist/voice/transcription/synchronizer.d.cts +35 -4
  766. package/dist/voice/transcription/synchronizer.d.ts +35 -4
  767. package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
  768. package/dist/voice/transcription/synchronizer.js +143 -16
  769. package/dist/voice/transcription/synchronizer.js.map +1 -1
  770. package/dist/voice/transcription/synchronizer.test.cjs +151 -0
  771. package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
  772. package/dist/voice/transcription/synchronizer.test.js +150 -0
  773. package/dist/voice/transcription/synchronizer.test.js.map +1 -0
  774. package/dist/voice/turn_config/endpointing.cjs +33 -0
  775. package/dist/voice/turn_config/endpointing.cjs.map +1 -0
  776. package/dist/voice/turn_config/endpointing.d.cts +30 -0
  777. package/dist/voice/turn_config/endpointing.d.ts +30 -0
  778. package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
  779. package/dist/voice/turn_config/endpointing.js +9 -0
  780. package/dist/voice/turn_config/endpointing.js.map +1 -0
  781. package/dist/voice/turn_config/interruption.cjs +37 -0
  782. package/dist/voice/turn_config/interruption.cjs.map +1 -0
  783. package/dist/voice/turn_config/interruption.d.cts +53 -0
  784. package/dist/voice/turn_config/interruption.d.ts +53 -0
  785. package/dist/voice/turn_config/interruption.d.ts.map +1 -0
  786. package/dist/voice/turn_config/interruption.js +13 -0
  787. package/dist/voice/turn_config/interruption.js.map +1 -0
  788. package/dist/voice/turn_config/turn_handling.cjs +35 -0
  789. package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
  790. package/dist/voice/turn_config/turn_handling.d.cts +36 -0
  791. package/dist/voice/turn_config/turn_handling.d.ts +36 -0
  792. package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
  793. package/dist/voice/turn_config/turn_handling.js +11 -0
  794. package/dist/voice/turn_config/turn_handling.js.map +1 -0
  795. package/dist/voice/turn_config/utils.cjs +157 -0
  796. package/dist/voice/turn_config/utils.cjs.map +1 -0
  797. package/dist/voice/turn_config/utils.d.cts +37 -0
  798. package/dist/voice/turn_config/utils.d.ts +37 -0
  799. package/dist/voice/turn_config/utils.d.ts.map +1 -0
  800. package/dist/voice/turn_config/utils.js +131 -0
  801. package/dist/voice/turn_config/utils.js.map +1 -0
  802. package/dist/voice/turn_config/utils.test.cjs +128 -0
  803. package/dist/voice/turn_config/utils.test.cjs.map +1 -0
  804. package/dist/voice/turn_config/utils.test.js +127 -0
  805. package/dist/voice/turn_config/utils.test.js.map +1 -0
  806. package/dist/voice/utils.cjs +47 -0
  807. package/dist/voice/utils.cjs.map +1 -0
  808. package/dist/voice/utils.d.cts +4 -0
  809. package/dist/voice/utils.d.ts +4 -0
  810. package/dist/voice/utils.d.ts.map +1 -0
  811. package/dist/voice/utils.js +23 -0
  812. package/dist/voice/utils.js.map +1 -0
  813. package/dist/worker.cjs +44 -52
  814. package/dist/worker.cjs.map +1 -1
  815. package/dist/worker.d.cts +18 -8
  816. package/dist/worker.d.ts +18 -8
  817. package/dist/worker.d.ts.map +1 -1
  818. package/dist/worker.js +43 -43
  819. package/dist/worker.js.map +1 -1
  820. package/package.json +35 -13
  821. package/resources/NOTICE +2 -0
  822. package/resources/keyboard-typing.ogg +0 -0
  823. package/resources/keyboard-typing2.ogg +0 -0
  824. package/resources/office-ambience.ogg +0 -0
  825. package/src/_exceptions.ts +5 -0
  826. package/src/audio.ts +132 -1
  827. package/src/beta/index.ts +9 -0
  828. package/src/beta/workflows/index.ts +9 -0
  829. package/src/beta/workflows/task_group.ts +203 -0
  830. package/src/cli.ts +57 -66
  831. package/src/connection_pool.test.ts +346 -0
  832. package/src/connection_pool.ts +307 -0
  833. package/src/constants.ts +14 -0
  834. package/src/cpu.test.ts +239 -0
  835. package/src/cpu.ts +173 -0
  836. package/src/http_server.ts +18 -6
  837. package/src/index.ts +15 -13
  838. package/src/inference/api_protos.ts +85 -2
  839. package/src/inference/index.ts +32 -4
  840. package/src/inference/interruption/defaults.ts +51 -0
  841. package/src/inference/interruption/errors.ts +25 -0
  842. package/src/inference/interruption/http_transport.ts +207 -0
  843. package/src/inference/interruption/interruption_cache_entry.ts +50 -0
  844. package/src/inference/interruption/interruption_detector.ts +204 -0
  845. package/src/inference/interruption/interruption_stream.ts +467 -0
  846. package/src/inference/interruption/types.ts +84 -0
  847. package/src/inference/interruption/utils.test.ts +132 -0
  848. package/src/inference/interruption/utils.ts +137 -0
  849. package/src/inference/interruption/ws_transport.ts +416 -0
  850. package/src/inference/llm.ts +214 -163
  851. package/src/inference/stt.test.ts +253 -0
  852. package/src/inference/stt.ts +449 -208
  853. package/src/inference/tts.test.ts +354 -0
  854. package/src/inference/tts.ts +417 -115
  855. package/src/inference/utils.ts +30 -2
  856. package/src/ipc/inference_proc_executor.ts +11 -3
  857. package/src/ipc/inference_proc_lazy_main.ts +13 -1
  858. package/src/ipc/job_proc_executor.ts +11 -1
  859. package/src/ipc/job_proc_lazy_main.ts +86 -20
  860. package/src/ipc/supervised_proc.test.ts +153 -0
  861. package/src/ipc/supervised_proc.ts +39 -10
  862. package/src/job.ts +120 -1
  863. package/src/language.test.ts +62 -0
  864. package/src/language.ts +380 -0
  865. package/src/llm/__snapshots__/zod-utils.test.ts.snap +559 -0
  866. package/src/llm/chat_context.test.ts +787 -0
  867. package/src/llm/chat_context.ts +493 -2
  868. package/src/llm/fallback_adapter.test.ts +238 -0
  869. package/src/llm/fallback_adapter.ts +394 -0
  870. package/src/llm/index.ts +13 -0
  871. package/src/llm/llm.ts +77 -12
  872. package/src/llm/provider_format/google.test.ts +72 -1
  873. package/src/llm/provider_format/google.ts +10 -6
  874. package/src/llm/provider_format/index.ts +7 -2
  875. package/src/llm/provider_format/openai.test.ts +480 -2
  876. package/src/llm/provider_format/openai.ts +152 -21
  877. package/src/llm/provider_format/utils.ts +11 -5
  878. package/src/llm/realtime.ts +23 -2
  879. package/src/llm/remote_chat_context.ts +2 -2
  880. package/src/llm/tool_context.test.ts +210 -1
  881. package/src/llm/tool_context.ts +115 -17
  882. package/src/llm/utils.test.ts +103 -2
  883. package/src/llm/utils.ts +152 -16
  884. package/src/llm/zod-utils.test.ts +577 -0
  885. package/src/llm/zod-utils.ts +153 -0
  886. package/src/log.ts +71 -19
  887. package/src/metrics/base.ts +78 -19
  888. package/src/metrics/index.ts +12 -0
  889. package/src/metrics/model_usage.test.ts +545 -0
  890. package/src/metrics/model_usage.ts +262 -0
  891. package/src/metrics/usage_collector.ts +14 -3
  892. package/src/metrics/utils.ts +27 -7
  893. package/src/stream/deferred_stream.test.ts +3 -3
  894. package/src/stream/deferred_stream.ts +43 -11
  895. package/src/stream/index.ts +1 -0
  896. package/src/stream/multi_input_stream.test.ts +545 -0
  897. package/src/stream/multi_input_stream.ts +172 -0
  898. package/src/stream/stream_channel.test.ts +37 -0
  899. package/src/stream/stream_channel.ts +43 -3
  900. package/src/stt/stream_adapter.ts +30 -9
  901. package/src/stt/stt.ts +140 -23
  902. package/src/telemetry/index.ts +28 -0
  903. package/src/telemetry/logging.ts +55 -0
  904. package/src/telemetry/otel_http_exporter.ts +218 -0
  905. package/src/telemetry/pino_otel_transport.ts +265 -0
  906. package/src/telemetry/trace_types.ts +109 -0
  907. package/src/telemetry/traces.ts +673 -0
  908. package/src/telemetry/utils.ts +61 -0
  909. package/src/tokenize/basic/sentence.ts +3 -3
  910. package/src/tokenize/tokenizer.test.ts +4 -0
  911. package/src/transcription.ts +6 -0
  912. package/src/tts/fallback_adapter.ts +586 -0
  913. package/src/tts/index.ts +1 -0
  914. package/src/tts/stream_adapter.ts +38 -8
  915. package/src/tts/tts.ts +245 -62
  916. package/src/typed_promise.ts +67 -0
  917. package/src/types.ts +62 -33
  918. package/src/utils.test.ts +90 -10
  919. package/src/utils.ts +178 -33
  920. package/src/vad.ts +42 -18
  921. package/src/version.ts +1 -1
  922. package/src/voice/agent.test.ts +347 -2
  923. package/src/voice/agent.ts +346 -44
  924. package/src/voice/agent_activity.test.ts +194 -0
  925. package/src/voice/agent_activity.ts +1457 -388
  926. package/src/voice/agent_session.ts +817 -112
  927. package/src/voice/audio_recognition.ts +845 -70
  928. package/src/voice/audio_recognition_span.test.ts +341 -0
  929. package/src/voice/avatar/datastream_io.ts +9 -1
  930. package/src/voice/background_audio.ts +494 -0
  931. package/src/voice/events.ts +27 -7
  932. package/src/voice/generation.ts +310 -56
  933. package/src/voice/generation_tools.test.ts +268 -0
  934. package/src/voice/index.ts +17 -3
  935. package/src/voice/interruption_detection.test.ts +151 -0
  936. package/src/voice/io.ts +115 -12
  937. package/src/voice/recorder_io/index.ts +4 -0
  938. package/src/voice/recorder_io/recorder_io.ts +783 -0
  939. package/src/voice/remote_session.ts +1083 -0
  940. package/src/voice/report.test.ts +136 -0
  941. package/src/voice/report.ts +140 -0
  942. package/src/voice/room_io/_input.ts +45 -10
  943. package/src/voice/room_io/_output.ts +26 -14
  944. package/src/voice/room_io/room_io.ts +67 -22
  945. package/src/voice/speech_handle.ts +38 -6
  946. package/src/voice/testing/fake_llm.ts +138 -0
  947. package/src/voice/testing/index.ts +52 -0
  948. package/src/voice/testing/run_result.ts +995 -0
  949. package/src/voice/testing/types.ts +118 -0
  950. package/src/voice/transcription/synchronizer.test.ts +206 -0
  951. package/src/voice/transcription/synchronizer.ts +204 -19
  952. package/src/voice/turn_config/endpointing.ts +33 -0
  953. package/src/voice/turn_config/interruption.ts +56 -0
  954. package/src/voice/turn_config/turn_handling.ts +45 -0
  955. package/src/voice/turn_config/utils.test.ts +148 -0
  956. package/src/voice/turn_config/utils.ts +167 -0
  957. package/src/voice/utils.ts +29 -0
  958. package/src/worker.ts +92 -78
  959. package/src/llm/__snapshots__/utils.test.ts.snap +0 -65
@@ -18,18 +18,24 @@ var __copyProps = (to, from, except, desc) => {
18
18
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
19
  var agent_activity_exports = {};
20
20
  __export(agent_activity_exports, {
21
- AgentActivity: () => AgentActivity
21
+ AgentActivity: () => AgentActivity,
22
+ agentActivityStorage: () => agentActivityStorage,
23
+ onEnterStorage: () => onEnterStorage
22
24
  });
23
25
  module.exports = __toCommonJS(agent_activity_exports);
24
26
  var import_mutex = require("@livekit/mutex");
27
+ var import_api = require("@opentelemetry/api");
25
28
  var import_heap_js = require("heap-js");
26
29
  var import_node_async_hooks = require("node:async_hooks");
27
30
  var import_web = require("node:stream/web");
31
+ var import_interruption_detector = require("../inference/interruption/interruption_detector.cjs");
28
32
  var import_chat_context = require("../llm/chat_context.cjs");
29
33
  var import_llm = require("../llm/index.cjs");
34
+ var import_tool_context = require("../llm/tool_context.cjs");
30
35
  var import_log = require("../log.cjs");
31
- var import_deferred_stream = require("../stream/deferred_stream.cjs");
36
+ var import_multi_input_stream = require("../stream/multi_input_stream.cjs");
32
37
  var import_stt = require("../stt/stt.cjs");
38
+ var import_telemetry = require("../telemetry/index.cjs");
33
39
  var import_word = require("../tokenize/basic/word.cjs");
34
40
  var import_tts = require("../tts/tts.cjs");
35
41
  var import_utils = require("../utils.cjs");
@@ -40,28 +46,66 @@ var import_audio_recognition = require("./audio_recognition.cjs");
40
46
  var import_events = require("./events.cjs");
41
47
  var import_generation = require("./generation.cjs");
42
48
  var import_speech_handle = require("./speech_handle.cjs");
43
- const speechHandleStorage = new import_node_async_hooks.AsyncLocalStorage();
49
+ var import_utils2 = require("./utils.cjs");
50
+ const agentActivityStorage = new import_node_async_hooks.AsyncLocalStorage();
51
+ const onEnterStorage = new import_node_async_hooks.AsyncLocalStorage();
44
52
  class AgentActivity {
53
+ agent;
54
+ agentSession;
45
55
  static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
46
56
  started = false;
47
57
  audioRecognition;
48
58
  realtimeSession;
59
+ realtimeSpans;
60
+ // Maps response_id to OTEL span for metrics recording
49
61
  turnDetectionMode;
50
62
  logger = (0, import_log.log)();
51
- _draining = false;
63
+ _schedulingPaused = true;
64
+ _drainBlockedTasks = [];
52
65
  _currentSpeech;
53
66
  speechQueue;
54
67
  // [priority, timestamp, speechHandle]
55
68
  q_updated;
56
69
  speechTasks = /* @__PURE__ */ new Set();
57
70
  lock = new import_mutex.Mutex();
58
- audioStream = new import_deferred_stream.DeferredReadableStream();
71
+ audioStream = new import_multi_input_stream.MultiInputStream();
72
+ audioStreamId;
59
73
  // default to null as None, which maps to the default provider tool choice value
60
74
  toolChoice = null;
61
- agent;
62
- agentSession;
75
+ _preemptiveGeneration;
76
+ interruptionDetector;
77
+ isInterruptionDetectionEnabled;
78
+ isInterruptionByAudioActivityEnabled;
79
+ isDefaultInterruptionByAudioActivityEnabled;
80
+ onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
81
+ onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
82
+ onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
83
+ onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
84
+ onModelError = (ev) => this.onError(ev);
85
+ onInterruptionOverlappingSpeech = (ev) => {
86
+ this.agentSession.emit(import_events.AgentSessionEventTypes.OverlappingSpeech, ev);
87
+ };
88
+ onInterruptionMetricsCollected = (ev) => {
89
+ this.agentSession._usageCollector.collect(ev);
90
+ this.agentSession.emit(
91
+ import_events.AgentSessionEventTypes.MetricsCollected,
92
+ (0, import_events.createMetricsCollectedEvent)({ metrics: ev })
93
+ );
94
+ };
95
+ onInterruptionError = (ev) => {
96
+ const errorEvent = (0, import_events.createErrorEvent)(ev, this.interruptionDetector);
97
+ this.agentSession.emit(import_events.AgentSessionEventTypes.Error, errorEvent);
98
+ if (!ev.recoverable) {
99
+ this.agentSession._onError(ev);
100
+ this.fallbackToVadInterruption();
101
+ return;
102
+ }
103
+ this.agentSession._onError(ev);
104
+ };
63
105
  /** @internal */
64
106
  _mainTask;
107
+ _onEnterTask;
108
+ _onExitTask;
65
109
  _userTurnCompletedTask;
66
110
  constructor(agent, agentSession) {
67
111
  this.agent = agent;
@@ -73,7 +117,7 @@ class AgentActivity {
73
117
  this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
74
118
  if (this.turnDetectionMode === "vad" && this.vad === void 0) {
75
119
  this.logger.warn(
76
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting'
120
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
77
121
  );
78
122
  this.turnDetectionMode = void 0;
79
123
  }
@@ -116,89 +160,136 @@ class AgentActivity {
116
160
  );
117
161
  this.turnDetectionMode = void 0;
118
162
  }
119
- if (!this.vad && this.stt && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
163
+ if (!this.vad && this.stt && !this.stt.capabilities.streaming && this.llm instanceof import_llm.LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
120
164
  this.logger.warn(
121
- "VAD is not set. Enabling VAD is recommended when using LLM and STT for more responsive interruption handling."
165
+ "VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
122
166
  );
123
167
  }
168
+ this.interruptionDetector = this.resolveInterruptionDetector();
169
+ this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
170
+ this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
171
+ this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
124
172
  }
125
173
  async start() {
126
174
  const unlock = await this.lock.lock();
127
175
  try {
128
- this.agent._agentActivity = this;
129
- if (this.llm instanceof import_llm.RealtimeModel) {
130
- this.realtimeSession = this.llm.session();
131
- this.realtimeSession.on("generation_created", (ev) => this.onGenerationCreated(ev));
132
- this.realtimeSession.on("input_speech_started", (ev) => this.onInputSpeechStarted(ev));
133
- this.realtimeSession.on("input_speech_stopped", (ev) => this.onInputSpeechStopped(ev));
134
- this.realtimeSession.on(
135
- "input_audio_transcription_completed",
136
- (ev) => this.onInputAudioTranscriptionCompleted(ev)
176
+ await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
177
+ } finally {
178
+ unlock();
179
+ }
180
+ }
181
+ async resume() {
182
+ const unlock = await this.lock.lock();
183
+ try {
184
+ await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
185
+ } finally {
186
+ unlock();
187
+ }
188
+ }
189
+ async _startSession(options) {
190
+ var _a, _b, _c, _d, _e;
191
+ const { spanName, runOnEnter } = options;
192
+ const startSpan = import_telemetry.tracer.startSpan({
193
+ name: spanName,
194
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
195
+ context: import_api.ROOT_CONTEXT
196
+ });
197
+ this.agent._agentActivity = this;
198
+ if (this.llm instanceof import_llm.RealtimeModel) {
199
+ this.realtimeSession = this.llm.session();
200
+ this.realtimeSpans = /* @__PURE__ */ new Map();
201
+ this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
202
+ this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
203
+ this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
204
+ this.realtimeSession.on(
205
+ "input_audio_transcription_completed",
206
+ this.onRealtimeInputAudioTranscriptionCompleted
207
+ );
208
+ this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
209
+ this.realtimeSession.on("error", this.onModelError);
210
+ (0, import_generation.removeInstructions)(this.agent._chatCtx);
211
+ try {
212
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
213
+ } catch (error) {
214
+ this.logger.error(error, "failed to update the instructions");
215
+ }
216
+ try {
217
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
218
+ } catch (error) {
219
+ this.logger.error(error, "failed to update the chat context");
220
+ }
221
+ try {
222
+ await this.realtimeSession.updateTools(this.tools);
223
+ } catch (error) {
224
+ this.logger.error(error, "failed to update the tools");
225
+ }
226
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
227
+ this.logger.error(
228
+ "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
137
229
  );
138
- this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
139
- this.realtimeSession.on("error", (ev) => this.onError(ev));
140
- (0, import_generation.removeInstructions)(this.agent._chatCtx);
141
- try {
142
- await this.realtimeSession.updateInstructions(this.agent.instructions);
143
- } catch (error) {
144
- this.logger.error(error, "failed to update the instructions");
145
- }
146
- try {
147
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
148
- } catch (error) {
149
- this.logger.error(error, "failed to update the chat context");
150
- }
151
- try {
152
- await this.realtimeSession.updateTools(this.tools);
153
- } catch (error) {
154
- this.logger.error(error, "failed to update the tools");
155
- }
156
- } else if (this.llm instanceof import_llm.LLM) {
157
- try {
158
- (0, import_generation.updateInstructions)({
159
- chatCtx: this.agent._chatCtx,
160
- instructions: this.agent.instructions,
161
- addIfMissing: true
162
- });
163
- } catch (error) {
164
- this.logger.error("failed to update the instructions", error);
165
- }
166
230
  }
167
- if (this.llm instanceof import_llm.LLM) {
168
- this.llm.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
169
- this.llm.on("error", (ev) => this.onError(ev));
170
- }
171
- if (this.stt instanceof import_stt.STT) {
172
- this.stt.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
173
- this.stt.on("error", (ev) => this.onError(ev));
174
- }
175
- if (this.tts instanceof import_tts.TTS) {
176
- this.tts.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
177
- this.tts.on("error", (ev) => this.onError(ev));
178
- }
179
- if (this.vad instanceof import_vad.VAD) {
180
- this.vad.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
181
- }
182
- this.audioRecognition = new import_audio_recognition.AudioRecognition({
183
- recognitionHooks: this,
184
- // Disable stt node if stt is not provided
185
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
186
- vad: this.vad,
187
- turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
188
- turnDetectionMode: this.turnDetectionMode,
189
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
190
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay
191
- });
192
- this.audioRecognition.start();
193
- this.started = true;
194
- this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
195
- this.createSpeechTask({
196
- task: import_utils.Task.from(() => this.agent.onEnter()),
231
+ } else if (this.llm instanceof import_llm.LLM) {
232
+ try {
233
+ (0, import_generation.updateInstructions)({
234
+ chatCtx: this.agent._chatCtx,
235
+ instructions: this.agent.instructions,
236
+ addIfMissing: true
237
+ });
238
+ } catch (error) {
239
+ this.logger.error("failed to update the instructions", error);
240
+ }
241
+ }
242
+ if (this.llm instanceof import_llm.LLM) {
243
+ this.llm.on("metrics_collected", this.onMetricsCollected);
244
+ this.llm.on("error", this.onModelError);
245
+ }
246
+ if (this.stt instanceof import_stt.STT) {
247
+ this.stt.on("metrics_collected", this.onMetricsCollected);
248
+ this.stt.on("error", this.onModelError);
249
+ }
250
+ if (this.tts instanceof import_tts.TTS) {
251
+ this.tts.on("metrics_collected", this.onMetricsCollected);
252
+ this.tts.on("error", this.onModelError);
253
+ }
254
+ if (this.vad instanceof import_vad.VAD) {
255
+ this.vad.on("metrics_collected", this.onMetricsCollected);
256
+ }
257
+ this.audioRecognition = new import_audio_recognition.AudioRecognition({
258
+ recognitionHooks: this,
259
+ // Disable stt node if stt is not provided
260
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
261
+ vad: this.vad,
262
+ turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
263
+ turnDetectionMode: this.turnDetectionMode,
264
+ interruptionDetection: this.interruptionDetector,
265
+ minEndpointingDelay: ((_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.endpointing) == null ? void 0 : _b.minDelay) ?? this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
266
+ maxEndpointingDelay: ((_d = (_c = this.agent.turnHandling) == null ? void 0 : _c.endpointing) == null ? void 0 : _d.maxDelay) ?? this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
267
+ rootSpanContext: this.agentSession.rootSpanContext,
268
+ sttModel: (_e = this.stt) == null ? void 0 : _e.label,
269
+ sttProvider: this.getSttProvider(),
270
+ getLinkedParticipant: () => {
271
+ var _a2;
272
+ return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
273
+ }
274
+ });
275
+ this.audioRecognition.start();
276
+ this.started = true;
277
+ this._resumeSchedulingTask();
278
+ if (runOnEnter) {
279
+ this._onEnterTask = this.createSpeechTask({
280
+ taskFn: () => onEnterStorage.run(
281
+ { session: this.agentSession, agent: this.agent },
282
+ () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
283
+ name: "on_enter",
284
+ context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
285
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
286
+ })
287
+ ),
288
+ inlineTask: true,
197
289
  name: "AgentActivity_onEnter"
198
290
  });
199
- } finally {
200
- unlock();
201
291
  }
292
+ startSpan.end();
202
293
  }
203
294
  get currentSpeech() {
204
295
  return this._currentSpeech;
@@ -209,6 +300,15 @@ class AgentActivity {
209
300
  get stt() {
210
301
  return this.agent.stt || this.agentSession.stt;
211
302
  }
303
+ getSttProvider() {
304
+ var _a;
305
+ const label = (_a = this.stt) == null ? void 0 : _a.label;
306
+ if (!label) {
307
+ return void 0;
308
+ }
309
+ const [provider] = label.split("-", 1);
310
+ return provider || label;
311
+ }
212
312
  get llm() {
213
313
  return this.agent.llm || this.agentSession.llm;
214
314
  }
@@ -218,21 +318,46 @@ class AgentActivity {
218
318
  get tools() {
219
319
  return this.agent.toolCtx;
220
320
  }
221
- get draining() {
222
- return this._draining;
321
+ get schedulingPaused() {
322
+ return this._schedulingPaused;
223
323
  }
224
324
  get realtimeLLMSession() {
225
325
  return this.realtimeSession;
226
326
  }
227
327
  get allowInterruptions() {
228
- return this.agentSession.options.allowInterruptions;
328
+ var _a, _b;
329
+ return ((_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.interruption) == null ? void 0 : _b.enabled) ?? this.agentSession.sessionOptions.turnHandling.interruption.enabled;
330
+ }
331
+ get useTtsAlignedTranscript() {
332
+ return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
229
333
  }
230
334
  get turnDetection() {
231
- return this.agentSession.turnDetection;
335
+ var _a;
336
+ return ((_a = this.agent.turnHandling) == null ? void 0 : _a.turnDetection) ?? this.agentSession.turnDetection;
232
337
  }
338
+ get turnHandling() {
339
+ return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling;
340
+ }
341
+ // get minEndpointingDelay(): number {
342
+ // return (
343
+ // this.agent.turnHandling?.endpointing?.minDelay ??
344
+ // this.agentSession.sessionOptions.turnHandling.endpointing.minDelay
345
+ // );
346
+ // }
347
+ // get maxEndpointingDelay(): number {
348
+ // return (
349
+ // this.agent.turnHandling?.endpointing?.maxDelay ??
350
+ // this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay
351
+ // );
352
+ // }
233
353
  get toolCtx() {
234
354
  return this.agent.toolCtx;
235
355
  }
356
+ /** @internal */
357
+ get inputStartedAt() {
358
+ var _a;
359
+ return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
360
+ }
236
361
  async updateChatCtx(chatCtx) {
237
362
  chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
238
363
  this.agent._chatCtx = chatCtx;
@@ -247,36 +372,79 @@ class AgentActivity {
247
372
  });
248
373
  }
249
374
  }
250
- updateOptions({ toolChoice }) {
375
+ // TODO: Add when AgentConfigUpdate is ported to ChatContext.
376
+ async updateTools(tools) {
377
+ this.agent._tools = { ...tools };
378
+ if (this.realtimeSession) {
379
+ await this.realtimeSession.updateTools(tools);
380
+ }
381
+ if (this.llm instanceof import_llm.LLM) {
382
+ await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
383
+ }
384
+ }
385
+ updateOptions({
386
+ toolChoice,
387
+ turnDetection
388
+ }) {
251
389
  if (toolChoice !== void 0) {
252
390
  this.toolChoice = toolChoice;
253
391
  }
254
392
  if (this.realtimeSession) {
255
393
  this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
256
394
  }
395
+ if (turnDetection !== void 0) {
396
+ this.turnDetectionMode = turnDetection;
397
+ this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
398
+ if (this.agentSession.agentState !== "speaking") {
399
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
400
+ }
401
+ }
402
+ if (this.audioRecognition) {
403
+ this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
404
+ }
257
405
  }
258
406
  attachAudioInput(audioStream) {
259
- if (this.audioStream.isSourceSet) {
260
- this.logger.debug("detaching existing audio input in agent activity");
261
- this.audioStream.detachSource();
262
- }
263
- this.audioStream.setSource(audioStream);
264
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
265
- if (this.realtimeSession) {
407
+ void this.audioStream.close();
408
+ this.audioStream = new import_multi_input_stream.MultiInputStream();
409
+ const aecWarmupAudioFilter = new import_web.TransformStream({
410
+ transform: (frame, controller) => {
411
+ const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
412
+ if (!shouldDiscardForAecWarmup) {
413
+ controller.enqueue(frame);
414
+ }
415
+ }
416
+ });
417
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
418
+ if (this.realtimeSession && this.audioRecognition) {
419
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
266
420
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
267
- }
268
- if (this.audioRecognition) {
269
421
  this.audioRecognition.setInputAudioStream(recognitionAudioStream);
422
+ } else if (this.realtimeSession) {
423
+ this.realtimeSession.setInputAudioStream(
424
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter)
425
+ );
426
+ } else if (this.audioRecognition) {
427
+ this.audioRecognition.setInputAudioStream(
428
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter)
429
+ );
270
430
  }
271
431
  }
272
432
  detachAudioInput() {
273
- this.audioStream.detachSource();
433
+ if (this.audioStreamId === void 0) {
434
+ return;
435
+ }
436
+ void this.audioStream.close();
437
+ this.audioStream = new import_multi_input_stream.MultiInputStream();
438
+ this.audioStreamId = void 0;
274
439
  }
275
- commitUserTurn() {
440
+ commitUserTurn(options = {}) {
441
+ const { audioDetached = false, throwIfNotReady = true } = options;
276
442
  if (!this.audioRecognition) {
277
- throw new Error("AudioRecognition is not initialized");
443
+ if (throwIfNotReady) {
444
+ throw new Error("AudioRecognition is not initialized");
445
+ }
446
+ return;
278
447
  }
279
- const audioDetached = false;
280
448
  this.audioRecognition.commitUserTurn(audioDetached);
281
449
  }
282
450
  clearUserTurn() {
@@ -312,22 +480,28 @@ class AgentActivity {
312
480
  })
313
481
  );
314
482
  const task = this.createSpeechTask({
315
- task: import_utils.Task.from(
316
- (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
317
- ),
483
+ taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
318
484
  ownedSpeechHandle: handle,
319
485
  name: "AgentActivity.say_tts"
320
486
  });
321
- task.finally(() => this.onPipelineReplyDone());
487
+ task.result.finally(() => this.onPipelineReplyDone());
322
488
  this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
323
489
  return handle;
324
490
  }
325
491
  // -- Metrics and errors --
326
492
  onMetricsCollected = (ev) => {
327
- const speechHandle = speechHandleStorage.getStore();
493
+ const speechHandle = import_agent.speechHandleStorage.getStore();
328
494
  if (speechHandle && (ev.type === "llm_metrics" || ev.type === "tts_metrics")) {
329
495
  ev.speechId = speechHandle.id;
330
496
  }
497
+ if (ev.type === "realtime_model_metrics" && this.realtimeSpans) {
498
+ const span = this.realtimeSpans.get(ev.requestId);
499
+ if (span) {
500
+ (0, import_telemetry.recordRealtimeMetrics)(span, ev);
501
+ this.realtimeSpans.delete(ev.requestId);
502
+ }
503
+ }
504
+ this.agentSession._usageCollector.collect(ev);
331
505
  this.agentSession.emit(
332
506
  import_events.AgentSessionEventTypes.MetricsCollected,
333
507
  (0, import_events.createMetricsCollectedEvent)({ metrics: ev })
@@ -354,6 +528,13 @@ class AgentActivity {
354
528
  this.logger.info("onInputSpeechStarted");
355
529
  if (!this.vad) {
356
530
  this.agentSession._updateUserState("speaking");
531
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
532
+ this.audioRecognition.onStartOfOverlapSpeech(
533
+ 0,
534
+ Date.now(),
535
+ this.agentSession._userSpeakingSpan
536
+ );
537
+ }
357
538
  }
358
539
  try {
359
540
  this.interrupt();
@@ -367,6 +548,9 @@ class AgentActivity {
367
548
  onInputSpeechStopped(ev) {
368
549
  this.logger.info(ev, "onInputSpeechStopped");
369
550
  if (!this.vad) {
551
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
552
+ this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
553
+ }
370
554
  this.agentSession._updateUserState("listening");
371
555
  }
372
556
  if (ev.userTranscriptionEnabled) {
@@ -401,8 +585,8 @@ class AgentActivity {
401
585
  if (ev.userInitiated) {
402
586
  return;
403
587
  }
404
- if (this.draining) {
405
- this.logger.warn("skipping new realtime generation, the agent is draining");
588
+ if (this.schedulingPaused) {
589
+ this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
406
590
  return;
407
591
  }
408
592
  const handle = import_speech_handle.SpeechHandle.create({
@@ -418,45 +602,91 @@ class AgentActivity {
418
602
  );
419
603
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
420
604
  this.createSpeechTask({
421
- task: import_utils.Task.from(
422
- (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
423
- ),
605
+ taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
424
606
  ownedSpeechHandle: handle,
425
607
  name: "AgentActivity.realtimeGeneration"
426
608
  });
427
609
  this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
428
610
  }
429
611
  // recognition hooks
430
- onStartOfSpeech(_ev) {
431
- this.agentSession._updateUserState("speaking");
612
+ onStartOfSpeech(ev) {
613
+ let speechStartTime = Date.now();
614
+ if (ev) {
615
+ speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
616
+ }
617
+ this.agentSession._updateUserState("speaking", {
618
+ lastSpeakingTime: speechStartTime,
619
+ otelContext: import_api.context.active()
620
+ });
621
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
622
+ this.audioRecognition.onStartOfOverlapSpeech(
623
+ ev.speechDuration,
624
+ speechStartTime,
625
+ this.agentSession._userSpeakingSpan
626
+ );
627
+ }
432
628
  }
433
- onEndOfSpeech(_ev) {
434
- this.agentSession._updateUserState("listening");
629
+ onEndOfSpeech(ev) {
630
+ let speechEndTime = Date.now();
631
+ if (ev) {
632
+ speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
633
+ }
634
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
635
+ this.audioRecognition.onEndOfOverlapSpeech(
636
+ speechEndTime,
637
+ this.agentSession._userSpeakingSpan
638
+ );
639
+ }
640
+ this.agentSession._updateUserState("listening", {
641
+ lastSpeakingTime: speechEndTime,
642
+ otelContext: import_api.context.active()
643
+ });
435
644
  }
436
645
  onVADInferenceDone(ev) {
437
- var _a, _b;
646
+ var _a;
438
647
  if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
439
648
  return;
440
649
  }
441
- if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
650
+ if (ev.speechDuration >= ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
651
+ this.interruptByAudioActivity();
652
+ }
653
+ }
654
+ interruptByAudioActivity() {
655
+ var _a, _b, _c, _d;
656
+ if (!this.isInterruptionByAudioActivityEnabled) {
442
657
  return;
443
658
  }
444
- if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
659
+ if (this.agentSession._aecWarmupRemaining > 0) {
660
+ return;
661
+ }
662
+ if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.turnDetection) {
445
663
  return;
446
664
  }
447
- if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
665
+ if (this.stt && ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
448
666
  const text = this.audioRecognition.currentTranscript;
449
- if (text && (0, import_word.splitWords)(text, true).length < this.agentSession.options.minInterruptionWords) {
667
+ const normalizedText = text ?? "";
668
+ const wordCount = (0, import_word.splitWords)(normalizedText, true).length;
669
+ if (wordCount < ((_b = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
450
670
  return;
451
671
  }
452
672
  }
453
- (_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
673
+ (_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
454
674
  if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
455
- this.logger.info({ "speech id": this._currentSpeech.id }, "speech interrupted by VAD");
456
- (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
675
+ this.logger.info(
676
+ { "speech id": this._currentSpeech.id },
677
+ "speech interrupted by audio activity"
678
+ );
679
+ (_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
457
680
  this._currentSpeech.interrupt();
458
681
  }
459
682
  }
683
+ onInterruption(ev) {
684
+ this.restoreInterruptionByAudioActivity();
685
+ this.interruptByAudioActivity();
686
+ if (this.audioRecognition) {
687
+ this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.detectedAt);
688
+ }
689
+ }
460
690
  onInterimTranscript(ev) {
461
691
  if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
462
692
  return;
@@ -465,10 +695,14 @@ class AgentActivity {
465
695
  import_events.AgentSessionEventTypes.UserInputTranscribed,
466
696
  (0, import_events.createUserInputTranscribedEvent)({
467
697
  transcript: ev.alternatives[0].text,
468
- isFinal: false
698
+ isFinal: false,
699
+ language: ev.alternatives[0].language
469
700
  // TODO(AJS-106): add multi participant support
470
701
  })
471
702
  );
703
+ if (ev.alternatives[0].text) {
704
+ this.interruptByAudioActivity();
705
+ }
472
706
  }
473
707
  onFinalTranscript(ev) {
474
708
  if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.userTranscription) {
@@ -478,13 +712,70 @@ class AgentActivity {
478
712
  import_events.AgentSessionEventTypes.UserInputTranscribed,
479
713
  (0, import_events.createUserInputTranscribedEvent)({
480
714
  transcript: ev.alternatives[0].text,
481
- isFinal: true
715
+ isFinal: true,
716
+ language: ev.alternatives[0].language
482
717
  // TODO(AJS-106): add multi participant support
483
718
  })
484
719
  );
720
+ if (this.audioRecognition && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm") {
721
+ this.interruptByAudioActivity();
722
+ }
723
+ }
724
+ onPreemptiveGeneration(info) {
725
+ if (!this.agentSession.sessionOptions.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
726
+ return;
727
+ }
728
+ this.cancelPreemptiveGeneration();
729
+ this.logger.info(
730
+ {
731
+ newTranscript: info.newTranscript,
732
+ transcriptConfidence: info.transcriptConfidence
733
+ },
734
+ "starting preemptive generation"
735
+ );
736
+ const userMessage = import_chat_context.ChatMessage.create({
737
+ role: "user",
738
+ content: info.newTranscript,
739
+ transcriptConfidence: info.transcriptConfidence
740
+ });
741
+ const chatCtx = this.agent.chatCtx.copy();
742
+ const speechHandle = this.generateReply({
743
+ userMessage,
744
+ chatCtx,
745
+ scheduleSpeech: false
746
+ });
747
+ this._preemptiveGeneration = {
748
+ speechHandle,
749
+ userMessage,
750
+ info,
751
+ chatCtx: chatCtx.copy(),
752
+ tools: { ...this.tools },
753
+ toolChoice: this.toolChoice,
754
+ createdAt: Date.now()
755
+ };
756
+ }
757
+ cancelPreemptiveGeneration() {
758
+ if (this._preemptiveGeneration !== void 0) {
759
+ this._preemptiveGeneration.speechHandle._cancel();
760
+ this._preemptiveGeneration = void 0;
761
+ }
485
762
  }
486
763
  createSpeechTask(options) {
487
- const { task, ownedSpeechHandle } = options;
764
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
765
+ const wrappedFn = (ctrl) => {
766
+ return agentActivityStorage.run(this, () => {
767
+ const currentTask = import_utils.Task.current();
768
+ if (currentTask) {
769
+ (0, import_agent._setActivityTaskInfo)(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
770
+ }
771
+ if (ownedSpeechHandle) {
772
+ return import_agent.speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
773
+ }
774
+ return taskFn(ctrl);
775
+ });
776
+ };
777
+ const task = import_utils.Task.from(wrappedFn, controller, name);
778
+ (0, import_agent._setActivityTaskInfo)(task, { speechHandle: ownedSpeechHandle, inlineTask });
488
779
  this.speechTasks.add(task);
489
780
  task.addDoneCallback(() => {
490
781
  this.speechTasks.delete(task);
@@ -500,20 +791,35 @@ class AgentActivity {
500
791
  task.addDoneCallback(() => {
501
792
  this.wakeupMainTask();
502
793
  });
503
- return task.result;
794
+ return task;
504
795
  }
505
796
  async onEndOfTurn(info) {
506
- if (this.draining) {
507
- this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
797
+ var _a, _b;
798
+ if (this.schedulingPaused) {
799
+ this.cancelPreemptiveGeneration();
800
+ this.logger.warn(
801
+ { user_input: info.newTranscript },
802
+ "skipping user input, speech scheduling is paused"
803
+ );
508
804
  return true;
509
805
  }
510
- if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0 && info.newTranscript.split(" ").length < this.agentSession.options.minInterruptionWords) {
511
- this.logger.info("skipping user input, new_transcript is too short");
512
- return false;
806
+ if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
807
+ const wordCount = (0, import_word.splitWords)(info.newTranscript, true).length;
808
+ if (wordCount < ((_b = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
809
+ this.cancelPreemptiveGeneration();
810
+ this.logger.info(
811
+ {
812
+ wordCount,
813
+ minInterruptionWords: this.agentSession.sessionOptions.turnHandling.interruption.minWords
814
+ },
815
+ "skipping user input, word count below minimum interruption threshold"
816
+ );
817
+ return false;
818
+ }
513
819
  }
514
820
  const oldTask = this._userTurnCompletedTask;
515
821
  this._userTurnCompletedTask = this.createSpeechTask({
516
- task: import_utils.Task.from(() => this.userTurnCompleted(info, oldTask)),
822
+ taskFn: () => this.userTurnCompleted(info, oldTask),
517
823
  name: "AgentActivity.userTurnCompleted"
518
824
  });
519
825
  return true;
@@ -538,19 +844,49 @@ class AgentActivity {
538
844
  throw new Error("Speech queue is empty");
539
845
  }
540
846
  const speechHandle = heapItem[2];
847
+ if (speechHandle.interrupted || speechHandle.done()) {
848
+ continue;
849
+ }
541
850
  this._currentSpeech = speechHandle;
542
851
  speechHandle._authorizeGeneration();
543
- await speechHandle._waitForGeneration();
852
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForGeneration()]);
544
853
  this._currentSpeech = void 0;
545
854
  }
546
- if (this.draining && this.speechTasks.size === 0) {
547
- this.logger.info("mainTask: draining and no more speech tasks");
855
+ const toWait = this.getDrainPendingSpeechTasks();
856
+ if (this._schedulingPaused && toWait.length === 0) {
857
+ this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
548
858
  break;
549
859
  }
550
860
  this.q_updated = new import_utils.Future();
551
861
  }
552
862
  this.logger.info("AgentActivity mainTask: exiting");
553
863
  }
864
+ getDrainPendingSpeechTasks() {
865
+ const blockedHandles = [];
866
+ for (const task of this._drainBlockedTasks) {
867
+ const info = (0, import_agent._getActivityTaskInfo)(task);
868
+ if (!info) {
869
+ this.logger.error("blocked task without activity info; skipping.");
870
+ continue;
871
+ }
872
+ if (!info.speechHandle) {
873
+ continue;
874
+ }
875
+ blockedHandles.push(info.speechHandle);
876
+ }
877
+ const toWait = [];
878
+ for (const task of this.speechTasks) {
879
+ if (this._drainBlockedTasks.includes(task)) {
880
+ continue;
881
+ }
882
+ const info = (0, import_agent._getActivityTaskInfo)(task);
883
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
884
+ continue;
885
+ }
886
+ toWait.push(task);
887
+ }
888
+ return toWait;
889
+ }
554
890
  wakeupMainTask() {
555
891
  this.q_updated.resolve();
556
892
  }
@@ -561,7 +897,8 @@ class AgentActivity {
561
897
  chatCtx,
562
898
  instructions: defaultInstructions,
563
899
  toolChoice: defaultToolChoice,
564
- allowInterruptions: defaultAllowInterruptions
900
+ allowInterruptions: defaultAllowInterruptions,
901
+ scheduleSpeech = true
565
902
  } = options;
566
903
  let instructions = defaultInstructions;
567
904
  let toolChoice = defaultToolChoice;
@@ -575,7 +912,7 @@ class AgentActivity {
575
912
  if (this.llm === void 0) {
576
913
  throw new Error("trying to generate reply without an LLM model");
577
914
  }
578
- const functionCall = (_a = import_agent.asyncLocalStorage.getStore()) == null ? void 0 : _a.functionCall;
915
+ const functionCall = (_a = import_agent.functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
579
916
  if (toolChoice === void 0 && functionCall !== void 0) {
580
917
  toolChoice = "none";
581
918
  }
@@ -593,19 +930,17 @@ class AgentActivity {
593
930
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
594
931
  if (this.llm instanceof import_llm.RealtimeModel) {
595
932
  this.createSpeechTask({
596
- task: import_utils.Task.from(
597
- (abortController) => this.realtimeReplyTask({
598
- speechHandle: handle,
599
- // TODO(brian): support llm.ChatMessage for the realtime model
600
- userInput: userMessage == null ? void 0 : userMessage.textContent,
601
- instructions,
602
- modelSettings: {
603
- // isGiven(toolChoice) = toolChoice !== undefined
604
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
605
- },
606
- abortController
607
- })
608
- ),
933
+ taskFn: (abortController) => this.realtimeReplyTask({
934
+ speechHandle: handle,
935
+ // TODO(brian): support llm.ChatMessage for the realtime model
936
+ userInput: userMessage == null ? void 0 : userMessage.textContent,
937
+ instructions,
938
+ modelSettings: {
939
+ // isGiven(toolChoice) = toolChoice !== undefined
940
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
941
+ },
942
+ abortController
943
+ }),
609
944
  ownedSpeechHandle: handle,
610
945
  name: "AgentActivity.realtimeReply"
611
946
  });
@@ -614,39 +949,56 @@ class AgentActivity {
614
949
  instructions = `${this.agent.instructions}
615
950
  ${instructions}`;
616
951
  }
952
+ const onEnterData = onEnterStorage.getStore();
953
+ const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
954
+ const tools = shouldFilterTools ? Object.fromEntries(
955
+ Object.entries(this.agent.toolCtx).filter(
956
+ ([, fnTool]) => !(fnTool.flags & import_llm.ToolFlag.IGNORE_ON_ENTER)
957
+ )
958
+ ) : this.agent.toolCtx;
617
959
  const task = this.createSpeechTask({
618
- task: import_utils.Task.from(
619
- (abortController) => this.pipelineReplyTask(
620
- handle,
621
- chatCtx ?? this.agent.chatCtx,
622
- this.agent.toolCtx,
623
- {
624
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
625
- },
626
- abortController,
627
- instructions ? `${this.agent.instructions}
628
- ${instructions}` : instructions,
629
- userMessage
630
- )
960
+ taskFn: (abortController) => this.pipelineReplyTask(
961
+ handle,
962
+ chatCtx ?? this.agent.chatCtx,
963
+ tools,
964
+ {
965
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
966
+ },
967
+ abortController,
968
+ instructions,
969
+ userMessage
631
970
  ),
632
971
  ownedSpeechHandle: handle,
633
972
  name: "AgentActivity.pipelineReply"
634
973
  });
635
- task.finally(() => this.onPipelineReplyDone());
974
+ task.result.finally(() => this.onPipelineReplyDone());
975
+ }
976
+ if (scheduleSpeech) {
977
+ this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
636
978
  }
637
- this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
638
979
  return handle;
639
980
  }
640
- interrupt() {
981
+ interrupt(options = {}) {
641
982
  var _a;
983
+ const { force = false } = options;
984
+ this.cancelPreemptiveGeneration();
642
985
  const future = new import_utils.Future();
643
986
  const currentSpeech = this._currentSpeech;
644
- currentSpeech == null ? void 0 : currentSpeech.interrupt();
987
+ currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
645
988
  for (const [_, __, speech] of this.speechQueue) {
646
- speech.interrupt();
989
+ speech.interrupt(force);
647
990
  }
648
991
  (_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
649
- if (currentSpeech === void 0) {
992
+ if (force) {
993
+ for (const task of this.speechTasks) {
994
+ task.cancel();
995
+ }
996
+ if (currentSpeech && !currentSpeech.done()) {
997
+ currentSpeech._markDone();
998
+ }
999
+ this.speechQueue.clear();
1000
+ future.resolve();
1001
+ } else if (currentSpeech === void 0) {
650
1002
  future.resolve();
651
1003
  } else {
652
1004
  currentSpeech.addDoneCallback(() => {
@@ -664,7 +1016,7 @@ ${instructions}` : instructions,
664
1016
  async userTurnCompleted(info, oldTask) {
665
1017
  var _a, _b;
666
1018
  if (oldTask) {
667
- await oldTask;
1019
+ await oldTask.result;
668
1020
  }
669
1021
  if (this.llm instanceof import_llm.RealtimeModel) {
670
1022
  if (this.llm.capabilities.turnDetection) {
@@ -689,7 +1041,8 @@ ${instructions}` : instructions,
689
1041
  }
690
1042
  let userMessage = import_chat_context.ChatMessage.create({
691
1043
  role: "user",
692
- content: info.newTranscript
1044
+ content: info.newTranscript,
1045
+ transcriptConfidence: info.transcriptConfidence
693
1046
  });
694
1047
  const chatCtx = this.agent.chatCtx.copy();
695
1048
  const startTime = Date.now();
@@ -707,13 +1060,57 @@ ${instructions}` : instructions,
707
1060
  } else if (this.llm === void 0) {
708
1061
  return;
709
1062
  }
710
- const speechHandle = this.generateReply({ userMessage, chatCtx });
1063
+ const userMetricsReport = {};
1064
+ if (info.startedSpeakingAt !== void 0) {
1065
+ userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
1066
+ }
1067
+ if (info.stoppedSpeakingAt !== void 0) {
1068
+ userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
1069
+ }
1070
+ if (info.transcriptionDelay !== void 0) {
1071
+ userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
1072
+ }
1073
+ if (info.endOfUtteranceDelay !== void 0) {
1074
+ userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
1075
+ }
1076
+ userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
1077
+ if (userMessage) {
1078
+ userMessage.metrics = userMetricsReport;
1079
+ }
1080
+ let speechHandle;
1081
+ if (this._preemptiveGeneration !== void 0) {
1082
+ const preemptive = this._preemptiveGeneration;
1083
+ if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && (0, import_tool_context.isSameToolContext)(preemptive.tools, this.tools) && (0, import_tool_context.isSameToolChoice)(preemptive.toolChoice, this.toolChoice)) {
1084
+ speechHandle = preemptive.speechHandle;
1085
+ if (preemptive.userMessage && userMessage) {
1086
+ preemptive.userMessage.metrics = userMetricsReport;
1087
+ preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
1088
+ }
1089
+ this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
1090
+ this.logger.debug(
1091
+ {
1092
+ preemptiveLeadTime: Date.now() - preemptive.createdAt
1093
+ },
1094
+ "using preemptive generation"
1095
+ );
1096
+ } else {
1097
+ this.logger.warn(
1098
+ "preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`"
1099
+ );
1100
+ preemptive.speechHandle._cancel();
1101
+ }
1102
+ this._preemptiveGeneration = void 0;
1103
+ }
1104
+ if (speechHandle === void 0) {
1105
+ speechHandle = this.generateReply({ userMessage, chatCtx });
1106
+ }
711
1107
  const eouMetrics = {
712
1108
  type: "eou_metrics",
713
1109
  timestamp: Date.now(),
714
- endOfUtteranceDelay: info.endOfUtteranceDelay,
715
- transcriptionDelay: info.transcriptionDelay,
716
- onUserTurnCompletedDelay: callbackDuration,
1110
+ endOfUtteranceDelayMs: info.endOfUtteranceDelay,
1111
+ transcriptionDelayMs: info.transcriptionDelay,
1112
+ onUserTurnCompletedDelayMs: callbackDuration,
1113
+ lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
717
1114
  speechId: speechHandle.id
718
1115
  };
719
1116
  this.agentSession.emit(
@@ -722,7 +1119,9 @@ ${instructions}` : instructions,
722
1119
  );
723
1120
  }
724
1121
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
725
- speechHandleStorage.enterWith(speechHandle);
1122
+ var _a, _b;
1123
+ speechHandle._agentTurnContext = import_api.context.active();
1124
+ import_agent.speechHandleStorage.enterWith(speechHandle);
726
1125
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
727
1126
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
728
1127
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
@@ -753,25 +1152,38 @@ ${instructions}` : instructions,
753
1152
  textOut = _textOut;
754
1153
  tasks.push(textForwardTask);
755
1154
  }
756
- const onFirstFrame = () => {
757
- this.agentSession._updateAgentState("speaking");
1155
+ let replyStartedSpeakingAt;
1156
+ let replyTtsGenData = null;
1157
+ const onFirstFrame = (startedSpeakingAt) => {
1158
+ replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1159
+ this.agentSession._updateAgentState("speaking", {
1160
+ startTime: startedSpeakingAt,
1161
+ otelContext: speechHandle._agentTurnContext
1162
+ });
1163
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1164
+ this.audioRecognition.onStartOfAgentSpeech();
1165
+ this.isInterruptionByAudioActivityEnabled = false;
1166
+ }
758
1167
  };
759
1168
  if (!audioOutput) {
760
1169
  if (textOut) {
761
- textOut.firstTextFut.await.finally(onFirstFrame);
1170
+ textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
762
1171
  }
763
1172
  } else {
764
1173
  let audioOut = null;
765
1174
  if (!audio) {
766
- const [ttsTask, ttsStream] = (0, import_generation.performTTSInference)(
1175
+ const [ttsTask, ttsGenData] = (0, import_generation.performTTSInference)(
767
1176
  (...args) => this.agent.ttsNode(...args),
768
1177
  audioSource,
769
1178
  modelSettings,
770
- replyAbortController
1179
+ replyAbortController,
1180
+ (_a = this.tts) == null ? void 0 : _a.model,
1181
+ (_b = this.tts) == null ? void 0 : _b.provider
771
1182
  );
772
1183
  tasks.push(ttsTask);
1184
+ replyTtsGenData = ttsGenData;
773
1185
  const [forwardTask, _audioOut] = (0, import_generation.performAudioForwarding)(
774
- ttsStream,
1186
+ ttsGenData.audioStream,
775
1187
  audioOutput,
776
1188
  replyAbortController
777
1189
  );
@@ -786,7 +1198,7 @@ ${instructions}` : instructions,
786
1198
  tasks.push(forwardTask);
787
1199
  audioOut = _audioOut;
788
1200
  }
789
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1201
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
790
1202
  }
791
1203
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
792
1204
  if (audioOutput) {
@@ -801,28 +1213,63 @@ ${instructions}` : instructions,
801
1213
  }
802
1214
  }
803
1215
  if (addToChatCtx) {
1216
+ const replyStoppedSpeakingAt = Date.now();
1217
+ const replyAssistantMetrics = {};
1218
+ if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
1219
+ replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
1220
+ }
1221
+ if (replyStartedSpeakingAt !== void 0) {
1222
+ replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
1223
+ replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
1224
+ }
804
1225
  const message = import_chat_context.ChatMessage.create({
805
1226
  role: "assistant",
806
1227
  content: (textOut == null ? void 0 : textOut.text) || "",
807
- interrupted: speechHandle.interrupted
1228
+ interrupted: speechHandle.interrupted,
1229
+ metrics: replyAssistantMetrics
808
1230
  });
809
1231
  this.agent._chatCtx.insert(message);
810
1232
  this.agentSession._conversationItemAdded(message);
811
1233
  }
812
1234
  if (this.agentSession.agentState === "speaking") {
813
1235
  this.agentSession._updateAgentState("listening");
1236
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1237
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1238
+ }
1239
+ this.restoreInterruptionByAudioActivity();
814
1240
  }
815
1241
  }
816
- async pipelineReplyTask(speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) {
817
- var _a, _b, _c;
818
- speechHandleStorage.enterWith(speechHandle);
1242
+ _pipelineReplyTaskImpl = async ({
1243
+ speechHandle,
1244
+ chatCtx,
1245
+ toolCtx,
1246
+ modelSettings,
1247
+ replyAbortController,
1248
+ instructions,
1249
+ newMessage,
1250
+ toolsMessages,
1251
+ span,
1252
+ _previousUserMetrics
1253
+ }) => {
1254
+ var _a, _b, _c, _d, _e, _f;
1255
+ speechHandle._agentTurnContext = import_api.context.active();
1256
+ span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1257
+ if (instructions) {
1258
+ span.setAttribute(import_telemetry.traceTypes.ATTR_INSTRUCTIONS, instructions);
1259
+ }
1260
+ if (newMessage) {
1261
+ span.setAttribute(import_telemetry.traceTypes.ATTR_USER_INPUT, newMessage.textContent || "");
1262
+ }
1263
+ const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
1264
+ if (localParticipant) {
1265
+ (0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
1266
+ }
1267
+ import_agent.speechHandleStorage.enterWith(speechHandle);
819
1268
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
820
1269
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
821
1270
  chatCtx = chatCtx.copy();
822
1271
  if (newMessage) {
823
1272
  chatCtx.insert(newMessage);
824
- this.agent._chatCtx.insert(newMessage);
825
- this.agentSession._conversationItemAdded(newMessage);
826
1273
  }
827
1274
  if (instructions) {
828
1275
  try {
@@ -835,7 +1282,6 @@ ${instructions}` : instructions,
835
1282
  this.logger.error({ error: e }, "error occurred during updateInstructions");
836
1283
  }
837
1284
  }
838
- this.agentSession._updateAgentState("thinking");
839
1285
  const tasks = [];
840
1286
  const [llmTask, llmGenData] = (0, import_generation.performLLMInference)(
841
1287
  // preserve `this` context in llmNode
@@ -843,22 +1289,36 @@ ${instructions}` : instructions,
843
1289
  chatCtx,
844
1290
  toolCtx,
845
1291
  modelSettings,
846
- replyAbortController
1292
+ replyAbortController,
1293
+ (_b = this.llm) == null ? void 0 : _b.model,
1294
+ (_c = this.llm) == null ? void 0 : _c.provider
847
1295
  );
848
1296
  tasks.push(llmTask);
849
- const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
850
1297
  let ttsTask = null;
851
- let ttsStream = null;
1298
+ let ttsGenData = null;
1299
+ let llmOutput;
852
1300
  if (audioOutput) {
853
- [ttsTask, ttsStream] = (0, import_generation.performTTSInference)(
1301
+ const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
1302
+ llmOutput = textOutput;
1303
+ [ttsTask, ttsGenData] = (0, import_generation.performTTSInference)(
854
1304
  (...args) => this.agent.ttsNode(...args),
855
1305
  ttsTextInput,
856
1306
  modelSettings,
857
- replyAbortController
1307
+ replyAbortController,
1308
+ (_d = this.tts) == null ? void 0 : _d.model,
1309
+ (_e = this.tts) == null ? void 0 : _e.provider
858
1310
  );
859
1311
  tasks.push(ttsTask);
1312
+ } else {
1313
+ llmOutput = llmGenData.textStream;
860
1314
  }
861
1315
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1316
+ let userMetrics = _previousUserMetrics;
1317
+ if (newMessage && speechHandle.scheduled) {
1318
+ this.agent._chatCtx.insert(newMessage);
1319
+ this.agentSession._conversationItemAdded(newMessage);
1320
+ userMetrics = newMessage.metrics;
1321
+ }
862
1322
  if (speechHandle.interrupted) {
863
1323
  replyAbortController.abort();
864
1324
  await (0, import_utils.cancelAndWait)(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
@@ -868,7 +1328,20 @@ ${instructions}` : instructions,
868
1328
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
869
1329
  speechHandle._clearAuthorization();
870
1330
  const replyStartedAt = Date.now();
871
- const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
1331
+ let transcriptionInput = llmOutput;
1332
+ if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
1333
+ const timedTextsStream = await Promise.race([
1334
+ ttsGenData.timedTextsFut.await,
1335
+ (ttsTask == null ? void 0 : ttsTask.result.catch(
1336
+ () => this.logger.warn("TTS task failed before resolving timedTextsFut")
1337
+ )) ?? Promise.resolve()
1338
+ ]);
1339
+ if (timedTextsStream) {
1340
+ this.logger.debug("Using TTS aligned transcripts for transcription node input");
1341
+ transcriptionInput = timedTextsStream;
1342
+ }
1343
+ }
1344
+ const trNodeResult = await this.agent.transcriptionNode(transcriptionInput, modelSettings);
872
1345
  let textOut = null;
873
1346
  if (trNodeResult) {
874
1347
  const [textForwardTask, _textOut] = (0, import_generation.performTextForwarding)(
@@ -879,29 +1352,44 @@ ${instructions}` : instructions,
879
1352
  tasks.push(textForwardTask);
880
1353
  textOut = _textOut;
881
1354
  }
882
- const onFirstFrame = () => {
883
- this.agentSession._updateAgentState("speaking");
1355
+ let agentStartedSpeakingAt;
1356
+ const onFirstFrame = (startedSpeakingAt) => {
1357
+ agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1358
+ this.agentSession._updateAgentState("speaking", {
1359
+ startTime: startedSpeakingAt,
1360
+ otelContext: speechHandle._agentTurnContext
1361
+ });
1362
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1363
+ this.audioRecognition.onStartOfAgentSpeech();
1364
+ this.isInterruptionByAudioActivityEnabled = false;
1365
+ }
884
1366
  };
885
1367
  let audioOut = null;
886
1368
  if (audioOutput) {
887
- if (ttsStream) {
1369
+ if (ttsGenData) {
888
1370
  const [forwardTask, _audioOut] = (0, import_generation.performAudioForwarding)(
889
- ttsStream,
1371
+ ttsGenData.audioStream,
890
1372
  audioOutput,
891
1373
  replyAbortController
892
1374
  );
893
1375
  audioOut = _audioOut;
894
1376
  tasks.push(forwardTask);
895
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1377
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
896
1378
  } else {
897
- throw Error("ttsStream is null when audioOutput is enabled");
1379
+ throw Error("ttsGenData is null when audioOutput is enabled");
898
1380
  }
899
1381
  } else {
900
- textOut == null ? void 0 : textOut.firstTextFut.await.finally(onFirstFrame);
1382
+ textOut == null ? void 0 : textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
901
1383
  }
902
- const onToolExecutionStarted = (_) => {
1384
+ const onToolExecutionStarted = (f) => {
1385
+ speechHandle._itemAdded([f]);
1386
+ this.agent._chatCtx.items.push(f);
1387
+ this.agentSession._toolItemsAdded([f]);
903
1388
  };
904
- const onToolExecutionCompleted = (_) => {
1389
+ const onToolExecutionCompleted = (out) => {
1390
+ if (out.toolCallOutput) {
1391
+ speechHandle._itemAdded([out.toolCallOutput]);
1392
+ }
905
1393
  };
906
1394
  const [executeToolsTask, toolOutput] = (0, import_generation.performToolExecutions)({
907
1395
  session: this.agentSession,
@@ -917,28 +1405,53 @@ ${instructions}` : instructions,
917
1405
  if (audioOutput) {
918
1406
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
919
1407
  }
1408
+ const agentStoppedSpeakingAt = Date.now();
1409
+ const assistantMetrics = {};
1410
+ if (llmGenData.ttft !== void 0) {
1411
+ assistantMetrics.llmNodeTtft = llmGenData.ttft;
1412
+ }
1413
+ if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
1414
+ assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
1415
+ }
1416
+ if (agentStartedSpeakingAt !== void 0) {
1417
+ assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
1418
+ assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
1419
+ if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
1420
+ const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
1421
+ assistantMetrics.e2eLatency = e2eLatency;
1422
+ span.setAttribute(import_telemetry.traceTypes.ATTR_E2E_LATENCY, e2eLatency);
1423
+ }
1424
+ }
1425
+ span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
1426
+ let hasSpeechMessage = false;
920
1427
  if (toolsMessages) {
921
1428
  for (const msg of toolsMessages) {
922
1429
  msg.createdAt = replyStartedAt;
923
1430
  }
924
- this.agent._chatCtx.insert(toolsMessages);
1431
+ const toolCallOutputs = toolsMessages.filter(
1432
+ (m) => m.type === "function_call_output"
1433
+ );
1434
+ if (toolCallOutputs.length > 0) {
1435
+ this.agent._chatCtx.insert(toolCallOutputs);
1436
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1437
+ }
925
1438
  }
926
1439
  if (speechHandle.interrupted) {
927
1440
  this.logger.debug(
928
1441
  { speech_id: speechHandle.id },
929
1442
  "Aborting all pipeline reply tasks due to interruption"
930
1443
  );
1444
+ if (audioOutput) {
1445
+ audioOutput.clearBuffer();
1446
+ }
931
1447
  replyAbortController.abort();
932
- await Promise.allSettled(
933
- tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT))
934
- );
1448
+ await (0, import_utils.cancelAndWait)(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
935
1449
  let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
936
1450
  if (audioOutput) {
937
- audioOutput.clearBuffer();
938
1451
  const playbackEv = await audioOutput.waitForPlayout();
939
- if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
1452
+ if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
940
1453
  this.logger.info(
941
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1454
+ { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
942
1455
  "playout interrupted"
943
1456
  );
944
1457
  if (playbackEv.synchronizedTranscript) {
@@ -949,19 +1462,27 @@ ${instructions}` : instructions,
949
1462
  }
950
1463
  }
951
1464
  if (forwardedText) {
1465
+ hasSpeechMessage = true;
952
1466
  const message = import_chat_context.ChatMessage.create({
953
1467
  role: "assistant",
954
1468
  content: forwardedText,
955
1469
  id: llmGenData.id,
956
1470
  interrupted: true,
957
- createdAt: replyStartedAt
1471
+ createdAt: replyStartedAt,
1472
+ metrics: assistantMetrics
958
1473
  });
959
1474
  chatCtx.insert(message);
960
1475
  this.agent._chatCtx.insert(message);
1476
+ speechHandle._itemAdded([message]);
961
1477
  this.agentSession._conversationItemAdded(message);
1478
+ span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
962
1479
  }
963
1480
  if (this.agentSession.agentState === "speaking") {
964
1481
  this.agentSession._updateAgentState("listening");
1482
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1483
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1484
+ this.restoreInterruptionByAudioActivity();
1485
+ }
965
1486
  }
966
1487
  this.logger.info(
967
1488
  { speech_id: speechHandle.id, message: forwardedText },
@@ -972,16 +1493,20 @@ ${instructions}` : instructions,
972
1493
  return;
973
1494
  }
974
1495
  if (textOut && textOut.text) {
1496
+ hasSpeechMessage = true;
975
1497
  const message = import_chat_context.ChatMessage.create({
976
1498
  role: "assistant",
977
1499
  id: llmGenData.id,
978
1500
  interrupted: false,
979
1501
  createdAt: replyStartedAt,
980
- content: textOut.text
1502
+ content: textOut.text,
1503
+ metrics: assistantMetrics
981
1504
  });
982
1505
  chatCtx.insert(message);
983
1506
  this.agent._chatCtx.insert(message);
1507
+ speechHandle._itemAdded([message]);
984
1508
  this.agentSession._conversationItemAdded(message);
1509
+ span.setAttribute(import_telemetry.traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
985
1510
  this.logger.info(
986
1511
  { speech_id: speechHandle.id, message: textOut.text },
987
1512
  "playout completed without interruption"
@@ -991,11 +1516,17 @@ ${instructions}` : instructions,
991
1516
  this.agentSession._updateAgentState("thinking");
992
1517
  } else if (this.agentSession.agentState === "speaking") {
993
1518
  this.agentSession._updateAgentState("listening");
1519
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1520
+ {
1521
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1522
+ this.restoreInterruptionByAudioActivity();
1523
+ }
1524
+ }
994
1525
  }
995
1526
  speechHandle._markGenerationDone();
996
1527
  await executeToolsTask.result;
997
1528
  if (toolOutput.output.length === 0) return;
998
- const { maxToolSteps } = this.agentSession.options;
1529
+ const { maxToolSteps } = this.agentSession.sessionOptions;
999
1530
  if (speechHandle.numSteps >= maxToolSteps) {
1000
1531
  this.logger.warn(
1001
1532
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
@@ -1003,45 +1534,15 @@ ${instructions}` : instructions,
1003
1534
  );
1004
1535
  return;
1005
1536
  }
1006
- const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1007
- functionCalls: [],
1008
- functionCallOutputs: []
1009
- });
1010
- let shouldGenerateToolReply = false;
1011
- let newAgentTask = null;
1012
- let ignoreTaskSwitch = false;
1013
- for (const sanitizedOut of toolOutput.output) {
1014
- if (sanitizedOut.toolCallOutput !== void 0) {
1015
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1016
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1017
- if (sanitizedOut.replyRequired) {
1018
- shouldGenerateToolReply = true;
1019
- }
1020
- }
1021
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1022
- this.logger.error("expected to receive only one agent task from the tool executions");
1023
- ignoreTaskSwitch = true;
1024
- }
1025
- newAgentTask = sanitizedOut.agentTask ?? null;
1026
- this.logger.debug(
1027
- {
1028
- speechId: speechHandle.id,
1029
- name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1030
- args: sanitizedOut.toolCall.args,
1031
- output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1032
- isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1033
- },
1034
- "Tool call execution finished"
1035
- );
1036
- }
1537
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1037
1538
  this.agentSession.emit(
1038
1539
  import_events.AgentSessionEventTypes.FunctionToolsExecuted,
1039
1540
  functionToolsExecutedEvent
1040
1541
  );
1041
- let draining = this.draining;
1542
+ let schedulingPaused = this.schedulingPaused;
1042
1543
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1043
1544
  this.agentSession.updateAgent(newAgentTask);
1044
- draining = true;
1545
+ schedulingPaused = true;
1045
1546
  }
1046
1547
  const toolMessages = [
1047
1548
  ...functionToolsExecutedEvent.functionCalls,
@@ -1049,54 +1550,96 @@ ${instructions}` : instructions,
1049
1550
  ];
1050
1551
  if (shouldGenerateToolReply) {
1051
1552
  chatCtx.insert(toolMessages);
1052
- const handle = import_speech_handle.SpeechHandle.create({
1053
- allowInterruptions: speechHandle.allowInterruptions,
1054
- stepIndex: speechHandle._stepIndex + 1,
1055
- parent: speechHandle
1056
- });
1057
- this.agentSession.emit(
1058
- import_events.AgentSessionEventTypes.SpeechCreated,
1059
- (0, import_events.createSpeechCreatedEvent)({
1060
- userInitiated: false,
1061
- source: "tool_response",
1062
- speechHandle: handle
1063
- })
1064
- );
1065
- const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1553
+ speechHandle._numSteps += 1;
1554
+ const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1066
1555
  const toolResponseTask = this.createSpeechTask({
1067
- task: import_utils.Task.from(
1068
- () => this.pipelineReplyTask(
1069
- handle,
1070
- chatCtx,
1071
- toolCtx,
1072
- { toolChoice: respondToolChoice },
1073
- replyAbortController,
1074
- instructions,
1075
- void 0,
1076
- toolMessages
1077
- )
1556
+ taskFn: () => this.pipelineReplyTask(
1557
+ speechHandle,
1558
+ chatCtx,
1559
+ toolCtx,
1560
+ { toolChoice: respondToolChoice },
1561
+ replyAbortController,
1562
+ instructions,
1563
+ void 0,
1564
+ toolMessages,
1565
+ hasSpeechMessage ? void 0 : userMetrics
1078
1566
  ),
1079
- ownedSpeechHandle: handle,
1567
+ ownedSpeechHandle: speechHandle,
1080
1568
  name: "AgentActivity.pipelineReply"
1081
1569
  });
1082
- toolResponseTask.finally(() => this.onPipelineReplyDone());
1083
- this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1570
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1571
+ this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1084
1572
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1085
1573
  for (const msg of toolMessages) {
1086
1574
  msg.createdAt = replyStartedAt;
1087
1575
  }
1088
- this.agent._chatCtx.insert(toolMessages);
1576
+ const toolCallOutputs = toolMessages.filter(
1577
+ (m) => m.type === "function_call_output"
1578
+ );
1579
+ if (toolCallOutputs.length > 0) {
1580
+ this.agent._chatCtx.insert(toolCallOutputs);
1581
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1582
+ }
1089
1583
  }
1090
- }
1584
+ };
1585
+ pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => import_telemetry.tracer.startActiveSpan(
1586
+ async (span) => this._pipelineReplyTaskImpl({
1587
+ speechHandle,
1588
+ chatCtx,
1589
+ toolCtx,
1590
+ modelSettings,
1591
+ replyAbortController,
1592
+ instructions,
1593
+ newMessage,
1594
+ toolsMessages,
1595
+ span,
1596
+ _previousUserMetrics
1597
+ }),
1598
+ {
1599
+ name: "agent_turn",
1600
+ context: this.agentSession.rootSpanContext
1601
+ }
1602
+ );
1091
1603
  async realtimeGenerationTask(speechHandle, ev, modelSettings, replyAbortController) {
1092
- var _a, _b, _c;
1093
- speechHandleStorage.enterWith(speechHandle);
1604
+ return import_telemetry.tracer.startActiveSpan(
1605
+ async (span) => this._realtimeGenerationTaskImpl({
1606
+ speechHandle,
1607
+ ev,
1608
+ modelSettings,
1609
+ replyAbortController,
1610
+ span
1611
+ }),
1612
+ {
1613
+ name: "agent_turn",
1614
+ context: this.agentSession.rootSpanContext
1615
+ }
1616
+ );
1617
+ }
1618
+ async _realtimeGenerationTaskImpl({
1619
+ speechHandle,
1620
+ ev,
1621
+ modelSettings,
1622
+ replyAbortController,
1623
+ span
1624
+ }) {
1625
+ var _a;
1626
+ speechHandle._agentTurnContext = import_api.context.active();
1627
+ span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1628
+ const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
1629
+ if (localParticipant) {
1630
+ (0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
1631
+ }
1632
+ import_agent.speechHandleStorage.enterWith(speechHandle);
1094
1633
  if (!this.realtimeSession) {
1095
1634
  throw new Error("realtime session is not initialized");
1096
1635
  }
1097
1636
  if (!(this.llm instanceof import_llm.RealtimeModel)) {
1098
1637
  throw new Error("llm is not a realtime model");
1099
1638
  }
1639
+ span.setAttribute(import_telemetry.traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.llm.model);
1640
+ if (this.realtimeSpans && ev.responseId) {
1641
+ this.realtimeSpans.set(ev.responseId, span);
1642
+ }
1100
1643
  this.logger.debug(
1101
1644
  { speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
1102
1645
  "realtime generation started"
@@ -1109,10 +1652,17 @@ ${instructions}` : instructions,
1109
1652
  if (speechHandle.interrupted) {
1110
1653
  return;
1111
1654
  }
1112
- const onFirstFrame = () => {
1113
- this.agentSession._updateAgentState("speaking");
1655
+ const onFirstFrame = (startedSpeakingAt) => {
1656
+ this.agentSession._updateAgentState("speaking", {
1657
+ startTime: startedSpeakingAt,
1658
+ otelContext: speechHandle._agentTurnContext
1659
+ });
1114
1660
  };
1115
1661
  const readMessages = async (abortController, outputs) => {
1662
+ var _a2, _b;
1663
+ replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
1664
+ once: true
1665
+ });
1116
1666
  const forwardTasks = [];
1117
1667
  try {
1118
1668
  for await (const msg of ev.messageStream) {
@@ -1122,7 +1672,22 @@ ${instructions}` : instructions,
1122
1672
  );
1123
1673
  break;
1124
1674
  }
1125
- const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
1675
+ const msgModalities = msg.modalities ? await msg.modalities : void 0;
1676
+ let ttsTextInput = null;
1677
+ let trTextInput;
1678
+ if (msgModalities && !msgModalities.includes("audio") && this.tts) {
1679
+ if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.audioOutput) {
1680
+ this.logger.warn(
1681
+ "text response received from realtime API, falling back to use a TTS model."
1682
+ );
1683
+ }
1684
+ const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
1685
+ ttsTextInput = _ttsTextInput;
1686
+ trTextInput = _trTextInput;
1687
+ } else {
1688
+ trTextInput = msg.textStream;
1689
+ }
1690
+ const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
1126
1691
  let textOut = null;
1127
1692
  if (trNodeResult) {
1128
1693
  const [textForwardTask, _textOut] = (0, import_generation.performTextForwarding)(
@@ -1135,28 +1700,46 @@ ${instructions}` : instructions,
1135
1700
  }
1136
1701
  let audioOut = null;
1137
1702
  if (audioOutput) {
1138
- const realtimeAudio = await this.agent.realtimeAudioOutputNode(
1139
- msg.audioStream,
1140
- modelSettings
1141
- );
1142
- if (realtimeAudio) {
1703
+ let realtimeAudioResult = null;
1704
+ if (ttsTextInput) {
1705
+ const [ttsTask, ttsGenData] = (0, import_generation.performTTSInference)(
1706
+ (...args) => this.agent.ttsNode(...args),
1707
+ ttsTextInput,
1708
+ modelSettings,
1709
+ abortController,
1710
+ (_a2 = this.tts) == null ? void 0 : _a2.model,
1711
+ (_b = this.tts) == null ? void 0 : _b.provider
1712
+ );
1713
+ tasks.push(ttsTask);
1714
+ realtimeAudioResult = ttsGenData.audioStream;
1715
+ } else if (msgModalities && msgModalities.includes("audio")) {
1716
+ realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
1717
+ msg.audioStream,
1718
+ modelSettings
1719
+ );
1720
+ } else if (this.llm instanceof import_llm.RealtimeModel && this.llm.capabilities.audioOutput) {
1721
+ this.logger.error(
1722
+ "Text message received from Realtime API with audio modality. This usually happens when text chat context is synced to the API. Try to add a TTS model as fallback or use text modality with TTS instead."
1723
+ );
1724
+ } else {
1725
+ this.logger.warn(
1726
+ "audio output is enabled but neither tts nor realtime audio is available"
1727
+ );
1728
+ }
1729
+ if (realtimeAudioResult) {
1143
1730
  const [forwardTask, _audioOut] = (0, import_generation.performAudioForwarding)(
1144
- realtimeAudio,
1731
+ realtimeAudioResult,
1145
1732
  audioOutput,
1146
1733
  abortController
1147
1734
  );
1148
1735
  forwardTasks.push(forwardTask);
1149
1736
  audioOut = _audioOut;
1150
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1151
- } else {
1152
- this.logger.warn(
1153
- "audio output is enabled but neither tts nor realtime audio is available"
1154
- );
1737
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
1155
1738
  }
1156
1739
  } else if (textOut) {
1157
- textOut.firstTextFut.await.finally(onFirstFrame);
1740
+ textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
1158
1741
  }
1159
- outputs.push([msg.messageId, textOut, audioOut]);
1742
+ outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
1160
1743
  }
1161
1744
  await (0, import_utils.waitFor)(forwardTasks);
1162
1745
  } catch (error) {
@@ -1169,7 +1752,7 @@ ${instructions}` : instructions,
1169
1752
  const tasks = [
1170
1753
  import_utils.Task.from(
1171
1754
  (controller) => readMessages(controller, messageOutputs),
1172
- replyAbortController,
1755
+ void 0,
1173
1756
  "AgentActivity.realtime_generation.read_messages"
1174
1757
  )
1175
1758
  ];
@@ -1197,6 +1780,8 @@ ${instructions}` : instructions,
1197
1780
  );
1198
1781
  const onToolExecutionStarted = (f) => {
1199
1782
  speechHandle._itemAdded([f]);
1783
+ this.agent._chatCtx.items.push(f);
1784
+ this.agentSession._toolItemsAdded([f]);
1200
1785
  };
1201
1786
  const onToolExecutionCompleted = (out) => {
1202
1787
  if (out.toolCallOutput) {
@@ -1216,7 +1801,6 @@ ${instructions}` : instructions,
1216
1801
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
1217
1802
  if (audioOutput) {
1218
1803
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1219
- this.agentSession._updateAgentState("listening");
1220
1804
  }
1221
1805
  if (speechHandle.interrupted) {
1222
1806
  this.logger.debug(
@@ -1226,15 +1810,15 @@ ${instructions}` : instructions,
1226
1810
  replyAbortController.abort();
1227
1811
  await (0, import_utils.cancelAndWait)(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1228
1812
  if (messageOutputs.length > 0) {
1229
- const [msgId, textOut, audioOut] = messageOutputs[0];
1813
+ const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0];
1230
1814
  let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
1231
1815
  if (audioOutput) {
1232
1816
  audioOutput.clearBuffer();
1233
1817
  const playbackEv = await audioOutput.waitForPlayout();
1234
- let playbackPosition = playbackEv.playbackPosition;
1235
- if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
1818
+ let playbackPositionInS = playbackEv.playbackPosition;
1819
+ if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
1236
1820
  this.logger.info(
1237
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1821
+ { speech_id: speechHandle.id, playbackPositionInS },
1238
1822
  "playout interrupted"
1239
1823
  );
1240
1824
  if (playbackEv.synchronizedTranscript) {
@@ -1242,11 +1826,13 @@ ${instructions}` : instructions,
1242
1826
  }
1243
1827
  } else {
1244
1828
  forwardedText = "";
1245
- playbackPosition = 0;
1829
+ playbackPositionInS = 0;
1246
1830
  }
1247
1831
  this.realtimeSession.truncate({
1248
1832
  messageId: msgId,
1249
- audioEndMs: Math.floor(playbackPosition)
1833
+ audioEndMs: Math.floor(playbackPositionInS * 1e3),
1834
+ modalities: msgModalities,
1835
+ audioTranscript: forwardedText
1250
1836
  });
1251
1837
  }
1252
1838
  if (forwardedText) {
@@ -1270,7 +1856,7 @@ ${instructions}` : instructions,
1270
1856
  return;
1271
1857
  }
1272
1858
  if (messageOutputs.length > 0) {
1273
- const [msgId, textOut, _] = messageOutputs[0];
1859
+ const [msgId, textOut, _, __] = messageOutputs[0];
1274
1860
  const message = import_chat_context.ChatMessage.create({
1275
1861
  role: "assistant",
1276
1862
  content: (textOut == null ? void 0 : textOut.text) || "",
@@ -1282,12 +1868,16 @@ ${instructions}` : instructions,
1282
1868
  this.agentSession._conversationItemAdded(message);
1283
1869
  }
1284
1870
  speechHandle._markGenerationDone();
1285
- toolOutput.firstToolStartedFuture.await.finally(() => {
1286
- this.agentSession._updateAgentState("thinking");
1287
- });
1288
1871
  await executeToolsTask.result;
1289
- if (toolOutput.output.length === 0) return;
1290
- const { maxToolSteps } = this.agentSession.options;
1872
+ if (toolOutput.output.length > 0) {
1873
+ this.agentSession._updateAgentState("thinking");
1874
+ } else if (this.agentSession.agentState === "speaking") {
1875
+ this.agentSession._updateAgentState("listening");
1876
+ }
1877
+ if (toolOutput.output.length === 0) {
1878
+ return;
1879
+ }
1880
+ const { maxToolSteps } = this.agentSession.sessionOptions;
1291
1881
  if (speechHandle.numSteps >= maxToolSteps) {
1292
1882
  this.logger.warn(
1293
1883
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
@@ -1295,48 +1885,29 @@ ${instructions}` : instructions,
1295
1885
  );
1296
1886
  return;
1297
1887
  }
1298
- const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1299
- functionCalls: [],
1300
- functionCallOutputs: []
1301
- });
1302
- let shouldGenerateToolReply = false;
1303
- let newAgentTask = null;
1304
- let ignoreTaskSwitch = false;
1305
- for (const sanitizedOut of toolOutput.output) {
1306
- if (sanitizedOut.toolCallOutput !== void 0) {
1307
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1308
- if (sanitizedOut.replyRequired) {
1309
- shouldGenerateToolReply = true;
1310
- }
1311
- }
1312
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1313
- this.logger.error("expected to receive only one agent task from the tool executions");
1314
- ignoreTaskSwitch = true;
1315
- }
1316
- newAgentTask = sanitizedOut.agentTask ?? null;
1317
- this.logger.debug(
1318
- {
1319
- speechId: speechHandle.id,
1320
- name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1321
- args: sanitizedOut.toolCall.args,
1322
- output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1323
- isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1324
- },
1325
- "Tool call execution finished"
1326
- );
1327
- }
1888
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1328
1889
  this.agentSession.emit(
1329
1890
  import_events.AgentSessionEventTypes.FunctionToolsExecuted,
1330
1891
  functionToolsExecutedEvent
1331
1892
  );
1332
- let draining = this.draining;
1893
+ let schedulingPaused = this.schedulingPaused;
1333
1894
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1334
1895
  this.agentSession.updateAgent(newAgentTask);
1335
- draining = true;
1896
+ schedulingPaused = true;
1336
1897
  }
1337
1898
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1899
+ while (this.currentSpeech || this.speechQueue.size() > 0) {
1900
+ if (this.currentSpeech && !this.currentSpeech.done() && this.currentSpeech !== speechHandle) {
1901
+ await this.currentSpeech.waitForPlayout();
1902
+ } else {
1903
+ await new Promise((resolve) => setImmediate(resolve));
1904
+ }
1905
+ }
1338
1906
  const chatCtx = this.realtimeSession.chatCtx.copy();
1339
1907
  chatCtx.items.push(...functionToolsExecutedEvent.functionCallOutputs);
1908
+ this.agentSession._toolItemsAdded(
1909
+ functionToolsExecutedEvent.functionCallOutputs
1910
+ );
1340
1911
  try {
1341
1912
  await this.realtimeSession.updateChatCtx(chatCtx);
1342
1913
  } catch (error) {
@@ -1363,20 +1934,58 @@ ${instructions}` : instructions,
1363
1934
  speechHandle: replySpeechHandle
1364
1935
  })
1365
1936
  );
1366
- const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1937
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1367
1938
  this.createSpeechTask({
1368
- task: import_utils.Task.from(
1369
- (abortController) => this.realtimeReplyTask({
1370
- speechHandle: replySpeechHandle,
1371
- modelSettings: { toolChoice },
1372
- abortController
1373
- })
1374
- ),
1939
+ taskFn: (abortController) => this.realtimeReplyTask({
1940
+ speechHandle: replySpeechHandle,
1941
+ modelSettings: { toolChoice },
1942
+ abortController
1943
+ }),
1375
1944
  ownedSpeechHandle: replySpeechHandle,
1376
1945
  name: "AgentActivity.realtime_reply"
1377
1946
  });
1378
1947
  this.scheduleSpeech(replySpeechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1379
1948
  }
1949
+ summarizeToolExecutionOutput(toolOutput, speechHandle) {
1950
+ var _a, _b, _c;
1951
+ const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1952
+ functionCalls: [],
1953
+ functionCallOutputs: []
1954
+ });
1955
+ let shouldGenerateToolReply = false;
1956
+ let newAgentTask = null;
1957
+ let ignoreTaskSwitch = false;
1958
+ for (const sanitizedOut of toolOutput.output) {
1959
+ if (sanitizedOut.toolCallOutput !== void 0) {
1960
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1961
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1962
+ if (sanitizedOut.replyRequired) {
1963
+ shouldGenerateToolReply = true;
1964
+ }
1965
+ }
1966
+ if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1967
+ this.logger.error("expected to receive only one agent task from the tool executions");
1968
+ ignoreTaskSwitch = true;
1969
+ }
1970
+ newAgentTask = sanitizedOut.agentTask ?? null;
1971
+ this.logger.debug(
1972
+ {
1973
+ speechId: speechHandle.id,
1974
+ name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1975
+ args: sanitizedOut.toolCall.args,
1976
+ output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1977
+ isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1978
+ },
1979
+ "Tool call execution finished"
1980
+ );
1981
+ }
1982
+ return {
1983
+ functionToolsExecutedEvent,
1984
+ shouldGenerateToolReply,
1985
+ newAgentTask,
1986
+ ignoreTaskSwitch
1987
+ };
1988
+ }
1380
1989
  async realtimeReplyTask({
1381
1990
  speechHandle,
1382
1991
  modelSettings: { toolChoice },
@@ -1384,7 +1993,7 @@ ${instructions}` : instructions,
1384
1993
  instructions,
1385
1994
  abortController
1386
1995
  }) {
1387
- speechHandleStorage.enterWith(speechHandle);
1996
+ import_agent.speechHandleStorage.enterWith(speechHandle);
1388
1997
  if (!this.realtimeSession) {
1389
1998
  throw new Error("realtime session is not available");
1390
1999
  }
@@ -1418,72 +2027,194 @@ ${instructions}` : instructions,
1418
2027
  }
1419
2028
  }
1420
2029
  scheduleSpeech(speechHandle, priority, force = false) {
1421
- if (this.draining && !force) {
1422
- throw new Error("cannot schedule new speech, the agent is draining");
2030
+ if (this.schedulingPaused && !force) {
2031
+ throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
1423
2032
  }
1424
2033
  this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1425
2034
  speechHandle._markScheduled();
1426
2035
  this.wakeupMainTask();
1427
2036
  }
2037
+ async _pauseSchedulingTask(blockedTasks) {
2038
+ if (this._schedulingPaused) return;
2039
+ this._schedulingPaused = true;
2040
+ this._drainBlockedTasks = blockedTasks;
2041
+ this.wakeupMainTask();
2042
+ if (this._mainTask) {
2043
+ await this._mainTask.result;
2044
+ }
2045
+ }
2046
+ _resumeSchedulingTask() {
2047
+ if (!this._schedulingPaused) return;
2048
+ this._schedulingPaused = false;
2049
+ this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
2050
+ }
2051
+ async pause(options = {}) {
2052
+ const { blockedTasks = [] } = options;
2053
+ const unlock = await this.lock.lock();
2054
+ try {
2055
+ const span = import_telemetry.tracer.startSpan({
2056
+ name: "pause_agent_activity",
2057
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
2058
+ });
2059
+ try {
2060
+ await this._pauseSchedulingTask(blockedTasks);
2061
+ await this._closeSessionResources();
2062
+ } finally {
2063
+ span.end();
2064
+ }
2065
+ } finally {
2066
+ unlock();
2067
+ }
2068
+ }
1428
2069
  async drain() {
1429
- var _a;
2070
+ return import_telemetry.tracer.startActiveSpan(async (span) => this._drainImpl(span), {
2071
+ name: "drain_agent_activity",
2072
+ context: import_api.ROOT_CONTEXT
2073
+ });
2074
+ }
2075
+ async _drainImpl(span) {
2076
+ span.setAttribute(import_telemetry.traceTypes.ATTR_AGENT_LABEL, this.agent.id);
1430
2077
  const unlock = await this.lock.lock();
1431
2078
  try {
1432
- if (this._draining) return;
1433
- this.createSpeechTask({
1434
- task: import_utils.Task.from(() => this.agent.onExit()),
2079
+ if (this._schedulingPaused) return;
2080
+ this._onExitTask = this.createSpeechTask({
2081
+ taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onExit(), {
2082
+ name: "on_exit",
2083
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
2084
+ }),
2085
+ inlineTask: true,
1435
2086
  name: "AgentActivity_onExit"
1436
2087
  });
1437
- this.wakeupMainTask();
1438
- this._draining = true;
1439
- await ((_a = this._mainTask) == null ? void 0 : _a.result);
2088
+ this.cancelPreemptiveGeneration();
2089
+ await this._onExitTask.result;
2090
+ await this._pauseSchedulingTask([]);
1440
2091
  } finally {
1441
2092
  unlock();
1442
2093
  }
1443
2094
  }
1444
2095
  async close() {
1445
- var _a, _b, _c;
1446
2096
  const unlock = await this.lock.lock();
1447
2097
  try {
1448
- if (!this._draining) {
1449
- this.logger.warn("task closing without draining");
1450
- }
1451
- if (this.llm instanceof import_llm.LLM) {
1452
- this.llm.off("metrics_collected", this.onMetricsCollected);
1453
- }
1454
- if (this.realtimeSession) {
1455
- this.realtimeSession.off("generation_created", this.onGenerationCreated);
1456
- this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
1457
- this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
1458
- this.realtimeSession.off(
1459
- "input_audio_transcription_completed",
1460
- this.onInputAudioTranscriptionCompleted
1461
- );
1462
- this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
2098
+ this.cancelPreemptiveGeneration();
2099
+ await (0, import_utils.cancelAndWait)(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
2100
+ if (this._currentSpeech && !this._currentSpeech.done()) {
2101
+ this._currentSpeech._markDone();
1463
2102
  }
1464
- if (this.stt instanceof import_stt.STT) {
1465
- this.stt.off("metrics_collected", this.onMetricsCollected);
2103
+ await this._closeSessionResources();
2104
+ if (this._mainTask) {
2105
+ await this._mainTask.cancelAndWait();
1466
2106
  }
1467
- if (this.tts instanceof import_tts.TTS) {
1468
- this.tts.off("metrics_collected", this.onMetricsCollected);
2107
+ if (this.interruptionDetector) {
2108
+ this.interruptionDetector.off("overlapping_speech", this.onInterruptionOverlappingSpeech);
2109
+ this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
2110
+ this.interruptionDetector.off("error", this.onInterruptionError);
1469
2111
  }
1470
- if (this.vad instanceof import_vad.VAD) {
1471
- this.vad.off("metrics_collected", this.onMetricsCollected);
1472
- }
1473
- this.detachAudioInput();
1474
- await ((_a = this.realtimeSession) == null ? void 0 : _a.close());
1475
- await ((_b = this.audioRecognition) == null ? void 0 : _b.close());
1476
- await ((_c = this._mainTask) == null ? void 0 : _c.cancelAndWait());
2112
+ this.agent._agentActivity = void 0;
1477
2113
  } finally {
1478
2114
  unlock();
1479
2115
  }
1480
2116
  }
2117
+ resolveInterruptionDetector() {
2118
+ var _a, _b;
2119
+ const agentInterruptionDetection = (_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.interruption) == null ? void 0 : _b.mode;
2120
+ const sessionInterruptionDetection = this.agentSession.interruptionDetection;
2121
+ if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof import_llm.RealtimeModel))) {
2122
+ if (agentInterruptionDetection === "adaptive" || sessionInterruptionDetection === "adaptive") {
2123
+ this.logger.warn(
2124
+ "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
2125
+ );
2126
+ }
2127
+ return void 0;
2128
+ }
2129
+ if (!this.allowInterruptions) {
2130
+ return void 0;
2131
+ }
2132
+ if (agentInterruptionDetection === "vad") {
2133
+ return void 0;
2134
+ }
2135
+ if (sessionInterruptionDetection === "vad") {
2136
+ return void 0;
2137
+ }
2138
+ if (agentInterruptionDetection === void 0 && sessionInterruptionDetection === void 0 && !(0, import_utils.isHosted)() && !(0, import_utils.isDevMode)()) {
2139
+ this.logger.info("adaptive interruption is disabled by default in production mode");
2140
+ return void 0;
2141
+ }
2142
+ try {
2143
+ const detector = new import_interruption_detector.AdaptiveInterruptionDetector();
2144
+ detector.on("overlapping_speech", this.onInterruptionOverlappingSpeech);
2145
+ detector.on("metrics_collected", this.onInterruptionMetricsCollected);
2146
+ detector.on("error", this.onInterruptionError);
2147
+ return detector;
2148
+ } catch (error) {
2149
+ this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
2150
+ }
2151
+ return void 0;
2152
+ }
2153
+ restoreInterruptionByAudioActivity() {
2154
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
2155
+ }
2156
+ fallbackToVadInterruption() {
2157
+ if (!this.isInterruptionDetectionEnabled) return;
2158
+ this.isInterruptionDetectionEnabled = false;
2159
+ this.restoreInterruptionByAudioActivity();
2160
+ if (this.interruptionDetector) {
2161
+ this.interruptionDetector.off("overlapping_speech", this.onInterruptionOverlappingSpeech);
2162
+ this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
2163
+ this.interruptionDetector.off("error", this.onInterruptionError);
2164
+ this.interruptionDetector = void 0;
2165
+ }
2166
+ if (this.audioRecognition) {
2167
+ this.audioRecognition.disableInterruptionDetection().catch((err) => {
2168
+ this.logger.warn({ err }, "error while disabling interruption detection");
2169
+ });
2170
+ }
2171
+ this.logger.warn(
2172
+ "adaptive interruption disabled due to unrecoverable error, falling back to VAD-based interruption"
2173
+ );
2174
+ }
2175
+ async _closeSessionResources() {
2176
+ var _a, _b, _c;
2177
+ if (this.llm instanceof import_llm.LLM) {
2178
+ this.llm.off("metrics_collected", this.onMetricsCollected);
2179
+ this.llm.off("error", this.onModelError);
2180
+ }
2181
+ if (this.realtimeSession) {
2182
+ this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
2183
+ this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
2184
+ this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
2185
+ this.realtimeSession.off(
2186
+ "input_audio_transcription_completed",
2187
+ this.onRealtimeInputAudioTranscriptionCompleted
2188
+ );
2189
+ this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
2190
+ this.realtimeSession.off("error", this.onModelError);
2191
+ }
2192
+ if (this.stt instanceof import_stt.STT) {
2193
+ this.stt.off("metrics_collected", this.onMetricsCollected);
2194
+ this.stt.off("error", this.onModelError);
2195
+ }
2196
+ if (this.tts instanceof import_tts.TTS) {
2197
+ this.tts.off("metrics_collected", this.onMetricsCollected);
2198
+ this.tts.off("error", this.onModelError);
2199
+ }
2200
+ if (this.vad instanceof import_vad.VAD) {
2201
+ this.vad.off("metrics_collected", this.onMetricsCollected);
2202
+ }
2203
+ this.detachAudioInput();
2204
+ (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
2205
+ await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
2206
+ await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
2207
+ this.realtimeSession = void 0;
2208
+ this.audioRecognition = void 0;
2209
+ }
1481
2210
  }
1482
2211
  function toOaiToolChoice(toolChoice) {
1483
2212
  return toolChoice !== null ? toolChoice : void 0;
1484
2213
  }
1485
2214
  // Annotate the CommonJS export names for ESM import in node:
1486
2215
  0 && (module.exports = {
1487
- AgentActivity
2216
+ AgentActivity,
2217
+ agentActivityStorage,
2218
+ onEnterStorage
1488
2219
  });
1489
2220
  //# sourceMappingURL=agent_activity.cjs.map