@livekit/agents 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (959) hide show
  1. package/dist/_exceptions.cjs.map +1 -1
  2. package/dist/_exceptions.d.ts.map +1 -1
  3. package/dist/_exceptions.js.map +1 -1
  4. package/dist/audio.cjs +89 -3
  5. package/dist/audio.cjs.map +1 -1
  6. package/dist/audio.d.cts +36 -1
  7. package/dist/audio.d.ts +36 -1
  8. package/dist/audio.d.ts.map +1 -1
  9. package/dist/audio.js +76 -2
  10. package/dist/audio.js.map +1 -1
  11. package/dist/beta/index.cjs +29 -0
  12. package/dist/beta/index.cjs.map +1 -0
  13. package/dist/beta/index.d.cts +2 -0
  14. package/dist/beta/index.d.ts +2 -0
  15. package/dist/beta/index.d.ts.map +1 -0
  16. package/dist/beta/index.js +7 -0
  17. package/dist/beta/index.js.map +1 -0
  18. package/dist/beta/workflows/index.cjs +29 -0
  19. package/dist/beta/workflows/index.cjs.map +1 -0
  20. package/dist/beta/workflows/index.d.cts +2 -0
  21. package/dist/beta/workflows/index.d.ts +2 -0
  22. package/dist/beta/workflows/index.d.ts.map +1 -0
  23. package/dist/beta/workflows/index.js +7 -0
  24. package/dist/beta/workflows/index.js.map +1 -0
  25. package/dist/beta/workflows/task_group.cjs +165 -0
  26. package/dist/beta/workflows/task_group.cjs.map +1 -0
  27. package/dist/beta/workflows/task_group.d.cts +32 -0
  28. package/dist/beta/workflows/task_group.d.ts +32 -0
  29. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  30. package/dist/beta/workflows/task_group.js +141 -0
  31. package/dist/beta/workflows/task_group.js.map +1 -0
  32. package/dist/cli.cjs +44 -46
  33. package/dist/cli.cjs.map +1 -1
  34. package/dist/cli.d.cts +3 -3
  35. package/dist/cli.d.ts +3 -3
  36. package/dist/cli.d.ts.map +1 -1
  37. package/dist/cli.js +45 -47
  38. package/dist/cli.js.map +1 -1
  39. package/dist/connection_pool.cjs +242 -0
  40. package/dist/connection_pool.cjs.map +1 -0
  41. package/dist/connection_pool.d.cts +123 -0
  42. package/dist/connection_pool.d.ts +123 -0
  43. package/dist/connection_pool.d.ts.map +1 -0
  44. package/dist/connection_pool.js +218 -0
  45. package/dist/connection_pool.js.map +1 -0
  46. package/dist/connection_pool.test.cjs +256 -0
  47. package/dist/connection_pool.test.cjs.map +1 -0
  48. package/dist/connection_pool.test.js +255 -0
  49. package/dist/connection_pool.test.js.map +1 -0
  50. package/dist/constants.cjs +30 -0
  51. package/dist/constants.cjs.map +1 -1
  52. package/dist/constants.d.cts +10 -0
  53. package/dist/constants.d.ts +10 -0
  54. package/dist/constants.d.ts.map +1 -1
  55. package/dist/constants.js +20 -0
  56. package/dist/constants.js.map +1 -1
  57. package/dist/cpu.cjs +189 -0
  58. package/dist/cpu.cjs.map +1 -0
  59. package/dist/cpu.d.cts +24 -0
  60. package/dist/cpu.d.ts +24 -0
  61. package/dist/cpu.d.ts.map +1 -0
  62. package/dist/cpu.js +152 -0
  63. package/dist/cpu.js.map +1 -0
  64. package/dist/cpu.test.cjs +227 -0
  65. package/dist/cpu.test.cjs.map +1 -0
  66. package/dist/cpu.test.js +204 -0
  67. package/dist/cpu.test.js.map +1 -0
  68. package/dist/http_server.cjs +9 -6
  69. package/dist/http_server.cjs.map +1 -1
  70. package/dist/http_server.d.cts +5 -1
  71. package/dist/http_server.d.ts +5 -1
  72. package/dist/http_server.d.ts.map +1 -1
  73. package/dist/http_server.js +9 -6
  74. package/dist/http_server.js.map +1 -1
  75. package/dist/index.cjs +24 -9
  76. package/dist/index.cjs.map +1 -1
  77. package/dist/index.d.cts +15 -11
  78. package/dist/index.d.ts +15 -11
  79. package/dist/index.d.ts.map +1 -1
  80. package/dist/index.js +18 -9
  81. package/dist/index.js.map +1 -1
  82. package/dist/inference/api_protos.cjs +70 -2
  83. package/dist/inference/api_protos.cjs.map +1 -1
  84. package/dist/inference/api_protos.d.cts +373 -32
  85. package/dist/inference/api_protos.d.ts +373 -32
  86. package/dist/inference/api_protos.d.ts.map +1 -1
  87. package/dist/inference/api_protos.js +62 -2
  88. package/dist/inference/api_protos.js.map +1 -1
  89. package/dist/inference/index.cjs +8 -0
  90. package/dist/inference/index.cjs.map +1 -1
  91. package/dist/inference/index.d.cts +3 -4
  92. package/dist/inference/index.d.ts +3 -4
  93. package/dist/inference/index.d.ts.map +1 -1
  94. package/dist/inference/index.js +18 -3
  95. package/dist/inference/index.js.map +1 -1
  96. package/dist/inference/interruption/defaults.cjs +81 -0
  97. package/dist/inference/interruption/defaults.cjs.map +1 -0
  98. package/dist/inference/interruption/defaults.d.cts +19 -0
  99. package/dist/inference/interruption/defaults.d.ts +19 -0
  100. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  101. package/dist/inference/interruption/defaults.js +46 -0
  102. package/dist/inference/interruption/defaults.js.map +1 -0
  103. package/dist/inference/interruption/errors.cjs +44 -0
  104. package/dist/inference/interruption/errors.cjs.map +1 -0
  105. package/dist/inference/interruption/errors.d.cts +12 -0
  106. package/dist/inference/interruption/errors.d.ts +12 -0
  107. package/dist/inference/interruption/errors.d.ts.map +1 -0
  108. package/dist/inference/interruption/errors.js +20 -0
  109. package/dist/inference/interruption/errors.js.map +1 -0
  110. package/dist/inference/interruption/http_transport.cjs +163 -0
  111. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  112. package/dist/inference/interruption/http_transport.d.cts +65 -0
  113. package/dist/inference/interruption/http_transport.d.ts +65 -0
  114. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  115. package/dist/inference/interruption/http_transport.js +137 -0
  116. package/dist/inference/interruption/http_transport.js.map +1 -0
  117. package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
  118. package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
  119. package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
  120. package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
  121. package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
  122. package/dist/inference/interruption/interruption_cache_entry.js +34 -0
  123. package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
  124. package/dist/inference/interruption/interruption_detector.cjs +198 -0
  125. package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
  126. package/dist/inference/interruption/interruption_detector.d.cts +59 -0
  127. package/dist/inference/interruption/interruption_detector.d.ts +59 -0
  128. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
  129. package/dist/inference/interruption/interruption_detector.js +164 -0
  130. package/dist/inference/interruption/interruption_detector.js.map +1 -0
  131. package/dist/inference/interruption/interruption_stream.cjs +368 -0
  132. package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
  133. package/dist/inference/interruption/interruption_stream.d.cts +46 -0
  134. package/dist/inference/interruption/interruption_stream.d.ts +46 -0
  135. package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
  136. package/dist/inference/interruption/interruption_stream.js +344 -0
  137. package/dist/inference/interruption/interruption_stream.js.map +1 -0
  138. package/dist/inference/interruption/types.cjs +17 -0
  139. package/dist/inference/interruption/types.cjs.map +1 -0
  140. package/dist/inference/interruption/types.d.cts +66 -0
  141. package/dist/inference/interruption/types.d.ts +66 -0
  142. package/dist/inference/interruption/types.d.ts.map +1 -0
  143. package/dist/inference/interruption/types.js +1 -0
  144. package/dist/inference/interruption/types.js.map +1 -0
  145. package/dist/inference/interruption/utils.cjs +130 -0
  146. package/dist/inference/interruption/utils.cjs.map +1 -0
  147. package/dist/inference/interruption/utils.d.cts +41 -0
  148. package/dist/inference/interruption/utils.d.ts +41 -0
  149. package/dist/inference/interruption/utils.d.ts.map +1 -0
  150. package/dist/inference/interruption/utils.js +105 -0
  151. package/dist/inference/interruption/utils.js.map +1 -0
  152. package/dist/inference/interruption/utils.test.cjs +105 -0
  153. package/dist/inference/interruption/utils.test.cjs.map +1 -0
  154. package/dist/inference/interruption/utils.test.js +104 -0
  155. package/dist/inference/interruption/utils.test.js.map +1 -0
  156. package/dist/inference/interruption/ws_transport.cjs +347 -0
  157. package/dist/inference/interruption/ws_transport.cjs.map +1 -0
  158. package/dist/inference/interruption/ws_transport.d.cts +33 -0
  159. package/dist/inference/interruption/ws_transport.d.ts +33 -0
  160. package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
  161. package/dist/inference/interruption/ws_transport.js +313 -0
  162. package/dist/inference/interruption/ws_transport.js.map +1 -0
  163. package/dist/inference/llm.cjs +106 -66
  164. package/dist/inference/llm.cjs.map +1 -1
  165. package/dist/inference/llm.d.cts +65 -43
  166. package/dist/inference/llm.d.ts +65 -43
  167. package/dist/inference/llm.d.ts.map +1 -1
  168. package/dist/inference/llm.js +100 -66
  169. package/dist/inference/llm.js.map +1 -1
  170. package/dist/inference/stt.cjs +319 -170
  171. package/dist/inference/stt.cjs.map +1 -1
  172. package/dist/inference/stt.d.cts +64 -15
  173. package/dist/inference/stt.d.ts +64 -15
  174. package/dist/inference/stt.d.ts.map +1 -1
  175. package/dist/inference/stt.js +319 -170
  176. package/dist/inference/stt.js.map +1 -1
  177. package/dist/inference/stt.test.cjs +218 -0
  178. package/dist/inference/stt.test.cjs.map +1 -0
  179. package/dist/inference/stt.test.js +217 -0
  180. package/dist/inference/stt.test.js.map +1 -0
  181. package/dist/inference/tts.cjs +249 -71
  182. package/dist/inference/tts.cjs.map +1 -1
  183. package/dist/inference/tts.d.cts +94 -17
  184. package/dist/inference/tts.d.ts +94 -17
  185. package/dist/inference/tts.d.ts.map +1 -1
  186. package/dist/inference/tts.js +249 -77
  187. package/dist/inference/tts.js.map +1 -1
  188. package/dist/inference/tts.test.cjs +305 -0
  189. package/dist/inference/tts.test.cjs.map +1 -0
  190. package/dist/inference/tts.test.js +304 -0
  191. package/dist/inference/tts.test.js.map +1 -0
  192. package/dist/inference/utils.cjs +26 -7
  193. package/dist/inference/utils.cjs.map +1 -1
  194. package/dist/inference/utils.d.cts +14 -1
  195. package/dist/inference/utils.d.ts +14 -1
  196. package/dist/inference/utils.d.ts.map +1 -1
  197. package/dist/inference/utils.js +18 -2
  198. package/dist/inference/utils.js.map +1 -1
  199. package/dist/ipc/inference_proc_executor.cjs +6 -3
  200. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  201. package/dist/ipc/inference_proc_executor.d.ts.map +1 -1
  202. package/dist/ipc/inference_proc_executor.js +6 -3
  203. package/dist/ipc/inference_proc_executor.js.map +1 -1
  204. package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
  205. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
  206. package/dist/ipc/inference_proc_lazy_main.js +13 -1
  207. package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
  208. package/dist/ipc/job_proc_executor.cjs +6 -1
  209. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  210. package/dist/ipc/job_proc_executor.d.ts.map +1 -1
  211. package/dist/ipc/job_proc_executor.js +6 -1
  212. package/dist/ipc/job_proc_executor.js.map +1 -1
  213. package/dist/ipc/job_proc_lazy_main.cjs +89 -17
  214. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  215. package/dist/ipc/job_proc_lazy_main.js +68 -18
  216. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  217. package/dist/ipc/supervised_proc.cjs +34 -8
  218. package/dist/ipc/supervised_proc.cjs.map +1 -1
  219. package/dist/ipc/supervised_proc.d.cts +8 -0
  220. package/dist/ipc/supervised_proc.d.ts +8 -0
  221. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  222. package/dist/ipc/supervised_proc.js +34 -8
  223. package/dist/ipc/supervised_proc.js.map +1 -1
  224. package/dist/ipc/supervised_proc.test.cjs +145 -0
  225. package/dist/ipc/supervised_proc.test.cjs.map +1 -0
  226. package/dist/ipc/supervised_proc.test.js +122 -0
  227. package/dist/ipc/supervised_proc.test.js.map +1 -0
  228. package/dist/job.cjs +109 -1
  229. package/dist/job.cjs.map +1 -1
  230. package/dist/job.d.cts +14 -0
  231. package/dist/job.d.ts +14 -0
  232. package/dist/job.d.ts.map +1 -1
  233. package/dist/job.js +99 -1
  234. package/dist/job.js.map +1 -1
  235. package/dist/language.cjs +394 -0
  236. package/dist/language.cjs.map +1 -0
  237. package/dist/language.d.cts +15 -0
  238. package/dist/language.d.ts +15 -0
  239. package/dist/language.d.ts.map +1 -0
  240. package/dist/language.js +363 -0
  241. package/dist/language.js.map +1 -0
  242. package/dist/language.test.cjs +43 -0
  243. package/dist/language.test.cjs.map +1 -0
  244. package/dist/language.test.js +49 -0
  245. package/dist/language.test.js.map +1 -0
  246. package/dist/llm/chat_context.cjs +345 -3
  247. package/dist/llm/chat_context.cjs.map +1 -1
  248. package/dist/llm/chat_context.d.cts +86 -2
  249. package/dist/llm/chat_context.d.ts +86 -2
  250. package/dist/llm/chat_context.d.ts.map +1 -1
  251. package/dist/llm/chat_context.js +344 -3
  252. package/dist/llm/chat_context.js.map +1 -1
  253. package/dist/llm/chat_context.test.cjs +692 -0
  254. package/dist/llm/chat_context.test.cjs.map +1 -1
  255. package/dist/llm/chat_context.test.js +692 -0
  256. package/dist/llm/chat_context.test.js.map +1 -1
  257. package/dist/llm/fallback_adapter.cjs +280 -0
  258. package/dist/llm/fallback_adapter.cjs.map +1 -0
  259. package/dist/llm/fallback_adapter.d.cts +73 -0
  260. package/dist/llm/fallback_adapter.d.ts +73 -0
  261. package/dist/llm/fallback_adapter.d.ts.map +1 -0
  262. package/dist/llm/fallback_adapter.js +256 -0
  263. package/dist/llm/fallback_adapter.js.map +1 -0
  264. package/dist/llm/fallback_adapter.test.cjs +176 -0
  265. package/dist/llm/fallback_adapter.test.cjs.map +1 -0
  266. package/dist/llm/fallback_adapter.test.js +175 -0
  267. package/dist/llm/fallback_adapter.test.js.map +1 -0
  268. package/dist/llm/index.cjs +11 -0
  269. package/dist/llm/index.cjs.map +1 -1
  270. package/dist/llm/index.d.cts +4 -3
  271. package/dist/llm/index.d.ts +4 -3
  272. package/dist/llm/index.d.ts.map +1 -1
  273. package/dist/llm/index.js +13 -1
  274. package/dist/llm/index.js.map +1 -1
  275. package/dist/llm/llm.cjs +65 -11
  276. package/dist/llm/llm.cjs.map +1 -1
  277. package/dist/llm/llm.d.cts +13 -2
  278. package/dist/llm/llm.d.ts +13 -2
  279. package/dist/llm/llm.d.ts.map +1 -1
  280. package/dist/llm/llm.js +65 -11
  281. package/dist/llm/llm.js.map +1 -1
  282. package/dist/llm/provider_format/google.cjs +6 -2
  283. package/dist/llm/provider_format/google.cjs.map +1 -1
  284. package/dist/llm/provider_format/google.d.cts +1 -1
  285. package/dist/llm/provider_format/google.d.ts +1 -1
  286. package/dist/llm/provider_format/google.d.ts.map +1 -1
  287. package/dist/llm/provider_format/google.js +6 -2
  288. package/dist/llm/provider_format/google.js.map +1 -1
  289. package/dist/llm/provider_format/google.test.cjs +48 -0
  290. package/dist/llm/provider_format/google.test.cjs.map +1 -1
  291. package/dist/llm/provider_format/google.test.js +54 -1
  292. package/dist/llm/provider_format/google.test.js.map +1 -1
  293. package/dist/llm/provider_format/index.cjs +2 -0
  294. package/dist/llm/provider_format/index.cjs.map +1 -1
  295. package/dist/llm/provider_format/index.d.cts +2 -2
  296. package/dist/llm/provider_format/index.d.ts +2 -2
  297. package/dist/llm/provider_format/index.d.ts.map +1 -1
  298. package/dist/llm/provider_format/index.js +6 -1
  299. package/dist/llm/provider_format/index.js.map +1 -1
  300. package/dist/llm/provider_format/openai.cjs +126 -24
  301. package/dist/llm/provider_format/openai.cjs.map +1 -1
  302. package/dist/llm/provider_format/openai.d.cts +1 -0
  303. package/dist/llm/provider_format/openai.d.ts +1 -0
  304. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  305. package/dist/llm/provider_format/openai.js +124 -23
  306. package/dist/llm/provider_format/openai.js.map +1 -1
  307. package/dist/llm/provider_format/openai.test.cjs +393 -0
  308. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  309. package/dist/llm/provider_format/openai.test.js +400 -2
  310. package/dist/llm/provider_format/openai.test.js.map +1 -1
  311. package/dist/llm/provider_format/utils.cjs +5 -4
  312. package/dist/llm/provider_format/utils.cjs.map +1 -1
  313. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  314. package/dist/llm/provider_format/utils.js +5 -4
  315. package/dist/llm/provider_format/utils.js.map +1 -1
  316. package/dist/llm/realtime.cjs +3 -0
  317. package/dist/llm/realtime.cjs.map +1 -1
  318. package/dist/llm/realtime.d.cts +15 -1
  319. package/dist/llm/realtime.d.ts +15 -1
  320. package/dist/llm/realtime.d.ts.map +1 -1
  321. package/dist/llm/realtime.js +3 -0
  322. package/dist/llm/realtime.js.map +1 -1
  323. package/dist/llm/remote_chat_context.cjs.map +1 -1
  324. package/dist/llm/remote_chat_context.d.cts +2 -0
  325. package/dist/llm/remote_chat_context.d.ts +2 -0
  326. package/dist/llm/remote_chat_context.d.ts.map +1 -1
  327. package/dist/llm/remote_chat_context.js.map +1 -1
  328. package/dist/llm/tool_context.cjs +50 -2
  329. package/dist/llm/tool_context.cjs.map +1 -1
  330. package/dist/llm/tool_context.d.cts +47 -11
  331. package/dist/llm/tool_context.d.ts +47 -11
  332. package/dist/llm/tool_context.d.ts.map +1 -1
  333. package/dist/llm/tool_context.js +48 -3
  334. package/dist/llm/tool_context.js.map +1 -1
  335. package/dist/llm/tool_context.test.cjs +197 -0
  336. package/dist/llm/tool_context.test.cjs.map +1 -1
  337. package/dist/llm/tool_context.test.js +175 -0
  338. package/dist/llm/tool_context.test.js.map +1 -1
  339. package/dist/llm/utils.cjs +107 -12
  340. package/dist/llm/utils.cjs.map +1 -1
  341. package/dist/llm/utils.d.cts +10 -3
  342. package/dist/llm/utils.d.ts +10 -3
  343. package/dist/llm/utils.d.ts.map +1 -1
  344. package/dist/llm/utils.js +106 -12
  345. package/dist/llm/utils.js.map +1 -1
  346. package/dist/llm/utils.test.cjs +90 -0
  347. package/dist/llm/utils.test.cjs.map +1 -1
  348. package/dist/llm/utils.test.js +98 -2
  349. package/dist/llm/utils.test.js.map +1 -1
  350. package/dist/llm/zod-utils.cjs +102 -0
  351. package/dist/llm/zod-utils.cjs.map +1 -0
  352. package/dist/llm/zod-utils.d.cts +65 -0
  353. package/dist/llm/zod-utils.d.ts +65 -0
  354. package/dist/llm/zod-utils.d.ts.map +1 -0
  355. package/dist/llm/zod-utils.js +64 -0
  356. package/dist/llm/zod-utils.js.map +1 -0
  357. package/dist/llm/zod-utils.test.cjs +472 -0
  358. package/dist/llm/zod-utils.test.cjs.map +1 -0
  359. package/dist/llm/zod-utils.test.js +455 -0
  360. package/dist/llm/zod-utils.test.js.map +1 -0
  361. package/dist/log.cjs +45 -14
  362. package/dist/log.cjs.map +1 -1
  363. package/dist/log.d.cts +8 -1
  364. package/dist/log.d.ts +8 -1
  365. package/dist/log.d.ts.map +1 -1
  366. package/dist/log.js +45 -15
  367. package/dist/log.js.map +1 -1
  368. package/dist/metrics/base.cjs.map +1 -1
  369. package/dist/metrics/base.d.cts +75 -19
  370. package/dist/metrics/base.d.ts +75 -19
  371. package/dist/metrics/base.d.ts.map +1 -1
  372. package/dist/metrics/index.cjs +5 -0
  373. package/dist/metrics/index.cjs.map +1 -1
  374. package/dist/metrics/index.d.cts +2 -1
  375. package/dist/metrics/index.d.ts +2 -1
  376. package/dist/metrics/index.d.ts.map +1 -1
  377. package/dist/metrics/index.js +6 -0
  378. package/dist/metrics/index.js.map +1 -1
  379. package/dist/metrics/model_usage.cjs +189 -0
  380. package/dist/metrics/model_usage.cjs.map +1 -0
  381. package/dist/metrics/model_usage.d.cts +92 -0
  382. package/dist/metrics/model_usage.d.ts +92 -0
  383. package/dist/metrics/model_usage.d.ts.map +1 -0
  384. package/dist/metrics/model_usage.js +164 -0
  385. package/dist/metrics/model_usage.js.map +1 -0
  386. package/dist/metrics/model_usage.test.cjs +474 -0
  387. package/dist/metrics/model_usage.test.cjs.map +1 -0
  388. package/dist/metrics/model_usage.test.js +476 -0
  389. package/dist/metrics/model_usage.test.js.map +1 -0
  390. package/dist/metrics/usage_collector.cjs +5 -2
  391. package/dist/metrics/usage_collector.cjs.map +1 -1
  392. package/dist/metrics/usage_collector.d.cts +10 -1
  393. package/dist/metrics/usage_collector.d.ts +10 -1
  394. package/dist/metrics/usage_collector.d.ts.map +1 -1
  395. package/dist/metrics/usage_collector.js +5 -2
  396. package/dist/metrics/usage_collector.js.map +1 -1
  397. package/dist/metrics/utils.cjs +23 -7
  398. package/dist/metrics/utils.cjs.map +1 -1
  399. package/dist/metrics/utils.d.ts.map +1 -1
  400. package/dist/metrics/utils.js +23 -7
  401. package/dist/metrics/utils.js.map +1 -1
  402. package/dist/stream/deferred_stream.cjs +31 -10
  403. package/dist/stream/deferred_stream.cjs.map +1 -1
  404. package/dist/stream/deferred_stream.d.cts +6 -1
  405. package/dist/stream/deferred_stream.d.ts +6 -1
  406. package/dist/stream/deferred_stream.d.ts.map +1 -1
  407. package/dist/stream/deferred_stream.js +31 -10
  408. package/dist/stream/deferred_stream.js.map +1 -1
  409. package/dist/stream/deferred_stream.test.cjs +2 -2
  410. package/dist/stream/deferred_stream.test.cjs.map +1 -1
  411. package/dist/stream/deferred_stream.test.js +2 -2
  412. package/dist/stream/deferred_stream.test.js.map +1 -1
  413. package/dist/stream/index.cjs +3 -0
  414. package/dist/stream/index.cjs.map +1 -1
  415. package/dist/stream/index.d.cts +1 -0
  416. package/dist/stream/index.d.ts +1 -0
  417. package/dist/stream/index.d.ts.map +1 -1
  418. package/dist/stream/index.js +2 -0
  419. package/dist/stream/index.js.map +1 -1
  420. package/dist/stream/multi_input_stream.cjs +139 -0
  421. package/dist/stream/multi_input_stream.cjs.map +1 -0
  422. package/dist/stream/multi_input_stream.d.cts +55 -0
  423. package/dist/stream/multi_input_stream.d.ts +55 -0
  424. package/dist/stream/multi_input_stream.d.ts.map +1 -0
  425. package/dist/stream/multi_input_stream.js +115 -0
  426. package/dist/stream/multi_input_stream.js.map +1 -0
  427. package/dist/stream/multi_input_stream.test.cjs +344 -0
  428. package/dist/stream/multi_input_stream.test.cjs.map +1 -0
  429. package/dist/stream/multi_input_stream.test.js +343 -0
  430. package/dist/stream/multi_input_stream.test.js.map +1 -0
  431. package/dist/stream/stream_channel.cjs +39 -1
  432. package/dist/stream/stream_channel.cjs.map +1 -1
  433. package/dist/stream/stream_channel.d.cts +5 -2
  434. package/dist/stream/stream_channel.d.ts +5 -2
  435. package/dist/stream/stream_channel.d.ts.map +1 -1
  436. package/dist/stream/stream_channel.js +39 -1
  437. package/dist/stream/stream_channel.js.map +1 -1
  438. package/dist/stream/stream_channel.test.cjs +27 -0
  439. package/dist/stream/stream_channel.test.cjs.map +1 -1
  440. package/dist/stream/stream_channel.test.js +27 -0
  441. package/dist/stream/stream_channel.test.js.map +1 -1
  442. package/dist/stt/stream_adapter.cjs +24 -9
  443. package/dist/stt/stream_adapter.cjs.map +1 -1
  444. package/dist/stt/stream_adapter.d.cts +7 -3
  445. package/dist/stt/stream_adapter.d.ts +7 -3
  446. package/dist/stt/stream_adapter.d.ts.map +1 -1
  447. package/dist/stt/stream_adapter.js +24 -9
  448. package/dist/stt/stream_adapter.js.map +1 -1
  449. package/dist/stt/stt.cjs +94 -19
  450. package/dist/stt/stt.cjs.map +1 -1
  451. package/dist/stt/stt.d.cts +68 -5
  452. package/dist/stt/stt.d.ts +68 -5
  453. package/dist/stt/stt.d.ts.map +1 -1
  454. package/dist/stt/stt.js +96 -21
  455. package/dist/stt/stt.js.map +1 -1
  456. package/dist/telemetry/index.cjs +72 -0
  457. package/dist/telemetry/index.cjs.map +1 -0
  458. package/dist/telemetry/index.d.cts +7 -0
  459. package/dist/telemetry/index.d.ts +7 -0
  460. package/dist/telemetry/index.d.ts.map +1 -0
  461. package/dist/telemetry/index.js +37 -0
  462. package/dist/telemetry/index.js.map +1 -0
  463. package/dist/telemetry/logging.cjs +65 -0
  464. package/dist/telemetry/logging.cjs.map +1 -0
  465. package/dist/telemetry/logging.d.cts +21 -0
  466. package/dist/telemetry/logging.d.ts +21 -0
  467. package/dist/telemetry/logging.d.ts.map +1 -0
  468. package/dist/telemetry/logging.js +40 -0
  469. package/dist/telemetry/logging.js.map +1 -0
  470. package/dist/telemetry/otel_http_exporter.cjs +166 -0
  471. package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
  472. package/dist/telemetry/otel_http_exporter.d.cts +63 -0
  473. package/dist/telemetry/otel_http_exporter.d.ts +63 -0
  474. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
  475. package/dist/telemetry/otel_http_exporter.js +142 -0
  476. package/dist/telemetry/otel_http_exporter.js.map +1 -0
  477. package/dist/telemetry/pino_otel_transport.cjs +217 -0
  478. package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
  479. package/dist/telemetry/pino_otel_transport.d.cts +58 -0
  480. package/dist/telemetry/pino_otel_transport.d.ts +58 -0
  481. package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
  482. package/dist/telemetry/pino_otel_transport.js +189 -0
  483. package/dist/telemetry/pino_otel_transport.js.map +1 -0
  484. package/dist/telemetry/trace_types.cjs +233 -0
  485. package/dist/telemetry/trace_types.cjs.map +1 -0
  486. package/dist/telemetry/trace_types.d.cts +74 -0
  487. package/dist/telemetry/trace_types.d.ts +74 -0
  488. package/dist/telemetry/trace_types.d.ts.map +1 -0
  489. package/dist/telemetry/trace_types.js +141 -0
  490. package/dist/telemetry/trace_types.js.map +1 -0
  491. package/dist/telemetry/traces.cjs +484 -0
  492. package/dist/telemetry/traces.cjs.map +1 -0
  493. package/dist/telemetry/traces.d.cts +116 -0
  494. package/dist/telemetry/traces.d.ts +116 -0
  495. package/dist/telemetry/traces.d.ts.map +1 -0
  496. package/dist/telemetry/traces.js +449 -0
  497. package/dist/telemetry/traces.js.map +1 -0
  498. package/dist/telemetry/utils.cjs +86 -0
  499. package/dist/telemetry/utils.cjs.map +1 -0
  500. package/dist/telemetry/utils.d.cts +5 -0
  501. package/dist/telemetry/utils.d.ts +5 -0
  502. package/dist/telemetry/utils.d.ts.map +1 -0
  503. package/dist/telemetry/utils.js +51 -0
  504. package/dist/telemetry/utils.js.map +1 -0
  505. package/dist/tokenize/basic/sentence.cjs +3 -3
  506. package/dist/tokenize/basic/sentence.cjs.map +1 -1
  507. package/dist/tokenize/basic/sentence.js +3 -3
  508. package/dist/tokenize/basic/sentence.js.map +1 -1
  509. package/dist/tokenize/tokenizer.test.cjs +3 -1
  510. package/dist/tokenize/tokenizer.test.cjs.map +1 -1
  511. package/dist/tokenize/tokenizer.test.js +3 -1
  512. package/dist/tokenize/tokenizer.test.js.map +1 -1
  513. package/dist/transcription.cjs.map +1 -1
  514. package/dist/transcription.d.cts +6 -0
  515. package/dist/transcription.d.ts +6 -0
  516. package/dist/transcription.d.ts.map +1 -1
  517. package/dist/transcription.js.map +1 -1
  518. package/dist/tts/fallback_adapter.cjs +472 -0
  519. package/dist/tts/fallback_adapter.cjs.map +1 -0
  520. package/dist/tts/fallback_adapter.d.cts +110 -0
  521. package/dist/tts/fallback_adapter.d.ts +110 -0
  522. package/dist/tts/fallback_adapter.d.ts.map +1 -0
  523. package/dist/tts/fallback_adapter.js +448 -0
  524. package/dist/tts/fallback_adapter.js.map +1 -0
  525. package/dist/tts/index.cjs +3 -0
  526. package/dist/tts/index.cjs.map +1 -1
  527. package/dist/tts/index.d.cts +1 -0
  528. package/dist/tts/index.d.ts +1 -0
  529. package/dist/tts/index.d.ts.map +1 -1
  530. package/dist/tts/index.js +2 -0
  531. package/dist/tts/index.js.map +1 -1
  532. package/dist/tts/stream_adapter.cjs +25 -8
  533. package/dist/tts/stream_adapter.cjs.map +1 -1
  534. package/dist/tts/stream_adapter.d.cts +6 -3
  535. package/dist/tts/stream_adapter.d.ts +6 -3
  536. package/dist/tts/stream_adapter.d.ts.map +1 -1
  537. package/dist/tts/stream_adapter.js +25 -8
  538. package/dist/tts/stream_adapter.js.map +1 -1
  539. package/dist/tts/tts.cjs +189 -57
  540. package/dist/tts/tts.cjs.map +1 -1
  541. package/dist/tts/tts.d.cts +58 -6
  542. package/dist/tts/tts.d.ts +58 -6
  543. package/dist/tts/tts.d.ts.map +1 -1
  544. package/dist/tts/tts.js +191 -59
  545. package/dist/tts/tts.js.map +1 -1
  546. package/dist/typed_promise.cjs +48 -0
  547. package/dist/typed_promise.cjs.map +1 -0
  548. package/dist/typed_promise.d.cts +24 -0
  549. package/dist/typed_promise.d.ts +24 -0
  550. package/dist/typed_promise.d.ts.map +1 -0
  551. package/dist/typed_promise.js +28 -0
  552. package/dist/typed_promise.js.map +1 -0
  553. package/dist/types.cjs +24 -32
  554. package/dist/types.cjs.map +1 -1
  555. package/dist/types.d.cts +45 -10
  556. package/dist/types.d.ts +45 -10
  557. package/dist/types.d.ts.map +1 -1
  558. package/dist/types.js +20 -30
  559. package/dist/types.js.map +1 -1
  560. package/dist/utils.cjs +124 -28
  561. package/dist/utils.cjs.map +1 -1
  562. package/dist/utils.d.cts +41 -1
  563. package/dist/utils.d.ts +41 -1
  564. package/dist/utils.d.ts.map +1 -1
  565. package/dist/utils.js +119 -27
  566. package/dist/utils.js.map +1 -1
  567. package/dist/utils.test.cjs +73 -1
  568. package/dist/utils.test.cjs.map +1 -1
  569. package/dist/utils.test.js +74 -10
  570. package/dist/utils.test.js.map +1 -1
  571. package/dist/vad.cjs +35 -15
  572. package/dist/vad.cjs.map +1 -1
  573. package/dist/vad.d.cts +15 -5
  574. package/dist/vad.d.ts +15 -5
  575. package/dist/vad.d.ts.map +1 -1
  576. package/dist/vad.js +35 -15
  577. package/dist/vad.js.map +1 -1
  578. package/dist/version.cjs +1 -1
  579. package/dist/version.cjs.map +1 -1
  580. package/dist/version.d.cts +1 -1
  581. package/dist/version.d.ts +1 -1
  582. package/dist/version.d.ts.map +1 -1
  583. package/dist/version.js +1 -1
  584. package/dist/version.js.map +1 -1
  585. package/dist/voice/agent.cjs +258 -35
  586. package/dist/voice/agent.cjs.map +1 -1
  587. package/dist/voice/agent.d.cts +54 -13
  588. package/dist/voice/agent.d.ts +54 -13
  589. package/dist/voice/agent.d.ts.map +1 -1
  590. package/dist/voice/agent.js +254 -34
  591. package/dist/voice/agent.js.map +1 -1
  592. package/dist/voice/agent.test.cjs +314 -0
  593. package/dist/voice/agent.test.cjs.map +1 -1
  594. package/dist/voice/agent.test.js +316 -2
  595. package/dist/voice/agent.test.js.map +1 -1
  596. package/dist/voice/agent_activity.cjs +1116 -385
  597. package/dist/voice/agent_activity.cjs.map +1 -1
  598. package/dist/voice/agent_activity.d.cts +72 -11
  599. package/dist/voice/agent_activity.d.ts +72 -11
  600. package/dist/voice/agent_activity.d.ts.map +1 -1
  601. package/dist/voice/agent_activity.js +1119 -383
  602. package/dist/voice/agent_activity.js.map +1 -1
  603. package/dist/voice/agent_activity.test.cjs +135 -0
  604. package/dist/voice/agent_activity.test.cjs.map +1 -0
  605. package/dist/voice/agent_activity.test.js +134 -0
  606. package/dist/voice/agent_activity.test.js.map +1 -0
  607. package/dist/voice/agent_session.cjs +550 -90
  608. package/dist/voice/agent_session.cjs.map +1 -1
  609. package/dist/voice/agent_session.d.cts +185 -25
  610. package/dist/voice/agent_session.d.ts +185 -25
  611. package/dist/voice/agent_session.d.ts.map +1 -1
  612. package/dist/voice/agent_session.js +556 -91
  613. package/dist/voice/agent_session.js.map +1 -1
  614. package/dist/voice/audio_recognition.cjs +605 -46
  615. package/dist/voice/audio_recognition.cjs.map +1 -1
  616. package/dist/voice/audio_recognition.d.cts +96 -4
  617. package/dist/voice/audio_recognition.d.ts +96 -4
  618. package/dist/voice/audio_recognition.d.ts.map +1 -1
  619. package/dist/voice/audio_recognition.js +611 -47
  620. package/dist/voice/audio_recognition.js.map +1 -1
  621. package/dist/voice/audio_recognition_span.test.cjs +295 -0
  622. package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
  623. package/dist/voice/audio_recognition_span.test.js +299 -0
  624. package/dist/voice/audio_recognition_span.test.js.map +1 -0
  625. package/dist/voice/avatar/datastream_io.cjs +7 -1
  626. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  627. package/dist/voice/avatar/datastream_io.d.cts +1 -0
  628. package/dist/voice/avatar/datastream_io.d.ts +1 -0
  629. package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
  630. package/dist/voice/avatar/datastream_io.js +7 -1
  631. package/dist/voice/avatar/datastream_io.js.map +1 -1
  632. package/dist/voice/background_audio.cjs +367 -0
  633. package/dist/voice/background_audio.cjs.map +1 -0
  634. package/dist/voice/background_audio.d.cts +123 -0
  635. package/dist/voice/background_audio.d.ts +123 -0
  636. package/dist/voice/background_audio.d.ts.map +1 -0
  637. package/dist/voice/background_audio.js +343 -0
  638. package/dist/voice/background_audio.js.map +1 -0
  639. package/dist/voice/events.cjs +3 -0
  640. package/dist/voice/events.cjs.map +1 -1
  641. package/dist/voice/events.d.cts +16 -9
  642. package/dist/voice/events.d.ts +16 -9
  643. package/dist/voice/events.d.ts.map +1 -1
  644. package/dist/voice/events.js +3 -0
  645. package/dist/voice/events.js.map +1 -1
  646. package/dist/voice/generation.cjs +205 -41
  647. package/dist/voice/generation.cjs.map +1 -1
  648. package/dist/voice/generation.d.cts +21 -5
  649. package/dist/voice/generation.d.ts +21 -5
  650. package/dist/voice/generation.d.ts.map +1 -1
  651. package/dist/voice/generation.js +215 -43
  652. package/dist/voice/generation.js.map +1 -1
  653. package/dist/voice/generation_tools.test.cjs +236 -0
  654. package/dist/voice/generation_tools.test.cjs.map +1 -0
  655. package/dist/voice/generation_tools.test.js +235 -0
  656. package/dist/voice/generation_tools.test.js.map +1 -0
  657. package/dist/voice/index.cjs +33 -2
  658. package/dist/voice/index.cjs.map +1 -1
  659. package/dist/voice/index.d.cts +8 -2
  660. package/dist/voice/index.d.ts +8 -2
  661. package/dist/voice/index.d.ts.map +1 -1
  662. package/dist/voice/index.js +19 -2
  663. package/dist/voice/index.js.map +1 -1
  664. package/dist/voice/interruption_detection.test.cjs +114 -0
  665. package/dist/voice/interruption_detection.test.cjs.map +1 -0
  666. package/dist/voice/interruption_detection.test.js +113 -0
  667. package/dist/voice/interruption_detection.test.js.map +1 -0
  668. package/dist/voice/io.cjs +66 -6
  669. package/dist/voice/io.cjs.map +1 -1
  670. package/dist/voice/io.d.cts +67 -7
  671. package/dist/voice/io.d.ts +67 -7
  672. package/dist/voice/io.d.ts.map +1 -1
  673. package/dist/voice/io.js +62 -5
  674. package/dist/voice/io.js.map +1 -1
  675. package/dist/voice/recorder_io/index.cjs +23 -0
  676. package/dist/voice/recorder_io/index.cjs.map +1 -0
  677. package/dist/voice/recorder_io/index.d.cts +2 -0
  678. package/dist/voice/recorder_io/index.d.ts +2 -0
  679. package/dist/voice/recorder_io/index.d.ts.map +1 -0
  680. package/dist/voice/recorder_io/index.js +2 -0
  681. package/dist/voice/recorder_io/index.js.map +1 -0
  682. package/dist/voice/recorder_io/recorder_io.cjs +607 -0
  683. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
  684. package/dist/voice/recorder_io/recorder_io.d.cts +106 -0
  685. package/dist/voice/recorder_io/recorder_io.d.ts +106 -0
  686. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
  687. package/dist/voice/recorder_io/recorder_io.js +573 -0
  688. package/dist/voice/recorder_io/recorder_io.js.map +1 -0
  689. package/dist/voice/remote_session.cjs +922 -0
  690. package/dist/voice/remote_session.cjs.map +1 -0
  691. package/dist/voice/remote_session.d.cts +108 -0
  692. package/dist/voice/remote_session.d.ts +108 -0
  693. package/dist/voice/remote_session.d.ts.map +1 -0
  694. package/dist/voice/remote_session.js +887 -0
  695. package/dist/voice/remote_session.js.map +1 -0
  696. package/dist/voice/report.cjs +88 -0
  697. package/dist/voice/report.cjs.map +1 -0
  698. package/dist/voice/report.d.cts +49 -0
  699. package/dist/voice/report.d.ts +49 -0
  700. package/dist/voice/report.d.ts.map +1 -0
  701. package/dist/voice/report.js +63 -0
  702. package/dist/voice/report.js.map +1 -0
  703. package/dist/voice/report.test.cjs +121 -0
  704. package/dist/voice/report.test.cjs.map +1 -0
  705. package/dist/voice/report.test.js +120 -0
  706. package/dist/voice/report.test.js.map +1 -0
  707. package/dist/voice/room_io/_input.cjs +40 -7
  708. package/dist/voice/room_io/_input.cjs.map +1 -1
  709. package/dist/voice/room_io/_input.d.cts +5 -2
  710. package/dist/voice/room_io/_input.d.ts +5 -2
  711. package/dist/voice/room_io/_input.d.ts.map +1 -1
  712. package/dist/voice/room_io/_input.js +41 -8
  713. package/dist/voice/room_io/_input.js.map +1 -1
  714. package/dist/voice/room_io/_output.cjs +19 -11
  715. package/dist/voice/room_io/_output.cjs.map +1 -1
  716. package/dist/voice/room_io/_output.d.cts +7 -4
  717. package/dist/voice/room_io/_output.d.ts +7 -4
  718. package/dist/voice/room_io/_output.d.ts.map +1 -1
  719. package/dist/voice/room_io/_output.js +20 -12
  720. package/dist/voice/room_io/_output.js.map +1 -1
  721. package/dist/voice/room_io/room_io.cjs +33 -6
  722. package/dist/voice/room_io/room_io.cjs.map +1 -1
  723. package/dist/voice/room_io/room_io.d.cts +29 -9
  724. package/dist/voice/room_io/room_io.d.ts +29 -9
  725. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  726. package/dist/voice/room_io/room_io.js +33 -7
  727. package/dist/voice/room_io/room_io.js.map +1 -1
  728. package/dist/voice/speech_handle.cjs +22 -4
  729. package/dist/voice/speech_handle.cjs.map +1 -1
  730. package/dist/voice/speech_handle.d.cts +17 -2
  731. package/dist/voice/speech_handle.d.ts +17 -2
  732. package/dist/voice/speech_handle.d.ts.map +1 -1
  733. package/dist/voice/speech_handle.js +21 -4
  734. package/dist/voice/speech_handle.js.map +1 -1
  735. package/dist/voice/testing/fake_llm.cjs +127 -0
  736. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  737. package/dist/voice/testing/fake_llm.d.cts +30 -0
  738. package/dist/voice/testing/fake_llm.d.ts +30 -0
  739. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  740. package/dist/voice/testing/fake_llm.js +103 -0
  741. package/dist/voice/testing/fake_llm.js.map +1 -0
  742. package/dist/voice/testing/index.cjs +57 -0
  743. package/dist/voice/testing/index.cjs.map +1 -0
  744. package/dist/voice/testing/index.d.cts +21 -0
  745. package/dist/voice/testing/index.d.ts +21 -0
  746. package/dist/voice/testing/index.d.ts.map +1 -0
  747. package/dist/voice/testing/index.js +35 -0
  748. package/dist/voice/testing/index.js.map +1 -0
  749. package/dist/voice/testing/run_result.cjs +817 -0
  750. package/dist/voice/testing/run_result.cjs.map +1 -0
  751. package/dist/voice/testing/run_result.d.cts +385 -0
  752. package/dist/voice/testing/run_result.d.ts +385 -0
  753. package/dist/voice/testing/run_result.d.ts.map +1 -0
  754. package/dist/voice/testing/run_result.js +790 -0
  755. package/dist/voice/testing/run_result.js.map +1 -0
  756. package/dist/voice/testing/types.cjs +46 -0
  757. package/dist/voice/testing/types.cjs.map +1 -0
  758. package/dist/voice/testing/types.d.cts +83 -0
  759. package/dist/voice/testing/types.d.ts +83 -0
  760. package/dist/voice/testing/types.d.ts.map +1 -0
  761. package/dist/voice/testing/types.js +19 -0
  762. package/dist/voice/testing/types.js.map +1 -0
  763. package/dist/voice/transcription/synchronizer.cjs +139 -15
  764. package/dist/voice/transcription/synchronizer.cjs.map +1 -1
  765. package/dist/voice/transcription/synchronizer.d.cts +35 -4
  766. package/dist/voice/transcription/synchronizer.d.ts +35 -4
  767. package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
  768. package/dist/voice/transcription/synchronizer.js +143 -16
  769. package/dist/voice/transcription/synchronizer.js.map +1 -1
  770. package/dist/voice/transcription/synchronizer.test.cjs +151 -0
  771. package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
  772. package/dist/voice/transcription/synchronizer.test.js +150 -0
  773. package/dist/voice/transcription/synchronizer.test.js.map +1 -0
  774. package/dist/voice/turn_config/endpointing.cjs +33 -0
  775. package/dist/voice/turn_config/endpointing.cjs.map +1 -0
  776. package/dist/voice/turn_config/endpointing.d.cts +30 -0
  777. package/dist/voice/turn_config/endpointing.d.ts +30 -0
  778. package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
  779. package/dist/voice/turn_config/endpointing.js +9 -0
  780. package/dist/voice/turn_config/endpointing.js.map +1 -0
  781. package/dist/voice/turn_config/interruption.cjs +37 -0
  782. package/dist/voice/turn_config/interruption.cjs.map +1 -0
  783. package/dist/voice/turn_config/interruption.d.cts +53 -0
  784. package/dist/voice/turn_config/interruption.d.ts +53 -0
  785. package/dist/voice/turn_config/interruption.d.ts.map +1 -0
  786. package/dist/voice/turn_config/interruption.js +13 -0
  787. package/dist/voice/turn_config/interruption.js.map +1 -0
  788. package/dist/voice/turn_config/turn_handling.cjs +35 -0
  789. package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
  790. package/dist/voice/turn_config/turn_handling.d.cts +36 -0
  791. package/dist/voice/turn_config/turn_handling.d.ts +36 -0
  792. package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
  793. package/dist/voice/turn_config/turn_handling.js +11 -0
  794. package/dist/voice/turn_config/turn_handling.js.map +1 -0
  795. package/dist/voice/turn_config/utils.cjs +157 -0
  796. package/dist/voice/turn_config/utils.cjs.map +1 -0
  797. package/dist/voice/turn_config/utils.d.cts +37 -0
  798. package/dist/voice/turn_config/utils.d.ts +37 -0
  799. package/dist/voice/turn_config/utils.d.ts.map +1 -0
  800. package/dist/voice/turn_config/utils.js +131 -0
  801. package/dist/voice/turn_config/utils.js.map +1 -0
  802. package/dist/voice/turn_config/utils.test.cjs +128 -0
  803. package/dist/voice/turn_config/utils.test.cjs.map +1 -0
  804. package/dist/voice/turn_config/utils.test.js +127 -0
  805. package/dist/voice/turn_config/utils.test.js.map +1 -0
  806. package/dist/voice/utils.cjs +47 -0
  807. package/dist/voice/utils.cjs.map +1 -0
  808. package/dist/voice/utils.d.cts +4 -0
  809. package/dist/voice/utils.d.ts +4 -0
  810. package/dist/voice/utils.d.ts.map +1 -0
  811. package/dist/voice/utils.js +23 -0
  812. package/dist/voice/utils.js.map +1 -0
  813. package/dist/worker.cjs +44 -52
  814. package/dist/worker.cjs.map +1 -1
  815. package/dist/worker.d.cts +18 -8
  816. package/dist/worker.d.ts +18 -8
  817. package/dist/worker.d.ts.map +1 -1
  818. package/dist/worker.js +43 -43
  819. package/dist/worker.js.map +1 -1
  820. package/package.json +35 -13
  821. package/resources/NOTICE +2 -0
  822. package/resources/keyboard-typing.ogg +0 -0
  823. package/resources/keyboard-typing2.ogg +0 -0
  824. package/resources/office-ambience.ogg +0 -0
  825. package/src/_exceptions.ts +5 -0
  826. package/src/audio.ts +132 -1
  827. package/src/beta/index.ts +9 -0
  828. package/src/beta/workflows/index.ts +9 -0
  829. package/src/beta/workflows/task_group.ts +203 -0
  830. package/src/cli.ts +57 -66
  831. package/src/connection_pool.test.ts +346 -0
  832. package/src/connection_pool.ts +307 -0
  833. package/src/constants.ts +14 -0
  834. package/src/cpu.test.ts +239 -0
  835. package/src/cpu.ts +173 -0
  836. package/src/http_server.ts +18 -6
  837. package/src/index.ts +15 -13
  838. package/src/inference/api_protos.ts +85 -2
  839. package/src/inference/index.ts +32 -4
  840. package/src/inference/interruption/defaults.ts +51 -0
  841. package/src/inference/interruption/errors.ts +25 -0
  842. package/src/inference/interruption/http_transport.ts +207 -0
  843. package/src/inference/interruption/interruption_cache_entry.ts +50 -0
  844. package/src/inference/interruption/interruption_detector.ts +204 -0
  845. package/src/inference/interruption/interruption_stream.ts +467 -0
  846. package/src/inference/interruption/types.ts +84 -0
  847. package/src/inference/interruption/utils.test.ts +132 -0
  848. package/src/inference/interruption/utils.ts +137 -0
  849. package/src/inference/interruption/ws_transport.ts +416 -0
  850. package/src/inference/llm.ts +214 -163
  851. package/src/inference/stt.test.ts +253 -0
  852. package/src/inference/stt.ts +449 -208
  853. package/src/inference/tts.test.ts +354 -0
  854. package/src/inference/tts.ts +417 -115
  855. package/src/inference/utils.ts +30 -2
  856. package/src/ipc/inference_proc_executor.ts +11 -3
  857. package/src/ipc/inference_proc_lazy_main.ts +13 -1
  858. package/src/ipc/job_proc_executor.ts +11 -1
  859. package/src/ipc/job_proc_lazy_main.ts +86 -20
  860. package/src/ipc/supervised_proc.test.ts +153 -0
  861. package/src/ipc/supervised_proc.ts +39 -10
  862. package/src/job.ts +120 -1
  863. package/src/language.test.ts +62 -0
  864. package/src/language.ts +380 -0
  865. package/src/llm/__snapshots__/zod-utils.test.ts.snap +559 -0
  866. package/src/llm/chat_context.test.ts +787 -0
  867. package/src/llm/chat_context.ts +493 -2
  868. package/src/llm/fallback_adapter.test.ts +238 -0
  869. package/src/llm/fallback_adapter.ts +394 -0
  870. package/src/llm/index.ts +13 -0
  871. package/src/llm/llm.ts +77 -12
  872. package/src/llm/provider_format/google.test.ts +72 -1
  873. package/src/llm/provider_format/google.ts +10 -6
  874. package/src/llm/provider_format/index.ts +7 -2
  875. package/src/llm/provider_format/openai.test.ts +480 -2
  876. package/src/llm/provider_format/openai.ts +152 -21
  877. package/src/llm/provider_format/utils.ts +11 -5
  878. package/src/llm/realtime.ts +23 -2
  879. package/src/llm/remote_chat_context.ts +2 -2
  880. package/src/llm/tool_context.test.ts +210 -1
  881. package/src/llm/tool_context.ts +115 -17
  882. package/src/llm/utils.test.ts +103 -2
  883. package/src/llm/utils.ts +152 -16
  884. package/src/llm/zod-utils.test.ts +577 -0
  885. package/src/llm/zod-utils.ts +153 -0
  886. package/src/log.ts +71 -19
  887. package/src/metrics/base.ts +78 -19
  888. package/src/metrics/index.ts +12 -0
  889. package/src/metrics/model_usage.test.ts +545 -0
  890. package/src/metrics/model_usage.ts +262 -0
  891. package/src/metrics/usage_collector.ts +14 -3
  892. package/src/metrics/utils.ts +27 -7
  893. package/src/stream/deferred_stream.test.ts +3 -3
  894. package/src/stream/deferred_stream.ts +43 -11
  895. package/src/stream/index.ts +1 -0
  896. package/src/stream/multi_input_stream.test.ts +545 -0
  897. package/src/stream/multi_input_stream.ts +172 -0
  898. package/src/stream/stream_channel.test.ts +37 -0
  899. package/src/stream/stream_channel.ts +43 -3
  900. package/src/stt/stream_adapter.ts +30 -9
  901. package/src/stt/stt.ts +140 -23
  902. package/src/telemetry/index.ts +28 -0
  903. package/src/telemetry/logging.ts +55 -0
  904. package/src/telemetry/otel_http_exporter.ts +218 -0
  905. package/src/telemetry/pino_otel_transport.ts +265 -0
  906. package/src/telemetry/trace_types.ts +109 -0
  907. package/src/telemetry/traces.ts +673 -0
  908. package/src/telemetry/utils.ts +61 -0
  909. package/src/tokenize/basic/sentence.ts +3 -3
  910. package/src/tokenize/tokenizer.test.ts +4 -0
  911. package/src/transcription.ts +6 -0
  912. package/src/tts/fallback_adapter.ts +586 -0
  913. package/src/tts/index.ts +1 -0
  914. package/src/tts/stream_adapter.ts +38 -8
  915. package/src/tts/tts.ts +245 -62
  916. package/src/typed_promise.ts +67 -0
  917. package/src/types.ts +62 -33
  918. package/src/utils.test.ts +90 -10
  919. package/src/utils.ts +178 -33
  920. package/src/vad.ts +42 -18
  921. package/src/version.ts +1 -1
  922. package/src/voice/agent.test.ts +347 -2
  923. package/src/voice/agent.ts +346 -44
  924. package/src/voice/agent_activity.test.ts +194 -0
  925. package/src/voice/agent_activity.ts +1457 -388
  926. package/src/voice/agent_session.ts +817 -112
  927. package/src/voice/audio_recognition.ts +845 -70
  928. package/src/voice/audio_recognition_span.test.ts +341 -0
  929. package/src/voice/avatar/datastream_io.ts +9 -1
  930. package/src/voice/background_audio.ts +494 -0
  931. package/src/voice/events.ts +27 -7
  932. package/src/voice/generation.ts +310 -56
  933. package/src/voice/generation_tools.test.ts +268 -0
  934. package/src/voice/index.ts +17 -3
  935. package/src/voice/interruption_detection.test.ts +151 -0
  936. package/src/voice/io.ts +115 -12
  937. package/src/voice/recorder_io/index.ts +4 -0
  938. package/src/voice/recorder_io/recorder_io.ts +783 -0
  939. package/src/voice/remote_session.ts +1083 -0
  940. package/src/voice/report.test.ts +136 -0
  941. package/src/voice/report.ts +140 -0
  942. package/src/voice/room_io/_input.ts +45 -10
  943. package/src/voice/room_io/_output.ts +26 -14
  944. package/src/voice/room_io/room_io.ts +67 -22
  945. package/src/voice/speech_handle.ts +38 -6
  946. package/src/voice/testing/fake_llm.ts +138 -0
  947. package/src/voice/testing/index.ts +52 -0
  948. package/src/voice/testing/run_result.ts +995 -0
  949. package/src/voice/testing/types.ts +118 -0
  950. package/src/voice/transcription/synchronizer.test.ts +206 -0
  951. package/src/voice/transcription/synchronizer.ts +204 -19
  952. package/src/voice/turn_config/endpointing.ts +33 -0
  953. package/src/voice/turn_config/interruption.ts +56 -0
  954. package/src/voice/turn_config/turn_handling.ts +45 -0
  955. package/src/voice/turn_config/utils.test.ts +148 -0
  956. package/src/voice/turn_config/utils.ts +167 -0
  957. package/src/voice/utils.ts +29 -0
  958. package/src/worker.ts +92 -78
  959. package/src/llm/__snapshots__/utils.test.ts.snap +0 -65
@@ -1,20 +1,31 @@
1
1
  import { Mutex } from "@livekit/mutex";
2
+ import { ROOT_CONTEXT, context as otelContext, trace } from "@opentelemetry/api";
2
3
  import { Heap } from "heap-js";
3
4
  import { AsyncLocalStorage } from "node:async_hooks";
4
- import { ReadableStream } from "node:stream/web";
5
+ import { ReadableStream, TransformStream } from "node:stream/web";
6
+ import { AdaptiveInterruptionDetector } from "../inference/interruption/interruption_detector.js";
5
7
  import { ChatMessage } from "../llm/chat_context.js";
6
8
  import {
7
9
  LLM,
8
- RealtimeModel
10
+ RealtimeModel,
11
+ ToolFlag
9
12
  } from "../llm/index.js";
13
+ import { isSameToolChoice, isSameToolContext } from "../llm/tool_context.js";
10
14
  import { log } from "../log.js";
11
- import { DeferredReadableStream } from "../stream/deferred_stream.js";
15
+ import { MultiInputStream } from "../stream/multi_input_stream.js";
12
16
  import { STT } from "../stt/stt.js";
17
+ import { recordRealtimeMetrics, traceTypes, tracer } from "../telemetry/index.js";
13
18
  import { splitWords } from "../tokenize/basic/word.js";
14
19
  import { TTS } from "../tts/tts.js";
15
- import { Future, Task, cancelAndWait, waitFor } from "../utils.js";
20
+ import { Future, Task, cancelAndWait, isDevMode, isHosted, waitFor } from "../utils.js";
16
21
  import { VAD } from "../vad.js";
17
- import { StopResponse, asyncLocalStorage } from "./agent.js";
22
+ import {
23
+ StopResponse,
24
+ _getActivityTaskInfo,
25
+ _setActivityTaskInfo,
26
+ functionCallStorage,
27
+ speechHandleStorage
28
+ } from "./agent.js";
18
29
  import {} from "./agent_session.js";
19
30
  import {
20
31
  AudioRecognition
@@ -37,28 +48,66 @@ import {
37
48
  updateInstructions
38
49
  } from "./generation.js";
39
50
  import { SpeechHandle } from "./speech_handle.js";
40
- const speechHandleStorage = new AsyncLocalStorage();
51
+ import { setParticipantSpanAttributes } from "./utils.js";
52
+ const agentActivityStorage = new AsyncLocalStorage();
53
+ const onEnterStorage = new AsyncLocalStorage();
41
54
  class AgentActivity {
55
+ agent;
56
+ agentSession;
42
57
  static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
43
58
  started = false;
44
59
  audioRecognition;
45
60
  realtimeSession;
61
+ realtimeSpans;
62
+ // Maps response_id to OTEL span for metrics recording
46
63
  turnDetectionMode;
47
64
  logger = log();
48
- _draining = false;
65
+ _schedulingPaused = true;
66
+ _drainBlockedTasks = [];
49
67
  _currentSpeech;
50
68
  speechQueue;
51
69
  // [priority, timestamp, speechHandle]
52
70
  q_updated;
53
71
  speechTasks = /* @__PURE__ */ new Set();
54
72
  lock = new Mutex();
55
- audioStream = new DeferredReadableStream();
73
+ audioStream = new MultiInputStream();
74
+ audioStreamId;
56
75
  // default to null as None, which maps to the default provider tool choice value
57
76
  toolChoice = null;
58
- agent;
59
- agentSession;
77
+ _preemptiveGeneration;
78
+ interruptionDetector;
79
+ isInterruptionDetectionEnabled;
80
+ isInterruptionByAudioActivityEnabled;
81
+ isDefaultInterruptionByAudioActivityEnabled;
82
+ onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
83
+ onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
84
+ onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
85
+ onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
86
+ onModelError = (ev) => this.onError(ev);
87
+ onInterruptionOverlappingSpeech = (ev) => {
88
+ this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev);
89
+ };
90
+ onInterruptionMetricsCollected = (ev) => {
91
+ this.agentSession._usageCollector.collect(ev);
92
+ this.agentSession.emit(
93
+ AgentSessionEventTypes.MetricsCollected,
94
+ createMetricsCollectedEvent({ metrics: ev })
95
+ );
96
+ };
97
+ onInterruptionError = (ev) => {
98
+ const errorEvent = createErrorEvent(ev, this.interruptionDetector);
99
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
100
+ if (!ev.recoverable) {
101
+ this.agentSession._onError(ev);
102
+ this.fallbackToVadInterruption();
103
+ return;
104
+ }
105
+ this.agentSession._onError(ev);
106
+ };
60
107
  /** @internal */
61
108
  _mainTask;
109
+ _onEnterTask;
110
+ _onExitTask;
62
111
  _userTurnCompletedTask;
63
112
  constructor(agent, agentSession) {
64
113
  this.agent = agent;
@@ -70,7 +119,7 @@ class AgentActivity {
70
119
  this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
71
120
  if (this.turnDetectionMode === "vad" && this.vad === void 0) {
72
121
  this.logger.warn(
73
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting'
122
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
74
123
  );
75
124
  this.turnDetectionMode = void 0;
76
125
  }
@@ -113,89 +162,136 @@ class AgentActivity {
113
162
  );
114
163
  this.turnDetectionMode = void 0;
115
164
  }
116
- if (!this.vad && this.stt && this.llm instanceof LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
165
+ if (!this.vad && this.stt && !this.stt.capabilities.streaming && this.llm instanceof LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
117
166
  this.logger.warn(
118
- "VAD is not set. Enabling VAD is recommended when using LLM and STT for more responsive interruption handling."
167
+ "VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
119
168
  );
120
169
  }
170
+ this.interruptionDetector = this.resolveInterruptionDetector();
171
+ this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
172
+ this.isInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
173
+ this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
121
174
  }
122
175
  async start() {
123
176
  const unlock = await this.lock.lock();
124
177
  try {
125
- this.agent._agentActivity = this;
126
- if (this.llm instanceof RealtimeModel) {
127
- this.realtimeSession = this.llm.session();
128
- this.realtimeSession.on("generation_created", (ev) => this.onGenerationCreated(ev));
129
- this.realtimeSession.on("input_speech_started", (ev) => this.onInputSpeechStarted(ev));
130
- this.realtimeSession.on("input_speech_stopped", (ev) => this.onInputSpeechStopped(ev));
131
- this.realtimeSession.on(
132
- "input_audio_transcription_completed",
133
- (ev) => this.onInputAudioTranscriptionCompleted(ev)
178
+ await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
179
+ } finally {
180
+ unlock();
181
+ }
182
+ }
183
+ async resume() {
184
+ const unlock = await this.lock.lock();
185
+ try {
186
+ await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
187
+ } finally {
188
+ unlock();
189
+ }
190
+ }
191
+ async _startSession(options) {
192
+ var _a, _b, _c, _d, _e;
193
+ const { spanName, runOnEnter } = options;
194
+ const startSpan = tracer.startSpan({
195
+ name: spanName,
196
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
197
+ context: ROOT_CONTEXT
198
+ });
199
+ this.agent._agentActivity = this;
200
+ if (this.llm instanceof RealtimeModel) {
201
+ this.realtimeSession = this.llm.session();
202
+ this.realtimeSpans = /* @__PURE__ */ new Map();
203
+ this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
204
+ this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
205
+ this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
206
+ this.realtimeSession.on(
207
+ "input_audio_transcription_completed",
208
+ this.onRealtimeInputAudioTranscriptionCompleted
209
+ );
210
+ this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
211
+ this.realtimeSession.on("error", this.onModelError);
212
+ removeInstructions(this.agent._chatCtx);
213
+ try {
214
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
215
+ } catch (error) {
216
+ this.logger.error(error, "failed to update the instructions");
217
+ }
218
+ try {
219
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
220
+ } catch (error) {
221
+ this.logger.error(error, "failed to update the chat context");
222
+ }
223
+ try {
224
+ await this.realtimeSession.updateTools(this.tools);
225
+ } catch (error) {
226
+ this.logger.error(error, "failed to update the tools");
227
+ }
228
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
229
+ this.logger.error(
230
+ "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
134
231
  );
135
- this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
136
- this.realtimeSession.on("error", (ev) => this.onError(ev));
137
- removeInstructions(this.agent._chatCtx);
138
- try {
139
- await this.realtimeSession.updateInstructions(this.agent.instructions);
140
- } catch (error) {
141
- this.logger.error(error, "failed to update the instructions");
142
- }
143
- try {
144
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
145
- } catch (error) {
146
- this.logger.error(error, "failed to update the chat context");
147
- }
148
- try {
149
- await this.realtimeSession.updateTools(this.tools);
150
- } catch (error) {
151
- this.logger.error(error, "failed to update the tools");
152
- }
153
- } else if (this.llm instanceof LLM) {
154
- try {
155
- updateInstructions({
156
- chatCtx: this.agent._chatCtx,
157
- instructions: this.agent.instructions,
158
- addIfMissing: true
159
- });
160
- } catch (error) {
161
- this.logger.error("failed to update the instructions", error);
162
- }
163
232
  }
164
- if (this.llm instanceof LLM) {
165
- this.llm.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
166
- this.llm.on("error", (ev) => this.onError(ev));
167
- }
168
- if (this.stt instanceof STT) {
169
- this.stt.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
170
- this.stt.on("error", (ev) => this.onError(ev));
171
- }
172
- if (this.tts instanceof TTS) {
173
- this.tts.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
174
- this.tts.on("error", (ev) => this.onError(ev));
175
- }
176
- if (this.vad instanceof VAD) {
177
- this.vad.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
178
- }
179
- this.audioRecognition = new AudioRecognition({
180
- recognitionHooks: this,
181
- // Disable stt node if stt is not provided
182
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
183
- vad: this.vad,
184
- turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
185
- turnDetectionMode: this.turnDetectionMode,
186
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
187
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay
188
- });
189
- this.audioRecognition.start();
190
- this.started = true;
191
- this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
192
- this.createSpeechTask({
193
- task: Task.from(() => this.agent.onEnter()),
233
+ } else if (this.llm instanceof LLM) {
234
+ try {
235
+ updateInstructions({
236
+ chatCtx: this.agent._chatCtx,
237
+ instructions: this.agent.instructions,
238
+ addIfMissing: true
239
+ });
240
+ } catch (error) {
241
+ this.logger.error("failed to update the instructions", error);
242
+ }
243
+ }
244
+ if (this.llm instanceof LLM) {
245
+ this.llm.on("metrics_collected", this.onMetricsCollected);
246
+ this.llm.on("error", this.onModelError);
247
+ }
248
+ if (this.stt instanceof STT) {
249
+ this.stt.on("metrics_collected", this.onMetricsCollected);
250
+ this.stt.on("error", this.onModelError);
251
+ }
252
+ if (this.tts instanceof TTS) {
253
+ this.tts.on("metrics_collected", this.onMetricsCollected);
254
+ this.tts.on("error", this.onModelError);
255
+ }
256
+ if (this.vad instanceof VAD) {
257
+ this.vad.on("metrics_collected", this.onMetricsCollected);
258
+ }
259
+ this.audioRecognition = new AudioRecognition({
260
+ recognitionHooks: this,
261
+ // Disable stt node if stt is not provided
262
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
263
+ vad: this.vad,
264
+ turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
265
+ turnDetectionMode: this.turnDetectionMode,
266
+ interruptionDetection: this.interruptionDetector,
267
+ minEndpointingDelay: ((_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.endpointing) == null ? void 0 : _b.minDelay) ?? this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
268
+ maxEndpointingDelay: ((_d = (_c = this.agent.turnHandling) == null ? void 0 : _c.endpointing) == null ? void 0 : _d.maxDelay) ?? this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
269
+ rootSpanContext: this.agentSession.rootSpanContext,
270
+ sttModel: (_e = this.stt) == null ? void 0 : _e.label,
271
+ sttProvider: this.getSttProvider(),
272
+ getLinkedParticipant: () => {
273
+ var _a2;
274
+ return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
275
+ }
276
+ });
277
+ this.audioRecognition.start();
278
+ this.started = true;
279
+ this._resumeSchedulingTask();
280
+ if (runOnEnter) {
281
+ this._onEnterTask = this.createSpeechTask({
282
+ taskFn: () => onEnterStorage.run(
283
+ { session: this.agentSession, agent: this.agent },
284
+ () => tracer.startActiveSpan(async () => this.agent.onEnter(), {
285
+ name: "on_enter",
286
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
287
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
288
+ })
289
+ ),
290
+ inlineTask: true,
194
291
  name: "AgentActivity_onEnter"
195
292
  });
196
- } finally {
197
- unlock();
198
293
  }
294
+ startSpan.end();
199
295
  }
200
296
  get currentSpeech() {
201
297
  return this._currentSpeech;
@@ -206,6 +302,15 @@ class AgentActivity {
206
302
  get stt() {
207
303
  return this.agent.stt || this.agentSession.stt;
208
304
  }
305
+ getSttProvider() {
306
+ var _a;
307
+ const label = (_a = this.stt) == null ? void 0 : _a.label;
308
+ if (!label) {
309
+ return void 0;
310
+ }
311
+ const [provider] = label.split("-", 1);
312
+ return provider || label;
313
+ }
209
314
  get llm() {
210
315
  return this.agent.llm || this.agentSession.llm;
211
316
  }
@@ -215,21 +320,46 @@ class AgentActivity {
215
320
  get tools() {
216
321
  return this.agent.toolCtx;
217
322
  }
218
- get draining() {
219
- return this._draining;
323
+ get schedulingPaused() {
324
+ return this._schedulingPaused;
220
325
  }
221
326
  get realtimeLLMSession() {
222
327
  return this.realtimeSession;
223
328
  }
224
329
  get allowInterruptions() {
225
- return this.agentSession.options.allowInterruptions;
330
+ var _a, _b;
331
+ return ((_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.interruption) == null ? void 0 : _b.enabled) ?? this.agentSession.sessionOptions.turnHandling.interruption.enabled;
332
+ }
333
+ get useTtsAlignedTranscript() {
334
+ return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
226
335
  }
227
336
  get turnDetection() {
228
- return this.agentSession.turnDetection;
337
+ var _a;
338
+ return ((_a = this.agent.turnHandling) == null ? void 0 : _a.turnDetection) ?? this.agentSession.turnDetection;
339
+ }
340
+ get turnHandling() {
341
+ return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling;
229
342
  }
343
+ // get minEndpointingDelay(): number {
344
+ // return (
345
+ // this.agent.turnHandling?.endpointing?.minDelay ??
346
+ // this.agentSession.sessionOptions.turnHandling.endpointing.minDelay
347
+ // );
348
+ // }
349
+ // get maxEndpointingDelay(): number {
350
+ // return (
351
+ // this.agent.turnHandling?.endpointing?.maxDelay ??
352
+ // this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay
353
+ // );
354
+ // }
230
355
  get toolCtx() {
231
356
  return this.agent.toolCtx;
232
357
  }
358
+ /** @internal */
359
+ get inputStartedAt() {
360
+ var _a;
361
+ return (_a = this.audioRecognition) == null ? void 0 : _a.inputStartedAt;
362
+ }
233
363
  async updateChatCtx(chatCtx) {
234
364
  chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
235
365
  this.agent._chatCtx = chatCtx;
@@ -244,36 +374,79 @@ class AgentActivity {
244
374
  });
245
375
  }
246
376
  }
247
- updateOptions({ toolChoice }) {
377
+ // TODO: Add when AgentConfigUpdate is ported to ChatContext.
378
+ async updateTools(tools) {
379
+ this.agent._tools = { ...tools };
380
+ if (this.realtimeSession) {
381
+ await this.realtimeSession.updateTools(tools);
382
+ }
383
+ if (this.llm instanceof LLM) {
384
+ await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
385
+ }
386
+ }
387
+ updateOptions({
388
+ toolChoice,
389
+ turnDetection
390
+ }) {
248
391
  if (toolChoice !== void 0) {
249
392
  this.toolChoice = toolChoice;
250
393
  }
251
394
  if (this.realtimeSession) {
252
395
  this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
253
396
  }
397
+ if (turnDetection !== void 0) {
398
+ this.turnDetectionMode = turnDetection;
399
+ this.isDefaultInterruptionByAudioActivityEnabled = this.turnDetectionMode !== "manual" && this.turnDetectionMode !== "realtime_llm";
400
+ if (this.agentSession.agentState !== "speaking") {
401
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
402
+ }
403
+ }
404
+ if (this.audioRecognition) {
405
+ this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
406
+ }
254
407
  }
255
408
  attachAudioInput(audioStream) {
256
- if (this.audioStream.isSourceSet) {
257
- this.logger.debug("detaching existing audio input in agent activity");
258
- this.audioStream.detachSource();
259
- }
260
- this.audioStream.setSource(audioStream);
261
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
262
- if (this.realtimeSession) {
409
+ void this.audioStream.close();
410
+ this.audioStream = new MultiInputStream();
411
+ const aecWarmupAudioFilter = new TransformStream({
412
+ transform: (frame, controller) => {
413
+ const shouldDiscardForAecWarmup = this.agentSession.agentState === "speaking" && this.agentSession._aecWarmupRemaining > 0;
414
+ if (!shouldDiscardForAecWarmup) {
415
+ controller.enqueue(frame);
416
+ }
417
+ }
418
+ });
419
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
420
+ if (this.realtimeSession && this.audioRecognition) {
421
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.pipeThrough(aecWarmupAudioFilter).tee();
263
422
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
264
- }
265
- if (this.audioRecognition) {
266
423
  this.audioRecognition.setInputAudioStream(recognitionAudioStream);
424
+ } else if (this.realtimeSession) {
425
+ this.realtimeSession.setInputAudioStream(
426
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter)
427
+ );
428
+ } else if (this.audioRecognition) {
429
+ this.audioRecognition.setInputAudioStream(
430
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter)
431
+ );
267
432
  }
268
433
  }
269
434
  detachAudioInput() {
270
- this.audioStream.detachSource();
435
+ if (this.audioStreamId === void 0) {
436
+ return;
437
+ }
438
+ void this.audioStream.close();
439
+ this.audioStream = new MultiInputStream();
440
+ this.audioStreamId = void 0;
271
441
  }
272
- commitUserTurn() {
442
+ commitUserTurn(options = {}) {
443
+ const { audioDetached = false, throwIfNotReady = true } = options;
273
444
  if (!this.audioRecognition) {
274
- throw new Error("AudioRecognition is not initialized");
445
+ if (throwIfNotReady) {
446
+ throw new Error("AudioRecognition is not initialized");
447
+ }
448
+ return;
275
449
  }
276
- const audioDetached = false;
277
450
  this.audioRecognition.commitUserTurn(audioDetached);
278
451
  }
279
452
  clearUserTurn() {
@@ -309,13 +482,11 @@ class AgentActivity {
309
482
  })
310
483
  );
311
484
  const task = this.createSpeechTask({
312
- task: Task.from(
313
- (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
314
- ),
485
+ taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
315
486
  ownedSpeechHandle: handle,
316
487
  name: "AgentActivity.say_tts"
317
488
  });
318
- task.finally(() => this.onPipelineReplyDone());
489
+ task.result.finally(() => this.onPipelineReplyDone());
319
490
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
320
491
  return handle;
321
492
  }
@@ -325,6 +496,14 @@ class AgentActivity {
325
496
  if (speechHandle && (ev.type === "llm_metrics" || ev.type === "tts_metrics")) {
326
497
  ev.speechId = speechHandle.id;
327
498
  }
499
+ if (ev.type === "realtime_model_metrics" && this.realtimeSpans) {
500
+ const span = this.realtimeSpans.get(ev.requestId);
501
+ if (span) {
502
+ recordRealtimeMetrics(span, ev);
503
+ this.realtimeSpans.delete(ev.requestId);
504
+ }
505
+ }
506
+ this.agentSession._usageCollector.collect(ev);
328
507
  this.agentSession.emit(
329
508
  AgentSessionEventTypes.MetricsCollected,
330
509
  createMetricsCollectedEvent({ metrics: ev })
@@ -351,6 +530,13 @@ class AgentActivity {
351
530
  this.logger.info("onInputSpeechStarted");
352
531
  if (!this.vad) {
353
532
  this.agentSession._updateUserState("speaking");
533
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
534
+ this.audioRecognition.onStartOfOverlapSpeech(
535
+ 0,
536
+ Date.now(),
537
+ this.agentSession._userSpeakingSpan
538
+ );
539
+ }
354
540
  }
355
541
  try {
356
542
  this.interrupt();
@@ -364,6 +550,9 @@ class AgentActivity {
364
550
  onInputSpeechStopped(ev) {
365
551
  this.logger.info(ev, "onInputSpeechStopped");
366
552
  if (!this.vad) {
553
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
554
+ this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
555
+ }
367
556
  this.agentSession._updateUserState("listening");
368
557
  }
369
558
  if (ev.userTranscriptionEnabled) {
@@ -398,8 +587,8 @@ class AgentActivity {
398
587
  if (ev.userInitiated) {
399
588
  return;
400
589
  }
401
- if (this.draining) {
402
- this.logger.warn("skipping new realtime generation, the agent is draining");
590
+ if (this.schedulingPaused) {
591
+ this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
403
592
  return;
404
593
  }
405
594
  const handle = SpeechHandle.create({
@@ -415,45 +604,91 @@ class AgentActivity {
415
604
  );
416
605
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
417
606
  this.createSpeechTask({
418
- task: Task.from(
419
- (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
420
- ),
607
+ taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
421
608
  ownedSpeechHandle: handle,
422
609
  name: "AgentActivity.realtimeGeneration"
423
610
  });
424
611
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
425
612
  }
426
613
  // recognition hooks
427
- onStartOfSpeech(_ev) {
428
- this.agentSession._updateUserState("speaking");
614
+ onStartOfSpeech(ev) {
615
+ let speechStartTime = Date.now();
616
+ if (ev) {
617
+ speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
618
+ }
619
+ this.agentSession._updateUserState("speaking", {
620
+ lastSpeakingTime: speechStartTime,
621
+ otelContext: otelContext.active()
622
+ });
623
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
624
+ this.audioRecognition.onStartOfOverlapSpeech(
625
+ ev.speechDuration,
626
+ speechStartTime,
627
+ this.agentSession._userSpeakingSpan
628
+ );
629
+ }
429
630
  }
430
- onEndOfSpeech(_ev) {
431
- this.agentSession._updateUserState("listening");
631
+ onEndOfSpeech(ev) {
632
+ let speechEndTime = Date.now();
633
+ if (ev) {
634
+ speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
635
+ }
636
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
637
+ this.audioRecognition.onEndOfOverlapSpeech(
638
+ speechEndTime,
639
+ this.agentSession._userSpeakingSpan
640
+ );
641
+ }
642
+ this.agentSession._updateUserState("listening", {
643
+ lastSpeakingTime: speechEndTime,
644
+ otelContext: otelContext.active()
645
+ });
432
646
  }
433
647
  onVADInferenceDone(ev) {
434
- var _a, _b;
648
+ var _a;
435
649
  if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
436
650
  return;
437
651
  }
438
- if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
652
+ if (ev.speechDuration >= ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minDuration)) {
653
+ this.interruptByAudioActivity();
654
+ }
655
+ }
656
+ interruptByAudioActivity() {
657
+ var _a, _b, _c, _d;
658
+ if (!this.isInterruptionByAudioActivityEnabled) {
659
+ return;
660
+ }
661
+ if (this.agentSession._aecWarmupRemaining > 0) {
439
662
  return;
440
663
  }
441
- if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
664
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
442
665
  return;
443
666
  }
444
- if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
667
+ if (this.stt && ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0 && this.audioRecognition) {
445
668
  const text = this.audioRecognition.currentTranscript;
446
- if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
669
+ const normalizedText = text ?? "";
670
+ const wordCount = splitWords(normalizedText, true).length;
671
+ if (wordCount < ((_b = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
447
672
  return;
448
673
  }
449
674
  }
450
- (_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
675
+ (_c = this.realtimeSession) == null ? void 0 : _c.startUserActivity();
451
676
  if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
452
- this.logger.info({ "speech id": this._currentSpeech.id }, "speech interrupted by VAD");
453
- (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
677
+ this.logger.info(
678
+ { "speech id": this._currentSpeech.id },
679
+ "speech interrupted by audio activity"
680
+ );
681
+ (_d = this.realtimeSession) == null ? void 0 : _d.interrupt();
454
682
  this._currentSpeech.interrupt();
455
683
  }
456
684
  }
685
+ onInterruption(ev) {
686
+ this.restoreInterruptionByAudioActivity();
687
+ this.interruptByAudioActivity();
688
+ if (this.audioRecognition) {
689
+ this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.detectedAt);
690
+ }
691
+ }
457
692
  onInterimTranscript(ev) {
458
693
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
459
694
  return;
@@ -462,10 +697,14 @@ class AgentActivity {
462
697
  AgentSessionEventTypes.UserInputTranscribed,
463
698
  createUserInputTranscribedEvent({
464
699
  transcript: ev.alternatives[0].text,
465
- isFinal: false
700
+ isFinal: false,
701
+ language: ev.alternatives[0].language
466
702
  // TODO(AJS-106): add multi participant support
467
703
  })
468
704
  );
705
+ if (ev.alternatives[0].text) {
706
+ this.interruptByAudioActivity();
707
+ }
469
708
  }
470
709
  onFinalTranscript(ev) {
471
710
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
@@ -475,13 +714,70 @@ class AgentActivity {
475
714
  AgentSessionEventTypes.UserInputTranscribed,
476
715
  createUserInputTranscribedEvent({
477
716
  transcript: ev.alternatives[0].text,
478
- isFinal: true
717
+ isFinal: true,
718
+ language: ev.alternatives[0].language
479
719
  // TODO(AJS-106): add multi participant support
480
720
  })
481
721
  );
722
+ if (this.audioRecognition && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm") {
723
+ this.interruptByAudioActivity();
724
+ }
725
+ }
726
+ onPreemptiveGeneration(info) {
727
+ if (!this.agentSession.sessionOptions.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
728
+ return;
729
+ }
730
+ this.cancelPreemptiveGeneration();
731
+ this.logger.info(
732
+ {
733
+ newTranscript: info.newTranscript,
734
+ transcriptConfidence: info.transcriptConfidence
735
+ },
736
+ "starting preemptive generation"
737
+ );
738
+ const userMessage = ChatMessage.create({
739
+ role: "user",
740
+ content: info.newTranscript,
741
+ transcriptConfidence: info.transcriptConfidence
742
+ });
743
+ const chatCtx = this.agent.chatCtx.copy();
744
+ const speechHandle = this.generateReply({
745
+ userMessage,
746
+ chatCtx,
747
+ scheduleSpeech: false
748
+ });
749
+ this._preemptiveGeneration = {
750
+ speechHandle,
751
+ userMessage,
752
+ info,
753
+ chatCtx: chatCtx.copy(),
754
+ tools: { ...this.tools },
755
+ toolChoice: this.toolChoice,
756
+ createdAt: Date.now()
757
+ };
758
+ }
759
+ cancelPreemptiveGeneration() {
760
+ if (this._preemptiveGeneration !== void 0) {
761
+ this._preemptiveGeneration.speechHandle._cancel();
762
+ this._preemptiveGeneration = void 0;
763
+ }
482
764
  }
483
765
  createSpeechTask(options) {
484
- const { task, ownedSpeechHandle } = options;
766
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
767
+ const wrappedFn = (ctrl) => {
768
+ return agentActivityStorage.run(this, () => {
769
+ const currentTask = Task.current();
770
+ if (currentTask) {
771
+ _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
772
+ }
773
+ if (ownedSpeechHandle) {
774
+ return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
775
+ }
776
+ return taskFn(ctrl);
777
+ });
778
+ };
779
+ const task = Task.from(wrappedFn, controller, name);
780
+ _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
485
781
  this.speechTasks.add(task);
486
782
  task.addDoneCallback(() => {
487
783
  this.speechTasks.delete(task);
@@ -497,20 +793,35 @@ class AgentActivity {
497
793
  task.addDoneCallback(() => {
498
794
  this.wakeupMainTask();
499
795
  });
500
- return task.result;
796
+ return task;
501
797
  }
502
798
  async onEndOfTurn(info) {
503
- if (this.draining) {
504
- this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
799
+ var _a, _b;
800
+ if (this.schedulingPaused) {
801
+ this.cancelPreemptiveGeneration();
802
+ this.logger.warn(
803
+ { user_input: info.newTranscript },
804
+ "skipping user input, speech scheduling is paused"
805
+ );
505
806
  return true;
506
807
  }
507
- if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0 && info.newTranscript.split(" ").length < this.agentSession.options.minInterruptionWords) {
508
- this.logger.info("skipping user input, new_transcript is too short");
509
- return false;
808
+ if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && ((_a = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _a.minWords) > 0) {
809
+ const wordCount = splitWords(info.newTranscript, true).length;
810
+ if (wordCount < ((_b = this.agentSession.sessionOptions.turnHandling.interruption) == null ? void 0 : _b.minWords)) {
811
+ this.cancelPreemptiveGeneration();
812
+ this.logger.info(
813
+ {
814
+ wordCount,
815
+ minInterruptionWords: this.agentSession.sessionOptions.turnHandling.interruption.minWords
816
+ },
817
+ "skipping user input, word count below minimum interruption threshold"
818
+ );
819
+ return false;
820
+ }
510
821
  }
511
822
  const oldTask = this._userTurnCompletedTask;
512
823
  this._userTurnCompletedTask = this.createSpeechTask({
513
- task: Task.from(() => this.userTurnCompleted(info, oldTask)),
824
+ taskFn: () => this.userTurnCompleted(info, oldTask),
514
825
  name: "AgentActivity.userTurnCompleted"
515
826
  });
516
827
  return true;
@@ -535,19 +846,49 @@ class AgentActivity {
535
846
  throw new Error("Speech queue is empty");
536
847
  }
537
848
  const speechHandle = heapItem[2];
849
+ if (speechHandle.interrupted || speechHandle.done()) {
850
+ continue;
851
+ }
538
852
  this._currentSpeech = speechHandle;
539
853
  speechHandle._authorizeGeneration();
540
- await speechHandle._waitForGeneration();
854
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForGeneration()]);
541
855
  this._currentSpeech = void 0;
542
856
  }
543
- if (this.draining && this.speechTasks.size === 0) {
544
- this.logger.info("mainTask: draining and no more speech tasks");
857
+ const toWait = this.getDrainPendingSpeechTasks();
858
+ if (this._schedulingPaused && toWait.length === 0) {
859
+ this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
545
860
  break;
546
861
  }
547
862
  this.q_updated = new Future();
548
863
  }
549
864
  this.logger.info("AgentActivity mainTask: exiting");
550
865
  }
866
+ getDrainPendingSpeechTasks() {
867
+ const blockedHandles = [];
868
+ for (const task of this._drainBlockedTasks) {
869
+ const info = _getActivityTaskInfo(task);
870
+ if (!info) {
871
+ this.logger.error("blocked task without activity info; skipping.");
872
+ continue;
873
+ }
874
+ if (!info.speechHandle) {
875
+ continue;
876
+ }
877
+ blockedHandles.push(info.speechHandle);
878
+ }
879
+ const toWait = [];
880
+ for (const task of this.speechTasks) {
881
+ if (this._drainBlockedTasks.includes(task)) {
882
+ continue;
883
+ }
884
+ const info = _getActivityTaskInfo(task);
885
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
886
+ continue;
887
+ }
888
+ toWait.push(task);
889
+ }
890
+ return toWait;
891
+ }
551
892
  wakeupMainTask() {
552
893
  this.q_updated.resolve();
553
894
  }
@@ -558,7 +899,8 @@ class AgentActivity {
558
899
  chatCtx,
559
900
  instructions: defaultInstructions,
560
901
  toolChoice: defaultToolChoice,
561
- allowInterruptions: defaultAllowInterruptions
902
+ allowInterruptions: defaultAllowInterruptions,
903
+ scheduleSpeech = true
562
904
  } = options;
563
905
  let instructions = defaultInstructions;
564
906
  let toolChoice = defaultToolChoice;
@@ -572,7 +914,7 @@ class AgentActivity {
572
914
  if (this.llm === void 0) {
573
915
  throw new Error("trying to generate reply without an LLM model");
574
916
  }
575
- const functionCall = (_a = asyncLocalStorage.getStore()) == null ? void 0 : _a.functionCall;
917
+ const functionCall = (_a = functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
576
918
  if (toolChoice === void 0 && functionCall !== void 0) {
577
919
  toolChoice = "none";
578
920
  }
@@ -590,19 +932,17 @@ class AgentActivity {
590
932
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
591
933
  if (this.llm instanceof RealtimeModel) {
592
934
  this.createSpeechTask({
593
- task: Task.from(
594
- (abortController) => this.realtimeReplyTask({
595
- speechHandle: handle,
596
- // TODO(brian): support llm.ChatMessage for the realtime model
597
- userInput: userMessage == null ? void 0 : userMessage.textContent,
598
- instructions,
599
- modelSettings: {
600
- // isGiven(toolChoice) = toolChoice !== undefined
601
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
602
- },
603
- abortController
604
- })
605
- ),
935
+ taskFn: (abortController) => this.realtimeReplyTask({
936
+ speechHandle: handle,
937
+ // TODO(brian): support llm.ChatMessage for the realtime model
938
+ userInput: userMessage == null ? void 0 : userMessage.textContent,
939
+ instructions,
940
+ modelSettings: {
941
+ // isGiven(toolChoice) = toolChoice !== undefined
942
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
943
+ },
944
+ abortController
945
+ }),
606
946
  ownedSpeechHandle: handle,
607
947
  name: "AgentActivity.realtimeReply"
608
948
  });
@@ -611,39 +951,56 @@ class AgentActivity {
611
951
  instructions = `${this.agent.instructions}
612
952
  ${instructions}`;
613
953
  }
954
+ const onEnterData = onEnterStorage.getStore();
955
+ const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
956
+ const tools = shouldFilterTools ? Object.fromEntries(
957
+ Object.entries(this.agent.toolCtx).filter(
958
+ ([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER)
959
+ )
960
+ ) : this.agent.toolCtx;
614
961
  const task = this.createSpeechTask({
615
- task: Task.from(
616
- (abortController) => this.pipelineReplyTask(
617
- handle,
618
- chatCtx ?? this.agent.chatCtx,
619
- this.agent.toolCtx,
620
- {
621
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
622
- },
623
- abortController,
624
- instructions ? `${this.agent.instructions}
625
- ${instructions}` : instructions,
626
- userMessage
627
- )
962
+ taskFn: (abortController) => this.pipelineReplyTask(
963
+ handle,
964
+ chatCtx ?? this.agent.chatCtx,
965
+ tools,
966
+ {
967
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
968
+ },
969
+ abortController,
970
+ instructions,
971
+ userMessage
628
972
  ),
629
973
  ownedSpeechHandle: handle,
630
974
  name: "AgentActivity.pipelineReply"
631
975
  });
632
- task.finally(() => this.onPipelineReplyDone());
976
+ task.result.finally(() => this.onPipelineReplyDone());
977
+ }
978
+ if (scheduleSpeech) {
979
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
633
980
  }
634
- this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
635
981
  return handle;
636
982
  }
637
- interrupt() {
983
+ interrupt(options = {}) {
638
984
  var _a;
985
+ const { force = false } = options;
986
+ this.cancelPreemptiveGeneration();
639
987
  const future = new Future();
640
988
  const currentSpeech = this._currentSpeech;
641
- currentSpeech == null ? void 0 : currentSpeech.interrupt();
989
+ currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
642
990
  for (const [_, __, speech] of this.speechQueue) {
643
- speech.interrupt();
991
+ speech.interrupt(force);
644
992
  }
645
993
  (_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
646
- if (currentSpeech === void 0) {
994
+ if (force) {
995
+ for (const task of this.speechTasks) {
996
+ task.cancel();
997
+ }
998
+ if (currentSpeech && !currentSpeech.done()) {
999
+ currentSpeech._markDone();
1000
+ }
1001
+ this.speechQueue.clear();
1002
+ future.resolve();
1003
+ } else if (currentSpeech === void 0) {
647
1004
  future.resolve();
648
1005
  } else {
649
1006
  currentSpeech.addDoneCallback(() => {
@@ -661,7 +1018,7 @@ ${instructions}` : instructions,
661
1018
  async userTurnCompleted(info, oldTask) {
662
1019
  var _a, _b;
663
1020
  if (oldTask) {
664
- await oldTask;
1021
+ await oldTask.result;
665
1022
  }
666
1023
  if (this.llm instanceof RealtimeModel) {
667
1024
  if (this.llm.capabilities.turnDetection) {
@@ -686,7 +1043,8 @@ ${instructions}` : instructions,
686
1043
  }
687
1044
  let userMessage = ChatMessage.create({
688
1045
  role: "user",
689
- content: info.newTranscript
1046
+ content: info.newTranscript,
1047
+ transcriptConfidence: info.transcriptConfidence
690
1048
  });
691
1049
  const chatCtx = this.agent.chatCtx.copy();
692
1050
  const startTime = Date.now();
@@ -704,13 +1062,57 @@ ${instructions}` : instructions,
704
1062
  } else if (this.llm === void 0) {
705
1063
  return;
706
1064
  }
707
- const speechHandle = this.generateReply({ userMessage, chatCtx });
1065
+ const userMetricsReport = {};
1066
+ if (info.startedSpeakingAt !== void 0) {
1067
+ userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1e3;
1068
+ }
1069
+ if (info.stoppedSpeakingAt !== void 0) {
1070
+ userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1e3;
1071
+ }
1072
+ if (info.transcriptionDelay !== void 0) {
1073
+ userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1e3;
1074
+ }
1075
+ if (info.endOfUtteranceDelay !== void 0) {
1076
+ userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1e3;
1077
+ }
1078
+ userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1e3;
1079
+ if (userMessage) {
1080
+ userMessage.metrics = userMetricsReport;
1081
+ }
1082
+ let speechHandle;
1083
+ if (this._preemptiveGeneration !== void 0) {
1084
+ const preemptive = this._preemptiveGeneration;
1085
+ if (preemptive.info.newTranscript === (userMessage == null ? void 0 : userMessage.textContent) && preemptive.chatCtx.isEquivalent(chatCtx) && isSameToolContext(preemptive.tools, this.tools) && isSameToolChoice(preemptive.toolChoice, this.toolChoice)) {
1086
+ speechHandle = preemptive.speechHandle;
1087
+ if (preemptive.userMessage && userMessage) {
1088
+ preemptive.userMessage.metrics = userMetricsReport;
1089
+ preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
1090
+ }
1091
+ this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
1092
+ this.logger.debug(
1093
+ {
1094
+ preemptiveLeadTime: Date.now() - preemptive.createdAt
1095
+ },
1096
+ "using preemptive generation"
1097
+ );
1098
+ } else {
1099
+ this.logger.warn(
1100
+ "preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`"
1101
+ );
1102
+ preemptive.speechHandle._cancel();
1103
+ }
1104
+ this._preemptiveGeneration = void 0;
1105
+ }
1106
+ if (speechHandle === void 0) {
1107
+ speechHandle = this.generateReply({ userMessage, chatCtx });
1108
+ }
708
1109
  const eouMetrics = {
709
1110
  type: "eou_metrics",
710
1111
  timestamp: Date.now(),
711
- endOfUtteranceDelay: info.endOfUtteranceDelay,
712
- transcriptionDelay: info.transcriptionDelay,
713
- onUserTurnCompletedDelay: callbackDuration,
1112
+ endOfUtteranceDelayMs: info.endOfUtteranceDelay,
1113
+ transcriptionDelayMs: info.transcriptionDelay,
1114
+ onUserTurnCompletedDelayMs: callbackDuration,
1115
+ lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
714
1116
  speechId: speechHandle.id
715
1117
  };
716
1118
  this.agentSession.emit(
@@ -719,6 +1121,8 @@ ${instructions}` : instructions,
719
1121
  );
720
1122
  }
721
1123
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
1124
+ var _a, _b;
1125
+ speechHandle._agentTurnContext = otelContext.active();
722
1126
  speechHandleStorage.enterWith(speechHandle);
723
1127
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
724
1128
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
@@ -750,25 +1154,38 @@ ${instructions}` : instructions,
750
1154
  textOut = _textOut;
751
1155
  tasks.push(textForwardTask);
752
1156
  }
753
- const onFirstFrame = () => {
754
- this.agentSession._updateAgentState("speaking");
1157
+ let replyStartedSpeakingAt;
1158
+ let replyTtsGenData = null;
1159
+ const onFirstFrame = (startedSpeakingAt) => {
1160
+ replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1161
+ this.agentSession._updateAgentState("speaking", {
1162
+ startTime: startedSpeakingAt,
1163
+ otelContext: speechHandle._agentTurnContext
1164
+ });
1165
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1166
+ this.audioRecognition.onStartOfAgentSpeech();
1167
+ this.isInterruptionByAudioActivityEnabled = false;
1168
+ }
755
1169
  };
756
1170
  if (!audioOutput) {
757
1171
  if (textOut) {
758
- textOut.firstTextFut.await.finally(onFirstFrame);
1172
+ textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
759
1173
  }
760
1174
  } else {
761
1175
  let audioOut = null;
762
1176
  if (!audio) {
763
- const [ttsTask, ttsStream] = performTTSInference(
1177
+ const [ttsTask, ttsGenData] = performTTSInference(
764
1178
  (...args) => this.agent.ttsNode(...args),
765
1179
  audioSource,
766
1180
  modelSettings,
767
- replyAbortController
1181
+ replyAbortController,
1182
+ (_a = this.tts) == null ? void 0 : _a.model,
1183
+ (_b = this.tts) == null ? void 0 : _b.provider
768
1184
  );
769
1185
  tasks.push(ttsTask);
1186
+ replyTtsGenData = ttsGenData;
770
1187
  const [forwardTask, _audioOut] = performAudioForwarding(
771
- ttsStream,
1188
+ ttsGenData.audioStream,
772
1189
  audioOutput,
773
1190
  replyAbortController
774
1191
  );
@@ -783,7 +1200,7 @@ ${instructions}` : instructions,
783
1200
  tasks.push(forwardTask);
784
1201
  audioOut = _audioOut;
785
1202
  }
786
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1203
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
787
1204
  }
788
1205
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
789
1206
  if (audioOutput) {
@@ -798,28 +1215,63 @@ ${instructions}` : instructions,
798
1215
  }
799
1216
  }
800
1217
  if (addToChatCtx) {
1218
+ const replyStoppedSpeakingAt = Date.now();
1219
+ const replyAssistantMetrics = {};
1220
+ if ((replyTtsGenData == null ? void 0 : replyTtsGenData.ttfb) !== void 0) {
1221
+ replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
1222
+ }
1223
+ if (replyStartedSpeakingAt !== void 0) {
1224
+ replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1e3;
1225
+ replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1e3;
1226
+ }
801
1227
  const message = ChatMessage.create({
802
1228
  role: "assistant",
803
1229
  content: (textOut == null ? void 0 : textOut.text) || "",
804
- interrupted: speechHandle.interrupted
1230
+ interrupted: speechHandle.interrupted,
1231
+ metrics: replyAssistantMetrics
805
1232
  });
806
1233
  this.agent._chatCtx.insert(message);
807
1234
  this.agentSession._conversationItemAdded(message);
808
1235
  }
809
1236
  if (this.agentSession.agentState === "speaking") {
810
1237
  this.agentSession._updateAgentState("listening");
1238
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1239
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1240
+ }
1241
+ this.restoreInterruptionByAudioActivity();
811
1242
  }
812
1243
  }
813
- async pipelineReplyTask(speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) {
814
- var _a, _b, _c;
1244
+ _pipelineReplyTaskImpl = async ({
1245
+ speechHandle,
1246
+ chatCtx,
1247
+ toolCtx,
1248
+ modelSettings,
1249
+ replyAbortController,
1250
+ instructions,
1251
+ newMessage,
1252
+ toolsMessages,
1253
+ span,
1254
+ _previousUserMetrics
1255
+ }) => {
1256
+ var _a, _b, _c, _d, _e, _f;
1257
+ speechHandle._agentTurnContext = otelContext.active();
1258
+ span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1259
+ if (instructions) {
1260
+ span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
1261
+ }
1262
+ if (newMessage) {
1263
+ span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || "");
1264
+ }
1265
+ const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
1266
+ if (localParticipant) {
1267
+ setParticipantSpanAttributes(span, localParticipant);
1268
+ }
815
1269
  speechHandleStorage.enterWith(speechHandle);
816
1270
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
817
1271
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
818
1272
  chatCtx = chatCtx.copy();
819
1273
  if (newMessage) {
820
1274
  chatCtx.insert(newMessage);
821
- this.agent._chatCtx.insert(newMessage);
822
- this.agentSession._conversationItemAdded(newMessage);
823
1275
  }
824
1276
  if (instructions) {
825
1277
  try {
@@ -832,7 +1284,6 @@ ${instructions}` : instructions,
832
1284
  this.logger.error({ error: e }, "error occurred during updateInstructions");
833
1285
  }
834
1286
  }
835
- this.agentSession._updateAgentState("thinking");
836
1287
  const tasks = [];
837
1288
  const [llmTask, llmGenData] = performLLMInference(
838
1289
  // preserve `this` context in llmNode
@@ -840,22 +1291,36 @@ ${instructions}` : instructions,
840
1291
  chatCtx,
841
1292
  toolCtx,
842
1293
  modelSettings,
843
- replyAbortController
1294
+ replyAbortController,
1295
+ (_b = this.llm) == null ? void 0 : _b.model,
1296
+ (_c = this.llm) == null ? void 0 : _c.provider
844
1297
  );
845
1298
  tasks.push(llmTask);
846
- const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
847
1299
  let ttsTask = null;
848
- let ttsStream = null;
1300
+ let ttsGenData = null;
1301
+ let llmOutput;
849
1302
  if (audioOutput) {
850
- [ttsTask, ttsStream] = performTTSInference(
1303
+ const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
1304
+ llmOutput = textOutput;
1305
+ [ttsTask, ttsGenData] = performTTSInference(
851
1306
  (...args) => this.agent.ttsNode(...args),
852
1307
  ttsTextInput,
853
1308
  modelSettings,
854
- replyAbortController
1309
+ replyAbortController,
1310
+ (_d = this.tts) == null ? void 0 : _d.model,
1311
+ (_e = this.tts) == null ? void 0 : _e.provider
855
1312
  );
856
1313
  tasks.push(ttsTask);
1314
+ } else {
1315
+ llmOutput = llmGenData.textStream;
857
1316
  }
858
1317
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1318
+ let userMetrics = _previousUserMetrics;
1319
+ if (newMessage && speechHandle.scheduled) {
1320
+ this.agent._chatCtx.insert(newMessage);
1321
+ this.agentSession._conversationItemAdded(newMessage);
1322
+ userMetrics = newMessage.metrics;
1323
+ }
859
1324
  if (speechHandle.interrupted) {
860
1325
  replyAbortController.abort();
861
1326
  await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
@@ -865,7 +1330,20 @@ ${instructions}` : instructions,
865
1330
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
866
1331
  speechHandle._clearAuthorization();
867
1332
  const replyStartedAt = Date.now();
868
- const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
1333
+ let transcriptionInput = llmOutput;
1334
+ if (this.useTtsAlignedTranscript && ((_f = this.tts) == null ? void 0 : _f.capabilities.alignedTranscript) && ttsGenData) {
1335
+ const timedTextsStream = await Promise.race([
1336
+ ttsGenData.timedTextsFut.await,
1337
+ (ttsTask == null ? void 0 : ttsTask.result.catch(
1338
+ () => this.logger.warn("TTS task failed before resolving timedTextsFut")
1339
+ )) ?? Promise.resolve()
1340
+ ]);
1341
+ if (timedTextsStream) {
1342
+ this.logger.debug("Using TTS aligned transcripts for transcription node input");
1343
+ transcriptionInput = timedTextsStream;
1344
+ }
1345
+ }
1346
+ const trNodeResult = await this.agent.transcriptionNode(transcriptionInput, modelSettings);
869
1347
  let textOut = null;
870
1348
  if (trNodeResult) {
871
1349
  const [textForwardTask, _textOut] = performTextForwarding(
@@ -876,29 +1354,44 @@ ${instructions}` : instructions,
876
1354
  tasks.push(textForwardTask);
877
1355
  textOut = _textOut;
878
1356
  }
879
- const onFirstFrame = () => {
880
- this.agentSession._updateAgentState("speaking");
1357
+ let agentStartedSpeakingAt;
1358
+ const onFirstFrame = (startedSpeakingAt) => {
1359
+ agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1360
+ this.agentSession._updateAgentState("speaking", {
1361
+ startTime: startedSpeakingAt,
1362
+ otelContext: speechHandle._agentTurnContext
1363
+ });
1364
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1365
+ this.audioRecognition.onStartOfAgentSpeech();
1366
+ this.isInterruptionByAudioActivityEnabled = false;
1367
+ }
881
1368
  };
882
1369
  let audioOut = null;
883
1370
  if (audioOutput) {
884
- if (ttsStream) {
1371
+ if (ttsGenData) {
885
1372
  const [forwardTask, _audioOut] = performAudioForwarding(
886
- ttsStream,
1373
+ ttsGenData.audioStream,
887
1374
  audioOutput,
888
1375
  replyAbortController
889
1376
  );
890
1377
  audioOut = _audioOut;
891
1378
  tasks.push(forwardTask);
892
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1379
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
893
1380
  } else {
894
- throw Error("ttsStream is null when audioOutput is enabled");
1381
+ throw Error("ttsGenData is null when audioOutput is enabled");
895
1382
  }
896
1383
  } else {
897
- textOut == null ? void 0 : textOut.firstTextFut.await.finally(onFirstFrame);
1384
+ textOut == null ? void 0 : textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
898
1385
  }
899
- const onToolExecutionStarted = (_) => {
1386
+ const onToolExecutionStarted = (f) => {
1387
+ speechHandle._itemAdded([f]);
1388
+ this.agent._chatCtx.items.push(f);
1389
+ this.agentSession._toolItemsAdded([f]);
900
1390
  };
901
- const onToolExecutionCompleted = (_) => {
1391
+ const onToolExecutionCompleted = (out) => {
1392
+ if (out.toolCallOutput) {
1393
+ speechHandle._itemAdded([out.toolCallOutput]);
1394
+ }
902
1395
  };
903
1396
  const [executeToolsTask, toolOutput] = performToolExecutions({
904
1397
  session: this.agentSession,
@@ -914,28 +1407,53 @@ ${instructions}` : instructions,
914
1407
  if (audioOutput) {
915
1408
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
916
1409
  }
1410
+ const agentStoppedSpeakingAt = Date.now();
1411
+ const assistantMetrics = {};
1412
+ if (llmGenData.ttft !== void 0) {
1413
+ assistantMetrics.llmNodeTtft = llmGenData.ttft;
1414
+ }
1415
+ if ((ttsGenData == null ? void 0 : ttsGenData.ttfb) !== void 0) {
1416
+ assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb;
1417
+ }
1418
+ if (agentStartedSpeakingAt !== void 0) {
1419
+ assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1e3;
1420
+ assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1e3;
1421
+ if ((userMetrics == null ? void 0 : userMetrics.stoppedSpeakingAt) !== void 0) {
1422
+ const e2eLatency = agentStartedSpeakingAt / 1e3 - userMetrics.stoppedSpeakingAt;
1423
+ assistantMetrics.e2eLatency = e2eLatency;
1424
+ span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
1425
+ }
1426
+ }
1427
+ span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
1428
+ let hasSpeechMessage = false;
917
1429
  if (toolsMessages) {
918
1430
  for (const msg of toolsMessages) {
919
1431
  msg.createdAt = replyStartedAt;
920
1432
  }
921
- this.agent._chatCtx.insert(toolsMessages);
1433
+ const toolCallOutputs = toolsMessages.filter(
1434
+ (m) => m.type === "function_call_output"
1435
+ );
1436
+ if (toolCallOutputs.length > 0) {
1437
+ this.agent._chatCtx.insert(toolCallOutputs);
1438
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1439
+ }
922
1440
  }
923
1441
  if (speechHandle.interrupted) {
924
1442
  this.logger.debug(
925
1443
  { speech_id: speechHandle.id },
926
1444
  "Aborting all pipeline reply tasks due to interruption"
927
1445
  );
1446
+ if (audioOutput) {
1447
+ audioOutput.clearBuffer();
1448
+ }
928
1449
  replyAbortController.abort();
929
- await Promise.allSettled(
930
- tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT))
931
- );
1450
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
932
1451
  let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
933
1452
  if (audioOutput) {
934
- audioOutput.clearBuffer();
935
1453
  const playbackEv = await audioOutput.waitForPlayout();
936
- if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
1454
+ if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
937
1455
  this.logger.info(
938
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1456
+ { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
939
1457
  "playout interrupted"
940
1458
  );
941
1459
  if (playbackEv.synchronizedTranscript) {
@@ -946,19 +1464,27 @@ ${instructions}` : instructions,
946
1464
  }
947
1465
  }
948
1466
  if (forwardedText) {
1467
+ hasSpeechMessage = true;
949
1468
  const message = ChatMessage.create({
950
1469
  role: "assistant",
951
1470
  content: forwardedText,
952
1471
  id: llmGenData.id,
953
1472
  interrupted: true,
954
- createdAt: replyStartedAt
1473
+ createdAt: replyStartedAt,
1474
+ metrics: assistantMetrics
955
1475
  });
956
1476
  chatCtx.insert(message);
957
1477
  this.agent._chatCtx.insert(message);
1478
+ speechHandle._itemAdded([message]);
958
1479
  this.agentSession._conversationItemAdded(message);
1480
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
959
1481
  }
960
1482
  if (this.agentSession.agentState === "speaking") {
961
1483
  this.agentSession._updateAgentState("listening");
1484
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1485
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1486
+ this.restoreInterruptionByAudioActivity();
1487
+ }
962
1488
  }
963
1489
  this.logger.info(
964
1490
  { speech_id: speechHandle.id, message: forwardedText },
@@ -969,16 +1495,20 @@ ${instructions}` : instructions,
969
1495
  return;
970
1496
  }
971
1497
  if (textOut && textOut.text) {
1498
+ hasSpeechMessage = true;
972
1499
  const message = ChatMessage.create({
973
1500
  role: "assistant",
974
1501
  id: llmGenData.id,
975
1502
  interrupted: false,
976
1503
  createdAt: replyStartedAt,
977
- content: textOut.text
1504
+ content: textOut.text,
1505
+ metrics: assistantMetrics
978
1506
  });
979
1507
  chatCtx.insert(message);
980
1508
  this.agent._chatCtx.insert(message);
1509
+ speechHandle._itemAdded([message]);
981
1510
  this.agentSession._conversationItemAdded(message);
1511
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
982
1512
  this.logger.info(
983
1513
  { speech_id: speechHandle.id, message: textOut.text },
984
1514
  "playout completed without interruption"
@@ -988,11 +1518,17 @@ ${instructions}` : instructions,
988
1518
  this.agentSession._updateAgentState("thinking");
989
1519
  } else if (this.agentSession.agentState === "speaking") {
990
1520
  this.agentSession._updateAgentState("listening");
1521
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1522
+ {
1523
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1524
+ this.restoreInterruptionByAudioActivity();
1525
+ }
1526
+ }
991
1527
  }
992
1528
  speechHandle._markGenerationDone();
993
1529
  await executeToolsTask.result;
994
1530
  if (toolOutput.output.length === 0) return;
995
- const { maxToolSteps } = this.agentSession.options;
1531
+ const { maxToolSteps } = this.agentSession.sessionOptions;
996
1532
  if (speechHandle.numSteps >= maxToolSteps) {
997
1533
  this.logger.warn(
998
1534
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
@@ -1000,45 +1536,15 @@ ${instructions}` : instructions,
1000
1536
  );
1001
1537
  return;
1002
1538
  }
1003
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1004
- functionCalls: [],
1005
- functionCallOutputs: []
1006
- });
1007
- let shouldGenerateToolReply = false;
1008
- let newAgentTask = null;
1009
- let ignoreTaskSwitch = false;
1010
- for (const sanitizedOut of toolOutput.output) {
1011
- if (sanitizedOut.toolCallOutput !== void 0) {
1012
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1013
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1014
- if (sanitizedOut.replyRequired) {
1015
- shouldGenerateToolReply = true;
1016
- }
1017
- }
1018
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1019
- this.logger.error("expected to receive only one agent task from the tool executions");
1020
- ignoreTaskSwitch = true;
1021
- }
1022
- newAgentTask = sanitizedOut.agentTask ?? null;
1023
- this.logger.debug(
1024
- {
1025
- speechId: speechHandle.id,
1026
- name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1027
- args: sanitizedOut.toolCall.args,
1028
- output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1029
- isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1030
- },
1031
- "Tool call execution finished"
1032
- );
1033
- }
1539
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1034
1540
  this.agentSession.emit(
1035
1541
  AgentSessionEventTypes.FunctionToolsExecuted,
1036
1542
  functionToolsExecutedEvent
1037
1543
  );
1038
- let draining = this.draining;
1544
+ let schedulingPaused = this.schedulingPaused;
1039
1545
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1040
1546
  this.agentSession.updateAgent(newAgentTask);
1041
- draining = true;
1547
+ schedulingPaused = true;
1042
1548
  }
1043
1549
  const toolMessages = [
1044
1550
  ...functionToolsExecutedEvent.functionCalls,
@@ -1046,47 +1552,85 @@ ${instructions}` : instructions,
1046
1552
  ];
1047
1553
  if (shouldGenerateToolReply) {
1048
1554
  chatCtx.insert(toolMessages);
1049
- const handle = SpeechHandle.create({
1050
- allowInterruptions: speechHandle.allowInterruptions,
1051
- stepIndex: speechHandle._stepIndex + 1,
1052
- parent: speechHandle
1053
- });
1054
- this.agentSession.emit(
1055
- AgentSessionEventTypes.SpeechCreated,
1056
- createSpeechCreatedEvent({
1057
- userInitiated: false,
1058
- source: "tool_response",
1059
- speechHandle: handle
1060
- })
1061
- );
1062
- const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1555
+ speechHandle._numSteps += 1;
1556
+ const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1063
1557
  const toolResponseTask = this.createSpeechTask({
1064
- task: Task.from(
1065
- () => this.pipelineReplyTask(
1066
- handle,
1067
- chatCtx,
1068
- toolCtx,
1069
- { toolChoice: respondToolChoice },
1070
- replyAbortController,
1071
- instructions,
1072
- void 0,
1073
- toolMessages
1074
- )
1558
+ taskFn: () => this.pipelineReplyTask(
1559
+ speechHandle,
1560
+ chatCtx,
1561
+ toolCtx,
1562
+ { toolChoice: respondToolChoice },
1563
+ replyAbortController,
1564
+ instructions,
1565
+ void 0,
1566
+ toolMessages,
1567
+ hasSpeechMessage ? void 0 : userMetrics
1075
1568
  ),
1076
- ownedSpeechHandle: handle,
1569
+ ownedSpeechHandle: speechHandle,
1077
1570
  name: "AgentActivity.pipelineReply"
1078
1571
  });
1079
- toolResponseTask.finally(() => this.onPipelineReplyDone());
1080
- this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1572
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1573
+ this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1081
1574
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1082
1575
  for (const msg of toolMessages) {
1083
1576
  msg.createdAt = replyStartedAt;
1084
1577
  }
1085
- this.agent._chatCtx.insert(toolMessages);
1578
+ const toolCallOutputs = toolMessages.filter(
1579
+ (m) => m.type === "function_call_output"
1580
+ );
1581
+ if (toolCallOutputs.length > 0) {
1582
+ this.agent._chatCtx.insert(toolCallOutputs);
1583
+ this.agentSession._toolItemsAdded(toolCallOutputs);
1584
+ }
1086
1585
  }
1087
- }
1586
+ };
1587
+ pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages, _previousUserMetrics) => tracer.startActiveSpan(
1588
+ async (span) => this._pipelineReplyTaskImpl({
1589
+ speechHandle,
1590
+ chatCtx,
1591
+ toolCtx,
1592
+ modelSettings,
1593
+ replyAbortController,
1594
+ instructions,
1595
+ newMessage,
1596
+ toolsMessages,
1597
+ span,
1598
+ _previousUserMetrics
1599
+ }),
1600
+ {
1601
+ name: "agent_turn",
1602
+ context: this.agentSession.rootSpanContext
1603
+ }
1604
+ );
1088
1605
  async realtimeGenerationTask(speechHandle, ev, modelSettings, replyAbortController) {
1089
- var _a, _b, _c;
1606
+ return tracer.startActiveSpan(
1607
+ async (span) => this._realtimeGenerationTaskImpl({
1608
+ speechHandle,
1609
+ ev,
1610
+ modelSettings,
1611
+ replyAbortController,
1612
+ span
1613
+ }),
1614
+ {
1615
+ name: "agent_turn",
1616
+ context: this.agentSession.rootSpanContext
1617
+ }
1618
+ );
1619
+ }
1620
+ async _realtimeGenerationTaskImpl({
1621
+ speechHandle,
1622
+ ev,
1623
+ modelSettings,
1624
+ replyAbortController,
1625
+ span
1626
+ }) {
1627
+ var _a;
1628
+ speechHandle._agentTurnContext = otelContext.active();
1629
+ span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1630
+ const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
1631
+ if (localParticipant) {
1632
+ setParticipantSpanAttributes(span, localParticipant);
1633
+ }
1090
1634
  speechHandleStorage.enterWith(speechHandle);
1091
1635
  if (!this.realtimeSession) {
1092
1636
  throw new Error("realtime session is not initialized");
@@ -1094,6 +1638,10 @@ ${instructions}` : instructions,
1094
1638
  if (!(this.llm instanceof RealtimeModel)) {
1095
1639
  throw new Error("llm is not a realtime model");
1096
1640
  }
1641
+ span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.llm.model);
1642
+ if (this.realtimeSpans && ev.responseId) {
1643
+ this.realtimeSpans.set(ev.responseId, span);
1644
+ }
1097
1645
  this.logger.debug(
1098
1646
  { speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
1099
1647
  "realtime generation started"
@@ -1106,10 +1654,17 @@ ${instructions}` : instructions,
1106
1654
  if (speechHandle.interrupted) {
1107
1655
  return;
1108
1656
  }
1109
- const onFirstFrame = () => {
1110
- this.agentSession._updateAgentState("speaking");
1657
+ const onFirstFrame = (startedSpeakingAt) => {
1658
+ this.agentSession._updateAgentState("speaking", {
1659
+ startTime: startedSpeakingAt,
1660
+ otelContext: speechHandle._agentTurnContext
1661
+ });
1111
1662
  };
1112
1663
  const readMessages = async (abortController, outputs) => {
1664
+ var _a2, _b;
1665
+ replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
1666
+ once: true
1667
+ });
1113
1668
  const forwardTasks = [];
1114
1669
  try {
1115
1670
  for await (const msg of ev.messageStream) {
@@ -1119,7 +1674,22 @@ ${instructions}` : instructions,
1119
1674
  );
1120
1675
  break;
1121
1676
  }
1122
- const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
1677
+ const msgModalities = msg.modalities ? await msg.modalities : void 0;
1678
+ let ttsTextInput = null;
1679
+ let trTextInput;
1680
+ if (msgModalities && !msgModalities.includes("audio") && this.tts) {
1681
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
1682
+ this.logger.warn(
1683
+ "text response received from realtime API, falling back to use a TTS model."
1684
+ );
1685
+ }
1686
+ const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
1687
+ ttsTextInput = _ttsTextInput;
1688
+ trTextInput = _trTextInput;
1689
+ } else {
1690
+ trTextInput = msg.textStream;
1691
+ }
1692
+ const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
1123
1693
  let textOut = null;
1124
1694
  if (trNodeResult) {
1125
1695
  const [textForwardTask, _textOut] = performTextForwarding(
@@ -1132,28 +1702,46 @@ ${instructions}` : instructions,
1132
1702
  }
1133
1703
  let audioOut = null;
1134
1704
  if (audioOutput) {
1135
- const realtimeAudio = await this.agent.realtimeAudioOutputNode(
1136
- msg.audioStream,
1137
- modelSettings
1138
- );
1139
- if (realtimeAudio) {
1705
+ let realtimeAudioResult = null;
1706
+ if (ttsTextInput) {
1707
+ const [ttsTask, ttsGenData] = performTTSInference(
1708
+ (...args) => this.agent.ttsNode(...args),
1709
+ ttsTextInput,
1710
+ modelSettings,
1711
+ abortController,
1712
+ (_a2 = this.tts) == null ? void 0 : _a2.model,
1713
+ (_b = this.tts) == null ? void 0 : _b.provider
1714
+ );
1715
+ tasks.push(ttsTask);
1716
+ realtimeAudioResult = ttsGenData.audioStream;
1717
+ } else if (msgModalities && msgModalities.includes("audio")) {
1718
+ realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
1719
+ msg.audioStream,
1720
+ modelSettings
1721
+ );
1722
+ } else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
1723
+ this.logger.error(
1724
+ "Text message received from Realtime API with audio modality. This usually happens when text chat context is synced to the API. Try to add a TTS model as fallback or use text modality with TTS instead."
1725
+ );
1726
+ } else {
1727
+ this.logger.warn(
1728
+ "audio output is enabled but neither tts nor realtime audio is available"
1729
+ );
1730
+ }
1731
+ if (realtimeAudioResult) {
1140
1732
  const [forwardTask, _audioOut] = performAudioForwarding(
1141
- realtimeAudio,
1733
+ realtimeAudioResult,
1142
1734
  audioOutput,
1143
1735
  abortController
1144
1736
  );
1145
1737
  forwardTasks.push(forwardTask);
1146
1738
  audioOut = _audioOut;
1147
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1148
- } else {
1149
- this.logger.warn(
1150
- "audio output is enabled but neither tts nor realtime audio is available"
1151
- );
1739
+ audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
1152
1740
  }
1153
1741
  } else if (textOut) {
1154
- textOut.firstTextFut.await.finally(onFirstFrame);
1742
+ textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
1155
1743
  }
1156
- outputs.push([msg.messageId, textOut, audioOut]);
1744
+ outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
1157
1745
  }
1158
1746
  await waitFor(forwardTasks);
1159
1747
  } catch (error) {
@@ -1166,7 +1754,7 @@ ${instructions}` : instructions,
1166
1754
  const tasks = [
1167
1755
  Task.from(
1168
1756
  (controller) => readMessages(controller, messageOutputs),
1169
- replyAbortController,
1757
+ void 0,
1170
1758
  "AgentActivity.realtime_generation.read_messages"
1171
1759
  )
1172
1760
  ];
@@ -1194,6 +1782,8 @@ ${instructions}` : instructions,
1194
1782
  );
1195
1783
  const onToolExecutionStarted = (f) => {
1196
1784
  speechHandle._itemAdded([f]);
1785
+ this.agent._chatCtx.items.push(f);
1786
+ this.agentSession._toolItemsAdded([f]);
1197
1787
  };
1198
1788
  const onToolExecutionCompleted = (out) => {
1199
1789
  if (out.toolCallOutput) {
@@ -1213,7 +1803,6 @@ ${instructions}` : instructions,
1213
1803
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
1214
1804
  if (audioOutput) {
1215
1805
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1216
- this.agentSession._updateAgentState("listening");
1217
1806
  }
1218
1807
  if (speechHandle.interrupted) {
1219
1808
  this.logger.debug(
@@ -1223,15 +1812,15 @@ ${instructions}` : instructions,
1223
1812
  replyAbortController.abort();
1224
1813
  await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1225
1814
  if (messageOutputs.length > 0) {
1226
- const [msgId, textOut, audioOut] = messageOutputs[0];
1815
+ const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0];
1227
1816
  let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
1228
1817
  if (audioOutput) {
1229
1818
  audioOutput.clearBuffer();
1230
1819
  const playbackEv = await audioOutput.waitForPlayout();
1231
- let playbackPosition = playbackEv.playbackPosition;
1232
- if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
1820
+ let playbackPositionInS = playbackEv.playbackPosition;
1821
+ if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
1233
1822
  this.logger.info(
1234
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1823
+ { speech_id: speechHandle.id, playbackPositionInS },
1235
1824
  "playout interrupted"
1236
1825
  );
1237
1826
  if (playbackEv.synchronizedTranscript) {
@@ -1239,11 +1828,13 @@ ${instructions}` : instructions,
1239
1828
  }
1240
1829
  } else {
1241
1830
  forwardedText = "";
1242
- playbackPosition = 0;
1831
+ playbackPositionInS = 0;
1243
1832
  }
1244
1833
  this.realtimeSession.truncate({
1245
1834
  messageId: msgId,
1246
- audioEndMs: Math.floor(playbackPosition)
1835
+ audioEndMs: Math.floor(playbackPositionInS * 1e3),
1836
+ modalities: msgModalities,
1837
+ audioTranscript: forwardedText
1247
1838
  });
1248
1839
  }
1249
1840
  if (forwardedText) {
@@ -1267,7 +1858,7 @@ ${instructions}` : instructions,
1267
1858
  return;
1268
1859
  }
1269
1860
  if (messageOutputs.length > 0) {
1270
- const [msgId, textOut, _] = messageOutputs[0];
1861
+ const [msgId, textOut, _, __] = messageOutputs[0];
1271
1862
  const message = ChatMessage.create({
1272
1863
  role: "assistant",
1273
1864
  content: (textOut == null ? void 0 : textOut.text) || "",
@@ -1279,12 +1870,16 @@ ${instructions}` : instructions,
1279
1870
  this.agentSession._conversationItemAdded(message);
1280
1871
  }
1281
1872
  speechHandle._markGenerationDone();
1282
- toolOutput.firstToolStartedFuture.await.finally(() => {
1283
- this.agentSession._updateAgentState("thinking");
1284
- });
1285
1873
  await executeToolsTask.result;
1286
- if (toolOutput.output.length === 0) return;
1287
- const { maxToolSteps } = this.agentSession.options;
1874
+ if (toolOutput.output.length > 0) {
1875
+ this.agentSession._updateAgentState("thinking");
1876
+ } else if (this.agentSession.agentState === "speaking") {
1877
+ this.agentSession._updateAgentState("listening");
1878
+ }
1879
+ if (toolOutput.output.length === 0) {
1880
+ return;
1881
+ }
1882
+ const { maxToolSteps } = this.agentSession.sessionOptions;
1288
1883
  if (speechHandle.numSteps >= maxToolSteps) {
1289
1884
  this.logger.warn(
1290
1885
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
@@ -1292,48 +1887,29 @@ ${instructions}` : instructions,
1292
1887
  );
1293
1888
  return;
1294
1889
  }
1295
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1296
- functionCalls: [],
1297
- functionCallOutputs: []
1298
- });
1299
- let shouldGenerateToolReply = false;
1300
- let newAgentTask = null;
1301
- let ignoreTaskSwitch = false;
1302
- for (const sanitizedOut of toolOutput.output) {
1303
- if (sanitizedOut.toolCallOutput !== void 0) {
1304
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1305
- if (sanitizedOut.replyRequired) {
1306
- shouldGenerateToolReply = true;
1307
- }
1308
- }
1309
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1310
- this.logger.error("expected to receive only one agent task from the tool executions");
1311
- ignoreTaskSwitch = true;
1312
- }
1313
- newAgentTask = sanitizedOut.agentTask ?? null;
1314
- this.logger.debug(
1315
- {
1316
- speechId: speechHandle.id,
1317
- name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1318
- args: sanitizedOut.toolCall.args,
1319
- output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1320
- isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1321
- },
1322
- "Tool call execution finished"
1323
- );
1324
- }
1890
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1325
1891
  this.agentSession.emit(
1326
1892
  AgentSessionEventTypes.FunctionToolsExecuted,
1327
1893
  functionToolsExecutedEvent
1328
1894
  );
1329
- let draining = this.draining;
1895
+ let schedulingPaused = this.schedulingPaused;
1330
1896
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1331
1897
  this.agentSession.updateAgent(newAgentTask);
1332
- draining = true;
1898
+ schedulingPaused = true;
1333
1899
  }
1334
1900
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1901
+ while (this.currentSpeech || this.speechQueue.size() > 0) {
1902
+ if (this.currentSpeech && !this.currentSpeech.done() && this.currentSpeech !== speechHandle) {
1903
+ await this.currentSpeech.waitForPlayout();
1904
+ } else {
1905
+ await new Promise((resolve) => setImmediate(resolve));
1906
+ }
1907
+ }
1335
1908
  const chatCtx = this.realtimeSession.chatCtx.copy();
1336
1909
  chatCtx.items.push(...functionToolsExecutedEvent.functionCallOutputs);
1910
+ this.agentSession._toolItemsAdded(
1911
+ functionToolsExecutedEvent.functionCallOutputs
1912
+ );
1337
1913
  try {
1338
1914
  await this.realtimeSession.updateChatCtx(chatCtx);
1339
1915
  } catch (error) {
@@ -1360,20 +1936,58 @@ ${instructions}` : instructions,
1360
1936
  speechHandle: replySpeechHandle
1361
1937
  })
1362
1938
  );
1363
- const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1939
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1364
1940
  this.createSpeechTask({
1365
- task: Task.from(
1366
- (abortController) => this.realtimeReplyTask({
1367
- speechHandle: replySpeechHandle,
1368
- modelSettings: { toolChoice },
1369
- abortController
1370
- })
1371
- ),
1941
+ taskFn: (abortController) => this.realtimeReplyTask({
1942
+ speechHandle: replySpeechHandle,
1943
+ modelSettings: { toolChoice },
1944
+ abortController
1945
+ }),
1372
1946
  ownedSpeechHandle: replySpeechHandle,
1373
1947
  name: "AgentActivity.realtime_reply"
1374
1948
  });
1375
1949
  this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1376
1950
  }
1951
+ summarizeToolExecutionOutput(toolOutput, speechHandle) {
1952
+ var _a, _b, _c;
1953
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1954
+ functionCalls: [],
1955
+ functionCallOutputs: []
1956
+ });
1957
+ let shouldGenerateToolReply = false;
1958
+ let newAgentTask = null;
1959
+ let ignoreTaskSwitch = false;
1960
+ for (const sanitizedOut of toolOutput.output) {
1961
+ if (sanitizedOut.toolCallOutput !== void 0) {
1962
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1963
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1964
+ if (sanitizedOut.replyRequired) {
1965
+ shouldGenerateToolReply = true;
1966
+ }
1967
+ }
1968
+ if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1969
+ this.logger.error("expected to receive only one agent task from the tool executions");
1970
+ ignoreTaskSwitch = true;
1971
+ }
1972
+ newAgentTask = sanitizedOut.agentTask ?? null;
1973
+ this.logger.debug(
1974
+ {
1975
+ speechId: speechHandle.id,
1976
+ name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1977
+ args: sanitizedOut.toolCall.args,
1978
+ output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1979
+ isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1980
+ },
1981
+ "Tool call execution finished"
1982
+ );
1983
+ }
1984
+ return {
1985
+ functionToolsExecutedEvent,
1986
+ shouldGenerateToolReply,
1987
+ newAgentTask,
1988
+ ignoreTaskSwitch
1989
+ };
1990
+ }
1377
1991
  async realtimeReplyTask({
1378
1992
  speechHandle,
1379
1993
  modelSettings: { toolChoice },
@@ -1415,71 +2029,193 @@ ${instructions}` : instructions,
1415
2029
  }
1416
2030
  }
1417
2031
  scheduleSpeech(speechHandle, priority, force = false) {
1418
- if (this.draining && !force) {
1419
- throw new Error("cannot schedule new speech, the agent is draining");
2032
+ if (this.schedulingPaused && !force) {
2033
+ throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
1420
2034
  }
1421
2035
  this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1422
2036
  speechHandle._markScheduled();
1423
2037
  this.wakeupMainTask();
1424
2038
  }
2039
+ async _pauseSchedulingTask(blockedTasks) {
2040
+ if (this._schedulingPaused) return;
2041
+ this._schedulingPaused = true;
2042
+ this._drainBlockedTasks = blockedTasks;
2043
+ this.wakeupMainTask();
2044
+ if (this._mainTask) {
2045
+ await this._mainTask.result;
2046
+ }
2047
+ }
2048
+ _resumeSchedulingTask() {
2049
+ if (!this._schedulingPaused) return;
2050
+ this._schedulingPaused = false;
2051
+ this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
2052
+ }
2053
+ async pause(options = {}) {
2054
+ const { blockedTasks = [] } = options;
2055
+ const unlock = await this.lock.lock();
2056
+ try {
2057
+ const span = tracer.startSpan({
2058
+ name: "pause_agent_activity",
2059
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
2060
+ });
2061
+ try {
2062
+ await this._pauseSchedulingTask(blockedTasks);
2063
+ await this._closeSessionResources();
2064
+ } finally {
2065
+ span.end();
2066
+ }
2067
+ } finally {
2068
+ unlock();
2069
+ }
2070
+ }
1425
2071
  async drain() {
1426
- var _a;
2072
+ return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
2073
+ name: "drain_agent_activity",
2074
+ context: ROOT_CONTEXT
2075
+ });
2076
+ }
2077
+ async _drainImpl(span) {
2078
+ span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
1427
2079
  const unlock = await this.lock.lock();
1428
2080
  try {
1429
- if (this._draining) return;
1430
- this.createSpeechTask({
1431
- task: Task.from(() => this.agent.onExit()),
2081
+ if (this._schedulingPaused) return;
2082
+ this._onExitTask = this.createSpeechTask({
2083
+ taskFn: () => tracer.startActiveSpan(async () => this.agent.onExit(), {
2084
+ name: "on_exit",
2085
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
2086
+ }),
2087
+ inlineTask: true,
1432
2088
  name: "AgentActivity_onExit"
1433
2089
  });
1434
- this.wakeupMainTask();
1435
- this._draining = true;
1436
- await ((_a = this._mainTask) == null ? void 0 : _a.result);
2090
+ this.cancelPreemptiveGeneration();
2091
+ await this._onExitTask.result;
2092
+ await this._pauseSchedulingTask([]);
1437
2093
  } finally {
1438
2094
  unlock();
1439
2095
  }
1440
2096
  }
1441
2097
  async close() {
1442
- var _a, _b, _c;
1443
2098
  const unlock = await this.lock.lock();
1444
2099
  try {
1445
- if (!this._draining) {
1446
- this.logger.warn("task closing without draining");
1447
- }
1448
- if (this.llm instanceof LLM) {
1449
- this.llm.off("metrics_collected", this.onMetricsCollected);
1450
- }
1451
- if (this.realtimeSession) {
1452
- this.realtimeSession.off("generation_created", this.onGenerationCreated);
1453
- this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
1454
- this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
1455
- this.realtimeSession.off(
1456
- "input_audio_transcription_completed",
1457
- this.onInputAudioTranscriptionCompleted
1458
- );
1459
- this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
2100
+ this.cancelPreemptiveGeneration();
2101
+ await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
2102
+ if (this._currentSpeech && !this._currentSpeech.done()) {
2103
+ this._currentSpeech._markDone();
1460
2104
  }
1461
- if (this.stt instanceof STT) {
1462
- this.stt.off("metrics_collected", this.onMetricsCollected);
2105
+ await this._closeSessionResources();
2106
+ if (this._mainTask) {
2107
+ await this._mainTask.cancelAndWait();
1463
2108
  }
1464
- if (this.tts instanceof TTS) {
1465
- this.tts.off("metrics_collected", this.onMetricsCollected);
2109
+ if (this.interruptionDetector) {
2110
+ this.interruptionDetector.off("overlapping_speech", this.onInterruptionOverlappingSpeech);
2111
+ this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
2112
+ this.interruptionDetector.off("error", this.onInterruptionError);
1466
2113
  }
1467
- if (this.vad instanceof VAD) {
1468
- this.vad.off("metrics_collected", this.onMetricsCollected);
1469
- }
1470
- this.detachAudioInput();
1471
- await ((_a = this.realtimeSession) == null ? void 0 : _a.close());
1472
- await ((_b = this.audioRecognition) == null ? void 0 : _b.close());
1473
- await ((_c = this._mainTask) == null ? void 0 : _c.cancelAndWait());
2114
+ this.agent._agentActivity = void 0;
1474
2115
  } finally {
1475
2116
  unlock();
1476
2117
  }
1477
2118
  }
2119
+ resolveInterruptionDetector() {
2120
+ var _a, _b;
2121
+ const agentInterruptionDetection = (_b = (_a = this.agent.turnHandling) == null ? void 0 : _a.interruption) == null ? void 0 : _b.mode;
2122
+ const sessionInterruptionDetection = this.agentSession.interruptionDetection;
2123
+ if (!(this.stt && this.stt.capabilities.alignedTranscript && this.stt.capabilities.streaming && this.vad && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm" && !(this.llm instanceof RealtimeModel))) {
2124
+ if (agentInterruptionDetection === "adaptive" || sessionInterruptionDetection === "adaptive") {
2125
+ this.logger.warn(
2126
+ "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled"
2127
+ );
2128
+ }
2129
+ return void 0;
2130
+ }
2131
+ if (!this.allowInterruptions) {
2132
+ return void 0;
2133
+ }
2134
+ if (agentInterruptionDetection === "vad") {
2135
+ return void 0;
2136
+ }
2137
+ if (sessionInterruptionDetection === "vad") {
2138
+ return void 0;
2139
+ }
2140
+ if (agentInterruptionDetection === void 0 && sessionInterruptionDetection === void 0 && !isHosted() && !isDevMode()) {
2141
+ this.logger.info("adaptive interruption is disabled by default in production mode");
2142
+ return void 0;
2143
+ }
2144
+ try {
2145
+ const detector = new AdaptiveInterruptionDetector();
2146
+ detector.on("overlapping_speech", this.onInterruptionOverlappingSpeech);
2147
+ detector.on("metrics_collected", this.onInterruptionMetricsCollected);
2148
+ detector.on("error", this.onInterruptionError);
2149
+ return detector;
2150
+ } catch (error) {
2151
+ this.logger.warn({ error }, "could not instantiate AdaptiveInterruptionDetector");
2152
+ }
2153
+ return void 0;
2154
+ }
2155
+ restoreInterruptionByAudioActivity() {
2156
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
2157
+ }
2158
+ fallbackToVadInterruption() {
2159
+ if (!this.isInterruptionDetectionEnabled) return;
2160
+ this.isInterruptionDetectionEnabled = false;
2161
+ this.restoreInterruptionByAudioActivity();
2162
+ if (this.interruptionDetector) {
2163
+ this.interruptionDetector.off("overlapping_speech", this.onInterruptionOverlappingSpeech);
2164
+ this.interruptionDetector.off("metrics_collected", this.onInterruptionMetricsCollected);
2165
+ this.interruptionDetector.off("error", this.onInterruptionError);
2166
+ this.interruptionDetector = void 0;
2167
+ }
2168
+ if (this.audioRecognition) {
2169
+ this.audioRecognition.disableInterruptionDetection().catch((err) => {
2170
+ this.logger.warn({ err }, "error while disabling interruption detection");
2171
+ });
2172
+ }
2173
+ this.logger.warn(
2174
+ "adaptive interruption disabled due to unrecoverable error, falling back to VAD-based interruption"
2175
+ );
2176
+ }
2177
+ async _closeSessionResources() {
2178
+ var _a, _b, _c;
2179
+ if (this.llm instanceof LLM) {
2180
+ this.llm.off("metrics_collected", this.onMetricsCollected);
2181
+ this.llm.off("error", this.onModelError);
2182
+ }
2183
+ if (this.realtimeSession) {
2184
+ this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
2185
+ this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
2186
+ this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
2187
+ this.realtimeSession.off(
2188
+ "input_audio_transcription_completed",
2189
+ this.onRealtimeInputAudioTranscriptionCompleted
2190
+ );
2191
+ this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
2192
+ this.realtimeSession.off("error", this.onModelError);
2193
+ }
2194
+ if (this.stt instanceof STT) {
2195
+ this.stt.off("metrics_collected", this.onMetricsCollected);
2196
+ this.stt.off("error", this.onModelError);
2197
+ }
2198
+ if (this.tts instanceof TTS) {
2199
+ this.tts.off("metrics_collected", this.onMetricsCollected);
2200
+ this.tts.off("error", this.onModelError);
2201
+ }
2202
+ if (this.vad instanceof VAD) {
2203
+ this.vad.off("metrics_collected", this.onMetricsCollected);
2204
+ }
2205
+ this.detachAudioInput();
2206
+ (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
2207
+ await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
2208
+ await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
2209
+ this.realtimeSession = void 0;
2210
+ this.audioRecognition = void 0;
2211
+ }
1478
2212
  }
1479
2213
  function toOaiToolChoice(toolChoice) {
1480
2214
  return toolChoice !== null ? toolChoice : void 0;
1481
2215
  }
1482
2216
  export {
1483
- AgentActivity
2217
+ AgentActivity,
2218
+ agentActivityStorage,
2219
+ onEnterStorage
1484
2220
  };
1485
2221
  //# sourceMappingURL=agent_activity.js.map