@livekit/agents 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (959) hide show
  1. package/dist/_exceptions.cjs.map +1 -1
  2. package/dist/_exceptions.d.ts.map +1 -1
  3. package/dist/_exceptions.js.map +1 -1
  4. package/dist/audio.cjs +89 -3
  5. package/dist/audio.cjs.map +1 -1
  6. package/dist/audio.d.cts +36 -1
  7. package/dist/audio.d.ts +36 -1
  8. package/dist/audio.d.ts.map +1 -1
  9. package/dist/audio.js +76 -2
  10. package/dist/audio.js.map +1 -1
  11. package/dist/beta/index.cjs +29 -0
  12. package/dist/beta/index.cjs.map +1 -0
  13. package/dist/beta/index.d.cts +2 -0
  14. package/dist/beta/index.d.ts +2 -0
  15. package/dist/beta/index.d.ts.map +1 -0
  16. package/dist/beta/index.js +7 -0
  17. package/dist/beta/index.js.map +1 -0
  18. package/dist/beta/workflows/index.cjs +29 -0
  19. package/dist/beta/workflows/index.cjs.map +1 -0
  20. package/dist/beta/workflows/index.d.cts +2 -0
  21. package/dist/beta/workflows/index.d.ts +2 -0
  22. package/dist/beta/workflows/index.d.ts.map +1 -0
  23. package/dist/beta/workflows/index.js +7 -0
  24. package/dist/beta/workflows/index.js.map +1 -0
  25. package/dist/beta/workflows/task_group.cjs +165 -0
  26. package/dist/beta/workflows/task_group.cjs.map +1 -0
  27. package/dist/beta/workflows/task_group.d.cts +32 -0
  28. package/dist/beta/workflows/task_group.d.ts +32 -0
  29. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  30. package/dist/beta/workflows/task_group.js +141 -0
  31. package/dist/beta/workflows/task_group.js.map +1 -0
  32. package/dist/cli.cjs +44 -46
  33. package/dist/cli.cjs.map +1 -1
  34. package/dist/cli.d.cts +3 -3
  35. package/dist/cli.d.ts +3 -3
  36. package/dist/cli.d.ts.map +1 -1
  37. package/dist/cli.js +45 -47
  38. package/dist/cli.js.map +1 -1
  39. package/dist/connection_pool.cjs +242 -0
  40. package/dist/connection_pool.cjs.map +1 -0
  41. package/dist/connection_pool.d.cts +123 -0
  42. package/dist/connection_pool.d.ts +123 -0
  43. package/dist/connection_pool.d.ts.map +1 -0
  44. package/dist/connection_pool.js +218 -0
  45. package/dist/connection_pool.js.map +1 -0
  46. package/dist/connection_pool.test.cjs +256 -0
  47. package/dist/connection_pool.test.cjs.map +1 -0
  48. package/dist/connection_pool.test.js +255 -0
  49. package/dist/connection_pool.test.js.map +1 -0
  50. package/dist/constants.cjs +30 -0
  51. package/dist/constants.cjs.map +1 -1
  52. package/dist/constants.d.cts +10 -0
  53. package/dist/constants.d.ts +10 -0
  54. package/dist/constants.d.ts.map +1 -1
  55. package/dist/constants.js +20 -0
  56. package/dist/constants.js.map +1 -1
  57. package/dist/cpu.cjs +189 -0
  58. package/dist/cpu.cjs.map +1 -0
  59. package/dist/cpu.d.cts +24 -0
  60. package/dist/cpu.d.ts +24 -0
  61. package/dist/cpu.d.ts.map +1 -0
  62. package/dist/cpu.js +152 -0
  63. package/dist/cpu.js.map +1 -0
  64. package/dist/cpu.test.cjs +227 -0
  65. package/dist/cpu.test.cjs.map +1 -0
  66. package/dist/cpu.test.js +204 -0
  67. package/dist/cpu.test.js.map +1 -0
  68. package/dist/http_server.cjs +9 -6
  69. package/dist/http_server.cjs.map +1 -1
  70. package/dist/http_server.d.cts +5 -1
  71. package/dist/http_server.d.ts +5 -1
  72. package/dist/http_server.d.ts.map +1 -1
  73. package/dist/http_server.js +9 -6
  74. package/dist/http_server.js.map +1 -1
  75. package/dist/index.cjs +24 -9
  76. package/dist/index.cjs.map +1 -1
  77. package/dist/index.d.cts +15 -11
  78. package/dist/index.d.ts +15 -11
  79. package/dist/index.d.ts.map +1 -1
  80. package/dist/index.js +18 -9
  81. package/dist/index.js.map +1 -1
  82. package/dist/inference/api_protos.cjs +70 -2
  83. package/dist/inference/api_protos.cjs.map +1 -1
  84. package/dist/inference/api_protos.d.cts +373 -32
  85. package/dist/inference/api_protos.d.ts +373 -32
  86. package/dist/inference/api_protos.d.ts.map +1 -1
  87. package/dist/inference/api_protos.js +62 -2
  88. package/dist/inference/api_protos.js.map +1 -1
  89. package/dist/inference/index.cjs +8 -0
  90. package/dist/inference/index.cjs.map +1 -1
  91. package/dist/inference/index.d.cts +3 -4
  92. package/dist/inference/index.d.ts +3 -4
  93. package/dist/inference/index.d.ts.map +1 -1
  94. package/dist/inference/index.js +18 -3
  95. package/dist/inference/index.js.map +1 -1
  96. package/dist/inference/interruption/defaults.cjs +81 -0
  97. package/dist/inference/interruption/defaults.cjs.map +1 -0
  98. package/dist/inference/interruption/defaults.d.cts +19 -0
  99. package/dist/inference/interruption/defaults.d.ts +19 -0
  100. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  101. package/dist/inference/interruption/defaults.js +46 -0
  102. package/dist/inference/interruption/defaults.js.map +1 -0
  103. package/dist/inference/interruption/errors.cjs +44 -0
  104. package/dist/inference/interruption/errors.cjs.map +1 -0
  105. package/dist/inference/interruption/errors.d.cts +12 -0
  106. package/dist/inference/interruption/errors.d.ts +12 -0
  107. package/dist/inference/interruption/errors.d.ts.map +1 -0
  108. package/dist/inference/interruption/errors.js +20 -0
  109. package/dist/inference/interruption/errors.js.map +1 -0
  110. package/dist/inference/interruption/http_transport.cjs +163 -0
  111. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  112. package/dist/inference/interruption/http_transport.d.cts +65 -0
  113. package/dist/inference/interruption/http_transport.d.ts +65 -0
  114. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  115. package/dist/inference/interruption/http_transport.js +137 -0
  116. package/dist/inference/interruption/http_transport.js.map +1 -0
  117. package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
  118. package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
  119. package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
  120. package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
  121. package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
  122. package/dist/inference/interruption/interruption_cache_entry.js +34 -0
  123. package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
  124. package/dist/inference/interruption/interruption_detector.cjs +198 -0
  125. package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
  126. package/dist/inference/interruption/interruption_detector.d.cts +59 -0
  127. package/dist/inference/interruption/interruption_detector.d.ts +59 -0
  128. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
  129. package/dist/inference/interruption/interruption_detector.js +164 -0
  130. package/dist/inference/interruption/interruption_detector.js.map +1 -0
  131. package/dist/inference/interruption/interruption_stream.cjs +368 -0
  132. package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
  133. package/dist/inference/interruption/interruption_stream.d.cts +46 -0
  134. package/dist/inference/interruption/interruption_stream.d.ts +46 -0
  135. package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
  136. package/dist/inference/interruption/interruption_stream.js +344 -0
  137. package/dist/inference/interruption/interruption_stream.js.map +1 -0
  138. package/dist/inference/interruption/types.cjs +17 -0
  139. package/dist/inference/interruption/types.cjs.map +1 -0
  140. package/dist/inference/interruption/types.d.cts +66 -0
  141. package/dist/inference/interruption/types.d.ts +66 -0
  142. package/dist/inference/interruption/types.d.ts.map +1 -0
  143. package/dist/inference/interruption/types.js +1 -0
  144. package/dist/inference/interruption/types.js.map +1 -0
  145. package/dist/inference/interruption/utils.cjs +130 -0
  146. package/dist/inference/interruption/utils.cjs.map +1 -0
  147. package/dist/inference/interruption/utils.d.cts +41 -0
  148. package/dist/inference/interruption/utils.d.ts +41 -0
  149. package/dist/inference/interruption/utils.d.ts.map +1 -0
  150. package/dist/inference/interruption/utils.js +105 -0
  151. package/dist/inference/interruption/utils.js.map +1 -0
  152. package/dist/inference/interruption/utils.test.cjs +105 -0
  153. package/dist/inference/interruption/utils.test.cjs.map +1 -0
  154. package/dist/inference/interruption/utils.test.js +104 -0
  155. package/dist/inference/interruption/utils.test.js.map +1 -0
  156. package/dist/inference/interruption/ws_transport.cjs +347 -0
  157. package/dist/inference/interruption/ws_transport.cjs.map +1 -0
  158. package/dist/inference/interruption/ws_transport.d.cts +33 -0
  159. package/dist/inference/interruption/ws_transport.d.ts +33 -0
  160. package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
  161. package/dist/inference/interruption/ws_transport.js +313 -0
  162. package/dist/inference/interruption/ws_transport.js.map +1 -0
  163. package/dist/inference/llm.cjs +106 -66
  164. package/dist/inference/llm.cjs.map +1 -1
  165. package/dist/inference/llm.d.cts +65 -43
  166. package/dist/inference/llm.d.ts +65 -43
  167. package/dist/inference/llm.d.ts.map +1 -1
  168. package/dist/inference/llm.js +100 -66
  169. package/dist/inference/llm.js.map +1 -1
  170. package/dist/inference/stt.cjs +319 -170
  171. package/dist/inference/stt.cjs.map +1 -1
  172. package/dist/inference/stt.d.cts +64 -15
  173. package/dist/inference/stt.d.ts +64 -15
  174. package/dist/inference/stt.d.ts.map +1 -1
  175. package/dist/inference/stt.js +319 -170
  176. package/dist/inference/stt.js.map +1 -1
  177. package/dist/inference/stt.test.cjs +218 -0
  178. package/dist/inference/stt.test.cjs.map +1 -0
  179. package/dist/inference/stt.test.js +217 -0
  180. package/dist/inference/stt.test.js.map +1 -0
  181. package/dist/inference/tts.cjs +249 -71
  182. package/dist/inference/tts.cjs.map +1 -1
  183. package/dist/inference/tts.d.cts +94 -17
  184. package/dist/inference/tts.d.ts +94 -17
  185. package/dist/inference/tts.d.ts.map +1 -1
  186. package/dist/inference/tts.js +249 -77
  187. package/dist/inference/tts.js.map +1 -1
  188. package/dist/inference/tts.test.cjs +305 -0
  189. package/dist/inference/tts.test.cjs.map +1 -0
  190. package/dist/inference/tts.test.js +304 -0
  191. package/dist/inference/tts.test.js.map +1 -0
  192. package/dist/inference/utils.cjs +26 -7
  193. package/dist/inference/utils.cjs.map +1 -1
  194. package/dist/inference/utils.d.cts +14 -1
  195. package/dist/inference/utils.d.ts +14 -1
  196. package/dist/inference/utils.d.ts.map +1 -1
  197. package/dist/inference/utils.js +18 -2
  198. package/dist/inference/utils.js.map +1 -1
  199. package/dist/ipc/inference_proc_executor.cjs +6 -3
  200. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  201. package/dist/ipc/inference_proc_executor.d.ts.map +1 -1
  202. package/dist/ipc/inference_proc_executor.js +6 -3
  203. package/dist/ipc/inference_proc_executor.js.map +1 -1
  204. package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
  205. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
  206. package/dist/ipc/inference_proc_lazy_main.js +13 -1
  207. package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
  208. package/dist/ipc/job_proc_executor.cjs +6 -1
  209. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  210. package/dist/ipc/job_proc_executor.d.ts.map +1 -1
  211. package/dist/ipc/job_proc_executor.js +6 -1
  212. package/dist/ipc/job_proc_executor.js.map +1 -1
  213. package/dist/ipc/job_proc_lazy_main.cjs +89 -17
  214. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  215. package/dist/ipc/job_proc_lazy_main.js +68 -18
  216. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  217. package/dist/ipc/supervised_proc.cjs +34 -8
  218. package/dist/ipc/supervised_proc.cjs.map +1 -1
  219. package/dist/ipc/supervised_proc.d.cts +8 -0
  220. package/dist/ipc/supervised_proc.d.ts +8 -0
  221. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  222. package/dist/ipc/supervised_proc.js +34 -8
  223. package/dist/ipc/supervised_proc.js.map +1 -1
  224. package/dist/ipc/supervised_proc.test.cjs +145 -0
  225. package/dist/ipc/supervised_proc.test.cjs.map +1 -0
  226. package/dist/ipc/supervised_proc.test.js +122 -0
  227. package/dist/ipc/supervised_proc.test.js.map +1 -0
  228. package/dist/job.cjs +109 -1
  229. package/dist/job.cjs.map +1 -1
  230. package/dist/job.d.cts +14 -0
  231. package/dist/job.d.ts +14 -0
  232. package/dist/job.d.ts.map +1 -1
  233. package/dist/job.js +99 -1
  234. package/dist/job.js.map +1 -1
  235. package/dist/language.cjs +394 -0
  236. package/dist/language.cjs.map +1 -0
  237. package/dist/language.d.cts +15 -0
  238. package/dist/language.d.ts +15 -0
  239. package/dist/language.d.ts.map +1 -0
  240. package/dist/language.js +363 -0
  241. package/dist/language.js.map +1 -0
  242. package/dist/language.test.cjs +43 -0
  243. package/dist/language.test.cjs.map +1 -0
  244. package/dist/language.test.js +49 -0
  245. package/dist/language.test.js.map +1 -0
  246. package/dist/llm/chat_context.cjs +345 -3
  247. package/dist/llm/chat_context.cjs.map +1 -1
  248. package/dist/llm/chat_context.d.cts +86 -2
  249. package/dist/llm/chat_context.d.ts +86 -2
  250. package/dist/llm/chat_context.d.ts.map +1 -1
  251. package/dist/llm/chat_context.js +344 -3
  252. package/dist/llm/chat_context.js.map +1 -1
  253. package/dist/llm/chat_context.test.cjs +692 -0
  254. package/dist/llm/chat_context.test.cjs.map +1 -1
  255. package/dist/llm/chat_context.test.js +692 -0
  256. package/dist/llm/chat_context.test.js.map +1 -1
  257. package/dist/llm/fallback_adapter.cjs +280 -0
  258. package/dist/llm/fallback_adapter.cjs.map +1 -0
  259. package/dist/llm/fallback_adapter.d.cts +73 -0
  260. package/dist/llm/fallback_adapter.d.ts +73 -0
  261. package/dist/llm/fallback_adapter.d.ts.map +1 -0
  262. package/dist/llm/fallback_adapter.js +256 -0
  263. package/dist/llm/fallback_adapter.js.map +1 -0
  264. package/dist/llm/fallback_adapter.test.cjs +176 -0
  265. package/dist/llm/fallback_adapter.test.cjs.map +1 -0
  266. package/dist/llm/fallback_adapter.test.js +175 -0
  267. package/dist/llm/fallback_adapter.test.js.map +1 -0
  268. package/dist/llm/index.cjs +11 -0
  269. package/dist/llm/index.cjs.map +1 -1
  270. package/dist/llm/index.d.cts +4 -3
  271. package/dist/llm/index.d.ts +4 -3
  272. package/dist/llm/index.d.ts.map +1 -1
  273. package/dist/llm/index.js +13 -1
  274. package/dist/llm/index.js.map +1 -1
  275. package/dist/llm/llm.cjs +65 -11
  276. package/dist/llm/llm.cjs.map +1 -1
  277. package/dist/llm/llm.d.cts +13 -2
  278. package/dist/llm/llm.d.ts +13 -2
  279. package/dist/llm/llm.d.ts.map +1 -1
  280. package/dist/llm/llm.js +65 -11
  281. package/dist/llm/llm.js.map +1 -1
  282. package/dist/llm/provider_format/google.cjs +6 -2
  283. package/dist/llm/provider_format/google.cjs.map +1 -1
  284. package/dist/llm/provider_format/google.d.cts +1 -1
  285. package/dist/llm/provider_format/google.d.ts +1 -1
  286. package/dist/llm/provider_format/google.d.ts.map +1 -1
  287. package/dist/llm/provider_format/google.js +6 -2
  288. package/dist/llm/provider_format/google.js.map +1 -1
  289. package/dist/llm/provider_format/google.test.cjs +48 -0
  290. package/dist/llm/provider_format/google.test.cjs.map +1 -1
  291. package/dist/llm/provider_format/google.test.js +54 -1
  292. package/dist/llm/provider_format/google.test.js.map +1 -1
  293. package/dist/llm/provider_format/index.cjs +2 -0
  294. package/dist/llm/provider_format/index.cjs.map +1 -1
  295. package/dist/llm/provider_format/index.d.cts +2 -2
  296. package/dist/llm/provider_format/index.d.ts +2 -2
  297. package/dist/llm/provider_format/index.d.ts.map +1 -1
  298. package/dist/llm/provider_format/index.js +6 -1
  299. package/dist/llm/provider_format/index.js.map +1 -1
  300. package/dist/llm/provider_format/openai.cjs +126 -24
  301. package/dist/llm/provider_format/openai.cjs.map +1 -1
  302. package/dist/llm/provider_format/openai.d.cts +1 -0
  303. package/dist/llm/provider_format/openai.d.ts +1 -0
  304. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  305. package/dist/llm/provider_format/openai.js +124 -23
  306. package/dist/llm/provider_format/openai.js.map +1 -1
  307. package/dist/llm/provider_format/openai.test.cjs +393 -0
  308. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  309. package/dist/llm/provider_format/openai.test.js +400 -2
  310. package/dist/llm/provider_format/openai.test.js.map +1 -1
  311. package/dist/llm/provider_format/utils.cjs +5 -4
  312. package/dist/llm/provider_format/utils.cjs.map +1 -1
  313. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  314. package/dist/llm/provider_format/utils.js +5 -4
  315. package/dist/llm/provider_format/utils.js.map +1 -1
  316. package/dist/llm/realtime.cjs +3 -0
  317. package/dist/llm/realtime.cjs.map +1 -1
  318. package/dist/llm/realtime.d.cts +15 -1
  319. package/dist/llm/realtime.d.ts +15 -1
  320. package/dist/llm/realtime.d.ts.map +1 -1
  321. package/dist/llm/realtime.js +3 -0
  322. package/dist/llm/realtime.js.map +1 -1
  323. package/dist/llm/remote_chat_context.cjs.map +1 -1
  324. package/dist/llm/remote_chat_context.d.cts +2 -0
  325. package/dist/llm/remote_chat_context.d.ts +2 -0
  326. package/dist/llm/remote_chat_context.d.ts.map +1 -1
  327. package/dist/llm/remote_chat_context.js.map +1 -1
  328. package/dist/llm/tool_context.cjs +50 -2
  329. package/dist/llm/tool_context.cjs.map +1 -1
  330. package/dist/llm/tool_context.d.cts +47 -11
  331. package/dist/llm/tool_context.d.ts +47 -11
  332. package/dist/llm/tool_context.d.ts.map +1 -1
  333. package/dist/llm/tool_context.js +48 -3
  334. package/dist/llm/tool_context.js.map +1 -1
  335. package/dist/llm/tool_context.test.cjs +197 -0
  336. package/dist/llm/tool_context.test.cjs.map +1 -1
  337. package/dist/llm/tool_context.test.js +175 -0
  338. package/dist/llm/tool_context.test.js.map +1 -1
  339. package/dist/llm/utils.cjs +107 -12
  340. package/dist/llm/utils.cjs.map +1 -1
  341. package/dist/llm/utils.d.cts +10 -3
  342. package/dist/llm/utils.d.ts +10 -3
  343. package/dist/llm/utils.d.ts.map +1 -1
  344. package/dist/llm/utils.js +106 -12
  345. package/dist/llm/utils.js.map +1 -1
  346. package/dist/llm/utils.test.cjs +90 -0
  347. package/dist/llm/utils.test.cjs.map +1 -1
  348. package/dist/llm/utils.test.js +98 -2
  349. package/dist/llm/utils.test.js.map +1 -1
  350. package/dist/llm/zod-utils.cjs +102 -0
  351. package/dist/llm/zod-utils.cjs.map +1 -0
  352. package/dist/llm/zod-utils.d.cts +65 -0
  353. package/dist/llm/zod-utils.d.ts +65 -0
  354. package/dist/llm/zod-utils.d.ts.map +1 -0
  355. package/dist/llm/zod-utils.js +64 -0
  356. package/dist/llm/zod-utils.js.map +1 -0
  357. package/dist/llm/zod-utils.test.cjs +472 -0
  358. package/dist/llm/zod-utils.test.cjs.map +1 -0
  359. package/dist/llm/zod-utils.test.js +455 -0
  360. package/dist/llm/zod-utils.test.js.map +1 -0
  361. package/dist/log.cjs +45 -14
  362. package/dist/log.cjs.map +1 -1
  363. package/dist/log.d.cts +8 -1
  364. package/dist/log.d.ts +8 -1
  365. package/dist/log.d.ts.map +1 -1
  366. package/dist/log.js +45 -15
  367. package/dist/log.js.map +1 -1
  368. package/dist/metrics/base.cjs.map +1 -1
  369. package/dist/metrics/base.d.cts +75 -19
  370. package/dist/metrics/base.d.ts +75 -19
  371. package/dist/metrics/base.d.ts.map +1 -1
  372. package/dist/metrics/index.cjs +5 -0
  373. package/dist/metrics/index.cjs.map +1 -1
  374. package/dist/metrics/index.d.cts +2 -1
  375. package/dist/metrics/index.d.ts +2 -1
  376. package/dist/metrics/index.d.ts.map +1 -1
  377. package/dist/metrics/index.js +6 -0
  378. package/dist/metrics/index.js.map +1 -1
  379. package/dist/metrics/model_usage.cjs +189 -0
  380. package/dist/metrics/model_usage.cjs.map +1 -0
  381. package/dist/metrics/model_usage.d.cts +92 -0
  382. package/dist/metrics/model_usage.d.ts +92 -0
  383. package/dist/metrics/model_usage.d.ts.map +1 -0
  384. package/dist/metrics/model_usage.js +164 -0
  385. package/dist/metrics/model_usage.js.map +1 -0
  386. package/dist/metrics/model_usage.test.cjs +474 -0
  387. package/dist/metrics/model_usage.test.cjs.map +1 -0
  388. package/dist/metrics/model_usage.test.js +476 -0
  389. package/dist/metrics/model_usage.test.js.map +1 -0
  390. package/dist/metrics/usage_collector.cjs +5 -2
  391. package/dist/metrics/usage_collector.cjs.map +1 -1
  392. package/dist/metrics/usage_collector.d.cts +10 -1
  393. package/dist/metrics/usage_collector.d.ts +10 -1
  394. package/dist/metrics/usage_collector.d.ts.map +1 -1
  395. package/dist/metrics/usage_collector.js +5 -2
  396. package/dist/metrics/usage_collector.js.map +1 -1
  397. package/dist/metrics/utils.cjs +23 -7
  398. package/dist/metrics/utils.cjs.map +1 -1
  399. package/dist/metrics/utils.d.ts.map +1 -1
  400. package/dist/metrics/utils.js +23 -7
  401. package/dist/metrics/utils.js.map +1 -1
  402. package/dist/stream/deferred_stream.cjs +31 -10
  403. package/dist/stream/deferred_stream.cjs.map +1 -1
  404. package/dist/stream/deferred_stream.d.cts +6 -1
  405. package/dist/stream/deferred_stream.d.ts +6 -1
  406. package/dist/stream/deferred_stream.d.ts.map +1 -1
  407. package/dist/stream/deferred_stream.js +31 -10
  408. package/dist/stream/deferred_stream.js.map +1 -1
  409. package/dist/stream/deferred_stream.test.cjs +2 -2
  410. package/dist/stream/deferred_stream.test.cjs.map +1 -1
  411. package/dist/stream/deferred_stream.test.js +2 -2
  412. package/dist/stream/deferred_stream.test.js.map +1 -1
  413. package/dist/stream/index.cjs +3 -0
  414. package/dist/stream/index.cjs.map +1 -1
  415. package/dist/stream/index.d.cts +1 -0
  416. package/dist/stream/index.d.ts +1 -0
  417. package/dist/stream/index.d.ts.map +1 -1
  418. package/dist/stream/index.js +2 -0
  419. package/dist/stream/index.js.map +1 -1
  420. package/dist/stream/multi_input_stream.cjs +139 -0
  421. package/dist/stream/multi_input_stream.cjs.map +1 -0
  422. package/dist/stream/multi_input_stream.d.cts +55 -0
  423. package/dist/stream/multi_input_stream.d.ts +55 -0
  424. package/dist/stream/multi_input_stream.d.ts.map +1 -0
  425. package/dist/stream/multi_input_stream.js +115 -0
  426. package/dist/stream/multi_input_stream.js.map +1 -0
  427. package/dist/stream/multi_input_stream.test.cjs +344 -0
  428. package/dist/stream/multi_input_stream.test.cjs.map +1 -0
  429. package/dist/stream/multi_input_stream.test.js +343 -0
  430. package/dist/stream/multi_input_stream.test.js.map +1 -0
  431. package/dist/stream/stream_channel.cjs +39 -1
  432. package/dist/stream/stream_channel.cjs.map +1 -1
  433. package/dist/stream/stream_channel.d.cts +5 -2
  434. package/dist/stream/stream_channel.d.ts +5 -2
  435. package/dist/stream/stream_channel.d.ts.map +1 -1
  436. package/dist/stream/stream_channel.js +39 -1
  437. package/dist/stream/stream_channel.js.map +1 -1
  438. package/dist/stream/stream_channel.test.cjs +27 -0
  439. package/dist/stream/stream_channel.test.cjs.map +1 -1
  440. package/dist/stream/stream_channel.test.js +27 -0
  441. package/dist/stream/stream_channel.test.js.map +1 -1
  442. package/dist/stt/stream_adapter.cjs +24 -9
  443. package/dist/stt/stream_adapter.cjs.map +1 -1
  444. package/dist/stt/stream_adapter.d.cts +7 -3
  445. package/dist/stt/stream_adapter.d.ts +7 -3
  446. package/dist/stt/stream_adapter.d.ts.map +1 -1
  447. package/dist/stt/stream_adapter.js +24 -9
  448. package/dist/stt/stream_adapter.js.map +1 -1
  449. package/dist/stt/stt.cjs +94 -19
  450. package/dist/stt/stt.cjs.map +1 -1
  451. package/dist/stt/stt.d.cts +68 -5
  452. package/dist/stt/stt.d.ts +68 -5
  453. package/dist/stt/stt.d.ts.map +1 -1
  454. package/dist/stt/stt.js +96 -21
  455. package/dist/stt/stt.js.map +1 -1
  456. package/dist/telemetry/index.cjs +72 -0
  457. package/dist/telemetry/index.cjs.map +1 -0
  458. package/dist/telemetry/index.d.cts +7 -0
  459. package/dist/telemetry/index.d.ts +7 -0
  460. package/dist/telemetry/index.d.ts.map +1 -0
  461. package/dist/telemetry/index.js +37 -0
  462. package/dist/telemetry/index.js.map +1 -0
  463. package/dist/telemetry/logging.cjs +65 -0
  464. package/dist/telemetry/logging.cjs.map +1 -0
  465. package/dist/telemetry/logging.d.cts +21 -0
  466. package/dist/telemetry/logging.d.ts +21 -0
  467. package/dist/telemetry/logging.d.ts.map +1 -0
  468. package/dist/telemetry/logging.js +40 -0
  469. package/dist/telemetry/logging.js.map +1 -0
  470. package/dist/telemetry/otel_http_exporter.cjs +166 -0
  471. package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
  472. package/dist/telemetry/otel_http_exporter.d.cts +63 -0
  473. package/dist/telemetry/otel_http_exporter.d.ts +63 -0
  474. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
  475. package/dist/telemetry/otel_http_exporter.js +142 -0
  476. package/dist/telemetry/otel_http_exporter.js.map +1 -0
  477. package/dist/telemetry/pino_otel_transport.cjs +217 -0
  478. package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
  479. package/dist/telemetry/pino_otel_transport.d.cts +58 -0
  480. package/dist/telemetry/pino_otel_transport.d.ts +58 -0
  481. package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
  482. package/dist/telemetry/pino_otel_transport.js +189 -0
  483. package/dist/telemetry/pino_otel_transport.js.map +1 -0
  484. package/dist/telemetry/trace_types.cjs +233 -0
  485. package/dist/telemetry/trace_types.cjs.map +1 -0
  486. package/dist/telemetry/trace_types.d.cts +74 -0
  487. package/dist/telemetry/trace_types.d.ts +74 -0
  488. package/dist/telemetry/trace_types.d.ts.map +1 -0
  489. package/dist/telemetry/trace_types.js +141 -0
  490. package/dist/telemetry/trace_types.js.map +1 -0
  491. package/dist/telemetry/traces.cjs +484 -0
  492. package/dist/telemetry/traces.cjs.map +1 -0
  493. package/dist/telemetry/traces.d.cts +116 -0
  494. package/dist/telemetry/traces.d.ts +116 -0
  495. package/dist/telemetry/traces.d.ts.map +1 -0
  496. package/dist/telemetry/traces.js +449 -0
  497. package/dist/telemetry/traces.js.map +1 -0
  498. package/dist/telemetry/utils.cjs +86 -0
  499. package/dist/telemetry/utils.cjs.map +1 -0
  500. package/dist/telemetry/utils.d.cts +5 -0
  501. package/dist/telemetry/utils.d.ts +5 -0
  502. package/dist/telemetry/utils.d.ts.map +1 -0
  503. package/dist/telemetry/utils.js +51 -0
  504. package/dist/telemetry/utils.js.map +1 -0
  505. package/dist/tokenize/basic/sentence.cjs +3 -3
  506. package/dist/tokenize/basic/sentence.cjs.map +1 -1
  507. package/dist/tokenize/basic/sentence.js +3 -3
  508. package/dist/tokenize/basic/sentence.js.map +1 -1
  509. package/dist/tokenize/tokenizer.test.cjs +3 -1
  510. package/dist/tokenize/tokenizer.test.cjs.map +1 -1
  511. package/dist/tokenize/tokenizer.test.js +3 -1
  512. package/dist/tokenize/tokenizer.test.js.map +1 -1
  513. package/dist/transcription.cjs.map +1 -1
  514. package/dist/transcription.d.cts +6 -0
  515. package/dist/transcription.d.ts +6 -0
  516. package/dist/transcription.d.ts.map +1 -1
  517. package/dist/transcription.js.map +1 -1
  518. package/dist/tts/fallback_adapter.cjs +472 -0
  519. package/dist/tts/fallback_adapter.cjs.map +1 -0
  520. package/dist/tts/fallback_adapter.d.cts +110 -0
  521. package/dist/tts/fallback_adapter.d.ts +110 -0
  522. package/dist/tts/fallback_adapter.d.ts.map +1 -0
  523. package/dist/tts/fallback_adapter.js +448 -0
  524. package/dist/tts/fallback_adapter.js.map +1 -0
  525. package/dist/tts/index.cjs +3 -0
  526. package/dist/tts/index.cjs.map +1 -1
  527. package/dist/tts/index.d.cts +1 -0
  528. package/dist/tts/index.d.ts +1 -0
  529. package/dist/tts/index.d.ts.map +1 -1
  530. package/dist/tts/index.js +2 -0
  531. package/dist/tts/index.js.map +1 -1
  532. package/dist/tts/stream_adapter.cjs +25 -8
  533. package/dist/tts/stream_adapter.cjs.map +1 -1
  534. package/dist/tts/stream_adapter.d.cts +6 -3
  535. package/dist/tts/stream_adapter.d.ts +6 -3
  536. package/dist/tts/stream_adapter.d.ts.map +1 -1
  537. package/dist/tts/stream_adapter.js +25 -8
  538. package/dist/tts/stream_adapter.js.map +1 -1
  539. package/dist/tts/tts.cjs +189 -57
  540. package/dist/tts/tts.cjs.map +1 -1
  541. package/dist/tts/tts.d.cts +58 -6
  542. package/dist/tts/tts.d.ts +58 -6
  543. package/dist/tts/tts.d.ts.map +1 -1
  544. package/dist/tts/tts.js +191 -59
  545. package/dist/tts/tts.js.map +1 -1
  546. package/dist/typed_promise.cjs +48 -0
  547. package/dist/typed_promise.cjs.map +1 -0
  548. package/dist/typed_promise.d.cts +24 -0
  549. package/dist/typed_promise.d.ts +24 -0
  550. package/dist/typed_promise.d.ts.map +1 -0
  551. package/dist/typed_promise.js +28 -0
  552. package/dist/typed_promise.js.map +1 -0
  553. package/dist/types.cjs +24 -32
  554. package/dist/types.cjs.map +1 -1
  555. package/dist/types.d.cts +45 -10
  556. package/dist/types.d.ts +45 -10
  557. package/dist/types.d.ts.map +1 -1
  558. package/dist/types.js +20 -30
  559. package/dist/types.js.map +1 -1
  560. package/dist/utils.cjs +124 -28
  561. package/dist/utils.cjs.map +1 -1
  562. package/dist/utils.d.cts +41 -1
  563. package/dist/utils.d.ts +41 -1
  564. package/dist/utils.d.ts.map +1 -1
  565. package/dist/utils.js +119 -27
  566. package/dist/utils.js.map +1 -1
  567. package/dist/utils.test.cjs +73 -1
  568. package/dist/utils.test.cjs.map +1 -1
  569. package/dist/utils.test.js +74 -10
  570. package/dist/utils.test.js.map +1 -1
  571. package/dist/vad.cjs +35 -15
  572. package/dist/vad.cjs.map +1 -1
  573. package/dist/vad.d.cts +15 -5
  574. package/dist/vad.d.ts +15 -5
  575. package/dist/vad.d.ts.map +1 -1
  576. package/dist/vad.js +35 -15
  577. package/dist/vad.js.map +1 -1
  578. package/dist/version.cjs +1 -1
  579. package/dist/version.cjs.map +1 -1
  580. package/dist/version.d.cts +1 -1
  581. package/dist/version.d.ts +1 -1
  582. package/dist/version.d.ts.map +1 -1
  583. package/dist/version.js +1 -1
  584. package/dist/version.js.map +1 -1
  585. package/dist/voice/agent.cjs +258 -35
  586. package/dist/voice/agent.cjs.map +1 -1
  587. package/dist/voice/agent.d.cts +54 -13
  588. package/dist/voice/agent.d.ts +54 -13
  589. package/dist/voice/agent.d.ts.map +1 -1
  590. package/dist/voice/agent.js +254 -34
  591. package/dist/voice/agent.js.map +1 -1
  592. package/dist/voice/agent.test.cjs +314 -0
  593. package/dist/voice/agent.test.cjs.map +1 -1
  594. package/dist/voice/agent.test.js +316 -2
  595. package/dist/voice/agent.test.js.map +1 -1
  596. package/dist/voice/agent_activity.cjs +1116 -385
  597. package/dist/voice/agent_activity.cjs.map +1 -1
  598. package/dist/voice/agent_activity.d.cts +72 -11
  599. package/dist/voice/agent_activity.d.ts +72 -11
  600. package/dist/voice/agent_activity.d.ts.map +1 -1
  601. package/dist/voice/agent_activity.js +1119 -383
  602. package/dist/voice/agent_activity.js.map +1 -1
  603. package/dist/voice/agent_activity.test.cjs +135 -0
  604. package/dist/voice/agent_activity.test.cjs.map +1 -0
  605. package/dist/voice/agent_activity.test.js +134 -0
  606. package/dist/voice/agent_activity.test.js.map +1 -0
  607. package/dist/voice/agent_session.cjs +550 -90
  608. package/dist/voice/agent_session.cjs.map +1 -1
  609. package/dist/voice/agent_session.d.cts +185 -25
  610. package/dist/voice/agent_session.d.ts +185 -25
  611. package/dist/voice/agent_session.d.ts.map +1 -1
  612. package/dist/voice/agent_session.js +556 -91
  613. package/dist/voice/agent_session.js.map +1 -1
  614. package/dist/voice/audio_recognition.cjs +605 -46
  615. package/dist/voice/audio_recognition.cjs.map +1 -1
  616. package/dist/voice/audio_recognition.d.cts +96 -4
  617. package/dist/voice/audio_recognition.d.ts +96 -4
  618. package/dist/voice/audio_recognition.d.ts.map +1 -1
  619. package/dist/voice/audio_recognition.js +611 -47
  620. package/dist/voice/audio_recognition.js.map +1 -1
  621. package/dist/voice/audio_recognition_span.test.cjs +295 -0
  622. package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
  623. package/dist/voice/audio_recognition_span.test.js +299 -0
  624. package/dist/voice/audio_recognition_span.test.js.map +1 -0
  625. package/dist/voice/avatar/datastream_io.cjs +7 -1
  626. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  627. package/dist/voice/avatar/datastream_io.d.cts +1 -0
  628. package/dist/voice/avatar/datastream_io.d.ts +1 -0
  629. package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
  630. package/dist/voice/avatar/datastream_io.js +7 -1
  631. package/dist/voice/avatar/datastream_io.js.map +1 -1
  632. package/dist/voice/background_audio.cjs +367 -0
  633. package/dist/voice/background_audio.cjs.map +1 -0
  634. package/dist/voice/background_audio.d.cts +123 -0
  635. package/dist/voice/background_audio.d.ts +123 -0
  636. package/dist/voice/background_audio.d.ts.map +1 -0
  637. package/dist/voice/background_audio.js +343 -0
  638. package/dist/voice/background_audio.js.map +1 -0
  639. package/dist/voice/events.cjs +3 -0
  640. package/dist/voice/events.cjs.map +1 -1
  641. package/dist/voice/events.d.cts +16 -9
  642. package/dist/voice/events.d.ts +16 -9
  643. package/dist/voice/events.d.ts.map +1 -1
  644. package/dist/voice/events.js +3 -0
  645. package/dist/voice/events.js.map +1 -1
  646. package/dist/voice/generation.cjs +205 -41
  647. package/dist/voice/generation.cjs.map +1 -1
  648. package/dist/voice/generation.d.cts +21 -5
  649. package/dist/voice/generation.d.ts +21 -5
  650. package/dist/voice/generation.d.ts.map +1 -1
  651. package/dist/voice/generation.js +215 -43
  652. package/dist/voice/generation.js.map +1 -1
  653. package/dist/voice/generation_tools.test.cjs +236 -0
  654. package/dist/voice/generation_tools.test.cjs.map +1 -0
  655. package/dist/voice/generation_tools.test.js +235 -0
  656. package/dist/voice/generation_tools.test.js.map +1 -0
  657. package/dist/voice/index.cjs +33 -2
  658. package/dist/voice/index.cjs.map +1 -1
  659. package/dist/voice/index.d.cts +8 -2
  660. package/dist/voice/index.d.ts +8 -2
  661. package/dist/voice/index.d.ts.map +1 -1
  662. package/dist/voice/index.js +19 -2
  663. package/dist/voice/index.js.map +1 -1
  664. package/dist/voice/interruption_detection.test.cjs +114 -0
  665. package/dist/voice/interruption_detection.test.cjs.map +1 -0
  666. package/dist/voice/interruption_detection.test.js +113 -0
  667. package/dist/voice/interruption_detection.test.js.map +1 -0
  668. package/dist/voice/io.cjs +66 -6
  669. package/dist/voice/io.cjs.map +1 -1
  670. package/dist/voice/io.d.cts +67 -7
  671. package/dist/voice/io.d.ts +67 -7
  672. package/dist/voice/io.d.ts.map +1 -1
  673. package/dist/voice/io.js +62 -5
  674. package/dist/voice/io.js.map +1 -1
  675. package/dist/voice/recorder_io/index.cjs +23 -0
  676. package/dist/voice/recorder_io/index.cjs.map +1 -0
  677. package/dist/voice/recorder_io/index.d.cts +2 -0
  678. package/dist/voice/recorder_io/index.d.ts +2 -0
  679. package/dist/voice/recorder_io/index.d.ts.map +1 -0
  680. package/dist/voice/recorder_io/index.js +2 -0
  681. package/dist/voice/recorder_io/index.js.map +1 -0
  682. package/dist/voice/recorder_io/recorder_io.cjs +607 -0
  683. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
  684. package/dist/voice/recorder_io/recorder_io.d.cts +106 -0
  685. package/dist/voice/recorder_io/recorder_io.d.ts +106 -0
  686. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
  687. package/dist/voice/recorder_io/recorder_io.js +573 -0
  688. package/dist/voice/recorder_io/recorder_io.js.map +1 -0
  689. package/dist/voice/remote_session.cjs +922 -0
  690. package/dist/voice/remote_session.cjs.map +1 -0
  691. package/dist/voice/remote_session.d.cts +108 -0
  692. package/dist/voice/remote_session.d.ts +108 -0
  693. package/dist/voice/remote_session.d.ts.map +1 -0
  694. package/dist/voice/remote_session.js +887 -0
  695. package/dist/voice/remote_session.js.map +1 -0
  696. package/dist/voice/report.cjs +88 -0
  697. package/dist/voice/report.cjs.map +1 -0
  698. package/dist/voice/report.d.cts +49 -0
  699. package/dist/voice/report.d.ts +49 -0
  700. package/dist/voice/report.d.ts.map +1 -0
  701. package/dist/voice/report.js +63 -0
  702. package/dist/voice/report.js.map +1 -0
  703. package/dist/voice/report.test.cjs +121 -0
  704. package/dist/voice/report.test.cjs.map +1 -0
  705. package/dist/voice/report.test.js +120 -0
  706. package/dist/voice/report.test.js.map +1 -0
  707. package/dist/voice/room_io/_input.cjs +40 -7
  708. package/dist/voice/room_io/_input.cjs.map +1 -1
  709. package/dist/voice/room_io/_input.d.cts +5 -2
  710. package/dist/voice/room_io/_input.d.ts +5 -2
  711. package/dist/voice/room_io/_input.d.ts.map +1 -1
  712. package/dist/voice/room_io/_input.js +41 -8
  713. package/dist/voice/room_io/_input.js.map +1 -1
  714. package/dist/voice/room_io/_output.cjs +19 -11
  715. package/dist/voice/room_io/_output.cjs.map +1 -1
  716. package/dist/voice/room_io/_output.d.cts +7 -4
  717. package/dist/voice/room_io/_output.d.ts +7 -4
  718. package/dist/voice/room_io/_output.d.ts.map +1 -1
  719. package/dist/voice/room_io/_output.js +20 -12
  720. package/dist/voice/room_io/_output.js.map +1 -1
  721. package/dist/voice/room_io/room_io.cjs +33 -6
  722. package/dist/voice/room_io/room_io.cjs.map +1 -1
  723. package/dist/voice/room_io/room_io.d.cts +29 -9
  724. package/dist/voice/room_io/room_io.d.ts +29 -9
  725. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  726. package/dist/voice/room_io/room_io.js +33 -7
  727. package/dist/voice/room_io/room_io.js.map +1 -1
  728. package/dist/voice/speech_handle.cjs +22 -4
  729. package/dist/voice/speech_handle.cjs.map +1 -1
  730. package/dist/voice/speech_handle.d.cts +17 -2
  731. package/dist/voice/speech_handle.d.ts +17 -2
  732. package/dist/voice/speech_handle.d.ts.map +1 -1
  733. package/dist/voice/speech_handle.js +21 -4
  734. package/dist/voice/speech_handle.js.map +1 -1
  735. package/dist/voice/testing/fake_llm.cjs +127 -0
  736. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  737. package/dist/voice/testing/fake_llm.d.cts +30 -0
  738. package/dist/voice/testing/fake_llm.d.ts +30 -0
  739. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  740. package/dist/voice/testing/fake_llm.js +103 -0
  741. package/dist/voice/testing/fake_llm.js.map +1 -0
  742. package/dist/voice/testing/index.cjs +57 -0
  743. package/dist/voice/testing/index.cjs.map +1 -0
  744. package/dist/voice/testing/index.d.cts +21 -0
  745. package/dist/voice/testing/index.d.ts +21 -0
  746. package/dist/voice/testing/index.d.ts.map +1 -0
  747. package/dist/voice/testing/index.js +35 -0
  748. package/dist/voice/testing/index.js.map +1 -0
  749. package/dist/voice/testing/run_result.cjs +817 -0
  750. package/dist/voice/testing/run_result.cjs.map +1 -0
  751. package/dist/voice/testing/run_result.d.cts +385 -0
  752. package/dist/voice/testing/run_result.d.ts +385 -0
  753. package/dist/voice/testing/run_result.d.ts.map +1 -0
  754. package/dist/voice/testing/run_result.js +790 -0
  755. package/dist/voice/testing/run_result.js.map +1 -0
  756. package/dist/voice/testing/types.cjs +46 -0
  757. package/dist/voice/testing/types.cjs.map +1 -0
  758. package/dist/voice/testing/types.d.cts +83 -0
  759. package/dist/voice/testing/types.d.ts +83 -0
  760. package/dist/voice/testing/types.d.ts.map +1 -0
  761. package/dist/voice/testing/types.js +19 -0
  762. package/dist/voice/testing/types.js.map +1 -0
  763. package/dist/voice/transcription/synchronizer.cjs +139 -15
  764. package/dist/voice/transcription/synchronizer.cjs.map +1 -1
  765. package/dist/voice/transcription/synchronizer.d.cts +35 -4
  766. package/dist/voice/transcription/synchronizer.d.ts +35 -4
  767. package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
  768. package/dist/voice/transcription/synchronizer.js +143 -16
  769. package/dist/voice/transcription/synchronizer.js.map +1 -1
  770. package/dist/voice/transcription/synchronizer.test.cjs +151 -0
  771. package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
  772. package/dist/voice/transcription/synchronizer.test.js +150 -0
  773. package/dist/voice/transcription/synchronizer.test.js.map +1 -0
  774. package/dist/voice/turn_config/endpointing.cjs +33 -0
  775. package/dist/voice/turn_config/endpointing.cjs.map +1 -0
  776. package/dist/voice/turn_config/endpointing.d.cts +30 -0
  777. package/dist/voice/turn_config/endpointing.d.ts +30 -0
  778. package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
  779. package/dist/voice/turn_config/endpointing.js +9 -0
  780. package/dist/voice/turn_config/endpointing.js.map +1 -0
  781. package/dist/voice/turn_config/interruption.cjs +37 -0
  782. package/dist/voice/turn_config/interruption.cjs.map +1 -0
  783. package/dist/voice/turn_config/interruption.d.cts +53 -0
  784. package/dist/voice/turn_config/interruption.d.ts +53 -0
  785. package/dist/voice/turn_config/interruption.d.ts.map +1 -0
  786. package/dist/voice/turn_config/interruption.js +13 -0
  787. package/dist/voice/turn_config/interruption.js.map +1 -0
  788. package/dist/voice/turn_config/turn_handling.cjs +35 -0
  789. package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
  790. package/dist/voice/turn_config/turn_handling.d.cts +36 -0
  791. package/dist/voice/turn_config/turn_handling.d.ts +36 -0
  792. package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
  793. package/dist/voice/turn_config/turn_handling.js +11 -0
  794. package/dist/voice/turn_config/turn_handling.js.map +1 -0
  795. package/dist/voice/turn_config/utils.cjs +157 -0
  796. package/dist/voice/turn_config/utils.cjs.map +1 -0
  797. package/dist/voice/turn_config/utils.d.cts +37 -0
  798. package/dist/voice/turn_config/utils.d.ts +37 -0
  799. package/dist/voice/turn_config/utils.d.ts.map +1 -0
  800. package/dist/voice/turn_config/utils.js +131 -0
  801. package/dist/voice/turn_config/utils.js.map +1 -0
  802. package/dist/voice/turn_config/utils.test.cjs +128 -0
  803. package/dist/voice/turn_config/utils.test.cjs.map +1 -0
  804. package/dist/voice/turn_config/utils.test.js +127 -0
  805. package/dist/voice/turn_config/utils.test.js.map +1 -0
  806. package/dist/voice/utils.cjs +47 -0
  807. package/dist/voice/utils.cjs.map +1 -0
  808. package/dist/voice/utils.d.cts +4 -0
  809. package/dist/voice/utils.d.ts +4 -0
  810. package/dist/voice/utils.d.ts.map +1 -0
  811. package/dist/voice/utils.js +23 -0
  812. package/dist/voice/utils.js.map +1 -0
  813. package/dist/worker.cjs +44 -52
  814. package/dist/worker.cjs.map +1 -1
  815. package/dist/worker.d.cts +18 -8
  816. package/dist/worker.d.ts +18 -8
  817. package/dist/worker.d.ts.map +1 -1
  818. package/dist/worker.js +43 -43
  819. package/dist/worker.js.map +1 -1
  820. package/package.json +35 -13
  821. package/resources/NOTICE +2 -0
  822. package/resources/keyboard-typing.ogg +0 -0
  823. package/resources/keyboard-typing2.ogg +0 -0
  824. package/resources/office-ambience.ogg +0 -0
  825. package/src/_exceptions.ts +5 -0
  826. package/src/audio.ts +132 -1
  827. package/src/beta/index.ts +9 -0
  828. package/src/beta/workflows/index.ts +9 -0
  829. package/src/beta/workflows/task_group.ts +203 -0
  830. package/src/cli.ts +57 -66
  831. package/src/connection_pool.test.ts +346 -0
  832. package/src/connection_pool.ts +307 -0
  833. package/src/constants.ts +14 -0
  834. package/src/cpu.test.ts +239 -0
  835. package/src/cpu.ts +173 -0
  836. package/src/http_server.ts +18 -6
  837. package/src/index.ts +15 -13
  838. package/src/inference/api_protos.ts +85 -2
  839. package/src/inference/index.ts +32 -4
  840. package/src/inference/interruption/defaults.ts +51 -0
  841. package/src/inference/interruption/errors.ts +25 -0
  842. package/src/inference/interruption/http_transport.ts +207 -0
  843. package/src/inference/interruption/interruption_cache_entry.ts +50 -0
  844. package/src/inference/interruption/interruption_detector.ts +204 -0
  845. package/src/inference/interruption/interruption_stream.ts +467 -0
  846. package/src/inference/interruption/types.ts +84 -0
  847. package/src/inference/interruption/utils.test.ts +132 -0
  848. package/src/inference/interruption/utils.ts +137 -0
  849. package/src/inference/interruption/ws_transport.ts +416 -0
  850. package/src/inference/llm.ts +214 -163
  851. package/src/inference/stt.test.ts +253 -0
  852. package/src/inference/stt.ts +449 -208
  853. package/src/inference/tts.test.ts +354 -0
  854. package/src/inference/tts.ts +417 -115
  855. package/src/inference/utils.ts +30 -2
  856. package/src/ipc/inference_proc_executor.ts +11 -3
  857. package/src/ipc/inference_proc_lazy_main.ts +13 -1
  858. package/src/ipc/job_proc_executor.ts +11 -1
  859. package/src/ipc/job_proc_lazy_main.ts +86 -20
  860. package/src/ipc/supervised_proc.test.ts +153 -0
  861. package/src/ipc/supervised_proc.ts +39 -10
  862. package/src/job.ts +120 -1
  863. package/src/language.test.ts +62 -0
  864. package/src/language.ts +380 -0
  865. package/src/llm/__snapshots__/zod-utils.test.ts.snap +559 -0
  866. package/src/llm/chat_context.test.ts +787 -0
  867. package/src/llm/chat_context.ts +493 -2
  868. package/src/llm/fallback_adapter.test.ts +238 -0
  869. package/src/llm/fallback_adapter.ts +394 -0
  870. package/src/llm/index.ts +13 -0
  871. package/src/llm/llm.ts +77 -12
  872. package/src/llm/provider_format/google.test.ts +72 -1
  873. package/src/llm/provider_format/google.ts +10 -6
  874. package/src/llm/provider_format/index.ts +7 -2
  875. package/src/llm/provider_format/openai.test.ts +480 -2
  876. package/src/llm/provider_format/openai.ts +152 -21
  877. package/src/llm/provider_format/utils.ts +11 -5
  878. package/src/llm/realtime.ts +23 -2
  879. package/src/llm/remote_chat_context.ts +2 -2
  880. package/src/llm/tool_context.test.ts +210 -1
  881. package/src/llm/tool_context.ts +115 -17
  882. package/src/llm/utils.test.ts +103 -2
  883. package/src/llm/utils.ts +152 -16
  884. package/src/llm/zod-utils.test.ts +577 -0
  885. package/src/llm/zod-utils.ts +153 -0
  886. package/src/log.ts +71 -19
  887. package/src/metrics/base.ts +78 -19
  888. package/src/metrics/index.ts +12 -0
  889. package/src/metrics/model_usage.test.ts +545 -0
  890. package/src/metrics/model_usage.ts +262 -0
  891. package/src/metrics/usage_collector.ts +14 -3
  892. package/src/metrics/utils.ts +27 -7
  893. package/src/stream/deferred_stream.test.ts +3 -3
  894. package/src/stream/deferred_stream.ts +43 -11
  895. package/src/stream/index.ts +1 -0
  896. package/src/stream/multi_input_stream.test.ts +545 -0
  897. package/src/stream/multi_input_stream.ts +172 -0
  898. package/src/stream/stream_channel.test.ts +37 -0
  899. package/src/stream/stream_channel.ts +43 -3
  900. package/src/stt/stream_adapter.ts +30 -9
  901. package/src/stt/stt.ts +140 -23
  902. package/src/telemetry/index.ts +28 -0
  903. package/src/telemetry/logging.ts +55 -0
  904. package/src/telemetry/otel_http_exporter.ts +218 -0
  905. package/src/telemetry/pino_otel_transport.ts +265 -0
  906. package/src/telemetry/trace_types.ts +109 -0
  907. package/src/telemetry/traces.ts +673 -0
  908. package/src/telemetry/utils.ts +61 -0
  909. package/src/tokenize/basic/sentence.ts +3 -3
  910. package/src/tokenize/tokenizer.test.ts +4 -0
  911. package/src/transcription.ts +6 -0
  912. package/src/tts/fallback_adapter.ts +586 -0
  913. package/src/tts/index.ts +1 -0
  914. package/src/tts/stream_adapter.ts +38 -8
  915. package/src/tts/tts.ts +245 -62
  916. package/src/typed_promise.ts +67 -0
  917. package/src/types.ts +62 -33
  918. package/src/utils.test.ts +90 -10
  919. package/src/utils.ts +178 -33
  920. package/src/vad.ts +42 -18
  921. package/src/version.ts +1 -1
  922. package/src/voice/agent.test.ts +347 -2
  923. package/src/voice/agent.ts +346 -44
  924. package/src/voice/agent_activity.test.ts +194 -0
  925. package/src/voice/agent_activity.ts +1457 -388
  926. package/src/voice/agent_session.ts +817 -112
  927. package/src/voice/audio_recognition.ts +845 -70
  928. package/src/voice/audio_recognition_span.test.ts +341 -0
  929. package/src/voice/avatar/datastream_io.ts +9 -1
  930. package/src/voice/background_audio.ts +494 -0
  931. package/src/voice/events.ts +27 -7
  932. package/src/voice/generation.ts +310 -56
  933. package/src/voice/generation_tools.test.ts +268 -0
  934. package/src/voice/index.ts +17 -3
  935. package/src/voice/interruption_detection.test.ts +151 -0
  936. package/src/voice/io.ts +115 -12
  937. package/src/voice/recorder_io/index.ts +4 -0
  938. package/src/voice/recorder_io/recorder_io.ts +783 -0
  939. package/src/voice/remote_session.ts +1083 -0
  940. package/src/voice/report.test.ts +136 -0
  941. package/src/voice/report.ts +140 -0
  942. package/src/voice/room_io/_input.ts +45 -10
  943. package/src/voice/room_io/_output.ts +26 -14
  944. package/src/voice/room_io/room_io.ts +67 -22
  945. package/src/voice/speech_handle.ts +38 -6
  946. package/src/voice/testing/fake_llm.ts +138 -0
  947. package/src/voice/testing/index.ts +52 -0
  948. package/src/voice/testing/run_result.ts +995 -0
  949. package/src/voice/testing/types.ts +118 -0
  950. package/src/voice/transcription/synchronizer.test.ts +206 -0
  951. package/src/voice/transcription/synchronizer.ts +204 -19
  952. package/src/voice/turn_config/endpointing.ts +33 -0
  953. package/src/voice/turn_config/interruption.ts +56 -0
  954. package/src/voice/turn_config/turn_handling.ts +45 -0
  955. package/src/voice/turn_config/utils.test.ts +148 -0
  956. package/src/voice/turn_config/utils.ts +167 -0
  957. package/src/voice/utils.ts +29 -0
  958. package/src/worker.ts +92 -78
  959. package/src/llm/__snapshots__/utils.test.ts.snap +0 -65
@@ -3,13 +3,19 @@
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import { Mutex } from '@livekit/mutex';
5
5
  import type { AudioFrame } from '@livekit/rtc-node';
6
+ import type { Span } from '@opentelemetry/api';
7
+ import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api';
6
8
  import { Heap } from 'heap-js';
7
9
  import { AsyncLocalStorage } from 'node:async_hooks';
8
- import { ReadableStream } from 'node:stream/web';
9
- import { type ChatContext, ChatMessage } from '../llm/chat_context.js';
10
+ import { ReadableStream, TransformStream } from 'node:stream/web';
11
+ import type { InterruptionDetectionError } from '../inference/interruption/errors.js';
12
+ import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
13
+ import type { OverlappingSpeechEvent } from '../inference/interruption/types.js';
14
+ import { type ChatContext, ChatMessage, type MetricsReport } from '../llm/chat_context.js';
10
15
  import {
11
16
  type ChatItem,
12
17
  type FunctionCall,
18
+ type FunctionCallOutput,
13
19
  type GenerationCreatedEvent,
14
20
  type InputSpeechStartedEvent,
15
21
  type InputSpeechStoppedEvent,
@@ -20,31 +26,41 @@ import {
20
26
  type RealtimeSession,
21
27
  type ToolChoice,
22
28
  type ToolContext,
29
+ ToolFlag,
23
30
  } from '../llm/index.js';
24
31
  import type { LLMError } from '../llm/llm.js';
32
+ import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
25
33
  import { log } from '../log.js';
26
34
  import type {
27
35
  EOUMetrics,
36
+ InterruptionMetrics,
28
37
  LLMMetrics,
29
38
  RealtimeModelMetrics,
30
39
  STTMetrics,
31
40
  TTSMetrics,
32
41
  VADMetrics,
33
42
  } from '../metrics/base.js';
34
- import { DeferredReadableStream } from '../stream/deferred_stream.js';
43
+ import { MultiInputStream } from '../stream/multi_input_stream.js';
35
44
  import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
45
+ import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
36
46
  import { splitWords } from '../tokenize/basic/word.js';
37
47
  import { TTS, type TTSError } from '../tts/tts.js';
38
- import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
48
+ import { Future, Task, cancelAndWait, isDevMode, isHosted, waitFor } from '../utils.js';
39
49
  import { VAD, type VADEvent } from '../vad.js';
40
50
  import type { Agent, ModelSettings } from './agent.js';
41
- import { StopResponse, asyncLocalStorage } from './agent.js';
51
+ import {
52
+ StopResponse,
53
+ _getActivityTaskInfo,
54
+ _setActivityTaskInfo,
55
+ functionCallStorage,
56
+ speechHandleStorage,
57
+ } from './agent.js';
42
58
  import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
43
59
  import {
44
60
  AudioRecognition,
45
61
  type EndOfTurnInfo,
62
+ type PreemptiveGenerationInfo,
46
63
  type RecognitionHooks,
47
- type _TurnDetector,
48
64
  } from './audio_recognition.js';
49
65
  import {
50
66
  AgentSessionEventTypes,
@@ -54,7 +70,7 @@ import {
54
70
  createSpeechCreatedEvent,
55
71
  createUserInputTranscribedEvent,
56
72
  } from './events.js';
57
- import type { ToolExecutionOutput } from './generation.js';
73
+ import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js';
58
74
  import {
59
75
  type _AudioOut,
60
76
  type _TextOut,
@@ -66,34 +82,105 @@ import {
66
82
  removeInstructions,
67
83
  updateInstructions,
68
84
  } from './generation.js';
85
+ import type { TimedString } from './io.js';
69
86
  import { SpeechHandle } from './speech_handle.js';
87
+ import { setParticipantSpanAttributes } from './utils.js';
88
+
89
+ export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
90
+ export const onEnterStorage = new AsyncLocalStorage<OnEnterData>();
91
+
92
+ interface OnEnterData {
93
+ session: AgentSession;
94
+ agent: Agent;
95
+ }
70
96
 
71
- // equivalent to Python's contextvars
72
- const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
97
+ interface PreemptiveGeneration {
98
+ speechHandle: SpeechHandle;
99
+ userMessage: ChatMessage;
100
+ info: PreemptiveGenerationInfo;
101
+ chatCtx: ChatContext;
102
+ tools: ToolContext;
103
+ toolChoice: ToolChoice | null;
104
+ createdAt: number;
105
+ }
73
106
 
107
+ // TODO add false interruption handling and barge in handling for https://github.com/livekit/agents/pull/3109/changes
74
108
  export class AgentActivity implements RecognitionHooks {
109
+ agent: Agent;
110
+ agentSession: AgentSession;
111
+
75
112
  private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
113
+
76
114
  private started = false;
77
115
  private audioRecognition?: AudioRecognition;
78
116
  private realtimeSession?: RealtimeSession;
79
- private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
117
+ private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
118
+ private turnDetectionMode?: TurnDetectionMode;
80
119
  private logger = log();
81
- private _draining = false;
120
+ private _schedulingPaused = true;
121
+ private _drainBlockedTasks: Task<any>[] = [];
82
122
  private _currentSpeech?: SpeechHandle;
83
123
  private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
84
124
  private q_updated: Future;
85
125
  private speechTasks: Set<Task<void>> = new Set();
86
126
  private lock = new Mutex();
87
- private audioStream = new DeferredReadableStream<AudioFrame>();
127
+ private audioStream = new MultiInputStream<AudioFrame>();
128
+ private audioStreamId?: string;
129
+
88
130
  // default to null as None, which maps to the default provider tool choice value
89
131
  private toolChoice: ToolChoice | null = null;
132
+ private _preemptiveGeneration?: PreemptiveGeneration;
133
+ private interruptionDetector?: AdaptiveInterruptionDetector;
134
+ private isInterruptionDetectionEnabled: boolean;
135
+ private isInterruptionByAudioActivityEnabled: boolean;
136
+ private isDefaultInterruptionByAudioActivityEnabled: boolean;
90
137
 
91
- agent: Agent;
92
- agentSession: AgentSession;
138
+ private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent): void =>
139
+ this.onGenerationCreated(ev);
140
+
141
+ private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent): void =>
142
+ this.onInputSpeechStarted(ev);
143
+
144
+ private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent): void =>
145
+ this.onInputSpeechStopped(ev);
146
+
147
+ private readonly onRealtimeInputAudioTranscriptionCompleted = (
148
+ ev: InputTranscriptionCompleted,
149
+ ): void => this.onInputAudioTranscriptionCompleted(ev);
150
+
151
+ private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError): void =>
152
+ this.onError(ev);
153
+
154
+ private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => {
155
+ this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev);
156
+ };
157
+
158
+ private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => {
159
+ this.agentSession._usageCollector.collect(ev);
160
+ this.agentSession.emit(
161
+ AgentSessionEventTypes.MetricsCollected,
162
+ createMetricsCollectedEvent({ metrics: ev }),
163
+ );
164
+ };
165
+
166
+ private readonly onInterruptionError = (ev: InterruptionDetectionError): void => {
167
+ const errorEvent = createErrorEvent(ev, this.interruptionDetector);
168
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
169
+
170
+ if (!ev.recoverable) {
171
+ this.agentSession._onError(ev);
172
+ this.fallbackToVadInterruption();
173
+ return;
174
+ }
175
+
176
+ this.agentSession._onError(ev);
177
+ };
93
178
 
94
179
  /** @internal */
95
180
  _mainTask?: Task<void>;
96
- _userTurnCompletedTask?: Promise<void>;
181
+ _onEnterTask?: Task<void>;
182
+ _onExitTask?: Task<void>;
183
+ _userTurnCompletedTask?: Task<void>;
97
184
 
98
185
  constructor(agent: Agent, agentSession: AgentSession) {
99
186
  this.agent = agent;
@@ -114,7 +201,7 @@ export class AgentActivity implements RecognitionHooks {
114
201
 
115
202
  if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
116
203
  this.logger.warn(
117
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting',
204
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting',
118
205
  );
119
206
  this.turnDetectionMode = undefined;
120
207
  }
@@ -177,104 +264,172 @@ export class AgentActivity implements RecognitionHooks {
177
264
  if (
178
265
  !this.vad &&
179
266
  this.stt &&
267
+ !this.stt.capabilities.streaming &&
180
268
  this.llm instanceof LLM &&
181
269
  this.allowInterruptions &&
182
270
  this.turnDetectionMode === undefined
183
271
  ) {
184
272
  this.logger.warn(
185
- 'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
273
+ 'VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT ' +
186
274
  'for more responsive interruption handling.',
187
275
  );
188
276
  }
277
+
278
+ this.interruptionDetector = this.resolveInterruptionDetector();
279
+ this.isInterruptionDetectionEnabled = !!this.interruptionDetector;
280
+
281
+ // this allows taking over audio interruption temporarily until interruption is detected
282
+ // by default is is ture unless turnDetection is manual or realtime_llm
283
+ this.isInterruptionByAudioActivityEnabled =
284
+ this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
285
+
286
+ this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled;
189
287
  }
190
288
 
191
289
  async start(): Promise<void> {
192
290
  const unlock = await this.lock.lock();
193
291
  try {
194
- this.agent._agentActivity = this;
195
-
196
- if (this.llm instanceof RealtimeModel) {
197
- this.realtimeSession = this.llm.session();
198
- this.realtimeSession.on('generation_created', (ev) => this.onGenerationCreated(ev));
199
- this.realtimeSession.on('input_speech_started', (ev) => this.onInputSpeechStarted(ev));
200
- this.realtimeSession.on('input_speech_stopped', (ev) => this.onInputSpeechStopped(ev));
201
- this.realtimeSession.on('input_audio_transcription_completed', (ev) =>
202
- this.onInputAudioTranscriptionCompleted(ev),
203
- );
204
- this.realtimeSession.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
205
- this.realtimeSession.on('error', (ev) => this.onError(ev));
206
-
207
- removeInstructions(this.agent._chatCtx);
208
- try {
209
- await this.realtimeSession.updateInstructions(this.agent.instructions);
210
- } catch (error) {
211
- this.logger.error(error, 'failed to update the instructions');
212
- }
292
+ await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true });
293
+ } finally {
294
+ unlock();
295
+ }
296
+ }
213
297
 
214
- try {
215
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
216
- } catch (error) {
217
- this.logger.error(error, 'failed to update the chat context');
218
- }
298
+ async resume(): Promise<void> {
299
+ const unlock = await this.lock.lock();
300
+ try {
301
+ await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false });
302
+ } finally {
303
+ unlock();
304
+ }
305
+ }
219
306
 
220
- try {
221
- await this.realtimeSession.updateTools(this.tools);
222
- } catch (error) {
223
- this.logger.error(error, 'failed to update the tools');
224
- }
225
- } else if (this.llm instanceof LLM) {
226
- try {
227
- updateInstructions({
228
- chatCtx: this.agent._chatCtx,
229
- instructions: this.agent.instructions,
230
- addIfMissing: true,
231
- });
232
- } catch (error) {
233
- this.logger.error('failed to update the instructions', error);
234
- }
235
- }
307
+ private async _startSession(options: {
308
+ spanName: 'start_agent_activity' | 'resume_agent_activity';
309
+ runOnEnter: boolean;
310
+ }): Promise<void> {
311
+ const { spanName, runOnEnter } = options;
312
+ const startSpan = tracer.startSpan({
313
+ name: spanName,
314
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
315
+ context: ROOT_CONTEXT,
316
+ });
317
+
318
+ this.agent._agentActivity = this;
319
+
320
+ if (this.llm instanceof RealtimeModel) {
321
+ this.realtimeSession = this.llm.session();
322
+ this.realtimeSpans = new Map<string, Span>();
323
+ this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated);
324
+ this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted);
325
+ this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped);
326
+ this.realtimeSession.on(
327
+ 'input_audio_transcription_completed',
328
+ this.onRealtimeInputAudioTranscriptionCompleted,
329
+ );
330
+ this.realtimeSession.on('metrics_collected', this.onMetricsCollected);
331
+ this.realtimeSession.on('error', this.onModelError);
236
332
 
237
- // metrics and error handling
238
- if (this.llm instanceof LLM) {
239
- this.llm.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
240
- this.llm.on('error', (ev) => this.onError(ev));
333
+ removeInstructions(this.agent._chatCtx);
334
+ try {
335
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
336
+ } catch (error) {
337
+ this.logger.error(error, 'failed to update the instructions');
241
338
  }
242
339
 
243
- if (this.stt instanceof STT) {
244
- this.stt.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
245
- this.stt.on('error', (ev) => this.onError(ev));
340
+ try {
341
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
342
+ } catch (error) {
343
+ this.logger.error(error, 'failed to update the chat context');
246
344
  }
247
345
 
248
- if (this.tts instanceof TTS) {
249
- this.tts.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
250
- this.tts.on('error', (ev) => this.onError(ev));
346
+ try {
347
+ await this.realtimeSession.updateTools(this.tools);
348
+ } catch (error) {
349
+ this.logger.error(error, 'failed to update the tools');
251
350
  }
252
351
 
253
- if (this.vad instanceof VAD) {
254
- this.vad.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
352
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
353
+ this.logger.error(
354
+ 'audio output is enabled but RealtimeModel has no audio modality ' +
355
+ 'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
356
+ 'or set a TTS model.',
357
+ );
358
+ }
359
+ } else if (this.llm instanceof LLM) {
360
+ try {
361
+ updateInstructions({
362
+ chatCtx: this.agent._chatCtx,
363
+ instructions: this.agent.instructions,
364
+ addIfMissing: true,
365
+ });
366
+ } catch (error) {
367
+ this.logger.error('failed to update the instructions', error);
255
368
  }
369
+ }
256
370
 
257
- this.audioRecognition = new AudioRecognition({
258
- recognitionHooks: this,
259
- // Disable stt node if stt is not provided
260
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
261
- vad: this.vad,
262
- turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
263
- turnDetectionMode: this.turnDetectionMode,
264
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
265
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
266
- });
267
- this.audioRecognition.start();
268
- this.started = true;
371
+ // TODO(parity): Record initial AgentConfigUpdate in chat context
269
372
 
270
- this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
271
- this.createSpeechTask({
272
- task: Task.from(() => this.agent.onEnter()),
373
+ // metrics and error handling
374
+ if (this.llm instanceof LLM) {
375
+ this.llm.on('metrics_collected', this.onMetricsCollected);
376
+ this.llm.on('error', this.onModelError);
377
+ }
378
+
379
+ if (this.stt instanceof STT) {
380
+ this.stt.on('metrics_collected', this.onMetricsCollected);
381
+ this.stt.on('error', this.onModelError);
382
+ }
383
+
384
+ if (this.tts instanceof TTS) {
385
+ this.tts.on('metrics_collected', this.onMetricsCollected);
386
+ this.tts.on('error', this.onModelError);
387
+ }
388
+
389
+ if (this.vad instanceof VAD) {
390
+ this.vad.on('metrics_collected', this.onMetricsCollected);
391
+ }
392
+
393
+ this.audioRecognition = new AudioRecognition({
394
+ recognitionHooks: this,
395
+ // Disable stt node if stt is not provided
396
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
397
+ vad: this.vad,
398
+ turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
399
+ turnDetectionMode: this.turnDetectionMode,
400
+ interruptionDetection: this.interruptionDetector,
401
+ minEndpointingDelay:
402
+ this.agent.turnHandling?.endpointing?.minDelay ??
403
+ this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
404
+ maxEndpointingDelay:
405
+ this.agent.turnHandling?.endpointing?.maxDelay ??
406
+ this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
407
+ rootSpanContext: this.agentSession.rootSpanContext,
408
+ sttModel: this.stt?.label,
409
+ sttProvider: this.getSttProvider(),
410
+ getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
411
+ });
412
+ this.audioRecognition.start();
413
+ this.started = true;
414
+
415
+ this._resumeSchedulingTask();
416
+
417
+ if (runOnEnter) {
418
+ this._onEnterTask = this.createSpeechTask({
419
+ taskFn: () =>
420
+ onEnterStorage.run({ session: this.agentSession, agent: this.agent }, () =>
421
+ tracer.startActiveSpan(async () => this.agent.onEnter(), {
422
+ name: 'on_enter',
423
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
424
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
425
+ }),
426
+ ),
427
+ inlineTask: true,
273
428
  name: 'AgentActivity_onEnter',
274
429
  });
275
- } finally {
276
- unlock();
277
430
  }
431
+
432
+ startSpan.end();
278
433
  }
279
434
 
280
435
  get currentSpeech(): SpeechHandle | undefined {
@@ -289,6 +444,17 @@ export class AgentActivity implements RecognitionHooks {
289
444
  return this.agent.stt || this.agentSession.stt;
290
445
  }
291
446
 
447
+ private getSttProvider(): string | undefined {
448
+ const label = this.stt?.label;
449
+ if (!label) {
450
+ return undefined;
451
+ }
452
+
453
+ // Heuristic: most labels look like "<provider>-<model>"
454
+ const [provider] = label.split('-', 1);
455
+ return provider || label;
456
+ }
457
+
292
458
  get llm(): LLM | RealtimeModel | undefined {
293
459
  return this.agent.llm || this.agentSession.llm;
294
460
  }
@@ -301,8 +467,8 @@ export class AgentActivity implements RecognitionHooks {
301
467
  return this.agent.toolCtx;
302
468
  }
303
469
 
304
- get draining(): boolean {
305
- return this._draining;
470
+ get schedulingPaused(): boolean {
471
+ return this._schedulingPaused;
306
472
  }
307
473
 
308
474
  get realtimeLLMSession(): RealtimeSession | undefined {
@@ -310,19 +476,48 @@ export class AgentActivity implements RecognitionHooks {
310
476
  }
311
477
 
312
478
  get allowInterruptions(): boolean {
313
- // TODO(AJS-51): Allow options to be defined in Agent class
314
- return this.agentSession.options.allowInterruptions;
479
+ return (
480
+ this.agent.turnHandling?.interruption?.enabled ??
481
+ this.agentSession.sessionOptions.turnHandling.interruption.enabled
482
+ );
483
+ }
484
+
485
+ get useTtsAlignedTranscript(): boolean {
486
+ // Agent setting takes precedence over session setting
487
+ return this.agent.useTtsAlignedTranscript ?? this.agentSession.useTtsAlignedTranscript;
315
488
  }
316
489
 
317
490
  get turnDetection(): TurnDetectionMode | undefined {
318
- // TODO(brian): prioritize using agent.turn_detection
319
- return this.agentSession.turnDetection;
491
+ return this.agent.turnHandling?.turnDetection ?? this.agentSession.turnDetection;
492
+ }
493
+
494
+ get turnHandling() {
495
+ return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling;
320
496
  }
321
497
 
498
+ // get minEndpointingDelay(): number {
499
+ // return (
500
+ // this.agent.turnHandling?.endpointing?.minDelay ??
501
+ // this.agentSession.sessionOptions.turnHandling.endpointing.minDelay
502
+ // );
503
+ // }
504
+
505
+ // get maxEndpointingDelay(): number {
506
+ // return (
507
+ // this.agent.turnHandling?.endpointing?.maxDelay ??
508
+ // this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay
509
+ // );
510
+ // }
511
+
322
512
  get toolCtx(): ToolContext {
323
513
  return this.agent.toolCtx;
324
514
  }
325
515
 
516
+ /** @internal */
517
+ get inputStartedAt() {
518
+ return this.audioRecognition?.inputStartedAt;
519
+ }
520
+
326
521
  async updateChatCtx(chatCtx: ChatContext): Promise<void> {
327
522
  chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
328
523
 
@@ -340,7 +535,27 @@ export class AgentActivity implements RecognitionHooks {
340
535
  }
341
536
  }
342
537
 
343
- updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void {
538
+ // TODO: Add when AgentConfigUpdate is ported to ChatContext.
539
+ async updateTools(tools: ToolContext): Promise<void> {
540
+ this.agent._tools = { ...tools };
541
+
542
+ if (this.realtimeSession) {
543
+ await this.realtimeSession.updateTools(tools);
544
+ }
545
+
546
+ if (this.llm instanceof LLM) {
547
+ // for realtime LLM, we assume the server will remove unvalid tool messages
548
+ await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
549
+ }
550
+ }
551
+
552
+ updateOptions({
553
+ toolChoice,
554
+ turnDetection,
555
+ }: {
556
+ toolChoice?: ToolChoice | null;
557
+ turnDetection?: TurnDetectionMode;
558
+ }): void {
344
559
  if (toolChoice !== undefined) {
345
560
  this.toolChoice = toolChoice;
346
561
  }
@@ -348,43 +563,85 @@ export class AgentActivity implements RecognitionHooks {
348
563
  if (this.realtimeSession) {
349
564
  this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
350
565
  }
566
+
567
+ if (turnDetection !== undefined) {
568
+ this.turnDetectionMode = turnDetection;
569
+ this.isDefaultInterruptionByAudioActivityEnabled =
570
+ this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm';
571
+
572
+ // sync live flag immediately when not speaking so the change takes effect right away
573
+ if (this.agentSession.agentState !== 'speaking') {
574
+ this.isInterruptionByAudioActivityEnabled =
575
+ this.isDefaultInterruptionByAudioActivityEnabled;
576
+ }
577
+ }
578
+
579
+ if (this.audioRecognition) {
580
+ this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
581
+ }
351
582
  }
352
583
 
353
584
  attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
354
- if (this.audioStream.isSourceSet) {
355
- this.logger.debug('detaching existing audio input in agent activity');
356
- this.audioStream.detachSource();
357
- }
585
+ void this.audioStream.close();
586
+ this.audioStream = new MultiInputStream<AudioFrame>();
587
+
588
+ // Filter is applied on this.audioStream.stream (downstream of MultiInputStream) rather
589
+ // than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
590
+ // if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
591
+ // and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
592
+ const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
593
+ transform: (frame, controller) => {
594
+ const shouldDiscardForAecWarmup =
595
+ this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
596
+ if (!shouldDiscardForAecWarmup) {
597
+ controller.enqueue(frame);
598
+ }
599
+ },
600
+ });
358
601
 
359
- /**
360
- * We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
361
- * The tee() operation should be applied to the deferred stream, not the original audioStream.
362
- * This is important because teeing the original stream directly makes it very difficult—if not
363
- * impossible—to implement stream unlock logic cleanly.
364
- */
365
- this.audioStream.setSource(audioStream);
366
- const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
602
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
367
603
 
368
- if (this.realtimeSession) {
604
+ if (this.realtimeSession && this.audioRecognition) {
605
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
606
+ .pipeThrough(aecWarmupAudioFilter)
607
+ .tee();
369
608
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
370
- }
371
-
372
- if (this.audioRecognition) {
373
609
  this.audioRecognition.setInputAudioStream(recognitionAudioStream);
610
+ } else if (this.realtimeSession) {
611
+ this.realtimeSession.setInputAudioStream(
612
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
613
+ );
614
+ } else if (this.audioRecognition) {
615
+ this.audioRecognition.setInputAudioStream(
616
+ this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
617
+ );
374
618
  }
375
619
  }
376
620
 
377
621
  detachAudioInput(): void {
378
- this.audioStream.detachSource();
622
+ if (this.audioStreamId === undefined) {
623
+ return;
624
+ }
625
+
626
+ void this.audioStream.close();
627
+ this.audioStream = new MultiInputStream<AudioFrame>();
628
+ this.audioStreamId = undefined;
379
629
  }
380
630
 
381
- commitUserTurn() {
631
+ commitUserTurn(
632
+ options: {
633
+ audioDetached?: boolean;
634
+ throwIfNotReady?: boolean;
635
+ } = {},
636
+ ) {
637
+ const { audioDetached = false, throwIfNotReady = true } = options;
382
638
  if (!this.audioRecognition) {
383
- throw new Error('AudioRecognition is not initialized');
639
+ if (throwIfNotReady) {
640
+ throw new Error('AudioRecognition is not initialized');
641
+ }
642
+ return;
384
643
  }
385
644
 
386
- // TODO(brian): add audio_detached flag
387
- const audioDetached = false;
388
645
  this.audioRecognition.commitUserTurn(audioDetached);
389
646
  }
390
647
 
@@ -442,14 +699,13 @@ export class AgentActivity implements RecognitionHooks {
442
699
  }),
443
700
  );
444
701
  const task = this.createSpeechTask({
445
- task: Task.from((abortController: AbortController) =>
702
+ taskFn: (abortController: AbortController) =>
446
703
  this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
447
- ),
448
704
  ownedSpeechHandle: handle,
449
705
  name: 'AgentActivity.say_tts',
450
706
  });
451
707
 
452
- task.finally(() => this.onPipelineReplyDone());
708
+ task.result.finally(() => this.onPipelineReplyDone());
453
709
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
454
710
  return handle;
455
711
  }
@@ -463,6 +719,18 @@ export class AgentActivity implements RecognitionHooks {
463
719
  if (speechHandle && (ev.type === 'llm_metrics' || ev.type === 'tts_metrics')) {
464
720
  ev.speechId = speechHandle.id;
465
721
  }
722
+
723
+ // Record realtime metrics on the associated span (if available)
724
+ if (ev.type === 'realtime_model_metrics' && this.realtimeSpans) {
725
+ const span = this.realtimeSpans.get(ev.requestId);
726
+ if (span) {
727
+ recordRealtimeMetrics(span, ev);
728
+ this.realtimeSpans.delete(ev.requestId);
729
+ }
730
+ }
731
+
732
+ this.agentSession._usageCollector.collect(ev);
733
+
466
734
  this.agentSession.emit(
467
735
  AgentSessionEventTypes.MetricsCollected,
468
736
  createMetricsCollectedEvent({ metrics: ev }),
@@ -494,6 +762,13 @@ export class AgentActivity implements RecognitionHooks {
494
762
 
495
763
  if (!this.vad) {
496
764
  this.agentSession._updateUserState('speaking');
765
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
766
+ this.audioRecognition.onStartOfOverlapSpeech(
767
+ 0,
768
+ Date.now(),
769
+ this.agentSession._userSpeakingSpan,
770
+ );
771
+ }
497
772
  }
498
773
 
499
774
  // this.interrupt() is going to raise when allow_interruptions is False,
@@ -512,6 +787,9 @@ export class AgentActivity implements RecognitionHooks {
512
787
  this.logger.info(ev, 'onInputSpeechStopped');
513
788
 
514
789
  if (!this.vad) {
790
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
791
+ this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
792
+ }
515
793
  this.agentSession._updateUserState('listening');
516
794
  }
517
795
 
@@ -552,10 +830,9 @@ export class AgentActivity implements RecognitionHooks {
552
830
  return;
553
831
  }
554
832
 
555
- if (this.draining) {
556
- // copied from python:
833
+ if (this.schedulingPaused) {
557
834
  // TODO(shubhra): should we "forward" this new turn to the next agent?
558
- this.logger.warn('skipping new realtime generation, the agent is draining');
835
+ this.logger.warn('skipping new realtime generation, the speech scheduling is not running');
559
836
  return;
560
837
  }
561
838
 
@@ -573,9 +850,8 @@ export class AgentActivity implements RecognitionHooks {
573
850
  this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
574
851
 
575
852
  this.createSpeechTask({
576
- task: Task.from((abortController: AbortController) =>
853
+ taskFn: (abortController: AbortController) =>
577
854
  this.realtimeGenerationTask(handle, ev, {}, abortController),
578
- ),
579
855
  ownedSpeechHandle: handle,
580
856
  name: 'AgentActivity.realtimeGeneration',
581
857
  });
@@ -584,13 +860,43 @@ export class AgentActivity implements RecognitionHooks {
584
860
  }
585
861
 
586
862
  // recognition hooks
587
-
588
- onStartOfSpeech(_ev: VADEvent): void {
589
- this.agentSession._updateUserState('speaking');
863
+ onStartOfSpeech(ev: VADEvent): void {
864
+ let speechStartTime = Date.now();
865
+ if (ev) {
866
+ // Subtract both speechDuration and inferenceDuration to correct for VAD model latency.
867
+ speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration;
868
+ }
869
+ this.agentSession._updateUserState('speaking', {
870
+ lastSpeakingTime: speechStartTime,
871
+ otelContext: otelContext.active(),
872
+ });
873
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
874
+ // Pass speechStartTime as the absolute startedAt timestamp.
875
+ this.audioRecognition.onStartOfOverlapSpeech(
876
+ ev.speechDuration,
877
+ speechStartTime,
878
+ this.agentSession._userSpeakingSpan,
879
+ );
880
+ }
590
881
  }
591
882
 
592
- onEndOfSpeech(_ev: VADEvent): void {
593
- this.agentSession._updateUserState('listening');
883
+ onEndOfSpeech(ev: VADEvent): void {
884
+ let speechEndTime = Date.now();
885
+ if (ev) {
886
+ // Subtract both silenceDuration and inferenceDuration to correct for VAD model latency.
887
+ speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
888
+ }
889
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
890
+ // Pass speechEndTime as the absolute endedAt timestamp.
891
+ this.audioRecognition.onEndOfOverlapSpeech(
892
+ speechEndTime,
893
+ this.agentSession._userSpeakingSpan,
894
+ );
895
+ }
896
+ this.agentSession._updateUserState('listening', {
897
+ lastSpeakingTime: speechEndTime,
898
+ otelContext: otelContext.active(),
899
+ });
594
900
  }
595
901
 
596
902
  onVADInferenceDone(ev: VADEvent): void {
@@ -599,20 +905,47 @@ export class AgentActivity implements RecognitionHooks {
599
905
  return;
600
906
  }
601
907
 
602
- if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
603
- // skip speech handle interruption if server side turn detection is enabled
908
+ if (
909
+ ev.speechDuration >= this.agentSession.sessionOptions.turnHandling.interruption?.minDuration
910
+ ) {
911
+ this.interruptByAudioActivity();
912
+ }
913
+ }
914
+
915
+ private interruptByAudioActivity(): void {
916
+ if (!this.isInterruptionByAudioActivityEnabled) {
604
917
  return;
605
918
  }
606
919
 
607
- if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
920
+ if (this.agentSession._aecWarmupRemaining > 0) {
921
+ // Disable interruption from audio activity while AEC warmup is active.
608
922
  return;
609
923
  }
610
924
 
611
- if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
612
- const text = this.audioRecognition.currentTranscript;
925
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
926
+ // skip speech handle interruption if server side turn detection is enabled
927
+ return;
928
+ }
613
929
 
930
+ // Refactored interruption word count check:
931
+ // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
932
+ // - Apply check to all STT results: empty string, undefined, or any length
933
+ // - This ensures consistent behavior across all interruption scenarios
934
+ if (
935
+ this.stt &&
936
+ this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0 &&
937
+ this.audioRecognition
938
+ ) {
939
+ const text = this.audioRecognition.currentTranscript;
614
940
  // TODO(shubhra): better word splitting for multi-language
615
- if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
941
+
942
+ // Normalize text: convert undefined/null to empty string for consistent word counting
943
+ const normalizedText = text ?? '';
944
+ const wordCount = splitWords(normalizedText, true).length;
945
+
946
+ // Only allow interruption if word count meets or exceeds minInterruptionWords
947
+ // This applies to all cases: empty strings, partial speech, and full speech
948
+ if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) {
616
949
  return;
617
950
  }
618
951
  }
@@ -624,12 +957,23 @@ export class AgentActivity implements RecognitionHooks {
624
957
  !this._currentSpeech.interrupted &&
625
958
  this._currentSpeech.allowInterruptions
626
959
  ) {
627
- this.logger.info({ 'speech id': this._currentSpeech.id }, 'speech interrupted by VAD');
960
+ this.logger.info(
961
+ { 'speech id': this._currentSpeech.id },
962
+ 'speech interrupted by audio activity',
963
+ );
628
964
  this.realtimeSession?.interrupt();
629
965
  this._currentSpeech.interrupt();
630
966
  }
631
967
  }
632
968
 
969
+ onInterruption(ev: OverlappingSpeechEvent) {
970
+ this.restoreInterruptionByAudioActivity();
971
+ this.interruptByAudioActivity();
972
+ if (this.audioRecognition) {
973
+ this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.detectedAt);
974
+ }
975
+ }
976
+
633
977
  onInterimTranscript(ev: SpeechEvent): void {
634
978
  if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
635
979
  // skip stt transcription if userTranscription is enabled on the realtime model
@@ -641,9 +985,14 @@ export class AgentActivity implements RecognitionHooks {
641
985
  createUserInputTranscribedEvent({
642
986
  transcript: ev.alternatives![0].text,
643
987
  isFinal: false,
988
+ language: ev.alternatives![0].language,
644
989
  // TODO(AJS-106): add multi participant support
645
990
  }),
646
991
  );
992
+
993
+ if (ev.alternatives![0].text) {
994
+ this.interruptByAudioActivity();
995
+ }
647
996
  }
648
997
 
649
998
  onFinalTranscript(ev: SpeechEvent): void {
@@ -657,17 +1006,103 @@ export class AgentActivity implements RecognitionHooks {
657
1006
  createUserInputTranscribedEvent({
658
1007
  transcript: ev.alternatives![0].text,
659
1008
  isFinal: true,
1009
+ language: ev.alternatives![0].language,
660
1010
  // TODO(AJS-106): add multi participant support
661
1011
  }),
662
1012
  );
1013
+
1014
+ // agent speech might not be interrupted if VAD failed and a final transcript is received
1015
+ // we call interruptByAudioActivity (idempotent) to pause the speech, if possible
1016
+ if (
1017
+ this.audioRecognition &&
1018
+ this.turnDetection !== 'manual' &&
1019
+ this.turnDetection !== 'realtime_llm'
1020
+ ) {
1021
+ this.interruptByAudioActivity();
1022
+
1023
+ // TODO: resume false interruption - schedule a resume timer if interrupted after end_of_speech
1024
+ }
1025
+
1026
+ // TODO: resume false interruption - start interrupt paused speech task
1027
+ }
1028
+
1029
+ onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
1030
+ if (
1031
+ !this.agentSession.sessionOptions.preemptiveGeneration ||
1032
+ this.schedulingPaused ||
1033
+ (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
1034
+ !(this.llm instanceof LLM)
1035
+ ) {
1036
+ return;
1037
+ }
1038
+
1039
+ this.cancelPreemptiveGeneration();
1040
+
1041
+ this.logger.info(
1042
+ {
1043
+ newTranscript: info.newTranscript,
1044
+ transcriptConfidence: info.transcriptConfidence,
1045
+ },
1046
+ 'starting preemptive generation',
1047
+ );
1048
+
1049
+ const userMessage = ChatMessage.create({
1050
+ role: 'user',
1051
+ content: info.newTranscript,
1052
+ transcriptConfidence: info.transcriptConfidence,
1053
+ });
1054
+ const chatCtx = this.agent.chatCtx.copy();
1055
+ const speechHandle = this.generateReply({
1056
+ userMessage,
1057
+ chatCtx,
1058
+ scheduleSpeech: false,
1059
+ });
1060
+
1061
+ this._preemptiveGeneration = {
1062
+ speechHandle,
1063
+ userMessage,
1064
+ info,
1065
+ chatCtx: chatCtx.copy(),
1066
+ tools: { ...this.tools },
1067
+ toolChoice: this.toolChoice,
1068
+ createdAt: Date.now(),
1069
+ };
1070
+ }
1071
+
1072
+ private cancelPreemptiveGeneration(): void {
1073
+ if (this._preemptiveGeneration !== undefined) {
1074
+ this._preemptiveGeneration.speechHandle._cancel();
1075
+ this._preemptiveGeneration = undefined;
1076
+ }
663
1077
  }
664
1078
 
665
1079
  private createSpeechTask(options: {
666
- task: Task<void>;
1080
+ taskFn: (controller: AbortController) => Promise<void>;
1081
+ controller?: AbortController;
667
1082
  ownedSpeechHandle?: SpeechHandle;
1083
+ inlineTask?: boolean;
668
1084
  name?: string;
669
- }): Promise<void> {
670
- const { task, ownedSpeechHandle } = options;
1085
+ }): Task<void> {
1086
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
1087
+
1088
+ const wrappedFn = (ctrl: AbortController) => {
1089
+ return agentActivityStorage.run(this, () => {
1090
+ // Mark inline/speech metadata at task runtime to avoid a race where taskFn executes
1091
+ // before post-construction metadata is attached to the Task instance.
1092
+ const currentTask = Task.current();
1093
+ if (currentTask) {
1094
+ _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
1095
+ }
1096
+
1097
+ if (ownedSpeechHandle) {
1098
+ return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
1099
+ }
1100
+ return taskFn(ctrl);
1101
+ });
1102
+ };
1103
+
1104
+ const task = Task.from(wrappedFn, controller, name);
1105
+ _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
671
1106
 
672
1107
  this.speechTasks.add(task);
673
1108
  task.addDoneCallback(() => {
@@ -687,34 +1122,50 @@ export class AgentActivity implements RecognitionHooks {
687
1122
  this.wakeupMainTask();
688
1123
  });
689
1124
 
690
- return task.result;
1125
+ return task;
691
1126
  }
692
1127
 
693
1128
  async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
694
- if (this.draining) {
695
- this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
696
- // copied from python:
1129
+ if (this.schedulingPaused) {
1130
+ this.cancelPreemptiveGeneration();
1131
+ this.logger.warn(
1132
+ { user_input: info.newTranscript },
1133
+ 'skipping user input, speech scheduling is paused',
1134
+ );
697
1135
  // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
698
1136
  return true;
699
1137
  }
700
1138
 
1139
+ // Refactored interruption word count check for consistency with onVADInferenceDone:
1140
+ // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
1141
+ // - Use consistent word splitting logic with splitWords (matching onVADInferenceDone pattern)
701
1142
  if (
702
1143
  this.stt &&
703
1144
  this.turnDetection !== 'manual' &&
704
1145
  this._currentSpeech &&
705
1146
  this._currentSpeech.allowInterruptions &&
706
1147
  !this._currentSpeech.interrupted &&
707
- this.agentSession.options.minInterruptionWords > 0 &&
708
- info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
1148
+ this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0
709
1149
  ) {
710
- // avoid interruption if the new_transcript is too short
711
- this.logger.info('skipping user input, new_transcript is too short');
712
- return false;
1150
+ const wordCount = splitWords(info.newTranscript, true).length;
1151
+ if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) {
1152
+ // avoid interruption if the new_transcript contains fewer words than minInterruptionWords
1153
+ this.cancelPreemptiveGeneration();
1154
+ this.logger.info(
1155
+ {
1156
+ wordCount,
1157
+ minInterruptionWords:
1158
+ this.agentSession.sessionOptions.turnHandling.interruption.minWords,
1159
+ },
1160
+ 'skipping user input, word count below minimum interruption threshold',
1161
+ );
1162
+ return false;
1163
+ }
713
1164
  }
714
1165
 
715
1166
  const oldTask = this._userTurnCompletedTask;
716
1167
  this._userTurnCompletedTask = this.createSpeechTask({
717
- task: Task.from(() => this.userTurnCompleted(info, oldTask)),
1168
+ taskFn: () => this.userTurnCompleted(info, oldTask),
718
1169
  name: 'AgentActivity.userTurnCompleted',
719
1170
  });
720
1171
  return true;
@@ -744,16 +1195,28 @@ export class AgentActivity implements RecognitionHooks {
744
1195
  throw new Error('Speech queue is empty');
745
1196
  }
746
1197
  const speechHandle = heapItem[2];
1198
+
1199
+ // Skip speech handles that were already interrupted/done before being
1200
+ // picked up from the queue (e.g. interrupted during shutdown before the
1201
+ // main loop had a chance to process them). Calling _authorizeGeneration
1202
+ // on a done handle would create a generation Future that nobody resolves,
1203
+ // causing the main loop to hang forever.
1204
+ if (speechHandle.interrupted || speechHandle.done()) {
1205
+ continue;
1206
+ }
1207
+
747
1208
  this._currentSpeech = speechHandle;
748
1209
  speechHandle._authorizeGeneration();
749
- await speechHandle._waitForGeneration();
1210
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForGeneration()]);
750
1211
  this._currentSpeech = undefined;
751
1212
  }
752
1213
 
753
- // If we're draining and there are no more speech tasks, we can exit.
754
- // Only speech tasks can bypass draining to create a tool response
755
- if (this.draining && this.speechTasks.size === 0) {
756
- this.logger.info('mainTask: draining and no more speech tasks');
1214
+ // if we're draining/pausing and there are no more speech tasks, we can exit.
1215
+ // only speech tasks can bypass draining to create a tool response (see scheduleSpeech)
1216
+ const toWait = this.getDrainPendingSpeechTasks();
1217
+
1218
+ if (this._schedulingPaused && toWait.length === 0) {
1219
+ this.logger.info('mainTask: scheduling paused and no more speech tasks to wait');
757
1220
  break;
758
1221
  }
759
1222
 
@@ -763,6 +1226,39 @@ export class AgentActivity implements RecognitionHooks {
763
1226
  this.logger.info('AgentActivity mainTask: exiting');
764
1227
  }
765
1228
 
1229
+ private getDrainPendingSpeechTasks(): Task<void>[] {
1230
+ const blockedHandles: SpeechHandle[] = [];
1231
+
1232
+ for (const task of this._drainBlockedTasks) {
1233
+ const info = _getActivityTaskInfo(task);
1234
+ if (!info) {
1235
+ this.logger.error('blocked task without activity info; skipping.');
1236
+ continue;
1237
+ }
1238
+
1239
+ if (!info.speechHandle) {
1240
+ continue; // onEnter/onExit
1241
+ }
1242
+
1243
+ blockedHandles.push(info.speechHandle);
1244
+ }
1245
+
1246
+ const toWait: Task<void>[] = [];
1247
+ for (const task of this.speechTasks) {
1248
+ if (this._drainBlockedTasks.includes(task)) {
1249
+ continue;
1250
+ }
1251
+
1252
+ const info = _getActivityTaskInfo(task);
1253
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
1254
+ continue;
1255
+ }
1256
+
1257
+ toWait.push(task);
1258
+ }
1259
+ return toWait;
1260
+ }
1261
+
766
1262
  private wakeupMainTask(): void {
767
1263
  this.q_updated.resolve();
768
1264
  }
@@ -773,6 +1269,7 @@ export class AgentActivity implements RecognitionHooks {
773
1269
  instructions?: string;
774
1270
  toolChoice?: ToolChoice | null;
775
1271
  allowInterruptions?: boolean;
1272
+ scheduleSpeech?: boolean;
776
1273
  }): SpeechHandle {
777
1274
  const {
778
1275
  userMessage,
@@ -780,6 +1277,7 @@ export class AgentActivity implements RecognitionHooks {
780
1277
  instructions: defaultInstructions,
781
1278
  toolChoice: defaultToolChoice,
782
1279
  allowInterruptions: defaultAllowInterruptions,
1280
+ scheduleSpeech = true,
783
1281
  } = options;
784
1282
 
785
1283
  let instructions = defaultInstructions;
@@ -802,7 +1300,7 @@ export class AgentActivity implements RecognitionHooks {
802
1300
  throw new Error('trying to generate reply without an LLM model');
803
1301
  }
804
1302
 
805
- const functionCall = asyncLocalStorage.getStore()?.functionCall;
1303
+ const functionCall = functionCallStorage.getStore()?.functionCall;
806
1304
  if (toolChoice === undefined && functionCall !== undefined) {
807
1305
  // when generateReply is called inside a tool, set toolChoice to 'none' by default
808
1306
  toolChoice = 'none';
@@ -824,7 +1322,7 @@ export class AgentActivity implements RecognitionHooks {
824
1322
 
825
1323
  if (this.llm instanceof RealtimeModel) {
826
1324
  this.createSpeechTask({
827
- task: Task.from((abortController: AbortController) =>
1325
+ taskFn: (abortController: AbortController) =>
828
1326
  this.realtimeReplyTask({
829
1327
  speechHandle: handle,
830
1328
  // TODO(brian): support llm.ChatMessage for the realtime model
@@ -836,7 +1334,6 @@ export class AgentActivity implements RecognitionHooks {
836
1334
  },
837
1335
  abortController,
838
1336
  }),
839
- ),
840
1337
  ownedSpeechHandle: handle,
841
1338
  name: 'AgentActivity.realtimeReply',
842
1339
  });
@@ -848,46 +1345,80 @@ export class AgentActivity implements RecognitionHooks {
848
1345
  instructions = `${this.agent.instructions}\n${instructions}`;
849
1346
  }
850
1347
 
1348
+ // Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter
1349
+ const onEnterData = onEnterStorage.getStore();
1350
+ const shouldFilterTools =
1351
+ onEnterData?.agent === this.agent && onEnterData?.session === this.agentSession;
1352
+
1353
+ const tools = shouldFilterTools
1354
+ ? Object.fromEntries(
1355
+ Object.entries(this.agent.toolCtx).filter(
1356
+ ([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER),
1357
+ ),
1358
+ )
1359
+ : this.agent.toolCtx;
1360
+
851
1361
  const task = this.createSpeechTask({
852
- task: Task.from((abortController: AbortController) =>
1362
+ taskFn: (abortController: AbortController) =>
853
1363
  this.pipelineReplyTask(
854
1364
  handle,
855
1365
  chatCtx ?? this.agent.chatCtx,
856
- this.agent.toolCtx,
1366
+ tools,
857
1367
  {
858
1368
  toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
859
1369
  },
860
1370
  abortController,
861
- instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
1371
+ instructions,
862
1372
  userMessage,
863
1373
  ),
864
- ),
865
1374
  ownedSpeechHandle: handle,
866
1375
  name: 'AgentActivity.pipelineReply',
867
1376
  });
868
1377
 
869
- task.finally(() => this.onPipelineReplyDone());
1378
+ task.result.finally(() => this.onPipelineReplyDone());
870
1379
  }
871
1380
 
872
- this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
1381
+ if (scheduleSpeech) {
1382
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
1383
+ }
873
1384
  return handle;
874
1385
  }
875
1386
 
876
- interrupt(): Future<void> {
1387
+ interrupt(options: { force?: boolean } = {}): Future<void> {
1388
+ const { force = false } = options;
1389
+ this.cancelPreemptiveGeneration();
1390
+
877
1391
  const future = new Future<void>();
878
1392
  const currentSpeech = this._currentSpeech;
879
1393
 
880
1394
  //TODO(AJS-273): add interrupt for background speeches
881
1395
 
882
- currentSpeech?.interrupt();
1396
+ currentSpeech?.interrupt(force);
883
1397
 
884
1398
  for (const [_, __, speech] of this.speechQueue) {
885
- speech.interrupt();
1399
+ speech.interrupt(force);
886
1400
  }
887
1401
 
888
1402
  this.realtimeSession?.interrupt();
889
1403
 
890
- if (currentSpeech === undefined) {
1404
+ if (force) {
1405
+ // Force-interrupt (used during shutdown): cancel all speech tasks so they
1406
+ // don't block on I/O that will never complete (e.g. audioOutput.waitForPlayout()
1407
+ // when the room is disconnected). Mark the current speech as done immediately
1408
+ // so the interrupt future resolves without waiting for tasks to finish.
1409
+ // Clear the queue so mainTask doesn't dequeue already-interrupted handles
1410
+ // and hang on _waitForGeneration() (the generation future created by
1411
+ // _authorizeGeneration would never resolve since _markDone is a no-op
1412
+ // once doneFut is already settled).
1413
+ for (const task of this.speechTasks) {
1414
+ task.cancel();
1415
+ }
1416
+ if (currentSpeech && !currentSpeech.done()) {
1417
+ currentSpeech._markDone();
1418
+ }
1419
+ this.speechQueue.clear();
1420
+ future.resolve();
1421
+ } else if (currentSpeech === undefined) {
891
1422
  future.resolve();
892
1423
  } else {
893
1424
  currentSpeech.addDoneCallback(() => {
@@ -905,13 +1436,13 @@ export class AgentActivity implements RecognitionHooks {
905
1436
  }
906
1437
  }
907
1438
 
908
- private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Promise<void>): Promise<void> {
1439
+ private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task<void>): Promise<void> {
909
1440
  if (oldTask) {
910
1441
  // We never cancel user code as this is very confusing.
911
1442
  // So we wait for the old execution of onUserTurnCompleted to finish.
912
1443
  // In practice this is OK because most speeches will be interrupted if a new turn
913
1444
  // is detected. So the previous execution should complete quickly.
914
- await oldTask;
1445
+ await oldTask.result;
915
1446
  }
916
1447
 
917
1448
  // When the audio recognition detects the end of a user turn:
@@ -949,6 +1480,7 @@ export class AgentActivity implements RecognitionHooks {
949
1480
  let userMessage: ChatMessage | undefined = ChatMessage.create({
950
1481
  role: 'user',
951
1482
  content: info.newTranscript,
1483
+ transcriptConfidence: info.transcriptConfidence,
952
1484
  });
953
1485
 
954
1486
  // create a temporary mutable chat context to pass to onUserTurnCompleted
@@ -975,16 +1507,74 @@ export class AgentActivity implements RecognitionHooks {
975
1507
  return;
976
1508
  }
977
1509
 
978
- // Ensure the new message is passed to generateReply
979
- // This preserves the original message id, making it easier for users to track responses
980
- const speechHandle = this.generateReply({ userMessage, chatCtx });
1510
+ const userMetricsReport: MetricsReport = {};
1511
+ if (info.startedSpeakingAt !== undefined) {
1512
+ userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1000; // ms -> seconds
1513
+ }
1514
+ if (info.stoppedSpeakingAt !== undefined) {
1515
+ userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1000; // ms -> seconds
1516
+ }
1517
+ if (info.transcriptionDelay !== undefined) {
1518
+ userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1000; // ms -> seconds
1519
+ }
1520
+ if (info.endOfUtteranceDelay !== undefined) {
1521
+ userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1000; // ms -> seconds
1522
+ }
1523
+ userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1000; // ms -> seconds
1524
+ if (userMessage) {
1525
+ userMessage.metrics = userMetricsReport;
1526
+ }
981
1527
 
982
- const eouMetrics: EOUMetrics = {
983
- type: 'eou_metrics',
984
- timestamp: Date.now(),
985
- endOfUtteranceDelay: info.endOfUtteranceDelay,
986
- transcriptionDelay: info.transcriptionDelay,
987
- onUserTurnCompletedDelay: callbackDuration,
1528
+ let speechHandle: SpeechHandle | undefined;
1529
+ if (this._preemptiveGeneration !== undefined) {
1530
+ const preemptive = this._preemptiveGeneration;
1531
+ // make sure the onUserTurnCompleted didn't change some request parameters
1532
+ // otherwise invalidate the preemptive generation
1533
+ if (
1534
+ preemptive.info.newTranscript === userMessage?.textContent &&
1535
+ preemptive.chatCtx.isEquivalent(chatCtx) &&
1536
+ isSameToolContext(preemptive.tools, this.tools) &&
1537
+ isSameToolChoice(preemptive.toolChoice, this.toolChoice)
1538
+ ) {
1539
+ speechHandle = preemptive.speechHandle;
1540
+ // The preemptive userMessage was created without metrics.
1541
+ // Copy the metrics and transcriptConfidence from the new userMessage
1542
+ // to the preemptive message BEFORE scheduling (so the pipeline inserts
1543
+ // the message with metrics already set).
1544
+ if (preemptive.userMessage && userMessage) {
1545
+ preemptive.userMessage.metrics = userMetricsReport;
1546
+ preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence;
1547
+ }
1548
+ this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
1549
+ this.logger.debug(
1550
+ {
1551
+ preemptiveLeadTime: Date.now() - preemptive.createdAt,
1552
+ },
1553
+ 'using preemptive generation',
1554
+ );
1555
+ } else {
1556
+ this.logger.warn(
1557
+ 'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
1558
+ );
1559
+ preemptive.speechHandle._cancel();
1560
+ }
1561
+
1562
+ this._preemptiveGeneration = undefined;
1563
+ }
1564
+
1565
+ if (speechHandle === undefined) {
1566
+ // Ensure the new message is passed to generateReply
1567
+ // This preserves the original message id, making it easier for users to track responses
1568
+ speechHandle = this.generateReply({ userMessage, chatCtx });
1569
+ }
1570
+
1571
+ const eouMetrics: EOUMetrics = {
1572
+ type: 'eou_metrics',
1573
+ timestamp: Date.now(),
1574
+ endOfUtteranceDelayMs: info.endOfUtteranceDelay,
1575
+ transcriptionDelayMs: info.transcriptionDelay,
1576
+ onUserTurnCompletedDelayMs: callbackDuration,
1577
+ lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
988
1578
  speechId: speechHandle.id,
989
1579
  };
990
1580
 
@@ -1002,6 +1592,8 @@ export class AgentActivity implements RecognitionHooks {
1002
1592
  replyAbortController: AbortController,
1003
1593
  audio?: ReadableStream<AudioFrame> | null,
1004
1594
  ): Promise<void> {
1595
+ speechHandle._agentTurnContext = otelContext.active();
1596
+
1005
1597
  speechHandleStorage.enterWith(speechHandle);
1006
1598
 
1007
1599
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled
@@ -1046,28 +1638,44 @@ export class AgentActivity implements RecognitionHooks {
1046
1638
  tasks.push(textForwardTask);
1047
1639
  }
1048
1640
 
1049
- const onFirstFrame = () => {
1050
- this.agentSession._updateAgentState('speaking');
1641
+ let replyStartedSpeakingAt: number | undefined;
1642
+ let replyTtsGenData: _TTSGenerationData | null = null;
1643
+
1644
+ const onFirstFrame = (startedSpeakingAt?: number) => {
1645
+ replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1646
+ this.agentSession._updateAgentState('speaking', {
1647
+ startTime: startedSpeakingAt,
1648
+ otelContext: speechHandle._agentTurnContext,
1649
+ });
1650
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1651
+ this.audioRecognition.onStartOfAgentSpeech();
1652
+ this.isInterruptionByAudioActivityEnabled = false;
1653
+ }
1051
1654
  };
1052
1655
 
1053
1656
  if (!audioOutput) {
1054
1657
  if (textOut) {
1055
- textOut.firstTextFut.await.finally(onFirstFrame);
1658
+ textOut.firstTextFut.await
1659
+ .then(() => onFirstFrame())
1660
+ .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
1056
1661
  }
1057
1662
  } else {
1058
1663
  let audioOut: _AudioOut | null = null;
1059
1664
  if (!audio) {
1060
1665
  // generate audio using TTS
1061
- const [ttsTask, ttsStream] = performTTSInference(
1666
+ const [ttsTask, ttsGenData] = performTTSInference(
1062
1667
  (...args) => this.agent.ttsNode(...args),
1063
1668
  audioSource,
1064
1669
  modelSettings,
1065
1670
  replyAbortController,
1671
+ this.tts?.model,
1672
+ this.tts?.provider,
1066
1673
  );
1067
1674
  tasks.push(ttsTask);
1675
+ replyTtsGenData = ttsGenData;
1068
1676
 
1069
1677
  const [forwardTask, _audioOut] = performAudioForwarding(
1070
- ttsStream,
1678
+ ttsGenData.audioStream,
1071
1679
  audioOutput,
1072
1680
  replyAbortController,
1073
1681
  );
@@ -1083,7 +1691,9 @@ export class AgentActivity implements RecognitionHooks {
1083
1691
  tasks.push(forwardTask);
1084
1692
  audioOut = _audioOut;
1085
1693
  }
1086
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1694
+ audioOut.firstFrameFut.await
1695
+ .then((ts) => onFirstFrame(ts))
1696
+ .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
1087
1697
  }
1088
1698
 
1089
1699
  await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
@@ -1102,10 +1712,21 @@ export class AgentActivity implements RecognitionHooks {
1102
1712
  }
1103
1713
 
1104
1714
  if (addToChatCtx) {
1715
+ const replyStoppedSpeakingAt = Date.now();
1716
+ const replyAssistantMetrics: MetricsReport = {};
1717
+ if (replyTtsGenData?.ttfb !== undefined) {
1718
+ replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb;
1719
+ }
1720
+ if (replyStartedSpeakingAt !== undefined) {
1721
+ replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds
1722
+ replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds
1723
+ }
1724
+
1105
1725
  const message = ChatMessage.create({
1106
1726
  role: 'assistant',
1107
1727
  content: textOut?.text || '',
1108
1728
  interrupted: speechHandle.interrupted,
1729
+ metrics: replyAssistantMetrics,
1109
1730
  });
1110
1731
  this.agent._chatCtx.insert(message);
1111
1732
  this.agentSession._conversationItemAdded(message);
@@ -1113,19 +1734,51 @@ export class AgentActivity implements RecognitionHooks {
1113
1734
 
1114
1735
  if (this.agentSession.agentState === 'speaking') {
1115
1736
  this.agentSession._updateAgentState('listening');
1737
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1738
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
1739
+ }
1740
+ this.restoreInterruptionByAudioActivity();
1116
1741
  }
1117
1742
  }
1118
1743
 
1119
- private async pipelineReplyTask(
1120
- speechHandle: SpeechHandle,
1121
- chatCtx: ChatContext,
1122
- toolCtx: ToolContext,
1123
- modelSettings: ModelSettings,
1124
- replyAbortController: AbortController,
1125
- instructions?: string,
1126
- newMessage?: ChatMessage,
1127
- toolsMessages?: ChatItem[],
1128
- ): Promise<void> {
1744
+ private _pipelineReplyTaskImpl = async ({
1745
+ speechHandle,
1746
+ chatCtx,
1747
+ toolCtx,
1748
+ modelSettings,
1749
+ replyAbortController,
1750
+ instructions,
1751
+ newMessage,
1752
+ toolsMessages,
1753
+ span,
1754
+ _previousUserMetrics,
1755
+ }: {
1756
+ speechHandle: SpeechHandle;
1757
+ chatCtx: ChatContext;
1758
+ toolCtx: ToolContext;
1759
+ modelSettings: ModelSettings;
1760
+ replyAbortController: AbortController;
1761
+ instructions?: string;
1762
+ newMessage?: ChatMessage;
1763
+ toolsMessages?: ChatItem[];
1764
+ span: Span;
1765
+ _previousUserMetrics?: MetricsReport;
1766
+ }): Promise<void> => {
1767
+ speechHandle._agentTurnContext = otelContext.active();
1768
+
1769
+ span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1770
+ if (instructions) {
1771
+ span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
1772
+ }
1773
+ if (newMessage) {
1774
+ span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
1775
+ }
1776
+
1777
+ const localParticipant = this.agentSession._roomIO?.localParticipant;
1778
+ if (localParticipant) {
1779
+ setParticipantSpanAttributes(span, localParticipant);
1780
+ }
1781
+
1129
1782
  speechHandleStorage.enterWith(speechHandle);
1130
1783
 
1131
1784
  const audioOutput = this.agentSession.output.audioEnabled
@@ -1137,10 +1790,9 @@ export class AgentActivity implements RecognitionHooks {
1137
1790
 
1138
1791
  chatCtx = chatCtx.copy();
1139
1792
 
1793
+ // Insert new message into temporary chat context for LLM inference
1140
1794
  if (newMessage) {
1141
1795
  chatCtx.insert(newMessage);
1142
- this.agent._chatCtx.insert(newMessage);
1143
- this.agentSession._conversationItemAdded(newMessage);
1144
1796
  }
1145
1797
 
1146
1798
  if (instructions) {
@@ -1155,7 +1807,6 @@ export class AgentActivity implements RecognitionHooks {
1155
1807
  }
1156
1808
  }
1157
1809
 
1158
- this.agentSession._updateAgentState('thinking');
1159
1810
  const tasks: Array<Task<void>> = [];
1160
1811
  const [llmTask, llmGenData] = performLLMInference(
1161
1812
  // preserve `this` context in llmNode
@@ -1164,25 +1815,43 @@ export class AgentActivity implements RecognitionHooks {
1164
1815
  toolCtx,
1165
1816
  modelSettings,
1166
1817
  replyAbortController,
1818
+ this.llm?.model,
1819
+ this.llm?.provider,
1167
1820
  );
1168
1821
  tasks.push(llmTask);
1169
1822
 
1170
- const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
1171
-
1172
1823
  let ttsTask: Task<void> | null = null;
1173
- let ttsStream: ReadableStream<AudioFrame> | null = null;
1824
+ let ttsGenData: _TTSGenerationData | null = null;
1825
+ let llmOutput: ReadableStream<string>;
1826
+
1174
1827
  if (audioOutput) {
1175
- [ttsTask, ttsStream] = performTTSInference(
1828
+ // Only tee the stream when we need TTS
1829
+ const [ttsTextInput, textOutput] = llmGenData.textStream.tee();
1830
+ llmOutput = textOutput;
1831
+ [ttsTask, ttsGenData] = performTTSInference(
1176
1832
  (...args) => this.agent.ttsNode(...args),
1177
1833
  ttsTextInput,
1178
1834
  modelSettings,
1179
1835
  replyAbortController,
1836
+ this.tts?.model,
1837
+ this.tts?.provider,
1180
1838
  );
1181
1839
  tasks.push(ttsTask);
1840
+ } else {
1841
+ // No TTS needed, use the stream directly
1842
+ llmOutput = llmGenData.textStream;
1182
1843
  }
1183
1844
 
1184
1845
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1185
1846
 
1847
+ let userMetrics: MetricsReport | undefined = _previousUserMetrics;
1848
+ // Add new message to actual chat context if the speech is scheduled
1849
+ if (newMessage && speechHandle.scheduled) {
1850
+ this.agent._chatCtx.insert(newMessage);
1851
+ this.agentSession._conversationItemAdded(newMessage);
1852
+ userMetrics = newMessage.metrics;
1853
+ }
1854
+
1186
1855
  if (speechHandle.interrupted) {
1187
1856
  replyAbortController.abort();
1188
1857
  await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
@@ -1195,7 +1864,26 @@ export class AgentActivity implements RecognitionHooks {
1195
1864
  speechHandle._clearAuthorization();
1196
1865
 
1197
1866
  const replyStartedAt = Date.now();
1198
- const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
1867
+
1868
+ // Determine the transcription input source
1869
+ let transcriptionInput: ReadableStream<string | TimedString> = llmOutput;
1870
+
1871
+ // Check if we should use TTS aligned transcripts
1872
+ if (this.useTtsAlignedTranscript && this.tts?.capabilities.alignedTranscript && ttsGenData) {
1873
+ // Race timedTextsFut with ttsTask to avoid hanging if TTS fails before resolving the future
1874
+ const timedTextsStream = await Promise.race([
1875
+ ttsGenData.timedTextsFut.await,
1876
+ ttsTask?.result.catch(() =>
1877
+ this.logger.warn('TTS task failed before resolving timedTextsFut'),
1878
+ ) ?? Promise.resolve(),
1879
+ ]);
1880
+ if (timedTextsStream) {
1881
+ this.logger.debug('Using TTS aligned transcripts for transcription node input');
1882
+ transcriptionInput = timedTextsStream;
1883
+ }
1884
+ }
1885
+
1886
+ const trNodeResult = await this.agent.transcriptionNode(transcriptionInput, modelSettings);
1199
1887
  let textOut: _TextOut | null = null;
1200
1888
  if (trNodeResult) {
1201
1889
  const [textForwardTask, _textOut] = performTextForwarding(
@@ -1207,37 +1895,54 @@ export class AgentActivity implements RecognitionHooks {
1207
1895
  textOut = _textOut;
1208
1896
  }
1209
1897
 
1210
- const onFirstFrame = () => {
1211
- this.agentSession._updateAgentState('speaking');
1898
+ let agentStartedSpeakingAt: number | undefined;
1899
+ const onFirstFrame = (startedSpeakingAt?: number) => {
1900
+ agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
1901
+ this.agentSession._updateAgentState('speaking', {
1902
+ startTime: startedSpeakingAt,
1903
+ otelContext: speechHandle._agentTurnContext,
1904
+ });
1905
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
1906
+ this.audioRecognition.onStartOfAgentSpeech();
1907
+ this.isInterruptionByAudioActivityEnabled = false;
1908
+ }
1212
1909
  };
1213
1910
 
1214
1911
  let audioOut: _AudioOut | null = null;
1215
1912
  if (audioOutput) {
1216
- if (ttsStream) {
1913
+ if (ttsGenData) {
1217
1914
  const [forwardTask, _audioOut] = performAudioForwarding(
1218
- ttsStream,
1915
+ ttsGenData.audioStream,
1219
1916
  audioOutput,
1220
1917
  replyAbortController,
1221
1918
  );
1222
1919
  audioOut = _audioOut;
1223
1920
  tasks.push(forwardTask);
1224
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1921
+ audioOut.firstFrameFut.await
1922
+ .then((ts) => onFirstFrame(ts))
1923
+ .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
1225
1924
  } else {
1226
- throw Error('ttsStream is null when audioOutput is enabled');
1925
+ throw Error('ttsGenData is null when audioOutput is enabled');
1227
1926
  }
1228
1927
  } else {
1229
- textOut?.firstTextFut.await.finally(onFirstFrame);
1928
+ textOut?.firstTextFut.await
1929
+ .then(() => onFirstFrame())
1930
+ .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
1230
1931
  }
1231
1932
 
1232
1933
  //TODO(AJS-272): before executing tools, make sure we generated all the text
1233
1934
  // (this ensure everything is kept ordered)
1234
1935
 
1235
- const onToolExecutionStarted = (_: FunctionCall) => {
1236
- // TODO(brian): handle speech_handle item_added
1936
+ const onToolExecutionStarted = (f: FunctionCall) => {
1937
+ speechHandle._itemAdded([f]);
1938
+ this.agent._chatCtx.items.push(f);
1939
+ this.agentSession._toolItemsAdded([f]);
1237
1940
  };
1238
1941
 
1239
- const onToolExecutionCompleted = (_: ToolExecutionOutput) => {
1240
- // TODO(brian): handle speech_handle item_added
1942
+ const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
1943
+ if (out.toolCallOutput) {
1944
+ speechHandle._itemAdded([out.toolCallOutput]);
1945
+ }
1241
1946
  };
1242
1947
 
1243
1948
  const [executeToolsTask, toolOutput] = performToolExecutions({
@@ -1257,12 +1962,45 @@ export class AgentActivity implements RecognitionHooks {
1257
1962
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1258
1963
  }
1259
1964
 
1965
+ const agentStoppedSpeakingAt = Date.now();
1966
+ const assistantMetrics: MetricsReport = {};
1967
+
1968
+ if (llmGenData.ttft !== undefined) {
1969
+ assistantMetrics.llmNodeTtft = llmGenData.ttft; // already in seconds
1970
+ }
1971
+ if (ttsGenData?.ttfb !== undefined) {
1972
+ assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb; // already in seconds
1973
+ }
1974
+ if (agentStartedSpeakingAt !== undefined) {
1975
+ assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds
1976
+ assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds
1977
+
1978
+ if (userMetrics?.stoppedSpeakingAt !== undefined) {
1979
+ const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt;
1980
+ assistantMetrics.e2eLatency = e2eLatency;
1981
+ span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency);
1982
+ }
1983
+ }
1984
+
1985
+ span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted);
1986
+ let hasSpeechMessage = false;
1987
+
1260
1988
  // add the tools messages that triggers this reply to the chat context
1261
1989
  if (toolsMessages) {
1262
1990
  for (const msg of toolsMessages) {
1263
1991
  msg.createdAt = replyStartedAt;
1264
1992
  }
1265
- this.agent._chatCtx.insert(toolsMessages);
1993
+ // Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items
1994
+ // were already added by onToolExecutionStarted when the tool execution began.
1995
+ // Inserting function_calls again would create duplicates that break provider APIs
1996
+ // (e.g. Google's "function response parts != function call parts" error).
1997
+ const toolCallOutputs = toolsMessages.filter(
1998
+ (m): m is FunctionCallOutput => m.type === 'function_call_output',
1999
+ );
2000
+ if (toolCallOutputs.length > 0) {
2001
+ this.agent._chatCtx.insert(toolCallOutputs);
2002
+ this.agentSession._toolItemsAdded(toolCallOutputs);
2003
+ }
1266
2004
  }
1267
2005
 
1268
2006
  if (speechHandle.interrupted) {
@@ -1270,20 +2008,24 @@ export class AgentActivity implements RecognitionHooks {
1270
2008
  { speech_id: speechHandle.id },
1271
2009
  'Aborting all pipeline reply tasks due to interruption',
1272
2010
  );
2011
+
2012
+ // Stop playout ASAP (don't wait for cancellations), otherwise the segment may finish and we
2013
+ // will correctly (but undesirably) commit a long transcript even though the user said "stop".
2014
+ if (audioOutput) {
2015
+ audioOutput.clearBuffer();
2016
+ }
2017
+
1273
2018
  replyAbortController.abort();
1274
- await Promise.allSettled(
1275
- tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
1276
- );
2019
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1277
2020
 
1278
2021
  let forwardedText = textOut?.text || '';
1279
2022
 
1280
2023
  if (audioOutput) {
1281
- audioOutput.clearBuffer();
1282
2024
  const playbackEv = await audioOutput.waitForPlayout();
1283
- if (audioOut?.firstFrameFut.done) {
2025
+ if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
1284
2026
  // playback EV is valid only if the first frame was already played
1285
2027
  this.logger.info(
1286
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
2028
+ { speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
1287
2029
  'playout interrupted',
1288
2030
  );
1289
2031
  if (playbackEv.synchronizedTranscript) {
@@ -1295,43 +2037,54 @@ export class AgentActivity implements RecognitionHooks {
1295
2037
  }
1296
2038
 
1297
2039
  if (forwardedText) {
2040
+ hasSpeechMessage = true;
1298
2041
  const message = ChatMessage.create({
1299
2042
  role: 'assistant',
1300
2043
  content: forwardedText,
1301
2044
  id: llmGenData.id,
1302
2045
  interrupted: true,
1303
2046
  createdAt: replyStartedAt,
2047
+ metrics: assistantMetrics,
1304
2048
  });
1305
2049
  chatCtx.insert(message);
1306
2050
  this.agent._chatCtx.insert(message);
2051
+ speechHandle._itemAdded([message]);
1307
2052
  this.agentSession._conversationItemAdded(message);
2053
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText);
1308
2054
  }
1309
2055
 
1310
2056
  if (this.agentSession.agentState === 'speaking') {
1311
2057
  this.agentSession._updateAgentState('listening');
2058
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
2059
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
2060
+ this.restoreInterruptionByAudioActivity();
2061
+ }
1312
2062
  }
1313
2063
 
1314
2064
  this.logger.info(
1315
2065
  { speech_id: speechHandle.id, message: forwardedText },
1316
2066
  'playout completed with interrupt',
1317
2067
  );
1318
- // TODO(shubhra) add chat message to speech handle
1319
2068
  speechHandle._markGenerationDone();
1320
2069
  await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1321
2070
  return;
1322
2071
  }
1323
2072
 
1324
2073
  if (textOut && textOut.text) {
2074
+ hasSpeechMessage = true;
1325
2075
  const message = ChatMessage.create({
1326
2076
  role: 'assistant',
1327
2077
  id: llmGenData.id,
1328
2078
  interrupted: false,
1329
2079
  createdAt: replyStartedAt,
1330
2080
  content: textOut.text,
2081
+ metrics: assistantMetrics,
1331
2082
  });
1332
2083
  chatCtx.insert(message);
1333
2084
  this.agent._chatCtx.insert(message);
2085
+ speechHandle._itemAdded([message]);
1334
2086
  this.agentSession._conversationItemAdded(message);
2087
+ span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text);
1335
2088
  this.logger.info(
1336
2089
  { speech_id: speechHandle.id, message: textOut.text },
1337
2090
  'playout completed without interruption',
@@ -1342,6 +2095,12 @@ export class AgentActivity implements RecognitionHooks {
1342
2095
  this.agentSession._updateAgentState('thinking');
1343
2096
  } else if (this.agentSession.agentState === 'speaking') {
1344
2097
  this.agentSession._updateAgentState('listening');
2098
+ if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
2099
+ {
2100
+ this.audioRecognition.onEndOfAgentSpeech(Date.now());
2101
+ this.restoreInterruptionByAudioActivity();
2102
+ }
2103
+ }
1345
2104
  }
1346
2105
 
1347
2106
  // mark the playout done before waiting for the tool execution
@@ -1351,7 +2110,7 @@ export class AgentActivity implements RecognitionHooks {
1351
2110
  if (toolOutput.output.length === 0) return;
1352
2111
 
1353
2112
  // important: no agent output should be used after this point
1354
- const { maxToolSteps } = this.agentSession.options;
2113
+ const { maxToolSteps } = this.agentSession.sessionOptions;
1355
2114
  if (speechHandle.numSteps >= maxToolSteps) {
1356
2115
  this.logger.warn(
1357
2116
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
@@ -1360,52 +2119,18 @@ export class AgentActivity implements RecognitionHooks {
1360
2119
  return;
1361
2120
  }
1362
2121
 
1363
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1364
- functionCalls: [],
1365
- functionCallOutputs: [],
1366
- });
1367
- let shouldGenerateToolReply: boolean = false;
1368
- let newAgentTask: Agent | null = null;
1369
- let ignoreTaskSwitch: boolean = false;
1370
-
1371
- for (const sanitizedOut of toolOutput.output) {
1372
- if (sanitizedOut.toolCallOutput !== undefined) {
1373
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1374
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1375
- if (sanitizedOut.replyRequired) {
1376
- shouldGenerateToolReply = true;
1377
- }
1378
- }
1379
-
1380
- if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
1381
- this.logger.error('expected to receive only one agent task from the tool executions');
1382
- ignoreTaskSwitch = true;
1383
- // TODO(brian): should we mark the function call as failed to notify the LLM?
1384
- }
1385
-
1386
- newAgentTask = sanitizedOut.agentTask ?? null;
1387
-
1388
- this.logger.debug(
1389
- {
1390
- speechId: speechHandle.id,
1391
- name: sanitizedOut.toolCall?.name,
1392
- args: sanitizedOut.toolCall.args,
1393
- output: sanitizedOut.toolCallOutput?.output,
1394
- isError: sanitizedOut.toolCallOutput?.isError,
1395
- },
1396
- 'Tool call execution finished',
1397
- );
1398
- }
2122
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
2123
+ this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1399
2124
 
1400
2125
  this.agentSession.emit(
1401
2126
  AgentSessionEventTypes.FunctionToolsExecuted,
1402
2127
  functionToolsExecutedEvent,
1403
2128
  );
1404
2129
 
1405
- let draining = this.draining;
2130
+ let schedulingPaused = this.schedulingPaused;
1406
2131
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1407
2132
  this.agentSession.updateAgent(newAgentTask);
1408
- draining = true;
2133
+ schedulingPaused = true;
1409
2134
  }
1410
2135
 
1411
2136
  const toolMessages = [
@@ -1415,28 +2140,19 @@ export class AgentActivity implements RecognitionHooks {
1415
2140
  if (shouldGenerateToolReply) {
1416
2141
  chatCtx.insert(toolMessages);
1417
2142
 
1418
- const handle = SpeechHandle.create({
1419
- allowInterruptions: speechHandle.allowInterruptions,
1420
- stepIndex: speechHandle._stepIndex + 1,
1421
- parent: speechHandle,
1422
- });
1423
- this.agentSession.emit(
1424
- AgentSessionEventTypes.SpeechCreated,
1425
- createSpeechCreatedEvent({
1426
- userInitiated: false,
1427
- source: 'tool_response',
1428
- speechHandle: handle,
1429
- }),
1430
- );
2143
+ // Increment step count on SAME handle (parity with Python agent_activity.py L2081)
2144
+ speechHandle._numSteps += 1;
1431
2145
 
1432
2146
  // Avoid setting tool_choice to "required" or a specific function when
1433
2147
  // passing tool response back to the LLM
1434
- const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
2148
+ const respondToolChoice =
2149
+ schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1435
2150
 
2151
+ // Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140)
1436
2152
  const toolResponseTask = this.createSpeechTask({
1437
- task: Task.from(() =>
2153
+ taskFn: () =>
1438
2154
  this.pipelineReplyTask(
1439
- handle,
2155
+ speechHandle,
1440
2156
  chatCtx,
1441
2157
  toolCtx,
1442
2158
  { toolChoice: respondToolChoice },
@@ -1444,22 +2160,61 @@ export class AgentActivity implements RecognitionHooks {
1444
2160
  instructions,
1445
2161
  undefined,
1446
2162
  toolMessages,
2163
+ hasSpeechMessage ? undefined : userMetrics,
1447
2164
  ),
1448
- ),
1449
- ownedSpeechHandle: handle,
2165
+ ownedSpeechHandle: speechHandle,
1450
2166
  name: 'AgentActivity.pipelineReply',
1451
2167
  });
1452
2168
 
1453
- toolResponseTask.finally(() => this.onPipelineReplyDone());
2169
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1454
2170
 
1455
- this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
2171
+ this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1456
2172
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1457
2173
  for (const msg of toolMessages) {
1458
2174
  msg.createdAt = replyStartedAt;
1459
2175
  }
1460
- this.agent._chatCtx.insert(toolMessages);
2176
+
2177
+ const toolCallOutputs = toolMessages.filter(
2178
+ (m): m is FunctionCallOutput => m.type === 'function_call_output',
2179
+ );
2180
+
2181
+ if (toolCallOutputs.length > 0) {
2182
+ this.agent._chatCtx.insert(toolCallOutputs);
2183
+ this.agentSession._toolItemsAdded(toolCallOutputs);
2184
+ }
1461
2185
  }
1462
- }
2186
+ };
2187
+
2188
+ private pipelineReplyTask = async (
2189
+ speechHandle: SpeechHandle,
2190
+ chatCtx: ChatContext,
2191
+ toolCtx: ToolContext,
2192
+ modelSettings: ModelSettings,
2193
+ replyAbortController: AbortController,
2194
+ instructions?: string,
2195
+ newMessage?: ChatMessage,
2196
+ toolsMessages?: ChatItem[],
2197
+ _previousUserMetrics?: MetricsReport,
2198
+ ): Promise<void> =>
2199
+ tracer.startActiveSpan(
2200
+ async (span) =>
2201
+ this._pipelineReplyTaskImpl({
2202
+ speechHandle,
2203
+ chatCtx,
2204
+ toolCtx,
2205
+ modelSettings,
2206
+ replyAbortController,
2207
+ instructions,
2208
+ newMessage,
2209
+ toolsMessages,
2210
+ span,
2211
+ _previousUserMetrics,
2212
+ }),
2213
+ {
2214
+ name: 'agent_turn',
2215
+ context: this.agentSession.rootSpanContext,
2216
+ },
2217
+ );
1463
2218
 
1464
2219
  private async realtimeGenerationTask(
1465
2220
  speechHandle: SpeechHandle,
@@ -1467,6 +2222,44 @@ export class AgentActivity implements RecognitionHooks {
1467
2222
  modelSettings: ModelSettings,
1468
2223
  replyAbortController: AbortController,
1469
2224
  ): Promise<void> {
2225
+ return tracer.startActiveSpan(
2226
+ async (span) =>
2227
+ this._realtimeGenerationTaskImpl({
2228
+ speechHandle,
2229
+ ev,
2230
+ modelSettings,
2231
+ replyAbortController,
2232
+ span,
2233
+ }),
2234
+ {
2235
+ name: 'agent_turn',
2236
+ context: this.agentSession.rootSpanContext,
2237
+ },
2238
+ );
2239
+ }
2240
+
2241
+ private async _realtimeGenerationTaskImpl({
2242
+ speechHandle,
2243
+ ev,
2244
+ modelSettings,
2245
+ replyAbortController,
2246
+ span,
2247
+ }: {
2248
+ speechHandle: SpeechHandle;
2249
+ ev: GenerationCreatedEvent;
2250
+ modelSettings: ModelSettings;
2251
+ replyAbortController: AbortController;
2252
+ span: Span;
2253
+ }): Promise<void> {
2254
+ speechHandle._agentTurnContext = otelContext.active();
2255
+
2256
+ span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
2257
+
2258
+ const localParticipant = this.agentSession._roomIO?.localParticipant;
2259
+ if (localParticipant) {
2260
+ setParticipantSpanAttributes(span, localParticipant);
2261
+ }
2262
+
1470
2263
  speechHandleStorage.enterWith(speechHandle);
1471
2264
 
1472
2265
  if (!this.realtimeSession) {
@@ -1476,6 +2269,12 @@ export class AgentActivity implements RecognitionHooks {
1476
2269
  throw new Error('llm is not a realtime model');
1477
2270
  }
1478
2271
 
2272
+ // Store span for metrics recording when they arrive later
2273
+ span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.llm.model);
2274
+ if (this.realtimeSpans && ev.responseId) {
2275
+ this.realtimeSpans.set(ev.responseId, span);
2276
+ }
2277
+
1479
2278
  this.logger.debug(
1480
2279
  { speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
1481
2280
  'realtime generation started',
@@ -1496,14 +2295,21 @@ export class AgentActivity implements RecognitionHooks {
1496
2295
  return;
1497
2296
  }
1498
2297
 
1499
- const onFirstFrame = () => {
1500
- this.agentSession._updateAgentState('speaking');
2298
+ const onFirstFrame = (startedSpeakingAt?: number) => {
2299
+ this.agentSession._updateAgentState('speaking', {
2300
+ startTime: startedSpeakingAt,
2301
+ otelContext: speechHandle._agentTurnContext,
2302
+ });
1501
2303
  };
1502
2304
 
1503
2305
  const readMessages = async (
1504
2306
  abortController: AbortController,
1505
- outputs: Array<[string, _TextOut | null, _AudioOut | null]>,
2307
+ outputs: Array<[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]>,
1506
2308
  ) => {
2309
+ replyAbortController.signal.addEventListener('abort', () => abortController.abort(), {
2310
+ once: true,
2311
+ });
2312
+
1507
2313
  const forwardTasks: Array<Task<void>> = [];
1508
2314
  try {
1509
2315
  for await (const msg of ev.messageStream) {
@@ -1513,7 +2319,25 @@ export class AgentActivity implements RecognitionHooks {
1513
2319
  );
1514
2320
  break;
1515
2321
  }
1516
- const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
2322
+
2323
+ const msgModalities = msg.modalities ? await msg.modalities : undefined;
2324
+ let ttsTextInput: ReadableStream<string | TimedString> | null = null;
2325
+ let trTextInput: ReadableStream<string | TimedString>;
2326
+
2327
+ if (msgModalities && !msgModalities.includes('audio') && this.tts) {
2328
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
2329
+ this.logger.warn(
2330
+ 'text response received from realtime API, falling back to use a TTS model.',
2331
+ );
2332
+ }
2333
+ const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
2334
+ ttsTextInput = _ttsTextInput;
2335
+ trTextInput = _trTextInput;
2336
+ } else {
2337
+ trTextInput = msg.textStream;
2338
+ }
2339
+
2340
+ const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
1517
2341
  let textOut: _TextOut | null = null;
1518
2342
  if (trNodeResult) {
1519
2343
  const [textForwardTask, _textOut] = performTextForwarding(
@@ -1524,30 +2348,57 @@ export class AgentActivity implements RecognitionHooks {
1524
2348
  forwardTasks.push(textForwardTask);
1525
2349
  textOut = _textOut;
1526
2350
  }
2351
+
1527
2352
  let audioOut: _AudioOut | null = null;
1528
2353
  if (audioOutput) {
1529
- const realtimeAudio = await this.agent.realtimeAudioOutputNode(
1530
- msg.audioStream,
1531
- modelSettings,
1532
- );
1533
- if (realtimeAudio) {
1534
- const [forwardTask, _audioOut] = performAudioForwarding(
1535
- realtimeAudio,
1536
- audioOutput,
2354
+ let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
2355
+
2356
+ if (ttsTextInput) {
2357
+ const [ttsTask, ttsGenData] = performTTSInference(
2358
+ (...args) => this.agent.ttsNode(...args),
2359
+ ttsTextInput,
2360
+ modelSettings,
1537
2361
  abortController,
2362
+ this.tts?.model,
2363
+ this.tts?.provider,
2364
+ );
2365
+ tasks.push(ttsTask);
2366
+ realtimeAudioResult = ttsGenData.audioStream;
2367
+ } else if (msgModalities && msgModalities.includes('audio')) {
2368
+ realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
2369
+ msg.audioStream,
2370
+ modelSettings,
2371
+ );
2372
+ } else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
2373
+ this.logger.error(
2374
+ 'Text message received from Realtime API with audio modality. ' +
2375
+ 'This usually happens when text chat context is synced to the API. ' +
2376
+ 'Try to add a TTS model as fallback or use text modality with TTS instead.',
1538
2377
  );
1539
- forwardTasks.push(forwardTask);
1540
- audioOut = _audioOut;
1541
- audioOut.firstFrameFut.await.finally(onFirstFrame);
1542
2378
  } else {
1543
2379
  this.logger.warn(
1544
2380
  'audio output is enabled but neither tts nor realtime audio is available',
1545
2381
  );
1546
2382
  }
2383
+
2384
+ if (realtimeAudioResult) {
2385
+ const [forwardTask, _audioOut] = performAudioForwarding(
2386
+ realtimeAudioResult,
2387
+ audioOutput,
2388
+ abortController,
2389
+ );
2390
+ forwardTasks.push(forwardTask);
2391
+ audioOut = _audioOut;
2392
+ audioOut.firstFrameFut.await
2393
+ .then((ts) => onFirstFrame(ts))
2394
+ .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
2395
+ }
1547
2396
  } else if (textOut) {
1548
- textOut.firstTextFut.await.finally(onFirstFrame);
2397
+ textOut.firstTextFut.await
2398
+ .then(() => onFirstFrame())
2399
+ .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
1549
2400
  }
1550
- outputs.push([msg.messageId, textOut, audioOut]);
2401
+ outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
1551
2402
  }
1552
2403
  await waitFor(forwardTasks);
1553
2404
  } catch (error) {
@@ -1557,11 +2408,13 @@ export class AgentActivity implements RecognitionHooks {
1557
2408
  }
1558
2409
  };
1559
2410
 
1560
- const messageOutputs: Array<[string, _TextOut | null, _AudioOut | null]> = [];
2411
+ const messageOutputs: Array<
2412
+ [string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]
2413
+ > = [];
1561
2414
  const tasks = [
1562
2415
  Task.from(
1563
2416
  (controller) => readMessages(controller, messageOutputs),
1564
- replyAbortController,
2417
+ undefined,
1565
2418
  'AgentActivity.realtime_generation.read_messages',
1566
2419
  ),
1567
2420
  ];
@@ -1598,6 +2451,8 @@ export class AgentActivity implements RecognitionHooks {
1598
2451
 
1599
2452
  const onToolExecutionStarted = (f: FunctionCall) => {
1600
2453
  speechHandle._itemAdded([f]);
2454
+ this.agent._chatCtx.items.push(f);
2455
+ this.agentSession._toolItemsAdded([f]);
1601
2456
  };
1602
2457
 
1603
2458
  const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
@@ -1623,7 +2478,6 @@ export class AgentActivity implements RecognitionHooks {
1623
2478
 
1624
2479
  if (audioOutput) {
1625
2480
  await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1626
- this.agentSession._updateAgentState('listening');
1627
2481
  }
1628
2482
 
1629
2483
  if (speechHandle.interrupted) {
@@ -1636,17 +2490,17 @@ export class AgentActivity implements RecognitionHooks {
1636
2490
 
1637
2491
  if (messageOutputs.length > 0) {
1638
2492
  // there should be only one message
1639
- const [msgId, textOut, audioOut] = messageOutputs[0]!;
2493
+ const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0]!;
1640
2494
  let forwardedText = textOut?.text || '';
1641
2495
 
1642
2496
  if (audioOutput) {
1643
2497
  audioOutput.clearBuffer();
1644
2498
  const playbackEv = await audioOutput.waitForPlayout();
1645
- let playbackPosition = playbackEv.playbackPosition;
1646
- if (audioOut?.firstFrameFut.done) {
2499
+ let playbackPositionInS = playbackEv.playbackPosition;
2500
+ if (audioOut?.firstFrameFut.done && !audioOut.firstFrameFut.rejected) {
1647
2501
  // playback EV is valid only if the first frame was already played
1648
2502
  this.logger.info(
1649
- { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
2503
+ { speech_id: speechHandle.id, playbackPositionInS },
1650
2504
  'playout interrupted',
1651
2505
  );
1652
2506
  if (playbackEv.synchronizedTranscript) {
@@ -1654,13 +2508,15 @@ export class AgentActivity implements RecognitionHooks {
1654
2508
  }
1655
2509
  } else {
1656
2510
  forwardedText = '';
1657
- playbackPosition = 0;
2511
+ playbackPositionInS = 0;
1658
2512
  }
1659
2513
 
1660
2514
  // truncate server-side message
1661
2515
  this.realtimeSession.truncate({
1662
2516
  messageId: msgId,
1663
- audioEndMs: Math.floor(playbackPosition),
2517
+ audioEndMs: Math.floor(playbackPositionInS * 1000),
2518
+ modalities: msgModalities,
2519
+ audioTranscript: forwardedText,
1664
2520
  });
1665
2521
  }
1666
2522
 
@@ -1691,7 +2547,7 @@ export class AgentActivity implements RecognitionHooks {
1691
2547
 
1692
2548
  if (messageOutputs.length > 0) {
1693
2549
  // there should be only one message
1694
- const [msgId, textOut, _] = messageOutputs[0]!;
2550
+ const [msgId, textOut, _, __] = messageOutputs[0]!;
1695
2551
  const message = ChatMessage.create({
1696
2552
  role: 'assistant',
1697
2553
  content: textOut?.text || '',
@@ -1708,16 +2564,20 @@ export class AgentActivity implements RecognitionHooks {
1708
2564
  speechHandle._markGenerationDone();
1709
2565
  // TODO(brian): close tees
1710
2566
 
1711
- toolOutput.firstToolStartedFuture.await.finally(() => {
1712
- this.agentSession._updateAgentState('thinking');
1713
- });
1714
-
1715
2567
  await executeToolsTask.result;
1716
2568
 
1717
- if (toolOutput.output.length === 0) return;
2569
+ if (toolOutput.output.length > 0) {
2570
+ this.agentSession._updateAgentState('thinking');
2571
+ } else if (this.agentSession.agentState === 'speaking') {
2572
+ this.agentSession._updateAgentState('listening');
2573
+ }
2574
+
2575
+ if (toolOutput.output.length === 0) {
2576
+ return;
2577
+ }
1718
2578
 
1719
2579
  // important: no agent ouput should be used after this point
1720
- const { maxToolSteps } = this.agentSession.options;
2580
+ const { maxToolSteps } = this.agentSession.sessionOptions;
1721
2581
  if (speechHandle.numSteps >= maxToolSteps) {
1722
2582
  this.logger.warn(
1723
2583
  { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
@@ -1726,55 +2586,42 @@ export class AgentActivity implements RecognitionHooks {
1726
2586
  return;
1727
2587
  }
1728
2588
 
1729
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1730
- functionCalls: [],
1731
- functionCallOutputs: [],
1732
- });
1733
- let shouldGenerateToolReply: boolean = false;
1734
- let newAgentTask: Agent | null = null;
1735
- let ignoreTaskSwitch: boolean = false;
1736
-
1737
- for (const sanitizedOut of toolOutput.output) {
1738
- if (sanitizedOut.toolCallOutput !== undefined) {
1739
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1740
- if (sanitizedOut.replyRequired) {
1741
- shouldGenerateToolReply = true;
1742
- }
1743
- }
1744
-
1745
- if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
1746
- this.logger.error('expected to receive only one agent task from the tool executions');
1747
- ignoreTaskSwitch = true;
1748
- }
1749
-
1750
- newAgentTask = sanitizedOut.agentTask ?? null;
1751
-
1752
- this.logger.debug(
1753
- {
1754
- speechId: speechHandle.id,
1755
- name: sanitizedOut.toolCall?.name,
1756
- args: sanitizedOut.toolCall.args,
1757
- output: sanitizedOut.toolCallOutput?.output,
1758
- isError: sanitizedOut.toolCallOutput?.isError,
1759
- },
1760
- 'Tool call execution finished',
1761
- );
1762
- }
2589
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } =
2590
+ this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1763
2591
 
1764
2592
  this.agentSession.emit(
1765
2593
  AgentSessionEventTypes.FunctionToolsExecuted,
1766
2594
  functionToolsExecutedEvent,
1767
2595
  );
1768
2596
 
1769
- let draining = this.draining;
2597
+ let schedulingPaused = this.schedulingPaused;
1770
2598
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1771
2599
  this.agentSession.updateAgent(newAgentTask);
1772
- draining = true;
2600
+ schedulingPaused = true;
1773
2601
  }
1774
2602
 
1775
2603
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
2604
+ // wait all speeches played before updating the tool output and generating the response
2605
+ // most realtime models dont support generating multiple responses at the same time
2606
+ while (this.currentSpeech || this.speechQueue.size() > 0) {
2607
+ if (
2608
+ this.currentSpeech &&
2609
+ !this.currentSpeech.done() &&
2610
+ this.currentSpeech !== speechHandle
2611
+ ) {
2612
+ await this.currentSpeech.waitForPlayout();
2613
+ } else {
2614
+ // Don't block the event loop
2615
+ await new Promise((resolve) => setImmediate(resolve));
2616
+ }
2617
+ }
1776
2618
  const chatCtx = this.realtimeSession.chatCtx.copy();
1777
2619
  chatCtx.items.push(...functionToolsExecutedEvent.functionCallOutputs);
2620
+
2621
+ this.agentSession._toolItemsAdded(
2622
+ functionToolsExecutedEvent.functionCallOutputs as FunctionCallOutput[],
2623
+ );
2624
+
1778
2625
  try {
1779
2626
  await this.realtimeSession.updateChatCtx(chatCtx);
1780
2627
  } catch (error) {
@@ -1806,15 +2653,14 @@ export class AgentActivity implements RecognitionHooks {
1806
2653
  }),
1807
2654
  );
1808
2655
 
1809
- const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
2656
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1810
2657
  this.createSpeechTask({
1811
- task: Task.from((abortController: AbortController) =>
2658
+ taskFn: (abortController: AbortController) =>
1812
2659
  this.realtimeReplyTask({
1813
2660
  speechHandle: replySpeechHandle,
1814
2661
  modelSettings: { toolChoice },
1815
2662
  abortController,
1816
2663
  }),
1817
- ),
1818
2664
  ownedSpeechHandle: replySpeechHandle,
1819
2665
  name: 'AgentActivity.realtime_reply',
1820
2666
  });
@@ -1822,6 +2668,53 @@ export class AgentActivity implements RecognitionHooks {
1822
2668
  this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1823
2669
  }
1824
2670
 
2671
+ private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) {
2672
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
2673
+ functionCalls: [],
2674
+ functionCallOutputs: [],
2675
+ });
2676
+
2677
+ let shouldGenerateToolReply = false;
2678
+ let newAgentTask: Agent | null = null;
2679
+ let ignoreTaskSwitch = false;
2680
+
2681
+ for (const sanitizedOut of toolOutput.output) {
2682
+ if (sanitizedOut.toolCallOutput !== undefined) {
2683
+ // Keep event payload symmetric for pipeline + realtime paths.
2684
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
2685
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
2686
+ if (sanitizedOut.replyRequired) {
2687
+ shouldGenerateToolReply = true;
2688
+ }
2689
+ }
2690
+
2691
+ if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
2692
+ this.logger.error('expected to receive only one agent task from the tool executions');
2693
+ ignoreTaskSwitch = true;
2694
+ }
2695
+
2696
+ newAgentTask = sanitizedOut.agentTask ?? null;
2697
+
2698
+ this.logger.debug(
2699
+ {
2700
+ speechId: speechHandle.id,
2701
+ name: sanitizedOut.toolCall?.name,
2702
+ args: sanitizedOut.toolCall.args,
2703
+ output: sanitizedOut.toolCallOutput?.output,
2704
+ isError: sanitizedOut.toolCallOutput?.isError,
2705
+ },
2706
+ 'Tool call execution finished',
2707
+ );
2708
+ }
2709
+
2710
+ return {
2711
+ functionToolsExecutedEvent,
2712
+ shouldGenerateToolReply,
2713
+ newAgentTask,
2714
+ ignoreTaskSwitch,
2715
+ };
2716
+ }
2717
+
1825
2718
  private async realtimeReplyTask({
1826
2719
  speechHandle,
1827
2720
  modelSettings: { toolChoice },
@@ -1880,10 +2773,10 @@ export class AgentActivity implements RecognitionHooks {
1880
2773
  priority: number,
1881
2774
  force: boolean = false,
1882
2775
  ): void {
1883
- // when force=true, we allow tool responses to bypass draining
2776
+ // when force=true, we allow tool responses to bypass scheduling pause
1884
2777
  // This allows for tool responses to be generated before the AgentActivity is finalized
1885
- if (this.draining && !force) {
1886
- throw new Error('cannot schedule new speech, the agent is draining');
2778
+ if (this.schedulingPaused && !force) {
2779
+ throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing');
1887
2780
  }
1888
2781
 
1889
2782
  // Monotonic time to avoid near 0 collisions
@@ -1892,19 +2785,77 @@ export class AgentActivity implements RecognitionHooks {
1892
2785
  this.wakeupMainTask();
1893
2786
  }
1894
2787
 
2788
+ private async _pauseSchedulingTask(blockedTasks: Task<any>[]): Promise<void> {
2789
+ if (this._schedulingPaused) return;
2790
+
2791
+ this._schedulingPaused = true;
2792
+ this._drainBlockedTasks = blockedTasks;
2793
+ this.wakeupMainTask();
2794
+
2795
+ if (this._mainTask) {
2796
+ // When pausing/draining, we ensure that all speech_tasks complete fully.
2797
+ // This means that even if the SpeechHandle themselves have finished,
2798
+ // we still wait for the entire execution (e.g function_tools)
2799
+ await this._mainTask.result;
2800
+ }
2801
+ }
2802
+
2803
+ private _resumeSchedulingTask(): void {
2804
+ if (!this._schedulingPaused) return;
2805
+
2806
+ this._schedulingPaused = false;
2807
+ this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
2808
+ }
2809
+
2810
+ async pause(options: { blockedTasks?: Task<any>[] } = {}): Promise<void> {
2811
+ const { blockedTasks = [] } = options;
2812
+ const unlock = await this.lock.lock();
2813
+
2814
+ try {
2815
+ const span = tracer.startSpan({
2816
+ name: 'pause_agent_activity',
2817
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2818
+ });
2819
+ try {
2820
+ await this._pauseSchedulingTask(blockedTasks);
2821
+ await this._closeSessionResources();
2822
+ } finally {
2823
+ span.end();
2824
+ }
2825
+ } finally {
2826
+ unlock();
2827
+ }
2828
+ }
2829
+
1895
2830
  async drain(): Promise<void> {
2831
+ // Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
2832
+ return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
2833
+ name: 'drain_agent_activity',
2834
+ context: ROOT_CONTEXT,
2835
+ });
2836
+ }
2837
+
2838
+ private async _drainImpl(span: Span): Promise<void> {
2839
+ span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
2840
+
1896
2841
  const unlock = await this.lock.lock();
1897
2842
  try {
1898
- if (this._draining) return;
2843
+ if (this._schedulingPaused) return;
1899
2844
 
1900
- this.createSpeechTask({
1901
- task: Task.from(() => this.agent.onExit()),
2845
+ this._onExitTask = this.createSpeechTask({
2846
+ taskFn: () =>
2847
+ tracer.startActiveSpan(async () => this.agent.onExit(), {
2848
+ name: 'on_exit',
2849
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2850
+ }),
2851
+ inlineTask: true,
1902
2852
  name: 'AgentActivity_onExit',
1903
2853
  });
1904
2854
 
1905
- this.wakeupMainTask();
1906
- this._draining = true;
1907
- await this._mainTask?.result;
2855
+ this.cancelPreemptiveGeneration();
2856
+
2857
+ await this._onExitTask.result;
2858
+ await this._pauseSchedulingTask([]);
1908
2859
  } finally {
1909
2860
  unlock();
1910
2861
  }
@@ -1913,42 +2864,160 @@ export class AgentActivity implements RecognitionHooks {
1913
2864
  async close(): Promise<void> {
1914
2865
  const unlock = await this.lock.lock();
1915
2866
  try {
1916
- if (!this._draining) {
1917
- this.logger.warn('task closing without draining');
1918
- }
2867
+ this.cancelPreemptiveGeneration();
1919
2868
 
1920
- // Unregister event handlers to prevent duplicate metrics
1921
- if (this.llm instanceof LLM) {
1922
- this.llm.off('metrics_collected', this.onMetricsCollected);
1923
- }
1924
- if (this.realtimeSession) {
1925
- this.realtimeSession.off('generation_created', this.onGenerationCreated);
1926
- this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
1927
- this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
1928
- this.realtimeSession.off(
1929
- 'input_audio_transcription_completed',
1930
- this.onInputAudioTranscriptionCompleted,
1931
- );
1932
- this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
1933
- }
1934
- if (this.stt instanceof STT) {
1935
- this.stt.off('metrics_collected', this.onMetricsCollected);
2869
+ await cancelAndWait(Array.from(this.speechTasks), AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
2870
+
2871
+ if (this._currentSpeech && !this._currentSpeech.done()) {
2872
+ this._currentSpeech._markDone();
1936
2873
  }
1937
- if (this.tts instanceof TTS) {
1938
- this.tts.off('metrics_collected', this.onMetricsCollected);
2874
+
2875
+ await this._closeSessionResources();
2876
+
2877
+ if (this._mainTask) {
2878
+ await this._mainTask.cancelAndWait();
1939
2879
  }
1940
- if (this.vad instanceof VAD) {
1941
- this.vad.off('metrics_collected', this.onMetricsCollected);
2880
+ if (this.interruptionDetector) {
2881
+ this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech);
2882
+ this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
2883
+ this.interruptionDetector.off('error', this.onInterruptionError);
1942
2884
  }
1943
2885
 
1944
- this.detachAudioInput();
1945
- await this.realtimeSession?.close();
1946
- await this.audioRecognition?.close();
1947
- await this._mainTask?.cancelAndWait();
2886
+ this.agent._agentActivity = undefined;
1948
2887
  } finally {
1949
2888
  unlock();
1950
2889
  }
1951
2890
  }
2891
+
2892
+ private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined {
2893
+ const agentInterruptionDetection = this.agent.turnHandling?.interruption?.mode;
2894
+ const sessionInterruptionDetection = this.agentSession.interruptionDetection;
2895
+ if (
2896
+ !(
2897
+ this.stt &&
2898
+ this.stt.capabilities.alignedTranscript &&
2899
+ this.stt.capabilities.streaming &&
2900
+ this.vad &&
2901
+ this.turnDetection !== 'manual' &&
2902
+ this.turnDetection !== 'realtime_llm' &&
2903
+ !(this.llm instanceof RealtimeModel)
2904
+ )
2905
+ ) {
2906
+ if (
2907
+ agentInterruptionDetection === 'adaptive' ||
2908
+ sessionInterruptionDetection === 'adaptive'
2909
+ ) {
2910
+ this.logger.warn(
2911
+ "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled",
2912
+ );
2913
+ }
2914
+ return undefined;
2915
+ }
2916
+
2917
+ if (!this.allowInterruptions) {
2918
+ return undefined;
2919
+ }
2920
+
2921
+ if (agentInterruptionDetection === 'vad') {
2922
+ return undefined;
2923
+ }
2924
+
2925
+ if (sessionInterruptionDetection === 'vad') {
2926
+ return undefined;
2927
+ }
2928
+
2929
+ if (
2930
+ agentInterruptionDetection === undefined &&
2931
+ sessionInterruptionDetection === undefined &&
2932
+ !isHosted() &&
2933
+ !isDevMode()
2934
+ ) {
2935
+ this.logger.info('adaptive interruption is disabled by default in production mode');
2936
+ return undefined;
2937
+ }
2938
+
2939
+ try {
2940
+ const detector = new AdaptiveInterruptionDetector();
2941
+
2942
+ detector.on('overlapping_speech', this.onInterruptionOverlappingSpeech);
2943
+ detector.on('metrics_collected', this.onInterruptionMetricsCollected);
2944
+ detector.on('error', this.onInterruptionError);
2945
+
2946
+ return detector;
2947
+ } catch (error: unknown) {
2948
+ this.logger.warn({ error }, 'could not instantiate AdaptiveInterruptionDetector');
2949
+ }
2950
+ return undefined;
2951
+ }
2952
+
2953
+ private restoreInterruptionByAudioActivity(): void {
2954
+ this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled;
2955
+ }
2956
+
2957
+ private fallbackToVadInterruption(): void {
2958
+ if (!this.isInterruptionDetectionEnabled) return;
2959
+
2960
+ this.isInterruptionDetectionEnabled = false;
2961
+ this.restoreInterruptionByAudioActivity();
2962
+
2963
+ if (this.interruptionDetector) {
2964
+ this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech);
2965
+ this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected);
2966
+ this.interruptionDetector.off('error', this.onInterruptionError);
2967
+ this.interruptionDetector = undefined;
2968
+ }
2969
+
2970
+ if (this.audioRecognition) {
2971
+ this.audioRecognition.disableInterruptionDetection().catch((err) => {
2972
+ this.logger.warn({ err }, 'error while disabling interruption detection');
2973
+ });
2974
+ }
2975
+
2976
+ this.logger.warn(
2977
+ 'adaptive interruption disabled due to unrecoverable error, falling back to VAD-based interruption',
2978
+ );
2979
+ }
2980
+
2981
+ private async _closeSessionResources(): Promise<void> {
2982
+ // Unregister event handlers to prevent duplicate metrics
2983
+ if (this.llm instanceof LLM) {
2984
+ this.llm.off('metrics_collected', this.onMetricsCollected);
2985
+ this.llm.off('error', this.onModelError);
2986
+ }
2987
+
2988
+ if (this.realtimeSession) {
2989
+ this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated);
2990
+ this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted);
2991
+ this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped);
2992
+ this.realtimeSession.off(
2993
+ 'input_audio_transcription_completed',
2994
+ this.onRealtimeInputAudioTranscriptionCompleted,
2995
+ );
2996
+ this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
2997
+ this.realtimeSession.off('error', this.onModelError);
2998
+ }
2999
+
3000
+ if (this.stt instanceof STT) {
3001
+ this.stt.off('metrics_collected', this.onMetricsCollected);
3002
+ this.stt.off('error', this.onModelError);
3003
+ }
3004
+
3005
+ if (this.tts instanceof TTS) {
3006
+ this.tts.off('metrics_collected', this.onMetricsCollected);
3007
+ this.tts.off('error', this.onModelError);
3008
+ }
3009
+
3010
+ if (this.vad instanceof VAD) {
3011
+ this.vad.off('metrics_collected', this.onMetricsCollected);
3012
+ }
3013
+
3014
+ this.detachAudioInput();
3015
+ this.realtimeSpans?.clear();
3016
+ await this.realtimeSession?.close();
3017
+ await this.audioRecognition?.close();
3018
+ this.realtimeSession = undefined;
3019
+ this.audioRecognition = undefined;
3020
+ }
1952
3021
  }
1953
3022
 
1954
3023
  function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {