@livekit/agents 0.0.0-20260120144724

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (987) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +17 -0
  3. package/dist/_exceptions.cjs +109 -0
  4. package/dist/_exceptions.cjs.map +1 -0
  5. package/dist/_exceptions.d.cts +64 -0
  6. package/dist/_exceptions.d.ts +64 -0
  7. package/dist/_exceptions.d.ts.map +1 -0
  8. package/dist/_exceptions.js +80 -0
  9. package/dist/_exceptions.js.map +1 -0
  10. package/dist/audio.cjs +170 -0
  11. package/dist/audio.cjs.map +1 -0
  12. package/dist/audio.d.cts +46 -0
  13. package/dist/audio.d.ts +46 -0
  14. package/dist/audio.d.ts.map +1 -0
  15. package/dist/audio.js +133 -0
  16. package/dist/audio.js.map +1 -0
  17. package/dist/cli.cjs +171 -0
  18. package/dist/cli.cjs.map +1 -0
  19. package/dist/cli.d.cts +14 -0
  20. package/dist/cli.d.ts +14 -0
  21. package/dist/cli.d.ts.map +1 -0
  22. package/dist/cli.js +145 -0
  23. package/dist/cli.js.map +1 -0
  24. package/dist/connection_pool.cjs +242 -0
  25. package/dist/connection_pool.cjs.map +1 -0
  26. package/dist/connection_pool.d.cts +123 -0
  27. package/dist/connection_pool.d.ts +123 -0
  28. package/dist/connection_pool.d.ts.map +1 -0
  29. package/dist/connection_pool.js +218 -0
  30. package/dist/connection_pool.js.map +1 -0
  31. package/dist/connection_pool.test.cjs +256 -0
  32. package/dist/connection_pool.test.cjs.map +1 -0
  33. package/dist/connection_pool.test.js +255 -0
  34. package/dist/connection_pool.test.js.map +1 -0
  35. package/dist/constants.cjs +44 -0
  36. package/dist/constants.cjs.map +1 -0
  37. package/dist/constants.d.cts +7 -0
  38. package/dist/constants.d.ts +7 -0
  39. package/dist/constants.d.ts.map +1 -0
  40. package/dist/constants.js +15 -0
  41. package/dist/constants.js.map +1 -0
  42. package/dist/generator.cjs +36 -0
  43. package/dist/generator.cjs.map +1 -0
  44. package/dist/generator.d.cts +23 -0
  45. package/dist/generator.d.ts +23 -0
  46. package/dist/generator.d.ts.map +1 -0
  47. package/dist/generator.js +11 -0
  48. package/dist/generator.js.map +1 -0
  49. package/dist/http_server.cjs +75 -0
  50. package/dist/http_server.cjs.map +1 -0
  51. package/dist/http_server.d.cts +20 -0
  52. package/dist/http_server.d.ts +20 -0
  53. package/dist/http_server.d.ts.map +1 -0
  54. package/dist/http_server.js +51 -0
  55. package/dist/http_server.js.map +1 -0
  56. package/dist/index.cjs +100 -0
  57. package/dist/index.cjs.map +1 -0
  58. package/dist/index.d.cts +35 -0
  59. package/dist/index.d.ts +35 -0
  60. package/dist/index.d.ts.map +1 -0
  61. package/dist/index.js +40 -0
  62. package/dist/index.js.map +1 -0
  63. package/dist/inference/api_protos.cjs +104 -0
  64. package/dist/inference/api_protos.cjs.map +1 -0
  65. package/dist/inference/api_protos.d.cts +222 -0
  66. package/dist/inference/api_protos.d.ts +222 -0
  67. package/dist/inference/api_protos.d.ts.map +1 -0
  68. package/dist/inference/api_protos.js +70 -0
  69. package/dist/inference/api_protos.js.map +1 -0
  70. package/dist/inference/index.cjs +56 -0
  71. package/dist/inference/index.cjs.map +1 -0
  72. package/dist/inference/index.d.cts +8 -0
  73. package/dist/inference/index.d.ts +8 -0
  74. package/dist/inference/index.d.ts.map +1 -0
  75. package/dist/inference/index.js +23 -0
  76. package/dist/inference/index.js.map +1 -0
  77. package/dist/inference/interruption/AdaptiveInterruptionDetector.cjs +152 -0
  78. package/dist/inference/interruption/AdaptiveInterruptionDetector.cjs.map +1 -0
  79. package/dist/inference/interruption/AdaptiveInterruptionDetector.d.cts +50 -0
  80. package/dist/inference/interruption/AdaptiveInterruptionDetector.d.ts +50 -0
  81. package/dist/inference/interruption/AdaptiveInterruptionDetector.d.ts.map +1 -0
  82. package/dist/inference/interruption/AdaptiveInterruptionDetector.js +125 -0
  83. package/dist/inference/interruption/AdaptiveInterruptionDetector.js.map +1 -0
  84. package/dist/inference/interruption/InterruptionStream.cjs +310 -0
  85. package/dist/inference/interruption/InterruptionStream.cjs.map +1 -0
  86. package/dist/inference/interruption/InterruptionStream.d.cts +57 -0
  87. package/dist/inference/interruption/InterruptionStream.d.ts +57 -0
  88. package/dist/inference/interruption/InterruptionStream.d.ts.map +1 -0
  89. package/dist/inference/interruption/InterruptionStream.js +288 -0
  90. package/dist/inference/interruption/InterruptionStream.js.map +1 -0
  91. package/dist/inference/interruption/defaults.cjs +76 -0
  92. package/dist/inference/interruption/defaults.cjs.map +1 -0
  93. package/dist/inference/interruption/defaults.d.cts +14 -0
  94. package/dist/inference/interruption/defaults.d.ts +14 -0
  95. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  96. package/dist/inference/interruption/defaults.js +42 -0
  97. package/dist/inference/interruption/defaults.js.map +1 -0
  98. package/dist/inference/interruption/errors.cjs +2 -0
  99. package/dist/inference/interruption/errors.cjs.map +1 -0
  100. package/dist/inference/interruption/errors.d.cts +2 -0
  101. package/dist/inference/interruption/errors.d.ts +2 -0
  102. package/dist/inference/interruption/errors.d.ts.map +1 -0
  103. package/dist/inference/interruption/errors.js +1 -0
  104. package/dist/inference/interruption/errors.js.map +1 -0
  105. package/dist/inference/interruption/http_transport.cjs +57 -0
  106. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  107. package/dist/inference/interruption/http_transport.d.cts +23 -0
  108. package/dist/inference/interruption/http_transport.d.ts +23 -0
  109. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  110. package/dist/inference/interruption/http_transport.js +33 -0
  111. package/dist/inference/interruption/http_transport.js.map +1 -0
  112. package/dist/inference/interruption/index.cjs +34 -0
  113. package/dist/inference/interruption/index.cjs.map +1 -0
  114. package/dist/inference/interruption/index.d.cts +5 -0
  115. package/dist/inference/interruption/index.d.ts +5 -0
  116. package/dist/inference/interruption/index.d.ts.map +1 -0
  117. package/dist/inference/interruption/index.js +7 -0
  118. package/dist/inference/interruption/index.js.map +1 -0
  119. package/dist/inference/interruption/interruption.cjs +85 -0
  120. package/dist/inference/interruption/interruption.cjs.map +1 -0
  121. package/dist/inference/interruption/interruption.d.cts +48 -0
  122. package/dist/inference/interruption/interruption.d.ts +48 -0
  123. package/dist/inference/interruption/interruption.d.ts.map +1 -0
  124. package/dist/inference/interruption/interruption.js +59 -0
  125. package/dist/inference/interruption/interruption.js.map +1 -0
  126. package/dist/inference/llm.cjs +347 -0
  127. package/dist/inference/llm.cjs.map +1 -0
  128. package/dist/inference/llm.d.cts +114 -0
  129. package/dist/inference/llm.d.ts +114 -0
  130. package/dist/inference/llm.d.ts.map +1 -0
  131. package/dist/inference/llm.js +318 -0
  132. package/dist/inference/llm.js.map +1 -0
  133. package/dist/inference/stt.cjs +371 -0
  134. package/dist/inference/stt.cjs.map +1 -0
  135. package/dist/inference/stt.d.cts +91 -0
  136. package/dist/inference/stt.d.ts +91 -0
  137. package/dist/inference/stt.d.ts.map +1 -0
  138. package/dist/inference/stt.js +350 -0
  139. package/dist/inference/stt.js.map +1 -0
  140. package/dist/inference/tts.cjs +439 -0
  141. package/dist/inference/tts.cjs.map +1 -0
  142. package/dist/inference/tts.d.cts +80 -0
  143. package/dist/inference/tts.d.ts +80 -0
  144. package/dist/inference/tts.d.ts.map +1 -0
  145. package/dist/inference/tts.js +417 -0
  146. package/dist/inference/tts.js.map +1 -0
  147. package/dist/inference/utils.cjs +89 -0
  148. package/dist/inference/utils.cjs.map +1 -0
  149. package/dist/inference/utils.d.cts +6 -0
  150. package/dist/inference/utils.d.ts +6 -0
  151. package/dist/inference/utils.d.ts.map +1 -0
  152. package/dist/inference/utils.js +63 -0
  153. package/dist/inference/utils.js.map +1 -0
  154. package/dist/inference/utils.test.cjs +20 -0
  155. package/dist/inference/utils.test.cjs.map +1 -0
  156. package/dist/inference/utils.test.js +19 -0
  157. package/dist/inference/utils.test.js.map +1 -0
  158. package/dist/inference_runner.cjs +37 -0
  159. package/dist/inference_runner.cjs.map +1 -0
  160. package/dist/inference_runner.d.cts +11 -0
  161. package/dist/inference_runner.d.ts +11 -0
  162. package/dist/inference_runner.d.ts.map +1 -0
  163. package/dist/inference_runner.js +13 -0
  164. package/dist/inference_runner.js.map +1 -0
  165. package/dist/ipc/index.cjs +23 -0
  166. package/dist/ipc/index.cjs.map +1 -0
  167. package/dist/ipc/index.d.cts +2 -0
  168. package/dist/ipc/index.d.ts +2 -0
  169. package/dist/ipc/index.d.ts.map +1 -0
  170. package/dist/ipc/index.js +2 -0
  171. package/dist/ipc/index.js.map +1 -0
  172. package/dist/ipc/inference_executor.cjs +17 -0
  173. package/dist/ipc/inference_executor.cjs.map +1 -0
  174. package/dist/ipc/inference_executor.d.cts +4 -0
  175. package/dist/ipc/inference_executor.d.ts +4 -0
  176. package/dist/ipc/inference_executor.d.ts.map +1 -0
  177. package/dist/ipc/inference_executor.js +1 -0
  178. package/dist/ipc/inference_executor.js.map +1 -0
  179. package/dist/ipc/inference_proc_executor.cjs +101 -0
  180. package/dist/ipc/inference_proc_executor.cjs.map +1 -0
  181. package/dist/ipc/inference_proc_executor.d.cts +23 -0
  182. package/dist/ipc/inference_proc_executor.d.ts +23 -0
  183. package/dist/ipc/inference_proc_executor.d.ts.map +1 -0
  184. package/dist/ipc/inference_proc_executor.js +75 -0
  185. package/dist/ipc/inference_proc_executor.js.map +1 -0
  186. package/dist/ipc/inference_proc_lazy_main.cjs +86 -0
  187. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -0
  188. package/dist/ipc/inference_proc_lazy_main.d.cts +2 -0
  189. package/dist/ipc/inference_proc_lazy_main.d.ts +2 -0
  190. package/dist/ipc/inference_proc_lazy_main.d.ts.map +1 -0
  191. package/dist/ipc/inference_proc_lazy_main.js +85 -0
  192. package/dist/ipc/inference_proc_lazy_main.js.map +1 -0
  193. package/dist/ipc/job_executor.cjs +34 -0
  194. package/dist/ipc/job_executor.cjs.map +1 -0
  195. package/dist/ipc/job_executor.d.cts +18 -0
  196. package/dist/ipc/job_executor.d.ts +18 -0
  197. package/dist/ipc/job_executor.d.ts.map +1 -0
  198. package/dist/ipc/job_executor.js +10 -0
  199. package/dist/ipc/job_executor.js.map +1 -0
  200. package/dist/ipc/job_proc_executor.cjs +115 -0
  201. package/dist/ipc/job_proc_executor.cjs.map +1 -0
  202. package/dist/ipc/job_proc_executor.d.cts +19 -0
  203. package/dist/ipc/job_proc_executor.d.ts +19 -0
  204. package/dist/ipc/job_proc_executor.d.ts.map +1 -0
  205. package/dist/ipc/job_proc_executor.js +89 -0
  206. package/dist/ipc/job_proc_executor.js.map +1 -0
  207. package/dist/ipc/job_proc_lazy_main.cjs +210 -0
  208. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -0
  209. package/dist/ipc/job_proc_lazy_main.d.cts +2 -0
  210. package/dist/ipc/job_proc_lazy_main.d.ts +2 -0
  211. package/dist/ipc/job_proc_lazy_main.d.ts.map +1 -0
  212. package/dist/ipc/job_proc_lazy_main.js +187 -0
  213. package/dist/ipc/job_proc_lazy_main.js.map +1 -0
  214. package/dist/ipc/message.cjs +17 -0
  215. package/dist/ipc/message.cjs.map +1 -0
  216. package/dist/ipc/message.d.cts +58 -0
  217. package/dist/ipc/message.d.ts +58 -0
  218. package/dist/ipc/message.d.ts.map +1 -0
  219. package/dist/ipc/message.js +1 -0
  220. package/dist/ipc/message.js.map +1 -0
  221. package/dist/ipc/proc_pool.cjs +164 -0
  222. package/dist/ipc/proc_pool.cjs.map +1 -0
  223. package/dist/ipc/proc_pool.d.cts +31 -0
  224. package/dist/ipc/proc_pool.d.ts +31 -0
  225. package/dist/ipc/proc_pool.d.ts.map +1 -0
  226. package/dist/ipc/proc_pool.js +140 -0
  227. package/dist/ipc/proc_pool.js.map +1 -0
  228. package/dist/ipc/supervised_proc.cjs +229 -0
  229. package/dist/ipc/supervised_proc.cjs.map +1 -0
  230. package/dist/ipc/supervised_proc.d.cts +32 -0
  231. package/dist/ipc/supervised_proc.d.ts +32 -0
  232. package/dist/ipc/supervised_proc.d.ts.map +1 -0
  233. package/dist/ipc/supervised_proc.js +195 -0
  234. package/dist/ipc/supervised_proc.js.map +1 -0
  235. package/dist/ipc/supervised_proc.test.cjs +145 -0
  236. package/dist/ipc/supervised_proc.test.cjs.map +1 -0
  237. package/dist/ipc/supervised_proc.test.js +122 -0
  238. package/dist/ipc/supervised_proc.test.js.map +1 -0
  239. package/dist/job.cjs +373 -0
  240. package/dist/job.cjs.map +1 -0
  241. package/dist/job.d.cts +141 -0
  242. package/dist/job.d.ts +141 -0
  243. package/dist/job.d.ts.map +1 -0
  244. package/dist/job.js +332 -0
  245. package/dist/job.js.map +1 -0
  246. package/dist/llm/chat_context.cjs +527 -0
  247. package/dist/llm/chat_context.cjs.map +1 -0
  248. package/dist/llm/chat_context.d.cts +223 -0
  249. package/dist/llm/chat_context.d.ts +223 -0
  250. package/dist/llm/chat_context.d.ts.map +1 -0
  251. package/dist/llm/chat_context.js +496 -0
  252. package/dist/llm/chat_context.js.map +1 -0
  253. package/dist/llm/chat_context.test.cjs +911 -0
  254. package/dist/llm/chat_context.test.cjs.map +1 -0
  255. package/dist/llm/chat_context.test.js +916 -0
  256. package/dist/llm/chat_context.test.js.map +1 -0
  257. package/dist/llm/fallback_adapter.cjs +278 -0
  258. package/dist/llm/fallback_adapter.cjs.map +1 -0
  259. package/dist/llm/fallback_adapter.d.cts +73 -0
  260. package/dist/llm/fallback_adapter.d.ts +73 -0
  261. package/dist/llm/fallback_adapter.d.ts.map +1 -0
  262. package/dist/llm/fallback_adapter.js +254 -0
  263. package/dist/llm/fallback_adapter.js.map +1 -0
  264. package/dist/llm/fallback_adapter.test.cjs +176 -0
  265. package/dist/llm/fallback_adapter.test.cjs.map +1 -0
  266. package/dist/llm/fallback_adapter.test.js +175 -0
  267. package/dist/llm/fallback_adapter.test.js.map +1 -0
  268. package/dist/llm/index.cjs +79 -0
  269. package/dist/llm/index.cjs.map +1 -0
  270. package/dist/llm/index.d.cts +9 -0
  271. package/dist/llm/index.d.ts +9 -0
  272. package/dist/llm/index.d.ts.map +1 -0
  273. package/dist/llm/index.js +61 -0
  274. package/dist/llm/index.js.map +1 -0
  275. package/dist/llm/llm.cjs +226 -0
  276. package/dist/llm/llm.cjs.map +1 -0
  277. package/dist/llm/llm.d.cts +94 -0
  278. package/dist/llm/llm.d.ts +94 -0
  279. package/dist/llm/llm.d.ts.map +1 -0
  280. package/dist/llm/llm.js +201 -0
  281. package/dist/llm/llm.js.map +1 -0
  282. package/dist/llm/provider_format/google.cjs +132 -0
  283. package/dist/llm/provider_format/google.cjs.map +1 -0
  284. package/dist/llm/provider_format/google.d.cts +6 -0
  285. package/dist/llm/provider_format/google.d.ts +6 -0
  286. package/dist/llm/provider_format/google.d.ts.map +1 -0
  287. package/dist/llm/provider_format/google.js +108 -0
  288. package/dist/llm/provider_format/google.js.map +1 -0
  289. package/dist/llm/provider_format/google.test.cjs +724 -0
  290. package/dist/llm/provider_format/google.test.cjs.map +1 -0
  291. package/dist/llm/provider_format/google.test.js +728 -0
  292. package/dist/llm/provider_format/google.test.js.map +1 -0
  293. package/dist/llm/provider_format/index.cjs +40 -0
  294. package/dist/llm/provider_format/index.cjs.map +1 -0
  295. package/dist/llm/provider_format/index.d.cts +4 -0
  296. package/dist/llm/provider_format/index.d.ts +4 -0
  297. package/dist/llm/provider_format/index.d.ts.map +1 -0
  298. package/dist/llm/provider_format/index.js +16 -0
  299. package/dist/llm/provider_format/index.js.map +1 -0
  300. package/dist/llm/provider_format/openai.cjs +138 -0
  301. package/dist/llm/provider_format/openai.cjs.map +1 -0
  302. package/dist/llm/provider_format/openai.d.cts +3 -0
  303. package/dist/llm/provider_format/openai.d.ts +3 -0
  304. package/dist/llm/provider_format/openai.d.ts.map +1 -0
  305. package/dist/llm/provider_format/openai.js +114 -0
  306. package/dist/llm/provider_format/openai.js.map +1 -0
  307. package/dist/llm/provider_format/openai.test.cjs +557 -0
  308. package/dist/llm/provider_format/openai.test.cjs.map +1 -0
  309. package/dist/llm/provider_format/openai.test.js +561 -0
  310. package/dist/llm/provider_format/openai.test.js.map +1 -0
  311. package/dist/llm/provider_format/utils.cjs +146 -0
  312. package/dist/llm/provider_format/utils.cjs.map +1 -0
  313. package/dist/llm/provider_format/utils.d.cts +38 -0
  314. package/dist/llm/provider_format/utils.d.ts +38 -0
  315. package/dist/llm/provider_format/utils.d.ts.map +1 -0
  316. package/dist/llm/provider_format/utils.js +122 -0
  317. package/dist/llm/provider_format/utils.js.map +1 -0
  318. package/dist/llm/realtime.cjs +77 -0
  319. package/dist/llm/realtime.cjs.map +1 -0
  320. package/dist/llm/realtime.d.cts +106 -0
  321. package/dist/llm/realtime.d.ts +106 -0
  322. package/dist/llm/realtime.d.ts.map +1 -0
  323. package/dist/llm/realtime.js +52 -0
  324. package/dist/llm/realtime.js.map +1 -0
  325. package/dist/llm/remote_chat_context.cjs +112 -0
  326. package/dist/llm/remote_chat_context.cjs.map +1 -0
  327. package/dist/llm/remote_chat_context.d.cts +25 -0
  328. package/dist/llm/remote_chat_context.d.ts +25 -0
  329. package/dist/llm/remote_chat_context.d.ts.map +1 -0
  330. package/dist/llm/remote_chat_context.js +88 -0
  331. package/dist/llm/remote_chat_context.js.map +1 -0
  332. package/dist/llm/remote_chat_context.test.cjs +225 -0
  333. package/dist/llm/remote_chat_context.test.cjs.map +1 -0
  334. package/dist/llm/remote_chat_context.test.js +224 -0
  335. package/dist/llm/remote_chat_context.test.js.map +1 -0
  336. package/dist/llm/tool_context.cjs +152 -0
  337. package/dist/llm/tool_context.cjs.map +1 -0
  338. package/dist/llm/tool_context.d.cts +153 -0
  339. package/dist/llm/tool_context.d.ts +153 -0
  340. package/dist/llm/tool_context.d.ts.map +1 -0
  341. package/dist/llm/tool_context.js +119 -0
  342. package/dist/llm/tool_context.js.map +1 -0
  343. package/dist/llm/tool_context.test.cjs +359 -0
  344. package/dist/llm/tool_context.test.cjs.map +1 -0
  345. package/dist/llm/tool_context.test.js +336 -0
  346. package/dist/llm/tool_context.test.js.map +1 -0
  347. package/dist/llm/tool_context.type.test.cjs +92 -0
  348. package/dist/llm/tool_context.type.test.cjs.map +1 -0
  349. package/dist/llm/tool_context.type.test.js +91 -0
  350. package/dist/llm/tool_context.type.test.js.map +1 -0
  351. package/dist/llm/utils.cjs +267 -0
  352. package/dist/llm/utils.cjs.map +1 -0
  353. package/dist/llm/utils.d.cts +41 -0
  354. package/dist/llm/utils.d.ts +41 -0
  355. package/dist/llm/utils.d.ts.map +1 -0
  356. package/dist/llm/utils.js +230 -0
  357. package/dist/llm/utils.js.map +1 -0
  358. package/dist/llm/utils.test.cjs +513 -0
  359. package/dist/llm/utils.test.cjs.map +1 -0
  360. package/dist/llm/utils.test.js +490 -0
  361. package/dist/llm/utils.test.js.map +1 -0
  362. package/dist/llm/zod-utils.cjs +102 -0
  363. package/dist/llm/zod-utils.cjs.map +1 -0
  364. package/dist/llm/zod-utils.d.cts +65 -0
  365. package/dist/llm/zod-utils.d.ts +65 -0
  366. package/dist/llm/zod-utils.d.ts.map +1 -0
  367. package/dist/llm/zod-utils.js +64 -0
  368. package/dist/llm/zod-utils.js.map +1 -0
  369. package/dist/llm/zod-utils.test.cjs +472 -0
  370. package/dist/llm/zod-utils.test.cjs.map +1 -0
  371. package/dist/llm/zod-utils.test.js +455 -0
  372. package/dist/llm/zod-utils.test.js.map +1 -0
  373. package/dist/log.cjs +81 -0
  374. package/dist/log.cjs.map +1 -0
  375. package/dist/log.d.cts +20 -0
  376. package/dist/log.d.ts +20 -0
  377. package/dist/log.d.ts.map +1 -0
  378. package/dist/log.js +54 -0
  379. package/dist/log.js.map +1 -0
  380. package/dist/metrics/base.cjs +17 -0
  381. package/dist/metrics/base.cjs.map +1 -0
  382. package/dist/metrics/base.d.cts +150 -0
  383. package/dist/metrics/base.d.ts +150 -0
  384. package/dist/metrics/base.d.ts.map +1 -0
  385. package/dist/metrics/base.js +1 -0
  386. package/dist/metrics/base.js.map +1 -0
  387. package/dist/metrics/index.cjs +32 -0
  388. package/dist/metrics/index.cjs.map +1 -0
  389. package/dist/metrics/index.d.cts +4 -0
  390. package/dist/metrics/index.d.ts +4 -0
  391. package/dist/metrics/index.d.ts.map +1 -0
  392. package/dist/metrics/index.js +7 -0
  393. package/dist/metrics/index.js.map +1 -0
  394. package/dist/metrics/usage_collector.cjs +58 -0
  395. package/dist/metrics/usage_collector.cjs.map +1 -0
  396. package/dist/metrics/usage_collector.d.cts +15 -0
  397. package/dist/metrics/usage_collector.d.ts +15 -0
  398. package/dist/metrics/usage_collector.d.ts.map +1 -0
  399. package/dist/metrics/usage_collector.js +34 -0
  400. package/dist/metrics/usage_collector.js.map +1 -0
  401. package/dist/metrics/utils.cjs +74 -0
  402. package/dist/metrics/utils.cjs.map +1 -0
  403. package/dist/metrics/utils.d.cts +3 -0
  404. package/dist/metrics/utils.d.ts +3 -0
  405. package/dist/metrics/utils.d.ts.map +1 -0
  406. package/dist/metrics/utils.js +50 -0
  407. package/dist/metrics/utils.js.map +1 -0
  408. package/dist/plugin.cjs +62 -0
  409. package/dist/plugin.cjs.map +1 -0
  410. package/dist/plugin.d.cts +24 -0
  411. package/dist/plugin.d.ts +24 -0
  412. package/dist/plugin.d.ts.map +1 -0
  413. package/dist/plugin.js +37 -0
  414. package/dist/plugin.js.map +1 -0
  415. package/dist/stream/deferred_stream.cjs +106 -0
  416. package/dist/stream/deferred_stream.cjs.map +1 -0
  417. package/dist/stream/deferred_stream.d.cts +32 -0
  418. package/dist/stream/deferred_stream.d.ts +32 -0
  419. package/dist/stream/deferred_stream.d.ts.map +1 -0
  420. package/dist/stream/deferred_stream.js +81 -0
  421. package/dist/stream/deferred_stream.js.map +1 -0
  422. package/dist/stream/deferred_stream.test.cjs +527 -0
  423. package/dist/stream/deferred_stream.test.cjs.map +1 -0
  424. package/dist/stream/deferred_stream.test.js +526 -0
  425. package/dist/stream/deferred_stream.test.js.map +1 -0
  426. package/dist/stream/identity_transform.cjs +42 -0
  427. package/dist/stream/identity_transform.cjs.map +1 -0
  428. package/dist/stream/identity_transform.d.cts +6 -0
  429. package/dist/stream/identity_transform.d.ts +6 -0
  430. package/dist/stream/identity_transform.d.ts.map +1 -0
  431. package/dist/stream/identity_transform.js +18 -0
  432. package/dist/stream/identity_transform.js.map +1 -0
  433. package/dist/stream/identity_transform.test.cjs +125 -0
  434. package/dist/stream/identity_transform.test.cjs.map +1 -0
  435. package/dist/stream/identity_transform.test.js +124 -0
  436. package/dist/stream/identity_transform.test.js.map +1 -0
  437. package/dist/stream/index.cjs +38 -0
  438. package/dist/stream/index.cjs.map +1 -0
  439. package/dist/stream/index.d.cts +5 -0
  440. package/dist/stream/index.d.ts +5 -0
  441. package/dist/stream/index.d.ts.map +1 -0
  442. package/dist/stream/index.js +11 -0
  443. package/dist/stream/index.js.map +1 -0
  444. package/dist/stream/merge_readable_streams.cjs +59 -0
  445. package/dist/stream/merge_readable_streams.cjs.map +1 -0
  446. package/dist/stream/merge_readable_streams.d.cts +4 -0
  447. package/dist/stream/merge_readable_streams.d.ts +4 -0
  448. package/dist/stream/merge_readable_streams.d.ts.map +1 -0
  449. package/dist/stream/merge_readable_streams.js +35 -0
  450. package/dist/stream/merge_readable_streams.js.map +1 -0
  451. package/dist/stream/stream_channel.cjs +57 -0
  452. package/dist/stream/stream_channel.cjs.map +1 -0
  453. package/dist/stream/stream_channel.d.cts +11 -0
  454. package/dist/stream/stream_channel.d.ts +11 -0
  455. package/dist/stream/stream_channel.d.ts.map +1 -0
  456. package/dist/stream/stream_channel.js +33 -0
  457. package/dist/stream/stream_channel.js.map +1 -0
  458. package/dist/stream/stream_channel.test.cjs +124 -0
  459. package/dist/stream/stream_channel.test.cjs.map +1 -0
  460. package/dist/stream/stream_channel.test.js +123 -0
  461. package/dist/stream/stream_channel.test.js.map +1 -0
  462. package/dist/stt/index.cjs +38 -0
  463. package/dist/stt/index.cjs.map +1 -0
  464. package/dist/stt/index.d.cts +3 -0
  465. package/dist/stt/index.d.ts +3 -0
  466. package/dist/stt/index.d.ts.map +1 -0
  467. package/dist/stt/index.js +14 -0
  468. package/dist/stt/index.js.map +1 -0
  469. package/dist/stt/stream_adapter.cjs +115 -0
  470. package/dist/stt/stream_adapter.cjs.map +1 -0
  471. package/dist/stt/stream_adapter.d.cts +23 -0
  472. package/dist/stt/stream_adapter.d.ts +23 -0
  473. package/dist/stt/stream_adapter.d.ts.map +1 -0
  474. package/dist/stt/stream_adapter.js +90 -0
  475. package/dist/stt/stream_adapter.js.map +1 -0
  476. package/dist/stt/stt.cjs +253 -0
  477. package/dist/stt/stt.cjs.map +1 -0
  478. package/dist/stt/stt.d.cts +158 -0
  479. package/dist/stt/stt.d.ts +158 -0
  480. package/dist/stt/stt.d.ts.map +1 -0
  481. package/dist/stt/stt.js +227 -0
  482. package/dist/stt/stt.js.map +1 -0
  483. package/dist/telemetry/index.cjs +72 -0
  484. package/dist/telemetry/index.cjs.map +1 -0
  485. package/dist/telemetry/index.d.cts +7 -0
  486. package/dist/telemetry/index.d.ts +7 -0
  487. package/dist/telemetry/index.d.ts.map +1 -0
  488. package/dist/telemetry/index.js +37 -0
  489. package/dist/telemetry/index.js.map +1 -0
  490. package/dist/telemetry/logging.cjs +65 -0
  491. package/dist/telemetry/logging.cjs.map +1 -0
  492. package/dist/telemetry/logging.d.cts +21 -0
  493. package/dist/telemetry/logging.d.ts +21 -0
  494. package/dist/telemetry/logging.d.ts.map +1 -0
  495. package/dist/telemetry/logging.js +40 -0
  496. package/dist/telemetry/logging.js.map +1 -0
  497. package/dist/telemetry/otel_http_exporter.cjs +147 -0
  498. package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
  499. package/dist/telemetry/otel_http_exporter.d.cts +62 -0
  500. package/dist/telemetry/otel_http_exporter.d.ts +62 -0
  501. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
  502. package/dist/telemetry/otel_http_exporter.js +123 -0
  503. package/dist/telemetry/otel_http_exporter.js.map +1 -0
  504. package/dist/telemetry/pino_otel_transport.cjs +217 -0
  505. package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
  506. package/dist/telemetry/pino_otel_transport.d.cts +58 -0
  507. package/dist/telemetry/pino_otel_transport.d.ts +58 -0
  508. package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
  509. package/dist/telemetry/pino_otel_transport.js +189 -0
  510. package/dist/telemetry/pino_otel_transport.js.map +1 -0
  511. package/dist/telemetry/trace_types.cjs +206 -0
  512. package/dist/telemetry/trace_types.cjs.map +1 -0
  513. package/dist/telemetry/trace_types.d.cts +61 -0
  514. package/dist/telemetry/trace_types.d.ts +61 -0
  515. package/dist/telemetry/trace_types.d.ts.map +1 -0
  516. package/dist/telemetry/trace_types.js +123 -0
  517. package/dist/telemetry/trace_types.js.map +1 -0
  518. package/dist/telemetry/traces.cjs +444 -0
  519. package/dist/telemetry/traces.cjs.map +1 -0
  520. package/dist/telemetry/traces.d.cts +114 -0
  521. package/dist/telemetry/traces.d.ts +114 -0
  522. package/dist/telemetry/traces.d.ts.map +1 -0
  523. package/dist/telemetry/traces.js +409 -0
  524. package/dist/telemetry/traces.js.map +1 -0
  525. package/dist/telemetry/utils.cjs +86 -0
  526. package/dist/telemetry/utils.cjs.map +1 -0
  527. package/dist/telemetry/utils.d.cts +5 -0
  528. package/dist/telemetry/utils.d.ts +5 -0
  529. package/dist/telemetry/utils.d.ts.map +1 -0
  530. package/dist/telemetry/utils.js +51 -0
  531. package/dist/telemetry/utils.js.map +1 -0
  532. package/dist/tokenize/basic/basic.cjs +105 -0
  533. package/dist/tokenize/basic/basic.cjs.map +1 -0
  534. package/dist/tokenize/basic/basic.d.cts +24 -0
  535. package/dist/tokenize/basic/basic.d.ts +24 -0
  536. package/dist/tokenize/basic/basic.d.ts.map +1 -0
  537. package/dist/tokenize/basic/basic.js +67 -0
  538. package/dist/tokenize/basic/basic.js.map +1 -0
  539. package/dist/tokenize/basic/hyphenator.cjs +425 -0
  540. package/dist/tokenize/basic/hyphenator.cjs.map +1 -0
  541. package/dist/tokenize/basic/hyphenator.d.cts +17 -0
  542. package/dist/tokenize/basic/hyphenator.d.ts +17 -0
  543. package/dist/tokenize/basic/hyphenator.d.ts.map +1 -0
  544. package/dist/tokenize/basic/hyphenator.js +401 -0
  545. package/dist/tokenize/basic/hyphenator.js.map +1 -0
  546. package/dist/tokenize/basic/index.cjs +37 -0
  547. package/dist/tokenize/basic/index.cjs.map +1 -0
  548. package/dist/tokenize/basic/index.d.cts +2 -0
  549. package/dist/tokenize/basic/index.d.ts +2 -0
  550. package/dist/tokenize/basic/index.d.ts.map +1 -0
  551. package/dist/tokenize/basic/index.js +15 -0
  552. package/dist/tokenize/basic/index.js.map +1 -0
  553. package/dist/tokenize/basic/paragraph.cjs +57 -0
  554. package/dist/tokenize/basic/paragraph.cjs.map +1 -0
  555. package/dist/tokenize/basic/paragraph.d.cts +5 -0
  556. package/dist/tokenize/basic/paragraph.d.ts +5 -0
  557. package/dist/tokenize/basic/paragraph.d.ts.map +1 -0
  558. package/dist/tokenize/basic/paragraph.js +33 -0
  559. package/dist/tokenize/basic/paragraph.js.map +1 -0
  560. package/dist/tokenize/basic/sentence.cjs +97 -0
  561. package/dist/tokenize/basic/sentence.cjs.map +1 -0
  562. package/dist/tokenize/basic/sentence.d.cts +5 -0
  563. package/dist/tokenize/basic/sentence.d.ts +5 -0
  564. package/dist/tokenize/basic/sentence.d.ts.map +1 -0
  565. package/dist/tokenize/basic/sentence.js +73 -0
  566. package/dist/tokenize/basic/sentence.js.map +1 -0
  567. package/dist/tokenize/basic/word.cjs +44 -0
  568. package/dist/tokenize/basic/word.cjs.map +1 -0
  569. package/dist/tokenize/basic/word.d.cts +5 -0
  570. package/dist/tokenize/basic/word.d.ts +5 -0
  571. package/dist/tokenize/basic/word.d.ts.map +1 -0
  572. package/dist/tokenize/basic/word.js +20 -0
  573. package/dist/tokenize/basic/word.js.map +1 -0
  574. package/dist/tokenize/index.cjs +55 -0
  575. package/dist/tokenize/index.cjs.map +1 -0
  576. package/dist/tokenize/index.d.cts +5 -0
  577. package/dist/tokenize/index.d.ts +5 -0
  578. package/dist/tokenize/index.d.ts.map +1 -0
  579. package/dist/tokenize/index.js +19 -0
  580. package/dist/tokenize/index.js.map +1 -0
  581. package/dist/tokenize/token_stream.cjs +168 -0
  582. package/dist/tokenize/token_stream.cjs.map +1 -0
  583. package/dist/tokenize/token_stream.d.cts +40 -0
  584. package/dist/tokenize/token_stream.d.ts +40 -0
  585. package/dist/tokenize/token_stream.d.ts.map +1 -0
  586. package/dist/tokenize/token_stream.js +142 -0
  587. package/dist/tokenize/token_stream.js.map +1 -0
  588. package/dist/tokenize/tokenizer.cjs +184 -0
  589. package/dist/tokenize/tokenizer.cjs.map +1 -0
  590. package/dist/tokenize/tokenizer.d.cts +55 -0
  591. package/dist/tokenize/tokenizer.d.ts +55 -0
  592. package/dist/tokenize/tokenizer.d.ts.map +1 -0
  593. package/dist/tokenize/tokenizer.js +156 -0
  594. package/dist/tokenize/tokenizer.js.map +1 -0
  595. package/dist/tokenize/tokenizer.test.cjs +220 -0
  596. package/dist/tokenize/tokenizer.test.cjs.map +1 -0
  597. package/dist/tokenize/tokenizer.test.js +219 -0
  598. package/dist/tokenize/tokenizer.test.js.map +1 -0
  599. package/dist/transcription.cjs +247 -0
  600. package/dist/transcription.cjs.map +1 -0
  601. package/dist/transcription.d.cts +31 -0
  602. package/dist/transcription.d.ts +31 -0
  603. package/dist/transcription.d.ts.map +1 -0
  604. package/dist/transcription.js +222 -0
  605. package/dist/transcription.js.map +1 -0
  606. package/dist/tts/index.cjs +38 -0
  607. package/dist/tts/index.cjs.map +1 -0
  608. package/dist/tts/index.d.cts +3 -0
  609. package/dist/tts/index.d.ts +3 -0
  610. package/dist/tts/index.d.ts.map +1 -0
  611. package/dist/tts/index.js +14 -0
  612. package/dist/tts/index.js.map +1 -0
  613. package/dist/tts/stream_adapter.cjs +105 -0
  614. package/dist/tts/stream_adapter.cjs.map +1 -0
  615. package/dist/tts/stream_adapter.d.cts +20 -0
  616. package/dist/tts/stream_adapter.d.ts +20 -0
  617. package/dist/tts/stream_adapter.d.ts.map +1 -0
  618. package/dist/tts/stream_adapter.js +80 -0
  619. package/dist/tts/stream_adapter.js.map +1 -0
  620. package/dist/tts/tts.cjs +431 -0
  621. package/dist/tts/tts.cjs.map +1 -0
  622. package/dist/tts/tts.d.cts +161 -0
  623. package/dist/tts/tts.d.ts +161 -0
  624. package/dist/tts/tts.d.ts.map +1 -0
  625. package/dist/tts/tts.js +405 -0
  626. package/dist/tts/tts.js.map +1 -0
  627. package/dist/types.cjs +49 -0
  628. package/dist/types.cjs.map +1 -0
  629. package/dist/types.d.cts +44 -0
  630. package/dist/types.d.ts +44 -0
  631. package/dist/types.d.ts.map +1 -0
  632. package/dist/types.js +23 -0
  633. package/dist/types.js.map +1 -0
  634. package/dist/utils/ws_transport.cjs +51 -0
  635. package/dist/utils/ws_transport.cjs.map +1 -0
  636. package/dist/utils/ws_transport.d.cts +9 -0
  637. package/dist/utils/ws_transport.d.ts +9 -0
  638. package/dist/utils/ws_transport.d.ts.map +1 -0
  639. package/dist/utils/ws_transport.js +17 -0
  640. package/dist/utils/ws_transport.js.map +1 -0
  641. package/dist/utils/ws_transport.test.cjs +212 -0
  642. package/dist/utils/ws_transport.test.cjs.map +1 -0
  643. package/dist/utils/ws_transport.test.js +211 -0
  644. package/dist/utils/ws_transport.test.js.map +1 -0
  645. package/dist/utils.cjs +669 -0
  646. package/dist/utils.cjs.map +1 -0
  647. package/dist/utils.d.cts +244 -0
  648. package/dist/utils.d.ts +244 -0
  649. package/dist/utils.d.ts.map +1 -0
  650. package/dist/utils.js +617 -0
  651. package/dist/utils.js.map +1 -0
  652. package/dist/utils.test.cjs +492 -0
  653. package/dist/utils.test.cjs.map +1 -0
  654. package/dist/utils.test.js +491 -0
  655. package/dist/utils.test.js.map +1 -0
  656. package/dist/vad.cjs +211 -0
  657. package/dist/vad.cjs.map +1 -0
  658. package/dist/vad.d.cts +105 -0
  659. package/dist/vad.d.ts +105 -0
  660. package/dist/vad.d.ts.map +1 -0
  661. package/dist/vad.js +185 -0
  662. package/dist/vad.js.map +1 -0
  663. package/dist/version.cjs +29 -0
  664. package/dist/version.cjs.map +1 -0
  665. package/dist/version.d.cts +2 -0
  666. package/dist/version.d.ts +2 -0
  667. package/dist/version.d.ts.map +1 -0
  668. package/dist/version.js +5 -0
  669. package/dist/version.js.map +1 -0
  670. package/dist/voice/agent.cjs +308 -0
  671. package/dist/voice/agent.cjs.map +1 -0
  672. package/dist/voice/agent.d.cts +83 -0
  673. package/dist/voice/agent.d.ts +83 -0
  674. package/dist/voice/agent.d.ts.map +1 -0
  675. package/dist/voice/agent.js +287 -0
  676. package/dist/voice/agent.js.map +1 -0
  677. package/dist/voice/agent.test.cjs +61 -0
  678. package/dist/voice/agent.test.cjs.map +1 -0
  679. package/dist/voice/agent.test.js +60 -0
  680. package/dist/voice/agent.test.js.map +1 -0
  681. package/dist/voice/agent_activity.cjs +1784 -0
  682. package/dist/voice/agent_activity.cjs.map +1 -0
  683. package/dist/voice/agent_activity.d.cts +116 -0
  684. package/dist/voice/agent_activity.d.ts +116 -0
  685. package/dist/voice/agent_activity.d.ts.map +1 -0
  686. package/dist/voice/agent_activity.js +1780 -0
  687. package/dist/voice/agent_activity.js.map +1 -0
  688. package/dist/voice/agent_session.cjs +592 -0
  689. package/dist/voice/agent_session.cjs.map +1 -0
  690. package/dist/voice/agent_session.d.cts +165 -0
  691. package/dist/voice/agent_session.d.ts +165 -0
  692. package/dist/voice/agent_session.d.ts.map +1 -0
  693. package/dist/voice/agent_session.js +582 -0
  694. package/dist/voice/agent_session.js.map +1 -0
  695. package/dist/voice/audio_recognition.cjs +668 -0
  696. package/dist/voice/audio_recognition.cjs.map +1 -0
  697. package/dist/voice/audio_recognition.d.cts +127 -0
  698. package/dist/voice/audio_recognition.d.ts +127 -0
  699. package/dist/voice/audio_recognition.d.ts.map +1 -0
  700. package/dist/voice/audio_recognition.js +647 -0
  701. package/dist/voice/audio_recognition.js.map +1 -0
  702. package/dist/voice/avatar/datastream_io.cjs +204 -0
  703. package/dist/voice/avatar/datastream_io.cjs.map +1 -0
  704. package/dist/voice/avatar/datastream_io.d.cts +37 -0
  705. package/dist/voice/avatar/datastream_io.d.ts +37 -0
  706. package/dist/voice/avatar/datastream_io.d.ts.map +1 -0
  707. package/dist/voice/avatar/datastream_io.js +188 -0
  708. package/dist/voice/avatar/datastream_io.js.map +1 -0
  709. package/dist/voice/avatar/index.cjs +23 -0
  710. package/dist/voice/avatar/index.cjs.map +1 -0
  711. package/dist/voice/avatar/index.d.cts +2 -0
  712. package/dist/voice/avatar/index.d.ts +2 -0
  713. package/dist/voice/avatar/index.d.ts.map +1 -0
  714. package/dist/voice/avatar/index.js +2 -0
  715. package/dist/voice/avatar/index.js.map +1 -0
  716. package/dist/voice/background_audio.cjs +366 -0
  717. package/dist/voice/background_audio.cjs.map +1 -0
  718. package/dist/voice/background_audio.d.cts +121 -0
  719. package/dist/voice/background_audio.d.ts +121 -0
  720. package/dist/voice/background_audio.d.ts.map +1 -0
  721. package/dist/voice/background_audio.js +342 -0
  722. package/dist/voice/background_audio.js.map +1 -0
  723. package/dist/voice/events.cjs +147 -0
  724. package/dist/voice/events.cjs.map +1 -0
  725. package/dist/voice/events.d.cts +127 -0
  726. package/dist/voice/events.d.ts +127 -0
  727. package/dist/voice/events.d.ts.map +1 -0
  728. package/dist/voice/events.js +112 -0
  729. package/dist/voice/events.js.map +1 -0
  730. package/dist/voice/generation.cjs +747 -0
  731. package/dist/voice/generation.cjs.map +1 -0
  732. package/dist/voice/generation.d.cts +116 -0
  733. package/dist/voice/generation.d.ts +116 -0
  734. package/dist/voice/generation.d.ts.map +1 -0
  735. package/dist/voice/generation.js +719 -0
  736. package/dist/voice/generation.js.map +1 -0
  737. package/dist/voice/generation_tools.test.cjs +236 -0
  738. package/dist/voice/generation_tools.test.cjs.map +1 -0
  739. package/dist/voice/generation_tools.test.js +235 -0
  740. package/dist/voice/generation_tools.test.js.map +1 -0
  741. package/dist/voice/index.cjs +49 -0
  742. package/dist/voice/index.cjs.map +1 -0
  743. package/dist/voice/index.d.cts +10 -0
  744. package/dist/voice/index.d.ts +10 -0
  745. package/dist/voice/index.d.ts.map +1 -0
  746. package/dist/voice/index.js +16 -0
  747. package/dist/voice/index.js.map +1 -0
  748. package/dist/voice/interruption_detection.test.cjs +114 -0
  749. package/dist/voice/interruption_detection.test.cjs.map +1 -0
  750. package/dist/voice/interruption_detection.test.js +113 -0
  751. package/dist/voice/interruption_detection.test.js.map +1 -0
  752. package/dist/voice/io.cjs +270 -0
  753. package/dist/voice/io.cjs.map +1 -0
  754. package/dist/voice/io.d.cts +126 -0
  755. package/dist/voice/io.d.ts +126 -0
  756. package/dist/voice/io.d.ts.map +1 -0
  757. package/dist/voice/io.js +242 -0
  758. package/dist/voice/io.js.map +1 -0
  759. package/dist/voice/recorder_io/index.cjs +23 -0
  760. package/dist/voice/recorder_io/index.cjs.map +1 -0
  761. package/dist/voice/recorder_io/index.d.cts +2 -0
  762. package/dist/voice/recorder_io/index.d.ts +2 -0
  763. package/dist/voice/recorder_io/index.d.ts.map +1 -0
  764. package/dist/voice/recorder_io/index.js +2 -0
  765. package/dist/voice/recorder_io/index.js.map +1 -0
  766. package/dist/voice/recorder_io/recorder_io.cjs +542 -0
  767. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
  768. package/dist/voice/recorder_io/recorder_io.d.cts +100 -0
  769. package/dist/voice/recorder_io/recorder_io.d.ts +100 -0
  770. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
  771. package/dist/voice/recorder_io/recorder_io.js +508 -0
  772. package/dist/voice/recorder_io/recorder_io.js.map +1 -0
  773. package/dist/voice/report.cjs +75 -0
  774. package/dist/voice/report.cjs.map +1 -0
  775. package/dist/voice/report.d.cts +42 -0
  776. package/dist/voice/report.d.ts +42 -0
  777. package/dist/voice/report.d.ts.map +1 -0
  778. package/dist/voice/report.js +50 -0
  779. package/dist/voice/report.js.map +1 -0
  780. package/dist/voice/room_io/_input.cjs +133 -0
  781. package/dist/voice/room_io/_input.cjs.map +1 -0
  782. package/dist/voice/room_io/_input.d.cts +24 -0
  783. package/dist/voice/room_io/_input.d.ts +24 -0
  784. package/dist/voice/room_io/_input.d.ts.map +1 -0
  785. package/dist/voice/room_io/_input.js +114 -0
  786. package/dist/voice/room_io/_input.js.map +1 -0
  787. package/dist/voice/room_io/_output.cjs +359 -0
  788. package/dist/voice/room_io/_output.cjs.map +1 -0
  789. package/dist/voice/room_io/_output.d.cts +77 -0
  790. package/dist/voice/room_io/_output.d.ts +77 -0
  791. package/dist/voice/room_io/_output.d.ts.map +1 -0
  792. package/dist/voice/room_io/_output.js +343 -0
  793. package/dist/voice/room_io/_output.js.map +1 -0
  794. package/dist/voice/room_io/index.cjs +25 -0
  795. package/dist/voice/room_io/index.cjs.map +1 -0
  796. package/dist/voice/room_io/index.d.cts +3 -0
  797. package/dist/voice/room_io/index.d.ts +3 -0
  798. package/dist/voice/room_io/index.d.ts.map +1 -0
  799. package/dist/voice/room_io/index.js +3 -0
  800. package/dist/voice/room_io/index.js.map +1 -0
  801. package/dist/voice/room_io/room_io.cjs +373 -0
  802. package/dist/voice/room_io/room_io.cjs.map +1 -0
  803. package/dist/voice/room_io/room_io.d.cts +94 -0
  804. package/dist/voice/room_io/room_io.d.ts +94 -0
  805. package/dist/voice/room_io/room_io.d.ts.map +1 -0
  806. package/dist/voice/room_io/room_io.js +364 -0
  807. package/dist/voice/room_io/room_io.js.map +1 -0
  808. package/dist/voice/run_context.cjs +51 -0
  809. package/dist/voice/run_context.cjs.map +1 -0
  810. package/dist/voice/run_context.d.cts +22 -0
  811. package/dist/voice/run_context.d.ts +22 -0
  812. package/dist/voice/run_context.d.ts.map +1 -0
  813. package/dist/voice/run_context.js +27 -0
  814. package/dist/voice/run_context.js.map +1 -0
  815. package/dist/voice/speech_handle.cjs +228 -0
  816. package/dist/voice/speech_handle.cjs.map +1 -0
  817. package/dist/voice/speech_handle.d.cts +97 -0
  818. package/dist/voice/speech_handle.d.ts +97 -0
  819. package/dist/voice/speech_handle.d.ts.map +1 -0
  820. package/dist/voice/speech_handle.js +204 -0
  821. package/dist/voice/speech_handle.js.map +1 -0
  822. package/dist/voice/transcription/_utils.cjs +45 -0
  823. package/dist/voice/transcription/_utils.cjs.map +1 -0
  824. package/dist/voice/transcription/_utils.d.cts +3 -0
  825. package/dist/voice/transcription/_utils.d.ts +3 -0
  826. package/dist/voice/transcription/_utils.d.ts.map +1 -0
  827. package/dist/voice/transcription/_utils.js +21 -0
  828. package/dist/voice/transcription/_utils.js.map +1 -0
  829. package/dist/voice/transcription/index.cjs +23 -0
  830. package/dist/voice/transcription/index.cjs.map +1 -0
  831. package/dist/voice/transcription/index.d.cts +2 -0
  832. package/dist/voice/transcription/index.d.ts +2 -0
  833. package/dist/voice/transcription/index.d.ts.map +1 -0
  834. package/dist/voice/transcription/index.js +2 -0
  835. package/dist/voice/transcription/index.js.map +1 -0
  836. package/dist/voice/transcription/synchronizer.cjs +379 -0
  837. package/dist/voice/transcription/synchronizer.cjs.map +1 -0
  838. package/dist/voice/transcription/synchronizer.d.cts +87 -0
  839. package/dist/voice/transcription/synchronizer.d.ts +87 -0
  840. package/dist/voice/transcription/synchronizer.d.ts.map +1 -0
  841. package/dist/voice/transcription/synchronizer.js +354 -0
  842. package/dist/voice/transcription/synchronizer.js.map +1 -0
  843. package/dist/worker.cjs +680 -0
  844. package/dist/worker.cjs.map +1 -0
  845. package/dist/worker.d.cts +119 -0
  846. package/dist/worker.d.ts +119 -0
  847. package/dist/worker.d.ts.map +1 -0
  848. package/dist/worker.js +645 -0
  849. package/dist/worker.js.map +1 -0
  850. package/package.json +86 -0
  851. package/resources/NOTICE +2 -0
  852. package/resources/keyboard-typing.ogg +0 -0
  853. package/resources/keyboard-typing2.ogg +0 -0
  854. package/resources/office-ambience.ogg +0 -0
  855. package/src/_exceptions.ts +137 -0
  856. package/src/audio.ts +205 -0
  857. package/src/cli.ts +224 -0
  858. package/src/connection_pool.test.ts +346 -0
  859. package/src/connection_pool.ts +307 -0
  860. package/src/constants.ts +9 -0
  861. package/src/generator.ts +38 -0
  862. package/src/http_server.ts +64 -0
  863. package/src/index.ts +41 -0
  864. package/src/inference/api_protos.ts +82 -0
  865. package/src/inference/index.ts +32 -0
  866. package/src/inference/interruption/AdaptiveInterruptionDetector.ts +166 -0
  867. package/src/inference/interruption/InterruptionStream.ts +397 -0
  868. package/src/inference/interruption/defaults.ts +33 -0
  869. package/src/inference/interruption/errors.ts +0 -0
  870. package/src/inference/interruption/http_transport.ts +61 -0
  871. package/src/inference/interruption/index.ts +4 -0
  872. package/src/inference/interruption/interruption.ts +88 -0
  873. package/src/inference/llm.ts +532 -0
  874. package/src/inference/stt.ts +524 -0
  875. package/src/inference/tts.ts +574 -0
  876. package/src/inference/utils.test.ts +31 -0
  877. package/src/inference/utils.ts +81 -0
  878. package/src/inference_runner.ts +19 -0
  879. package/src/ipc/index.ts +5 -0
  880. package/src/ipc/inference_executor.ts +7 -0
  881. package/src/ipc/inference_proc_executor.ts +101 -0
  882. package/src/ipc/inference_proc_lazy_main.ts +115 -0
  883. package/src/ipc/job_executor.ts +23 -0
  884. package/src/ipc/job_proc_executor.ts +122 -0
  885. package/src/ipc/job_proc_lazy_main.ts +247 -0
  886. package/src/ipc/message.ts +52 -0
  887. package/src/ipc/proc_pool.ts +164 -0
  888. package/src/ipc/supervised_proc.test.ts +153 -0
  889. package/src/ipc/supervised_proc.ts +242 -0
  890. package/src/job.ts +461 -0
  891. package/src/llm/__snapshots__/chat_context.test.ts.snap +527 -0
  892. package/src/llm/__snapshots__/tool_context.test.ts.snap +177 -0
  893. package/src/llm/__snapshots__/zod-utils.test.ts.snap +559 -0
  894. package/src/llm/chat_context.test.ts +1057 -0
  895. package/src/llm/chat_context.ts +759 -0
  896. package/src/llm/fallback_adapter.test.ts +238 -0
  897. package/src/llm/fallback_adapter.ts +391 -0
  898. package/src/llm/index.ts +74 -0
  899. package/src/llm/llm.ts +303 -0
  900. package/src/llm/provider_format/google.test.ts +843 -0
  901. package/src/llm/provider_format/google.ts +134 -0
  902. package/src/llm/provider_format/index.ts +23 -0
  903. package/src/llm/provider_format/openai.test.ts +675 -0
  904. package/src/llm/provider_format/openai.ts +146 -0
  905. package/src/llm/provider_format/utils.ts +187 -0
  906. package/src/llm/realtime.ts +163 -0
  907. package/src/llm/remote_chat_context.test.ts +290 -0
  908. package/src/llm/remote_chat_context.ts +114 -0
  909. package/src/llm/tool_context.test.ts +407 -0
  910. package/src/llm/tool_context.ts +343 -0
  911. package/src/llm/tool_context.type.test.ts +115 -0
  912. package/src/llm/utils.test.ts +670 -0
  913. package/src/llm/utils.ts +336 -0
  914. package/src/llm/zod-utils.test.ts +577 -0
  915. package/src/llm/zod-utils.ts +153 -0
  916. package/src/log.ts +83 -0
  917. package/src/metrics/base.ts +168 -0
  918. package/src/metrics/index.ts +15 -0
  919. package/src/metrics/usage_collector.ts +46 -0
  920. package/src/metrics/utils.ts +64 -0
  921. package/src/plugin.ts +46 -0
  922. package/src/stream/deferred_stream.test.ts +755 -0
  923. package/src/stream/deferred_stream.ts +127 -0
  924. package/src/stream/identity_transform.test.ts +179 -0
  925. package/src/stream/identity_transform.ts +18 -0
  926. package/src/stream/index.ts +7 -0
  927. package/src/stream/merge_readable_streams.ts +40 -0
  928. package/src/stream/stream_channel.test.ts +166 -0
  929. package/src/stream/stream_channel.ts +44 -0
  930. package/src/stt/index.ts +15 -0
  931. package/src/stt/stream_adapter.ts +107 -0
  932. package/src/stt/stt.ts +374 -0
  933. package/src/telemetry/index.ts +28 -0
  934. package/src/telemetry/logging.ts +55 -0
  935. package/src/telemetry/otel_http_exporter.ts +195 -0
  936. package/src/telemetry/pino_otel_transport.ts +265 -0
  937. package/src/telemetry/trace_types.ts +95 -0
  938. package/src/telemetry/traces.ts +612 -0
  939. package/src/telemetry/utils.ts +61 -0
  940. package/src/tokenize/basic/basic.ts +83 -0
  941. package/src/tokenize/basic/hyphenator.ts +434 -0
  942. package/src/tokenize/basic/index.ts +11 -0
  943. package/src/tokenize/basic/paragraph.ts +43 -0
  944. package/src/tokenize/basic/sentence.ts +89 -0
  945. package/src/tokenize/basic/word.ts +27 -0
  946. package/src/tokenize/index.ts +16 -0
  947. package/src/tokenize/token_stream.ts +180 -0
  948. package/src/tokenize/tokenizer.test.ts +255 -0
  949. package/src/tokenize/tokenizer.ts +152 -0
  950. package/src/transcription.ts +307 -0
  951. package/src/tts/index.ts +12 -0
  952. package/src/tts/stream_adapter.ts +110 -0
  953. package/src/tts/tts.ts +598 -0
  954. package/src/types.ts +66 -0
  955. package/src/utils/ws_transport.test.ts +282 -0
  956. package/src/utils/ws_transport.ts +22 -0
  957. package/src/utils.test.ts +651 -0
  958. package/src/utils.ts +871 -0
  959. package/src/vad.ts +262 -0
  960. package/src/version.ts +5 -0
  961. package/src/voice/agent.test.ts +80 -0
  962. package/src/voice/agent.ts +418 -0
  963. package/src/voice/agent_activity.ts +2375 -0
  964. package/src/voice/agent_session.ts +866 -0
  965. package/src/voice/audio_recognition.ts +877 -0
  966. package/src/voice/avatar/datastream_io.ts +247 -0
  967. package/src/voice/avatar/index.ts +4 -0
  968. package/src/voice/background_audio.ts +491 -0
  969. package/src/voice/events.ts +261 -0
  970. package/src/voice/generation.ts +946 -0
  971. package/src/voice/generation_tools.test.ts +268 -0
  972. package/src/voice/index.ts +12 -0
  973. package/src/voice/interruption_detection.test.ts +151 -0
  974. package/src/voice/io.ts +347 -0
  975. package/src/voice/recorder_io/index.ts +4 -0
  976. package/src/voice/recorder_io/recorder_io.ts +690 -0
  977. package/src/voice/report.ts +100 -0
  978. package/src/voice/room_io/_input.ts +162 -0
  979. package/src/voice/room_io/_output.ts +439 -0
  980. package/src/voice/room_io/index.ts +5 -0
  981. package/src/voice/room_io/room_io.ts +518 -0
  982. package/src/voice/run_context.ts +34 -0
  983. package/src/voice/speech_handle.ts +250 -0
  984. package/src/voice/transcription/_utils.ts +25 -0
  985. package/src/voice/transcription/index.ts +4 -0
  986. package/src/voice/transcription/synchronizer.ts +477 -0
  987. package/src/worker.ts +798 -0
@@ -0,0 +1,2375 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { Mutex } from '@livekit/mutex';
5
+ import type { AudioFrame } from '@livekit/rtc-node';
6
+ import type { Span } from '@opentelemetry/api';
7
+ import { ROOT_CONTEXT, trace } from '@opentelemetry/api';
8
+ import { Heap } from 'heap-js';
9
+ import { AsyncLocalStorage } from 'node:async_hooks';
10
+ import { ReadableStream } from 'node:stream/web';
11
+ import { type ChatContext, ChatMessage } from '../llm/chat_context.js';
12
+ import {
13
+ type ChatItem,
14
+ type FunctionCall,
15
+ type FunctionCallOutput,
16
+ type GenerationCreatedEvent,
17
+ type InputSpeechStartedEvent,
18
+ type InputSpeechStoppedEvent,
19
+ type InputTranscriptionCompleted,
20
+ LLM,
21
+ RealtimeModel,
22
+ type RealtimeModelError,
23
+ type RealtimeSession,
24
+ type ToolChoice,
25
+ type ToolContext,
26
+ } from '../llm/index.js';
27
+ import type { LLMError } from '../llm/llm.js';
28
+ import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js';
29
+ import { log } from '../log.js';
30
+ import type {
31
+ EOUMetrics,
32
+ LLMMetrics,
33
+ RealtimeModelMetrics,
34
+ STTMetrics,
35
+ TTSMetrics,
36
+ VADMetrics,
37
+ } from '../metrics/base.js';
38
+ import { DeferredReadableStream } from '../stream/deferred_stream.js';
39
+ import { STT, type STTError, type SpeechEvent } from '../stt/stt.js';
40
+ import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js';
41
+ import { splitWords } from '../tokenize/basic/word.js';
42
+ import { TTS, type TTSError } from '../tts/tts.js';
43
+ import { Future, Task, cancelAndWait, waitFor } from '../utils.js';
44
+ import type { InterruptionEvent } from '../inference/interruption/interruption.js';
45
+ import { InterruptionEventType } from '../inference/interruption/interruption.js';
46
+ import { VAD, type VADEvent } from '../vad.js';
47
+ import type { Agent, ModelSettings } from './agent.js';
48
+ import { StopResponse, asyncLocalStorage } from './agent.js';
49
+ import { type AgentSession, type TurnDetectionMode } from './agent_session.js';
50
+ import {
51
+ AudioRecognition,
52
+ type EndOfTurnInfo,
53
+ type PreemptiveGenerationInfo,
54
+ type RecognitionHooks,
55
+ type _TurnDetector,
56
+ } from './audio_recognition.js';
57
+ import {
58
+ AgentSessionEventTypes,
59
+ createErrorEvent,
60
+ createFunctionToolsExecutedEvent,
61
+ createMetricsCollectedEvent,
62
+ createSpeechCreatedEvent,
63
+ createUserInputTranscribedEvent,
64
+ } from './events.js';
65
+ import type { ToolExecutionOutput } from './generation.js';
66
+ import {
67
+ type _AudioOut,
68
+ type _TextOut,
69
+ performAudioForwarding,
70
+ performLLMInference,
71
+ performTTSInference,
72
+ performTextForwarding,
73
+ performToolExecutions,
74
+ removeInstructions,
75
+ updateInstructions,
76
+ } from './generation.js';
77
+ import { SpeechHandle } from './speech_handle.js';
78
+
79
+ const speechHandleStorage = new AsyncLocalStorage<SpeechHandle>();
80
+
81
+ interface PreemptiveGeneration {
82
+ speechHandle: SpeechHandle;
83
+ userMessage: ChatMessage;
84
+ info: PreemptiveGenerationInfo;
85
+ chatCtx: ChatContext;
86
+ tools: ToolContext;
87
+ toolChoice: ToolChoice | null;
88
+ createdAt: number;
89
+ }
90
+
91
+ export class AgentActivity implements RecognitionHooks {
92
+ private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000;
93
+ private started = false;
94
+ private audioRecognition?: AudioRecognition;
95
+ private realtimeSession?: RealtimeSession;
96
+ private realtimeSpans?: Map<string, Span>; // Maps response_id to OTEL span for metrics recording
97
+ private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
98
+ private logger = log();
99
+ private _draining = false;
100
+ private _currentSpeech?: SpeechHandle;
101
+ private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle]
102
+ private q_updated: Future;
103
+ private speechTasks: Set<Task<void>> = new Set();
104
+ private lock = new Mutex();
105
+ private audioStream = new DeferredReadableStream<AudioFrame>();
106
+ // default to null as None, which maps to the default provider tool choice value
107
+ private toolChoice: ToolChoice | null = null;
108
+ private _preemptiveGeneration?: PreemptiveGeneration;
109
+
110
+ agent: Agent;
111
+ agentSession: AgentSession;
112
+
113
+ /** @internal */
114
+ _mainTask?: Task<void>;
115
+ _userTurnCompletedTask?: Promise<void>;
116
+
117
+ /**
118
+ * Notify that agent started speaking.
119
+ * This enables interruption detection in AudioRecognition.
120
+ * @internal
121
+ */
122
+ notifyAgentSpeechStarted(): void {
123
+ this.audioRecognition?.onStartOfAgentSpeech();
124
+ }
125
+
126
+ /**
127
+ * Notify that agent stopped speaking.
128
+ * This disables interruption detection in AudioRecognition.
129
+ * @internal
130
+ */
131
+ notifyAgentSpeechEnded(): void {
132
+ this.audioRecognition?.onEndOfAgentSpeech();
133
+ }
134
+
135
+ constructor(agent: Agent, agentSession: AgentSession) {
136
+ this.agent = agent;
137
+ this.agentSession = agentSession;
138
+
139
+ /**
140
+ * Custom comparator to prioritize speech handles with higher priority
141
+ * - Prefer higher priority
142
+ * - Prefer earlier timestamp (so calling a sequence of generateReply() will execute in FIFO order)
143
+ */
144
+ this.speechQueue = new Heap<[number, number, SpeechHandle]>(([p1, t1, _], [p2, t2, __]) => {
145
+ return p1 === p2 ? t1 - t2 : p2 - p1;
146
+ });
147
+ this.q_updated = new Future();
148
+
149
+ this.turnDetectionMode =
150
+ typeof this.turnDetection === 'string' ? this.turnDetection : undefined;
151
+
152
+ if (this.turnDetectionMode === 'vad' && this.vad === undefined) {
153
+ this.logger.warn(
154
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting',
155
+ );
156
+ this.turnDetectionMode = undefined;
157
+ }
158
+
159
+ if (this.turnDetectionMode === 'stt' && this.stt === undefined) {
160
+ this.logger.warn(
161
+ 'turnDetection is set to "stt", but no STT model is provided, ignoring the turnDetection setting',
162
+ );
163
+ this.turnDetectionMode = undefined;
164
+ }
165
+
166
+ if (this.llm instanceof RealtimeModel) {
167
+ if (this.llm.capabilities.turnDetection && !this.allowInterruptions) {
168
+ this.logger.warn(
169
+ 'the RealtimeModel uses a server-side turn detection, allowInterruptions cannot be false, ' +
170
+ 'disable turnDetection in the RealtimeModel and use VAD on the AgentSession instead',
171
+ );
172
+ }
173
+
174
+ if (this.turnDetectionMode === 'realtime_llm' && !this.llm.capabilities.turnDetection) {
175
+ this.logger.warn(
176
+ 'turnDetection is set to "realtime_llm", but the LLM is not a RealtimeModel or the server-side turn detection is not supported/enabled, ignoring the turnDetection setting',
177
+ );
178
+ this.turnDetectionMode = undefined;
179
+ }
180
+
181
+ if (this.turnDetectionMode === 'stt') {
182
+ this.logger.warn(
183
+ 'turnDetection is set to "stt", but the LLM is a RealtimeModel, ignoring the turnDetection setting',
184
+ );
185
+ this.turnDetectionMode = undefined;
186
+ }
187
+
188
+ if (
189
+ this.turnDetectionMode &&
190
+ this.turnDetectionMode !== 'realtime_llm' &&
191
+ this.llm.capabilities.turnDetection
192
+ ) {
193
+ this.logger.warn(
194
+ `turnDetection is set to "${this.turnDetectionMode}", but the LLM is a RealtimeModel and server-side turn detection enabled, ignoring the turnDetection setting`,
195
+ );
196
+ this.turnDetectionMode = undefined;
197
+ }
198
+
199
+ // fallback to VAD if server side turn detection is disabled and VAD is available
200
+ if (
201
+ !this.llm.capabilities.turnDetection &&
202
+ this.vad &&
203
+ this.turnDetectionMode === undefined
204
+ ) {
205
+ this.turnDetectionMode = 'vad';
206
+ }
207
+ } else if (this.turnDetectionMode === 'realtime_llm') {
208
+ this.logger.warn(
209
+ 'turnDetection is set to "realtime_llm", but the LLM is not a RealtimeModel',
210
+ );
211
+ this.turnDetectionMode = undefined;
212
+ }
213
+
214
+ if (
215
+ !this.vad &&
216
+ this.stt &&
217
+ this.llm instanceof LLM &&
218
+ this.allowInterruptions &&
219
+ this.turnDetectionMode === undefined
220
+ ) {
221
+ this.logger.warn(
222
+ 'VAD is not set. Enabling VAD is recommended when using LLM and STT ' +
223
+ 'for more responsive interruption handling.',
224
+ );
225
+ }
226
+ }
227
+
228
+ async start(): Promise<void> {
229
+ const unlock = await this.lock.lock();
230
+ try {
231
+ // Create start_agent_activity as a ROOT span (new trace) to match Python behavior
232
+ const startSpan = tracer.startSpan({
233
+ name: 'start_agent_activity',
234
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
235
+ context: ROOT_CONTEXT,
236
+ });
237
+
238
+ this.agent._agentActivity = this;
239
+
240
+ if (this.llm instanceof RealtimeModel) {
241
+ this.realtimeSession = this.llm.session();
242
+ this.realtimeSpans = new Map<string, Span>();
243
+ this.realtimeSession.on('generation_created', (ev) => this.onGenerationCreated(ev));
244
+ this.realtimeSession.on('input_speech_started', (ev) => this.onInputSpeechStarted(ev));
245
+ this.realtimeSession.on('input_speech_stopped', (ev) => this.onInputSpeechStopped(ev));
246
+ this.realtimeSession.on('input_audio_transcription_completed', (ev) =>
247
+ this.onInputAudioTranscriptionCompleted(ev),
248
+ );
249
+ this.realtimeSession.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
250
+ this.realtimeSession.on('error', (ev) => this.onError(ev));
251
+
252
+ removeInstructions(this.agent._chatCtx);
253
+ try {
254
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
255
+ } catch (error) {
256
+ this.logger.error(error, 'failed to update the instructions');
257
+ }
258
+
259
+ try {
260
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
261
+ } catch (error) {
262
+ this.logger.error(error, 'failed to update the chat context');
263
+ }
264
+
265
+ try {
266
+ await this.realtimeSession.updateTools(this.tools);
267
+ } catch (error) {
268
+ this.logger.error(error, 'failed to update the tools');
269
+ }
270
+
271
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
272
+ this.logger.error(
273
+ 'audio output is enabled but RealtimeModel has no audio modality ' +
274
+ 'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
275
+ 'or set a TTS model.',
276
+ );
277
+ }
278
+ } else if (this.llm instanceof LLM) {
279
+ try {
280
+ updateInstructions({
281
+ chatCtx: this.agent._chatCtx,
282
+ instructions: this.agent.instructions,
283
+ addIfMissing: true,
284
+ });
285
+ } catch (error) {
286
+ this.logger.error('failed to update the instructions', error);
287
+ }
288
+ }
289
+
290
+ // metrics and error handling
291
+ if (this.llm instanceof LLM) {
292
+ this.llm.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
293
+ this.llm.on('error', (ev) => this.onError(ev));
294
+ }
295
+
296
+ if (this.stt instanceof STT) {
297
+ this.stt.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
298
+ this.stt.on('error', (ev) => this.onError(ev));
299
+ }
300
+
301
+ if (this.tts instanceof TTS) {
302
+ this.tts.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
303
+ this.tts.on('error', (ev) => this.onError(ev));
304
+ }
305
+
306
+ if (this.vad instanceof VAD) {
307
+ this.vad.on('metrics_collected', (ev) => this.onMetricsCollected(ev));
308
+ }
309
+
310
+ this.audioRecognition = new AudioRecognition({
311
+ recognitionHooks: this,
312
+ // Disable stt node if stt is not provided
313
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined,
314
+ vad: this.vad,
315
+ interruptionDetector: this.agentSession.interruptionDetector,
316
+ turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
317
+ turnDetectionMode: this.turnDetectionMode,
318
+ minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
319
+ maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
320
+ rootSpanContext: this.agentSession.rootSpanContext,
321
+ });
322
+ this.audioRecognition.start();
323
+ this.started = true;
324
+
325
+ this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
326
+
327
+ // Create on_enter as a child of start_agent_activity in the new trace
328
+ const onEnterTask = tracer.startActiveSpan(async () => this.agent.onEnter(), {
329
+ name: 'on_enter',
330
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
331
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
332
+ });
333
+
334
+ this.createSpeechTask({
335
+ task: Task.from(() => onEnterTask),
336
+ name: 'AgentActivity_onEnter',
337
+ });
338
+
339
+ startSpan.end();
340
+ } finally {
341
+ unlock();
342
+ }
343
+ }
344
+
345
+ get currentSpeech(): SpeechHandle | undefined {
346
+ return this._currentSpeech;
347
+ }
348
+
349
+ get vad(): VAD | undefined {
350
+ return this.agent.vad || this.agentSession.vad;
351
+ }
352
+
353
+ get stt(): STT | undefined {
354
+ return this.agent.stt || this.agentSession.stt;
355
+ }
356
+
357
+ get llm(): LLM | RealtimeModel | undefined {
358
+ return this.agent.llm || this.agentSession.llm;
359
+ }
360
+
361
+ get tts(): TTS | undefined {
362
+ return this.agent.tts || this.agentSession.tts;
363
+ }
364
+
365
+ get tools(): ToolContext {
366
+ return this.agent.toolCtx;
367
+ }
368
+
369
+ get draining(): boolean {
370
+ return this._draining;
371
+ }
372
+
373
+ get realtimeLLMSession(): RealtimeSession | undefined {
374
+ return this.realtimeSession;
375
+ }
376
+
377
+ get allowInterruptions(): boolean {
378
+ // TODO(AJS-51): Allow options to be defined in Agent class
379
+ return this.agentSession.options.allowInterruptions;
380
+ }
381
+
382
+ get turnDetection(): TurnDetectionMode | undefined {
383
+ // TODO(brian): prioritize using agent.turn_detection
384
+ return this.agentSession.turnDetection;
385
+ }
386
+
387
+ get toolCtx(): ToolContext {
388
+ return this.agent.toolCtx;
389
+ }
390
+
391
+ async updateChatCtx(chatCtx: ChatContext): Promise<void> {
392
+ chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
393
+
394
+ this.agent._chatCtx = chatCtx;
395
+
396
+ if (this.realtimeSession) {
397
+ removeInstructions(chatCtx);
398
+ this.realtimeSession.updateChatCtx(chatCtx);
399
+ } else {
400
+ updateInstructions({
401
+ chatCtx,
402
+ instructions: this.agent.instructions,
403
+ addIfMissing: true,
404
+ });
405
+ }
406
+ }
407
+
408
+ updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void {
409
+ if (toolChoice !== undefined) {
410
+ this.toolChoice = toolChoice;
411
+ }
412
+
413
+ if (this.realtimeSession) {
414
+ this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
415
+ }
416
+ }
417
+
418
+ attachAudioInput(audioStream: ReadableStream<AudioFrame>): void {
419
+ if (this.audioStream.isSourceSet) {
420
+ this.logger.debug('detaching existing audio input in agent activity');
421
+ this.audioStream.detachSource();
422
+ }
423
+
424
+ /**
425
+ * We need to add a deferred ReadableStream layer on top of the audioStream from the agent session.
426
+ * The tee() operation should be applied to the deferred stream, not the original audioStream.
427
+ * This is important because teeing the original stream directly makes it very difficult—if not
428
+ * impossible—to implement stream unlock logic cleanly.
429
+ */
430
+ this.audioStream.setSource(audioStream);
431
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
432
+
433
+ if (this.realtimeSession) {
434
+ this.realtimeSession.setInputAudioStream(realtimeAudioStream);
435
+ }
436
+
437
+ if (this.audioRecognition) {
438
+ this.audioRecognition.setInputAudioStream(recognitionAudioStream);
439
+ }
440
+ }
441
+
442
+ detachAudioInput(): void {
443
+ this.audioStream.detachSource();
444
+ }
445
+
446
+ commitUserTurn() {
447
+ if (!this.audioRecognition) {
448
+ throw new Error('AudioRecognition is not initialized');
449
+ }
450
+
451
+ // TODO(brian): add audio_detached flag
452
+ const audioDetached = false;
453
+ this.audioRecognition.commitUserTurn(audioDetached);
454
+ }
455
+
456
+ clearUserTurn() {
457
+ this.audioRecognition?.clearUserTurn();
458
+ this.realtimeSession?.clearAudio();
459
+ }
460
+
461
+ say(
462
+ text: string | ReadableStream<string>,
463
+ options?: {
464
+ audio?: ReadableStream<AudioFrame>;
465
+ allowInterruptions?: boolean;
466
+ addToChatCtx?: boolean;
467
+ },
468
+ ): SpeechHandle {
469
+ const {
470
+ audio,
471
+ allowInterruptions: defaultAllowInterruptions,
472
+ addToChatCtx = true,
473
+ } = options ?? {};
474
+ let allowInterruptions = defaultAllowInterruptions;
475
+
476
+ if (
477
+ !audio &&
478
+ !this.tts &&
479
+ this.agentSession.output.audio &&
480
+ this.agentSession.output.audioEnabled
481
+ ) {
482
+ throw new Error('trying to generate speech from text without a TTS model');
483
+ }
484
+
485
+ if (
486
+ this.llm instanceof RealtimeModel &&
487
+ this.llm.capabilities.turnDetection &&
488
+ allowInterruptions === false
489
+ ) {
490
+ this.logger.warn(
491
+ 'the RealtimeModel uses a server-side turn detection, allowInterruptions cannot be false when using VoiceAgent.say(), ' +
492
+ 'disable turnDetection in the RealtimeModel and use VAD on the AgentTask/VoiceAgent instead',
493
+ );
494
+ allowInterruptions = true;
495
+ }
496
+
497
+ const handle = SpeechHandle.create({
498
+ allowInterruptions: allowInterruptions ?? this.allowInterruptions,
499
+ });
500
+
501
+ this.agentSession.emit(
502
+ AgentSessionEventTypes.SpeechCreated,
503
+ createSpeechCreatedEvent({
504
+ userInitiated: true,
505
+ source: 'say',
506
+ speechHandle: handle,
507
+ }),
508
+ );
509
+ const task = this.createSpeechTask({
510
+ task: Task.from((abortController: AbortController) =>
511
+ this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
512
+ ),
513
+ ownedSpeechHandle: handle,
514
+ name: 'AgentActivity.say_tts',
515
+ });
516
+
517
+ task.finally(() => this.onPipelineReplyDone());
518
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
519
+ return handle;
520
+ }
521
+
522
+ // -- Metrics and errors --
523
+
524
+ private onMetricsCollected = (
525
+ ev: STTMetrics | TTSMetrics | VADMetrics | LLMMetrics | RealtimeModelMetrics,
526
+ ) => {
527
+ const speechHandle = speechHandleStorage.getStore();
528
+ if (speechHandle && (ev.type === 'llm_metrics' || ev.type === 'tts_metrics')) {
529
+ ev.speechId = speechHandle.id;
530
+ }
531
+
532
+ // Record realtime metrics on the associated span (if available)
533
+ if (ev.type === 'realtime_model_metrics' && this.realtimeSpans) {
534
+ const span = this.realtimeSpans.get(ev.requestId);
535
+ if (span) {
536
+ recordRealtimeMetrics(span, ev);
537
+ this.realtimeSpans.delete(ev.requestId);
538
+ }
539
+ }
540
+
541
+ this.agentSession.emit(
542
+ AgentSessionEventTypes.MetricsCollected,
543
+ createMetricsCollectedEvent({ metrics: ev }),
544
+ );
545
+ };
546
+
547
+ private onError(ev: RealtimeModelError | STTError | TTSError | LLMError): void {
548
+ if (ev.type === 'realtime_model_error') {
549
+ const errorEvent = createErrorEvent(ev.error, this.llm);
550
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
551
+ } else if (ev.type === 'stt_error') {
552
+ const errorEvent = createErrorEvent(ev.error, this.stt);
553
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
554
+ } else if (ev.type === 'tts_error') {
555
+ const errorEvent = createErrorEvent(ev.error, this.tts);
556
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
557
+ } else if (ev.type === 'llm_error') {
558
+ const errorEvent = createErrorEvent(ev.error, this.llm);
559
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
560
+ }
561
+
562
+ this.agentSession._onError(ev);
563
+ }
564
+
565
+ // -- Realtime Session events --
566
+
567
+ onInputSpeechStarted(_ev: InputSpeechStartedEvent): void {
568
+ this.logger.info('onInputSpeechStarted');
569
+
570
+ if (!this.vad) {
571
+ this.agentSession._updateUserState('speaking');
572
+ }
573
+
574
+ // this.interrupt() is going to raise when allow_interruptions is False,
575
+ // llm.InputSpeechStartedEvent is only fired by the server when the turn_detection is enabled.
576
+ try {
577
+ this.interrupt();
578
+ } catch (error) {
579
+ this.logger.error(
580
+ 'RealtimeAPI input_speech_started, but current speech is not interruptable, this should never happen!',
581
+ error,
582
+ );
583
+ }
584
+ }
585
+
586
+ onInputSpeechStopped(ev: InputSpeechStoppedEvent): void {
587
+ this.logger.info(ev, 'onInputSpeechStopped');
588
+
589
+ if (!this.vad) {
590
+ this.agentSession._updateUserState('listening');
591
+ }
592
+
593
+ if (ev.userTranscriptionEnabled) {
594
+ this.agentSession.emit(
595
+ AgentSessionEventTypes.UserInputTranscribed,
596
+ createUserInputTranscribedEvent({
597
+ isFinal: false,
598
+ transcript: '',
599
+ }),
600
+ );
601
+ }
602
+ }
603
+
604
+ onInputAudioTranscriptionCompleted(ev: InputTranscriptionCompleted): void {
605
+ this.agentSession.emit(
606
+ AgentSessionEventTypes.UserInputTranscribed,
607
+ createUserInputTranscribedEvent({
608
+ transcript: ev.transcript,
609
+ isFinal: ev.isFinal,
610
+ }),
611
+ );
612
+
613
+ if (ev.isFinal) {
614
+ const message = ChatMessage.create({
615
+ role: 'user',
616
+ content: ev.transcript,
617
+ id: ev.itemId,
618
+ });
619
+ this.agent._chatCtx.items.push(message);
620
+ this.agentSession._conversationItemAdded(message);
621
+ }
622
+ }
623
+
624
+ onGenerationCreated(ev: GenerationCreatedEvent): void {
625
+ if (ev.userInitiated) {
626
+ // user initiated generations are directly handled inside _realtime_reply_task
627
+ return;
628
+ }
629
+
630
+ if (this.draining) {
631
+ // TODO(shubhra): should we "forward" this new turn to the next agent?
632
+ this.logger.warn('skipping new realtime generation, the agent is draining');
633
+ return;
634
+ }
635
+
636
+ const handle = SpeechHandle.create({
637
+ allowInterruptions: this.allowInterruptions,
638
+ });
639
+ this.agentSession.emit(
640
+ AgentSessionEventTypes.SpeechCreated,
641
+ createSpeechCreatedEvent({
642
+ userInitiated: false,
643
+ source: 'generate_reply',
644
+ speechHandle: handle,
645
+ }),
646
+ );
647
+ this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
648
+
649
+ this.createSpeechTask({
650
+ task: Task.from((abortController: AbortController) =>
651
+ this.realtimeGenerationTask(handle, ev, {}, abortController),
652
+ ),
653
+ ownedSpeechHandle: handle,
654
+ name: 'AgentActivity.realtimeGeneration',
655
+ });
656
+
657
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
658
+ }
659
+
660
+ // recognition hooks
661
+
662
+ onStartOfSpeech(_ev: VADEvent): void {
663
+ this.agentSession._updateUserState('speaking');
664
+ }
665
+
666
+ onEndOfSpeech(ev: VADEvent): void {
667
+ let speechEndTime = Date.now();
668
+ if (ev) {
669
+ speechEndTime = speechEndTime - ev.silenceDuration;
670
+ }
671
+ this.agentSession._updateUserState('listening', speechEndTime);
672
+ }
673
+
674
+ onVADInferenceDone(ev: VADEvent): void {
675
+ if (this.turnDetection === 'manual' || this.turnDetection === 'realtime_llm') {
676
+ // skip speech handle interruption for manual and realtime model
677
+ return;
678
+ }
679
+
680
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
681
+ // skip speech handle interruption if server side turn detection is enabled
682
+ return;
683
+ }
684
+
685
+ if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
686
+ return;
687
+ }
688
+
689
+ // Refactored interruption word count check:
690
+ // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
691
+ // - Apply check to all STT results: empty string, undefined, or any length
692
+ // - This ensures consistent behavior across all interruption scenarios
693
+ if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
694
+ const text = this.audioRecognition.currentTranscript;
695
+ // TODO(shubhra): better word splitting for multi-language
696
+
697
+ // Normalize text: convert undefined/null to empty string for consistent word counting
698
+ const normalizedText = text ?? '';
699
+ const wordCount = splitWords(normalizedText, true).length;
700
+
701
+ // Only allow interruption if word count meets or exceeds minInterruptionWords
702
+ // This applies to all cases: empty strings, partial speech, and full speech
703
+ if (wordCount < this.agentSession.options.minInterruptionWords) {
704
+ return;
705
+ }
706
+ }
707
+
708
+ this.realtimeSession?.startUserActivity();
709
+
710
+ if (
711
+ this._currentSpeech &&
712
+ !this._currentSpeech.interrupted &&
713
+ this._currentSpeech.allowInterruptions
714
+ ) {
715
+ this.logger.info({ 'speech id': this._currentSpeech.id }, 'speech interrupted by VAD');
716
+ this.realtimeSession?.interrupt();
717
+ this._currentSpeech.interrupt();
718
+ }
719
+ }
720
+
721
+ onInterruption(ev: InterruptionEvent): void {
722
+ if (ev.type !== InterruptionEventType.INTERRUPTION) {
723
+ // Only handle actual interruptions, not overlap_speech_ended events
724
+ return;
725
+ }
726
+
727
+ this.logger.info(
728
+ {
729
+ probability: ev.probability,
730
+ detectionDelay: ev.detectionDelay,
731
+ totalDuration: ev.totalDuration,
732
+ },
733
+ 'adaptive interruption detected',
734
+ );
735
+
736
+ // Similar to onVADInferenceDone but triggered by the adaptive interruption detector
737
+ if (this.turnDetection === 'manual' || this.turnDetection === 'realtime_llm') {
738
+ return;
739
+ }
740
+
741
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
742
+ return;
743
+ }
744
+
745
+ this.realtimeSession?.startUserActivity();
746
+
747
+ if (
748
+ this._currentSpeech &&
749
+ !this._currentSpeech.interrupted &&
750
+ this._currentSpeech.allowInterruptions
751
+ ) {
752
+ this.logger.info(
753
+ { 'speech id': this._currentSpeech.id },
754
+ 'speech interrupted by adaptive interruption detector',
755
+ );
756
+ this.realtimeSession?.interrupt();
757
+ this._currentSpeech.interrupt();
758
+ }
759
+ }
760
+
761
+ onInterimTranscript(ev: SpeechEvent): void {
762
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
763
+ // skip stt transcription if userTranscription is enabled on the realtime model
764
+ return;
765
+ }
766
+
767
+ this.agentSession.emit(
768
+ AgentSessionEventTypes.UserInputTranscribed,
769
+ createUserInputTranscribedEvent({
770
+ transcript: ev.alternatives![0].text,
771
+ isFinal: false,
772
+ language: ev.alternatives![0].language,
773
+ // TODO(AJS-106): add multi participant support
774
+ }),
775
+ );
776
+ }
777
+
778
+ onFinalTranscript(ev: SpeechEvent): void {
779
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
780
+ // skip stt transcription if userTranscription is enabled on the realtime model
781
+ return;
782
+ }
783
+
784
+ this.agentSession.emit(
785
+ AgentSessionEventTypes.UserInputTranscribed,
786
+ createUserInputTranscribedEvent({
787
+ transcript: ev.alternatives![0].text,
788
+ isFinal: true,
789
+ language: ev.alternatives![0].language,
790
+ // TODO(AJS-106): add multi participant support
791
+ }),
792
+ );
793
+ }
794
+
795
+ onPreemptiveGeneration(info: PreemptiveGenerationInfo): void {
796
+ if (
797
+ !this.agentSession.options.preemptiveGeneration ||
798
+ this.draining ||
799
+ (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) ||
800
+ !(this.llm instanceof LLM)
801
+ ) {
802
+ return;
803
+ }
804
+
805
+ this.cancelPreemptiveGeneration();
806
+
807
+ this.logger.info(
808
+ {
809
+ newTranscript: info.newTranscript,
810
+ transcriptConfidence: info.transcriptConfidence,
811
+ },
812
+ 'starting preemptive generation',
813
+ );
814
+
815
+ const userMessage = ChatMessage.create({
816
+ role: 'user',
817
+ content: info.newTranscript,
818
+ });
819
+ const chatCtx = this.agent.chatCtx.copy();
820
+ const speechHandle = this.generateReply({
821
+ userMessage,
822
+ chatCtx,
823
+ scheduleSpeech: false,
824
+ });
825
+
826
+ this._preemptiveGeneration = {
827
+ speechHandle,
828
+ userMessage,
829
+ info,
830
+ chatCtx: chatCtx.copy(),
831
+ tools: { ...this.tools },
832
+ toolChoice: this.toolChoice,
833
+ createdAt: Date.now(),
834
+ };
835
+ }
836
+
837
+ private cancelPreemptiveGeneration(): void {
838
+ if (this._preemptiveGeneration !== undefined) {
839
+ this._preemptiveGeneration.speechHandle._cancel();
840
+ this._preemptiveGeneration = undefined;
841
+ }
842
+ }
843
+
844
+ private createSpeechTask(options: {
845
+ task: Task<void>;
846
+ ownedSpeechHandle?: SpeechHandle;
847
+ name?: string;
848
+ }): Promise<void> {
849
+ const { task, ownedSpeechHandle } = options;
850
+
851
+ this.speechTasks.add(task);
852
+ task.addDoneCallback(() => {
853
+ this.speechTasks.delete(task);
854
+ });
855
+
856
+ if (ownedSpeechHandle) {
857
+ ownedSpeechHandle._tasks.push(task);
858
+ task.addDoneCallback(() => {
859
+ if (ownedSpeechHandle._tasks.every((t) => t.done)) {
860
+ ownedSpeechHandle._markDone();
861
+ }
862
+ });
863
+ }
864
+
865
+ task.addDoneCallback(() => {
866
+ this.wakeupMainTask();
867
+ });
868
+
869
+ return task.result;
870
+ }
871
+
872
+ async onEndOfTurn(info: EndOfTurnInfo): Promise<boolean> {
873
+ if (this.draining) {
874
+ this.cancelPreemptiveGeneration();
875
+ this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining');
876
+ // TODO(shubhra): should we "forward" this new turn to the next agent/activity?
877
+ return true;
878
+ }
879
+
880
+ // Refactored interruption word count check for consistency with onVADInferenceDone:
881
+ // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
882
+ // - Use consistent word splitting logic with splitWords (matching onVADInferenceDone pattern)
883
+ if (
884
+ this.stt &&
885
+ this.turnDetection !== 'manual' &&
886
+ this._currentSpeech &&
887
+ this._currentSpeech.allowInterruptions &&
888
+ !this._currentSpeech.interrupted &&
889
+ this.agentSession.options.minInterruptionWords > 0
890
+ ) {
891
+ const wordCount = splitWords(info.newTranscript, true).length;
892
+ if (wordCount < this.agentSession.options.minInterruptionWords) {
893
+ // avoid interruption if the new_transcript contains fewer words than minInterruptionWords
894
+ this.cancelPreemptiveGeneration();
895
+ this.logger.info(
896
+ {
897
+ wordCount,
898
+ minInterruptionWords: this.agentSession.options.minInterruptionWords,
899
+ },
900
+ 'skipping user input, word count below minimum interruption threshold',
901
+ );
902
+ return false;
903
+ }
904
+ }
905
+
906
+ const oldTask = this._userTurnCompletedTask;
907
+ this._userTurnCompletedTask = this.createSpeechTask({
908
+ task: Task.from(() => this.userTurnCompleted(info, oldTask)),
909
+ name: 'AgentActivity.userTurnCompleted',
910
+ });
911
+ return true;
912
+ }
913
+
914
+ retrieveChatCtx(): ChatContext {
915
+ return this.agentSession.chatCtx;
916
+ }
917
+
918
+ private async mainTask(signal: AbortSignal): Promise<void> {
919
+ const abortFuture = new Future();
920
+ const abortHandler = () => {
921
+ abortFuture.resolve();
922
+ signal.removeEventListener('abort', abortHandler);
923
+ };
924
+ signal.addEventListener('abort', abortHandler);
925
+
926
+ while (true) {
927
+ await Promise.race([this.q_updated.await, abortFuture.await]);
928
+ if (signal.aborted) break;
929
+
930
+ while (this.speechQueue.size() > 0) {
931
+ if (signal.aborted) break;
932
+
933
+ const heapItem = this.speechQueue.pop();
934
+ if (!heapItem) {
935
+ throw new Error('Speech queue is empty');
936
+ }
937
+ const speechHandle = heapItem[2];
938
+ this._currentSpeech = speechHandle;
939
+ speechHandle._authorizeGeneration();
940
+ await speechHandle._waitForGeneration();
941
+ this._currentSpeech = undefined;
942
+ }
943
+
944
+ // If we're draining and there are no more speech tasks, we can exit.
945
+ // Only speech tasks can bypass draining to create a tool response
946
+ if (this.draining && this.speechTasks.size === 0) {
947
+ this.logger.info('mainTask: draining and no more speech tasks');
948
+ break;
949
+ }
950
+
951
+ this.q_updated = new Future();
952
+ }
953
+
954
+ this.logger.info('AgentActivity mainTask: exiting');
955
+ }
956
+
957
+ private wakeupMainTask(): void {
958
+ this.q_updated.resolve();
959
+ }
960
+
961
+ generateReply(options: {
962
+ userMessage?: ChatMessage;
963
+ chatCtx?: ChatContext;
964
+ instructions?: string;
965
+ toolChoice?: ToolChoice | null;
966
+ allowInterruptions?: boolean;
967
+ scheduleSpeech?: boolean;
968
+ }): SpeechHandle {
969
+ const {
970
+ userMessage,
971
+ chatCtx,
972
+ instructions: defaultInstructions,
973
+ toolChoice: defaultToolChoice,
974
+ allowInterruptions: defaultAllowInterruptions,
975
+ scheduleSpeech = true,
976
+ } = options;
977
+
978
+ let instructions = defaultInstructions;
979
+ let toolChoice = defaultToolChoice;
980
+ let allowInterruptions = defaultAllowInterruptions;
981
+
982
+ if (
983
+ this.llm instanceof RealtimeModel &&
984
+ this.llm.capabilities.turnDetection &&
985
+ allowInterruptions === false
986
+ ) {
987
+ this.logger.warn(
988
+ 'the RealtimeModel uses a server-side turn detection, allowInterruptions cannot be false when using VoiceAgent.generateReply(), ' +
989
+ 'disable turnDetection in the RealtimeModel and use VAD on the AgentTask/VoiceAgent instead',
990
+ );
991
+ allowInterruptions = true;
992
+ }
993
+
994
+ if (this.llm === undefined) {
995
+ throw new Error('trying to generate reply without an LLM model');
996
+ }
997
+
998
+ const functionCall = asyncLocalStorage.getStore()?.functionCall;
999
+ if (toolChoice === undefined && functionCall !== undefined) {
1000
+ // when generateReply is called inside a tool, set toolChoice to 'none' by default
1001
+ toolChoice = 'none';
1002
+ }
1003
+
1004
+ const handle = SpeechHandle.create({
1005
+ allowInterruptions: allowInterruptions ?? this.allowInterruptions,
1006
+ });
1007
+
1008
+ this.agentSession.emit(
1009
+ AgentSessionEventTypes.SpeechCreated,
1010
+ createSpeechCreatedEvent({
1011
+ userInitiated: true,
1012
+ source: 'generate_reply',
1013
+ speechHandle: handle,
1014
+ }),
1015
+ );
1016
+ this.logger.info({ speech_id: handle.id }, 'Creating speech handle');
1017
+
1018
+ if (this.llm instanceof RealtimeModel) {
1019
+ this.createSpeechTask({
1020
+ task: Task.from((abortController: AbortController) =>
1021
+ this.realtimeReplyTask({
1022
+ speechHandle: handle,
1023
+ // TODO(brian): support llm.ChatMessage for the realtime model
1024
+ userInput: userMessage?.textContent,
1025
+ instructions,
1026
+ modelSettings: {
1027
+ // isGiven(toolChoice) = toolChoice !== undefined
1028
+ toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
1029
+ },
1030
+ abortController,
1031
+ }),
1032
+ ),
1033
+ ownedSpeechHandle: handle,
1034
+ name: 'AgentActivity.realtimeReply',
1035
+ });
1036
+ } else if (this.llm instanceof LLM) {
1037
+ // instructions used inside generateReply are "extra" instructions.
1038
+ // this matches the behavior of the Realtime API:
1039
+ // https://platform.openai.com/docs/api-reference/realtime-client-events/response/create
1040
+ if (instructions) {
1041
+ instructions = `${this.agent.instructions}\n${instructions}`;
1042
+ }
1043
+
1044
+ const task = this.createSpeechTask({
1045
+ task: Task.from((abortController: AbortController) =>
1046
+ this.pipelineReplyTask(
1047
+ handle,
1048
+ chatCtx ?? this.agent.chatCtx,
1049
+ this.agent.toolCtx,
1050
+ {
1051
+ toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
1052
+ },
1053
+ abortController,
1054
+ instructions ? `${this.agent.instructions}\n${instructions}` : instructions,
1055
+ userMessage,
1056
+ ),
1057
+ ),
1058
+ ownedSpeechHandle: handle,
1059
+ name: 'AgentActivity.pipelineReply',
1060
+ });
1061
+
1062
+ task.finally(() => this.onPipelineReplyDone());
1063
+ }
1064
+
1065
+ if (scheduleSpeech) {
1066
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
1067
+ }
1068
+ return handle;
1069
+ }
1070
+
1071
+ interrupt(): Future<void> {
1072
+ const future = new Future<void>();
1073
+ const currentSpeech = this._currentSpeech;
1074
+
1075
+ //TODO(AJS-273): add interrupt for background speeches
1076
+
1077
+ currentSpeech?.interrupt();
1078
+
1079
+ for (const [_, __, speech] of this.speechQueue) {
1080
+ speech.interrupt();
1081
+ }
1082
+
1083
+ this.realtimeSession?.interrupt();
1084
+
1085
+ if (currentSpeech === undefined) {
1086
+ future.resolve();
1087
+ } else {
1088
+ currentSpeech.addDoneCallback(() => {
1089
+ if (future.done) return;
1090
+ future.resolve();
1091
+ });
1092
+ }
1093
+
1094
+ return future;
1095
+ }
1096
+
1097
+ private onPipelineReplyDone(): void {
1098
+ if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done())) {
1099
+ this.agentSession._updateAgentState('listening');
1100
+ }
1101
+ }
1102
+
1103
+ private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Promise<void>): Promise<void> {
1104
+ if (oldTask) {
1105
+ // We never cancel user code as this is very confusing.
1106
+ // So we wait for the old execution of onUserTurnCompleted to finish.
1107
+ // In practice this is OK because most speeches will be interrupted if a new turn
1108
+ // is detected. So the previous execution should complete quickly.
1109
+ await oldTask;
1110
+ }
1111
+
1112
+ // When the audio recognition detects the end of a user turn:
1113
+ // - check if realtime model server-side turn detection is enabled
1114
+ // - check if there is no current generation happening
1115
+ // - cancel the current generation if it allows interruptions (otherwise skip this current
1116
+ // turn)
1117
+ // - generate a reply to the user input
1118
+
1119
+ if (this.llm instanceof RealtimeModel) {
1120
+ if (this.llm.capabilities.turnDetection) {
1121
+ return;
1122
+ }
1123
+ this.realtimeSession?.commitAudio();
1124
+ }
1125
+
1126
+ if (this._currentSpeech) {
1127
+ if (!this._currentSpeech.allowInterruptions) {
1128
+ this.logger.warn(
1129
+ { user_input: info.newTranscript },
1130
+ 'skipping user input, current speech generation cannot be interrupted',
1131
+ );
1132
+ return;
1133
+ }
1134
+
1135
+ this.logger.info(
1136
+ { 'speech id': this._currentSpeech.id },
1137
+ 'speech interrupted, new user turn detected',
1138
+ );
1139
+
1140
+ this._currentSpeech.interrupt();
1141
+ this.realtimeSession?.interrupt();
1142
+ }
1143
+
1144
+ let userMessage: ChatMessage | undefined = ChatMessage.create({
1145
+ role: 'user',
1146
+ content: info.newTranscript,
1147
+ });
1148
+
1149
+ // create a temporary mutable chat context to pass to onUserTurnCompleted
1150
+ // the user can edit it for the current generation, but changes will not be kept inside the
1151
+ // Agent.chatCtx
1152
+ const chatCtx = this.agent.chatCtx.copy();
1153
+ const startTime = Date.now();
1154
+
1155
+ try {
1156
+ await this.agent.onUserTurnCompleted(chatCtx, userMessage);
1157
+ } catch (e) {
1158
+ if (e instanceof StopResponse) {
1159
+ return;
1160
+ }
1161
+ this.logger.error({ error: e }, 'error occurred during onUserTurnCompleted');
1162
+ }
1163
+
1164
+ const callbackDuration = Date.now() - startTime;
1165
+
1166
+ if (this.llm instanceof RealtimeModel) {
1167
+ // ignore stt transcription for realtime model
1168
+ userMessage = undefined;
1169
+ } else if (this.llm === undefined) {
1170
+ return;
1171
+ }
1172
+
1173
+ let speechHandle: SpeechHandle | undefined;
1174
+ if (this._preemptiveGeneration !== undefined) {
1175
+ const preemptive = this._preemptiveGeneration;
1176
+ // make sure the onUserTurnCompleted didn't change some request parameters
1177
+ // otherwise invalidate the preemptive generation
1178
+ if (
1179
+ preemptive.info.newTranscript === userMessage?.textContent &&
1180
+ preemptive.chatCtx.isEquivalent(chatCtx) &&
1181
+ isSameToolContext(preemptive.tools, this.tools) &&
1182
+ isSameToolChoice(preemptive.toolChoice, this.toolChoice)
1183
+ ) {
1184
+ speechHandle = preemptive.speechHandle;
1185
+ this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
1186
+ this.logger.debug(
1187
+ {
1188
+ preemptiveLeadTime: Date.now() - preemptive.createdAt,
1189
+ },
1190
+ 'using preemptive generation',
1191
+ );
1192
+ } else {
1193
+ this.logger.warn(
1194
+ 'preemptive generation enabled but chat context or tools have changed after `onUserTurnCompleted`',
1195
+ );
1196
+ preemptive.speechHandle._cancel();
1197
+ }
1198
+
1199
+ this._preemptiveGeneration = undefined;
1200
+ }
1201
+
1202
+ if (speechHandle === undefined) {
1203
+ // Ensure the new message is passed to generateReply
1204
+ // This preserves the original message id, making it easier for users to track responses
1205
+ speechHandle = this.generateReply({ userMessage, chatCtx });
1206
+ }
1207
+
1208
+ const eouMetrics: EOUMetrics = {
1209
+ type: 'eou_metrics',
1210
+ timestamp: Date.now(),
1211
+ endOfUtteranceDelayMs: info.endOfUtteranceDelay,
1212
+ transcriptionDelayMs: info.transcriptionDelay,
1213
+ onUserTurnCompletedDelayMs: callbackDuration,
1214
+ lastSpeakingTimeMs: info.stoppedSpeakingAt ?? 0,
1215
+ speechId: speechHandle.id,
1216
+ };
1217
+
1218
+ this.agentSession.emit(
1219
+ AgentSessionEventTypes.MetricsCollected,
1220
+ createMetricsCollectedEvent({ metrics: eouMetrics }),
1221
+ );
1222
+ }
1223
+
1224
+ private async ttsTask(
1225
+ speechHandle: SpeechHandle,
1226
+ text: string | ReadableStream<string>,
1227
+ addToChatCtx: boolean,
1228
+ modelSettings: ModelSettings,
1229
+ replyAbortController: AbortController,
1230
+ audio?: ReadableStream<AudioFrame> | null,
1231
+ ): Promise<void> {
1232
+ speechHandleStorage.enterWith(speechHandle);
1233
+
1234
+ const transcriptionOutput = this.agentSession.output.transcriptionEnabled
1235
+ ? this.agentSession.output.transcription
1236
+ : null;
1237
+
1238
+ const audioOutput = this.agentSession.output.audioEnabled
1239
+ ? this.agentSession.output.audio
1240
+ : null;
1241
+
1242
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1243
+
1244
+ if (speechHandle.interrupted) {
1245
+ return;
1246
+ }
1247
+
1248
+ let baseStream: ReadableStream<string>;
1249
+ if (text instanceof ReadableStream) {
1250
+ baseStream = text;
1251
+ } else {
1252
+ baseStream = new ReadableStream({
1253
+ start(controller) {
1254
+ controller.enqueue(text);
1255
+ controller.close();
1256
+ },
1257
+ });
1258
+ }
1259
+
1260
+ const [textSource, audioSource] = baseStream.tee();
1261
+
1262
+ const tasks: Array<Task<void>> = [];
1263
+
1264
+ const trNode = await this.agent.transcriptionNode(textSource, {});
1265
+ let textOut: _TextOut | null = null;
1266
+ if (trNode) {
1267
+ const [textForwardTask, _textOut] = performTextForwarding(
1268
+ trNode,
1269
+ replyAbortController,
1270
+ transcriptionOutput,
1271
+ );
1272
+ textOut = _textOut;
1273
+ tasks.push(textForwardTask);
1274
+ }
1275
+
1276
+ const onFirstFrame = () => {
1277
+ this.agentSession._updateAgentState('speaking');
1278
+ };
1279
+
1280
+ if (!audioOutput) {
1281
+ if (textOut) {
1282
+ textOut.firstTextFut.await.finally(onFirstFrame);
1283
+ }
1284
+ } else {
1285
+ let audioOut: _AudioOut | null = null;
1286
+ if (!audio) {
1287
+ // generate audio using TTS
1288
+ const [ttsTask, ttsStream] = performTTSInference(
1289
+ (...args) => this.agent.ttsNode(...args),
1290
+ audioSource,
1291
+ modelSettings,
1292
+ replyAbortController,
1293
+ );
1294
+ tasks.push(ttsTask);
1295
+
1296
+ const [forwardTask, _audioOut] = performAudioForwarding(
1297
+ ttsStream,
1298
+ audioOutput,
1299
+ replyAbortController,
1300
+ );
1301
+ tasks.push(forwardTask);
1302
+ audioOut = _audioOut;
1303
+ } else {
1304
+ // use the provided audio
1305
+ const [forwardTask, _audioOut] = performAudioForwarding(
1306
+ audio,
1307
+ audioOutput,
1308
+ replyAbortController,
1309
+ );
1310
+ tasks.push(forwardTask);
1311
+ audioOut = _audioOut;
1312
+ }
1313
+ audioOut.firstFrameFut.await.finally(onFirstFrame);
1314
+ }
1315
+
1316
+ await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
1317
+
1318
+ if (audioOutput) {
1319
+ await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1320
+ }
1321
+
1322
+ if (speechHandle.interrupted) {
1323
+ replyAbortController.abort();
1324
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1325
+ if (audioOutput) {
1326
+ audioOutput.clearBuffer();
1327
+ await audioOutput.waitForPlayout();
1328
+ }
1329
+ }
1330
+
1331
+ if (addToChatCtx) {
1332
+ const message = ChatMessage.create({
1333
+ role: 'assistant',
1334
+ content: textOut?.text || '',
1335
+ interrupted: speechHandle.interrupted,
1336
+ });
1337
+ this.agent._chatCtx.insert(message);
1338
+ this.agentSession._conversationItemAdded(message);
1339
+ }
1340
+
1341
+ if (this.agentSession.agentState === 'speaking') {
1342
+ this.agentSession._updateAgentState('listening');
1343
+ }
1344
+ }
1345
+
1346
+ private _pipelineReplyTaskImpl = async ({
1347
+ speechHandle,
1348
+ chatCtx,
1349
+ toolCtx,
1350
+ modelSettings,
1351
+ replyAbortController,
1352
+ instructions,
1353
+ newMessage,
1354
+ toolsMessages,
1355
+ span,
1356
+ }: {
1357
+ speechHandle: SpeechHandle;
1358
+ chatCtx: ChatContext;
1359
+ toolCtx: ToolContext;
1360
+ modelSettings: ModelSettings;
1361
+ replyAbortController: AbortController;
1362
+ instructions?: string;
1363
+ newMessage?: ChatMessage;
1364
+ toolsMessages?: ChatItem[];
1365
+ span: Span;
1366
+ }): Promise<void> => {
1367
+ span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1368
+ if (instructions) {
1369
+ span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
1370
+ }
1371
+ if (newMessage) {
1372
+ span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
1373
+ }
1374
+
1375
+ speechHandleStorage.enterWith(speechHandle);
1376
+
1377
+ const audioOutput = this.agentSession.output.audioEnabled
1378
+ ? this.agentSession.output.audio
1379
+ : null;
1380
+ const transcriptionOutput = this.agentSession.output.transcriptionEnabled
1381
+ ? this.agentSession.output.transcription
1382
+ : null;
1383
+
1384
+ chatCtx = chatCtx.copy();
1385
+
1386
+ // Insert new message into temporary chat context for LLM inference
1387
+ if (newMessage) {
1388
+ chatCtx.insert(newMessage);
1389
+ }
1390
+
1391
+ if (instructions) {
1392
+ try {
1393
+ updateInstructions({
1394
+ chatCtx,
1395
+ instructions,
1396
+ addIfMissing: true,
1397
+ });
1398
+ } catch (e) {
1399
+ this.logger.error({ error: e }, 'error occurred during updateInstructions');
1400
+ }
1401
+ }
1402
+
1403
+ const tasks: Array<Task<void>> = [];
1404
+ const [llmTask, llmGenData] = performLLMInference(
1405
+ // preserve `this` context in llmNode
1406
+ (...args) => this.agent.llmNode(...args),
1407
+ chatCtx,
1408
+ toolCtx,
1409
+ modelSettings,
1410
+ replyAbortController,
1411
+ );
1412
+ tasks.push(llmTask);
1413
+
1414
+ const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
1415
+
1416
+ let ttsTask: Task<void> | null = null;
1417
+ let ttsStream: ReadableStream<AudioFrame> | null = null;
1418
+ if (audioOutput) {
1419
+ [ttsTask, ttsStream] = performTTSInference(
1420
+ (...args) => this.agent.ttsNode(...args),
1421
+ ttsTextInput,
1422
+ modelSettings,
1423
+ replyAbortController,
1424
+ );
1425
+ tasks.push(ttsTask);
1426
+ }
1427
+
1428
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]);
1429
+
1430
+ // Add new message to actual chat context if the speech is scheduled
1431
+ if (newMessage && speechHandle.scheduled) {
1432
+ this.agent._chatCtx.insert(newMessage);
1433
+ this.agentSession._conversationItemAdded(newMessage);
1434
+ }
1435
+
1436
+ if (speechHandle.interrupted) {
1437
+ replyAbortController.abort();
1438
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1439
+ return;
1440
+ }
1441
+
1442
+ this.agentSession._updateAgentState('thinking');
1443
+
1444
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1445
+ speechHandle._clearAuthorization();
1446
+
1447
+ const replyStartedAt = Date.now();
1448
+ const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
1449
+ let textOut: _TextOut | null = null;
1450
+ if (trNodeResult) {
1451
+ const [textForwardTask, _textOut] = performTextForwarding(
1452
+ trNodeResult,
1453
+ replyAbortController,
1454
+ transcriptionOutput,
1455
+ );
1456
+ tasks.push(textForwardTask);
1457
+ textOut = _textOut;
1458
+ }
1459
+
1460
+ const onFirstFrame = () => {
1461
+ this.agentSession._updateAgentState('speaking');
1462
+ };
1463
+
1464
+ let audioOut: _AudioOut | null = null;
1465
+ if (audioOutput) {
1466
+ if (ttsStream) {
1467
+ const [forwardTask, _audioOut] = performAudioForwarding(
1468
+ ttsStream,
1469
+ audioOutput,
1470
+ replyAbortController,
1471
+ );
1472
+ audioOut = _audioOut;
1473
+ tasks.push(forwardTask);
1474
+ audioOut.firstFrameFut.await.finally(onFirstFrame);
1475
+ } else {
1476
+ throw Error('ttsStream is null when audioOutput is enabled');
1477
+ }
1478
+ } else {
1479
+ textOut?.firstTextFut.await.finally(onFirstFrame);
1480
+ }
1481
+
1482
+ //TODO(AJS-272): before executing tools, make sure we generated all the text
1483
+ // (this ensure everything is kept ordered)
1484
+
1485
+ const onToolExecutionStarted = (_: FunctionCall) => {
1486
+ // TODO(brian): handle speech_handle item_added
1487
+ };
1488
+
1489
+ const onToolExecutionCompleted = (_: ToolExecutionOutput) => {
1490
+ // TODO(brian): handle speech_handle item_added
1491
+ };
1492
+
1493
+ const [executeToolsTask, toolOutput] = performToolExecutions({
1494
+ session: this.agentSession,
1495
+ speechHandle,
1496
+ toolCtx,
1497
+ toolChoice: modelSettings.toolChoice,
1498
+ toolCallStream: llmGenData.toolCallStream,
1499
+ controller: replyAbortController,
1500
+ onToolExecutionStarted,
1501
+ onToolExecutionCompleted,
1502
+ });
1503
+
1504
+ await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
1505
+
1506
+ if (audioOutput) {
1507
+ await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1508
+ }
1509
+
1510
+ // add the tools messages that triggers this reply to the chat context
1511
+ if (toolsMessages) {
1512
+ for (const msg of toolsMessages) {
1513
+ msg.createdAt = replyStartedAt;
1514
+ }
1515
+ this.agent._chatCtx.insert(toolsMessages);
1516
+ // Also add to session history (matches Python agent_session.py _tool_items_added)
1517
+ this.agentSession._toolItemsAdded(toolsMessages as (FunctionCall | FunctionCallOutput)[]);
1518
+ }
1519
+
1520
+ if (speechHandle.interrupted) {
1521
+ this.logger.debug(
1522
+ { speech_id: speechHandle.id },
1523
+ 'Aborting all pipeline reply tasks due to interruption',
1524
+ );
1525
+
1526
+ // Stop playout ASAP (don't wait for cancellations), otherwise the segment may finish and we
1527
+ // will correctly (but undesirably) commit a long transcript even though the user said "stop".
1528
+ if (audioOutput) {
1529
+ audioOutput.clearBuffer();
1530
+ }
1531
+
1532
+ replyAbortController.abort();
1533
+ await Promise.allSettled(
1534
+ tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT)),
1535
+ );
1536
+
1537
+ let forwardedText = textOut?.text || '';
1538
+
1539
+ if (audioOutput) {
1540
+ const playbackEv = await audioOutput.waitForPlayout();
1541
+ if (audioOut?.firstFrameFut.done) {
1542
+ // playback EV is valid only if the first frame was already played
1543
+ this.logger.info(
1544
+ { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1545
+ 'playout interrupted',
1546
+ );
1547
+ if (playbackEv.synchronizedTranscript) {
1548
+ forwardedText = playbackEv.synchronizedTranscript;
1549
+ }
1550
+ } else {
1551
+ forwardedText = '';
1552
+ }
1553
+ }
1554
+
1555
+ if (forwardedText) {
1556
+ const message = ChatMessage.create({
1557
+ role: 'assistant',
1558
+ content: forwardedText,
1559
+ id: llmGenData.id,
1560
+ interrupted: true,
1561
+ createdAt: replyStartedAt,
1562
+ });
1563
+ chatCtx.insert(message);
1564
+ this.agent._chatCtx.insert(message);
1565
+ this.agentSession._conversationItemAdded(message);
1566
+ }
1567
+
1568
+ if (this.agentSession.agentState === 'speaking') {
1569
+ this.agentSession._updateAgentState('listening');
1570
+ }
1571
+
1572
+ this.logger.info(
1573
+ { speech_id: speechHandle.id, message: forwardedText },
1574
+ 'playout completed with interrupt',
1575
+ );
1576
+ // TODO(shubhra) add chat message to speech handle
1577
+ speechHandle._markGenerationDone();
1578
+ await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1579
+ return;
1580
+ }
1581
+
1582
+ if (textOut && textOut.text) {
1583
+ const message = ChatMessage.create({
1584
+ role: 'assistant',
1585
+ id: llmGenData.id,
1586
+ interrupted: false,
1587
+ createdAt: replyStartedAt,
1588
+ content: textOut.text,
1589
+ });
1590
+ chatCtx.insert(message);
1591
+ this.agent._chatCtx.insert(message);
1592
+ this.agentSession._conversationItemAdded(message);
1593
+ this.logger.info(
1594
+ { speech_id: speechHandle.id, message: textOut.text },
1595
+ 'playout completed without interruption',
1596
+ );
1597
+ }
1598
+
1599
+ if (toolOutput.output.length > 0) {
1600
+ this.agentSession._updateAgentState('thinking');
1601
+ } else if (this.agentSession.agentState === 'speaking') {
1602
+ this.agentSession._updateAgentState('listening');
1603
+ }
1604
+
1605
+ // mark the playout done before waiting for the tool execution
1606
+ speechHandle._markGenerationDone();
1607
+ await executeToolsTask.result;
1608
+
1609
+ if (toolOutput.output.length === 0) return;
1610
+
1611
+ // important: no agent output should be used after this point
1612
+ const { maxToolSteps } = this.agentSession.options;
1613
+ if (speechHandle.numSteps >= maxToolSteps) {
1614
+ this.logger.warn(
1615
+ { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
1616
+ 'maximum number of function calls steps reached',
1617
+ );
1618
+ return;
1619
+ }
1620
+
1621
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1622
+ functionCalls: [],
1623
+ functionCallOutputs: [],
1624
+ });
1625
+ let shouldGenerateToolReply: boolean = false;
1626
+ let newAgentTask: Agent | null = null;
1627
+ let ignoreTaskSwitch: boolean = false;
1628
+
1629
+ for (const sanitizedOut of toolOutput.output) {
1630
+ if (sanitizedOut.toolCallOutput !== undefined) {
1631
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1632
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1633
+ if (sanitizedOut.replyRequired) {
1634
+ shouldGenerateToolReply = true;
1635
+ }
1636
+ }
1637
+
1638
+ if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
1639
+ this.logger.error('expected to receive only one agent task from the tool executions');
1640
+ ignoreTaskSwitch = true;
1641
+ // TODO(brian): should we mark the function call as failed to notify the LLM?
1642
+ }
1643
+
1644
+ newAgentTask = sanitizedOut.agentTask ?? null;
1645
+
1646
+ this.logger.debug(
1647
+ {
1648
+ speechId: speechHandle.id,
1649
+ name: sanitizedOut.toolCall?.name,
1650
+ args: sanitizedOut.toolCall.args,
1651
+ output: sanitizedOut.toolCallOutput?.output,
1652
+ isError: sanitizedOut.toolCallOutput?.isError,
1653
+ },
1654
+ 'Tool call execution finished',
1655
+ );
1656
+ }
1657
+
1658
+ this.agentSession.emit(
1659
+ AgentSessionEventTypes.FunctionToolsExecuted,
1660
+ functionToolsExecutedEvent,
1661
+ );
1662
+
1663
+ let draining = this.draining;
1664
+ if (!ignoreTaskSwitch && newAgentTask !== null) {
1665
+ this.agentSession.updateAgent(newAgentTask);
1666
+ draining = true;
1667
+ }
1668
+
1669
+ const toolMessages = [
1670
+ ...functionToolsExecutedEvent.functionCalls,
1671
+ ...functionToolsExecutedEvent.functionCallOutputs,
1672
+ ] as ChatItem[];
1673
+ if (shouldGenerateToolReply) {
1674
+ chatCtx.insert(toolMessages);
1675
+
1676
+ const handle = SpeechHandle.create({
1677
+ allowInterruptions: speechHandle.allowInterruptions,
1678
+ stepIndex: speechHandle._stepIndex + 1,
1679
+ parent: speechHandle,
1680
+ });
1681
+ this.agentSession.emit(
1682
+ AgentSessionEventTypes.SpeechCreated,
1683
+ createSpeechCreatedEvent({
1684
+ userInitiated: false,
1685
+ source: 'tool_response',
1686
+ speechHandle: handle,
1687
+ }),
1688
+ );
1689
+
1690
+ // Avoid setting tool_choice to "required" or a specific function when
1691
+ // passing tool response back to the LLM
1692
+ const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
1693
+
1694
+ const toolResponseTask = this.createSpeechTask({
1695
+ task: Task.from(() =>
1696
+ this.pipelineReplyTask(
1697
+ handle,
1698
+ chatCtx,
1699
+ toolCtx,
1700
+ { toolChoice: respondToolChoice },
1701
+ replyAbortController,
1702
+ instructions,
1703
+ undefined,
1704
+ toolMessages,
1705
+ ),
1706
+ ),
1707
+ ownedSpeechHandle: handle,
1708
+ name: 'AgentActivity.pipelineReply',
1709
+ });
1710
+
1711
+ toolResponseTask.finally(() => this.onPipelineReplyDone());
1712
+
1713
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1714
+ } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1715
+ for (const msg of toolMessages) {
1716
+ msg.createdAt = replyStartedAt;
1717
+ }
1718
+ this.agent._chatCtx.insert(toolMessages);
1719
+ this.agentSession._toolItemsAdded(toolMessages as (FunctionCall | FunctionCallOutput)[]);
1720
+ }
1721
+ };
1722
+
1723
+ private pipelineReplyTask = async (
1724
+ speechHandle: SpeechHandle,
1725
+ chatCtx: ChatContext,
1726
+ toolCtx: ToolContext,
1727
+ modelSettings: ModelSettings,
1728
+ replyAbortController: AbortController,
1729
+ instructions?: string,
1730
+ newMessage?: ChatMessage,
1731
+ toolsMessages?: ChatItem[],
1732
+ ): Promise<void> =>
1733
+ tracer.startActiveSpan(
1734
+ async (span) =>
1735
+ this._pipelineReplyTaskImpl({
1736
+ speechHandle,
1737
+ chatCtx,
1738
+ toolCtx,
1739
+ modelSettings,
1740
+ replyAbortController,
1741
+ instructions,
1742
+ newMessage,
1743
+ toolsMessages,
1744
+ span,
1745
+ }),
1746
+ {
1747
+ name: 'agent_turn',
1748
+ context: this.agentSession.rootSpanContext,
1749
+ },
1750
+ );
1751
+
1752
+ private async realtimeGenerationTask(
1753
+ speechHandle: SpeechHandle,
1754
+ ev: GenerationCreatedEvent,
1755
+ modelSettings: ModelSettings,
1756
+ replyAbortController: AbortController,
1757
+ ): Promise<void> {
1758
+ return tracer.startActiveSpan(
1759
+ async (span) =>
1760
+ this._realtimeGenerationTaskImpl({
1761
+ speechHandle,
1762
+ ev,
1763
+ modelSettings,
1764
+ replyAbortController,
1765
+ span,
1766
+ }),
1767
+ {
1768
+ name: 'agent_turn',
1769
+ context: this.agentSession.rootSpanContext,
1770
+ },
1771
+ );
1772
+ }
1773
+
1774
+ private async _realtimeGenerationTaskImpl({
1775
+ speechHandle,
1776
+ ev,
1777
+ modelSettings,
1778
+ replyAbortController,
1779
+ span,
1780
+ }: {
1781
+ speechHandle: SpeechHandle;
1782
+ ev: GenerationCreatedEvent;
1783
+ modelSettings: ModelSettings;
1784
+ replyAbortController: AbortController;
1785
+ span: Span;
1786
+ }): Promise<void> {
1787
+ span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1788
+
1789
+ speechHandleStorage.enterWith(speechHandle);
1790
+
1791
+ if (!this.realtimeSession) {
1792
+ throw new Error('realtime session is not initialized');
1793
+ }
1794
+ if (!(this.llm instanceof RealtimeModel)) {
1795
+ throw new Error('llm is not a realtime model');
1796
+ }
1797
+
1798
+ // Store span for metrics recording when they arrive later
1799
+ span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.llm.model);
1800
+ if (this.realtimeSpans && ev.responseId) {
1801
+ this.realtimeSpans.set(ev.responseId, span);
1802
+ }
1803
+
1804
+ this.logger.debug(
1805
+ { speech_id: speechHandle.id, stepIndex: speechHandle.numSteps },
1806
+ 'realtime generation started',
1807
+ );
1808
+
1809
+ const audioOutput = this.agentSession.output.audioEnabled
1810
+ ? this.agentSession.output.audio
1811
+ : null;
1812
+ const textOutput = this.agentSession.output.transcriptionEnabled
1813
+ ? this.agentSession.output.transcription
1814
+ : null;
1815
+ const toolCtx = this.realtimeSession.tools;
1816
+
1817
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1818
+ speechHandle._clearAuthorization();
1819
+
1820
+ if (speechHandle.interrupted) {
1821
+ return;
1822
+ }
1823
+
1824
+ const onFirstFrame = () => {
1825
+ this.agentSession._updateAgentState('speaking');
1826
+ };
1827
+
1828
+ const readMessages = async (
1829
+ abortController: AbortController,
1830
+ outputs: Array<[string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]>,
1831
+ ) => {
1832
+ replyAbortController.signal.addEventListener('abort', () => abortController.abort(), {
1833
+ once: true,
1834
+ });
1835
+
1836
+ const forwardTasks: Array<Task<void>> = [];
1837
+ try {
1838
+ for await (const msg of ev.messageStream) {
1839
+ if (forwardTasks.length > 0) {
1840
+ this.logger.warn(
1841
+ 'expected to receive only one message generation from the realtime API',
1842
+ );
1843
+ break;
1844
+ }
1845
+
1846
+ const msgModalities = msg.modalities ? await msg.modalities : undefined;
1847
+ let ttsTextInput: ReadableStream<string> | null = null;
1848
+ let trTextInput: ReadableStream<string>;
1849
+
1850
+ if (msgModalities && !msgModalities.includes('audio') && this.tts) {
1851
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
1852
+ this.logger.warn(
1853
+ 'text response received from realtime API, falling back to use a TTS model.',
1854
+ );
1855
+ }
1856
+ const [_ttsTextInput, _trTextInput] = msg.textStream.tee();
1857
+ ttsTextInput = _ttsTextInput;
1858
+ trTextInput = _trTextInput;
1859
+ } else {
1860
+ trTextInput = msg.textStream;
1861
+ }
1862
+
1863
+ const trNodeResult = await this.agent.transcriptionNode(trTextInput, modelSettings);
1864
+ let textOut: _TextOut | null = null;
1865
+ if (trNodeResult) {
1866
+ const [textForwardTask, _textOut] = performTextForwarding(
1867
+ trNodeResult,
1868
+ abortController,
1869
+ textOutput,
1870
+ );
1871
+ forwardTasks.push(textForwardTask);
1872
+ textOut = _textOut;
1873
+ }
1874
+
1875
+ let audioOut: _AudioOut | null = null;
1876
+ if (audioOutput) {
1877
+ let realtimeAudioResult: ReadableStream<AudioFrame> | null = null;
1878
+
1879
+ if (ttsTextInput) {
1880
+ const [ttsTask, ttsStream] = performTTSInference(
1881
+ (...args) => this.agent.ttsNode(...args),
1882
+ ttsTextInput,
1883
+ modelSettings,
1884
+ abortController,
1885
+ );
1886
+ tasks.push(ttsTask);
1887
+ realtimeAudioResult = ttsStream;
1888
+ } else if (msgModalities && msgModalities.includes('audio')) {
1889
+ realtimeAudioResult = await this.agent.realtimeAudioOutputNode(
1890
+ msg.audioStream,
1891
+ modelSettings,
1892
+ );
1893
+ } else if (this.llm instanceof RealtimeModel && this.llm.capabilities.audioOutput) {
1894
+ this.logger.error(
1895
+ 'Text message received from Realtime API with audio modality. ' +
1896
+ 'This usually happens when text chat context is synced to the API. ' +
1897
+ 'Try to add a TTS model as fallback or use text modality with TTS instead.',
1898
+ );
1899
+ } else {
1900
+ this.logger.warn(
1901
+ 'audio output is enabled but neither tts nor realtime audio is available',
1902
+ );
1903
+ }
1904
+
1905
+ if (realtimeAudioResult) {
1906
+ const [forwardTask, _audioOut] = performAudioForwarding(
1907
+ realtimeAudioResult,
1908
+ audioOutput,
1909
+ abortController,
1910
+ );
1911
+ forwardTasks.push(forwardTask);
1912
+ audioOut = _audioOut;
1913
+ audioOut.firstFrameFut.await.finally(onFirstFrame);
1914
+ }
1915
+ } else if (textOut) {
1916
+ textOut.firstTextFut.await.finally(onFirstFrame);
1917
+ }
1918
+ outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
1919
+ }
1920
+ await waitFor(forwardTasks);
1921
+ } catch (error) {
1922
+ this.logger.error(error, 'error reading messages from the realtime API');
1923
+ } finally {
1924
+ await cancelAndWait(forwardTasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1925
+ }
1926
+ };
1927
+
1928
+ const messageOutputs: Array<
1929
+ [string, _TextOut | null, _AudioOut | null, ('text' | 'audio')[] | undefined]
1930
+ > = [];
1931
+ const tasks = [
1932
+ Task.from(
1933
+ (controller) => readMessages(controller, messageOutputs),
1934
+ undefined,
1935
+ 'AgentActivity.realtime_generation.read_messages',
1936
+ ),
1937
+ ];
1938
+
1939
+ const [toolCallStream, toolCallStreamForTracing] = ev.functionStream.tee();
1940
+ // TODO(brian): append to tracing tees
1941
+ const toolCalls: FunctionCall[] = [];
1942
+
1943
+ const readToolStreamTask = async (
1944
+ controller: AbortController,
1945
+ stream: ReadableStream<FunctionCall>,
1946
+ ) => {
1947
+ const reader = stream.getReader();
1948
+ try {
1949
+ while (!controller.signal.aborted) {
1950
+ const { done, value } = await reader.read();
1951
+ if (done) break;
1952
+
1953
+ this.logger.debug({ tool_call: value }, 'received tool call from the realtime API');
1954
+ toolCalls.push(value);
1955
+ }
1956
+ } finally {
1957
+ reader.releaseLock();
1958
+ }
1959
+ };
1960
+
1961
+ tasks.push(
1962
+ Task.from(
1963
+ (controller) => readToolStreamTask(controller, toolCallStreamForTracing),
1964
+ replyAbortController,
1965
+ 'AgentActivity.realtime_generation.read_tool_stream',
1966
+ ),
1967
+ );
1968
+
1969
+ const onToolExecutionStarted = (f: FunctionCall) => {
1970
+ speechHandle._itemAdded([f]);
1971
+ this.agent._chatCtx.items.push(f);
1972
+ this.agentSession._toolItemsAdded([f]);
1973
+ };
1974
+
1975
+ const onToolExecutionCompleted = (out: ToolExecutionOutput) => {
1976
+ if (out.toolCallOutput) {
1977
+ speechHandle._itemAdded([out.toolCallOutput]);
1978
+ }
1979
+ };
1980
+
1981
+ const [executeToolsTask, toolOutput] = performToolExecutions({
1982
+ session: this.agentSession,
1983
+ speechHandle,
1984
+ toolCtx,
1985
+ toolCallStream,
1986
+ toolChoice: modelSettings.toolChoice,
1987
+ controller: replyAbortController,
1988
+ onToolExecutionStarted,
1989
+ onToolExecutionCompleted,
1990
+ });
1991
+
1992
+ await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
1993
+
1994
+ // TODO(brian): add tracing span
1995
+
1996
+ if (audioOutput) {
1997
+ await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1998
+ this.agentSession._updateAgentState('listening');
1999
+ }
2000
+
2001
+ if (speechHandle.interrupted) {
2002
+ this.logger.debug(
2003
+ { speech_id: speechHandle.id },
2004
+ 'Aborting all realtime generation tasks due to interruption',
2005
+ );
2006
+ replyAbortController.abort();
2007
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
2008
+
2009
+ if (messageOutputs.length > 0) {
2010
+ // there should be only one message
2011
+ const [msgId, textOut, audioOut, msgModalities] = messageOutputs[0]!;
2012
+ let forwardedText = textOut?.text || '';
2013
+
2014
+ if (audioOutput) {
2015
+ audioOutput.clearBuffer();
2016
+ const playbackEv = await audioOutput.waitForPlayout();
2017
+ let playbackPosition = playbackEv.playbackPosition;
2018
+ if (audioOut?.firstFrameFut.done) {
2019
+ // playback EV is valid only if the first frame was already played
2020
+ this.logger.info(
2021
+ { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
2022
+ 'playout interrupted',
2023
+ );
2024
+ if (playbackEv.synchronizedTranscript) {
2025
+ forwardedText = playbackEv.synchronizedTranscript;
2026
+ }
2027
+ } else {
2028
+ forwardedText = '';
2029
+ playbackPosition = 0;
2030
+ }
2031
+
2032
+ // truncate server-side message
2033
+ this.realtimeSession.truncate({
2034
+ messageId: msgId,
2035
+ audioEndMs: Math.floor(playbackPosition),
2036
+ modalities: msgModalities,
2037
+ audioTranscript: forwardedText,
2038
+ });
2039
+ }
2040
+
2041
+ if (forwardedText) {
2042
+ const message = ChatMessage.create({
2043
+ role: 'assistant',
2044
+ content: forwardedText,
2045
+ id: msgId,
2046
+ interrupted: true,
2047
+ });
2048
+ this.agent._chatCtx.insert(message);
2049
+ speechHandle._itemAdded([message]);
2050
+ this.agentSession._conversationItemAdded(message);
2051
+
2052
+ // TODO(brian): add tracing span
2053
+ }
2054
+ this.logger.info(
2055
+ { speech_id: speechHandle.id, message: forwardedText },
2056
+ 'playout completed with interrupt',
2057
+ );
2058
+ }
2059
+ speechHandle._markGenerationDone();
2060
+ await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
2061
+
2062
+ // TODO(brian): close tees
2063
+ return;
2064
+ }
2065
+
2066
+ if (messageOutputs.length > 0) {
2067
+ // there should be only one message
2068
+ const [msgId, textOut, _, __] = messageOutputs[0]!;
2069
+ const message = ChatMessage.create({
2070
+ role: 'assistant',
2071
+ content: textOut?.text || '',
2072
+ id: msgId,
2073
+ interrupted: false,
2074
+ });
2075
+ this.agent._chatCtx.insert(message);
2076
+ speechHandle._itemAdded([message]);
2077
+ this.agentSession._conversationItemAdded(message); // mark the playout done before waiting for the tool execution\
2078
+ // TODO(brian): add tracing span
2079
+ }
2080
+
2081
+ // mark the playout done before waiting for the tool execution
2082
+ speechHandle._markGenerationDone();
2083
+ // TODO(brian): close tees
2084
+
2085
+ toolOutput.firstToolStartedFuture.await.finally(() => {
2086
+ this.agentSession._updateAgentState('thinking');
2087
+ });
2088
+
2089
+ await executeToolsTask.result;
2090
+
2091
+ if (toolOutput.output.length === 0) {
2092
+ // return to listening state for thinking-only turns (no audio output, no tools)
2093
+ if (!speechHandle.interrupted) {
2094
+ this.agentSession._updateAgentState('listening');
2095
+ }
2096
+ return;
2097
+ }
2098
+
2099
+ // important: no agent ouput should be used after this point
2100
+ const { maxToolSteps } = this.agentSession.options;
2101
+ if (speechHandle.numSteps >= maxToolSteps) {
2102
+ this.logger.warn(
2103
+ { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
2104
+ 'maximum number of function calls steps reached',
2105
+ );
2106
+ return;
2107
+ }
2108
+
2109
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
2110
+ functionCalls: [],
2111
+ functionCallOutputs: [],
2112
+ });
2113
+ let shouldGenerateToolReply: boolean = false;
2114
+ let newAgentTask: Agent | null = null;
2115
+ let ignoreTaskSwitch: boolean = false;
2116
+
2117
+ for (const sanitizedOut of toolOutput.output) {
2118
+ if (sanitizedOut.toolCallOutput !== undefined) {
2119
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
2120
+ if (sanitizedOut.replyRequired) {
2121
+ shouldGenerateToolReply = true;
2122
+ }
2123
+ }
2124
+
2125
+ if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) {
2126
+ this.logger.error('expected to receive only one agent task from the tool executions');
2127
+ ignoreTaskSwitch = true;
2128
+ }
2129
+
2130
+ newAgentTask = sanitizedOut.agentTask ?? null;
2131
+
2132
+ this.logger.debug(
2133
+ {
2134
+ speechId: speechHandle.id,
2135
+ name: sanitizedOut.toolCall?.name,
2136
+ args: sanitizedOut.toolCall.args,
2137
+ output: sanitizedOut.toolCallOutput?.output,
2138
+ isError: sanitizedOut.toolCallOutput?.isError,
2139
+ },
2140
+ 'Tool call execution finished',
2141
+ );
2142
+ }
2143
+
2144
+ this.agentSession.emit(
2145
+ AgentSessionEventTypes.FunctionToolsExecuted,
2146
+ functionToolsExecutedEvent,
2147
+ );
2148
+
2149
+ let draining = this.draining;
2150
+ if (!ignoreTaskSwitch && newAgentTask !== null) {
2151
+ this.agentSession.updateAgent(newAgentTask);
2152
+ draining = true;
2153
+ }
2154
+
2155
+ if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
2156
+ // wait all speeches played before updating the tool output and generating the response
2157
+ // most realtime models dont support generating multiple responses at the same time
2158
+ while (this.currentSpeech || this.speechQueue.size() > 0) {
2159
+ if (
2160
+ this.currentSpeech &&
2161
+ !this.currentSpeech.done() &&
2162
+ this.currentSpeech !== speechHandle
2163
+ ) {
2164
+ await this.currentSpeech.waitForPlayout();
2165
+ } else {
2166
+ // Don't block the event loop
2167
+ await new Promise((resolve) => setImmediate(resolve));
2168
+ }
2169
+ }
2170
+ const chatCtx = this.realtimeSession.chatCtx.copy();
2171
+ chatCtx.items.push(...functionToolsExecutedEvent.functionCallOutputs);
2172
+
2173
+ this.agentSession._toolItemsAdded(
2174
+ functionToolsExecutedEvent.functionCallOutputs as FunctionCallOutput[],
2175
+ );
2176
+
2177
+ try {
2178
+ await this.realtimeSession.updateChatCtx(chatCtx);
2179
+ } catch (error) {
2180
+ this.logger.warn(
2181
+ { error },
2182
+ 'failed to update chat context before generating the function calls results',
2183
+ );
2184
+ }
2185
+ }
2186
+
2187
+ // skip realtime reply if not required or auto-generated
2188
+ if (!shouldGenerateToolReply || this.llm.capabilities.autoToolReplyGeneration) {
2189
+ return;
2190
+ }
2191
+
2192
+ this.realtimeSession.interrupt();
2193
+
2194
+ const replySpeechHandle = SpeechHandle.create({
2195
+ allowInterruptions: speechHandle.allowInterruptions,
2196
+ stepIndex: speechHandle.numSteps + 1,
2197
+ parent: speechHandle,
2198
+ });
2199
+ this.agentSession.emit(
2200
+ AgentSessionEventTypes.SpeechCreated,
2201
+ createSpeechCreatedEvent({
2202
+ userInitiated: false,
2203
+ source: 'tool_response',
2204
+ speechHandle: replySpeechHandle,
2205
+ }),
2206
+ );
2207
+
2208
+ const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto';
2209
+ this.createSpeechTask({
2210
+ task: Task.from((abortController: AbortController) =>
2211
+ this.realtimeReplyTask({
2212
+ speechHandle: replySpeechHandle,
2213
+ modelSettings: { toolChoice },
2214
+ abortController,
2215
+ }),
2216
+ ),
2217
+ ownedSpeechHandle: replySpeechHandle,
2218
+ name: 'AgentActivity.realtime_reply',
2219
+ });
2220
+
2221
+ this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
2222
+ }
2223
+
2224
+ private async realtimeReplyTask({
2225
+ speechHandle,
2226
+ modelSettings: { toolChoice },
2227
+ userInput,
2228
+ instructions,
2229
+ abortController,
2230
+ }: {
2231
+ speechHandle: SpeechHandle;
2232
+ modelSettings: ModelSettings;
2233
+ abortController: AbortController;
2234
+ userInput?: string;
2235
+ instructions?: string;
2236
+ }): Promise<void> {
2237
+ speechHandleStorage.enterWith(speechHandle);
2238
+
2239
+ if (!this.realtimeSession) {
2240
+ throw new Error('realtime session is not available');
2241
+ }
2242
+
2243
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
2244
+
2245
+ if (userInput) {
2246
+ const chatCtx = this.realtimeSession.chatCtx.copy();
2247
+ const message = chatCtx.addMessage({
2248
+ role: 'user',
2249
+ content: userInput,
2250
+ });
2251
+ await this.realtimeSession.updateChatCtx(chatCtx);
2252
+ this.agent._chatCtx.insert(message);
2253
+ this.agentSession._conversationItemAdded(message);
2254
+ }
2255
+
2256
+ const originalToolChoice = this.toolChoice;
2257
+ if (toolChoice !== undefined) {
2258
+ this.realtimeSession.updateOptions({ toolChoice });
2259
+ }
2260
+
2261
+ try {
2262
+ const generationEvent = await this.realtimeSession.generateReply(instructions);
2263
+ await this.realtimeGenerationTask(
2264
+ speechHandle,
2265
+ generationEvent,
2266
+ { toolChoice },
2267
+ abortController,
2268
+ );
2269
+ } finally {
2270
+ // reset toolChoice value
2271
+ if (toolChoice !== undefined && toolChoice !== originalToolChoice) {
2272
+ this.realtimeSession.updateOptions({ toolChoice: originalToolChoice });
2273
+ }
2274
+ }
2275
+ }
2276
+
2277
+ private scheduleSpeech(
2278
+ speechHandle: SpeechHandle,
2279
+ priority: number,
2280
+ force: boolean = false,
2281
+ ): void {
2282
+ // when force=true, we allow tool responses to bypass draining
2283
+ // This allows for tool responses to be generated before the AgentActivity is finalized
2284
+ if (this.draining && !force) {
2285
+ throw new Error('cannot schedule new speech, the agent is draining');
2286
+ }
2287
+
2288
+ // Monotonic time to avoid near 0 collisions
2289
+ this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
2290
+ speechHandle._markScheduled();
2291
+ this.wakeupMainTask();
2292
+ }
2293
+
2294
+ async drain(): Promise<void> {
2295
+ // Create drain_agent_activity as a ROOT span (new trace) to match Python behavior
2296
+ return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
2297
+ name: 'drain_agent_activity',
2298
+ context: ROOT_CONTEXT,
2299
+ });
2300
+ }
2301
+
2302
+ private async _drainImpl(span: Span): Promise<void> {
2303
+ span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
2304
+
2305
+ const unlock = await this.lock.lock();
2306
+ try {
2307
+ if (this._draining) return;
2308
+
2309
+ this.cancelPreemptiveGeneration();
2310
+
2311
+ const onExitTask = tracer.startActiveSpan(async () => this.agent.onExit(), {
2312
+ name: 'on_exit',
2313
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
2314
+ });
2315
+
2316
+ this.createSpeechTask({
2317
+ task: Task.from(() => onExitTask),
2318
+ name: 'AgentActivity_onExit',
2319
+ });
2320
+
2321
+ this.wakeupMainTask();
2322
+ this._draining = true;
2323
+ await this._mainTask?.result;
2324
+ } finally {
2325
+ unlock();
2326
+ }
2327
+ }
2328
+
2329
+ async close(): Promise<void> {
2330
+ const unlock = await this.lock.lock();
2331
+ try {
2332
+ if (!this._draining) {
2333
+ this.logger.warn('task closing without draining');
2334
+ }
2335
+
2336
+ this.cancelPreemptiveGeneration();
2337
+ // Unregister event handlers to prevent duplicate metrics
2338
+ if (this.llm instanceof LLM) {
2339
+ this.llm.off('metrics_collected', this.onMetricsCollected);
2340
+ }
2341
+ if (this.realtimeSession) {
2342
+ this.realtimeSession.off('generation_created', this.onGenerationCreated);
2343
+ this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted);
2344
+ this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped);
2345
+ this.realtimeSession.off(
2346
+ 'input_audio_transcription_completed',
2347
+ this.onInputAudioTranscriptionCompleted,
2348
+ );
2349
+ this.realtimeSession.off('metrics_collected', this.onMetricsCollected);
2350
+ }
2351
+ if (this.stt instanceof STT) {
2352
+ this.stt.off('metrics_collected', this.onMetricsCollected);
2353
+ }
2354
+ if (this.tts instanceof TTS) {
2355
+ this.tts.off('metrics_collected', this.onMetricsCollected);
2356
+ }
2357
+ if (this.vad instanceof VAD) {
2358
+ this.vad.off('metrics_collected', this.onMetricsCollected);
2359
+ }
2360
+
2361
+ this.detachAudioInput();
2362
+ this.realtimeSpans?.clear();
2363
+ await this.realtimeSession?.close();
2364
+ await this.audioRecognition?.close();
2365
+ await this._mainTask?.cancelAndWait();
2366
+ } finally {
2367
+ unlock();
2368
+ }
2369
+ }
2370
+ }
2371
+
2372
+ function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined {
2373
+ // we convert null to undefined, which maps to the default provider tool choice value
2374
+ return toolChoice !== null ? toolChoice : undefined;
2375
+ }