@livekit/agents 0.7.9 → 1.0.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (627) hide show
  1. package/dist/_exceptions.cjs +109 -0
  2. package/dist/_exceptions.cjs.map +1 -0
  3. package/dist/_exceptions.d.cts +64 -0
  4. package/dist/_exceptions.d.ts +64 -0
  5. package/dist/_exceptions.d.ts.map +1 -0
  6. package/dist/_exceptions.js +80 -0
  7. package/dist/_exceptions.js.map +1 -0
  8. package/dist/audio.cjs +10 -3
  9. package/dist/audio.cjs.map +1 -1
  10. package/dist/audio.d.cts +2 -0
  11. package/dist/audio.d.ts +2 -0
  12. package/dist/audio.d.ts.map +1 -1
  13. package/dist/audio.js +8 -2
  14. package/dist/audio.js.map +1 -1
  15. package/dist/cli.cjs +25 -0
  16. package/dist/cli.cjs.map +1 -1
  17. package/dist/cli.d.ts.map +1 -1
  18. package/dist/cli.js +25 -0
  19. package/dist/cli.js.map +1 -1
  20. package/dist/constants.cjs +6 -3
  21. package/dist/constants.cjs.map +1 -1
  22. package/dist/constants.d.cts +2 -1
  23. package/dist/constants.d.ts +2 -1
  24. package/dist/constants.d.ts.map +1 -1
  25. package/dist/constants.js +4 -2
  26. package/dist/constants.js.map +1 -1
  27. package/dist/http_server.cjs.map +1 -1
  28. package/dist/http_server.d.cts +1 -0
  29. package/dist/http_server.d.ts +1 -0
  30. package/dist/http_server.d.ts.map +1 -1
  31. package/dist/http_server.js.map +1 -1
  32. package/dist/index.cjs +27 -20
  33. package/dist/index.cjs.map +1 -1
  34. package/dist/index.d.cts +13 -10
  35. package/dist/index.d.ts +13 -10
  36. package/dist/index.d.ts.map +1 -1
  37. package/dist/index.js +15 -11
  38. package/dist/index.js.map +1 -1
  39. package/dist/inference_runner.cjs +0 -1
  40. package/dist/inference_runner.cjs.map +1 -1
  41. package/dist/inference_runner.d.cts +2 -3
  42. package/dist/inference_runner.d.ts +2 -3
  43. package/dist/inference_runner.d.ts.map +1 -1
  44. package/dist/inference_runner.js +0 -1
  45. package/dist/inference_runner.js.map +1 -1
  46. package/dist/ipc/inference_proc_executor.cjs +2 -2
  47. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  48. package/dist/ipc/inference_proc_executor.js +2 -2
  49. package/dist/ipc/inference_proc_executor.js.map +1 -1
  50. package/dist/ipc/job_executor.cjs.map +1 -1
  51. package/dist/ipc/job_executor.js.map +1 -1
  52. package/dist/ipc/job_proc_executor.cjs +1 -0
  53. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  54. package/dist/ipc/job_proc_executor.js +1 -0
  55. package/dist/ipc/job_proc_executor.js.map +1 -1
  56. package/dist/ipc/job_proc_lazy_main.cjs +1 -1
  57. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  58. package/dist/ipc/job_proc_lazy_main.js +1 -1
  59. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  60. package/dist/ipc/supervised_proc.d.cts +1 -1
  61. package/dist/ipc/supervised_proc.d.ts +1 -1
  62. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  63. package/dist/job.cjs +14 -2
  64. package/dist/job.cjs.map +1 -1
  65. package/dist/job.d.cts +8 -0
  66. package/dist/job.d.ts +8 -0
  67. package/dist/job.d.ts.map +1 -1
  68. package/dist/job.js +12 -1
  69. package/dist/job.js.map +1 -1
  70. package/dist/llm/chat_context.cjs +332 -82
  71. package/dist/llm/chat_context.cjs.map +1 -1
  72. package/dist/llm/chat_context.d.cts +152 -48
  73. package/dist/llm/chat_context.d.ts +152 -48
  74. package/dist/llm/chat_context.d.ts.map +1 -1
  75. package/dist/llm/chat_context.js +327 -81
  76. package/dist/llm/chat_context.js.map +1 -1
  77. package/dist/llm/chat_context.test.cjs +380 -0
  78. package/dist/llm/chat_context.test.cjs.map +1 -0
  79. package/dist/llm/chat_context.test.js +385 -0
  80. package/dist/llm/chat_context.test.js.map +1 -0
  81. package/dist/llm/index.cjs +37 -8
  82. package/dist/llm/index.cjs.map +1 -1
  83. package/dist/llm/index.d.cts +7 -3
  84. package/dist/llm/index.d.ts +7 -3
  85. package/dist/llm/index.d.ts.map +1 -1
  86. package/dist/llm/index.js +39 -9
  87. package/dist/llm/index.js.map +1 -1
  88. package/dist/llm/llm.cjs +97 -33
  89. package/dist/llm/llm.cjs.map +1 -1
  90. package/dist/llm/llm.d.cts +50 -24
  91. package/dist/llm/llm.d.ts +50 -24
  92. package/dist/llm/llm.d.ts.map +1 -1
  93. package/dist/llm/llm.js +98 -33
  94. package/dist/llm/llm.js.map +1 -1
  95. package/dist/llm/provider_format/google.cjs +128 -0
  96. package/dist/llm/provider_format/google.cjs.map +1 -0
  97. package/dist/llm/provider_format/google.d.cts +6 -0
  98. package/dist/llm/provider_format/google.d.ts +6 -0
  99. package/dist/llm/provider_format/google.d.ts.map +1 -0
  100. package/dist/llm/provider_format/google.js +104 -0
  101. package/dist/llm/provider_format/google.js.map +1 -0
  102. package/dist/llm/provider_format/google.test.cjs +676 -0
  103. package/dist/llm/provider_format/google.test.cjs.map +1 -0
  104. package/dist/llm/provider_format/google.test.js +675 -0
  105. package/dist/llm/provider_format/google.test.js.map +1 -0
  106. package/dist/llm/provider_format/index.cjs +40 -0
  107. package/dist/llm/provider_format/index.cjs.map +1 -0
  108. package/dist/llm/provider_format/index.d.cts +4 -0
  109. package/dist/llm/provider_format/index.d.ts +4 -0
  110. package/dist/llm/provider_format/index.d.ts.map +1 -0
  111. package/dist/llm/provider_format/index.js +16 -0
  112. package/dist/llm/provider_format/index.js.map +1 -0
  113. package/dist/llm/provider_format/openai.cjs +116 -0
  114. package/dist/llm/provider_format/openai.cjs.map +1 -0
  115. package/dist/llm/provider_format/openai.d.cts +3 -0
  116. package/dist/llm/provider_format/openai.d.ts +3 -0
  117. package/dist/llm/provider_format/openai.d.ts.map +1 -0
  118. package/dist/llm/provider_format/openai.js +92 -0
  119. package/dist/llm/provider_format/openai.js.map +1 -0
  120. package/dist/llm/provider_format/openai.test.cjs +490 -0
  121. package/dist/llm/provider_format/openai.test.cjs.map +1 -0
  122. package/dist/llm/provider_format/openai.test.js +489 -0
  123. package/dist/llm/provider_format/openai.test.js.map +1 -0
  124. package/dist/llm/provider_format/utils.cjs +146 -0
  125. package/dist/llm/provider_format/utils.cjs.map +1 -0
  126. package/dist/llm/provider_format/utils.d.cts +38 -0
  127. package/dist/llm/provider_format/utils.d.ts +38 -0
  128. package/dist/llm/provider_format/utils.d.ts.map +1 -0
  129. package/dist/llm/provider_format/utils.js +122 -0
  130. package/dist/llm/provider_format/utils.js.map +1 -0
  131. package/dist/llm/realtime.cjs +77 -0
  132. package/dist/llm/realtime.cjs.map +1 -0
  133. package/dist/llm/realtime.d.cts +98 -0
  134. package/dist/llm/realtime.d.ts +98 -0
  135. package/dist/llm/realtime.d.ts.map +1 -0
  136. package/dist/llm/realtime.js +52 -0
  137. package/dist/llm/realtime.js.map +1 -0
  138. package/dist/llm/remote_chat_context.cjs +112 -0
  139. package/dist/llm/remote_chat_context.cjs.map +1 -0
  140. package/dist/llm/remote_chat_context.d.cts +23 -0
  141. package/dist/llm/remote_chat_context.d.ts +23 -0
  142. package/dist/llm/remote_chat_context.d.ts.map +1 -0
  143. package/dist/llm/remote_chat_context.js +88 -0
  144. package/dist/llm/remote_chat_context.js.map +1 -0
  145. package/dist/llm/remote_chat_context.test.cjs +225 -0
  146. package/dist/llm/remote_chat_context.test.cjs.map +1 -0
  147. package/dist/llm/remote_chat_context.test.js +224 -0
  148. package/dist/llm/remote_chat_context.test.js.map +1 -0
  149. package/dist/llm/tool_context.cjs +111 -0
  150. package/dist/llm/tool_context.cjs.map +1 -0
  151. package/dist/llm/tool_context.d.cts +125 -0
  152. package/dist/llm/tool_context.d.ts +125 -0
  153. package/dist/llm/tool_context.d.ts.map +1 -0
  154. package/dist/llm/tool_context.js +80 -0
  155. package/dist/llm/tool_context.js.map +1 -0
  156. package/dist/llm/tool_context.test.cjs +162 -0
  157. package/dist/llm/tool_context.test.cjs.map +1 -0
  158. package/dist/llm/tool_context.test.js +161 -0
  159. package/dist/llm/tool_context.test.js.map +1 -0
  160. package/dist/llm/tool_context.type.test.cjs +92 -0
  161. package/dist/llm/tool_context.type.test.cjs.map +1 -0
  162. package/dist/llm/tool_context.type.test.js +91 -0
  163. package/dist/llm/tool_context.type.test.js.map +1 -0
  164. package/dist/llm/utils.cjs +260 -0
  165. package/dist/llm/utils.cjs.map +1 -0
  166. package/dist/llm/utils.d.cts +42 -0
  167. package/dist/llm/utils.d.ts +42 -0
  168. package/dist/llm/utils.d.ts.map +1 -0
  169. package/dist/llm/utils.js +223 -0
  170. package/dist/llm/utils.js.map +1 -0
  171. package/dist/llm/utils.test.cjs +513 -0
  172. package/dist/llm/utils.test.cjs.map +1 -0
  173. package/dist/llm/utils.test.js +490 -0
  174. package/dist/llm/utils.test.js.map +1 -0
  175. package/dist/metrics/base.cjs +0 -27
  176. package/dist/metrics/base.cjs.map +1 -1
  177. package/dist/metrics/base.d.cts +105 -63
  178. package/dist/metrics/base.d.ts +105 -63
  179. package/dist/metrics/base.d.ts.map +1 -1
  180. package/dist/metrics/base.js +0 -19
  181. package/dist/metrics/base.js.map +1 -1
  182. package/dist/metrics/index.cjs +0 -3
  183. package/dist/metrics/index.cjs.map +1 -1
  184. package/dist/metrics/index.d.cts +2 -3
  185. package/dist/metrics/index.d.ts +2 -3
  186. package/dist/metrics/index.d.ts.map +1 -1
  187. package/dist/metrics/index.js +0 -2
  188. package/dist/metrics/index.js.map +1 -1
  189. package/dist/metrics/usage_collector.cjs +17 -12
  190. package/dist/metrics/usage_collector.cjs.map +1 -1
  191. package/dist/metrics/usage_collector.d.cts +3 -2
  192. package/dist/metrics/usage_collector.d.ts +3 -2
  193. package/dist/metrics/usage_collector.d.ts.map +1 -1
  194. package/dist/metrics/usage_collector.js +17 -12
  195. package/dist/metrics/usage_collector.js.map +1 -1
  196. package/dist/metrics/utils.cjs +22 -59
  197. package/dist/metrics/utils.cjs.map +1 -1
  198. package/dist/metrics/utils.d.cts +1 -8
  199. package/dist/metrics/utils.d.ts +1 -8
  200. package/dist/metrics/utils.d.ts.map +1 -1
  201. package/dist/metrics/utils.js +22 -52
  202. package/dist/metrics/utils.js.map +1 -1
  203. package/dist/multimodal/index.cjs +0 -2
  204. package/dist/multimodal/index.cjs.map +1 -1
  205. package/dist/multimodal/index.d.cts +0 -1
  206. package/dist/multimodal/index.d.ts +0 -1
  207. package/dist/multimodal/index.d.ts.map +1 -1
  208. package/dist/multimodal/index.js +0 -1
  209. package/dist/multimodal/index.js.map +1 -1
  210. package/dist/plugin.cjs +24 -8
  211. package/dist/plugin.cjs.map +1 -1
  212. package/dist/plugin.d.cts +18 -4
  213. package/dist/plugin.d.ts +18 -4
  214. package/dist/plugin.d.ts.map +1 -1
  215. package/dist/plugin.js +22 -7
  216. package/dist/plugin.js.map +1 -1
  217. package/dist/stream/deferred_stream.cjs +98 -0
  218. package/dist/stream/deferred_stream.cjs.map +1 -0
  219. package/dist/stream/deferred_stream.d.cts +27 -0
  220. package/dist/stream/deferred_stream.d.ts +27 -0
  221. package/dist/stream/deferred_stream.d.ts.map +1 -0
  222. package/dist/stream/deferred_stream.js +73 -0
  223. package/dist/stream/deferred_stream.js.map +1 -0
  224. package/dist/stream/deferred_stream.test.cjs +527 -0
  225. package/dist/stream/deferred_stream.test.cjs.map +1 -0
  226. package/dist/stream/deferred_stream.test.js +526 -0
  227. package/dist/stream/deferred_stream.test.js.map +1 -0
  228. package/dist/stream/identity_transform.cjs +42 -0
  229. package/dist/stream/identity_transform.cjs.map +1 -0
  230. package/dist/stream/identity_transform.d.cts +6 -0
  231. package/dist/stream/identity_transform.d.ts +6 -0
  232. package/dist/stream/identity_transform.d.ts.map +1 -0
  233. package/dist/stream/identity_transform.js +18 -0
  234. package/dist/stream/identity_transform.js.map +1 -0
  235. package/dist/stream/identity_transform.test.cjs +125 -0
  236. package/dist/stream/identity_transform.test.cjs.map +1 -0
  237. package/dist/stream/identity_transform.test.js +124 -0
  238. package/dist/stream/identity_transform.test.js.map +1 -0
  239. package/dist/stream/index.cjs +38 -0
  240. package/dist/stream/index.cjs.map +1 -0
  241. package/dist/stream/index.d.cts +5 -0
  242. package/dist/stream/index.d.ts +5 -0
  243. package/dist/stream/index.d.ts.map +1 -0
  244. package/dist/stream/index.js +11 -0
  245. package/dist/stream/index.js.map +1 -0
  246. package/dist/stream/merge_readable_streams.cjs +59 -0
  247. package/dist/stream/merge_readable_streams.cjs.map +1 -0
  248. package/dist/stream/merge_readable_streams.d.cts +4 -0
  249. package/dist/stream/merge_readable_streams.d.ts +4 -0
  250. package/dist/stream/merge_readable_streams.d.ts.map +1 -0
  251. package/dist/stream/merge_readable_streams.js +35 -0
  252. package/dist/stream/merge_readable_streams.js.map +1 -0
  253. package/dist/stream/stream_channel.cjs +47 -0
  254. package/dist/stream/stream_channel.cjs.map +1 -0
  255. package/dist/stream/stream_channel.d.cts +9 -0
  256. package/dist/stream/stream_channel.d.ts +9 -0
  257. package/dist/stream/stream_channel.d.ts.map +1 -0
  258. package/dist/stream/stream_channel.js +23 -0
  259. package/dist/stream/stream_channel.js.map +1 -0
  260. package/dist/stream/stream_channel.test.cjs +97 -0
  261. package/dist/stream/stream_channel.test.cjs.map +1 -0
  262. package/dist/stream/stream_channel.test.js +96 -0
  263. package/dist/stream/stream_channel.test.js.map +1 -0
  264. package/dist/stt/stream_adapter.cjs +3 -4
  265. package/dist/stt/stream_adapter.cjs.map +1 -1
  266. package/dist/stt/stream_adapter.d.cts +1 -0
  267. package/dist/stt/stream_adapter.d.ts +1 -0
  268. package/dist/stt/stream_adapter.d.ts.map +1 -1
  269. package/dist/stt/stream_adapter.js +3 -4
  270. package/dist/stt/stream_adapter.js.map +1 -1
  271. package/dist/stt/stt.cjs +100 -10
  272. package/dist/stt/stt.cjs.map +1 -1
  273. package/dist/stt/stt.d.cts +26 -5
  274. package/dist/stt/stt.d.ts +26 -5
  275. package/dist/stt/stt.d.ts.map +1 -1
  276. package/dist/stt/stt.js +101 -11
  277. package/dist/stt/stt.js.map +1 -1
  278. package/dist/tokenize/basic/basic.cjs +10 -5
  279. package/dist/tokenize/basic/basic.cjs.map +1 -1
  280. package/dist/tokenize/basic/basic.d.cts +7 -1
  281. package/dist/tokenize/basic/basic.d.ts +7 -1
  282. package/dist/tokenize/basic/basic.d.ts.map +1 -1
  283. package/dist/tokenize/basic/basic.js +10 -5
  284. package/dist/tokenize/basic/basic.js.map +1 -1
  285. package/dist/tokenize/basic/sentence.cjs +14 -6
  286. package/dist/tokenize/basic/sentence.cjs.map +1 -1
  287. package/dist/tokenize/basic/sentence.d.cts +1 -1
  288. package/dist/tokenize/basic/sentence.d.ts +1 -1
  289. package/dist/tokenize/basic/sentence.d.ts.map +1 -1
  290. package/dist/tokenize/basic/sentence.js +14 -6
  291. package/dist/tokenize/basic/sentence.js.map +1 -1
  292. package/dist/tokenize/token_stream.cjs +5 -3
  293. package/dist/tokenize/token_stream.cjs.map +1 -1
  294. package/dist/tokenize/token_stream.d.cts +1 -0
  295. package/dist/tokenize/token_stream.d.ts +1 -0
  296. package/dist/tokenize/token_stream.d.ts.map +1 -1
  297. package/dist/tokenize/token_stream.js +6 -4
  298. package/dist/tokenize/token_stream.js.map +1 -1
  299. package/dist/transcription.cjs +1 -2
  300. package/dist/transcription.cjs.map +1 -1
  301. package/dist/transcription.d.ts.map +1 -1
  302. package/dist/transcription.js +2 -3
  303. package/dist/transcription.js.map +1 -1
  304. package/dist/tts/index.cjs +2 -4
  305. package/dist/tts/index.cjs.map +1 -1
  306. package/dist/tts/index.d.cts +1 -1
  307. package/dist/tts/index.d.ts +1 -1
  308. package/dist/tts/index.d.ts.map +1 -1
  309. package/dist/tts/index.js +1 -3
  310. package/dist/tts/index.js.map +1 -1
  311. package/dist/tts/stream_adapter.cjs +26 -13
  312. package/dist/tts/stream_adapter.cjs.map +1 -1
  313. package/dist/tts/stream_adapter.d.cts +1 -1
  314. package/dist/tts/stream_adapter.d.ts +1 -1
  315. package/dist/tts/stream_adapter.d.ts.map +1 -1
  316. package/dist/tts/stream_adapter.js +27 -14
  317. package/dist/tts/stream_adapter.js.map +1 -1
  318. package/dist/tts/tts.cjs +156 -25
  319. package/dist/tts/tts.cjs.map +1 -1
  320. package/dist/tts/tts.d.cts +29 -5
  321. package/dist/tts/tts.d.ts +29 -5
  322. package/dist/tts/tts.d.ts.map +1 -1
  323. package/dist/tts/tts.js +156 -24
  324. package/dist/tts/tts.js.map +1 -1
  325. package/dist/types.cjs +60 -0
  326. package/dist/types.cjs.map +1 -0
  327. package/dist/types.d.cts +13 -0
  328. package/dist/types.d.ts +13 -0
  329. package/dist/types.d.ts.map +1 -0
  330. package/dist/types.js +35 -0
  331. package/dist/types.js.map +1 -0
  332. package/dist/utils.cjs +298 -27
  333. package/dist/utils.cjs.map +1 -1
  334. package/dist/utils.d.cts +145 -9
  335. package/dist/utils.d.ts +145 -9
  336. package/dist/utils.d.ts.map +1 -1
  337. package/dist/utils.js +281 -26
  338. package/dist/utils.js.map +1 -1
  339. package/dist/utils.test.cjs +491 -0
  340. package/dist/utils.test.cjs.map +1 -0
  341. package/dist/utils.test.js +498 -0
  342. package/dist/utils.test.js.map +1 -0
  343. package/dist/vad.cjs +76 -20
  344. package/dist/vad.cjs.map +1 -1
  345. package/dist/vad.d.cts +25 -5
  346. package/dist/vad.d.ts +25 -5
  347. package/dist/vad.d.ts.map +1 -1
  348. package/dist/vad.js +76 -20
  349. package/dist/vad.js.map +1 -1
  350. package/dist/voice/agent.cjs +245 -0
  351. package/dist/voice/agent.cjs.map +1 -0
  352. package/dist/voice/agent.d.cts +78 -0
  353. package/dist/voice/agent.d.ts +78 -0
  354. package/dist/voice/agent.d.ts.map +1 -0
  355. package/dist/voice/agent.js +220 -0
  356. package/dist/voice/agent.js.map +1 -0
  357. package/dist/voice/agent.test.cjs +61 -0
  358. package/dist/voice/agent.test.cjs.map +1 -0
  359. package/dist/voice/agent.test.js +60 -0
  360. package/dist/voice/agent.test.js.map +1 -0
  361. package/dist/voice/agent_activity.cjs +1453 -0
  362. package/dist/voice/agent_activity.cjs.map +1 -0
  363. package/dist/voice/agent_activity.d.cts +94 -0
  364. package/dist/voice/agent_activity.d.ts +94 -0
  365. package/dist/voice/agent_activity.d.ts.map +1 -0
  366. package/dist/voice/agent_activity.js +1449 -0
  367. package/dist/voice/agent_activity.js.map +1 -0
  368. package/dist/voice/agent_session.cjs +312 -0
  369. package/dist/voice/agent_session.cjs.map +1 -0
  370. package/dist/voice/agent_session.d.cts +121 -0
  371. package/dist/voice/agent_session.d.ts +121 -0
  372. package/dist/voice/agent_session.d.ts.map +1 -0
  373. package/dist/voice/agent_session.js +295 -0
  374. package/dist/voice/agent_session.js.map +1 -0
  375. package/dist/voice/audio_recognition.cjs +374 -0
  376. package/dist/voice/audio_recognition.cjs.map +1 -0
  377. package/dist/voice/audio_recognition.d.cts +80 -0
  378. package/dist/voice/audio_recognition.d.ts +80 -0
  379. package/dist/voice/audio_recognition.d.ts.map +1 -0
  380. package/dist/voice/audio_recognition.js +350 -0
  381. package/dist/voice/audio_recognition.js.map +1 -0
  382. package/dist/voice/events.cjs +145 -0
  383. package/dist/voice/events.cjs.map +1 -0
  384. package/dist/voice/events.d.cts +124 -0
  385. package/dist/voice/events.d.ts +124 -0
  386. package/dist/voice/events.d.ts.map +1 -0
  387. package/dist/voice/events.js +110 -0
  388. package/dist/voice/events.js.map +1 -0
  389. package/dist/voice/generation.cjs +700 -0
  390. package/dist/voice/generation.cjs.map +1 -0
  391. package/dist/voice/generation.d.cts +115 -0
  392. package/dist/voice/generation.d.ts +115 -0
  393. package/dist/voice/generation.d.ts.map +1 -0
  394. package/dist/voice/generation.js +672 -0
  395. package/dist/voice/generation.js.map +1 -0
  396. package/dist/voice/index.cjs +40 -0
  397. package/dist/voice/index.cjs.map +1 -0
  398. package/dist/voice/index.d.cts +5 -0
  399. package/dist/voice/index.d.ts +5 -0
  400. package/dist/voice/index.d.ts.map +1 -0
  401. package/dist/voice/index.js +11 -0
  402. package/dist/voice/index.js.map +1 -0
  403. package/dist/voice/io.cjs +245 -0
  404. package/dist/voice/io.cjs.map +1 -0
  405. package/dist/voice/io.d.cts +101 -0
  406. package/dist/voice/io.d.ts +101 -0
  407. package/dist/voice/io.d.ts.map +1 -0
  408. package/dist/voice/io.js +217 -0
  409. package/dist/voice/io.js.map +1 -0
  410. package/dist/voice/room_io/_input.cjs +121 -0
  411. package/dist/voice/room_io/_input.cjs.map +1 -0
  412. package/dist/voice/room_io/_input.d.cts +24 -0
  413. package/dist/voice/room_io/_input.d.ts +24 -0
  414. package/dist/voice/room_io/_input.d.ts.map +1 -0
  415. package/dist/voice/room_io/_input.js +102 -0
  416. package/dist/voice/room_io/_input.js.map +1 -0
  417. package/dist/voice/room_io/_output.cjs +358 -0
  418. package/dist/voice/room_io/_output.cjs.map +1 -0
  419. package/dist/voice/room_io/_output.d.cts +75 -0
  420. package/dist/voice/room_io/_output.d.ts +75 -0
  421. package/dist/voice/room_io/_output.d.ts.map +1 -0
  422. package/dist/voice/room_io/_output.js +342 -0
  423. package/dist/voice/room_io/_output.js.map +1 -0
  424. package/dist/voice/room_io/index.cjs +25 -0
  425. package/dist/voice/room_io/index.cjs.map +1 -0
  426. package/dist/voice/room_io/index.d.cts +3 -0
  427. package/dist/voice/room_io/index.d.ts +3 -0
  428. package/dist/voice/room_io/index.d.ts.map +1 -0
  429. package/dist/voice/room_io/index.js +3 -0
  430. package/dist/voice/room_io/index.js.map +1 -0
  431. package/dist/voice/room_io/room_io.cjs +370 -0
  432. package/dist/voice/room_io/room_io.cjs.map +1 -0
  433. package/dist/voice/room_io/room_io.d.cts +73 -0
  434. package/dist/voice/room_io/room_io.d.ts +73 -0
  435. package/dist/voice/room_io/room_io.d.ts.map +1 -0
  436. package/dist/voice/room_io/room_io.js +361 -0
  437. package/dist/voice/room_io/room_io.js.map +1 -0
  438. package/dist/{pipeline/index.cjs → voice/run_context.cjs} +16 -11
  439. package/dist/voice/run_context.cjs.map +1 -0
  440. package/dist/voice/run_context.d.cts +12 -0
  441. package/dist/voice/run_context.d.ts +12 -0
  442. package/dist/voice/run_context.d.ts.map +1 -0
  443. package/dist/voice/run_context.js +14 -0
  444. package/dist/voice/run_context.js.map +1 -0
  445. package/dist/voice/speech_handle.cjs +105 -0
  446. package/dist/voice/speech_handle.cjs.map +1 -0
  447. package/dist/voice/speech_handle.d.cts +46 -0
  448. package/dist/voice/speech_handle.d.ts +46 -0
  449. package/dist/voice/speech_handle.d.ts.map +1 -0
  450. package/dist/voice/speech_handle.js +81 -0
  451. package/dist/voice/speech_handle.js.map +1 -0
  452. package/dist/voice/transcription/_utils.cjs +45 -0
  453. package/dist/voice/transcription/_utils.cjs.map +1 -0
  454. package/dist/voice/transcription/_utils.d.cts +3 -0
  455. package/dist/voice/transcription/_utils.d.ts +3 -0
  456. package/dist/voice/transcription/_utils.d.ts.map +1 -0
  457. package/dist/voice/transcription/_utils.js +21 -0
  458. package/dist/voice/transcription/_utils.js.map +1 -0
  459. package/dist/voice/transcription/index.cjs +23 -0
  460. package/dist/voice/transcription/index.cjs.map +1 -0
  461. package/dist/voice/transcription/index.d.cts +2 -0
  462. package/dist/voice/transcription/index.d.ts +2 -0
  463. package/dist/voice/transcription/index.d.ts.map +1 -0
  464. package/dist/voice/transcription/index.js +2 -0
  465. package/dist/voice/transcription/index.js.map +1 -0
  466. package/dist/voice/transcription/synchronizer.cjs +379 -0
  467. package/dist/voice/transcription/synchronizer.cjs.map +1 -0
  468. package/dist/voice/transcription/synchronizer.d.cts +86 -0
  469. package/dist/voice/transcription/synchronizer.d.ts +86 -0
  470. package/dist/voice/transcription/synchronizer.d.ts.map +1 -0
  471. package/dist/voice/transcription/synchronizer.js +354 -0
  472. package/dist/voice/transcription/synchronizer.js.map +1 -0
  473. package/dist/worker.cjs +22 -4
  474. package/dist/worker.cjs.map +1 -1
  475. package/dist/worker.d.cts +1 -1
  476. package/dist/worker.d.ts +1 -1
  477. package/dist/worker.d.ts.map +1 -1
  478. package/dist/worker.js +22 -4
  479. package/dist/worker.js.map +1 -1
  480. package/package.json +8 -2
  481. package/src/_exceptions.ts +137 -0
  482. package/src/audio.ts +12 -1
  483. package/src/cli.ts +37 -0
  484. package/src/constants.ts +2 -1
  485. package/src/http_server.ts +1 -0
  486. package/src/index.ts +13 -10
  487. package/src/inference_runner.ts +2 -3
  488. package/src/ipc/inference_proc_executor.ts +2 -2
  489. package/src/ipc/job_executor.ts +1 -1
  490. package/src/ipc/job_proc_executor.ts +1 -1
  491. package/src/ipc/job_proc_lazy_main.ts +1 -1
  492. package/src/job.ts +18 -0
  493. package/src/llm/__snapshots__/chat_context.test.ts.snap +527 -0
  494. package/src/llm/__snapshots__/tool_context.test.ts.snap +177 -0
  495. package/src/llm/__snapshots__/utils.test.ts.snap +65 -0
  496. package/src/llm/chat_context.test.ts +450 -0
  497. package/src/llm/chat_context.ts +501 -103
  498. package/src/llm/index.ts +53 -18
  499. package/src/llm/llm.ts +148 -50
  500. package/src/llm/provider_format/google.test.ts +772 -0
  501. package/src/llm/provider_format/google.ts +130 -0
  502. package/src/llm/provider_format/index.ts +23 -0
  503. package/src/llm/provider_format/openai.test.ts +581 -0
  504. package/src/llm/provider_format/openai.ts +118 -0
  505. package/src/llm/provider_format/utils.ts +183 -0
  506. package/src/llm/realtime.ts +151 -0
  507. package/src/llm/remote_chat_context.test.ts +290 -0
  508. package/src/llm/remote_chat_context.ts +114 -0
  509. package/src/llm/tool_context.test.ts +198 -0
  510. package/src/llm/tool_context.ts +259 -0
  511. package/src/llm/tool_context.type.test.ts +115 -0
  512. package/src/llm/utils.test.ts +670 -0
  513. package/src/llm/utils.ts +324 -0
  514. package/src/metrics/base.ts +110 -78
  515. package/src/metrics/index.ts +3 -9
  516. package/src/metrics/usage_collector.ts +19 -13
  517. package/src/metrics/utils.ts +24 -69
  518. package/src/multimodal/index.ts +0 -1
  519. package/src/plugin.ts +26 -8
  520. package/src/stream/deferred_stream.test.ts +755 -0
  521. package/src/stream/deferred_stream.ts +110 -0
  522. package/src/stream/identity_transform.test.ts +179 -0
  523. package/src/stream/identity_transform.ts +18 -0
  524. package/src/stream/index.ts +7 -0
  525. package/src/stream/merge_readable_streams.ts +40 -0
  526. package/src/stream/stream_channel.test.ts +129 -0
  527. package/src/stream/stream_channel.ts +32 -0
  528. package/src/stt/stream_adapter.ts +3 -5
  529. package/src/stt/stt.ts +134 -17
  530. package/src/tokenize/basic/basic.ts +13 -5
  531. package/src/tokenize/basic/sentence.ts +20 -6
  532. package/src/tokenize/token_stream.ts +7 -4
  533. package/src/transcription.ts +2 -3
  534. package/src/tts/index.ts +0 -1
  535. package/src/tts/stream_adapter.ts +42 -16
  536. package/src/tts/tts.ts +202 -21
  537. package/src/types.ts +42 -0
  538. package/src/utils.test.ts +658 -0
  539. package/src/utils.ts +402 -44
  540. package/src/vad.ts +90 -22
  541. package/src/voice/agent.test.ts +80 -0
  542. package/src/voice/agent.ts +332 -0
  543. package/src/voice/agent_activity.ts +1913 -0
  544. package/src/voice/agent_session.ts +460 -0
  545. package/src/voice/audio_recognition.ts +473 -0
  546. package/src/voice/events.ts +252 -0
  547. package/src/voice/generation.ts +881 -0
  548. package/src/voice/index.ts +7 -0
  549. package/src/voice/io.ts +304 -0
  550. package/src/voice/room_io/_input.ts +144 -0
  551. package/src/voice/room_io/_output.ts +436 -0
  552. package/src/voice/room_io/index.ts +5 -0
  553. package/src/voice/room_io/room_io.ts +495 -0
  554. package/src/voice/run_context.ts +20 -0
  555. package/src/voice/speech_handle.ts +104 -0
  556. package/src/voice/transcription/_utils.ts +25 -0
  557. package/src/voice/transcription/index.ts +4 -0
  558. package/src/voice/transcription/synchronizer.ts +477 -0
  559. package/src/worker.ts +22 -2
  560. package/dist/llm/function_context.cjs +0 -103
  561. package/dist/llm/function_context.cjs.map +0 -1
  562. package/dist/llm/function_context.d.cts +0 -47
  563. package/dist/llm/function_context.d.ts +0 -47
  564. package/dist/llm/function_context.d.ts.map +0 -1
  565. package/dist/llm/function_context.js +0 -78
  566. package/dist/llm/function_context.js.map +0 -1
  567. package/dist/llm/function_context.test.cjs +0 -218
  568. package/dist/llm/function_context.test.cjs.map +0 -1
  569. package/dist/llm/function_context.test.js +0 -217
  570. package/dist/llm/function_context.test.js.map +0 -1
  571. package/dist/multimodal/multimodal_agent.cjs +0 -486
  572. package/dist/multimodal/multimodal_agent.cjs.map +0 -1
  573. package/dist/multimodal/multimodal_agent.d.cts +0 -48
  574. package/dist/multimodal/multimodal_agent.d.ts +0 -48
  575. package/dist/multimodal/multimodal_agent.d.ts.map +0 -1
  576. package/dist/multimodal/multimodal_agent.js +0 -461
  577. package/dist/multimodal/multimodal_agent.js.map +0 -1
  578. package/dist/pipeline/agent_output.cjs +0 -197
  579. package/dist/pipeline/agent_output.cjs.map +0 -1
  580. package/dist/pipeline/agent_output.d.cts +0 -33
  581. package/dist/pipeline/agent_output.d.ts +0 -33
  582. package/dist/pipeline/agent_output.d.ts.map +0 -1
  583. package/dist/pipeline/agent_output.js +0 -172
  584. package/dist/pipeline/agent_output.js.map +0 -1
  585. package/dist/pipeline/agent_playout.cjs +0 -175
  586. package/dist/pipeline/agent_playout.cjs.map +0 -1
  587. package/dist/pipeline/agent_playout.d.cts +0 -40
  588. package/dist/pipeline/agent_playout.d.ts +0 -40
  589. package/dist/pipeline/agent_playout.d.ts.map +0 -1
  590. package/dist/pipeline/agent_playout.js +0 -139
  591. package/dist/pipeline/agent_playout.js.map +0 -1
  592. package/dist/pipeline/human_input.cjs +0 -171
  593. package/dist/pipeline/human_input.cjs.map +0 -1
  594. package/dist/pipeline/human_input.d.cts +0 -30
  595. package/dist/pipeline/human_input.d.ts +0 -30
  596. package/dist/pipeline/human_input.d.ts.map +0 -1
  597. package/dist/pipeline/human_input.js +0 -146
  598. package/dist/pipeline/human_input.js.map +0 -1
  599. package/dist/pipeline/index.cjs.map +0 -1
  600. package/dist/pipeline/index.d.cts +0 -2
  601. package/dist/pipeline/index.d.ts +0 -2
  602. package/dist/pipeline/index.d.ts.map +0 -1
  603. package/dist/pipeline/index.js +0 -11
  604. package/dist/pipeline/index.js.map +0 -1
  605. package/dist/pipeline/pipeline_agent.cjs +0 -859
  606. package/dist/pipeline/pipeline_agent.cjs.map +0 -1
  607. package/dist/pipeline/pipeline_agent.d.cts +0 -150
  608. package/dist/pipeline/pipeline_agent.d.ts +0 -150
  609. package/dist/pipeline/pipeline_agent.d.ts.map +0 -1
  610. package/dist/pipeline/pipeline_agent.js +0 -837
  611. package/dist/pipeline/pipeline_agent.js.map +0 -1
  612. package/dist/pipeline/speech_handle.cjs +0 -176
  613. package/dist/pipeline/speech_handle.cjs.map +0 -1
  614. package/dist/pipeline/speech_handle.d.cts +0 -37
  615. package/dist/pipeline/speech_handle.d.ts +0 -37
  616. package/dist/pipeline/speech_handle.d.ts.map +0 -1
  617. package/dist/pipeline/speech_handle.js +0 -152
  618. package/dist/pipeline/speech_handle.js.map +0 -1
  619. package/src/llm/function_context.test.ts +0 -248
  620. package/src/llm/function_context.ts +0 -142
  621. package/src/multimodal/multimodal_agent.ts +0 -592
  622. package/src/pipeline/agent_output.ts +0 -219
  623. package/src/pipeline/agent_playout.ts +0 -192
  624. package/src/pipeline/human_input.ts +0 -188
  625. package/src/pipeline/index.ts +0 -15
  626. package/src/pipeline/pipeline_agent.ts +0 -1197
  627. package/src/pipeline/speech_handle.ts +0 -201
@@ -0,0 +1,1449 @@
1
+ import { Mutex } from "@livekit/mutex";
2
+ import { Heap } from "heap-js";
3
+ import { AsyncLocalStorage } from "node:async_hooks";
4
+ import { ReadableStream } from "node:stream/web";
5
+ import { ChatMessage } from "../llm/chat_context.js";
6
+ import {
7
+ LLM,
8
+ RealtimeModel
9
+ } from "../llm/index.js";
10
+ import { log } from "../log.js";
11
+ import { DeferredReadableStream } from "../stream/deferred_stream.js";
12
+ import { STT } from "../stt/stt.js";
13
+ import { splitWords } from "../tokenize/basic/word.js";
14
+ import { TTS } from "../tts/tts.js";
15
+ import { Future, Task, cancelAndWait, waitFor } from "../utils.js";
16
+ import { VAD } from "../vad.js";
17
+ import { StopResponse, asyncLocalStorage } from "./agent.js";
18
+ import {} from "./agent_session.js";
19
+ import {
20
+ AudioRecognition
21
+ } from "./audio_recognition.js";
22
+ import {
23
+ AgentSessionEventTypes,
24
+ createErrorEvent,
25
+ createFunctionToolsExecutedEvent,
26
+ createMetricsCollectedEvent,
27
+ createSpeechCreatedEvent,
28
+ createUserInputTranscribedEvent
29
+ } from "./events.js";
30
+ import {
31
+ performAudioForwarding,
32
+ performLLMInference,
33
+ performTTSInference,
34
+ performTextForwarding,
35
+ performToolExecutions,
36
+ removeInstructions,
37
+ updateInstructions
38
+ } from "./generation.js";
39
+ import { SpeechHandle } from "./speech_handle.js";
40
+ const speechHandleStorage = new AsyncLocalStorage();
41
+ class AgentActivity {
42
+ static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
43
+ started = false;
44
+ audioRecognition;
45
+ realtimeSession;
46
+ turnDetectionMode;
47
+ logger = log();
48
+ _draining = false;
49
+ _currentSpeech;
50
+ speechQueue;
51
+ // [priority, timestamp, speechHandle]
52
+ q_updated;
53
+ speechTasks = /* @__PURE__ */ new Set();
54
+ lock = new Mutex();
55
+ audioStream = new DeferredReadableStream();
56
+ // default to null as None, which maps to the default provider tool choice value
57
+ toolChoice = null;
58
+ agent;
59
+ agentSession;
60
+ /** @internal */
61
+ _mainTask;
62
+ _userTurnCompletedTask;
63
+ constructor(agent, agentSession) {
64
+ this.agent = agent;
65
+ this.agentSession = agentSession;
66
+ this.speechQueue = new Heap(([p1, t1, _], [p2, t2, __]) => {
67
+ return p1 === p2 ? t1 - t2 : p2 - p1;
68
+ });
69
+ this.q_updated = new Future();
70
+ this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
71
+ if (this.turnDetectionMode === "vad" && this.vad === void 0) {
72
+ this.logger.warn(
73
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting'
74
+ );
75
+ this.turnDetectionMode = void 0;
76
+ }
77
+ if (this.turnDetectionMode === "stt" && this.stt === void 0) {
78
+ this.logger.warn(
79
+ 'turnDetection is set to "stt", but no STT model is provided, ignoring the turnDetection setting'
80
+ );
81
+ this.turnDetectionMode = void 0;
82
+ }
83
+ if (this.llm instanceof RealtimeModel) {
84
+ if (this.llm.capabilities.turnDetection && !this.allowInterruptions) {
85
+ this.logger.warn(
86
+ "the RealtimeModel uses a server-side turn detection, allowInterruptions cannot be false, disable turnDetection in the RealtimeModel and use VAD on the AgentSession instead"
87
+ );
88
+ }
89
+ if (this.turnDetectionMode === "realtime_llm" && !this.llm.capabilities.turnDetection) {
90
+ this.logger.warn(
91
+ 'turnDetection is set to "realtime_llm", but the LLM is not a RealtimeModel or the server-side turn detection is not supported/enabled, ignoring the turnDetection setting'
92
+ );
93
+ this.turnDetectionMode = void 0;
94
+ }
95
+ if (this.turnDetectionMode === "stt") {
96
+ this.logger.warn(
97
+ 'turnDetection is set to "stt", but the LLM is a RealtimeModel, ignoring the turnDetection setting'
98
+ );
99
+ this.turnDetectionMode = void 0;
100
+ }
101
+ if (this.turnDetectionMode && this.turnDetectionMode !== "realtime_llm" && this.llm.capabilities.turnDetection) {
102
+ this.logger.warn(
103
+ `turnDetection is set to "${this.turnDetectionMode}", but the LLM is a RealtimeModel and server-side turn detection enabled, ignoring the turnDetection setting`
104
+ );
105
+ this.turnDetectionMode = void 0;
106
+ }
107
+ if (!this.llm.capabilities.turnDetection && this.vad && this.turnDetectionMode === void 0) {
108
+ this.turnDetectionMode = "vad";
109
+ }
110
+ } else if (this.turnDetectionMode === "realtime_llm") {
111
+ this.logger.warn(
112
+ 'turnDetection is set to "realtime_llm", but the LLM is not a RealtimeModel'
113
+ );
114
+ this.turnDetectionMode = void 0;
115
+ }
116
+ if (!this.vad && this.stt && this.llm instanceof LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
117
+ this.logger.warn(
118
+ "VAD is not set. Enabling VAD is recommended when using LLM and STT for more responsive interruption handling."
119
+ );
120
+ }
121
+ }
122
+ async start() {
123
+ const unlock = await this.lock.lock();
124
+ try {
125
+ this.agent._agentActivity = this;
126
+ if (this.llm instanceof RealtimeModel) {
127
+ this.realtimeSession = this.llm.session();
128
+ this.realtimeSession.on("generation_created", (ev) => this.onGenerationCreated(ev));
129
+ this.realtimeSession.on("input_speech_started", (ev) => this.onInputSpeechStarted(ev));
130
+ this.realtimeSession.on("input_speech_stopped", (ev) => this.onInputSpeechStopped(ev));
131
+ this.realtimeSession.on(
132
+ "input_audio_transcription_completed",
133
+ (ev) => this.onInputAudioTranscriptionCompleted(ev)
134
+ );
135
+ this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
136
+ this.realtimeSession.on("error", (ev) => this.onError(ev));
137
+ removeInstructions(this.agent._chatCtx);
138
+ try {
139
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
140
+ } catch (error) {
141
+ this.logger.error(error, "failed to update the instructions");
142
+ }
143
+ try {
144
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
145
+ } catch (error) {
146
+ this.logger.error(error, "failed to update the chat context");
147
+ }
148
+ try {
149
+ await this.realtimeSession.updateTools(this.tools);
150
+ } catch (error) {
151
+ this.logger.error(error, "failed to update the tools");
152
+ }
153
+ } else if (this.llm instanceof LLM) {
154
+ try {
155
+ updateInstructions({
156
+ chatCtx: this.agent._chatCtx,
157
+ instructions: this.agent.instructions,
158
+ addIfMissing: true
159
+ });
160
+ } catch (error) {
161
+ this.logger.error("failed to update the instructions", error);
162
+ }
163
+ }
164
+ if (this.llm instanceof LLM) {
165
+ this.llm.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
166
+ this.llm.on("error", (ev) => this.onError(ev));
167
+ }
168
+ if (this.stt instanceof STT) {
169
+ this.stt.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
170
+ this.stt.on("error", (ev) => this.onError(ev));
171
+ }
172
+ if (this.tts instanceof TTS) {
173
+ this.tts.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
174
+ this.tts.on("error", (ev) => this.onError(ev));
175
+ }
176
+ if (this.vad instanceof VAD) {
177
+ this.vad.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
178
+ }
179
+ this.audioRecognition = new AudioRecognition({
180
+ recognitionHooks: this,
181
+ // Disable stt node if stt is not provided
182
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
183
+ vad: this.vad,
184
+ turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
185
+ turnDetectionMode: this.turnDetectionMode,
186
+ minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
187
+ maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay
188
+ });
189
+ this.audioRecognition.start();
190
+ this.started = true;
191
+ this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
192
+ this.createSpeechTask({
193
+ promise: this.agent.onEnter(),
194
+ name: "AgentActivity_onEnter"
195
+ });
196
+ } finally {
197
+ unlock();
198
+ }
199
+ }
200
+ get currentSpeech() {
201
+ return this._currentSpeech;
202
+ }
203
+ get vad() {
204
+ return this.agent.vad || this.agentSession.vad;
205
+ }
206
+ get stt() {
207
+ return this.agent.stt || this.agentSession.stt;
208
+ }
209
+ get llm() {
210
+ return this.agent.llm || this.agentSession.llm;
211
+ }
212
+ get tts() {
213
+ return this.agent.tts || this.agentSession.tts;
214
+ }
215
+ get tools() {
216
+ return this.agent.toolCtx;
217
+ }
218
+ get draining() {
219
+ return this._draining;
220
+ }
221
+ get realtimeLLMSession() {
222
+ return this.realtimeSession;
223
+ }
224
+ get allowInterruptions() {
225
+ return this.agentSession.options.allowInterruptions;
226
+ }
227
+ get turnDetection() {
228
+ return this.agentSession.turnDetection;
229
+ }
230
+ get toolCtx() {
231
+ return this.agent.toolCtx;
232
+ }
233
+ async updateChatCtx(chatCtx) {
234
+ chatCtx = chatCtx.copy({ toolCtx: this.toolCtx });
235
+ this.agent._chatCtx = chatCtx;
236
+ if (this.realtimeSession) {
237
+ removeInstructions(chatCtx);
238
+ this.realtimeSession.updateChatCtx(chatCtx);
239
+ } else {
240
+ updateInstructions({
241
+ chatCtx,
242
+ instructions: this.agent.instructions,
243
+ addIfMissing: true
244
+ });
245
+ }
246
+ }
247
+ updateOptions({ toolChoice }) {
248
+ if (toolChoice !== void 0) {
249
+ this.toolChoice = toolChoice;
250
+ }
251
+ if (this.realtimeSession) {
252
+ this.realtimeSession.updateOptions({ toolChoice: this.toolChoice });
253
+ }
254
+ }
255
+ attachAudioInput(audioStream) {
256
+ if (this.audioStream.isSourceSet) {
257
+ this.logger.debug("detaching existing audio input in agent activity");
258
+ this.audioStream.detachSource();
259
+ }
260
+ this.audioStream.setSource(audioStream);
261
+ const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
262
+ if (this.realtimeSession) {
263
+ this.realtimeSession.setInputAudioStream(realtimeAudioStream);
264
+ }
265
+ if (this.audioRecognition) {
266
+ this.audioRecognition.setInputAudioStream(recognitionAudioStream);
267
+ }
268
+ }
269
+ detachAudioInput() {
270
+ this.audioStream.detachSource();
271
+ }
272
+ commitUserTurn() {
273
+ if (!this.audioRecognition) {
274
+ throw new Error("AudioRecognition is not initialized");
275
+ }
276
+ const audioDetached = false;
277
+ this.audioRecognition.commitUserTurn(audioDetached);
278
+ }
279
+ clearUserTurn() {
280
+ var _a, _b;
281
+ (_a = this.audioRecognition) == null ? void 0 : _a.clearUserTurn();
282
+ (_b = this.realtimeSession) == null ? void 0 : _b.clearAudio();
283
+ }
284
+ say(text, options) {
285
+ const {
286
+ audio,
287
+ allowInterruptions: defaultAllowInterruptions,
288
+ addToChatCtx = true
289
+ } = options ?? {};
290
+ let allowInterruptions = defaultAllowInterruptions;
291
+ if (!audio && !this.tts && this.agentSession.output.audio && this.agentSession.output.audioEnabled) {
292
+ throw new Error("trying to generate speech from text without a TTS model");
293
+ }
294
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection && allowInterruptions === false) {
295
+ this.logger.warn(
296
+ "the RealtimeModel uses a server-side turn detection, allowInterruptions cannot be false when using VoiceAgent.say(), disable turnDetection in the RealtimeModel and use VAD on the AgentTask/VoiceAgent instead"
297
+ );
298
+ allowInterruptions = true;
299
+ }
300
+ const handle = SpeechHandle.create({
301
+ allowInterruptions: allowInterruptions ?? this.allowInterruptions
302
+ });
303
+ this.agentSession.emit(
304
+ AgentSessionEventTypes.SpeechCreated,
305
+ createSpeechCreatedEvent({
306
+ userInitiated: true,
307
+ source: "say",
308
+ speechHandle: handle
309
+ })
310
+ );
311
+ const task = this.createSpeechTask({
312
+ promise: this.ttsTask(handle, text, addToChatCtx, {}, audio),
313
+ ownedSpeechHandle: handle,
314
+ name: "AgentActivity.say_tts"
315
+ });
316
+ task.finally(() => this.onPipelineReplyDone());
317
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
318
+ return handle;
319
+ }
320
+ // -- Metrics and errors --
321
+ onMetricsCollected = (ev) => {
322
+ const speechHandle = speechHandleStorage.getStore();
323
+ if (speechHandle && (ev.type === "llm_metrics" || ev.type === "tts_metrics")) {
324
+ ev.speechId = speechHandle.id;
325
+ }
326
+ this.agentSession.emit(
327
+ AgentSessionEventTypes.MetricsCollected,
328
+ createMetricsCollectedEvent({ metrics: ev })
329
+ );
330
+ };
331
+ onError(ev) {
332
+ if (ev.type === "realtime_model_error") {
333
+ const errorEvent = createErrorEvent(ev.error, this.llm);
334
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
335
+ } else if (ev.type === "stt_error") {
336
+ const errorEvent = createErrorEvent(ev.error, this.stt);
337
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
338
+ } else if (ev.type === "tts_error") {
339
+ const errorEvent = createErrorEvent(ev.error, this.tts);
340
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
341
+ } else if (ev.type === "llm_error") {
342
+ const errorEvent = createErrorEvent(ev.error, this.llm);
343
+ this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent);
344
+ }
345
+ this.agentSession._onError(ev);
346
+ }
347
+ // -- Realtime Session events --
348
+ onInputSpeechStarted(_ev) {
349
+ this.logger.info("onInputSpeechStarted");
350
+ if (!this.vad) {
351
+ this.agentSession._updateUserState("speaking");
352
+ }
353
+ try {
354
+ this.interrupt();
355
+ } catch (error) {
356
+ this.logger.error(
357
+ "RealtimeAPI input_speech_started, but current speech is not interruptable, this should never happen!",
358
+ error
359
+ );
360
+ }
361
+ }
362
+ onInputSpeechStopped(ev) {
363
+ this.logger.info(ev, "onInputSpeechStopped");
364
+ if (!this.vad) {
365
+ this.agentSession._updateUserState("listening");
366
+ }
367
+ if (ev.userTranscriptionEnabled) {
368
+ this.agentSession.emit(
369
+ AgentSessionEventTypes.UserInputTranscribed,
370
+ createUserInputTranscribedEvent({
371
+ isFinal: false,
372
+ transcript: ""
373
+ })
374
+ );
375
+ }
376
+ }
377
+ onInputAudioTranscriptionCompleted(ev) {
378
+ this.agentSession.emit(
379
+ AgentSessionEventTypes.UserInputTranscribed,
380
+ createUserInputTranscribedEvent({
381
+ transcript: ev.transcript,
382
+ isFinal: ev.isFinal
383
+ })
384
+ );
385
+ if (ev.isFinal) {
386
+ const message = ChatMessage.create({
387
+ role: "user",
388
+ content: ev.transcript,
389
+ id: ev.itemId
390
+ });
391
+ this.agent._chatCtx.items.push(message);
392
+ this.agentSession._conversationItemAdded(message);
393
+ }
394
+ }
395
+ onGenerationCreated(ev) {
396
+ if (ev.userInitiated) {
397
+ return;
398
+ }
399
+ if (this.draining) {
400
+ this.logger.warn("skipping new realtime generation, the agent is draining");
401
+ return;
402
+ }
403
+ const handle = SpeechHandle.create({
404
+ allowInterruptions: this.allowInterruptions
405
+ });
406
+ this.agentSession.emit(
407
+ AgentSessionEventTypes.SpeechCreated,
408
+ createSpeechCreatedEvent({
409
+ userInitiated: false,
410
+ source: "generate_reply",
411
+ speechHandle: handle
412
+ })
413
+ );
414
+ this.logger.info({ speech_id: handle.id }, "Creating speech handle");
415
+ this.createSpeechTask({
416
+ promise: this.realtimeGenerationTask(handle, ev, {}),
417
+ ownedSpeechHandle: handle,
418
+ name: "AgentActivity.realtimeGeneration"
419
+ });
420
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
421
+ }
422
+ // recognition hooks
423
+ onStartOfSpeech(_ev) {
424
+ this.agentSession._updateUserState("speaking");
425
+ }
426
+ onEndOfSpeech(_ev) {
427
+ this.agentSession._updateUserState("listening");
428
+ }
429
+ onVADInferenceDone(ev) {
430
+ var _a, _b;
431
+ if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
432
+ return;
433
+ }
434
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
435
+ return;
436
+ }
437
+ if (ev.speechDuration < this.agentSession.options.minInterruptionDuration) {
438
+ return;
439
+ }
440
+ if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
441
+ const text = this.audioRecognition.currentTranscript;
442
+ if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
443
+ return;
444
+ }
445
+ }
446
+ (_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
447
+ if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
448
+ this.logger.info({ "speech id": this._currentSpeech.id }, "speech interrupted by VAD");
449
+ (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
450
+ this._currentSpeech.interrupt();
451
+ }
452
+ }
453
+ onInterimTranscript(ev) {
454
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
455
+ return;
456
+ }
457
+ this.agentSession.emit(
458
+ AgentSessionEventTypes.UserInputTranscribed,
459
+ createUserInputTranscribedEvent({
460
+ transcript: ev.alternatives[0].text,
461
+ isFinal: false
462
+ // TODO(AJS-106): add multi participant support
463
+ })
464
+ );
465
+ }
466
+ onFinalTranscript(ev) {
467
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
468
+ return;
469
+ }
470
+ this.agentSession.emit(
471
+ AgentSessionEventTypes.UserInputTranscribed,
472
+ createUserInputTranscribedEvent({
473
+ transcript: ev.alternatives[0].text,
474
+ isFinal: true
475
+ // TODO(AJS-106): add multi participant support
476
+ })
477
+ );
478
+ }
479
+ createSpeechTask(options) {
480
+ const { promise, ownedSpeechHandle } = options;
481
+ this.speechTasks.add(promise);
482
+ promise.finally(() => {
483
+ this.speechTasks.delete(promise);
484
+ if (ownedSpeechHandle) {
485
+ ownedSpeechHandle._markPlayoutDone();
486
+ }
487
+ this.wakeupMainTask();
488
+ });
489
+ return promise;
490
+ }
491
+ async onEndOfTurn(info) {
492
+ if (this.draining) {
493
+ this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
494
+ return true;
495
+ }
496
+ if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0 && info.newTranscript.split(" ").length < this.agentSession.options.minInterruptionWords) {
497
+ this.logger.info("skipping user input, new_transcript is too short");
498
+ return false;
499
+ }
500
+ const oldTask = this._userTurnCompletedTask;
501
+ this._userTurnCompletedTask = this.createSpeechTask({
502
+ promise: this.userTurnCompleted(info, oldTask),
503
+ name: "AgentActivity.userTurnCompleted"
504
+ });
505
+ return true;
506
+ }
507
+ retrieveChatCtx() {
508
+ return this.agentSession.chatCtx;
509
+ }
510
+ async mainTask(signal) {
511
+ const abortFuture = new Future();
512
+ const abortHandler = () => {
513
+ abortFuture.resolve();
514
+ signal.removeEventListener("abort", abortHandler);
515
+ };
516
+ signal.addEventListener("abort", abortHandler);
517
+ while (true) {
518
+ await Promise.race([this.q_updated.await, abortFuture.await]);
519
+ if (signal.aborted) break;
520
+ while (this.speechQueue.size() > 0) {
521
+ if (signal.aborted) break;
522
+ const heapItem = this.speechQueue.pop();
523
+ if (!heapItem) {
524
+ throw new Error("Speech queue is empty");
525
+ }
526
+ const speechHandle = heapItem[2];
527
+ this._currentSpeech = speechHandle;
528
+ speechHandle._authorizePlayout();
529
+ await speechHandle.waitForPlayout();
530
+ this._currentSpeech = void 0;
531
+ }
532
+ if (this.draining && this.speechTasks.size === 0) {
533
+ this.logger.info("mainTask: draining and no more speech tasks");
534
+ break;
535
+ }
536
+ this.q_updated = new Future();
537
+ }
538
+ this.logger.info("AgentActivity mainTask: exiting");
539
+ }
540
+ wakeupMainTask() {
541
+ this.q_updated.resolve();
542
+ }
543
+ generateReply(options) {
544
+ var _a;
545
+ const {
546
+ userMessage,
547
+ chatCtx,
548
+ instructions: defaultInstructions,
549
+ toolChoice: defaultToolChoice,
550
+ allowInterruptions: defaultAllowInterruptions
551
+ } = options;
552
+ let instructions = defaultInstructions;
553
+ let toolChoice = defaultToolChoice;
554
+ let allowInterruptions = defaultAllowInterruptions;
555
+ if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection && allowInterruptions === false) {
556
+ this.logger.warn(
557
+ "the RealtimeModel uses a server-side turn detection, allowInterruptions cannot be false when using VoiceAgent.generateReply(), disable turnDetection in the RealtimeModel and use VAD on the AgentTask/VoiceAgent instead"
558
+ );
559
+ allowInterruptions = true;
560
+ }
561
+ if (this.llm === void 0) {
562
+ throw new Error("trying to generate reply without an LLM model");
563
+ }
564
+ const functionCall = (_a = asyncLocalStorage.getStore()) == null ? void 0 : _a.functionCall;
565
+ if (toolChoice === void 0 && functionCall !== void 0) {
566
+ toolChoice = "none";
567
+ }
568
+ const handle = SpeechHandle.create({
569
+ allowInterruptions: allowInterruptions ?? this.allowInterruptions
570
+ });
571
+ this.agentSession.emit(
572
+ AgentSessionEventTypes.SpeechCreated,
573
+ createSpeechCreatedEvent({
574
+ userInitiated: true,
575
+ source: "generate_reply",
576
+ speechHandle: handle
577
+ })
578
+ );
579
+ this.logger.info({ speech_id: handle.id }, "Creating speech handle");
580
+ if (this.llm instanceof RealtimeModel) {
581
+ this.createSpeechTask({
582
+ promise: this.realtimeReplyTask({
583
+ speechHandle: handle,
584
+ // TODO(brian): support llm.ChatMessage for the realtime model
585
+ userInput: userMessage == null ? void 0 : userMessage.textContent,
586
+ instructions,
587
+ modelSettings: {
588
+ // isGiven(toolChoice) = toolChoice !== undefined
589
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
590
+ }
591
+ }),
592
+ ownedSpeechHandle: handle,
593
+ name: "AgentActivity.realtimeReply"
594
+ });
595
+ } else if (this.llm instanceof LLM) {
596
+ if (instructions) {
597
+ instructions = `${this.agent.instructions}
598
+ ${instructions}`;
599
+ }
600
+ const task = this.createSpeechTask({
601
+ promise: this.pipelineReplyTask(
602
+ handle,
603
+ chatCtx ?? this.agent.chatCtx,
604
+ this.agent.toolCtx,
605
+ { toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice) },
606
+ instructions ? `${this.agent.instructions}
607
+ ${instructions}` : instructions,
608
+ userMessage
609
+ ),
610
+ ownedSpeechHandle: handle,
611
+ name: "AgentActivity.pipelineReply"
612
+ });
613
+ task.finally(() => this.onPipelineReplyDone());
614
+ }
615
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
616
+ return handle;
617
+ }
618
+ interrupt() {
619
+ var _a;
620
+ const future = new Future();
621
+ const currentSpeech = this._currentSpeech;
622
+ currentSpeech == null ? void 0 : currentSpeech.interrupt();
623
+ for (const [_, __, speech] of this.speechQueue) {
624
+ speech.interrupt();
625
+ }
626
+ (_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
627
+ if (currentSpeech === void 0) {
628
+ future.resolve();
629
+ } else {
630
+ currentSpeech.then(() => {
631
+ if (future.done) return;
632
+ future.resolve();
633
+ });
634
+ }
635
+ return future;
636
+ }
637
+ onPipelineReplyDone() {
638
+ if (!this.speechQueue.peek() && (!this._currentSpeech || this._currentSpeech.done)) {
639
+ this.agentSession._updateAgentState("listening");
640
+ }
641
+ }
642
+ async userTurnCompleted(info, oldTask) {
643
+ var _a, _b;
644
+ if (oldTask) {
645
+ await oldTask;
646
+ }
647
+ if (this.llm instanceof RealtimeModel) {
648
+ if (this.llm.capabilities.turnDetection) {
649
+ return;
650
+ }
651
+ (_a = this.realtimeSession) == null ? void 0 : _a.commitAudio();
652
+ }
653
+ if (this._currentSpeech) {
654
+ if (!this._currentSpeech.allowInterruptions) {
655
+ this.logger.warn(
656
+ { user_input: info.newTranscript },
657
+ "skipping user input, current speech generation cannot be interrupted"
658
+ );
659
+ return;
660
+ }
661
+ this.logger.info(
662
+ { "speech id": this._currentSpeech.id },
663
+ "speech interrupted, new user turn detected"
664
+ );
665
+ this._currentSpeech.interrupt();
666
+ (_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
667
+ }
668
+ let userMessage = ChatMessage.create({
669
+ role: "user",
670
+ content: info.newTranscript
671
+ });
672
+ const chatCtx = this.agent.chatCtx.copy();
673
+ const startTime = Date.now();
674
+ try {
675
+ await this.agent.onUserTurnCompleted(chatCtx, userMessage);
676
+ } catch (e) {
677
+ if (e instanceof StopResponse) {
678
+ return;
679
+ }
680
+ this.logger.error({ error: e }, "error occurred during onUserTurnCompleted");
681
+ }
682
+ const callbackDuration = Date.now() - startTime;
683
+ if (this.llm instanceof RealtimeModel) {
684
+ userMessage = void 0;
685
+ } else if (this.llm === void 0) {
686
+ return;
687
+ }
688
+ const speechHandle = this.generateReply({ userMessage, chatCtx });
689
+ const eouMetrics = {
690
+ type: "eou_metrics",
691
+ timestamp: Date.now(),
692
+ endOfUtteranceDelay: info.endOfUtteranceDelay,
693
+ transcriptionDelay: info.transcriptionDelay,
694
+ onUserTurnCompletedDelay: callbackDuration,
695
+ speechId: speechHandle.id
696
+ };
697
+ this.agentSession.emit(
698
+ AgentSessionEventTypes.MetricsCollected,
699
+ createMetricsCollectedEvent({ metrics: eouMetrics })
700
+ );
701
+ }
702
+ async ttsTask(speechHandle, text, addToChatCtx, modelSettings, audio) {
703
+ speechHandleStorage.enterWith(speechHandle);
704
+ const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
705
+ const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
706
+ const replyAbortController = new AbortController();
707
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
708
+ if (speechHandle.interrupted) {
709
+ return;
710
+ }
711
+ let baseStream;
712
+ if (text instanceof ReadableStream) {
713
+ baseStream = text;
714
+ } else {
715
+ baseStream = new ReadableStream({
716
+ start(controller) {
717
+ controller.enqueue(text);
718
+ controller.close();
719
+ }
720
+ });
721
+ }
722
+ const [textSource, audioSource] = baseStream.tee();
723
+ const tasks = [];
724
+ const trNode = await this.agent.transcriptionNode(textSource, {});
725
+ let textOut = null;
726
+ if (trNode) {
727
+ const [textForwardTask, _textOut] = performTextForwarding(
728
+ trNode,
729
+ replyAbortController,
730
+ transcriptionOutput
731
+ );
732
+ textOut = _textOut;
733
+ tasks.push(textForwardTask);
734
+ }
735
+ const onFirstFrame = () => {
736
+ this.agentSession._updateAgentState("speaking");
737
+ };
738
+ if (!audioOutput) {
739
+ if (textOut) {
740
+ textOut.firstTextFut.await.finally(onFirstFrame);
741
+ }
742
+ } else {
743
+ let audioOut = null;
744
+ if (!audio) {
745
+ const [ttsTask, ttsStream] = performTTSInference(
746
+ (...args) => this.agent.ttsNode(...args),
747
+ audioSource,
748
+ modelSettings,
749
+ replyAbortController
750
+ );
751
+ tasks.push(ttsTask);
752
+ const [forwardTask, _audioOut] = performAudioForwarding(
753
+ ttsStream,
754
+ audioOutput,
755
+ replyAbortController
756
+ );
757
+ tasks.push(forwardTask);
758
+ audioOut = _audioOut;
759
+ } else {
760
+ const [forwardTask, _audioOut] = performAudioForwarding(
761
+ audio,
762
+ audioOutput,
763
+ replyAbortController
764
+ );
765
+ tasks.push(forwardTask);
766
+ audioOut = _audioOut;
767
+ }
768
+ audioOut.firstFrameFut.await.finally(onFirstFrame);
769
+ }
770
+ await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
771
+ if (audioOutput) {
772
+ await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
773
+ }
774
+ if (speechHandle.interrupted) {
775
+ replyAbortController.abort();
776
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
777
+ if (audioOutput) {
778
+ audioOutput.clearBuffer();
779
+ await audioOutput.waitForPlayout();
780
+ }
781
+ }
782
+ if (addToChatCtx) {
783
+ const message = ChatMessage.create({
784
+ role: "assistant",
785
+ content: (textOut == null ? void 0 : textOut.text) || "",
786
+ interrupted: speechHandle.interrupted
787
+ });
788
+ this.agent._chatCtx.insert(message);
789
+ this.agentSession._conversationItemAdded(message);
790
+ }
791
+ if (this.agentSession.agentState === "speaking") {
792
+ this.agentSession._updateAgentState("listening");
793
+ }
794
+ }
795
+ async pipelineReplyTask(speechHandle, chatCtx, toolCtx, modelSettings, instructions, newMessage, toolsMessages) {
796
+ var _a, _b, _c;
797
+ speechHandleStorage.enterWith(speechHandle);
798
+ const replyAbortController = new AbortController();
799
+ const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
800
+ const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
801
+ chatCtx = chatCtx.copy();
802
+ if (newMessage) {
803
+ chatCtx.insert(newMessage);
804
+ this.agent._chatCtx.insert(newMessage);
805
+ this.agentSession._conversationItemAdded(newMessage);
806
+ }
807
+ if (instructions) {
808
+ try {
809
+ updateInstructions({
810
+ chatCtx,
811
+ instructions,
812
+ addIfMissing: true
813
+ });
814
+ } catch (e) {
815
+ this.logger.error({ error: e }, "error occurred during updateInstructions");
816
+ }
817
+ }
818
+ this.agentSession._updateAgentState("thinking");
819
+ const tasks = [];
820
+ const [llmTask, llmGenData] = performLLMInference(
821
+ // preserve `this` context in llmNode
822
+ (...args) => this.agent.llmNode(...args),
823
+ chatCtx,
824
+ toolCtx,
825
+ modelSettings,
826
+ replyAbortController
827
+ );
828
+ tasks.push(llmTask);
829
+ const [ttsTextInput, llmOutput] = llmGenData.textStream.tee();
830
+ let ttsTask = null;
831
+ let ttsStream = null;
832
+ if (audioOutput) {
833
+ [ttsTask, ttsStream] = performTTSInference(
834
+ (...args) => this.agent.ttsNode(...args),
835
+ ttsTextInput,
836
+ modelSettings,
837
+ replyAbortController
838
+ );
839
+ tasks.push(ttsTask);
840
+ }
841
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
842
+ if (speechHandle.interrupted) {
843
+ replyAbortController.abort();
844
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
845
+ return;
846
+ }
847
+ const replyStartedAt = Date.now();
848
+ const trNodeResult = await this.agent.transcriptionNode(llmOutput, modelSettings);
849
+ let textOut = null;
850
+ if (trNodeResult) {
851
+ const [textForwardTask, _textOut] = performTextForwarding(
852
+ trNodeResult,
853
+ replyAbortController,
854
+ transcriptionOutput
855
+ );
856
+ tasks.push(textForwardTask);
857
+ textOut = _textOut;
858
+ }
859
+ const onFirstFrame = () => {
860
+ this.agentSession._updateAgentState("speaking");
861
+ };
862
+ let audioOut = null;
863
+ if (audioOutput) {
864
+ if (ttsStream) {
865
+ const [forwardTask, _audioOut] = performAudioForwarding(
866
+ ttsStream,
867
+ audioOutput,
868
+ replyAbortController
869
+ );
870
+ audioOut = _audioOut;
871
+ tasks.push(forwardTask);
872
+ audioOut.firstFrameFut.await.finally(onFirstFrame);
873
+ } else {
874
+ throw Error("ttsStream is null when audioOutput is enabled");
875
+ }
876
+ } else {
877
+ textOut == null ? void 0 : textOut.firstTextFut.await.finally(onFirstFrame);
878
+ }
879
+ const onToolExecutionStarted = (_) => {
880
+ };
881
+ const onToolExecutionCompleted = (_) => {
882
+ };
883
+ const [executeToolsTask, toolOutput] = performToolExecutions({
884
+ session: this.agentSession,
885
+ speechHandle,
886
+ toolCtx,
887
+ toolChoice: modelSettings.toolChoice,
888
+ toolCallStream: llmGenData.toolCallStream,
889
+ controller: replyAbortController,
890
+ onToolExecutionStarted,
891
+ onToolExecutionCompleted
892
+ });
893
+ tasks.push(executeToolsTask);
894
+ await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
895
+ if (audioOutput) {
896
+ await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
897
+ }
898
+ if (toolsMessages) {
899
+ for (const msg of toolsMessages) {
900
+ msg.createdAt = replyStartedAt;
901
+ }
902
+ this.agent._chatCtx.insert(toolsMessages);
903
+ }
904
+ if (speechHandle.interrupted) {
905
+ this.logger.debug(
906
+ { speech_id: speechHandle.id },
907
+ "Aborting all pipeline reply tasks due to interruption"
908
+ );
909
+ replyAbortController.abort();
910
+ await Promise.allSettled(
911
+ tasks.map((task) => task.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT))
912
+ );
913
+ let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
914
+ if (audioOutput) {
915
+ audioOutput.clearBuffer();
916
+ const playbackEv = await audioOutput.waitForPlayout();
917
+ if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
918
+ this.logger.info(
919
+ { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
920
+ "playout interrupted"
921
+ );
922
+ if (playbackEv.synchronizedTranscript) {
923
+ forwardedText = playbackEv.synchronizedTranscript;
924
+ }
925
+ } else {
926
+ forwardedText = "";
927
+ }
928
+ }
929
+ if (forwardedText) {
930
+ const message = ChatMessage.create({
931
+ role: "assistant",
932
+ content: forwardedText,
933
+ id: llmGenData.id,
934
+ interrupted: true,
935
+ createdAt: replyStartedAt
936
+ });
937
+ chatCtx.insert(message);
938
+ this.agent._chatCtx.insert(message);
939
+ this.agentSession._conversationItemAdded(message);
940
+ }
941
+ if (this.agentSession.agentState === "speaking") {
942
+ this.agentSession._updateAgentState("listening");
943
+ }
944
+ this.logger.info(
945
+ { speech_id: speechHandle.id, message: forwardedText },
946
+ "playout completed with interrupt"
947
+ );
948
+ speechHandle._markPlayoutDone();
949
+ await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
950
+ return;
951
+ }
952
+ if (textOut && textOut.text) {
953
+ const message = ChatMessage.create({
954
+ role: "assistant",
955
+ id: llmGenData.id,
956
+ interrupted: false,
957
+ createdAt: replyStartedAt,
958
+ content: textOut.text
959
+ });
960
+ chatCtx.insert(message);
961
+ this.agent._chatCtx.insert(message);
962
+ this.agentSession._conversationItemAdded(message);
963
+ this.logger.info(
964
+ { speech_id: speechHandle.id, message: textOut.text },
965
+ "playout completed without interruption"
966
+ );
967
+ }
968
+ if (toolOutput.output.length > 0) {
969
+ this.agentSession._updateAgentState("thinking");
970
+ } else if (this.agentSession.agentState === "speaking") {
971
+ this.agentSession._updateAgentState("listening");
972
+ }
973
+ speechHandle._markPlayoutDone();
974
+ await executeToolsTask.result;
975
+ if (toolOutput.output.length === 0) return;
976
+ const { maxToolSteps } = this.agentSession.options;
977
+ if (speechHandle.stepIndex >= maxToolSteps) {
978
+ this.logger.warn(
979
+ { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
980
+ "maximum number of function calls steps reached"
981
+ );
982
+ return;
983
+ }
984
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
985
+ functionCalls: [],
986
+ functionCallOutputs: []
987
+ });
988
+ let shouldGenerateToolReply = false;
989
+ let newAgentTask = null;
990
+ let ignoreTaskSwitch = false;
991
+ for (const sanitizedOut of toolOutput.output) {
992
+ if (sanitizedOut.toolCallOutput !== void 0) {
993
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
994
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
995
+ if (sanitizedOut.replyRequired) {
996
+ shouldGenerateToolReply = true;
997
+ }
998
+ }
999
+ if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1000
+ this.logger.error("expected to receive only one agent task from the tool executions");
1001
+ ignoreTaskSwitch = true;
1002
+ }
1003
+ newAgentTask = sanitizedOut.agentTask ?? null;
1004
+ this.logger.debug(
1005
+ {
1006
+ speechId: speechHandle.id,
1007
+ name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1008
+ args: sanitizedOut.toolCall.args,
1009
+ output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1010
+ isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1011
+ },
1012
+ "Tool call execution finished"
1013
+ );
1014
+ }
1015
+ this.agentSession.emit(
1016
+ AgentSessionEventTypes.FunctionToolsExecuted,
1017
+ functionToolsExecutedEvent
1018
+ );
1019
+ let draining = this.draining;
1020
+ if (!ignoreTaskSwitch && newAgentTask !== null) {
1021
+ this.agentSession.updateAgent(newAgentTask);
1022
+ draining = true;
1023
+ }
1024
+ const toolMessages = [
1025
+ ...functionToolsExecutedEvent.functionCalls,
1026
+ ...functionToolsExecutedEvent.functionCallOutputs
1027
+ ];
1028
+ if (shouldGenerateToolReply) {
1029
+ chatCtx.insert(toolMessages);
1030
+ const handle = SpeechHandle.create({
1031
+ allowInterruptions: speechHandle.allowInterruptions,
1032
+ stepIndex: speechHandle.stepIndex + 1,
1033
+ parent: speechHandle
1034
+ });
1035
+ this.agentSession.emit(
1036
+ AgentSessionEventTypes.SpeechCreated,
1037
+ createSpeechCreatedEvent({
1038
+ userInitiated: false,
1039
+ source: "tool_response",
1040
+ speechHandle: handle
1041
+ })
1042
+ );
1043
+ const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1044
+ const toolResponseTask = this.createSpeechTask({
1045
+ promise: this.pipelineReplyTask(
1046
+ handle,
1047
+ chatCtx,
1048
+ toolCtx,
1049
+ { toolChoice: respondToolChoice },
1050
+ instructions,
1051
+ void 0,
1052
+ toolMessages
1053
+ ),
1054
+ ownedSpeechHandle: handle,
1055
+ name: "AgentActivity.pipelineReply"
1056
+ });
1057
+ toolResponseTask.finally(() => this.onPipelineReplyDone());
1058
+ this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1059
+ } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1060
+ for (const msg of toolMessages) {
1061
+ msg.createdAt = replyStartedAt;
1062
+ }
1063
+ this.agent._chatCtx.insert(toolMessages);
1064
+ }
1065
+ }
1066
+ async realtimeGenerationTask(speechHandle, ev, modelSettings) {
1067
+ var _a, _b, _c;
1068
+ speechHandleStorage.enterWith(speechHandle);
1069
+ if (!this.realtimeSession) {
1070
+ throw new Error("realtime session is not initialized");
1071
+ }
1072
+ if (!(this.llm instanceof RealtimeModel)) {
1073
+ throw new Error("llm is not a realtime model");
1074
+ }
1075
+ this.logger.debug(
1076
+ { speech_id: speechHandle.id, stepIndex: speechHandle.stepIndex },
1077
+ "realtime generation started"
1078
+ );
1079
+ const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
1080
+ const textOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
1081
+ const toolCtx = this.realtimeSession.tools;
1082
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1083
+ if (speechHandle.interrupted) {
1084
+ return;
1085
+ }
1086
+ const onFirstFrame = () => {
1087
+ this.agentSession._updateAgentState("speaking");
1088
+ };
1089
+ const replyAbortController = new AbortController();
1090
+ const readMessages = async (abortController, outputs) => {
1091
+ const forwardTasks = [];
1092
+ try {
1093
+ for await (const msg of ev.messageStream) {
1094
+ if (forwardTasks.length > 0) {
1095
+ this.logger.warn(
1096
+ "expected to receive only one message generation from the realtime API"
1097
+ );
1098
+ break;
1099
+ }
1100
+ const trNodeResult = await this.agent.transcriptionNode(msg.textStream, modelSettings);
1101
+ let textOut = null;
1102
+ if (trNodeResult) {
1103
+ const [textForwardTask, _textOut] = performTextForwarding(
1104
+ trNodeResult,
1105
+ abortController,
1106
+ textOutput
1107
+ );
1108
+ forwardTasks.push(textForwardTask);
1109
+ textOut = _textOut;
1110
+ }
1111
+ let audioOut = null;
1112
+ if (audioOutput) {
1113
+ const realtimeAudio = await this.agent.realtimeAudioOutputNode(
1114
+ msg.audioStream,
1115
+ modelSettings
1116
+ );
1117
+ if (realtimeAudio) {
1118
+ const [forwardTask, _audioOut] = performAudioForwarding(
1119
+ realtimeAudio,
1120
+ audioOutput,
1121
+ abortController
1122
+ );
1123
+ forwardTasks.push(forwardTask);
1124
+ audioOut = _audioOut;
1125
+ audioOut.firstFrameFut.await.finally(onFirstFrame);
1126
+ } else {
1127
+ this.logger.warn(
1128
+ "audio output is enabled but neither tts nor realtime audio is available"
1129
+ );
1130
+ }
1131
+ } else if (textOut) {
1132
+ textOut.firstTextFut.await.finally(onFirstFrame);
1133
+ }
1134
+ outputs.push([msg.messageId, textOut, audioOut]);
1135
+ }
1136
+ await waitFor(forwardTasks);
1137
+ } catch (error) {
1138
+ this.logger.error(error, "error reading messages from the realtime API");
1139
+ } finally {
1140
+ await cancelAndWait(forwardTasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1141
+ }
1142
+ };
1143
+ const messageOutputs = [];
1144
+ const tasks = [
1145
+ Task.from(
1146
+ (controller) => readMessages(controller, messageOutputs),
1147
+ replyAbortController,
1148
+ "AgentActivity.realtime_generation.read_messages"
1149
+ )
1150
+ ];
1151
+ const [toolCallStream, toolCallStreamForTracing] = ev.functionStream.tee();
1152
+ const toolCalls = [];
1153
+ const readToolStreamTask = async (controller, stream) => {
1154
+ const reader = stream.getReader();
1155
+ try {
1156
+ while (!controller.signal.aborted) {
1157
+ const { done, value } = await reader.read();
1158
+ if (done) break;
1159
+ this.logger.debug({ tool_call: value }, "received tool call from the realtime API");
1160
+ toolCalls.push(value);
1161
+ }
1162
+ } finally {
1163
+ reader.releaseLock();
1164
+ }
1165
+ };
1166
+ tasks.push(
1167
+ Task.from(
1168
+ (controller) => readToolStreamTask(controller, toolCallStreamForTracing),
1169
+ replyAbortController,
1170
+ "AgentActivity.realtime_generation.read_tool_stream"
1171
+ )
1172
+ );
1173
+ const onToolExecutionStarted = (_) => {
1174
+ };
1175
+ const onToolExecutionCompleted = (_) => {
1176
+ };
1177
+ const [executeToolsTask, toolOutput] = performToolExecutions({
1178
+ session: this.agentSession,
1179
+ speechHandle,
1180
+ toolCtx,
1181
+ toolCallStream,
1182
+ toolChoice: modelSettings.toolChoice,
1183
+ controller: replyAbortController,
1184
+ onToolExecutionStarted,
1185
+ onToolExecutionCompleted
1186
+ });
1187
+ await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
1188
+ if (audioOutput) {
1189
+ await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
1190
+ this.agentSession._updateAgentState("listening");
1191
+ }
1192
+ if (speechHandle.interrupted) {
1193
+ this.logger.debug(
1194
+ { speech_id: speechHandle.id },
1195
+ "Aborting all realtime generation tasks due to interruption"
1196
+ );
1197
+ replyAbortController.abort();
1198
+ await cancelAndWait(tasks, AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1199
+ if (messageOutputs.length > 0) {
1200
+ const [msgId, textOut, audioOut] = messageOutputs[0];
1201
+ let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
1202
+ if (audioOutput) {
1203
+ audioOutput.clearBuffer();
1204
+ const playbackEv = await audioOutput.waitForPlayout();
1205
+ let playbackPosition = playbackEv.playbackPosition;
1206
+ if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
1207
+ this.logger.info(
1208
+ { speech_id: speechHandle.id, playbackPosition: playbackEv.playbackPosition },
1209
+ "playout interrupted"
1210
+ );
1211
+ if (playbackEv.synchronizedTranscript) {
1212
+ forwardedText = playbackEv.synchronizedTranscript;
1213
+ }
1214
+ } else {
1215
+ forwardedText = "";
1216
+ playbackPosition = 0;
1217
+ }
1218
+ this.realtimeSession.truncate({
1219
+ messageId: msgId,
1220
+ audioEndMs: Math.floor(playbackPosition)
1221
+ });
1222
+ }
1223
+ if (forwardedText) {
1224
+ const message = ChatMessage.create({
1225
+ role: "assistant",
1226
+ content: forwardedText,
1227
+ id: msgId,
1228
+ interrupted: true
1229
+ });
1230
+ this.agent._chatCtx.insert(message);
1231
+ speechHandle._setChatMessage(message);
1232
+ this.agentSession._conversationItemAdded(message);
1233
+ }
1234
+ this.logger.info(
1235
+ { speech_id: speechHandle.id, message: forwardedText },
1236
+ "playout completed with interrupt"
1237
+ );
1238
+ }
1239
+ speechHandle._markPlayoutDone();
1240
+ await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);
1241
+ return;
1242
+ }
1243
+ if (messageOutputs.length > 0) {
1244
+ const [msgId, textOut, _] = messageOutputs[0];
1245
+ const message = ChatMessage.create({
1246
+ role: "assistant",
1247
+ content: (textOut == null ? void 0 : textOut.text) || "",
1248
+ id: msgId,
1249
+ interrupted: false
1250
+ });
1251
+ this.agent._chatCtx.insert(message);
1252
+ speechHandle._setChatMessage(message);
1253
+ this.agentSession._conversationItemAdded(message);
1254
+ }
1255
+ speechHandle._markPlayoutDone();
1256
+ toolOutput.firstToolStartedFuture.await.finally(() => {
1257
+ this.agentSession._updateAgentState("thinking");
1258
+ });
1259
+ await executeToolsTask.result;
1260
+ if (toolOutput.output.length === 0) return;
1261
+ const { maxToolSteps } = this.agentSession.options;
1262
+ if (speechHandle.stepIndex >= maxToolSteps) {
1263
+ this.logger.warn(
1264
+ { speech_id: speechHandle.id, max_tool_steps: maxToolSteps },
1265
+ "maximum number of function calls steps reached"
1266
+ );
1267
+ return;
1268
+ }
1269
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1270
+ functionCalls: [],
1271
+ functionCallOutputs: []
1272
+ });
1273
+ let shouldGenerateToolReply = false;
1274
+ let newAgentTask = null;
1275
+ let ignoreTaskSwitch = false;
1276
+ for (const sanitizedOut of toolOutput.output) {
1277
+ if (sanitizedOut.toolCallOutput !== void 0) {
1278
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1279
+ if (sanitizedOut.replyRequired) {
1280
+ shouldGenerateToolReply = true;
1281
+ }
1282
+ }
1283
+ if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1284
+ this.logger.error("expected to receive only one agent task from the tool executions");
1285
+ ignoreTaskSwitch = true;
1286
+ }
1287
+ newAgentTask = sanitizedOut.agentTask ?? null;
1288
+ this.logger.debug(
1289
+ {
1290
+ speechId: speechHandle.id,
1291
+ name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1292
+ args: sanitizedOut.toolCall.args,
1293
+ output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1294
+ isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1295
+ },
1296
+ "Tool call execution finished"
1297
+ );
1298
+ }
1299
+ this.agentSession.emit(
1300
+ AgentSessionEventTypes.FunctionToolsExecuted,
1301
+ functionToolsExecutedEvent
1302
+ );
1303
+ let draining = this.draining;
1304
+ if (!ignoreTaskSwitch && newAgentTask !== null) {
1305
+ this.agentSession.updateAgent(newAgentTask);
1306
+ draining = true;
1307
+ }
1308
+ if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1309
+ const chatCtx = this.realtimeSession.chatCtx.copy();
1310
+ chatCtx.items.push(...functionToolsExecutedEvent.functionCallOutputs);
1311
+ try {
1312
+ await this.realtimeSession.updateChatCtx(chatCtx);
1313
+ } catch (error) {
1314
+ this.logger.warn(
1315
+ { error },
1316
+ "failed to update chat context before generating the function calls results"
1317
+ );
1318
+ }
1319
+ }
1320
+ if (!shouldGenerateToolReply || this.llm.capabilities.autoToolReplyGeneration) {
1321
+ return;
1322
+ }
1323
+ this.realtimeSession.interrupt();
1324
+ const replySpeechHandle = SpeechHandle.create({
1325
+ allowInterruptions: speechHandle.allowInterruptions,
1326
+ stepIndex: speechHandle.stepIndex + 1,
1327
+ parent: speechHandle
1328
+ });
1329
+ this.agentSession.emit(
1330
+ AgentSessionEventTypes.SpeechCreated,
1331
+ createSpeechCreatedEvent({
1332
+ userInitiated: false,
1333
+ source: "tool_response",
1334
+ speechHandle: replySpeechHandle
1335
+ })
1336
+ );
1337
+ const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1338
+ this.createSpeechTask({
1339
+ promise: this.realtimeReplyTask({
1340
+ speechHandle: replySpeechHandle,
1341
+ modelSettings: { toolChoice }
1342
+ }),
1343
+ ownedSpeechHandle: replySpeechHandle,
1344
+ name: "AgentActivity.realtime_reply"
1345
+ });
1346
+ this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1347
+ }
1348
+ async realtimeReplyTask({
1349
+ speechHandle,
1350
+ modelSettings: { toolChoice },
1351
+ userInput,
1352
+ instructions
1353
+ }) {
1354
+ speechHandleStorage.enterWith(speechHandle);
1355
+ if (!this.realtimeSession) {
1356
+ throw new Error("realtime session is not available");
1357
+ }
1358
+ await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
1359
+ if (userInput) {
1360
+ const chatCtx = this.realtimeSession.chatCtx.copy();
1361
+ const message = chatCtx.addMessage({
1362
+ role: "user",
1363
+ content: userInput
1364
+ });
1365
+ await this.realtimeSession.updateChatCtx(chatCtx);
1366
+ this.agent._chatCtx.insert(message);
1367
+ this.agentSession._conversationItemAdded(message);
1368
+ }
1369
+ const originalToolChoice = this.toolChoice;
1370
+ if (toolChoice !== void 0) {
1371
+ this.realtimeSession.updateOptions({ toolChoice });
1372
+ }
1373
+ try {
1374
+ const generationEvent = await this.realtimeSession.generateReply(instructions);
1375
+ await this.realtimeGenerationTask(speechHandle, generationEvent, { toolChoice });
1376
+ } finally {
1377
+ if (toolChoice !== void 0 && toolChoice !== originalToolChoice) {
1378
+ this.realtimeSession.updateOptions({ toolChoice: originalToolChoice });
1379
+ }
1380
+ }
1381
+ }
1382
+ scheduleSpeech(speechHandle, priority, bypassDraining = false) {
1383
+ if (this.draining && !bypassDraining) {
1384
+ throw new Error("cannot schedule new speech, the agent is draining");
1385
+ }
1386
+ this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1387
+ this.wakeupMainTask();
1388
+ }
1389
+ async drain() {
1390
+ var _a;
1391
+ const unlock = await this.lock.lock();
1392
+ try {
1393
+ if (this._draining) return;
1394
+ this.createSpeechTask({
1395
+ promise: this.agent.onExit(),
1396
+ name: "AgentActivity_onExit"
1397
+ });
1398
+ this.wakeupMainTask();
1399
+ this._draining = true;
1400
+ await ((_a = this._mainTask) == null ? void 0 : _a.result);
1401
+ } finally {
1402
+ unlock();
1403
+ }
1404
+ }
1405
+ async close() {
1406
+ var _a, _b, _c;
1407
+ const unlock = await this.lock.lock();
1408
+ try {
1409
+ if (!this._draining) {
1410
+ this.logger.warn("task closing without draining");
1411
+ }
1412
+ if (this.llm instanceof LLM) {
1413
+ this.llm.off("metrics_collected", this.onMetricsCollected);
1414
+ }
1415
+ if (this.realtimeSession) {
1416
+ this.realtimeSession.off("generation_created", this.onGenerationCreated);
1417
+ this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
1418
+ this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
1419
+ this.realtimeSession.off(
1420
+ "input_audio_transcription_completed",
1421
+ this.onInputAudioTranscriptionCompleted
1422
+ );
1423
+ this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1424
+ }
1425
+ if (this.stt instanceof STT) {
1426
+ this.stt.off("metrics_collected", this.onMetricsCollected);
1427
+ }
1428
+ if (this.tts instanceof TTS) {
1429
+ this.tts.off("metrics_collected", this.onMetricsCollected);
1430
+ }
1431
+ if (this.vad instanceof VAD) {
1432
+ this.vad.off("metrics_collected", this.onMetricsCollected);
1433
+ }
1434
+ this.detachAudioInput();
1435
+ await ((_a = this.realtimeSession) == null ? void 0 : _a.close());
1436
+ await ((_b = this.audioRecognition) == null ? void 0 : _b.close());
1437
+ await ((_c = this._mainTask) == null ? void 0 : _c.cancelAndWait());
1438
+ } finally {
1439
+ unlock();
1440
+ }
1441
+ }
1442
+ }
1443
+ function toOaiToolChoice(toolChoice) {
1444
+ return toolChoice !== null ? toolChoice : void 0;
1445
+ }
1446
+ export {
1447
+ AgentActivity
1448
+ };
1449
+ //# sourceMappingURL=agent_activity.js.map