@livekit/agents 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (942) hide show
  1. package/dist/audio.cjs +89 -3
  2. package/dist/audio.cjs.map +1 -1
  3. package/dist/audio.d.cts +36 -1
  4. package/dist/audio.d.ts +36 -1
  5. package/dist/audio.d.ts.map +1 -1
  6. package/dist/audio.js +76 -2
  7. package/dist/audio.js.map +1 -1
  8. package/dist/beta/index.cjs +29 -0
  9. package/dist/beta/index.cjs.map +1 -0
  10. package/dist/beta/index.d.cts +2 -0
  11. package/dist/beta/index.d.ts +2 -0
  12. package/dist/beta/index.d.ts.map +1 -0
  13. package/dist/beta/index.js +7 -0
  14. package/dist/beta/index.js.map +1 -0
  15. package/dist/beta/workflows/index.cjs +29 -0
  16. package/dist/beta/workflows/index.cjs.map +1 -0
  17. package/dist/beta/workflows/index.d.cts +2 -0
  18. package/dist/beta/workflows/index.d.ts +2 -0
  19. package/dist/beta/workflows/index.d.ts.map +1 -0
  20. package/dist/beta/workflows/index.js +7 -0
  21. package/dist/beta/workflows/index.js.map +1 -0
  22. package/dist/beta/workflows/task_group.cjs +162 -0
  23. package/dist/beta/workflows/task_group.cjs.map +1 -0
  24. package/dist/beta/workflows/task_group.d.cts +32 -0
  25. package/dist/beta/workflows/task_group.d.ts +32 -0
  26. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  27. package/dist/beta/workflows/task_group.js +138 -0
  28. package/dist/beta/workflows/task_group.js.map +1 -0
  29. package/dist/cli.cjs +44 -46
  30. package/dist/cli.cjs.map +1 -1
  31. package/dist/cli.d.cts +3 -3
  32. package/dist/cli.d.ts +3 -3
  33. package/dist/cli.d.ts.map +1 -1
  34. package/dist/cli.js +45 -47
  35. package/dist/cli.js.map +1 -1
  36. package/dist/connection_pool.cjs +242 -0
  37. package/dist/connection_pool.cjs.map +1 -0
  38. package/dist/connection_pool.d.cts +123 -0
  39. package/dist/connection_pool.d.ts +123 -0
  40. package/dist/connection_pool.d.ts.map +1 -0
  41. package/dist/connection_pool.js +218 -0
  42. package/dist/connection_pool.js.map +1 -0
  43. package/dist/connection_pool.test.cjs +256 -0
  44. package/dist/connection_pool.test.cjs.map +1 -0
  45. package/dist/connection_pool.test.js +255 -0
  46. package/dist/connection_pool.test.js.map +1 -0
  47. package/dist/constants.cjs +30 -0
  48. package/dist/constants.cjs.map +1 -1
  49. package/dist/constants.d.cts +10 -0
  50. package/dist/constants.d.ts +10 -0
  51. package/dist/constants.d.ts.map +1 -1
  52. package/dist/constants.js +20 -0
  53. package/dist/constants.js.map +1 -1
  54. package/dist/cpu.cjs +189 -0
  55. package/dist/cpu.cjs.map +1 -0
  56. package/dist/cpu.d.cts +24 -0
  57. package/dist/cpu.d.ts +24 -0
  58. package/dist/cpu.d.ts.map +1 -0
  59. package/dist/cpu.js +152 -0
  60. package/dist/cpu.js.map +1 -0
  61. package/dist/cpu.test.cjs +227 -0
  62. package/dist/cpu.test.cjs.map +1 -0
  63. package/dist/cpu.test.js +204 -0
  64. package/dist/cpu.test.js.map +1 -0
  65. package/dist/http_server.cjs +9 -6
  66. package/dist/http_server.cjs.map +1 -1
  67. package/dist/http_server.d.cts +5 -1
  68. package/dist/http_server.d.ts +5 -1
  69. package/dist/http_server.d.ts.map +1 -1
  70. package/dist/http_server.js +9 -6
  71. package/dist/http_server.js.map +1 -1
  72. package/dist/index.cjs +24 -9
  73. package/dist/index.cjs.map +1 -1
  74. package/dist/index.d.cts +15 -11
  75. package/dist/index.d.ts +15 -11
  76. package/dist/index.d.ts.map +1 -1
  77. package/dist/index.js +18 -9
  78. package/dist/index.js.map +1 -1
  79. package/dist/inference/api_protos.cjs +70 -2
  80. package/dist/inference/api_protos.cjs.map +1 -1
  81. package/dist/inference/api_protos.d.cts +373 -32
  82. package/dist/inference/api_protos.d.ts +373 -32
  83. package/dist/inference/api_protos.d.ts.map +1 -1
  84. package/dist/inference/api_protos.js +62 -2
  85. package/dist/inference/api_protos.js.map +1 -1
  86. package/dist/inference/index.cjs +8 -0
  87. package/dist/inference/index.cjs.map +1 -1
  88. package/dist/inference/index.d.cts +3 -4
  89. package/dist/inference/index.d.ts +3 -4
  90. package/dist/inference/index.d.ts.map +1 -1
  91. package/dist/inference/index.js +18 -3
  92. package/dist/inference/index.js.map +1 -1
  93. package/dist/inference/interruption/defaults.cjs +81 -0
  94. package/dist/inference/interruption/defaults.cjs.map +1 -0
  95. package/dist/inference/interruption/defaults.d.cts +19 -0
  96. package/dist/inference/interruption/defaults.d.ts +19 -0
  97. package/dist/inference/interruption/defaults.d.ts.map +1 -0
  98. package/dist/inference/interruption/defaults.js +46 -0
  99. package/dist/inference/interruption/defaults.js.map +1 -0
  100. package/dist/inference/interruption/errors.cjs +44 -0
  101. package/dist/inference/interruption/errors.cjs.map +1 -0
  102. package/dist/inference/interruption/errors.d.cts +12 -0
  103. package/dist/inference/interruption/errors.d.ts +12 -0
  104. package/dist/inference/interruption/errors.d.ts.map +1 -0
  105. package/dist/inference/interruption/errors.js +20 -0
  106. package/dist/inference/interruption/errors.js.map +1 -0
  107. package/dist/inference/interruption/http_transport.cjs +163 -0
  108. package/dist/inference/interruption/http_transport.cjs.map +1 -0
  109. package/dist/inference/interruption/http_transport.d.cts +63 -0
  110. package/dist/inference/interruption/http_transport.d.ts +63 -0
  111. package/dist/inference/interruption/http_transport.d.ts.map +1 -0
  112. package/dist/inference/interruption/http_transport.js +137 -0
  113. package/dist/inference/interruption/http_transport.js.map +1 -0
  114. package/dist/inference/interruption/interruption_cache_entry.cjs +58 -0
  115. package/dist/inference/interruption/interruption_cache_entry.cjs.map +1 -0
  116. package/dist/inference/interruption/interruption_cache_entry.d.cts +30 -0
  117. package/dist/inference/interruption/interruption_cache_entry.d.ts +30 -0
  118. package/dist/inference/interruption/interruption_cache_entry.d.ts.map +1 -0
  119. package/dist/inference/interruption/interruption_cache_entry.js +34 -0
  120. package/dist/inference/interruption/interruption_cache_entry.js.map +1 -0
  121. package/dist/inference/interruption/interruption_detector.cjs +198 -0
  122. package/dist/inference/interruption/interruption_detector.cjs.map +1 -0
  123. package/dist/inference/interruption/interruption_detector.d.cts +59 -0
  124. package/dist/inference/interruption/interruption_detector.d.ts +59 -0
  125. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -0
  126. package/dist/inference/interruption/interruption_detector.js +164 -0
  127. package/dist/inference/interruption/interruption_detector.js.map +1 -0
  128. package/dist/inference/interruption/interruption_stream.cjs +368 -0
  129. package/dist/inference/interruption/interruption_stream.cjs.map +1 -0
  130. package/dist/inference/interruption/interruption_stream.d.cts +46 -0
  131. package/dist/inference/interruption/interruption_stream.d.ts +46 -0
  132. package/dist/inference/interruption/interruption_stream.d.ts.map +1 -0
  133. package/dist/inference/interruption/interruption_stream.js +344 -0
  134. package/dist/inference/interruption/interruption_stream.js.map +1 -0
  135. package/dist/inference/interruption/types.cjs +17 -0
  136. package/dist/inference/interruption/types.cjs.map +1 -0
  137. package/dist/inference/interruption/types.d.cts +66 -0
  138. package/dist/inference/interruption/types.d.ts +66 -0
  139. package/dist/inference/interruption/types.d.ts.map +1 -0
  140. package/dist/inference/interruption/types.js +1 -0
  141. package/dist/inference/interruption/types.js.map +1 -0
  142. package/dist/inference/interruption/utils.cjs +130 -0
  143. package/dist/inference/interruption/utils.cjs.map +1 -0
  144. package/dist/inference/interruption/utils.d.cts +41 -0
  145. package/dist/inference/interruption/utils.d.ts +41 -0
  146. package/dist/inference/interruption/utils.d.ts.map +1 -0
  147. package/dist/inference/interruption/utils.js +105 -0
  148. package/dist/inference/interruption/utils.js.map +1 -0
  149. package/dist/inference/interruption/utils.test.cjs +105 -0
  150. package/dist/inference/interruption/utils.test.cjs.map +1 -0
  151. package/dist/inference/interruption/utils.test.js +104 -0
  152. package/dist/inference/interruption/utils.test.js.map +1 -0
  153. package/dist/inference/interruption/ws_transport.cjs +342 -0
  154. package/dist/inference/interruption/ws_transport.cjs.map +1 -0
  155. package/dist/inference/interruption/ws_transport.d.cts +33 -0
  156. package/dist/inference/interruption/ws_transport.d.ts +33 -0
  157. package/dist/inference/interruption/ws_transport.d.ts.map +1 -0
  158. package/dist/inference/interruption/ws_transport.js +308 -0
  159. package/dist/inference/interruption/ws_transport.js.map +1 -0
  160. package/dist/inference/llm.cjs +106 -66
  161. package/dist/inference/llm.cjs.map +1 -1
  162. package/dist/inference/llm.d.cts +65 -43
  163. package/dist/inference/llm.d.ts +65 -43
  164. package/dist/inference/llm.d.ts.map +1 -1
  165. package/dist/inference/llm.js +100 -66
  166. package/dist/inference/llm.js.map +1 -1
  167. package/dist/inference/stt.cjs +319 -170
  168. package/dist/inference/stt.cjs.map +1 -1
  169. package/dist/inference/stt.d.cts +64 -15
  170. package/dist/inference/stt.d.ts +64 -15
  171. package/dist/inference/stt.d.ts.map +1 -1
  172. package/dist/inference/stt.js +319 -170
  173. package/dist/inference/stt.js.map +1 -1
  174. package/dist/inference/stt.test.cjs +218 -0
  175. package/dist/inference/stt.test.cjs.map +1 -0
  176. package/dist/inference/stt.test.js +217 -0
  177. package/dist/inference/stt.test.js.map +1 -0
  178. package/dist/inference/tts.cjs +249 -71
  179. package/dist/inference/tts.cjs.map +1 -1
  180. package/dist/inference/tts.d.cts +55 -16
  181. package/dist/inference/tts.d.ts +55 -16
  182. package/dist/inference/tts.d.ts.map +1 -1
  183. package/dist/inference/tts.js +249 -77
  184. package/dist/inference/tts.js.map +1 -1
  185. package/dist/inference/tts.test.cjs +233 -0
  186. package/dist/inference/tts.test.cjs.map +1 -0
  187. package/dist/inference/tts.test.js +232 -0
  188. package/dist/inference/tts.test.js.map +1 -0
  189. package/dist/inference/utils.cjs +26 -7
  190. package/dist/inference/utils.cjs.map +1 -1
  191. package/dist/inference/utils.d.cts +14 -1
  192. package/dist/inference/utils.d.ts +14 -1
  193. package/dist/inference/utils.d.ts.map +1 -1
  194. package/dist/inference/utils.js +18 -2
  195. package/dist/inference/utils.js.map +1 -1
  196. package/dist/ipc/inference_proc_executor.cjs +6 -3
  197. package/dist/ipc/inference_proc_executor.cjs.map +1 -1
  198. package/dist/ipc/inference_proc_executor.d.ts.map +1 -1
  199. package/dist/ipc/inference_proc_executor.js +6 -3
  200. package/dist/ipc/inference_proc_executor.js.map +1 -1
  201. package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
  202. package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
  203. package/dist/ipc/inference_proc_lazy_main.js +13 -1
  204. package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
  205. package/dist/ipc/job_proc_executor.cjs +6 -1
  206. package/dist/ipc/job_proc_executor.cjs.map +1 -1
  207. package/dist/ipc/job_proc_executor.d.ts.map +1 -1
  208. package/dist/ipc/job_proc_executor.js +6 -1
  209. package/dist/ipc/job_proc_executor.js.map +1 -1
  210. package/dist/ipc/job_proc_lazy_main.cjs +89 -17
  211. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  212. package/dist/ipc/job_proc_lazy_main.js +68 -18
  213. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  214. package/dist/ipc/supervised_proc.cjs +34 -8
  215. package/dist/ipc/supervised_proc.cjs.map +1 -1
  216. package/dist/ipc/supervised_proc.d.cts +8 -0
  217. package/dist/ipc/supervised_proc.d.ts +8 -0
  218. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  219. package/dist/ipc/supervised_proc.js +34 -8
  220. package/dist/ipc/supervised_proc.js.map +1 -1
  221. package/dist/ipc/supervised_proc.test.cjs +145 -0
  222. package/dist/ipc/supervised_proc.test.cjs.map +1 -0
  223. package/dist/ipc/supervised_proc.test.js +122 -0
  224. package/dist/ipc/supervised_proc.test.js.map +1 -0
  225. package/dist/job.cjs +109 -1
  226. package/dist/job.cjs.map +1 -1
  227. package/dist/job.d.cts +14 -0
  228. package/dist/job.d.ts +14 -0
  229. package/dist/job.d.ts.map +1 -1
  230. package/dist/job.js +99 -1
  231. package/dist/job.js.map +1 -1
  232. package/dist/language.cjs +394 -0
  233. package/dist/language.cjs.map +1 -0
  234. package/dist/language.d.cts +15 -0
  235. package/dist/language.d.ts +15 -0
  236. package/dist/language.d.ts.map +1 -0
  237. package/dist/language.js +363 -0
  238. package/dist/language.js.map +1 -0
  239. package/dist/language.test.cjs +43 -0
  240. package/dist/language.test.cjs.map +1 -0
  241. package/dist/language.test.js +49 -0
  242. package/dist/language.test.js.map +1 -0
  243. package/dist/llm/chat_context.cjs +274 -3
  244. package/dist/llm/chat_context.cjs.map +1 -1
  245. package/dist/llm/chat_context.d.cts +86 -2
  246. package/dist/llm/chat_context.d.ts +86 -2
  247. package/dist/llm/chat_context.d.ts.map +1 -1
  248. package/dist/llm/chat_context.js +273 -3
  249. package/dist/llm/chat_context.js.map +1 -1
  250. package/dist/llm/chat_context.test.cjs +574 -0
  251. package/dist/llm/chat_context.test.cjs.map +1 -1
  252. package/dist/llm/chat_context.test.js +574 -0
  253. package/dist/llm/chat_context.test.js.map +1 -1
  254. package/dist/llm/fallback_adapter.cjs +278 -0
  255. package/dist/llm/fallback_adapter.cjs.map +1 -0
  256. package/dist/llm/fallback_adapter.d.cts +73 -0
  257. package/dist/llm/fallback_adapter.d.ts +73 -0
  258. package/dist/llm/fallback_adapter.d.ts.map +1 -0
  259. package/dist/llm/fallback_adapter.js +254 -0
  260. package/dist/llm/fallback_adapter.js.map +1 -0
  261. package/dist/llm/fallback_adapter.test.cjs +176 -0
  262. package/dist/llm/fallback_adapter.test.cjs.map +1 -0
  263. package/dist/llm/fallback_adapter.test.js +175 -0
  264. package/dist/llm/fallback_adapter.test.js.map +1 -0
  265. package/dist/llm/index.cjs +9 -0
  266. package/dist/llm/index.cjs.map +1 -1
  267. package/dist/llm/index.d.cts +4 -3
  268. package/dist/llm/index.d.ts +4 -3
  269. package/dist/llm/index.d.ts.map +1 -1
  270. package/dist/llm/index.js +11 -1
  271. package/dist/llm/index.js.map +1 -1
  272. package/dist/llm/llm.cjs +65 -11
  273. package/dist/llm/llm.cjs.map +1 -1
  274. package/dist/llm/llm.d.cts +13 -2
  275. package/dist/llm/llm.d.ts +13 -2
  276. package/dist/llm/llm.d.ts.map +1 -1
  277. package/dist/llm/llm.js +65 -11
  278. package/dist/llm/llm.js.map +1 -1
  279. package/dist/llm/provider_format/google.cjs +6 -2
  280. package/dist/llm/provider_format/google.cjs.map +1 -1
  281. package/dist/llm/provider_format/google.d.cts +1 -1
  282. package/dist/llm/provider_format/google.d.ts +1 -1
  283. package/dist/llm/provider_format/google.d.ts.map +1 -1
  284. package/dist/llm/provider_format/google.js +6 -2
  285. package/dist/llm/provider_format/google.js.map +1 -1
  286. package/dist/llm/provider_format/google.test.cjs +48 -0
  287. package/dist/llm/provider_format/google.test.cjs.map +1 -1
  288. package/dist/llm/provider_format/google.test.js +54 -1
  289. package/dist/llm/provider_format/google.test.js.map +1 -1
  290. package/dist/llm/provider_format/index.cjs +2 -0
  291. package/dist/llm/provider_format/index.cjs.map +1 -1
  292. package/dist/llm/provider_format/index.d.cts +2 -2
  293. package/dist/llm/provider_format/index.d.ts +2 -2
  294. package/dist/llm/provider_format/index.d.ts.map +1 -1
  295. package/dist/llm/provider_format/index.js +6 -1
  296. package/dist/llm/provider_format/index.js.map +1 -1
  297. package/dist/llm/provider_format/openai.cjs +126 -24
  298. package/dist/llm/provider_format/openai.cjs.map +1 -1
  299. package/dist/llm/provider_format/openai.d.cts +1 -0
  300. package/dist/llm/provider_format/openai.d.ts +1 -0
  301. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  302. package/dist/llm/provider_format/openai.js +124 -23
  303. package/dist/llm/provider_format/openai.js.map +1 -1
  304. package/dist/llm/provider_format/openai.test.cjs +393 -0
  305. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  306. package/dist/llm/provider_format/openai.test.js +400 -2
  307. package/dist/llm/provider_format/openai.test.js.map +1 -1
  308. package/dist/llm/provider_format/utils.cjs +5 -4
  309. package/dist/llm/provider_format/utils.cjs.map +1 -1
  310. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  311. package/dist/llm/provider_format/utils.js +5 -4
  312. package/dist/llm/provider_format/utils.js.map +1 -1
  313. package/dist/llm/realtime.cjs +3 -0
  314. package/dist/llm/realtime.cjs.map +1 -1
  315. package/dist/llm/realtime.d.cts +15 -1
  316. package/dist/llm/realtime.d.ts +15 -1
  317. package/dist/llm/realtime.d.ts.map +1 -1
  318. package/dist/llm/realtime.js +3 -0
  319. package/dist/llm/realtime.js.map +1 -1
  320. package/dist/llm/remote_chat_context.cjs.map +1 -1
  321. package/dist/llm/remote_chat_context.d.cts +2 -0
  322. package/dist/llm/remote_chat_context.d.ts +2 -0
  323. package/dist/llm/remote_chat_context.d.ts.map +1 -1
  324. package/dist/llm/remote_chat_context.js.map +1 -1
  325. package/dist/llm/tool_context.cjs +50 -2
  326. package/dist/llm/tool_context.cjs.map +1 -1
  327. package/dist/llm/tool_context.d.cts +47 -11
  328. package/dist/llm/tool_context.d.ts +47 -11
  329. package/dist/llm/tool_context.d.ts.map +1 -1
  330. package/dist/llm/tool_context.js +48 -3
  331. package/dist/llm/tool_context.js.map +1 -1
  332. package/dist/llm/tool_context.test.cjs +197 -0
  333. package/dist/llm/tool_context.test.cjs.map +1 -1
  334. package/dist/llm/tool_context.test.js +175 -0
  335. package/dist/llm/tool_context.test.js.map +1 -1
  336. package/dist/llm/utils.cjs +18 -12
  337. package/dist/llm/utils.cjs.map +1 -1
  338. package/dist/llm/utils.d.cts +2 -3
  339. package/dist/llm/utils.d.ts +2 -3
  340. package/dist/llm/utils.d.ts.map +1 -1
  341. package/dist/llm/utils.js +18 -12
  342. package/dist/llm/utils.js.map +1 -1
  343. package/dist/llm/zod-utils.cjs +102 -0
  344. package/dist/llm/zod-utils.cjs.map +1 -0
  345. package/dist/llm/zod-utils.d.cts +65 -0
  346. package/dist/llm/zod-utils.d.ts +65 -0
  347. package/dist/llm/zod-utils.d.ts.map +1 -0
  348. package/dist/llm/zod-utils.js +64 -0
  349. package/dist/llm/zod-utils.js.map +1 -0
  350. package/dist/llm/zod-utils.test.cjs +472 -0
  351. package/dist/llm/zod-utils.test.cjs.map +1 -0
  352. package/dist/llm/zod-utils.test.js +455 -0
  353. package/dist/llm/zod-utils.test.js.map +1 -0
  354. package/dist/log.cjs +45 -14
  355. package/dist/log.cjs.map +1 -1
  356. package/dist/log.d.cts +8 -1
  357. package/dist/log.d.ts +8 -1
  358. package/dist/log.d.ts.map +1 -1
  359. package/dist/log.js +45 -15
  360. package/dist/log.js.map +1 -1
  361. package/dist/metrics/base.cjs.map +1 -1
  362. package/dist/metrics/base.d.cts +75 -19
  363. package/dist/metrics/base.d.ts +75 -19
  364. package/dist/metrics/base.d.ts.map +1 -1
  365. package/dist/metrics/index.cjs +5 -0
  366. package/dist/metrics/index.cjs.map +1 -1
  367. package/dist/metrics/index.d.cts +2 -1
  368. package/dist/metrics/index.d.ts +2 -1
  369. package/dist/metrics/index.d.ts.map +1 -1
  370. package/dist/metrics/index.js +6 -0
  371. package/dist/metrics/index.js.map +1 -1
  372. package/dist/metrics/model_usage.cjs +189 -0
  373. package/dist/metrics/model_usage.cjs.map +1 -0
  374. package/dist/metrics/model_usage.d.cts +92 -0
  375. package/dist/metrics/model_usage.d.ts +92 -0
  376. package/dist/metrics/model_usage.d.ts.map +1 -0
  377. package/dist/metrics/model_usage.js +164 -0
  378. package/dist/metrics/model_usage.js.map +1 -0
  379. package/dist/metrics/model_usage.test.cjs +474 -0
  380. package/dist/metrics/model_usage.test.cjs.map +1 -0
  381. package/dist/metrics/model_usage.test.js +476 -0
  382. package/dist/metrics/model_usage.test.js.map +1 -0
  383. package/dist/metrics/usage_collector.cjs +5 -2
  384. package/dist/metrics/usage_collector.cjs.map +1 -1
  385. package/dist/metrics/usage_collector.d.cts +10 -1
  386. package/dist/metrics/usage_collector.d.ts +10 -1
  387. package/dist/metrics/usage_collector.d.ts.map +1 -1
  388. package/dist/metrics/usage_collector.js +5 -2
  389. package/dist/metrics/usage_collector.js.map +1 -1
  390. package/dist/metrics/utils.cjs +23 -7
  391. package/dist/metrics/utils.cjs.map +1 -1
  392. package/dist/metrics/utils.d.ts.map +1 -1
  393. package/dist/metrics/utils.js +23 -7
  394. package/dist/metrics/utils.js.map +1 -1
  395. package/dist/stream/deferred_stream.cjs +31 -10
  396. package/dist/stream/deferred_stream.cjs.map +1 -1
  397. package/dist/stream/deferred_stream.d.cts +6 -1
  398. package/dist/stream/deferred_stream.d.ts +6 -1
  399. package/dist/stream/deferred_stream.d.ts.map +1 -1
  400. package/dist/stream/deferred_stream.js +31 -10
  401. package/dist/stream/deferred_stream.js.map +1 -1
  402. package/dist/stream/deferred_stream.test.cjs +2 -2
  403. package/dist/stream/deferred_stream.test.cjs.map +1 -1
  404. package/dist/stream/deferred_stream.test.js +2 -2
  405. package/dist/stream/deferred_stream.test.js.map +1 -1
  406. package/dist/stream/index.cjs +3 -0
  407. package/dist/stream/index.cjs.map +1 -1
  408. package/dist/stream/index.d.cts +1 -0
  409. package/dist/stream/index.d.ts +1 -0
  410. package/dist/stream/index.d.ts.map +1 -1
  411. package/dist/stream/index.js +2 -0
  412. package/dist/stream/index.js.map +1 -1
  413. package/dist/stream/multi_input_stream.cjs +139 -0
  414. package/dist/stream/multi_input_stream.cjs.map +1 -0
  415. package/dist/stream/multi_input_stream.d.cts +55 -0
  416. package/dist/stream/multi_input_stream.d.ts +55 -0
  417. package/dist/stream/multi_input_stream.d.ts.map +1 -0
  418. package/dist/stream/multi_input_stream.js +115 -0
  419. package/dist/stream/multi_input_stream.js.map +1 -0
  420. package/dist/stream/multi_input_stream.test.cjs +344 -0
  421. package/dist/stream/multi_input_stream.test.cjs.map +1 -0
  422. package/dist/stream/multi_input_stream.test.js +343 -0
  423. package/dist/stream/multi_input_stream.test.js.map +1 -0
  424. package/dist/stream/stream_channel.cjs +39 -1
  425. package/dist/stream/stream_channel.cjs.map +1 -1
  426. package/dist/stream/stream_channel.d.cts +5 -2
  427. package/dist/stream/stream_channel.d.ts +5 -2
  428. package/dist/stream/stream_channel.d.ts.map +1 -1
  429. package/dist/stream/stream_channel.js +39 -1
  430. package/dist/stream/stream_channel.js.map +1 -1
  431. package/dist/stream/stream_channel.test.cjs +27 -0
  432. package/dist/stream/stream_channel.test.cjs.map +1 -1
  433. package/dist/stream/stream_channel.test.js +27 -0
  434. package/dist/stream/stream_channel.test.js.map +1 -1
  435. package/dist/stt/stream_adapter.cjs +24 -9
  436. package/dist/stt/stream_adapter.cjs.map +1 -1
  437. package/dist/stt/stream_adapter.d.cts +7 -3
  438. package/dist/stt/stream_adapter.d.ts +7 -3
  439. package/dist/stt/stream_adapter.d.ts.map +1 -1
  440. package/dist/stt/stream_adapter.js +24 -9
  441. package/dist/stt/stream_adapter.js.map +1 -1
  442. package/dist/stt/stt.cjs +86 -19
  443. package/dist/stt/stt.cjs.map +1 -1
  444. package/dist/stt/stt.d.cts +60 -5
  445. package/dist/stt/stt.d.ts +60 -5
  446. package/dist/stt/stt.d.ts.map +1 -1
  447. package/dist/stt/stt.js +88 -21
  448. package/dist/stt/stt.js.map +1 -1
  449. package/dist/telemetry/index.cjs +72 -0
  450. package/dist/telemetry/index.cjs.map +1 -0
  451. package/dist/telemetry/index.d.cts +7 -0
  452. package/dist/telemetry/index.d.ts +7 -0
  453. package/dist/telemetry/index.d.ts.map +1 -0
  454. package/dist/telemetry/index.js +37 -0
  455. package/dist/telemetry/index.js.map +1 -0
  456. package/dist/telemetry/logging.cjs +65 -0
  457. package/dist/telemetry/logging.cjs.map +1 -0
  458. package/dist/telemetry/logging.d.cts +21 -0
  459. package/dist/telemetry/logging.d.ts +21 -0
  460. package/dist/telemetry/logging.d.ts.map +1 -0
  461. package/dist/telemetry/logging.js +40 -0
  462. package/dist/telemetry/logging.js.map +1 -0
  463. package/dist/telemetry/otel_http_exporter.cjs +166 -0
  464. package/dist/telemetry/otel_http_exporter.cjs.map +1 -0
  465. package/dist/telemetry/otel_http_exporter.d.cts +63 -0
  466. package/dist/telemetry/otel_http_exporter.d.ts +63 -0
  467. package/dist/telemetry/otel_http_exporter.d.ts.map +1 -0
  468. package/dist/telemetry/otel_http_exporter.js +142 -0
  469. package/dist/telemetry/otel_http_exporter.js.map +1 -0
  470. package/dist/telemetry/pino_otel_transport.cjs +217 -0
  471. package/dist/telemetry/pino_otel_transport.cjs.map +1 -0
  472. package/dist/telemetry/pino_otel_transport.d.cts +58 -0
  473. package/dist/telemetry/pino_otel_transport.d.ts +58 -0
  474. package/dist/telemetry/pino_otel_transport.d.ts.map +1 -0
  475. package/dist/telemetry/pino_otel_transport.js +189 -0
  476. package/dist/telemetry/pino_otel_transport.js.map +1 -0
  477. package/dist/telemetry/trace_types.cjs +233 -0
  478. package/dist/telemetry/trace_types.cjs.map +1 -0
  479. package/dist/telemetry/trace_types.d.cts +74 -0
  480. package/dist/telemetry/trace_types.d.ts +74 -0
  481. package/dist/telemetry/trace_types.d.ts.map +1 -0
  482. package/dist/telemetry/trace_types.js +141 -0
  483. package/dist/telemetry/trace_types.js.map +1 -0
  484. package/dist/telemetry/traces.cjs +484 -0
  485. package/dist/telemetry/traces.cjs.map +1 -0
  486. package/dist/telemetry/traces.d.cts +116 -0
  487. package/dist/telemetry/traces.d.ts +116 -0
  488. package/dist/telemetry/traces.d.ts.map +1 -0
  489. package/dist/telemetry/traces.js +449 -0
  490. package/dist/telemetry/traces.js.map +1 -0
  491. package/dist/telemetry/utils.cjs +86 -0
  492. package/dist/telemetry/utils.cjs.map +1 -0
  493. package/dist/telemetry/utils.d.cts +5 -0
  494. package/dist/telemetry/utils.d.ts +5 -0
  495. package/dist/telemetry/utils.d.ts.map +1 -0
  496. package/dist/telemetry/utils.js +51 -0
  497. package/dist/telemetry/utils.js.map +1 -0
  498. package/dist/tokenize/basic/sentence.cjs +3 -3
  499. package/dist/tokenize/basic/sentence.cjs.map +1 -1
  500. package/dist/tokenize/basic/sentence.js +3 -3
  501. package/dist/tokenize/basic/sentence.js.map +1 -1
  502. package/dist/tokenize/tokenizer.test.cjs +3 -1
  503. package/dist/tokenize/tokenizer.test.cjs.map +1 -1
  504. package/dist/tokenize/tokenizer.test.js +3 -1
  505. package/dist/tokenize/tokenizer.test.js.map +1 -1
  506. package/dist/transcription.cjs.map +1 -1
  507. package/dist/transcription.d.cts +6 -0
  508. package/dist/transcription.d.ts +6 -0
  509. package/dist/transcription.d.ts.map +1 -1
  510. package/dist/transcription.js.map +1 -1
  511. package/dist/tts/fallback_adapter.cjs +466 -0
  512. package/dist/tts/fallback_adapter.cjs.map +1 -0
  513. package/dist/tts/fallback_adapter.d.cts +110 -0
  514. package/dist/tts/fallback_adapter.d.ts +110 -0
  515. package/dist/tts/fallback_adapter.d.ts.map +1 -0
  516. package/dist/tts/fallback_adapter.js +442 -0
  517. package/dist/tts/fallback_adapter.js.map +1 -0
  518. package/dist/tts/index.cjs +3 -0
  519. package/dist/tts/index.cjs.map +1 -1
  520. package/dist/tts/index.d.cts +1 -0
  521. package/dist/tts/index.d.ts +1 -0
  522. package/dist/tts/index.d.ts.map +1 -1
  523. package/dist/tts/index.js +2 -0
  524. package/dist/tts/index.js.map +1 -1
  525. package/dist/tts/stream_adapter.cjs +25 -8
  526. package/dist/tts/stream_adapter.cjs.map +1 -1
  527. package/dist/tts/stream_adapter.d.cts +6 -3
  528. package/dist/tts/stream_adapter.d.ts +6 -3
  529. package/dist/tts/stream_adapter.d.ts.map +1 -1
  530. package/dist/tts/stream_adapter.js +25 -8
  531. package/dist/tts/stream_adapter.js.map +1 -1
  532. package/dist/tts/tts.cjs +189 -57
  533. package/dist/tts/tts.cjs.map +1 -1
  534. package/dist/tts/tts.d.cts +58 -6
  535. package/dist/tts/tts.d.ts +58 -6
  536. package/dist/tts/tts.d.ts.map +1 -1
  537. package/dist/tts/tts.js +191 -59
  538. package/dist/tts/tts.js.map +1 -1
  539. package/dist/types.cjs +24 -32
  540. package/dist/types.cjs.map +1 -1
  541. package/dist/types.d.cts +45 -10
  542. package/dist/types.d.ts +45 -10
  543. package/dist/types.d.ts.map +1 -1
  544. package/dist/types.js +20 -30
  545. package/dist/types.js.map +1 -1
  546. package/dist/utils.cjs +122 -26
  547. package/dist/utils.cjs.map +1 -1
  548. package/dist/utils.d.cts +41 -1
  549. package/dist/utils.d.ts +41 -1
  550. package/dist/utils.d.ts.map +1 -1
  551. package/dist/utils.js +117 -25
  552. package/dist/utils.js.map +1 -1
  553. package/dist/utils.test.cjs +73 -1
  554. package/dist/utils.test.cjs.map +1 -1
  555. package/dist/utils.test.js +74 -10
  556. package/dist/utils.test.js.map +1 -1
  557. package/dist/vad.cjs +35 -15
  558. package/dist/vad.cjs.map +1 -1
  559. package/dist/vad.d.cts +15 -5
  560. package/dist/vad.d.ts +15 -5
  561. package/dist/vad.d.ts.map +1 -1
  562. package/dist/vad.js +35 -15
  563. package/dist/vad.js.map +1 -1
  564. package/dist/version.cjs +1 -1
  565. package/dist/version.cjs.map +1 -1
  566. package/dist/version.d.cts +1 -1
  567. package/dist/version.d.ts +1 -1
  568. package/dist/version.d.ts.map +1 -1
  569. package/dist/version.js +1 -1
  570. package/dist/version.js.map +1 -1
  571. package/dist/voice/agent.cjs +258 -35
  572. package/dist/voice/agent.cjs.map +1 -1
  573. package/dist/voice/agent.d.cts +54 -13
  574. package/dist/voice/agent.d.ts +54 -13
  575. package/dist/voice/agent.d.ts.map +1 -1
  576. package/dist/voice/agent.js +254 -34
  577. package/dist/voice/agent.js.map +1 -1
  578. package/dist/voice/agent.test.cjs +314 -0
  579. package/dist/voice/agent.test.cjs.map +1 -1
  580. package/dist/voice/agent.test.js +316 -2
  581. package/dist/voice/agent.test.js.map +1 -1
  582. package/dist/voice/agent_activity.cjs +1116 -385
  583. package/dist/voice/agent_activity.cjs.map +1 -1
  584. package/dist/voice/agent_activity.d.cts +72 -11
  585. package/dist/voice/agent_activity.d.ts +72 -11
  586. package/dist/voice/agent_activity.d.ts.map +1 -1
  587. package/dist/voice/agent_activity.js +1119 -383
  588. package/dist/voice/agent_activity.js.map +1 -1
  589. package/dist/voice/agent_activity.test.cjs +135 -0
  590. package/dist/voice/agent_activity.test.cjs.map +1 -0
  591. package/dist/voice/agent_activity.test.js +134 -0
  592. package/dist/voice/agent_activity.test.js.map +1 -0
  593. package/dist/voice/agent_session.cjs +550 -90
  594. package/dist/voice/agent_session.cjs.map +1 -1
  595. package/dist/voice/agent_session.d.cts +185 -25
  596. package/dist/voice/agent_session.d.ts +185 -25
  597. package/dist/voice/agent_session.d.ts.map +1 -1
  598. package/dist/voice/agent_session.js +556 -91
  599. package/dist/voice/agent_session.js.map +1 -1
  600. package/dist/voice/audio_recognition.cjs +605 -46
  601. package/dist/voice/audio_recognition.cjs.map +1 -1
  602. package/dist/voice/audio_recognition.d.cts +96 -4
  603. package/dist/voice/audio_recognition.d.ts +96 -4
  604. package/dist/voice/audio_recognition.d.ts.map +1 -1
  605. package/dist/voice/audio_recognition.js +611 -47
  606. package/dist/voice/audio_recognition.js.map +1 -1
  607. package/dist/voice/audio_recognition_span.test.cjs +295 -0
  608. package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
  609. package/dist/voice/audio_recognition_span.test.js +299 -0
  610. package/dist/voice/audio_recognition_span.test.js.map +1 -0
  611. package/dist/voice/avatar/datastream_io.cjs +7 -1
  612. package/dist/voice/avatar/datastream_io.cjs.map +1 -1
  613. package/dist/voice/avatar/datastream_io.d.cts +1 -0
  614. package/dist/voice/avatar/datastream_io.d.ts +1 -0
  615. package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
  616. package/dist/voice/avatar/datastream_io.js +7 -1
  617. package/dist/voice/avatar/datastream_io.js.map +1 -1
  618. package/dist/voice/background_audio.cjs +367 -0
  619. package/dist/voice/background_audio.cjs.map +1 -0
  620. package/dist/voice/background_audio.d.cts +123 -0
  621. package/dist/voice/background_audio.d.ts +123 -0
  622. package/dist/voice/background_audio.d.ts.map +1 -0
  623. package/dist/voice/background_audio.js +343 -0
  624. package/dist/voice/background_audio.js.map +1 -0
  625. package/dist/voice/events.cjs +3 -0
  626. package/dist/voice/events.cjs.map +1 -1
  627. package/dist/voice/events.d.cts +16 -9
  628. package/dist/voice/events.d.ts +16 -9
  629. package/dist/voice/events.d.ts.map +1 -1
  630. package/dist/voice/events.js +3 -0
  631. package/dist/voice/events.js.map +1 -1
  632. package/dist/voice/generation.cjs +205 -41
  633. package/dist/voice/generation.cjs.map +1 -1
  634. package/dist/voice/generation.d.cts +21 -5
  635. package/dist/voice/generation.d.ts +21 -5
  636. package/dist/voice/generation.d.ts.map +1 -1
  637. package/dist/voice/generation.js +215 -43
  638. package/dist/voice/generation.js.map +1 -1
  639. package/dist/voice/generation_tools.test.cjs +236 -0
  640. package/dist/voice/generation_tools.test.cjs.map +1 -0
  641. package/dist/voice/generation_tools.test.js +235 -0
  642. package/dist/voice/generation_tools.test.js.map +1 -0
  643. package/dist/voice/index.cjs +33 -2
  644. package/dist/voice/index.cjs.map +1 -1
  645. package/dist/voice/index.d.cts +8 -2
  646. package/dist/voice/index.d.ts +8 -2
  647. package/dist/voice/index.d.ts.map +1 -1
  648. package/dist/voice/index.js +19 -2
  649. package/dist/voice/index.js.map +1 -1
  650. package/dist/voice/interruption_detection.test.cjs +114 -0
  651. package/dist/voice/interruption_detection.test.cjs.map +1 -0
  652. package/dist/voice/interruption_detection.test.js +113 -0
  653. package/dist/voice/interruption_detection.test.js.map +1 -0
  654. package/dist/voice/io.cjs +66 -6
  655. package/dist/voice/io.cjs.map +1 -1
  656. package/dist/voice/io.d.cts +67 -7
  657. package/dist/voice/io.d.ts +67 -7
  658. package/dist/voice/io.d.ts.map +1 -1
  659. package/dist/voice/io.js +62 -5
  660. package/dist/voice/io.js.map +1 -1
  661. package/dist/voice/recorder_io/index.cjs +23 -0
  662. package/dist/voice/recorder_io/index.cjs.map +1 -0
  663. package/dist/voice/recorder_io/index.d.cts +2 -0
  664. package/dist/voice/recorder_io/index.d.ts +2 -0
  665. package/dist/voice/recorder_io/index.d.ts.map +1 -0
  666. package/dist/voice/recorder_io/index.js +2 -0
  667. package/dist/voice/recorder_io/index.js.map +1 -0
  668. package/dist/voice/recorder_io/recorder_io.cjs +607 -0
  669. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -0
  670. package/dist/voice/recorder_io/recorder_io.d.cts +106 -0
  671. package/dist/voice/recorder_io/recorder_io.d.ts +106 -0
  672. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -0
  673. package/dist/voice/recorder_io/recorder_io.js +573 -0
  674. package/dist/voice/recorder_io/recorder_io.js.map +1 -0
  675. package/dist/voice/remote_session.cjs +922 -0
  676. package/dist/voice/remote_session.cjs.map +1 -0
  677. package/dist/voice/remote_session.d.cts +108 -0
  678. package/dist/voice/remote_session.d.ts +108 -0
  679. package/dist/voice/remote_session.d.ts.map +1 -0
  680. package/dist/voice/remote_session.js +887 -0
  681. package/dist/voice/remote_session.js.map +1 -0
  682. package/dist/voice/report.cjs +88 -0
  683. package/dist/voice/report.cjs.map +1 -0
  684. package/dist/voice/report.d.cts +49 -0
  685. package/dist/voice/report.d.ts +49 -0
  686. package/dist/voice/report.d.ts.map +1 -0
  687. package/dist/voice/report.js +63 -0
  688. package/dist/voice/report.js.map +1 -0
  689. package/dist/voice/report.test.cjs +121 -0
  690. package/dist/voice/report.test.cjs.map +1 -0
  691. package/dist/voice/report.test.js +120 -0
  692. package/dist/voice/report.test.js.map +1 -0
  693. package/dist/voice/room_io/_input.cjs +40 -7
  694. package/dist/voice/room_io/_input.cjs.map +1 -1
  695. package/dist/voice/room_io/_input.d.cts +5 -2
  696. package/dist/voice/room_io/_input.d.ts +5 -2
  697. package/dist/voice/room_io/_input.d.ts.map +1 -1
  698. package/dist/voice/room_io/_input.js +41 -8
  699. package/dist/voice/room_io/_input.js.map +1 -1
  700. package/dist/voice/room_io/_output.cjs +19 -11
  701. package/dist/voice/room_io/_output.cjs.map +1 -1
  702. package/dist/voice/room_io/_output.d.cts +7 -4
  703. package/dist/voice/room_io/_output.d.ts +7 -4
  704. package/dist/voice/room_io/_output.d.ts.map +1 -1
  705. package/dist/voice/room_io/_output.js +20 -12
  706. package/dist/voice/room_io/_output.js.map +1 -1
  707. package/dist/voice/room_io/room_io.cjs +33 -6
  708. package/dist/voice/room_io/room_io.cjs.map +1 -1
  709. package/dist/voice/room_io/room_io.d.cts +29 -9
  710. package/dist/voice/room_io/room_io.d.ts +29 -9
  711. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  712. package/dist/voice/room_io/room_io.js +33 -7
  713. package/dist/voice/room_io/room_io.js.map +1 -1
  714. package/dist/voice/speech_handle.cjs +22 -4
  715. package/dist/voice/speech_handle.cjs.map +1 -1
  716. package/dist/voice/speech_handle.d.cts +17 -2
  717. package/dist/voice/speech_handle.d.ts +17 -2
  718. package/dist/voice/speech_handle.d.ts.map +1 -1
  719. package/dist/voice/speech_handle.js +21 -4
  720. package/dist/voice/speech_handle.js.map +1 -1
  721. package/dist/voice/testing/fake_llm.cjs +127 -0
  722. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  723. package/dist/voice/testing/fake_llm.d.cts +30 -0
  724. package/dist/voice/testing/fake_llm.d.ts +30 -0
  725. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  726. package/dist/voice/testing/fake_llm.js +103 -0
  727. package/dist/voice/testing/fake_llm.js.map +1 -0
  728. package/dist/voice/testing/index.cjs +57 -0
  729. package/dist/voice/testing/index.cjs.map +1 -0
  730. package/dist/voice/testing/index.d.cts +21 -0
  731. package/dist/voice/testing/index.d.ts +21 -0
  732. package/dist/voice/testing/index.d.ts.map +1 -0
  733. package/dist/voice/testing/index.js +35 -0
  734. package/dist/voice/testing/index.js.map +1 -0
  735. package/dist/voice/testing/run_result.cjs +817 -0
  736. package/dist/voice/testing/run_result.cjs.map +1 -0
  737. package/dist/voice/testing/run_result.d.cts +385 -0
  738. package/dist/voice/testing/run_result.d.ts +385 -0
  739. package/dist/voice/testing/run_result.d.ts.map +1 -0
  740. package/dist/voice/testing/run_result.js +790 -0
  741. package/dist/voice/testing/run_result.js.map +1 -0
  742. package/dist/voice/testing/types.cjs +46 -0
  743. package/dist/voice/testing/types.cjs.map +1 -0
  744. package/dist/voice/testing/types.d.cts +83 -0
  745. package/dist/voice/testing/types.d.ts +83 -0
  746. package/dist/voice/testing/types.d.ts.map +1 -0
  747. package/dist/voice/testing/types.js +19 -0
  748. package/dist/voice/testing/types.js.map +1 -0
  749. package/dist/voice/transcription/synchronizer.cjs +139 -15
  750. package/dist/voice/transcription/synchronizer.cjs.map +1 -1
  751. package/dist/voice/transcription/synchronizer.d.cts +35 -4
  752. package/dist/voice/transcription/synchronizer.d.ts +35 -4
  753. package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
  754. package/dist/voice/transcription/synchronizer.js +143 -16
  755. package/dist/voice/transcription/synchronizer.js.map +1 -1
  756. package/dist/voice/transcription/synchronizer.test.cjs +151 -0
  757. package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
  758. package/dist/voice/transcription/synchronizer.test.js +150 -0
  759. package/dist/voice/transcription/synchronizer.test.js.map +1 -0
  760. package/dist/voice/turn_config/endpointing.cjs +33 -0
  761. package/dist/voice/turn_config/endpointing.cjs.map +1 -0
  762. package/dist/voice/turn_config/endpointing.d.cts +30 -0
  763. package/dist/voice/turn_config/endpointing.d.ts +30 -0
  764. package/dist/voice/turn_config/endpointing.d.ts.map +1 -0
  765. package/dist/voice/turn_config/endpointing.js +9 -0
  766. package/dist/voice/turn_config/endpointing.js.map +1 -0
  767. package/dist/voice/turn_config/interruption.cjs +37 -0
  768. package/dist/voice/turn_config/interruption.cjs.map +1 -0
  769. package/dist/voice/turn_config/interruption.d.cts +53 -0
  770. package/dist/voice/turn_config/interruption.d.ts +53 -0
  771. package/dist/voice/turn_config/interruption.d.ts.map +1 -0
  772. package/dist/voice/turn_config/interruption.js +13 -0
  773. package/dist/voice/turn_config/interruption.js.map +1 -0
  774. package/dist/voice/turn_config/turn_handling.cjs +35 -0
  775. package/dist/voice/turn_config/turn_handling.cjs.map +1 -0
  776. package/dist/voice/turn_config/turn_handling.d.cts +36 -0
  777. package/dist/voice/turn_config/turn_handling.d.ts +36 -0
  778. package/dist/voice/turn_config/turn_handling.d.ts.map +1 -0
  779. package/dist/voice/turn_config/turn_handling.js +11 -0
  780. package/dist/voice/turn_config/turn_handling.js.map +1 -0
  781. package/dist/voice/turn_config/utils.cjs +157 -0
  782. package/dist/voice/turn_config/utils.cjs.map +1 -0
  783. package/dist/voice/turn_config/utils.d.cts +37 -0
  784. package/dist/voice/turn_config/utils.d.ts +37 -0
  785. package/dist/voice/turn_config/utils.d.ts.map +1 -0
  786. package/dist/voice/turn_config/utils.js +131 -0
  787. package/dist/voice/turn_config/utils.js.map +1 -0
  788. package/dist/voice/turn_config/utils.test.cjs +128 -0
  789. package/dist/voice/turn_config/utils.test.cjs.map +1 -0
  790. package/dist/voice/turn_config/utils.test.js +127 -0
  791. package/dist/voice/turn_config/utils.test.js.map +1 -0
  792. package/dist/voice/utils.cjs +47 -0
  793. package/dist/voice/utils.cjs.map +1 -0
  794. package/dist/voice/utils.d.cts +4 -0
  795. package/dist/voice/utils.d.ts +4 -0
  796. package/dist/voice/utils.d.ts.map +1 -0
  797. package/dist/voice/utils.js +23 -0
  798. package/dist/voice/utils.js.map +1 -0
  799. package/dist/worker.cjs +44 -52
  800. package/dist/worker.cjs.map +1 -1
  801. package/dist/worker.d.cts +18 -8
  802. package/dist/worker.d.ts +18 -8
  803. package/dist/worker.d.ts.map +1 -1
  804. package/dist/worker.js +43 -43
  805. package/dist/worker.js.map +1 -1
  806. package/package.json +32 -12
  807. package/resources/NOTICE +2 -0
  808. package/resources/keyboard-typing.ogg +0 -0
  809. package/resources/keyboard-typing2.ogg +0 -0
  810. package/resources/office-ambience.ogg +0 -0
  811. package/src/audio.ts +132 -1
  812. package/src/beta/index.ts +9 -0
  813. package/src/beta/workflows/index.ts +9 -0
  814. package/src/beta/workflows/task_group.ts +194 -0
  815. package/src/cli.ts +57 -66
  816. package/src/connection_pool.test.ts +346 -0
  817. package/src/connection_pool.ts +307 -0
  818. package/src/constants.ts +14 -0
  819. package/src/cpu.test.ts +239 -0
  820. package/src/cpu.ts +173 -0
  821. package/src/http_server.ts +18 -6
  822. package/src/index.ts +15 -13
  823. package/src/inference/api_protos.ts +85 -2
  824. package/src/inference/index.ts +32 -4
  825. package/src/inference/interruption/defaults.ts +51 -0
  826. package/src/inference/interruption/errors.ts +25 -0
  827. package/src/inference/interruption/http_transport.ts +206 -0
  828. package/src/inference/interruption/interruption_cache_entry.ts +50 -0
  829. package/src/inference/interruption/interruption_detector.ts +204 -0
  830. package/src/inference/interruption/interruption_stream.ts +467 -0
  831. package/src/inference/interruption/types.ts +84 -0
  832. package/src/inference/interruption/utils.test.ts +132 -0
  833. package/src/inference/interruption/utils.ts +137 -0
  834. package/src/inference/interruption/ws_transport.ts +406 -0
  835. package/src/inference/llm.ts +214 -163
  836. package/src/inference/stt.test.ts +253 -0
  837. package/src/inference/stt.ts +449 -208
  838. package/src/inference/tts.test.ts +267 -0
  839. package/src/inference/tts.ts +377 -115
  840. package/src/inference/utils.ts +30 -2
  841. package/src/ipc/inference_proc_executor.ts +11 -3
  842. package/src/ipc/inference_proc_lazy_main.ts +13 -1
  843. package/src/ipc/job_proc_executor.ts +11 -1
  844. package/src/ipc/job_proc_lazy_main.ts +86 -20
  845. package/src/ipc/supervised_proc.test.ts +153 -0
  846. package/src/ipc/supervised_proc.ts +39 -10
  847. package/src/job.ts +120 -1
  848. package/src/language.test.ts +62 -0
  849. package/src/language.ts +380 -0
  850. package/src/llm/__snapshots__/zod-utils.test.ts.snap +559 -0
  851. package/src/llm/chat_context.test.ts +655 -0
  852. package/src/llm/chat_context.ts +412 -2
  853. package/src/llm/fallback_adapter.test.ts +238 -0
  854. package/src/llm/fallback_adapter.ts +391 -0
  855. package/src/llm/index.ts +11 -0
  856. package/src/llm/llm.ts +77 -12
  857. package/src/llm/provider_format/google.test.ts +72 -1
  858. package/src/llm/provider_format/google.ts +10 -6
  859. package/src/llm/provider_format/index.ts +7 -2
  860. package/src/llm/provider_format/openai.test.ts +480 -2
  861. package/src/llm/provider_format/openai.ts +152 -21
  862. package/src/llm/provider_format/utils.ts +11 -5
  863. package/src/llm/realtime.ts +23 -2
  864. package/src/llm/remote_chat_context.ts +2 -2
  865. package/src/llm/tool_context.test.ts +210 -1
  866. package/src/llm/tool_context.ts +115 -17
  867. package/src/llm/utils.ts +24 -16
  868. package/src/llm/zod-utils.test.ts +577 -0
  869. package/src/llm/zod-utils.ts +153 -0
  870. package/src/log.ts +71 -19
  871. package/src/metrics/base.ts +78 -19
  872. package/src/metrics/index.ts +12 -0
  873. package/src/metrics/model_usage.test.ts +545 -0
  874. package/src/metrics/model_usage.ts +262 -0
  875. package/src/metrics/usage_collector.ts +14 -3
  876. package/src/metrics/utils.ts +27 -7
  877. package/src/stream/deferred_stream.test.ts +3 -3
  878. package/src/stream/deferred_stream.ts +43 -11
  879. package/src/stream/index.ts +1 -0
  880. package/src/stream/multi_input_stream.test.ts +545 -0
  881. package/src/stream/multi_input_stream.ts +172 -0
  882. package/src/stream/stream_channel.test.ts +37 -0
  883. package/src/stream/stream_channel.ts +43 -3
  884. package/src/stt/stream_adapter.ts +30 -9
  885. package/src/stt/stt.ts +131 -22
  886. package/src/telemetry/index.ts +28 -0
  887. package/src/telemetry/logging.ts +55 -0
  888. package/src/telemetry/otel_http_exporter.ts +218 -0
  889. package/src/telemetry/pino_otel_transport.ts +265 -0
  890. package/src/telemetry/trace_types.ts +109 -0
  891. package/src/telemetry/traces.ts +673 -0
  892. package/src/telemetry/utils.ts +61 -0
  893. package/src/tokenize/basic/sentence.ts +3 -3
  894. package/src/tokenize/tokenizer.test.ts +4 -0
  895. package/src/transcription.ts +6 -0
  896. package/src/tts/fallback_adapter.ts +579 -0
  897. package/src/tts/index.ts +1 -0
  898. package/src/tts/stream_adapter.ts +38 -8
  899. package/src/tts/tts.ts +245 -62
  900. package/src/types.ts +62 -33
  901. package/src/utils.test.ts +90 -10
  902. package/src/utils.ts +176 -31
  903. package/src/vad.ts +42 -18
  904. package/src/version.ts +1 -1
  905. package/src/voice/agent.test.ts +347 -2
  906. package/src/voice/agent.ts +346 -44
  907. package/src/voice/agent_activity.test.ts +194 -0
  908. package/src/voice/agent_activity.ts +1457 -388
  909. package/src/voice/agent_session.ts +817 -112
  910. package/src/voice/audio_recognition.ts +845 -70
  911. package/src/voice/audio_recognition_span.test.ts +341 -0
  912. package/src/voice/avatar/datastream_io.ts +9 -1
  913. package/src/voice/background_audio.ts +494 -0
  914. package/src/voice/events.ts +27 -7
  915. package/src/voice/generation.ts +310 -56
  916. package/src/voice/generation_tools.test.ts +268 -0
  917. package/src/voice/index.ts +17 -3
  918. package/src/voice/interruption_detection.test.ts +151 -0
  919. package/src/voice/io.ts +115 -12
  920. package/src/voice/recorder_io/index.ts +4 -0
  921. package/src/voice/recorder_io/recorder_io.ts +783 -0
  922. package/src/voice/remote_session.ts +1083 -0
  923. package/src/voice/report.test.ts +136 -0
  924. package/src/voice/report.ts +140 -0
  925. package/src/voice/room_io/_input.ts +45 -10
  926. package/src/voice/room_io/_output.ts +26 -14
  927. package/src/voice/room_io/room_io.ts +67 -22
  928. package/src/voice/speech_handle.ts +38 -6
  929. package/src/voice/testing/fake_llm.ts +138 -0
  930. package/src/voice/testing/index.ts +52 -0
  931. package/src/voice/testing/run_result.ts +995 -0
  932. package/src/voice/testing/types.ts +118 -0
  933. package/src/voice/transcription/synchronizer.test.ts +206 -0
  934. package/src/voice/transcription/synchronizer.ts +204 -19
  935. package/src/voice/turn_config/endpointing.ts +33 -0
  936. package/src/voice/turn_config/interruption.ts +56 -0
  937. package/src/voice/turn_config/turn_handling.ts +45 -0
  938. package/src/voice/turn_config/utils.test.ts +148 -0
  939. package/src/voice/turn_config/utils.ts +167 -0
  940. package/src/voice/utils.ts +29 -0
  941. package/src/worker.ts +92 -78
  942. package/src/llm/__snapshots__/utils.test.ts.snap +0 -65
@@ -1,73 +1,150 @@
1
1
  // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
+ import type { ParticipantKind } from '@livekit/rtc-node';
4
5
  import { AudioFrame } from '@livekit/rtc-node';
6
+ import {
7
+ type Context,
8
+ ROOT_CONTEXT,
9
+ type Span,
10
+ context as otelContext,
11
+ trace,
12
+ } from '@opentelemetry/api';
5
13
  import type { WritableStreamDefaultWriter } from 'node:stream/web';
6
14
  import { ReadableStream } from 'node:stream/web';
15
+ import { isAPIError } from '../_exceptions.js';
16
+ import { apiConnectDefaults, intervalForRetry } from '../inference/interruption/defaults.js';
17
+ import { InterruptionDetectionError } from '../inference/interruption/errors.js';
18
+ import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
19
+ import { InterruptionStreamSentinel } from '../inference/interruption/interruption_stream.js';
20
+ import {
21
+ type InterruptionSentinel,
22
+ type OverlappingSpeechEvent,
23
+ } from '../inference/interruption/types.js';
24
+ import type { LanguageCode } from '../language.js';
7
25
  import { type ChatContext } from '../llm/chat_context.js';
8
26
  import { log } from '../log.js';
9
27
  import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js';
10
28
  import { IdentityTransform } from '../stream/identity_transform.js';
11
29
  import { mergeReadableStreams } from '../stream/merge_readable_streams.js';
30
+ import { type StreamChannel, createStreamChannel } from '../stream/stream_channel.js';
12
31
  import { type SpeechEvent, SpeechEventType } from '../stt/stt.js';
13
- import { Task, delay } from '../utils.js';
32
+ import { traceTypes, tracer } from '../telemetry/index.js';
33
+ import { Task, delay, waitForAbort } from '../utils.js';
14
34
  import { type VAD, type VADEvent, VADEventType } from '../vad.js';
15
35
  import type { TurnDetectionMode } from './agent_session.js';
16
36
  import type { STTNode } from './io.js';
37
+ import { setParticipantSpanAttributes } from './utils.js';
17
38
 
18
39
  export interface EndOfTurnInfo {
40
+ /** The new transcript text from the user's speech. */
19
41
  newTranscript: string;
42
+ /** Confidence score of the transcript (0-1). */
43
+ transcriptConfidence: number;
44
+ /** Delay from speech stop to final transcription in milliseconds. */
20
45
  transcriptionDelay: number;
46
+ /** Delay from speech stop to end of utterance detection in milliseconds. */
21
47
  endOfUtteranceDelay: number;
48
+ /** Timestamp when user started speaking (milliseconds since epoch). */
49
+ startedSpeakingAt: number | undefined;
50
+ /** Timestamp when user stopped speaking (milliseconds since epoch). */
51
+ stoppedSpeakingAt: number | undefined;
52
+ }
53
+
54
+ export interface PreemptiveGenerationInfo {
55
+ newTranscript: string;
56
+ transcriptConfidence: number;
22
57
  }
23
58
 
24
59
  export interface RecognitionHooks {
60
+ onInterruption: (ev: OverlappingSpeechEvent) => void;
25
61
  onStartOfSpeech: (ev: VADEvent) => void;
26
62
  onVADInferenceDone: (ev: VADEvent) => void;
27
63
  onEndOfSpeech: (ev: VADEvent) => void;
28
64
  onInterimTranscript: (ev: SpeechEvent) => void;
29
65
  onFinalTranscript: (ev: SpeechEvent) => void;
30
66
  onEndOfTurn: (info: EndOfTurnInfo) => Promise<boolean>;
67
+ onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void;
31
68
 
32
69
  retrieveChatCtx: () => ChatContext;
33
70
  }
34
71
 
35
72
  export interface _TurnDetector {
36
- unlikelyThreshold: (language?: string) => Promise<number | undefined>;
37
- supportsLanguage: (language?: string) => Promise<boolean>;
38
- predictEndOfTurn(chatCtx: ChatContext): Promise<number>;
73
+ /** The model name used by this turn detector. */
74
+ readonly model: string;
75
+ /** The provider name for this turn detector. */
76
+ readonly provider: string;
77
+ unlikelyThreshold: (language?: LanguageCode) => Promise<number | undefined>;
78
+ supportsLanguage: (language?: LanguageCode) => Promise<boolean>;
79
+ predictEndOfTurn(chatCtx: ChatContext, timeout?: number): Promise<number>;
39
80
  }
40
81
 
41
82
  export interface AudioRecognitionOptions {
83
+ /** Hooks for recognition events. */
42
84
  recognitionHooks: RecognitionHooks;
85
+ /** Speech-to-text node. */
43
86
  stt?: STTNode;
87
+ /** Voice activity detection. */
44
88
  vad?: VAD;
89
+ /** Turn detector for end-of-turn prediction. */
45
90
  turnDetector?: _TurnDetector;
46
- turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
91
+ /** Turn detection mode. */
92
+ turnDetectionMode?: TurnDetectionMode;
93
+ interruptionDetection?: AdaptiveInterruptionDetector;
94
+ /** Minimum endpointing delay in milliseconds. */
47
95
  minEndpointingDelay: number;
96
+ /** Maximum endpointing delay in milliseconds. */
48
97
  maxEndpointingDelay: number;
98
+ /** Root span context for tracing. */
99
+ rootSpanContext?: Context;
100
+ /** STT model name for tracing */
101
+ sttModel?: string;
102
+ /** STT provider name for tracing */
103
+ sttProvider?: string;
104
+ /** Getter for linked participant for span attribution */
105
+ getLinkedParticipant?: () => ParticipantLike | undefined;
106
+ }
107
+
108
+ /**
109
+ * Minimal participant shape for span attribution.
110
+ * Compatible with both `LocalParticipant` and `RemoteParticipant` from `@livekit/rtc-node`.
111
+ */
112
+ export interface ParticipantLike {
113
+ sid: string | undefined;
114
+ identity: string;
115
+ kind: ParticipantKind;
49
116
  }
50
117
 
118
+ // TODO add ability to update stt/vad/interruption-detection
51
119
  export class AudioRecognition {
52
120
  private hooks: RecognitionHooks;
53
121
  private stt?: STTNode;
54
122
  private vad?: VAD;
55
123
  private turnDetector?: _TurnDetector;
56
- private turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
124
+ private turnDetectionMode?: TurnDetectionMode;
57
125
  private minEndpointingDelay: number;
58
126
  private maxEndpointingDelay: number;
59
- private lastLanguage?: string;
127
+ private lastLanguage?: LanguageCode;
128
+ private rootSpanContext?: Context;
129
+ private sttModel?: string;
130
+ private sttProvider?: string;
131
+ private getLinkedParticipant?: () => ParticipantLike | undefined;
60
132
 
61
133
  private deferredInputStream: DeferredReadableStream<AudioFrame>;
62
134
  private logger = log();
63
135
  private lastFinalTranscriptTime = 0;
64
136
  private audioTranscript = '';
65
137
  private audioInterimTranscript = '';
66
- private lastSpeakingTime = 0;
138
+ private audioPreflightTranscript = '';
139
+ private finalTranscriptConfidence: number[] = [];
140
+ private lastSpeakingTime: number | undefined;
141
+ private speechStartTime: number | undefined;
67
142
  private userTurnCommitted = false;
68
143
  private speaking = false;
69
144
  private sampleRate?: number;
70
145
 
146
+ private userTurnSpan?: Span;
147
+
71
148
  private vadInputStream: ReadableStream<AudioFrame>;
72
149
  private sttInputStream: ReadableStream<AudioFrame>;
73
150
  private silenceAudioTransform = new IdentityTransform<AudioFrame>();
@@ -78,6 +155,16 @@ export class AudioRecognition {
78
155
  private commitUserTurnTask?: Task<void>;
79
156
  private vadTask?: Task<void>;
80
157
  private sttTask?: Task<void>;
158
+ private interruptionTask?: Task<void>;
159
+
160
+ // interruption detection
161
+ private interruptionDetection?: AdaptiveInterruptionDetector;
162
+ private _inputStartedAt?: number;
163
+ private ignoreUserTranscriptUntil?: number;
164
+ private transcriptBuffer: SpeechEvent[];
165
+ private isInterruptionEnabled: boolean;
166
+ private isAgentSpeaking: boolean;
167
+ private interruptionStreamChannel?: StreamChannel<InterruptionSentinel | AudioFrame>;
81
168
 
82
169
  constructor(opts: AudioRecognitionOptions) {
83
170
  this.hooks = opts.recognitionHooks;
@@ -88,11 +175,35 @@ export class AudioRecognition {
88
175
  this.minEndpointingDelay = opts.minEndpointingDelay;
89
176
  this.maxEndpointingDelay = opts.maxEndpointingDelay;
90
177
  this.lastLanguage = undefined;
178
+ this.rootSpanContext = opts.rootSpanContext;
179
+ this.sttModel = opts.sttModel;
180
+ this.sttProvider = opts.sttProvider;
181
+ this.getLinkedParticipant = opts.getLinkedParticipant;
91
182
 
92
183
  this.deferredInputStream = new DeferredReadableStream<AudioFrame>();
93
- const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
94
- this.vadInputStream = vadInputStream;
95
- this.sttInputStream = mergeReadableStreams(sttInputStream, this.silenceAudioTransform.readable);
184
+ this.interruptionDetection = opts.interruptionDetection;
185
+ this.transcriptBuffer = [];
186
+ this.isInterruptionEnabled = !!(opts.interruptionDetection && opts.vad);
187
+ this.isAgentSpeaking = false;
188
+
189
+ if (opts.interruptionDetection) {
190
+ const [vadInputStream, teedInput] = this.deferredInputStream.stream.tee();
191
+ const [inputStream, sttInputStream] = teedInput.tee();
192
+ this.vadInputStream = vadInputStream;
193
+ this.sttInputStream = mergeReadableStreams(
194
+ sttInputStream,
195
+ this.silenceAudioTransform.readable,
196
+ );
197
+ this.interruptionStreamChannel = createStreamChannel();
198
+ this.interruptionStreamChannel.addStreamInput(inputStream);
199
+ } else {
200
+ const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee();
201
+ this.vadInputStream = vadInputStream;
202
+ this.sttInputStream = mergeReadableStreams(
203
+ sttInputStream,
204
+ this.silenceAudioTransform.readable,
205
+ );
206
+ }
96
207
  this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter();
97
208
  }
98
209
 
@@ -106,6 +217,16 @@ export class AudioRecognition {
106
217
  return this.audioTranscript;
107
218
  }
108
219
 
220
+ /** @internal */
221
+ get inputStartedAt() {
222
+ return this._inputStartedAt;
223
+ }
224
+
225
+ /** @internal */
226
+ updateOptions(options: { turnDetection: TurnDetectionMode | undefined }): void {
227
+ this.turnDetectionMode = options.turnDetection;
228
+ }
229
+
109
230
  async start() {
110
231
  this.vadTask = Task.from(({ signal }) => this.createVadTask(this.vad, signal));
111
232
  this.vadTask.result.catch((err) => {
@@ -116,6 +237,251 @@ export class AudioRecognition {
116
237
  this.sttTask.result.catch((err) => {
117
238
  this.logger.error(`Error running STT task: ${err}`);
118
239
  });
240
+
241
+ this.interruptionTask = Task.from(({ signal }) =>
242
+ this.createInterruptionTask(this.interruptionDetection, signal),
243
+ );
244
+ this.interruptionTask.result.catch((err) => {
245
+ this.logger.error(`Error running interruption task: ${err}`);
246
+ });
247
+ }
248
+
249
+ async stop() {
250
+ await this.sttTask?.cancelAndWait();
251
+ await this.vadTask?.cancelAndWait();
252
+ await this.interruptionTask?.cancelAndWait();
253
+ }
254
+
255
+ async disableInterruptionDetection(): Promise<void> {
256
+ this.isInterruptionEnabled = false;
257
+ this.interruptionDetection = undefined;
258
+ await this.interruptionTask?.cancelAndWait();
259
+ this.interruptionTask = undefined;
260
+ await this.interruptionStreamChannel?.close();
261
+ this.interruptionStreamChannel = undefined;
262
+ }
263
+
264
+ async onStartOfAgentSpeech() {
265
+ this.isAgentSpeaking = true;
266
+ return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted());
267
+ }
268
+
269
+ async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) {
270
+ if (!this.isInterruptionEnabled) {
271
+ this.isAgentSpeaking = false;
272
+ return;
273
+ }
274
+
275
+ const inputOpen = await this.trySendInterruptionSentinel(
276
+ InterruptionStreamSentinel.agentSpeechEnded(),
277
+ );
278
+ if (!inputOpen) {
279
+ this.isAgentSpeaking = false;
280
+ return;
281
+ }
282
+
283
+ if (this.isAgentSpeaking) {
284
+ if (this.ignoreUserTranscriptUntil === undefined) {
285
+ this.onEndOfOverlapSpeech(Date.now());
286
+ }
287
+ this.ignoreUserTranscriptUntil = this.ignoreUserTranscriptUntil
288
+ ? Math.min(ignoreUserTranscriptUntil, this.ignoreUserTranscriptUntil)
289
+ : ignoreUserTranscriptUntil;
290
+
291
+ // flush held transcripts if possible
292
+ await this.flushHeldTranscripts();
293
+ }
294
+ this.isAgentSpeaking = false;
295
+ }
296
+
297
+ /** Start interruption inference when agent is speaking and overlap speech starts. */
298
+ async onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span) {
299
+ if (this.isAgentSpeaking) {
300
+ this.trySendInterruptionSentinel(
301
+ InterruptionStreamSentinel.overlapSpeechStarted(
302
+ speechDuration,
303
+ startedAt,
304
+ userSpeakingSpan,
305
+ ),
306
+ );
307
+ }
308
+ }
309
+
310
+ /** End interruption inference when overlap speech ends. */
311
+ async onEndOfOverlapSpeech(endedAt: number, userSpeakingSpan?: Span) {
312
+ if (!this.isInterruptionEnabled) {
313
+ return;
314
+ }
315
+ if (userSpeakingSpan && userSpeakingSpan.isRecording()) {
316
+ userSpeakingSpan.setAttribute(traceTypes.ATTR_IS_INTERRUPTION, 'false');
317
+ }
318
+
319
+ return this.trySendInterruptionSentinel(InterruptionStreamSentinel.overlapSpeechEnded(endedAt));
320
+ }
321
+
322
+ /**
323
+ * Flush held transcripts whose *end time* is after the ignoreUserTranscriptUntil timestamp.
324
+ * If the event has no timestamps, we assume it is the same as the next valid event.
325
+ */
326
+ private async flushHeldTranscripts() {
327
+ if (
328
+ !this.isInterruptionEnabled ||
329
+ this.ignoreUserTranscriptUntil === undefined ||
330
+ this.transcriptBuffer.length === 0
331
+ ) {
332
+ return;
333
+ }
334
+
335
+ if (!this._inputStartedAt) {
336
+ this.transcriptBuffer = [];
337
+ this.ignoreUserTranscriptUntil = undefined;
338
+ return;
339
+ }
340
+
341
+ let emitFromIndex: number | null = null;
342
+ let shouldFlush = false;
343
+
344
+ for (let i = 0; i < this.transcriptBuffer.length; i++) {
345
+ const ev = this.transcriptBuffer[i];
346
+ if (!ev || !ev.alternatives || ev.alternatives.length === 0) {
347
+ emitFromIndex = Math.min(emitFromIndex ?? i, i);
348
+ continue;
349
+ }
350
+ const firstAlternative = ev.alternatives[0];
351
+ if (
352
+ firstAlternative.startTime === firstAlternative.endTime &&
353
+ firstAlternative.startTime === 0
354
+ ) {
355
+ this.transcriptBuffer = [];
356
+ this.ignoreUserTranscriptUntil = undefined;
357
+ return;
358
+ }
359
+
360
+ if (this.#alternativeEndsBeforeIgnoreWindow(firstAlternative)) {
361
+ emitFromIndex = null;
362
+ } else {
363
+ emitFromIndex = Math.min(emitFromIndex ?? i, i);
364
+ shouldFlush = true;
365
+ break;
366
+ }
367
+ }
368
+
369
+ const eventsToEmit =
370
+ emitFromIndex !== null && shouldFlush ? this.transcriptBuffer.slice(emitFromIndex) : [];
371
+
372
+ this.transcriptBuffer = [];
373
+ this.ignoreUserTranscriptUntil = undefined;
374
+
375
+ for (const event of eventsToEmit) {
376
+ this.logger.trace(
377
+ {
378
+ event: event.type,
379
+ },
380
+ 're-emitting held user transcript',
381
+ );
382
+ this.onSTTEvent(event);
383
+ }
384
+ }
385
+
386
+ #alternativeEndsBeforeIgnoreWindow(
387
+ alternative: NonNullable<SpeechEvent['alternatives']>[number],
388
+ ): boolean {
389
+ if (
390
+ this.ignoreUserTranscriptUntil === undefined ||
391
+ !this._inputStartedAt ||
392
+ alternative.startTime <= 0
393
+ ) {
394
+ return false;
395
+ }
396
+
397
+ // `SpeechData.startTime` is in seconds relative to audio start, while `inputStartedAt` and
398
+ // `ignoreUserTranscriptUntil` are epoch milliseconds.
399
+ return alternative.startTime * 1000 + this._inputStartedAt < this.ignoreUserTranscriptUntil;
400
+ }
401
+
402
+ private shouldHoldSttEvent(ev: SpeechEvent): boolean {
403
+ if (!this.isInterruptionEnabled) {
404
+ return false;
405
+ }
406
+ if (this.isAgentSpeaking) {
407
+ return true;
408
+ }
409
+
410
+ // reset when the user starts speaking after the agent speech
411
+ if (ev.type === SpeechEventType.START_OF_SPEECH) {
412
+ this.ignoreUserTranscriptUntil = undefined;
413
+ this.transcriptBuffer = [];
414
+ return false;
415
+ }
416
+
417
+ if (this.ignoreUserTranscriptUntil === undefined) {
418
+ return false;
419
+ }
420
+ // sentinel events are always held until we have something concrete to release them
421
+ if (!ev.alternatives || ev.alternatives.length === 0) {
422
+ return true;
423
+ }
424
+
425
+ const alternative = ev.alternatives[0];
426
+
427
+ if (
428
+ alternative.startTime !== alternative.endTime &&
429
+ this.#alternativeEndsBeforeIgnoreWindow(alternative)
430
+ ) {
431
+ return true;
432
+ }
433
+ return false;
434
+ }
435
+
436
+ private async trySendInterruptionSentinel(
437
+ frame: AudioFrame | InterruptionSentinel,
438
+ ): Promise<boolean> {
439
+ if (
440
+ this.isInterruptionEnabled &&
441
+ this.interruptionStreamChannel &&
442
+ !this.interruptionStreamChannel.closed
443
+ ) {
444
+ try {
445
+ await this.interruptionStreamChannel.write(frame);
446
+ return true;
447
+ } catch (e: unknown) {
448
+ this.logger.warn(
449
+ `could not forward interruption sentinel: ${e instanceof Error ? e.message : String(e)}`,
450
+ );
451
+ }
452
+ }
453
+ return false;
454
+ }
455
+
456
+ private ensureUserTurnSpan(startTime?: number): Span {
457
+ if (this.userTurnSpan && this.userTurnSpan.isRecording()) {
458
+ return this.userTurnSpan;
459
+ }
460
+
461
+ this.userTurnSpan = tracer.startSpan({
462
+ name: 'user_turn',
463
+ context: this.rootSpanContext,
464
+ startTime,
465
+ });
466
+
467
+ const participant = this.getLinkedParticipant?.();
468
+ if (participant) {
469
+ setParticipantSpanAttributes(this.userTurnSpan, participant);
470
+ }
471
+
472
+ if (this.sttModel) {
473
+ this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.sttModel);
474
+ }
475
+ if (this.sttProvider) {
476
+ this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, this.sttProvider);
477
+ }
478
+
479
+ return this.userTurnSpan;
480
+ }
481
+
482
+ private userTurnContext(span: Span): Context {
483
+ const base = this.rootSpanContext ?? ROOT_CONTEXT;
484
+ return trace.setSpan(base, span);
119
485
  }
120
486
 
121
487
  private async onSTTEvent(ev: SpeechEvent) {
@@ -140,10 +506,29 @@ export class AudioRecognition {
140
506
  return;
141
507
  }
142
508
 
509
+ // handle interruption detection
510
+ // - hold the event until the ignore_user_transcript_until expires
511
+ // - release only relevant events
512
+ // - allow RECOGNITION_USAGE to pass through immediately
513
+
514
+ if (ev.type !== SpeechEventType.RECOGNITION_USAGE && this.isInterruptionEnabled) {
515
+ if (this.shouldHoldSttEvent(ev)) {
516
+ this.logger.trace(
517
+ { event: ev.type, ignoreUserTranscriptUntil: this.ignoreUserTranscriptUntil },
518
+ 'holding STT event until ignore_user_transcript_until expires',
519
+ );
520
+ this.transcriptBuffer.push(ev);
521
+ return;
522
+ } else {
523
+ await this.flushHeldTranscripts();
524
+ // no return here to allow the new event to be processed normally
525
+ }
526
+ }
527
+
143
528
  switch (ev.type) {
144
529
  case SpeechEventType.FINAL_TRANSCRIPT:
145
- this.hooks.onFinalTranscript(ev);
146
530
  const transcript = ev.alternatives?.[0]?.text;
531
+ const confidence = ev.alternatives?.[0]?.confidence ?? 0;
147
532
  this.lastLanguage = ev.alternatives?.[0]?.language;
148
533
 
149
534
  if (!transcript) {
@@ -151,6 +536,8 @@ export class AudioRecognition {
151
536
  return;
152
537
  }
153
538
 
539
+ this.hooks.onFinalTranscript(ev);
540
+
154
541
  this.logger.debug(
155
542
  {
156
543
  user_transcript: transcript,
@@ -162,34 +549,156 @@ export class AudioRecognition {
162
549
  this.lastFinalTranscriptTime = Date.now();
163
550
  this.audioTranscript += ` ${transcript}`;
164
551
  this.audioTranscript = this.audioTranscript.trimStart();
552
+ this.finalTranscriptConfidence.push(confidence);
553
+ const transcriptChanged = this.audioTranscript !== this.audioPreflightTranscript;
165
554
  this.audioInterimTranscript = '';
555
+ this.audioPreflightTranscript = '';
556
+
557
+ if (!this.vad || this.lastSpeakingTime === undefined) {
558
+ // vad disabled, use stt timestamp
559
+ // TODO: this would screw up transcription latency metrics
560
+ // but we'll live with it for now.
561
+ // the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
562
+ // and using that timestamp for lastSpeakingTime
563
+ this.lastSpeakingTime = Date.now();
564
+ }
166
565
 
167
- if (!this.speaking) {
168
- if (!this.vad) {
169
- // Copied from python agents:
170
- // vad disabled, use stt timestamp
171
- // TODO: this would screw up transcription latency metrics
172
- // but we'll live with it for now.
173
- // the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
174
- // and using that timestamp for _last_speaking_time
175
- this.lastSpeakingTime = Date.now();
566
+ if (this.vadBaseTurnDetection || this.userTurnCommitted) {
567
+ if (transcriptChanged) {
568
+ this.logger.debug(
569
+ { transcript: this.audioTranscript },
570
+ 'triggering preemptive generation (FINAL_TRANSCRIPT)',
571
+ );
572
+ this.hooks.onPreemptiveGeneration({
573
+ newTranscript: this.audioTranscript,
574
+ transcriptConfidence:
575
+ this.finalTranscriptConfidence.length > 0
576
+ ? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
577
+ this.finalTranscriptConfidence.length
578
+ : 0,
579
+ });
176
580
  }
177
581
 
178
- if (this.vadBaseTurnDetection || this.userTurnCommitted) {
582
+ if (!this.speaking) {
179
583
  const chatCtx = this.hooks.retrieveChatCtx();
180
584
  this.logger.debug('running EOU detection on stt FINAL_TRANSCRIPT');
181
585
  this.runEOUDetection(chatCtx);
182
586
  }
183
587
  }
184
588
  break;
589
+ case SpeechEventType.PREFLIGHT_TRANSCRIPT:
590
+ this.hooks.onInterimTranscript(ev);
591
+ const preflightTranscript = ev.alternatives?.[0]?.text ?? '';
592
+ const preflightConfidence = ev.alternatives?.[0]?.confidence ?? 0;
593
+ const preflightLanguage = ev.alternatives?.[0]?.language;
594
+
595
+ const MIN_LANGUAGE_DETECTION_LENGTH = 5;
596
+ if (
597
+ !this.lastLanguage ||
598
+ (preflightLanguage && preflightTranscript.length > MIN_LANGUAGE_DETECTION_LENGTH)
599
+ ) {
600
+ this.lastLanguage = preflightLanguage;
601
+ }
602
+
603
+ if (!preflightTranscript) {
604
+ return;
605
+ }
606
+
607
+ this.logger.debug(
608
+ {
609
+ user_transcript: preflightTranscript,
610
+ language: this.lastLanguage,
611
+ },
612
+ 'received user preflight transcript',
613
+ );
614
+
615
+ // still need to increment it as it's used for turn detection,
616
+ this.lastFinalTranscriptTime = Date.now();
617
+ // preflight transcript includes all pre-committed transcripts (including final transcript from the previous STT run)
618
+ this.audioPreflightTranscript =
619
+ `${this.audioTranscript} ${preflightTranscript}`.trimStart();
620
+ this.audioInterimTranscript = preflightTranscript;
621
+
622
+ if (!this.vad || this.lastSpeakingTime === undefined) {
623
+ // vad disabled, use stt timestamp
624
+ this.lastSpeakingTime = Date.now();
625
+ }
626
+
627
+ if (this.turnDetectionMode !== 'manual' || this.userTurnCommitted) {
628
+ const confidenceVals = [...this.finalTranscriptConfidence, preflightConfidence];
629
+ this.logger.debug(
630
+ {
631
+ transcript:
632
+ this.audioPreflightTranscript.length > 100
633
+ ? this.audioPreflightTranscript.slice(0, 100) + '...'
634
+ : this.audioPreflightTranscript,
635
+ },
636
+ 'triggering preemptive generation (PREFLIGHT_TRANSCRIPT)',
637
+ );
638
+ this.hooks.onPreemptiveGeneration({
639
+ newTranscript: this.audioPreflightTranscript,
640
+ transcriptConfidence:
641
+ confidenceVals.length > 0
642
+ ? confidenceVals.reduce((a, b) => a + b, 0) / confidenceVals.length
643
+ : 0,
644
+ });
645
+ }
646
+ break;
185
647
  case SpeechEventType.INTERIM_TRANSCRIPT:
186
648
  this.logger.debug({ transcript: ev.alternatives?.[0]?.text }, 'interim transcript');
187
649
  this.hooks.onInterimTranscript(ev);
188
650
  this.audioInterimTranscript = ev.alternatives?.[0]?.text ?? '';
189
651
  break;
652
+ case SpeechEventType.START_OF_SPEECH:
653
+ if (this.turnDetectionMode !== 'stt') break;
654
+ {
655
+ const span = this.ensureUserTurnSpan(Date.now());
656
+ const ctx = this.userTurnContext(span);
657
+ otelContext.with(ctx, () => {
658
+ this.hooks.onStartOfSpeech({
659
+ type: VADEventType.START_OF_SPEECH,
660
+ samplesIndex: 0,
661
+ timestamp: Date.now(),
662
+ speechDuration: 0,
663
+ silenceDuration: 0,
664
+ frames: [],
665
+ probability: 0,
666
+ inferenceDuration: 0,
667
+ speaking: true,
668
+ rawAccumulatedSilence: 0,
669
+ rawAccumulatedSpeech: 0,
670
+ });
671
+ });
672
+ }
673
+ this.speaking = true;
674
+ this.lastSpeakingTime = Date.now();
675
+
676
+ this.bounceEOUTask?.cancel();
677
+ break;
190
678
  case SpeechEventType.END_OF_SPEECH:
191
679
  if (this.turnDetectionMode !== 'stt') break;
680
+ {
681
+ const span = this.ensureUserTurnSpan();
682
+ const ctx = this.userTurnContext(span);
683
+ otelContext.with(ctx, () => {
684
+ this.hooks.onEndOfSpeech({
685
+ type: VADEventType.END_OF_SPEECH,
686
+ samplesIndex: 0,
687
+ timestamp: Date.now(),
688
+ speechDuration: 0,
689
+ silenceDuration: 0,
690
+ frames: [],
691
+ probability: 0,
692
+ inferenceDuration: 0,
693
+ speaking: false,
694
+ rawAccumulatedSilence: 0,
695
+ rawAccumulatedSpeech: 0,
696
+ });
697
+ });
698
+ }
699
+ this.speaking = false;
192
700
  this.userTurnCommitted = true;
701
+ this.lastSpeakingTime = Date.now();
193
702
 
194
703
  if (!this.speaking) {
195
704
  const chatCtx = this.hooks.retrieveChatCtx();
@@ -199,6 +708,12 @@ export class AudioRecognition {
199
708
  }
200
709
  }
201
710
 
711
+ private onOverlapSpeechEvent(ev: OverlappingSpeechEvent) {
712
+ if (ev.isInterruption) {
713
+ this.hooks.onInterruption(ev);
714
+ }
715
+ }
716
+
202
717
  private runEOUDetection(chatCtx: ChatContext) {
203
718
  this.logger.debug(
204
719
  {
@@ -222,61 +737,132 @@ export class AudioRecognition {
222
737
  // disable EOU model if manual turn detection enabled
223
738
  this.audioTranscript && this.turnDetectionMode !== 'manual' ? this.turnDetector : undefined;
224
739
 
225
- const bounceEOUTask = (lastSpeakingTime: number) => async (controller: AbortController) => {
226
- let endpointingDelay = this.minEndpointingDelay;
227
-
228
- // TODO(AJS-74): need to support actual turn detection model plugins for following code to run
229
- if (turnDetector) {
230
- this.logger.debug('Running turn detector model');
231
- if (!turnDetector.supportsLanguage(this.lastLanguage)) {
232
- this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
233
- } else {
234
- const endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
235
- this.logger.debug(
236
- { endOfTurnProbability, language: this.lastLanguage },
237
- 'end of turn probability',
238
- );
239
-
240
- const unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
241
- this.logger.debug(
740
+ const bounceEOUTask =
741
+ (
742
+ lastSpeakingTime: number | undefined,
743
+ lastFinalTranscriptTime: number,
744
+ speechStartTime: number | undefined,
745
+ ) =>
746
+ async (controller: AbortController) => {
747
+ let endpointingDelay = this.minEndpointingDelay;
748
+
749
+ const userTurnSpan = this.ensureUserTurnSpan();
750
+ const userTurnCtx = this.userTurnContext(userTurnSpan);
751
+
752
+ if (turnDetector) {
753
+ await tracer.startActiveSpan(
754
+ async (span) => {
755
+ this.logger.debug('Running turn detector model');
756
+
757
+ let endOfTurnProbability = 0.0;
758
+ let unlikelyThreshold: number | undefined;
759
+
760
+ if (!(await turnDetector.supportsLanguage(this.lastLanguage))) {
761
+ this.logger.debug(`Turn detector does not support language ${this.lastLanguage}`);
762
+ } else {
763
+ try {
764
+ endOfTurnProbability = await turnDetector.predictEndOfTurn(chatCtx);
765
+ unlikelyThreshold = await turnDetector.unlikelyThreshold(this.lastLanguage);
766
+
767
+ this.logger.debug(
768
+ { endOfTurnProbability, unlikelyThreshold, language: this.lastLanguage },
769
+ 'end of turn probability',
770
+ );
771
+
772
+ if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
773
+ endpointingDelay = this.maxEndpointingDelay;
774
+ }
775
+ } catch (error) {
776
+ this.logger.error(error, 'Error predicting end of turn');
777
+ }
778
+ }
779
+
780
+ span.setAttribute(
781
+ traceTypes.ATTR_CHAT_CTX,
782
+ JSON.stringify(chatCtx.toJSON({ excludeTimestamp: false })),
783
+ );
784
+ span.setAttribute(traceTypes.ATTR_EOU_PROBABILITY, endOfTurnProbability);
785
+ span.setAttribute(traceTypes.ATTR_EOU_UNLIKELY_THRESHOLD, unlikelyThreshold ?? 0);
786
+ span.setAttribute(traceTypes.ATTR_EOU_DELAY, endpointingDelay);
787
+ span.setAttribute(traceTypes.ATTR_EOU_LANGUAGE, this.lastLanguage ?? '');
788
+ },
242
789
  {
243
- unlikelyThreshold,
244
- endOfTurnProbability,
245
- language: this.lastLanguage,
246
- transcript: this.audioTranscript,
790
+ name: 'eou_detection',
791
+ context: userTurnCtx,
247
792
  },
248
- 'EOU Detection',
249
793
  );
250
-
251
- if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) {
252
- endpointingDelay = this.maxEndpointingDelay;
253
- }
254
794
  }
255
- }
256
795
 
257
- const extraSleep = lastSpeakingTime + endpointingDelay - Date.now();
258
- // add delay to see if there's a potential upcoming EOU task that cancels this one
259
- await delay(Math.max(extraSleep, 0), { signal: controller.signal });
796
+ let extraSleep = endpointingDelay;
797
+ if (lastSpeakingTime !== undefined) {
798
+ extraSleep += lastSpeakingTime - Date.now();
799
+ }
260
800
 
261
- this.logger.debug({ transcript: this.audioTranscript }, 'end of user turn');
801
+ if (extraSleep > 0) {
802
+ // add delay to see if there's a potential upcoming EOU task that cancels this one
803
+ await delay(Math.max(extraSleep, 0), { signal: controller.signal });
804
+ }
262
805
 
263
- const committed = await this.hooks.onEndOfTurn({
264
- newTranscript: this.audioTranscript,
265
- transcriptionDelay: Math.max(this.lastFinalTranscriptTime - lastSpeakingTime, 0),
266
- endOfUtteranceDelay: Date.now() - lastSpeakingTime,
267
- });
806
+ this.logger.debug({ transcript: this.audioTranscript }, 'end of user turn');
807
+
808
+ const confidenceAvg =
809
+ this.finalTranscriptConfidence.length > 0
810
+ ? this.finalTranscriptConfidence.reduce((a, b) => a + b, 0) /
811
+ this.finalTranscriptConfidence.length
812
+ : 0;
813
+
814
+ let startedSpeakingAt: number | undefined;
815
+ let stoppedSpeakingAt: number | undefined;
816
+ let transcriptionDelay: number | undefined;
817
+ let endOfUtteranceDelay: number | undefined;
818
+
819
+ // sometimes, we can't calculate the metrics because VAD was unreliable.
820
+ // in this case, we just ignore the calculation, it's better than providing likely wrong values
821
+ if (
822
+ lastFinalTranscriptTime !== 0 &&
823
+ lastSpeakingTime !== undefined &&
824
+ speechStartTime !== undefined
825
+ ) {
826
+ startedSpeakingAt = speechStartTime;
827
+ stoppedSpeakingAt = lastSpeakingTime;
828
+ transcriptionDelay = Math.max(lastFinalTranscriptTime - lastSpeakingTime, 0);
829
+ endOfUtteranceDelay = Date.now() - lastSpeakingTime;
830
+ }
268
831
 
269
- if (committed) {
270
- // clear the transcript if the user turn was committed
271
- this.audioTranscript = '';
272
- }
832
+ const committed = await this.hooks.onEndOfTurn({
833
+ newTranscript: this.audioTranscript,
834
+ transcriptConfidence: confidenceAvg,
835
+ transcriptionDelay: transcriptionDelay ?? 0,
836
+ endOfUtteranceDelay: endOfUtteranceDelay ?? 0,
837
+ startedSpeakingAt,
838
+ stoppedSpeakingAt,
839
+ });
840
+
841
+ if (committed) {
842
+ this._endUserTurnSpan({
843
+ transcript: this.audioTranscript,
844
+ confidence: confidenceAvg,
845
+ transcriptionDelay: transcriptionDelay ?? 0,
846
+ endOfUtteranceDelay: endOfUtteranceDelay ?? 0,
847
+ });
848
+
849
+ // clear the transcript if the user turn was committed
850
+ this.audioTranscript = '';
851
+ this.finalTranscriptConfidence = [];
852
+ this.lastSpeakingTime = undefined;
853
+ this.lastFinalTranscriptTime = 0;
854
+ this.speechStartTime = undefined;
855
+ }
273
856
 
274
- this.userTurnCommitted = false;
275
- };
857
+ this.userTurnCommitted = false;
858
+ };
276
859
 
277
860
  // cancel any existing EOU task
278
861
  this.bounceEOUTask?.cancel();
279
- this.bounceEOUTask = Task.from(bounceEOUTask(this.lastSpeakingTime));
862
+ // copy the values before awaiting (the values can change)
863
+ this.bounceEOUTask = Task.from(
864
+ bounceEOUTask(this.lastSpeakingTime, this.lastFinalTranscriptTime, this.speechStartTime),
865
+ );
280
866
 
281
867
  this.bounceEOUTask.result
282
868
  .then(() => {
@@ -364,7 +950,12 @@ export class AudioRecognition {
364
950
  switch (ev.type) {
365
951
  case VADEventType.START_OF_SPEECH:
366
952
  this.logger.debug('VAD task: START_OF_SPEECH');
367
- this.hooks.onStartOfSpeech(ev);
953
+ {
954
+ const startTime = Date.now() - ev.speechDuration;
955
+ const span = this.ensureUserTurnSpan(startTime);
956
+ const ctx = this.userTurnContext(span);
957
+ otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev));
958
+ }
368
959
  this.speaking = true;
369
960
 
370
961
  // Capture sample rate from the first VAD event if not already set
@@ -376,13 +967,27 @@ export class AudioRecognition {
376
967
  break;
377
968
  case VADEventType.INFERENCE_DONE:
378
969
  this.hooks.onVADInferenceDone(ev);
970
+ // for metrics, get the "earliest" signal of speech as possible
971
+ if (ev.rawAccumulatedSpeech > 0.0) {
972
+ this.lastSpeakingTime = Date.now();
973
+
974
+ if (this.speechStartTime === undefined) {
975
+ // Backdate speechStartTime to the actual start of accumulated speech.
976
+ // ev.rawAccumulatedSpeech is in ms (VADEvent durations are all ms in TS).
977
+ this.speechStartTime = Date.now() - ev.rawAccumulatedSpeech;
978
+ }
979
+ }
379
980
  break;
380
981
  case VADEventType.END_OF_SPEECH:
381
982
  this.logger.debug('VAD task: END_OF_SPEECH');
382
- this.hooks.onEndOfSpeech(ev);
383
- this.speaking = false;
983
+ {
984
+ const span = this.ensureUserTurnSpan();
985
+ const ctx = this.userTurnContext(span);
986
+ otelContext.with(ctx, () => this.hooks.onEndOfSpeech(ev));
987
+ }
988
+
384
989
  // when VAD fires END_OF_SPEECH, it already waited for the silence_duration
385
- this.lastSpeakingTime = Date.now() - ev.silenceDuration;
990
+ this.speaking = false;
386
991
 
387
992
  if (
388
993
  this.vadBaseTurnDetection ||
@@ -401,6 +1006,136 @@ export class AudioRecognition {
401
1006
  }
402
1007
  }
403
1008
 
1009
+ private async createInterruptionTask(
1010
+ interruptionDetection: AdaptiveInterruptionDetector | undefined,
1011
+ signal: AbortSignal,
1012
+ ) {
1013
+ if (!interruptionDetection || !this.interruptionStreamChannel) return;
1014
+
1015
+ let numRetries = 0;
1016
+ const maxRetries = apiConnectDefaults.maxRetries;
1017
+
1018
+ while (!signal.aborted) {
1019
+ const stream = interruptionDetection.createStream();
1020
+ const eventReader = stream.stream().getReader();
1021
+
1022
+ const cleanup = async () => {
1023
+ try {
1024
+ signal.removeEventListener('abort', cleanup);
1025
+ eventReader.releaseLock();
1026
+ await stream.close();
1027
+ } catch (e) {
1028
+ this.logger.debug('createInterruptionTask: error during cleanup:', e);
1029
+ }
1030
+ };
1031
+
1032
+ signal.addEventListener('abort', cleanup, { once: true });
1033
+
1034
+ let forwardTask: Promise<void> | undefined;
1035
+
1036
+ try {
1037
+ // Unlike Python where _agent_speech_started lives on `self` and survives retries,
1038
+ // JS creates a fresh InterruptionStreamBase per retry with agentSpeechStarted = false.
1039
+ // Re-inject the sentinel so the new stream knows the agent is mid-speech.
1040
+ if (numRetries > 0 && this.isAgentSpeaking) {
1041
+ await stream.pushFrame(InterruptionStreamSentinel.agentSpeechStarted());
1042
+ }
1043
+
1044
+ forwardTask = (async () => {
1045
+ const inputReader = this.interruptionStreamChannel!.stream().getReader();
1046
+ const abortPromise = waitForAbort(signal);
1047
+
1048
+ try {
1049
+ while (!signal.aborted) {
1050
+ const res = await Promise.race([inputReader.read(), abortPromise]);
1051
+ if (!res) break;
1052
+
1053
+ const { value, done } = res;
1054
+ if (done) break;
1055
+
1056
+ if (value instanceof AudioFrame) {
1057
+ const frameDurationMs = (value.samplesPerChannel / value.sampleRate) * 1000;
1058
+ this._inputStartedAt ??= Date.now() - frameDurationMs;
1059
+ } else {
1060
+ this._inputStartedAt ??= Date.now();
1061
+ }
1062
+
1063
+ await stream.pushFrame(value);
1064
+ }
1065
+ } finally {
1066
+ inputReader.releaseLock();
1067
+ }
1068
+ })();
1069
+
1070
+ const abortPromise = waitForAbort(signal);
1071
+
1072
+ while (!signal.aborted) {
1073
+ const res = await Promise.race([eventReader.read(), abortPromise]);
1074
+ if (!res) break;
1075
+ const { done, value: ev } = res;
1076
+ if (done) break;
1077
+ this.onOverlapSpeechEvent(ev);
1078
+ }
1079
+ break;
1080
+ } catch (e) {
1081
+ if (signal.aborted) break;
1082
+
1083
+ if (isAPIError(e)) {
1084
+ if (maxRetries === 0 || !e.retryable) {
1085
+ interruptionDetection.emitError(
1086
+ new InterruptionDetectionError(
1087
+ e.message,
1088
+ Date.now(),
1089
+ interruptionDetection.label,
1090
+ false,
1091
+ ),
1092
+ );
1093
+ break;
1094
+ } else if (numRetries >= maxRetries) {
1095
+ interruptionDetection.emitError(
1096
+ new InterruptionDetectionError(
1097
+ `failed to detect interruption after ${numRetries} attempts`,
1098
+ Date.now(),
1099
+ interruptionDetection.label,
1100
+ false,
1101
+ ),
1102
+ );
1103
+ break;
1104
+ } else {
1105
+ const retryInterval = intervalForRetry(numRetries);
1106
+ interruptionDetection.emitError(
1107
+ new InterruptionDetectionError(
1108
+ e.message,
1109
+ Date.now(),
1110
+ interruptionDetection.label,
1111
+ true,
1112
+ ),
1113
+ );
1114
+ this.logger.warn(
1115
+ { model: interruptionDetection.label, attempt: numRetries },
1116
+ `failed to detect interruption, retrying in ${retryInterval}ms`,
1117
+ );
1118
+ numRetries++;
1119
+ await delay(retryInterval, { signal });
1120
+ }
1121
+ } else {
1122
+ const msg = e instanceof Error ? e.message : String(e);
1123
+ interruptionDetection.emitError(
1124
+ new InterruptionDetectionError(msg, Date.now(), interruptionDetection.label, false),
1125
+ );
1126
+ this.logger.error(e, 'Error in interruption task');
1127
+ break;
1128
+ }
1129
+ } finally {
1130
+ await cleanup();
1131
+ await forwardTask?.catch((e) => {
1132
+ this.logger.debug({ err: e }, 'interruption task exited with error');
1133
+ });
1134
+ }
1135
+ }
1136
+ this.logger.debug('Interruption task closed');
1137
+ }
1138
+
404
1139
  setInputAudioStream(audioStream: ReadableStream<AudioFrame>) {
405
1140
  this.deferredInputStream.setSource(audioStream);
406
1141
  }
@@ -412,6 +1147,8 @@ export class AudioRecognition {
412
1147
  clearUserTurn() {
413
1148
  this.audioTranscript = '';
414
1149
  this.audioInterimTranscript = '';
1150
+ this.audioPreflightTranscript = '';
1151
+ this.finalTranscriptConfidence = [];
415
1152
  this.userTurnCommitted = false;
416
1153
 
417
1154
  this.sttTask?.cancelAndWait().finally(() => {
@@ -460,19 +1197,57 @@ export class AudioRecognition {
460
1197
  this.logger.debug('User turn committed');
461
1198
  })
462
1199
  .catch((err: unknown) => {
1200
+ if (err instanceof Error && err.name === 'AbortError') {
1201
+ this.logger.debug('User turn commit task cancelled');
1202
+ return;
1203
+ }
463
1204
  this.logger.error(err, 'Error in user turn commit task:');
464
1205
  });
465
1206
  }
466
1207
 
467
1208
  async close() {
468
1209
  this.detachInputAudioStream();
1210
+ this.silenceAudioWriter.releaseLock();
469
1211
  await this.commitUserTurnTask?.cancelAndWait();
470
1212
  await this.sttTask?.cancelAndWait();
471
1213
  await this.vadTask?.cancelAndWait();
472
1214
  await this.bounceEOUTask?.cancelAndWait();
1215
+ await this.interruptionTask?.cancelAndWait();
1216
+ await this.interruptionStreamChannel?.close();
1217
+ }
1218
+
1219
+ private _endUserTurnSpan({
1220
+ transcript,
1221
+ confidence,
1222
+ transcriptionDelay,
1223
+ endOfUtteranceDelay,
1224
+ }: {
1225
+ transcript: string;
1226
+ confidence: number;
1227
+ transcriptionDelay: number;
1228
+ endOfUtteranceDelay: number;
1229
+ }): void {
1230
+ if (this.userTurnSpan) {
1231
+ this.userTurnSpan.setAttributes({
1232
+ [traceTypes.ATTR_USER_TRANSCRIPT]: transcript,
1233
+ [traceTypes.ATTR_TRANSCRIPT_CONFIDENCE]: confidence,
1234
+ [traceTypes.ATTR_TRANSCRIPTION_DELAY]: transcriptionDelay,
1235
+ [traceTypes.ATTR_END_OF_TURN_DELAY]: endOfUtteranceDelay,
1236
+ });
1237
+ this.userTurnSpan.end();
1238
+ this.userTurnSpan = undefined;
1239
+ }
473
1240
  }
474
1241
 
475
1242
  private get vadBaseTurnDetection() {
476
- return ['vad', undefined].includes(this.turnDetectionMode);
1243
+ if (typeof this.turnDetectionMode === 'object') {
1244
+ return false;
1245
+ }
1246
+
1247
+ if (this.turnDetectionMode === undefined || this.turnDetectionMode === 'vad') {
1248
+ return true;
1249
+ }
1250
+
1251
+ return false;
477
1252
  }
478
1253
  }