@livekit/agents 1.1.0-dev.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (292) hide show
  1. package/dist/cli.cjs +2 -0
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +2 -0
  5. package/dist/cli.js.map +1 -1
  6. package/dist/constants.cjs +3 -0
  7. package/dist/constants.cjs.map +1 -1
  8. package/dist/constants.d.cts +1 -0
  9. package/dist/constants.d.ts +1 -0
  10. package/dist/constants.d.ts.map +1 -1
  11. package/dist/constants.js +2 -0
  12. package/dist/constants.js.map +1 -1
  13. package/dist/cpu.cjs +189 -0
  14. package/dist/cpu.cjs.map +1 -0
  15. package/dist/cpu.d.cts +24 -0
  16. package/dist/cpu.d.ts +24 -0
  17. package/dist/cpu.d.ts.map +1 -0
  18. package/dist/cpu.js +152 -0
  19. package/dist/cpu.js.map +1 -0
  20. package/dist/cpu.test.cjs +227 -0
  21. package/dist/cpu.test.cjs.map +1 -0
  22. package/dist/cpu.test.js +204 -0
  23. package/dist/cpu.test.js.map +1 -0
  24. package/dist/index.cjs +12 -10
  25. package/dist/index.cjs.map +1 -1
  26. package/dist/index.d.cts +13 -13
  27. package/dist/index.d.ts +13 -13
  28. package/dist/index.d.ts.map +1 -1
  29. package/dist/index.js +11 -10
  30. package/dist/index.js.map +1 -1
  31. package/dist/inference/interruption/defaults.cjs +1 -1
  32. package/dist/inference/interruption/defaults.cjs.map +1 -1
  33. package/dist/inference/interruption/defaults.d.cts +1 -1
  34. package/dist/inference/interruption/defaults.d.ts +1 -1
  35. package/dist/inference/interruption/defaults.d.ts.map +1 -1
  36. package/dist/inference/interruption/defaults.js +1 -1
  37. package/dist/inference/interruption/defaults.js.map +1 -1
  38. package/dist/inference/interruption/http_transport.cjs +44 -28
  39. package/dist/inference/interruption/http_transport.cjs.map +1 -1
  40. package/dist/inference/interruption/http_transport.d.ts.map +1 -1
  41. package/dist/inference/interruption/http_transport.js +45 -29
  42. package/dist/inference/interruption/http_transport.js.map +1 -1
  43. package/dist/inference/interruption/interruption_detector.cjs +22 -5
  44. package/dist/inference/interruption/interruption_detector.cjs.map +1 -1
  45. package/dist/inference/interruption/interruption_detector.d.cts +2 -2
  46. package/dist/inference/interruption/interruption_detector.d.ts +2 -2
  47. package/dist/inference/interruption/interruption_detector.d.ts.map +1 -1
  48. package/dist/inference/interruption/interruption_detector.js +22 -5
  49. package/dist/inference/interruption/interruption_detector.js.map +1 -1
  50. package/dist/inference/interruption/interruption_stream.cjs +4 -4
  51. package/dist/inference/interruption/interruption_stream.cjs.map +1 -1
  52. package/dist/inference/interruption/interruption_stream.js +4 -4
  53. package/dist/inference/interruption/interruption_stream.js.map +1 -1
  54. package/dist/inference/interruption/types.cjs.map +1 -1
  55. package/dist/inference/interruption/types.d.cts +2 -2
  56. package/dist/inference/interruption/types.d.ts +2 -2
  57. package/dist/inference/interruption/types.d.ts.map +1 -1
  58. package/dist/inference/interruption/ws_transport.cjs +60 -47
  59. package/dist/inference/interruption/ws_transport.cjs.map +1 -1
  60. package/dist/inference/interruption/ws_transport.d.ts.map +1 -1
  61. package/dist/inference/interruption/ws_transport.js +60 -47
  62. package/dist/inference/interruption/ws_transport.js.map +1 -1
  63. package/dist/inference/llm.cjs.map +1 -1
  64. package/dist/inference/llm.d.cts +1 -1
  65. package/dist/inference/llm.d.ts +1 -1
  66. package/dist/inference/llm.d.ts.map +1 -1
  67. package/dist/inference/llm.js.map +1 -1
  68. package/dist/inference/stt.cjs +20 -12
  69. package/dist/inference/stt.cjs.map +1 -1
  70. package/dist/inference/stt.d.cts +3 -2
  71. package/dist/inference/stt.d.ts +3 -2
  72. package/dist/inference/stt.d.ts.map +1 -1
  73. package/dist/inference/stt.js +20 -12
  74. package/dist/inference/stt.js.map +1 -1
  75. package/dist/inference/stt.test.cjs +14 -0
  76. package/dist/inference/stt.test.cjs.map +1 -1
  77. package/dist/inference/stt.test.js +14 -0
  78. package/dist/inference/stt.test.js.map +1 -1
  79. package/dist/inference/tts.cjs +13 -4
  80. package/dist/inference/tts.cjs.map +1 -1
  81. package/dist/inference/tts.d.cts +8 -1
  82. package/dist/inference/tts.d.ts +8 -1
  83. package/dist/inference/tts.d.ts.map +1 -1
  84. package/dist/inference/tts.js +13 -4
  85. package/dist/inference/tts.js.map +1 -1
  86. package/dist/inference/tts.test.cjs +10 -0
  87. package/dist/inference/tts.test.cjs.map +1 -1
  88. package/dist/inference/tts.test.js +10 -0
  89. package/dist/inference/tts.test.js.map +1 -1
  90. package/dist/ipc/job_proc_lazy_main.cjs +41 -23
  91. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  92. package/dist/ipc/job_proc_lazy_main.js +41 -23
  93. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  94. package/dist/job.cjs +1 -1
  95. package/dist/job.cjs.map +1 -1
  96. package/dist/job.js +1 -1
  97. package/dist/job.js.map +1 -1
  98. package/dist/language.cjs +394 -0
  99. package/dist/language.cjs.map +1 -0
  100. package/dist/language.d.cts +15 -0
  101. package/dist/language.d.ts +15 -0
  102. package/dist/language.d.ts.map +1 -0
  103. package/dist/language.js +363 -0
  104. package/dist/language.js.map +1 -0
  105. package/dist/language.test.cjs +43 -0
  106. package/dist/language.test.cjs.map +1 -0
  107. package/dist/language.test.js +49 -0
  108. package/dist/language.test.js.map +1 -0
  109. package/dist/llm/index.cjs +2 -0
  110. package/dist/llm/index.cjs.map +1 -1
  111. package/dist/llm/index.d.cts +1 -1
  112. package/dist/llm/index.d.ts +1 -1
  113. package/dist/llm/index.d.ts.map +1 -1
  114. package/dist/llm/index.js +2 -0
  115. package/dist/llm/index.js.map +1 -1
  116. package/dist/stream/deferred_stream.cjs +6 -2
  117. package/dist/stream/deferred_stream.cjs.map +1 -1
  118. package/dist/stream/deferred_stream.d.ts.map +1 -1
  119. package/dist/stream/deferred_stream.js +6 -2
  120. package/dist/stream/deferred_stream.js.map +1 -1
  121. package/dist/stt/stt.cjs.map +1 -1
  122. package/dist/stt/stt.d.cts +2 -1
  123. package/dist/stt/stt.d.ts +2 -1
  124. package/dist/stt/stt.d.ts.map +1 -1
  125. package/dist/stt/stt.js.map +1 -1
  126. package/dist/utils.cjs +15 -0
  127. package/dist/utils.cjs.map +1 -1
  128. package/dist/utils.d.cts +8 -0
  129. package/dist/utils.d.ts +8 -0
  130. package/dist/utils.d.ts.map +1 -1
  131. package/dist/utils.js +13 -0
  132. package/dist/utils.js.map +1 -1
  133. package/dist/version.cjs +1 -1
  134. package/dist/version.js +1 -1
  135. package/dist/voice/agent.cjs +14 -17
  136. package/dist/voice/agent.cjs.map +1 -1
  137. package/dist/voice/agent.d.cts +10 -11
  138. package/dist/voice/agent.d.ts +10 -11
  139. package/dist/voice/agent.d.ts.map +1 -1
  140. package/dist/voice/agent.js +15 -18
  141. package/dist/voice/agent.js.map +1 -1
  142. package/dist/voice/agent.test.cjs +194 -0
  143. package/dist/voice/agent.test.cjs.map +1 -1
  144. package/dist/voice/agent.test.js +195 -1
  145. package/dist/voice/agent.test.js.map +1 -1
  146. package/dist/voice/agent_activity.cjs +116 -39
  147. package/dist/voice/agent_activity.cjs.map +1 -1
  148. package/dist/voice/agent_activity.d.cts +2 -0
  149. package/dist/voice/agent_activity.d.ts +2 -0
  150. package/dist/voice/agent_activity.d.ts.map +1 -1
  151. package/dist/voice/agent_activity.js +117 -40
  152. package/dist/voice/agent_activity.js.map +1 -1
  153. package/dist/voice/agent_activity.test.cjs +135 -0
  154. package/dist/voice/agent_activity.test.cjs.map +1 -0
  155. package/dist/voice/agent_activity.test.js +134 -0
  156. package/dist/voice/agent_activity.test.js.map +1 -0
  157. package/dist/voice/agent_session.cjs +38 -38
  158. package/dist/voice/agent_session.cjs.map +1 -1
  159. package/dist/voice/agent_session.d.cts +65 -56
  160. package/dist/voice/agent_session.d.ts +65 -56
  161. package/dist/voice/agent_session.d.ts.map +1 -1
  162. package/dist/voice/agent_session.js +37 -37
  163. package/dist/voice/agent_session.js.map +1 -1
  164. package/dist/voice/audio_recognition.cjs +106 -52
  165. package/dist/voice/audio_recognition.cjs.map +1 -1
  166. package/dist/voice/audio_recognition.d.cts +4 -2
  167. package/dist/voice/audio_recognition.d.ts +4 -2
  168. package/dist/voice/audio_recognition.d.ts.map +1 -1
  169. package/dist/voice/audio_recognition.js +106 -52
  170. package/dist/voice/audio_recognition.js.map +1 -1
  171. package/dist/voice/audio_recognition_span.test.cjs +84 -22
  172. package/dist/voice/audio_recognition_span.test.cjs.map +1 -1
  173. package/dist/voice/audio_recognition_span.test.js +90 -23
  174. package/dist/voice/audio_recognition_span.test.js.map +1 -1
  175. package/dist/voice/events.cjs +1 -1
  176. package/dist/voice/events.cjs.map +1 -1
  177. package/dist/voice/events.d.cts +4 -3
  178. package/dist/voice/events.d.ts +4 -3
  179. package/dist/voice/events.d.ts.map +1 -1
  180. package/dist/voice/events.js +1 -1
  181. package/dist/voice/events.js.map +1 -1
  182. package/dist/voice/index.cjs +9 -1
  183. package/dist/voice/index.cjs.map +1 -1
  184. package/dist/voice/index.d.cts +1 -1
  185. package/dist/voice/index.d.ts +1 -1
  186. package/dist/voice/index.d.ts.map +1 -1
  187. package/dist/voice/index.js +10 -1
  188. package/dist/voice/index.js.map +1 -1
  189. package/dist/voice/remote_session.cjs +922 -0
  190. package/dist/voice/remote_session.cjs.map +1 -0
  191. package/dist/voice/remote_session.d.cts +108 -0
  192. package/dist/voice/remote_session.d.ts +108 -0
  193. package/dist/voice/remote_session.d.ts.map +1 -0
  194. package/dist/voice/remote_session.js +887 -0
  195. package/dist/voice/remote_session.js.map +1 -0
  196. package/dist/voice/report.cjs +11 -10
  197. package/dist/voice/report.cjs.map +1 -1
  198. package/dist/voice/report.d.cts +5 -3
  199. package/dist/voice/report.d.ts +5 -3
  200. package/dist/voice/report.d.ts.map +1 -1
  201. package/dist/voice/report.js +11 -10
  202. package/dist/voice/report.js.map +1 -1
  203. package/dist/voice/report.test.cjs +15 -0
  204. package/dist/voice/report.test.cjs.map +1 -1
  205. package/dist/voice/report.test.js +15 -0
  206. package/dist/voice/report.test.js.map +1 -1
  207. package/dist/voice/room_io/room_io.cjs +39 -0
  208. package/dist/voice/room_io/room_io.cjs.map +1 -1
  209. package/dist/voice/room_io/room_io.d.cts +3 -1
  210. package/dist/voice/room_io/room_io.d.ts +3 -1
  211. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  212. package/dist/voice/room_io/room_io.js +40 -1
  213. package/dist/voice/room_io/room_io.js.map +1 -1
  214. package/dist/voice/turn_config/interruption.cjs.map +1 -1
  215. package/dist/voice/turn_config/interruption.d.cts +1 -1
  216. package/dist/voice/turn_config/interruption.d.ts +1 -1
  217. package/dist/voice/turn_config/interruption.d.ts.map +1 -1
  218. package/dist/voice/turn_config/interruption.js.map +1 -1
  219. package/dist/voice/turn_config/utils.cjs +95 -35
  220. package/dist/voice/turn_config/utils.cjs.map +1 -1
  221. package/dist/voice/turn_config/utils.d.cts +17 -5
  222. package/dist/voice/turn_config/utils.d.ts +17 -5
  223. package/dist/voice/turn_config/utils.d.ts.map +1 -1
  224. package/dist/voice/turn_config/utils.js +93 -35
  225. package/dist/voice/turn_config/utils.js.map +1 -1
  226. package/dist/voice/turn_config/utils.test.cjs +83 -41
  227. package/dist/voice/turn_config/utils.test.cjs.map +1 -1
  228. package/dist/voice/turn_config/utils.test.js +84 -42
  229. package/dist/voice/turn_config/utils.test.js.map +1 -1
  230. package/dist/worker.cjs +6 -29
  231. package/dist/worker.cjs.map +1 -1
  232. package/dist/worker.d.ts.map +1 -1
  233. package/dist/worker.js +6 -19
  234. package/dist/worker.js.map +1 -1
  235. package/package.json +3 -2
  236. package/src/cli.ts +2 -0
  237. package/src/constants.ts +1 -0
  238. package/src/cpu.test.ts +239 -0
  239. package/src/cpu.ts +173 -0
  240. package/src/index.ts +13 -15
  241. package/src/inference/interruption/defaults.ts +1 -1
  242. package/src/inference/interruption/http_transport.ts +49 -30
  243. package/src/inference/interruption/interruption_detector.ts +22 -6
  244. package/src/inference/interruption/interruption_stream.ts +4 -4
  245. package/src/inference/interruption/types.ts +2 -2
  246. package/src/inference/interruption/ws_transport.ts +63 -59
  247. package/src/inference/llm.ts +3 -1
  248. package/src/inference/stt.test.ts +17 -0
  249. package/src/inference/stt.ts +22 -14
  250. package/src/inference/tts.test.ts +12 -0
  251. package/src/inference/tts.ts +22 -6
  252. package/src/ipc/job_proc_lazy_main.ts +44 -24
  253. package/src/job.ts +1 -1
  254. package/src/language.test.ts +62 -0
  255. package/src/language.ts +380 -0
  256. package/src/llm/index.ts +2 -0
  257. package/src/stream/deferred_stream.ts +5 -1
  258. package/src/stt/stt.ts +2 -1
  259. package/src/utils.ts +20 -0
  260. package/src/voice/agent.test.ts +208 -1
  261. package/src/voice/agent.ts +21 -22
  262. package/src/voice/agent_activity.test.ts +194 -0
  263. package/src/voice/agent_activity.ts +161 -43
  264. package/src/voice/agent_session.ts +103 -92
  265. package/src/voice/audio_recognition.ts +124 -61
  266. package/src/voice/audio_recognition_span.test.ts +115 -35
  267. package/src/voice/events.ts +4 -3
  268. package/src/voice/index.ts +10 -1
  269. package/src/voice/remote_session.ts +1083 -0
  270. package/src/voice/report.test.ts +22 -3
  271. package/src/voice/report.ts +31 -14
  272. package/src/voice/room_io/room_io.ts +52 -2
  273. package/src/voice/turn_config/interruption.ts +1 -1
  274. package/src/voice/turn_config/utils.test.ts +91 -43
  275. package/src/voice/turn_config/utils.ts +120 -56
  276. package/src/worker.ts +34 -50
  277. package/dist/voice/client_events.cjs +0 -554
  278. package/dist/voice/client_events.cjs.map +0 -1
  279. package/dist/voice/client_events.d.cts +0 -195
  280. package/dist/voice/client_events.d.ts +0 -195
  281. package/dist/voice/client_events.d.ts.map +0 -1
  282. package/dist/voice/client_events.js +0 -548
  283. package/dist/voice/client_events.js.map +0 -1
  284. package/dist/voice/wire_format.cjs +0 -798
  285. package/dist/voice/wire_format.cjs.map +0 -1
  286. package/dist/voice/wire_format.d.cts +0 -5503
  287. package/dist/voice/wire_format.d.ts +0 -5503
  288. package/dist/voice/wire_format.d.ts.map +0 -1
  289. package/dist/voice/wire_format.js +0 -728
  290. package/dist/voice/wire_format.js.map +0 -1
  291. package/src/voice/client_events.ts +0 -838
  292. package/src/voice/wire_format.ts +0 -827
@@ -41,7 +41,6 @@ import type { VAD } from '../vad.js';
41
41
  import type { Agent } from './agent.js';
42
42
  import { AgentActivity } from './agent_activity.js';
43
43
  import type { _TurnDetector } from './audio_recognition.js';
44
- import { ClientEventsHandler } from './client_events.js';
45
44
  import {
46
45
  type AgentEvent,
47
46
  AgentSessionEventTypes,
@@ -65,6 +64,7 @@ import {
65
64
  } from './events.js';
66
65
  import { AgentInput, AgentOutput } from './io.js';
67
66
  import { RecorderIO } from './recorder_io/index.js';
67
+ import { RoomSessionTransport, SessionHost } from './remote_session.js';
68
68
  import {
69
69
  DEFAULT_TEXT_INPUT_CALLBACK,
70
70
  RoomIO,
@@ -87,68 +87,40 @@ export interface AgentSessionUsage {
87
87
  modelUsage: Array<Partial<ModelUsage>>;
88
88
  }
89
89
 
90
- export interface SessionOptions {
90
+ export interface InternalSessionOptions<UserData> extends AgentSessionOptions<UserData> {
91
+ turnHandling: InternalTurnHandlingOptions;
92
+ useTtsAlignedTranscript: boolean;
91
93
  maxToolSteps: number;
92
- /**
93
- * Whether to speculatively begin LLM and TTS requests before an end-of-turn is detected.
94
- * When `true`, the agent sends inference calls as soon as a user transcript is received rather
95
- * than waiting for a definitive turn boundary. This can reduce response latency by overlapping
96
- * model inference with user audio, but may incur extra compute if the user interrupts or
97
- * revises mid-utterance.
98
- * @defaultValue false
99
- */
100
- preemptiveGeneration: boolean;
101
-
102
- /**
103
- * If set, set the user state as "away" after this amount of time after user and agent are
104
- * silent. Set to `null` to disable.
105
- * @defaultValue 15.0
106
- */
107
94
  userAwayTimeout: number | null;
108
-
109
- /**
110
- * Duration in milliseconds for AEC (Acoustic Echo Cancellation) warmup, during which
111
- * interruptions from audio activity are suppressed. Set to `null` to disable.
112
- * @defaultValue 3000
113
- */
114
- aecWarmupDuration: number | null;
115
-
116
- /**
117
- * Configuration for turn handling.
118
- */
119
- turnHandling: Partial<TurnHandlingOptions>;
120
-
121
- useTtsAlignedTranscript: boolean;
122
-
123
- /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.mode instead. */
124
- allowInterruptions?: boolean;
125
- /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.discardAudioIfUninterruptible instead. */
126
- discardAudioIfUninterruptible?: boolean;
127
- /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.minDuration instead. */
128
- minInterruptionDuration?: number;
129
- /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.minWords instead. */
130
- minInterruptionWords?: number;
131
- /** @deprecated Use {@link SessionOptions.turnHandling}.endpointing.minDelay instead. */
132
- minEndpointingDelay?: number;
133
- /** @deprecated Use {@link SessionOptions.turnHandling}.endpointing.maxDelay instead. */
134
- maxEndpointingDelay?: number;
135
- }
136
-
137
- export interface InternalSessionOptions extends SessionOptions {
138
- turnHandling: InternalTurnHandlingOptions;
139
95
  }
140
96
 
141
- export const defaultSessionOptions = {
97
+ export const defaultAgentSessionOptions = {
142
98
  maxToolSteps: 3,
143
- preemptiveGeneration: false,
99
+ preemptiveGeneration: true,
144
100
  userAwayTimeout: 15.0,
145
101
  aecWarmupDuration: 3000,
146
102
  turnHandling: {},
147
103
  useTtsAlignedTranscript: true,
148
- } as const satisfies SessionOptions;
104
+ } as const satisfies AgentSessionOptions;
149
105
 
150
- /** @deprecated {@link VoiceOptions} has been renamed to {@link SessionOptions} */
151
- export type VoiceOptions = SessionOptions;
106
+ /** @deprecated {@link VoiceOptions} has been flattened onto to {@link AgentSessionOptions} */
107
+ export type VoiceOptions = {
108
+ maxToolSteps: number;
109
+ preemptiveGeneration: boolean;
110
+ userAwayTimeout?: number | null;
111
+ /** @deprecated Use {@link AgentSessionOptions.turnHandling}.interruption.mode instead. */
112
+ allowInterruptions?: boolean;
113
+ /** @deprecated Use {@link AgentSessionOptions.turnHandling}.interruption.discardAudioIfUninterruptible instead. */
114
+ discardAudioIfUninterruptible?: boolean;
115
+ /** @deprecated Use {@link AgentSessionOptions.turnHandling}.interruption.minDuration instead. */
116
+ minInterruptionDuration?: number;
117
+ /** @deprecated Use {@link AgentSessionOptions.turnHandling}.interruption.minWords instead. */
118
+ minInterruptionWords?: number;
119
+ /** @deprecated Use {@link AgentSessionOptions.turnHandling}.endpointing.minDelay instead. */
120
+ minEndpointingDelay?: number;
121
+ /** @deprecated Use {@link AgentSessionOptions.turnHandling}.endpointing.maxDelay instead. */
122
+ maxEndpointingDelay?: number;
123
+ };
152
124
 
153
125
  export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
154
126
 
@@ -162,7 +134,7 @@ export type AgentSessionCallbacks = {
162
134
  [AgentSessionEventTypes.SpeechCreated]: (ev: SpeechCreatedEvent) => void;
163
135
  [AgentSessionEventTypes.Error]: (ev: ErrorEvent) => void;
164
136
  [AgentSessionEventTypes.Close]: (ev: CloseEvent) => void;
165
- [AgentSessionEventTypes.UserOverlappingSpeech]: (ev: OverlappingSpeechEvent) => void;
137
+ [AgentSessionEventTypes.OverlappingSpeech]: (ev: OverlappingSpeechEvent) => void;
166
138
  };
167
139
 
168
140
  export type AgentSessionOptions<UserData = UnknownUserData> = {
@@ -171,13 +143,44 @@ export type AgentSessionOptions<UserData = UnknownUserData> = {
171
143
  llm?: LLM | RealtimeModel | LLMModels;
172
144
  tts?: TTS | TTSModelString;
173
145
  userData?: UserData;
174
- options?: Partial<SessionOptions>;
175
146
  connOptions?: SessionConnectOptions;
176
147
 
177
- /** @deprecated use {@link AgentSessionOptions.options}.turnHandling.turnDetection instead */
148
+ /** @deprecated use turnHandling.turnDetection instead */
178
149
  turnDetection?: TurnDetectionMode;
179
- /** @deprecated use {@link AgentSessionOptions.options} instead */
150
+ /** @deprecated use top-level SessionOptions fields instead */
180
151
  voiceOptions?: Partial<VoiceOptions>;
152
+
153
+ maxToolSteps?: number;
154
+ /**
155
+ * Whether to speculatively begin LLM and TTS requests before an end-of-turn is detected.
156
+ * When `true`, the agent sends inference calls as soon as a user transcript is received rather
157
+ * than waiting for a definitive turn boundary. This can reduce response latency by overlapping
158
+ * model inference with user audio, but may incur extra compute if the user interrupts or
159
+ * revises mid-utterance.
160
+ * @defaultValue true
161
+ */
162
+ preemptiveGeneration?: boolean;
163
+
164
+ /**
165
+ * If set, set the user state as "away" after this amount of time after user and agent are
166
+ * silent. Set to `null` to disable.
167
+ * @defaultValue 15.0
168
+ */
169
+ userAwayTimeout?: number | null;
170
+
171
+ /**
172
+ * Duration in milliseconds for AEC (Acoustic Echo Cancellation) warmup, during which
173
+ * interruptions from audio activity are suppressed. Set to `null` to disable.
174
+ * @defaultValue 3000
175
+ */
176
+ aecWarmupDuration?: number | null;
177
+
178
+ /**
179
+ * Configuration for turn handling.
180
+ */
181
+ turnHandling?: Partial<TurnHandlingOptions>;
182
+
183
+ useTtsAlignedTranscript?: boolean;
181
184
  };
182
185
 
183
186
  type ActivityTransitionOptions = {
@@ -196,7 +199,11 @@ export class AgentSession<
196
199
  tts?: TTS;
197
200
  turnDetection?: TurnDetectionMode;
198
201
 
199
- readonly options: InternalSessionOptions;
202
+ /** @deprecated use {@link sessionOptions } instead */
203
+ readonly options: VoiceOptions;
204
+
205
+ readonly sessionOptions: InternalSessionOptions<UserData>;
206
+
200
207
  private readonly activityLock = new Mutex();
201
208
 
202
209
  private agent?: Agent;
@@ -204,7 +211,7 @@ export class AgentSession<
204
211
  private nextActivity?: AgentActivity;
205
212
  private updateActivityTask?: Task<void>;
206
213
  private started = false;
207
- private clientEventsHandler?: ClientEventsHandler;
214
+ private sessionHost?: SessionHost;
208
215
 
209
216
  private _chatCtx: ChatContext;
210
217
  private _userData: UserData | undefined;
@@ -225,14 +232,14 @@ export class AgentSession<
225
232
  // Unrecoverable error counts, reset after agent speaking
226
233
  private llmErrorCounts = 0;
227
234
  private ttsErrorCounts = 0;
228
- private interruptionDetectionErrorCounts = 0;
229
235
 
230
236
  private sessionSpan?: Span;
231
237
  private agentSpeakingSpan?: Span;
232
238
 
233
239
  private _interruptionDetection?: InterruptionOptions['mode'];
234
240
 
235
- private _usageCollector: ModelUsageCollector = new ModelUsageCollector();
241
+ /** @internal */
242
+ _usageCollector: ModelUsageCollector = new ModelUsageCollector();
236
243
 
237
244
  /** @internal */
238
245
  _roomIO?: RoomIO;
@@ -266,9 +273,10 @@ export class AgentSession<
266
273
  constructor(options: AgentSessionOptions<UserData>) {
267
274
  super();
268
275
 
269
- const opts = migrateLegacyOptions<UserData>(options);
276
+ const { agentSessionOptions: opts, legacyVoiceOptions } =
277
+ migrateLegacyOptions<UserData>(options);
270
278
 
271
- const { vad, stt, llm, tts, userData, connOptions, options: sessionOptions } = opts;
279
+ const { vad, stt, llm, tts, userData, connOptions, ...resolvedSessionOptions } = opts;
272
280
  // Merge user-provided connOptions with defaults
273
281
  this._connOptions = {
274
282
  sttConnOptions: { ...DEFAULT_API_CONNECT_OPTIONS, ...connOptions?.sttConnOptions },
@@ -299,8 +307,8 @@ export class AgentSession<
299
307
  this.tts = tts;
300
308
  }
301
309
 
302
- this.turnDetection = sessionOptions?.turnHandling?.turnDetection;
303
- this._interruptionDetection = sessionOptions?.turnHandling?.interruption?.mode;
310
+ this.turnDetection = resolvedSessionOptions.turnHandling.turnDetection;
311
+ this._interruptionDetection = resolvedSessionOptions.turnHandling.interruption?.mode;
304
312
  this._userData = userData;
305
313
 
306
314
  // configurable IO
@@ -309,8 +317,9 @@ export class AgentSession<
309
317
 
310
318
  // This is the "global" chat context, it holds the entire conversation history
311
319
  this._chatCtx = ChatContext.empty();
312
- this.options = opts.options;
313
- this._aecWarmupRemaining = this.options.aecWarmupDuration ?? 0;
320
+ this.sessionOptions = resolvedSessionOptions;
321
+ this.options = legacyVoiceOptions;
322
+ this._aecWarmupRemaining = this.sessionOptions.aecWarmupDuration ?? 0;
314
323
 
315
324
  this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this);
316
325
  this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed);
@@ -322,9 +331,6 @@ export class AgentSession<
322
331
  ): boolean {
323
332
  const eventData = args[0] as AgentEvent;
324
333
  this._recordedEvents.push(eventData);
325
- if (event === AgentSessionEventTypes.MetricsCollected) {
326
- this._usageCollector.collect((eventData as MetricsCollectedEvent).metrics);
327
- }
328
334
  return super.emit(event, ...args);
329
335
  }
330
336
 
@@ -366,7 +372,7 @@ export class AgentSession<
366
372
  }
367
373
 
368
374
  get useTtsAlignedTranscript(): boolean {
369
- return this.options.useTtsAlignedTranscript;
375
+ return this.sessionOptions.useTtsAlignedTranscript;
370
376
  }
371
377
 
372
378
  set userData(value: UserData) {
@@ -422,9 +428,11 @@ export class AgentSession<
422
428
 
423
429
  this._roomIO.start();
424
430
 
425
- this.clientEventsHandler = new ClientEventsHandler(this, this._roomIO);
431
+ const transport = new RoomSessionTransport(room, this._roomIO);
432
+ this.sessionHost = new SessionHost(transport);
433
+ this.sessionHost.registerSession(this);
426
434
  if (inputOptions?.textEnabled !== false) {
427
- this.clientEventsHandler.registerTextInput(
435
+ this.sessionHost.registerTextInput(
428
436
  inputOptions?.textInputCallback ?? DEFAULT_TEXT_INPUT_CALLBACK,
429
437
  );
430
438
  }
@@ -470,8 +478,8 @@ export class AgentSession<
470
478
 
471
479
  await Promise.allSettled(tasks);
472
480
 
473
- if (this.clientEventsHandler) {
474
- await this.clientEventsHandler.start();
481
+ if (this.sessionHost) {
482
+ await this.sessionHost.start();
475
483
  }
476
484
 
477
485
  // Log used IO configuration
@@ -877,7 +885,9 @@ export class AgentSession<
877
885
  if (this.closingTask) {
878
886
  return;
879
887
  }
880
- this.closeImpl(reason, error, drain);
888
+ this.closingTask = this.closeImpl(reason, error, drain).finally(() => {
889
+ this.closingTask = null;
890
+ });
881
891
  }
882
892
 
883
893
  /** @internal */
@@ -900,13 +910,11 @@ export class AgentSession<
900
910
  return;
901
911
  }
902
912
  } else if (error.type === 'interruption_detection_error') {
903
- this.interruptionDetectionErrorCounts += 1;
904
- if (this.interruptionDetectionErrorCounts <= this._connOptions.maxUnrecoverableErrors) {
905
- return;
906
- }
913
+ this.logger.error(error.toString());
914
+ return;
907
915
  }
908
916
 
909
- this.logger.error(error, 'AgentSession is closing due to unrecoverable error');
917
+ this.logger.error(error, 'AgentSession is closing due to an unrecoverable error');
910
918
 
911
919
  this.closingTask = (async () => {
912
920
  await this.closeImpl(CloseReason.ERROR, error);
@@ -935,7 +943,6 @@ export class AgentSession<
935
943
  if (state === 'speaking') {
936
944
  this.llmErrorCounts = 0;
937
945
  this.ttsErrorCounts = 0;
938
- this.interruptionDetectionErrorCounts = 0;
939
946
 
940
947
  if (this.agentSpeakingSpan === undefined) {
941
948
  this.agentSpeakingSpan = tracer.startSpan({
@@ -980,7 +987,10 @@ export class AgentSession<
980
987
  }
981
988
 
982
989
  /** @internal */
983
- _updateUserState(state: UserState, lastSpeakingTime?: number) {
990
+ _updateUserState(
991
+ state: UserState,
992
+ options?: { lastSpeakingTime?: number; otelContext?: Context },
993
+ ) {
984
994
  if (this._userState === state) {
985
995
  return;
986
996
  }
@@ -988,8 +998,8 @@ export class AgentSession<
988
998
  if (state === 'speaking' && this._userSpeakingSpan === undefined) {
989
999
  this._userSpeakingSpan = tracer.startSpan({
990
1000
  name: 'user_speaking',
991
- context: this.rootSpanContext,
992
- startTime: lastSpeakingTime,
1001
+ context: options?.otelContext ?? this.rootSpanContext,
1002
+ startTime: options?.lastSpeakingTime,
993
1003
  });
994
1004
 
995
1005
  const linked = this._roomIO?.linkedParticipant;
@@ -997,7 +1007,7 @@ export class AgentSession<
997
1007
  setParticipantSpanAttributes(this._userSpeakingSpan, linked);
998
1008
  }
999
1009
  } else if (this._userSpeakingSpan !== undefined) {
1000
- this._userSpeakingSpan.end(lastSpeakingTime);
1010
+ this._userSpeakingSpan.end(options?.lastSpeakingTime);
1001
1011
  this._userSpeakingSpan = undefined;
1002
1012
  }
1003
1013
 
@@ -1035,7 +1045,10 @@ export class AgentSession<
1035
1045
  private _setUserAwayTimer(): void {
1036
1046
  this._cancelUserAwayTimer();
1037
1047
 
1038
- if (this.options.userAwayTimeout === null || this.options.userAwayTimeout === undefined) {
1048
+ if (
1049
+ this.sessionOptions.userAwayTimeout === null ||
1050
+ this.sessionOptions.userAwayTimeout === undefined
1051
+ ) {
1039
1052
  return;
1040
1053
  }
1041
1054
 
@@ -1046,7 +1059,7 @@ export class AgentSession<
1046
1059
  this.userAwayTimer = setTimeout(() => {
1047
1060
  this.logger.debug('User away timeout triggered');
1048
1061
  this._updateUserState('away');
1049
- }, this.options.userAwayTimeout * 1000);
1062
+ }, this.sessionOptions.userAwayTimeout * 1000);
1050
1063
  }
1051
1064
 
1052
1065
  private _cancelUserAwayTimer(): void {
@@ -1120,7 +1133,6 @@ export class AgentSession<
1120
1133
  try {
1121
1134
  await this.activity.interrupt({ force: true }).await;
1122
1135
  } catch (error) {
1123
- // Uninterruptible speech can throw during forced interruption.
1124
1136
  this.logger.warn({ error }, 'Error interrupting activity');
1125
1137
  }
1126
1138
  }
@@ -1150,8 +1162,8 @@ export class AgentSession<
1150
1162
  this.output.audio = null;
1151
1163
  this.output.transcription = null;
1152
1164
 
1153
- await this.clientEventsHandler?.close();
1154
- this.clientEventsHandler = undefined;
1165
+ await this.sessionHost?.close();
1166
+ this.sessionHost = undefined;
1155
1167
 
1156
1168
  await this._roomIO?.close();
1157
1169
  this._roomIO = undefined;
@@ -1183,7 +1195,6 @@ export class AgentSession<
1183
1195
  this.rootSpanContext = undefined;
1184
1196
  this.llmErrorCounts = 0;
1185
1197
  this.ttsErrorCounts = 0;
1186
- this.interruptionDetectionErrorCounts = 0;
1187
1198
 
1188
1199
  this.logger.info({ reason, error }, 'AgentSession closed');
1189
1200
  }
@@ -12,6 +12,8 @@ import {
12
12
  } from '@opentelemetry/api';
13
13
  import type { WritableStreamDefaultWriter } from 'node:stream/web';
14
14
  import { ReadableStream } from 'node:stream/web';
15
+ import { isAPIError } from '../_exceptions.js';
16
+ import { apiConnectDefaults, intervalForRetry } from '../inference/interruption/defaults.js';
15
17
  import { InterruptionDetectionError } from '../inference/interruption/errors.js';
16
18
  import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js';
17
19
  import { InterruptionStreamSentinel } from '../inference/interruption/interruption_stream.js';
@@ -19,6 +21,7 @@ import {
19
21
  type InterruptionSentinel,
20
22
  type OverlappingSpeechEvent,
21
23
  } from '../inference/interruption/types.js';
24
+ import type { LanguageCode } from '../language.js';
22
25
  import { type ChatContext } from '../llm/chat_context.js';
23
26
  import { log } from '../log.js';
24
27
  import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js';
@@ -71,8 +74,8 @@ export interface _TurnDetector {
71
74
  readonly model: string;
72
75
  /** The provider name for this turn detector. */
73
76
  readonly provider: string;
74
- unlikelyThreshold: (language?: string) => Promise<number | undefined>;
75
- supportsLanguage: (language?: string) => Promise<boolean>;
77
+ unlikelyThreshold: (language?: LanguageCode) => Promise<number | undefined>;
78
+ supportsLanguage: (language?: LanguageCode) => Promise<boolean>;
76
79
  predictEndOfTurn(chatCtx: ChatContext, timeout?: number): Promise<number>;
77
80
  }
78
81
 
@@ -121,7 +124,7 @@ export class AudioRecognition {
121
124
  private turnDetectionMode?: TurnDetectionMode;
122
125
  private minEndpointingDelay: number;
123
126
  private maxEndpointingDelay: number;
124
- private lastLanguage?: string;
127
+ private lastLanguage?: LanguageCode;
125
128
  private rootSpanContext?: Context;
126
129
  private sttModel?: string;
127
130
  private sttProvider?: string;
@@ -249,6 +252,15 @@ export class AudioRecognition {
249
252
  await this.interruptionTask?.cancelAndWait();
250
253
  }
251
254
 
255
+ async disableInterruptionDetection(): Promise<void> {
256
+ this.isInterruptionEnabled = false;
257
+ this.interruptionDetection = undefined;
258
+ await this.interruptionTask?.cancelAndWait();
259
+ this.interruptionTask = undefined;
260
+ await this.interruptionStreamChannel?.close();
261
+ this.interruptionStreamChannel = undefined;
262
+ }
263
+
252
264
  async onStartOfAgentSpeech() {
253
265
  this.isAgentSpeaking = true;
254
266
  return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted());
@@ -1000,77 +1012,128 @@ export class AudioRecognition {
1000
1012
  ) {
1001
1013
  if (!interruptionDetection || !this.interruptionStreamChannel) return;
1002
1014
 
1003
- const stream = interruptionDetection.createStream();
1004
- const inputReader = this.interruptionStreamChannel.stream().getReader();
1015
+ let numRetries = 0;
1016
+ const maxRetries = apiConnectDefaults.maxRetries;
1005
1017
 
1006
- const cleanup = async () => {
1007
- try {
1008
- signal.removeEventListener('abort', abortHandler);
1009
- eventReader.releaseLock();
1010
- await stream.close();
1011
- } catch (e) {
1012
- this.logger.debug('createInterruptionTask: error during abort handler:', e);
1013
- }
1014
- };
1018
+ while (!signal.aborted) {
1019
+ const stream = interruptionDetection.createStream();
1020
+ const eventReader = stream.stream().getReader();
1021
+
1022
+ const cleanup = async () => {
1023
+ try {
1024
+ signal.removeEventListener('abort', cleanup);
1025
+ eventReader.releaseLock();
1026
+ await stream.close();
1027
+ } catch (e) {
1028
+ this.logger.debug('createInterruptionTask: error during cleanup:', e);
1029
+ }
1030
+ };
1031
+
1032
+ signal.addEventListener('abort', cleanup, { once: true });
1033
+
1034
+ let forwardTask: Promise<void> | undefined;
1015
1035
 
1016
- // Forward input frames/sentinels to the interruption stream
1017
- const forwardTask = (async () => {
1018
1036
  try {
1037
+ // Unlike Python where _agent_speech_started lives on `self` and survives retries,
1038
+ // JS creates a fresh InterruptionStreamBase per retry with agentSpeechStarted = false.
1039
+ // Re-inject the sentinel so the new stream knows the agent is mid-speech.
1040
+ if (numRetries > 0 && this.isAgentSpeaking) {
1041
+ await stream.pushFrame(InterruptionStreamSentinel.agentSpeechStarted());
1042
+ }
1043
+
1044
+ forwardTask = (async () => {
1045
+ const inputReader = this.interruptionStreamChannel!.stream().getReader();
1046
+ const abortPromise = waitForAbort(signal);
1047
+
1048
+ try {
1049
+ while (!signal.aborted) {
1050
+ const res = await Promise.race([inputReader.read(), abortPromise]);
1051
+ if (!res) break;
1052
+
1053
+ const { value, done } = res;
1054
+ if (done) break;
1055
+
1056
+ if (value instanceof AudioFrame) {
1057
+ const frameDurationMs = (value.samplesPerChannel / value.sampleRate) * 1000;
1058
+ this._inputStartedAt ??= Date.now() - frameDurationMs;
1059
+ } else {
1060
+ this._inputStartedAt ??= Date.now();
1061
+ }
1062
+
1063
+ await stream.pushFrame(value);
1064
+ }
1065
+ } finally {
1066
+ inputReader.releaseLock();
1067
+ }
1068
+ })();
1069
+
1019
1070
  const abortPromise = waitForAbort(signal);
1071
+
1020
1072
  while (!signal.aborted) {
1021
- const res = await Promise.race([inputReader.read(), abortPromise]);
1073
+ const res = await Promise.race([eventReader.read(), abortPromise]);
1022
1074
  if (!res) break;
1023
- const { value, done } = res;
1075
+ const { done, value: ev } = res;
1024
1076
  if (done) break;
1025
- // Backdate to the actual start of the audio frame, not when it was received.
1026
- if (value instanceof AudioFrame) {
1027
- const frameDurationMs = (value.samplesPerChannel / value.sampleRate) * 1000;
1028
- this._inputStartedAt ??= Date.now() - frameDurationMs;
1077
+ this.onOverlapSpeechEvent(ev);
1078
+ }
1079
+ break;
1080
+ } catch (e) {
1081
+ if (signal.aborted) break;
1082
+
1083
+ if (isAPIError(e)) {
1084
+ if (maxRetries === 0 || !e.retryable) {
1085
+ interruptionDetection.emitError(
1086
+ new InterruptionDetectionError(
1087
+ e.message,
1088
+ Date.now(),
1089
+ interruptionDetection.label,
1090
+ false,
1091
+ ),
1092
+ );
1093
+ break;
1094
+ } else if (numRetries >= maxRetries) {
1095
+ interruptionDetection.emitError(
1096
+ new InterruptionDetectionError(
1097
+ `failed to detect interruption after ${numRetries} attempts`,
1098
+ Date.now(),
1099
+ interruptionDetection.label,
1100
+ false,
1101
+ ),
1102
+ );
1103
+ break;
1029
1104
  } else {
1030
- this._inputStartedAt ??= Date.now();
1105
+ const retryInterval = intervalForRetry(numRetries);
1106
+ interruptionDetection.emitError(
1107
+ new InterruptionDetectionError(
1108
+ e.message,
1109
+ Date.now(),
1110
+ interruptionDetection.label,
1111
+ true,
1112
+ ),
1113
+ );
1114
+ this.logger.warn(
1115
+ { model: interruptionDetection.label, attempt: numRetries },
1116
+ `failed to detect interruption, retrying in ${retryInterval}ms`,
1117
+ );
1118
+ numRetries++;
1119
+ await delay(retryInterval, { signal });
1031
1120
  }
1032
- await stream.pushFrame(value);
1121
+ } else {
1122
+ const msg = e instanceof Error ? e.message : String(e);
1123
+ interruptionDetection.emitError(
1124
+ new InterruptionDetectionError(msg, Date.now(), interruptionDetection.label, false),
1125
+ );
1126
+ this.logger.error(e, 'Error in interruption task');
1127
+ break;
1033
1128
  }
1034
1129
  } finally {
1035
- inputReader.releaseLock();
1036
- }
1037
- })();
1038
-
1039
- // Read output events from the interruption stream
1040
- const eventReader = stream.stream().getReader();
1041
- const abortHandler = async () => {
1042
- await cleanup();
1043
- };
1044
- signal.addEventListener('abort', abortHandler);
1045
-
1046
- try {
1047
- const abortPromise = waitForAbort(signal);
1048
-
1049
- while (!signal.aborted) {
1050
- const res = await Promise.race([eventReader.read(), abortPromise]);
1051
- if (!res) break;
1052
- const { done, value: ev } = res;
1053
- if (done) break;
1054
- this.onOverlapSpeechEvent(ev);
1055
- }
1056
- } catch (e) {
1057
- if (!signal.aborted) {
1058
- const cause = e instanceof Error ? e : new Error(String(e));
1059
- interruptionDetection.emitError(
1060
- new InterruptionDetectionError(
1061
- cause.message,
1062
- Date.now(),
1063
- interruptionDetection.label,
1064
- false,
1065
- ),
1066
- );
1067
- this.logger.error(e, 'Error in interruption task');
1130
+ await cleanup();
1131
+ await forwardTask?.catch((e) => {
1132
+ this.logger.debug({ err: e }, 'interruption task exited with error');
1133
+ });
1068
1134
  }
1069
- } finally {
1070
- await cleanup();
1071
- await forwardTask;
1072
- this.logger.debug('Interruption task closed');
1073
1135
  }
1136
+ this.logger.debug('Interruption task closed');
1074
1137
  }
1075
1138
 
1076
1139
  setInputAudioStream(audioStream: ReadableStream<AudioFrame>) {