@livekit/agents 1.0.45 → 1.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/dist/cli.cjs +14 -20
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +14 -20
  5. package/dist/cli.js.map +1 -1
  6. package/dist/ipc/job_proc_lazy_main.cjs +14 -5
  7. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  8. package/dist/ipc/job_proc_lazy_main.js +14 -5
  9. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  10. package/dist/llm/chat_context.cjs +19 -0
  11. package/dist/llm/chat_context.cjs.map +1 -1
  12. package/dist/llm/chat_context.d.cts +4 -0
  13. package/dist/llm/chat_context.d.ts +4 -0
  14. package/dist/llm/chat_context.d.ts.map +1 -1
  15. package/dist/llm/chat_context.js +19 -0
  16. package/dist/llm/chat_context.js.map +1 -1
  17. package/dist/llm/provider_format/index.cjs +2 -0
  18. package/dist/llm/provider_format/index.cjs.map +1 -1
  19. package/dist/llm/provider_format/index.d.cts +1 -1
  20. package/dist/llm/provider_format/index.d.ts +1 -1
  21. package/dist/llm/provider_format/index.d.ts.map +1 -1
  22. package/dist/llm/provider_format/index.js +6 -1
  23. package/dist/llm/provider_format/index.js.map +1 -1
  24. package/dist/llm/provider_format/openai.cjs +82 -2
  25. package/dist/llm/provider_format/openai.cjs.map +1 -1
  26. package/dist/llm/provider_format/openai.d.cts +1 -0
  27. package/dist/llm/provider_format/openai.d.ts +1 -0
  28. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  29. package/dist/llm/provider_format/openai.js +80 -1
  30. package/dist/llm/provider_format/openai.js.map +1 -1
  31. package/dist/llm/provider_format/openai.test.cjs +326 -0
  32. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  33. package/dist/llm/provider_format/openai.test.js +327 -1
  34. package/dist/llm/provider_format/openai.test.js.map +1 -1
  35. package/dist/llm/provider_format/utils.cjs +4 -3
  36. package/dist/llm/provider_format/utils.cjs.map +1 -1
  37. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  38. package/dist/llm/provider_format/utils.js +4 -3
  39. package/dist/llm/provider_format/utils.js.map +1 -1
  40. package/dist/llm/realtime.cjs.map +1 -1
  41. package/dist/llm/realtime.d.cts +1 -0
  42. package/dist/llm/realtime.d.ts +1 -0
  43. package/dist/llm/realtime.d.ts.map +1 -1
  44. package/dist/llm/realtime.js.map +1 -1
  45. package/dist/log.cjs +5 -2
  46. package/dist/log.cjs.map +1 -1
  47. package/dist/log.d.ts.map +1 -1
  48. package/dist/log.js +5 -2
  49. package/dist/log.js.map +1 -1
  50. package/dist/stream/deferred_stream.cjs +15 -6
  51. package/dist/stream/deferred_stream.cjs.map +1 -1
  52. package/dist/stream/deferred_stream.d.ts.map +1 -1
  53. package/dist/stream/deferred_stream.js +15 -6
  54. package/dist/stream/deferred_stream.js.map +1 -1
  55. package/dist/stream/index.cjs +3 -0
  56. package/dist/stream/index.cjs.map +1 -1
  57. package/dist/stream/index.d.cts +1 -0
  58. package/dist/stream/index.d.ts +1 -0
  59. package/dist/stream/index.d.ts.map +1 -1
  60. package/dist/stream/index.js +2 -0
  61. package/dist/stream/index.js.map +1 -1
  62. package/dist/stream/multi_input_stream.cjs +139 -0
  63. package/dist/stream/multi_input_stream.cjs.map +1 -0
  64. package/dist/stream/multi_input_stream.d.cts +55 -0
  65. package/dist/stream/multi_input_stream.d.ts +55 -0
  66. package/dist/stream/multi_input_stream.d.ts.map +1 -0
  67. package/dist/stream/multi_input_stream.js +115 -0
  68. package/dist/stream/multi_input_stream.js.map +1 -0
  69. package/dist/stream/multi_input_stream.test.cjs +340 -0
  70. package/dist/stream/multi_input_stream.test.cjs.map +1 -0
  71. package/dist/stream/multi_input_stream.test.js +339 -0
  72. package/dist/stream/multi_input_stream.test.js.map +1 -0
  73. package/dist/telemetry/trace_types.cjs +42 -0
  74. package/dist/telemetry/trace_types.cjs.map +1 -1
  75. package/dist/telemetry/trace_types.d.cts +14 -0
  76. package/dist/telemetry/trace_types.d.ts +14 -0
  77. package/dist/telemetry/trace_types.d.ts.map +1 -1
  78. package/dist/telemetry/trace_types.js +28 -0
  79. package/dist/telemetry/trace_types.js.map +1 -1
  80. package/dist/utils.cjs +44 -2
  81. package/dist/utils.cjs.map +1 -1
  82. package/dist/utils.d.cts +8 -0
  83. package/dist/utils.d.ts +8 -0
  84. package/dist/utils.d.ts.map +1 -1
  85. package/dist/utils.js +44 -2
  86. package/dist/utils.js.map +1 -1
  87. package/dist/utils.test.cjs +71 -0
  88. package/dist/utils.test.cjs.map +1 -1
  89. package/dist/utils.test.js +71 -0
  90. package/dist/utils.test.js.map +1 -1
  91. package/dist/version.cjs +1 -1
  92. package/dist/version.cjs.map +1 -1
  93. package/dist/version.d.cts +1 -1
  94. package/dist/version.d.ts +1 -1
  95. package/dist/version.d.ts.map +1 -1
  96. package/dist/version.js +1 -1
  97. package/dist/version.js.map +1 -1
  98. package/dist/voice/agent.cjs +144 -12
  99. package/dist/voice/agent.cjs.map +1 -1
  100. package/dist/voice/agent.d.cts +29 -4
  101. package/dist/voice/agent.d.ts +29 -4
  102. package/dist/voice/agent.d.ts.map +1 -1
  103. package/dist/voice/agent.js +140 -11
  104. package/dist/voice/agent.js.map +1 -1
  105. package/dist/voice/agent.test.cjs +120 -0
  106. package/dist/voice/agent.test.cjs.map +1 -1
  107. package/dist/voice/agent.test.js +122 -2
  108. package/dist/voice/agent.test.js.map +1 -1
  109. package/dist/voice/agent_activity.cjs +402 -292
  110. package/dist/voice/agent_activity.cjs.map +1 -1
  111. package/dist/voice/agent_activity.d.cts +35 -7
  112. package/dist/voice/agent_activity.d.ts +35 -7
  113. package/dist/voice/agent_activity.d.ts.map +1 -1
  114. package/dist/voice/agent_activity.js +402 -287
  115. package/dist/voice/agent_activity.js.map +1 -1
  116. package/dist/voice/agent_session.cjs +156 -44
  117. package/dist/voice/agent_session.cjs.map +1 -1
  118. package/dist/voice/agent_session.d.cts +22 -9
  119. package/dist/voice/agent_session.d.ts +22 -9
  120. package/dist/voice/agent_session.d.ts.map +1 -1
  121. package/dist/voice/agent_session.js +156 -44
  122. package/dist/voice/agent_session.js.map +1 -1
  123. package/dist/voice/audio_recognition.cjs +89 -36
  124. package/dist/voice/audio_recognition.cjs.map +1 -1
  125. package/dist/voice/audio_recognition.d.cts +22 -1
  126. package/dist/voice/audio_recognition.d.ts +22 -1
  127. package/dist/voice/audio_recognition.d.ts.map +1 -1
  128. package/dist/voice/audio_recognition.js +93 -36
  129. package/dist/voice/audio_recognition.js.map +1 -1
  130. package/dist/voice/audio_recognition_span.test.cjs +233 -0
  131. package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
  132. package/dist/voice/audio_recognition_span.test.js +232 -0
  133. package/dist/voice/audio_recognition_span.test.js.map +1 -0
  134. package/dist/voice/generation.cjs +39 -19
  135. package/dist/voice/generation.cjs.map +1 -1
  136. package/dist/voice/generation.d.ts.map +1 -1
  137. package/dist/voice/generation.js +44 -20
  138. package/dist/voice/generation.js.map +1 -1
  139. package/dist/voice/index.cjs +2 -0
  140. package/dist/voice/index.cjs.map +1 -1
  141. package/dist/voice/index.d.cts +1 -1
  142. package/dist/voice/index.d.ts +1 -1
  143. package/dist/voice/index.d.ts.map +1 -1
  144. package/dist/voice/index.js +2 -1
  145. package/dist/voice/index.js.map +1 -1
  146. package/dist/voice/io.cjs +6 -3
  147. package/dist/voice/io.cjs.map +1 -1
  148. package/dist/voice/io.d.cts +3 -2
  149. package/dist/voice/io.d.ts +3 -2
  150. package/dist/voice/io.d.ts.map +1 -1
  151. package/dist/voice/io.js +6 -3
  152. package/dist/voice/io.js.map +1 -1
  153. package/dist/voice/recorder_io/recorder_io.cjs +3 -1
  154. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  155. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  156. package/dist/voice/recorder_io/recorder_io.js +3 -1
  157. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  158. package/dist/voice/room_io/_input.cjs +17 -17
  159. package/dist/voice/room_io/_input.cjs.map +1 -1
  160. package/dist/voice/room_io/_input.d.cts +2 -2
  161. package/dist/voice/room_io/_input.d.ts +2 -2
  162. package/dist/voice/room_io/_input.d.ts.map +1 -1
  163. package/dist/voice/room_io/_input.js +7 -6
  164. package/dist/voice/room_io/_input.js.map +1 -1
  165. package/dist/voice/room_io/room_io.cjs +9 -0
  166. package/dist/voice/room_io/room_io.cjs.map +1 -1
  167. package/dist/voice/room_io/room_io.d.cts +3 -1
  168. package/dist/voice/room_io/room_io.d.ts +3 -1
  169. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  170. package/dist/voice/room_io/room_io.js +9 -0
  171. package/dist/voice/room_io/room_io.js.map +1 -1
  172. package/dist/voice/speech_handle.cjs +7 -1
  173. package/dist/voice/speech_handle.cjs.map +1 -1
  174. package/dist/voice/speech_handle.d.cts +2 -0
  175. package/dist/voice/speech_handle.d.ts +2 -0
  176. package/dist/voice/speech_handle.d.ts.map +1 -1
  177. package/dist/voice/speech_handle.js +8 -2
  178. package/dist/voice/speech_handle.js.map +1 -1
  179. package/dist/voice/testing/run_result.cjs +66 -15
  180. package/dist/voice/testing/run_result.cjs.map +1 -1
  181. package/dist/voice/testing/run_result.d.cts +14 -3
  182. package/dist/voice/testing/run_result.d.ts +14 -3
  183. package/dist/voice/testing/run_result.d.ts.map +1 -1
  184. package/dist/voice/testing/run_result.js +66 -15
  185. package/dist/voice/testing/run_result.js.map +1 -1
  186. package/dist/voice/utils.cjs +47 -0
  187. package/dist/voice/utils.cjs.map +1 -0
  188. package/dist/voice/utils.d.cts +4 -0
  189. package/dist/voice/utils.d.ts +4 -0
  190. package/dist/voice/utils.d.ts.map +1 -0
  191. package/dist/voice/utils.js +23 -0
  192. package/dist/voice/utils.js.map +1 -0
  193. package/package.json +1 -1
  194. package/src/cli.ts +20 -33
  195. package/src/ipc/job_proc_lazy_main.ts +16 -5
  196. package/src/llm/chat_context.ts +35 -0
  197. package/src/llm/provider_format/index.ts +7 -2
  198. package/src/llm/provider_format/openai.test.ts +385 -1
  199. package/src/llm/provider_format/openai.ts +103 -0
  200. package/src/llm/provider_format/utils.ts +6 -4
  201. package/src/llm/realtime.ts +1 -0
  202. package/src/log.ts +5 -2
  203. package/src/stream/deferred_stream.ts +17 -6
  204. package/src/stream/index.ts +1 -0
  205. package/src/stream/multi_input_stream.test.ts +540 -0
  206. package/src/stream/multi_input_stream.ts +172 -0
  207. package/src/telemetry/trace_types.ts +18 -0
  208. package/src/utils.test.ts +87 -0
  209. package/src/utils.ts +52 -2
  210. package/src/version.ts +1 -1
  211. package/src/voice/agent.test.ts +140 -2
  212. package/src/voice/agent.ts +189 -10
  213. package/src/voice/agent_activity.ts +449 -286
  214. package/src/voice/agent_session.ts +195 -51
  215. package/src/voice/audio_recognition.ts +118 -38
  216. package/src/voice/audio_recognition_span.test.ts +261 -0
  217. package/src/voice/generation.ts +52 -23
  218. package/src/voice/index.ts +1 -1
  219. package/src/voice/io.ts +7 -4
  220. package/src/voice/recorder_io/recorder_io.ts +2 -1
  221. package/src/voice/room_io/_input.ts +11 -7
  222. package/src/voice/room_io/room_io.ts +12 -0
  223. package/src/voice/speech_handle.ts +9 -2
  224. package/src/voice/testing/run_result.ts +81 -23
  225. package/src/voice/utils.ts +29 -0
@@ -10,14 +10,20 @@ import {
10
10
  } from "../llm/index.js";
11
11
  import { isSameToolChoice, isSameToolContext } from "../llm/tool_context.js";
12
12
  import { log } from "../log.js";
13
- import { DeferredReadableStream } from "../stream/deferred_stream.js";
13
+ import { MultiInputStream } from "../stream/multi_input_stream.js";
14
14
  import { STT } from "../stt/stt.js";
15
15
  import { recordRealtimeMetrics, traceTypes, tracer } from "../telemetry/index.js";
16
16
  import { splitWords } from "../tokenize/basic/word.js";
17
17
  import { TTS } from "../tts/tts.js";
18
18
  import { Future, Task, cancelAndWait, waitFor } from "../utils.js";
19
19
  import { VAD } from "../vad.js";
20
- import { StopResponse, asyncLocalStorage } from "./agent.js";
20
+ import {
21
+ StopResponse,
22
+ _getActivityTaskInfo,
23
+ _setActivityTaskInfo,
24
+ functionCallStorage,
25
+ speechHandleStorage
26
+ } from "./agent.js";
21
27
  import {} from "./agent_session.js";
22
28
  import {
23
29
  AudioRecognition
@@ -40,8 +46,11 @@ import {
40
46
  updateInstructions
41
47
  } from "./generation.js";
42
48
  import { SpeechHandle } from "./speech_handle.js";
43
- const speechHandleStorage = new AsyncLocalStorage();
49
+ import { setParticipantSpanAttributes } from "./utils.js";
50
+ const agentActivityStorage = new AsyncLocalStorage();
44
51
  class AgentActivity {
52
+ agent;
53
+ agentSession;
45
54
  static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
46
55
  started = false;
47
56
  audioRecognition;
@@ -50,22 +59,29 @@ class AgentActivity {
50
59
  // Maps response_id to OTEL span for metrics recording
51
60
  turnDetectionMode;
52
61
  logger = log();
53
- _draining = false;
62
+ _schedulingPaused = true;
63
+ _drainBlockedTasks = [];
54
64
  _currentSpeech;
55
65
  speechQueue;
56
66
  // [priority, timestamp, speechHandle]
57
67
  q_updated;
58
68
  speechTasks = /* @__PURE__ */ new Set();
59
69
  lock = new Mutex();
60
- audioStream = new DeferredReadableStream();
70
+ audioStream = new MultiInputStream();
71
+ audioStreamId;
61
72
  // default to null as None, which maps to the default provider tool choice value
62
73
  toolChoice = null;
63
74
  _preemptiveGeneration;
64
- agent;
65
- agentSession;
66
75
  /** @internal */
67
76
  _mainTask;
77
+ _onEnterTask;
78
+ _onExitTask;
68
79
  _userTurnCompletedTask;
80
+ onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
81
+ onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
82
+ onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
83
+ onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
84
+ onModelError = (ev) => this.onError(ev);
69
85
  constructor(agent, agentSession) {
70
86
  this.agent = agent;
71
87
  this.agentSession = agentSession;
@@ -76,7 +92,7 @@ class AgentActivity {
76
92
  this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
77
93
  if (this.turnDetectionMode === "vad" && this.vad === void 0) {
78
94
  this.logger.warn(
79
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting'
95
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
80
96
  );
81
97
  this.turnDetectionMode = void 0;
82
98
  }
@@ -128,98 +144,119 @@ class AgentActivity {
128
144
  async start() {
129
145
  const unlock = await this.lock.lock();
130
146
  try {
131
- const startSpan = tracer.startSpan({
132
- name: "start_agent_activity",
133
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
134
- context: ROOT_CONTEXT
135
- });
136
- this.agent._agentActivity = this;
137
- if (this.llm instanceof RealtimeModel) {
138
- this.realtimeSession = this.llm.session();
139
- this.realtimeSpans = /* @__PURE__ */ new Map();
140
- this.realtimeSession.on("generation_created", (ev) => this.onGenerationCreated(ev));
141
- this.realtimeSession.on("input_speech_started", (ev) => this.onInputSpeechStarted(ev));
142
- this.realtimeSession.on("input_speech_stopped", (ev) => this.onInputSpeechStopped(ev));
143
- this.realtimeSession.on(
144
- "input_audio_transcription_completed",
145
- (ev) => this.onInputAudioTranscriptionCompleted(ev)
146
- );
147
- this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
148
- this.realtimeSession.on("error", (ev) => this.onError(ev));
149
- removeInstructions(this.agent._chatCtx);
150
- try {
151
- await this.realtimeSession.updateInstructions(this.agent.instructions);
152
- } catch (error) {
153
- this.logger.error(error, "failed to update the instructions");
154
- }
155
- try {
156
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
157
- } catch (error) {
158
- this.logger.error(error, "failed to update the chat context");
159
- }
160
- try {
161
- await this.realtimeSession.updateTools(this.tools);
162
- } catch (error) {
163
- this.logger.error(error, "failed to update the tools");
164
- }
165
- if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
166
- this.logger.error(
167
- "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
168
- );
169
- }
170
- } else if (this.llm instanceof LLM) {
171
- try {
172
- updateInstructions({
173
- chatCtx: this.agent._chatCtx,
174
- instructions: this.agent.instructions,
175
- addIfMissing: true
176
- });
177
- } catch (error) {
178
- this.logger.error("failed to update the instructions", error);
179
- }
147
+ await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
148
+ } finally {
149
+ unlock();
150
+ }
151
+ }
152
+ async resume() {
153
+ const unlock = await this.lock.lock();
154
+ try {
155
+ await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
156
+ } finally {
157
+ unlock();
158
+ }
159
+ }
160
+ async _startSession(options) {
161
+ var _a;
162
+ const { spanName, runOnEnter } = options;
163
+ const startSpan = tracer.startSpan({
164
+ name: spanName,
165
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
166
+ context: ROOT_CONTEXT
167
+ });
168
+ this.agent._agentActivity = this;
169
+ if (this.llm instanceof RealtimeModel) {
170
+ this.realtimeSession = this.llm.session();
171
+ this.realtimeSpans = /* @__PURE__ */ new Map();
172
+ this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
173
+ this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
174
+ this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
175
+ this.realtimeSession.on(
176
+ "input_audio_transcription_completed",
177
+ this.onRealtimeInputAudioTranscriptionCompleted
178
+ );
179
+ this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
180
+ this.realtimeSession.on("error", this.onModelError);
181
+ removeInstructions(this.agent._chatCtx);
182
+ try {
183
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
184
+ } catch (error) {
185
+ this.logger.error(error, "failed to update the instructions");
180
186
  }
181
- if (this.llm instanceof LLM) {
182
- this.llm.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
183
- this.llm.on("error", (ev) => this.onError(ev));
187
+ try {
188
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
189
+ } catch (error) {
190
+ this.logger.error(error, "failed to update the chat context");
184
191
  }
185
- if (this.stt instanceof STT) {
186
- this.stt.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
187
- this.stt.on("error", (ev) => this.onError(ev));
192
+ try {
193
+ await this.realtimeSession.updateTools(this.tools);
194
+ } catch (error) {
195
+ this.logger.error(error, "failed to update the tools");
188
196
  }
189
- if (this.tts instanceof TTS) {
190
- this.tts.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
191
- this.tts.on("error", (ev) => this.onError(ev));
197
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
198
+ this.logger.error(
199
+ "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
200
+ );
192
201
  }
193
- if (this.vad instanceof VAD) {
194
- this.vad.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
202
+ } else if (this.llm instanceof LLM) {
203
+ try {
204
+ updateInstructions({
205
+ chatCtx: this.agent._chatCtx,
206
+ instructions: this.agent.instructions,
207
+ addIfMissing: true
208
+ });
209
+ } catch (error) {
210
+ this.logger.error("failed to update the instructions", error);
195
211
  }
196
- this.audioRecognition = new AudioRecognition({
197
- recognitionHooks: this,
198
- // Disable stt node if stt is not provided
199
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
200
- vad: this.vad,
201
- turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
202
- turnDetectionMode: this.turnDetectionMode,
203
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
204
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
205
- rootSpanContext: this.agentSession.rootSpanContext
206
- });
207
- this.audioRecognition.start();
208
- this.started = true;
209
- this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
210
- const onEnterTask = tracer.startActiveSpan(async () => this.agent.onEnter(), {
211
- name: "on_enter",
212
- context: trace.setSpan(ROOT_CONTEXT, startSpan),
213
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
214
- });
215
- this.createSpeechTask({
216
- task: Task.from(() => onEnterTask),
212
+ }
213
+ if (this.llm instanceof LLM) {
214
+ this.llm.on("metrics_collected", this.onMetricsCollected);
215
+ this.llm.on("error", this.onModelError);
216
+ }
217
+ if (this.stt instanceof STT) {
218
+ this.stt.on("metrics_collected", this.onMetricsCollected);
219
+ this.stt.on("error", this.onModelError);
220
+ }
221
+ if (this.tts instanceof TTS) {
222
+ this.tts.on("metrics_collected", this.onMetricsCollected);
223
+ this.tts.on("error", this.onModelError);
224
+ }
225
+ if (this.vad instanceof VAD) {
226
+ this.vad.on("metrics_collected", this.onMetricsCollected);
227
+ }
228
+ this.audioRecognition = new AudioRecognition({
229
+ recognitionHooks: this,
230
+ // Disable stt node if stt is not provided
231
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
232
+ vad: this.vad,
233
+ turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
234
+ turnDetectionMode: this.turnDetectionMode,
235
+ minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
236
+ maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
237
+ rootSpanContext: this.agentSession.rootSpanContext,
238
+ sttModel: (_a = this.stt) == null ? void 0 : _a.label,
239
+ sttProvider: this.getSttProvider(),
240
+ getLinkedParticipant: () => {
241
+ var _a2;
242
+ return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
243
+ }
244
+ });
245
+ this.audioRecognition.start();
246
+ this.started = true;
247
+ this._resumeSchedulingTask();
248
+ if (runOnEnter) {
249
+ this._onEnterTask = this.createSpeechTask({
250
+ taskFn: () => tracer.startActiveSpan(async () => this.agent.onEnter(), {
251
+ name: "on_enter",
252
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
253
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
254
+ }),
255
+ inlineTask: true,
217
256
  name: "AgentActivity_onEnter"
218
257
  });
219
- startSpan.end();
220
- } finally {
221
- unlock();
222
258
  }
259
+ startSpan.end();
223
260
  }
224
261
  get currentSpeech() {
225
262
  return this._currentSpeech;
@@ -230,6 +267,15 @@ class AgentActivity {
230
267
  get stt() {
231
268
  return this.agent.stt || this.agentSession.stt;
232
269
  }
270
+ getSttProvider() {
271
+ var _a;
272
+ const label = (_a = this.stt) == null ? void 0 : _a.label;
273
+ if (!label) {
274
+ return void 0;
275
+ }
276
+ const [provider] = label.split("-", 1);
277
+ return provider || label;
278
+ }
233
279
  get llm() {
234
280
  return this.agent.llm || this.agentSession.llm;
235
281
  }
@@ -239,8 +285,8 @@ class AgentActivity {
239
285
  get tools() {
240
286
  return this.agent.toolCtx;
241
287
  }
242
- get draining() {
243
- return this._draining;
288
+ get schedulingPaused() {
289
+ return this._schedulingPaused;
244
290
  }
245
291
  get realtimeLLMSession() {
246
292
  return this.realtimeSession;
@@ -280,11 +326,9 @@ class AgentActivity {
280
326
  }
281
327
  }
282
328
  attachAudioInput(audioStream) {
283
- if (this.audioStream.isSourceSet) {
284
- this.logger.debug("detaching existing audio input in agent activity");
285
- this.audioStream.detachSource();
286
- }
287
- this.audioStream.setSource(audioStream);
329
+ void this.audioStream.close();
330
+ this.audioStream = new MultiInputStream();
331
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
288
332
  const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
289
333
  if (this.realtimeSession) {
290
334
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
@@ -294,13 +338,21 @@ class AgentActivity {
294
338
  }
295
339
  }
296
340
  detachAudioInput() {
297
- this.audioStream.detachSource();
341
+ if (this.audioStreamId === void 0) {
342
+ return;
343
+ }
344
+ void this.audioStream.close();
345
+ this.audioStream = new MultiInputStream();
346
+ this.audioStreamId = void 0;
298
347
  }
299
- commitUserTurn() {
348
+ commitUserTurn(options = {}) {
349
+ const { audioDetached = false, throwIfNotReady = true } = options;
300
350
  if (!this.audioRecognition) {
301
- throw new Error("AudioRecognition is not initialized");
351
+ if (throwIfNotReady) {
352
+ throw new Error("AudioRecognition is not initialized");
353
+ }
354
+ return;
302
355
  }
303
- const audioDetached = false;
304
356
  this.audioRecognition.commitUserTurn(audioDetached);
305
357
  }
306
358
  clearUserTurn() {
@@ -336,13 +388,11 @@ class AgentActivity {
336
388
  })
337
389
  );
338
390
  const task = this.createSpeechTask({
339
- task: Task.from(
340
- (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
341
- ),
391
+ taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
342
392
  ownedSpeechHandle: handle,
343
393
  name: "AgentActivity.say_tts"
344
394
  });
345
- task.finally(() => this.onPipelineReplyDone());
395
+ task.result.finally(() => this.onPipelineReplyDone());
346
396
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
347
397
  return handle;
348
398
  }
@@ -432,8 +482,8 @@ class AgentActivity {
432
482
  if (ev.userInitiated) {
433
483
  return;
434
484
  }
435
- if (this.draining) {
436
- this.logger.warn("skipping new realtime generation, the agent is draining");
485
+ if (this.schedulingPaused) {
486
+ this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
437
487
  return;
438
488
  }
439
489
  const handle = SpeechHandle.create({
@@ -449,9 +499,7 @@ class AgentActivity {
449
499
  );
450
500
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
451
501
  this.createSpeechTask({
452
- task: Task.from(
453
- (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
454
- ),
502
+ taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
455
503
  ownedSpeechHandle: handle,
456
504
  name: "AgentActivity.realtimeGeneration"
457
505
  });
@@ -538,7 +586,7 @@ class AgentActivity {
538
586
  }
539
587
  }
540
588
  onPreemptiveGeneration(info) {
541
- if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
589
+ if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
542
590
  return;
543
591
  }
544
592
  this.cancelPreemptiveGeneration();
@@ -576,7 +624,21 @@ class AgentActivity {
576
624
  }
577
625
  }
578
626
  createSpeechTask(options) {
579
- const { task, ownedSpeechHandle } = options;
627
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
628
+ const wrappedFn = (ctrl) => {
629
+ return agentActivityStorage.run(this, () => {
630
+ const currentTask = Task.current();
631
+ if (currentTask) {
632
+ _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
633
+ }
634
+ if (ownedSpeechHandle) {
635
+ return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
636
+ }
637
+ return taskFn(ctrl);
638
+ });
639
+ };
640
+ const task = Task.from(wrappedFn, controller, name);
641
+ _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
580
642
  this.speechTasks.add(task);
581
643
  task.addDoneCallback(() => {
582
644
  this.speechTasks.delete(task);
@@ -592,12 +654,15 @@ class AgentActivity {
592
654
  task.addDoneCallback(() => {
593
655
  this.wakeupMainTask();
594
656
  });
595
- return task.result;
657
+ return task;
596
658
  }
597
659
  async onEndOfTurn(info) {
598
- if (this.draining) {
660
+ if (this.schedulingPaused) {
599
661
  this.cancelPreemptiveGeneration();
600
- this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
662
+ this.logger.warn(
663
+ { user_input: info.newTranscript },
664
+ "skipping user input, speech scheduling is paused"
665
+ );
601
666
  return true;
602
667
  }
603
668
  if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
@@ -616,7 +681,7 @@ class AgentActivity {
616
681
  }
617
682
  const oldTask = this._userTurnCompletedTask;
618
683
  this._userTurnCompletedTask = this.createSpeechTask({
619
- task: Task.from(() => this.userTurnCompleted(info, oldTask)),
684
+ taskFn: () => this.userTurnCompleted(info, oldTask),
620
685
  name: "AgentActivity.userTurnCompleted"
621
686
  });
622
687
  return true;
@@ -646,14 +711,41 @@ class AgentActivity {
646
711
  await speechHandle._waitForGeneration();
647
712
  this._currentSpeech = void 0;
648
713
  }
649
- if (this.draining && this.speechTasks.size === 0) {
650
- this.logger.info("mainTask: draining and no more speech tasks");
714
+ const toWait = this.getDrainPendingSpeechTasks();
715
+ if (this._schedulingPaused && toWait.length === 0) {
716
+ this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
651
717
  break;
652
718
  }
653
719
  this.q_updated = new Future();
654
720
  }
655
721
  this.logger.info("AgentActivity mainTask: exiting");
656
722
  }
723
+ getDrainPendingSpeechTasks() {
724
+ const blockedHandles = [];
725
+ for (const task of this._drainBlockedTasks) {
726
+ const info = _getActivityTaskInfo(task);
727
+ if (!info) {
728
+ this.logger.error("blocked task without activity info; skipping.");
729
+ continue;
730
+ }
731
+ if (!info.speechHandle) {
732
+ continue;
733
+ }
734
+ blockedHandles.push(info.speechHandle);
735
+ }
736
+ const toWait = [];
737
+ for (const task of this.speechTasks) {
738
+ if (this._drainBlockedTasks.includes(task)) {
739
+ continue;
740
+ }
741
+ const info = _getActivityTaskInfo(task);
742
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
743
+ continue;
744
+ }
745
+ toWait.push(task);
746
+ }
747
+ return toWait;
748
+ }
657
749
  wakeupMainTask() {
658
750
  this.q_updated.resolve();
659
751
  }
@@ -679,7 +771,7 @@ class AgentActivity {
679
771
  if (this.llm === void 0) {
680
772
  throw new Error("trying to generate reply without an LLM model");
681
773
  }
682
- const functionCall = (_a = asyncLocalStorage.getStore()) == null ? void 0 : _a.functionCall;
774
+ const functionCall = (_a = functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
683
775
  if (toolChoice === void 0 && functionCall !== void 0) {
684
776
  toolChoice = "none";
685
777
  }
@@ -697,19 +789,17 @@ class AgentActivity {
697
789
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
698
790
  if (this.llm instanceof RealtimeModel) {
699
791
  this.createSpeechTask({
700
- task: Task.from(
701
- (abortController) => this.realtimeReplyTask({
702
- speechHandle: handle,
703
- // TODO(brian): support llm.ChatMessage for the realtime model
704
- userInput: userMessage == null ? void 0 : userMessage.textContent,
705
- instructions,
706
- modelSettings: {
707
- // isGiven(toolChoice) = toolChoice !== undefined
708
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
709
- },
710
- abortController
711
- })
712
- ),
792
+ taskFn: (abortController) => this.realtimeReplyTask({
793
+ speechHandle: handle,
794
+ // TODO(brian): support llm.ChatMessage for the realtime model
795
+ userInput: userMessage == null ? void 0 : userMessage.textContent,
796
+ instructions,
797
+ modelSettings: {
798
+ // isGiven(toolChoice) = toolChoice !== undefined
799
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
800
+ },
801
+ abortController
802
+ }),
713
803
  ownedSpeechHandle: handle,
714
804
  name: "AgentActivity.realtimeReply"
715
805
  });
@@ -719,36 +809,36 @@ class AgentActivity {
719
809
  ${instructions}`;
720
810
  }
721
811
  const task = this.createSpeechTask({
722
- task: Task.from(
723
- (abortController) => this.pipelineReplyTask(
724
- handle,
725
- chatCtx ?? this.agent.chatCtx,
726
- this.agent.toolCtx,
727
- {
728
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
729
- },
730
- abortController,
731
- instructions,
732
- userMessage
733
- )
812
+ taskFn: (abortController) => this.pipelineReplyTask(
813
+ handle,
814
+ chatCtx ?? this.agent.chatCtx,
815
+ this.agent.toolCtx,
816
+ {
817
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
818
+ },
819
+ abortController,
820
+ instructions,
821
+ userMessage
734
822
  ),
735
823
  ownedSpeechHandle: handle,
736
824
  name: "AgentActivity.pipelineReply"
737
825
  });
738
- task.finally(() => this.onPipelineReplyDone());
826
+ task.result.finally(() => this.onPipelineReplyDone());
739
827
  }
740
828
  if (scheduleSpeech) {
741
829
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
742
830
  }
743
831
  return handle;
744
832
  }
745
- interrupt() {
833
+ interrupt(options = {}) {
746
834
  var _a;
835
+ const { force = false } = options;
836
+ this.cancelPreemptiveGeneration();
747
837
  const future = new Future();
748
838
  const currentSpeech = this._currentSpeech;
749
- currentSpeech == null ? void 0 : currentSpeech.interrupt();
839
+ currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
750
840
  for (const [_, __, speech] of this.speechQueue) {
751
- speech.interrupt();
841
+ speech.interrupt(force);
752
842
  }
753
843
  (_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
754
844
  if (currentSpeech === void 0) {
@@ -769,7 +859,7 @@ ${instructions}`;
769
859
  async userTurnCompleted(info, oldTask) {
770
860
  var _a, _b;
771
861
  if (oldTask) {
772
- await oldTask;
862
+ await oldTask.result;
773
863
  }
774
864
  if (this.llm instanceof RealtimeModel) {
775
865
  if (this.llm.capabilities.turnDetection) {
@@ -956,7 +1046,7 @@ ${instructions}`;
956
1046
  toolsMessages,
957
1047
  span
958
1048
  }) => {
959
- var _a, _b, _c, _d;
1049
+ var _a, _b;
960
1050
  speechHandle._agentTurnContext = otelContext.active();
961
1051
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
962
1052
  if (instructions) {
@@ -965,6 +1055,10 @@ ${instructions}`;
965
1055
  if (newMessage) {
966
1056
  span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || "");
967
1057
  }
1058
+ const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
1059
+ if (localParticipant) {
1060
+ setParticipantSpanAttributes(span, localParticipant);
1061
+ }
968
1062
  speechHandleStorage.enterWith(speechHandle);
969
1063
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
970
1064
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
@@ -1024,7 +1118,7 @@ ${instructions}`;
1024
1118
  speechHandle._clearAuthorization();
1025
1119
  const replyStartedAt = Date.now();
1026
1120
  let transcriptionInput = llmOutput;
1027
- if (this.useTtsAlignedTranscript && ((_a = this.tts) == null ? void 0 : _a.capabilities.alignedTranscript) && ttsGenData) {
1121
+ if (this.useTtsAlignedTranscript && ((_b = this.tts) == null ? void 0 : _b.capabilities.alignedTranscript) && ttsGenData) {
1028
1122
  const timedTextsStream = await Promise.race([
1029
1123
  ttsGenData.timedTextsFut.await,
1030
1124
  (ttsTask == null ? void 0 : ttsTask.result.catch(
@@ -1098,11 +1192,11 @@ ${instructions}`;
1098
1192
  for (const msg of toolsMessages) {
1099
1193
  msg.createdAt = replyStartedAt;
1100
1194
  }
1101
- this.agent._chatCtx.insert(toolsMessages);
1102
1195
  const toolCallOutputs = toolsMessages.filter(
1103
1196
  (m) => m.type === "function_call_output"
1104
1197
  );
1105
1198
  if (toolCallOutputs.length > 0) {
1199
+ this.agent._chatCtx.insert(toolCallOutputs);
1106
1200
  this.agentSession._toolItemsAdded(toolCallOutputs);
1107
1201
  }
1108
1202
  }
@@ -1190,45 +1284,15 @@ ${instructions}`;
1190
1284
  );
1191
1285
  return;
1192
1286
  }
1193
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1194
- functionCalls: [],
1195
- functionCallOutputs: []
1196
- });
1197
- let shouldGenerateToolReply = false;
1198
- let newAgentTask = null;
1199
- let ignoreTaskSwitch = false;
1200
- for (const sanitizedOut of toolOutput.output) {
1201
- if (sanitizedOut.toolCallOutput !== void 0) {
1202
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1203
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1204
- if (sanitizedOut.replyRequired) {
1205
- shouldGenerateToolReply = true;
1206
- }
1207
- }
1208
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1209
- this.logger.error("expected to receive only one agent task from the tool executions");
1210
- ignoreTaskSwitch = true;
1211
- }
1212
- newAgentTask = sanitizedOut.agentTask ?? null;
1213
- this.logger.debug(
1214
- {
1215
- speechId: speechHandle.id,
1216
- name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
1217
- args: sanitizedOut.toolCall.args,
1218
- output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
1219
- isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
1220
- },
1221
- "Tool call execution finished"
1222
- );
1223
- }
1287
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1224
1288
  this.agentSession.emit(
1225
1289
  AgentSessionEventTypes.FunctionToolsExecuted,
1226
1290
  functionToolsExecutedEvent
1227
1291
  );
1228
- let draining = this.draining;
1292
+ let schedulingPaused = this.schedulingPaused;
1229
1293
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1230
1294
  this.agentSession.updateAgent(newAgentTask);
1231
- draining = true;
1295
+ schedulingPaused = true;
1232
1296
  }
1233
1297
  const toolMessages = [
1234
1298
  ...functionToolsExecutedEvent.functionCalls,
@@ -1237,34 +1301,32 @@ ${instructions}`;
1237
1301
  if (shouldGenerateToolReply) {
1238
1302
  chatCtx.insert(toolMessages);
1239
1303
  speechHandle._numSteps += 1;
1240
- const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1304
+ const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1241
1305
  const toolResponseTask = this.createSpeechTask({
1242
- task: Task.from(
1243
- () => this.pipelineReplyTask(
1244
- speechHandle,
1245
- chatCtx,
1246
- toolCtx,
1247
- { toolChoice: respondToolChoice },
1248
- replyAbortController,
1249
- instructions,
1250
- void 0,
1251
- toolMessages
1252
- )
1306
+ taskFn: () => this.pipelineReplyTask(
1307
+ speechHandle,
1308
+ chatCtx,
1309
+ toolCtx,
1310
+ { toolChoice: respondToolChoice },
1311
+ replyAbortController,
1312
+ instructions,
1313
+ void 0,
1314
+ toolMessages
1253
1315
  ),
1254
1316
  ownedSpeechHandle: speechHandle,
1255
1317
  name: "AgentActivity.pipelineReply"
1256
1318
  });
1257
- toolResponseTask.finally(() => this.onPipelineReplyDone());
1319
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1258
1320
  this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1259
1321
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1260
1322
  for (const msg of toolMessages) {
1261
1323
  msg.createdAt = replyStartedAt;
1262
1324
  }
1263
- this.agent._chatCtx.insert(toolMessages);
1264
1325
  const toolCallOutputs = toolMessages.filter(
1265
1326
  (m) => m.type === "function_call_output"
1266
1327
  );
1267
1328
  if (toolCallOutputs.length > 0) {
1329
+ this.agent._chatCtx.insert(toolCallOutputs);
1268
1330
  this.agentSession._toolItemsAdded(toolCallOutputs);
1269
1331
  }
1270
1332
  }
@@ -1308,9 +1370,13 @@ ${instructions}`;
1308
1370
  replyAbortController,
1309
1371
  span
1310
1372
  }) {
1311
- var _a, _b, _c;
1373
+ var _a;
1312
1374
  speechHandle._agentTurnContext = otelContext.active();
1313
1375
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1376
+ const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
1377
+ if (localParticipant) {
1378
+ setParticipantSpanAttributes(span, localParticipant);
1379
+ }
1314
1380
  speechHandleStorage.enterWith(speechHandle);
1315
1381
  if (!this.realtimeSession) {
1316
1382
  throw new Error("realtime session is not initialized");
@@ -1564,44 +1630,15 @@ ${instructions}`;
1564
1630
  );
1565
1631
  return;
1566
1632
  }
1567
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1568
- functionCalls: [],
1569
- functionCallOutputs: []
1570
- });
1571
- let shouldGenerateToolReply = false;
1572
- let newAgentTask = null;
1573
- let ignoreTaskSwitch = false;
1574
- for (const sanitizedOut of toolOutput.output) {
1575
- if (sanitizedOut.toolCallOutput !== void 0) {
1576
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1577
- if (sanitizedOut.replyRequired) {
1578
- shouldGenerateToolReply = true;
1579
- }
1580
- }
1581
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1582
- this.logger.error("expected to receive only one agent task from the tool executions");
1583
- ignoreTaskSwitch = true;
1584
- }
1585
- newAgentTask = sanitizedOut.agentTask ?? null;
1586
- this.logger.debug(
1587
- {
1588
- speechId: speechHandle.id,
1589
- name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1590
- args: sanitizedOut.toolCall.args,
1591
- output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1592
- isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1593
- },
1594
- "Tool call execution finished"
1595
- );
1596
- }
1633
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1597
1634
  this.agentSession.emit(
1598
1635
  AgentSessionEventTypes.FunctionToolsExecuted,
1599
1636
  functionToolsExecutedEvent
1600
1637
  );
1601
- let draining = this.draining;
1638
+ let schedulingPaused = this.schedulingPaused;
1602
1639
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1603
1640
  this.agentSession.updateAgent(newAgentTask);
1604
- draining = true;
1641
+ schedulingPaused = true;
1605
1642
  }
1606
1643
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1607
1644
  while (this.currentSpeech || this.speechQueue.size() > 0) {
@@ -1642,20 +1679,58 @@ ${instructions}`;
1642
1679
  speechHandle: replySpeechHandle
1643
1680
  })
1644
1681
  );
1645
- const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1682
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1646
1683
  this.createSpeechTask({
1647
- task: Task.from(
1648
- (abortController) => this.realtimeReplyTask({
1649
- speechHandle: replySpeechHandle,
1650
- modelSettings: { toolChoice },
1651
- abortController
1652
- })
1653
- ),
1684
+ taskFn: (abortController) => this.realtimeReplyTask({
1685
+ speechHandle: replySpeechHandle,
1686
+ modelSettings: { toolChoice },
1687
+ abortController
1688
+ }),
1654
1689
  ownedSpeechHandle: replySpeechHandle,
1655
1690
  name: "AgentActivity.realtime_reply"
1656
1691
  });
1657
1692
  this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1658
1693
  }
1694
+ summarizeToolExecutionOutput(toolOutput, speechHandle) {
1695
+ var _a, _b, _c;
1696
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1697
+ functionCalls: [],
1698
+ functionCallOutputs: []
1699
+ });
1700
+ let shouldGenerateToolReply = false;
1701
+ let newAgentTask = null;
1702
+ let ignoreTaskSwitch = false;
1703
+ for (const sanitizedOut of toolOutput.output) {
1704
+ if (sanitizedOut.toolCallOutput !== void 0) {
1705
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1706
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1707
+ if (sanitizedOut.replyRequired) {
1708
+ shouldGenerateToolReply = true;
1709
+ }
1710
+ }
1711
+ if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1712
+ this.logger.error("expected to receive only one agent task from the tool executions");
1713
+ ignoreTaskSwitch = true;
1714
+ }
1715
+ newAgentTask = sanitizedOut.agentTask ?? null;
1716
+ this.logger.debug(
1717
+ {
1718
+ speechId: speechHandle.id,
1719
+ name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1720
+ args: sanitizedOut.toolCall.args,
1721
+ output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1722
+ isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1723
+ },
1724
+ "Tool call execution finished"
1725
+ );
1726
+ }
1727
+ return {
1728
+ functionToolsExecutedEvent,
1729
+ shouldGenerateToolReply,
1730
+ newAgentTask,
1731
+ ignoreTaskSwitch
1732
+ };
1733
+ }
1659
1734
  async realtimeReplyTask({
1660
1735
  speechHandle,
1661
1736
  modelSettings: { toolChoice },
@@ -1697,13 +1772,45 @@ ${instructions}`;
1697
1772
  }
1698
1773
  }
1699
1774
  scheduleSpeech(speechHandle, priority, force = false) {
1700
- if (this.draining && !force) {
1701
- throw new Error("cannot schedule new speech, the agent is draining");
1775
+ if (this.schedulingPaused && !force) {
1776
+ throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
1702
1777
  }
1703
1778
  this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1704
1779
  speechHandle._markScheduled();
1705
1780
  this.wakeupMainTask();
1706
1781
  }
1782
+ async _pauseSchedulingTask(blockedTasks) {
1783
+ if (this._schedulingPaused) return;
1784
+ this._schedulingPaused = true;
1785
+ this._drainBlockedTasks = blockedTasks;
1786
+ this.wakeupMainTask();
1787
+ if (this._mainTask) {
1788
+ await this._mainTask.result;
1789
+ }
1790
+ }
1791
+ _resumeSchedulingTask() {
1792
+ if (!this._schedulingPaused) return;
1793
+ this._schedulingPaused = false;
1794
+ this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
1795
+ }
1796
+ async pause(options = {}) {
1797
+ const { blockedTasks = [] } = options;
1798
+ const unlock = await this.lock.lock();
1799
+ try {
1800
+ const span = tracer.startSpan({
1801
+ name: "pause_agent_activity",
1802
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1803
+ });
1804
+ try {
1805
+ await this._pauseSchedulingTask(blockedTasks);
1806
+ await this._closeSessionResources();
1807
+ } finally {
1808
+ span.end();
1809
+ }
1810
+ } finally {
1811
+ unlock();
1812
+ }
1813
+ }
1707
1814
  async drain() {
1708
1815
  return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
1709
1816
  name: "drain_agent_activity",
@@ -1711,71 +1818,79 @@ ${instructions}`;
1711
1818
  });
1712
1819
  }
1713
1820
  async _drainImpl(span) {
1714
- var _a;
1715
1821
  span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
1716
1822
  const unlock = await this.lock.lock();
1717
1823
  try {
1718
- if (this._draining) return;
1719
- this.cancelPreemptiveGeneration();
1720
- const onExitTask = tracer.startActiveSpan(async () => this.agent.onExit(), {
1721
- name: "on_exit",
1722
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1723
- });
1724
- this.createSpeechTask({
1725
- task: Task.from(() => onExitTask),
1824
+ if (this._schedulingPaused) return;
1825
+ this._onExitTask = this.createSpeechTask({
1826
+ taskFn: () => tracer.startActiveSpan(async () => this.agent.onExit(), {
1827
+ name: "on_exit",
1828
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1829
+ }),
1830
+ inlineTask: true,
1726
1831
  name: "AgentActivity_onExit"
1727
1832
  });
1728
- this.wakeupMainTask();
1729
- this._draining = true;
1730
- await ((_a = this._mainTask) == null ? void 0 : _a.result);
1833
+ this.cancelPreemptiveGeneration();
1834
+ await this._onExitTask.result;
1835
+ await this._pauseSchedulingTask([]);
1731
1836
  } finally {
1732
1837
  unlock();
1733
1838
  }
1734
1839
  }
1735
1840
  async close() {
1736
- var _a, _b, _c, _d;
1737
1841
  const unlock = await this.lock.lock();
1738
1842
  try {
1739
- if (!this._draining) {
1740
- this.logger.warn("task closing without draining");
1741
- }
1742
1843
  this.cancelPreemptiveGeneration();
1743
- if (this.llm instanceof LLM) {
1744
- this.llm.off("metrics_collected", this.onMetricsCollected);
1745
- }
1746
- if (this.realtimeSession) {
1747
- this.realtimeSession.off("generation_created", this.onGenerationCreated);
1748
- this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
1749
- this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
1750
- this.realtimeSession.off(
1751
- "input_audio_transcription_completed",
1752
- this.onInputAudioTranscriptionCompleted
1753
- );
1754
- this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1755
- }
1756
- if (this.stt instanceof STT) {
1757
- this.stt.off("metrics_collected", this.onMetricsCollected);
1758
- }
1759
- if (this.tts instanceof TTS) {
1760
- this.tts.off("metrics_collected", this.onMetricsCollected);
1844
+ await this._closeSessionResources();
1845
+ if (this._mainTask) {
1846
+ await this._mainTask.cancelAndWait();
1761
1847
  }
1762
- if (this.vad instanceof VAD) {
1763
- this.vad.off("metrics_collected", this.onMetricsCollected);
1764
- }
1765
- this.detachAudioInput();
1766
- (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
1767
- await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
1768
- await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
1769
- await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
1848
+ this.agent._agentActivity = void 0;
1770
1849
  } finally {
1771
1850
  unlock();
1772
1851
  }
1773
1852
  }
1853
+ async _closeSessionResources() {
1854
+ var _a, _b, _c;
1855
+ if (this.llm instanceof LLM) {
1856
+ this.llm.off("metrics_collected", this.onMetricsCollected);
1857
+ this.llm.off("error", this.onModelError);
1858
+ }
1859
+ if (this.realtimeSession) {
1860
+ this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
1861
+ this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
1862
+ this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
1863
+ this.realtimeSession.off(
1864
+ "input_audio_transcription_completed",
1865
+ this.onRealtimeInputAudioTranscriptionCompleted
1866
+ );
1867
+ this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1868
+ this.realtimeSession.off("error", this.onModelError);
1869
+ }
1870
+ if (this.stt instanceof STT) {
1871
+ this.stt.off("metrics_collected", this.onMetricsCollected);
1872
+ this.stt.off("error", this.onModelError);
1873
+ }
1874
+ if (this.tts instanceof TTS) {
1875
+ this.tts.off("metrics_collected", this.onMetricsCollected);
1876
+ this.tts.off("error", this.onModelError);
1877
+ }
1878
+ if (this.vad instanceof VAD) {
1879
+ this.vad.off("metrics_collected", this.onMetricsCollected);
1880
+ }
1881
+ this.detachAudioInput();
1882
+ (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
1883
+ await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
1884
+ await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
1885
+ this.realtimeSession = void 0;
1886
+ this.audioRecognition = void 0;
1887
+ }
1774
1888
  }
1775
1889
  function toOaiToolChoice(toolChoice) {
1776
1890
  return toolChoice !== null ? toolChoice : void 0;
1777
1891
  }
1778
1892
  export {
1779
- AgentActivity
1893
+ AgentActivity,
1894
+ agentActivityStorage
1780
1895
  };
1781
1896
  //# sourceMappingURL=agent_activity.js.map