@livekit/agents 1.0.46 → 1.0.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. package/dist/beta/index.cjs +29 -0
  2. package/dist/beta/index.cjs.map +1 -0
  3. package/dist/beta/index.d.cts +2 -0
  4. package/dist/beta/index.d.ts +2 -0
  5. package/dist/beta/index.d.ts.map +1 -0
  6. package/dist/beta/index.js +7 -0
  7. package/dist/beta/index.js.map +1 -0
  8. package/dist/beta/workflows/index.cjs +29 -0
  9. package/dist/beta/workflows/index.cjs.map +1 -0
  10. package/dist/beta/workflows/index.d.cts +2 -0
  11. package/dist/beta/workflows/index.d.ts +2 -0
  12. package/dist/beta/workflows/index.d.ts.map +1 -0
  13. package/dist/beta/workflows/index.js +7 -0
  14. package/dist/beta/workflows/index.js.map +1 -0
  15. package/dist/beta/workflows/task_group.cjs +162 -0
  16. package/dist/beta/workflows/task_group.cjs.map +1 -0
  17. package/dist/beta/workflows/task_group.d.cts +32 -0
  18. package/dist/beta/workflows/task_group.d.ts +32 -0
  19. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  20. package/dist/beta/workflows/task_group.js +138 -0
  21. package/dist/beta/workflows/task_group.js.map +1 -0
  22. package/dist/cli.cjs +14 -20
  23. package/dist/cli.cjs.map +1 -1
  24. package/dist/cli.d.ts.map +1 -1
  25. package/dist/cli.js +14 -20
  26. package/dist/cli.js.map +1 -1
  27. package/dist/index.cjs +3 -0
  28. package/dist/index.cjs.map +1 -1
  29. package/dist/index.d.cts +2 -1
  30. package/dist/index.d.ts +2 -1
  31. package/dist/index.d.ts.map +1 -1
  32. package/dist/index.js +2 -0
  33. package/dist/index.js.map +1 -1
  34. package/dist/inference/api_protos.d.cts +59 -59
  35. package/dist/inference/api_protos.d.ts +59 -59
  36. package/dist/ipc/job_proc_lazy_main.cjs +14 -5
  37. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  38. package/dist/ipc/job_proc_lazy_main.js +14 -5
  39. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  40. package/dist/llm/chat_context.cjs +108 -1
  41. package/dist/llm/chat_context.cjs.map +1 -1
  42. package/dist/llm/chat_context.d.cts +14 -1
  43. package/dist/llm/chat_context.d.ts +14 -1
  44. package/dist/llm/chat_context.d.ts.map +1 -1
  45. package/dist/llm/chat_context.js +108 -1
  46. package/dist/llm/chat_context.js.map +1 -1
  47. package/dist/llm/chat_context.test.cjs +43 -0
  48. package/dist/llm/chat_context.test.cjs.map +1 -1
  49. package/dist/llm/chat_context.test.js +43 -0
  50. package/dist/llm/chat_context.test.js.map +1 -1
  51. package/dist/llm/index.cjs +2 -0
  52. package/dist/llm/index.cjs.map +1 -1
  53. package/dist/llm/index.d.cts +1 -1
  54. package/dist/llm/index.d.ts +1 -1
  55. package/dist/llm/index.d.ts.map +1 -1
  56. package/dist/llm/index.js +3 -1
  57. package/dist/llm/index.js.map +1 -1
  58. package/dist/llm/provider_format/index.cjs +2 -0
  59. package/dist/llm/provider_format/index.cjs.map +1 -1
  60. package/dist/llm/provider_format/index.d.cts +2 -2
  61. package/dist/llm/provider_format/index.d.ts +2 -2
  62. package/dist/llm/provider_format/index.d.ts.map +1 -1
  63. package/dist/llm/provider_format/index.js +6 -1
  64. package/dist/llm/provider_format/index.js.map +1 -1
  65. package/dist/llm/provider_format/openai.cjs +82 -2
  66. package/dist/llm/provider_format/openai.cjs.map +1 -1
  67. package/dist/llm/provider_format/openai.d.cts +1 -0
  68. package/dist/llm/provider_format/openai.d.ts +1 -0
  69. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  70. package/dist/llm/provider_format/openai.js +80 -1
  71. package/dist/llm/provider_format/openai.js.map +1 -1
  72. package/dist/llm/provider_format/openai.test.cjs +326 -0
  73. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  74. package/dist/llm/provider_format/openai.test.js +327 -1
  75. package/dist/llm/provider_format/openai.test.js.map +1 -1
  76. package/dist/llm/provider_format/utils.cjs +4 -3
  77. package/dist/llm/provider_format/utils.cjs.map +1 -1
  78. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  79. package/dist/llm/provider_format/utils.js +4 -3
  80. package/dist/llm/provider_format/utils.js.map +1 -1
  81. package/dist/llm/realtime.cjs.map +1 -1
  82. package/dist/llm/realtime.d.cts +1 -0
  83. package/dist/llm/realtime.d.ts +1 -0
  84. package/dist/llm/realtime.d.ts.map +1 -1
  85. package/dist/llm/realtime.js.map +1 -1
  86. package/dist/llm/tool_context.cjs +7 -0
  87. package/dist/llm/tool_context.cjs.map +1 -1
  88. package/dist/llm/tool_context.d.cts +10 -2
  89. package/dist/llm/tool_context.d.ts +10 -2
  90. package/dist/llm/tool_context.d.ts.map +1 -1
  91. package/dist/llm/tool_context.js +6 -0
  92. package/dist/llm/tool_context.js.map +1 -1
  93. package/dist/log.cjs +5 -2
  94. package/dist/log.cjs.map +1 -1
  95. package/dist/log.d.ts.map +1 -1
  96. package/dist/log.js +5 -2
  97. package/dist/log.js.map +1 -1
  98. package/dist/stream/deferred_stream.cjs +15 -6
  99. package/dist/stream/deferred_stream.cjs.map +1 -1
  100. package/dist/stream/deferred_stream.d.ts.map +1 -1
  101. package/dist/stream/deferred_stream.js +15 -6
  102. package/dist/stream/deferred_stream.js.map +1 -1
  103. package/dist/utils.cjs +32 -2
  104. package/dist/utils.cjs.map +1 -1
  105. package/dist/utils.d.cts +7 -0
  106. package/dist/utils.d.ts +7 -0
  107. package/dist/utils.d.ts.map +1 -1
  108. package/dist/utils.js +32 -2
  109. package/dist/utils.js.map +1 -1
  110. package/dist/utils.test.cjs +71 -0
  111. package/dist/utils.test.cjs.map +1 -1
  112. package/dist/utils.test.js +71 -0
  113. package/dist/utils.test.js.map +1 -1
  114. package/dist/version.cjs +1 -1
  115. package/dist/version.cjs.map +1 -1
  116. package/dist/version.d.cts +1 -1
  117. package/dist/version.d.ts +1 -1
  118. package/dist/version.d.ts.map +1 -1
  119. package/dist/version.js +1 -1
  120. package/dist/version.js.map +1 -1
  121. package/dist/voice/agent.cjs +153 -12
  122. package/dist/voice/agent.cjs.map +1 -1
  123. package/dist/voice/agent.d.cts +30 -4
  124. package/dist/voice/agent.d.ts +30 -4
  125. package/dist/voice/agent.d.ts.map +1 -1
  126. package/dist/voice/agent.js +149 -11
  127. package/dist/voice/agent.js.map +1 -1
  128. package/dist/voice/agent.test.cjs +120 -0
  129. package/dist/voice/agent.test.cjs.map +1 -1
  130. package/dist/voice/agent.test.js +122 -2
  131. package/dist/voice/agent.test.js.map +1 -1
  132. package/dist/voice/agent_activity.cjs +406 -298
  133. package/dist/voice/agent_activity.cjs.map +1 -1
  134. package/dist/voice/agent_activity.d.cts +41 -7
  135. package/dist/voice/agent_activity.d.ts +41 -7
  136. package/dist/voice/agent_activity.d.ts.map +1 -1
  137. package/dist/voice/agent_activity.js +407 -294
  138. package/dist/voice/agent_activity.js.map +1 -1
  139. package/dist/voice/agent_session.cjs +140 -40
  140. package/dist/voice/agent_session.cjs.map +1 -1
  141. package/dist/voice/agent_session.d.cts +19 -7
  142. package/dist/voice/agent_session.d.ts +19 -7
  143. package/dist/voice/agent_session.d.ts.map +1 -1
  144. package/dist/voice/agent_session.js +137 -37
  145. package/dist/voice/agent_session.js.map +1 -1
  146. package/dist/voice/audio_recognition.cjs +4 -0
  147. package/dist/voice/audio_recognition.cjs.map +1 -1
  148. package/dist/voice/audio_recognition.d.ts.map +1 -1
  149. package/dist/voice/audio_recognition.js +4 -0
  150. package/dist/voice/audio_recognition.js.map +1 -1
  151. package/dist/voice/generation.cjs +39 -19
  152. package/dist/voice/generation.cjs.map +1 -1
  153. package/dist/voice/generation.d.ts.map +1 -1
  154. package/dist/voice/generation.js +44 -20
  155. package/dist/voice/generation.js.map +1 -1
  156. package/dist/voice/index.cjs +2 -0
  157. package/dist/voice/index.cjs.map +1 -1
  158. package/dist/voice/index.d.cts +1 -1
  159. package/dist/voice/index.d.ts +1 -1
  160. package/dist/voice/index.d.ts.map +1 -1
  161. package/dist/voice/index.js +2 -1
  162. package/dist/voice/index.js.map +1 -1
  163. package/dist/voice/room_io/room_io.cjs +11 -2
  164. package/dist/voice/room_io/room_io.cjs.map +1 -1
  165. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  166. package/dist/voice/room_io/room_io.js +12 -3
  167. package/dist/voice/room_io/room_io.js.map +1 -1
  168. package/dist/voice/speech_handle.cjs +7 -1
  169. package/dist/voice/speech_handle.cjs.map +1 -1
  170. package/dist/voice/speech_handle.d.cts +2 -0
  171. package/dist/voice/speech_handle.d.ts +2 -0
  172. package/dist/voice/speech_handle.d.ts.map +1 -1
  173. package/dist/voice/speech_handle.js +8 -2
  174. package/dist/voice/speech_handle.js.map +1 -1
  175. package/dist/voice/testing/fake_llm.cjs +127 -0
  176. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  177. package/dist/voice/testing/fake_llm.d.cts +30 -0
  178. package/dist/voice/testing/fake_llm.d.ts +30 -0
  179. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  180. package/dist/voice/testing/fake_llm.js +103 -0
  181. package/dist/voice/testing/fake_llm.js.map +1 -0
  182. package/dist/voice/testing/index.cjs +3 -0
  183. package/dist/voice/testing/index.cjs.map +1 -1
  184. package/dist/voice/testing/index.d.cts +1 -0
  185. package/dist/voice/testing/index.d.ts +1 -0
  186. package/dist/voice/testing/index.d.ts.map +1 -1
  187. package/dist/voice/testing/index.js +2 -0
  188. package/dist/voice/testing/index.js.map +1 -1
  189. package/dist/voice/testing/run_result.cjs +66 -15
  190. package/dist/voice/testing/run_result.cjs.map +1 -1
  191. package/dist/voice/testing/run_result.d.cts +14 -3
  192. package/dist/voice/testing/run_result.d.ts +14 -3
  193. package/dist/voice/testing/run_result.d.ts.map +1 -1
  194. package/dist/voice/testing/run_result.js +66 -15
  195. package/dist/voice/testing/run_result.js.map +1 -1
  196. package/package.json +1 -1
  197. package/src/beta/index.ts +9 -0
  198. package/src/beta/workflows/index.ts +9 -0
  199. package/src/beta/workflows/task_group.ts +194 -0
  200. package/src/cli.ts +20 -33
  201. package/src/index.ts +2 -1
  202. package/src/ipc/job_proc_lazy_main.ts +16 -5
  203. package/src/llm/chat_context.test.ts +48 -0
  204. package/src/llm/chat_context.ts +158 -0
  205. package/src/llm/index.ts +1 -0
  206. package/src/llm/provider_format/index.ts +7 -2
  207. package/src/llm/provider_format/openai.test.ts +385 -1
  208. package/src/llm/provider_format/openai.ts +103 -0
  209. package/src/llm/provider_format/utils.ts +6 -4
  210. package/src/llm/realtime.ts +1 -0
  211. package/src/llm/tool_context.ts +14 -0
  212. package/src/log.ts +5 -2
  213. package/src/stream/deferred_stream.ts +17 -6
  214. package/src/utils.test.ts +87 -0
  215. package/src/utils.ts +41 -2
  216. package/src/version.ts +1 -1
  217. package/src/voice/agent.test.ts +140 -2
  218. package/src/voice/agent.ts +200 -10
  219. package/src/voice/agent_activity.ts +466 -290
  220. package/src/voice/agent_session.ts +178 -40
  221. package/src/voice/audio_recognition.ts +4 -0
  222. package/src/voice/generation.ts +52 -23
  223. package/src/voice/index.ts +1 -1
  224. package/src/voice/room_io/room_io.ts +14 -3
  225. package/src/voice/speech_handle.ts +9 -2
  226. package/src/voice/testing/fake_llm.ts +138 -0
  227. package/src/voice/testing/index.ts +2 -0
  228. package/src/voice/testing/run_result.ts +81 -23
@@ -6,18 +6,25 @@ import { ReadableStream } from "node:stream/web";
6
6
  import { ChatMessage } from "../llm/chat_context.js";
7
7
  import {
8
8
  LLM,
9
- RealtimeModel
9
+ RealtimeModel,
10
+ ToolFlag
10
11
  } from "../llm/index.js";
11
12
  import { isSameToolChoice, isSameToolContext } from "../llm/tool_context.js";
12
13
  import { log } from "../log.js";
13
- import { DeferredReadableStream } from "../stream/deferred_stream.js";
14
+ import { MultiInputStream } from "../stream/multi_input_stream.js";
14
15
  import { STT } from "../stt/stt.js";
15
16
  import { recordRealtimeMetrics, traceTypes, tracer } from "../telemetry/index.js";
16
17
  import { splitWords } from "../tokenize/basic/word.js";
17
18
  import { TTS } from "../tts/tts.js";
18
19
  import { Future, Task, cancelAndWait, waitFor } from "../utils.js";
19
20
  import { VAD } from "../vad.js";
20
- import { StopResponse, asyncLocalStorage } from "./agent.js";
21
+ import {
22
+ StopResponse,
23
+ _getActivityTaskInfo,
24
+ _setActivityTaskInfo,
25
+ functionCallStorage,
26
+ speechHandleStorage
27
+ } from "./agent.js";
21
28
  import {} from "./agent_session.js";
22
29
  import {
23
30
  AudioRecognition
@@ -41,8 +48,11 @@ import {
41
48
  } from "./generation.js";
42
49
  import { SpeechHandle } from "./speech_handle.js";
43
50
  import { setParticipantSpanAttributes } from "./utils.js";
44
- const speechHandleStorage = new AsyncLocalStorage();
51
+ const agentActivityStorage = new AsyncLocalStorage();
52
+ const onEnterStorage = new AsyncLocalStorage();
45
53
  class AgentActivity {
54
+ agent;
55
+ agentSession;
46
56
  static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
47
57
  started = false;
48
58
  audioRecognition;
@@ -51,22 +61,29 @@ class AgentActivity {
51
61
  // Maps response_id to OTEL span for metrics recording
52
62
  turnDetectionMode;
53
63
  logger = log();
54
- _draining = false;
64
+ _schedulingPaused = true;
65
+ _drainBlockedTasks = [];
55
66
  _currentSpeech;
56
67
  speechQueue;
57
68
  // [priority, timestamp, speechHandle]
58
69
  q_updated;
59
70
  speechTasks = /* @__PURE__ */ new Set();
60
71
  lock = new Mutex();
61
- audioStream = new DeferredReadableStream();
72
+ audioStream = new MultiInputStream();
73
+ audioStreamId;
62
74
  // default to null as None, which maps to the default provider tool choice value
63
75
  toolChoice = null;
64
76
  _preemptiveGeneration;
65
- agent;
66
- agentSession;
67
77
  /** @internal */
68
78
  _mainTask;
79
+ _onEnterTask;
80
+ _onExitTask;
69
81
  _userTurnCompletedTask;
82
+ onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
83
+ onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
84
+ onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
85
+ onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
86
+ onModelError = (ev) => this.onError(ev);
70
87
  constructor(agent, agentSession) {
71
88
  this.agent = agent;
72
89
  this.agentSession = agentSession;
@@ -77,7 +94,7 @@ class AgentActivity {
77
94
  this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
78
95
  if (this.turnDetectionMode === "vad" && this.vad === void 0) {
79
96
  this.logger.warn(
80
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting'
97
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
81
98
  );
82
99
  this.turnDetectionMode = void 0;
83
100
  }
@@ -127,107 +144,124 @@ class AgentActivity {
127
144
  }
128
145
  }
129
146
  async start() {
130
- var _a;
131
147
  const unlock = await this.lock.lock();
132
148
  try {
133
- const startSpan = tracer.startSpan({
134
- name: "start_agent_activity",
135
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
136
- context: ROOT_CONTEXT
137
- });
138
- this.agent._agentActivity = this;
139
- if (this.llm instanceof RealtimeModel) {
140
- this.realtimeSession = this.llm.session();
141
- this.realtimeSpans = /* @__PURE__ */ new Map();
142
- this.realtimeSession.on("generation_created", (ev) => this.onGenerationCreated(ev));
143
- this.realtimeSession.on("input_speech_started", (ev) => this.onInputSpeechStarted(ev));
144
- this.realtimeSession.on("input_speech_stopped", (ev) => this.onInputSpeechStopped(ev));
145
- this.realtimeSession.on(
146
- "input_audio_transcription_completed",
147
- (ev) => this.onInputAudioTranscriptionCompleted(ev)
148
- );
149
- this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
150
- this.realtimeSession.on("error", (ev) => this.onError(ev));
151
- removeInstructions(this.agent._chatCtx);
152
- try {
153
- await this.realtimeSession.updateInstructions(this.agent.instructions);
154
- } catch (error) {
155
- this.logger.error(error, "failed to update the instructions");
156
- }
157
- try {
158
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
159
- } catch (error) {
160
- this.logger.error(error, "failed to update the chat context");
161
- }
162
- try {
163
- await this.realtimeSession.updateTools(this.tools);
164
- } catch (error) {
165
- this.logger.error(error, "failed to update the tools");
166
- }
167
- if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
168
- this.logger.error(
169
- "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
170
- );
171
- }
172
- } else if (this.llm instanceof LLM) {
173
- try {
174
- updateInstructions({
175
- chatCtx: this.agent._chatCtx,
176
- instructions: this.agent.instructions,
177
- addIfMissing: true
178
- });
179
- } catch (error) {
180
- this.logger.error("failed to update the instructions", error);
181
- }
149
+ await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
150
+ } finally {
151
+ unlock();
152
+ }
153
+ }
154
+ async resume() {
155
+ const unlock = await this.lock.lock();
156
+ try {
157
+ await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
158
+ } finally {
159
+ unlock();
160
+ }
161
+ }
162
+ async _startSession(options) {
163
+ var _a;
164
+ const { spanName, runOnEnter } = options;
165
+ const startSpan = tracer.startSpan({
166
+ name: spanName,
167
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
168
+ context: ROOT_CONTEXT
169
+ });
170
+ this.agent._agentActivity = this;
171
+ if (this.llm instanceof RealtimeModel) {
172
+ this.realtimeSession = this.llm.session();
173
+ this.realtimeSpans = /* @__PURE__ */ new Map();
174
+ this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
175
+ this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
176
+ this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
177
+ this.realtimeSession.on(
178
+ "input_audio_transcription_completed",
179
+ this.onRealtimeInputAudioTranscriptionCompleted
180
+ );
181
+ this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
182
+ this.realtimeSession.on("error", this.onModelError);
183
+ removeInstructions(this.agent._chatCtx);
184
+ try {
185
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
186
+ } catch (error) {
187
+ this.logger.error(error, "failed to update the instructions");
182
188
  }
183
- if (this.llm instanceof LLM) {
184
- this.llm.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
185
- this.llm.on("error", (ev) => this.onError(ev));
189
+ try {
190
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
191
+ } catch (error) {
192
+ this.logger.error(error, "failed to update the chat context");
186
193
  }
187
- if (this.stt instanceof STT) {
188
- this.stt.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
189
- this.stt.on("error", (ev) => this.onError(ev));
194
+ try {
195
+ await this.realtimeSession.updateTools(this.tools);
196
+ } catch (error) {
197
+ this.logger.error(error, "failed to update the tools");
198
+ }
199
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
200
+ this.logger.error(
201
+ "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
202
+ );
190
203
  }
191
- if (this.tts instanceof TTS) {
192
- this.tts.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
193
- this.tts.on("error", (ev) => this.onError(ev));
204
+ } else if (this.llm instanceof LLM) {
205
+ try {
206
+ updateInstructions({
207
+ chatCtx: this.agent._chatCtx,
208
+ instructions: this.agent.instructions,
209
+ addIfMissing: true
210
+ });
211
+ } catch (error) {
212
+ this.logger.error("failed to update the instructions", error);
194
213
  }
195
- if (this.vad instanceof VAD) {
196
- this.vad.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
214
+ }
215
+ if (this.llm instanceof LLM) {
216
+ this.llm.on("metrics_collected", this.onMetricsCollected);
217
+ this.llm.on("error", this.onModelError);
218
+ }
219
+ if (this.stt instanceof STT) {
220
+ this.stt.on("metrics_collected", this.onMetricsCollected);
221
+ this.stt.on("error", this.onModelError);
222
+ }
223
+ if (this.tts instanceof TTS) {
224
+ this.tts.on("metrics_collected", this.onMetricsCollected);
225
+ this.tts.on("error", this.onModelError);
226
+ }
227
+ if (this.vad instanceof VAD) {
228
+ this.vad.on("metrics_collected", this.onMetricsCollected);
229
+ }
230
+ this.audioRecognition = new AudioRecognition({
231
+ recognitionHooks: this,
232
+ // Disable stt node if stt is not provided
233
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
234
+ vad: this.vad,
235
+ turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
236
+ turnDetectionMode: this.turnDetectionMode,
237
+ minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
238
+ maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
239
+ rootSpanContext: this.agentSession.rootSpanContext,
240
+ sttModel: (_a = this.stt) == null ? void 0 : _a.label,
241
+ sttProvider: this.getSttProvider(),
242
+ getLinkedParticipant: () => {
243
+ var _a2;
244
+ return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
197
245
  }
198
- this.audioRecognition = new AudioRecognition({
199
- recognitionHooks: this,
200
- // Disable stt node if stt is not provided
201
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
202
- vad: this.vad,
203
- turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
204
- turnDetectionMode: this.turnDetectionMode,
205
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
206
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
207
- rootSpanContext: this.agentSession.rootSpanContext,
208
- sttModel: (_a = this.stt) == null ? void 0 : _a.label,
209
- sttProvider: this.getSttProvider(),
210
- getLinkedParticipant: () => {
211
- var _a2;
212
- return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
213
- }
214
- });
215
- this.audioRecognition.start();
216
- this.started = true;
217
- this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
218
- const onEnterTask = tracer.startActiveSpan(async () => this.agent.onEnter(), {
219
- name: "on_enter",
220
- context: trace.setSpan(ROOT_CONTEXT, startSpan),
221
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
222
- });
223
- this.createSpeechTask({
224
- task: Task.from(() => onEnterTask),
246
+ });
247
+ this.audioRecognition.start();
248
+ this.started = true;
249
+ this._resumeSchedulingTask();
250
+ if (runOnEnter) {
251
+ this._onEnterTask = this.createSpeechTask({
252
+ taskFn: () => onEnterStorage.run(
253
+ { session: this.agentSession, agent: this.agent },
254
+ () => tracer.startActiveSpan(async () => this.agent.onEnter(), {
255
+ name: "on_enter",
256
+ context: trace.setSpan(ROOT_CONTEXT, startSpan),
257
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
258
+ })
259
+ ),
260
+ inlineTask: true,
225
261
  name: "AgentActivity_onEnter"
226
262
  });
227
- startSpan.end();
228
- } finally {
229
- unlock();
230
263
  }
264
+ startSpan.end();
231
265
  }
232
266
  get currentSpeech() {
233
267
  return this._currentSpeech;
@@ -256,8 +290,8 @@ class AgentActivity {
256
290
  get tools() {
257
291
  return this.agent.toolCtx;
258
292
  }
259
- get draining() {
260
- return this._draining;
293
+ get schedulingPaused() {
294
+ return this._schedulingPaused;
261
295
  }
262
296
  get realtimeLLMSession() {
263
297
  return this.realtimeSession;
@@ -288,6 +322,16 @@ class AgentActivity {
288
322
  });
289
323
  }
290
324
  }
325
+ // TODO: Add when AgentConfigUpdate is ported to ChatContext.
326
+ async updateTools(tools) {
327
+ this.agent._tools = { ...tools };
328
+ if (this.realtimeSession) {
329
+ await this.realtimeSession.updateTools(tools);
330
+ }
331
+ if (this.llm instanceof LLM) {
332
+ await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
333
+ }
334
+ }
291
335
  updateOptions({ toolChoice }) {
292
336
  if (toolChoice !== void 0) {
293
337
  this.toolChoice = toolChoice;
@@ -297,11 +341,9 @@ class AgentActivity {
297
341
  }
298
342
  }
299
343
  attachAudioInput(audioStream) {
300
- if (this.audioStream.isSourceSet) {
301
- this.logger.debug("detaching existing audio input in agent activity");
302
- this.audioStream.detachSource();
303
- }
304
- this.audioStream.setSource(audioStream);
344
+ void this.audioStream.close();
345
+ this.audioStream = new MultiInputStream();
346
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
305
347
  const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
306
348
  if (this.realtimeSession) {
307
349
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
@@ -311,13 +353,21 @@ class AgentActivity {
311
353
  }
312
354
  }
313
355
  detachAudioInput() {
314
- this.audioStream.detachSource();
356
+ if (this.audioStreamId === void 0) {
357
+ return;
358
+ }
359
+ void this.audioStream.close();
360
+ this.audioStream = new MultiInputStream();
361
+ this.audioStreamId = void 0;
315
362
  }
316
- commitUserTurn() {
363
+ commitUserTurn(options = {}) {
364
+ const { audioDetached = false, throwIfNotReady = true } = options;
317
365
  if (!this.audioRecognition) {
318
- throw new Error("AudioRecognition is not initialized");
366
+ if (throwIfNotReady) {
367
+ throw new Error("AudioRecognition is not initialized");
368
+ }
369
+ return;
319
370
  }
320
- const audioDetached = false;
321
371
  this.audioRecognition.commitUserTurn(audioDetached);
322
372
  }
323
373
  clearUserTurn() {
@@ -353,13 +403,11 @@ class AgentActivity {
353
403
  })
354
404
  );
355
405
  const task = this.createSpeechTask({
356
- task: Task.from(
357
- (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
358
- ),
406
+ taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
359
407
  ownedSpeechHandle: handle,
360
408
  name: "AgentActivity.say_tts"
361
409
  });
362
- task.finally(() => this.onPipelineReplyDone());
410
+ task.result.finally(() => this.onPipelineReplyDone());
363
411
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
364
412
  return handle;
365
413
  }
@@ -449,8 +497,8 @@ class AgentActivity {
449
497
  if (ev.userInitiated) {
450
498
  return;
451
499
  }
452
- if (this.draining) {
453
- this.logger.warn("skipping new realtime generation, the agent is draining");
500
+ if (this.schedulingPaused) {
501
+ this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
454
502
  return;
455
503
  }
456
504
  const handle = SpeechHandle.create({
@@ -466,9 +514,7 @@ class AgentActivity {
466
514
  );
467
515
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
468
516
  this.createSpeechTask({
469
- task: Task.from(
470
- (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
471
- ),
517
+ taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
472
518
  ownedSpeechHandle: handle,
473
519
  name: "AgentActivity.realtimeGeneration"
474
520
  });
@@ -555,7 +601,7 @@ class AgentActivity {
555
601
  }
556
602
  }
557
603
  onPreemptiveGeneration(info) {
558
- if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
604
+ if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
559
605
  return;
560
606
  }
561
607
  this.cancelPreemptiveGeneration();
@@ -593,7 +639,21 @@ class AgentActivity {
593
639
  }
594
640
  }
595
641
  createSpeechTask(options) {
596
- const { task, ownedSpeechHandle } = options;
642
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
643
+ const wrappedFn = (ctrl) => {
644
+ return agentActivityStorage.run(this, () => {
645
+ const currentTask = Task.current();
646
+ if (currentTask) {
647
+ _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
648
+ }
649
+ if (ownedSpeechHandle) {
650
+ return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
651
+ }
652
+ return taskFn(ctrl);
653
+ });
654
+ };
655
+ const task = Task.from(wrappedFn, controller, name);
656
+ _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask });
597
657
  this.speechTasks.add(task);
598
658
  task.addDoneCallback(() => {
599
659
  this.speechTasks.delete(task);
@@ -609,12 +669,15 @@ class AgentActivity {
609
669
  task.addDoneCallback(() => {
610
670
  this.wakeupMainTask();
611
671
  });
612
- return task.result;
672
+ return task;
613
673
  }
614
674
  async onEndOfTurn(info) {
615
- if (this.draining) {
675
+ if (this.schedulingPaused) {
616
676
  this.cancelPreemptiveGeneration();
617
- this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
677
+ this.logger.warn(
678
+ { user_input: info.newTranscript },
679
+ "skipping user input, speech scheduling is paused"
680
+ );
618
681
  return true;
619
682
  }
620
683
  if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
@@ -633,7 +696,7 @@ class AgentActivity {
633
696
  }
634
697
  const oldTask = this._userTurnCompletedTask;
635
698
  this._userTurnCompletedTask = this.createSpeechTask({
636
- task: Task.from(() => this.userTurnCompleted(info, oldTask)),
699
+ taskFn: () => this.userTurnCompleted(info, oldTask),
637
700
  name: "AgentActivity.userTurnCompleted"
638
701
  });
639
702
  return true;
@@ -663,14 +726,41 @@ class AgentActivity {
663
726
  await speechHandle._waitForGeneration();
664
727
  this._currentSpeech = void 0;
665
728
  }
666
- if (this.draining && this.speechTasks.size === 0) {
667
- this.logger.info("mainTask: draining and no more speech tasks");
729
+ const toWait = this.getDrainPendingSpeechTasks();
730
+ if (this._schedulingPaused && toWait.length === 0) {
731
+ this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
668
732
  break;
669
733
  }
670
734
  this.q_updated = new Future();
671
735
  }
672
736
  this.logger.info("AgentActivity mainTask: exiting");
673
737
  }
738
+ getDrainPendingSpeechTasks() {
739
+ const blockedHandles = [];
740
+ for (const task of this._drainBlockedTasks) {
741
+ const info = _getActivityTaskInfo(task);
742
+ if (!info) {
743
+ this.logger.error("blocked task without activity info; skipping.");
744
+ continue;
745
+ }
746
+ if (!info.speechHandle) {
747
+ continue;
748
+ }
749
+ blockedHandles.push(info.speechHandle);
750
+ }
751
+ const toWait = [];
752
+ for (const task of this.speechTasks) {
753
+ if (this._drainBlockedTasks.includes(task)) {
754
+ continue;
755
+ }
756
+ const info = _getActivityTaskInfo(task);
757
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
758
+ continue;
759
+ }
760
+ toWait.push(task);
761
+ }
762
+ return toWait;
763
+ }
674
764
  wakeupMainTask() {
675
765
  this.q_updated.resolve();
676
766
  }
@@ -696,7 +786,7 @@ class AgentActivity {
696
786
  if (this.llm === void 0) {
697
787
  throw new Error("trying to generate reply without an LLM model");
698
788
  }
699
- const functionCall = (_a = asyncLocalStorage.getStore()) == null ? void 0 : _a.functionCall;
789
+ const functionCall = (_a = functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
700
790
  if (toolChoice === void 0 && functionCall !== void 0) {
701
791
  toolChoice = "none";
702
792
  }
@@ -714,19 +804,17 @@ class AgentActivity {
714
804
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
715
805
  if (this.llm instanceof RealtimeModel) {
716
806
  this.createSpeechTask({
717
- task: Task.from(
718
- (abortController) => this.realtimeReplyTask({
719
- speechHandle: handle,
720
- // TODO(brian): support llm.ChatMessage for the realtime model
721
- userInput: userMessage == null ? void 0 : userMessage.textContent,
722
- instructions,
723
- modelSettings: {
724
- // isGiven(toolChoice) = toolChoice !== undefined
725
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
726
- },
727
- abortController
728
- })
729
- ),
807
+ taskFn: (abortController) => this.realtimeReplyTask({
808
+ speechHandle: handle,
809
+ // TODO(brian): support llm.ChatMessage for the realtime model
810
+ userInput: userMessage == null ? void 0 : userMessage.textContent,
811
+ instructions,
812
+ modelSettings: {
813
+ // isGiven(toolChoice) = toolChoice !== undefined
814
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
815
+ },
816
+ abortController
817
+ }),
730
818
  ownedSpeechHandle: handle,
731
819
  name: "AgentActivity.realtimeReply"
732
820
  });
@@ -735,37 +823,44 @@ class AgentActivity {
735
823
  instructions = `${this.agent.instructions}
736
824
  ${instructions}`;
737
825
  }
826
+ const onEnterData = onEnterStorage.getStore();
827
+ const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
828
+ const tools = shouldFilterTools ? Object.fromEntries(
829
+ Object.entries(this.agent.toolCtx).filter(
830
+ ([, fnTool]) => !(fnTool.flags & ToolFlag.IGNORE_ON_ENTER)
831
+ )
832
+ ) : this.agent.toolCtx;
738
833
  const task = this.createSpeechTask({
739
- task: Task.from(
740
- (abortController) => this.pipelineReplyTask(
741
- handle,
742
- chatCtx ?? this.agent.chatCtx,
743
- this.agent.toolCtx,
744
- {
745
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
746
- },
747
- abortController,
748
- instructions,
749
- userMessage
750
- )
834
+ taskFn: (abortController) => this.pipelineReplyTask(
835
+ handle,
836
+ chatCtx ?? this.agent.chatCtx,
837
+ tools,
838
+ {
839
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
840
+ },
841
+ abortController,
842
+ instructions,
843
+ userMessage
751
844
  ),
752
845
  ownedSpeechHandle: handle,
753
846
  name: "AgentActivity.pipelineReply"
754
847
  });
755
- task.finally(() => this.onPipelineReplyDone());
848
+ task.result.finally(() => this.onPipelineReplyDone());
756
849
  }
757
850
  if (scheduleSpeech) {
758
851
  this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
759
852
  }
760
853
  return handle;
761
854
  }
762
- interrupt() {
855
+ interrupt(options = {}) {
763
856
  var _a;
857
+ const { force = false } = options;
858
+ this.cancelPreemptiveGeneration();
764
859
  const future = new Future();
765
860
  const currentSpeech = this._currentSpeech;
766
- currentSpeech == null ? void 0 : currentSpeech.interrupt();
861
+ currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
767
862
  for (const [_, __, speech] of this.speechQueue) {
768
- speech.interrupt();
863
+ speech.interrupt(force);
769
864
  }
770
865
  (_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
771
866
  if (currentSpeech === void 0) {
@@ -786,7 +881,7 @@ ${instructions}`;
786
881
  async userTurnCompleted(info, oldTask) {
787
882
  var _a, _b;
788
883
  if (oldTask) {
789
- await oldTask;
884
+ await oldTask.result;
790
885
  }
791
886
  if (this.llm instanceof RealtimeModel) {
792
887
  if (this.llm.capabilities.turnDetection) {
@@ -973,7 +1068,7 @@ ${instructions}`;
973
1068
  toolsMessages,
974
1069
  span
975
1070
  }) => {
976
- var _a, _b, _c, _d, _e;
1071
+ var _a, _b;
977
1072
  speechHandle._agentTurnContext = otelContext.active();
978
1073
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
979
1074
  if (instructions) {
@@ -1119,11 +1214,11 @@ ${instructions}`;
1119
1214
  for (const msg of toolsMessages) {
1120
1215
  msg.createdAt = replyStartedAt;
1121
1216
  }
1122
- this.agent._chatCtx.insert(toolsMessages);
1123
1217
  const toolCallOutputs = toolsMessages.filter(
1124
1218
  (m) => m.type === "function_call_output"
1125
1219
  );
1126
1220
  if (toolCallOutputs.length > 0) {
1221
+ this.agent._chatCtx.insert(toolCallOutputs);
1127
1222
  this.agentSession._toolItemsAdded(toolCallOutputs);
1128
1223
  }
1129
1224
  }
@@ -1211,45 +1306,15 @@ ${instructions}`;
1211
1306
  );
1212
1307
  return;
1213
1308
  }
1214
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1215
- functionCalls: [],
1216
- functionCallOutputs: []
1217
- });
1218
- let shouldGenerateToolReply = false;
1219
- let newAgentTask = null;
1220
- let ignoreTaskSwitch = false;
1221
- for (const sanitizedOut of toolOutput.output) {
1222
- if (sanitizedOut.toolCallOutput !== void 0) {
1223
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1224
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1225
- if (sanitizedOut.replyRequired) {
1226
- shouldGenerateToolReply = true;
1227
- }
1228
- }
1229
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1230
- this.logger.error("expected to receive only one agent task from the tool executions");
1231
- ignoreTaskSwitch = true;
1232
- }
1233
- newAgentTask = sanitizedOut.agentTask ?? null;
1234
- this.logger.debug(
1235
- {
1236
- speechId: speechHandle.id,
1237
- name: (_c = sanitizedOut.toolCall) == null ? void 0 : _c.name,
1238
- args: sanitizedOut.toolCall.args,
1239
- output: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.output,
1240
- isError: (_e = sanitizedOut.toolCallOutput) == null ? void 0 : _e.isError
1241
- },
1242
- "Tool call execution finished"
1243
- );
1244
- }
1309
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1245
1310
  this.agentSession.emit(
1246
1311
  AgentSessionEventTypes.FunctionToolsExecuted,
1247
1312
  functionToolsExecutedEvent
1248
1313
  );
1249
- let draining = this.draining;
1314
+ let schedulingPaused = this.schedulingPaused;
1250
1315
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1251
1316
  this.agentSession.updateAgent(newAgentTask);
1252
- draining = true;
1317
+ schedulingPaused = true;
1253
1318
  }
1254
1319
  const toolMessages = [
1255
1320
  ...functionToolsExecutedEvent.functionCalls,
@@ -1258,34 +1323,32 @@ ${instructions}`;
1258
1323
  if (shouldGenerateToolReply) {
1259
1324
  chatCtx.insert(toolMessages);
1260
1325
  speechHandle._numSteps += 1;
1261
- const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1326
+ const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1262
1327
  const toolResponseTask = this.createSpeechTask({
1263
- task: Task.from(
1264
- () => this.pipelineReplyTask(
1265
- speechHandle,
1266
- chatCtx,
1267
- toolCtx,
1268
- { toolChoice: respondToolChoice },
1269
- replyAbortController,
1270
- instructions,
1271
- void 0,
1272
- toolMessages
1273
- )
1328
+ taskFn: () => this.pipelineReplyTask(
1329
+ speechHandle,
1330
+ chatCtx,
1331
+ toolCtx,
1332
+ { toolChoice: respondToolChoice },
1333
+ replyAbortController,
1334
+ instructions,
1335
+ void 0,
1336
+ toolMessages
1274
1337
  ),
1275
1338
  ownedSpeechHandle: speechHandle,
1276
1339
  name: "AgentActivity.pipelineReply"
1277
1340
  });
1278
- toolResponseTask.finally(() => this.onPipelineReplyDone());
1341
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1279
1342
  this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1280
1343
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1281
1344
  for (const msg of toolMessages) {
1282
1345
  msg.createdAt = replyStartedAt;
1283
1346
  }
1284
- this.agent._chatCtx.insert(toolMessages);
1285
1347
  const toolCallOutputs = toolMessages.filter(
1286
1348
  (m) => m.type === "function_call_output"
1287
1349
  );
1288
1350
  if (toolCallOutputs.length > 0) {
1351
+ this.agent._chatCtx.insert(toolCallOutputs);
1289
1352
  this.agentSession._toolItemsAdded(toolCallOutputs);
1290
1353
  }
1291
1354
  }
@@ -1329,7 +1392,7 @@ ${instructions}`;
1329
1392
  replyAbortController,
1330
1393
  span
1331
1394
  }) {
1332
- var _a, _b, _c, _d;
1395
+ var _a;
1333
1396
  speechHandle._agentTurnContext = otelContext.active();
1334
1397
  span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1335
1398
  const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
@@ -1589,44 +1652,15 @@ ${instructions}`;
1589
1652
  );
1590
1653
  return;
1591
1654
  }
1592
- const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1593
- functionCalls: [],
1594
- functionCallOutputs: []
1595
- });
1596
- let shouldGenerateToolReply = false;
1597
- let newAgentTask = null;
1598
- let ignoreTaskSwitch = false;
1599
- for (const sanitizedOut of toolOutput.output) {
1600
- if (sanitizedOut.toolCallOutput !== void 0) {
1601
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1602
- if (sanitizedOut.replyRequired) {
1603
- shouldGenerateToolReply = true;
1604
- }
1605
- }
1606
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1607
- this.logger.error("expected to receive only one agent task from the tool executions");
1608
- ignoreTaskSwitch = true;
1609
- }
1610
- newAgentTask = sanitizedOut.agentTask ?? null;
1611
- this.logger.debug(
1612
- {
1613
- speechId: speechHandle.id,
1614
- name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
1615
- args: sanitizedOut.toolCall.args,
1616
- output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
1617
- isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
1618
- },
1619
- "Tool call execution finished"
1620
- );
1621
- }
1655
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1622
1656
  this.agentSession.emit(
1623
1657
  AgentSessionEventTypes.FunctionToolsExecuted,
1624
1658
  functionToolsExecutedEvent
1625
1659
  );
1626
- let draining = this.draining;
1660
+ let schedulingPaused = this.schedulingPaused;
1627
1661
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1628
1662
  this.agentSession.updateAgent(newAgentTask);
1629
- draining = true;
1663
+ schedulingPaused = true;
1630
1664
  }
1631
1665
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1632
1666
  while (this.currentSpeech || this.speechQueue.size() > 0) {
@@ -1667,20 +1701,58 @@ ${instructions}`;
1667
1701
  speechHandle: replySpeechHandle
1668
1702
  })
1669
1703
  );
1670
- const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1704
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1671
1705
  this.createSpeechTask({
1672
- task: Task.from(
1673
- (abortController) => this.realtimeReplyTask({
1674
- speechHandle: replySpeechHandle,
1675
- modelSettings: { toolChoice },
1676
- abortController
1677
- })
1678
- ),
1706
+ taskFn: (abortController) => this.realtimeReplyTask({
1707
+ speechHandle: replySpeechHandle,
1708
+ modelSettings: { toolChoice },
1709
+ abortController
1710
+ }),
1679
1711
  ownedSpeechHandle: replySpeechHandle,
1680
1712
  name: "AgentActivity.realtime_reply"
1681
1713
  });
1682
1714
  this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1683
1715
  }
1716
+ summarizeToolExecutionOutput(toolOutput, speechHandle) {
1717
+ var _a, _b, _c;
1718
+ const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({
1719
+ functionCalls: [],
1720
+ functionCallOutputs: []
1721
+ });
1722
+ let shouldGenerateToolReply = false;
1723
+ let newAgentTask = null;
1724
+ let ignoreTaskSwitch = false;
1725
+ for (const sanitizedOut of toolOutput.output) {
1726
+ if (sanitizedOut.toolCallOutput !== void 0) {
1727
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1728
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1729
+ if (sanitizedOut.replyRequired) {
1730
+ shouldGenerateToolReply = true;
1731
+ }
1732
+ }
1733
+ if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1734
+ this.logger.error("expected to receive only one agent task from the tool executions");
1735
+ ignoreTaskSwitch = true;
1736
+ }
1737
+ newAgentTask = sanitizedOut.agentTask ?? null;
1738
+ this.logger.debug(
1739
+ {
1740
+ speechId: speechHandle.id,
1741
+ name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1742
+ args: sanitizedOut.toolCall.args,
1743
+ output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1744
+ isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1745
+ },
1746
+ "Tool call execution finished"
1747
+ );
1748
+ }
1749
+ return {
1750
+ functionToolsExecutedEvent,
1751
+ shouldGenerateToolReply,
1752
+ newAgentTask,
1753
+ ignoreTaskSwitch
1754
+ };
1755
+ }
1684
1756
  async realtimeReplyTask({
1685
1757
  speechHandle,
1686
1758
  modelSettings: { toolChoice },
@@ -1722,13 +1794,45 @@ ${instructions}`;
1722
1794
  }
1723
1795
  }
1724
1796
  scheduleSpeech(speechHandle, priority, force = false) {
1725
- if (this.draining && !force) {
1726
- throw new Error("cannot schedule new speech, the agent is draining");
1797
+ if (this.schedulingPaused && !force) {
1798
+ throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
1727
1799
  }
1728
1800
  this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1729
1801
  speechHandle._markScheduled();
1730
1802
  this.wakeupMainTask();
1731
1803
  }
1804
+ async _pauseSchedulingTask(blockedTasks) {
1805
+ if (this._schedulingPaused) return;
1806
+ this._schedulingPaused = true;
1807
+ this._drainBlockedTasks = blockedTasks;
1808
+ this.wakeupMainTask();
1809
+ if (this._mainTask) {
1810
+ await this._mainTask.result;
1811
+ }
1812
+ }
1813
+ _resumeSchedulingTask() {
1814
+ if (!this._schedulingPaused) return;
1815
+ this._schedulingPaused = false;
1816
+ this._mainTask = Task.from(({ signal }) => this.mainTask(signal));
1817
+ }
1818
+ async pause(options = {}) {
1819
+ const { blockedTasks = [] } = options;
1820
+ const unlock = await this.lock.lock();
1821
+ try {
1822
+ const span = tracer.startSpan({
1823
+ name: "pause_agent_activity",
1824
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1825
+ });
1826
+ try {
1827
+ await this._pauseSchedulingTask(blockedTasks);
1828
+ await this._closeSessionResources();
1829
+ } finally {
1830
+ span.end();
1831
+ }
1832
+ } finally {
1833
+ unlock();
1834
+ }
1835
+ }
1732
1836
  async drain() {
1733
1837
  return tracer.startActiveSpan(async (span) => this._drainImpl(span), {
1734
1838
  name: "drain_agent_activity",
@@ -1736,71 +1840,80 @@ ${instructions}`;
1736
1840
  });
1737
1841
  }
1738
1842
  async _drainImpl(span) {
1739
- var _a;
1740
1843
  span.setAttribute(traceTypes.ATTR_AGENT_LABEL, this.agent.id);
1741
1844
  const unlock = await this.lock.lock();
1742
1845
  try {
1743
- if (this._draining) return;
1744
- this.cancelPreemptiveGeneration();
1745
- const onExitTask = tracer.startActiveSpan(async () => this.agent.onExit(), {
1746
- name: "on_exit",
1747
- attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1748
- });
1749
- this.createSpeechTask({
1750
- task: Task.from(() => onExitTask),
1846
+ if (this._schedulingPaused) return;
1847
+ this._onExitTask = this.createSpeechTask({
1848
+ taskFn: () => tracer.startActiveSpan(async () => this.agent.onExit(), {
1849
+ name: "on_exit",
1850
+ attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1851
+ }),
1852
+ inlineTask: true,
1751
1853
  name: "AgentActivity_onExit"
1752
1854
  });
1753
- this.wakeupMainTask();
1754
- this._draining = true;
1755
- await ((_a = this._mainTask) == null ? void 0 : _a.result);
1855
+ this.cancelPreemptiveGeneration();
1856
+ await this._onExitTask.result;
1857
+ await this._pauseSchedulingTask([]);
1756
1858
  } finally {
1757
1859
  unlock();
1758
1860
  }
1759
1861
  }
1760
1862
  async close() {
1761
- var _a, _b, _c, _d;
1762
1863
  const unlock = await this.lock.lock();
1763
1864
  try {
1764
- if (!this._draining) {
1765
- this.logger.warn("task closing without draining");
1766
- }
1767
1865
  this.cancelPreemptiveGeneration();
1768
- if (this.llm instanceof LLM) {
1769
- this.llm.off("metrics_collected", this.onMetricsCollected);
1770
- }
1771
- if (this.realtimeSession) {
1772
- this.realtimeSession.off("generation_created", this.onGenerationCreated);
1773
- this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
1774
- this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
1775
- this.realtimeSession.off(
1776
- "input_audio_transcription_completed",
1777
- this.onInputAudioTranscriptionCompleted
1778
- );
1779
- this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1780
- }
1781
- if (this.stt instanceof STT) {
1782
- this.stt.off("metrics_collected", this.onMetricsCollected);
1866
+ await this._closeSessionResources();
1867
+ if (this._mainTask) {
1868
+ await this._mainTask.cancelAndWait();
1783
1869
  }
1784
- if (this.tts instanceof TTS) {
1785
- this.tts.off("metrics_collected", this.onMetricsCollected);
1786
- }
1787
- if (this.vad instanceof VAD) {
1788
- this.vad.off("metrics_collected", this.onMetricsCollected);
1789
- }
1790
- this.detachAudioInput();
1791
- (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
1792
- await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
1793
- await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
1794
- await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
1870
+ this.agent._agentActivity = void 0;
1795
1871
  } finally {
1796
1872
  unlock();
1797
1873
  }
1798
1874
  }
1875
+ async _closeSessionResources() {
1876
+ var _a, _b, _c;
1877
+ if (this.llm instanceof LLM) {
1878
+ this.llm.off("metrics_collected", this.onMetricsCollected);
1879
+ this.llm.off("error", this.onModelError);
1880
+ }
1881
+ if (this.realtimeSession) {
1882
+ this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
1883
+ this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
1884
+ this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
1885
+ this.realtimeSession.off(
1886
+ "input_audio_transcription_completed",
1887
+ this.onRealtimeInputAudioTranscriptionCompleted
1888
+ );
1889
+ this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1890
+ this.realtimeSession.off("error", this.onModelError);
1891
+ }
1892
+ if (this.stt instanceof STT) {
1893
+ this.stt.off("metrics_collected", this.onMetricsCollected);
1894
+ this.stt.off("error", this.onModelError);
1895
+ }
1896
+ if (this.tts instanceof TTS) {
1897
+ this.tts.off("metrics_collected", this.onMetricsCollected);
1898
+ this.tts.off("error", this.onModelError);
1899
+ }
1900
+ if (this.vad instanceof VAD) {
1901
+ this.vad.off("metrics_collected", this.onMetricsCollected);
1902
+ }
1903
+ this.detachAudioInput();
1904
+ (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
1905
+ await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
1906
+ await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
1907
+ this.realtimeSession = void 0;
1908
+ this.audioRecognition = void 0;
1909
+ }
1799
1910
  }
1800
1911
  function toOaiToolChoice(toolChoice) {
1801
1912
  return toolChoice !== null ? toolChoice : void 0;
1802
1913
  }
1803
1914
  export {
1804
- AgentActivity
1915
+ AgentActivity,
1916
+ agentActivityStorage,
1917
+ onEnterStorage
1805
1918
  };
1806
1919
  //# sourceMappingURL=agent_activity.js.map