@livekit/agents 1.0.46 → 1.0.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. package/dist/beta/index.cjs +29 -0
  2. package/dist/beta/index.cjs.map +1 -0
  3. package/dist/beta/index.d.cts +2 -0
  4. package/dist/beta/index.d.ts +2 -0
  5. package/dist/beta/index.d.ts.map +1 -0
  6. package/dist/beta/index.js +7 -0
  7. package/dist/beta/index.js.map +1 -0
  8. package/dist/beta/workflows/index.cjs +29 -0
  9. package/dist/beta/workflows/index.cjs.map +1 -0
  10. package/dist/beta/workflows/index.d.cts +2 -0
  11. package/dist/beta/workflows/index.d.ts +2 -0
  12. package/dist/beta/workflows/index.d.ts.map +1 -0
  13. package/dist/beta/workflows/index.js +7 -0
  14. package/dist/beta/workflows/index.js.map +1 -0
  15. package/dist/beta/workflows/task_group.cjs +162 -0
  16. package/dist/beta/workflows/task_group.cjs.map +1 -0
  17. package/dist/beta/workflows/task_group.d.cts +32 -0
  18. package/dist/beta/workflows/task_group.d.ts +32 -0
  19. package/dist/beta/workflows/task_group.d.ts.map +1 -0
  20. package/dist/beta/workflows/task_group.js +138 -0
  21. package/dist/beta/workflows/task_group.js.map +1 -0
  22. package/dist/cli.cjs +14 -20
  23. package/dist/cli.cjs.map +1 -1
  24. package/dist/cli.d.ts.map +1 -1
  25. package/dist/cli.js +14 -20
  26. package/dist/cli.js.map +1 -1
  27. package/dist/index.cjs +3 -0
  28. package/dist/index.cjs.map +1 -1
  29. package/dist/index.d.cts +2 -1
  30. package/dist/index.d.ts +2 -1
  31. package/dist/index.d.ts.map +1 -1
  32. package/dist/index.js +2 -0
  33. package/dist/index.js.map +1 -1
  34. package/dist/inference/api_protos.d.cts +59 -59
  35. package/dist/inference/api_protos.d.ts +59 -59
  36. package/dist/ipc/job_proc_lazy_main.cjs +14 -5
  37. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  38. package/dist/ipc/job_proc_lazy_main.js +14 -5
  39. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  40. package/dist/llm/chat_context.cjs +108 -1
  41. package/dist/llm/chat_context.cjs.map +1 -1
  42. package/dist/llm/chat_context.d.cts +14 -1
  43. package/dist/llm/chat_context.d.ts +14 -1
  44. package/dist/llm/chat_context.d.ts.map +1 -1
  45. package/dist/llm/chat_context.js +108 -1
  46. package/dist/llm/chat_context.js.map +1 -1
  47. package/dist/llm/chat_context.test.cjs +43 -0
  48. package/dist/llm/chat_context.test.cjs.map +1 -1
  49. package/dist/llm/chat_context.test.js +43 -0
  50. package/dist/llm/chat_context.test.js.map +1 -1
  51. package/dist/llm/index.cjs +2 -0
  52. package/dist/llm/index.cjs.map +1 -1
  53. package/dist/llm/index.d.cts +1 -1
  54. package/dist/llm/index.d.ts +1 -1
  55. package/dist/llm/index.d.ts.map +1 -1
  56. package/dist/llm/index.js +3 -1
  57. package/dist/llm/index.js.map +1 -1
  58. package/dist/llm/provider_format/index.cjs +2 -0
  59. package/dist/llm/provider_format/index.cjs.map +1 -1
  60. package/dist/llm/provider_format/index.d.cts +2 -2
  61. package/dist/llm/provider_format/index.d.ts +2 -2
  62. package/dist/llm/provider_format/index.d.ts.map +1 -1
  63. package/dist/llm/provider_format/index.js +6 -1
  64. package/dist/llm/provider_format/index.js.map +1 -1
  65. package/dist/llm/provider_format/openai.cjs +82 -2
  66. package/dist/llm/provider_format/openai.cjs.map +1 -1
  67. package/dist/llm/provider_format/openai.d.cts +1 -0
  68. package/dist/llm/provider_format/openai.d.ts +1 -0
  69. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  70. package/dist/llm/provider_format/openai.js +80 -1
  71. package/dist/llm/provider_format/openai.js.map +1 -1
  72. package/dist/llm/provider_format/openai.test.cjs +326 -0
  73. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  74. package/dist/llm/provider_format/openai.test.js +327 -1
  75. package/dist/llm/provider_format/openai.test.js.map +1 -1
  76. package/dist/llm/provider_format/utils.cjs +4 -3
  77. package/dist/llm/provider_format/utils.cjs.map +1 -1
  78. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  79. package/dist/llm/provider_format/utils.js +4 -3
  80. package/dist/llm/provider_format/utils.js.map +1 -1
  81. package/dist/llm/realtime.cjs.map +1 -1
  82. package/dist/llm/realtime.d.cts +1 -0
  83. package/dist/llm/realtime.d.ts +1 -0
  84. package/dist/llm/realtime.d.ts.map +1 -1
  85. package/dist/llm/realtime.js.map +1 -1
  86. package/dist/llm/tool_context.cjs +7 -0
  87. package/dist/llm/tool_context.cjs.map +1 -1
  88. package/dist/llm/tool_context.d.cts +10 -2
  89. package/dist/llm/tool_context.d.ts +10 -2
  90. package/dist/llm/tool_context.d.ts.map +1 -1
  91. package/dist/llm/tool_context.js +6 -0
  92. package/dist/llm/tool_context.js.map +1 -1
  93. package/dist/log.cjs +5 -2
  94. package/dist/log.cjs.map +1 -1
  95. package/dist/log.d.ts.map +1 -1
  96. package/dist/log.js +5 -2
  97. package/dist/log.js.map +1 -1
  98. package/dist/stream/deferred_stream.cjs +15 -6
  99. package/dist/stream/deferred_stream.cjs.map +1 -1
  100. package/dist/stream/deferred_stream.d.ts.map +1 -1
  101. package/dist/stream/deferred_stream.js +15 -6
  102. package/dist/stream/deferred_stream.js.map +1 -1
  103. package/dist/utils.cjs +32 -2
  104. package/dist/utils.cjs.map +1 -1
  105. package/dist/utils.d.cts +7 -0
  106. package/dist/utils.d.ts +7 -0
  107. package/dist/utils.d.ts.map +1 -1
  108. package/dist/utils.js +32 -2
  109. package/dist/utils.js.map +1 -1
  110. package/dist/utils.test.cjs +71 -0
  111. package/dist/utils.test.cjs.map +1 -1
  112. package/dist/utils.test.js +71 -0
  113. package/dist/utils.test.js.map +1 -1
  114. package/dist/version.cjs +1 -1
  115. package/dist/version.cjs.map +1 -1
  116. package/dist/version.d.cts +1 -1
  117. package/dist/version.d.ts +1 -1
  118. package/dist/version.d.ts.map +1 -1
  119. package/dist/version.js +1 -1
  120. package/dist/version.js.map +1 -1
  121. package/dist/voice/agent.cjs +153 -12
  122. package/dist/voice/agent.cjs.map +1 -1
  123. package/dist/voice/agent.d.cts +30 -4
  124. package/dist/voice/agent.d.ts +30 -4
  125. package/dist/voice/agent.d.ts.map +1 -1
  126. package/dist/voice/agent.js +149 -11
  127. package/dist/voice/agent.js.map +1 -1
  128. package/dist/voice/agent.test.cjs +120 -0
  129. package/dist/voice/agent.test.cjs.map +1 -1
  130. package/dist/voice/agent.test.js +122 -2
  131. package/dist/voice/agent.test.js.map +1 -1
  132. package/dist/voice/agent_activity.cjs +406 -298
  133. package/dist/voice/agent_activity.cjs.map +1 -1
  134. package/dist/voice/agent_activity.d.cts +41 -7
  135. package/dist/voice/agent_activity.d.ts +41 -7
  136. package/dist/voice/agent_activity.d.ts.map +1 -1
  137. package/dist/voice/agent_activity.js +407 -294
  138. package/dist/voice/agent_activity.js.map +1 -1
  139. package/dist/voice/agent_session.cjs +140 -40
  140. package/dist/voice/agent_session.cjs.map +1 -1
  141. package/dist/voice/agent_session.d.cts +19 -7
  142. package/dist/voice/agent_session.d.ts +19 -7
  143. package/dist/voice/agent_session.d.ts.map +1 -1
  144. package/dist/voice/agent_session.js +137 -37
  145. package/dist/voice/agent_session.js.map +1 -1
  146. package/dist/voice/audio_recognition.cjs +4 -0
  147. package/dist/voice/audio_recognition.cjs.map +1 -1
  148. package/dist/voice/audio_recognition.d.ts.map +1 -1
  149. package/dist/voice/audio_recognition.js +4 -0
  150. package/dist/voice/audio_recognition.js.map +1 -1
  151. package/dist/voice/generation.cjs +39 -19
  152. package/dist/voice/generation.cjs.map +1 -1
  153. package/dist/voice/generation.d.ts.map +1 -1
  154. package/dist/voice/generation.js +44 -20
  155. package/dist/voice/generation.js.map +1 -1
  156. package/dist/voice/index.cjs +2 -0
  157. package/dist/voice/index.cjs.map +1 -1
  158. package/dist/voice/index.d.cts +1 -1
  159. package/dist/voice/index.d.ts +1 -1
  160. package/dist/voice/index.d.ts.map +1 -1
  161. package/dist/voice/index.js +2 -1
  162. package/dist/voice/index.js.map +1 -1
  163. package/dist/voice/room_io/room_io.cjs +11 -2
  164. package/dist/voice/room_io/room_io.cjs.map +1 -1
  165. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  166. package/dist/voice/room_io/room_io.js +12 -3
  167. package/dist/voice/room_io/room_io.js.map +1 -1
  168. package/dist/voice/speech_handle.cjs +7 -1
  169. package/dist/voice/speech_handle.cjs.map +1 -1
  170. package/dist/voice/speech_handle.d.cts +2 -0
  171. package/dist/voice/speech_handle.d.ts +2 -0
  172. package/dist/voice/speech_handle.d.ts.map +1 -1
  173. package/dist/voice/speech_handle.js +8 -2
  174. package/dist/voice/speech_handle.js.map +1 -1
  175. package/dist/voice/testing/fake_llm.cjs +127 -0
  176. package/dist/voice/testing/fake_llm.cjs.map +1 -0
  177. package/dist/voice/testing/fake_llm.d.cts +30 -0
  178. package/dist/voice/testing/fake_llm.d.ts +30 -0
  179. package/dist/voice/testing/fake_llm.d.ts.map +1 -0
  180. package/dist/voice/testing/fake_llm.js +103 -0
  181. package/dist/voice/testing/fake_llm.js.map +1 -0
  182. package/dist/voice/testing/index.cjs +3 -0
  183. package/dist/voice/testing/index.cjs.map +1 -1
  184. package/dist/voice/testing/index.d.cts +1 -0
  185. package/dist/voice/testing/index.d.ts +1 -0
  186. package/dist/voice/testing/index.d.ts.map +1 -1
  187. package/dist/voice/testing/index.js +2 -0
  188. package/dist/voice/testing/index.js.map +1 -1
  189. package/dist/voice/testing/run_result.cjs +66 -15
  190. package/dist/voice/testing/run_result.cjs.map +1 -1
  191. package/dist/voice/testing/run_result.d.cts +14 -3
  192. package/dist/voice/testing/run_result.d.ts +14 -3
  193. package/dist/voice/testing/run_result.d.ts.map +1 -1
  194. package/dist/voice/testing/run_result.js +66 -15
  195. package/dist/voice/testing/run_result.js.map +1 -1
  196. package/package.json +1 -1
  197. package/src/beta/index.ts +9 -0
  198. package/src/beta/workflows/index.ts +9 -0
  199. package/src/beta/workflows/task_group.ts +194 -0
  200. package/src/cli.ts +20 -33
  201. package/src/index.ts +2 -1
  202. package/src/ipc/job_proc_lazy_main.ts +16 -5
  203. package/src/llm/chat_context.test.ts +48 -0
  204. package/src/llm/chat_context.ts +158 -0
  205. package/src/llm/index.ts +1 -0
  206. package/src/llm/provider_format/index.ts +7 -2
  207. package/src/llm/provider_format/openai.test.ts +385 -1
  208. package/src/llm/provider_format/openai.ts +103 -0
  209. package/src/llm/provider_format/utils.ts +6 -4
  210. package/src/llm/realtime.ts +1 -0
  211. package/src/llm/tool_context.ts +14 -0
  212. package/src/log.ts +5 -2
  213. package/src/stream/deferred_stream.ts +17 -6
  214. package/src/utils.test.ts +87 -0
  215. package/src/utils.ts +41 -2
  216. package/src/version.ts +1 -1
  217. package/src/voice/agent.test.ts +140 -2
  218. package/src/voice/agent.ts +200 -10
  219. package/src/voice/agent_activity.ts +466 -290
  220. package/src/voice/agent_session.ts +178 -40
  221. package/src/voice/audio_recognition.ts +4 -0
  222. package/src/voice/generation.ts +52 -23
  223. package/src/voice/index.ts +1 -1
  224. package/src/voice/room_io/room_io.ts +14 -3
  225. package/src/voice/speech_handle.ts +9 -2
  226. package/src/voice/testing/fake_llm.ts +138 -0
  227. package/src/voice/testing/index.ts +2 -0
  228. package/src/voice/testing/run_result.ts +81 -23
@@ -18,7 +18,9 @@ var __copyProps = (to, from, except, desc) => {
18
18
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
19
  var agent_activity_exports = {};
20
20
  __export(agent_activity_exports, {
21
- AgentActivity: () => AgentActivity
21
+ AgentActivity: () => AgentActivity,
22
+ agentActivityStorage: () => agentActivityStorage,
23
+ onEnterStorage: () => onEnterStorage
22
24
  });
23
25
  module.exports = __toCommonJS(agent_activity_exports);
24
26
  var import_mutex = require("@livekit/mutex");
@@ -30,7 +32,7 @@ var import_chat_context = require("../llm/chat_context.cjs");
30
32
  var import_llm = require("../llm/index.cjs");
31
33
  var import_tool_context = require("../llm/tool_context.cjs");
32
34
  var import_log = require("../log.cjs");
33
- var import_deferred_stream = require("../stream/deferred_stream.cjs");
35
+ var import_multi_input_stream = require("../stream/multi_input_stream.cjs");
34
36
  var import_stt = require("../stt/stt.cjs");
35
37
  var import_telemetry = require("../telemetry/index.cjs");
36
38
  var import_word = require("../tokenize/basic/word.cjs");
@@ -44,8 +46,11 @@ var import_events = require("./events.cjs");
44
46
  var import_generation = require("./generation.cjs");
45
47
  var import_speech_handle = require("./speech_handle.cjs");
46
48
  var import_utils2 = require("./utils.cjs");
47
- const speechHandleStorage = new import_node_async_hooks.AsyncLocalStorage();
49
+ const agentActivityStorage = new import_node_async_hooks.AsyncLocalStorage();
50
+ const onEnterStorage = new import_node_async_hooks.AsyncLocalStorage();
48
51
  class AgentActivity {
52
+ agent;
53
+ agentSession;
49
54
  static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
50
55
  started = false;
51
56
  audioRecognition;
@@ -54,22 +59,29 @@ class AgentActivity {
54
59
  // Maps response_id to OTEL span for metrics recording
55
60
  turnDetectionMode;
56
61
  logger = (0, import_log.log)();
57
- _draining = false;
62
+ _schedulingPaused = true;
63
+ _drainBlockedTasks = [];
58
64
  _currentSpeech;
59
65
  speechQueue;
60
66
  // [priority, timestamp, speechHandle]
61
67
  q_updated;
62
68
  speechTasks = /* @__PURE__ */ new Set();
63
69
  lock = new import_mutex.Mutex();
64
- audioStream = new import_deferred_stream.DeferredReadableStream();
70
+ audioStream = new import_multi_input_stream.MultiInputStream();
71
+ audioStreamId;
65
72
  // default to null as None, which maps to the default provider tool choice value
66
73
  toolChoice = null;
67
74
  _preemptiveGeneration;
68
- agent;
69
- agentSession;
70
75
  /** @internal */
71
76
  _mainTask;
77
+ _onEnterTask;
78
+ _onExitTask;
72
79
  _userTurnCompletedTask;
80
+ onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
81
+ onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
82
+ onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
83
+ onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
84
+ onModelError = (ev) => this.onError(ev);
73
85
  constructor(agent, agentSession) {
74
86
  this.agent = agent;
75
87
  this.agentSession = agentSession;
@@ -80,7 +92,7 @@ class AgentActivity {
80
92
  this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
81
93
  if (this.turnDetectionMode === "vad" && this.vad === void 0) {
82
94
  this.logger.warn(
83
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting'
95
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
84
96
  );
85
97
  this.turnDetectionMode = void 0;
86
98
  }
@@ -130,107 +142,124 @@ class AgentActivity {
130
142
  }
131
143
  }
132
144
  async start() {
133
- var _a;
134
145
  const unlock = await this.lock.lock();
135
146
  try {
136
- const startSpan = import_telemetry.tracer.startSpan({
137
- name: "start_agent_activity",
138
- attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
139
- context: import_api.ROOT_CONTEXT
140
- });
141
- this.agent._agentActivity = this;
142
- if (this.llm instanceof import_llm.RealtimeModel) {
143
- this.realtimeSession = this.llm.session();
144
- this.realtimeSpans = /* @__PURE__ */ new Map();
145
- this.realtimeSession.on("generation_created", (ev) => this.onGenerationCreated(ev));
146
- this.realtimeSession.on("input_speech_started", (ev) => this.onInputSpeechStarted(ev));
147
- this.realtimeSession.on("input_speech_stopped", (ev) => this.onInputSpeechStopped(ev));
148
- this.realtimeSession.on(
149
- "input_audio_transcription_completed",
150
- (ev) => this.onInputAudioTranscriptionCompleted(ev)
151
- );
152
- this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
153
- this.realtimeSession.on("error", (ev) => this.onError(ev));
154
- (0, import_generation.removeInstructions)(this.agent._chatCtx);
155
- try {
156
- await this.realtimeSession.updateInstructions(this.agent.instructions);
157
- } catch (error) {
158
- this.logger.error(error, "failed to update the instructions");
159
- }
160
- try {
161
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
162
- } catch (error) {
163
- this.logger.error(error, "failed to update the chat context");
164
- }
165
- try {
166
- await this.realtimeSession.updateTools(this.tools);
167
- } catch (error) {
168
- this.logger.error(error, "failed to update the tools");
169
- }
170
- if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
171
- this.logger.error(
172
- "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
173
- );
174
- }
175
- } else if (this.llm instanceof import_llm.LLM) {
176
- try {
177
- (0, import_generation.updateInstructions)({
178
- chatCtx: this.agent._chatCtx,
179
- instructions: this.agent.instructions,
180
- addIfMissing: true
181
- });
182
- } catch (error) {
183
- this.logger.error("failed to update the instructions", error);
184
- }
147
+ await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
148
+ } finally {
149
+ unlock();
150
+ }
151
+ }
152
+ async resume() {
153
+ const unlock = await this.lock.lock();
154
+ try {
155
+ await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
156
+ } finally {
157
+ unlock();
158
+ }
159
+ }
160
+ async _startSession(options) {
161
+ var _a;
162
+ const { spanName, runOnEnter } = options;
163
+ const startSpan = import_telemetry.tracer.startSpan({
164
+ name: spanName,
165
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
166
+ context: import_api.ROOT_CONTEXT
167
+ });
168
+ this.agent._agentActivity = this;
169
+ if (this.llm instanceof import_llm.RealtimeModel) {
170
+ this.realtimeSession = this.llm.session();
171
+ this.realtimeSpans = /* @__PURE__ */ new Map();
172
+ this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
173
+ this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
174
+ this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
175
+ this.realtimeSession.on(
176
+ "input_audio_transcription_completed",
177
+ this.onRealtimeInputAudioTranscriptionCompleted
178
+ );
179
+ this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
180
+ this.realtimeSession.on("error", this.onModelError);
181
+ (0, import_generation.removeInstructions)(this.agent._chatCtx);
182
+ try {
183
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
184
+ } catch (error) {
185
+ this.logger.error(error, "failed to update the instructions");
185
186
  }
186
- if (this.llm instanceof import_llm.LLM) {
187
- this.llm.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
188
- this.llm.on("error", (ev) => this.onError(ev));
187
+ try {
188
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
189
+ } catch (error) {
190
+ this.logger.error(error, "failed to update the chat context");
189
191
  }
190
- if (this.stt instanceof import_stt.STT) {
191
- this.stt.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
192
- this.stt.on("error", (ev) => this.onError(ev));
192
+ try {
193
+ await this.realtimeSession.updateTools(this.tools);
194
+ } catch (error) {
195
+ this.logger.error(error, "failed to update the tools");
196
+ }
197
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
198
+ this.logger.error(
199
+ "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
200
+ );
193
201
  }
194
- if (this.tts instanceof import_tts.TTS) {
195
- this.tts.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
196
- this.tts.on("error", (ev) => this.onError(ev));
202
+ } else if (this.llm instanceof import_llm.LLM) {
203
+ try {
204
+ (0, import_generation.updateInstructions)({
205
+ chatCtx: this.agent._chatCtx,
206
+ instructions: this.agent.instructions,
207
+ addIfMissing: true
208
+ });
209
+ } catch (error) {
210
+ this.logger.error("failed to update the instructions", error);
197
211
  }
198
- if (this.vad instanceof import_vad.VAD) {
199
- this.vad.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
212
+ }
213
+ if (this.llm instanceof import_llm.LLM) {
214
+ this.llm.on("metrics_collected", this.onMetricsCollected);
215
+ this.llm.on("error", this.onModelError);
216
+ }
217
+ if (this.stt instanceof import_stt.STT) {
218
+ this.stt.on("metrics_collected", this.onMetricsCollected);
219
+ this.stt.on("error", this.onModelError);
220
+ }
221
+ if (this.tts instanceof import_tts.TTS) {
222
+ this.tts.on("metrics_collected", this.onMetricsCollected);
223
+ this.tts.on("error", this.onModelError);
224
+ }
225
+ if (this.vad instanceof import_vad.VAD) {
226
+ this.vad.on("metrics_collected", this.onMetricsCollected);
227
+ }
228
+ this.audioRecognition = new import_audio_recognition.AudioRecognition({
229
+ recognitionHooks: this,
230
+ // Disable stt node if stt is not provided
231
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
232
+ vad: this.vad,
233
+ turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
234
+ turnDetectionMode: this.turnDetectionMode,
235
+ minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
236
+ maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
237
+ rootSpanContext: this.agentSession.rootSpanContext,
238
+ sttModel: (_a = this.stt) == null ? void 0 : _a.label,
239
+ sttProvider: this.getSttProvider(),
240
+ getLinkedParticipant: () => {
241
+ var _a2;
242
+ return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
200
243
  }
201
- this.audioRecognition = new import_audio_recognition.AudioRecognition({
202
- recognitionHooks: this,
203
- // Disable stt node if stt is not provided
204
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
205
- vad: this.vad,
206
- turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
207
- turnDetectionMode: this.turnDetectionMode,
208
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
209
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
210
- rootSpanContext: this.agentSession.rootSpanContext,
211
- sttModel: (_a = this.stt) == null ? void 0 : _a.label,
212
- sttProvider: this.getSttProvider(),
213
- getLinkedParticipant: () => {
214
- var _a2;
215
- return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
216
- }
217
- });
218
- this.audioRecognition.start();
219
- this.started = true;
220
- this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
221
- const onEnterTask = import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
222
- name: "on_enter",
223
- context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
224
- attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
225
- });
226
- this.createSpeechTask({
227
- task: import_utils.Task.from(() => onEnterTask),
244
+ });
245
+ this.audioRecognition.start();
246
+ this.started = true;
247
+ this._resumeSchedulingTask();
248
+ if (runOnEnter) {
249
+ this._onEnterTask = this.createSpeechTask({
250
+ taskFn: () => onEnterStorage.run(
251
+ { session: this.agentSession, agent: this.agent },
252
+ () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
253
+ name: "on_enter",
254
+ context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
255
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
256
+ })
257
+ ),
258
+ inlineTask: true,
228
259
  name: "AgentActivity_onEnter"
229
260
  });
230
- startSpan.end();
231
- } finally {
232
- unlock();
233
261
  }
262
+ startSpan.end();
234
263
  }
235
264
  get currentSpeech() {
236
265
  return this._currentSpeech;
@@ -259,8 +288,8 @@ class AgentActivity {
259
288
  get tools() {
260
289
  return this.agent.toolCtx;
261
290
  }
262
- get draining() {
263
- return this._draining;
291
+ get schedulingPaused() {
292
+ return this._schedulingPaused;
264
293
  }
265
294
  get realtimeLLMSession() {
266
295
  return this.realtimeSession;
@@ -291,6 +320,16 @@ class AgentActivity {
291
320
  });
292
321
  }
293
322
  }
323
+ // TODO: Add when AgentConfigUpdate is ported to ChatContext.
324
+ async updateTools(tools) {
325
+ this.agent._tools = { ...tools };
326
+ if (this.realtimeSession) {
327
+ await this.realtimeSession.updateTools(tools);
328
+ }
329
+ if (this.llm instanceof import_llm.LLM) {
330
+ await this.updateChatCtx(this.agent._chatCtx.copy({ toolCtx: tools }));
331
+ }
332
+ }
294
333
  updateOptions({ toolChoice }) {
295
334
  if (toolChoice !== void 0) {
296
335
  this.toolChoice = toolChoice;
@@ -300,11 +339,9 @@ class AgentActivity {
300
339
  }
301
340
  }
302
341
  attachAudioInput(audioStream) {
303
- if (this.audioStream.isSourceSet) {
304
- this.logger.debug("detaching existing audio input in agent activity");
305
- this.audioStream.detachSource();
306
- }
307
- this.audioStream.setSource(audioStream);
342
+ void this.audioStream.close();
343
+ this.audioStream = new import_multi_input_stream.MultiInputStream();
344
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
308
345
  const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
309
346
  if (this.realtimeSession) {
310
347
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
@@ -314,13 +351,21 @@ class AgentActivity {
314
351
  }
315
352
  }
316
353
  detachAudioInput() {
317
- this.audioStream.detachSource();
354
+ if (this.audioStreamId === void 0) {
355
+ return;
356
+ }
357
+ void this.audioStream.close();
358
+ this.audioStream = new import_multi_input_stream.MultiInputStream();
359
+ this.audioStreamId = void 0;
318
360
  }
319
- commitUserTurn() {
361
+ commitUserTurn(options = {}) {
362
+ const { audioDetached = false, throwIfNotReady = true } = options;
320
363
  if (!this.audioRecognition) {
321
- throw new Error("AudioRecognition is not initialized");
364
+ if (throwIfNotReady) {
365
+ throw new Error("AudioRecognition is not initialized");
366
+ }
367
+ return;
322
368
  }
323
- const audioDetached = false;
324
369
  this.audioRecognition.commitUserTurn(audioDetached);
325
370
  }
326
371
  clearUserTurn() {
@@ -356,19 +401,17 @@ class AgentActivity {
356
401
  })
357
402
  );
358
403
  const task = this.createSpeechTask({
359
- task: import_utils.Task.from(
360
- (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
361
- ),
404
+ taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
362
405
  ownedSpeechHandle: handle,
363
406
  name: "AgentActivity.say_tts"
364
407
  });
365
- task.finally(() => this.onPipelineReplyDone());
408
+ task.result.finally(() => this.onPipelineReplyDone());
366
409
  this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
367
410
  return handle;
368
411
  }
369
412
  // -- Metrics and errors --
370
413
  onMetricsCollected = (ev) => {
371
- const speechHandle = speechHandleStorage.getStore();
414
+ const speechHandle = import_agent.speechHandleStorage.getStore();
372
415
  if (speechHandle && (ev.type === "llm_metrics" || ev.type === "tts_metrics")) {
373
416
  ev.speechId = speechHandle.id;
374
417
  }
@@ -452,8 +495,8 @@ class AgentActivity {
452
495
  if (ev.userInitiated) {
453
496
  return;
454
497
  }
455
- if (this.draining) {
456
- this.logger.warn("skipping new realtime generation, the agent is draining");
498
+ if (this.schedulingPaused) {
499
+ this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
457
500
  return;
458
501
  }
459
502
  const handle = import_speech_handle.SpeechHandle.create({
@@ -469,9 +512,7 @@ class AgentActivity {
469
512
  );
470
513
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
471
514
  this.createSpeechTask({
472
- task: import_utils.Task.from(
473
- (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
474
- ),
515
+ taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
475
516
  ownedSpeechHandle: handle,
476
517
  name: "AgentActivity.realtimeGeneration"
477
518
  });
@@ -558,7 +599,7 @@ class AgentActivity {
558
599
  }
559
600
  }
560
601
  onPreemptiveGeneration(info) {
561
- if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
602
+ if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
562
603
  return;
563
604
  }
564
605
  this.cancelPreemptiveGeneration();
@@ -596,7 +637,21 @@ class AgentActivity {
596
637
  }
597
638
  }
598
639
  createSpeechTask(options) {
599
- const { task, ownedSpeechHandle } = options;
640
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
641
+ const wrappedFn = (ctrl) => {
642
+ return agentActivityStorage.run(this, () => {
643
+ const currentTask = import_utils.Task.current();
644
+ if (currentTask) {
645
+ (0, import_agent._setActivityTaskInfo)(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
646
+ }
647
+ if (ownedSpeechHandle) {
648
+ return import_agent.speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
649
+ }
650
+ return taskFn(ctrl);
651
+ });
652
+ };
653
+ const task = import_utils.Task.from(wrappedFn, controller, name);
654
+ (0, import_agent._setActivityTaskInfo)(task, { speechHandle: ownedSpeechHandle, inlineTask });
600
655
  this.speechTasks.add(task);
601
656
  task.addDoneCallback(() => {
602
657
  this.speechTasks.delete(task);
@@ -612,12 +667,15 @@ class AgentActivity {
612
667
  task.addDoneCallback(() => {
613
668
  this.wakeupMainTask();
614
669
  });
615
- return task.result;
670
+ return task;
616
671
  }
617
672
  async onEndOfTurn(info) {
618
- if (this.draining) {
673
+ if (this.schedulingPaused) {
619
674
  this.cancelPreemptiveGeneration();
620
- this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
675
+ this.logger.warn(
676
+ { user_input: info.newTranscript },
677
+ "skipping user input, speech scheduling is paused"
678
+ );
621
679
  return true;
622
680
  }
623
681
  if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
@@ -636,7 +694,7 @@ class AgentActivity {
636
694
  }
637
695
  const oldTask = this._userTurnCompletedTask;
638
696
  this._userTurnCompletedTask = this.createSpeechTask({
639
- task: import_utils.Task.from(() => this.userTurnCompleted(info, oldTask)),
697
+ taskFn: () => this.userTurnCompleted(info, oldTask),
640
698
  name: "AgentActivity.userTurnCompleted"
641
699
  });
642
700
  return true;
@@ -666,14 +724,41 @@ class AgentActivity {
666
724
  await speechHandle._waitForGeneration();
667
725
  this._currentSpeech = void 0;
668
726
  }
669
- if (this.draining && this.speechTasks.size === 0) {
670
- this.logger.info("mainTask: draining and no more speech tasks");
727
+ const toWait = this.getDrainPendingSpeechTasks();
728
+ if (this._schedulingPaused && toWait.length === 0) {
729
+ this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
671
730
  break;
672
731
  }
673
732
  this.q_updated = new import_utils.Future();
674
733
  }
675
734
  this.logger.info("AgentActivity mainTask: exiting");
676
735
  }
736
+ getDrainPendingSpeechTasks() {
737
+ const blockedHandles = [];
738
+ for (const task of this._drainBlockedTasks) {
739
+ const info = (0, import_agent._getActivityTaskInfo)(task);
740
+ if (!info) {
741
+ this.logger.error("blocked task without activity info; skipping.");
742
+ continue;
743
+ }
744
+ if (!info.speechHandle) {
745
+ continue;
746
+ }
747
+ blockedHandles.push(info.speechHandle);
748
+ }
749
+ const toWait = [];
750
+ for (const task of this.speechTasks) {
751
+ if (this._drainBlockedTasks.includes(task)) {
752
+ continue;
753
+ }
754
+ const info = (0, import_agent._getActivityTaskInfo)(task);
755
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
756
+ continue;
757
+ }
758
+ toWait.push(task);
759
+ }
760
+ return toWait;
761
+ }
677
762
  wakeupMainTask() {
678
763
  this.q_updated.resolve();
679
764
  }
@@ -699,7 +784,7 @@ class AgentActivity {
699
784
  if (this.llm === void 0) {
700
785
  throw new Error("trying to generate reply without an LLM model");
701
786
  }
702
- const functionCall = (_a = import_agent.asyncLocalStorage.getStore()) == null ? void 0 : _a.functionCall;
787
+ const functionCall = (_a = import_agent.functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
703
788
  if (toolChoice === void 0 && functionCall !== void 0) {
704
789
  toolChoice = "none";
705
790
  }
@@ -717,19 +802,17 @@ class AgentActivity {
717
802
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
718
803
  if (this.llm instanceof import_llm.RealtimeModel) {
719
804
  this.createSpeechTask({
720
- task: import_utils.Task.from(
721
- (abortController) => this.realtimeReplyTask({
722
- speechHandle: handle,
723
- // TODO(brian): support llm.ChatMessage for the realtime model
724
- userInput: userMessage == null ? void 0 : userMessage.textContent,
725
- instructions,
726
- modelSettings: {
727
- // isGiven(toolChoice) = toolChoice !== undefined
728
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
729
- },
730
- abortController
731
- })
732
- ),
805
+ taskFn: (abortController) => this.realtimeReplyTask({
806
+ speechHandle: handle,
807
+ // TODO(brian): support llm.ChatMessage for the realtime model
808
+ userInput: userMessage == null ? void 0 : userMessage.textContent,
809
+ instructions,
810
+ modelSettings: {
811
+ // isGiven(toolChoice) = toolChoice !== undefined
812
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
813
+ },
814
+ abortController
815
+ }),
733
816
  ownedSpeechHandle: handle,
734
817
  name: "AgentActivity.realtimeReply"
735
818
  });
@@ -738,37 +821,44 @@ class AgentActivity {
738
821
  instructions = `${this.agent.instructions}
739
822
  ${instructions}`;
740
823
  }
824
+ const onEnterData = onEnterStorage.getStore();
825
+ const shouldFilterTools = (onEnterData == null ? void 0 : onEnterData.agent) === this.agent && (onEnterData == null ? void 0 : onEnterData.session) === this.agentSession;
826
+ const tools = shouldFilterTools ? Object.fromEntries(
827
+ Object.entries(this.agent.toolCtx).filter(
828
+ ([, fnTool]) => !(fnTool.flags & import_llm.ToolFlag.IGNORE_ON_ENTER)
829
+ )
830
+ ) : this.agent.toolCtx;
741
831
  const task = this.createSpeechTask({
742
- task: import_utils.Task.from(
743
- (abortController) => this.pipelineReplyTask(
744
- handle,
745
- chatCtx ?? this.agent.chatCtx,
746
- this.agent.toolCtx,
747
- {
748
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
749
- },
750
- abortController,
751
- instructions,
752
- userMessage
753
- )
832
+ taskFn: (abortController) => this.pipelineReplyTask(
833
+ handle,
834
+ chatCtx ?? this.agent.chatCtx,
835
+ tools,
836
+ {
837
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
838
+ },
839
+ abortController,
840
+ instructions,
841
+ userMessage
754
842
  ),
755
843
  ownedSpeechHandle: handle,
756
844
  name: "AgentActivity.pipelineReply"
757
845
  });
758
- task.finally(() => this.onPipelineReplyDone());
846
+ task.result.finally(() => this.onPipelineReplyDone());
759
847
  }
760
848
  if (scheduleSpeech) {
761
849
  this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
762
850
  }
763
851
  return handle;
764
852
  }
765
- interrupt() {
853
+ interrupt(options = {}) {
766
854
  var _a;
855
+ const { force = false } = options;
856
+ this.cancelPreemptiveGeneration();
767
857
  const future = new import_utils.Future();
768
858
  const currentSpeech = this._currentSpeech;
769
- currentSpeech == null ? void 0 : currentSpeech.interrupt();
859
+ currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
770
860
  for (const [_, __, speech] of this.speechQueue) {
771
- speech.interrupt();
861
+ speech.interrupt(force);
772
862
  }
773
863
  (_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
774
864
  if (currentSpeech === void 0) {
@@ -789,7 +879,7 @@ ${instructions}`;
789
879
  async userTurnCompleted(info, oldTask) {
790
880
  var _a, _b;
791
881
  if (oldTask) {
792
- await oldTask;
882
+ await oldTask.result;
793
883
  }
794
884
  if (this.llm instanceof import_llm.RealtimeModel) {
795
885
  if (this.llm.capabilities.turnDetection) {
@@ -871,7 +961,7 @@ ${instructions}`;
871
961
  }
872
962
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
873
963
  speechHandle._agentTurnContext = import_api.context.active();
874
- speechHandleStorage.enterWith(speechHandle);
964
+ import_agent.speechHandleStorage.enterWith(speechHandle);
875
965
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
876
966
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
877
967
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
@@ -976,7 +1066,7 @@ ${instructions}`;
976
1066
  toolsMessages,
977
1067
  span
978
1068
  }) => {
979
- var _a, _b, _c, _d, _e;
1069
+ var _a, _b;
980
1070
  speechHandle._agentTurnContext = import_api.context.active();
981
1071
  span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
982
1072
  if (instructions) {
@@ -989,7 +1079,7 @@ ${instructions}`;
989
1079
  if (localParticipant) {
990
1080
  (0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
991
1081
  }
992
- speechHandleStorage.enterWith(speechHandle);
1082
+ import_agent.speechHandleStorage.enterWith(speechHandle);
993
1083
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
994
1084
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
995
1085
  chatCtx = chatCtx.copy();
@@ -1122,11 +1212,11 @@ ${instructions}`;
1122
1212
  for (const msg of toolsMessages) {
1123
1213
  msg.createdAt = replyStartedAt;
1124
1214
  }
1125
- this.agent._chatCtx.insert(toolsMessages);
1126
1215
  const toolCallOutputs = toolsMessages.filter(
1127
1216
  (m) => m.type === "function_call_output"
1128
1217
  );
1129
1218
  if (toolCallOutputs.length > 0) {
1219
+ this.agent._chatCtx.insert(toolCallOutputs);
1130
1220
  this.agentSession._toolItemsAdded(toolCallOutputs);
1131
1221
  }
1132
1222
  }
@@ -1214,45 +1304,15 @@ ${instructions}`;
1214
1304
  );
1215
1305
  return;
1216
1306
  }
1217
- const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1218
- functionCalls: [],
1219
- functionCallOutputs: []
1220
- });
1221
- let shouldGenerateToolReply = false;
1222
- let newAgentTask = null;
1223
- let ignoreTaskSwitch = false;
1224
- for (const sanitizedOut of toolOutput.output) {
1225
- if (sanitizedOut.toolCallOutput !== void 0) {
1226
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1227
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1228
- if (sanitizedOut.replyRequired) {
1229
- shouldGenerateToolReply = true;
1230
- }
1231
- }
1232
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1233
- this.logger.error("expected to receive only one agent task from the tool executions");
1234
- ignoreTaskSwitch = true;
1235
- }
1236
- newAgentTask = sanitizedOut.agentTask ?? null;
1237
- this.logger.debug(
1238
- {
1239
- speechId: speechHandle.id,
1240
- name: (_c = sanitizedOut.toolCall) == null ? void 0 : _c.name,
1241
- args: sanitizedOut.toolCall.args,
1242
- output: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.output,
1243
- isError: (_e = sanitizedOut.toolCallOutput) == null ? void 0 : _e.isError
1244
- },
1245
- "Tool call execution finished"
1246
- );
1247
- }
1307
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1248
1308
  this.agentSession.emit(
1249
1309
  import_events.AgentSessionEventTypes.FunctionToolsExecuted,
1250
1310
  functionToolsExecutedEvent
1251
1311
  );
1252
- let draining = this.draining;
1312
+ let schedulingPaused = this.schedulingPaused;
1253
1313
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1254
1314
  this.agentSession.updateAgent(newAgentTask);
1255
- draining = true;
1315
+ schedulingPaused = true;
1256
1316
  }
1257
1317
  const toolMessages = [
1258
1318
  ...functionToolsExecutedEvent.functionCalls,
@@ -1261,34 +1321,32 @@ ${instructions}`;
1261
1321
  if (shouldGenerateToolReply) {
1262
1322
  chatCtx.insert(toolMessages);
1263
1323
  speechHandle._numSteps += 1;
1264
- const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1324
+ const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1265
1325
  const toolResponseTask = this.createSpeechTask({
1266
- task: import_utils.Task.from(
1267
- () => this.pipelineReplyTask(
1268
- speechHandle,
1269
- chatCtx,
1270
- toolCtx,
1271
- { toolChoice: respondToolChoice },
1272
- replyAbortController,
1273
- instructions,
1274
- void 0,
1275
- toolMessages
1276
- )
1326
+ taskFn: () => this.pipelineReplyTask(
1327
+ speechHandle,
1328
+ chatCtx,
1329
+ toolCtx,
1330
+ { toolChoice: respondToolChoice },
1331
+ replyAbortController,
1332
+ instructions,
1333
+ void 0,
1334
+ toolMessages
1277
1335
  ),
1278
1336
  ownedSpeechHandle: speechHandle,
1279
1337
  name: "AgentActivity.pipelineReply"
1280
1338
  });
1281
- toolResponseTask.finally(() => this.onPipelineReplyDone());
1339
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1282
1340
  this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1283
1341
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1284
1342
  for (const msg of toolMessages) {
1285
1343
  msg.createdAt = replyStartedAt;
1286
1344
  }
1287
- this.agent._chatCtx.insert(toolMessages);
1288
1345
  const toolCallOutputs = toolMessages.filter(
1289
1346
  (m) => m.type === "function_call_output"
1290
1347
  );
1291
1348
  if (toolCallOutputs.length > 0) {
1349
+ this.agent._chatCtx.insert(toolCallOutputs);
1292
1350
  this.agentSession._toolItemsAdded(toolCallOutputs);
1293
1351
  }
1294
1352
  }
@@ -1332,14 +1390,14 @@ ${instructions}`;
1332
1390
  replyAbortController,
1333
1391
  span
1334
1392
  }) {
1335
- var _a, _b, _c, _d;
1393
+ var _a;
1336
1394
  speechHandle._agentTurnContext = import_api.context.active();
1337
1395
  span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1338
1396
  const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
1339
1397
  if (localParticipant) {
1340
1398
  (0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
1341
1399
  }
1342
- speechHandleStorage.enterWith(speechHandle);
1400
+ import_agent.speechHandleStorage.enterWith(speechHandle);
1343
1401
  if (!this.realtimeSession) {
1344
1402
  throw new Error("realtime session is not initialized");
1345
1403
  }
@@ -1592,44 +1650,15 @@ ${instructions}`;
1592
1650
  );
1593
1651
  return;
1594
1652
  }
1595
- const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1596
- functionCalls: [],
1597
- functionCallOutputs: []
1598
- });
1599
- let shouldGenerateToolReply = false;
1600
- let newAgentTask = null;
1601
- let ignoreTaskSwitch = false;
1602
- for (const sanitizedOut of toolOutput.output) {
1603
- if (sanitizedOut.toolCallOutput !== void 0) {
1604
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1605
- if (sanitizedOut.replyRequired) {
1606
- shouldGenerateToolReply = true;
1607
- }
1608
- }
1609
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1610
- this.logger.error("expected to receive only one agent task from the tool executions");
1611
- ignoreTaskSwitch = true;
1612
- }
1613
- newAgentTask = sanitizedOut.agentTask ?? null;
1614
- this.logger.debug(
1615
- {
1616
- speechId: speechHandle.id,
1617
- name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
1618
- args: sanitizedOut.toolCall.args,
1619
- output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
1620
- isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
1621
- },
1622
- "Tool call execution finished"
1623
- );
1624
- }
1653
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1625
1654
  this.agentSession.emit(
1626
1655
  import_events.AgentSessionEventTypes.FunctionToolsExecuted,
1627
1656
  functionToolsExecutedEvent
1628
1657
  );
1629
- let draining = this.draining;
1658
+ let schedulingPaused = this.schedulingPaused;
1630
1659
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1631
1660
  this.agentSession.updateAgent(newAgentTask);
1632
- draining = true;
1661
+ schedulingPaused = true;
1633
1662
  }
1634
1663
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1635
1664
  while (this.currentSpeech || this.speechQueue.size() > 0) {
@@ -1670,20 +1699,58 @@ ${instructions}`;
1670
1699
  speechHandle: replySpeechHandle
1671
1700
  })
1672
1701
  );
1673
- const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1702
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1674
1703
  this.createSpeechTask({
1675
- task: import_utils.Task.from(
1676
- (abortController) => this.realtimeReplyTask({
1677
- speechHandle: replySpeechHandle,
1678
- modelSettings: { toolChoice },
1679
- abortController
1680
- })
1681
- ),
1704
+ taskFn: (abortController) => this.realtimeReplyTask({
1705
+ speechHandle: replySpeechHandle,
1706
+ modelSettings: { toolChoice },
1707
+ abortController
1708
+ }),
1682
1709
  ownedSpeechHandle: replySpeechHandle,
1683
1710
  name: "AgentActivity.realtime_reply"
1684
1711
  });
1685
1712
  this.scheduleSpeech(replySpeechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1686
1713
  }
1714
+ summarizeToolExecutionOutput(toolOutput, speechHandle) {
1715
+ var _a, _b, _c;
1716
+ const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1717
+ functionCalls: [],
1718
+ functionCallOutputs: []
1719
+ });
1720
+ let shouldGenerateToolReply = false;
1721
+ let newAgentTask = null;
1722
+ let ignoreTaskSwitch = false;
1723
+ for (const sanitizedOut of toolOutput.output) {
1724
+ if (sanitizedOut.toolCallOutput !== void 0) {
1725
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1726
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1727
+ if (sanitizedOut.replyRequired) {
1728
+ shouldGenerateToolReply = true;
1729
+ }
1730
+ }
1731
+ if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1732
+ this.logger.error("expected to receive only one agent task from the tool executions");
1733
+ ignoreTaskSwitch = true;
1734
+ }
1735
+ newAgentTask = sanitizedOut.agentTask ?? null;
1736
+ this.logger.debug(
1737
+ {
1738
+ speechId: speechHandle.id,
1739
+ name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1740
+ args: sanitizedOut.toolCall.args,
1741
+ output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1742
+ isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1743
+ },
1744
+ "Tool call execution finished"
1745
+ );
1746
+ }
1747
+ return {
1748
+ functionToolsExecutedEvent,
1749
+ shouldGenerateToolReply,
1750
+ newAgentTask,
1751
+ ignoreTaskSwitch
1752
+ };
1753
+ }
1687
1754
  async realtimeReplyTask({
1688
1755
  speechHandle,
1689
1756
  modelSettings: { toolChoice },
@@ -1691,7 +1758,7 @@ ${instructions}`;
1691
1758
  instructions,
1692
1759
  abortController
1693
1760
  }) {
1694
- speechHandleStorage.enterWith(speechHandle);
1761
+ import_agent.speechHandleStorage.enterWith(speechHandle);
1695
1762
  if (!this.realtimeSession) {
1696
1763
  throw new Error("realtime session is not available");
1697
1764
  }
@@ -1725,13 +1792,45 @@ ${instructions}`;
1725
1792
  }
1726
1793
  }
1727
1794
  scheduleSpeech(speechHandle, priority, force = false) {
1728
- if (this.draining && !force) {
1729
- throw new Error("cannot schedule new speech, the agent is draining");
1795
+ if (this.schedulingPaused && !force) {
1796
+ throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
1730
1797
  }
1731
1798
  this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1732
1799
  speechHandle._markScheduled();
1733
1800
  this.wakeupMainTask();
1734
1801
  }
1802
+ async _pauseSchedulingTask(blockedTasks) {
1803
+ if (this._schedulingPaused) return;
1804
+ this._schedulingPaused = true;
1805
+ this._drainBlockedTasks = blockedTasks;
1806
+ this.wakeupMainTask();
1807
+ if (this._mainTask) {
1808
+ await this._mainTask.result;
1809
+ }
1810
+ }
1811
+ _resumeSchedulingTask() {
1812
+ if (!this._schedulingPaused) return;
1813
+ this._schedulingPaused = false;
1814
+ this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
1815
+ }
1816
+ async pause(options = {}) {
1817
+ const { blockedTasks = [] } = options;
1818
+ const unlock = await this.lock.lock();
1819
+ try {
1820
+ const span = import_telemetry.tracer.startSpan({
1821
+ name: "pause_agent_activity",
1822
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1823
+ });
1824
+ try {
1825
+ await this._pauseSchedulingTask(blockedTasks);
1826
+ await this._closeSessionResources();
1827
+ } finally {
1828
+ span.end();
1829
+ }
1830
+ } finally {
1831
+ unlock();
1832
+ }
1833
+ }
1735
1834
  async drain() {
1736
1835
  return import_telemetry.tracer.startActiveSpan(async (span) => this._drainImpl(span), {
1737
1836
  name: "drain_agent_activity",
@@ -1739,72 +1838,81 @@ ${instructions}`;
1739
1838
  });
1740
1839
  }
1741
1840
  async _drainImpl(span) {
1742
- var _a;
1743
1841
  span.setAttribute(import_telemetry.traceTypes.ATTR_AGENT_LABEL, this.agent.id);
1744
1842
  const unlock = await this.lock.lock();
1745
1843
  try {
1746
- if (this._draining) return;
1747
- this.cancelPreemptiveGeneration();
1748
- const onExitTask = import_telemetry.tracer.startActiveSpan(async () => this.agent.onExit(), {
1749
- name: "on_exit",
1750
- attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1751
- });
1752
- this.createSpeechTask({
1753
- task: import_utils.Task.from(() => onExitTask),
1844
+ if (this._schedulingPaused) return;
1845
+ this._onExitTask = this.createSpeechTask({
1846
+ taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onExit(), {
1847
+ name: "on_exit",
1848
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1849
+ }),
1850
+ inlineTask: true,
1754
1851
  name: "AgentActivity_onExit"
1755
1852
  });
1756
- this.wakeupMainTask();
1757
- this._draining = true;
1758
- await ((_a = this._mainTask) == null ? void 0 : _a.result);
1853
+ this.cancelPreemptiveGeneration();
1854
+ await this._onExitTask.result;
1855
+ await this._pauseSchedulingTask([]);
1759
1856
  } finally {
1760
1857
  unlock();
1761
1858
  }
1762
1859
  }
1763
1860
  async close() {
1764
- var _a, _b, _c, _d;
1765
1861
  const unlock = await this.lock.lock();
1766
1862
  try {
1767
- if (!this._draining) {
1768
- this.logger.warn("task closing without draining");
1769
- }
1770
1863
  this.cancelPreemptiveGeneration();
1771
- if (this.llm instanceof import_llm.LLM) {
1772
- this.llm.off("metrics_collected", this.onMetricsCollected);
1773
- }
1774
- if (this.realtimeSession) {
1775
- this.realtimeSession.off("generation_created", this.onGenerationCreated);
1776
- this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
1777
- this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
1778
- this.realtimeSession.off(
1779
- "input_audio_transcription_completed",
1780
- this.onInputAudioTranscriptionCompleted
1781
- );
1782
- this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1783
- }
1784
- if (this.stt instanceof import_stt.STT) {
1785
- this.stt.off("metrics_collected", this.onMetricsCollected);
1864
+ await this._closeSessionResources();
1865
+ if (this._mainTask) {
1866
+ await this._mainTask.cancelAndWait();
1786
1867
  }
1787
- if (this.tts instanceof import_tts.TTS) {
1788
- this.tts.off("metrics_collected", this.onMetricsCollected);
1789
- }
1790
- if (this.vad instanceof import_vad.VAD) {
1791
- this.vad.off("metrics_collected", this.onMetricsCollected);
1792
- }
1793
- this.detachAudioInput();
1794
- (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
1795
- await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
1796
- await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
1797
- await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
1868
+ this.agent._agentActivity = void 0;
1798
1869
  } finally {
1799
1870
  unlock();
1800
1871
  }
1801
1872
  }
1873
+ async _closeSessionResources() {
1874
+ var _a, _b, _c;
1875
+ if (this.llm instanceof import_llm.LLM) {
1876
+ this.llm.off("metrics_collected", this.onMetricsCollected);
1877
+ this.llm.off("error", this.onModelError);
1878
+ }
1879
+ if (this.realtimeSession) {
1880
+ this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
1881
+ this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
1882
+ this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
1883
+ this.realtimeSession.off(
1884
+ "input_audio_transcription_completed",
1885
+ this.onRealtimeInputAudioTranscriptionCompleted
1886
+ );
1887
+ this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1888
+ this.realtimeSession.off("error", this.onModelError);
1889
+ }
1890
+ if (this.stt instanceof import_stt.STT) {
1891
+ this.stt.off("metrics_collected", this.onMetricsCollected);
1892
+ this.stt.off("error", this.onModelError);
1893
+ }
1894
+ if (this.tts instanceof import_tts.TTS) {
1895
+ this.tts.off("metrics_collected", this.onMetricsCollected);
1896
+ this.tts.off("error", this.onModelError);
1897
+ }
1898
+ if (this.vad instanceof import_vad.VAD) {
1899
+ this.vad.off("metrics_collected", this.onMetricsCollected);
1900
+ }
1901
+ this.detachAudioInput();
1902
+ (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
1903
+ await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
1904
+ await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
1905
+ this.realtimeSession = void 0;
1906
+ this.audioRecognition = void 0;
1907
+ }
1802
1908
  }
1803
1909
  function toOaiToolChoice(toolChoice) {
1804
1910
  return toolChoice !== null ? toolChoice : void 0;
1805
1911
  }
1806
1912
  // Annotate the CommonJS export names for ESM import in node:
1807
1913
  0 && (module.exports = {
1808
- AgentActivity
1914
+ AgentActivity,
1915
+ agentActivityStorage,
1916
+ onEnterStorage
1809
1917
  });
1810
1918
  //# sourceMappingURL=agent_activity.cjs.map