@livekit/agents 1.0.45 → 1.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/dist/cli.cjs +14 -20
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +14 -20
  5. package/dist/cli.js.map +1 -1
  6. package/dist/ipc/job_proc_lazy_main.cjs +14 -5
  7. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  8. package/dist/ipc/job_proc_lazy_main.js +14 -5
  9. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  10. package/dist/llm/chat_context.cjs +19 -0
  11. package/dist/llm/chat_context.cjs.map +1 -1
  12. package/dist/llm/chat_context.d.cts +4 -0
  13. package/dist/llm/chat_context.d.ts +4 -0
  14. package/dist/llm/chat_context.d.ts.map +1 -1
  15. package/dist/llm/chat_context.js +19 -0
  16. package/dist/llm/chat_context.js.map +1 -1
  17. package/dist/llm/provider_format/index.cjs +2 -0
  18. package/dist/llm/provider_format/index.cjs.map +1 -1
  19. package/dist/llm/provider_format/index.d.cts +1 -1
  20. package/dist/llm/provider_format/index.d.ts +1 -1
  21. package/dist/llm/provider_format/index.d.ts.map +1 -1
  22. package/dist/llm/provider_format/index.js +6 -1
  23. package/dist/llm/provider_format/index.js.map +1 -1
  24. package/dist/llm/provider_format/openai.cjs +82 -2
  25. package/dist/llm/provider_format/openai.cjs.map +1 -1
  26. package/dist/llm/provider_format/openai.d.cts +1 -0
  27. package/dist/llm/provider_format/openai.d.ts +1 -0
  28. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  29. package/dist/llm/provider_format/openai.js +80 -1
  30. package/dist/llm/provider_format/openai.js.map +1 -1
  31. package/dist/llm/provider_format/openai.test.cjs +326 -0
  32. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  33. package/dist/llm/provider_format/openai.test.js +327 -1
  34. package/dist/llm/provider_format/openai.test.js.map +1 -1
  35. package/dist/llm/provider_format/utils.cjs +4 -3
  36. package/dist/llm/provider_format/utils.cjs.map +1 -1
  37. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  38. package/dist/llm/provider_format/utils.js +4 -3
  39. package/dist/llm/provider_format/utils.js.map +1 -1
  40. package/dist/llm/realtime.cjs.map +1 -1
  41. package/dist/llm/realtime.d.cts +1 -0
  42. package/dist/llm/realtime.d.ts +1 -0
  43. package/dist/llm/realtime.d.ts.map +1 -1
  44. package/dist/llm/realtime.js.map +1 -1
  45. package/dist/log.cjs +5 -2
  46. package/dist/log.cjs.map +1 -1
  47. package/dist/log.d.ts.map +1 -1
  48. package/dist/log.js +5 -2
  49. package/dist/log.js.map +1 -1
  50. package/dist/stream/deferred_stream.cjs +15 -6
  51. package/dist/stream/deferred_stream.cjs.map +1 -1
  52. package/dist/stream/deferred_stream.d.ts.map +1 -1
  53. package/dist/stream/deferred_stream.js +15 -6
  54. package/dist/stream/deferred_stream.js.map +1 -1
  55. package/dist/stream/index.cjs +3 -0
  56. package/dist/stream/index.cjs.map +1 -1
  57. package/dist/stream/index.d.cts +1 -0
  58. package/dist/stream/index.d.ts +1 -0
  59. package/dist/stream/index.d.ts.map +1 -1
  60. package/dist/stream/index.js +2 -0
  61. package/dist/stream/index.js.map +1 -1
  62. package/dist/stream/multi_input_stream.cjs +139 -0
  63. package/dist/stream/multi_input_stream.cjs.map +1 -0
  64. package/dist/stream/multi_input_stream.d.cts +55 -0
  65. package/dist/stream/multi_input_stream.d.ts +55 -0
  66. package/dist/stream/multi_input_stream.d.ts.map +1 -0
  67. package/dist/stream/multi_input_stream.js +115 -0
  68. package/dist/stream/multi_input_stream.js.map +1 -0
  69. package/dist/stream/multi_input_stream.test.cjs +340 -0
  70. package/dist/stream/multi_input_stream.test.cjs.map +1 -0
  71. package/dist/stream/multi_input_stream.test.js +339 -0
  72. package/dist/stream/multi_input_stream.test.js.map +1 -0
  73. package/dist/telemetry/trace_types.cjs +42 -0
  74. package/dist/telemetry/trace_types.cjs.map +1 -1
  75. package/dist/telemetry/trace_types.d.cts +14 -0
  76. package/dist/telemetry/trace_types.d.ts +14 -0
  77. package/dist/telemetry/trace_types.d.ts.map +1 -1
  78. package/dist/telemetry/trace_types.js +28 -0
  79. package/dist/telemetry/trace_types.js.map +1 -1
  80. package/dist/utils.cjs +44 -2
  81. package/dist/utils.cjs.map +1 -1
  82. package/dist/utils.d.cts +8 -0
  83. package/dist/utils.d.ts +8 -0
  84. package/dist/utils.d.ts.map +1 -1
  85. package/dist/utils.js +44 -2
  86. package/dist/utils.js.map +1 -1
  87. package/dist/utils.test.cjs +71 -0
  88. package/dist/utils.test.cjs.map +1 -1
  89. package/dist/utils.test.js +71 -0
  90. package/dist/utils.test.js.map +1 -1
  91. package/dist/version.cjs +1 -1
  92. package/dist/version.cjs.map +1 -1
  93. package/dist/version.d.cts +1 -1
  94. package/dist/version.d.ts +1 -1
  95. package/dist/version.d.ts.map +1 -1
  96. package/dist/version.js +1 -1
  97. package/dist/version.js.map +1 -1
  98. package/dist/voice/agent.cjs +144 -12
  99. package/dist/voice/agent.cjs.map +1 -1
  100. package/dist/voice/agent.d.cts +29 -4
  101. package/dist/voice/agent.d.ts +29 -4
  102. package/dist/voice/agent.d.ts.map +1 -1
  103. package/dist/voice/agent.js +140 -11
  104. package/dist/voice/agent.js.map +1 -1
  105. package/dist/voice/agent.test.cjs +120 -0
  106. package/dist/voice/agent.test.cjs.map +1 -1
  107. package/dist/voice/agent.test.js +122 -2
  108. package/dist/voice/agent.test.js.map +1 -1
  109. package/dist/voice/agent_activity.cjs +402 -292
  110. package/dist/voice/agent_activity.cjs.map +1 -1
  111. package/dist/voice/agent_activity.d.cts +35 -7
  112. package/dist/voice/agent_activity.d.ts +35 -7
  113. package/dist/voice/agent_activity.d.ts.map +1 -1
  114. package/dist/voice/agent_activity.js +402 -287
  115. package/dist/voice/agent_activity.js.map +1 -1
  116. package/dist/voice/agent_session.cjs +156 -44
  117. package/dist/voice/agent_session.cjs.map +1 -1
  118. package/dist/voice/agent_session.d.cts +22 -9
  119. package/dist/voice/agent_session.d.ts +22 -9
  120. package/dist/voice/agent_session.d.ts.map +1 -1
  121. package/dist/voice/agent_session.js +156 -44
  122. package/dist/voice/agent_session.js.map +1 -1
  123. package/dist/voice/audio_recognition.cjs +89 -36
  124. package/dist/voice/audio_recognition.cjs.map +1 -1
  125. package/dist/voice/audio_recognition.d.cts +22 -1
  126. package/dist/voice/audio_recognition.d.ts +22 -1
  127. package/dist/voice/audio_recognition.d.ts.map +1 -1
  128. package/dist/voice/audio_recognition.js +93 -36
  129. package/dist/voice/audio_recognition.js.map +1 -1
  130. package/dist/voice/audio_recognition_span.test.cjs +233 -0
  131. package/dist/voice/audio_recognition_span.test.cjs.map +1 -0
  132. package/dist/voice/audio_recognition_span.test.js +232 -0
  133. package/dist/voice/audio_recognition_span.test.js.map +1 -0
  134. package/dist/voice/generation.cjs +39 -19
  135. package/dist/voice/generation.cjs.map +1 -1
  136. package/dist/voice/generation.d.ts.map +1 -1
  137. package/dist/voice/generation.js +44 -20
  138. package/dist/voice/generation.js.map +1 -1
  139. package/dist/voice/index.cjs +2 -0
  140. package/dist/voice/index.cjs.map +1 -1
  141. package/dist/voice/index.d.cts +1 -1
  142. package/dist/voice/index.d.ts +1 -1
  143. package/dist/voice/index.d.ts.map +1 -1
  144. package/dist/voice/index.js +2 -1
  145. package/dist/voice/index.js.map +1 -1
  146. package/dist/voice/io.cjs +6 -3
  147. package/dist/voice/io.cjs.map +1 -1
  148. package/dist/voice/io.d.cts +3 -2
  149. package/dist/voice/io.d.ts +3 -2
  150. package/dist/voice/io.d.ts.map +1 -1
  151. package/dist/voice/io.js +6 -3
  152. package/dist/voice/io.js.map +1 -1
  153. package/dist/voice/recorder_io/recorder_io.cjs +3 -1
  154. package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
  155. package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
  156. package/dist/voice/recorder_io/recorder_io.js +3 -1
  157. package/dist/voice/recorder_io/recorder_io.js.map +1 -1
  158. package/dist/voice/room_io/_input.cjs +17 -17
  159. package/dist/voice/room_io/_input.cjs.map +1 -1
  160. package/dist/voice/room_io/_input.d.cts +2 -2
  161. package/dist/voice/room_io/_input.d.ts +2 -2
  162. package/dist/voice/room_io/_input.d.ts.map +1 -1
  163. package/dist/voice/room_io/_input.js +7 -6
  164. package/dist/voice/room_io/_input.js.map +1 -1
  165. package/dist/voice/room_io/room_io.cjs +9 -0
  166. package/dist/voice/room_io/room_io.cjs.map +1 -1
  167. package/dist/voice/room_io/room_io.d.cts +3 -1
  168. package/dist/voice/room_io/room_io.d.ts +3 -1
  169. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  170. package/dist/voice/room_io/room_io.js +9 -0
  171. package/dist/voice/room_io/room_io.js.map +1 -1
  172. package/dist/voice/speech_handle.cjs +7 -1
  173. package/dist/voice/speech_handle.cjs.map +1 -1
  174. package/dist/voice/speech_handle.d.cts +2 -0
  175. package/dist/voice/speech_handle.d.ts +2 -0
  176. package/dist/voice/speech_handle.d.ts.map +1 -1
  177. package/dist/voice/speech_handle.js +8 -2
  178. package/dist/voice/speech_handle.js.map +1 -1
  179. package/dist/voice/testing/run_result.cjs +66 -15
  180. package/dist/voice/testing/run_result.cjs.map +1 -1
  181. package/dist/voice/testing/run_result.d.cts +14 -3
  182. package/dist/voice/testing/run_result.d.ts +14 -3
  183. package/dist/voice/testing/run_result.d.ts.map +1 -1
  184. package/dist/voice/testing/run_result.js +66 -15
  185. package/dist/voice/testing/run_result.js.map +1 -1
  186. package/dist/voice/utils.cjs +47 -0
  187. package/dist/voice/utils.cjs.map +1 -0
  188. package/dist/voice/utils.d.cts +4 -0
  189. package/dist/voice/utils.d.ts +4 -0
  190. package/dist/voice/utils.d.ts.map +1 -0
  191. package/dist/voice/utils.js +23 -0
  192. package/dist/voice/utils.js.map +1 -0
  193. package/package.json +1 -1
  194. package/src/cli.ts +20 -33
  195. package/src/ipc/job_proc_lazy_main.ts +16 -5
  196. package/src/llm/chat_context.ts +35 -0
  197. package/src/llm/provider_format/index.ts +7 -2
  198. package/src/llm/provider_format/openai.test.ts +385 -1
  199. package/src/llm/provider_format/openai.ts +103 -0
  200. package/src/llm/provider_format/utils.ts +6 -4
  201. package/src/llm/realtime.ts +1 -0
  202. package/src/log.ts +5 -2
  203. package/src/stream/deferred_stream.ts +17 -6
  204. package/src/stream/index.ts +1 -0
  205. package/src/stream/multi_input_stream.test.ts +540 -0
  206. package/src/stream/multi_input_stream.ts +172 -0
  207. package/src/telemetry/trace_types.ts +18 -0
  208. package/src/utils.test.ts +87 -0
  209. package/src/utils.ts +52 -2
  210. package/src/version.ts +1 -1
  211. package/src/voice/agent.test.ts +140 -2
  212. package/src/voice/agent.ts +189 -10
  213. package/src/voice/agent_activity.ts +449 -286
  214. package/src/voice/agent_session.ts +195 -51
  215. package/src/voice/audio_recognition.ts +118 -38
  216. package/src/voice/audio_recognition_span.test.ts +261 -0
  217. package/src/voice/generation.ts +52 -23
  218. package/src/voice/index.ts +1 -1
  219. package/src/voice/io.ts +7 -4
  220. package/src/voice/recorder_io/recorder_io.ts +2 -1
  221. package/src/voice/room_io/_input.ts +11 -7
  222. package/src/voice/room_io/room_io.ts +12 -0
  223. package/src/voice/speech_handle.ts +9 -2
  224. package/src/voice/testing/run_result.ts +81 -23
  225. package/src/voice/utils.ts +29 -0
@@ -18,7 +18,8 @@ var __copyProps = (to, from, except, desc) => {
18
18
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
19
  var agent_activity_exports = {};
20
20
  __export(agent_activity_exports, {
21
- AgentActivity: () => AgentActivity
21
+ AgentActivity: () => AgentActivity,
22
+ agentActivityStorage: () => agentActivityStorage
22
23
  });
23
24
  module.exports = __toCommonJS(agent_activity_exports);
24
25
  var import_mutex = require("@livekit/mutex");
@@ -30,7 +31,7 @@ var import_chat_context = require("../llm/chat_context.cjs");
30
31
  var import_llm = require("../llm/index.cjs");
31
32
  var import_tool_context = require("../llm/tool_context.cjs");
32
33
  var import_log = require("../log.cjs");
33
- var import_deferred_stream = require("../stream/deferred_stream.cjs");
34
+ var import_multi_input_stream = require("../stream/multi_input_stream.cjs");
34
35
  var import_stt = require("../stt/stt.cjs");
35
36
  var import_telemetry = require("../telemetry/index.cjs");
36
37
  var import_word = require("../tokenize/basic/word.cjs");
@@ -43,8 +44,11 @@ var import_audio_recognition = require("./audio_recognition.cjs");
43
44
  var import_events = require("./events.cjs");
44
45
  var import_generation = require("./generation.cjs");
45
46
  var import_speech_handle = require("./speech_handle.cjs");
46
- const speechHandleStorage = new import_node_async_hooks.AsyncLocalStorage();
47
+ var import_utils2 = require("./utils.cjs");
48
+ const agentActivityStorage = new import_node_async_hooks.AsyncLocalStorage();
47
49
  class AgentActivity {
50
+ agent;
51
+ agentSession;
48
52
  static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
49
53
  started = false;
50
54
  audioRecognition;
@@ -53,22 +57,29 @@ class AgentActivity {
53
57
  // Maps response_id to OTEL span for metrics recording
54
58
  turnDetectionMode;
55
59
  logger = (0, import_log.log)();
56
- _draining = false;
60
+ _schedulingPaused = true;
61
+ _drainBlockedTasks = [];
57
62
  _currentSpeech;
58
63
  speechQueue;
59
64
  // [priority, timestamp, speechHandle]
60
65
  q_updated;
61
66
  speechTasks = /* @__PURE__ */ new Set();
62
67
  lock = new import_mutex.Mutex();
63
- audioStream = new import_deferred_stream.DeferredReadableStream();
68
+ audioStream = new import_multi_input_stream.MultiInputStream();
69
+ audioStreamId;
64
70
  // default to null as None, which maps to the default provider tool choice value
65
71
  toolChoice = null;
66
72
  _preemptiveGeneration;
67
- agent;
68
- agentSession;
69
73
  /** @internal */
70
74
  _mainTask;
75
+ _onEnterTask;
76
+ _onExitTask;
71
77
  _userTurnCompletedTask;
78
+ onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
79
+ onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
80
+ onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
81
+ onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
82
+ onModelError = (ev) => this.onError(ev);
72
83
  constructor(agent, agentSession) {
73
84
  this.agent = agent;
74
85
  this.agentSession = agentSession;
@@ -79,7 +90,7 @@ class AgentActivity {
79
90
  this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
80
91
  if (this.turnDetectionMode === "vad" && this.vad === void 0) {
81
92
  this.logger.warn(
82
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting'
93
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
83
94
  );
84
95
  this.turnDetectionMode = void 0;
85
96
  }
@@ -131,98 +142,119 @@ class AgentActivity {
131
142
  async start() {
132
143
  const unlock = await this.lock.lock();
133
144
  try {
134
- const startSpan = import_telemetry.tracer.startSpan({
135
- name: "start_agent_activity",
136
- attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
137
- context: import_api.ROOT_CONTEXT
138
- });
139
- this.agent._agentActivity = this;
140
- if (this.llm instanceof import_llm.RealtimeModel) {
141
- this.realtimeSession = this.llm.session();
142
- this.realtimeSpans = /* @__PURE__ */ new Map();
143
- this.realtimeSession.on("generation_created", (ev) => this.onGenerationCreated(ev));
144
- this.realtimeSession.on("input_speech_started", (ev) => this.onInputSpeechStarted(ev));
145
- this.realtimeSession.on("input_speech_stopped", (ev) => this.onInputSpeechStopped(ev));
146
- this.realtimeSession.on(
147
- "input_audio_transcription_completed",
148
- (ev) => this.onInputAudioTranscriptionCompleted(ev)
149
- );
150
- this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
151
- this.realtimeSession.on("error", (ev) => this.onError(ev));
152
- (0, import_generation.removeInstructions)(this.agent._chatCtx);
153
- try {
154
- await this.realtimeSession.updateInstructions(this.agent.instructions);
155
- } catch (error) {
156
- this.logger.error(error, "failed to update the instructions");
157
- }
158
- try {
159
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
160
- } catch (error) {
161
- this.logger.error(error, "failed to update the chat context");
162
- }
163
- try {
164
- await this.realtimeSession.updateTools(this.tools);
165
- } catch (error) {
166
- this.logger.error(error, "failed to update the tools");
167
- }
168
- if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
169
- this.logger.error(
170
- "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
171
- );
172
- }
173
- } else if (this.llm instanceof import_llm.LLM) {
174
- try {
175
- (0, import_generation.updateInstructions)({
176
- chatCtx: this.agent._chatCtx,
177
- instructions: this.agent.instructions,
178
- addIfMissing: true
179
- });
180
- } catch (error) {
181
- this.logger.error("failed to update the instructions", error);
182
- }
145
+ await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
146
+ } finally {
147
+ unlock();
148
+ }
149
+ }
150
+ async resume() {
151
+ const unlock = await this.lock.lock();
152
+ try {
153
+ await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
154
+ } finally {
155
+ unlock();
156
+ }
157
+ }
158
+ async _startSession(options) {
159
+ var _a;
160
+ const { spanName, runOnEnter } = options;
161
+ const startSpan = import_telemetry.tracer.startSpan({
162
+ name: spanName,
163
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
164
+ context: import_api.ROOT_CONTEXT
165
+ });
166
+ this.agent._agentActivity = this;
167
+ if (this.llm instanceof import_llm.RealtimeModel) {
168
+ this.realtimeSession = this.llm.session();
169
+ this.realtimeSpans = /* @__PURE__ */ new Map();
170
+ this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
171
+ this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
172
+ this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
173
+ this.realtimeSession.on(
174
+ "input_audio_transcription_completed",
175
+ this.onRealtimeInputAudioTranscriptionCompleted
176
+ );
177
+ this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
178
+ this.realtimeSession.on("error", this.onModelError);
179
+ (0, import_generation.removeInstructions)(this.agent._chatCtx);
180
+ try {
181
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
182
+ } catch (error) {
183
+ this.logger.error(error, "failed to update the instructions");
183
184
  }
184
- if (this.llm instanceof import_llm.LLM) {
185
- this.llm.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
186
- this.llm.on("error", (ev) => this.onError(ev));
185
+ try {
186
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
187
+ } catch (error) {
188
+ this.logger.error(error, "failed to update the chat context");
187
189
  }
188
- if (this.stt instanceof import_stt.STT) {
189
- this.stt.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
190
- this.stt.on("error", (ev) => this.onError(ev));
190
+ try {
191
+ await this.realtimeSession.updateTools(this.tools);
192
+ } catch (error) {
193
+ this.logger.error(error, "failed to update the tools");
191
194
  }
192
- if (this.tts instanceof import_tts.TTS) {
193
- this.tts.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
194
- this.tts.on("error", (ev) => this.onError(ev));
195
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
196
+ this.logger.error(
197
+ "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
198
+ );
195
199
  }
196
- if (this.vad instanceof import_vad.VAD) {
197
- this.vad.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
200
+ } else if (this.llm instanceof import_llm.LLM) {
201
+ try {
202
+ (0, import_generation.updateInstructions)({
203
+ chatCtx: this.agent._chatCtx,
204
+ instructions: this.agent.instructions,
205
+ addIfMissing: true
206
+ });
207
+ } catch (error) {
208
+ this.logger.error("failed to update the instructions", error);
198
209
  }
199
- this.audioRecognition = new import_audio_recognition.AudioRecognition({
200
- recognitionHooks: this,
201
- // Disable stt node if stt is not provided
202
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
203
- vad: this.vad,
204
- turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
205
- turnDetectionMode: this.turnDetectionMode,
206
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
207
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
208
- rootSpanContext: this.agentSession.rootSpanContext
209
- });
210
- this.audioRecognition.start();
211
- this.started = true;
212
- this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
213
- const onEnterTask = import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
214
- name: "on_enter",
215
- context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
216
- attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
217
- });
218
- this.createSpeechTask({
219
- task: import_utils.Task.from(() => onEnterTask),
210
+ }
211
+ if (this.llm instanceof import_llm.LLM) {
212
+ this.llm.on("metrics_collected", this.onMetricsCollected);
213
+ this.llm.on("error", this.onModelError);
214
+ }
215
+ if (this.stt instanceof import_stt.STT) {
216
+ this.stt.on("metrics_collected", this.onMetricsCollected);
217
+ this.stt.on("error", this.onModelError);
218
+ }
219
+ if (this.tts instanceof import_tts.TTS) {
220
+ this.tts.on("metrics_collected", this.onMetricsCollected);
221
+ this.tts.on("error", this.onModelError);
222
+ }
223
+ if (this.vad instanceof import_vad.VAD) {
224
+ this.vad.on("metrics_collected", this.onMetricsCollected);
225
+ }
226
+ this.audioRecognition = new import_audio_recognition.AudioRecognition({
227
+ recognitionHooks: this,
228
+ // Disable stt node if stt is not provided
229
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
230
+ vad: this.vad,
231
+ turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
232
+ turnDetectionMode: this.turnDetectionMode,
233
+ minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
234
+ maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
235
+ rootSpanContext: this.agentSession.rootSpanContext,
236
+ sttModel: (_a = this.stt) == null ? void 0 : _a.label,
237
+ sttProvider: this.getSttProvider(),
238
+ getLinkedParticipant: () => {
239
+ var _a2;
240
+ return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
241
+ }
242
+ });
243
+ this.audioRecognition.start();
244
+ this.started = true;
245
+ this._resumeSchedulingTask();
246
+ if (runOnEnter) {
247
+ this._onEnterTask = this.createSpeechTask({
248
+ taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
249
+ name: "on_enter",
250
+ context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
251
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
252
+ }),
253
+ inlineTask: true,
220
254
  name: "AgentActivity_onEnter"
221
255
  });
222
- startSpan.end();
223
- } finally {
224
- unlock();
225
256
  }
257
+ startSpan.end();
226
258
  }
227
259
  get currentSpeech() {
228
260
  return this._currentSpeech;
@@ -233,6 +265,15 @@ class AgentActivity {
233
265
  get stt() {
234
266
  return this.agent.stt || this.agentSession.stt;
235
267
  }
268
+ getSttProvider() {
269
+ var _a;
270
+ const label = (_a = this.stt) == null ? void 0 : _a.label;
271
+ if (!label) {
272
+ return void 0;
273
+ }
274
+ const [provider] = label.split("-", 1);
275
+ return provider || label;
276
+ }
236
277
  get llm() {
237
278
  return this.agent.llm || this.agentSession.llm;
238
279
  }
@@ -242,8 +283,8 @@ class AgentActivity {
242
283
  get tools() {
243
284
  return this.agent.toolCtx;
244
285
  }
245
- get draining() {
246
- return this._draining;
286
+ get schedulingPaused() {
287
+ return this._schedulingPaused;
247
288
  }
248
289
  get realtimeLLMSession() {
249
290
  return this.realtimeSession;
@@ -283,11 +324,9 @@ class AgentActivity {
283
324
  }
284
325
  }
285
326
  attachAudioInput(audioStream) {
286
- if (this.audioStream.isSourceSet) {
287
- this.logger.debug("detaching existing audio input in agent activity");
288
- this.audioStream.detachSource();
289
- }
290
- this.audioStream.setSource(audioStream);
327
+ void this.audioStream.close();
328
+ this.audioStream = new import_multi_input_stream.MultiInputStream();
329
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
291
330
  const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
292
331
  if (this.realtimeSession) {
293
332
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
@@ -297,13 +336,21 @@ class AgentActivity {
297
336
  }
298
337
  }
299
338
  detachAudioInput() {
300
- this.audioStream.detachSource();
339
+ if (this.audioStreamId === void 0) {
340
+ return;
341
+ }
342
+ void this.audioStream.close();
343
+ this.audioStream = new import_multi_input_stream.MultiInputStream();
344
+ this.audioStreamId = void 0;
301
345
  }
302
- commitUserTurn() {
346
+ commitUserTurn(options = {}) {
347
+ const { audioDetached = false, throwIfNotReady = true } = options;
303
348
  if (!this.audioRecognition) {
304
- throw new Error("AudioRecognition is not initialized");
349
+ if (throwIfNotReady) {
350
+ throw new Error("AudioRecognition is not initialized");
351
+ }
352
+ return;
305
353
  }
306
- const audioDetached = false;
307
354
  this.audioRecognition.commitUserTurn(audioDetached);
308
355
  }
309
356
  clearUserTurn() {
@@ -339,19 +386,17 @@ class AgentActivity {
339
386
  })
340
387
  );
341
388
  const task = this.createSpeechTask({
342
- task: import_utils.Task.from(
343
- (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
344
- ),
389
+ taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
345
390
  ownedSpeechHandle: handle,
346
391
  name: "AgentActivity.say_tts"
347
392
  });
348
- task.finally(() => this.onPipelineReplyDone());
393
+ task.result.finally(() => this.onPipelineReplyDone());
349
394
  this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
350
395
  return handle;
351
396
  }
352
397
  // -- Metrics and errors --
353
398
  onMetricsCollected = (ev) => {
354
- const speechHandle = speechHandleStorage.getStore();
399
+ const speechHandle = import_agent.speechHandleStorage.getStore();
355
400
  if (speechHandle && (ev.type === "llm_metrics" || ev.type === "tts_metrics")) {
356
401
  ev.speechId = speechHandle.id;
357
402
  }
@@ -435,8 +480,8 @@ class AgentActivity {
435
480
  if (ev.userInitiated) {
436
481
  return;
437
482
  }
438
- if (this.draining) {
439
- this.logger.warn("skipping new realtime generation, the agent is draining");
483
+ if (this.schedulingPaused) {
484
+ this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
440
485
  return;
441
486
  }
442
487
  const handle = import_speech_handle.SpeechHandle.create({
@@ -452,9 +497,7 @@ class AgentActivity {
452
497
  );
453
498
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
454
499
  this.createSpeechTask({
455
- task: import_utils.Task.from(
456
- (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
457
- ),
500
+ taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
458
501
  ownedSpeechHandle: handle,
459
502
  name: "AgentActivity.realtimeGeneration"
460
503
  });
@@ -541,7 +584,7 @@ class AgentActivity {
541
584
  }
542
585
  }
543
586
  onPreemptiveGeneration(info) {
544
- if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
587
+ if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
545
588
  return;
546
589
  }
547
590
  this.cancelPreemptiveGeneration();
@@ -579,7 +622,21 @@ class AgentActivity {
579
622
  }
580
623
  }
581
624
  createSpeechTask(options) {
582
- const { task, ownedSpeechHandle } = options;
625
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
626
+ const wrappedFn = (ctrl) => {
627
+ return agentActivityStorage.run(this, () => {
628
+ const currentTask = import_utils.Task.current();
629
+ if (currentTask) {
630
+ (0, import_agent._setActivityTaskInfo)(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
631
+ }
632
+ if (ownedSpeechHandle) {
633
+ return import_agent.speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
634
+ }
635
+ return taskFn(ctrl);
636
+ });
637
+ };
638
+ const task = import_utils.Task.from(wrappedFn, controller, name);
639
+ (0, import_agent._setActivityTaskInfo)(task, { speechHandle: ownedSpeechHandle, inlineTask });
583
640
  this.speechTasks.add(task);
584
641
  task.addDoneCallback(() => {
585
642
  this.speechTasks.delete(task);
@@ -595,12 +652,15 @@ class AgentActivity {
595
652
  task.addDoneCallback(() => {
596
653
  this.wakeupMainTask();
597
654
  });
598
- return task.result;
655
+ return task;
599
656
  }
600
657
  async onEndOfTurn(info) {
601
- if (this.draining) {
658
+ if (this.schedulingPaused) {
602
659
  this.cancelPreemptiveGeneration();
603
- this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
660
+ this.logger.warn(
661
+ { user_input: info.newTranscript },
662
+ "skipping user input, speech scheduling is paused"
663
+ );
604
664
  return true;
605
665
  }
606
666
  if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
@@ -619,7 +679,7 @@ class AgentActivity {
619
679
  }
620
680
  const oldTask = this._userTurnCompletedTask;
621
681
  this._userTurnCompletedTask = this.createSpeechTask({
622
- task: import_utils.Task.from(() => this.userTurnCompleted(info, oldTask)),
682
+ taskFn: () => this.userTurnCompleted(info, oldTask),
623
683
  name: "AgentActivity.userTurnCompleted"
624
684
  });
625
685
  return true;
@@ -649,14 +709,41 @@ class AgentActivity {
649
709
  await speechHandle._waitForGeneration();
650
710
  this._currentSpeech = void 0;
651
711
  }
652
- if (this.draining && this.speechTasks.size === 0) {
653
- this.logger.info("mainTask: draining and no more speech tasks");
712
+ const toWait = this.getDrainPendingSpeechTasks();
713
+ if (this._schedulingPaused && toWait.length === 0) {
714
+ this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
654
715
  break;
655
716
  }
656
717
  this.q_updated = new import_utils.Future();
657
718
  }
658
719
  this.logger.info("AgentActivity mainTask: exiting");
659
720
  }
721
+ getDrainPendingSpeechTasks() {
722
+ const blockedHandles = [];
723
+ for (const task of this._drainBlockedTasks) {
724
+ const info = (0, import_agent._getActivityTaskInfo)(task);
725
+ if (!info) {
726
+ this.logger.error("blocked task without activity info; skipping.");
727
+ continue;
728
+ }
729
+ if (!info.speechHandle) {
730
+ continue;
731
+ }
732
+ blockedHandles.push(info.speechHandle);
733
+ }
734
+ const toWait = [];
735
+ for (const task of this.speechTasks) {
736
+ if (this._drainBlockedTasks.includes(task)) {
737
+ continue;
738
+ }
739
+ const info = (0, import_agent._getActivityTaskInfo)(task);
740
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
741
+ continue;
742
+ }
743
+ toWait.push(task);
744
+ }
745
+ return toWait;
746
+ }
660
747
  wakeupMainTask() {
661
748
  this.q_updated.resolve();
662
749
  }
@@ -682,7 +769,7 @@ class AgentActivity {
682
769
  if (this.llm === void 0) {
683
770
  throw new Error("trying to generate reply without an LLM model");
684
771
  }
685
- const functionCall = (_a = import_agent.asyncLocalStorage.getStore()) == null ? void 0 : _a.functionCall;
772
+ const functionCall = (_a = import_agent.functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
686
773
  if (toolChoice === void 0 && functionCall !== void 0) {
687
774
  toolChoice = "none";
688
775
  }
@@ -700,19 +787,17 @@ class AgentActivity {
700
787
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
701
788
  if (this.llm instanceof import_llm.RealtimeModel) {
702
789
  this.createSpeechTask({
703
- task: import_utils.Task.from(
704
- (abortController) => this.realtimeReplyTask({
705
- speechHandle: handle,
706
- // TODO(brian): support llm.ChatMessage for the realtime model
707
- userInput: userMessage == null ? void 0 : userMessage.textContent,
708
- instructions,
709
- modelSettings: {
710
- // isGiven(toolChoice) = toolChoice !== undefined
711
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
712
- },
713
- abortController
714
- })
715
- ),
790
+ taskFn: (abortController) => this.realtimeReplyTask({
791
+ speechHandle: handle,
792
+ // TODO(brian): support llm.ChatMessage for the realtime model
793
+ userInput: userMessage == null ? void 0 : userMessage.textContent,
794
+ instructions,
795
+ modelSettings: {
796
+ // isGiven(toolChoice) = toolChoice !== undefined
797
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
798
+ },
799
+ abortController
800
+ }),
716
801
  ownedSpeechHandle: handle,
717
802
  name: "AgentActivity.realtimeReply"
718
803
  });
@@ -722,36 +807,36 @@ class AgentActivity {
722
807
  ${instructions}`;
723
808
  }
724
809
  const task = this.createSpeechTask({
725
- task: import_utils.Task.from(
726
- (abortController) => this.pipelineReplyTask(
727
- handle,
728
- chatCtx ?? this.agent.chatCtx,
729
- this.agent.toolCtx,
730
- {
731
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
732
- },
733
- abortController,
734
- instructions,
735
- userMessage
736
- )
810
+ taskFn: (abortController) => this.pipelineReplyTask(
811
+ handle,
812
+ chatCtx ?? this.agent.chatCtx,
813
+ this.agent.toolCtx,
814
+ {
815
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
816
+ },
817
+ abortController,
818
+ instructions,
819
+ userMessage
737
820
  ),
738
821
  ownedSpeechHandle: handle,
739
822
  name: "AgentActivity.pipelineReply"
740
823
  });
741
- task.finally(() => this.onPipelineReplyDone());
824
+ task.result.finally(() => this.onPipelineReplyDone());
742
825
  }
743
826
  if (scheduleSpeech) {
744
827
  this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
745
828
  }
746
829
  return handle;
747
830
  }
748
- interrupt() {
831
+ interrupt(options = {}) {
749
832
  var _a;
833
+ const { force = false } = options;
834
+ this.cancelPreemptiveGeneration();
750
835
  const future = new import_utils.Future();
751
836
  const currentSpeech = this._currentSpeech;
752
- currentSpeech == null ? void 0 : currentSpeech.interrupt();
837
+ currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
753
838
  for (const [_, __, speech] of this.speechQueue) {
754
- speech.interrupt();
839
+ speech.interrupt(force);
755
840
  }
756
841
  (_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
757
842
  if (currentSpeech === void 0) {
@@ -772,7 +857,7 @@ ${instructions}`;
772
857
  async userTurnCompleted(info, oldTask) {
773
858
  var _a, _b;
774
859
  if (oldTask) {
775
- await oldTask;
860
+ await oldTask.result;
776
861
  }
777
862
  if (this.llm instanceof import_llm.RealtimeModel) {
778
863
  if (this.llm.capabilities.turnDetection) {
@@ -854,7 +939,7 @@ ${instructions}`;
854
939
  }
855
940
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
856
941
  speechHandle._agentTurnContext = import_api.context.active();
857
- speechHandleStorage.enterWith(speechHandle);
942
+ import_agent.speechHandleStorage.enterWith(speechHandle);
858
943
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
859
944
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
860
945
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
@@ -959,7 +1044,7 @@ ${instructions}`;
959
1044
  toolsMessages,
960
1045
  span
961
1046
  }) => {
962
- var _a, _b, _c, _d;
1047
+ var _a, _b;
963
1048
  speechHandle._agentTurnContext = import_api.context.active();
964
1049
  span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
965
1050
  if (instructions) {
@@ -968,7 +1053,11 @@ ${instructions}`;
968
1053
  if (newMessage) {
969
1054
  span.setAttribute(import_telemetry.traceTypes.ATTR_USER_INPUT, newMessage.textContent || "");
970
1055
  }
971
- speechHandleStorage.enterWith(speechHandle);
1056
+ const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
1057
+ if (localParticipant) {
1058
+ (0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
1059
+ }
1060
+ import_agent.speechHandleStorage.enterWith(speechHandle);
972
1061
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
973
1062
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
974
1063
  chatCtx = chatCtx.copy();
@@ -1027,7 +1116,7 @@ ${instructions}`;
1027
1116
  speechHandle._clearAuthorization();
1028
1117
  const replyStartedAt = Date.now();
1029
1118
  let transcriptionInput = llmOutput;
1030
- if (this.useTtsAlignedTranscript && ((_a = this.tts) == null ? void 0 : _a.capabilities.alignedTranscript) && ttsGenData) {
1119
+ if (this.useTtsAlignedTranscript && ((_b = this.tts) == null ? void 0 : _b.capabilities.alignedTranscript) && ttsGenData) {
1031
1120
  const timedTextsStream = await Promise.race([
1032
1121
  ttsGenData.timedTextsFut.await,
1033
1122
  (ttsTask == null ? void 0 : ttsTask.result.catch(
@@ -1101,11 +1190,11 @@ ${instructions}`;
1101
1190
  for (const msg of toolsMessages) {
1102
1191
  msg.createdAt = replyStartedAt;
1103
1192
  }
1104
- this.agent._chatCtx.insert(toolsMessages);
1105
1193
  const toolCallOutputs = toolsMessages.filter(
1106
1194
  (m) => m.type === "function_call_output"
1107
1195
  );
1108
1196
  if (toolCallOutputs.length > 0) {
1197
+ this.agent._chatCtx.insert(toolCallOutputs);
1109
1198
  this.agentSession._toolItemsAdded(toolCallOutputs);
1110
1199
  }
1111
1200
  }
@@ -1193,45 +1282,15 @@ ${instructions}`;
1193
1282
  );
1194
1283
  return;
1195
1284
  }
1196
- const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1197
- functionCalls: [],
1198
- functionCallOutputs: []
1199
- });
1200
- let shouldGenerateToolReply = false;
1201
- let newAgentTask = null;
1202
- let ignoreTaskSwitch = false;
1203
- for (const sanitizedOut of toolOutput.output) {
1204
- if (sanitizedOut.toolCallOutput !== void 0) {
1205
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1206
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1207
- if (sanitizedOut.replyRequired) {
1208
- shouldGenerateToolReply = true;
1209
- }
1210
- }
1211
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1212
- this.logger.error("expected to receive only one agent task from the tool executions");
1213
- ignoreTaskSwitch = true;
1214
- }
1215
- newAgentTask = sanitizedOut.agentTask ?? null;
1216
- this.logger.debug(
1217
- {
1218
- speechId: speechHandle.id,
1219
- name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
1220
- args: sanitizedOut.toolCall.args,
1221
- output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
1222
- isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
1223
- },
1224
- "Tool call execution finished"
1225
- );
1226
- }
1285
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1227
1286
  this.agentSession.emit(
1228
1287
  import_events.AgentSessionEventTypes.FunctionToolsExecuted,
1229
1288
  functionToolsExecutedEvent
1230
1289
  );
1231
- let draining = this.draining;
1290
+ let schedulingPaused = this.schedulingPaused;
1232
1291
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1233
1292
  this.agentSession.updateAgent(newAgentTask);
1234
- draining = true;
1293
+ schedulingPaused = true;
1235
1294
  }
1236
1295
  const toolMessages = [
1237
1296
  ...functionToolsExecutedEvent.functionCalls,
@@ -1240,34 +1299,32 @@ ${instructions}`;
1240
1299
  if (shouldGenerateToolReply) {
1241
1300
  chatCtx.insert(toolMessages);
1242
1301
  speechHandle._numSteps += 1;
1243
- const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1302
+ const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1244
1303
  const toolResponseTask = this.createSpeechTask({
1245
- task: import_utils.Task.from(
1246
- () => this.pipelineReplyTask(
1247
- speechHandle,
1248
- chatCtx,
1249
- toolCtx,
1250
- { toolChoice: respondToolChoice },
1251
- replyAbortController,
1252
- instructions,
1253
- void 0,
1254
- toolMessages
1255
- )
1304
+ taskFn: () => this.pipelineReplyTask(
1305
+ speechHandle,
1306
+ chatCtx,
1307
+ toolCtx,
1308
+ { toolChoice: respondToolChoice },
1309
+ replyAbortController,
1310
+ instructions,
1311
+ void 0,
1312
+ toolMessages
1256
1313
  ),
1257
1314
  ownedSpeechHandle: speechHandle,
1258
1315
  name: "AgentActivity.pipelineReply"
1259
1316
  });
1260
- toolResponseTask.finally(() => this.onPipelineReplyDone());
1317
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1261
1318
  this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1262
1319
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1263
1320
  for (const msg of toolMessages) {
1264
1321
  msg.createdAt = replyStartedAt;
1265
1322
  }
1266
- this.agent._chatCtx.insert(toolMessages);
1267
1323
  const toolCallOutputs = toolMessages.filter(
1268
1324
  (m) => m.type === "function_call_output"
1269
1325
  );
1270
1326
  if (toolCallOutputs.length > 0) {
1327
+ this.agent._chatCtx.insert(toolCallOutputs);
1271
1328
  this.agentSession._toolItemsAdded(toolCallOutputs);
1272
1329
  }
1273
1330
  }
@@ -1311,10 +1368,14 @@ ${instructions}`;
1311
1368
  replyAbortController,
1312
1369
  span
1313
1370
  }) {
1314
- var _a, _b, _c;
1371
+ var _a;
1315
1372
  speechHandle._agentTurnContext = import_api.context.active();
1316
1373
  span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1317
- speechHandleStorage.enterWith(speechHandle);
1374
+ const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
1375
+ if (localParticipant) {
1376
+ (0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
1377
+ }
1378
+ import_agent.speechHandleStorage.enterWith(speechHandle);
1318
1379
  if (!this.realtimeSession) {
1319
1380
  throw new Error("realtime session is not initialized");
1320
1381
  }
@@ -1567,44 +1628,15 @@ ${instructions}`;
1567
1628
  );
1568
1629
  return;
1569
1630
  }
1570
- const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1571
- functionCalls: [],
1572
- functionCallOutputs: []
1573
- });
1574
- let shouldGenerateToolReply = false;
1575
- let newAgentTask = null;
1576
- let ignoreTaskSwitch = false;
1577
- for (const sanitizedOut of toolOutput.output) {
1578
- if (sanitizedOut.toolCallOutput !== void 0) {
1579
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1580
- if (sanitizedOut.replyRequired) {
1581
- shouldGenerateToolReply = true;
1582
- }
1583
- }
1584
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1585
- this.logger.error("expected to receive only one agent task from the tool executions");
1586
- ignoreTaskSwitch = true;
1587
- }
1588
- newAgentTask = sanitizedOut.agentTask ?? null;
1589
- this.logger.debug(
1590
- {
1591
- speechId: speechHandle.id,
1592
- name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1593
- args: sanitizedOut.toolCall.args,
1594
- output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1595
- isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1596
- },
1597
- "Tool call execution finished"
1598
- );
1599
- }
1631
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1600
1632
  this.agentSession.emit(
1601
1633
  import_events.AgentSessionEventTypes.FunctionToolsExecuted,
1602
1634
  functionToolsExecutedEvent
1603
1635
  );
1604
- let draining = this.draining;
1636
+ let schedulingPaused = this.schedulingPaused;
1605
1637
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1606
1638
  this.agentSession.updateAgent(newAgentTask);
1607
- draining = true;
1639
+ schedulingPaused = true;
1608
1640
  }
1609
1641
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1610
1642
  while (this.currentSpeech || this.speechQueue.size() > 0) {
@@ -1645,20 +1677,58 @@ ${instructions}`;
1645
1677
  speechHandle: replySpeechHandle
1646
1678
  })
1647
1679
  );
1648
- const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1680
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1649
1681
  this.createSpeechTask({
1650
- task: import_utils.Task.from(
1651
- (abortController) => this.realtimeReplyTask({
1652
- speechHandle: replySpeechHandle,
1653
- modelSettings: { toolChoice },
1654
- abortController
1655
- })
1656
- ),
1682
+ taskFn: (abortController) => this.realtimeReplyTask({
1683
+ speechHandle: replySpeechHandle,
1684
+ modelSettings: { toolChoice },
1685
+ abortController
1686
+ }),
1657
1687
  ownedSpeechHandle: replySpeechHandle,
1658
1688
  name: "AgentActivity.realtime_reply"
1659
1689
  });
1660
1690
  this.scheduleSpeech(replySpeechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1661
1691
  }
1692
+ summarizeToolExecutionOutput(toolOutput, speechHandle) {
1693
+ var _a, _b, _c;
1694
+ const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1695
+ functionCalls: [],
1696
+ functionCallOutputs: []
1697
+ });
1698
+ let shouldGenerateToolReply = false;
1699
+ let newAgentTask = null;
1700
+ let ignoreTaskSwitch = false;
1701
+ for (const sanitizedOut of toolOutput.output) {
1702
+ if (sanitizedOut.toolCallOutput !== void 0) {
1703
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1704
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1705
+ if (sanitizedOut.replyRequired) {
1706
+ shouldGenerateToolReply = true;
1707
+ }
1708
+ }
1709
+ if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1710
+ this.logger.error("expected to receive only one agent task from the tool executions");
1711
+ ignoreTaskSwitch = true;
1712
+ }
1713
+ newAgentTask = sanitizedOut.agentTask ?? null;
1714
+ this.logger.debug(
1715
+ {
1716
+ speechId: speechHandle.id,
1717
+ name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1718
+ args: sanitizedOut.toolCall.args,
1719
+ output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1720
+ isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1721
+ },
1722
+ "Tool call execution finished"
1723
+ );
1724
+ }
1725
+ return {
1726
+ functionToolsExecutedEvent,
1727
+ shouldGenerateToolReply,
1728
+ newAgentTask,
1729
+ ignoreTaskSwitch
1730
+ };
1731
+ }
1662
1732
  async realtimeReplyTask({
1663
1733
  speechHandle,
1664
1734
  modelSettings: { toolChoice },
@@ -1666,7 +1736,7 @@ ${instructions}`;
1666
1736
  instructions,
1667
1737
  abortController
1668
1738
  }) {
1669
- speechHandleStorage.enterWith(speechHandle);
1739
+ import_agent.speechHandleStorage.enterWith(speechHandle);
1670
1740
  if (!this.realtimeSession) {
1671
1741
  throw new Error("realtime session is not available");
1672
1742
  }
@@ -1700,13 +1770,45 @@ ${instructions}`;
1700
1770
  }
1701
1771
  }
1702
1772
  scheduleSpeech(speechHandle, priority, force = false) {
1703
- if (this.draining && !force) {
1704
- throw new Error("cannot schedule new speech, the agent is draining");
1773
+ if (this.schedulingPaused && !force) {
1774
+ throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
1705
1775
  }
1706
1776
  this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1707
1777
  speechHandle._markScheduled();
1708
1778
  this.wakeupMainTask();
1709
1779
  }
1780
+ async _pauseSchedulingTask(blockedTasks) {
1781
+ if (this._schedulingPaused) return;
1782
+ this._schedulingPaused = true;
1783
+ this._drainBlockedTasks = blockedTasks;
1784
+ this.wakeupMainTask();
1785
+ if (this._mainTask) {
1786
+ await this._mainTask.result;
1787
+ }
1788
+ }
1789
+ _resumeSchedulingTask() {
1790
+ if (!this._schedulingPaused) return;
1791
+ this._schedulingPaused = false;
1792
+ this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
1793
+ }
1794
+ async pause(options = {}) {
1795
+ const { blockedTasks = [] } = options;
1796
+ const unlock = await this.lock.lock();
1797
+ try {
1798
+ const span = import_telemetry.tracer.startSpan({
1799
+ name: "pause_agent_activity",
1800
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1801
+ });
1802
+ try {
1803
+ await this._pauseSchedulingTask(blockedTasks);
1804
+ await this._closeSessionResources();
1805
+ } finally {
1806
+ span.end();
1807
+ }
1808
+ } finally {
1809
+ unlock();
1810
+ }
1811
+ }
1710
1812
  async drain() {
1711
1813
  return import_telemetry.tracer.startActiveSpan(async (span) => this._drainImpl(span), {
1712
1814
  name: "drain_agent_activity",
@@ -1714,72 +1816,80 @@ ${instructions}`;
1714
1816
  });
1715
1817
  }
1716
1818
  async _drainImpl(span) {
1717
- var _a;
1718
1819
  span.setAttribute(import_telemetry.traceTypes.ATTR_AGENT_LABEL, this.agent.id);
1719
1820
  const unlock = await this.lock.lock();
1720
1821
  try {
1721
- if (this._draining) return;
1722
- this.cancelPreemptiveGeneration();
1723
- const onExitTask = import_telemetry.tracer.startActiveSpan(async () => this.agent.onExit(), {
1724
- name: "on_exit",
1725
- attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1726
- });
1727
- this.createSpeechTask({
1728
- task: import_utils.Task.from(() => onExitTask),
1822
+ if (this._schedulingPaused) return;
1823
+ this._onExitTask = this.createSpeechTask({
1824
+ taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onExit(), {
1825
+ name: "on_exit",
1826
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1827
+ }),
1828
+ inlineTask: true,
1729
1829
  name: "AgentActivity_onExit"
1730
1830
  });
1731
- this.wakeupMainTask();
1732
- this._draining = true;
1733
- await ((_a = this._mainTask) == null ? void 0 : _a.result);
1831
+ this.cancelPreemptiveGeneration();
1832
+ await this._onExitTask.result;
1833
+ await this._pauseSchedulingTask([]);
1734
1834
  } finally {
1735
1835
  unlock();
1736
1836
  }
1737
1837
  }
1738
1838
  async close() {
1739
- var _a, _b, _c, _d;
1740
1839
  const unlock = await this.lock.lock();
1741
1840
  try {
1742
- if (!this._draining) {
1743
- this.logger.warn("task closing without draining");
1744
- }
1745
1841
  this.cancelPreemptiveGeneration();
1746
- if (this.llm instanceof import_llm.LLM) {
1747
- this.llm.off("metrics_collected", this.onMetricsCollected);
1748
- }
1749
- if (this.realtimeSession) {
1750
- this.realtimeSession.off("generation_created", this.onGenerationCreated);
1751
- this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
1752
- this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
1753
- this.realtimeSession.off(
1754
- "input_audio_transcription_completed",
1755
- this.onInputAudioTranscriptionCompleted
1756
- );
1757
- this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1758
- }
1759
- if (this.stt instanceof import_stt.STT) {
1760
- this.stt.off("metrics_collected", this.onMetricsCollected);
1761
- }
1762
- if (this.tts instanceof import_tts.TTS) {
1763
- this.tts.off("metrics_collected", this.onMetricsCollected);
1842
+ await this._closeSessionResources();
1843
+ if (this._mainTask) {
1844
+ await this._mainTask.cancelAndWait();
1764
1845
  }
1765
- if (this.vad instanceof import_vad.VAD) {
1766
- this.vad.off("metrics_collected", this.onMetricsCollected);
1767
- }
1768
- this.detachAudioInput();
1769
- (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
1770
- await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
1771
- await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
1772
- await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
1846
+ this.agent._agentActivity = void 0;
1773
1847
  } finally {
1774
1848
  unlock();
1775
1849
  }
1776
1850
  }
1851
+ async _closeSessionResources() {
1852
+ var _a, _b, _c;
1853
+ if (this.llm instanceof import_llm.LLM) {
1854
+ this.llm.off("metrics_collected", this.onMetricsCollected);
1855
+ this.llm.off("error", this.onModelError);
1856
+ }
1857
+ if (this.realtimeSession) {
1858
+ this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
1859
+ this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
1860
+ this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
1861
+ this.realtimeSession.off(
1862
+ "input_audio_transcription_completed",
1863
+ this.onRealtimeInputAudioTranscriptionCompleted
1864
+ );
1865
+ this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1866
+ this.realtimeSession.off("error", this.onModelError);
1867
+ }
1868
+ if (this.stt instanceof import_stt.STT) {
1869
+ this.stt.off("metrics_collected", this.onMetricsCollected);
1870
+ this.stt.off("error", this.onModelError);
1871
+ }
1872
+ if (this.tts instanceof import_tts.TTS) {
1873
+ this.tts.off("metrics_collected", this.onMetricsCollected);
1874
+ this.tts.off("error", this.onModelError);
1875
+ }
1876
+ if (this.vad instanceof import_vad.VAD) {
1877
+ this.vad.off("metrics_collected", this.onMetricsCollected);
1878
+ }
1879
+ this.detachAudioInput();
1880
+ (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
1881
+ await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
1882
+ await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
1883
+ this.realtimeSession = void 0;
1884
+ this.audioRecognition = void 0;
1885
+ }
1777
1886
  }
1778
1887
  function toOaiToolChoice(toolChoice) {
1779
1888
  return toolChoice !== null ? toolChoice : void 0;
1780
1889
  }
1781
1890
  // Annotate the CommonJS export names for ESM import in node:
1782
1891
  0 && (module.exports = {
1783
- AgentActivity
1892
+ AgentActivity,
1893
+ agentActivityStorage
1784
1894
  });
1785
1895
  //# sourceMappingURL=agent_activity.cjs.map