@livekit/agents 1.0.46 → 1.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. package/dist/cli.cjs +14 -20
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +14 -20
  5. package/dist/cli.js.map +1 -1
  6. package/dist/ipc/job_proc_lazy_main.cjs +14 -5
  7. package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
  8. package/dist/ipc/job_proc_lazy_main.js +14 -5
  9. package/dist/ipc/job_proc_lazy_main.js.map +1 -1
  10. package/dist/llm/chat_context.cjs +19 -0
  11. package/dist/llm/chat_context.cjs.map +1 -1
  12. package/dist/llm/chat_context.d.cts +4 -0
  13. package/dist/llm/chat_context.d.ts +4 -0
  14. package/dist/llm/chat_context.d.ts.map +1 -1
  15. package/dist/llm/chat_context.js +19 -0
  16. package/dist/llm/chat_context.js.map +1 -1
  17. package/dist/llm/provider_format/index.cjs +2 -0
  18. package/dist/llm/provider_format/index.cjs.map +1 -1
  19. package/dist/llm/provider_format/index.d.cts +1 -1
  20. package/dist/llm/provider_format/index.d.ts +1 -1
  21. package/dist/llm/provider_format/index.d.ts.map +1 -1
  22. package/dist/llm/provider_format/index.js +6 -1
  23. package/dist/llm/provider_format/index.js.map +1 -1
  24. package/dist/llm/provider_format/openai.cjs +82 -2
  25. package/dist/llm/provider_format/openai.cjs.map +1 -1
  26. package/dist/llm/provider_format/openai.d.cts +1 -0
  27. package/dist/llm/provider_format/openai.d.ts +1 -0
  28. package/dist/llm/provider_format/openai.d.ts.map +1 -1
  29. package/dist/llm/provider_format/openai.js +80 -1
  30. package/dist/llm/provider_format/openai.js.map +1 -1
  31. package/dist/llm/provider_format/openai.test.cjs +326 -0
  32. package/dist/llm/provider_format/openai.test.cjs.map +1 -1
  33. package/dist/llm/provider_format/openai.test.js +327 -1
  34. package/dist/llm/provider_format/openai.test.js.map +1 -1
  35. package/dist/llm/provider_format/utils.cjs +4 -3
  36. package/dist/llm/provider_format/utils.cjs.map +1 -1
  37. package/dist/llm/provider_format/utils.d.ts.map +1 -1
  38. package/dist/llm/provider_format/utils.js +4 -3
  39. package/dist/llm/provider_format/utils.js.map +1 -1
  40. package/dist/llm/realtime.cjs.map +1 -1
  41. package/dist/llm/realtime.d.cts +1 -0
  42. package/dist/llm/realtime.d.ts +1 -0
  43. package/dist/llm/realtime.d.ts.map +1 -1
  44. package/dist/llm/realtime.js.map +1 -1
  45. package/dist/log.cjs +5 -2
  46. package/dist/log.cjs.map +1 -1
  47. package/dist/log.d.ts.map +1 -1
  48. package/dist/log.js +5 -2
  49. package/dist/log.js.map +1 -1
  50. package/dist/stream/deferred_stream.cjs +15 -6
  51. package/dist/stream/deferred_stream.cjs.map +1 -1
  52. package/dist/stream/deferred_stream.d.ts.map +1 -1
  53. package/dist/stream/deferred_stream.js +15 -6
  54. package/dist/stream/deferred_stream.js.map +1 -1
  55. package/dist/utils.cjs +31 -2
  56. package/dist/utils.cjs.map +1 -1
  57. package/dist/utils.d.cts +7 -0
  58. package/dist/utils.d.ts +7 -0
  59. package/dist/utils.d.ts.map +1 -1
  60. package/dist/utils.js +31 -2
  61. package/dist/utils.js.map +1 -1
  62. package/dist/utils.test.cjs +71 -0
  63. package/dist/utils.test.cjs.map +1 -1
  64. package/dist/utils.test.js +71 -0
  65. package/dist/utils.test.js.map +1 -1
  66. package/dist/version.cjs +1 -1
  67. package/dist/version.cjs.map +1 -1
  68. package/dist/version.d.cts +1 -1
  69. package/dist/version.d.ts +1 -1
  70. package/dist/version.d.ts.map +1 -1
  71. package/dist/version.js +1 -1
  72. package/dist/version.js.map +1 -1
  73. package/dist/voice/agent.cjs +144 -12
  74. package/dist/voice/agent.cjs.map +1 -1
  75. package/dist/voice/agent.d.cts +29 -4
  76. package/dist/voice/agent.d.ts +29 -4
  77. package/dist/voice/agent.d.ts.map +1 -1
  78. package/dist/voice/agent.js +140 -11
  79. package/dist/voice/agent.js.map +1 -1
  80. package/dist/voice/agent.test.cjs +120 -0
  81. package/dist/voice/agent.test.cjs.map +1 -1
  82. package/dist/voice/agent.test.js +122 -2
  83. package/dist/voice/agent.test.js.map +1 -1
  84. package/dist/voice/agent_activity.cjs +383 -298
  85. package/dist/voice/agent_activity.cjs.map +1 -1
  86. package/dist/voice/agent_activity.d.cts +34 -7
  87. package/dist/voice/agent_activity.d.ts +34 -7
  88. package/dist/voice/agent_activity.d.ts.map +1 -1
  89. package/dist/voice/agent_activity.js +383 -293
  90. package/dist/voice/agent_activity.js.map +1 -1
  91. package/dist/voice/agent_session.cjs +140 -40
  92. package/dist/voice/agent_session.cjs.map +1 -1
  93. package/dist/voice/agent_session.d.cts +19 -7
  94. package/dist/voice/agent_session.d.ts +19 -7
  95. package/dist/voice/agent_session.d.ts.map +1 -1
  96. package/dist/voice/agent_session.js +137 -37
  97. package/dist/voice/agent_session.js.map +1 -1
  98. package/dist/voice/audio_recognition.cjs +4 -0
  99. package/dist/voice/audio_recognition.cjs.map +1 -1
  100. package/dist/voice/audio_recognition.d.ts.map +1 -1
  101. package/dist/voice/audio_recognition.js +4 -0
  102. package/dist/voice/audio_recognition.js.map +1 -1
  103. package/dist/voice/generation.cjs +39 -19
  104. package/dist/voice/generation.cjs.map +1 -1
  105. package/dist/voice/generation.d.ts.map +1 -1
  106. package/dist/voice/generation.js +44 -20
  107. package/dist/voice/generation.js.map +1 -1
  108. package/dist/voice/index.cjs +2 -0
  109. package/dist/voice/index.cjs.map +1 -1
  110. package/dist/voice/index.d.cts +1 -1
  111. package/dist/voice/index.d.ts +1 -1
  112. package/dist/voice/index.d.ts.map +1 -1
  113. package/dist/voice/index.js +2 -1
  114. package/dist/voice/index.js.map +1 -1
  115. package/dist/voice/speech_handle.cjs +7 -1
  116. package/dist/voice/speech_handle.cjs.map +1 -1
  117. package/dist/voice/speech_handle.d.cts +2 -0
  118. package/dist/voice/speech_handle.d.ts +2 -0
  119. package/dist/voice/speech_handle.d.ts.map +1 -1
  120. package/dist/voice/speech_handle.js +8 -2
  121. package/dist/voice/speech_handle.js.map +1 -1
  122. package/dist/voice/testing/run_result.cjs +66 -15
  123. package/dist/voice/testing/run_result.cjs.map +1 -1
  124. package/dist/voice/testing/run_result.d.cts +14 -3
  125. package/dist/voice/testing/run_result.d.ts +14 -3
  126. package/dist/voice/testing/run_result.d.ts.map +1 -1
  127. package/dist/voice/testing/run_result.js +66 -15
  128. package/dist/voice/testing/run_result.js.map +1 -1
  129. package/package.json +1 -1
  130. package/src/cli.ts +20 -33
  131. package/src/ipc/job_proc_lazy_main.ts +16 -5
  132. package/src/llm/chat_context.ts +35 -0
  133. package/src/llm/provider_format/index.ts +7 -2
  134. package/src/llm/provider_format/openai.test.ts +385 -1
  135. package/src/llm/provider_format/openai.ts +103 -0
  136. package/src/llm/provider_format/utils.ts +6 -4
  137. package/src/llm/realtime.ts +1 -0
  138. package/src/log.ts +5 -2
  139. package/src/stream/deferred_stream.ts +17 -6
  140. package/src/utils.test.ts +87 -0
  141. package/src/utils.ts +36 -2
  142. package/src/version.ts +1 -1
  143. package/src/voice/agent.test.ts +140 -2
  144. package/src/voice/agent.ts +189 -10
  145. package/src/voice/agent_activity.ts +427 -289
  146. package/src/voice/agent_session.ts +178 -40
  147. package/src/voice/audio_recognition.ts +4 -0
  148. package/src/voice/generation.ts +52 -23
  149. package/src/voice/index.ts +1 -1
  150. package/src/voice/speech_handle.ts +9 -2
  151. package/src/voice/testing/run_result.ts +81 -23
@@ -18,7 +18,8 @@ var __copyProps = (to, from, except, desc) => {
18
18
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
19
  var agent_activity_exports = {};
20
20
  __export(agent_activity_exports, {
21
- AgentActivity: () => AgentActivity
21
+ AgentActivity: () => AgentActivity,
22
+ agentActivityStorage: () => agentActivityStorage
22
23
  });
23
24
  module.exports = __toCommonJS(agent_activity_exports);
24
25
  var import_mutex = require("@livekit/mutex");
@@ -30,7 +31,7 @@ var import_chat_context = require("../llm/chat_context.cjs");
30
31
  var import_llm = require("../llm/index.cjs");
31
32
  var import_tool_context = require("../llm/tool_context.cjs");
32
33
  var import_log = require("../log.cjs");
33
- var import_deferred_stream = require("../stream/deferred_stream.cjs");
34
+ var import_multi_input_stream = require("../stream/multi_input_stream.cjs");
34
35
  var import_stt = require("../stt/stt.cjs");
35
36
  var import_telemetry = require("../telemetry/index.cjs");
36
37
  var import_word = require("../tokenize/basic/word.cjs");
@@ -44,8 +45,10 @@ var import_events = require("./events.cjs");
44
45
  var import_generation = require("./generation.cjs");
45
46
  var import_speech_handle = require("./speech_handle.cjs");
46
47
  var import_utils2 = require("./utils.cjs");
47
- const speechHandleStorage = new import_node_async_hooks.AsyncLocalStorage();
48
+ const agentActivityStorage = new import_node_async_hooks.AsyncLocalStorage();
48
49
  class AgentActivity {
50
+ agent;
51
+ agentSession;
49
52
  static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
50
53
  started = false;
51
54
  audioRecognition;
@@ -54,22 +57,29 @@ class AgentActivity {
54
57
  // Maps response_id to OTEL span for metrics recording
55
58
  turnDetectionMode;
56
59
  logger = (0, import_log.log)();
57
- _draining = false;
60
+ _schedulingPaused = true;
61
+ _drainBlockedTasks = [];
58
62
  _currentSpeech;
59
63
  speechQueue;
60
64
  // [priority, timestamp, speechHandle]
61
65
  q_updated;
62
66
  speechTasks = /* @__PURE__ */ new Set();
63
67
  lock = new import_mutex.Mutex();
64
- audioStream = new import_deferred_stream.DeferredReadableStream();
68
+ audioStream = new import_multi_input_stream.MultiInputStream();
69
+ audioStreamId;
65
70
  // default to null as None, which maps to the default provider tool choice value
66
71
  toolChoice = null;
67
72
  _preemptiveGeneration;
68
- agent;
69
- agentSession;
70
73
  /** @internal */
71
74
  _mainTask;
75
+ _onEnterTask;
76
+ _onExitTask;
72
77
  _userTurnCompletedTask;
78
+ onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
79
+ onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
80
+ onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
81
+ onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
82
+ onModelError = (ev) => this.onError(ev);
73
83
  constructor(agent, agentSession) {
74
84
  this.agent = agent;
75
85
  this.agentSession = agentSession;
@@ -80,7 +90,7 @@ class AgentActivity {
80
90
  this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
81
91
  if (this.turnDetectionMode === "vad" && this.vad === void 0) {
82
92
  this.logger.warn(
83
- 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting'
93
+ 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
84
94
  );
85
95
  this.turnDetectionMode = void 0;
86
96
  }
@@ -130,107 +140,121 @@ class AgentActivity {
130
140
  }
131
141
  }
132
142
  async start() {
133
- var _a;
134
143
  const unlock = await this.lock.lock();
135
144
  try {
136
- const startSpan = import_telemetry.tracer.startSpan({
137
- name: "start_agent_activity",
138
- attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
139
- context: import_api.ROOT_CONTEXT
140
- });
141
- this.agent._agentActivity = this;
142
- if (this.llm instanceof import_llm.RealtimeModel) {
143
- this.realtimeSession = this.llm.session();
144
- this.realtimeSpans = /* @__PURE__ */ new Map();
145
- this.realtimeSession.on("generation_created", (ev) => this.onGenerationCreated(ev));
146
- this.realtimeSession.on("input_speech_started", (ev) => this.onInputSpeechStarted(ev));
147
- this.realtimeSession.on("input_speech_stopped", (ev) => this.onInputSpeechStopped(ev));
148
- this.realtimeSession.on(
149
- "input_audio_transcription_completed",
150
- (ev) => this.onInputAudioTranscriptionCompleted(ev)
151
- );
152
- this.realtimeSession.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
153
- this.realtimeSession.on("error", (ev) => this.onError(ev));
154
- (0, import_generation.removeInstructions)(this.agent._chatCtx);
155
- try {
156
- await this.realtimeSession.updateInstructions(this.agent.instructions);
157
- } catch (error) {
158
- this.logger.error(error, "failed to update the instructions");
159
- }
160
- try {
161
- await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
162
- } catch (error) {
163
- this.logger.error(error, "failed to update the chat context");
164
- }
165
- try {
166
- await this.realtimeSession.updateTools(this.tools);
167
- } catch (error) {
168
- this.logger.error(error, "failed to update the tools");
169
- }
170
- if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
171
- this.logger.error(
172
- "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
173
- );
174
- }
175
- } else if (this.llm instanceof import_llm.LLM) {
176
- try {
177
- (0, import_generation.updateInstructions)({
178
- chatCtx: this.agent._chatCtx,
179
- instructions: this.agent.instructions,
180
- addIfMissing: true
181
- });
182
- } catch (error) {
183
- this.logger.error("failed to update the instructions", error);
184
- }
145
+ await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
146
+ } finally {
147
+ unlock();
148
+ }
149
+ }
150
+ async resume() {
151
+ const unlock = await this.lock.lock();
152
+ try {
153
+ await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
154
+ } finally {
155
+ unlock();
156
+ }
157
+ }
158
+ async _startSession(options) {
159
+ var _a;
160
+ const { spanName, runOnEnter } = options;
161
+ const startSpan = import_telemetry.tracer.startSpan({
162
+ name: spanName,
163
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
164
+ context: import_api.ROOT_CONTEXT
165
+ });
166
+ this.agent._agentActivity = this;
167
+ if (this.llm instanceof import_llm.RealtimeModel) {
168
+ this.realtimeSession = this.llm.session();
169
+ this.realtimeSpans = /* @__PURE__ */ new Map();
170
+ this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
171
+ this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
172
+ this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
173
+ this.realtimeSession.on(
174
+ "input_audio_transcription_completed",
175
+ this.onRealtimeInputAudioTranscriptionCompleted
176
+ );
177
+ this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
178
+ this.realtimeSession.on("error", this.onModelError);
179
+ (0, import_generation.removeInstructions)(this.agent._chatCtx);
180
+ try {
181
+ await this.realtimeSession.updateInstructions(this.agent.instructions);
182
+ } catch (error) {
183
+ this.logger.error(error, "failed to update the instructions");
185
184
  }
186
- if (this.llm instanceof import_llm.LLM) {
187
- this.llm.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
188
- this.llm.on("error", (ev) => this.onError(ev));
185
+ try {
186
+ await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
187
+ } catch (error) {
188
+ this.logger.error(error, "failed to update the chat context");
189
189
  }
190
- if (this.stt instanceof import_stt.STT) {
191
- this.stt.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
192
- this.stt.on("error", (ev) => this.onError(ev));
190
+ try {
191
+ await this.realtimeSession.updateTools(this.tools);
192
+ } catch (error) {
193
+ this.logger.error(error, "failed to update the tools");
193
194
  }
194
- if (this.tts instanceof import_tts.TTS) {
195
- this.tts.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
196
- this.tts.on("error", (ev) => this.onError(ev));
195
+ if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
196
+ this.logger.error(
197
+ "audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
198
+ );
197
199
  }
198
- if (this.vad instanceof import_vad.VAD) {
199
- this.vad.on("metrics_collected", (ev) => this.onMetricsCollected(ev));
200
+ } else if (this.llm instanceof import_llm.LLM) {
201
+ try {
202
+ (0, import_generation.updateInstructions)({
203
+ chatCtx: this.agent._chatCtx,
204
+ instructions: this.agent.instructions,
205
+ addIfMissing: true
206
+ });
207
+ } catch (error) {
208
+ this.logger.error("failed to update the instructions", error);
200
209
  }
201
- this.audioRecognition = new import_audio_recognition.AudioRecognition({
202
- recognitionHooks: this,
203
- // Disable stt node if stt is not provided
204
- stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
205
- vad: this.vad,
206
- turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
207
- turnDetectionMode: this.turnDetectionMode,
208
- minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
209
- maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
210
- rootSpanContext: this.agentSession.rootSpanContext,
211
- sttModel: (_a = this.stt) == null ? void 0 : _a.label,
212
- sttProvider: this.getSttProvider(),
213
- getLinkedParticipant: () => {
214
- var _a2;
215
- return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
216
- }
217
- });
218
- this.audioRecognition.start();
219
- this.started = true;
220
- this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
221
- const onEnterTask = import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
222
- name: "on_enter",
223
- context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
224
- attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
225
- });
226
- this.createSpeechTask({
227
- task: import_utils.Task.from(() => onEnterTask),
210
+ }
211
+ if (this.llm instanceof import_llm.LLM) {
212
+ this.llm.on("metrics_collected", this.onMetricsCollected);
213
+ this.llm.on("error", this.onModelError);
214
+ }
215
+ if (this.stt instanceof import_stt.STT) {
216
+ this.stt.on("metrics_collected", this.onMetricsCollected);
217
+ this.stt.on("error", this.onModelError);
218
+ }
219
+ if (this.tts instanceof import_tts.TTS) {
220
+ this.tts.on("metrics_collected", this.onMetricsCollected);
221
+ this.tts.on("error", this.onModelError);
222
+ }
223
+ if (this.vad instanceof import_vad.VAD) {
224
+ this.vad.on("metrics_collected", this.onMetricsCollected);
225
+ }
226
+ this.audioRecognition = new import_audio_recognition.AudioRecognition({
227
+ recognitionHooks: this,
228
+ // Disable stt node if stt is not provided
229
+ stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
230
+ vad: this.vad,
231
+ turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
232
+ turnDetectionMode: this.turnDetectionMode,
233
+ minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
234
+ maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
235
+ rootSpanContext: this.agentSession.rootSpanContext,
236
+ sttModel: (_a = this.stt) == null ? void 0 : _a.label,
237
+ sttProvider: this.getSttProvider(),
238
+ getLinkedParticipant: () => {
239
+ var _a2;
240
+ return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
241
+ }
242
+ });
243
+ this.audioRecognition.start();
244
+ this.started = true;
245
+ this._resumeSchedulingTask();
246
+ if (runOnEnter) {
247
+ this._onEnterTask = this.createSpeechTask({
248
+ taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
249
+ name: "on_enter",
250
+ context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
251
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
252
+ }),
253
+ inlineTask: true,
228
254
  name: "AgentActivity_onEnter"
229
255
  });
230
- startSpan.end();
231
- } finally {
232
- unlock();
233
256
  }
257
+ startSpan.end();
234
258
  }
235
259
  get currentSpeech() {
236
260
  return this._currentSpeech;
@@ -259,8 +283,8 @@ class AgentActivity {
259
283
  get tools() {
260
284
  return this.agent.toolCtx;
261
285
  }
262
- get draining() {
263
- return this._draining;
286
+ get schedulingPaused() {
287
+ return this._schedulingPaused;
264
288
  }
265
289
  get realtimeLLMSession() {
266
290
  return this.realtimeSession;
@@ -300,11 +324,9 @@ class AgentActivity {
300
324
  }
301
325
  }
302
326
  attachAudioInput(audioStream) {
303
- if (this.audioStream.isSourceSet) {
304
- this.logger.debug("detaching existing audio input in agent activity");
305
- this.audioStream.detachSource();
306
- }
307
- this.audioStream.setSource(audioStream);
327
+ void this.audioStream.close();
328
+ this.audioStream = new import_multi_input_stream.MultiInputStream();
329
+ this.audioStreamId = this.audioStream.addInputStream(audioStream);
308
330
  const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
309
331
  if (this.realtimeSession) {
310
332
  this.realtimeSession.setInputAudioStream(realtimeAudioStream);
@@ -314,13 +336,21 @@ class AgentActivity {
314
336
  }
315
337
  }
316
338
  detachAudioInput() {
317
- this.audioStream.detachSource();
339
+ if (this.audioStreamId === void 0) {
340
+ return;
341
+ }
342
+ void this.audioStream.close();
343
+ this.audioStream = new import_multi_input_stream.MultiInputStream();
344
+ this.audioStreamId = void 0;
318
345
  }
319
- commitUserTurn() {
346
+ commitUserTurn(options = {}) {
347
+ const { audioDetached = false, throwIfNotReady = true } = options;
320
348
  if (!this.audioRecognition) {
321
- throw new Error("AudioRecognition is not initialized");
349
+ if (throwIfNotReady) {
350
+ throw new Error("AudioRecognition is not initialized");
351
+ }
352
+ return;
322
353
  }
323
- const audioDetached = false;
324
354
  this.audioRecognition.commitUserTurn(audioDetached);
325
355
  }
326
356
  clearUserTurn() {
@@ -356,19 +386,17 @@ class AgentActivity {
356
386
  })
357
387
  );
358
388
  const task = this.createSpeechTask({
359
- task: import_utils.Task.from(
360
- (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
361
- ),
389
+ taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
362
390
  ownedSpeechHandle: handle,
363
391
  name: "AgentActivity.say_tts"
364
392
  });
365
- task.finally(() => this.onPipelineReplyDone());
393
+ task.result.finally(() => this.onPipelineReplyDone());
366
394
  this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
367
395
  return handle;
368
396
  }
369
397
  // -- Metrics and errors --
370
398
  onMetricsCollected = (ev) => {
371
- const speechHandle = speechHandleStorage.getStore();
399
+ const speechHandle = import_agent.speechHandleStorage.getStore();
372
400
  if (speechHandle && (ev.type === "llm_metrics" || ev.type === "tts_metrics")) {
373
401
  ev.speechId = speechHandle.id;
374
402
  }
@@ -452,8 +480,8 @@ class AgentActivity {
452
480
  if (ev.userInitiated) {
453
481
  return;
454
482
  }
455
- if (this.draining) {
456
- this.logger.warn("skipping new realtime generation, the agent is draining");
483
+ if (this.schedulingPaused) {
484
+ this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
457
485
  return;
458
486
  }
459
487
  const handle = import_speech_handle.SpeechHandle.create({
@@ -469,9 +497,7 @@ class AgentActivity {
469
497
  );
470
498
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
471
499
  this.createSpeechTask({
472
- task: import_utils.Task.from(
473
- (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
474
- ),
500
+ taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
475
501
  ownedSpeechHandle: handle,
476
502
  name: "AgentActivity.realtimeGeneration"
477
503
  });
@@ -558,7 +584,7 @@ class AgentActivity {
558
584
  }
559
585
  }
560
586
  onPreemptiveGeneration(info) {
561
- if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
587
+ if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
562
588
  return;
563
589
  }
564
590
  this.cancelPreemptiveGeneration();
@@ -596,7 +622,21 @@ class AgentActivity {
596
622
  }
597
623
  }
598
624
  createSpeechTask(options) {
599
- const { task, ownedSpeechHandle } = options;
625
+ const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
626
+ const wrappedFn = (ctrl) => {
627
+ return agentActivityStorage.run(this, () => {
628
+ const currentTask = import_utils.Task.current();
629
+ if (currentTask) {
630
+ (0, import_agent._setActivityTaskInfo)(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
631
+ }
632
+ if (ownedSpeechHandle) {
633
+ return import_agent.speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
634
+ }
635
+ return taskFn(ctrl);
636
+ });
637
+ };
638
+ const task = import_utils.Task.from(wrappedFn, controller, name);
639
+ (0, import_agent._setActivityTaskInfo)(task, { speechHandle: ownedSpeechHandle, inlineTask });
600
640
  this.speechTasks.add(task);
601
641
  task.addDoneCallback(() => {
602
642
  this.speechTasks.delete(task);
@@ -612,12 +652,15 @@ class AgentActivity {
612
652
  task.addDoneCallback(() => {
613
653
  this.wakeupMainTask();
614
654
  });
615
- return task.result;
655
+ return task;
616
656
  }
617
657
  async onEndOfTurn(info) {
618
- if (this.draining) {
658
+ if (this.schedulingPaused) {
619
659
  this.cancelPreemptiveGeneration();
620
- this.logger.warn({ user_input: info.newTranscript }, "skipping user input, task is draining");
660
+ this.logger.warn(
661
+ { user_input: info.newTranscript },
662
+ "skipping user input, speech scheduling is paused"
663
+ );
621
664
  return true;
622
665
  }
623
666
  if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
@@ -636,7 +679,7 @@ class AgentActivity {
636
679
  }
637
680
  const oldTask = this._userTurnCompletedTask;
638
681
  this._userTurnCompletedTask = this.createSpeechTask({
639
- task: import_utils.Task.from(() => this.userTurnCompleted(info, oldTask)),
682
+ taskFn: () => this.userTurnCompleted(info, oldTask),
640
683
  name: "AgentActivity.userTurnCompleted"
641
684
  });
642
685
  return true;
@@ -666,14 +709,41 @@ class AgentActivity {
666
709
  await speechHandle._waitForGeneration();
667
710
  this._currentSpeech = void 0;
668
711
  }
669
- if (this.draining && this.speechTasks.size === 0) {
670
- this.logger.info("mainTask: draining and no more speech tasks");
712
+ const toWait = this.getDrainPendingSpeechTasks();
713
+ if (this._schedulingPaused && toWait.length === 0) {
714
+ this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
671
715
  break;
672
716
  }
673
717
  this.q_updated = new import_utils.Future();
674
718
  }
675
719
  this.logger.info("AgentActivity mainTask: exiting");
676
720
  }
721
+ getDrainPendingSpeechTasks() {
722
+ const blockedHandles = [];
723
+ for (const task of this._drainBlockedTasks) {
724
+ const info = (0, import_agent._getActivityTaskInfo)(task);
725
+ if (!info) {
726
+ this.logger.error("blocked task without activity info; skipping.");
727
+ continue;
728
+ }
729
+ if (!info.speechHandle) {
730
+ continue;
731
+ }
732
+ blockedHandles.push(info.speechHandle);
733
+ }
734
+ const toWait = [];
735
+ for (const task of this.speechTasks) {
736
+ if (this._drainBlockedTasks.includes(task)) {
737
+ continue;
738
+ }
739
+ const info = (0, import_agent._getActivityTaskInfo)(task);
740
+ if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
741
+ continue;
742
+ }
743
+ toWait.push(task);
744
+ }
745
+ return toWait;
746
+ }
677
747
  wakeupMainTask() {
678
748
  this.q_updated.resolve();
679
749
  }
@@ -699,7 +769,7 @@ class AgentActivity {
699
769
  if (this.llm === void 0) {
700
770
  throw new Error("trying to generate reply without an LLM model");
701
771
  }
702
- const functionCall = (_a = import_agent.asyncLocalStorage.getStore()) == null ? void 0 : _a.functionCall;
772
+ const functionCall = (_a = import_agent.functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
703
773
  if (toolChoice === void 0 && functionCall !== void 0) {
704
774
  toolChoice = "none";
705
775
  }
@@ -717,19 +787,17 @@ class AgentActivity {
717
787
  this.logger.info({ speech_id: handle.id }, "Creating speech handle");
718
788
  if (this.llm instanceof import_llm.RealtimeModel) {
719
789
  this.createSpeechTask({
720
- task: import_utils.Task.from(
721
- (abortController) => this.realtimeReplyTask({
722
- speechHandle: handle,
723
- // TODO(brian): support llm.ChatMessage for the realtime model
724
- userInput: userMessage == null ? void 0 : userMessage.textContent,
725
- instructions,
726
- modelSettings: {
727
- // isGiven(toolChoice) = toolChoice !== undefined
728
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
729
- },
730
- abortController
731
- })
732
- ),
790
+ taskFn: (abortController) => this.realtimeReplyTask({
791
+ speechHandle: handle,
792
+ // TODO(brian): support llm.ChatMessage for the realtime model
793
+ userInput: userMessage == null ? void 0 : userMessage.textContent,
794
+ instructions,
795
+ modelSettings: {
796
+ // isGiven(toolChoice) = toolChoice !== undefined
797
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
798
+ },
799
+ abortController
800
+ }),
733
801
  ownedSpeechHandle: handle,
734
802
  name: "AgentActivity.realtimeReply"
735
803
  });
@@ -739,36 +807,36 @@ class AgentActivity {
739
807
  ${instructions}`;
740
808
  }
741
809
  const task = this.createSpeechTask({
742
- task: import_utils.Task.from(
743
- (abortController) => this.pipelineReplyTask(
744
- handle,
745
- chatCtx ?? this.agent.chatCtx,
746
- this.agent.toolCtx,
747
- {
748
- toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
749
- },
750
- abortController,
751
- instructions,
752
- userMessage
753
- )
810
+ taskFn: (abortController) => this.pipelineReplyTask(
811
+ handle,
812
+ chatCtx ?? this.agent.chatCtx,
813
+ this.agent.toolCtx,
814
+ {
815
+ toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
816
+ },
817
+ abortController,
818
+ instructions,
819
+ userMessage
754
820
  ),
755
821
  ownedSpeechHandle: handle,
756
822
  name: "AgentActivity.pipelineReply"
757
823
  });
758
- task.finally(() => this.onPipelineReplyDone());
824
+ task.result.finally(() => this.onPipelineReplyDone());
759
825
  }
760
826
  if (scheduleSpeech) {
761
827
  this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
762
828
  }
763
829
  return handle;
764
830
  }
765
- interrupt() {
831
+ interrupt(options = {}) {
766
832
  var _a;
833
+ const { force = false } = options;
834
+ this.cancelPreemptiveGeneration();
767
835
  const future = new import_utils.Future();
768
836
  const currentSpeech = this._currentSpeech;
769
- currentSpeech == null ? void 0 : currentSpeech.interrupt();
837
+ currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
770
838
  for (const [_, __, speech] of this.speechQueue) {
771
- speech.interrupt();
839
+ speech.interrupt(force);
772
840
  }
773
841
  (_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
774
842
  if (currentSpeech === void 0) {
@@ -789,7 +857,7 @@ ${instructions}`;
789
857
  async userTurnCompleted(info, oldTask) {
790
858
  var _a, _b;
791
859
  if (oldTask) {
792
- await oldTask;
860
+ await oldTask.result;
793
861
  }
794
862
  if (this.llm instanceof import_llm.RealtimeModel) {
795
863
  if (this.llm.capabilities.turnDetection) {
@@ -871,7 +939,7 @@ ${instructions}`;
871
939
  }
872
940
  async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
873
941
  speechHandle._agentTurnContext = import_api.context.active();
874
- speechHandleStorage.enterWith(speechHandle);
942
+ import_agent.speechHandleStorage.enterWith(speechHandle);
875
943
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
876
944
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
877
945
  await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
@@ -976,7 +1044,7 @@ ${instructions}`;
976
1044
  toolsMessages,
977
1045
  span
978
1046
  }) => {
979
- var _a, _b, _c, _d, _e;
1047
+ var _a, _b;
980
1048
  speechHandle._agentTurnContext = import_api.context.active();
981
1049
  span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
982
1050
  if (instructions) {
@@ -989,7 +1057,7 @@ ${instructions}`;
989
1057
  if (localParticipant) {
990
1058
  (0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
991
1059
  }
992
- speechHandleStorage.enterWith(speechHandle);
1060
+ import_agent.speechHandleStorage.enterWith(speechHandle);
993
1061
  const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
994
1062
  const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
995
1063
  chatCtx = chatCtx.copy();
@@ -1122,11 +1190,11 @@ ${instructions}`;
1122
1190
  for (const msg of toolsMessages) {
1123
1191
  msg.createdAt = replyStartedAt;
1124
1192
  }
1125
- this.agent._chatCtx.insert(toolsMessages);
1126
1193
  const toolCallOutputs = toolsMessages.filter(
1127
1194
  (m) => m.type === "function_call_output"
1128
1195
  );
1129
1196
  if (toolCallOutputs.length > 0) {
1197
+ this.agent._chatCtx.insert(toolCallOutputs);
1130
1198
  this.agentSession._toolItemsAdded(toolCallOutputs);
1131
1199
  }
1132
1200
  }
@@ -1214,45 +1282,15 @@ ${instructions}`;
1214
1282
  );
1215
1283
  return;
1216
1284
  }
1217
- const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1218
- functionCalls: [],
1219
- functionCallOutputs: []
1220
- });
1221
- let shouldGenerateToolReply = false;
1222
- let newAgentTask = null;
1223
- let ignoreTaskSwitch = false;
1224
- for (const sanitizedOut of toolOutput.output) {
1225
- if (sanitizedOut.toolCallOutput !== void 0) {
1226
- functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1227
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1228
- if (sanitizedOut.replyRequired) {
1229
- shouldGenerateToolReply = true;
1230
- }
1231
- }
1232
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1233
- this.logger.error("expected to receive only one agent task from the tool executions");
1234
- ignoreTaskSwitch = true;
1235
- }
1236
- newAgentTask = sanitizedOut.agentTask ?? null;
1237
- this.logger.debug(
1238
- {
1239
- speechId: speechHandle.id,
1240
- name: (_c = sanitizedOut.toolCall) == null ? void 0 : _c.name,
1241
- args: sanitizedOut.toolCall.args,
1242
- output: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.output,
1243
- isError: (_e = sanitizedOut.toolCallOutput) == null ? void 0 : _e.isError
1244
- },
1245
- "Tool call execution finished"
1246
- );
1247
- }
1285
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1248
1286
  this.agentSession.emit(
1249
1287
  import_events.AgentSessionEventTypes.FunctionToolsExecuted,
1250
1288
  functionToolsExecutedEvent
1251
1289
  );
1252
- let draining = this.draining;
1290
+ let schedulingPaused = this.schedulingPaused;
1253
1291
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1254
1292
  this.agentSession.updateAgent(newAgentTask);
1255
- draining = true;
1293
+ schedulingPaused = true;
1256
1294
  }
1257
1295
  const toolMessages = [
1258
1296
  ...functionToolsExecutedEvent.functionCalls,
@@ -1261,34 +1299,32 @@ ${instructions}`;
1261
1299
  if (shouldGenerateToolReply) {
1262
1300
  chatCtx.insert(toolMessages);
1263
1301
  speechHandle._numSteps += 1;
1264
- const respondToolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1302
+ const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1265
1303
  const toolResponseTask = this.createSpeechTask({
1266
- task: import_utils.Task.from(
1267
- () => this.pipelineReplyTask(
1268
- speechHandle,
1269
- chatCtx,
1270
- toolCtx,
1271
- { toolChoice: respondToolChoice },
1272
- replyAbortController,
1273
- instructions,
1274
- void 0,
1275
- toolMessages
1276
- )
1304
+ taskFn: () => this.pipelineReplyTask(
1305
+ speechHandle,
1306
+ chatCtx,
1307
+ toolCtx,
1308
+ { toolChoice: respondToolChoice },
1309
+ replyAbortController,
1310
+ instructions,
1311
+ void 0,
1312
+ toolMessages
1277
1313
  ),
1278
1314
  ownedSpeechHandle: speechHandle,
1279
1315
  name: "AgentActivity.pipelineReply"
1280
1316
  });
1281
- toolResponseTask.finally(() => this.onPipelineReplyDone());
1317
+ toolResponseTask.result.finally(() => this.onPipelineReplyDone());
1282
1318
  this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1283
1319
  } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1284
1320
  for (const msg of toolMessages) {
1285
1321
  msg.createdAt = replyStartedAt;
1286
1322
  }
1287
- this.agent._chatCtx.insert(toolMessages);
1288
1323
  const toolCallOutputs = toolMessages.filter(
1289
1324
  (m) => m.type === "function_call_output"
1290
1325
  );
1291
1326
  if (toolCallOutputs.length > 0) {
1327
+ this.agent._chatCtx.insert(toolCallOutputs);
1292
1328
  this.agentSession._toolItemsAdded(toolCallOutputs);
1293
1329
  }
1294
1330
  }
@@ -1332,14 +1368,14 @@ ${instructions}`;
1332
1368
  replyAbortController,
1333
1369
  span
1334
1370
  }) {
1335
- var _a, _b, _c, _d;
1371
+ var _a;
1336
1372
  speechHandle._agentTurnContext = import_api.context.active();
1337
1373
  span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
1338
1374
  const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
1339
1375
  if (localParticipant) {
1340
1376
  (0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
1341
1377
  }
1342
- speechHandleStorage.enterWith(speechHandle);
1378
+ import_agent.speechHandleStorage.enterWith(speechHandle);
1343
1379
  if (!this.realtimeSession) {
1344
1380
  throw new Error("realtime session is not initialized");
1345
1381
  }
@@ -1592,44 +1628,15 @@ ${instructions}`;
1592
1628
  );
1593
1629
  return;
1594
1630
  }
1595
- const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1596
- functionCalls: [],
1597
- functionCallOutputs: []
1598
- });
1599
- let shouldGenerateToolReply = false;
1600
- let newAgentTask = null;
1601
- let ignoreTaskSwitch = false;
1602
- for (const sanitizedOut of toolOutput.output) {
1603
- if (sanitizedOut.toolCallOutput !== void 0) {
1604
- functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1605
- if (sanitizedOut.replyRequired) {
1606
- shouldGenerateToolReply = true;
1607
- }
1608
- }
1609
- if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1610
- this.logger.error("expected to receive only one agent task from the tool executions");
1611
- ignoreTaskSwitch = true;
1612
- }
1613
- newAgentTask = sanitizedOut.agentTask ?? null;
1614
- this.logger.debug(
1615
- {
1616
- speechId: speechHandle.id,
1617
- name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
1618
- args: sanitizedOut.toolCall.args,
1619
- output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
1620
- isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
1621
- },
1622
- "Tool call execution finished"
1623
- );
1624
- }
1631
+ const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
1625
1632
  this.agentSession.emit(
1626
1633
  import_events.AgentSessionEventTypes.FunctionToolsExecuted,
1627
1634
  functionToolsExecutedEvent
1628
1635
  );
1629
- let draining = this.draining;
1636
+ let schedulingPaused = this.schedulingPaused;
1630
1637
  if (!ignoreTaskSwitch && newAgentTask !== null) {
1631
1638
  this.agentSession.updateAgent(newAgentTask);
1632
- draining = true;
1639
+ schedulingPaused = true;
1633
1640
  }
1634
1641
  if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
1635
1642
  while (this.currentSpeech || this.speechQueue.size() > 0) {
@@ -1670,20 +1677,58 @@ ${instructions}`;
1670
1677
  speechHandle: replySpeechHandle
1671
1678
  })
1672
1679
  );
1673
- const toolChoice = draining || modelSettings.toolChoice === "none" ? "none" : "auto";
1680
+ const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
1674
1681
  this.createSpeechTask({
1675
- task: import_utils.Task.from(
1676
- (abortController) => this.realtimeReplyTask({
1677
- speechHandle: replySpeechHandle,
1678
- modelSettings: { toolChoice },
1679
- abortController
1680
- })
1681
- ),
1682
+ taskFn: (abortController) => this.realtimeReplyTask({
1683
+ speechHandle: replySpeechHandle,
1684
+ modelSettings: { toolChoice },
1685
+ abortController
1686
+ }),
1682
1687
  ownedSpeechHandle: replySpeechHandle,
1683
1688
  name: "AgentActivity.realtime_reply"
1684
1689
  });
1685
1690
  this.scheduleSpeech(replySpeechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
1686
1691
  }
1692
+ summarizeToolExecutionOutput(toolOutput, speechHandle) {
1693
+ var _a, _b, _c;
1694
+ const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
1695
+ functionCalls: [],
1696
+ functionCallOutputs: []
1697
+ });
1698
+ let shouldGenerateToolReply = false;
1699
+ let newAgentTask = null;
1700
+ let ignoreTaskSwitch = false;
1701
+ for (const sanitizedOut of toolOutput.output) {
1702
+ if (sanitizedOut.toolCallOutput !== void 0) {
1703
+ functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
1704
+ functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
1705
+ if (sanitizedOut.replyRequired) {
1706
+ shouldGenerateToolReply = true;
1707
+ }
1708
+ }
1709
+ if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
1710
+ this.logger.error("expected to receive only one agent task from the tool executions");
1711
+ ignoreTaskSwitch = true;
1712
+ }
1713
+ newAgentTask = sanitizedOut.agentTask ?? null;
1714
+ this.logger.debug(
1715
+ {
1716
+ speechId: speechHandle.id,
1717
+ name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
1718
+ args: sanitizedOut.toolCall.args,
1719
+ output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
1720
+ isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
1721
+ },
1722
+ "Tool call execution finished"
1723
+ );
1724
+ }
1725
+ return {
1726
+ functionToolsExecutedEvent,
1727
+ shouldGenerateToolReply,
1728
+ newAgentTask,
1729
+ ignoreTaskSwitch
1730
+ };
1731
+ }
1687
1732
  async realtimeReplyTask({
1688
1733
  speechHandle,
1689
1734
  modelSettings: { toolChoice },
@@ -1691,7 +1736,7 @@ ${instructions}`;
1691
1736
  instructions,
1692
1737
  abortController
1693
1738
  }) {
1694
- speechHandleStorage.enterWith(speechHandle);
1739
+ import_agent.speechHandleStorage.enterWith(speechHandle);
1695
1740
  if (!this.realtimeSession) {
1696
1741
  throw new Error("realtime session is not available");
1697
1742
  }
@@ -1725,13 +1770,45 @@ ${instructions}`;
1725
1770
  }
1726
1771
  }
1727
1772
  scheduleSpeech(speechHandle, priority, force = false) {
1728
- if (this.draining && !force) {
1729
- throw new Error("cannot schedule new speech, the agent is draining");
1773
+ if (this.schedulingPaused && !force) {
1774
+ throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
1730
1775
  }
1731
1776
  this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
1732
1777
  speechHandle._markScheduled();
1733
1778
  this.wakeupMainTask();
1734
1779
  }
1780
+ async _pauseSchedulingTask(blockedTasks) {
1781
+ if (this._schedulingPaused) return;
1782
+ this._schedulingPaused = true;
1783
+ this._drainBlockedTasks = blockedTasks;
1784
+ this.wakeupMainTask();
1785
+ if (this._mainTask) {
1786
+ await this._mainTask.result;
1787
+ }
1788
+ }
1789
+ _resumeSchedulingTask() {
1790
+ if (!this._schedulingPaused) return;
1791
+ this._schedulingPaused = false;
1792
+ this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
1793
+ }
1794
+ async pause(options = {}) {
1795
+ const { blockedTasks = [] } = options;
1796
+ const unlock = await this.lock.lock();
1797
+ try {
1798
+ const span = import_telemetry.tracer.startSpan({
1799
+ name: "pause_agent_activity",
1800
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1801
+ });
1802
+ try {
1803
+ await this._pauseSchedulingTask(blockedTasks);
1804
+ await this._closeSessionResources();
1805
+ } finally {
1806
+ span.end();
1807
+ }
1808
+ } finally {
1809
+ unlock();
1810
+ }
1811
+ }
1735
1812
  async drain() {
1736
1813
  return import_telemetry.tracer.startActiveSpan(async (span) => this._drainImpl(span), {
1737
1814
  name: "drain_agent_activity",
@@ -1739,72 +1816,80 @@ ${instructions}`;
1739
1816
  });
1740
1817
  }
1741
1818
  async _drainImpl(span) {
1742
- var _a;
1743
1819
  span.setAttribute(import_telemetry.traceTypes.ATTR_AGENT_LABEL, this.agent.id);
1744
1820
  const unlock = await this.lock.lock();
1745
1821
  try {
1746
- if (this._draining) return;
1747
- this.cancelPreemptiveGeneration();
1748
- const onExitTask = import_telemetry.tracer.startActiveSpan(async () => this.agent.onExit(), {
1749
- name: "on_exit",
1750
- attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1751
- });
1752
- this.createSpeechTask({
1753
- task: import_utils.Task.from(() => onExitTask),
1822
+ if (this._schedulingPaused) return;
1823
+ this._onExitTask = this.createSpeechTask({
1824
+ taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onExit(), {
1825
+ name: "on_exit",
1826
+ attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
1827
+ }),
1828
+ inlineTask: true,
1754
1829
  name: "AgentActivity_onExit"
1755
1830
  });
1756
- this.wakeupMainTask();
1757
- this._draining = true;
1758
- await ((_a = this._mainTask) == null ? void 0 : _a.result);
1831
+ this.cancelPreemptiveGeneration();
1832
+ await this._onExitTask.result;
1833
+ await this._pauseSchedulingTask([]);
1759
1834
  } finally {
1760
1835
  unlock();
1761
1836
  }
1762
1837
  }
1763
1838
  async close() {
1764
- var _a, _b, _c, _d;
1765
1839
  const unlock = await this.lock.lock();
1766
1840
  try {
1767
- if (!this._draining) {
1768
- this.logger.warn("task closing without draining");
1769
- }
1770
1841
  this.cancelPreemptiveGeneration();
1771
- if (this.llm instanceof import_llm.LLM) {
1772
- this.llm.off("metrics_collected", this.onMetricsCollected);
1773
- }
1774
- if (this.realtimeSession) {
1775
- this.realtimeSession.off("generation_created", this.onGenerationCreated);
1776
- this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
1777
- this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
1778
- this.realtimeSession.off(
1779
- "input_audio_transcription_completed",
1780
- this.onInputAudioTranscriptionCompleted
1781
- );
1782
- this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1783
- }
1784
- if (this.stt instanceof import_stt.STT) {
1785
- this.stt.off("metrics_collected", this.onMetricsCollected);
1842
+ await this._closeSessionResources();
1843
+ if (this._mainTask) {
1844
+ await this._mainTask.cancelAndWait();
1786
1845
  }
1787
- if (this.tts instanceof import_tts.TTS) {
1788
- this.tts.off("metrics_collected", this.onMetricsCollected);
1789
- }
1790
- if (this.vad instanceof import_vad.VAD) {
1791
- this.vad.off("metrics_collected", this.onMetricsCollected);
1792
- }
1793
- this.detachAudioInput();
1794
- (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
1795
- await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
1796
- await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
1797
- await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
1846
+ this.agent._agentActivity = void 0;
1798
1847
  } finally {
1799
1848
  unlock();
1800
1849
  }
1801
1850
  }
1851
+ async _closeSessionResources() {
1852
+ var _a, _b, _c;
1853
+ if (this.llm instanceof import_llm.LLM) {
1854
+ this.llm.off("metrics_collected", this.onMetricsCollected);
1855
+ this.llm.off("error", this.onModelError);
1856
+ }
1857
+ if (this.realtimeSession) {
1858
+ this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
1859
+ this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
1860
+ this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
1861
+ this.realtimeSession.off(
1862
+ "input_audio_transcription_completed",
1863
+ this.onRealtimeInputAudioTranscriptionCompleted
1864
+ );
1865
+ this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
1866
+ this.realtimeSession.off("error", this.onModelError);
1867
+ }
1868
+ if (this.stt instanceof import_stt.STT) {
1869
+ this.stt.off("metrics_collected", this.onMetricsCollected);
1870
+ this.stt.off("error", this.onModelError);
1871
+ }
1872
+ if (this.tts instanceof import_tts.TTS) {
1873
+ this.tts.off("metrics_collected", this.onMetricsCollected);
1874
+ this.tts.off("error", this.onModelError);
1875
+ }
1876
+ if (this.vad instanceof import_vad.VAD) {
1877
+ this.vad.off("metrics_collected", this.onMetricsCollected);
1878
+ }
1879
+ this.detachAudioInput();
1880
+ (_a = this.realtimeSpans) == null ? void 0 : _a.clear();
1881
+ await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
1882
+ await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
1883
+ this.realtimeSession = void 0;
1884
+ this.audioRecognition = void 0;
1885
+ }
1802
1886
  }
1803
1887
  function toOaiToolChoice(toolChoice) {
1804
1888
  return toolChoice !== null ? toolChoice : void 0;
1805
1889
  }
1806
1890
  // Annotate the CommonJS export names for ESM import in node:
1807
1891
  0 && (module.exports = {
1808
- AgentActivity
1892
+ AgentActivity,
1893
+ agentActivityStorage
1809
1894
  });
1810
1895
  //# sourceMappingURL=agent_activity.cjs.map