@livekit/agents 1.0.46 → 1.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +14 -20
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +14 -20
- package/dist/cli.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +14 -5
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +14 -5
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/llm/chat_context.cjs +19 -0
- package/dist/llm/chat_context.cjs.map +1 -1
- package/dist/llm/chat_context.d.cts +4 -0
- package/dist/llm/chat_context.d.ts +4 -0
- package/dist/llm/chat_context.d.ts.map +1 -1
- package/dist/llm/chat_context.js +19 -0
- package/dist/llm/chat_context.js.map +1 -1
- package/dist/llm/provider_format/index.cjs +2 -0
- package/dist/llm/provider_format/index.cjs.map +1 -1
- package/dist/llm/provider_format/index.d.cts +1 -1
- package/dist/llm/provider_format/index.d.ts +1 -1
- package/dist/llm/provider_format/index.d.ts.map +1 -1
- package/dist/llm/provider_format/index.js +6 -1
- package/dist/llm/provider_format/index.js.map +1 -1
- package/dist/llm/provider_format/openai.cjs +82 -2
- package/dist/llm/provider_format/openai.cjs.map +1 -1
- package/dist/llm/provider_format/openai.d.cts +1 -0
- package/dist/llm/provider_format/openai.d.ts +1 -0
- package/dist/llm/provider_format/openai.d.ts.map +1 -1
- package/dist/llm/provider_format/openai.js +80 -1
- package/dist/llm/provider_format/openai.js.map +1 -1
- package/dist/llm/provider_format/openai.test.cjs +326 -0
- package/dist/llm/provider_format/openai.test.cjs.map +1 -1
- package/dist/llm/provider_format/openai.test.js +327 -1
- package/dist/llm/provider_format/openai.test.js.map +1 -1
- package/dist/llm/provider_format/utils.cjs +4 -3
- package/dist/llm/provider_format/utils.cjs.map +1 -1
- package/dist/llm/provider_format/utils.d.ts.map +1 -1
- package/dist/llm/provider_format/utils.js +4 -3
- package/dist/llm/provider_format/utils.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +1 -0
- package/dist/llm/realtime.d.ts +1 -0
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/log.cjs +5 -2
- package/dist/log.cjs.map +1 -1
- package/dist/log.d.ts.map +1 -1
- package/dist/log.js +5 -2
- package/dist/log.js.map +1 -1
- package/dist/stream/deferred_stream.cjs +15 -6
- package/dist/stream/deferred_stream.cjs.map +1 -1
- package/dist/stream/deferred_stream.d.ts.map +1 -1
- package/dist/stream/deferred_stream.js +15 -6
- package/dist/stream/deferred_stream.js.map +1 -1
- package/dist/utils.cjs +31 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +7 -0
- package/dist/utils.d.ts +7 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +31 -2
- package/dist/utils.js.map +1 -1
- package/dist/utils.test.cjs +71 -0
- package/dist/utils.test.cjs.map +1 -1
- package/dist/utils.test.js +71 -0
- package/dist/utils.test.js.map +1 -1
- package/dist/version.cjs +1 -1
- package/dist/version.cjs.map +1 -1
- package/dist/version.d.cts +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.d.ts.map +1 -1
- package/dist/version.js +1 -1
- package/dist/version.js.map +1 -1
- package/dist/voice/agent.cjs +144 -12
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +29 -4
- package/dist/voice/agent.d.ts +29 -4
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +140 -11
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent.test.cjs +120 -0
- package/dist/voice/agent.test.cjs.map +1 -1
- package/dist/voice/agent.test.js +122 -2
- package/dist/voice/agent.test.js.map +1 -1
- package/dist/voice/agent_activity.cjs +383 -298
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +34 -7
- package/dist/voice/agent_activity.d.ts +34 -7
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +383 -293
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +140 -40
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +19 -7
- package/dist/voice/agent_session.d.ts +19 -7
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +137 -37
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +4 -0
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +4 -0
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/generation.cjs +39 -19
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +44 -20
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs +2 -0
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js +2 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/speech_handle.cjs +7 -1
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +2 -0
- package/dist/voice/speech_handle.d.ts +2 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +8 -2
- package/dist/voice/speech_handle.js.map +1 -1
- package/dist/voice/testing/run_result.cjs +66 -15
- package/dist/voice/testing/run_result.cjs.map +1 -1
- package/dist/voice/testing/run_result.d.cts +14 -3
- package/dist/voice/testing/run_result.d.ts +14 -3
- package/dist/voice/testing/run_result.d.ts.map +1 -1
- package/dist/voice/testing/run_result.js +66 -15
- package/dist/voice/testing/run_result.js.map +1 -1
- package/package.json +1 -1
- package/src/cli.ts +20 -33
- package/src/ipc/job_proc_lazy_main.ts +16 -5
- package/src/llm/chat_context.ts +35 -0
- package/src/llm/provider_format/index.ts +7 -2
- package/src/llm/provider_format/openai.test.ts +385 -1
- package/src/llm/provider_format/openai.ts +103 -0
- package/src/llm/provider_format/utils.ts +6 -4
- package/src/llm/realtime.ts +1 -0
- package/src/log.ts +5 -2
- package/src/stream/deferred_stream.ts +17 -6
- package/src/utils.test.ts +87 -0
- package/src/utils.ts +36 -2
- package/src/version.ts +1 -1
- package/src/voice/agent.test.ts +140 -2
- package/src/voice/agent.ts +189 -10
- package/src/voice/agent_activity.ts +427 -289
- package/src/voice/agent_session.ts +178 -40
- package/src/voice/audio_recognition.ts +4 -0
- package/src/voice/generation.ts +52 -23
- package/src/voice/index.ts +1 -1
- package/src/voice/speech_handle.ts +9 -2
- package/src/voice/testing/run_result.ts +81 -23
|
@@ -18,7 +18,8 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
18
18
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
19
|
var agent_activity_exports = {};
|
|
20
20
|
__export(agent_activity_exports, {
|
|
21
|
-
AgentActivity: () => AgentActivity
|
|
21
|
+
AgentActivity: () => AgentActivity,
|
|
22
|
+
agentActivityStorage: () => agentActivityStorage
|
|
22
23
|
});
|
|
23
24
|
module.exports = __toCommonJS(agent_activity_exports);
|
|
24
25
|
var import_mutex = require("@livekit/mutex");
|
|
@@ -30,7 +31,7 @@ var import_chat_context = require("../llm/chat_context.cjs");
|
|
|
30
31
|
var import_llm = require("../llm/index.cjs");
|
|
31
32
|
var import_tool_context = require("../llm/tool_context.cjs");
|
|
32
33
|
var import_log = require("../log.cjs");
|
|
33
|
-
var
|
|
34
|
+
var import_multi_input_stream = require("../stream/multi_input_stream.cjs");
|
|
34
35
|
var import_stt = require("../stt/stt.cjs");
|
|
35
36
|
var import_telemetry = require("../telemetry/index.cjs");
|
|
36
37
|
var import_word = require("../tokenize/basic/word.cjs");
|
|
@@ -44,8 +45,10 @@ var import_events = require("./events.cjs");
|
|
|
44
45
|
var import_generation = require("./generation.cjs");
|
|
45
46
|
var import_speech_handle = require("./speech_handle.cjs");
|
|
46
47
|
var import_utils2 = require("./utils.cjs");
|
|
47
|
-
const
|
|
48
|
+
const agentActivityStorage = new import_node_async_hooks.AsyncLocalStorage();
|
|
48
49
|
class AgentActivity {
|
|
50
|
+
agent;
|
|
51
|
+
agentSession;
|
|
49
52
|
static REPLY_TASK_CANCEL_TIMEOUT = 5e3;
|
|
50
53
|
started = false;
|
|
51
54
|
audioRecognition;
|
|
@@ -54,22 +57,29 @@ class AgentActivity {
|
|
|
54
57
|
// Maps response_id to OTEL span for metrics recording
|
|
55
58
|
turnDetectionMode;
|
|
56
59
|
logger = (0, import_log.log)();
|
|
57
|
-
|
|
60
|
+
_schedulingPaused = true;
|
|
61
|
+
_drainBlockedTasks = [];
|
|
58
62
|
_currentSpeech;
|
|
59
63
|
speechQueue;
|
|
60
64
|
// [priority, timestamp, speechHandle]
|
|
61
65
|
q_updated;
|
|
62
66
|
speechTasks = /* @__PURE__ */ new Set();
|
|
63
67
|
lock = new import_mutex.Mutex();
|
|
64
|
-
audioStream = new
|
|
68
|
+
audioStream = new import_multi_input_stream.MultiInputStream();
|
|
69
|
+
audioStreamId;
|
|
65
70
|
// default to null as None, which maps to the default provider tool choice value
|
|
66
71
|
toolChoice = null;
|
|
67
72
|
_preemptiveGeneration;
|
|
68
|
-
agent;
|
|
69
|
-
agentSession;
|
|
70
73
|
/** @internal */
|
|
71
74
|
_mainTask;
|
|
75
|
+
_onEnterTask;
|
|
76
|
+
_onExitTask;
|
|
72
77
|
_userTurnCompletedTask;
|
|
78
|
+
onRealtimeGenerationCreated = (ev) => this.onGenerationCreated(ev);
|
|
79
|
+
onRealtimeInputSpeechStarted = (ev) => this.onInputSpeechStarted(ev);
|
|
80
|
+
onRealtimeInputSpeechStopped = (ev) => this.onInputSpeechStopped(ev);
|
|
81
|
+
onRealtimeInputAudioTranscriptionCompleted = (ev) => this.onInputAudioTranscriptionCompleted(ev);
|
|
82
|
+
onModelError = (ev) => this.onError(ev);
|
|
73
83
|
constructor(agent, agentSession) {
|
|
74
84
|
this.agent = agent;
|
|
75
85
|
this.agentSession = agentSession;
|
|
@@ -80,7 +90,7 @@ class AgentActivity {
|
|
|
80
90
|
this.turnDetectionMode = typeof this.turnDetection === "string" ? this.turnDetection : void 0;
|
|
81
91
|
if (this.turnDetectionMode === "vad" && this.vad === void 0) {
|
|
82
92
|
this.logger.warn(
|
|
83
|
-
'turnDetection is set to "vad", but no VAD model is provided, ignoring the
|
|
93
|
+
'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting'
|
|
84
94
|
);
|
|
85
95
|
this.turnDetectionMode = void 0;
|
|
86
96
|
}
|
|
@@ -130,107 +140,121 @@ class AgentActivity {
|
|
|
130
140
|
}
|
|
131
141
|
}
|
|
132
142
|
async start() {
|
|
133
|
-
var _a;
|
|
134
143
|
const unlock = await this.lock.lock();
|
|
135
144
|
try {
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
} else if (this.llm instanceof import_llm.LLM) {
|
|
176
|
-
try {
|
|
177
|
-
(0, import_generation.updateInstructions)({
|
|
178
|
-
chatCtx: this.agent._chatCtx,
|
|
179
|
-
instructions: this.agent.instructions,
|
|
180
|
-
addIfMissing: true
|
|
181
|
-
});
|
|
182
|
-
} catch (error) {
|
|
183
|
-
this.logger.error("failed to update the instructions", error);
|
|
184
|
-
}
|
|
145
|
+
await this._startSession({ spanName: "start_agent_activity", runOnEnter: true });
|
|
146
|
+
} finally {
|
|
147
|
+
unlock();
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
async resume() {
|
|
151
|
+
const unlock = await this.lock.lock();
|
|
152
|
+
try {
|
|
153
|
+
await this._startSession({ spanName: "resume_agent_activity", runOnEnter: false });
|
|
154
|
+
} finally {
|
|
155
|
+
unlock();
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
async _startSession(options) {
|
|
159
|
+
var _a;
|
|
160
|
+
const { spanName, runOnEnter } = options;
|
|
161
|
+
const startSpan = import_telemetry.tracer.startSpan({
|
|
162
|
+
name: spanName,
|
|
163
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id },
|
|
164
|
+
context: import_api.ROOT_CONTEXT
|
|
165
|
+
});
|
|
166
|
+
this.agent._agentActivity = this;
|
|
167
|
+
if (this.llm instanceof import_llm.RealtimeModel) {
|
|
168
|
+
this.realtimeSession = this.llm.session();
|
|
169
|
+
this.realtimeSpans = /* @__PURE__ */ new Map();
|
|
170
|
+
this.realtimeSession.on("generation_created", this.onRealtimeGenerationCreated);
|
|
171
|
+
this.realtimeSession.on("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
172
|
+
this.realtimeSession.on("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
173
|
+
this.realtimeSession.on(
|
|
174
|
+
"input_audio_transcription_completed",
|
|
175
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
176
|
+
);
|
|
177
|
+
this.realtimeSession.on("metrics_collected", this.onMetricsCollected);
|
|
178
|
+
this.realtimeSession.on("error", this.onModelError);
|
|
179
|
+
(0, import_generation.removeInstructions)(this.agent._chatCtx);
|
|
180
|
+
try {
|
|
181
|
+
await this.realtimeSession.updateInstructions(this.agent.instructions);
|
|
182
|
+
} catch (error) {
|
|
183
|
+
this.logger.error(error, "failed to update the instructions");
|
|
185
184
|
}
|
|
186
|
-
|
|
187
|
-
this.
|
|
188
|
-
|
|
185
|
+
try {
|
|
186
|
+
await this.realtimeSession.updateChatCtx(this.agent.chatCtx);
|
|
187
|
+
} catch (error) {
|
|
188
|
+
this.logger.error(error, "failed to update the chat context");
|
|
189
189
|
}
|
|
190
|
-
|
|
191
|
-
this.
|
|
192
|
-
|
|
190
|
+
try {
|
|
191
|
+
await this.realtimeSession.updateTools(this.tools);
|
|
192
|
+
} catch (error) {
|
|
193
|
+
this.logger.error(error, "failed to update the tools");
|
|
193
194
|
}
|
|
194
|
-
if (this.tts
|
|
195
|
-
this.
|
|
196
|
-
|
|
195
|
+
if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) {
|
|
196
|
+
this.logger.error(
|
|
197
|
+
"audio output is enabled but RealtimeModel has no audio modality and no TTS is set. Either enable audio modality in the RealtimeModel or set a TTS model."
|
|
198
|
+
);
|
|
197
199
|
}
|
|
198
|
-
|
|
199
|
-
|
|
200
|
+
} else if (this.llm instanceof import_llm.LLM) {
|
|
201
|
+
try {
|
|
202
|
+
(0, import_generation.updateInstructions)({
|
|
203
|
+
chatCtx: this.agent._chatCtx,
|
|
204
|
+
instructions: this.agent.instructions,
|
|
205
|
+
addIfMissing: true
|
|
206
|
+
});
|
|
207
|
+
} catch (error) {
|
|
208
|
+
this.logger.error("failed to update the instructions", error);
|
|
200
209
|
}
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
this
|
|
219
|
-
|
|
220
|
-
this.
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
this.
|
|
227
|
-
|
|
210
|
+
}
|
|
211
|
+
if (this.llm instanceof import_llm.LLM) {
|
|
212
|
+
this.llm.on("metrics_collected", this.onMetricsCollected);
|
|
213
|
+
this.llm.on("error", this.onModelError);
|
|
214
|
+
}
|
|
215
|
+
if (this.stt instanceof import_stt.STT) {
|
|
216
|
+
this.stt.on("metrics_collected", this.onMetricsCollected);
|
|
217
|
+
this.stt.on("error", this.onModelError);
|
|
218
|
+
}
|
|
219
|
+
if (this.tts instanceof import_tts.TTS) {
|
|
220
|
+
this.tts.on("metrics_collected", this.onMetricsCollected);
|
|
221
|
+
this.tts.on("error", this.onModelError);
|
|
222
|
+
}
|
|
223
|
+
if (this.vad instanceof import_vad.VAD) {
|
|
224
|
+
this.vad.on("metrics_collected", this.onMetricsCollected);
|
|
225
|
+
}
|
|
226
|
+
this.audioRecognition = new import_audio_recognition.AudioRecognition({
|
|
227
|
+
recognitionHooks: this,
|
|
228
|
+
// Disable stt node if stt is not provided
|
|
229
|
+
stt: this.stt ? (...args) => this.agent.sttNode(...args) : void 0,
|
|
230
|
+
vad: this.vad,
|
|
231
|
+
turnDetector: typeof this.turnDetection === "string" ? void 0 : this.turnDetection,
|
|
232
|
+
turnDetectionMode: this.turnDetectionMode,
|
|
233
|
+
minEndpointingDelay: this.agentSession.options.minEndpointingDelay,
|
|
234
|
+
maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay,
|
|
235
|
+
rootSpanContext: this.agentSession.rootSpanContext,
|
|
236
|
+
sttModel: (_a = this.stt) == null ? void 0 : _a.label,
|
|
237
|
+
sttProvider: this.getSttProvider(),
|
|
238
|
+
getLinkedParticipant: () => {
|
|
239
|
+
var _a2;
|
|
240
|
+
return (_a2 = this.agentSession._roomIO) == null ? void 0 : _a2.linkedParticipant;
|
|
241
|
+
}
|
|
242
|
+
});
|
|
243
|
+
this.audioRecognition.start();
|
|
244
|
+
this.started = true;
|
|
245
|
+
this._resumeSchedulingTask();
|
|
246
|
+
if (runOnEnter) {
|
|
247
|
+
this._onEnterTask = this.createSpeechTask({
|
|
248
|
+
taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onEnter(), {
|
|
249
|
+
name: "on_enter",
|
|
250
|
+
context: import_api.trace.setSpan(import_api.ROOT_CONTEXT, startSpan),
|
|
251
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
252
|
+
}),
|
|
253
|
+
inlineTask: true,
|
|
228
254
|
name: "AgentActivity_onEnter"
|
|
229
255
|
});
|
|
230
|
-
startSpan.end();
|
|
231
|
-
} finally {
|
|
232
|
-
unlock();
|
|
233
256
|
}
|
|
257
|
+
startSpan.end();
|
|
234
258
|
}
|
|
235
259
|
get currentSpeech() {
|
|
236
260
|
return this._currentSpeech;
|
|
@@ -259,8 +283,8 @@ class AgentActivity {
|
|
|
259
283
|
get tools() {
|
|
260
284
|
return this.agent.toolCtx;
|
|
261
285
|
}
|
|
262
|
-
get
|
|
263
|
-
return this.
|
|
286
|
+
get schedulingPaused() {
|
|
287
|
+
return this._schedulingPaused;
|
|
264
288
|
}
|
|
265
289
|
get realtimeLLMSession() {
|
|
266
290
|
return this.realtimeSession;
|
|
@@ -300,11 +324,9 @@ class AgentActivity {
|
|
|
300
324
|
}
|
|
301
325
|
}
|
|
302
326
|
attachAudioInput(audioStream) {
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
}
|
|
307
|
-
this.audioStream.setSource(audioStream);
|
|
327
|
+
void this.audioStream.close();
|
|
328
|
+
this.audioStream = new import_multi_input_stream.MultiInputStream();
|
|
329
|
+
this.audioStreamId = this.audioStream.addInputStream(audioStream);
|
|
308
330
|
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
|
|
309
331
|
if (this.realtimeSession) {
|
|
310
332
|
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
|
|
@@ -314,13 +336,21 @@ class AgentActivity {
|
|
|
314
336
|
}
|
|
315
337
|
}
|
|
316
338
|
detachAudioInput() {
|
|
317
|
-
this.
|
|
339
|
+
if (this.audioStreamId === void 0) {
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
void this.audioStream.close();
|
|
343
|
+
this.audioStream = new import_multi_input_stream.MultiInputStream();
|
|
344
|
+
this.audioStreamId = void 0;
|
|
318
345
|
}
|
|
319
|
-
commitUserTurn() {
|
|
346
|
+
commitUserTurn(options = {}) {
|
|
347
|
+
const { audioDetached = false, throwIfNotReady = true } = options;
|
|
320
348
|
if (!this.audioRecognition) {
|
|
321
|
-
|
|
349
|
+
if (throwIfNotReady) {
|
|
350
|
+
throw new Error("AudioRecognition is not initialized");
|
|
351
|
+
}
|
|
352
|
+
return;
|
|
322
353
|
}
|
|
323
|
-
const audioDetached = false;
|
|
324
354
|
this.audioRecognition.commitUserTurn(audioDetached);
|
|
325
355
|
}
|
|
326
356
|
clearUserTurn() {
|
|
@@ -356,19 +386,17 @@ class AgentActivity {
|
|
|
356
386
|
})
|
|
357
387
|
);
|
|
358
388
|
const task = this.createSpeechTask({
|
|
359
|
-
|
|
360
|
-
(abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio)
|
|
361
|
-
),
|
|
389
|
+
taskFn: (abortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio),
|
|
362
390
|
ownedSpeechHandle: handle,
|
|
363
391
|
name: "AgentActivity.say_tts"
|
|
364
392
|
});
|
|
365
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
393
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
366
394
|
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
367
395
|
return handle;
|
|
368
396
|
}
|
|
369
397
|
// -- Metrics and errors --
|
|
370
398
|
onMetricsCollected = (ev) => {
|
|
371
|
-
const speechHandle = speechHandleStorage.getStore();
|
|
399
|
+
const speechHandle = import_agent.speechHandleStorage.getStore();
|
|
372
400
|
if (speechHandle && (ev.type === "llm_metrics" || ev.type === "tts_metrics")) {
|
|
373
401
|
ev.speechId = speechHandle.id;
|
|
374
402
|
}
|
|
@@ -452,8 +480,8 @@ class AgentActivity {
|
|
|
452
480
|
if (ev.userInitiated) {
|
|
453
481
|
return;
|
|
454
482
|
}
|
|
455
|
-
if (this.
|
|
456
|
-
this.logger.warn("skipping new realtime generation, the
|
|
483
|
+
if (this.schedulingPaused) {
|
|
484
|
+
this.logger.warn("skipping new realtime generation, the speech scheduling is not running");
|
|
457
485
|
return;
|
|
458
486
|
}
|
|
459
487
|
const handle = import_speech_handle.SpeechHandle.create({
|
|
@@ -469,9 +497,7 @@ class AgentActivity {
|
|
|
469
497
|
);
|
|
470
498
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
471
499
|
this.createSpeechTask({
|
|
472
|
-
|
|
473
|
-
(abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController)
|
|
474
|
-
),
|
|
500
|
+
taskFn: (abortController) => this.realtimeGenerationTask(handle, ev, {}, abortController),
|
|
475
501
|
ownedSpeechHandle: handle,
|
|
476
502
|
name: "AgentActivity.realtimeGeneration"
|
|
477
503
|
});
|
|
@@ -558,7 +584,7 @@ class AgentActivity {
|
|
|
558
584
|
}
|
|
559
585
|
}
|
|
560
586
|
onPreemptiveGeneration(info) {
|
|
561
|
-
if (!this.agentSession.options.preemptiveGeneration || this.
|
|
587
|
+
if (!this.agentSession.options.preemptiveGeneration || this.schedulingPaused || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof import_llm.LLM)) {
|
|
562
588
|
return;
|
|
563
589
|
}
|
|
564
590
|
this.cancelPreemptiveGeneration();
|
|
@@ -596,7 +622,21 @@ class AgentActivity {
|
|
|
596
622
|
}
|
|
597
623
|
}
|
|
598
624
|
createSpeechTask(options) {
|
|
599
|
-
const {
|
|
625
|
+
const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options;
|
|
626
|
+
const wrappedFn = (ctrl) => {
|
|
627
|
+
return agentActivityStorage.run(this, () => {
|
|
628
|
+
const currentTask = import_utils.Task.current();
|
|
629
|
+
if (currentTask) {
|
|
630
|
+
(0, import_agent._setActivityTaskInfo)(currentTask, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
631
|
+
}
|
|
632
|
+
if (ownedSpeechHandle) {
|
|
633
|
+
return import_agent.speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl));
|
|
634
|
+
}
|
|
635
|
+
return taskFn(ctrl);
|
|
636
|
+
});
|
|
637
|
+
};
|
|
638
|
+
const task = import_utils.Task.from(wrappedFn, controller, name);
|
|
639
|
+
(0, import_agent._setActivityTaskInfo)(task, { speechHandle: ownedSpeechHandle, inlineTask });
|
|
600
640
|
this.speechTasks.add(task);
|
|
601
641
|
task.addDoneCallback(() => {
|
|
602
642
|
this.speechTasks.delete(task);
|
|
@@ -612,12 +652,15 @@ class AgentActivity {
|
|
|
612
652
|
task.addDoneCallback(() => {
|
|
613
653
|
this.wakeupMainTask();
|
|
614
654
|
});
|
|
615
|
-
return task
|
|
655
|
+
return task;
|
|
616
656
|
}
|
|
617
657
|
async onEndOfTurn(info) {
|
|
618
|
-
if (this.
|
|
658
|
+
if (this.schedulingPaused) {
|
|
619
659
|
this.cancelPreemptiveGeneration();
|
|
620
|
-
this.logger.warn(
|
|
660
|
+
this.logger.warn(
|
|
661
|
+
{ user_input: info.newTranscript },
|
|
662
|
+
"skipping user input, speech scheduling is paused"
|
|
663
|
+
);
|
|
621
664
|
return true;
|
|
622
665
|
}
|
|
623
666
|
if (this.stt && this.turnDetection !== "manual" && this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && this.agentSession.options.minInterruptionWords > 0) {
|
|
@@ -636,7 +679,7 @@ class AgentActivity {
|
|
|
636
679
|
}
|
|
637
680
|
const oldTask = this._userTurnCompletedTask;
|
|
638
681
|
this._userTurnCompletedTask = this.createSpeechTask({
|
|
639
|
-
|
|
682
|
+
taskFn: () => this.userTurnCompleted(info, oldTask),
|
|
640
683
|
name: "AgentActivity.userTurnCompleted"
|
|
641
684
|
});
|
|
642
685
|
return true;
|
|
@@ -666,14 +709,41 @@ class AgentActivity {
|
|
|
666
709
|
await speechHandle._waitForGeneration();
|
|
667
710
|
this._currentSpeech = void 0;
|
|
668
711
|
}
|
|
669
|
-
|
|
670
|
-
|
|
712
|
+
const toWait = this.getDrainPendingSpeechTasks();
|
|
713
|
+
if (this._schedulingPaused && toWait.length === 0) {
|
|
714
|
+
this.logger.info("mainTask: scheduling paused and no more speech tasks to wait");
|
|
671
715
|
break;
|
|
672
716
|
}
|
|
673
717
|
this.q_updated = new import_utils.Future();
|
|
674
718
|
}
|
|
675
719
|
this.logger.info("AgentActivity mainTask: exiting");
|
|
676
720
|
}
|
|
721
|
+
getDrainPendingSpeechTasks() {
|
|
722
|
+
const blockedHandles = [];
|
|
723
|
+
for (const task of this._drainBlockedTasks) {
|
|
724
|
+
const info = (0, import_agent._getActivityTaskInfo)(task);
|
|
725
|
+
if (!info) {
|
|
726
|
+
this.logger.error("blocked task without activity info; skipping.");
|
|
727
|
+
continue;
|
|
728
|
+
}
|
|
729
|
+
if (!info.speechHandle) {
|
|
730
|
+
continue;
|
|
731
|
+
}
|
|
732
|
+
blockedHandles.push(info.speechHandle);
|
|
733
|
+
}
|
|
734
|
+
const toWait = [];
|
|
735
|
+
for (const task of this.speechTasks) {
|
|
736
|
+
if (this._drainBlockedTasks.includes(task)) {
|
|
737
|
+
continue;
|
|
738
|
+
}
|
|
739
|
+
const info = (0, import_agent._getActivityTaskInfo)(task);
|
|
740
|
+
if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) {
|
|
741
|
+
continue;
|
|
742
|
+
}
|
|
743
|
+
toWait.push(task);
|
|
744
|
+
}
|
|
745
|
+
return toWait;
|
|
746
|
+
}
|
|
677
747
|
wakeupMainTask() {
|
|
678
748
|
this.q_updated.resolve();
|
|
679
749
|
}
|
|
@@ -699,7 +769,7 @@ class AgentActivity {
|
|
|
699
769
|
if (this.llm === void 0) {
|
|
700
770
|
throw new Error("trying to generate reply without an LLM model");
|
|
701
771
|
}
|
|
702
|
-
const functionCall = (_a = import_agent.
|
|
772
|
+
const functionCall = (_a = import_agent.functionCallStorage.getStore()) == null ? void 0 : _a.functionCall;
|
|
703
773
|
if (toolChoice === void 0 && functionCall !== void 0) {
|
|
704
774
|
toolChoice = "none";
|
|
705
775
|
}
|
|
@@ -717,19 +787,17 @@ class AgentActivity {
|
|
|
717
787
|
this.logger.info({ speech_id: handle.id }, "Creating speech handle");
|
|
718
788
|
if (this.llm instanceof import_llm.RealtimeModel) {
|
|
719
789
|
this.createSpeechTask({
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
})
|
|
732
|
-
),
|
|
790
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
791
|
+
speechHandle: handle,
|
|
792
|
+
// TODO(brian): support llm.ChatMessage for the realtime model
|
|
793
|
+
userInput: userMessage == null ? void 0 : userMessage.textContent,
|
|
794
|
+
instructions,
|
|
795
|
+
modelSettings: {
|
|
796
|
+
// isGiven(toolChoice) = toolChoice !== undefined
|
|
797
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
798
|
+
},
|
|
799
|
+
abortController
|
|
800
|
+
}),
|
|
733
801
|
ownedSpeechHandle: handle,
|
|
734
802
|
name: "AgentActivity.realtimeReply"
|
|
735
803
|
});
|
|
@@ -739,36 +807,36 @@ class AgentActivity {
|
|
|
739
807
|
${instructions}`;
|
|
740
808
|
}
|
|
741
809
|
const task = this.createSpeechTask({
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
userMessage
|
|
753
|
-
)
|
|
810
|
+
taskFn: (abortController) => this.pipelineReplyTask(
|
|
811
|
+
handle,
|
|
812
|
+
chatCtx ?? this.agent.chatCtx,
|
|
813
|
+
this.agent.toolCtx,
|
|
814
|
+
{
|
|
815
|
+
toolChoice: toOaiToolChoice(toolChoice !== void 0 ? toolChoice : this.toolChoice)
|
|
816
|
+
},
|
|
817
|
+
abortController,
|
|
818
|
+
instructions,
|
|
819
|
+
userMessage
|
|
754
820
|
),
|
|
755
821
|
ownedSpeechHandle: handle,
|
|
756
822
|
name: "AgentActivity.pipelineReply"
|
|
757
823
|
});
|
|
758
|
-
task.finally(() => this.onPipelineReplyDone());
|
|
824
|
+
task.result.finally(() => this.onPipelineReplyDone());
|
|
759
825
|
}
|
|
760
826
|
if (scheduleSpeech) {
|
|
761
827
|
this.scheduleSpeech(handle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
762
828
|
}
|
|
763
829
|
return handle;
|
|
764
830
|
}
|
|
765
|
-
interrupt() {
|
|
831
|
+
interrupt(options = {}) {
|
|
766
832
|
var _a;
|
|
833
|
+
const { force = false } = options;
|
|
834
|
+
this.cancelPreemptiveGeneration();
|
|
767
835
|
const future = new import_utils.Future();
|
|
768
836
|
const currentSpeech = this._currentSpeech;
|
|
769
|
-
currentSpeech == null ? void 0 : currentSpeech.interrupt();
|
|
837
|
+
currentSpeech == null ? void 0 : currentSpeech.interrupt(force);
|
|
770
838
|
for (const [_, __, speech] of this.speechQueue) {
|
|
771
|
-
speech.interrupt();
|
|
839
|
+
speech.interrupt(force);
|
|
772
840
|
}
|
|
773
841
|
(_a = this.realtimeSession) == null ? void 0 : _a.interrupt();
|
|
774
842
|
if (currentSpeech === void 0) {
|
|
@@ -789,7 +857,7 @@ ${instructions}`;
|
|
|
789
857
|
async userTurnCompleted(info, oldTask) {
|
|
790
858
|
var _a, _b;
|
|
791
859
|
if (oldTask) {
|
|
792
|
-
await oldTask;
|
|
860
|
+
await oldTask.result;
|
|
793
861
|
}
|
|
794
862
|
if (this.llm instanceof import_llm.RealtimeModel) {
|
|
795
863
|
if (this.llm.capabilities.turnDetection) {
|
|
@@ -871,7 +939,7 @@ ${instructions}`;
|
|
|
871
939
|
}
|
|
872
940
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
873
941
|
speechHandle._agentTurnContext = import_api.context.active();
|
|
874
|
-
speechHandleStorage.enterWith(speechHandle);
|
|
942
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
875
943
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
876
944
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
877
945
|
await speechHandle.waitIfNotInterrupted([speechHandle._waitForAuthorization()]);
|
|
@@ -976,7 +1044,7 @@ ${instructions}`;
|
|
|
976
1044
|
toolsMessages,
|
|
977
1045
|
span
|
|
978
1046
|
}) => {
|
|
979
|
-
var _a, _b
|
|
1047
|
+
var _a, _b;
|
|
980
1048
|
speechHandle._agentTurnContext = import_api.context.active();
|
|
981
1049
|
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
982
1050
|
if (instructions) {
|
|
@@ -989,7 +1057,7 @@ ${instructions}`;
|
|
|
989
1057
|
if (localParticipant) {
|
|
990
1058
|
(0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
|
|
991
1059
|
}
|
|
992
|
-
speechHandleStorage.enterWith(speechHandle);
|
|
1060
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
993
1061
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
994
1062
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
995
1063
|
chatCtx = chatCtx.copy();
|
|
@@ -1122,11 +1190,11 @@ ${instructions}`;
|
|
|
1122
1190
|
for (const msg of toolsMessages) {
|
|
1123
1191
|
msg.createdAt = replyStartedAt;
|
|
1124
1192
|
}
|
|
1125
|
-
this.agent._chatCtx.insert(toolsMessages);
|
|
1126
1193
|
const toolCallOutputs = toolsMessages.filter(
|
|
1127
1194
|
(m) => m.type === "function_call_output"
|
|
1128
1195
|
);
|
|
1129
1196
|
if (toolCallOutputs.length > 0) {
|
|
1197
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1130
1198
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1131
1199
|
}
|
|
1132
1200
|
}
|
|
@@ -1214,45 +1282,15 @@ ${instructions}`;
|
|
|
1214
1282
|
);
|
|
1215
1283
|
return;
|
|
1216
1284
|
}
|
|
1217
|
-
const functionToolsExecutedEvent = (
|
|
1218
|
-
functionCalls: [],
|
|
1219
|
-
functionCallOutputs: []
|
|
1220
|
-
});
|
|
1221
|
-
let shouldGenerateToolReply = false;
|
|
1222
|
-
let newAgentTask = null;
|
|
1223
|
-
let ignoreTaskSwitch = false;
|
|
1224
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1225
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1226
|
-
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1227
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1228
|
-
if (sanitizedOut.replyRequired) {
|
|
1229
|
-
shouldGenerateToolReply = true;
|
|
1230
|
-
}
|
|
1231
|
-
}
|
|
1232
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1233
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1234
|
-
ignoreTaskSwitch = true;
|
|
1235
|
-
}
|
|
1236
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1237
|
-
this.logger.debug(
|
|
1238
|
-
{
|
|
1239
|
-
speechId: speechHandle.id,
|
|
1240
|
-
name: (_c = sanitizedOut.toolCall) == null ? void 0 : _c.name,
|
|
1241
|
-
args: sanitizedOut.toolCall.args,
|
|
1242
|
-
output: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.output,
|
|
1243
|
-
isError: (_e = sanitizedOut.toolCallOutput) == null ? void 0 : _e.isError
|
|
1244
|
-
},
|
|
1245
|
-
"Tool call execution finished"
|
|
1246
|
-
);
|
|
1247
|
-
}
|
|
1285
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1248
1286
|
this.agentSession.emit(
|
|
1249
1287
|
import_events.AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1250
1288
|
functionToolsExecutedEvent
|
|
1251
1289
|
);
|
|
1252
|
-
let
|
|
1290
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1253
1291
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1254
1292
|
this.agentSession.updateAgent(newAgentTask);
|
|
1255
|
-
|
|
1293
|
+
schedulingPaused = true;
|
|
1256
1294
|
}
|
|
1257
1295
|
const toolMessages = [
|
|
1258
1296
|
...functionToolsExecutedEvent.functionCalls,
|
|
@@ -1261,34 +1299,32 @@ ${instructions}`;
|
|
|
1261
1299
|
if (shouldGenerateToolReply) {
|
|
1262
1300
|
chatCtx.insert(toolMessages);
|
|
1263
1301
|
speechHandle._numSteps += 1;
|
|
1264
|
-
const respondToolChoice =
|
|
1302
|
+
const respondToolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1265
1303
|
const toolResponseTask = this.createSpeechTask({
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
toolMessages
|
|
1276
|
-
)
|
|
1304
|
+
taskFn: () => this.pipelineReplyTask(
|
|
1305
|
+
speechHandle,
|
|
1306
|
+
chatCtx,
|
|
1307
|
+
toolCtx,
|
|
1308
|
+
{ toolChoice: respondToolChoice },
|
|
1309
|
+
replyAbortController,
|
|
1310
|
+
instructions,
|
|
1311
|
+
void 0,
|
|
1312
|
+
toolMessages
|
|
1277
1313
|
),
|
|
1278
1314
|
ownedSpeechHandle: speechHandle,
|
|
1279
1315
|
name: "AgentActivity.pipelineReply"
|
|
1280
1316
|
});
|
|
1281
|
-
toolResponseTask.finally(() => this.onPipelineReplyDone());
|
|
1317
|
+
toolResponseTask.result.finally(() => this.onPipelineReplyDone());
|
|
1282
1318
|
this.scheduleSpeech(speechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1283
1319
|
} else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1284
1320
|
for (const msg of toolMessages) {
|
|
1285
1321
|
msg.createdAt = replyStartedAt;
|
|
1286
1322
|
}
|
|
1287
|
-
this.agent._chatCtx.insert(toolMessages);
|
|
1288
1323
|
const toolCallOutputs = toolMessages.filter(
|
|
1289
1324
|
(m) => m.type === "function_call_output"
|
|
1290
1325
|
);
|
|
1291
1326
|
if (toolCallOutputs.length > 0) {
|
|
1327
|
+
this.agent._chatCtx.insert(toolCallOutputs);
|
|
1292
1328
|
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1293
1329
|
}
|
|
1294
1330
|
}
|
|
@@ -1332,14 +1368,14 @@ ${instructions}`;
|
|
|
1332
1368
|
replyAbortController,
|
|
1333
1369
|
span
|
|
1334
1370
|
}) {
|
|
1335
|
-
var _a
|
|
1371
|
+
var _a;
|
|
1336
1372
|
speechHandle._agentTurnContext = import_api.context.active();
|
|
1337
1373
|
span.setAttribute(import_telemetry.traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1338
1374
|
const localParticipant = (_a = this.agentSession._roomIO) == null ? void 0 : _a.localParticipant;
|
|
1339
1375
|
if (localParticipant) {
|
|
1340
1376
|
(0, import_utils2.setParticipantSpanAttributes)(span, localParticipant);
|
|
1341
1377
|
}
|
|
1342
|
-
speechHandleStorage.enterWith(speechHandle);
|
|
1378
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
1343
1379
|
if (!this.realtimeSession) {
|
|
1344
1380
|
throw new Error("realtime session is not initialized");
|
|
1345
1381
|
}
|
|
@@ -1592,44 +1628,15 @@ ${instructions}`;
|
|
|
1592
1628
|
);
|
|
1593
1629
|
return;
|
|
1594
1630
|
}
|
|
1595
|
-
const functionToolsExecutedEvent = (
|
|
1596
|
-
functionCalls: [],
|
|
1597
|
-
functionCallOutputs: []
|
|
1598
|
-
});
|
|
1599
|
-
let shouldGenerateToolReply = false;
|
|
1600
|
-
let newAgentTask = null;
|
|
1601
|
-
let ignoreTaskSwitch = false;
|
|
1602
|
-
for (const sanitizedOut of toolOutput.output) {
|
|
1603
|
-
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1604
|
-
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1605
|
-
if (sanitizedOut.replyRequired) {
|
|
1606
|
-
shouldGenerateToolReply = true;
|
|
1607
|
-
}
|
|
1608
|
-
}
|
|
1609
|
-
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1610
|
-
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1611
|
-
ignoreTaskSwitch = true;
|
|
1612
|
-
}
|
|
1613
|
-
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1614
|
-
this.logger.debug(
|
|
1615
|
-
{
|
|
1616
|
-
speechId: speechHandle.id,
|
|
1617
|
-
name: (_b = sanitizedOut.toolCall) == null ? void 0 : _b.name,
|
|
1618
|
-
args: sanitizedOut.toolCall.args,
|
|
1619
|
-
output: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.output,
|
|
1620
|
-
isError: (_d = sanitizedOut.toolCallOutput) == null ? void 0 : _d.isError
|
|
1621
|
-
},
|
|
1622
|
-
"Tool call execution finished"
|
|
1623
|
-
);
|
|
1624
|
-
}
|
|
1631
|
+
const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = this.summarizeToolExecutionOutput(toolOutput, speechHandle);
|
|
1625
1632
|
this.agentSession.emit(
|
|
1626
1633
|
import_events.AgentSessionEventTypes.FunctionToolsExecuted,
|
|
1627
1634
|
functionToolsExecutedEvent
|
|
1628
1635
|
);
|
|
1629
|
-
let
|
|
1636
|
+
let schedulingPaused = this.schedulingPaused;
|
|
1630
1637
|
if (!ignoreTaskSwitch && newAgentTask !== null) {
|
|
1631
1638
|
this.agentSession.updateAgent(newAgentTask);
|
|
1632
|
-
|
|
1639
|
+
schedulingPaused = true;
|
|
1633
1640
|
}
|
|
1634
1641
|
if (functionToolsExecutedEvent.functionCallOutputs.length > 0) {
|
|
1635
1642
|
while (this.currentSpeech || this.speechQueue.size() > 0) {
|
|
@@ -1670,20 +1677,58 @@ ${instructions}`;
|
|
|
1670
1677
|
speechHandle: replySpeechHandle
|
|
1671
1678
|
})
|
|
1672
1679
|
);
|
|
1673
|
-
const toolChoice =
|
|
1680
|
+
const toolChoice = schedulingPaused || modelSettings.toolChoice === "none" ? "none" : "auto";
|
|
1674
1681
|
this.createSpeechTask({
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
})
|
|
1681
|
-
),
|
|
1682
|
+
taskFn: (abortController) => this.realtimeReplyTask({
|
|
1683
|
+
speechHandle: replySpeechHandle,
|
|
1684
|
+
modelSettings: { toolChoice },
|
|
1685
|
+
abortController
|
|
1686
|
+
}),
|
|
1682
1687
|
ownedSpeechHandle: replySpeechHandle,
|
|
1683
1688
|
name: "AgentActivity.realtime_reply"
|
|
1684
1689
|
});
|
|
1685
1690
|
this.scheduleSpeech(replySpeechHandle, import_speech_handle.SpeechHandle.SPEECH_PRIORITY_NORMAL, true);
|
|
1686
1691
|
}
|
|
1692
|
+
summarizeToolExecutionOutput(toolOutput, speechHandle) {
|
|
1693
|
+
var _a, _b, _c;
|
|
1694
|
+
const functionToolsExecutedEvent = (0, import_events.createFunctionToolsExecutedEvent)({
|
|
1695
|
+
functionCalls: [],
|
|
1696
|
+
functionCallOutputs: []
|
|
1697
|
+
});
|
|
1698
|
+
let shouldGenerateToolReply = false;
|
|
1699
|
+
let newAgentTask = null;
|
|
1700
|
+
let ignoreTaskSwitch = false;
|
|
1701
|
+
for (const sanitizedOut of toolOutput.output) {
|
|
1702
|
+
if (sanitizedOut.toolCallOutput !== void 0) {
|
|
1703
|
+
functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall);
|
|
1704
|
+
functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput);
|
|
1705
|
+
if (sanitizedOut.replyRequired) {
|
|
1706
|
+
shouldGenerateToolReply = true;
|
|
1707
|
+
}
|
|
1708
|
+
}
|
|
1709
|
+
if (newAgentTask !== null && sanitizedOut.agentTask !== void 0) {
|
|
1710
|
+
this.logger.error("expected to receive only one agent task from the tool executions");
|
|
1711
|
+
ignoreTaskSwitch = true;
|
|
1712
|
+
}
|
|
1713
|
+
newAgentTask = sanitizedOut.agentTask ?? null;
|
|
1714
|
+
this.logger.debug(
|
|
1715
|
+
{
|
|
1716
|
+
speechId: speechHandle.id,
|
|
1717
|
+
name: (_a = sanitizedOut.toolCall) == null ? void 0 : _a.name,
|
|
1718
|
+
args: sanitizedOut.toolCall.args,
|
|
1719
|
+
output: (_b = sanitizedOut.toolCallOutput) == null ? void 0 : _b.output,
|
|
1720
|
+
isError: (_c = sanitizedOut.toolCallOutput) == null ? void 0 : _c.isError
|
|
1721
|
+
},
|
|
1722
|
+
"Tool call execution finished"
|
|
1723
|
+
);
|
|
1724
|
+
}
|
|
1725
|
+
return {
|
|
1726
|
+
functionToolsExecutedEvent,
|
|
1727
|
+
shouldGenerateToolReply,
|
|
1728
|
+
newAgentTask,
|
|
1729
|
+
ignoreTaskSwitch
|
|
1730
|
+
};
|
|
1731
|
+
}
|
|
1687
1732
|
async realtimeReplyTask({
|
|
1688
1733
|
speechHandle,
|
|
1689
1734
|
modelSettings: { toolChoice },
|
|
@@ -1691,7 +1736,7 @@ ${instructions}`;
|
|
|
1691
1736
|
instructions,
|
|
1692
1737
|
abortController
|
|
1693
1738
|
}) {
|
|
1694
|
-
speechHandleStorage.enterWith(speechHandle);
|
|
1739
|
+
import_agent.speechHandleStorage.enterWith(speechHandle);
|
|
1695
1740
|
if (!this.realtimeSession) {
|
|
1696
1741
|
throw new Error("realtime session is not available");
|
|
1697
1742
|
}
|
|
@@ -1725,13 +1770,45 @@ ${instructions}`;
|
|
|
1725
1770
|
}
|
|
1726
1771
|
}
|
|
1727
1772
|
scheduleSpeech(speechHandle, priority, force = false) {
|
|
1728
|
-
if (this.
|
|
1729
|
-
throw new Error("cannot schedule new speech, the
|
|
1773
|
+
if (this.schedulingPaused && !force) {
|
|
1774
|
+
throw new Error("cannot schedule new speech, the speech scheduling is draining/pausing");
|
|
1730
1775
|
}
|
|
1731
1776
|
this.speechQueue.push([priority, Number(process.hrtime.bigint()), speechHandle]);
|
|
1732
1777
|
speechHandle._markScheduled();
|
|
1733
1778
|
this.wakeupMainTask();
|
|
1734
1779
|
}
|
|
1780
|
+
async _pauseSchedulingTask(blockedTasks) {
|
|
1781
|
+
if (this._schedulingPaused) return;
|
|
1782
|
+
this._schedulingPaused = true;
|
|
1783
|
+
this._drainBlockedTasks = blockedTasks;
|
|
1784
|
+
this.wakeupMainTask();
|
|
1785
|
+
if (this._mainTask) {
|
|
1786
|
+
await this._mainTask.result;
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
_resumeSchedulingTask() {
|
|
1790
|
+
if (!this._schedulingPaused) return;
|
|
1791
|
+
this._schedulingPaused = false;
|
|
1792
|
+
this._mainTask = import_utils.Task.from(({ signal }) => this.mainTask(signal));
|
|
1793
|
+
}
|
|
1794
|
+
async pause(options = {}) {
|
|
1795
|
+
const { blockedTasks = [] } = options;
|
|
1796
|
+
const unlock = await this.lock.lock();
|
|
1797
|
+
try {
|
|
1798
|
+
const span = import_telemetry.tracer.startSpan({
|
|
1799
|
+
name: "pause_agent_activity",
|
|
1800
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
1801
|
+
});
|
|
1802
|
+
try {
|
|
1803
|
+
await this._pauseSchedulingTask(blockedTasks);
|
|
1804
|
+
await this._closeSessionResources();
|
|
1805
|
+
} finally {
|
|
1806
|
+
span.end();
|
|
1807
|
+
}
|
|
1808
|
+
} finally {
|
|
1809
|
+
unlock();
|
|
1810
|
+
}
|
|
1811
|
+
}
|
|
1735
1812
|
async drain() {
|
|
1736
1813
|
return import_telemetry.tracer.startActiveSpan(async (span) => this._drainImpl(span), {
|
|
1737
1814
|
name: "drain_agent_activity",
|
|
@@ -1739,72 +1816,80 @@ ${instructions}`;
|
|
|
1739
1816
|
});
|
|
1740
1817
|
}
|
|
1741
1818
|
async _drainImpl(span) {
|
|
1742
|
-
var _a;
|
|
1743
1819
|
span.setAttribute(import_telemetry.traceTypes.ATTR_AGENT_LABEL, this.agent.id);
|
|
1744
1820
|
const unlock = await this.lock.lock();
|
|
1745
1821
|
try {
|
|
1746
|
-
if (this.
|
|
1747
|
-
this.
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
task: import_utils.Task.from(() => onExitTask),
|
|
1822
|
+
if (this._schedulingPaused) return;
|
|
1823
|
+
this._onExitTask = this.createSpeechTask({
|
|
1824
|
+
taskFn: () => import_telemetry.tracer.startActiveSpan(async () => this.agent.onExit(), {
|
|
1825
|
+
name: "on_exit",
|
|
1826
|
+
attributes: { [import_telemetry.traceTypes.ATTR_AGENT_LABEL]: this.agent.id }
|
|
1827
|
+
}),
|
|
1828
|
+
inlineTask: true,
|
|
1754
1829
|
name: "AgentActivity_onExit"
|
|
1755
1830
|
});
|
|
1756
|
-
this.
|
|
1757
|
-
this.
|
|
1758
|
-
await
|
|
1831
|
+
this.cancelPreemptiveGeneration();
|
|
1832
|
+
await this._onExitTask.result;
|
|
1833
|
+
await this._pauseSchedulingTask([]);
|
|
1759
1834
|
} finally {
|
|
1760
1835
|
unlock();
|
|
1761
1836
|
}
|
|
1762
1837
|
}
|
|
1763
1838
|
async close() {
|
|
1764
|
-
var _a, _b, _c, _d;
|
|
1765
1839
|
const unlock = await this.lock.lock();
|
|
1766
1840
|
try {
|
|
1767
|
-
if (!this._draining) {
|
|
1768
|
-
this.logger.warn("task closing without draining");
|
|
1769
|
-
}
|
|
1770
1841
|
this.cancelPreemptiveGeneration();
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
if (this.realtimeSession) {
|
|
1775
|
-
this.realtimeSession.off("generation_created", this.onGenerationCreated);
|
|
1776
|
-
this.realtimeSession.off("input_speech_started", this.onInputSpeechStarted);
|
|
1777
|
-
this.realtimeSession.off("input_speech_stopped", this.onInputSpeechStopped);
|
|
1778
|
-
this.realtimeSession.off(
|
|
1779
|
-
"input_audio_transcription_completed",
|
|
1780
|
-
this.onInputAudioTranscriptionCompleted
|
|
1781
|
-
);
|
|
1782
|
-
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
1783
|
-
}
|
|
1784
|
-
if (this.stt instanceof import_stt.STT) {
|
|
1785
|
-
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
1842
|
+
await this._closeSessionResources();
|
|
1843
|
+
if (this._mainTask) {
|
|
1844
|
+
await this._mainTask.cancelAndWait();
|
|
1786
1845
|
}
|
|
1787
|
-
|
|
1788
|
-
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
1789
|
-
}
|
|
1790
|
-
if (this.vad instanceof import_vad.VAD) {
|
|
1791
|
-
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1792
|
-
}
|
|
1793
|
-
this.detachAudioInput();
|
|
1794
|
-
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
1795
|
-
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
1796
|
-
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
1797
|
-
await ((_d = this._mainTask) == null ? void 0 : _d.cancelAndWait());
|
|
1846
|
+
this.agent._agentActivity = void 0;
|
|
1798
1847
|
} finally {
|
|
1799
1848
|
unlock();
|
|
1800
1849
|
}
|
|
1801
1850
|
}
|
|
1851
|
+
async _closeSessionResources() {
|
|
1852
|
+
var _a, _b, _c;
|
|
1853
|
+
if (this.llm instanceof import_llm.LLM) {
|
|
1854
|
+
this.llm.off("metrics_collected", this.onMetricsCollected);
|
|
1855
|
+
this.llm.off("error", this.onModelError);
|
|
1856
|
+
}
|
|
1857
|
+
if (this.realtimeSession) {
|
|
1858
|
+
this.realtimeSession.off("generation_created", this.onRealtimeGenerationCreated);
|
|
1859
|
+
this.realtimeSession.off("input_speech_started", this.onRealtimeInputSpeechStarted);
|
|
1860
|
+
this.realtimeSession.off("input_speech_stopped", this.onRealtimeInputSpeechStopped);
|
|
1861
|
+
this.realtimeSession.off(
|
|
1862
|
+
"input_audio_transcription_completed",
|
|
1863
|
+
this.onRealtimeInputAudioTranscriptionCompleted
|
|
1864
|
+
);
|
|
1865
|
+
this.realtimeSession.off("metrics_collected", this.onMetricsCollected);
|
|
1866
|
+
this.realtimeSession.off("error", this.onModelError);
|
|
1867
|
+
}
|
|
1868
|
+
if (this.stt instanceof import_stt.STT) {
|
|
1869
|
+
this.stt.off("metrics_collected", this.onMetricsCollected);
|
|
1870
|
+
this.stt.off("error", this.onModelError);
|
|
1871
|
+
}
|
|
1872
|
+
if (this.tts instanceof import_tts.TTS) {
|
|
1873
|
+
this.tts.off("metrics_collected", this.onMetricsCollected);
|
|
1874
|
+
this.tts.off("error", this.onModelError);
|
|
1875
|
+
}
|
|
1876
|
+
if (this.vad instanceof import_vad.VAD) {
|
|
1877
|
+
this.vad.off("metrics_collected", this.onMetricsCollected);
|
|
1878
|
+
}
|
|
1879
|
+
this.detachAudioInput();
|
|
1880
|
+
(_a = this.realtimeSpans) == null ? void 0 : _a.clear();
|
|
1881
|
+
await ((_b = this.realtimeSession) == null ? void 0 : _b.close());
|
|
1882
|
+
await ((_c = this.audioRecognition) == null ? void 0 : _c.close());
|
|
1883
|
+
this.realtimeSession = void 0;
|
|
1884
|
+
this.audioRecognition = void 0;
|
|
1885
|
+
}
|
|
1802
1886
|
}
|
|
1803
1887
|
function toOaiToolChoice(toolChoice) {
|
|
1804
1888
|
return toolChoice !== null ? toolChoice : void 0;
|
|
1805
1889
|
}
|
|
1806
1890
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1807
1891
|
0 && (module.exports = {
|
|
1808
|
-
AgentActivity
|
|
1892
|
+
AgentActivity,
|
|
1893
|
+
agentActivityStorage
|
|
1809
1894
|
});
|
|
1810
1895
|
//# sourceMappingURL=agent_activity.cjs.map
|